PyPI - dbt-cube-sync - Versions diffs - 0.1.0a11__tar.gz → 0.1.0a13__tar.gz - Mend

dbt-cube-sync 0.1.0a11tar.gz → 0.1.0a13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dbt-cube-sync might be problematic. Click here for more details.

Files changed (17) hide show

{dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dbt-cube-sync
-Version: 0.1.0a11
+Version: 0.1.0a13
 Summary: Synchronization tool for dbt models to Cube.js schemas and BI tools
 Author: Ponder
 Requires-Python: >=3.9,<4.0

{dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/cli.py RENAMED Viewed

@@ -444,16 +444,21 @@ def sync_all(
                 manifest, manifest_nodes, {}
             )
-        # Update cube_sync step state
-        current_state = state_manager.update_step_state(
-            current_state,
-            'cube_sync',
-            'failed' if cube_sync_error else 'success',
-            cube_sync_error
-        )
+        # Save cube sync state
         state_manager.save_state(current_state)
         click.echo(f"  State saved to {state_path}")
+        if cube_sync_error:
+            click.echo(f"  Error during cube generation: {cube_sync_error}", err=True)
+        # Build a mapping from model name (file stem) to node_id for status updates
+        model_name_to_node_id = {}
+        for node_id in current_state.models.keys():
+            # Extract model name from output file (e.g., "model/cubes/ModelName.js" -> "ModelName")
+            output_file = current_state.models[node_id].output_file
+            model_name = Path(output_file).stem
+            model_name_to_node_id[model_name] = node_id
         # ============================================================
         # STEP 2: Sync to Superset (if configured)
         # ============================================================
@@ -462,17 +467,21 @@ def sync_all(
         if not superset_url or not superset_username or not superset_password:
             click.echo("  Skipped - no Superset credentials provided")
-            current_state = state_manager.update_step_state(current_state, 'superset_sync', 'skipped')
-            state_manager.save_state(current_state)
         else:
-            should_run_superset = state_manager.should_run_step(
-                'superset_sync', previous_state, changes_detected
-            ) or force_full_sync
+            # Get models that need Superset sync (status is None or 'failed')
+            models_to_sync_ids = state_manager.get_models_needing_sync(current_state, 'superset')
-            if not should_run_superset:
-                click.echo("  Skipped - no changes and previous sync succeeded")
+            if not models_to_sync_ids and not force_full_sync:
+                click.echo("  Skipped - all models already synced successfully")
             else:
-                superset_error = None
+                # Convert node_ids to model names for filtering
+                models_to_sync_names = set()
+                for node_id in models_to_sync_ids:
+                    if node_id in current_state.models:
+                        output_file = current_state.models[node_id].output_file
+                        model_name = Path(output_file).stem
+                        models_to_sync_names.add(model_name)
                 try:
                     connector_config = {
                         'url': superset_url,
@@ -482,24 +491,34 @@ def sync_all(
                     }
                     connector = ConnectorRegistry.get_connector('superset', **connector_config)
-                    results = connector.sync_cube_schemas(output)
+                    if force_full_sync:
+                        results = connector.sync_cube_schemas(output)
+                    else:
+                        results = connector.sync_cube_schemas(output, models_to_sync_names)
+                    # Update per-model status
+                    for r in results:
+                        model_name = r.file_or_dataset.replace('.js', '')
+                        node_id = model_name_to_node_id.get(model_name)
+                        if node_id:
+                            state_manager.update_model_sync_status(
+                                current_state, node_id, 'superset',
+                                'success' if r.status == 'success' else 'failed'
+                            )
                     successful = sum(1 for r in results if r.status == 'success')
                     failed = sum(1 for r in results if r.status == 'failed')
                     click.echo(f"  Synced: {successful} successful, {failed} failed")
-                    if failed > 0:
-                        superset_error = f"{failed} datasets failed to sync"
                 except Exception as e:
-                    superset_error = str(e)
-                    click.echo(f"  Error: {superset_error}", err=True)
-                current_state = state_manager.update_step_state(
-                    current_state,
-                    'superset_sync',
-                    'failed' if superset_error else 'success',
-                    superset_error
-                )
+                    click.echo(f"  Error: {str(e)}", err=True)
+                    # Mark all models we tried to sync as failed
+                    for node_id in models_to_sync_ids:
+                        state_manager.update_model_sync_status(
+                            current_state, node_id, 'superset', 'failed'
+                        )
                 state_manager.save_state(current_state)
         # ============================================================
@@ -510,17 +529,16 @@ def sync_all(
         if not rag_api_url:
             click.echo("  Skipped - no RAG API URL provided")
-            current_state = state_manager.update_step_state(current_state, 'rag_sync', 'skipped')
-            state_manager.save_state(current_state)
         else:
-            should_run_rag = state_manager.should_run_step(
-                'rag_sync', previous_state, changes_detected
-            ) or force_full_sync
+            # Get models that need RAG sync (status is None or 'failed')
+            models_to_embed_ids = state_manager.get_models_needing_sync(current_state, 'rag')
-            if not should_run_rag:
-                click.echo("  Skipped - no changes and previous sync succeeded")
+            if not models_to_embed_ids and not force_full_sync:
+                click.echo("  Skipped - all models already synced successfully")
             else:
-                rag_error = None
+                if force_full_sync:
+                    models_to_embed_ids = set(current_state.models.keys())
                 try:
                     # Call the RAG API to re-ingest embeddings
                     response = requests.post(
@@ -532,19 +550,26 @@ def sync_all(
                     if response.status_code == 200:
                         result = response.json()
                         click.echo(f"  Ingested {result.get('schemas_ingested', 0)} schema documents")
+                        # Mark all models as succeeded
+                        for node_id in models_to_embed_ids:
+                            state_manager.update_model_sync_status(
+                                current_state, node_id, 'rag', 'success'
+                            )
                     else:
-                        rag_error = f"RAG API returned {response.status_code}"
-                        click.echo(f"  Error: {rag_error}", err=True)
+                        click.echo(f"  Error: RAG API returned {response.status_code}", err=True)
+                        # Mark all models as failed
+                        for node_id in models_to_embed_ids:
+                            state_manager.update_model_sync_status(
+                                current_state, node_id, 'rag', 'failed'
+                            )
                 except requests.RequestException as e:
-                    rag_error = str(e)
-                    click.echo(f"  Error: Could not reach RAG API: {rag_error}", err=True)
-                current_state = state_manager.update_step_state(
-                    current_state,
-                    'rag_sync',
-                    'failed' if rag_error else 'success',
-                    rag_error
-                )
+                    click.echo(f"  Error: Could not reach RAG API: {e}", err=True)
+                    # Mark all models as failed
+                    for node_id in models_to_embed_ids:
+                        state_manager.update_model_sync_status(
+                            current_state, node_id, 'rag', 'failed'
+                        )
                 state_manager.save_state(current_state)
         # ============================================================
@@ -554,10 +579,23 @@ def sync_all(
         click.echo("SYNC COMPLETE")
         click.echo("=" * 60)
-        # Show step statuses
-        click.echo(f"  Cube sync:     {current_state.cube_sync.status if current_state.cube_sync else 'unknown'}")
-        click.echo(f"  Superset sync: {current_state.superset_sync.status if current_state.superset_sync else 'unknown'}")
-        click.echo(f"  RAG sync:      {current_state.rag_sync.status if current_state.rag_sync else 'unknown'}")
+        # Get per-model sync summaries
+        superset_summary = state_manager.get_sync_summary(current_state, 'superset')
+        rag_summary = state_manager.get_sync_summary(current_state, 'rag')
+        def format_summary(summary, step_configured):
+            if not step_configured:
+                return "skipped (not configured)"
+            if summary['failed'] > 0:
+                return f"{summary['success']} success, {summary['failed']} failed (will retry)"
+            elif summary['pending'] > 0:
+                return f"{summary['success']} success, {summary['pending']} pending"
+            else:
+                return f"{summary['success']} success"
+        click.echo(f"  Cube.js files: {len(current_state.models)} models")
+        click.echo(f"  Superset sync: {format_summary(superset_summary, superset_url)}")
+        click.echo(f"  RAG sync:      {format_summary(rag_summary, rag_api_url)}")
         if changes_detected or force_full_sync:
             click.echo(f"  Models processed: {len(added_models) + len(modified_models)}")
@@ -566,14 +604,10 @@ def sync_all(
         else:
             click.echo("  No model changes detected")
-        # Exit with error if any step failed
-        any_failed = (
-            (current_state.cube_sync and current_state.cube_sync.status == 'failed') or
-            (current_state.superset_sync and current_state.superset_sync.status == 'failed') or
-            (current_state.rag_sync and current_state.rag_sync.status == 'failed')
-        )
+        # Exit with error if any models failed
+        any_failed = superset_summary['failed'] > 0 or rag_summary['failed'] > 0
         if any_failed:
-            click.echo("\n  ⚠️  Some steps failed - they will be retried on next run")
+            click.echo("\n  ⚠️  Some models failed - they will be retried on next run")
             sys.exit(1)
     except Exception as e:

{dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/connectors/superset.py RENAMED Viewed

@@ -123,20 +123,30 @@ class SupersetConnector(BaseConnector):
         self.database_id = result[0]['id']
         print(f"✓ Found database '{database_name}' with ID: {self.database_id}")
-    def sync_cube_schemas(self, cube_dir: str) -> List[SyncResult]:
-        """Sync all Cube.js schemas from directory to Superset"""
+    def sync_cube_schemas(self, cube_dir: str, models_filter: set = None) -> List[SyncResult]:
+        """Sync Cube.js schemas from directory to Superset
+        Args:
+            cube_dir: Directory containing Cube.js schema files
+            models_filter: Optional set of model names to sync. If None, sync all.
+        """
         results = []
         cube_files = self._get_cube_files(cube_dir)
         if not cube_files:
             return [SyncResult(
                 file_or_dataset="No files",
-                status="failed",
+                status="failed",
                 message=f"No .js files found in {cube_dir}"
             )]
-        print(f"🔍 Found {len(cube_files)} Cube.js files")
+        # Filter files if models_filter is provided
+        if models_filter:
+            cube_files = [f for f in cube_files if f.stem in models_filter]
+            print(f"🔍 Syncing {len(cube_files)} Cube.js files (filtered from {len(self._get_cube_files(cube_dir))})")
+        else:
+            print(f"🔍 Found {len(cube_files)} Cube.js files")
         for cube_file in cube_files:
             try:
                 print(f"\\n{'='*60}")

{dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/core/models.py RENAMED Viewed

@@ -112,13 +112,9 @@ class ModelState(BaseModel):
     has_metrics: bool
     last_generated: str
     output_file: str
-class StepState(BaseModel):
-    """Represents the state of a pipeline step"""
-    status: str  # 'success', 'failed', 'skipped'
-    last_run: Optional[str] = None
-    error: Optional[str] = None
+    # Per-model sync status for each step
+    superset_sync_status: Optional[str] = None  # 'success', 'failed', or None (not attempted)
+    rag_sync_status: Optional[str] = None  # 'success', 'failed', or None (not attempted)
 class SyncState(BaseModel):
@@ -126,8 +122,4 @@ class SyncState(BaseModel):
     version: str = "1.1"
     last_sync_timestamp: str
     manifest_path: str
-    models: Dict[str, ModelState] = {}
-    # Step states for tracking pipeline progress
-    cube_sync: Optional[StepState] = None
-    superset_sync: Optional[StepState] = None
-    rag_sync: Optional[StepState] = None
+    models: Dict[str, ModelState] = {}

{dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/core/state_manager.py RENAMED Viewed

@@ -4,13 +4,42 @@ State management for incremental sync functionality.
 Tracks model checksums to enable incremental sync - only regenerate
 Cube.js files for models that have actually changed.
 """
+import hashlib
 import json
 import os
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, List, Optional, Set, Tuple
-from .models import ModelState, StepState, SyncState
+from .models import ModelState, SyncState
+def compute_model_checksum(node_data: dict) -> str:
+    """
+    Compute a checksum that includes both the dbt model checksum
+    and the metrics/meta configuration.
+    This ensures that changes to metrics (which don't change the SQL)
+    are still detected as modifications.
+    Args:
+        node_data: The node data from the dbt manifest
+    Returns:
+        A combined SHA256 checksum string
+    """
+    # Get the base dbt checksum
+    base_checksum = node_data.get("checksum", {}).get("checksum", "")
+    # Get the meta configuration (where metrics are defined)
+    meta = node_data.get("config", {}).get("meta", {})
+    # Serialize meta to a stable JSON string (sorted keys for consistency)
+    meta_json = json.dumps(meta, sort_keys=True, default=str)
+    # Combine and hash
+    combined = f"{base_checksum}:{meta_json}"
+    return hashlib.sha256(combined.encode()).hexdigest()
 class StateManager:
@@ -86,11 +115,11 @@ class StateManager:
         removed = previous_node_ids - current_node_ids
         # Find modified models (in both, but checksum changed)
+        # Note: We compute a combined checksum that includes metrics/meta config,
+        # not just the dbt SQL checksum. This ensures metric changes are detected.
         modified = set()
         for node_id in current_node_ids & previous_node_ids:
-            current_checksum = manifest_nodes[node_id].get("checksum", {}).get(
-                "checksum", ""
-            )
+            current_checksum = compute_model_checksum(manifest_nodes[node_id])
             previous_checksum = previous_state.models[node_id].checksum
             if current_checksum != previous_checksum:
                 modified.add(node_id)
@@ -121,7 +150,8 @@ class StateManager:
             if node_id not in generated_files:
                 continue
-            checksum = node_data.get("checksum", {}).get("checksum", "")
+            # Use combined checksum that includes metrics/meta config
+            checksum = compute_model_checksum(node_data)
             has_metrics = bool(
                 node_data.get("config", {}).get("meta", {}).get("metrics")
             )
@@ -174,20 +204,24 @@ class StateManager:
         # Update/add newly generated models
         for node_id, output_file in generated_files.items():
             node_data = manifest_nodes.get(node_id, {})
-            checksum = node_data.get("checksum", {}).get("checksum", "")
+            # Use combined checksum that includes metrics/meta config
+            checksum = compute_model_checksum(node_data)
             has_metrics = bool(
                 node_data.get("config", {}).get("meta", {}).get("metrics")
             )
+            # For newly generated/modified models, reset sync status (they need to be re-synced)
             models[node_id] = ModelState(
                 checksum=checksum,
                 has_metrics=has_metrics,
                 last_generated=timestamp,
                 output_file=output_file,
+                superset_sync_status=None,  # Reset - needs sync
+                rag_sync_status=None,  # Reset - needs sync
             )
         return SyncState(
-            version="1.0",
+            version="1.1",
             last_sync_timestamp=timestamp,
             manifest_path=str(manifest_path),
             models=models,
@@ -220,64 +254,80 @@ class StateManager:
         return files_to_delete
-    def should_run_step(
+    def get_models_needing_sync(
         self,
-        step_name: str,
-        previous_state: Optional[SyncState],
-        changes_detected: bool,
-    ) -> bool:
+        state: SyncState,
+        step: str,
+    ) -> Set[str]:
         """
-        Determine if a pipeline step should run.
+        Get node_ids of models that need to be synced for a step.
-        A step should run if:
-        - There are changes detected, OR
-        - The previous run of this step failed
+        A model needs sync if:
+        - Its sync status is None (never synced)
+        - Its sync status is 'failed' (needs retry)
         Args:
-            step_name: Name of the step ('cube_sync', 'superset_sync', 'rag_sync')
-            previous_state: Previous sync state
-            changes_detected: Whether model changes were detected
+            state: Current sync state
+            step: Step name ('superset' or 'rag')
         Returns:
-            True if the step should run
+            Set of node_ids that need syncing
         """
-        if changes_detected:
-            return True
+        models_to_sync = set()
+        status_field = f"{step}_sync_status"
-        if previous_state is None:
-            return True
-        step_state = getattr(previous_state, step_name, None)
-        if step_state is None:
-            return True
+        for node_id, model_state in state.models.items():
+            status = getattr(model_state, status_field, None)
+            if status is None or status == 'failed':
+                models_to_sync.add(node_id)
-        # Re-run if previous attempt failed
-        return step_state.status == 'failed'
+        return models_to_sync
-    def update_step_state(
+    def update_model_sync_status(
         self,
         state: SyncState,
-        step_name: str,
+        node_id: str,
+        step: str,
         status: str,
-        error: Optional[str] = None,
-    ) -> SyncState:
+    ) -> None:
         """
-        Update the state of a pipeline step.
+        Update the sync status of a model for a specific step.
         Args:
             state: Current sync state
-            step_name: Name of the step ('cube_sync', 'superset_sync', 'rag_sync')
-            status: Step status ('success', 'failed', 'skipped')
-            error: Error message if failed
+            node_id: The model's node_id
+            step: Step name ('superset' or 'rag')
+            status: Status to set ('success' or 'failed')
+        """
+        if node_id in state.models:
+            status_field = f"{step}_sync_status"
+            setattr(state.models[node_id], status_field, status)
+    def get_sync_summary(
+        self,
+        state: SyncState,
+        step: str,
+    ) -> Dict[str, int]:
+        """
+        Get a summary of sync status for a step.
+        Args:
+            state: Current sync state
+            step: Step name ('superset' or 'rag')
         Returns:
-            Updated SyncState
+            Dict with counts: {'success': N, 'failed': N, 'pending': N}
         """
-        timestamp = datetime.utcnow().isoformat() + "Z"
-        step_state = StepState(
-            status=status,
-            last_run=timestamp,
-            error=error,
-        )
-        setattr(state, step_name, step_state)
-        return state
+        status_field = f"{step}_sync_status"
+        summary = {'success': 0, 'failed': 0, 'pending': 0}
+        for model_state in state.models.values():
+            status = getattr(model_state, status_field, None)
+            if status == 'success':
+                summary['success'] += 1
+            elif status == 'failed':
+                summary['failed'] += 1
+            else:
+                summary['pending'] += 1
+        return summary

{dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dbt-cube-sync"
-version = "0.1.0a11"
+version = "0.1.0a13"
 description = "Synchronization tool for dbt models to Cube.js schemas and BI tools"
 authors = ["Ponder"]
 readme = "README.md"