PyPI - dbt-cube-sync - Versions diffs - 0.1.0a6__py3-none-any.whl → 0.1.0a8__py3-none-any.whl - Mend

dbt-cube-sync 0.1.0a6py3-none-any.whl → 0.1.0a8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dbt-cube-sync might be problematic. Click here for more details.

Files changed (11) hide show

dbt_cube_sync/cli.py +362 -14
dbt_cube_sync/core/cube_generator.py +14 -10
dbt_cube_sync/core/db_inspector.py +60 -14
dbt_cube_sync/core/dbt_parser.py +74 -8
dbt_cube_sync/core/models.py +17 -1
dbt_cube_sync/core/state_manager.py +221 -0
{dbt_cube_sync-0.1.0a6.dist-info → dbt_cube_sync-0.1.0a8.dist-info}/METADATA +100 -19
dbt_cube_sync-0.1.0a8.dist-info/RECORD +18 -0
{dbt_cube_sync-0.1.0a6.dist-info → dbt_cube_sync-0.1.0a8.dist-info}/WHEEL +1 -1
dbt_cube_sync-0.1.0a6.dist-info/RECORD +0 -17
{dbt_cube_sync-0.1.0a6.dist-info → dbt_cube_sync-0.1.0a8.dist-info}/entry_points.txt +0 -0

dbt_cube_sync/cli.py CHANGED Viewed

@@ -2,12 +2,14 @@
 CLI interface for dbt-cube-sync tool
 """
 import click
+import os
 import sys
 from pathlib import Path
 from typing import Optional
 from .core.dbt_parser import DbtParser
 from .core.cube_generator import CubeGenerator
+from .core.state_manager import StateManager
 from .connectors.base import ConnectorRegistry
 from .config import Config
@@ -62,47 +64,142 @@ def main():
 @click.option('--template-dir', '-t',
               default='./cube/templates',
               help='Directory containing Cube.js templates')
-def dbt_to_cube(manifest: str, catalog: Optional[str], sqlalchemy_uri: Optional[str], models: Optional[str], output: str, template_dir: str):
+@click.option('--state-path',
+              required=False,
+              default='.dbt-cube-sync-state.json',
+              help='Path to state file for incremental sync (default: .dbt-cube-sync-state.json)')
+@click.option('--force-full-sync',
+              is_flag=True,
+              default=False,
+              help='Force full regeneration, ignore cached state')
+@click.option('--no-state',
+              is_flag=True,
+              default=False,
+              help='Disable state tracking (legacy behavior)')
+def dbt_to_cube(
+    manifest: str,
+    catalog: Optional[str],
+    sqlalchemy_uri: Optional[str],
+    models: Optional[str],
+    output: str,
+    template_dir: str,
+    state_path: str,
+    force_full_sync: bool,
+    no_state: bool
+):
     """Generate Cube.js schemas from dbt models"""
     try:
         # Validate that at least one source of column types is provided
         if not catalog and not sqlalchemy_uri:
-            click.echo("❌ Error: You must provide either --catalog or --sqlalchemy-uri to get column data types", err=True)
-            click.echo("💡 Example with catalog: dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/", err=True)
-            click.echo("💡 Example with database: dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@host:port/db -o output/", err=True)
+            click.echo("Error: You must provide either --catalog or --sqlalchemy-uri to get column data types", err=True)
+            click.echo("Example with catalog: dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/", err=True)
+            click.echo("Example with database: dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@host:port/db -o output/", err=True)
             sys.exit(1)
         # Parse model filter if provided
         model_filter = None
         if models:
             model_filter = [m.strip() for m in models.split(',')]
-            click.echo(f"🎯 Filtering models: {', '.join(model_filter)}")
+            click.echo(f"Filtering models: {', '.join(model_filter)}")
+        # Initialize state manager (if enabled)
+        state_manager = None
+        previous_state = None
+        use_incremental = not no_state and not force_full_sync
+        if not no_state:
+            state_manager = StateManager(state_path)
+            if not force_full_sync:
+                previous_state = state_manager.load_state()
+                if previous_state:
+                    click.echo(f"Loaded previous state from {state_path}")
-        click.echo("🔄 Parsing dbt manifest...")
+        click.echo("Parsing dbt manifest...")
         parser = DbtParser(
             manifest_path=manifest,
             catalog_path=catalog,
             sqlalchemy_uri=sqlalchemy_uri,
             model_filter=model_filter
         )
-        parsed_models = parser.parse_models()
-        click.echo(f"📊 Found {len(parsed_models)} dbt models")
+        # Get all manifest nodes with metrics (for checksum comparison)
+        manifest_nodes = parser.get_manifest_nodes_with_metrics()
+        click.echo(f"Found {len(manifest_nodes)} models with metrics in manifest")
+        # Determine which models need regeneration
+        if use_incremental and previous_state:
+            added, modified, removed = state_manager.get_changed_models(
+                manifest_nodes, previous_state
+            )
+            if not added and not modified and not removed:
+                click.echo("No changes detected. All models are up to date.")
+                sys.exit(0)
+            click.echo(f"Incremental sync: {len(added)} added, {len(modified)} modified, {len(removed)} removed")
+            # Clean up files for removed models
+            if removed:
+                files_to_delete = state_manager.get_files_to_delete(previous_state, removed)
+                for file_path in files_to_delete:
+                    try:
+                        os.remove(file_path)
+                        click.echo(f"  Deleted: {Path(file_path).name}")
+                    except OSError as e:
+                        click.echo(f"  Warning: Could not delete {file_path}: {e}")
+            # Only parse changed models
+            node_ids_to_process = list(added | modified)
+            if not node_ids_to_process:
+                # Only removals, no models to regenerate
+                if state_manager:
+                    new_state = state_manager.merge_state(
+                        previous_state, manifest, manifest_nodes, {}, removed
+                    )
+                    state_manager.save_state(new_state)
+                    click.echo(f"State saved to {state_path}")
+                click.echo("Sync complete (only removals)")
+                sys.exit(0)
+            parsed_models = parser.parse_models(node_ids_filter=node_ids_to_process)
+        else:
+            # Full sync - parse all models
+            if force_full_sync:
+                click.echo("Forcing full sync...")
+            parsed_models = parser.parse_models()
+        click.echo(f"Processing {len(parsed_models)} dbt models")
         if len(parsed_models) == 0:
-            click.echo("⚠️  No models found. Make sure your models have both columns and metrics defined.")
+            click.echo("No models found. Make sure your models have both columns and metrics defined.")
             sys.exit(0)
-        click.echo("🏗️  Generating Cube.js schemas...")
+        click.echo("Generating Cube.js schemas...")
         generator = CubeGenerator(template_dir, output)
         generated_files = generator.generate_cube_files(parsed_models)
-        click.echo(f"✅ Generated {len(generated_files)} Cube.js files:")
-        for file_path in generated_files:
-            click.echo(f"   • {file_path}")
+        click.echo(f"Generated {len(generated_files)} Cube.js files:")
+        for node_id, file_path in generated_files.items():
+            click.echo(f"   {file_path}")
+        # Save state (if enabled)
+        if state_manager:
+            if use_incremental and previous_state:
+                # Merge with previous state
+                removed_ids = removed if 'removed' in dir() else set()
+                new_state = state_manager.merge_state(
+                    previous_state, manifest, manifest_nodes, generated_files, removed_ids
+                )
+            else:
+                # Create fresh state
+                new_state = state_manager.create_state_from_results(
+                    manifest, manifest_nodes, generated_files
+                )
+            state_manager.save_state(new_state)
+            click.echo(f"State saved to {state_path}")
     except Exception as e:
-        click.echo(f"❌ Error: {str(e)}", err=True)
+        click.echo(f"Error: {str(e)}", err=True)
         sys.exit(1)
@@ -164,5 +261,256 @@ def version():
     click.echo(f"dbt-cube-sync version {__version__}")
+@main.command()
+@click.option('--manifest', '-m',
+              required=True,
+              help='Path to dbt manifest.json file')
+@click.option('--catalog', '-c',
+              required=False,
+              default=None,
+              help='Path to dbt catalog.json file')
+@click.option('--sqlalchemy-uri', '-s',
+              required=False,
+              default=None,
+              help='SQLAlchemy database URI for fetching column types')
+@click.option('--output', '-o',
+              required=True,
+              help='Output directory for Cube.js files')
+@click.option('--state-path',
+              required=False,
+              default='.dbt-cube-sync-state.json',
+              help='Path to state file for incremental sync')
+@click.option('--force-full-sync',
+              is_flag=True,
+              default=False,
+              help='Force full regeneration, ignore cached state')
+@click.option('--superset-url',
+              required=False,
+              default=None,
+              help='Superset URL (e.g., http://localhost:8088)')
+@click.option('--superset-username',
+              required=False,
+              default=None,
+              help='Superset username')
+@click.option('--superset-password',
+              required=False,
+              default=None,
+              help='Superset password')
+@click.option('--cube-connection-name',
+              default='Cube',
+              help='Name of Cube database connection in Superset')
+@click.option('--rag-api-url',
+              required=False,
+              default=None,
+              help='RAG API URL for embedding updates (e.g., http://localhost:8000)')
+def sync_all(
+    manifest: str,
+    catalog: Optional[str],
+    sqlalchemy_uri: Optional[str],
+    output: str,
+    state_path: str,
+    force_full_sync: bool,
+    superset_url: Optional[str],
+    superset_username: Optional[str],
+    superset_password: Optional[str],
+    cube_connection_name: str,
+    rag_api_url: Optional[str]
+):
+    """
+    Ultimate sync command: dbt → Cube.js → BI tools → RAG embeddings.
+    Incrementally syncs everything based on state file. Only processes
+    models that have changed since last sync.
+    Examples:
+      # Basic incremental sync (Cube.js only)
+      dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output
+      # Full pipeline with Superset
+      dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output \\
+        --superset-url http://localhost:8088 --superset-username admin --superset-password admin
+      # Full pipeline with Superset + RAG embeddings
+      dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output \\
+        --superset-url http://localhost:8088 --superset-username admin --superset-password admin \\
+        --rag-api-url http://localhost:8000
+      # Force full rebuild
+      dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output --force-full-sync
+    """
+    import requests
+    try:
+        # Validate that at least one source of column types is provided
+        if not catalog and not sqlalchemy_uri:
+            click.echo("Error: You must provide either --catalog or --sqlalchemy-uri", err=True)
+            sys.exit(1)
+        click.echo("=" * 60)
+        click.echo("SYNC-ALL: Incremental Pipeline")
+        click.echo("=" * 60)
+        # Track what changed for downstream updates
+        changes_detected = False
+        added_models = set()
+        modified_models = set()
+        removed_models = set()
+        # ============================================================
+        # STEP 1: Incremental dbt → Cube.js sync
+        # ============================================================
+        click.echo("\n[1/3] dbt → Cube.js schemas")
+        click.echo("-" * 40)
+        # Initialize state manager
+        state_manager = StateManager(state_path)
+        previous_state = None
+        if not force_full_sync:
+            previous_state = state_manager.load_state()
+            if previous_state:
+                click.echo(f"  Loaded state from {state_path}")
+        # Parse manifest
+        parser = DbtParser(
+            manifest_path=manifest,
+            catalog_path=catalog,
+            sqlalchemy_uri=sqlalchemy_uri
+        )
+        manifest_nodes = parser.get_manifest_nodes_with_metrics()
+        click.echo(f"  Found {len(manifest_nodes)} models with metrics")
+        # Determine what changed
+        if not force_full_sync and previous_state:
+            added_models, modified_models, removed_models = state_manager.get_changed_models(
+                manifest_nodes, previous_state
+            )
+            if not added_models and not modified_models and not removed_models:
+                click.echo("  No changes detected - all models up to date")
+            else:
+                changes_detected = True
+                click.echo(f"  Changes: {len(added_models)} added, {len(modified_models)} modified, {len(removed_models)} removed")
+                # Clean up removed model files
+                if removed_models:
+                    files_to_delete = state_manager.get_files_to_delete(previous_state, removed_models)
+                    for file_path in files_to_delete:
+                        try:
+                            os.remove(file_path)
+                            click.echo(f"    Deleted: {Path(file_path).name}")
+                        except OSError:
+                            pass
+            node_ids_to_process = list(added_models | modified_models)
+        else:
+            # Force full sync
+            changes_detected = True
+            added_models = set(manifest_nodes.keys())
+            node_ids_to_process = list(manifest_nodes.keys())
+            click.echo(f"  Full sync: processing all {len(node_ids_to_process)} models")
+        # Generate Cube.js files for changed models
+        generated_files = {}
+        if node_ids_to_process:
+            parsed_models = parser.parse_models(node_ids_filter=node_ids_to_process)
+            if parsed_models:
+                generator = CubeGenerator('./cube/templates', output)
+                generated_files = generator.generate_cube_files(parsed_models)
+                click.echo(f"  Generated {len(generated_files)} Cube.js files")
+        # Save state
+        if changes_detected or force_full_sync:
+            if previous_state and not force_full_sync:
+                new_state = state_manager.merge_state(
+                    previous_state, manifest, manifest_nodes, generated_files, removed_models
+                )
+            else:
+                new_state = state_manager.create_state_from_results(
+                    manifest, manifest_nodes, generated_files
+                )
+            state_manager.save_state(new_state)
+            click.echo(f"  State saved to {state_path}")
+        # ============================================================
+        # STEP 2: Sync to Superset (if configured)
+        # ============================================================
+        if superset_url and superset_username and superset_password:
+            click.echo("\n[2/3] Cube.js → Superset")
+            click.echo("-" * 40)
+            if not changes_detected and not force_full_sync:
+                click.echo("  Skipped - no changes detected")
+            else:
+                connector_config = {
+                    'url': superset_url,
+                    'username': superset_username,
+                    'password': superset_password,
+                    'database_name': cube_connection_name
+                }
+                connector = ConnectorRegistry.get_connector('superset', **connector_config)
+                results = connector.sync_cube_schemas(output)
+                successful = sum(1 for r in results if r.status == 'success')
+                failed = sum(1 for r in results if r.status == 'failed')
+                click.echo(f"  Synced: {successful} successful, {failed} failed")
+        else:
+            click.echo("\n[2/3] Cube.js → Superset")
+            click.echo("-" * 40)
+            click.echo("  Skipped - no Superset credentials provided")
+        # ============================================================
+        # STEP 3: Update RAG embeddings (if configured)
+        # ============================================================
+        if rag_api_url:
+            click.echo("\n[3/3] Update RAG embeddings")
+            click.echo("-" * 40)
+            if not changes_detected and not force_full_sync:
+                click.echo("  Skipped - no changes detected")
+            else:
+                try:
+                    # Call the RAG API to re-ingest embeddings
+                    response = requests.post(
+                        f"{rag_api_url.rstrip('/')}/embeddings/ingest",
+                        json={"schema_dir": output},
+                        timeout=120
+                    )
+                    if response.status_code == 200:
+                        result = response.json()
+                        click.echo(f"  Ingested {result.get('schemas_ingested', 0)} schema documents")
+                    else:
+                        click.echo(f"  Warning: RAG API returned {response.status_code}", err=True)
+                except requests.RequestException as e:
+                    click.echo(f"  Warning: Could not reach RAG API: {e}", err=True)
+        else:
+            click.echo("\n[3/3] Update RAG embeddings")
+            click.echo("-" * 40)
+            click.echo("  Skipped - no RAG API URL provided")
+        # ============================================================
+        # Summary
+        # ============================================================
+        click.echo("\n" + "=" * 60)
+        click.echo("SYNC COMPLETE")
+        click.echo("=" * 60)
+        if changes_detected or force_full_sync:
+            click.echo(f"  Models processed: {len(added_models) + len(modified_models)}")
+            click.echo(f"  Models removed: {len(removed_models)}")
+            click.echo(f"  Cube.js files generated: {len(generated_files)}")
+        else:
+            click.echo("  No changes - everything is up to date")
+    except Exception as e:
+        click.echo(f"Error: {str(e)}", err=True)
+        sys.exit(1)
 if __name__ == '__main__':
     main()

dbt_cube_sync/core/cube_generator.py CHANGED Viewed

@@ -29,27 +29,31 @@ class CubeGenerator:
         # Initialize Jinja2 environment
         self.env = Environment(loader=FileSystemLoader(str(self.template_dir)))
-    def generate_cube_files(self, models: List[DbtModel]) -> List[str]:
+    def generate_cube_files(
+        self, models: List[DbtModel], return_node_mapping: bool = False
+    ) -> Dict[str, str]:
         """
         Generate Cube.js files for all models
         Args:
             models: List of DbtModel instances
+            return_node_mapping: If True, returns dict mapping node_id -> file_path
+                                 If False (legacy), returns list of file paths
         Returns:
-            List of generated file paths
+            Dict mapping node_id -> file_path (for incremental sync support)
         """
-        generated_files = []
+        generated_files = {}
         for model in models:
             try:
                 cube_schema = self._convert_model_to_cube(model)
                 file_path = self._write_cube_file(cube_schema)
-                generated_files.append(str(file_path))
-                print(f"✓ Generated: {file_path.name}")
+                generated_files[model.node_id] = str(file_path)
+                print(f"  Generated: {file_path.name}")
             except Exception as e:
-                print(f"✗ Error generating cube for {model.name}: {str(e)}")
+                print(f"  Error generating cube for {model.name}: {str(e)}")
         return generated_files
     def _convert_model_to_cube(self, model: DbtModel) -> CubeSchema:

dbt_cube_sync/core/db_inspector.py CHANGED Viewed

@@ -1,27 +1,42 @@
 """
-Database inspector - fetches column types using SQLAlchemy
+Database inspector - fetches column types using SQLAlchemy MetaData reflection.
+Uses SQLAlchemy's Table(..., autoload_with=engine) for portable, database-agnostic
+column type extraction. This approach works consistently across PostgreSQL, MySQL,
+Snowflake, BigQuery, Redshift, and other databases.
 """
 from typing import Dict, Optional
-from sqlalchemy import create_engine, inspect, MetaData, Table
+from sqlalchemy import create_engine, MetaData, Table
 from sqlalchemy.engine import Engine
 class DatabaseInspector:
-    """Inspects database schema to extract column type information"""
+    """Inspects database schema to extract column type information using SQLAlchemy reflection."""
     def __init__(self, sqlalchemy_uri: str):
         """
-        Initialize the database inspector
+        Initialize the database inspector.
         Args:
             sqlalchemy_uri: SQLAlchemy connection URI (e.g., postgresql://user:pass@host:port/db)
         """
-        self.engine: Engine = create_engine(sqlalchemy_uri)
-        self.inspector = inspect(self.engine)
+        # Add connect_args for Redshift compatibility
+        if 'redshift' in sqlalchemy_uri.lower():
+            self.engine: Engine = create_engine(
+                sqlalchemy_uri,
+                connect_args={'sslmode': 'prefer'}
+            )
+        else:
+            self.engine: Engine = create_engine(sqlalchemy_uri)
+        self.metadata = MetaData()
+        self._table_cache: Dict[str, Table] = {}
     def get_table_columns(self, schema: str, table_name: str) -> Dict[str, str]:
         """
-        Get column names and their data types for a specific table
+        Get column names and their data types for a specific table.
+        Uses SQLAlchemy MetaData reflection for portable column extraction.
         Args:
             schema: Database schema name
@@ -31,21 +46,52 @@ class DatabaseInspector:
             Dictionary mapping column names to data types
         """
         columns = {}
+        cache_key = f"{schema}.{table_name}"
         try:
-            # Get columns from the database
-            table_columns = self.inspector.get_columns(table_name, schema=schema)
+            # Check cache first
+            if cache_key in self._table_cache:
+                table = self._table_cache[cache_key]
+            else:
+                # Reflect table using SQLAlchemy MetaData
+                table = Table(
+                    table_name,
+                    self.metadata,
+                    autoload_with=self.engine,
+                    schema=schema
+                )
+                self._table_cache[cache_key] = table
-            for column in table_columns:
-                col_name = column['name']
-                col_type = str(column['type'])
-                columns[col_name] = col_type
+            # Extract column types
+            for column in table.columns:
+                columns[column.name] = str(column.type)
         except Exception as e:
             print(f"Warning: Could not inspect table {schema}.{table_name}: {e}")
         return columns
+    def reflect_multiple_tables(
+        self, tables: list[tuple[str, str]]
+    ) -> Dict[str, Dict[str, str]]:
+        """
+        Reflect multiple tables in bulk for performance optimization.
+        Args:
+            tables: List of (schema, table_name) tuples
+        Returns:
+            Dict mapping "schema.table_name" -> {column_name: column_type}
+        """
+        results = {}
+        for schema, table_name in tables:
+            cache_key = f"{schema}.{table_name}"
+            results[cache_key] = self.get_table_columns(schema, table_name)
+        return results
     def close(self):
-        """Close the database connection"""
+        """Close the database connection and clear cache."""
+        self._table_cache.clear()
         self.engine.dispose()

dbt_cube_sync/core/dbt_parser.py CHANGED Viewed

@@ -57,10 +57,14 @@ class DbtParser:
             print(f"Warning: Could not load catalog file {self.catalog_path}: {e}")
             return None
-    def parse_models(self) -> List[DbtModel]:
+    def parse_models(self, node_ids_filter: Optional[List[str]] = None) -> List[DbtModel]:
         """
         Extract models with metrics and columns from manifest
+        Args:
+            node_ids_filter: Optional list of node_ids to parse (for incremental sync).
+                             If provided, only these specific nodes are processed.
         Returns:
             List of DbtModel instances
         """
@@ -72,6 +76,10 @@ class DbtParser:
             if node_data.get('resource_type') != 'model':
                 continue
+            # Apply node_ids filter if specified (for incremental sync)
+            if node_ids_filter is not None and node_id not in node_ids_filter:
+                continue
             # Apply model filter if specified
             model_name = node_data.get('name', '')
             if self.model_filter and model_name not in self.model_filter:
@@ -87,6 +95,38 @@ class DbtParser:
             self.db_inspector.close()
         return models
+    def get_manifest_nodes_with_metrics(self) -> Dict[str, dict]:
+        """
+        Get all manifest nodes that have metrics defined.
+        This is used by the StateManager to compare checksums for incremental sync.
+        Returns:
+            Dict of node_id -> node_data for all models with metrics
+        """
+        nodes_with_metrics = {}
+        nodes = self.manifest.get('nodes', {})
+        for node_id, node_data in nodes.items():
+            # Only process models
+            if node_data.get('resource_type') != 'model':
+                continue
+            # Apply model filter if specified
+            model_name = node_data.get('name', '')
+            if self.model_filter and model_name not in self.model_filter:
+                continue
+            # Check if model has metrics defined
+            config = node_data.get('config', {})
+            meta = config.get('meta', {})
+            metrics = meta.get('metrics', {})
+            if metrics:
+                nodes_with_metrics[node_id] = node_data
+        return nodes_with_metrics
     def _parse_model(self, node_id: str, node_data: dict) -> DbtModel:
         """Parse a single model from the manifest"""
@@ -114,7 +154,14 @@ class DbtParser:
         )
     def _parse_columns(self, node_id: str, node_data: dict) -> Dict[str, DbtColumn]:
-        """Parse columns for a model, enhanced with catalog or database data if available"""
+        """
+        Parse columns for a model using hybrid metadata approach.
+        Priority order for column types:
+        1. Manifest `data_type` - When explicitly defined in dbt .yml files
+        2. Catalog `type` - When catalog.json is provided
+        3. SQLAlchemy Reflection - Fallback using database inspector
+        """
         columns = {}
         manifest_columns = node_data.get('columns', {})
@@ -123,23 +170,42 @@ class DbtParser:
         if self.catalog and node_id in self.catalog.get('nodes', {}):
             catalog_columns = self.catalog['nodes'][node_id].get('columns', {})
-        # Get database columns for type information (if db_inspector is available)
+        # Check if we need database lookup - only if we have columns missing types
+        need_db_lookup = False
+        if manifest_columns:
+            for col_name, col_data in manifest_columns.items():
+                # Check manifest data_type first
+                manifest_data_type = col_data.get('data_type')
+                if manifest_data_type:
+                    continue
+                # Check catalog
+                if col_name in catalog_columns and catalog_columns[col_name].get('type'):
+                    continue
+                # Need database lookup for this column
+                need_db_lookup = True
+                break
+        # Get database columns only if needed (lazy loading)
         db_columns = {}
-        if self.db_inspector and not self.catalog:
+        if need_db_lookup and self.db_inspector:
             schema = node_data.get('schema', '')
             table_name = node_data.get('name', '')
             if schema and table_name:
                 db_columns = self.db_inspector.get_table_columns(schema, table_name)
-        # If manifest has columns, use them with catalog or database type info
+        # If manifest has columns, use them with hybrid type resolution
         if manifest_columns:
             for col_name, col_data in manifest_columns.items():
                 data_type = None
-                # Try to get data type from catalog first
-                if col_name in catalog_columns:
+                # Priority 1: Manifest data_type (explicitly defined in dbt .yml)
+                manifest_data_type = col_data.get('data_type')
+                if manifest_data_type:
+                    data_type = manifest_data_type
+                # Priority 2: Catalog type
+                elif col_name in catalog_columns:
                     data_type = catalog_columns[col_name].get('type', '')
-                # Otherwise try database
+                # Priority 3: Database reflection
                 elif col_name in db_columns:
                     data_type = db_columns[col_name]

dbt_cube_sync/core/models.py CHANGED Viewed

@@ -103,4 +103,20 @@ class SyncResult(BaseModel):
     file_or_dataset: str
     status: str  # 'success' or 'failed'
     message: Optional[str] = None
-    error: Optional[str] = None
+    error: Optional[str] = None
+class ModelState(BaseModel):
+    """Represents the state of a single model for incremental sync"""
+    checksum: str
+    has_metrics: bool
+    last_generated: str
+    output_file: str
+class SyncState(BaseModel):
+    """Represents the overall state for incremental sync"""
+    version: str = "1.0"
+    last_sync_timestamp: str
+    manifest_path: str
+    models: Dict[str, ModelState] = {}

dbt_cube_sync/core/state_manager.py ADDED Viewed

@@ -0,0 +1,221 @@
+"""
+State management for incremental sync functionality.
+Tracks model checksums to enable incremental sync - only regenerate
+Cube.js files for models that have actually changed.
+"""
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Set, Tuple
+from .models import ModelState, SyncState
+class StateManager:
+    """Manages sync state for incremental model generation."""
+    def __init__(self, state_path: str = ".dbt-cube-sync-state.json"):
+        """
+        Initialize the StateManager.
+        Args:
+            state_path: Path to the state file (default: .dbt-cube-sync-state.json)
+        """
+        self.state_path = Path(state_path)
+        self._state: Optional[SyncState] = None
+    def load_state(self) -> Optional[SyncState]:
+        """
+        Load state from file.
+        Returns:
+            SyncState if file exists and is valid, None otherwise
+        """
+        if not self.state_path.exists():
+            return None
+        try:
+            with open(self.state_path, "r") as f:
+                data = json.load(f)
+            self._state = SyncState(**data)
+            return self._state
+        except (json.JSONDecodeError, Exception) as e:
+            print(f"Warning: Could not load state file: {e}")
+            return None
+    def save_state(self, state: SyncState) -> None:
+        """
+        Save state to file.
+        Args:
+            state: The SyncState to save
+        """
+        self._state = state
+        with open(self.state_path, "w") as f:
+            json.dump(state.model_dump(), f, indent=2)
+    def get_changed_models(
+        self,
+        manifest_nodes: Dict[str, dict],
+        previous_state: Optional[SyncState] = None,
+    ) -> Tuple[Set[str], Set[str], Set[str]]:
+        """
+        Compare manifest nodes against stored state to identify changes.
+        Args:
+            manifest_nodes: Dict of node_id -> node data from manifest
+            previous_state: Previous sync state (if None, all models are "added")
+        Returns:
+            Tuple of (added_node_ids, modified_node_ids, removed_node_ids)
+        """
+        if previous_state is None:
+            # First run - all models with metrics are "added"
+            added = set(manifest_nodes.keys())
+            return added, set(), set()
+        current_node_ids = set(manifest_nodes.keys())
+        previous_node_ids = set(previous_state.models.keys())
+        # Find added models (in current but not in previous)
+        added = current_node_ids - previous_node_ids
+        # Find removed models (in previous but not in current)
+        removed = previous_node_ids - current_node_ids
+        # Find modified models (in both, but checksum changed)
+        modified = set()
+        for node_id in current_node_ids & previous_node_ids:
+            current_checksum = manifest_nodes[node_id].get("checksum", {}).get(
+                "checksum", ""
+            )
+            previous_checksum = previous_state.models[node_id].checksum
+            if current_checksum != previous_checksum:
+                modified.add(node_id)
+        return added, modified, removed
+    def create_state_from_results(
+        self,
+        manifest_path: str,
+        manifest_nodes: Dict[str, dict],
+        generated_files: Dict[str, str],
+    ) -> SyncState:
+        """
+        Build a new state from sync results.
+        Args:
+            manifest_path: Path to the manifest file used
+            manifest_nodes: Dict of node_id -> node data from manifest
+            generated_files: Dict of node_id -> output_file_path
+        Returns:
+            New SyncState representing the current state
+        """
+        timestamp = datetime.utcnow().isoformat() + "Z"
+        models: Dict[str, ModelState] = {}
+        for node_id, node_data in manifest_nodes.items():
+            if node_id not in generated_files:
+                continue
+            checksum = node_data.get("checksum", {}).get("checksum", "")
+            has_metrics = bool(
+                node_data.get("config", {}).get("meta", {}).get("metrics")
+            )
+            models[node_id] = ModelState(
+                checksum=checksum,
+                has_metrics=has_metrics,
+                last_generated=timestamp,
+                output_file=generated_files[node_id],
+            )
+        return SyncState(
+            version="1.0",
+            last_sync_timestamp=timestamp,
+            manifest_path=str(manifest_path),
+            models=models,
+        )
+    def merge_state(
+        self,
+        previous_state: Optional[SyncState],
+        manifest_path: str,
+        manifest_nodes: Dict[str, dict],
+        generated_files: Dict[str, str],
+        removed_node_ids: Set[str],
+    ) -> SyncState:
+        """
+        Merge new sync results with previous state for incremental updates.
+        Args:
+            previous_state: Previous sync state (or None for first run)
+            manifest_path: Path to the manifest file used
+            manifest_nodes: Dict of node_id -> node data from manifest
+            generated_files: Dict of node_id -> output_file_path (only newly generated)
+            removed_node_ids: Set of node_ids that were removed
+        Returns:
+            Merged SyncState
+        """
+        timestamp = datetime.utcnow().isoformat() + "Z"
+        models: Dict[str, ModelState] = {}
+        # Start with previous models (excluding removed ones)
+        if previous_state:
+            for node_id, model_state in previous_state.models.items():
+                if node_id not in removed_node_ids:
+                    models[node_id] = model_state
+        # Update/add newly generated models
+        for node_id, output_file in generated_files.items():
+            node_data = manifest_nodes.get(node_id, {})
+            checksum = node_data.get("checksum", {}).get("checksum", "")
+            has_metrics = bool(
+                node_data.get("config", {}).get("meta", {}).get("metrics")
+            )
+            models[node_id] = ModelState(
+                checksum=checksum,
+                has_metrics=has_metrics,
+                last_generated=timestamp,
+                output_file=output_file,
+            )
+        return SyncState(
+            version="1.0",
+            last_sync_timestamp=timestamp,
+            manifest_path=str(manifest_path),
+            models=models,
+        )
+    def get_files_to_delete(
+        self,
+        previous_state: Optional[SyncState],
+        removed_node_ids: Set[str],
+    ) -> List[str]:
+        """
+        Get list of output files that should be deleted for removed models.
+        Args:
+            previous_state: Previous sync state
+            removed_node_ids: Set of node_ids that were removed
+        Returns:
+            List of file paths to delete
+        """
+        if not previous_state:
+            return []
+        files_to_delete = []
+        for node_id in removed_node_ids:
+            if node_id in previous_state.models:
+                output_file = previous_state.models[node_id].output_file
+                if os.path.exists(output_file):
+                    files_to_delete.append(output_file)
+        return files_to_delete

{dbt_cube_sync-0.1.0a6.dist-info → dbt_cube_sync-0.1.0a8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dbt-cube-sync
-Version: 0.1.0a6
+Version: 0.1.0a8
 Summary: Synchronization tool for dbt models to Cube.js schemas and BI tools
 Author: Ponder
 Requires-Python: >=3.9,<4.0
@@ -144,34 +144,101 @@ connectors:
 ## CLI Commands
+### Quick Reference
+| Command | Description |
+|---------|-------------|
+| `sync-all` | **Ultimate command** - Incremental sync: dbt → Cube.js → Superset → RAG |
+| `dbt-to-cube` | Generate Cube.js schemas from dbt models (with incremental support) |
+| `cube-to-bi` | Sync Cube.js schemas to BI tools (Superset, Tableau, PowerBI) |
+---
+### `sync-all` (Recommended)
+**Ultimate incremental sync command** - handles the complete pipeline with state tracking.
+```bash
+# Basic incremental sync (Cube.js only)
+dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output
+# Full pipeline: dbt → Cube.js → Superset
+dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output \
+  --superset-url http://localhost:8088 \
+  --superset-username admin \
+  --superset-password admin
+# Full pipeline: dbt → Cube.js → Superset → RAG embeddings
+dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output \
+  --superset-url http://localhost:8088 \
+  --superset-username admin \
+  --superset-password admin \
+  --rag-api-url http://localhost:8000
+# Force full rebuild (ignore state)
+dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output --force-full-sync
+```
+**Options:**
+| Option | Required | Description |
+|--------|----------|-------------|
+| `--manifest, -m` | Yes | Path to dbt manifest.json |
+| `--catalog, -c` | No* | Path to dbt catalog.json |
+| `--sqlalchemy-uri, -s` | No* | Database URI for column types |
+| `--output, -o` | Yes | Output directory for Cube.js files |
+| `--state-path` | No | State file path (default: `.dbt-cube-sync-state.json`) |
+| `--force-full-sync` | No | Force full rebuild, ignore state |
+| `--superset-url` | No | Superset URL |
+| `--superset-username` | No | Superset username |
+| `--superset-password` | No | Superset password |
+| `--cube-connection-name` | No | Cube database name in Superset (default: `Cube`) |
+| `--rag-api-url` | No | RAG API URL for embedding updates |
+*Either `--catalog` or `--sqlalchemy-uri` is required.
+**How Incremental Sync Works:**
+1. Reads state file (`.dbt-cube-sync-state.json`) with model checksums
+2. Compares against current manifest to detect changes
+3. Only processes **added** or **modified** models
+4. Deletes Cube.js files for **removed** models
+5. Updates state file with new checksums
+---
 ### `dbt-to-cube`
-Generate Cube.js schema files from dbt models.
+Generate Cube.js schema files from dbt models with incremental support.
 **Options:**
 - `--manifest` / `-m`: Path to dbt manifest.json file (required)
-- `--catalog` / `-c`: Path to dbt catalog.json file (optional if --sqlalchemy-uri is provided)
-- `--sqlalchemy-uri` / `-s`: SQLAlchemy database URI for fetching column types (optional if --catalog is provided)
-  - Example: `postgresql://user:password@localhost:5432/database`
-  - Example: `mysql://user:password@localhost:3306/database`
-  - Example: `snowflake://user:password@account/database/schema`
-- `--models`: Comma-separated list of model names to process (optional, processes all if not specified)
-  - Example: `--models model1,model2,model3`
+- `--catalog` / `-c`: Path to dbt catalog.json file
+- `--sqlalchemy-uri` / `-s`: SQLAlchemy database URI for fetching column types
+- `--models`: Comma-separated list of model names to process
 - `--output` / `-o`: Output directory for Cube.js files (required)
 - `--template-dir` / `-t`: Directory containing Cube.js templates (default: ./cube/templates)
+- `--state-path`: State file for incremental sync (default: `.dbt-cube-sync-state.json`)
+- `--force-full-sync`: Force full regeneration, ignore cached state
+- `--no-state`: Disable state tracking (legacy behavior)
 **Examples:**
 ```bash
-# Using catalog file
+# Incremental sync (default)
 dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/
+# Force full rebuild
+dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/ --force-full-sync
 # Using database connection (no catalog needed)
 dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@localhost/db -o output/
 # Filter specific models
-dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@localhost/db --models users,orders -o output/
+dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/ --models users,orders
 ```
+---
 ### `cube-to-bi`
 Sync Cube.js schemas to BI tool datasets.
 **Arguments:**
@@ -189,15 +256,29 @@ Sync Cube.js schemas to BI tool datasets.
 dbt-cube-sync cube-to-bi superset -c cube_output/ -u http://localhost:8088 -n admin -p admin -d Cube
 ```
-### `full-sync`
-Complete pipeline: dbt models → Cube.js schemas → BI tool datasets.
+---
+## State File
+The state file (`.dbt-cube-sync-state.json`) tracks:
+```json
+{
+  "version": "1.0",
+  "last_sync_timestamp": "2024-01-15T10:30:00Z",
+  "manifest_path": "/path/to/manifest.json",
+  "models": {
+    "model.project.users": {
+      "checksum": "abc123...",
+      "has_metrics": true,
+      "last_generated": "2024-01-15T10:30:00Z",
+      "output_file": "./cube_output/Users.js"
+    }
+  }
+}
+```
-**Options:**
-- `--dbt-manifest` / `-m`: Path to dbt manifest.json file
-- `--cube-dir` / `-c`: Directory for Cube.js files
-- `--template-dir` / `-t`: Directory containing Cube.js templates
-- `--bi-connector` / `-b`: BI tool to sync to
-- `--config-file` / `-f`: Configuration file for BI tool connection
+Delete this file to force a full rebuild, or use `--force-full-sync`.
 ## Architecture

dbt_cube_sync-0.1.0a8.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+dbt_cube_sync/__init__.py,sha256=aifkfgUDRPL5v0LZzceH2LXu66YDkJjdpvKwXsdikbI,113
+dbt_cube_sync/cli.py,sha256=AxSVF3hJJqovk51mjA8Nyyte5NkfukSF3sAjk_VYJ6Y,20992
+dbt_cube_sync/config.py,sha256=qhGE7CxTmh0RhPizgd3x3Yj-3L2LoC00UQIDT0q9FlQ,3858
+dbt_cube_sync/connectors/__init__.py,sha256=NG6tYZ3CYD5bG_MfNLZrUM8YoBEKArG8-AOmJ8pwvQI,52
+dbt_cube_sync/connectors/base.py,sha256=JLzerxJdt34z0kWuyieL6UQhf5_dUYPGmwkiRWBuSPY,2802
+dbt_cube_sync/connectors/powerbi.py,sha256=2Y8fTfh_6Q_Myma1ymipPh1U3HsfQKcktVequXXnIXI,1275
+dbt_cube_sync/connectors/superset.py,sha256=5YEqadVZRPFAJkgvhqkse3JuGJkQHfyvT88jy52ow_0,21429
+dbt_cube_sync/connectors/tableau.py,sha256=jKve1zErzTbgPOtmPB92ZwZl4I6uEySedM51JiwlGrE,1261
+dbt_cube_sync/core/__init__.py,sha256=kgsawtU5dqEvnHz6dU8qwJbH3rtIV7QlK2MhtYVDCaY,46
+dbt_cube_sync/core/cube_generator.py,sha256=DtmaA_dtWmBVJnSWHVoQi-3KEsRc0axHZpCUEcKeYAk,11061
+dbt_cube_sync/core/db_inspector.py,sha256=Ccd9ieGNlwHDHdgMVDEOfjs7R9Mjj904OW1P-mDSsyo,3155
+dbt_cube_sync/core/dbt_parser.py,sha256=KbhDoB0ULP6JDUPZPDVbm9yCtRKrW17ptGoJvVLtueY,12763
+dbt_cube_sync/core/models.py,sha256=2s5iZ9MEBGfSzkB4HJB5vG0mZqNXNJSfAD3Byw1IVe4,3203
+dbt_cube_sync/core/state_manager.py,sha256=7uXJtlZBIWj6s6XgAhNlP6UHdfhH0y461iyQlfidqGI,7233
+dbt_cube_sync-0.1.0a8.dist-info/METADATA,sha256=fsb721DeeHXUeeeLIihijjIiM6x7Wl8fUTeBMzlyoZo,10680
+dbt_cube_sync-0.1.0a8.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
+dbt_cube_sync-0.1.0a8.dist-info/entry_points.txt,sha256=iEAB_nZ1AoSeFwSHPY2tr02xmTHLVFKp5CJeFh0AfCw,56
+dbt_cube_sync-0.1.0a8.dist-info/RECORD,,

{dbt_cube_sync-0.1.0a6.dist-info → dbt_cube_sync-0.1.0a8.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 2.2.1
+Generator: poetry-core 2.3.0
 Root-Is-Purelib: true
 Tag: py3-none-any

dbt_cube_sync-0.1.0a6.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-dbt_cube_sync/__init__.py,sha256=aifkfgUDRPL5v0LZzceH2LXu66YDkJjdpvKwXsdikbI,113
-dbt_cube_sync/cli.py,sha256=lZT9vYosnr5NbrMPRAxP_AOSvomqjoFGnPuu9d-vcTM,6896
-dbt_cube_sync/config.py,sha256=qhGE7CxTmh0RhPizgd3x3Yj-3L2LoC00UQIDT0q9FlQ,3858
-dbt_cube_sync/connectors/__init__.py,sha256=NG6tYZ3CYD5bG_MfNLZrUM8YoBEKArG8-AOmJ8pwvQI,52
-dbt_cube_sync/connectors/base.py,sha256=JLzerxJdt34z0kWuyieL6UQhf5_dUYPGmwkiRWBuSPY,2802
-dbt_cube_sync/connectors/powerbi.py,sha256=2Y8fTfh_6Q_Myma1ymipPh1U3HsfQKcktVequXXnIXI,1275
-dbt_cube_sync/connectors/superset.py,sha256=5YEqadVZRPFAJkgvhqkse3JuGJkQHfyvT88jy52ow_0,21429
-dbt_cube_sync/connectors/tableau.py,sha256=jKve1zErzTbgPOtmPB92ZwZl4I6uEySedM51JiwlGrE,1261
-dbt_cube_sync/core/__init__.py,sha256=kgsawtU5dqEvnHz6dU8qwJbH3rtIV7QlK2MhtYVDCaY,46
-dbt_cube_sync/core/cube_generator.py,sha256=o_-fa09F3RQADueIgou8EFhmxKd7PbQ-hCJmXvRuvWM,10839
-dbt_cube_sync/core/db_inspector.py,sha256=eoJl7XG3dPcKg22SEX2dehC8Hvj5hgLR8sUgKiPCIGI,1540
-dbt_cube_sync/core/dbt_parser.py,sha256=vQEUO19WYdeFNnulU2_PD4hdHUtTO-Y9BXfHuH6ZVnM,10192
-dbt_cube_sync/core/models.py,sha256=JjiFAO0vbfVZkKOd6NcZb_JMGSVMTMfQiYjHcZbKtnI,2811
-dbt_cube_sync-0.1.0a6.dist-info/METADATA,sha256=SgI2Sm6jQ748KKn3ZnFGz1qEoIaJmSfMm_Owv9hE5Hc,8274
-dbt_cube_sync-0.1.0a6.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-dbt_cube_sync-0.1.0a6.dist-info/entry_points.txt,sha256=iEAB_nZ1AoSeFwSHPY2tr02xmTHLVFKp5CJeFh0AfCw,56
-dbt_cube_sync-0.1.0a6.dist-info/RECORD,,

{dbt_cube_sync-0.1.0a6.dist-info → dbt_cube_sync-0.1.0a8.dist-info}/entry_points.txt RENAMED Viewed

File without changes

dbt-cube-sync 0.1.0a6__py3-none-any.whl → 0.1.0a8__py3-none-any.whl

Potentially problematic release.

dbt-cube-sync 0.1.0a6py3-none-any.whl → 0.1.0a8py3-none-any.whl