PyPI - dbt-cube-sync - Versions diffs - 0.1.0a5__py3-none-any.whl → 0.1.0a7__py3-none-any.whl - Mend

dbt-cube-sync 0.1.0a5py3-none-any.whl → 0.1.0a7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dbt-cube-sync might be problematic. Click here for more details.

Files changed (7) hide show

dbt_cube_sync/cli.py CHANGED Viewed

@@ -24,10 +24,12 @@ class CustomGroup(click.Group):
             click.echo("\nAvailable commands:")
             click.echo("  dbt-cube-sync --help                                    # Show help")
             click.echo("  dbt-cube-sync --version                                 # Show version")
-            click.echo("  dbt-cube-sync dbt-to-cube -m manifest -c catalog -o output # Generate Cube.js schemas")
+            click.echo("  dbt-cube-sync dbt-to-cube -m manifest -c catalog -o output # Generate with catalog")
+            click.echo("  dbt-cube-sync dbt-to-cube -m manifest -s postgresql://user:pass@host/db -o output # Generate with database")
+            click.echo("  dbt-cube-sync dbt-to-cube -m manifest -s <uri> --models model1,model2 -o output # Filter specific models")
             click.echo("  dbt-cube-sync cube-to-bi superset -c cubes -u url -n user -p pass -d Cube # Sync to BI tool")
             ctx.exit(1)
         return super().get_command(ctx, cmd_name)
@@ -39,35 +41,66 @@ def main():
 @main.command()
-@click.option('--manifest', '-m',
+@click.option('--manifest', '-m',
               required=True,
               help='Path to dbt manifest.json file')
 @click.option('--catalog', '-c',
-              required=True,
-              help='Path to dbt catalog.json file')
+              required=False,
+              default=None,
+              help='Path to dbt catalog.json file (optional if --sqlalchemy-uri is provided)')
+@click.option('--sqlalchemy-uri', '-s',
+              required=False,
+              default=None,
+              help='SQLAlchemy database URI for fetching column types (e.g., postgresql://user:pass@host:port/db)')
+@click.option('--models',
+              required=False,
+              default=None,
+              help='Comma-separated list of model names to process (e.g., model1,model2). If not specified, processes all models')
 @click.option('--output', '-o',
               required=True,
               help='Output directory for Cube.js files')
 @click.option('--template-dir', '-t',
               default='./cube/templates',
               help='Directory containing Cube.js templates')
-def dbt_to_cube(manifest: str, catalog: str, output: str, template_dir: str):
+def dbt_to_cube(manifest: str, catalog: Optional[str], sqlalchemy_uri: Optional[str], models: Optional[str], output: str, template_dir: str):
     """Generate Cube.js schemas from dbt models"""
     try:
+        # Validate that at least one source of column types is provided
+        if not catalog and not sqlalchemy_uri:
+            click.echo("❌ Error: You must provide either --catalog or --sqlalchemy-uri to get column data types", err=True)
+            click.echo("💡 Example with catalog: dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/", err=True)
+            click.echo("💡 Example with database: dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@host:port/db -o output/", err=True)
+            sys.exit(1)
+        # Parse model filter if provided
+        model_filter = None
+        if models:
+            model_filter = [m.strip() for m in models.split(',')]
+            click.echo(f"🎯 Filtering models: {', '.join(model_filter)}")
         click.echo("🔄 Parsing dbt manifest...")
-        parser = DbtParser(manifest, catalog)
-        models = parser.parse_models()
-        click.echo(f"📊 Found {len(models)} dbt models")
+        parser = DbtParser(
+            manifest_path=manifest,
+            catalog_path=catalog,
+            sqlalchemy_uri=sqlalchemy_uri,
+            model_filter=model_filter
+        )
+        parsed_models = parser.parse_models()
+        click.echo(f"📊 Found {len(parsed_models)} dbt models")
+        if len(parsed_models) == 0:
+            click.echo("⚠️  No models found. Make sure your models have both columns and metrics defined.")
+            sys.exit(0)
         click.echo("🏗️  Generating Cube.js schemas...")
         generator = CubeGenerator(template_dir, output)
-        generated_files = generator.generate_cube_files(models)
+        generated_files = generator.generate_cube_files(parsed_models)
         click.echo(f"✅ Generated {len(generated_files)} Cube.js files:")
         for file_path in generated_files:
             click.echo(f"   • {file_path}")
     except Exception as e:
         click.echo(f"❌ Error: {str(e)}", err=True)
         sys.exit(1)

dbt_cube_sync/core/db_inspector.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""
+Database inspector - fetches column types using SQLAlchemy
+"""
+from typing import Dict, Optional
+from sqlalchemy import create_engine, inspect, MetaData, Table, text
+from sqlalchemy.engine import Engine
+class DatabaseInspector:
+    """Inspects database schema to extract column type information"""
+    def __init__(self, sqlalchemy_uri: str):
+        """
+        Initialize the database inspector
+        Args:
+            sqlalchemy_uri: SQLAlchemy connection URI (e.g., postgresql://user:pass@host:port/db)
+        """
+        # Add connect_args for Redshift compatibility
+        if 'redshift' in sqlalchemy_uri:
+            self.engine: Engine = create_engine(
+                sqlalchemy_uri,
+                connect_args={'sslmode': 'prefer'}
+            )
+        else:
+            self.engine: Engine = create_engine(sqlalchemy_uri)
+        self.inspector = inspect(self.engine)
+        self.is_redshift = 'redshift' in sqlalchemy_uri.lower()
+    def get_table_columns(self, schema: str, table_name: str) -> Dict[str, str]:
+        """
+        Get column names and their data types for a specific table
+        Args:
+            schema: Database schema name
+            table_name: Table name
+        Returns:
+            Dictionary mapping column names to data types
+        """
+        columns = {}
+        try:
+            # For Redshift, use direct SQL query to avoid pg_catalog issues
+            if self.is_redshift:
+                columns = self._get_redshift_columns(schema, table_name)
+            else:
+                # Get columns from the database using inspector
+                table_columns = self.inspector.get_columns(table_name, schema=schema)
+                for column in table_columns:
+                    col_name = column['name']
+                    col_type = str(column['type'])
+                    columns[col_name] = col_type
+        except Exception as e:
+            print(f"Warning: Could not inspect table {schema}.{table_name}: {e}")
+        return columns
+    def _get_redshift_columns(self, schema: str, table_name: str) -> Dict[str, str]:
+        """
+        Get columns for Redshift using direct SQL query
+        Args:
+            schema: Database schema name
+            table_name: Table name
+        Returns:
+            Dictionary mapping column names to data types
+        """
+        columns = {}
+        try:
+            # Query Redshift's pg_table_def view which is more reliable
+            query = text("""
+                SELECT column_name, data_type
+                FROM pg_table_def
+                WHERE schemaname = :schema
+                AND tablename = :table_name
+                ORDER BY column_name
+            """)
+            with self.engine.connect() as conn:
+                result = conn.execute(query, {"schema": schema, "table_name": table_name})
+                for row in result:
+                    columns[row[0]] = row[1]
+        except Exception as e:
+            # Fallback to information_schema if pg_table_def fails
+            try:
+                query = text("""
+                    SELECT column_name, data_type
+                    FROM information_schema.columns
+                    WHERE table_schema = :schema
+                    AND table_name = :table_name
+                    ORDER BY ordinal_position
+                """)
+                with self.engine.connect() as conn:
+                    result = conn.execute(query, {"schema": schema, "table_name": table_name})
+                    for row in result:
+                        columns[row[0]] = row[1]
+            except Exception as fallback_error:
+                print(f"Warning: Could not query Redshift table {schema}.{table_name}: {fallback_error}")
+        return columns
+    def close(self):
+        """Close the database connection"""
+        self.engine.dispose()

dbt_cube_sync/core/dbt_parser.py CHANGED Viewed

@@ -3,27 +3,39 @@ dbt manifest parser - extracts models, metrics, and column information
 """
 import json
 import os
-from typing import Dict, List
+from typing import Dict, List, Optional
 from pathlib import Path
 from .models import DbtModel, DbtColumn, DbtMetric, DbtPreAggregation, DbtRefreshKey
+from .db_inspector import DatabaseInspector
 class DbtParser:
     """Parses dbt manifest.json to extract model and metric information"""
-    def __init__(self, manifest_path: str, catalog_path: str = None):
+    def __init__(
+        self,
+        manifest_path: str,
+        catalog_path: Optional[str] = None,
+        sqlalchemy_uri: Optional[str] = None,
+        model_filter: Optional[List[str]] = None
+    ):
         """
         Initialize the parser
         Args:
             manifest_path: Path to dbt manifest.json file
             catalog_path: Optional path to dbt catalog.json for column types
+            sqlalchemy_uri: Optional SQLAlchemy URI to connect to database for column types
+            model_filter: Optional list of model names to process (if None, processes all models)
         """
         self.manifest_path = manifest_path
         self.catalog_path = catalog_path
+        self.sqlalchemy_uri = sqlalchemy_uri
+        self.model_filter = model_filter
         self.manifest = self._load_manifest()
         self.catalog = self._load_catalog() if catalog_path else None
+        self.db_inspector = DatabaseInspector(sqlalchemy_uri) if sqlalchemy_uri else None
     def _load_manifest(self) -> dict:
         """Load the dbt manifest.json file"""
@@ -48,23 +60,32 @@ class DbtParser:
     def parse_models(self) -> List[DbtModel]:
         """
         Extract models with metrics and columns from manifest
         Returns:
             List of DbtModel instances
         """
         models = []
         nodes = self.manifest.get('nodes', {})
         for node_id, node_data in nodes.items():
             # Only process models
             if node_data.get('resource_type') != 'model':
                 continue
+            # Apply model filter if specified
+            model_name = node_data.get('name', '')
+            if self.model_filter and model_name not in self.model_filter:
+                continue
             model = self._parse_model(node_id, node_data)
             # Include models that have columns AND metrics (measures are required for useful Cube.js schemas)
             if model and model.columns and model.metrics:
                 models.append(model)
+        # Close database inspector if it was used
+        if self.db_inspector:
+            self.db_inspector.close()
         return models
     def _parse_model(self, node_id: str, node_data: dict) -> DbtModel:
@@ -93,24 +114,35 @@ class DbtParser:
         )
     def _parse_columns(self, node_id: str, node_data: dict) -> Dict[str, DbtColumn]:
-        """Parse columns for a model, enhanced with catalog data if available"""
+        """Parse columns for a model, enhanced with catalog or database data if available"""
         columns = {}
         manifest_columns = node_data.get('columns', {})
-        # Get catalog columns for type information
+        # Get catalog columns for type information (if catalog is available)
         catalog_columns = {}
         if self.catalog and node_id in self.catalog.get('nodes', {}):
             catalog_columns = self.catalog['nodes'][node_id].get('columns', {})
-        # If manifest has columns, use them with catalog type info
+        # Get database columns for type information (if db_inspector is available)
+        db_columns = {}
+        if self.db_inspector and not self.catalog:
+            schema = node_data.get('schema', '')
+            table_name = node_data.get('name', '')
+            if schema and table_name:
+                db_columns = self.db_inspector.get_table_columns(schema, table_name)
+        # If manifest has columns, use them with catalog or database type info
         if manifest_columns:
             for col_name, col_data in manifest_columns.items():
                 data_type = None
-                # Try to get data type from catalog
+                # Try to get data type from catalog first
                 if col_name in catalog_columns:
                     data_type = catalog_columns[col_name].get('type', '')
+                # Otherwise try database
+                elif col_name in db_columns:
+                    data_type = db_columns[col_name]
                 columns[col_name] = DbtColumn(
                     name=col_name,
                     data_type=data_type,
@@ -118,15 +150,24 @@ class DbtParser:
                     meta=col_data.get('meta', {})
                 )
         else:
-            # If no manifest columns, use all catalog columns
-            for col_name, col_data in catalog_columns.items():
+            # If no manifest columns, use catalog or database columns
+            source_columns = catalog_columns or db_columns
+            for col_name in source_columns:
+                if catalog_columns:
+                    col_data = catalog_columns[col_name]
+                    data_type = col_data.get('type', '')
+                    description = f"Column from catalog: {col_name}"
+                else:
+                    data_type = db_columns[col_name]
+                    description = f"Column from database: {col_name}"
                 columns[col_name] = DbtColumn(
                     name=col_name,
-                    data_type=col_data.get('type', ''),
-                    description=f"Column from catalog: {col_name}",
+                    data_type=data_type,
+                    description=description,
                     meta={}
                 )
         return columns
     def _parse_metrics(self, node_data: dict) -> Dict[str, DbtMetric]:

{dbt_cube_sync-0.1.0a5.dist-info → dbt_cube_sync-0.1.0a7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dbt-cube-sync
-Version: 0.1.0a5
+Version: 0.1.0a7
 Summary: Synchronization tool for dbt models to Cube.js schemas and BI tools
 Author: Ponder
 Requires-Python: >=3.9,<4.0
@@ -16,6 +16,7 @@ Requires-Dist: jinja2 (>=3.1.2,<4.0.0)
 Requires-Dist: pydantic (>=2.5.0,<3.0.0)
 Requires-Dist: pyyaml (>=6.0,<7.0)
 Requires-Dist: requests (>=2.31.0,<3.0.0)
+Requires-Dist: sqlalchemy (>=2.0.0,<3.0.0)
 Description-Content-Type: text/markdown
 # dbt-cube-sync
@@ -25,6 +26,8 @@ A powerful synchronization tool that creates a seamless pipeline from dbt models
 ## Features
 - 🔄 **dbt → Cube.js**: Auto-generate Cube.js schemas from dbt models with metrics
+- 🗃️ **Flexible Data Type Source**: Get column types from catalog OR directly from database via SQLAlchemy
+- 🎯 **Model Filtering**: Process specific models instead of all models
 - 📊 **Cube.js → BI Tools**: Sync schemas to multiple BI platforms
 - 🏗️ **Extensible Architecture**: Plugin-based connector system for easy BI tool integration
 - 🐳 **Docker Support**: Containerized execution with orchestration support
@@ -46,6 +49,27 @@ poetry install
 poetry run dbt-cube-sync --help
 ```
+### Database Drivers (for SQLAlchemy URI feature)
+If you want to use the `--sqlalchemy-uri` option to fetch column types directly from your database, you'll need to install the appropriate database driver:
+```bash
+# PostgreSQL
+poetry add psycopg2-binary
+# MySQL
+poetry add pymysql
+# Snowflake
+poetry add snowflake-sqlalchemy
+# BigQuery
+poetry add sqlalchemy-bigquery
+# Redshift
+poetry add sqlalchemy-redshift
+```
 ### Using Docker
 ```bash
@@ -55,42 +79,43 @@ docker run --rm dbt-cube-sync --help
 ## Quick Start
-### 1. Create Configuration File
+### 1. Generate Cube.js Schemas from dbt
+**Option A: Using catalog file (traditional method)**
 ```bash
-# Create sample config
-dbt-cube-sync create-config sync-config.yaml
-# Edit the config file with your BI tool credentials
+dbt-cube-sync dbt-to-cube \
+  --manifest ./target/manifest.json \
+  --catalog ./target/catalog.json \
+  --output ./cube_output
 ```
-### 2. Generate Cube.js Schemas
+**Option B: Using database connection (no catalog needed)**
 ```bash
-# Generate from dbt manifest
-dbt-cube-sync generate-cubes \\
-  --dbt-manifest ./DbtEducationalDataProject/target/manifest.json \\
-  --output-dir ./cube/conf/cube_output
+dbt-cube-sync dbt-to-cube \
+  --manifest ./target/manifest.json \
+  --sqlalchemy-uri postgresql://user:password@localhost:5432/mydb \
+  --output ./cube_output
 ```
-### 3. Sync to BI Tool
+**Option C: Filter specific models**
 ```bash
-# Sync to Superset
-dbt-cube-sync sync-bi superset \\
-  --cube-dir ./cube/conf/cube_output \\
-  --config-file ./sync-config.yaml
+dbt-cube-sync dbt-to-cube \
+  --manifest ./target/manifest.json \
+  --sqlalchemy-uri postgresql://user:password@localhost:5432/mydb \
+  --models orders,customers,products \
+  --output ./cube_output
 ```
-### 4. Full Pipeline
+### 2. Sync to BI Tool (Optional)
 ```bash
-# Complete dbt → Cube.js → Superset pipeline
-dbt-cube-sync full-sync \\
-  --dbt-manifest ./DbtEducationalDataProject/target/manifest.json \\
-  --cube-dir ./cube/conf/cube_output \\
-  --bi-connector superset \\
-  --config-file ./sync-config.yaml
+# Sync to Superset
+dbt-cube-sync cube-to-bi superset \
+  --cube-files ./cube_output \
+  --url http://localhost:8088 \
+  --username admin \
+  --password admin \
+  --cube-connection-name Cube
 ```
 ## Configuration
@@ -119,23 +144,50 @@ connectors:
 ## CLI Commands
-### `generate-cubes`
+### `dbt-to-cube`
 Generate Cube.js schema files from dbt models.
 **Options:**
-- `--dbt-manifest` / `-m`: Path to dbt manifest.json file
-- `--output-dir` / `-o`: Output directory for Cube.js files
-- `--template-dir` / `-t`: Directory containing Cube.js templates
+- `--manifest` / `-m`: Path to dbt manifest.json file (required)
+- `--catalog` / `-c`: Path to dbt catalog.json file (optional if --sqlalchemy-uri is provided)
+- `--sqlalchemy-uri` / `-s`: SQLAlchemy database URI for fetching column types (optional if --catalog is provided)
+  - Example: `postgresql://user:password@localhost:5432/database`
+  - Example: `mysql://user:password@localhost:3306/database`
+  - Example: `snowflake://user:password@account/database/schema`
+- `--models`: Comma-separated list of model names to process (optional, processes all if not specified)
+  - Example: `--models model1,model2,model3`
+- `--output` / `-o`: Output directory for Cube.js files (required)
+- `--template-dir` / `-t`: Directory containing Cube.js templates (default: ./cube/templates)
+**Examples:**
+```bash
+# Using catalog file
+dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/
+# Using database connection (no catalog needed)
+dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@localhost/db -o output/
-### `sync-bi`
+# Filter specific models
+dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@localhost/db --models users,orders -o output/
+```
+### `cube-to-bi`
 Sync Cube.js schemas to BI tool datasets.
 **Arguments:**
-- `connector`: BI tool type (`superset`, `tableau`, `powerbi`)
+- `bi_tool`: BI tool type (`superset`, `tableau`, `powerbi`)
 **Options:**
-- `--cube-dir` / `-c`: Directory containing Cube.js files
-- `--config-file` / `-f`: Configuration file for BI tool connection
+- `--cube-files` / `-c`: Directory containing Cube.js files (required)
+- `--url` / `-u`: BI tool URL (required)
+- `--username` / `-n`: BI tool username (required)
+- `--password` / `-p`: BI tool password (required)
+- `--cube-connection-name` / `-d`: Name of Cube database connection in BI tool (default: Cube)
+**Example:**
+```bash
+dbt-cube-sync cube-to-bi superset -c cube_output/ -u http://localhost:8088 -n admin -p admin -d Cube
+```
 ### `full-sync`
 Complete pipeline: dbt models → Cube.js schemas → BI tool datasets.
@@ -170,6 +222,7 @@ dbt-cube-sync/
 │   ├── config.py             # Configuration management
 │   ├── core/
 │   │   ├── dbt_parser.py     # dbt manifest parser
+│   │   ├── db_inspector.py   # Database column type inspector (SQLAlchemy)
 │   │   ├── cube_generator.py # Cube.js generator
 │   │   └── models.py         # Pydantic data models
 │   └── connectors/

{dbt_cube_sync-0.1.0a5.dist-info → dbt_cube_sync-0.1.0a7.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 dbt_cube_sync/__init__.py,sha256=aifkfgUDRPL5v0LZzceH2LXu66YDkJjdpvKwXsdikbI,113
-dbt_cube_sync/cli.py,sha256=hcZzKqmSFEvEXoDWbbluna5mqdWVC6Cej6imWob0ml4,4983
+dbt_cube_sync/cli.py,sha256=lZT9vYosnr5NbrMPRAxP_AOSvomqjoFGnPuu9d-vcTM,6896
 dbt_cube_sync/config.py,sha256=qhGE7CxTmh0RhPizgd3x3Yj-3L2LoC00UQIDT0q9FlQ,3858
 dbt_cube_sync/connectors/__init__.py,sha256=NG6tYZ3CYD5bG_MfNLZrUM8YoBEKArG8-AOmJ8pwvQI,52
 dbt_cube_sync/connectors/base.py,sha256=JLzerxJdt34z0kWuyieL6UQhf5_dUYPGmwkiRWBuSPY,2802
@@ -8,9 +8,10 @@ dbt_cube_sync/connectors/superset.py,sha256=5YEqadVZRPFAJkgvhqkse3JuGJkQHfyvT88j
 dbt_cube_sync/connectors/tableau.py,sha256=jKve1zErzTbgPOtmPB92ZwZl4I6uEySedM51JiwlGrE,1261
 dbt_cube_sync/core/__init__.py,sha256=kgsawtU5dqEvnHz6dU8qwJbH3rtIV7QlK2MhtYVDCaY,46
 dbt_cube_sync/core/cube_generator.py,sha256=o_-fa09F3RQADueIgou8EFhmxKd7PbQ-hCJmXvRuvWM,10839
-dbt_cube_sync/core/dbt_parser.py,sha256=xYNMDOFOAJIL9LmGfcNMbbQYwMTumkMsqBvxBUtMGd4,8472
+dbt_cube_sync/core/db_inspector.py,sha256=HK7hpU56X5ED-i_vXGB9rVs79eAwgEXS_SMdk08PHs0,3850
+dbt_cube_sync/core/dbt_parser.py,sha256=vQEUO19WYdeFNnulU2_PD4hdHUtTO-Y9BXfHuH6ZVnM,10192
 dbt_cube_sync/core/models.py,sha256=JjiFAO0vbfVZkKOd6NcZb_JMGSVMTMfQiYjHcZbKtnI,2811
-dbt_cube_sync-0.1.0a5.dist-info/METADATA,sha256=gJ_Ti9TtbNHPuEROEn0iZ0GTh6yQDLTUsUb653pFKDc,6116
-dbt_cube_sync-0.1.0a5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-dbt_cube_sync-0.1.0a5.dist-info/entry_points.txt,sha256=iEAB_nZ1AoSeFwSHPY2tr02xmTHLVFKp5CJeFh0AfCw,56
-dbt_cube_sync-0.1.0a5.dist-info/RECORD,,
+dbt_cube_sync-0.1.0a7.dist-info/METADATA,sha256=2cn4bF41UTmo9nv817Xmd2I5LjGmzNYC9gNX_tlmz3Y,8274
+dbt_cube_sync-0.1.0a7.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+dbt_cube_sync-0.1.0a7.dist-info/entry_points.txt,sha256=iEAB_nZ1AoSeFwSHPY2tr02xmTHLVFKp5CJeFh0AfCw,56
+dbt_cube_sync-0.1.0a7.dist-info/RECORD,,

{dbt_cube_sync-0.1.0a5.dist-info → dbt_cube_sync-0.1.0a7.dist-info}/WHEEL RENAMED Viewed

File without changes

{dbt_cube_sync-0.1.0a5.dist-info → dbt_cube_sync-0.1.0a7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

dbt-cube-sync 0.1.0a5__py3-none-any.whl → 0.1.0a7__py3-none-any.whl

Potentially problematic release.

dbt-cube-sync 0.1.0a5py3-none-any.whl → 0.1.0a7py3-none-any.whl