dbt-cube-sync 0.1.0a5__tar.gz → 0.1.0a7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dbt-cube-sync
3
- Version: 0.1.0a5
3
+ Version: 0.1.0a7
4
4
  Summary: Synchronization tool for dbt models to Cube.js schemas and BI tools
5
5
  Author: Ponder
6
6
  Requires-Python: >=3.9,<4.0
@@ -16,6 +16,7 @@ Requires-Dist: jinja2 (>=3.1.2,<4.0.0)
16
16
  Requires-Dist: pydantic (>=2.5.0,<3.0.0)
17
17
  Requires-Dist: pyyaml (>=6.0,<7.0)
18
18
  Requires-Dist: requests (>=2.31.0,<3.0.0)
19
+ Requires-Dist: sqlalchemy (>=2.0.0,<3.0.0)
19
20
  Description-Content-Type: text/markdown
20
21
 
21
22
  # dbt-cube-sync
@@ -25,6 +26,8 @@ A powerful synchronization tool that creates a seamless pipeline from dbt models
25
26
  ## Features
26
27
 
27
28
  - 🔄 **dbt → Cube.js**: Auto-generate Cube.js schemas from dbt models with metrics
29
+ - 🗃️ **Flexible Data Type Source**: Get column types from catalog OR directly from database via SQLAlchemy
30
+ - 🎯 **Model Filtering**: Process specific models instead of all models
28
31
  - 📊 **Cube.js → BI Tools**: Sync schemas to multiple BI platforms
29
32
  - 🏗️ **Extensible Architecture**: Plugin-based connector system for easy BI tool integration
30
33
  - 🐳 **Docker Support**: Containerized execution with orchestration support
@@ -46,6 +49,27 @@ poetry install
46
49
  poetry run dbt-cube-sync --help
47
50
  ```
48
51
 
52
+ ### Database Drivers (for SQLAlchemy URI feature)
53
+
54
+ If you want to use the `--sqlalchemy-uri` option to fetch column types directly from your database, you'll need to install the appropriate database driver:
55
+
56
+ ```bash
57
+ # PostgreSQL
58
+ poetry add psycopg2-binary
59
+
60
+ # MySQL
61
+ poetry add pymysql
62
+
63
+ # Snowflake
64
+ poetry add snowflake-sqlalchemy
65
+
66
+ # BigQuery
67
+ poetry add sqlalchemy-bigquery
68
+
69
+ # Redshift
70
+ poetry add sqlalchemy-redshift
71
+ ```
72
+
49
73
  ### Using Docker
50
74
 
51
75
  ```bash
@@ -55,42 +79,43 @@ docker run --rm dbt-cube-sync --help
55
79
 
56
80
  ## Quick Start
57
81
 
58
- ### 1. Create Configuration File
82
+ ### 1. Generate Cube.js Schemas from dbt
59
83
 
84
+ **Option A: Using catalog file (traditional method)**
60
85
  ```bash
61
- # Create sample config
62
- dbt-cube-sync create-config sync-config.yaml
63
-
64
- # Edit the config file with your BI tool credentials
86
+ dbt-cube-sync dbt-to-cube \
87
+ --manifest ./target/manifest.json \
88
+ --catalog ./target/catalog.json \
89
+ --output ./cube_output
65
90
  ```
66
91
 
67
- ### 2. Generate Cube.js Schemas
68
-
92
+ **Option B: Using database connection (no catalog needed)**
69
93
  ```bash
70
- # Generate from dbt manifest
71
- dbt-cube-sync generate-cubes \\
72
- --dbt-manifest ./DbtEducationalDataProject/target/manifest.json \\
73
- --output-dir ./cube/conf/cube_output
94
+ dbt-cube-sync dbt-to-cube \
95
+ --manifest ./target/manifest.json \
96
+ --sqlalchemy-uri postgresql://user:password@localhost:5432/mydb \
97
+ --output ./cube_output
74
98
  ```
75
99
 
76
- ### 3. Sync to BI Tool
77
-
100
+ **Option C: Filter specific models**
78
101
  ```bash
79
- # Sync to Superset
80
- dbt-cube-sync sync-bi superset \\
81
- --cube-dir ./cube/conf/cube_output \\
82
- --config-file ./sync-config.yaml
102
+ dbt-cube-sync dbt-to-cube \
103
+ --manifest ./target/manifest.json \
104
+ --sqlalchemy-uri postgresql://user:password@localhost:5432/mydb \
105
+ --models orders,customers,products \
106
+ --output ./cube_output
83
107
  ```
84
108
 
85
- ### 4. Full Pipeline
109
+ ### 2. Sync to BI Tool (Optional)
86
110
 
87
111
  ```bash
88
- # Complete dbt → Cube.js → Superset pipeline
89
- dbt-cube-sync full-sync \\
90
- --dbt-manifest ./DbtEducationalDataProject/target/manifest.json \\
91
- --cube-dir ./cube/conf/cube_output \\
92
- --bi-connector superset \\
93
- --config-file ./sync-config.yaml
112
+ # Sync to Superset
113
+ dbt-cube-sync cube-to-bi superset \
114
+ --cube-files ./cube_output \
115
+ --url http://localhost:8088 \
116
+ --username admin \
117
+ --password admin \
118
+ --cube-connection-name Cube
94
119
  ```
95
120
 
96
121
  ## Configuration
@@ -119,23 +144,50 @@ connectors:
119
144
 
120
145
  ## CLI Commands
121
146
 
122
- ### `generate-cubes`
147
+ ### `dbt-to-cube`
123
148
  Generate Cube.js schema files from dbt models.
124
149
 
125
150
  **Options:**
126
- - `--dbt-manifest` / `-m`: Path to dbt manifest.json file
127
- - `--output-dir` / `-o`: Output directory for Cube.js files
128
- - `--template-dir` / `-t`: Directory containing Cube.js templates
151
+ - `--manifest` / `-m`: Path to dbt manifest.json file (required)
152
+ - `--catalog` / `-c`: Path to dbt catalog.json file (optional if --sqlalchemy-uri is provided)
153
+ - `--sqlalchemy-uri` / `-s`: SQLAlchemy database URI for fetching column types (optional if --catalog is provided)
154
+ - Example: `postgresql://user:password@localhost:5432/database`
155
+ - Example: `mysql://user:password@localhost:3306/database`
156
+ - Example: `snowflake://user:password@account/database/schema`
157
+ - `--models`: Comma-separated list of model names to process (optional, processes all if not specified)
158
+ - Example: `--models model1,model2,model3`
159
+ - `--output` / `-o`: Output directory for Cube.js files (required)
160
+ - `--template-dir` / `-t`: Directory containing Cube.js templates (default: ./cube/templates)
161
+
162
+ **Examples:**
163
+ ```bash
164
+ # Using catalog file
165
+ dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/
166
+
167
+ # Using database connection (no catalog needed)
168
+ dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@localhost/db -o output/
129
169
 
130
- ### `sync-bi`
170
+ # Filter specific models
171
+ dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@localhost/db --models users,orders -o output/
172
+ ```
173
+
174
+ ### `cube-to-bi`
131
175
  Sync Cube.js schemas to BI tool datasets.
132
176
 
133
177
  **Arguments:**
134
- - `connector`: BI tool type (`superset`, `tableau`, `powerbi`)
178
+ - `bi_tool`: BI tool type (`superset`, `tableau`, `powerbi`)
135
179
 
136
180
  **Options:**
137
- - `--cube-dir` / `-c`: Directory containing Cube.js files
138
- - `--config-file` / `-f`: Configuration file for BI tool connection
181
+ - `--cube-files` / `-c`: Directory containing Cube.js files (required)
182
+ - `--url` / `-u`: BI tool URL (required)
183
+ - `--username` / `-n`: BI tool username (required)
184
+ - `--password` / `-p`: BI tool password (required)
185
+ - `--cube-connection-name` / `-d`: Name of Cube database connection in BI tool (default: Cube)
186
+
187
+ **Example:**
188
+ ```bash
189
+ dbt-cube-sync cube-to-bi superset -c cube_output/ -u http://localhost:8088 -n admin -p admin -d Cube
190
+ ```
139
191
 
140
192
  ### `full-sync`
141
193
  Complete pipeline: dbt models → Cube.js schemas → BI tool datasets.
@@ -170,6 +222,7 @@ dbt-cube-sync/
170
222
  │ ├── config.py # Configuration management
171
223
  │ ├── core/
172
224
  │ │ ├── dbt_parser.py # dbt manifest parser
225
+ │ │ ├── db_inspector.py # Database column type inspector (SQLAlchemy)
173
226
  │ │ ├── cube_generator.py # Cube.js generator
174
227
  │ │ └── models.py # Pydantic data models
175
228
  │ └── connectors/
@@ -5,6 +5,8 @@ A powerful synchronization tool that creates a seamless pipeline from dbt models
5
5
  ## Features
6
6
 
7
7
  - 🔄 **dbt → Cube.js**: Auto-generate Cube.js schemas from dbt models with metrics
8
+ - 🗃️ **Flexible Data Type Source**: Get column types from catalog OR directly from database via SQLAlchemy
9
+ - 🎯 **Model Filtering**: Process specific models instead of all models
8
10
  - 📊 **Cube.js → BI Tools**: Sync schemas to multiple BI platforms
9
11
  - 🏗️ **Extensible Architecture**: Plugin-based connector system for easy BI tool integration
10
12
  - 🐳 **Docker Support**: Containerized execution with orchestration support
@@ -26,6 +28,27 @@ poetry install
26
28
  poetry run dbt-cube-sync --help
27
29
  ```
28
30
 
31
+ ### Database Drivers (for SQLAlchemy URI feature)
32
+
33
+ If you want to use the `--sqlalchemy-uri` option to fetch column types directly from your database, you'll need to install the appropriate database driver:
34
+
35
+ ```bash
36
+ # PostgreSQL
37
+ poetry add psycopg2-binary
38
+
39
+ # MySQL
40
+ poetry add pymysql
41
+
42
+ # Snowflake
43
+ poetry add snowflake-sqlalchemy
44
+
45
+ # BigQuery
46
+ poetry add sqlalchemy-bigquery
47
+
48
+ # Redshift
49
+ poetry add sqlalchemy-redshift
50
+ ```
51
+
29
52
  ### Using Docker
30
53
 
31
54
  ```bash
@@ -35,42 +58,43 @@ docker run --rm dbt-cube-sync --help
35
58
 
36
59
  ## Quick Start
37
60
 
38
- ### 1. Create Configuration File
61
+ ### 1. Generate Cube.js Schemas from dbt
39
62
 
63
+ **Option A: Using catalog file (traditional method)**
40
64
  ```bash
41
- # Create sample config
42
- dbt-cube-sync create-config sync-config.yaml
43
-
44
- # Edit the config file with your BI tool credentials
65
+ dbt-cube-sync dbt-to-cube \
66
+ --manifest ./target/manifest.json \
67
+ --catalog ./target/catalog.json \
68
+ --output ./cube_output
45
69
  ```
46
70
 
47
- ### 2. Generate Cube.js Schemas
48
-
71
+ **Option B: Using database connection (no catalog needed)**
49
72
  ```bash
50
- # Generate from dbt manifest
51
- dbt-cube-sync generate-cubes \\
52
- --dbt-manifest ./DbtEducationalDataProject/target/manifest.json \\
53
- --output-dir ./cube/conf/cube_output
73
+ dbt-cube-sync dbt-to-cube \
74
+ --manifest ./target/manifest.json \
75
+ --sqlalchemy-uri postgresql://user:password@localhost:5432/mydb \
76
+ --output ./cube_output
54
77
  ```
55
78
 
56
- ### 3. Sync to BI Tool
57
-
79
+ **Option C: Filter specific models**
58
80
  ```bash
59
- # Sync to Superset
60
- dbt-cube-sync sync-bi superset \\
61
- --cube-dir ./cube/conf/cube_output \\
62
- --config-file ./sync-config.yaml
81
+ dbt-cube-sync dbt-to-cube \
82
+ --manifest ./target/manifest.json \
83
+ --sqlalchemy-uri postgresql://user:password@localhost:5432/mydb \
84
+ --models orders,customers,products \
85
+ --output ./cube_output
63
86
  ```
64
87
 
65
- ### 4. Full Pipeline
88
+ ### 2. Sync to BI Tool (Optional)
66
89
 
67
90
  ```bash
68
- # Complete dbt → Cube.js → Superset pipeline
69
- dbt-cube-sync full-sync \\
70
- --dbt-manifest ./DbtEducationalDataProject/target/manifest.json \\
71
- --cube-dir ./cube/conf/cube_output \\
72
- --bi-connector superset \\
73
- --config-file ./sync-config.yaml
91
+ # Sync to Superset
92
+ dbt-cube-sync cube-to-bi superset \
93
+ --cube-files ./cube_output \
94
+ --url http://localhost:8088 \
95
+ --username admin \
96
+ --password admin \
97
+ --cube-connection-name Cube
74
98
  ```
75
99
 
76
100
  ## Configuration
@@ -99,23 +123,50 @@ connectors:
99
123
 
100
124
  ## CLI Commands
101
125
 
102
- ### `generate-cubes`
126
+ ### `dbt-to-cube`
103
127
  Generate Cube.js schema files from dbt models.
104
128
 
105
129
  **Options:**
106
- - `--dbt-manifest` / `-m`: Path to dbt manifest.json file
107
- - `--output-dir` / `-o`: Output directory for Cube.js files
108
- - `--template-dir` / `-t`: Directory containing Cube.js templates
130
+ - `--manifest` / `-m`: Path to dbt manifest.json file (required)
131
+ - `--catalog` / `-c`: Path to dbt catalog.json file (optional if --sqlalchemy-uri is provided)
132
+ - `--sqlalchemy-uri` / `-s`: SQLAlchemy database URI for fetching column types (optional if --catalog is provided)
133
+ - Example: `postgresql://user:password@localhost:5432/database`
134
+ - Example: `mysql://user:password@localhost:3306/database`
135
+ - Example: `snowflake://user:password@account/database/schema`
136
+ - `--models`: Comma-separated list of model names to process (optional, processes all if not specified)
137
+ - Example: `--models model1,model2,model3`
138
+ - `--output` / `-o`: Output directory for Cube.js files (required)
139
+ - `--template-dir` / `-t`: Directory containing Cube.js templates (default: ./cube/templates)
140
+
141
+ **Examples:**
142
+ ```bash
143
+ # Using catalog file
144
+ dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/
145
+
146
+ # Using database connection (no catalog needed)
147
+ dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@localhost/db -o output/
109
148
 
110
- ### `sync-bi`
149
+ # Filter specific models
150
+ dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@localhost/db --models users,orders -o output/
151
+ ```
152
+
153
+ ### `cube-to-bi`
111
154
  Sync Cube.js schemas to BI tool datasets.
112
155
 
113
156
  **Arguments:**
114
- - `connector`: BI tool type (`superset`, `tableau`, `powerbi`)
157
+ - `bi_tool`: BI tool type (`superset`, `tableau`, `powerbi`)
115
158
 
116
159
  **Options:**
117
- - `--cube-dir` / `-c`: Directory containing Cube.js files
118
- - `--config-file` / `-f`: Configuration file for BI tool connection
160
+ - `--cube-files` / `-c`: Directory containing Cube.js files (required)
161
+ - `--url` / `-u`: BI tool URL (required)
162
+ - `--username` / `-n`: BI tool username (required)
163
+ - `--password` / `-p`: BI tool password (required)
164
+ - `--cube-connection-name` / `-d`: Name of Cube database connection in BI tool (default: Cube)
165
+
166
+ **Example:**
167
+ ```bash
168
+ dbt-cube-sync cube-to-bi superset -c cube_output/ -u http://localhost:8088 -n admin -p admin -d Cube
169
+ ```
119
170
 
120
171
  ### `full-sync`
121
172
  Complete pipeline: dbt models → Cube.js schemas → BI tool datasets.
@@ -150,6 +201,7 @@ dbt-cube-sync/
150
201
  │ ├── config.py # Configuration management
151
202
  │ ├── core/
152
203
  │ │ ├── dbt_parser.py # dbt manifest parser
204
+ │ │ ├── db_inspector.py # Database column type inspector (SQLAlchemy)
153
205
  │ │ ├── cube_generator.py # Cube.js generator
154
206
  │ │ └── models.py # Pydantic data models
155
207
  │ └── connectors/
@@ -24,10 +24,12 @@ class CustomGroup(click.Group):
24
24
  click.echo("\nAvailable commands:")
25
25
  click.echo(" dbt-cube-sync --help # Show help")
26
26
  click.echo(" dbt-cube-sync --version # Show version")
27
- click.echo(" dbt-cube-sync dbt-to-cube -m manifest -c catalog -o output # Generate Cube.js schemas")
27
+ click.echo(" dbt-cube-sync dbt-to-cube -m manifest -c catalog -o output # Generate with catalog")
28
+ click.echo(" dbt-cube-sync dbt-to-cube -m manifest -s postgresql://user:pass@host/db -o output # Generate with database")
29
+ click.echo(" dbt-cube-sync dbt-to-cube -m manifest -s <uri> --models model1,model2 -o output # Filter specific models")
28
30
  click.echo(" dbt-cube-sync cube-to-bi superset -c cubes -u url -n user -p pass -d Cube # Sync to BI tool")
29
31
  ctx.exit(1)
30
-
32
+
31
33
  return super().get_command(ctx, cmd_name)
32
34
 
33
35
 
@@ -39,35 +41,66 @@ def main():
39
41
 
40
42
 
41
43
  @main.command()
42
- @click.option('--manifest', '-m',
44
+ @click.option('--manifest', '-m',
43
45
  required=True,
44
46
  help='Path to dbt manifest.json file')
45
47
  @click.option('--catalog', '-c',
46
- required=True,
47
- help='Path to dbt catalog.json file')
48
+ required=False,
49
+ default=None,
50
+ help='Path to dbt catalog.json file (optional if --sqlalchemy-uri is provided)')
51
+ @click.option('--sqlalchemy-uri', '-s',
52
+ required=False,
53
+ default=None,
54
+ help='SQLAlchemy database URI for fetching column types (e.g., postgresql://user:pass@host:port/db)')
55
+ @click.option('--models',
56
+ required=False,
57
+ default=None,
58
+ help='Comma-separated list of model names to process (e.g., model1,model2). If not specified, processes all models')
48
59
  @click.option('--output', '-o',
49
60
  required=True,
50
61
  help='Output directory for Cube.js files')
51
62
  @click.option('--template-dir', '-t',
52
63
  default='./cube/templates',
53
64
  help='Directory containing Cube.js templates')
54
- def dbt_to_cube(manifest: str, catalog: str, output: str, template_dir: str):
65
+ def dbt_to_cube(manifest: str, catalog: Optional[str], sqlalchemy_uri: Optional[str], models: Optional[str], output: str, template_dir: str):
55
66
  """Generate Cube.js schemas from dbt models"""
56
67
  try:
68
+ # Validate that at least one source of column types is provided
69
+ if not catalog and not sqlalchemy_uri:
70
+ click.echo("❌ Error: You must provide either --catalog or --sqlalchemy-uri to get column data types", err=True)
71
+ click.echo("💡 Example with catalog: dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/", err=True)
72
+ click.echo("💡 Example with database: dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@host:port/db -o output/", err=True)
73
+ sys.exit(1)
74
+
75
+ # Parse model filter if provided
76
+ model_filter = None
77
+ if models:
78
+ model_filter = [m.strip() for m in models.split(',')]
79
+ click.echo(f"🎯 Filtering models: {', '.join(model_filter)}")
80
+
57
81
  click.echo("🔄 Parsing dbt manifest...")
58
- parser = DbtParser(manifest, catalog)
59
- models = parser.parse_models()
60
-
61
- click.echo(f"📊 Found {len(models)} dbt models")
62
-
82
+ parser = DbtParser(
83
+ manifest_path=manifest,
84
+ catalog_path=catalog,
85
+ sqlalchemy_uri=sqlalchemy_uri,
86
+ model_filter=model_filter
87
+ )
88
+ parsed_models = parser.parse_models()
89
+
90
+ click.echo(f"📊 Found {len(parsed_models)} dbt models")
91
+
92
+ if len(parsed_models) == 0:
93
+ click.echo("⚠️ No models found. Make sure your models have both columns and metrics defined.")
94
+ sys.exit(0)
95
+
63
96
  click.echo("🏗️ Generating Cube.js schemas...")
64
97
  generator = CubeGenerator(template_dir, output)
65
- generated_files = generator.generate_cube_files(models)
66
-
98
+ generated_files = generator.generate_cube_files(parsed_models)
99
+
67
100
  click.echo(f"✅ Generated {len(generated_files)} Cube.js files:")
68
101
  for file_path in generated_files:
69
102
  click.echo(f" • {file_path}")
70
-
103
+
71
104
  except Exception as e:
72
105
  click.echo(f"❌ Error: {str(e)}", err=True)
73
106
  sys.exit(1)
@@ -0,0 +1,112 @@
1
+ """
2
+ Database inspector - fetches column types using SQLAlchemy
3
+ """
4
+ from typing import Dict, Optional
5
+ from sqlalchemy import create_engine, inspect, MetaData, Table, text
6
+ from sqlalchemy.engine import Engine
7
+
8
+
9
+ class DatabaseInspector:
10
+ """Inspects database schema to extract column type information"""
11
+
12
+ def __init__(self, sqlalchemy_uri: str):
13
+ """
14
+ Initialize the database inspector
15
+
16
+ Args:
17
+ sqlalchemy_uri: SQLAlchemy connection URI (e.g., postgresql://user:pass@host:port/db)
18
+ """
19
+ # Add connect_args for Redshift compatibility
20
+ if 'redshift' in sqlalchemy_uri:
21
+ self.engine: Engine = create_engine(
22
+ sqlalchemy_uri,
23
+ connect_args={'sslmode': 'prefer'}
24
+ )
25
+ else:
26
+ self.engine: Engine = create_engine(sqlalchemy_uri)
27
+
28
+ self.inspector = inspect(self.engine)
29
+ self.is_redshift = 'redshift' in sqlalchemy_uri.lower()
30
+
31
+ def get_table_columns(self, schema: str, table_name: str) -> Dict[str, str]:
32
+ """
33
+ Get column names and their data types for a specific table
34
+
35
+ Args:
36
+ schema: Database schema name
37
+ table_name: Table name
38
+
39
+ Returns:
40
+ Dictionary mapping column names to data types
41
+ """
42
+ columns = {}
43
+
44
+ try:
45
+ # For Redshift, use direct SQL query to avoid pg_catalog issues
46
+ if self.is_redshift:
47
+ columns = self._get_redshift_columns(schema, table_name)
48
+ else:
49
+ # Get columns from the database using inspector
50
+ table_columns = self.inspector.get_columns(table_name, schema=schema)
51
+
52
+ for column in table_columns:
53
+ col_name = column['name']
54
+ col_type = str(column['type'])
55
+ columns[col_name] = col_type
56
+
57
+ except Exception as e:
58
+ print(f"Warning: Could not inspect table {schema}.{table_name}: {e}")
59
+
60
+ return columns
61
+
62
+ def _get_redshift_columns(self, schema: str, table_name: str) -> Dict[str, str]:
63
+ """
64
+ Get columns for Redshift using direct SQL query
65
+
66
+ Args:
67
+ schema: Database schema name
68
+ table_name: Table name
69
+
70
+ Returns:
71
+ Dictionary mapping column names to data types
72
+ """
73
+ columns = {}
74
+
75
+ try:
76
+ # Query Redshift's pg_table_def view which is more reliable
77
+ query = text("""
78
+ SELECT column_name, data_type
79
+ FROM pg_table_def
80
+ WHERE schemaname = :schema
81
+ AND tablename = :table_name
82
+ ORDER BY column_name
83
+ """)
84
+
85
+ with self.engine.connect() as conn:
86
+ result = conn.execute(query, {"schema": schema, "table_name": table_name})
87
+ for row in result:
88
+ columns[row[0]] = row[1]
89
+
90
+ except Exception as e:
91
+ # Fallback to information_schema if pg_table_def fails
92
+ try:
93
+ query = text("""
94
+ SELECT column_name, data_type
95
+ FROM information_schema.columns
96
+ WHERE table_schema = :schema
97
+ AND table_name = :table_name
98
+ ORDER BY ordinal_position
99
+ """)
100
+
101
+ with self.engine.connect() as conn:
102
+ result = conn.execute(query, {"schema": schema, "table_name": table_name})
103
+ for row in result:
104
+ columns[row[0]] = row[1]
105
+ except Exception as fallback_error:
106
+ print(f"Warning: Could not query Redshift table {schema}.{table_name}: {fallback_error}")
107
+
108
+ return columns
109
+
110
+ def close(self):
111
+ """Close the database connection"""
112
+ self.engine.dispose()
@@ -3,27 +3,39 @@ dbt manifest parser - extracts models, metrics, and column information
3
3
  """
4
4
  import json
5
5
  import os
6
- from typing import Dict, List
6
+ from typing import Dict, List, Optional
7
7
  from pathlib import Path
8
8
 
9
9
  from .models import DbtModel, DbtColumn, DbtMetric, DbtPreAggregation, DbtRefreshKey
10
+ from .db_inspector import DatabaseInspector
10
11
 
11
12
 
12
13
  class DbtParser:
13
14
  """Parses dbt manifest.json to extract model and metric information"""
14
-
15
- def __init__(self, manifest_path: str, catalog_path: str = None):
15
+
16
+ def __init__(
17
+ self,
18
+ manifest_path: str,
19
+ catalog_path: Optional[str] = None,
20
+ sqlalchemy_uri: Optional[str] = None,
21
+ model_filter: Optional[List[str]] = None
22
+ ):
16
23
  """
17
24
  Initialize the parser
18
-
25
+
19
26
  Args:
20
27
  manifest_path: Path to dbt manifest.json file
21
28
  catalog_path: Optional path to dbt catalog.json for column types
29
+ sqlalchemy_uri: Optional SQLAlchemy URI to connect to database for column types
30
+ model_filter: Optional list of model names to process (if None, processes all models)
22
31
  """
23
32
  self.manifest_path = manifest_path
24
33
  self.catalog_path = catalog_path
34
+ self.sqlalchemy_uri = sqlalchemy_uri
35
+ self.model_filter = model_filter
25
36
  self.manifest = self._load_manifest()
26
37
  self.catalog = self._load_catalog() if catalog_path else None
38
+ self.db_inspector = DatabaseInspector(sqlalchemy_uri) if sqlalchemy_uri else None
27
39
 
28
40
  def _load_manifest(self) -> dict:
29
41
  """Load the dbt manifest.json file"""
@@ -48,23 +60,32 @@ class DbtParser:
48
60
  def parse_models(self) -> List[DbtModel]:
49
61
  """
50
62
  Extract models with metrics and columns from manifest
51
-
63
+
52
64
  Returns:
53
65
  List of DbtModel instances
54
66
  """
55
67
  models = []
56
68
  nodes = self.manifest.get('nodes', {})
57
-
69
+
58
70
  for node_id, node_data in nodes.items():
59
71
  # Only process models
60
72
  if node_data.get('resource_type') != 'model':
61
73
  continue
62
-
74
+
75
+ # Apply model filter if specified
76
+ model_name = node_data.get('name', '')
77
+ if self.model_filter and model_name not in self.model_filter:
78
+ continue
79
+
63
80
  model = self._parse_model(node_id, node_data)
64
81
  # Include models that have columns AND metrics (measures are required for useful Cube.js schemas)
65
82
  if model and model.columns and model.metrics:
66
83
  models.append(model)
67
-
84
+
85
+ # Close database inspector if it was used
86
+ if self.db_inspector:
87
+ self.db_inspector.close()
88
+
68
89
  return models
69
90
 
70
91
  def _parse_model(self, node_id: str, node_data: dict) -> DbtModel:
@@ -93,24 +114,35 @@ class DbtParser:
93
114
  )
94
115
 
95
116
  def _parse_columns(self, node_id: str, node_data: dict) -> Dict[str, DbtColumn]:
96
- """Parse columns for a model, enhanced with catalog data if available"""
117
+ """Parse columns for a model, enhanced with catalog or database data if available"""
97
118
  columns = {}
98
119
  manifest_columns = node_data.get('columns', {})
99
-
100
- # Get catalog columns for type information
120
+
121
+ # Get catalog columns for type information (if catalog is available)
101
122
  catalog_columns = {}
102
123
  if self.catalog and node_id in self.catalog.get('nodes', {}):
103
124
  catalog_columns = self.catalog['nodes'][node_id].get('columns', {})
104
-
105
- # If manifest has columns, use them with catalog type info
125
+
126
+ # Get database columns for type information (if db_inspector is available)
127
+ db_columns = {}
128
+ if self.db_inspector and not self.catalog:
129
+ schema = node_data.get('schema', '')
130
+ table_name = node_data.get('name', '')
131
+ if schema and table_name:
132
+ db_columns = self.db_inspector.get_table_columns(schema, table_name)
133
+
134
+ # If manifest has columns, use them with catalog or database type info
106
135
  if manifest_columns:
107
136
  for col_name, col_data in manifest_columns.items():
108
137
  data_type = None
109
-
110
- # Try to get data type from catalog
138
+
139
+ # Try to get data type from catalog first
111
140
  if col_name in catalog_columns:
112
141
  data_type = catalog_columns[col_name].get('type', '')
113
-
142
+ # Otherwise try database
143
+ elif col_name in db_columns:
144
+ data_type = db_columns[col_name]
145
+
114
146
  columns[col_name] = DbtColumn(
115
147
  name=col_name,
116
148
  data_type=data_type,
@@ -118,15 +150,24 @@ class DbtParser:
118
150
  meta=col_data.get('meta', {})
119
151
  )
120
152
  else:
121
- # If no manifest columns, use all catalog columns
122
- for col_name, col_data in catalog_columns.items():
153
+ # If no manifest columns, use catalog or database columns
154
+ source_columns = catalog_columns or db_columns
155
+ for col_name in source_columns:
156
+ if catalog_columns:
157
+ col_data = catalog_columns[col_name]
158
+ data_type = col_data.get('type', '')
159
+ description = f"Column from catalog: {col_name}"
160
+ else:
161
+ data_type = db_columns[col_name]
162
+ description = f"Column from database: {col_name}"
163
+
123
164
  columns[col_name] = DbtColumn(
124
165
  name=col_name,
125
- data_type=col_data.get('type', ''),
126
- description=f"Column from catalog: {col_name}",
166
+ data_type=data_type,
167
+ description=description,
127
168
  meta={}
128
169
  )
129
-
170
+
130
171
  return columns
131
172
 
132
173
  def _parse_metrics(self, node_data: dict) -> Dict[str, DbtMetric]:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dbt-cube-sync"
3
- version = "0.1.0a5"
3
+ version = "0.1.0a7"
4
4
  description = "Synchronization tool for dbt models to Cube.js schemas and BI tools"
5
5
  authors = ["Ponder"]
6
6
  readme = "README.md"
@@ -13,6 +13,7 @@ pyyaml = "^6.0"
13
13
  click = "^8.1.7"
14
14
  pydantic = "^2.5.0"
15
15
  jinja2 = "^3.1.2"
16
+ sqlalchemy = "^2.0.0"
16
17
 
17
18
  [tool.poetry.group.dev.dependencies]
18
19
  pytest = "^7.4.0"