dbt-cube-sync 0.1.0a6__py3-none-any.whl → 0.1.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dbt-cube-sync might be problematic. Click here for more details.
- dbt_cube_sync/cli.py +362 -14
- dbt_cube_sync/core/cube_generator.py +14 -10
- dbt_cube_sync/core/db_inspector.py +60 -14
- dbt_cube_sync/core/dbt_parser.py +74 -8
- dbt_cube_sync/core/models.py +17 -1
- dbt_cube_sync/core/state_manager.py +221 -0
- {dbt_cube_sync-0.1.0a6.dist-info → dbt_cube_sync-0.1.0a8.dist-info}/METADATA +100 -19
- dbt_cube_sync-0.1.0a8.dist-info/RECORD +18 -0
- {dbt_cube_sync-0.1.0a6.dist-info → dbt_cube_sync-0.1.0a8.dist-info}/WHEEL +1 -1
- dbt_cube_sync-0.1.0a6.dist-info/RECORD +0 -17
- {dbt_cube_sync-0.1.0a6.dist-info → dbt_cube_sync-0.1.0a8.dist-info}/entry_points.txt +0 -0
dbt_cube_sync/cli.py
CHANGED
|
@@ -2,12 +2,14 @@
|
|
|
2
2
|
CLI interface for dbt-cube-sync tool
|
|
3
3
|
"""
|
|
4
4
|
import click
|
|
5
|
+
import os
|
|
5
6
|
import sys
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Optional
|
|
8
9
|
|
|
9
10
|
from .core.dbt_parser import DbtParser
|
|
10
11
|
from .core.cube_generator import CubeGenerator
|
|
12
|
+
from .core.state_manager import StateManager
|
|
11
13
|
from .connectors.base import ConnectorRegistry
|
|
12
14
|
from .config import Config
|
|
13
15
|
|
|
@@ -62,47 +64,142 @@ def main():
|
|
|
62
64
|
@click.option('--template-dir', '-t',
|
|
63
65
|
default='./cube/templates',
|
|
64
66
|
help='Directory containing Cube.js templates')
|
|
65
|
-
|
|
67
|
+
@click.option('--state-path',
|
|
68
|
+
required=False,
|
|
69
|
+
default='.dbt-cube-sync-state.json',
|
|
70
|
+
help='Path to state file for incremental sync (default: .dbt-cube-sync-state.json)')
|
|
71
|
+
@click.option('--force-full-sync',
|
|
72
|
+
is_flag=True,
|
|
73
|
+
default=False,
|
|
74
|
+
help='Force full regeneration, ignore cached state')
|
|
75
|
+
@click.option('--no-state',
|
|
76
|
+
is_flag=True,
|
|
77
|
+
default=False,
|
|
78
|
+
help='Disable state tracking (legacy behavior)')
|
|
79
|
+
def dbt_to_cube(
|
|
80
|
+
manifest: str,
|
|
81
|
+
catalog: Optional[str],
|
|
82
|
+
sqlalchemy_uri: Optional[str],
|
|
83
|
+
models: Optional[str],
|
|
84
|
+
output: str,
|
|
85
|
+
template_dir: str,
|
|
86
|
+
state_path: str,
|
|
87
|
+
force_full_sync: bool,
|
|
88
|
+
no_state: bool
|
|
89
|
+
):
|
|
66
90
|
"""Generate Cube.js schemas from dbt models"""
|
|
67
91
|
try:
|
|
68
92
|
# Validate that at least one source of column types is provided
|
|
69
93
|
if not catalog and not sqlalchemy_uri:
|
|
70
|
-
click.echo("
|
|
71
|
-
click.echo("
|
|
72
|
-
click.echo("
|
|
94
|
+
click.echo("Error: You must provide either --catalog or --sqlalchemy-uri to get column data types", err=True)
|
|
95
|
+
click.echo("Example with catalog: dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/", err=True)
|
|
96
|
+
click.echo("Example with database: dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@host:port/db -o output/", err=True)
|
|
73
97
|
sys.exit(1)
|
|
74
98
|
|
|
75
99
|
# Parse model filter if provided
|
|
76
100
|
model_filter = None
|
|
77
101
|
if models:
|
|
78
102
|
model_filter = [m.strip() for m in models.split(',')]
|
|
79
|
-
click.echo(f"
|
|
103
|
+
click.echo(f"Filtering models: {', '.join(model_filter)}")
|
|
104
|
+
|
|
105
|
+
# Initialize state manager (if enabled)
|
|
106
|
+
state_manager = None
|
|
107
|
+
previous_state = None
|
|
108
|
+
use_incremental = not no_state and not force_full_sync
|
|
109
|
+
|
|
110
|
+
if not no_state:
|
|
111
|
+
state_manager = StateManager(state_path)
|
|
112
|
+
if not force_full_sync:
|
|
113
|
+
previous_state = state_manager.load_state()
|
|
114
|
+
if previous_state:
|
|
115
|
+
click.echo(f"Loaded previous state from {state_path}")
|
|
80
116
|
|
|
81
|
-
click.echo("
|
|
117
|
+
click.echo("Parsing dbt manifest...")
|
|
82
118
|
parser = DbtParser(
|
|
83
119
|
manifest_path=manifest,
|
|
84
120
|
catalog_path=catalog,
|
|
85
121
|
sqlalchemy_uri=sqlalchemy_uri,
|
|
86
122
|
model_filter=model_filter
|
|
87
123
|
)
|
|
88
|
-
parsed_models = parser.parse_models()
|
|
89
124
|
|
|
90
|
-
|
|
125
|
+
# Get all manifest nodes with metrics (for checksum comparison)
|
|
126
|
+
manifest_nodes = parser.get_manifest_nodes_with_metrics()
|
|
127
|
+
click.echo(f"Found {len(manifest_nodes)} models with metrics in manifest")
|
|
128
|
+
|
|
129
|
+
# Determine which models need regeneration
|
|
130
|
+
if use_incremental and previous_state:
|
|
131
|
+
added, modified, removed = state_manager.get_changed_models(
|
|
132
|
+
manifest_nodes, previous_state
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if not added and not modified and not removed:
|
|
136
|
+
click.echo("No changes detected. All models are up to date.")
|
|
137
|
+
sys.exit(0)
|
|
138
|
+
|
|
139
|
+
click.echo(f"Incremental sync: {len(added)} added, {len(modified)} modified, {len(removed)} removed")
|
|
140
|
+
|
|
141
|
+
# Clean up files for removed models
|
|
142
|
+
if removed:
|
|
143
|
+
files_to_delete = state_manager.get_files_to_delete(previous_state, removed)
|
|
144
|
+
for file_path in files_to_delete:
|
|
145
|
+
try:
|
|
146
|
+
os.remove(file_path)
|
|
147
|
+
click.echo(f" Deleted: {Path(file_path).name}")
|
|
148
|
+
except OSError as e:
|
|
149
|
+
click.echo(f" Warning: Could not delete {file_path}: {e}")
|
|
150
|
+
|
|
151
|
+
# Only parse changed models
|
|
152
|
+
node_ids_to_process = list(added | modified)
|
|
153
|
+
if not node_ids_to_process:
|
|
154
|
+
# Only removals, no models to regenerate
|
|
155
|
+
if state_manager:
|
|
156
|
+
new_state = state_manager.merge_state(
|
|
157
|
+
previous_state, manifest, manifest_nodes, {}, removed
|
|
158
|
+
)
|
|
159
|
+
state_manager.save_state(new_state)
|
|
160
|
+
click.echo(f"State saved to {state_path}")
|
|
161
|
+
click.echo("Sync complete (only removals)")
|
|
162
|
+
sys.exit(0)
|
|
163
|
+
|
|
164
|
+
parsed_models = parser.parse_models(node_ids_filter=node_ids_to_process)
|
|
165
|
+
else:
|
|
166
|
+
# Full sync - parse all models
|
|
167
|
+
if force_full_sync:
|
|
168
|
+
click.echo("Forcing full sync...")
|
|
169
|
+
parsed_models = parser.parse_models()
|
|
170
|
+
|
|
171
|
+
click.echo(f"Processing {len(parsed_models)} dbt models")
|
|
91
172
|
|
|
92
173
|
if len(parsed_models) == 0:
|
|
93
|
-
click.echo("
|
|
174
|
+
click.echo("No models found. Make sure your models have both columns and metrics defined.")
|
|
94
175
|
sys.exit(0)
|
|
95
176
|
|
|
96
|
-
click.echo("
|
|
177
|
+
click.echo("Generating Cube.js schemas...")
|
|
97
178
|
generator = CubeGenerator(template_dir, output)
|
|
98
179
|
generated_files = generator.generate_cube_files(parsed_models)
|
|
99
180
|
|
|
100
|
-
click.echo(f"
|
|
101
|
-
for file_path in generated_files:
|
|
102
|
-
click.echo(f"
|
|
181
|
+
click.echo(f"Generated {len(generated_files)} Cube.js files:")
|
|
182
|
+
for node_id, file_path in generated_files.items():
|
|
183
|
+
click.echo(f" {file_path}")
|
|
184
|
+
|
|
185
|
+
# Save state (if enabled)
|
|
186
|
+
if state_manager:
|
|
187
|
+
if use_incremental and previous_state:
|
|
188
|
+
# Merge with previous state
|
|
189
|
+
removed_ids = removed if 'removed' in dir() else set()
|
|
190
|
+
new_state = state_manager.merge_state(
|
|
191
|
+
previous_state, manifest, manifest_nodes, generated_files, removed_ids
|
|
192
|
+
)
|
|
193
|
+
else:
|
|
194
|
+
# Create fresh state
|
|
195
|
+
new_state = state_manager.create_state_from_results(
|
|
196
|
+
manifest, manifest_nodes, generated_files
|
|
197
|
+
)
|
|
198
|
+
state_manager.save_state(new_state)
|
|
199
|
+
click.echo(f"State saved to {state_path}")
|
|
103
200
|
|
|
104
201
|
except Exception as e:
|
|
105
|
-
click.echo(f"
|
|
202
|
+
click.echo(f"Error: {str(e)}", err=True)
|
|
106
203
|
sys.exit(1)
|
|
107
204
|
|
|
108
205
|
|
|
@@ -164,5 +261,256 @@ def version():
|
|
|
164
261
|
click.echo(f"dbt-cube-sync version {__version__}")
|
|
165
262
|
|
|
166
263
|
|
|
264
|
+
@main.command()
|
|
265
|
+
@click.option('--manifest', '-m',
|
|
266
|
+
required=True,
|
|
267
|
+
help='Path to dbt manifest.json file')
|
|
268
|
+
@click.option('--catalog', '-c',
|
|
269
|
+
required=False,
|
|
270
|
+
default=None,
|
|
271
|
+
help='Path to dbt catalog.json file')
|
|
272
|
+
@click.option('--sqlalchemy-uri', '-s',
|
|
273
|
+
required=False,
|
|
274
|
+
default=None,
|
|
275
|
+
help='SQLAlchemy database URI for fetching column types')
|
|
276
|
+
@click.option('--output', '-o',
|
|
277
|
+
required=True,
|
|
278
|
+
help='Output directory for Cube.js files')
|
|
279
|
+
@click.option('--state-path',
|
|
280
|
+
required=False,
|
|
281
|
+
default='.dbt-cube-sync-state.json',
|
|
282
|
+
help='Path to state file for incremental sync')
|
|
283
|
+
@click.option('--force-full-sync',
|
|
284
|
+
is_flag=True,
|
|
285
|
+
default=False,
|
|
286
|
+
help='Force full regeneration, ignore cached state')
|
|
287
|
+
@click.option('--superset-url',
|
|
288
|
+
required=False,
|
|
289
|
+
default=None,
|
|
290
|
+
help='Superset URL (e.g., http://localhost:8088)')
|
|
291
|
+
@click.option('--superset-username',
|
|
292
|
+
required=False,
|
|
293
|
+
default=None,
|
|
294
|
+
help='Superset username')
|
|
295
|
+
@click.option('--superset-password',
|
|
296
|
+
required=False,
|
|
297
|
+
default=None,
|
|
298
|
+
help='Superset password')
|
|
299
|
+
@click.option('--cube-connection-name',
|
|
300
|
+
default='Cube',
|
|
301
|
+
help='Name of Cube database connection in Superset')
|
|
302
|
+
@click.option('--rag-api-url',
|
|
303
|
+
required=False,
|
|
304
|
+
default=None,
|
|
305
|
+
help='RAG API URL for embedding updates (e.g., http://localhost:8000)')
|
|
306
|
+
def sync_all(
|
|
307
|
+
manifest: str,
|
|
308
|
+
catalog: Optional[str],
|
|
309
|
+
sqlalchemy_uri: Optional[str],
|
|
310
|
+
output: str,
|
|
311
|
+
state_path: str,
|
|
312
|
+
force_full_sync: bool,
|
|
313
|
+
superset_url: Optional[str],
|
|
314
|
+
superset_username: Optional[str],
|
|
315
|
+
superset_password: Optional[str],
|
|
316
|
+
cube_connection_name: str,
|
|
317
|
+
rag_api_url: Optional[str]
|
|
318
|
+
):
|
|
319
|
+
"""
|
|
320
|
+
Ultimate sync command: dbt → Cube.js → BI tools → RAG embeddings.
|
|
321
|
+
|
|
322
|
+
Incrementally syncs everything based on state file. Only processes
|
|
323
|
+
models that have changed since last sync.
|
|
324
|
+
|
|
325
|
+
Examples:
|
|
326
|
+
|
|
327
|
+
# Basic incremental sync (Cube.js only)
|
|
328
|
+
dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output
|
|
329
|
+
|
|
330
|
+
# Full pipeline with Superset
|
|
331
|
+
dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output \\
|
|
332
|
+
--superset-url http://localhost:8088 --superset-username admin --superset-password admin
|
|
333
|
+
|
|
334
|
+
# Full pipeline with Superset + RAG embeddings
|
|
335
|
+
dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output \\
|
|
336
|
+
--superset-url http://localhost:8088 --superset-username admin --superset-password admin \\
|
|
337
|
+
--rag-api-url http://localhost:8000
|
|
338
|
+
|
|
339
|
+
# Force full rebuild
|
|
340
|
+
dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output --force-full-sync
|
|
341
|
+
"""
|
|
342
|
+
import requests
|
|
343
|
+
|
|
344
|
+
try:
|
|
345
|
+
# Validate that at least one source of column types is provided
|
|
346
|
+
if not catalog and not sqlalchemy_uri:
|
|
347
|
+
click.echo("Error: You must provide either --catalog or --sqlalchemy-uri", err=True)
|
|
348
|
+
sys.exit(1)
|
|
349
|
+
|
|
350
|
+
click.echo("=" * 60)
|
|
351
|
+
click.echo("SYNC-ALL: Incremental Pipeline")
|
|
352
|
+
click.echo("=" * 60)
|
|
353
|
+
|
|
354
|
+
# Track what changed for downstream updates
|
|
355
|
+
changes_detected = False
|
|
356
|
+
added_models = set()
|
|
357
|
+
modified_models = set()
|
|
358
|
+
removed_models = set()
|
|
359
|
+
|
|
360
|
+
# ============================================================
|
|
361
|
+
# STEP 1: Incremental dbt → Cube.js sync
|
|
362
|
+
# ============================================================
|
|
363
|
+
click.echo("\n[1/3] dbt → Cube.js schemas")
|
|
364
|
+
click.echo("-" * 40)
|
|
365
|
+
|
|
366
|
+
# Initialize state manager
|
|
367
|
+
state_manager = StateManager(state_path)
|
|
368
|
+
previous_state = None
|
|
369
|
+
|
|
370
|
+
if not force_full_sync:
|
|
371
|
+
previous_state = state_manager.load_state()
|
|
372
|
+
if previous_state:
|
|
373
|
+
click.echo(f" Loaded state from {state_path}")
|
|
374
|
+
|
|
375
|
+
# Parse manifest
|
|
376
|
+
parser = DbtParser(
|
|
377
|
+
manifest_path=manifest,
|
|
378
|
+
catalog_path=catalog,
|
|
379
|
+
sqlalchemy_uri=sqlalchemy_uri
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
manifest_nodes = parser.get_manifest_nodes_with_metrics()
|
|
383
|
+
click.echo(f" Found {len(manifest_nodes)} models with metrics")
|
|
384
|
+
|
|
385
|
+
# Determine what changed
|
|
386
|
+
if not force_full_sync and previous_state:
|
|
387
|
+
added_models, modified_models, removed_models = state_manager.get_changed_models(
|
|
388
|
+
manifest_nodes, previous_state
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
if not added_models and not modified_models and not removed_models:
|
|
392
|
+
click.echo(" No changes detected - all models up to date")
|
|
393
|
+
else:
|
|
394
|
+
changes_detected = True
|
|
395
|
+
click.echo(f" Changes: {len(added_models)} added, {len(modified_models)} modified, {len(removed_models)} removed")
|
|
396
|
+
|
|
397
|
+
# Clean up removed model files
|
|
398
|
+
if removed_models:
|
|
399
|
+
files_to_delete = state_manager.get_files_to_delete(previous_state, removed_models)
|
|
400
|
+
for file_path in files_to_delete:
|
|
401
|
+
try:
|
|
402
|
+
os.remove(file_path)
|
|
403
|
+
click.echo(f" Deleted: {Path(file_path).name}")
|
|
404
|
+
except OSError:
|
|
405
|
+
pass
|
|
406
|
+
|
|
407
|
+
node_ids_to_process = list(added_models | modified_models)
|
|
408
|
+
else:
|
|
409
|
+
# Force full sync
|
|
410
|
+
changes_detected = True
|
|
411
|
+
added_models = set(manifest_nodes.keys())
|
|
412
|
+
node_ids_to_process = list(manifest_nodes.keys())
|
|
413
|
+
click.echo(f" Full sync: processing all {len(node_ids_to_process)} models")
|
|
414
|
+
|
|
415
|
+
# Generate Cube.js files for changed models
|
|
416
|
+
generated_files = {}
|
|
417
|
+
if node_ids_to_process:
|
|
418
|
+
parsed_models = parser.parse_models(node_ids_filter=node_ids_to_process)
|
|
419
|
+
|
|
420
|
+
if parsed_models:
|
|
421
|
+
generator = CubeGenerator('./cube/templates', output)
|
|
422
|
+
generated_files = generator.generate_cube_files(parsed_models)
|
|
423
|
+
click.echo(f" Generated {len(generated_files)} Cube.js files")
|
|
424
|
+
|
|
425
|
+
# Save state
|
|
426
|
+
if changes_detected or force_full_sync:
|
|
427
|
+
if previous_state and not force_full_sync:
|
|
428
|
+
new_state = state_manager.merge_state(
|
|
429
|
+
previous_state, manifest, manifest_nodes, generated_files, removed_models
|
|
430
|
+
)
|
|
431
|
+
else:
|
|
432
|
+
new_state = state_manager.create_state_from_results(
|
|
433
|
+
manifest, manifest_nodes, generated_files
|
|
434
|
+
)
|
|
435
|
+
state_manager.save_state(new_state)
|
|
436
|
+
click.echo(f" State saved to {state_path}")
|
|
437
|
+
|
|
438
|
+
# ============================================================
|
|
439
|
+
# STEP 2: Sync to Superset (if configured)
|
|
440
|
+
# ============================================================
|
|
441
|
+
if superset_url and superset_username and superset_password:
|
|
442
|
+
click.echo("\n[2/3] Cube.js → Superset")
|
|
443
|
+
click.echo("-" * 40)
|
|
444
|
+
|
|
445
|
+
if not changes_detected and not force_full_sync:
|
|
446
|
+
click.echo(" Skipped - no changes detected")
|
|
447
|
+
else:
|
|
448
|
+
connector_config = {
|
|
449
|
+
'url': superset_url,
|
|
450
|
+
'username': superset_username,
|
|
451
|
+
'password': superset_password,
|
|
452
|
+
'database_name': cube_connection_name
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
connector = ConnectorRegistry.get_connector('superset', **connector_config)
|
|
456
|
+
results = connector.sync_cube_schemas(output)
|
|
457
|
+
|
|
458
|
+
successful = sum(1 for r in results if r.status == 'success')
|
|
459
|
+
failed = sum(1 for r in results if r.status == 'failed')
|
|
460
|
+
click.echo(f" Synced: {successful} successful, {failed} failed")
|
|
461
|
+
else:
|
|
462
|
+
click.echo("\n[2/3] Cube.js → Superset")
|
|
463
|
+
click.echo("-" * 40)
|
|
464
|
+
click.echo(" Skipped - no Superset credentials provided")
|
|
465
|
+
|
|
466
|
+
# ============================================================
|
|
467
|
+
# STEP 3: Update RAG embeddings (if configured)
|
|
468
|
+
# ============================================================
|
|
469
|
+
if rag_api_url:
|
|
470
|
+
click.echo("\n[3/3] Update RAG embeddings")
|
|
471
|
+
click.echo("-" * 40)
|
|
472
|
+
|
|
473
|
+
if not changes_detected and not force_full_sync:
|
|
474
|
+
click.echo(" Skipped - no changes detected")
|
|
475
|
+
else:
|
|
476
|
+
try:
|
|
477
|
+
# Call the RAG API to re-ingest embeddings
|
|
478
|
+
response = requests.post(
|
|
479
|
+
f"{rag_api_url.rstrip('/')}/embeddings/ingest",
|
|
480
|
+
json={"schema_dir": output},
|
|
481
|
+
timeout=120
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
if response.status_code == 200:
|
|
485
|
+
result = response.json()
|
|
486
|
+
click.echo(f" Ingested {result.get('schemas_ingested', 0)} schema documents")
|
|
487
|
+
else:
|
|
488
|
+
click.echo(f" Warning: RAG API returned {response.status_code}", err=True)
|
|
489
|
+
except requests.RequestException as e:
|
|
490
|
+
click.echo(f" Warning: Could not reach RAG API: {e}", err=True)
|
|
491
|
+
else:
|
|
492
|
+
click.echo("\n[3/3] Update RAG embeddings")
|
|
493
|
+
click.echo("-" * 40)
|
|
494
|
+
click.echo(" Skipped - no RAG API URL provided")
|
|
495
|
+
|
|
496
|
+
# ============================================================
|
|
497
|
+
# Summary
|
|
498
|
+
# ============================================================
|
|
499
|
+
click.echo("\n" + "=" * 60)
|
|
500
|
+
click.echo("SYNC COMPLETE")
|
|
501
|
+
click.echo("=" * 60)
|
|
502
|
+
|
|
503
|
+
if changes_detected or force_full_sync:
|
|
504
|
+
click.echo(f" Models processed: {len(added_models) + len(modified_models)}")
|
|
505
|
+
click.echo(f" Models removed: {len(removed_models)}")
|
|
506
|
+
click.echo(f" Cube.js files generated: {len(generated_files)}")
|
|
507
|
+
else:
|
|
508
|
+
click.echo(" No changes - everything is up to date")
|
|
509
|
+
|
|
510
|
+
except Exception as e:
|
|
511
|
+
click.echo(f"Error: {str(e)}", err=True)
|
|
512
|
+
sys.exit(1)
|
|
513
|
+
|
|
514
|
+
|
|
167
515
|
if __name__ == '__main__':
|
|
168
516
|
main()
|
|
@@ -29,27 +29,31 @@ class CubeGenerator:
|
|
|
29
29
|
# Initialize Jinja2 environment
|
|
30
30
|
self.env = Environment(loader=FileSystemLoader(str(self.template_dir)))
|
|
31
31
|
|
|
32
|
-
def generate_cube_files(
|
|
32
|
+
def generate_cube_files(
|
|
33
|
+
self, models: List[DbtModel], return_node_mapping: bool = False
|
|
34
|
+
) -> Dict[str, str]:
|
|
33
35
|
"""
|
|
34
36
|
Generate Cube.js files for all models
|
|
35
|
-
|
|
37
|
+
|
|
36
38
|
Args:
|
|
37
39
|
models: List of DbtModel instances
|
|
38
|
-
|
|
40
|
+
return_node_mapping: If True, returns dict mapping node_id -> file_path
|
|
41
|
+
If False (legacy), returns list of file paths
|
|
42
|
+
|
|
39
43
|
Returns:
|
|
40
|
-
|
|
44
|
+
Dict mapping node_id -> file_path (for incremental sync support)
|
|
41
45
|
"""
|
|
42
|
-
generated_files =
|
|
43
|
-
|
|
46
|
+
generated_files = {}
|
|
47
|
+
|
|
44
48
|
for model in models:
|
|
45
49
|
try:
|
|
46
50
|
cube_schema = self._convert_model_to_cube(model)
|
|
47
51
|
file_path = self._write_cube_file(cube_schema)
|
|
48
|
-
generated_files.
|
|
49
|
-
print(f"
|
|
52
|
+
generated_files[model.node_id] = str(file_path)
|
|
53
|
+
print(f" Generated: {file_path.name}")
|
|
50
54
|
except Exception as e:
|
|
51
|
-
print(f"
|
|
52
|
-
|
|
55
|
+
print(f" Error generating cube for {model.name}: {str(e)}")
|
|
56
|
+
|
|
53
57
|
return generated_files
|
|
54
58
|
|
|
55
59
|
def _convert_model_to_cube(self, model: DbtModel) -> CubeSchema:
|
|
@@ -1,27 +1,42 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Database inspector - fetches column types using SQLAlchemy
|
|
2
|
+
Database inspector - fetches column types using SQLAlchemy MetaData reflection.
|
|
3
|
+
|
|
4
|
+
Uses SQLAlchemy's Table(..., autoload_with=engine) for portable, database-agnostic
|
|
5
|
+
column type extraction. This approach works consistently across PostgreSQL, MySQL,
|
|
6
|
+
Snowflake, BigQuery, Redshift, and other databases.
|
|
3
7
|
"""
|
|
4
8
|
from typing import Dict, Optional
|
|
5
|
-
from sqlalchemy import create_engine,
|
|
9
|
+
from sqlalchemy import create_engine, MetaData, Table
|
|
6
10
|
from sqlalchemy.engine import Engine
|
|
7
11
|
|
|
8
12
|
|
|
9
13
|
class DatabaseInspector:
|
|
10
|
-
"""Inspects database schema to extract column type information"""
|
|
14
|
+
"""Inspects database schema to extract column type information using SQLAlchemy reflection."""
|
|
11
15
|
|
|
12
16
|
def __init__(self, sqlalchemy_uri: str):
|
|
13
17
|
"""
|
|
14
|
-
Initialize the database inspector
|
|
18
|
+
Initialize the database inspector.
|
|
15
19
|
|
|
16
20
|
Args:
|
|
17
21
|
sqlalchemy_uri: SQLAlchemy connection URI (e.g., postgresql://user:pass@host:port/db)
|
|
18
22
|
"""
|
|
19
|
-
|
|
20
|
-
|
|
23
|
+
# Add connect_args for Redshift compatibility
|
|
24
|
+
if 'redshift' in sqlalchemy_uri.lower():
|
|
25
|
+
self.engine: Engine = create_engine(
|
|
26
|
+
sqlalchemy_uri,
|
|
27
|
+
connect_args={'sslmode': 'prefer'}
|
|
28
|
+
)
|
|
29
|
+
else:
|
|
30
|
+
self.engine: Engine = create_engine(sqlalchemy_uri)
|
|
31
|
+
|
|
32
|
+
self.metadata = MetaData()
|
|
33
|
+
self._table_cache: Dict[str, Table] = {}
|
|
21
34
|
|
|
22
35
|
def get_table_columns(self, schema: str, table_name: str) -> Dict[str, str]:
|
|
23
36
|
"""
|
|
24
|
-
Get column names and their data types for a specific table
|
|
37
|
+
Get column names and their data types for a specific table.
|
|
38
|
+
|
|
39
|
+
Uses SQLAlchemy MetaData reflection for portable column extraction.
|
|
25
40
|
|
|
26
41
|
Args:
|
|
27
42
|
schema: Database schema name
|
|
@@ -31,21 +46,52 @@ class DatabaseInspector:
|
|
|
31
46
|
Dictionary mapping column names to data types
|
|
32
47
|
"""
|
|
33
48
|
columns = {}
|
|
49
|
+
cache_key = f"{schema}.{table_name}"
|
|
34
50
|
|
|
35
51
|
try:
|
|
36
|
-
#
|
|
37
|
-
|
|
52
|
+
# Check cache first
|
|
53
|
+
if cache_key in self._table_cache:
|
|
54
|
+
table = self._table_cache[cache_key]
|
|
55
|
+
else:
|
|
56
|
+
# Reflect table using SQLAlchemy MetaData
|
|
57
|
+
table = Table(
|
|
58
|
+
table_name,
|
|
59
|
+
self.metadata,
|
|
60
|
+
autoload_with=self.engine,
|
|
61
|
+
schema=schema
|
|
62
|
+
)
|
|
63
|
+
self._table_cache[cache_key] = table
|
|
38
64
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
columns[col_name] = col_type
|
|
65
|
+
# Extract column types
|
|
66
|
+
for column in table.columns:
|
|
67
|
+
columns[column.name] = str(column.type)
|
|
43
68
|
|
|
44
69
|
except Exception as e:
|
|
45
70
|
print(f"Warning: Could not inspect table {schema}.{table_name}: {e}")
|
|
46
71
|
|
|
47
72
|
return columns
|
|
48
73
|
|
|
74
|
+
def reflect_multiple_tables(
|
|
75
|
+
self, tables: list[tuple[str, str]]
|
|
76
|
+
) -> Dict[str, Dict[str, str]]:
|
|
77
|
+
"""
|
|
78
|
+
Reflect multiple tables in bulk for performance optimization.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
tables: List of (schema, table_name) tuples
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Dict mapping "schema.table_name" -> {column_name: column_type}
|
|
85
|
+
"""
|
|
86
|
+
results = {}
|
|
87
|
+
|
|
88
|
+
for schema, table_name in tables:
|
|
89
|
+
cache_key = f"{schema}.{table_name}"
|
|
90
|
+
results[cache_key] = self.get_table_columns(schema, table_name)
|
|
91
|
+
|
|
92
|
+
return results
|
|
93
|
+
|
|
49
94
|
def close(self):
|
|
50
|
-
"""Close the database connection"""
|
|
95
|
+
"""Close the database connection and clear cache."""
|
|
96
|
+
self._table_cache.clear()
|
|
51
97
|
self.engine.dispose()
|
dbt_cube_sync/core/dbt_parser.py
CHANGED
|
@@ -57,10 +57,14 @@ class DbtParser:
|
|
|
57
57
|
print(f"Warning: Could not load catalog file {self.catalog_path}: {e}")
|
|
58
58
|
return None
|
|
59
59
|
|
|
60
|
-
def parse_models(self) -> List[DbtModel]:
|
|
60
|
+
def parse_models(self, node_ids_filter: Optional[List[str]] = None) -> List[DbtModel]:
|
|
61
61
|
"""
|
|
62
62
|
Extract models with metrics and columns from manifest
|
|
63
63
|
|
|
64
|
+
Args:
|
|
65
|
+
node_ids_filter: Optional list of node_ids to parse (for incremental sync).
|
|
66
|
+
If provided, only these specific nodes are processed.
|
|
67
|
+
|
|
64
68
|
Returns:
|
|
65
69
|
List of DbtModel instances
|
|
66
70
|
"""
|
|
@@ -72,6 +76,10 @@ class DbtParser:
|
|
|
72
76
|
if node_data.get('resource_type') != 'model':
|
|
73
77
|
continue
|
|
74
78
|
|
|
79
|
+
# Apply node_ids filter if specified (for incremental sync)
|
|
80
|
+
if node_ids_filter is not None and node_id not in node_ids_filter:
|
|
81
|
+
continue
|
|
82
|
+
|
|
75
83
|
# Apply model filter if specified
|
|
76
84
|
model_name = node_data.get('name', '')
|
|
77
85
|
if self.model_filter and model_name not in self.model_filter:
|
|
@@ -87,6 +95,38 @@ class DbtParser:
|
|
|
87
95
|
self.db_inspector.close()
|
|
88
96
|
|
|
89
97
|
return models
|
|
98
|
+
|
|
99
|
+
def get_manifest_nodes_with_metrics(self) -> Dict[str, dict]:
|
|
100
|
+
"""
|
|
101
|
+
Get all manifest nodes that have metrics defined.
|
|
102
|
+
|
|
103
|
+
This is used by the StateManager to compare checksums for incremental sync.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Dict of node_id -> node_data for all models with metrics
|
|
107
|
+
"""
|
|
108
|
+
nodes_with_metrics = {}
|
|
109
|
+
nodes = self.manifest.get('nodes', {})
|
|
110
|
+
|
|
111
|
+
for node_id, node_data in nodes.items():
|
|
112
|
+
# Only process models
|
|
113
|
+
if node_data.get('resource_type') != 'model':
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
# Apply model filter if specified
|
|
117
|
+
model_name = node_data.get('name', '')
|
|
118
|
+
if self.model_filter and model_name not in self.model_filter:
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
# Check if model has metrics defined
|
|
122
|
+
config = node_data.get('config', {})
|
|
123
|
+
meta = config.get('meta', {})
|
|
124
|
+
metrics = meta.get('metrics', {})
|
|
125
|
+
|
|
126
|
+
if metrics:
|
|
127
|
+
nodes_with_metrics[node_id] = node_data
|
|
128
|
+
|
|
129
|
+
return nodes_with_metrics
|
|
90
130
|
|
|
91
131
|
def _parse_model(self, node_id: str, node_data: dict) -> DbtModel:
|
|
92
132
|
"""Parse a single model from the manifest"""
|
|
@@ -114,7 +154,14 @@ class DbtParser:
|
|
|
114
154
|
)
|
|
115
155
|
|
|
116
156
|
def _parse_columns(self, node_id: str, node_data: dict) -> Dict[str, DbtColumn]:
|
|
117
|
-
"""
|
|
157
|
+
"""
|
|
158
|
+
Parse columns for a model using hybrid metadata approach.
|
|
159
|
+
|
|
160
|
+
Priority order for column types:
|
|
161
|
+
1. Manifest `data_type` - When explicitly defined in dbt .yml files
|
|
162
|
+
2. Catalog `type` - When catalog.json is provided
|
|
163
|
+
3. SQLAlchemy Reflection - Fallback using database inspector
|
|
164
|
+
"""
|
|
118
165
|
columns = {}
|
|
119
166
|
manifest_columns = node_data.get('columns', {})
|
|
120
167
|
|
|
@@ -123,23 +170,42 @@ class DbtParser:
|
|
|
123
170
|
if self.catalog and node_id in self.catalog.get('nodes', {}):
|
|
124
171
|
catalog_columns = self.catalog['nodes'][node_id].get('columns', {})
|
|
125
172
|
|
|
126
|
-
#
|
|
173
|
+
# Check if we need database lookup - only if we have columns missing types
|
|
174
|
+
need_db_lookup = False
|
|
175
|
+
if manifest_columns:
|
|
176
|
+
for col_name, col_data in manifest_columns.items():
|
|
177
|
+
# Check manifest data_type first
|
|
178
|
+
manifest_data_type = col_data.get('data_type')
|
|
179
|
+
if manifest_data_type:
|
|
180
|
+
continue
|
|
181
|
+
# Check catalog
|
|
182
|
+
if col_name in catalog_columns and catalog_columns[col_name].get('type'):
|
|
183
|
+
continue
|
|
184
|
+
# Need database lookup for this column
|
|
185
|
+
need_db_lookup = True
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
# Get database columns only if needed (lazy loading)
|
|
127
189
|
db_columns = {}
|
|
128
|
-
if
|
|
190
|
+
if need_db_lookup and self.db_inspector:
|
|
129
191
|
schema = node_data.get('schema', '')
|
|
130
192
|
table_name = node_data.get('name', '')
|
|
131
193
|
if schema and table_name:
|
|
132
194
|
db_columns = self.db_inspector.get_table_columns(schema, table_name)
|
|
133
195
|
|
|
134
|
-
# If manifest has columns, use them with
|
|
196
|
+
# If manifest has columns, use them with hybrid type resolution
|
|
135
197
|
if manifest_columns:
|
|
136
198
|
for col_name, col_data in manifest_columns.items():
|
|
137
199
|
data_type = None
|
|
138
200
|
|
|
139
|
-
#
|
|
140
|
-
|
|
201
|
+
# Priority 1: Manifest data_type (explicitly defined in dbt .yml)
|
|
202
|
+
manifest_data_type = col_data.get('data_type')
|
|
203
|
+
if manifest_data_type:
|
|
204
|
+
data_type = manifest_data_type
|
|
205
|
+
# Priority 2: Catalog type
|
|
206
|
+
elif col_name in catalog_columns:
|
|
141
207
|
data_type = catalog_columns[col_name].get('type', '')
|
|
142
|
-
#
|
|
208
|
+
# Priority 3: Database reflection
|
|
143
209
|
elif col_name in db_columns:
|
|
144
210
|
data_type = db_columns[col_name]
|
|
145
211
|
|
dbt_cube_sync/core/models.py
CHANGED
|
@@ -103,4 +103,20 @@ class SyncResult(BaseModel):
|
|
|
103
103
|
file_or_dataset: str
|
|
104
104
|
status: str # 'success' or 'failed'
|
|
105
105
|
message: Optional[str] = None
|
|
106
|
-
error: Optional[str] = None
|
|
106
|
+
error: Optional[str] = None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class ModelState(BaseModel):
|
|
110
|
+
"""Represents the state of a single model for incremental sync"""
|
|
111
|
+
checksum: str
|
|
112
|
+
has_metrics: bool
|
|
113
|
+
last_generated: str
|
|
114
|
+
output_file: str
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class SyncState(BaseModel):
|
|
118
|
+
"""Represents the overall state for incremental sync"""
|
|
119
|
+
version: str = "1.0"
|
|
120
|
+
last_sync_timestamp: str
|
|
121
|
+
manifest_path: str
|
|
122
|
+
models: Dict[str, ModelState] = {}
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""
|
|
2
|
+
State management for incremental sync functionality.
|
|
3
|
+
|
|
4
|
+
Tracks model checksums to enable incremental sync - only regenerate
|
|
5
|
+
Cube.js files for models that have actually changed.
|
|
6
|
+
"""
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
12
|
+
|
|
13
|
+
from .models import ModelState, SyncState
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class StateManager:
|
|
17
|
+
"""Manages sync state for incremental model generation."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, state_path: str = ".dbt-cube-sync-state.json"):
|
|
20
|
+
"""
|
|
21
|
+
Initialize the StateManager.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
state_path: Path to the state file (default: .dbt-cube-sync-state.json)
|
|
25
|
+
"""
|
|
26
|
+
self.state_path = Path(state_path)
|
|
27
|
+
self._state: Optional[SyncState] = None
|
|
28
|
+
|
|
29
|
+
def load_state(self) -> Optional[SyncState]:
|
|
30
|
+
"""
|
|
31
|
+
Load state from file.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
SyncState if file exists and is valid, None otherwise
|
|
35
|
+
"""
|
|
36
|
+
if not self.state_path.exists():
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
with open(self.state_path, "r") as f:
|
|
41
|
+
data = json.load(f)
|
|
42
|
+
self._state = SyncState(**data)
|
|
43
|
+
return self._state
|
|
44
|
+
except (json.JSONDecodeError, Exception) as e:
|
|
45
|
+
print(f"Warning: Could not load state file: {e}")
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
def save_state(self, state: SyncState) -> None:
|
|
49
|
+
"""
|
|
50
|
+
Save state to file.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
state: The SyncState to save
|
|
54
|
+
"""
|
|
55
|
+
self._state = state
|
|
56
|
+
with open(self.state_path, "w") as f:
|
|
57
|
+
json.dump(state.model_dump(), f, indent=2)
|
|
58
|
+
|
|
59
|
+
def get_changed_models(
|
|
60
|
+
self,
|
|
61
|
+
manifest_nodes: Dict[str, dict],
|
|
62
|
+
previous_state: Optional[SyncState] = None,
|
|
63
|
+
) -> Tuple[Set[str], Set[str], Set[str]]:
|
|
64
|
+
"""
|
|
65
|
+
Compare manifest nodes against stored state to identify changes.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
manifest_nodes: Dict of node_id -> node data from manifest
|
|
69
|
+
previous_state: Previous sync state (if None, all models are "added")
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Tuple of (added_node_ids, modified_node_ids, removed_node_ids)
|
|
73
|
+
"""
|
|
74
|
+
if previous_state is None:
|
|
75
|
+
# First run - all models with metrics are "added"
|
|
76
|
+
added = set(manifest_nodes.keys())
|
|
77
|
+
return added, set(), set()
|
|
78
|
+
|
|
79
|
+
current_node_ids = set(manifest_nodes.keys())
|
|
80
|
+
previous_node_ids = set(previous_state.models.keys())
|
|
81
|
+
|
|
82
|
+
# Find added models (in current but not in previous)
|
|
83
|
+
added = current_node_ids - previous_node_ids
|
|
84
|
+
|
|
85
|
+
# Find removed models (in previous but not in current)
|
|
86
|
+
removed = previous_node_ids - current_node_ids
|
|
87
|
+
|
|
88
|
+
# Find modified models (in both, but checksum changed)
|
|
89
|
+
modified = set()
|
|
90
|
+
for node_id in current_node_ids & previous_node_ids:
|
|
91
|
+
current_checksum = manifest_nodes[node_id].get("checksum", {}).get(
|
|
92
|
+
"checksum", ""
|
|
93
|
+
)
|
|
94
|
+
previous_checksum = previous_state.models[node_id].checksum
|
|
95
|
+
if current_checksum != previous_checksum:
|
|
96
|
+
modified.add(node_id)
|
|
97
|
+
|
|
98
|
+
return added, modified, removed
|
|
99
|
+
|
|
100
|
+
def create_state_from_results(
|
|
101
|
+
self,
|
|
102
|
+
manifest_path: str,
|
|
103
|
+
manifest_nodes: Dict[str, dict],
|
|
104
|
+
generated_files: Dict[str, str],
|
|
105
|
+
) -> SyncState:
|
|
106
|
+
"""
|
|
107
|
+
Build a new state from sync results.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
manifest_path: Path to the manifest file used
|
|
111
|
+
manifest_nodes: Dict of node_id -> node data from manifest
|
|
112
|
+
generated_files: Dict of node_id -> output_file_path
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
New SyncState representing the current state
|
|
116
|
+
"""
|
|
117
|
+
timestamp = datetime.utcnow().isoformat() + "Z"
|
|
118
|
+
|
|
119
|
+
models: Dict[str, ModelState] = {}
|
|
120
|
+
for node_id, node_data in manifest_nodes.items():
|
|
121
|
+
if node_id not in generated_files:
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
checksum = node_data.get("checksum", {}).get("checksum", "")
|
|
125
|
+
has_metrics = bool(
|
|
126
|
+
node_data.get("config", {}).get("meta", {}).get("metrics")
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
models[node_id] = ModelState(
|
|
130
|
+
checksum=checksum,
|
|
131
|
+
has_metrics=has_metrics,
|
|
132
|
+
last_generated=timestamp,
|
|
133
|
+
output_file=generated_files[node_id],
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return SyncState(
|
|
137
|
+
version="1.0",
|
|
138
|
+
last_sync_timestamp=timestamp,
|
|
139
|
+
manifest_path=str(manifest_path),
|
|
140
|
+
models=models,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
def merge_state(
|
|
144
|
+
self,
|
|
145
|
+
previous_state: Optional[SyncState],
|
|
146
|
+
manifest_path: str,
|
|
147
|
+
manifest_nodes: Dict[str, dict],
|
|
148
|
+
generated_files: Dict[str, str],
|
|
149
|
+
removed_node_ids: Set[str],
|
|
150
|
+
) -> SyncState:
|
|
151
|
+
"""
|
|
152
|
+
Merge new sync results with previous state for incremental updates.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
previous_state: Previous sync state (or None for first run)
|
|
156
|
+
manifest_path: Path to the manifest file used
|
|
157
|
+
manifest_nodes: Dict of node_id -> node data from manifest
|
|
158
|
+
generated_files: Dict of node_id -> output_file_path (only newly generated)
|
|
159
|
+
removed_node_ids: Set of node_ids that were removed
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Merged SyncState
|
|
163
|
+
"""
|
|
164
|
+
timestamp = datetime.utcnow().isoformat() + "Z"
|
|
165
|
+
|
|
166
|
+
models: Dict[str, ModelState] = {}
|
|
167
|
+
|
|
168
|
+
# Start with previous models (excluding removed ones)
|
|
169
|
+
if previous_state:
|
|
170
|
+
for node_id, model_state in previous_state.models.items():
|
|
171
|
+
if node_id not in removed_node_ids:
|
|
172
|
+
models[node_id] = model_state
|
|
173
|
+
|
|
174
|
+
# Update/add newly generated models
|
|
175
|
+
for node_id, output_file in generated_files.items():
|
|
176
|
+
node_data = manifest_nodes.get(node_id, {})
|
|
177
|
+
checksum = node_data.get("checksum", {}).get("checksum", "")
|
|
178
|
+
has_metrics = bool(
|
|
179
|
+
node_data.get("config", {}).get("meta", {}).get("metrics")
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
models[node_id] = ModelState(
|
|
183
|
+
checksum=checksum,
|
|
184
|
+
has_metrics=has_metrics,
|
|
185
|
+
last_generated=timestamp,
|
|
186
|
+
output_file=output_file,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return SyncState(
|
|
190
|
+
version="1.0",
|
|
191
|
+
last_sync_timestamp=timestamp,
|
|
192
|
+
manifest_path=str(manifest_path),
|
|
193
|
+
models=models,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def get_files_to_delete(
|
|
197
|
+
self,
|
|
198
|
+
previous_state: Optional[SyncState],
|
|
199
|
+
removed_node_ids: Set[str],
|
|
200
|
+
) -> List[str]:
|
|
201
|
+
"""
|
|
202
|
+
Get list of output files that should be deleted for removed models.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
previous_state: Previous sync state
|
|
206
|
+
removed_node_ids: Set of node_ids that were removed
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
List of file paths to delete
|
|
210
|
+
"""
|
|
211
|
+
if not previous_state:
|
|
212
|
+
return []
|
|
213
|
+
|
|
214
|
+
files_to_delete = []
|
|
215
|
+
for node_id in removed_node_ids:
|
|
216
|
+
if node_id in previous_state.models:
|
|
217
|
+
output_file = previous_state.models[node_id].output_file
|
|
218
|
+
if os.path.exists(output_file):
|
|
219
|
+
files_to_delete.append(output_file)
|
|
220
|
+
|
|
221
|
+
return files_to_delete
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dbt-cube-sync
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0a8
|
|
4
4
|
Summary: Synchronization tool for dbt models to Cube.js schemas and BI tools
|
|
5
5
|
Author: Ponder
|
|
6
6
|
Requires-Python: >=3.9,<4.0
|
|
@@ -144,34 +144,101 @@ connectors:
|
|
|
144
144
|
|
|
145
145
|
## CLI Commands
|
|
146
146
|
|
|
147
|
+
### Quick Reference
|
|
148
|
+
|
|
149
|
+
| Command | Description |
|
|
150
|
+
|---------|-------------|
|
|
151
|
+
| `sync-all` | **Ultimate command** - Incremental sync: dbt → Cube.js → Superset → RAG |
|
|
152
|
+
| `dbt-to-cube` | Generate Cube.js schemas from dbt models (with incremental support) |
|
|
153
|
+
| `cube-to-bi` | Sync Cube.js schemas to BI tools (Superset, Tableau, PowerBI) |
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
### `sync-all` (Recommended)
|
|
158
|
+
|
|
159
|
+
**Ultimate incremental sync command** - handles the complete pipeline with state tracking.
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
# Basic incremental sync (Cube.js only)
|
|
163
|
+
dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output
|
|
164
|
+
|
|
165
|
+
# Full pipeline: dbt → Cube.js → Superset
|
|
166
|
+
dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output \
|
|
167
|
+
--superset-url http://localhost:8088 \
|
|
168
|
+
--superset-username admin \
|
|
169
|
+
--superset-password admin
|
|
170
|
+
|
|
171
|
+
# Full pipeline: dbt → Cube.js → Superset → RAG embeddings
|
|
172
|
+
dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output \
|
|
173
|
+
--superset-url http://localhost:8088 \
|
|
174
|
+
--superset-username admin \
|
|
175
|
+
--superset-password admin \
|
|
176
|
+
--rag-api-url http://localhost:8000
|
|
177
|
+
|
|
178
|
+
# Force full rebuild (ignore state)
|
|
179
|
+
dbt-cube-sync sync-all -m manifest.json -c catalog.json -o ./cube_output --force-full-sync
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
**Options:**
|
|
183
|
+
| Option | Required | Description |
|
|
184
|
+
|--------|----------|-------------|
|
|
185
|
+
| `--manifest, -m` | Yes | Path to dbt manifest.json |
|
|
186
|
+
| `--catalog, -c` | No* | Path to dbt catalog.json |
|
|
187
|
+
| `--sqlalchemy-uri, -s` | No* | Database URI for column types |
|
|
188
|
+
| `--output, -o` | Yes | Output directory for Cube.js files |
|
|
189
|
+
| `--state-path` | No | State file path (default: `.dbt-cube-sync-state.json`) |
|
|
190
|
+
| `--force-full-sync` | No | Force full rebuild, ignore state |
|
|
191
|
+
| `--superset-url` | No | Superset URL |
|
|
192
|
+
| `--superset-username` | No | Superset username |
|
|
193
|
+
| `--superset-password` | No | Superset password |
|
|
194
|
+
| `--cube-connection-name` | No | Cube database name in Superset (default: `Cube`) |
|
|
195
|
+
| `--rag-api-url` | No | RAG API URL for embedding updates |
|
|
196
|
+
|
|
197
|
+
*Either `--catalog` or `--sqlalchemy-uri` is required.
|
|
198
|
+
|
|
199
|
+
**How Incremental Sync Works:**
|
|
200
|
+
1. Reads state file (`.dbt-cube-sync-state.json`) with model checksums
|
|
201
|
+
2. Compares against current manifest to detect changes
|
|
202
|
+
3. Only processes **added** or **modified** models
|
|
203
|
+
4. Deletes Cube.js files for **removed** models
|
|
204
|
+
5. Updates state file with new checksums
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
147
208
|
### `dbt-to-cube`
|
|
148
|
-
|
|
209
|
+
|
|
210
|
+
Generate Cube.js schema files from dbt models with incremental support.
|
|
149
211
|
|
|
150
212
|
**Options:**
|
|
151
213
|
- `--manifest` / `-m`: Path to dbt manifest.json file (required)
|
|
152
|
-
- `--catalog` / `-c`: Path to dbt catalog.json file
|
|
153
|
-
- `--sqlalchemy-uri` / `-s`: SQLAlchemy database URI for fetching column types
|
|
154
|
-
|
|
155
|
-
- Example: `mysql://user:password@localhost:3306/database`
|
|
156
|
-
- Example: `snowflake://user:password@account/database/schema`
|
|
157
|
-
- `--models`: Comma-separated list of model names to process (optional, processes all if not specified)
|
|
158
|
-
- Example: `--models model1,model2,model3`
|
|
214
|
+
- `--catalog` / `-c`: Path to dbt catalog.json file
|
|
215
|
+
- `--sqlalchemy-uri` / `-s`: SQLAlchemy database URI for fetching column types
|
|
216
|
+
- `--models`: Comma-separated list of model names to process
|
|
159
217
|
- `--output` / `-o`: Output directory for Cube.js files (required)
|
|
160
218
|
- `--template-dir` / `-t`: Directory containing Cube.js templates (default: ./cube/templates)
|
|
219
|
+
- `--state-path`: State file for incremental sync (default: `.dbt-cube-sync-state.json`)
|
|
220
|
+
- `--force-full-sync`: Force full regeneration, ignore cached state
|
|
221
|
+
- `--no-state`: Disable state tracking (legacy behavior)
|
|
161
222
|
|
|
162
223
|
**Examples:**
|
|
163
224
|
```bash
|
|
164
|
-
#
|
|
225
|
+
# Incremental sync (default)
|
|
165
226
|
dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/
|
|
166
227
|
|
|
228
|
+
# Force full rebuild
|
|
229
|
+
dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/ --force-full-sync
|
|
230
|
+
|
|
167
231
|
# Using database connection (no catalog needed)
|
|
168
232
|
dbt-cube-sync dbt-to-cube -m manifest.json -s postgresql://user:pass@localhost/db -o output/
|
|
169
233
|
|
|
170
234
|
# Filter specific models
|
|
171
|
-
dbt-cube-sync dbt-to-cube -m manifest.json -
|
|
235
|
+
dbt-cube-sync dbt-to-cube -m manifest.json -c catalog.json -o output/ --models users,orders
|
|
172
236
|
```
|
|
173
237
|
|
|
238
|
+
---
|
|
239
|
+
|
|
174
240
|
### `cube-to-bi`
|
|
241
|
+
|
|
175
242
|
Sync Cube.js schemas to BI tool datasets.
|
|
176
243
|
|
|
177
244
|
**Arguments:**
|
|
@@ -189,15 +256,29 @@ Sync Cube.js schemas to BI tool datasets.
|
|
|
189
256
|
dbt-cube-sync cube-to-bi superset -c cube_output/ -u http://localhost:8088 -n admin -p admin -d Cube
|
|
190
257
|
```
|
|
191
258
|
|
|
192
|
-
|
|
193
|
-
|
|
259
|
+
---
|
|
260
|
+
|
|
261
|
+
## State File
|
|
262
|
+
|
|
263
|
+
The state file (`.dbt-cube-sync-state.json`) tracks:
|
|
264
|
+
|
|
265
|
+
```json
|
|
266
|
+
{
|
|
267
|
+
"version": "1.0",
|
|
268
|
+
"last_sync_timestamp": "2024-01-15T10:30:00Z",
|
|
269
|
+
"manifest_path": "/path/to/manifest.json",
|
|
270
|
+
"models": {
|
|
271
|
+
"model.project.users": {
|
|
272
|
+
"checksum": "abc123...",
|
|
273
|
+
"has_metrics": true,
|
|
274
|
+
"last_generated": "2024-01-15T10:30:00Z",
|
|
275
|
+
"output_file": "./cube_output/Users.js"
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
```
|
|
194
280
|
|
|
195
|
-
|
|
196
|
-
- `--dbt-manifest` / `-m`: Path to dbt manifest.json file
|
|
197
|
-
- `--cube-dir` / `-c`: Directory for Cube.js files
|
|
198
|
-
- `--template-dir` / `-t`: Directory containing Cube.js templates
|
|
199
|
-
- `--bi-connector` / `-b`: BI tool to sync to
|
|
200
|
-
- `--config-file` / `-f`: Configuration file for BI tool connection
|
|
281
|
+
Delete this file to force a full rebuild, or use `--force-full-sync`.
|
|
201
282
|
|
|
202
283
|
## Architecture
|
|
203
284
|
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
dbt_cube_sync/__init__.py,sha256=aifkfgUDRPL5v0LZzceH2LXu66YDkJjdpvKwXsdikbI,113
|
|
2
|
+
dbt_cube_sync/cli.py,sha256=AxSVF3hJJqovk51mjA8Nyyte5NkfukSF3sAjk_VYJ6Y,20992
|
|
3
|
+
dbt_cube_sync/config.py,sha256=qhGE7CxTmh0RhPizgd3x3Yj-3L2LoC00UQIDT0q9FlQ,3858
|
|
4
|
+
dbt_cube_sync/connectors/__init__.py,sha256=NG6tYZ3CYD5bG_MfNLZrUM8YoBEKArG8-AOmJ8pwvQI,52
|
|
5
|
+
dbt_cube_sync/connectors/base.py,sha256=JLzerxJdt34z0kWuyieL6UQhf5_dUYPGmwkiRWBuSPY,2802
|
|
6
|
+
dbt_cube_sync/connectors/powerbi.py,sha256=2Y8fTfh_6Q_Myma1ymipPh1U3HsfQKcktVequXXnIXI,1275
|
|
7
|
+
dbt_cube_sync/connectors/superset.py,sha256=5YEqadVZRPFAJkgvhqkse3JuGJkQHfyvT88jy52ow_0,21429
|
|
8
|
+
dbt_cube_sync/connectors/tableau.py,sha256=jKve1zErzTbgPOtmPB92ZwZl4I6uEySedM51JiwlGrE,1261
|
|
9
|
+
dbt_cube_sync/core/__init__.py,sha256=kgsawtU5dqEvnHz6dU8qwJbH3rtIV7QlK2MhtYVDCaY,46
|
|
10
|
+
dbt_cube_sync/core/cube_generator.py,sha256=DtmaA_dtWmBVJnSWHVoQi-3KEsRc0axHZpCUEcKeYAk,11061
|
|
11
|
+
dbt_cube_sync/core/db_inspector.py,sha256=Ccd9ieGNlwHDHdgMVDEOfjs7R9Mjj904OW1P-mDSsyo,3155
|
|
12
|
+
dbt_cube_sync/core/dbt_parser.py,sha256=KbhDoB0ULP6JDUPZPDVbm9yCtRKrW17ptGoJvVLtueY,12763
|
|
13
|
+
dbt_cube_sync/core/models.py,sha256=2s5iZ9MEBGfSzkB4HJB5vG0mZqNXNJSfAD3Byw1IVe4,3203
|
|
14
|
+
dbt_cube_sync/core/state_manager.py,sha256=7uXJtlZBIWj6s6XgAhNlP6UHdfhH0y461iyQlfidqGI,7233
|
|
15
|
+
dbt_cube_sync-0.1.0a8.dist-info/METADATA,sha256=fsb721DeeHXUeeeLIihijjIiM6x7Wl8fUTeBMzlyoZo,10680
|
|
16
|
+
dbt_cube_sync-0.1.0a8.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
|
|
17
|
+
dbt_cube_sync-0.1.0a8.dist-info/entry_points.txt,sha256=iEAB_nZ1AoSeFwSHPY2tr02xmTHLVFKp5CJeFh0AfCw,56
|
|
18
|
+
dbt_cube_sync-0.1.0a8.dist-info/RECORD,,
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
dbt_cube_sync/__init__.py,sha256=aifkfgUDRPL5v0LZzceH2LXu66YDkJjdpvKwXsdikbI,113
|
|
2
|
-
dbt_cube_sync/cli.py,sha256=lZT9vYosnr5NbrMPRAxP_AOSvomqjoFGnPuu9d-vcTM,6896
|
|
3
|
-
dbt_cube_sync/config.py,sha256=qhGE7CxTmh0RhPizgd3x3Yj-3L2LoC00UQIDT0q9FlQ,3858
|
|
4
|
-
dbt_cube_sync/connectors/__init__.py,sha256=NG6tYZ3CYD5bG_MfNLZrUM8YoBEKArG8-AOmJ8pwvQI,52
|
|
5
|
-
dbt_cube_sync/connectors/base.py,sha256=JLzerxJdt34z0kWuyieL6UQhf5_dUYPGmwkiRWBuSPY,2802
|
|
6
|
-
dbt_cube_sync/connectors/powerbi.py,sha256=2Y8fTfh_6Q_Myma1ymipPh1U3HsfQKcktVequXXnIXI,1275
|
|
7
|
-
dbt_cube_sync/connectors/superset.py,sha256=5YEqadVZRPFAJkgvhqkse3JuGJkQHfyvT88jy52ow_0,21429
|
|
8
|
-
dbt_cube_sync/connectors/tableau.py,sha256=jKve1zErzTbgPOtmPB92ZwZl4I6uEySedM51JiwlGrE,1261
|
|
9
|
-
dbt_cube_sync/core/__init__.py,sha256=kgsawtU5dqEvnHz6dU8qwJbH3rtIV7QlK2MhtYVDCaY,46
|
|
10
|
-
dbt_cube_sync/core/cube_generator.py,sha256=o_-fa09F3RQADueIgou8EFhmxKd7PbQ-hCJmXvRuvWM,10839
|
|
11
|
-
dbt_cube_sync/core/db_inspector.py,sha256=eoJl7XG3dPcKg22SEX2dehC8Hvj5hgLR8sUgKiPCIGI,1540
|
|
12
|
-
dbt_cube_sync/core/dbt_parser.py,sha256=vQEUO19WYdeFNnulU2_PD4hdHUtTO-Y9BXfHuH6ZVnM,10192
|
|
13
|
-
dbt_cube_sync/core/models.py,sha256=JjiFAO0vbfVZkKOd6NcZb_JMGSVMTMfQiYjHcZbKtnI,2811
|
|
14
|
-
dbt_cube_sync-0.1.0a6.dist-info/METADATA,sha256=SgI2Sm6jQ748KKn3ZnFGz1qEoIaJmSfMm_Owv9hE5Hc,8274
|
|
15
|
-
dbt_cube_sync-0.1.0a6.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
16
|
-
dbt_cube_sync-0.1.0a6.dist-info/entry_points.txt,sha256=iEAB_nZ1AoSeFwSHPY2tr02xmTHLVFKp5CJeFh0AfCw,56
|
|
17
|
-
dbt_cube_sync-0.1.0a6.dist-info/RECORD,,
|
|
File without changes
|