dbt-cube-sync 0.1.0a11__tar.gz → 0.1.0a13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dbt-cube-sync might be problematic. Click here for more details.

Files changed (17) hide show
  1. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/PKG-INFO +1 -1
  2. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/cli.py +91 -57
  3. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/connectors/superset.py +17 -7
  4. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/core/models.py +4 -12
  5. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/core/state_manager.py +97 -47
  6. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/pyproject.toml +1 -1
  7. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/README.md +0 -0
  8. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/__init__.py +0 -0
  9. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/config.py +0 -0
  10. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/connectors/__init__.py +0 -0
  11. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/connectors/base.py +0 -0
  12. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/connectors/powerbi.py +0 -0
  13. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/connectors/tableau.py +0 -0
  14. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/core/__init__.py +0 -0
  15. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/core/cube_generator.py +0 -0
  16. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/core/db_inspector.py +0 -0
  17. {dbt_cube_sync-0.1.0a11 → dbt_cube_sync-0.1.0a13}/dbt_cube_sync/core/dbt_parser.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dbt-cube-sync
3
- Version: 0.1.0a11
3
+ Version: 0.1.0a13
4
4
  Summary: Synchronization tool for dbt models to Cube.js schemas and BI tools
5
5
  Author: Ponder
6
6
  Requires-Python: >=3.9,<4.0
@@ -444,16 +444,21 @@ def sync_all(
444
444
  manifest, manifest_nodes, {}
445
445
  )
446
446
 
447
- # Update cube_sync step state
448
- current_state = state_manager.update_step_state(
449
- current_state,
450
- 'cube_sync',
451
- 'failed' if cube_sync_error else 'success',
452
- cube_sync_error
453
- )
447
+ # Save cube sync state
454
448
  state_manager.save_state(current_state)
455
449
  click.echo(f" State saved to {state_path}")
456
450
 
451
+ if cube_sync_error:
452
+ click.echo(f" Error during cube generation: {cube_sync_error}", err=True)
453
+
454
+ # Build a mapping from model name (file stem) to node_id for status updates
455
+ model_name_to_node_id = {}
456
+ for node_id in current_state.models.keys():
457
+ # Extract model name from output file (e.g., "model/cubes/ModelName.js" -> "ModelName")
458
+ output_file = current_state.models[node_id].output_file
459
+ model_name = Path(output_file).stem
460
+ model_name_to_node_id[model_name] = node_id
461
+
457
462
  # ============================================================
458
463
  # STEP 2: Sync to Superset (if configured)
459
464
  # ============================================================
@@ -462,17 +467,21 @@ def sync_all(
462
467
 
463
468
  if not superset_url or not superset_username or not superset_password:
464
469
  click.echo(" Skipped - no Superset credentials provided")
465
- current_state = state_manager.update_step_state(current_state, 'superset_sync', 'skipped')
466
- state_manager.save_state(current_state)
467
470
  else:
468
- should_run_superset = state_manager.should_run_step(
469
- 'superset_sync', previous_state, changes_detected
470
- ) or force_full_sync
471
+ # Get models that need Superset sync (status is None or 'failed')
472
+ models_to_sync_ids = state_manager.get_models_needing_sync(current_state, 'superset')
471
473
 
472
- if not should_run_superset:
473
- click.echo(" Skipped - no changes and previous sync succeeded")
474
+ if not models_to_sync_ids and not force_full_sync:
475
+ click.echo(" Skipped - all models already synced successfully")
474
476
  else:
475
- superset_error = None
477
+ # Convert node_ids to model names for filtering
478
+ models_to_sync_names = set()
479
+ for node_id in models_to_sync_ids:
480
+ if node_id in current_state.models:
481
+ output_file = current_state.models[node_id].output_file
482
+ model_name = Path(output_file).stem
483
+ models_to_sync_names.add(model_name)
484
+
476
485
  try:
477
486
  connector_config = {
478
487
  'url': superset_url,
@@ -482,24 +491,34 @@ def sync_all(
482
491
  }
483
492
 
484
493
  connector = ConnectorRegistry.get_connector('superset', **connector_config)
485
- results = connector.sync_cube_schemas(output)
494
+
495
+ if force_full_sync:
496
+ results = connector.sync_cube_schemas(output)
497
+ else:
498
+ results = connector.sync_cube_schemas(output, models_to_sync_names)
499
+
500
+ # Update per-model status
501
+ for r in results:
502
+ model_name = r.file_or_dataset.replace('.js', '')
503
+ node_id = model_name_to_node_id.get(model_name)
504
+ if node_id:
505
+ state_manager.update_model_sync_status(
506
+ current_state, node_id, 'superset',
507
+ 'success' if r.status == 'success' else 'failed'
508
+ )
486
509
 
487
510
  successful = sum(1 for r in results if r.status == 'success')
488
511
  failed = sum(1 for r in results if r.status == 'failed')
489
512
  click.echo(f" Synced: {successful} successful, {failed} failed")
490
513
 
491
- if failed > 0:
492
- superset_error = f"{failed} datasets failed to sync"
493
514
  except Exception as e:
494
- superset_error = str(e)
495
- click.echo(f" Error: {superset_error}", err=True)
496
-
497
- current_state = state_manager.update_step_state(
498
- current_state,
499
- 'superset_sync',
500
- 'failed' if superset_error else 'success',
501
- superset_error
502
- )
515
+ click.echo(f" Error: {str(e)}", err=True)
516
+ # Mark all models we tried to sync as failed
517
+ for node_id in models_to_sync_ids:
518
+ state_manager.update_model_sync_status(
519
+ current_state, node_id, 'superset', 'failed'
520
+ )
521
+
503
522
  state_manager.save_state(current_state)
504
523
 
505
524
  # ============================================================
@@ -510,17 +529,16 @@ def sync_all(
510
529
 
511
530
  if not rag_api_url:
512
531
  click.echo(" Skipped - no RAG API URL provided")
513
- current_state = state_manager.update_step_state(current_state, 'rag_sync', 'skipped')
514
- state_manager.save_state(current_state)
515
532
  else:
516
- should_run_rag = state_manager.should_run_step(
517
- 'rag_sync', previous_state, changes_detected
518
- ) or force_full_sync
533
+ # Get models that need RAG sync (status is None or 'failed')
534
+ models_to_embed_ids = state_manager.get_models_needing_sync(current_state, 'rag')
519
535
 
520
- if not should_run_rag:
521
- click.echo(" Skipped - no changes and previous sync succeeded")
536
+ if not models_to_embed_ids and not force_full_sync:
537
+ click.echo(" Skipped - all models already synced successfully")
522
538
  else:
523
- rag_error = None
539
+ if force_full_sync:
540
+ models_to_embed_ids = set(current_state.models.keys())
541
+
524
542
  try:
525
543
  # Call the RAG API to re-ingest embeddings
526
544
  response = requests.post(
@@ -532,19 +550,26 @@ def sync_all(
532
550
  if response.status_code == 200:
533
551
  result = response.json()
534
552
  click.echo(f" Ingested {result.get('schemas_ingested', 0)} schema documents")
553
+ # Mark all models as succeeded
554
+ for node_id in models_to_embed_ids:
555
+ state_manager.update_model_sync_status(
556
+ current_state, node_id, 'rag', 'success'
557
+ )
535
558
  else:
536
- rag_error = f"RAG API returned {response.status_code}"
537
- click.echo(f" Error: {rag_error}", err=True)
559
+ click.echo(f" Error: RAG API returned {response.status_code}", err=True)
560
+ # Mark all models as failed
561
+ for node_id in models_to_embed_ids:
562
+ state_manager.update_model_sync_status(
563
+ current_state, node_id, 'rag', 'failed'
564
+ )
538
565
  except requests.RequestException as e:
539
- rag_error = str(e)
540
- click.echo(f" Error: Could not reach RAG API: {rag_error}", err=True)
541
-
542
- current_state = state_manager.update_step_state(
543
- current_state,
544
- 'rag_sync',
545
- 'failed' if rag_error else 'success',
546
- rag_error
547
- )
566
+ click.echo(f" Error: Could not reach RAG API: {e}", err=True)
567
+ # Mark all models as failed
568
+ for node_id in models_to_embed_ids:
569
+ state_manager.update_model_sync_status(
570
+ current_state, node_id, 'rag', 'failed'
571
+ )
572
+
548
573
  state_manager.save_state(current_state)
549
574
 
550
575
  # ============================================================
@@ -554,10 +579,23 @@ def sync_all(
554
579
  click.echo("SYNC COMPLETE")
555
580
  click.echo("=" * 60)
556
581
 
557
- # Show step statuses
558
- click.echo(f" Cube sync: {current_state.cube_sync.status if current_state.cube_sync else 'unknown'}")
559
- click.echo(f" Superset sync: {current_state.superset_sync.status if current_state.superset_sync else 'unknown'}")
560
- click.echo(f" RAG sync: {current_state.rag_sync.status if current_state.rag_sync else 'unknown'}")
582
+ # Get per-model sync summaries
583
+ superset_summary = state_manager.get_sync_summary(current_state, 'superset')
584
+ rag_summary = state_manager.get_sync_summary(current_state, 'rag')
585
+
586
+ def format_summary(summary, step_configured):
587
+ if not step_configured:
588
+ return "skipped (not configured)"
589
+ if summary['failed'] > 0:
590
+ return f"{summary['success']} success, {summary['failed']} failed (will retry)"
591
+ elif summary['pending'] > 0:
592
+ return f"{summary['success']} success, {summary['pending']} pending"
593
+ else:
594
+ return f"{summary['success']} success"
595
+
596
+ click.echo(f" Cube.js files: {len(current_state.models)} models")
597
+ click.echo(f" Superset sync: {format_summary(superset_summary, superset_url)}")
598
+ click.echo(f" RAG sync: {format_summary(rag_summary, rag_api_url)}")
561
599
 
562
600
  if changes_detected or force_full_sync:
563
601
  click.echo(f" Models processed: {len(added_models) + len(modified_models)}")
@@ -566,14 +604,10 @@ def sync_all(
566
604
  else:
567
605
  click.echo(" No model changes detected")
568
606
 
569
- # Exit with error if any step failed
570
- any_failed = (
571
- (current_state.cube_sync and current_state.cube_sync.status == 'failed') or
572
- (current_state.superset_sync and current_state.superset_sync.status == 'failed') or
573
- (current_state.rag_sync and current_state.rag_sync.status == 'failed')
574
- )
607
+ # Exit with error if any models failed
608
+ any_failed = superset_summary['failed'] > 0 or rag_summary['failed'] > 0
575
609
  if any_failed:
576
- click.echo("\n ⚠️ Some steps failed - they will be retried on next run")
610
+ click.echo("\n ⚠️ Some models failed - they will be retried on next run")
577
611
  sys.exit(1)
578
612
 
579
613
  except Exception as e:
@@ -123,20 +123,30 @@ class SupersetConnector(BaseConnector):
123
123
  self.database_id = result[0]['id']
124
124
  print(f"✓ Found database '{database_name}' with ID: {self.database_id}")
125
125
 
126
- def sync_cube_schemas(self, cube_dir: str) -> List[SyncResult]:
127
- """Sync all Cube.js schemas from directory to Superset"""
126
+ def sync_cube_schemas(self, cube_dir: str, models_filter: set = None) -> List[SyncResult]:
127
+ """Sync Cube.js schemas from directory to Superset
128
+
129
+ Args:
130
+ cube_dir: Directory containing Cube.js schema files
131
+ models_filter: Optional set of model names to sync. If None, sync all.
132
+ """
128
133
  results = []
129
134
  cube_files = self._get_cube_files(cube_dir)
130
-
135
+
131
136
  if not cube_files:
132
137
  return [SyncResult(
133
138
  file_or_dataset="No files",
134
- status="failed",
139
+ status="failed",
135
140
  message=f"No .js files found in {cube_dir}"
136
141
  )]
137
-
138
- print(f"🔍 Found {len(cube_files)} Cube.js files")
139
-
142
+
143
+ # Filter files if models_filter is provided
144
+ if models_filter:
145
+ cube_files = [f for f in cube_files if f.stem in models_filter]
146
+ print(f"🔍 Syncing {len(cube_files)} Cube.js files (filtered from {len(self._get_cube_files(cube_dir))})")
147
+ else:
148
+ print(f"🔍 Found {len(cube_files)} Cube.js files")
149
+
140
150
  for cube_file in cube_files:
141
151
  try:
142
152
  print(f"\\n{'='*60}")
@@ -112,13 +112,9 @@ class ModelState(BaseModel):
112
112
  has_metrics: bool
113
113
  last_generated: str
114
114
  output_file: str
115
-
116
-
117
- class StepState(BaseModel):
118
- """Represents the state of a pipeline step"""
119
- status: str # 'success', 'failed', 'skipped'
120
- last_run: Optional[str] = None
121
- error: Optional[str] = None
115
+ # Per-model sync status for each step
116
+ superset_sync_status: Optional[str] = None # 'success', 'failed', or None (not attempted)
117
+ rag_sync_status: Optional[str] = None # 'success', 'failed', or None (not attempted)
122
118
 
123
119
 
124
120
  class SyncState(BaseModel):
@@ -126,8 +122,4 @@ class SyncState(BaseModel):
126
122
  version: str = "1.1"
127
123
  last_sync_timestamp: str
128
124
  manifest_path: str
129
- models: Dict[str, ModelState] = {}
130
- # Step states for tracking pipeline progress
131
- cube_sync: Optional[StepState] = None
132
- superset_sync: Optional[StepState] = None
133
- rag_sync: Optional[StepState] = None
125
+ models: Dict[str, ModelState] = {}
@@ -4,13 +4,42 @@ State management for incremental sync functionality.
4
4
  Tracks model checksums to enable incremental sync - only regenerate
5
5
  Cube.js files for models that have actually changed.
6
6
  """
7
+ import hashlib
7
8
  import json
8
9
  import os
9
10
  from datetime import datetime
10
11
  from pathlib import Path
11
12
  from typing import Dict, List, Optional, Set, Tuple
12
13
 
13
- from .models import ModelState, StepState, SyncState
14
+ from .models import ModelState, SyncState
15
+
16
+
17
+ def compute_model_checksum(node_data: dict) -> str:
18
+ """
19
+ Compute a checksum that includes both the dbt model checksum
20
+ and the metrics/meta configuration.
21
+
22
+ This ensures that changes to metrics (which don't change the SQL)
23
+ are still detected as modifications.
24
+
25
+ Args:
26
+ node_data: The node data from the dbt manifest
27
+
28
+ Returns:
29
+ A combined SHA256 checksum string
30
+ """
31
+ # Get the base dbt checksum
32
+ base_checksum = node_data.get("checksum", {}).get("checksum", "")
33
+
34
+ # Get the meta configuration (where metrics are defined)
35
+ meta = node_data.get("config", {}).get("meta", {})
36
+
37
+ # Serialize meta to a stable JSON string (sorted keys for consistency)
38
+ meta_json = json.dumps(meta, sort_keys=True, default=str)
39
+
40
+ # Combine and hash
41
+ combined = f"{base_checksum}:{meta_json}"
42
+ return hashlib.sha256(combined.encode()).hexdigest()
14
43
 
15
44
 
16
45
  class StateManager:
@@ -86,11 +115,11 @@ class StateManager:
86
115
  removed = previous_node_ids - current_node_ids
87
116
 
88
117
  # Find modified models (in both, but checksum changed)
118
+ # Note: We compute a combined checksum that includes metrics/meta config,
119
+ # not just the dbt SQL checksum. This ensures metric changes are detected.
89
120
  modified = set()
90
121
  for node_id in current_node_ids & previous_node_ids:
91
- current_checksum = manifest_nodes[node_id].get("checksum", {}).get(
92
- "checksum", ""
93
- )
122
+ current_checksum = compute_model_checksum(manifest_nodes[node_id])
94
123
  previous_checksum = previous_state.models[node_id].checksum
95
124
  if current_checksum != previous_checksum:
96
125
  modified.add(node_id)
@@ -121,7 +150,8 @@ class StateManager:
121
150
  if node_id not in generated_files:
122
151
  continue
123
152
 
124
- checksum = node_data.get("checksum", {}).get("checksum", "")
153
+ # Use combined checksum that includes metrics/meta config
154
+ checksum = compute_model_checksum(node_data)
125
155
  has_metrics = bool(
126
156
  node_data.get("config", {}).get("meta", {}).get("metrics")
127
157
  )
@@ -174,20 +204,24 @@ class StateManager:
174
204
  # Update/add newly generated models
175
205
  for node_id, output_file in generated_files.items():
176
206
  node_data = manifest_nodes.get(node_id, {})
177
- checksum = node_data.get("checksum", {}).get("checksum", "")
207
+ # Use combined checksum that includes metrics/meta config
208
+ checksum = compute_model_checksum(node_data)
178
209
  has_metrics = bool(
179
210
  node_data.get("config", {}).get("meta", {}).get("metrics")
180
211
  )
181
212
 
213
+ # For newly generated/modified models, reset sync status (they need to be re-synced)
182
214
  models[node_id] = ModelState(
183
215
  checksum=checksum,
184
216
  has_metrics=has_metrics,
185
217
  last_generated=timestamp,
186
218
  output_file=output_file,
219
+ superset_sync_status=None, # Reset - needs sync
220
+ rag_sync_status=None, # Reset - needs sync
187
221
  )
188
222
 
189
223
  return SyncState(
190
- version="1.0",
224
+ version="1.1",
191
225
  last_sync_timestamp=timestamp,
192
226
  manifest_path=str(manifest_path),
193
227
  models=models,
@@ -220,64 +254,80 @@ class StateManager:
220
254
 
221
255
  return files_to_delete
222
256
 
223
- def should_run_step(
257
+ def get_models_needing_sync(
224
258
  self,
225
- step_name: str,
226
- previous_state: Optional[SyncState],
227
- changes_detected: bool,
228
- ) -> bool:
259
+ state: SyncState,
260
+ step: str,
261
+ ) -> Set[str]:
229
262
  """
230
- Determine if a pipeline step should run.
263
+ Get node_ids of models that need to be synced for a step.
231
264
 
232
- A step should run if:
233
- - There are changes detected, OR
234
- - The previous run of this step failed
265
+ A model needs sync if:
266
+ - Its sync status is None (never synced)
267
+ - Its sync status is 'failed' (needs retry)
235
268
 
236
269
  Args:
237
- step_name: Name of the step ('cube_sync', 'superset_sync', 'rag_sync')
238
- previous_state: Previous sync state
239
- changes_detected: Whether model changes were detected
270
+ state: Current sync state
271
+ step: Step name ('superset' or 'rag')
240
272
 
241
273
  Returns:
242
- True if the step should run
274
+ Set of node_ids that need syncing
243
275
  """
244
- if changes_detected:
245
- return True
276
+ models_to_sync = set()
277
+ status_field = f"{step}_sync_status"
246
278
 
247
- if previous_state is None:
248
- return True
249
-
250
- step_state = getattr(previous_state, step_name, None)
251
- if step_state is None:
252
- return True
279
+ for node_id, model_state in state.models.items():
280
+ status = getattr(model_state, status_field, None)
281
+ if status is None or status == 'failed':
282
+ models_to_sync.add(node_id)
253
283
 
254
- # Re-run if previous attempt failed
255
- return step_state.status == 'failed'
284
+ return models_to_sync
256
285
 
257
- def update_step_state(
286
+ def update_model_sync_status(
258
287
  self,
259
288
  state: SyncState,
260
- step_name: str,
289
+ node_id: str,
290
+ step: str,
261
291
  status: str,
262
- error: Optional[str] = None,
263
- ) -> SyncState:
292
+ ) -> None:
264
293
  """
265
- Update the state of a pipeline step.
294
+ Update the sync status of a model for a specific step.
266
295
 
267
296
  Args:
268
297
  state: Current sync state
269
- step_name: Name of the step ('cube_sync', 'superset_sync', 'rag_sync')
270
- status: Step status ('success', 'failed', 'skipped')
271
- error: Error message if failed
298
+ node_id: The model's node_id
299
+ step: Step name ('superset' or 'rag')
300
+ status: Status to set ('success' or 'failed')
301
+ """
302
+ if node_id in state.models:
303
+ status_field = f"{step}_sync_status"
304
+ setattr(state.models[node_id], status_field, status)
305
+
306
+ def get_sync_summary(
307
+ self,
308
+ state: SyncState,
309
+ step: str,
310
+ ) -> Dict[str, int]:
311
+ """
312
+ Get a summary of sync status for a step.
313
+
314
+ Args:
315
+ state: Current sync state
316
+ step: Step name ('superset' or 'rag')
272
317
 
273
318
  Returns:
274
- Updated SyncState
319
+ Dict with counts: {'success': N, 'failed': N, 'pending': N}
275
320
  """
276
- timestamp = datetime.utcnow().isoformat() + "Z"
277
- step_state = StepState(
278
- status=status,
279
- last_run=timestamp,
280
- error=error,
281
- )
282
- setattr(state, step_name, step_state)
283
- return state
321
+ status_field = f"{step}_sync_status"
322
+ summary = {'success': 0, 'failed': 0, 'pending': 0}
323
+
324
+ for model_state in state.models.values():
325
+ status = getattr(model_state, status_field, None)
326
+ if status == 'success':
327
+ summary['success'] += 1
328
+ elif status == 'failed':
329
+ summary['failed'] += 1
330
+ else:
331
+ summary['pending'] += 1
332
+
333
+ return summary
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dbt-cube-sync"
3
- version = "0.1.0a11"
3
+ version = "0.1.0a13"
4
4
  description = "Synchronization tool for dbt models to Cube.js schemas and BI tools"
5
5
  authors = ["Ponder"]
6
6
  readme = "README.md"