dbt-cube-sync 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dbt-cube-sync might be problematic. Click here for more details.

@@ -0,0 +1,556 @@
1
+ """
2
+ Superset connector for syncing Cube.js schemas
3
+ """
4
+ import os
5
+ import re
6
+ import json
7
+ import requests
8
+ import time
9
+ from typing import Dict, List, Any, Optional
10
+ from pathlib import Path
11
+
12
+ from .base import BaseConnector, ConnectorRegistry
13
+ from ..core.models import SyncResult
14
+
15
+
16
+ class SupersetConnector(BaseConnector):
17
+ """Connector for Apache Superset BI tool"""
18
+
19
+ def __init__(self, url: str, username: str, password: str, database_name: str = "Cube", **kwargs):
20
+ """
21
+ Initialize Superset connector
22
+
23
+ Args:
24
+ url: Superset base URL (e.g., 'http://localhost:8088')
25
+ username: Superset username
26
+ password: Superset password
27
+ database_name: Name of the database in Superset (default: "Cube")
28
+ """
29
+ super().__init__(
30
+ url=url,
31
+ username=username,
32
+ password=password,
33
+ database_name=database_name,
34
+ **kwargs
35
+ )
36
+
37
+ self.base_url = url.rstrip('/')
38
+ self.session = requests.Session()
39
+ self.access_token = None
40
+ self.csrf_token = None
41
+ self.database_id = None
42
+
43
+ self.connect()
44
+
45
+ def _validate_config(self) -> None:
46
+ """Validate the provided configuration"""
47
+ required_fields = ['url', 'username', 'password']
48
+ missing_fields = [field for field in required_fields if not self.config.get(field)]
49
+
50
+ if missing_fields:
51
+ raise ValueError(f"Missing required configuration fields: {missing_fields}")
52
+
53
+ def connect(self) -> None:
54
+ """Establish connection to Superset"""
55
+ self._login()
56
+ self._get_csrf_token()
57
+ self._get_database_id()
58
+
59
+ def _login(self):
60
+ """Authenticate and get JWT token"""
61
+ login_url = f"{self.base_url}/api/v1/security/login"
62
+ payload = {
63
+ "username": self.config['username'],
64
+ "password": self.config['password'],
65
+ "provider": "db",
66
+ "refresh": True
67
+ }
68
+
69
+ response = self.session.post(login_url, json=payload)
70
+ response.raise_for_status()
71
+
72
+ data = response.json()
73
+ self.access_token = data.get('access_token')
74
+
75
+ # Set authorization header for all future requests
76
+ self.session.headers.update({
77
+ 'Authorization': f'Bearer {self.access_token}',
78
+ 'Content-Type': 'application/json'
79
+ })
80
+
81
+ print("āœ“ Successfully logged in to Superset")
82
+
83
+ def _get_csrf_token(self):
84
+ """Get CSRF token for POST requests"""
85
+ csrf_url = f"{self.base_url}/api/v1/security/csrf_token/"
86
+ response = self.session.get(csrf_url)
87
+ response.raise_for_status()
88
+
89
+ self.csrf_token = response.json().get('result')
90
+ self.session.headers.update({'X-CSRFToken': self.csrf_token})
91
+
92
+ print("āœ“ Retrieved CSRF token")
93
+
94
+ def _get_database_id(self):
95
+ """Get database ID by name"""
96
+ database_name = self.config.get('database_name', 'Cube')
97
+ databases_url = f"{self.base_url}/api/v1/database/"
98
+ params = {
99
+ "q": json.dumps({
100
+ "filters": [
101
+ {
102
+ "col": "database_name",
103
+ "opr": "eq",
104
+ "value": database_name
105
+ }
106
+ ]
107
+ })
108
+ }
109
+
110
+ response = self.session.get(databases_url, params=params)
111
+ response.raise_for_status()
112
+
113
+ result = response.json().get('result', [])
114
+ if not result:
115
+ raise ValueError(f"Database '{database_name}' not found")
116
+
117
+ self.database_id = result[0]['id']
118
+ print(f"āœ“ Found database '{database_name}' with ID: {self.database_id}")
119
+
120
+ def sync_cube_schemas(self, cube_dir: str) -> List[SyncResult]:
121
+ """Sync all Cube.js schemas from directory to Superset"""
122
+ results = []
123
+ cube_files = self._get_cube_files(cube_dir)
124
+
125
+ if not cube_files:
126
+ return [SyncResult(
127
+ file_or_dataset="No files",
128
+ status="failed",
129
+ message=f"No .js files found in {cube_dir}"
130
+ )]
131
+
132
+ print(f"šŸ” Found {len(cube_files)} Cube.js files")
133
+
134
+ for cube_file in cube_files:
135
+ try:
136
+ print(f"\\n{'='*60}")
137
+ print(f"Processing: {cube_file.name}")
138
+ print(f"{'='*60}")
139
+
140
+ result = self.sync_single_schema(str(cube_file))
141
+ results.append(result)
142
+
143
+ except Exception as e:
144
+ print(f"āœ— Error processing {cube_file.name}: {str(e)}")
145
+ results.append(SyncResult(
146
+ file_or_dataset=cube_file.name,
147
+ status="failed",
148
+ error=str(e)
149
+ ))
150
+
151
+ # Print summary
152
+ successful = sum(1 for r in results if r.status == 'success')
153
+ failed = sum(1 for r in results if r.status == 'failed')
154
+ print(f"\\n{'='*60}")
155
+ print("SYNC SUMMARY")
156
+ print(f"{'='*60}")
157
+ print(f"āœ“ Successful: {successful}")
158
+ print(f"āœ— Failed: {failed}")
159
+ print(f"Total: {len(results)}")
160
+
161
+ return results
162
+
163
+ def sync_single_schema(self, cube_file_path: str) -> SyncResult:
164
+ """Sync a single Cube.js schema file to Superset"""
165
+ try:
166
+ schema_info = self._parse_cube_file(cube_file_path)
167
+ dataset_id = self._create_or_update_dataset(schema_info)
168
+
169
+ return SyncResult(
170
+ file_or_dataset=Path(cube_file_path).name,
171
+ status="success",
172
+ message=f"Dataset created/updated with ID: {dataset_id}"
173
+ )
174
+
175
+ except Exception as e:
176
+ return SyncResult(
177
+ file_or_dataset=Path(cube_file_path).name,
178
+ status="failed",
179
+ error=str(e)
180
+ )
181
+
182
+ def _parse_cube_file(self, file_path: str) -> Dict[str, Any]:
183
+ """Parse Cube.js schema file and extract metadata"""
184
+ with open(file_path, 'r') as f:
185
+ content = f.read()
186
+
187
+ # Extract cube name
188
+ cube_name_match = re.search(r'cube\s*\(\s*[`"\']([^`"\']+)[`"\']', content)
189
+ if not cube_name_match:
190
+ raise ValueError(f"Could not find cube name in {file_path}")
191
+
192
+ cube_name = cube_name_match.group(1)
193
+
194
+ # Extract schema and table name from SQL query
195
+ sql_match = re.search(r'sql:\s*[`"\']\s*SELECT\s+.*FROM\s+(\w+\.\w+)', content, re.IGNORECASE)
196
+ if sql_match:
197
+ schema_table = sql_match.group(1)
198
+ schema_name, table_name = schema_table.split('.')
199
+ else:
200
+ # Fallback to cube name
201
+ schema_name = "public"
202
+ table_name = cube_name
203
+
204
+ print(f" Cube: {cube_name}")
205
+ print(f" Schema: {schema_name}")
206
+ print(f" Table: {table_name}")
207
+
208
+ # Parse dimensions
209
+ dimensions = self._parse_dimensions(content)
210
+
211
+ # Parse measures
212
+ measures = self._parse_measures(content)
213
+
214
+ return {
215
+ 'cube_name': cube_name,
216
+ 'schema': schema_name,
217
+ 'table_name': table_name,
218
+ 'dimensions': dimensions,
219
+ 'measures': measures
220
+ }
221
+
222
+ def _parse_dimensions(self, content: str) -> List[Dict[str, Any]]:
223
+ """Extract dimensions from Cube.js file"""
224
+ dimensions = []
225
+
226
+ # Find dimensions block
227
+ dimensions_match = re.search(
228
+ r'dimensions:\s*\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*?)\}',
229
+ content,
230
+ re.DOTALL
231
+ )
232
+
233
+ if not dimensions_match:
234
+ print(" āš ļø No dimensions block found")
235
+ return dimensions
236
+
237
+ dimensions_block = dimensions_match.group(1)
238
+
239
+ # Parse individual dimensions
240
+ dimension_pattern = r'(\w+):\s*\{([^}]*(?:\{[^}]*\}[^}]*)*)\}(?=\s*,|\s*$)'
241
+ matches = list(re.finditer(dimension_pattern, dimensions_block))
242
+
243
+ print(f" Found {len(matches)} dimensions in Cube.js file")
244
+
245
+ for match in matches:
246
+ dim_name = match.group(1)
247
+ dim_content = match.group(2)
248
+
249
+ # Extract sql field (actual column name)
250
+ sql_match = re.search(r'sql:\s*`([^`]+)`', dim_content)
251
+ column_name = sql_match.group(1).strip() if sql_match else dim_name
252
+
253
+ # Extract type
254
+ type_match = re.search(r'type:\s*[`"\']([^`"\']+)[`"\']', dim_content)
255
+ dim_type = type_match.group(1) if type_match else 'string'
256
+
257
+ # Extract title/description
258
+ title_match = re.search(r'title:\s*[\'"]([^\'\"]+)[\'"]', dim_content)
259
+ description = title_match.group(1) if title_match else dim_name.replace('_', ' ').title()
260
+
261
+ verbose_name = dim_name.replace('_', ' ').title()
262
+
263
+ dimensions.append({
264
+ 'column_name': column_name,
265
+ 'type': self._map_cube_type_to_superset(dim_type),
266
+ 'verbose_name': verbose_name,
267
+ 'description': description,
268
+ 'is_dttm': dim_type == 'time',
269
+ 'groupby': True,
270
+ 'filterable': True
271
+ })
272
+
273
+ print(f" - {dim_name} ({column_name})")
274
+
275
+ return dimensions
276
+
277
+ def _parse_measures(self, content: str) -> List[Dict[str, Any]]:
278
+ """Extract measures from Cube.js file"""
279
+ measures = []
280
+
281
+ # Find measures block
282
+ measures_match = re.search(
283
+ r'measures:\s*\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*?)\}',
284
+ content,
285
+ re.DOTALL
286
+ )
287
+
288
+ if not measures_match:
289
+ print(" āš ļø No measures block found")
290
+ return measures
291
+
292
+ measures_block = measures_match.group(1)
293
+
294
+ # Parse individual measures
295
+ measure_pattern = r'(\w+):\s*\{([^}]*(?:\{[^}]*\}[^}]*)*)\}(?=\s*,|\s*$)'
296
+ matches = list(re.finditer(measure_pattern, measures_block))
297
+
298
+ print(f" Found {len(matches)} measures in Cube.js file")
299
+
300
+ for match in matches:
301
+ measure_name = match.group(1)
302
+ measure_content = match.group(2)
303
+
304
+ # Extract type
305
+ type_match = re.search(r'type:\s*[`"\']([^`"\']+)[`"\']', measure_content)
306
+ measure_type = type_match.group(1) if type_match else 'sum'
307
+
308
+ # Extract sql
309
+ sql_match = re.search(r'sql:\s*`([^`]+)`', measure_content)
310
+ sql_expression = sql_match.group(1).strip() if sql_match else measure_name
311
+
312
+ # Extract title
313
+ title_match = re.search(r'title:\s*[\'"]([^\'\"]+)[\'"]', measure_content)
314
+ metric_name = title_match.group(1) if title_match else measure_name.replace('_', ' ').title()
315
+
316
+ # Map Cube.js aggregation type to SQL aggregate
317
+ expression = self._create_metric_expression(measure_type, sql_expression)
318
+
319
+ measures.append({
320
+ 'metric_name': metric_name,
321
+ 'expression': expression,
322
+ 'description': metric_name,
323
+ 'verbose_name': metric_name,
324
+ 'metric_type': measure_type
325
+ })
326
+
327
+ print(f" - {metric_name}")
328
+
329
+ return measures
330
+
331
+ def _map_cube_type_to_superset(self, cube_type: str) -> str:
332
+ """Map Cube.js types to Superset/SQL types"""
333
+ type_mapping = {
334
+ 'string': 'VARCHAR',
335
+ 'number': 'NUMERIC',
336
+ 'time': 'TIMESTAMP',
337
+ 'boolean': 'BOOLEAN'
338
+ }
339
+ return type_mapping.get(cube_type, 'VARCHAR')
340
+
341
+ def _create_metric_expression(self, agg_type: str, sql_expression: str) -> str:
342
+ """Create SQL metric expression from Cube.js measure"""
343
+ agg_mapping = {
344
+ 'sum': 'SUM',
345
+ 'avg': 'AVG',
346
+ 'count': 'COUNT',
347
+ 'min': 'MIN',
348
+ 'max': 'MAX',
349
+ 'count_distinct': 'COUNT(DISTINCT'
350
+ }
351
+
352
+ agg_func = agg_mapping.get(agg_type, 'SUM')
353
+
354
+ if agg_type == 'count_distinct':
355
+ return f"{agg_func} {sql_expression})"
356
+ else:
357
+ return f"{agg_func}({sql_expression})"
358
+
359
+ def _create_or_update_dataset(self, schema_info: Dict[str, Any]) -> int:
360
+ """Create a new dataset or update existing one"""
361
+ # Check if dataset already exists
362
+ existing_id = self._find_existing_dataset(
363
+ schema_info['schema'],
364
+ schema_info['table_name']
365
+ )
366
+
367
+ if existing_id:
368
+ print(f"\\nšŸ”„ Dataset already exists (ID: {existing_id}), updating...")
369
+ self._update_dataset_metadata(existing_id, schema_info)
370
+ return existing_id
371
+ else:
372
+ return self._create_new_dataset(schema_info)
373
+
374
+ def _find_existing_dataset(self, schema_name: str, table_name: str) -> Optional[int]:
375
+ """Find existing dataset by schema and table name"""
376
+ dataset_url = f"{self.base_url}/api/v1/dataset/"
377
+ params = {
378
+ "q": json.dumps({
379
+ "filters": [
380
+ {
381
+ "col": "table_name",
382
+ "opr": "eq",
383
+ "value": table_name
384
+ },
385
+ {
386
+ "col": "schema",
387
+ "opr": "eq",
388
+ "value": schema_name
389
+ },
390
+ {
391
+ "col": "database",
392
+ "opr": "rel_o_m",
393
+ "value": self.database_id
394
+ }
395
+ ]
396
+ })
397
+ }
398
+
399
+ response = self.session.get(dataset_url, params=params)
400
+ if response.status_code == 200:
401
+ results = response.json().get('result', [])
402
+ if results:
403
+ return results[0]['id']
404
+
405
+ return None
406
+
407
+ def _create_new_dataset(self, schema_info: Dict[str, Any]) -> int:
408
+ """Create a new dataset in Superset"""
409
+ dataset_url = f"{self.base_url}/api/v1/dataset/"
410
+
411
+ payload = {
412
+ "database": self.database_id,
413
+ "schema": schema_info['schema'],
414
+ "table_name": schema_info['table_name'],
415
+ "normalize_columns": False,
416
+ "always_filter_main_dttm": False
417
+ }
418
+
419
+ print(f"\\nšŸ“Š Creating new dataset: {schema_info['table_name']}")
420
+ response = self.session.post(dataset_url, json=payload)
421
+
422
+ if response.status_code == 201:
423
+ dataset_id = response.json()['id']
424
+ print(f"āœ“ Dataset created with ID: {dataset_id}")
425
+
426
+ # Update dataset with columns and metrics
427
+ self._update_dataset_metadata(dataset_id, schema_info)
428
+
429
+ return dataset_id
430
+ else:
431
+ print(f"āœ— Failed to create dataset: {response.status_code}")
432
+ print(f"Response: {response.text}")
433
+ raise Exception(f"Failed to create dataset: {response.text}")
434
+
435
+ def _update_dataset_metadata(self, dataset_id: int, schema_info: Dict[str, Any]):
436
+ """Update dataset with column descriptions and metrics"""
437
+ dataset_url = f"{self.base_url}/api/v1/dataset/{dataset_id}"
438
+
439
+ # Refresh dataset to get all columns
440
+ print(f"\\nšŸ”„ Step 1: Refreshing dataset to fetch columns...")
441
+ refresh_url = f"{self.base_url}/api/v1/dataset/{dataset_id}/refresh"
442
+ self.session.put(refresh_url)
443
+ time.sleep(2) # Wait for refresh
444
+
445
+ # Get current dataset info
446
+ print(f"\\nšŸ“„ Step 2: Fetching dataset details...")
447
+ response = self.session.get(dataset_url)
448
+ if response.status_code != 200:
449
+ print(f"āœ— Failed to get dataset info: {response.status_code}")
450
+ return
451
+
452
+ dataset_data = response.json()['result']
453
+ existing_columns = dataset_data.get('columns', [])
454
+ existing_metrics = dataset_data.get('metrics', [])
455
+
456
+ # Update columns
457
+ print(f"\\nšŸ·ļø Step 3: Updating column metadata...")
458
+ updated_columns = self._update_columns(existing_columns, schema_info['dimensions'])
459
+
460
+ # Update metrics
461
+ print(f"\\nšŸ“Š Step 4: Adding metrics to dataset...")
462
+ updated_metrics = self._update_metrics(existing_metrics, schema_info['measures'])
463
+
464
+ # Send updates
465
+ if updated_columns:
466
+ print(f"\\nšŸ’¾ Step 5: Saving updates...")
467
+ update_payload = {
468
+ 'columns': updated_columns,
469
+ 'metrics': updated_metrics
470
+ }
471
+
472
+ response = self.session.put(dataset_url, json=update_payload)
473
+ if response.status_code == 200:
474
+ print(f"āœ“ Dataset updated successfully")
475
+ else:
476
+ print(f"āœ— Failed to update dataset: {response.status_code}")
477
+ print(f"Response: {response.text}")
478
+
479
+ def _update_columns(self, existing_columns: List[dict], dimensions: List[dict]) -> List[dict]:
480
+ """Update columns with metadata from dimensions"""
481
+ updated_columns = []
482
+
483
+ for col in existing_columns:
484
+ col_name = col['column_name']
485
+
486
+ # Find matching dimension
487
+ matching_dim = next(
488
+ (d for d in dimensions if d['column_name'].lower() == col_name.lower()),
489
+ None
490
+ )
491
+
492
+ if matching_dim:
493
+ # Clean and update column
494
+ updated_col = {k: v for k, v in col.items()
495
+ if k not in ['created_on', 'changed_on', 'type_generic', 'uuid', 'advanced_data_type']}
496
+
497
+ updated_col.update({
498
+ 'verbose_name': matching_dim['verbose_name'],
499
+ 'description': matching_dim['description'],
500
+ 'is_dttm': matching_dim['is_dttm'],
501
+ 'groupby': matching_dim['groupby'],
502
+ 'filterable': matching_dim['filterable'],
503
+ 'is_active': True,
504
+ 'expression': col.get('expression', ''),
505
+ })
506
+
507
+ updated_columns.append(updated_col)
508
+ print(f" āœ“ {col_name} → '{matching_dim['verbose_name']}'")
509
+ else:
510
+ # Clean column but keep it
511
+ clean_col = {k: v for k, v in col.items()
512
+ if k not in ['created_on', 'changed_on', 'type_generic', 'uuid', 'advanced_data_type']}
513
+ updated_columns.append(clean_col)
514
+ print(f" ā—‹ {col_name} (no matching dimension)")
515
+
516
+ return updated_columns
517
+
518
+ def _update_metrics(self, existing_metrics: List[dict], measures: List[dict]) -> List[dict]:
519
+ """Update metrics with new measures"""
520
+ # Clean existing metrics
521
+ updated_metrics = []
522
+ for metric in existing_metrics:
523
+ clean_metric = {k: v for k, v in metric.items()
524
+ if k not in ['created_on', 'changed_on', 'uuid']}
525
+ updated_metrics.append(clean_metric)
526
+
527
+ # Add new metrics
528
+ existing_metric_names = {m.get('metric_name') for m in existing_metrics}
529
+ added_count = 0
530
+
531
+ for measure in measures:
532
+ metric_name = measure['metric_name']
533
+
534
+ if metric_name not in existing_metric_names:
535
+ new_metric = {
536
+ 'metric_name': metric_name,
537
+ 'verbose_name': measure['verbose_name'],
538
+ 'expression': measure['expression'],
539
+ 'description': measure['description'],
540
+ 'metric_type': 'simple',
541
+ 'currency': None,
542
+ 'd3format': None,
543
+ 'extra': None,
544
+ 'warning_text': None
545
+ }
546
+ updated_metrics.append(new_metric)
547
+ print(f" āœ“ Prepared '{metric_name}': {measure['expression']}")
548
+ added_count += 1
549
+ else:
550
+ print(f" ⊘ Skipping '{metric_name}' (already exists)")
551
+
552
+ return updated_metrics
553
+
554
+
555
+ # Register the Superset connector
556
+ ConnectorRegistry.register('superset', SupersetConnector)
@@ -0,0 +1,34 @@
1
+ """
2
+ Tableau connector placeholder for future implementation
3
+ """
4
+ from typing import List
5
+ from .base import BaseConnector, ConnectorRegistry
6
+ from ..core.models import SyncResult
7
+
8
+
9
+ class TableauConnector(BaseConnector):
10
+ """Connector for Tableau (placeholder implementation)"""
11
+
12
+ def _validate_config(self) -> None:
13
+ """Validate the provided configuration"""
14
+ # TODO: Implement Tableau-specific validation
15
+ pass
16
+
17
+ def connect(self) -> None:
18
+ """Establish connection to Tableau"""
19
+ # TODO: Implement Tableau connection logic
20
+ raise NotImplementedError("Tableau connector not yet implemented")
21
+
22
+ def sync_cube_schemas(self, cube_dir: str) -> List[SyncResult]:
23
+ """Sync all Cube.js schemas from directory to Tableau"""
24
+ # TODO: Implement Tableau sync logic
25
+ raise NotImplementedError("Tableau connector not yet implemented")
26
+
27
+ def sync_single_schema(self, cube_file_path: str) -> SyncResult:
28
+ """Sync a single Cube.js schema file to Tableau"""
29
+ # TODO: Implement single schema sync for Tableau
30
+ raise NotImplementedError("Tableau connector not yet implemented")
31
+
32
+
33
+ # Register the Tableau connector
34
+ ConnectorRegistry.register('tableau', TableauConnector)
@@ -0,0 +1 @@
1
+ """Core modules for dbt and cube processing"""