datalex-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. datalex_cli/__init__.py +1 -0
  2. datalex_cli/datalex_cli.py +658 -0
  3. datalex_cli/main.py +2925 -0
  4. datalex_cli-0.1.1.dist-info/METADATA +228 -0
  5. datalex_cli-0.1.1.dist-info/RECORD +64 -0
  6. datalex_cli-0.1.1.dist-info/WHEEL +5 -0
  7. datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
  8. datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
  9. datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
  10. datalex_core/__init__.py +94 -0
  11. datalex_core/_schemas/datalex/common.schema.json +127 -0
  12. datalex_core/_schemas/datalex/domain.schema.json +24 -0
  13. datalex_core/_schemas/datalex/entity.schema.json +158 -0
  14. datalex_core/_schemas/datalex/model.schema.json +141 -0
  15. datalex_core/_schemas/datalex/policy.schema.json +70 -0
  16. datalex_core/_schemas/datalex/project.schema.json +82 -0
  17. datalex_core/_schemas/datalex/snippet.schema.json +24 -0
  18. datalex_core/_schemas/datalex/source.schema.json +104 -0
  19. datalex_core/_schemas/datalex/term.schema.json +30 -0
  20. datalex_core/canonical.py +166 -0
  21. datalex_core/completion.py +204 -0
  22. datalex_core/connectors/__init__.py +39 -0
  23. datalex_core/connectors/base.py +417 -0
  24. datalex_core/connectors/bigquery.py +229 -0
  25. datalex_core/connectors/databricks.py +262 -0
  26. datalex_core/connectors/mysql.py +266 -0
  27. datalex_core/connectors/postgres.py +309 -0
  28. datalex_core/connectors/redshift.py +298 -0
  29. datalex_core/connectors/snowflake.py +336 -0
  30. datalex_core/connectors/sqlserver.py +425 -0
  31. datalex_core/datalex/__init__.py +26 -0
  32. datalex_core/datalex/diff.py +188 -0
  33. datalex_core/datalex/errors.py +85 -0
  34. datalex_core/datalex/loader.py +512 -0
  35. datalex_core/datalex/migrate_layout.py +382 -0
  36. datalex_core/datalex/parse_cache.py +102 -0
  37. datalex_core/datalex/project.py +214 -0
  38. datalex_core/datalex/types.py +224 -0
  39. datalex_core/dbt/__init__.py +18 -0
  40. datalex_core/dbt/emit.py +344 -0
  41. datalex_core/dbt/manifest.py +329 -0
  42. datalex_core/dbt/profiles.py +185 -0
  43. datalex_core/dbt/sync.py +279 -0
  44. datalex_core/dbt/warehouse.py +215 -0
  45. datalex_core/dialects/__init__.py +15 -0
  46. datalex_core/dialects/_common.py +48 -0
  47. datalex_core/dialects/base.py +47 -0
  48. datalex_core/dialects/postgres.py +164 -0
  49. datalex_core/dialects/registry.py +36 -0
  50. datalex_core/dialects/snowflake.py +129 -0
  51. datalex_core/diffing.py +358 -0
  52. datalex_core/docs_generator.py +797 -0
  53. datalex_core/doctor.py +181 -0
  54. datalex_core/generators.py +478 -0
  55. datalex_core/importers.py +1176 -0
  56. datalex_core/issues.py +23 -0
  57. datalex_core/loader.py +21 -0
  58. datalex_core/migrate.py +316 -0
  59. datalex_core/modeling.py +679 -0
  60. datalex_core/packages.py +430 -0
  61. datalex_core/policy.py +1037 -0
  62. datalex_core/resolver.py +456 -0
  63. datalex_core/schema.py +54 -0
  64. datalex_core/semantic.py +1561 -0
datalex_cli/main.py ADDED
@@ -0,0 +1,2925 @@
1
+ import argparse
2
+ import glob
3
+ import json
4
+ import hashlib
5
+ import re
6
+ import os
7
+ import sys
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Tuple
11
+ from urllib.parse import urlparse
12
+
13
+ import yaml
14
+
15
+ from datalex_core import (
16
+ apply_standards_fixes,
17
+ compile_model,
18
+ completeness_as_dict,
19
+ completeness_report,
20
+ diagnostics_as_json,
21
+ format_diagnostics,
22
+ generate_bash_completion,
23
+ generate_changelog,
24
+ generate_fish_completion,
25
+ generate_html_docs,
26
+ generate_markdown_docs,
27
+ generate_migration,
28
+ generate_sql_ddl,
29
+ generate_zsh_completion,
30
+ ConnectorConfig,
31
+ get_connector,
32
+ import_dbt_schema_yml,
33
+ import_dbml,
34
+ import_spark_schema,
35
+ import_sql_ddl,
36
+ sync_dbt_schema_yml,
37
+ list_connectors,
38
+ lint_issues,
39
+ load_policy_pack,
40
+ load_policy_pack_with_inheritance,
41
+ load_schema,
42
+ load_yaml_model,
43
+ merge_policy_packs,
44
+ merge_models_preserving_docs,
45
+ policy_issues,
46
+ project_diff,
47
+ resolve_model,
48
+ resolve_project,
49
+ run_diagnostics,
50
+ schema_issues,
51
+ semantic_diff,
52
+ standards_issues,
53
+ transform_model,
54
+ write_changelog,
55
+ write_dbt_scaffold,
56
+ write_html_docs,
57
+ write_markdown_docs,
58
+ write_migration,
59
+ )
60
+ from datalex_core.issues import Issue, has_errors, to_lines
61
+
62
+ STARTER_MODEL = """model:
63
+ name: starter_model
64
+ version: 1.0.0
65
+ domain: demo
66
+ owners:
67
+ - data-team@example.com
68
+ state: draft
69
+
70
+ entities:
71
+ - name: User
72
+ type: table
73
+ fields:
74
+ - name: user_id
75
+ type: integer
76
+ primary_key: true
77
+ nullable: false
78
+ - name: email
79
+ type: string
80
+ nullable: false
81
+ """
82
+
83
+ MULTI_MODEL_SHARED = """model:
84
+ name: shared_dimensions
85
+ spec_version: 2
86
+ version: 1.0.0
87
+ domain: shared
88
+ owners:
89
+ - data-team@example.com
90
+ state: draft
91
+ description: Shared dimension entities used across domain models
92
+
93
+ entities:
94
+ - name: Customer
95
+ type: table
96
+ description: Customer master record
97
+ schema: shared
98
+ subject_area: customer_domain
99
+ fields:
100
+ - name: customer_id
101
+ type: integer
102
+ primary_key: true
103
+ nullable: false
104
+ - name: email
105
+ type: string
106
+ nullable: false
107
+ unique: true
108
+ - name: full_name
109
+ type: string
110
+ nullable: false
111
+ - name: created_at
112
+ type: timestamp
113
+ nullable: false
114
+
115
+ indexes:
116
+ - name: idx_customer_email
117
+ entity: Customer
118
+ fields: [email]
119
+ unique: true
120
+ """
121
+
122
+ MULTI_MODEL_ORDERS = """model:
123
+ name: orders
124
+ spec_version: 2
125
+ version: 1.0.0
126
+ domain: sales
127
+ owners:
128
+ - data-team@example.com
129
+ state: draft
130
+ description: Order domain model
131
+ imports:
132
+ - model: shared_dimensions
133
+ alias: shared
134
+ entities: [Customer]
135
+
136
+ entities:
137
+ - name: Order
138
+ type: table
139
+ description: Customer orders
140
+ schema: sales
141
+ subject_area: order_domain
142
+ fields:
143
+ - name: order_id
144
+ type: integer
145
+ primary_key: true
146
+ nullable: false
147
+ - name: customer_id
148
+ type: integer
149
+ nullable: false
150
+ foreign_key: true
151
+ - name: total_amount
152
+ type: decimal(12,2)
153
+ nullable: false
154
+ - name: order_date
155
+ type: timestamp
156
+ nullable: false
157
+
158
+ relationships:
159
+ - name: order_customer
160
+ from: Order.customer_id
161
+ to: Customer.customer_id
162
+ cardinality: many_to_one
163
+ description: Order belongs to a customer (cross-model)
164
+ """
165
+
166
+ END_TO_END_SOURCE = """model:
167
+ name: source_sales_raw
168
+ spec_version: 2
169
+ version: 1.0.0
170
+ domain: sales
171
+ owners:
172
+ - data-platform@example.com
173
+ state: draft
174
+ layer: source
175
+ description: Source layer contract pulled from warehouse raw schemas.
176
+
177
+ entities:
178
+ - name: RawCustomers
179
+ type: table
180
+ description: Raw customer profile records from CRM.
181
+ tags: [BRONZE, SOURCE, CUSTOMER]
182
+ schema: raw
183
+ subject_area: customer_domain
184
+ owner: customer-data@example.com
185
+ grain: [customer_id]
186
+ sla:
187
+ freshness: 4h
188
+ quality_score: 98
189
+ fields:
190
+ - name: customer_id
191
+ type: string
192
+ primary_key: true
193
+ nullable: false
194
+ description: Stable customer identifier from CRM.
195
+ tags: [IDENTIFIER]
196
+ - name: email
197
+ type: string
198
+ nullable: false
199
+ description: Customer email from source system.
200
+ tags: [PII, CONTACT]
201
+ sensitivity: restricted
202
+ - name: created_at
203
+ type: timestamp
204
+ nullable: false
205
+ description: Customer creation timestamp from source.
206
+ tags: [AUDIT]
207
+
208
+ - name: RawOrders
209
+ type: table
210
+ description: Raw order transactions from commerce platform.
211
+ tags: [BRONZE, SOURCE, ORDER]
212
+ schema: raw
213
+ subject_area: order_domain
214
+ owner: order-data@example.com
215
+ grain: [order_id]
216
+ sla:
217
+ freshness: 2h
218
+ quality_score: 97
219
+ fields:
220
+ - name: order_id
221
+ type: string
222
+ primary_key: true
223
+ nullable: false
224
+ description: Unique order identifier.
225
+ tags: [IDENTIFIER]
226
+ - name: customer_id
227
+ type: string
228
+ nullable: false
229
+ foreign_key: true
230
+ description: Customer identifier attached to the order.
231
+ tags: [JOIN_KEY]
232
+ - name: order_ts
233
+ type: timestamp
234
+ nullable: false
235
+ description: Order creation timestamp.
236
+ tags: [EVENT_TIME]
237
+ - name: gross_amount
238
+ type: decimal(12,2)
239
+ nullable: false
240
+ description: Total order amount before discounts and tax allocations.
241
+ tags: [AMOUNT, FINANCE]
242
+ - name: status
243
+ type: string
244
+ nullable: false
245
+ description: Raw order lifecycle status.
246
+ tags: [STATUS]
247
+
248
+ relationships:
249
+ - name: raw_orders_customer
250
+ from: RawOrders.customer_id
251
+ to: RawCustomers.customer_id
252
+ cardinality: many_to_one
253
+ description: Raw order row belongs to a raw customer row.
254
+
255
+ governance:
256
+ classification:
257
+ RawCustomers.email: PII
258
+ stewards:
259
+ customer_domain: customer-data@example.com
260
+ order_domain: order-data@example.com
261
+ retention:
262
+ period: 3y
263
+ policy: source_contract_baseline
264
+
265
+ glossary:
266
+ - term: Raw Zone
267
+ definition: Ingested source-aligned data before business transformations.
268
+ owner: data-platform@example.com
269
+ tags: [INGESTION]
270
+
271
+ rules:
272
+ - name: raw_orders_amount_non_negative
273
+ target: RawOrders.gross_amount
274
+ expression: "value >= 0"
275
+ severity: error
276
+ """
277
+
278
+ END_TO_END_TRANSFORM = """model:
279
+ name: commerce_transform
280
+ spec_version: 2
281
+ version: 1.0.0
282
+ domain: sales
283
+ owners:
284
+ - analytics-engineering@example.com
285
+ state: draft
286
+ layer: transform
287
+ description: Transform layer business models derived from raw sources.
288
+ imports:
289
+ - model: source_sales_raw
290
+ alias: src
291
+ path: ../source/source_sales_raw.model.yaml
292
+
293
+ entities:
294
+ - name: CustomerDim
295
+ type: table
296
+ description: Conformed customer dimension for analytics.
297
+ tags: [SILVER, DIMENSION, CUSTOMER]
298
+ schema: analytics
299
+ subject_area: customer_domain
300
+ owner: analytics-engineering@example.com
301
+ grain: [customer_id]
302
+ sla:
303
+ freshness: 8h
304
+ quality_score: 99
305
+ fields:
306
+ - name: customer_id
307
+ type: string
308
+ primary_key: true
309
+ nullable: false
310
+ description: Conformed customer key.
311
+ tags: [IDENTIFIER]
312
+ - name: email
313
+ type: string
314
+ nullable: false
315
+ description: Customer email used by lifecycle reporting.
316
+ tags: [PII, CONTACT]
317
+ sensitivity: restricted
318
+ - name: customer_tier
319
+ type: string
320
+ nullable: false
321
+ description: Normalized customer segment derived from source events.
322
+ tags: [SEGMENT]
323
+
324
+ - name: OrderFact
325
+ type: table
326
+ description: Atomic order-level fact table for finance and growth analytics.
327
+ tags: [SILVER, FACT, ORDER]
328
+ schema: analytics
329
+ subject_area: order_domain
330
+ owner: analytics-engineering@example.com
331
+ grain: [order_id]
332
+ sla:
333
+ freshness: 4h
334
+ quality_score: 99
335
+ fields:
336
+ - name: order_id
337
+ type: string
338
+ primary_key: true
339
+ nullable: false
340
+ description: Unique order key.
341
+ tags: [IDENTIFIER]
342
+ - name: customer_id
343
+ type: string
344
+ nullable: false
345
+ foreign_key: true
346
+ description: Foreign key to customer dimension.
347
+ tags: [JOIN_KEY]
348
+ - name: order_date
349
+ type: date
350
+ nullable: false
351
+ description: Business order date used for reporting grain.
352
+ tags: [REPORTING_DATE]
353
+ - name: net_revenue
354
+ type: decimal(12,2)
355
+ nullable: false
356
+ description: Revenue after discount normalization.
357
+ tags: [AMOUNT, FINANCE]
358
+ - name: order_status
359
+ type: string
360
+ nullable: false
361
+ description: Standardized business order status.
362
+ tags: [STATUS]
363
+
364
+ relationships:
365
+ - name: order_fact_customer_dim
366
+ from: OrderFact.customer_id
367
+ to: CustomerDim.customer_id
368
+ cardinality: many_to_one
369
+ description: Fact row belongs to one customer.
370
+
371
+ indexes:
372
+ - name: idx_order_fact_order_date
373
+ entity: OrderFact
374
+ fields: [order_date]
375
+ - name: idx_order_fact_customer_id
376
+ entity: OrderFact
377
+ fields: [customer_id]
378
+
379
+ governance:
380
+ classification:
381
+ CustomerDim.email: PII
382
+ stewards:
383
+ customer_domain: analytics-engineering@example.com
384
+ order_domain: analytics-engineering@example.com
385
+ retention:
386
+ period: 5y
387
+ policy: transformed_contract
388
+
389
+ glossary:
390
+ - term: Order Fact
391
+ definition: One row per order after transformation and standardization.
392
+ owner: analytics-engineering@example.com
393
+ related_fields:
394
+ - OrderFact.order_id
395
+ - OrderFact.net_revenue
396
+ tags: [FACT]
397
+
398
+ rules:
399
+ - name: order_fact_revenue_non_negative
400
+ target: OrderFact.net_revenue
401
+ expression: "value >= 0"
402
+ severity: error
403
+ """
404
+
405
+ END_TO_END_REPORT = """model:
406
+ name: commerce_reporting
407
+ spec_version: 2
408
+ version: 1.0.0
409
+ domain: sales
410
+ owners:
411
+ - bi-team@example.com
412
+ state: draft
413
+ layer: report
414
+ description: Reporting layer metric contracts and dictionary-ready semantic views.
415
+ imports:
416
+ - model: commerce_transform
417
+ alias: tr
418
+ path: ../transform/commerce_transform.model.yaml
419
+
420
+ entities:
421
+ - name: DailyRevenueMetric
422
+ type: view
423
+ description: Daily revenue KPI contract used by executive dashboards.
424
+ tags: [GOLD, METRIC, KPI, REPORTING]
425
+ schema: reporting
426
+ subject_area: executive_kpis
427
+ owner: bi-team@example.com
428
+ grain: [metric_date]
429
+ sla:
430
+ freshness: 24h
431
+ quality_score: 99
432
+ fields:
433
+ - name: metric_date
434
+ type: date
435
+ nullable: false
436
+ description: Daily reporting grain for KPI trend lines.
437
+ tags: [GRAIN, REPORTING_DATE]
438
+ - name: gross_revenue
439
+ type: decimal(12,2)
440
+ nullable: false
441
+ computed: true
442
+ computed_expression: "SUM(OrderFact.net_revenue)"
443
+ description: Sum of net revenue at daily grain.
444
+ tags: [METRIC, FINANCE]
445
+ - name: order_count
446
+ type: integer
447
+ nullable: false
448
+ computed: true
449
+ computed_expression: "COUNT_DISTINCT(OrderFact.order_id)"
450
+ description: Distinct order count at daily grain.
451
+ tags: [METRIC, VOLUME]
452
+ - name: avg_order_value
453
+ type: decimal(12,2)
454
+ nullable: false
455
+ computed: true
456
+ computed_expression: "gross_revenue / NULLIF(order_count, 0)"
457
+ description: Average order value derived from daily metrics.
458
+ tags: [METRIC, FINANCE]
459
+
460
+ - name: CustomerRevenueMetric
461
+ type: view
462
+ description: Customer-level revenue KPI contract for retention analysis.
463
+ tags: [GOLD, METRIC, CUSTOMER]
464
+ schema: reporting
465
+ subject_area: customer_kpis
466
+ owner: bi-team@example.com
467
+ grain: [customer_id, report_month]
468
+ sla:
469
+ freshness: 24h
470
+ quality_score: 99
471
+ fields:
472
+ - name: customer_id
473
+ type: string
474
+ nullable: false
475
+ description: Customer identifier for customer KPI cuts.
476
+ tags: [DIMENSION, IDENTIFIER]
477
+ - name: report_month
478
+ type: date
479
+ nullable: false
480
+ description: Monthly reporting period for customer metrics.
481
+ tags: [GRAIN]
482
+ - name: customer_revenue
483
+ type: decimal(12,2)
484
+ nullable: false
485
+ computed: true
486
+ computed_expression: "SUM(OrderFact.net_revenue)"
487
+ description: Total monthly customer revenue.
488
+ tags: [METRIC, FINANCE]
489
+ - name: active_order_count
490
+ type: integer
491
+ nullable: false
492
+ computed: true
493
+ computed_expression: "COUNT_DISTINCT(OrderFact.order_id)"
494
+ description: Distinct active orders for the customer period.
495
+ tags: [METRIC]
496
+
497
+ indexes:
498
+ - name: idx_daily_revenue_metric_date
499
+ entity: DailyRevenueMetric
500
+ fields: [metric_date]
501
+ - name: idx_customer_revenue_metric_customer
502
+ entity: CustomerRevenueMetric
503
+ fields: [customer_id]
504
+
505
+ governance:
506
+ classification:
507
+ CustomerRevenueMetric.customer_id: INTERNAL
508
+ stewards:
509
+ executive_kpis: bi-team@example.com
510
+ customer_kpis: bi-team@example.com
511
+ retention:
512
+ period: 7y
513
+ policy: reporting_contract
514
+
515
+ glossary:
516
+ - term: Gross Revenue
517
+ abbreviation: GR
518
+ definition: Sum of net revenue values over the reporting grain.
519
+ owner: bi-team@example.com
520
+ related_fields:
521
+ - DailyRevenueMetric.gross_revenue
522
+ tags: [KPI, FINANCE]
523
+ - term: Average Order Value
524
+ abbreviation: AOV
525
+ definition: Gross revenue divided by distinct order count for the period.
526
+ owner: bi-team@example.com
527
+ related_fields:
528
+ - DailyRevenueMetric.avg_order_value
529
+ tags: [KPI, COMMERCE]
530
+ - term: Customer Revenue
531
+ definition: Total revenue attributed to a customer within report_month.
532
+ owner: bi-team@example.com
533
+ related_fields:
534
+ - CustomerRevenueMetric.customer_revenue
535
+ tags: [KPI, CUSTOMER]
536
+
537
+ rules:
538
+ - name: gross_revenue_non_negative
539
+ target: DailyRevenueMetric.gross_revenue
540
+ expression: "value >= 0"
541
+ severity: error
542
+ - name: order_count_non_negative
543
+ target: DailyRevenueMetric.order_count
544
+ expression: "value >= 0"
545
+ severity: error
546
+ - name: customer_revenue_non_negative
547
+ target: CustomerRevenueMetric.customer_revenue
548
+ expression: "value >= 0"
549
+ severity: error
550
+
551
+ metrics:
552
+ - name: daily_gross_revenue
553
+ entity: DailyRevenueMetric
554
+ description: Daily gross revenue KPI for executive reporting.
555
+ expression: gross_revenue
556
+ aggregation: sum
557
+ grain: [metric_date]
558
+ dimensions: [metric_date]
559
+ time_dimension: metric_date
560
+ owner: bi-team@example.com
561
+ tags: [KPI, METRIC, FINANCE]
562
+ - name: daily_order_count
563
+ entity: DailyRevenueMetric
564
+ description: Daily distinct order count.
565
+ expression: order_count
566
+ aggregation: count_distinct
567
+ grain: [metric_date]
568
+ dimensions: [metric_date]
569
+ time_dimension: metric_date
570
+ owner: bi-team@example.com
571
+ tags: [KPI, METRIC, VOLUME]
572
+ - name: monthly_customer_revenue
573
+ entity: CustomerRevenueMetric
574
+ description: Monthly revenue by customer.
575
+ expression: customer_revenue
576
+ aggregation: sum
577
+ grain: [customer_id, report_month]
578
+ dimensions: [customer_id]
579
+ time_dimension: report_month
580
+ owner: bi-team@example.com
581
+ tags: [KPI, METRIC, CUSTOMER]
582
+
583
+ display:
584
+ sections:
585
+ - name: Executive KPIs
586
+ entities: [DailyRevenueMetric]
587
+ - name: Customer KPIs
588
+ entities: [CustomerRevenueMetric]
589
+ """
590
+
591
+ END_TO_END_POLICY = """pack:
592
+ name: end_to_end_dictionary
593
+ version: 1.0.0
594
+ description: Strict policy profile for end-to-end modeling + dictionary-first projects.
595
+ extends: strict.policy.yaml
596
+
597
+ policies:
598
+ - id: REQUIRE_MODEL_GOVERNANCE
599
+ type: custom_expression
600
+ severity: error
601
+ params:
602
+ scope: model
603
+ expression: "has_governance"
604
+ message: "Model '{name}' must define governance metadata."
605
+
606
+ - id: REQUIRE_MODEL_GLOSSARY
607
+ type: custom_expression
608
+ severity: error
609
+ params:
610
+ scope: model
611
+ expression: "has_glossary"
612
+ message: "Model '{name}' must define glossary terms for dictionary coverage."
613
+
614
+ - id: REQUIRE_MODEL_RULES
615
+ type: custom_expression
616
+ severity: error
617
+ params:
618
+ scope: model
619
+ expression: "has_rules"
620
+ message: "Model '{name}' must define rules for business logic checks."
621
+
622
+ - id: REQUIRE_REPORT_LAYER_METRICS
623
+ type: custom_expression
624
+ severity: error
625
+ params:
626
+ scope: model
627
+ expression: "layer != 'report' or has_metrics"
628
+ message: "Report layer model '{name}' must define metrics."
629
+
630
+ - id: REQUIRE_ENTITY_SUBJECT_AREA
631
+ type: custom_expression
632
+ severity: error
633
+ params:
634
+ scope: entity
635
+ expression: "subject_area != ''"
636
+ message: "Entity '{name}' must define subject_area for dictionary organization."
637
+
638
+ - id: REQUIRE_ENTITY_DESCRIPTION
639
+ type: custom_expression
640
+ severity: error
641
+ params:
642
+ scope: entity
643
+ expression: "has_description"
644
+ message: "Entity '{name}' must include a description."
645
+
646
+ - id: REQUIRE_FIELD_DESCRIPTION
647
+ type: custom_expression
648
+ severity: error
649
+ params:
650
+ scope: field
651
+ expression: "primary_key or has_description"
652
+ message: "Field '{name}' must include a description unless it is a primary key."
653
+
654
+ - id: REQUIRE_FIELD_TAGS
655
+ type: custom_expression
656
+ severity: error
657
+ params:
658
+ scope: field
659
+ expression: "primary_key or tags != []"
660
+ message: "Field '{name}' must include at least one tag unless it is a primary key."
661
+ """
662
+
663
+ END_TO_END_DICTIONARY_README = """# End-to-End Dictionary Workflow
664
+
665
+ This project is scaffolded to keep architecture, transformation logic, reporting metrics,
666
+ and business dictionary metadata in one programmable YAML system.
667
+
668
+ ## Layers
669
+
670
+ 1. `models/source/`:
671
+ - Physical source contracts (warehouse/raw systems).
672
+ 2. `models/transform/`:
673
+ - Business-conformed entities and relationships.
674
+ 3. `models/report/`:
675
+ - Reporting semantic contracts and KPI-focused glossary terms.
676
+
677
+ ## Required Sections Per Model
678
+
679
+ 1. `model` metadata (`name`, `version`, `owners`, `state`, `description`).
680
+ 2. `entities` with field-level descriptions and tags.
681
+ 3. `grain` in transform/report entities.
682
+ 4. `governance` classification/stewardship metadata.
683
+ 5. `glossary` terms for dictionary clarity.
684
+ 6. `rules` for enforceable business logic.
685
+ 7. `metrics` in report models for KPI contracts.
686
+
687
+ ## Mandatory Validation Flow
688
+
689
+ ```bash
690
+ datalex validate-all --glob "models/**/*.model.yaml"
691
+ datalex policy-check models/source/source_sales_raw.model.yaml --policy policies/end_to_end_dictionary.policy.yaml --inherit
692
+ datalex policy-check models/transform/commerce_transform.model.yaml --policy policies/end_to_end_dictionary.policy.yaml --inherit
693
+ datalex policy-check models/report/commerce_reporting.model.yaml --policy policies/end_to_end_dictionary.policy.yaml --inherit
694
+ datalex resolve-project models
695
+ datalex generate docs models/report/commerce_reporting.model.yaml --format html --out docs/dictionary/reporting-dictionary.html
696
+ ```
697
+ """
698
+
699
+
700
+ def _default_schema_path() -> str:
701
+ return str(Path.cwd() / "schemas" / "model.schema.json")
702
+
703
+
704
+ def _default_policy_schema_path() -> str:
705
+ return str(Path.cwd() / "schemas" / "policy.schema.json")
706
+
707
+
708
+ def _default_policy_path() -> str:
709
+ return str(Path.cwd() / "policies" / "default.policy.yaml")
710
+
711
+
712
+ def _print_issues(issues: List[Issue]) -> None:
713
+ if not issues:
714
+ print("No issues found.")
715
+ return
716
+ for line in to_lines(issues):
717
+ print(line)
718
+
719
+
720
+ def _combined_issues(model: Dict[str, Any], schema: Dict[str, Any]) -> List[Issue]:
721
+ issues = schema_issues(model, schema)
722
+ issues.extend(lint_issues(model))
723
+ return issues
724
+
725
+
726
+ def _normalize_host_and_port(host: str, port: int) -> Tuple[str, int]:
727
+ """Accept URL-ish host input and normalize it to hostname + port."""
728
+ clean_host = (host or "").strip()
729
+ clean_port = port or 0
730
+ if not clean_host:
731
+ return "", clean_port
732
+
733
+ target = clean_host if "://" in clean_host else f"//{clean_host}"
734
+ parsed = urlparse(target)
735
+ normalized_host = parsed.hostname or clean_host.split("/", 1)[0].strip()
736
+
737
+ parsed_port = 0
738
+ try:
739
+ parsed_port = parsed.port or 0
740
+ except ValueError:
741
+ parsed_port = 0
742
+
743
+ if not clean_port and parsed_port:
744
+ clean_port = parsed_port
745
+
746
+ return normalized_host, clean_port
747
+
748
+
749
+ def _sanitize_model_file_stem(model_name: str) -> str:
750
+ stem = (model_name or "imported_model").strip() or "imported_model"
751
+ for ch in ("/", "\\", " ", ":", ";"):
752
+ stem = stem.replace(ch, "_")
753
+ return stem
754
+
755
+
756
+ def _should_create_directory(path: Path) -> bool:
757
+ if sys.stdin.isatty():
758
+ answer = input(f'Project folder "{path}" does not exist. Create it? [y/N]: ').strip().lower()
759
+ return answer in {"y", "yes"}
760
+ return False
761
+
762
+
763
+ def _resolve_pull_output_path(args: argparse.Namespace, model_name: str) -> Tuple[bool, str]:
764
+ project_dir_raw = getattr(args, "project_dir", "") or ""
765
+ out_raw = getattr(args, "out", "") or ""
766
+ create_project_dir = bool(getattr(args, "create_project_dir", False))
767
+
768
+ if not project_dir_raw:
769
+ return True, out_raw
770
+
771
+ project_dir = Path(project_dir_raw).expanduser()
772
+ if project_dir.exists() and not project_dir.is_dir():
773
+ return False, f"Project folder is not a directory: {project_dir}"
774
+ if not project_dir.exists():
775
+ if create_project_dir or _should_create_directory(project_dir):
776
+ project_dir.mkdir(parents=True, exist_ok=True)
777
+ else:
778
+ if not sys.stdin.isatty():
779
+ return False, (
780
+ f"Project folder does not exist: {project_dir}. "
781
+ f"Re-run with --create-project-dir to create it."
782
+ )
783
+ return False, f"Aborted: project folder not created: {project_dir}"
784
+
785
+ if out_raw:
786
+ out_path = Path(out_raw)
787
+ if out_path.is_absolute():
788
+ return False, "--out must be a relative filename/path when used with --project-dir"
789
+ return True, str(project_dir / out_path)
790
+
791
+ file_name = f"{_sanitize_model_file_stem(model_name)}.model.yaml"
792
+ return True, str(project_dir / file_name)
793
+
794
+
795
+ def _validate_model_file(model_path: str, schema: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Issue]]:
796
+ model = load_yaml_model(model_path)
797
+ issues = _combined_issues(model, schema)
798
+ return model, issues
799
+
800
+
801
+ def _print_issue_block(prefix: str, issues: List[Issue]) -> None:
802
+ if not issues:
803
+ print(f"{prefix}: No issues found.")
804
+ return
805
+ print(f"{prefix}:")
806
+ for line in to_lines(issues):
807
+ print(f" {line}")
808
+
809
+
810
+ def _issues_as_json(issues: List[Issue]) -> List[Dict[str, str]]:
811
+ return [
812
+ {
813
+ "severity": issue.severity,
814
+ "code": issue.code,
815
+ "message": issue.message,
816
+ "path": issue.path,
817
+ }
818
+ for issue in issues
819
+ ]
820
+
821
+
822
+ def _write_yaml(path: str, payload: Dict[str, Any]) -> None:
823
+ output = yaml.safe_dump(payload, sort_keys=False)
824
+ Path(path).write_text(output, encoding="utf-8")
825
+
826
+
827
+ def _print_or_write_yaml(payload: Dict[str, Any], out: str = "") -> None:
828
+ output = yaml.safe_dump(payload, sort_keys=False, default_flow_style=False, allow_unicode=True)
829
+ if out:
830
+ Path(out).write_text(output, encoding="utf-8")
831
+ print(f"Wrote model: {out}")
832
+ else:
833
+ print(output)
834
+
835
+
836
+ def _init_schemas_and_policies(root: Path) -> List[Path]:
837
+ """Copy schema and policy files into the workspace. Returns list of created paths."""
838
+ created = []
839
+ (root / "schemas").mkdir(parents=True, exist_ok=True)
840
+ (root / "policies").mkdir(parents=True, exist_ok=True)
841
+
842
+ schema_dst = root / "schemas" / "model.schema.json"
843
+ policy_schema_dst = root / "schemas" / "policy.schema.json"
844
+ default_policy_dst = root / "policies" / "default.policy.yaml"
845
+ strict_policy_dst = root / "policies" / "strict.policy.yaml"
846
+
847
+ if not schema_dst.exists():
848
+ repo_schema = Path.cwd() / "schemas" / "model.schema.json"
849
+ if repo_schema.exists():
850
+ schema_dst.write_text(repo_schema.read_text(encoding="utf-8"), encoding="utf-8")
851
+ else:
852
+ schema_dst.write_text("{}", encoding="utf-8")
853
+ created.append(schema_dst)
854
+
855
+ if not policy_schema_dst.exists():
856
+ repo_policy_schema = Path.cwd() / "schemas" / "policy.schema.json"
857
+ if repo_policy_schema.exists():
858
+ policy_schema_dst.write_text(
859
+ repo_policy_schema.read_text(encoding="utf-8"), encoding="utf-8"
860
+ )
861
+ else:
862
+ policy_schema_dst.write_text("{}", encoding="utf-8")
863
+ created.append(policy_schema_dst)
864
+
865
+ repo_policy_dir = Path.cwd() / "policies"
866
+ if not default_policy_dst.exists():
867
+ repo_default = repo_policy_dir / "default.policy.yaml"
868
+ if repo_default.exists():
869
+ default_policy_dst.write_text(repo_default.read_text(encoding="utf-8"), encoding="utf-8")
870
+ created.append(default_policy_dst)
871
+
872
+ if not strict_policy_dst.exists():
873
+ repo_strict = repo_policy_dir / "strict.policy.yaml"
874
+ if repo_strict.exists():
875
+ strict_policy_dst.write_text(repo_strict.read_text(encoding="utf-8"), encoding="utf-8")
876
+ created.append(strict_policy_dst)
877
+
878
+ return created
879
+
880
+
881
+ def cmd_init(args: argparse.Namespace) -> int:
882
+ root = Path(args.path).resolve()
883
+ created = _init_schemas_and_policies(root)
884
+
885
+ template = args.template
886
+ if args.multi_model:
887
+ if template not in {"single", "multi-model"}:
888
+ print(
889
+ "Init failed: --multi-model cannot be combined with --template end-to-end.",
890
+ file=sys.stderr,
891
+ )
892
+ return 1
893
+ template = "multi-model"
894
+
895
+ if template == "multi-model":
896
+ # Multi-model project structure
897
+ models_dir = root / "models"
898
+ (models_dir / "shared").mkdir(parents=True, exist_ok=True)
899
+ (models_dir / "orders").mkdir(parents=True, exist_ok=True)
900
+
901
+ shared_dst = models_dir / "shared" / "shared_dimensions.model.yaml"
902
+ orders_dst = models_dir / "orders" / "orders.model.yaml"
903
+ config_dst = root / "dm.config.yaml"
904
+
905
+ if not shared_dst.exists():
906
+ shared_dst.write_text(MULTI_MODEL_SHARED, encoding="utf-8")
907
+ created.append(shared_dst)
908
+
909
+ if not orders_dst.exists():
910
+ orders_dst.write_text(MULTI_MODEL_ORDERS, encoding="utf-8")
911
+ created.append(orders_dst)
912
+
913
+ if not config_dst.exists():
914
+ config_dst.write_text(
915
+ "schema: schemas/model.schema.json\n"
916
+ "policy_schema: schemas/policy.schema.json\n"
917
+ "policy_pack: policies/default.policy.yaml\n"
918
+ "model_glob: \"models/**/*.model.yaml\"\n"
919
+ "multi_model: true\n"
920
+ "search_dirs:\n"
921
+ " - models/shared\n"
922
+ " - models/orders\n",
923
+ encoding="utf-8",
924
+ )
925
+ created.append(config_dst)
926
+
927
+ print(f"Initialized multi-model workspace at {root}")
928
+ elif template == "end-to-end":
929
+ models_dir = root / "models"
930
+ (models_dir / "source").mkdir(parents=True, exist_ok=True)
931
+ (models_dir / "transform").mkdir(parents=True, exist_ok=True)
932
+ (models_dir / "report").mkdir(parents=True, exist_ok=True)
933
+ (root / "docs" / "dictionary").mkdir(parents=True, exist_ok=True)
934
+
935
+ source_dst = models_dir / "source" / "source_sales_raw.model.yaml"
936
+ transform_dst = models_dir / "transform" / "commerce_transform.model.yaml"
937
+ report_dst = models_dir / "report" / "commerce_reporting.model.yaml"
938
+ dictionary_readme_dst = root / "docs" / "dictionary" / "README.md"
939
+ end_to_end_policy_dst = root / "policies" / "end_to_end_dictionary.policy.yaml"
940
+ config_dst = root / "dm.config.yaml"
941
+
942
+ if not source_dst.exists():
943
+ source_dst.write_text(END_TO_END_SOURCE, encoding="utf-8")
944
+ created.append(source_dst)
945
+
946
+ if not transform_dst.exists():
947
+ transform_dst.write_text(END_TO_END_TRANSFORM, encoding="utf-8")
948
+ created.append(transform_dst)
949
+
950
+ if not report_dst.exists():
951
+ report_dst.write_text(END_TO_END_REPORT, encoding="utf-8")
952
+ created.append(report_dst)
953
+
954
+ if not dictionary_readme_dst.exists():
955
+ dictionary_readme_dst.write_text(END_TO_END_DICTIONARY_README, encoding="utf-8")
956
+ created.append(dictionary_readme_dst)
957
+
958
+ if not end_to_end_policy_dst.exists():
959
+ end_to_end_policy_dst.write_text(END_TO_END_POLICY, encoding="utf-8")
960
+ created.append(end_to_end_policy_dst)
961
+
962
+ if not config_dst.exists():
963
+ config_dst.write_text(
964
+ "schema: schemas/model.schema.json\n"
965
+ "policy_schema: schemas/policy.schema.json\n"
966
+ "policy_pack: policies/end_to_end_dictionary.policy.yaml\n"
967
+ "model_glob: \"models/**/*.model.yaml\"\n"
968
+ "multi_model: true\n"
969
+ "search_dirs:\n"
970
+ " - models/source\n"
971
+ " - models/transform\n"
972
+ " - models/report\n",
973
+ encoding="utf-8",
974
+ )
975
+ created.append(config_dst)
976
+
977
+ print(f"Initialized end-to-end modeling workspace at {root}")
978
+ else:
979
+ # Single-model project structure
980
+ (root / "model-examples").mkdir(parents=True, exist_ok=True)
981
+ sample_dst = root / "model-examples" / "starter.model.yaml"
982
+ config_dst = root / "dm.config.yaml"
983
+
984
+ if not sample_dst.exists():
985
+ sample_dst.write_text(STARTER_MODEL, encoding="utf-8")
986
+ created.append(sample_dst)
987
+
988
+ if not config_dst.exists():
989
+ config_dst.write_text(
990
+ "schema: schemas/model.schema.json\n"
991
+ "policy_schema: schemas/policy.schema.json\n"
992
+ "policy_pack: policies/default.policy.yaml\n"
993
+ "model_glob: \"**/*.model.yaml\"\n",
994
+ encoding="utf-8",
995
+ )
996
+ created.append(config_dst)
997
+
998
+ print(f"Initialized workspace at {root}")
999
+
1000
+ for path in created:
1001
+ print(f"- {path}")
1002
+ return 0
1003
+
1004
+
1005
+ def cmd_validate(args: argparse.Namespace) -> int:
1006
+ schema = load_schema(args.schema)
1007
+ _, issues = _validate_model_file(args.model, schema)
1008
+ _print_issues(issues)
1009
+ return 1 if has_errors(issues) else 0
1010
+
1011
+
1012
+ def cmd_lint(args: argparse.Namespace) -> int:
1013
+ model = load_yaml_model(args.model)
1014
+ issues = lint_issues(model)
1015
+ _print_issues(issues)
1016
+ return 1 if has_errors(issues) else 0
1017
+
1018
+
1019
+ def cmd_compile(args: argparse.Namespace) -> int:
1020
+ schema = load_schema(args.schema)
1021
+ model, issues = _validate_model_file(args.model, schema)
1022
+ if has_errors(issues):
1023
+ _print_issues(issues)
1024
+ return 1
1025
+
1026
+ canonical = compile_model(model)
1027
+ output = json.dumps(canonical, indent=2, sort_keys=False)
1028
+
1029
+ if args.out:
1030
+ Path(args.out).write_text(output + "\n", encoding="utf-8")
1031
+ print(f"Wrote canonical model: {args.out}")
1032
+ else:
1033
+ print(output)
1034
+
1035
+ return 0
1036
+
1037
+
1038
+ def cmd_diff(args: argparse.Namespace) -> int:
1039
+ old_model = load_yaml_model(args.old)
1040
+ new_model = load_yaml_model(args.new)
1041
+ diff = semantic_diff(old_model, new_model)
1042
+ print(json.dumps(diff, indent=2))
1043
+ return 0
1044
+
1045
+
1046
+ def cmd_validate_all(args: argparse.Namespace) -> int:
1047
+ schema = load_schema(args.schema)
1048
+ paths = sorted(
1049
+ {
1050
+ Path(path)
1051
+ for path in glob.glob(args.glob, recursive=True)
1052
+ if Path(path).is_file()
1053
+ }
1054
+ )
1055
+
1056
+ if not paths:
1057
+ print(f"No files matched glob: {args.glob}")
1058
+ return 0
1059
+
1060
+ failing_files = 0
1061
+ for path in paths:
1062
+ if any(path.match(pattern) for pattern in args.exclude):
1063
+ continue
1064
+
1065
+ _, issues = _validate_model_file(str(path), schema)
1066
+ _print_issue_block(str(path), issues)
1067
+ if has_errors(issues):
1068
+ failing_files += 1
1069
+
1070
+ if failing_files:
1071
+ print(f"Validation failed for {failing_files} file(s).")
1072
+ return 1
1073
+
1074
+ print("All model files passed validation.")
1075
+ return 0
1076
+
1077
+
1078
+ def cmd_gate(args: argparse.Namespace) -> int:
1079
+ schema = load_schema(args.schema)
1080
+
1081
+ old_model, old_issues = _validate_model_file(args.old, schema)
1082
+ new_model, new_issues = _validate_model_file(args.new, schema)
1083
+
1084
+ _print_issue_block(f"Old model ({args.old})", old_issues)
1085
+ _print_issue_block(f"New model ({args.new})", new_issues)
1086
+
1087
+ combined_issues = list(old_issues) + list(new_issues)
1088
+ if has_errors(combined_issues):
1089
+ print("Gate failed: model validation errors detected.")
1090
+ return 1
1091
+
1092
+ diff = semantic_diff(old_model, new_model)
1093
+ if args.output_json:
1094
+ print(json.dumps(diff, indent=2))
1095
+ else:
1096
+ summary = diff["summary"]
1097
+ print("Diff summary:")
1098
+ print(
1099
+ f" entities +{summary['added_entities']} -{summary['removed_entities']} "
1100
+ f"changed:{summary['changed_entities']}"
1101
+ )
1102
+ print(
1103
+ f" relationships +{summary['added_relationships']} -{summary['removed_relationships']}"
1104
+ )
1105
+ print(f" metrics +{summary['added_metrics']} -{summary['removed_metrics']} changed:{summary['changed_metrics']}")
1106
+ print(f" breaking changes: {summary['breaking_change_count']}")
1107
+ if diff["breaking_changes"]:
1108
+ print("Breaking changes:")
1109
+ for item in diff["breaking_changes"]:
1110
+ print(f" - {item}")
1111
+
1112
+ if diff["has_breaking_changes"] and not args.allow_breaking:
1113
+ print("Gate failed: breaking changes detected. Use --allow-breaking to bypass.")
1114
+ return 2
1115
+
1116
+ print("Gate passed.")
1117
+ return 0
1118
+
1119
+
1120
+ def cmd_policy_check(args: argparse.Namespace) -> int:
1121
+ schema = load_schema(args.schema)
1122
+ policy_schema = load_schema(args.policy_schema)
1123
+
1124
+ model, model_issues = _validate_model_file(args.model, schema)
1125
+ if getattr(args, "inherit", False):
1126
+ policy_pack = load_policy_pack_with_inheritance(args.policy)
1127
+ else:
1128
+ policy_pack = load_policy_pack(args.policy)
1129
+ policy_pack_issues = schema_issues(policy_pack, policy_schema)
1130
+
1131
+ _print_issue_block(f"Model checks ({args.model})", model_issues)
1132
+ _print_issue_block(f"Policy pack checks ({args.policy})", policy_pack_issues)
1133
+
1134
+ if has_errors(model_issues) or has_errors(policy_pack_issues):
1135
+ print("Policy check failed: validation errors detected before policy evaluation.")
1136
+ return 1
1137
+
1138
+ evaluated_issues = policy_issues(model, policy_pack)
1139
+ _print_issue_block("Policy evaluation", evaluated_issues)
1140
+
1141
+ if args.output_json:
1142
+ payload = {
1143
+ "model": args.model,
1144
+ "policy": args.policy,
1145
+ "summary": {
1146
+ "error_count": len([item for item in evaluated_issues if item.severity == "error"]),
1147
+ "warning_count": len([item for item in evaluated_issues if item.severity == "warn"]),
1148
+ "info_count": len([item for item in evaluated_issues if item.severity == "info"]),
1149
+ },
1150
+ "issues": _issues_as_json(evaluated_issues),
1151
+ }
1152
+ print(json.dumps(payload, indent=2))
1153
+
1154
+ if has_errors(evaluated_issues):
1155
+ print("Policy check failed.")
1156
+ return 1
1157
+
1158
+ print("Policy check passed.")
1159
+ return 0
1160
+
1161
+
1162
+ def cmd_generate_sql(args: argparse.Namespace) -> int:
1163
+ schema = load_schema(args.schema)
1164
+ model, issues = _validate_model_file(args.model, schema)
1165
+
1166
+ if has_errors(issues):
1167
+ _print_issues(issues)
1168
+ return 1
1169
+
1170
+ ddl = generate_sql_ddl(model, dialect=args.dialect)
1171
+ if args.out:
1172
+ Path(args.out).write_text(ddl, encoding="utf-8")
1173
+ print(f"Wrote SQL DDL: {args.out}")
1174
+ else:
1175
+ print(ddl)
1176
+
1177
+ return 0
1178
+
1179
+
1180
+ def cmd_generate_dbt(args: argparse.Namespace) -> int:
1181
+ schema = load_schema(args.schema)
1182
+ model, issues = _validate_model_file(args.model, schema)
1183
+
1184
+ if has_errors(issues):
1185
+ _print_issues(issues)
1186
+ return 1
1187
+
1188
+ created = write_dbt_scaffold(
1189
+ model=model,
1190
+ out_dir=args.out_dir,
1191
+ source_name=args.source_name,
1192
+ project_name=args.project_name,
1193
+ )
1194
+
1195
+ print(f"Created dbt scaffold files ({len(created)}):")
1196
+ for path in created:
1197
+ print(f"- {path}")
1198
+
1199
+ return 0
1200
+
1201
+
1202
+ def cmd_generate_metadata(args: argparse.Namespace) -> int:
1203
+ schema = load_schema(args.schema)
1204
+ model, issues = _validate_model_file(args.model, schema)
1205
+
1206
+ if has_errors(issues):
1207
+ _print_issues(issues)
1208
+ return 1
1209
+
1210
+ canonical = compile_model(model)
1211
+ payload = {
1212
+ "model": canonical.get("model", {}),
1213
+ "summary": {
1214
+ "entity_count": len(canonical.get("entities", [])),
1215
+ "relationship_count": len(canonical.get("relationships", [])),
1216
+ "index_count": len(canonical.get("indexes", [])),
1217
+ "glossary_term_count": len(canonical.get("glossary", [])),
1218
+ "rule_count": len(canonical.get("rules", [])),
1219
+ },
1220
+ "entities": canonical.get("entities", []),
1221
+ "relationships": canonical.get("relationships", []),
1222
+ "indexes": canonical.get("indexes", []),
1223
+ "glossary": canonical.get("glossary", []),
1224
+ "governance": canonical.get("governance", {}),
1225
+ "generated_by": "datalex generate metadata",
1226
+ }
1227
+ output = json.dumps(payload, indent=2)
1228
+
1229
+ if args.out:
1230
+ Path(args.out).write_text(output + "\n", encoding="utf-8")
1231
+ print(f"Wrote metadata export: {args.out}")
1232
+ else:
1233
+ print(output)
1234
+
1235
+ return 0
1236
+
1237
+
1238
+ def cmd_import_sql(args: argparse.Namespace) -> int:
1239
+ ddl_text = Path(args.input).read_text(encoding="utf-8")
1240
+ model = import_sql_ddl(
1241
+ ddl_text=ddl_text,
1242
+ model_name=args.model_name,
1243
+ domain=args.domain,
1244
+ owners=args.owner if args.owner else ["data-team@example.com"],
1245
+ )
1246
+
1247
+ schema = load_schema(args.schema)
1248
+ issues = _combined_issues(model, schema)
1249
+ _print_issue_block("Imported model checks", issues)
1250
+
1251
+ if args.out:
1252
+ _write_yaml(args.out, model)
1253
+ print(f"Wrote imported YAML model: {args.out}")
1254
+ else:
1255
+ print(yaml.safe_dump(model, sort_keys=False))
1256
+
1257
+ return 1 if has_errors(issues) else 0
1258
+
1259
+
1260
+ def cmd_import_dbml(args: argparse.Namespace) -> int:
1261
+ dbml_text = Path(args.input).read_text(encoding="utf-8")
1262
+ model = import_dbml(
1263
+ dbml_text=dbml_text,
1264
+ model_name=args.model_name,
1265
+ domain=args.domain,
1266
+ owners=args.owner if args.owner else ["data-team@example.com"],
1267
+ )
1268
+
1269
+ schema = load_schema(args.schema)
1270
+ issues = _combined_issues(model, schema)
1271
+ _print_issue_block("Imported model checks", issues)
1272
+
1273
+ if args.out:
1274
+ _write_yaml(args.out, model)
1275
+ print(f"Wrote imported YAML model: {args.out}")
1276
+ else:
1277
+ print(yaml.safe_dump(model, sort_keys=False))
1278
+
1279
+ return 1 if has_errors(issues) else 0
1280
+
1281
+
1282
+ def cmd_import_spark_schema(args: argparse.Namespace) -> int:
1283
+ text = Path(args.input).read_text(encoding="utf-8")
1284
+ model = import_spark_schema(
1285
+ schema_text=text,
1286
+ model_name=args.model_name,
1287
+ domain=args.domain,
1288
+ owners=args.owner if args.owner else ["data-team@example.com"],
1289
+ table_name=getattr(args, "table_name", None),
1290
+ )
1291
+
1292
+ schema = load_schema(args.schema)
1293
+ issues = _combined_issues(model, schema)
1294
+ _print_issue_block("Imported model checks", issues)
1295
+
1296
+ if args.out:
1297
+ _write_yaml(args.out, model)
1298
+ print(f"Wrote imported YAML model: {args.out}")
1299
+ else:
1300
+ print(yaml.safe_dump(model, sort_keys=False))
1301
+
1302
+ return 1 if has_errors(issues) else 0
1303
+
1304
+
1305
+ def cmd_import_dbt(args: argparse.Namespace) -> int:
1306
+ schema_text = Path(args.input).read_text(encoding="utf-8")
1307
+ model = import_dbt_schema_yml(
1308
+ schema_yml_text=schema_text,
1309
+ model_name=args.model_name,
1310
+ domain=args.domain,
1311
+ owners=args.owner if args.owner else ["data-team@example.com"],
1312
+ )
1313
+
1314
+ schema = load_schema(args.schema)
1315
+ issues = _combined_issues(model, schema)
1316
+ _print_issue_block("Imported model checks", issues)
1317
+
1318
+ if args.out:
1319
+ _write_yaml(args.out, model)
1320
+ print(f"Wrote imported YAML model: {args.out}")
1321
+ else:
1322
+ print(yaml.safe_dump(model, sort_keys=False))
1323
+
1324
+ return 1 if has_errors(issues) else 0
1325
+
1326
+
1327
+ def cmd_dbt_sync(args: argparse.Namespace) -> int:
1328
+ """Merge DataLex model metadata into an existing dbt schema.yml (non-destructive)."""
1329
+ model = load_yaml_model(args.model)
1330
+ dbt_schema_path = Path(args.dbt_schema)
1331
+ if not dbt_schema_path.exists():
1332
+ print(f"ERROR: dbt schema file not found: {dbt_schema_path}", file=sys.stderr)
1333
+ return 1
1334
+ existing_yml = dbt_schema_path.read_text(encoding="utf-8")
1335
+ updated_yml = sync_dbt_schema_yml(model, existing_yml)
1336
+ out_path = Path(args.out) if getattr(args, "out", None) else dbt_schema_path
1337
+ out_path.write_text(updated_yml, encoding="utf-8")
1338
+ print(f"dbt schema synced: {out_path}")
1339
+ return 0
1340
+
1341
+
1342
+ def cmd_dbt_push(args: argparse.Namespace) -> int:
1343
+ """Push DataLex metadata into all schema.yml files found in a dbt project directory."""
1344
+ model = load_yaml_model(args.model)
1345
+ dbt_project_root = Path(args.dbt_project)
1346
+ if not dbt_project_root.is_dir():
1347
+ print(f"ERROR: dbt project directory not found: {dbt_project_root}", file=sys.stderr)
1348
+ return 1
1349
+ yaml_files = list(dbt_project_root.rglob("schema.yml")) + list(dbt_project_root.rglob("schema.yaml"))
1350
+ if not yaml_files:
1351
+ print("No dbt schema.yml files found in project directory.", file=sys.stderr)
1352
+ return 1
1353
+ updated_count = 0
1354
+ for yml_path in sorted(yaml_files):
1355
+ try:
1356
+ existing_yml = yml_path.read_text(encoding="utf-8")
1357
+ updated_yml = sync_dbt_schema_yml(model, existing_yml)
1358
+ yml_path.write_text(updated_yml, encoding="utf-8")
1359
+ print(f" synced: {yml_path}")
1360
+ updated_count += 1
1361
+ except Exception as exc:
1362
+ print(f" WARN: skipping {yml_path}: {exc}", file=sys.stderr)
1363
+ print(f"dbt push complete. Updated {updated_count} schema.yml file(s).")
1364
+ return 0
1365
+
1366
+
1367
+ def _build_connector_extra(args: argparse.Namespace) -> Dict[str, Any]:
1368
+ extra: Dict[str, Any] = {}
1369
+ if getattr(args, "odbc_driver", ""):
1370
+ extra["odbc_driver"] = getattr(args, "odbc_driver")
1371
+ if getattr(args, "encrypt", ""):
1372
+ extra["encrypt"] = getattr(args, "encrypt")
1373
+ if getattr(args, "trust_server_certificate", ""):
1374
+ extra["trust_server_certificate"] = getattr(args, "trust_server_certificate")
1375
+ if getattr(args, "http_path", ""):
1376
+ extra["http_path"] = getattr(args, "http_path")
1377
+ return extra
1378
+
1379
+
1380
+ def cmd_pull(args: argparse.Namespace) -> int:
1381
+ connector_type = args.connector
1382
+ connector = get_connector(connector_type)
1383
+ if connector is None:
1384
+ print(f"Unknown connector: {connector_type}", file=sys.stderr)
1385
+ print(f"Available: {', '.join(c['type'] for c in list_connectors())}", file=sys.stderr)
1386
+ return 1
1387
+
1388
+ ok, msg = connector.check_driver()
1389
+ if not ok:
1390
+ print(f"Driver check failed: {msg}", file=sys.stderr)
1391
+ return 1
1392
+
1393
+ host, port = _normalize_host_and_port(
1394
+ getattr(args, "host", "") or "",
1395
+ getattr(args, "port", 0) or 0,
1396
+ )
1397
+
1398
+ config = ConnectorConfig(
1399
+ connector_type=connector_type,
1400
+ host=host,
1401
+ port=port,
1402
+ database=getattr(args, "database", "") or "",
1403
+ schema=getattr(args, "db_schema", "") or "",
1404
+ user=getattr(args, "user", "") or "",
1405
+ password=getattr(args, "password", "") or "",
1406
+ warehouse=getattr(args, "warehouse", "") or "",
1407
+ project=getattr(args, "project", "") or "",
1408
+ dataset=getattr(args, "dataset", "") or "",
1409
+ catalog=getattr(args, "catalog", "") or "",
1410
+ token=getattr(args, "token", "") or "",
1411
+ private_key_path=getattr(args, "private_key_path", "") or "",
1412
+ model_name=getattr(args, "model_name", "imported_model") or "imported_model",
1413
+ domain=getattr(args, "domain", "imported") or "imported",
1414
+ owners=[getattr(args, "owner", None)] if getattr(args, "owner", None) else None,
1415
+ tables=getattr(args, "tables", None),
1416
+ exclude_tables=getattr(args, "exclude_tables", None),
1417
+ extra=_build_connector_extra(args),
1418
+ )
1419
+
1420
+ if getattr(args, "test", False):
1421
+ ok, msg = connector.test_connection(config)
1422
+ print(f"{'OK' if ok else 'FAIL'}: {msg}")
1423
+ return 0 if ok else 1
1424
+
1425
+ ok_out, output_path_or_error = _resolve_pull_output_path(args, config.model_name)
1426
+ if not ok_out:
1427
+ print(output_path_or_error, file=sys.stderr)
1428
+ return 1
1429
+
1430
+ print(f"Pulling schema from {connector.display_name}...")
1431
+ result = connector.pull_schema(config)
1432
+
1433
+ print(f"\n{result.summary()}")
1434
+
1435
+ if result.warnings:
1436
+ for w in result.warnings:
1437
+ print(f" [WARN] {w}")
1438
+
1439
+ if output_path_or_error:
1440
+ _write_yaml(output_path_or_error, result.model)
1441
+ print(f"\nWrote model: {output_path_or_error}")
1442
+ else:
1443
+ print("\n" + yaml.safe_dump(result.model, sort_keys=False))
1444
+
1445
+ return 0
1446
+
1447
+
1448
+ def cmd_connectors(args: argparse.Namespace) -> int:
1449
+ connectors = list_connectors()
1450
+ if getattr(args, "output_json", False):
1451
+ print(json.dumps(connectors, indent=2))
1452
+ else:
1453
+ print("Available database connectors:\n")
1454
+ for c in connectors:
1455
+ status = "installed" if c["installed"] else "NOT INSTALLED"
1456
+ print(f" {c['type']:12s} {c['name']:30s} driver: {c['driver']:25s} [{status}]")
1457
+ print(
1458
+ "\nUsage: datalex pull <connector> --host <host> --database <db> --user <user> "
1459
+ "--password <pass> [--out model.yaml | --project-dir ./models]"
1460
+ )
1461
+ return 0
1462
+
1463
+
1464
+ def _build_connector_config(args: argparse.Namespace) -> "ConnectorConfig":
1465
+ host, port = _normalize_host_and_port(
1466
+ getattr(args, "host", "") or "",
1467
+ getattr(args, "port", 0) or 0,
1468
+ )
1469
+ extra = _build_connector_extra(args)
1470
+
1471
+ return ConnectorConfig(
1472
+ connector_type=args.connector,
1473
+ host=host,
1474
+ port=port,
1475
+ database=getattr(args, "database", "") or "",
1476
+ schema=getattr(args, "db_schema", "") or "",
1477
+ user=getattr(args, "user", "") or "",
1478
+ password=getattr(args, "password", "") or "",
1479
+ warehouse=getattr(args, "warehouse", "") or "",
1480
+ project=getattr(args, "project", "") or "",
1481
+ dataset=getattr(args, "dataset", "") or "",
1482
+ catalog=getattr(args, "catalog", "") or "",
1483
+ token=getattr(args, "token", "") or "",
1484
+ private_key_path=getattr(args, "private_key_path", "") or "",
1485
+ extra=extra,
1486
+ )
1487
+
1488
+
1489
+ def cmd_schemas(args: argparse.Namespace) -> int:
1490
+ connector = get_connector(args.connector)
1491
+ if connector is None:
1492
+ print(f"Unknown connector: {args.connector}", file=sys.stderr)
1493
+ return 1
1494
+ ok, msg = connector.check_driver()
1495
+ if not ok:
1496
+ print(f"Driver check failed: {msg}", file=sys.stderr)
1497
+ return 1
1498
+
1499
+ config = _build_connector_config(args)
1500
+ schemas = connector.list_schemas(config)
1501
+
1502
+ if getattr(args, "output_json", False):
1503
+ print(json.dumps(schemas, indent=2))
1504
+ else:
1505
+ print(f"Schemas in {connector.display_name} ({config.database or config.project or 'default'}):\n")
1506
+ for s in schemas:
1507
+ print(f" {s['name']:30s} {s['table_count']:4d} tables")
1508
+ return 0
1509
+
1510
+
1511
+ def cmd_tables(args: argparse.Namespace) -> int:
1512
+ connector = get_connector(args.connector)
1513
+ if connector is None:
1514
+ print(f"Unknown connector: {args.connector}", file=sys.stderr)
1515
+ return 1
1516
+ ok, msg = connector.check_driver()
1517
+ if not ok:
1518
+ print(f"Driver check failed: {msg}", file=sys.stderr)
1519
+ return 1
1520
+
1521
+ config = _build_connector_config(args)
1522
+ tables = connector.list_tables(config)
1523
+
1524
+ if getattr(args, "output_json", False):
1525
+ print(json.dumps(tables, indent=2))
1526
+ else:
1527
+ schema_label = config.schema or config.dataset or "default"
1528
+ print(f"Tables in {connector.display_name} / {schema_label}:\n")
1529
+ print(f" {'TABLE':30s} {'TYPE':8s} {'COLUMNS':>8s} {'ROWS':>12s}")
1530
+ print(f" {'-'*30} {'-'*8} {'-'*8} {'-'*12}")
1531
+ for t in tables:
1532
+ rows = str(t.get("row_count") or "") if t.get("row_count") is not None else "-"
1533
+ print(f" {t['name']:30s} {t['type']:8s} {t['column_count']:>8d} {rows:>12s}")
1534
+ print(f"\n Total: {len(tables)} tables")
1535
+ return 0
1536
+
1537
+
1538
+ def cmd_generate_docs(args: argparse.Namespace) -> int:
1539
+ model = load_yaml_model(args.model)
1540
+ fmt = args.format
1541
+
1542
+ if fmt == "html":
1543
+ if args.out:
1544
+ write_html_docs(model, args.out, title=args.title)
1545
+ print(f"Wrote HTML docs: {args.out}")
1546
+ else:
1547
+ print(generate_html_docs(model, title=args.title))
1548
+ elif fmt == "markdown":
1549
+ if args.out:
1550
+ write_markdown_docs(model, args.out, title=args.title)
1551
+ print(f"Wrote Markdown docs: {args.out}")
1552
+ else:
1553
+ print(generate_markdown_docs(model, title=args.title))
1554
+
1555
+ return 0
1556
+
1557
+
1558
+ def cmd_generate_changelog(args: argparse.Namespace) -> int:
1559
+ old_model = load_yaml_model(args.old)
1560
+ new_model = load_yaml_model(args.new)
1561
+ diff = semantic_diff(old_model, new_model)
1562
+
1563
+ old_version = old_model.get("model", {}).get("version", "")
1564
+ new_version = new_model.get("model", {}).get("version", "")
1565
+
1566
+ if args.out:
1567
+ write_changelog(diff, args.out, old_version=old_version, new_version=new_version)
1568
+ print(f"Wrote changelog: {args.out}")
1569
+ else:
1570
+ print(generate_changelog(diff, old_version=old_version, new_version=new_version))
1571
+
1572
+ return 0
1573
+
1574
+
1575
+ def cmd_fmt(args: argparse.Namespace) -> int:
1576
+ model = load_yaml_model(args.model)
1577
+ canonical = compile_model(model)
1578
+ output = yaml.safe_dump(canonical, sort_keys=False, default_flow_style=False, allow_unicode=True)
1579
+
1580
+ if args.write:
1581
+ Path(args.model).write_text(output, encoding="utf-8")
1582
+ print(f"Formatted: {args.model}")
1583
+ elif args.out:
1584
+ Path(args.out).write_text(output, encoding="utf-8")
1585
+ print(f"Wrote formatted model: {args.out}")
1586
+ else:
1587
+ print(output)
1588
+
1589
+ return 0
1590
+
1591
+
1592
+ def cmd_stats(args: argparse.Namespace) -> int:
1593
+ model = load_yaml_model(args.model)
1594
+ entities = model.get("entities", [])
1595
+ relationships = model.get("relationships", [])
1596
+ indexes = model.get("indexes", [])
1597
+ glossary = model.get("glossary", [])
1598
+ rules = model.get("rules", [])
1599
+
1600
+ total_fields = sum(len(e.get("fields", [])) for e in entities)
1601
+ pk_count = sum(
1602
+ 1 for e in entities for f in e.get("fields", []) if f.get("primary_key")
1603
+ )
1604
+ fk_count = sum(
1605
+ 1 for e in entities for f in e.get("fields", []) if f.get("foreign_key")
1606
+ )
1607
+ nullable_count = sum(
1608
+ 1 for e in entities for f in e.get("fields", []) if f.get("nullable", True)
1609
+ )
1610
+ described_fields = sum(
1611
+ 1 for e in entities for f in e.get("fields", []) if f.get("description")
1612
+ )
1613
+ deprecated_count = sum(
1614
+ 1 for e in entities for f in e.get("fields", []) if f.get("deprecated")
1615
+ )
1616
+ entity_types = {}
1617
+ for e in entities:
1618
+ t = e.get("type", "table")
1619
+ entity_types[t] = entity_types.get(t, 0) + 1
1620
+ subject_areas = set(e.get("subject_area") for e in entities if e.get("subject_area"))
1621
+ tags = set()
1622
+ for e in entities:
1623
+ for t in e.get("tags", []):
1624
+ tags.add(t)
1625
+
1626
+ desc_coverage = f"{described_fields}/{total_fields}" if total_fields else "0/0"
1627
+ desc_pct = f"{described_fields / total_fields * 100:.0f}%" if total_fields else "0%"
1628
+
1629
+ stats = {
1630
+ "model_name": model.get("model", {}).get("name", "unknown"),
1631
+ "version": model.get("model", {}).get("version", "unknown"),
1632
+ "entity_count": len(entities),
1633
+ "entity_types": entity_types,
1634
+ "total_fields": total_fields,
1635
+ "primary_keys": pk_count,
1636
+ "foreign_keys": fk_count,
1637
+ "nullable_fields": nullable_count,
1638
+ "relationship_count": len(relationships),
1639
+ "index_count": len(indexes),
1640
+ "glossary_terms": len(glossary),
1641
+ "rule_count": len(rules),
1642
+ "description_coverage": f"{desc_coverage} ({desc_pct})",
1643
+ "deprecated_fields": deprecated_count,
1644
+ "subject_areas": sorted(subject_areas),
1645
+ "tags": sorted(tags),
1646
+ }
1647
+
1648
+ if args.output_json:
1649
+ print(json.dumps(stats, indent=2))
1650
+ else:
1651
+ print(f"Model: {stats['model_name']} v{stats['version']}")
1652
+ print(f"Entities: {stats['entity_count']} ({', '.join(f'{v} {k}' for k, v in entity_types.items())})")
1653
+ print(f"Fields: {stats['total_fields']} (PK: {pk_count}, FK: {fk_count}, nullable: {nullable_count})")
1654
+ print(f"Relationships: {stats['relationship_count']}")
1655
+ print(f"Indexes: {stats['index_count']}")
1656
+ print(f"Glossary terms: {stats['glossary_terms']}")
1657
+ print(f"Rules: {stats['rule_count']}")
1658
+ print(f"Description coverage: {desc_coverage} ({desc_pct})")
1659
+ if deprecated_count:
1660
+ print(f"Deprecated fields: {deprecated_count}")
1661
+ if subject_areas:
1662
+ print(f"Subject areas: {', '.join(sorted(subject_areas))}")
1663
+ if tags:
1664
+ print(f"Tags: {', '.join(sorted(tags))}")
1665
+
1666
+ return 0
1667
+
1668
+
1669
+ def cmd_completeness(args: argparse.Namespace) -> int:
1670
+ """Score every entity in a model against the single-source-of-truth dimensions."""
1671
+ model = load_yaml_model(args.model)
1672
+ report = completeness_report(model)
1673
+ data = completeness_as_dict(report)
1674
+
1675
+ if args.output_json:
1676
+ print(json.dumps(data, indent=2))
1677
+ return 0
1678
+
1679
+ # ── Human-readable report ─────────────────────────────────────────────────
1680
+ BAR_WIDTH = 20
1681
+ SCORE_PASS = 80
1682
+ SCORE_WARN = 60
1683
+
1684
+ def _bar(score: int) -> str:
1685
+ filled = round(score / 100 * BAR_WIDTH)
1686
+ if score >= SCORE_PASS:
1687
+ fill_char, empty_char = "█", "░"
1688
+ elif score >= SCORE_WARN:
1689
+ fill_char, empty_char = "▓", "░"
1690
+ else:
1691
+ fill_char, empty_char = "▒", "░"
1692
+ return fill_char * filled + empty_char * (BAR_WIDTH - filled)
1693
+
1694
+ def _score_label(score: int) -> str:
1695
+ if score == 100:
1696
+ return "COMPLETE"
1697
+ if score >= SCORE_PASS:
1698
+ return "GOOD "
1699
+ if score >= SCORE_WARN:
1700
+ return "PARTIAL "
1701
+ return "GAPS "
1702
+
1703
+ print(f"\nCompleteness report — {report.model_name}")
1704
+ print(f"Model score: {report.model_score}% "
1705
+ f"({report.fully_complete}/{report.total_entities} fully complete)\n")
1706
+ print(f" {'Entity':<30} {'Score':>5} {'':^{BAR_WIDTH}} Status")
1707
+ print(f" {'-'*30} {'-----':>5} {'-'*BAR_WIDTH} --------")
1708
+
1709
+ for e in report.entities:
1710
+ bar = _bar(e.score)
1711
+ label = _score_label(e.score)
1712
+ print(f" {e.entity_name:<30} {e.score:>4}% {bar} {label}")
1713
+ if e.missing and not args.summary:
1714
+ for m in e.missing:
1715
+ print(f" {'':30} ↳ missing: {m}")
1716
+
1717
+ if report.needs_attention:
1718
+ print(f"\n Needs attention (<60%): {', '.join(report.needs_attention)}")
1719
+
1720
+ # Surface completeness as lint-style warnings when --min-score is set
1721
+ if args.min_score is not None:
1722
+ failed = [e for e in report.entities if e.score < args.min_score]
1723
+ if failed:
1724
+ print(
1725
+ f"\n {len(failed)} entity/entities below minimum score of {args.min_score}%:"
1726
+ )
1727
+ for e in failed:
1728
+ print(f" [{e.score}%] {e.entity_name}")
1729
+ return 1
1730
+
1731
+ return 0
1732
+
1733
+
1734
+ def cmd_resolve(args: argparse.Namespace) -> int:
1735
+ search_dirs = args.search_dir if args.search_dir else []
1736
+ resolved = resolve_model(args.model, search_dirs=search_dirs)
1737
+
1738
+ if resolved.issues:
1739
+ for iss in resolved.issues:
1740
+ sev = iss.severity.upper()
1741
+ print(f" [{sev}] {iss.code}: {iss.message}")
1742
+
1743
+ summary = resolved.to_graph_summary()
1744
+
1745
+ if args.output_json:
1746
+ print(json.dumps(summary, indent=2))
1747
+ else:
1748
+ print(f"Root model: {summary['root_model']}")
1749
+ print(f"Models resolved: {summary['model_count']}")
1750
+ print(f"Total entities: {summary['total_entities']}")
1751
+ for m in summary["models"]:
1752
+ prefix = "*" if m["is_root"] else " "
1753
+ alias = f" (alias: {m.get('alias', '')})" if m.get("alias") else ""
1754
+ print(f" {prefix} {m['name']}{alias}: {m['entity_count']} entities [{', '.join(m['entities'])}]")
1755
+ cross = summary["cross_model_relationships"]
1756
+ if cross:
1757
+ print(f"Cross-model relationships: {len(cross)}")
1758
+ for cr in cross:
1759
+ print(f" {cr['from_model']}.{cr['from']} -> {cr['to_model']}.{cr['to']} ({cr['cardinality']})")
1760
+
1761
+ has_errs = any(i.severity == "error" for i in resolved.issues)
1762
+ return 1 if has_errs else 0
1763
+
1764
+
1765
+ def cmd_diff_all(args: argparse.Namespace) -> int:
1766
+ diff = project_diff(args.old, args.new)
1767
+
1768
+ if args.output_json:
1769
+ print(json.dumps(diff, indent=2))
1770
+ else:
1771
+ s = diff["summary"]
1772
+ print(f"Project diff: {args.old} -> {args.new}")
1773
+ print(f" Models: +{s['added_models']} -{s['removed_models']} changed:{s['changed_models']} unchanged:{s['unchanged_models']}")
1774
+ if diff["added_models"]:
1775
+ print(f" Added: {', '.join(diff['added_models'])}")
1776
+ if diff["removed_models"]:
1777
+ print(f" Removed: {', '.join(diff['removed_models'])}")
1778
+ if diff["changed_models"]:
1779
+ print(f" Changed: {', '.join(diff['changed_models'])}")
1780
+ for name, mdiff in diff["model_diffs"].items():
1781
+ ms = mdiff["summary"]
1782
+ print(f" [{name}] entities +{ms['added_entities']} -{ms['removed_entities']} changed:{ms['changed_entities']}")
1783
+ print(f" [{name}] metrics +{ms['added_metrics']} -{ms['removed_metrics']} changed:{ms['changed_metrics']}")
1784
+ print(f" Breaking changes: {s['breaking_change_count']}")
1785
+ if diff["breaking_changes"]:
1786
+ for bc in diff["breaking_changes"]:
1787
+ print(f" - {bc}")
1788
+
1789
+ if diff["has_breaking_changes"] and not args.allow_breaking:
1790
+ print("Project diff failed: breaking changes detected. Use --allow-breaking to bypass.")
1791
+ return 2
1792
+
1793
+ return 0
1794
+
1795
+
1796
+ def cmd_transform(args: argparse.Namespace) -> int:
1797
+ schema = load_schema(args.schema)
1798
+ model, issues = _validate_model_file(args.model, schema)
1799
+ if has_errors(issues):
1800
+ _print_issues(issues)
1801
+ return 1
1802
+
1803
+ target_kind = "logical" if args.transform_command == "conceptual-to-logical" else "physical"
1804
+ transformed = transform_model(model, target_kind=target_kind, dialect=getattr(args, "dialect", "postgres"))
1805
+ transformed_issues = _combined_issues(transformed, schema)
1806
+ if has_errors(transformed_issues):
1807
+ _print_issues(transformed_issues)
1808
+ return 1
1809
+
1810
+ _print_or_write_yaml(transformed, getattr(args, "out", "") or "")
1811
+ return 0
1812
+
1813
+
1814
+ def cmd_standards_check(args: argparse.Namespace) -> int:
1815
+ schema = load_schema(args.schema)
1816
+ model, issues = _validate_model_file(args.model, schema)
1817
+ issues.extend(standards_issues(model))
1818
+
1819
+ if args.output_json:
1820
+ print(json.dumps({"issues": _issues_as_json(issues)}, indent=2))
1821
+ else:
1822
+ _print_issues(issues)
1823
+ return 1 if has_errors(issues) else 0
1824
+
1825
+
1826
+ def cmd_standards_fix(args: argparse.Namespace) -> int:
1827
+ model = load_yaml_model(args.model)
1828
+ fixed, changes = apply_standards_fixes(model)
1829
+
1830
+ if not args.write and not args.out:
1831
+ print("# Applied supported standards autofixes")
1832
+ for change in changes:
1833
+ print(f"# - {change}")
1834
+ print("")
1835
+
1836
+ _print_or_write_yaml(fixed, args.model if args.write else (args.out or ""))
1837
+ return 0
1838
+
1839
+
1840
+ def cmd_sync_compare(args: argparse.Namespace) -> int:
1841
+ current_model = load_yaml_model(args.current)
1842
+ candidate_model = load_yaml_model(args.candidate)
1843
+ diff = semantic_diff(current_model, candidate_model)
1844
+ print(json.dumps(diff, indent=2))
1845
+ return 0 if not diff["has_breaking_changes"] or args.allow_breaking else 2
1846
+
1847
+
1848
+ def cmd_sync_merge(args: argparse.Namespace) -> int:
1849
+ current_model = load_yaml_model(args.current)
1850
+ candidate_model = load_yaml_model(args.candidate)
1851
+ merged = merge_models_preserving_docs(current_model, candidate_model)
1852
+ _print_or_write_yaml(merged, getattr(args, "out", "") or "")
1853
+ return 0
1854
+
1855
+
1856
+ def cmd_sync_pull(args: argparse.Namespace) -> int:
1857
+ return cmd_pull(args)
1858
+
1859
+
1860
+ def cmd_resolve_project(args: argparse.Namespace) -> int:
1861
+ search_dirs = args.search_dir if args.search_dir else []
1862
+ results = resolve_project(args.directory, search_dirs=search_dirs)
1863
+
1864
+ total_issues = 0
1865
+ all_models = []
1866
+
1867
+ for path, resolved in sorted(results.items()):
1868
+ name = resolved.root_model.get("model", {}).get("name", "unknown")
1869
+ imports = list(resolved.imported_models.keys())
1870
+ entities = [e.get("name", "") for e in resolved.unified_entities()]
1871
+ issue_count = len(resolved.issues)
1872
+ total_issues += issue_count
1873
+
1874
+ all_models.append({
1875
+ "name": name,
1876
+ "file": path,
1877
+ "imports": imports,
1878
+ "entity_count": len(entities),
1879
+ "entities": entities,
1880
+ "issue_count": issue_count,
1881
+ "issues": [
1882
+ {"severity": i.severity, "code": i.code, "message": i.message}
1883
+ for i in resolved.issues
1884
+ ],
1885
+ })
1886
+
1887
+ if args.output_json:
1888
+ print(json.dumps({"models": all_models, "total_issues": total_issues}, indent=2))
1889
+ else:
1890
+ print(f"Project: {args.directory}")
1891
+ print(f"Models found: {len(all_models)}")
1892
+ for m in all_models:
1893
+ imp_str = f" (imports: {', '.join(m['imports'])})" if m["imports"] else ""
1894
+ status = "OK" if m["issue_count"] == 0 else f"{m['issue_count']} issues"
1895
+ print(f" {m['name']}: {m['entity_count']} entities{imp_str} [{status}]")
1896
+ for iss in m["issues"]:
1897
+ print(f" [{iss['severity'].upper()}] {iss['code']}: {iss['message']}")
1898
+ print(f"Total issues: {total_issues}")
1899
+
1900
+ return 1 if total_issues > 0 else 0
1901
+
1902
+
1903
+ def cmd_schema(args: argparse.Namespace) -> int:
1904
+ schema = load_schema(args.schema)
1905
+ print(json.dumps(schema, indent=2))
1906
+ return 0
1907
+
1908
+
1909
+ def cmd_policy_schema(args: argparse.Namespace) -> int:
1910
+ schema = load_schema(args.policy_schema)
1911
+ print(json.dumps(schema, indent=2))
1912
+ return 0
1913
+
1914
+
1915
+ def cmd_doctor(args: argparse.Namespace) -> int:
1916
+ project_dir = getattr(args, "path", ".")
1917
+ results = run_diagnostics(project_dir)
1918
+
1919
+ if getattr(args, "output_json", False):
1920
+ print(json.dumps(diagnostics_as_json(results), indent=2))
1921
+ else:
1922
+ print(format_diagnostics(results))
1923
+
1924
+ error_count = sum(1 for r in results if r.status == "error")
1925
+ return 1 if error_count > 0 else 0
1926
+
1927
+
1928
+ def cmd_migrate(args: argparse.Namespace) -> int:
1929
+ old_model = load_yaml_model(args.old)
1930
+ new_model = load_yaml_model(args.new)
1931
+ dialect = getattr(args, "dialect", "postgres")
1932
+
1933
+ if args.out:
1934
+ write_migration(old_model, new_model, args.out, dialect=dialect)
1935
+ print(f"Wrote migration SQL: {args.out}")
1936
+ else:
1937
+ sql = generate_migration(old_model, new_model, dialect=dialect)
1938
+ print(sql)
1939
+
1940
+ return 0
1941
+
1942
+
1943
+ def _split_sql_statements(sql_text: str) -> List[str]:
1944
+ statements: List[str] = []
1945
+ buf: List[str] = []
1946
+ in_single = False
1947
+ in_double = False
1948
+ in_line_comment = False
1949
+ in_block_comment = False
1950
+ i = 0
1951
+
1952
+ while i < len(sql_text):
1953
+ ch = sql_text[i]
1954
+ nxt = sql_text[i + 1] if i + 1 < len(sql_text) else ""
1955
+
1956
+ if in_line_comment:
1957
+ if ch == "\n":
1958
+ in_line_comment = False
1959
+ buf.append(ch)
1960
+ i += 1
1961
+ continue
1962
+
1963
+ if in_block_comment:
1964
+ if ch == "*" and nxt == "/":
1965
+ in_block_comment = False
1966
+ i += 2
1967
+ continue
1968
+ i += 1
1969
+ continue
1970
+
1971
+ if not in_single and not in_double and ch == "-" and nxt == "-":
1972
+ in_line_comment = True
1973
+ i += 2
1974
+ continue
1975
+
1976
+ if not in_single and not in_double and ch == "/" and nxt == "*":
1977
+ in_block_comment = True
1978
+ i += 2
1979
+ continue
1980
+
1981
+ if ch == "'" and not in_double:
1982
+ if in_single and nxt == "'":
1983
+ buf.append(ch)
1984
+ buf.append(nxt)
1985
+ i += 2
1986
+ continue
1987
+ in_single = not in_single
1988
+ buf.append(ch)
1989
+ i += 1
1990
+ continue
1991
+
1992
+ if ch == '"' and not in_single:
1993
+ in_double = not in_double
1994
+ buf.append(ch)
1995
+ i += 1
1996
+ continue
1997
+
1998
+ if ch == ";" and not in_single and not in_double:
1999
+ stmt = "".join(buf).strip()
2000
+ if stmt:
2001
+ statements.append(stmt)
2002
+ buf = []
2003
+ i += 1
2004
+ continue
2005
+
2006
+ buf.append(ch)
2007
+ i += 1
2008
+
2009
+ tail = "".join(buf).strip()
2010
+ if tail:
2011
+ statements.append(tail)
2012
+ return statements
2013
+
2014
+
2015
+ def _escape_sql_string(value: str) -> str:
2016
+ return value.replace("'", "''")
2017
+
2018
+
2019
+ def _sql_checksum(sql_text: str) -> str:
2020
+ return hashlib.sha256(sql_text.encode("utf-8")).hexdigest()
2021
+
2022
+
2023
+ def _default_migration_name() -> str:
2024
+ return f"migration_{time.strftime('%Y%m%d%H%M%S', time.gmtime())}"
2025
+
2026
+
2027
+ def _preview_sql(statement: str, max_len: int = 180) -> str:
2028
+ flat = " ".join(statement.strip().split())
2029
+ return flat if len(flat) <= max_len else f"{flat[: max_len - 3]}..."
2030
+
2031
+
2032
+ def _detect_destructive_statements(statements: List[str]) -> List[Dict[str, Any]]:
2033
+ checks = [
2034
+ ("DROP TABLE", re.compile(r"\bDROP\s+TABLE\b", re.IGNORECASE)),
2035
+ ("DROP VIEW", re.compile(r"\bDROP\s+VIEW\b", re.IGNORECASE)),
2036
+ ("DROP SCHEMA", re.compile(r"\bDROP\s+SCHEMA\b", re.IGNORECASE)),
2037
+ ("DROP DATABASE", re.compile(r"\bDROP\s+DATABASE\b", re.IGNORECASE)),
2038
+ ("TRUNCATE TABLE", re.compile(r"\bTRUNCATE\s+TABLE\b", re.IGNORECASE)),
2039
+ ("ALTER TABLE DROP COLUMN", re.compile(r"\bALTER\s+TABLE\b[\s\S]*\bDROP\s+COLUMN\b", re.IGNORECASE)),
2040
+ ]
2041
+ findings: List[Dict[str, Any]] = []
2042
+ for idx, statement in enumerate(statements, start=1):
2043
+ for check_name, pattern in checks:
2044
+ if pattern.search(statement):
2045
+ findings.append({
2046
+ "statement_index": idx,
2047
+ "kind": check_name,
2048
+ "preview": _preview_sql(statement),
2049
+ })
2050
+ break
2051
+ return findings
2052
+
2053
+
2054
+ def _write_apply_report(path: str, payload: Dict[str, Any]) -> None:
2055
+ Path(path).write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
2056
+
2057
+
2058
+ class ApplyExecutionError(RuntimeError):
2059
+ def __init__(self, connector: str, statement_index: int, statement: str, error: Exception):
2060
+ self.connector = connector
2061
+ self.statement_index = statement_index
2062
+ self.statement = statement
2063
+ self.error = error
2064
+ message = (
2065
+ f"{connector} apply failed at statement #{statement_index}: "
2066
+ f"{_preview_sql(statement)} ({error})"
2067
+ )
2068
+ super().__init__(message)
2069
+
2070
+
2071
+ def _apply_snowflake(config: ConnectorConfig, statements: List[str], migration_name: str, checksum: str, ledger_table: str, skip_ledger: bool) -> None:
2072
+ import snowflake.connector
2073
+ from datalex_core.connectors.snowflake import _load_private_key
2074
+
2075
+ params: Dict[str, Any] = {
2076
+ "account": config.host,
2077
+ "user": config.user,
2078
+ "warehouse": config.warehouse,
2079
+ "database": config.database,
2080
+ "schema": config.schema or "PUBLIC",
2081
+ }
2082
+ if config.private_key_path:
2083
+ passphrase = config.password if config.password else None
2084
+ params["private_key"] = _load_private_key(config.private_key_path, passphrase)
2085
+ else:
2086
+ params["password"] = config.password
2087
+
2088
+ conn = snowflake.connector.connect(**params)
2089
+ try:
2090
+ cur = conn.cursor()
2091
+ try:
2092
+ if config.warehouse:
2093
+ try:
2094
+ cur.execute(f"ALTER WAREHOUSE IF EXISTS {config.warehouse} RESUME IF SUSPENDED")
2095
+ except Exception:
2096
+ pass
2097
+
2098
+ for idx, stmt in enumerate(statements, start=1):
2099
+ try:
2100
+ cur.execute(stmt)
2101
+ except Exception as e:
2102
+ raise ApplyExecutionError("snowflake", idx, stmt, e) from e
2103
+
2104
+ if not skip_ledger:
2105
+ schema_name = (config.schema or "PUBLIC").upper()
2106
+ table_name = ledger_table
2107
+ create_sql = (
2108
+ f'CREATE TABLE IF NOT EXISTS "{schema_name}"."{table_name}" ('
2109
+ 'migration_name VARCHAR, checksum VARCHAR, statement_count NUMBER, '
2110
+ 'status VARCHAR, applied_at TIMESTAMP_NTZ)'
2111
+ )
2112
+ cur.execute(create_sql)
2113
+ insert_sql = (
2114
+ f'INSERT INTO "{schema_name}"."{table_name}" '
2115
+ '(migration_name, checksum, statement_count, status, applied_at) VALUES '
2116
+ f"('{_escape_sql_string(migration_name)}', '{checksum}', {len(statements)}, 'success', CURRENT_TIMESTAMP())"
2117
+ )
2118
+ cur.execute(insert_sql)
2119
+ finally:
2120
+ cur.close()
2121
+ finally:
2122
+ conn.close()
2123
+
2124
+
2125
+ def _apply_databricks(config: ConnectorConfig, statements: List[str], migration_name: str, checksum: str, ledger_table: str, skip_ledger: bool) -> None:
2126
+ from databricks import sql
2127
+
2128
+ conn = sql.connect(
2129
+ server_hostname=config.host,
2130
+ http_path=config.extra.get("http_path", ""),
2131
+ access_token=config.token,
2132
+ )
2133
+ try:
2134
+ cur = conn.cursor()
2135
+ try:
2136
+ for idx, stmt in enumerate(statements, start=1):
2137
+ try:
2138
+ cur.execute(stmt)
2139
+ except Exception as e:
2140
+ raise ApplyExecutionError("databricks", idx, stmt, e) from e
2141
+
2142
+ if not skip_ledger:
2143
+ catalog = config.catalog or "main"
2144
+ schema_name = config.schema or "default"
2145
+ qualified = f"`{catalog}`.`{schema_name}`.`{ledger_table}`"
2146
+ cur.execute(
2147
+ f"CREATE TABLE IF NOT EXISTS {qualified} ("
2148
+ "migration_name STRING, checksum STRING, statement_count INT, status STRING, applied_at TIMESTAMP)"
2149
+ )
2150
+ cur.execute(
2151
+ f"INSERT INTO {qualified} (migration_name, checksum, statement_count, status, applied_at) VALUES ("
2152
+ f"'{_escape_sql_string(migration_name)}', '{checksum}', {len(statements)}, 'success', current_timestamp())"
2153
+ )
2154
+ finally:
2155
+ cur.close()
2156
+ finally:
2157
+ conn.close()
2158
+
2159
+
2160
+ def _apply_bigquery(config: ConnectorConfig, statements: List[str], migration_name: str, checksum: str, ledger_table: str, skip_ledger: bool) -> None:
2161
+ from google.cloud import bigquery
2162
+
2163
+ client = bigquery.Client(project=config.project)
2164
+ for idx, stmt in enumerate(statements, start=1):
2165
+ try:
2166
+ client.query(stmt).result()
2167
+ except Exception as e:
2168
+ raise ApplyExecutionError("bigquery", idx, stmt, e) from e
2169
+
2170
+ if not skip_ledger:
2171
+ dataset = config.dataset
2172
+ if not dataset:
2173
+ raise ValueError("--dataset is required for BigQuery migration ledger")
2174
+ qualified = f"`{config.project}.{dataset}.{ledger_table}`"
2175
+ client.query(
2176
+ f"CREATE TABLE IF NOT EXISTS {qualified} ("
2177
+ "migration_name STRING, checksum STRING, statement_count INT64, status STRING, applied_at TIMESTAMP)"
2178
+ ).result()
2179
+ client.query(
2180
+ f"INSERT INTO {qualified} (migration_name, checksum, statement_count, status, applied_at) VALUES ("
2181
+ f"'{_escape_sql_string(migration_name)}', '{checksum}', {len(statements)}, 'success', CURRENT_TIMESTAMP())"
2182
+ ).result()
2183
+
2184
+
2185
+ def cmd_apply(args: argparse.Namespace) -> int:
2186
+ connector_type = args.connector
2187
+ dialect = (getattr(args, "dialect", "") or connector_type).lower()
2188
+ started_ts = time.time()
2189
+ mode = "sql_file" if args.sql_file else "model_diff"
2190
+ policy_results: List[Dict[str, str]] = []
2191
+
2192
+ if connector_type not in {"snowflake", "databricks", "bigquery"}:
2193
+ print("Apply currently supports only snowflake, databricks, and bigquery.", file=sys.stderr)
2194
+ return 1
2195
+
2196
+ if dialect not in {"snowflake", "databricks", "bigquery"}:
2197
+ print(f"Unsupported apply dialect: {dialect}", file=sys.stderr)
2198
+ return 1
2199
+
2200
+ if args.sql_file and (args.old or args.new):
2201
+ print("Use either --sql-file or --old/--new, not both.", file=sys.stderr)
2202
+ return 1
2203
+
2204
+ if not args.sql_file and not (args.old and args.new):
2205
+ print("Provide --sql-file or both --old and --new.", file=sys.stderr)
2206
+ return 1
2207
+
2208
+ if (args.old and not args.new) or (args.new and not args.old):
2209
+ print("Both --old and --new are required together.", file=sys.stderr)
2210
+ return 1
2211
+
2212
+ if args.sql_file:
2213
+ sql_text = Path(args.sql_file).read_text(encoding="utf-8")
2214
+ else:
2215
+ schema = load_schema(args.model_schema)
2216
+ old_model, old_issues = _validate_model_file(args.old, schema)
2217
+ new_model, new_issues = _validate_model_file(args.new, schema)
2218
+ _print_issue_block(f"Old model ({args.old})", old_issues)
2219
+ _print_issue_block(f"New model ({args.new})", new_issues)
2220
+ combined_issues = list(old_issues) + list(new_issues)
2221
+ if has_errors(combined_issues):
2222
+ print("Apply failed: validation errors detected.", file=sys.stderr)
2223
+ return 1
2224
+ if not getattr(args, "skip_policy_check", False):
2225
+ policy_pack = load_policy_pack_with_inheritance(args.policy_pack)
2226
+ evaluated = policy_issues(new_model, policy_pack)
2227
+ _print_issue_block(f"Policy evaluation ({args.policy_pack})", evaluated)
2228
+ policy_results = _issues_as_json(evaluated)
2229
+ if has_errors(evaluated):
2230
+ print("Apply failed: policy check failed.", file=sys.stderr)
2231
+ return 1
2232
+ sql_text = generate_migration(old_model, new_model, dialect=dialect)
2233
+
2234
+ statements = _split_sql_statements(sql_text)
2235
+ if not statements:
2236
+ print("No executable SQL statements found.", file=sys.stderr)
2237
+ return 1
2238
+
2239
+ migration_name = args.migration_name or _default_migration_name()
2240
+ checksum = _sql_checksum(sql_text)
2241
+ destructive_findings = _detect_destructive_statements(statements)
2242
+
2243
+ if destructive_findings and not getattr(args, "allow_destructive", False):
2244
+ print(
2245
+ "Apply blocked: destructive SQL detected. Re-run with --allow-destructive if this is intentional.",
2246
+ file=sys.stderr,
2247
+ )
2248
+ for finding in destructive_findings[:5]:
2249
+ print(
2250
+ f" - #{finding['statement_index']} {finding['kind']}: {finding['preview']}",
2251
+ file=sys.stderr,
2252
+ )
2253
+ if len(destructive_findings) > 5:
2254
+ print(f" ... and {len(destructive_findings) - 5} more statement(s).", file=sys.stderr)
2255
+ return 1
2256
+
2257
+ if getattr(args, "write_sql", ""):
2258
+ Path(args.write_sql).write_text(sql_text.strip() + "\n", encoding="utf-8")
2259
+
2260
+ report: Dict[str, Any] = {
2261
+ "connector": connector_type,
2262
+ "dialect": dialect,
2263
+ "mode": mode,
2264
+ "status": "pending",
2265
+ "migration_name": migration_name,
2266
+ "checksum": checksum,
2267
+ "statement_count": len(statements),
2268
+ "destructive_statement_count": len(destructive_findings),
2269
+ "destructive_statements": destructive_findings,
2270
+ "policy_checked": mode == "model_diff" and not getattr(args, "skip_policy_check", False),
2271
+ "policy_results": policy_results,
2272
+ "skip_ledger": bool(args.skip_ledger),
2273
+ "ledger_table": args.ledger_table,
2274
+ "started_at_epoch": started_ts,
2275
+ "started_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(started_ts)),
2276
+ }
2277
+
2278
+ if getattr(args, "dry_run", False):
2279
+ finished_ts = time.time()
2280
+ report["status"] = "dry_run"
2281
+ report["finished_at_epoch"] = finished_ts
2282
+ report["finished_at_utc"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(finished_ts))
2283
+ report["duration_ms"] = int((finished_ts - started_ts) * 1000)
2284
+ if getattr(args, "report_json", ""):
2285
+ _write_apply_report(args.report_json, report)
2286
+ if getattr(args, "output_json", False):
2287
+ print(json.dumps(report, indent=2))
2288
+ else:
2289
+ print(f"DRY RUN: {len(statements)} statements for {connector_type}")
2290
+ print(f"Migration: {migration_name}")
2291
+ print(f"Checksum: {checksum}")
2292
+ if destructive_findings:
2293
+ print(f"Destructive statements: {len(destructive_findings)} (allowed)")
2294
+ print("\n" + sql_text.strip() + "\n")
2295
+ return 0
2296
+
2297
+ if connector_type == "snowflake" and (not getattr(args, "host", "") or not getattr(args, "user", "") or not getattr(args, "database", "")):
2298
+ print("Snowflake apply requires --host, --user, and --database.", file=sys.stderr)
2299
+ return 1
2300
+ if connector_type == "databricks" and (not getattr(args, "host", "") or not getattr(args, "token", "") or not getattr(args, "http_path", "")):
2301
+ print("Databricks apply requires --host, --token, and --http-path.", file=sys.stderr)
2302
+ return 1
2303
+ if connector_type == "bigquery" and (not getattr(args, "project", "") or not getattr(args, "dataset", "")):
2304
+ print("BigQuery apply requires --project and --dataset.", file=sys.stderr)
2305
+ return 1
2306
+
2307
+ connector = get_connector(connector_type)
2308
+ if connector is None:
2309
+ print(f"Unknown connector: {connector_type}", file=sys.stderr)
2310
+ return 1
2311
+
2312
+ ok, msg = connector.check_driver()
2313
+ if not ok:
2314
+ print(f"Driver check failed: {msg}", file=sys.stderr)
2315
+ return 1
2316
+
2317
+ config = _build_connector_config(args)
2318
+ try:
2319
+ if connector_type == "snowflake":
2320
+ _apply_snowflake(config, statements, migration_name, checksum, args.ledger_table, args.skip_ledger)
2321
+ elif connector_type == "databricks":
2322
+ _apply_databricks(config, statements, migration_name, checksum, args.ledger_table, args.skip_ledger)
2323
+ elif connector_type == "bigquery":
2324
+ _apply_bigquery(config, statements, migration_name, checksum, args.ledger_table, args.skip_ledger)
2325
+ except ApplyExecutionError as e:
2326
+ finished_ts = time.time()
2327
+ report["status"] = "failed"
2328
+ report["error"] = str(e)
2329
+ report["failed_statement_index"] = e.statement_index
2330
+ report["failed_statement_preview"] = _preview_sql(e.statement)
2331
+ report["finished_at_epoch"] = finished_ts
2332
+ report["finished_at_utc"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(finished_ts))
2333
+ report["duration_ms"] = int((finished_ts - started_ts) * 1000)
2334
+ if getattr(args, "report_json", ""):
2335
+ _write_apply_report(args.report_json, report)
2336
+ if getattr(args, "output_json", False):
2337
+ print(json.dumps(report, indent=2))
2338
+ else:
2339
+ print(str(e), file=sys.stderr)
2340
+ return 1
2341
+ except Exception as e:
2342
+ finished_ts = time.time()
2343
+ report["status"] = "failed"
2344
+ report["error"] = str(e)
2345
+ report["finished_at_epoch"] = finished_ts
2346
+ report["finished_at_utc"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(finished_ts))
2347
+ report["duration_ms"] = int((finished_ts - started_ts) * 1000)
2348
+ if getattr(args, "report_json", ""):
2349
+ _write_apply_report(args.report_json, report)
2350
+ if getattr(args, "output_json", False):
2351
+ print(json.dumps(report, indent=2))
2352
+ else:
2353
+ print(f"Apply failed: {e}", file=sys.stderr)
2354
+ return 1
2355
+
2356
+ finished_ts = time.time()
2357
+ report["status"] = "success"
2358
+ report["finished_at_epoch"] = finished_ts
2359
+ report["finished_at_utc"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(finished_ts))
2360
+ report["duration_ms"] = int((finished_ts - started_ts) * 1000)
2361
+ if getattr(args, "report_json", ""):
2362
+ _write_apply_report(args.report_json, report)
2363
+
2364
+ if getattr(args, "output_json", False):
2365
+ print(json.dumps(report, indent=2))
2366
+ else:
2367
+ print(f"Applied migration '{migration_name}' ({len(statements)} statements) to {connector_type}.")
2368
+ if not args.skip_ledger:
2369
+ print(f"Ledger table: {args.ledger_table}")
2370
+ return 0
2371
+
2372
+
2373
+ def cmd_completion(args: argparse.Namespace) -> int:
2374
+ shell = args.shell
2375
+ if shell == "bash":
2376
+ print(generate_bash_completion())
2377
+ elif shell == "zsh":
2378
+ print(generate_zsh_completion())
2379
+ elif shell == "fish":
2380
+ print(generate_fish_completion())
2381
+ else:
2382
+ print(f"Unsupported shell: {shell}", file=sys.stderr)
2383
+ return 1
2384
+ return 0
2385
+
2386
+
2387
+ def cmd_watch(args: argparse.Namespace) -> int:
2388
+ schema_path = getattr(args, "schema", None) or _default_schema_path()
2389
+ schema = load_schema(schema_path)
2390
+ watch_glob = getattr(args, "glob", "**/*.model.yaml")
2391
+ interval = getattr(args, "interval", 2)
2392
+ root = Path(".").resolve()
2393
+
2394
+ print(f"Watching for changes: {watch_glob} (every {interval}s)")
2395
+ print("Press Ctrl+C to stop.\n")
2396
+
2397
+ mtimes: Dict[str, float] = {}
2398
+
2399
+ try:
2400
+ while True:
2401
+ current_files: Dict[str, float] = {}
2402
+ for pattern in [watch_glob]:
2403
+ for path in sorted(root.glob(pattern)):
2404
+ rel = str(path.relative_to(root))
2405
+ if ".git" in rel or "node_modules" in rel or ".venv" in rel:
2406
+ continue
2407
+ try:
2408
+ mtime = path.stat().st_mtime
2409
+ current_files[str(path)] = mtime
2410
+ except OSError:
2411
+ continue
2412
+
2413
+ changed: List[str] = []
2414
+ for fpath, mtime in current_files.items():
2415
+ if fpath not in mtimes or mtimes[fpath] != mtime:
2416
+ changed.append(fpath)
2417
+
2418
+ mtimes = current_files
2419
+
2420
+ for fpath in changed:
2421
+ rel = str(Path(fpath).relative_to(root))
2422
+ print(f"\n--- Changed: {rel} ---")
2423
+ try:
2424
+ model = load_yaml_model(fpath)
2425
+ s_issues = schema_issues(model, schema)
2426
+ l_issues = lint_issues(model)
2427
+ all_issues = s_issues + l_issues
2428
+
2429
+ if all_issues:
2430
+ for iss in all_issues:
2431
+ sev = iss.severity.upper()
2432
+ print(f" [{sev}] {iss.code}: {iss.message}")
2433
+ error_count = sum(1 for i in all_issues if i.severity == "error")
2434
+ warn_count = sum(1 for i in all_issues if i.severity == "warn")
2435
+ print(f" Result: {error_count} error(s), {warn_count} warning(s)")
2436
+ else:
2437
+ print(" \u2713 Valid")
2438
+ except Exception as exc:
2439
+ print(f" [ERROR] {exc}")
2440
+
2441
+ time.sleep(interval)
2442
+ except KeyboardInterrupt:
2443
+ print("\nWatch stopped.")
2444
+ return 0
2445
+
2446
+
2447
+ def build_parser() -> argparse.ArgumentParser:
2448
+ parser = argparse.ArgumentParser(prog="datalex", description="DataLex CLI")
2449
+ sub = parser.add_subparsers(dest="command", required=True)
2450
+
2451
+ init_parser = sub.add_parser("init", help="Initialize a new workspace")
2452
+ init_parser.add_argument("--path", default=".", help="Workspace path")
2453
+ init_parser.add_argument(
2454
+ "--template",
2455
+ choices=["single", "multi-model", "end-to-end"],
2456
+ default="single",
2457
+ help="Starter template to scaffold (default: single).",
2458
+ )
2459
+ init_parser.add_argument(
2460
+ "--multi-model",
2461
+ action="store_true",
2462
+ help="Deprecated alias for --template multi-model.",
2463
+ )
2464
+ init_parser.set_defaults(func=cmd_init)
2465
+
2466
+ validate_parser = sub.add_parser("validate", help="Validate model with schema + semantic rules")
2467
+ validate_parser.add_argument("model", help="Path to model YAML")
2468
+ validate_parser.add_argument("--schema", default=_default_schema_path(), help="Path to JSON schema")
2469
+ validate_parser.set_defaults(func=cmd_validate)
2470
+
2471
+ lint_parser = sub.add_parser("lint", help="Run semantic lint checks")
2472
+ lint_parser.add_argument("model", help="Path to model YAML")
2473
+ lint_parser.set_defaults(func=cmd_lint)
2474
+
2475
+ compile_parser = sub.add_parser("compile", help="Compile model to canonical JSON")
2476
+ compile_parser.add_argument("model", help="Path to model YAML")
2477
+ compile_parser.add_argument("--schema", default=_default_schema_path(), help="Path to JSON schema")
2478
+ compile_parser.add_argument("--out", help="Output file for canonical JSON")
2479
+ compile_parser.set_defaults(func=cmd_compile)
2480
+
2481
+ diff_parser = sub.add_parser("diff", help="Semantic diff between two model files")
2482
+ diff_parser.add_argument("old", help="Old model YAML path")
2483
+ diff_parser.add_argument("new", help="New model YAML path")
2484
+ diff_parser.set_defaults(func=cmd_diff)
2485
+
2486
+ validate_all_parser = sub.add_parser(
2487
+ "validate-all", help="Validate all model files matching a glob"
2488
+ )
2489
+ validate_all_parser.add_argument(
2490
+ "--glob", default="**/*.model.yaml", help="Glob pattern for model files"
2491
+ )
2492
+ validate_all_parser.add_argument(
2493
+ "--exclude",
2494
+ nargs="*",
2495
+ default=["**/node_modules/**", "**/.git/**", "**/.venv/**"],
2496
+ help="Glob-style path patterns to exclude",
2497
+ )
2498
+ validate_all_parser.add_argument(
2499
+ "--schema", default=_default_schema_path(), help="Path to JSON schema"
2500
+ )
2501
+ validate_all_parser.set_defaults(func=cmd_validate_all)
2502
+
2503
+ gate_parser = sub.add_parser(
2504
+ "gate",
2505
+ help="PR gate: validate old/new models and fail on breaking changes by default",
2506
+ )
2507
+ gate_parser.add_argument("old", help="Old model YAML path")
2508
+ gate_parser.add_argument("new", help="New model YAML path")
2509
+ gate_parser.add_argument(
2510
+ "--schema", default=_default_schema_path(), help="Path to JSON schema"
2511
+ )
2512
+ gate_parser.add_argument(
2513
+ "--allow-breaking",
2514
+ action="store_true",
2515
+ help="Allow breaking changes (still fails on validation errors)",
2516
+ )
2517
+ gate_parser.add_argument(
2518
+ "--output-json", action="store_true", help="Print semantic diff as JSON"
2519
+ )
2520
+ gate_parser.set_defaults(func=cmd_gate)
2521
+
2522
+ policy_parser = sub.add_parser("policy-check", help="Evaluate a model against a policy pack")
2523
+ policy_parser.add_argument("model", help="Path to model YAML")
2524
+ policy_parser.add_argument(
2525
+ "--policy", default=_default_policy_path(), help="Path to policy pack YAML"
2526
+ )
2527
+ policy_parser.add_argument(
2528
+ "--schema", default=_default_schema_path(), help="Path to model schema JSON"
2529
+ )
2530
+ policy_parser.add_argument(
2531
+ "--policy-schema",
2532
+ default=_default_policy_schema_path(),
2533
+ help="Path to policy schema JSON",
2534
+ )
2535
+ policy_parser.add_argument("--output-json", action="store_true", help="Print policy output as JSON")
2536
+ policy_parser.add_argument("--inherit", action="store_true", help="Resolve pack.extends inheritance chain before evaluation")
2537
+ policy_parser.set_defaults(func=cmd_policy_check)
2538
+
2539
+ generate_parser = sub.add_parser("generate", help="Generate artifacts from model YAML")
2540
+ generate_sub = generate_parser.add_subparsers(dest="generate_command", required=True)
2541
+
2542
+ gen_sql_parser = generate_sub.add_parser("sql", help="Generate SQL DDL")
2543
+ gen_sql_parser.add_argument("model", help="Path to model YAML")
2544
+ gen_sql_parser.add_argument("--dialect", default="postgres", choices=["postgres", "snowflake", "bigquery", "databricks"])
2545
+ gen_sql_parser.add_argument("--out", help="Output SQL file path")
2546
+ gen_sql_parser.add_argument("--schema", default=_default_schema_path(), help="Path to model schema JSON")
2547
+ gen_sql_parser.set_defaults(func=cmd_generate_sql)
2548
+
2549
+ gen_dbt_parser = generate_sub.add_parser("dbt", help="Generate dbt project scaffold")
2550
+ gen_dbt_parser.add_argument("model", help="Path to model YAML")
2551
+ gen_dbt_parser.add_argument("--out-dir", required=True, help="Target directory for scaffold files")
2552
+ gen_dbt_parser.add_argument("--source-name", default="raw", help="dbt source name")
2553
+ gen_dbt_parser.add_argument("--project-name", default="data_modeling_mvp", help="dbt project name")
2554
+ gen_dbt_parser.add_argument("--schema", default=_default_schema_path(), help="Path to model schema JSON")
2555
+ gen_dbt_parser.set_defaults(func=cmd_generate_dbt)
2556
+
2557
+ gen_metadata_parser = generate_sub.add_parser("metadata", help="Generate metadata JSON export")
2558
+ gen_metadata_parser.add_argument("model", help="Path to model YAML")
2559
+ gen_metadata_parser.add_argument("--out", help="Output metadata JSON path")
2560
+ gen_metadata_parser.add_argument("--schema", default=_default_schema_path(), help="Path to model schema JSON")
2561
+ gen_metadata_parser.set_defaults(func=cmd_generate_metadata)
2562
+
2563
+ gen_docs_parser = generate_sub.add_parser("docs", help="Generate data dictionary documentation")
2564
+ gen_docs_parser.add_argument("model", help="Path to model YAML")
2565
+ gen_docs_parser.add_argument("--format", default="html", choices=["html", "markdown"], help="Output format")
2566
+ gen_docs_parser.add_argument("--out", help="Output file path")
2567
+ gen_docs_parser.add_argument("--title", help="Custom page title")
2568
+ gen_docs_parser.set_defaults(func=cmd_generate_docs)
2569
+
2570
+ gen_changelog_parser = generate_sub.add_parser("changelog", help="Generate changelog from model diff")
2571
+ gen_changelog_parser.add_argument("old", help="Old model YAML path")
2572
+ gen_changelog_parser.add_argument("new", help="New model YAML path")
2573
+ gen_changelog_parser.add_argument("--out", help="Output changelog file path")
2574
+ gen_changelog_parser.set_defaults(func=cmd_generate_changelog)
2575
+
2576
+ import_parser = sub.add_parser("import", help="Import SQL/DBML/Spark/dbt schema into model YAML")
2577
+ import_sub = import_parser.add_subparsers(dest="import_command", required=True)
2578
+
2579
+ import_sql_parser = import_sub.add_parser("sql", help="Import SQL DDL file")
2580
+ import_sql_parser.add_argument("input", help="Path to SQL DDL file")
2581
+ import_sql_parser.add_argument("--out", help="Write output YAML model file")
2582
+ import_sql_parser.add_argument("--model-name", default="imported_sql_model", help="Model name")
2583
+ import_sql_parser.add_argument("--domain", default="imported", help="Domain value")
2584
+ import_sql_parser.add_argument(
2585
+ "--owner",
2586
+ action="append",
2587
+ default=[],
2588
+ help="Owner email (repeatable)",
2589
+ )
2590
+ import_sql_parser.add_argument("--schema", default=_default_schema_path(), help="Path to model schema JSON")
2591
+ import_sql_parser.set_defaults(func=cmd_import_sql)
2592
+
2593
+ import_dbml_parser = import_sub.add_parser("dbml", help="Import DBML file")
2594
+ import_dbml_parser.add_argument("input", help="Path to DBML file")
2595
+ import_dbml_parser.add_argument("--out", help="Write output YAML model file")
2596
+ import_dbml_parser.add_argument("--model-name", default="imported_dbml_model", help="Model name")
2597
+ import_dbml_parser.add_argument("--domain", default="imported", help="Domain value")
2598
+ import_dbml_parser.add_argument(
2599
+ "--owner",
2600
+ action="append",
2601
+ default=[],
2602
+ help="Owner email (repeatable)",
2603
+ )
2604
+ import_dbml_parser.add_argument("--schema", default=_default_schema_path(), help="Path to model schema JSON")
2605
+ import_dbml_parser.set_defaults(func=cmd_import_dbml)
2606
+
2607
+ import_spark_parser = import_sub.add_parser("spark-schema", help="Import Spark schema JSON file")
2608
+ import_spark_parser.add_argument("input", help="Path to Spark schema JSON file")
2609
+ import_spark_parser.add_argument("--out", help="Write output YAML model file")
2610
+ import_spark_parser.add_argument("--model-name", default="imported_spark_schema", help="Model name")
2611
+ import_spark_parser.add_argument("--table-name", help="Table name (for single StructType schemas)")
2612
+ import_spark_parser.add_argument("--domain", default="imported", help="Domain value")
2613
+ import_spark_parser.add_argument("--owner", action="append", default=[], help="Owner email (repeatable)")
2614
+ import_spark_parser.add_argument("--schema", default=_default_schema_path(), help="Path to model schema JSON")
2615
+ import_spark_parser.set_defaults(func=cmd_import_spark_schema)
2616
+
2617
+ import_dbt_parser = import_sub.add_parser("dbt", help="Import dbt schema.yml file")
2618
+ import_dbt_parser.add_argument("input", help="Path to dbt schema.yml file")
2619
+ import_dbt_parser.add_argument("--out", help="Write output YAML model file")
2620
+ import_dbt_parser.add_argument("--model-name", default="imported_dbt_model", help="Model name")
2621
+ import_dbt_parser.add_argument("--domain", default="imported", help="Domain value")
2622
+ import_dbt_parser.add_argument("--owner", action="append", default=[], help="Owner email (repeatable)")
2623
+ import_dbt_parser.add_argument("--schema", default=_default_schema_path(), help="Path to model schema JSON")
2624
+ import_dbt_parser.set_defaults(func=cmd_import_dbt)
2625
+
2626
+ # dbt round-trip subcommand group
2627
+ dbt_parser = sub.add_parser("dbt", help="dbt round-trip: sync DataLex metadata into dbt schema.yml files")
2628
+ dbt_sub = dbt_parser.add_subparsers(dest="dbt_command", required=True)
2629
+
2630
+ dbt_sync_parser = dbt_sub.add_parser("sync", help="Merge DataLex metadata into a single dbt schema.yml (non-destructive)")
2631
+ dbt_sync_parser.add_argument("model", help="Path to the DataLex .model.yaml file")
2632
+ dbt_sync_parser.add_argument("--dbt-schema", required=True, help="Path to the existing dbt schema.yml to update")
2633
+ dbt_sync_parser.add_argument("--out", default=None, help="Output path (default: overwrites --dbt-schema in-place)")
2634
+ dbt_sync_parser.set_defaults(func=cmd_dbt_sync)
2635
+
2636
+ dbt_push_parser = dbt_sub.add_parser("push", help="Push DataLex metadata into all schema.yml files found in a dbt project")
2637
+ dbt_push_parser.add_argument("model", help="Path to the DataLex .model.yaml file")
2638
+ dbt_push_parser.add_argument("--dbt-project", required=True, help="Root path of the dbt project to scan for schema.yml files")
2639
+ dbt_push_parser.set_defaults(func=cmd_dbt_push)
2640
+
2641
+ pull_parser = sub.add_parser("pull", help="Pull schema from a live database into a DataLex model")
2642
+ pull_parser.add_argument("connector", help="Connector type (postgres, mysql, snowflake, bigquery, databricks, sqlserver, azure_sql, azure_fabric, redshift)")
2643
+ pull_parser.add_argument("--host", help="Database host (or Snowflake account, Databricks server hostname)")
2644
+ pull_parser.add_argument("--port", type=int, help="Database port")
2645
+ pull_parser.add_argument("--database", help="Database name")
2646
+ pull_parser.add_argument("--db-schema", help="Schema name (default: public/PUBLIC/default)")
2647
+ pull_parser.add_argument("--user", help="Database user")
2648
+ pull_parser.add_argument("--password", help="Database password")
2649
+ pull_parser.add_argument("--warehouse", help="Snowflake warehouse")
2650
+ pull_parser.add_argument("--project", help="BigQuery project ID")
2651
+ pull_parser.add_argument("--dataset", help="BigQuery dataset")
2652
+ pull_parser.add_argument("--catalog", help="Databricks Unity Catalog name")
2653
+ pull_parser.add_argument("--token", help="Access token (Databricks)")
2654
+ pull_parser.add_argument("--http-path", help="Databricks SQL Warehouse/Cluster HTTP path")
2655
+ pull_parser.add_argument("--odbc-driver", help="ODBC driver for SQL Server-family connectors")
2656
+ pull_parser.add_argument("--encrypt", help="SQL Server encryption setting (yes/no)")
2657
+ pull_parser.add_argument("--trust-server-certificate", help="SQL Server TrustServerCertificate setting (yes/no)")
2658
+ pull_parser.add_argument("--private-key-path", help="Path to RSA private key PEM file (Snowflake key-pair auth)")
2659
+ pull_parser.add_argument("--tables", nargs="*", help="Only include these tables")
2660
+ pull_parser.add_argument("--exclude-tables", nargs="*", help="Exclude these tables")
2661
+ pull_parser.add_argument("--model-name", default="imported_model", help="Model name")
2662
+ pull_parser.add_argument("--domain", default="imported", help="Domain value")
2663
+ pull_parser.add_argument("--owner", help="Owner email")
2664
+ pull_parser.add_argument("--out", help="Output YAML model file path")
2665
+ pull_parser.add_argument("--project-dir", help="Project folder to write extracted model YAML")
2666
+ pull_parser.add_argument(
2667
+ "--create-project-dir",
2668
+ action="store_true",
2669
+ help="Create --project-dir if missing (otherwise prompt in interactive mode)",
2670
+ )
2671
+ pull_parser.add_argument("--test", action="store_true", help="Test connection only, do not pull schema")
2672
+ pull_parser.set_defaults(func=cmd_pull)
2673
+
2674
+ connectors_parser = sub.add_parser("connectors", help="List available database connectors and driver status")
2675
+ connectors_parser.add_argument("--output-json", action="store_true", help="Print as JSON")
2676
+ connectors_parser.set_defaults(func=cmd_connectors)
2677
+
2678
+ # Common connection args helper
2679
+ def _add_conn_args(p):
2680
+ p.add_argument("connector", help="Connector type (postgres, mysql, snowflake, bigquery, databricks, sqlserver, azure_sql, azure_fabric, redshift)")
2681
+ p.add_argument("--host", help="Database host")
2682
+ p.add_argument("--port", type=int, help="Database port")
2683
+ p.add_argument("--database", help="Database name")
2684
+ p.add_argument("--db-schema", help="Schema name")
2685
+ p.add_argument("--user", help="Database user")
2686
+ p.add_argument("--password", help="Database password")
2687
+ p.add_argument("--warehouse", help="Snowflake warehouse")
2688
+ p.add_argument("--project", help="BigQuery project ID")
2689
+ p.add_argument("--dataset", help="BigQuery dataset")
2690
+ p.add_argument("--catalog", help="Databricks catalog")
2691
+ p.add_argument("--token", help="Access token")
2692
+ p.add_argument("--http-path", help="Databricks SQL Warehouse/Cluster HTTP path")
2693
+ p.add_argument("--odbc-driver", help="ODBC driver for SQL Server-family connectors")
2694
+ p.add_argument("--encrypt", help="SQL Server encryption setting (yes/no)")
2695
+ p.add_argument("--trust-server-certificate", help="SQL Server TrustServerCertificate setting (yes/no)")
2696
+ p.add_argument("--private-key-path", help="Path to RSA private key PEM file (Snowflake key-pair auth)")
2697
+ p.add_argument("--output-json", action="store_true", help="Print as JSON")
2698
+
2699
+ schemas_parser = sub.add_parser("schemas", help="List schemas/datasets in a database")
2700
+ _add_conn_args(schemas_parser)
2701
+ schemas_parser.set_defaults(func=cmd_schemas)
2702
+
2703
+ tables_parser = sub.add_parser("tables", help="List tables in a database schema")
2704
+ _add_conn_args(tables_parser)
2705
+ tables_parser.set_defaults(func=cmd_tables)
2706
+
2707
+ resolve_parser = sub.add_parser("resolve", help="Resolve cross-model imports and show unified graph")
2708
+ resolve_parser.add_argument("model", help="Path to root model YAML")
2709
+ resolve_parser.add_argument(
2710
+ "--search-dir",
2711
+ action="append",
2712
+ default=[],
2713
+ help="Additional directories to search for imported models (repeatable)",
2714
+ )
2715
+ resolve_parser.add_argument("--output-json", action="store_true", help="Print graph as JSON")
2716
+ resolve_parser.set_defaults(func=cmd_resolve)
2717
+
2718
+ resolve_project_parser = sub.add_parser("resolve-project", help="Resolve all models in a project directory")
2719
+ resolve_project_parser.add_argument("directory", help="Project directory path")
2720
+ resolve_project_parser.add_argument(
2721
+ "--search-dir",
2722
+ action="append",
2723
+ default=[],
2724
+ help="Additional search directories (repeatable)",
2725
+ )
2726
+ resolve_project_parser.add_argument("--output-json", action="store_true", help="Print results as JSON")
2727
+ resolve_project_parser.set_defaults(func=cmd_resolve_project)
2728
+
2729
+ diff_all_parser = sub.add_parser("diff-all", help="Semantic diff between two model directories")
2730
+ diff_all_parser.add_argument("old", help="Old model directory")
2731
+ diff_all_parser.add_argument("new", help="New model directory")
2732
+ diff_all_parser.add_argument("--output-json", action="store_true", help="Print diff as JSON")
2733
+ diff_all_parser.add_argument(
2734
+ "--allow-breaking",
2735
+ action="store_true",
2736
+ help="Allow breaking changes (exit 0 even with breaking changes)",
2737
+ )
2738
+ diff_all_parser.set_defaults(func=cmd_diff_all)
2739
+
2740
+ transform_parser = sub.add_parser("transform", help="Transform a model between conceptual, logical, and physical forms")
2741
+ transform_sub = transform_parser.add_subparsers(dest="transform_command", required=True)
2742
+
2743
+ transform_to_logical = transform_sub.add_parser("conceptual-to-logical", help="Transform a conceptual model into a logical model")
2744
+ transform_to_logical.add_argument("model", help="Path to source model YAML")
2745
+ transform_to_logical.add_argument("--schema", default=_default_schema_path(), help="Path to model schema JSON")
2746
+ transform_to_logical.add_argument("--out", help="Write transformed model YAML")
2747
+ transform_to_logical.set_defaults(func=cmd_transform)
2748
+
2749
+ transform_to_physical = transform_sub.add_parser("logical-to-physical", help="Transform a logical model into a physical model")
2750
+ transform_to_physical.add_argument("model", help="Path to source model YAML")
2751
+ transform_to_physical.add_argument("--dialect", default="postgres", choices=["postgres", "snowflake", "bigquery", "databricks"])
2752
+ transform_to_physical.add_argument("--schema", default=_default_schema_path(), help="Path to model schema JSON")
2753
+ transform_to_physical.add_argument("--out", help="Write transformed model YAML")
2754
+ transform_to_physical.set_defaults(func=cmd_transform)
2755
+
2756
+ standards_parser = sub.add_parser("standards", help="Check or autofix model standards, naming rules, and shared libraries")
2757
+ standards_sub = standards_parser.add_subparsers(dest="standards_command", required=True)
2758
+
2759
+ standards_check = standards_sub.add_parser("check", help="Evaluate standards and naming rules")
2760
+ standards_check.add_argument("model", help="Path to model YAML")
2761
+ standards_check.add_argument("--schema", default=_default_schema_path(), help="Path to model schema JSON")
2762
+ standards_check.add_argument("--output-json", action="store_true", help="Print standards report as JSON")
2763
+ standards_check.set_defaults(func=cmd_standards_check)
2764
+
2765
+ standards_fix = standards_sub.add_parser("fix", help="Apply supported standards autofixes")
2766
+ standards_fix.add_argument("model", help="Path to model YAML")
2767
+ standards_fix.add_argument("--write", "-w", action="store_true", help="Overwrite the input model in-place")
2768
+ standards_fix.add_argument("--out", help="Write fixed YAML to a new path")
2769
+ standards_fix.set_defaults(func=cmd_standards_fix)
2770
+
2771
+ sync_parser = sub.add_parser("sync", help="Round-trip compare, merge, or pull workflows")
2772
+ sync_sub = sync_parser.add_subparsers(dest="sync_command", required=True)
2773
+
2774
+ sync_compare = sync_sub.add_parser("compare", help="Compare current and candidate models")
2775
+ sync_compare.add_argument("current", help="Current local model YAML")
2776
+ sync_compare.add_argument("candidate", help="Candidate/live model YAML")
2777
+ sync_compare.add_argument("--allow-breaking", action="store_true", help="Return 0 even when breaking changes are detected")
2778
+ sync_compare.set_defaults(func=cmd_sync_compare)
2779
+
2780
+ sync_merge = sync_sub.add_parser("merge", help="Merge documentation metadata from current into candidate model")
2781
+ sync_merge.add_argument("current", help="Current local model YAML")
2782
+ sync_merge.add_argument("candidate", help="Candidate/live model YAML")
2783
+ sync_merge.add_argument("--out", help="Write merged model YAML")
2784
+ sync_merge.set_defaults(func=cmd_sync_merge)
2785
+
2786
+ sync_pull = sync_sub.add_parser("pull", help="Alias of 'datalex pull' for round-trip workflows")
2787
+ sync_pull.add_argument("connector", help="Connector type (postgres, mysql, snowflake, bigquery, databricks, sqlserver, azure_sql, azure_fabric, redshift)")
2788
+ sync_pull.add_argument("--host", help="Database host (or Snowflake account, Databricks server hostname)")
2789
+ sync_pull.add_argument("--port", type=int, help="Database port")
2790
+ sync_pull.add_argument("--database", help="Database name")
2791
+ sync_pull.add_argument("--db-schema", help="Schema name (default: public/PUBLIC/default)")
2792
+ sync_pull.add_argument("--user", help="Database user")
2793
+ sync_pull.add_argument("--password", help="Database password")
2794
+ sync_pull.add_argument("--warehouse", help="Snowflake warehouse")
2795
+ sync_pull.add_argument("--project", help="BigQuery project ID")
2796
+ sync_pull.add_argument("--dataset", help="BigQuery dataset")
2797
+ sync_pull.add_argument("--catalog", help="Databricks Unity Catalog name")
2798
+ sync_pull.add_argument("--token", help="Access token (Databricks)")
2799
+ sync_pull.add_argument("--http-path", help="Databricks SQL Warehouse/Cluster HTTP path")
2800
+ sync_pull.add_argument("--odbc-driver", help="ODBC driver for SQL Server-family connectors")
2801
+ sync_pull.add_argument("--encrypt", help="SQL Server encryption setting (yes/no)")
2802
+ sync_pull.add_argument("--trust-server-certificate", help="SQL Server TrustServerCertificate setting (yes/no)")
2803
+ sync_pull.add_argument("--private-key-path", help="Path to RSA private key PEM file (Snowflake key-pair auth)")
2804
+ sync_pull.add_argument("--tables", nargs="*", help="Only include these tables")
2805
+ sync_pull.add_argument("--exclude-tables", nargs="*", help="Exclude these tables")
2806
+ sync_pull.add_argument("--model-name", default="imported_model", help="Model name")
2807
+ sync_pull.add_argument("--domain", default="imported", help="Domain value")
2808
+ sync_pull.add_argument("--owner", help="Owner email")
2809
+ sync_pull.add_argument("--out", help="Output YAML model file path")
2810
+ sync_pull.add_argument("--project-dir", help="Project folder to write extracted model YAML")
2811
+ sync_pull.add_argument("--create-project-dir", action="store_true", help="Create --project-dir if missing")
2812
+ sync_pull.add_argument("--test", action="store_true", help="Test connection only, do not pull schema")
2813
+ sync_pull.set_defaults(func=cmd_sync_pull)
2814
+
2815
+ fmt_parser = sub.add_parser("fmt", help="Auto-format YAML model to canonical style")
2816
+ fmt_parser.add_argument("model", help="Path to model YAML")
2817
+ fmt_parser.add_argument("--write", "-w", action="store_true", help="Overwrite the input file in-place")
2818
+ fmt_parser.add_argument("--out", help="Output file path (alternative to --write)")
2819
+ fmt_parser.set_defaults(func=cmd_fmt)
2820
+
2821
+ stats_parser = sub.add_parser("stats", help="Print model statistics")
2822
+ stats_parser.add_argument("model", help="Path to model YAML")
2823
+ stats_parser.add_argument("--output-json", action="store_true", help="Print stats as JSON")
2824
+ stats_parser.set_defaults(func=cmd_stats)
2825
+
2826
+ completeness_parser = sub.add_parser(
2827
+ "completeness",
2828
+ help="Score each entity against single-source-of-truth completeness dimensions",
2829
+ )
2830
+ completeness_parser.add_argument("model", help="Path to model YAML")
2831
+ completeness_parser.add_argument(
2832
+ "--output-json", action="store_true", help="Emit full report as JSON (for API/CI integration)"
2833
+ )
2834
+ completeness_parser.add_argument(
2835
+ "--summary", action="store_true", help="Show scores only, suppress per-entity missing detail"
2836
+ )
2837
+ completeness_parser.add_argument(
2838
+ "--min-score",
2839
+ type=int,
2840
+ default=None,
2841
+ metavar="N",
2842
+ help="Exit with code 1 if any entity scores below N%% (useful in CI gates)",
2843
+ )
2844
+ completeness_parser.set_defaults(func=cmd_completeness)
2845
+
2846
+ schema_parser = sub.add_parser("print-schema", help="Print active model schema JSON")
2847
+ schema_parser.add_argument("--schema", default=_default_schema_path(), help="Path to JSON schema")
2848
+ schema_parser.set_defaults(func=cmd_schema)
2849
+
2850
+ policy_schema_parser = sub.add_parser("print-policy-schema", help="Print policy schema JSON")
2851
+ policy_schema_parser.add_argument(
2852
+ "--policy-schema",
2853
+ default=_default_policy_schema_path(),
2854
+ help="Path to policy schema JSON",
2855
+ )
2856
+ policy_schema_parser.set_defaults(func=cmd_policy_schema)
2857
+
2858
+ doctor_parser = sub.add_parser("doctor", help="Diagnose project setup issues")
2859
+ doctor_parser.add_argument("--path", default=".", help="Project directory to diagnose")
2860
+ doctor_parser.add_argument("--output-json", action="store_true", help="Print diagnostics as JSON")
2861
+ doctor_parser.set_defaults(func=cmd_doctor)
2862
+
2863
+ migrate_parser = sub.add_parser("migrate", help="Generate SQL migration between two model versions")
2864
+ migrate_parser.add_argument("old", help="Old model YAML path")
2865
+ migrate_parser.add_argument("new", help="New model YAML path")
2866
+ migrate_parser.add_argument("--dialect", default="postgres", choices=["postgres", "snowflake", "bigquery", "databricks"])
2867
+ migrate_parser.add_argument("--out", help="Output SQL migration file path")
2868
+ migrate_parser.set_defaults(func=cmd_migrate)
2869
+
2870
+ apply_parser = sub.add_parser("apply", help="Apply SQL/migration to a live database")
2871
+ apply_parser.add_argument("connector", choices=["snowflake", "databricks", "bigquery"], help="Target connector")
2872
+ apply_parser.add_argument("--dialect", default=None, choices=["snowflake", "bigquery", "databricks"], help="SQL dialect (defaults to connector)")
2873
+ apply_parser.add_argument("--sql-file", help="Path to SQL file to apply")
2874
+ apply_parser.add_argument("--old", help="Old model YAML path (for generated migration)")
2875
+ apply_parser.add_argument("--new", help="New model YAML path (for generated migration)")
2876
+ apply_parser.add_argument("--model-schema", default=_default_schema_path(), help="Path to model schema JSON")
2877
+ apply_parser.add_argument("--host", help="Database host/account")
2878
+ apply_parser.add_argument("--port", type=int, help="Database port")
2879
+ apply_parser.add_argument("--database", help="Database name")
2880
+ apply_parser.add_argument("--db-schema", help="Schema name")
2881
+ apply_parser.add_argument("--user", help="Database user")
2882
+ apply_parser.add_argument("--password", help="Database password or key passphrase")
2883
+ apply_parser.add_argument("--warehouse", help="Snowflake warehouse")
2884
+ apply_parser.add_argument("--project", help="BigQuery project ID")
2885
+ apply_parser.add_argument("--dataset", help="BigQuery dataset")
2886
+ apply_parser.add_argument("--catalog", help="Databricks catalog")
2887
+ apply_parser.add_argument("--token", help="Databricks token")
2888
+ apply_parser.add_argument("--http-path", help="Databricks SQL Warehouse/Cluster HTTP path")
2889
+ apply_parser.add_argument("--private-key-path", help="Path to RSA private key PEM file (Snowflake key-pair auth)")
2890
+ apply_parser.add_argument("--migration-name", help="Migration name override")
2891
+ apply_parser.add_argument("--ledger-table", default="datalex_migrations", help="Migration ledger table name")
2892
+ apply_parser.add_argument("--skip-ledger", action="store_true", help="Skip writing migration ledger record")
2893
+ apply_parser.add_argument("--policy-pack", default=_default_policy_path(), help="Policy pack for model-diff preflight checks")
2894
+ apply_parser.add_argument("--skip-policy-check", action="store_true", help="Skip policy preflight checks for model-diff apply")
2895
+ apply_parser.add_argument("--allow-destructive", action="store_true", help="Allow destructive SQL statements (DROP/TRUNCATE)")
2896
+ apply_parser.add_argument("--write-sql", help="Write final SQL payload to file before execution")
2897
+ apply_parser.add_argument("--report-json", help="Write structured apply report JSON to file")
2898
+ apply_parser.add_argument("--output-json", action="store_true", help="Print structured apply report JSON")
2899
+ apply_parser.add_argument("--dry-run", action="store_true", help="Print SQL and exit without execution")
2900
+ apply_parser.set_defaults(func=cmd_apply)
2901
+
2902
+ completion_parser = sub.add_parser("completion", help="Generate shell completion script")
2903
+ completion_parser.add_argument("shell", choices=["bash", "zsh", "fish"], help="Shell type")
2904
+ completion_parser.set_defaults(func=cmd_completion)
2905
+
2906
+ watch_parser = sub.add_parser("watch", help="Watch model files and validate on change")
2907
+ watch_parser.add_argument("--glob", default="**/*.model.yaml", help="Glob pattern for model files")
2908
+ watch_parser.add_argument("--interval", type=int, default=2, help="Poll interval in seconds")
2909
+ watch_parser.add_argument("--schema", default=_default_schema_path(), help="Path to JSON schema")
2910
+ watch_parser.set_defaults(func=cmd_watch)
2911
+
2912
+ from datalex_cli.datalex_cli import register_datalex
2913
+ register_datalex(sub)
2914
+
2915
+ return parser
2916
+
2917
+
2918
+ def main() -> int:
2919
+ parser = build_parser()
2920
+ args = parser.parse_args()
2921
+ return args.func(args)
2922
+
2923
+
2924
+ if __name__ == "__main__":
2925
+ raise SystemExit(main())