odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/introspect.py ADDED
@@ -0,0 +1,1214 @@
1
+ """Introspection tool for generating Configuration Manual."""
2
+
3
+ import importlib
4
+ import inspect
5
+ import re
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Set, Type, Union
9
+
10
+ try:
11
+ from typing import Annotated, get_args, get_origin
12
+ except ImportError:
13
+ from typing_extensions import Annotated, get_args, get_origin
14
+
15
+ # Python 3.10+ has types.UnionType for X | Y syntax
16
+ try:
17
+ from types import UnionType
18
+
19
+ HAS_UNION_TYPE = True
20
+ except ImportError:
21
+ UnionType = None # type: ignore
22
+ HAS_UNION_TYPE = False
23
+
24
+ from pydantic import BaseModel
25
+
26
+ # Try to import registry/transformers to get function metadata
27
+ try:
28
+ from odibi.registry import FunctionRegistry
29
+ from odibi.transformers import register_standard_library
30
+
31
+ # Ensure registry is populated
32
+ register_standard_library()
33
+ HAS_REGISTRY = True
34
+ except ImportError:
35
+ HAS_REGISTRY = False
36
+ print(
37
+ "Warning: Could not import FunctionRegistry/transformers. Function details will be missing."
38
+ )
39
+
40
+ # --- Data Models ---
41
+
42
+
43
+ class FieldDoc(BaseModel):
44
+ name: str
45
+ type_hint: str
46
+ required: bool
47
+ default: Optional[str] = None
48
+ description: Optional[str] = None
49
+
50
+
51
+ class ModelDoc(BaseModel):
52
+ name: str
53
+ module: str
54
+ summary: Optional[str]
55
+ docstring: Optional[str]
56
+ fields: List[FieldDoc]
57
+ group: str # "Core", "Connection", "Transformation", "Setting"
58
+ category: Optional[str] = None # Sub-category for Transformations
59
+ function_name: Optional[str] = None
60
+ function_doc: Optional[str] = None
61
+ used_in: List[str] = []
62
+
63
+
64
+ # --- Configuration ---
65
+
66
+ GROUP_MAPPING = {
67
+ "ProjectConfig": "Core",
68
+ "PipelineConfig": "Core",
69
+ "NodeConfig": "Core",
70
+ "ReadConfig": "Operation",
71
+ "IncrementalConfig": "Operation",
72
+ "WriteConfig": "Operation",
73
+ "WriteMetadataConfig": "Operation",
74
+ "StreamingWriteConfig": "Operation",
75
+ "TriggerConfig": "Operation",
76
+ "AutoOptimizeConfig": "Operation",
77
+ "DeleteDetectionConfig": "Operation",
78
+ "TransformConfig": "Operation",
79
+ "TransformStep": "Operation",
80
+ "ValidationConfig": "Operation",
81
+ "PrivacyConfig": "Operation",
82
+ "ColumnMetadata": "Core",
83
+ "TimeTravelConfig": "Operation",
84
+ "LocalConnectionConfig": "Connection",
85
+ "AzureBlobConnectionConfig": "Connection",
86
+ "DeltaConnectionConfig": "Connection",
87
+ "SQLServerConnectionConfig": "Connection",
88
+ "HttpConnectionConfig": "Connection",
89
+ "StoryConfig": "Setting",
90
+ "RetryConfig": "Setting",
91
+ "LoggingConfig": "Setting",
92
+ "PerformanceConfig": "Setting",
93
+ "AlertConfig": "Setting",
94
+ "LineageConfig": "Setting",
95
+ "StateConfig": "Setting",
96
+ "SystemConfig": "Core",
97
+ "SyncFromConfig": "Core",
98
+ # Contract/Test types
99
+ "NotNullTest": "Contract",
100
+ "UniqueTest": "Contract",
101
+ "AcceptedValuesTest": "Contract",
102
+ "RowCountTest": "Contract",
103
+ "CustomSQLTest": "Contract",
104
+ "RangeTest": "Contract",
105
+ "RegexMatchTest": "Contract",
106
+ "VolumeDropTest": "Contract",
107
+ "SchemaContract": "Contract",
108
+ "DistributionContract": "Contract",
109
+ "FreshnessContract": "Contract",
110
+ # Quarantine & Quality Gates (Week 1)
111
+ "QuarantineConfig": "Operation",
112
+ "QuarantineColumnsConfig": "Operation",
113
+ "GateConfig": "Operation",
114
+ "GateThreshold": "Operation",
115
+ "RowCountGate": "Operation",
116
+ # Cross-Pipeline Dependencies
117
+ "ReferenceResolutionError": "Core",
118
+ # Semantic Layer
119
+ "MetricDefinition": "Semantic",
120
+ "DimensionDefinition": "Semantic",
121
+ "MaterializationConfig": "Semantic",
122
+ "SemanticLayerConfig": "Semantic",
123
+ # FK Validation
124
+ "RelationshipConfig": "Validation",
125
+ "RelationshipRegistry": "Validation",
126
+ # Patterns
127
+ "DimensionPattern": "Pattern",
128
+ "DateDimensionPattern": "Pattern",
129
+ "FactPattern": "Pattern",
130
+ "AggregationPattern": "Pattern",
131
+ "AuditConfig": "Pattern",
132
+ # SQL Server Merge (Phase 4)
133
+ "SqlServerMergeOptions": "Operation",
134
+ "SqlServerOverwriteOptions": "Operation",
135
+ "SqlServerAuditColsConfig": "Operation",
136
+ "SqlServerMergeValidationConfig": "Operation",
137
+ "SqlServerSchemaEvolutionConfig": "Operation",
138
+ }
139
+
140
+ # Map modules to readable Categories
141
+ TRANSFORM_CATEGORY_MAP = {
142
+ "odibi.transformers.sql_core": "Common Operations",
143
+ "odibi.transformers.relational": "Relational Algebra",
144
+ "odibi.transformers.advanced": "Advanced & Feature Engineering",
145
+ "odibi.transformers.scd": "Warehousing Patterns",
146
+ "odibi.transformers.validation": "Data Quality",
147
+ "odibi.transformers.merge_transformer": "Warehousing Patterns",
148
+ "odibi.transformers.delete_detection": "Data Engineering Patterns",
149
+ "odibi.transformers.manufacturing": "Manufacturing & IoT",
150
+ }
151
+
152
+ CUSTOM_ORDER = [
153
+ # Core
154
+ "ProjectConfig",
155
+ "PipelineConfig",
156
+ "NodeConfig",
157
+ "ColumnMetadata",
158
+ "SystemConfig",
159
+ "StateConfig",
160
+ "LineageConfig",
161
+ # Operations (ETL flow)
162
+ "ReadConfig",
163
+ "IncrementalConfig",
164
+ "TimeTravelConfig",
165
+ "TransformConfig",
166
+ "DeleteDetectionConfig",
167
+ "ValidationConfig",
168
+ "QuarantineConfig",
169
+ "QuarantineColumnsConfig",
170
+ "GateConfig",
171
+ "GateThreshold",
172
+ "RowCountGate",
173
+ "WriteConfig",
174
+ "WriteMetadataConfig",
175
+ "StreamingWriteConfig",
176
+ "TriggerConfig",
177
+ "AutoOptimizeConfig",
178
+ # Connections (Common first)
179
+ "LocalConnectionConfig",
180
+ "DeltaConnectionConfig",
181
+ "AzureBlobConnectionConfig",
182
+ "SQLServerConnectionConfig",
183
+ "HttpConnectionConfig",
184
+ ]
185
+
186
+ # Map type aliases to their documentation section anchors
187
+ TYPE_ALIAS_LINKS = {
188
+ "TestConfig": "contracts-data-quality-gates",
189
+ "ConnectionConfig": "connections",
190
+ }
191
+
192
+ # Known Type Aliases to simplify display
193
+ TYPE_ALIASES = {
194
+ "ConnectionConfig": [
195
+ "LocalConnectionConfig",
196
+ "AzureBlobConnectionConfig",
197
+ "DeltaConnectionConfig",
198
+ "SQLServerConnectionConfig",
199
+ "HttpConnectionConfig",
200
+ ],
201
+ "AzureBlobAuthConfig": [
202
+ "AzureBlobKeyVaultAuth",
203
+ "AzureBlobAccountKeyAuth",
204
+ "AzureBlobSasAuth",
205
+ "AzureBlobConnectionStringAuth",
206
+ "AzureBlobMsiAuth",
207
+ ],
208
+ "SQLServerAuthConfig": [
209
+ "SQLLoginAuth",
210
+ "SQLAadPasswordAuth",
211
+ "SQLMsiAuth",
212
+ "SQLConnectionStringAuth",
213
+ ],
214
+ "HttpAuthConfig": ["HttpNoAuth", "HttpBasicAuth", "HttpBearerAuth", "HttpApiKeyAuth"],
215
+ "TestConfig": [
216
+ "NotNullTest",
217
+ "UniqueTest",
218
+ "AcceptedValuesTest",
219
+ "RowCountTest",
220
+ "CustomSQLTest",
221
+ "RangeTest",
222
+ "RegexMatchTest",
223
+ "VolumeDropTest",
224
+ "SchemaContract",
225
+ "DistributionContract",
226
+ "FreshnessContract",
227
+ ],
228
+ }
229
+
230
+ SECTION_INTROS = {
231
+ "Contract": """
232
+ ### Contracts (Pre-Transform Checks)
233
+
234
+ Contracts are **fail-fast data quality checks** that run on input data **before** transformation.
235
+ They always halt execution on failure - use them to prevent bad data from entering the pipeline.
236
+
237
+ **Contracts vs Validation vs Quality Gates:**
238
+
239
+ | Feature | When it Runs | On Failure | Use Case |
240
+ |---------|--------------|------------|----------|
241
+ | **Contracts** | Before transform | Always fails | Input data quality (not-null, unique keys) |
242
+ | **Validation** | After transform | Configurable (fail/warn/quarantine) | Output data quality (ranges, formats) |
243
+ | **Quality Gates** | After validation | Configurable (abort/warn) | Pipeline-level thresholds (pass rate, row counts) |
244
+ | **Quarantine** | With validation | Routes bad rows | Capture invalid records for review |
245
+
246
+ **See Also:**
247
+ - [Validation Guide](../features/quality_gates.md) - Full validation configuration
248
+ - [Quarantine Guide](../features/quarantine.md) - Quarantine setup and review
249
+ - [Getting Started: Validation](../tutorials/getting_started.md#add-data-validation)
250
+
251
+ **Example:**
252
+ ```yaml
253
+ - name: "process_orders"
254
+ contracts:
255
+ - type: not_null
256
+ columns: [order_id, customer_id]
257
+ - type: row_count
258
+ min: 100
259
+ - type: freshness
260
+ column: created_at
261
+ max_age: "24h"
262
+ read:
263
+ source: raw_orders
264
+ ```
265
+ """,
266
+ "Semantic": """
267
+ ### Semantic Layer
268
+
269
+ The semantic layer provides a unified interface for defining and querying business metrics.
270
+ Define metrics once, query them by name across dimensions.
271
+
272
+ **Core Components:**
273
+ - **MetricDefinition**: Define aggregation expressions (SUM, COUNT, AVG)
274
+ - **DimensionDefinition**: Define grouping attributes with hierarchies
275
+ - **MaterializationConfig**: Pre-compute metrics at specific grain
276
+ - **SemanticQuery**: Execute queries like "revenue BY region, month"
277
+ - **Project**: Unified API that connects pipelines and semantic layer
278
+
279
+ **Unified Project API (Recommended):**
280
+ ```python
281
+ from odibi import Project
282
+
283
+ project = Project.load("odibi.yaml")
284
+ result = project.query("revenue BY region")
285
+ print(result.df)
286
+ ```
287
+
288
+ **YAML Configuration:**
289
+ ```yaml
290
+ project: my_warehouse
291
+ engine: pandas
292
+
293
+ connections:
294
+ gold:
295
+ type: delta
296
+ path: /mnt/data/gold
297
+
298
+ # Semantic layer at project level
299
+ semantic:
300
+ metrics:
301
+ - name: revenue
302
+ expr: "SUM(total_amount)"
303
+ source: gold.fact_orders # connection.table notation
304
+ filters:
305
+ - "status = 'completed'"
306
+
307
+ dimensions:
308
+ - name: region
309
+ source: gold.dim_customer
310
+ column: region
311
+
312
+ materializations:
313
+ - name: monthly_revenue
314
+ metrics: [revenue]
315
+ dimensions: [region, month]
316
+ output: gold/agg_monthly_revenue
317
+ ```
318
+
319
+ The `source: gold.fact_orders` notation resolves paths automatically from connections.
320
+ """,
321
+ "Validation": """
322
+ ### FK Validation
323
+
324
+ Declare and validate referential integrity between fact and dimension tables.
325
+
326
+ **Features:**
327
+ - Declare relationships in YAML
328
+ - Validate FK constraints on fact load
329
+ - Detect orphan records
330
+ - Generate lineage from relationships
331
+
332
+ **Example:**
333
+ ```yaml
334
+ relationships:
335
+ - name: orders_to_customers
336
+ fact: fact_orders
337
+ dimension: dim_customer
338
+ fact_key: customer_sk
339
+ dimension_key: customer_sk
340
+ on_violation: error
341
+ ```
342
+ """,
343
+ "Pattern": """
344
+ ### Data Patterns
345
+
346
+ Declarative patterns for common data warehouse building blocks. Patterns encapsulate
347
+ best practices for dimensional modeling, ensuring consistent implementation across
348
+ your data warehouse.
349
+
350
+ ---
351
+
352
+ ## DimensionPattern
353
+
354
+ Build complete dimension tables with surrogate keys and SCD (Slowly Changing Dimension) support.
355
+
356
+ **When to Use:**
357
+ - Building dimension tables from source systems (customers, products, locations)
358
+ - Need surrogate keys for star schema joins
359
+ - Need to track historical changes (SCD Type 2)
360
+
361
+ **Beginner Note:**
362
+ Dimensions are the "who, what, where, when" of your data warehouse.
363
+ A customer dimension has customer_id (natural key) and customer_sk (surrogate key).
364
+ Fact tables join to dimensions via surrogate keys.
365
+
366
+ **See Also:** [FactPattern](#factpattern), [DateDimensionPattern](#datedimensionpattern)
367
+
368
+ **Features:**
369
+ - Auto-generate integer surrogate keys (MAX(existing) + ROW_NUMBER)
370
+ - SCD Type 0 (static), 1 (overwrite), 2 (history tracking)
371
+ - Optional unknown member row (SK=0) for orphan FK handling
372
+ - Audit columns (load_timestamp, source_system)
373
+
374
+ **Params:**
375
+
376
+ | Parameter | Type | Required | Description |
377
+ |-----------|------|----------|-------------|
378
+ | `natural_key` | str | Yes | Natural/business key column name |
379
+ | `surrogate_key` | str | Yes | Surrogate key column name to generate |
380
+ | `scd_type` | int | No | 0=static, 1=overwrite, 2=history (default: 1) |
381
+ | `track_cols` | list | SCD1/2 | Columns to track for change detection |
382
+ | `target` | str | SCD2 | Target table path to read existing history |
383
+ | `unknown_member` | bool | No | Insert row with SK=0 for orphan handling |
384
+ | `audit.load_timestamp` | bool | No | Add load_timestamp column |
385
+ | `audit.source_system` | str | No | Add source_system column with value |
386
+
387
+ **Supported Target Formats:**
388
+ - Spark: catalog.table, Delta paths, .parquet, .csv, .json, .orc
389
+ - Pandas: .parquet, .csv, .json, .xlsx, .feather, .pickle
390
+
391
+ **Example:**
392
+ ```yaml
393
+ pattern:
394
+ type: dimension
395
+ params:
396
+ natural_key: customer_id
397
+ surrogate_key: customer_sk
398
+ scd_type: 2
399
+ track_cols: [name, email, address, city]
400
+ target: warehouse.dim_customer
401
+ unknown_member: true
402
+ audit:
403
+ load_timestamp: true
404
+ source_system: "crm"
405
+ ```
406
+
407
+ ---
408
+
409
+ ## DateDimensionPattern
410
+
411
+ Generate a complete date dimension table with pre-calculated attributes for BI/reporting.
412
+
413
+ **When to Use:**
414
+ - Every data warehouse needs a date dimension for time-based analytics
415
+ - Enable date filtering, grouping by week/month/quarter, fiscal year reporting
416
+
417
+ **Beginner Note:**
418
+ The date dimension is foundational for any BI/reporting system.
419
+ It lets you query "sales by month" or "orders in fiscal Q2" without complex date calculations.
420
+
421
+ **See Also:** [DimensionPattern](#dimensionpattern)
422
+
423
+ **Features:**
424
+ - Generates all dates in a range with rich attributes
425
+ - Calendar and fiscal year support
426
+ - ISO week numbering
427
+ - Weekend/month-end flags
428
+
429
+ **Params:**
430
+
431
+ | Parameter | Type | Required | Description |
432
+ |-----------|------|----------|-------------|
433
+ | `start_date` | str | Yes | Start date (YYYY-MM-DD) |
434
+ | `end_date` | str | Yes | End date (YYYY-MM-DD) |
435
+ | `date_key_format` | str | No | Format for date_sk (default: yyyyMMdd) |
436
+ | `fiscal_year_start_month` | int | No | Month fiscal year starts (1-12, default: 1) |
437
+ | `unknown_member` | bool | No | Add unknown date row with date_sk=0 |
438
+
439
+ **Generated Columns:**
440
+ `date_sk`, `full_date`, `day_of_week`, `day_of_week_num`, `day_of_month`,
441
+ `day_of_year`, `is_weekend`, `week_of_year`, `month`, `month_name`, `quarter`,
442
+ `quarter_name`, `year`, `fiscal_year`, `fiscal_quarter`, `is_month_start`,
443
+ `is_month_end`, `is_year_start`, `is_year_end`
444
+
445
+ **Example:**
446
+ ```yaml
447
+ pattern:
448
+ type: date_dimension
449
+ params:
450
+ start_date: "2020-01-01"
451
+ end_date: "2030-12-31"
452
+ fiscal_year_start_month: 7
453
+ unknown_member: true
454
+ ```
455
+
456
+ ---
457
+
458
+ ## FactPattern
459
+
460
+ Build fact tables with automatic surrogate key lookups from dimensions.
461
+
462
+ **When to Use:**
463
+ - Building fact tables from transactional data (orders, events, transactions)
464
+ - Need to look up surrogate keys from dimension tables
465
+ - Need to handle orphan records (missing dimension matches)
466
+
467
+ **Beginner Note:**
468
+ Facts are the "how much, how many" of your data warehouse.
469
+ An orders fact has measures (quantity, revenue) and dimension keys (customer_sk, product_sk).
470
+ The pattern automatically looks up SKs from dimensions.
471
+
472
+ **See Also:** [DimensionPattern](#dimensionpattern), [QuarantineConfig](#quarantineconfig)
473
+
474
+ **Features:**
475
+ - Automatic SK lookups from dimension tables (with SCD2 current-record filtering)
476
+ - Orphan handling: unknown (SK=0), reject (error), quarantine (route to table)
477
+ - Grain validation (detect duplicates)
478
+ - Calculated measures and column renaming
479
+ - Audit columns
480
+
481
+ **Params:**
482
+
483
+ | Parameter | Type | Required | Description |
484
+ |-----------|------|----------|-------------|
485
+ | `grain` | list | No | Columns defining uniqueness (validates no duplicates) |
486
+ | `dimensions` | list | No | Dimension lookup configurations (see below) |
487
+ | `orphan_handling` | str | No | "unknown" \\| "reject" \\| "quarantine" (default: unknown) |
488
+ | `quarantine` | dict | quarantine | Quarantine config (see below) |
489
+ | `measures` | list | No | Measure definitions (passthrough, rename, or calculated) |
490
+ | `deduplicate` | bool | No | Remove duplicates before processing |
491
+ | `keys` | list | dedupe | Keys for deduplication |
492
+ | `audit.load_timestamp` | bool | No | Add load_timestamp column |
493
+ | `audit.source_system` | str | No | Add source_system column |
494
+
495
+ **Dimension Lookup Config:**
496
+ ```yaml
497
+ dimensions:
498
+ - source_column: customer_id # Column in source fact
499
+ dimension_table: dim_customer # Dimension in context
500
+ dimension_key: customer_id # Natural key in dimension
501
+ surrogate_key: customer_sk # SK to retrieve
502
+ scd2: true # Filter is_current=true
503
+ ```
504
+
505
+ **Quarantine Config (for orphan_handling: quarantine):**
506
+ ```yaml
507
+ quarantine:
508
+ connection: silver # Required: connection name
509
+ path: fact_orders_orphans # OR table: quarantine_table
510
+ add_columns:
511
+ _rejection_reason: true # Add rejection reason
512
+ _rejected_at: true # Add rejection timestamp
513
+ _source_dimension: true # Add dimension name
514
+ ```
515
+
516
+ **Example:**
517
+ ```yaml
518
+ pattern:
519
+ type: fact
520
+ params:
521
+ grain: [order_id]
522
+ dimensions:
523
+ - source_column: customer_id
524
+ dimension_table: dim_customer
525
+ dimension_key: customer_id
526
+ surrogate_key: customer_sk
527
+ scd2: true
528
+ - source_column: product_id
529
+ dimension_table: dim_product
530
+ dimension_key: product_id
531
+ surrogate_key: product_sk
532
+ orphan_handling: unknown
533
+ measures:
534
+ - quantity
535
+ - revenue: "quantity * unit_price"
536
+ audit:
537
+ load_timestamp: true
538
+ source_system: "pos"
539
+ ```
540
+
541
+ ---
542
+
543
+ ## AggregationPattern
544
+
545
+ Declarative aggregation with GROUP BY and optional incremental merge.
546
+
547
+ **When to Use:**
548
+ - Building summary/aggregate tables (daily sales, monthly metrics)
549
+ - Need incremental aggregation (update existing aggregates)
550
+ - Gold layer reporting tables
551
+
552
+ **Beginner Note:**
553
+ Aggregations summarize facts at a higher grain.
554
+ Example: daily_sales aggregates orders by date with SUM(revenue).
555
+
556
+ **See Also:** [FactPattern](#factpattern)
557
+
558
+ **Features:**
559
+ - Declare grain (GROUP BY columns)
560
+ - Define measures with SQL aggregation expressions
561
+ - Optional HAVING filter
562
+ - Audit columns
563
+
564
+ **Params:**
565
+
566
+ | Parameter | Type | Required | Description |
567
+ |-----------|------|----------|-------------|
568
+ | `grain` | list | Yes | Columns to GROUP BY (defines uniqueness) |
569
+ | `measures` | list | Yes | Measure definitions with name and expr |
570
+ | `having` | str | No | HAVING clause for filtering aggregates |
571
+ | `incremental.timestamp_column` | str | No | Column to identify new data |
572
+ | `incremental.merge_strategy` | str | No | "replace", "sum", "min", or "max" |
573
+ | `audit.load_timestamp` | bool | No | Add load_timestamp column |
574
+ | `audit.source_system` | str | No | Add source_system column |
575
+
576
+ **Example:**
577
+ ```yaml
578
+ pattern:
579
+ type: aggregation
580
+ params:
581
+ grain: [date_sk, product_sk, region]
582
+ measures:
583
+ - name: total_revenue
584
+ expr: "SUM(total_amount)"
585
+ - name: order_count
586
+ expr: "COUNT(*)"
587
+ - name: avg_order_value
588
+ expr: "AVG(total_amount)"
589
+ having: "COUNT(*) > 0"
590
+ audit:
591
+ load_timestamp: true
592
+ ```
593
+ """,
594
+ "Transformation": """
595
+ ### How to Use Transformers
596
+
597
+ You can use any transformer in two ways:
598
+
599
+ **1. As a Top-Level Transformer ("The App")**
600
+ Use this for major operations that define the node's purpose (e.g. Merge, SCD2).
601
+ ```yaml
602
+ - name: "my_node"
603
+ transformer: "<transformer_name>"
604
+ params:
605
+ <param_name>: <value>
606
+ ```
607
+
608
+ **2. As a Step in a Chain ("The Script")**
609
+ Use this for smaller operations within a `transform` block (e.g. clean_text, filter).
610
+ ```yaml
611
+ - name: "my_node"
612
+ transform:
613
+ steps:
614
+ - function: "<transformer_name>"
615
+ params:
616
+ <param_name>: <value>
617
+ ```
618
+
619
+ **Available Transformers:**
620
+ The models below describe the `params` required for each transformer.
621
+ """,
622
+ }
623
+
624
+ # --- Logic ---
625
+
626
+
627
+ def discover_modules(root_dir: str = "odibi") -> List[str]:
628
+ """Recursively discover all Python modules in the package."""
629
+ modules = []
630
+ path = Path(root_dir)
631
+
632
+ # Handle running from inside odibi/ vs root
633
+ if not path.exists():
634
+ # Try finding it in current directory
635
+ if Path("odibi").exists():
636
+ path = Path("odibi")
637
+ else:
638
+ # If we are inside the package already?
639
+ # Assumption: Script is run from project root d:/odibi
640
+ # so odibi/ should exist.
641
+ print(f"Warning: Could not find root directory '{root_dir}'", file=sys.stderr)
642
+ return []
643
+
644
+ # Directories to skip (experimental/archived, or with optional deps like gradio)
645
+ skip_dirs = {"agents", "ui", "_archive"}
646
+
647
+ for file_path in path.rglob("*.py"):
648
+ if "introspect.py" in str(file_path): # Avoid self
649
+ continue
650
+ if "tests" in file_path.parts or "test_" in file_path.name: # Skip test files
651
+ continue
652
+ if skip_dirs.intersection(file_path.parts): # Skip experimental directories
653
+ continue
654
+
655
+ # Convert path to module notation
656
+ # e.g. odibi\transformers\scd.py -> odibi.transformers.scd
657
+ try:
658
+ # If path is absolute or relative to cwd, we need to find the 'odibi' package root
659
+ # simpler: assume we are at project root, so 'odibi/...' maps to 'odibi.'
660
+ parts = list(file_path.parts)
661
+
662
+ # Find where 'odibi' starts in the path parts
663
+ if "odibi" in parts:
664
+ start_idx = parts.index("odibi")
665
+ rel_parts = parts[start_idx:]
666
+ module_name = ".".join(rel_parts).replace(".py", "")
667
+
668
+ # Fix __init__ (odibi.transformers.__init__ -> odibi.transformers)
669
+ if module_name.endswith(".__init__"):
670
+ module_name = module_name[:-9]
671
+
672
+ modules.append(module_name)
673
+ except Exception as e:
674
+ print(f"Skipping {file_path}: {e}")
675
+
676
+ return sorted(list(set(modules)))
677
+
678
+
679
+ def get_docstring(obj: Any) -> Optional[str]:
680
+ doc = inspect.getdoc(obj)
681
+ if doc is None:
682
+ return None
683
+ # Prevent inheriting Pydantic's internal docstring
684
+ if doc == inspect.getdoc(BaseModel):
685
+ return None
686
+ return doc
687
+
688
+
689
+ def get_summary(obj: Any) -> Optional[str]:
690
+ doc = get_docstring(obj)
691
+ if not doc:
692
+ return None
693
+ return doc.split("\n")[0].strip()
694
+
695
+
696
+ def clean_type_str(s: str) -> str:
697
+ """Clean up raw type string."""
698
+ s = s.replace("typing.", "")
699
+ s = s.replace("odibi.config.", "")
700
+ s = s.replace("odibi.enums.", "")
701
+ s = s.replace("pydantic.types.", "")
702
+ s = s.replace("NoneType", "None")
703
+ s = s.replace("False", "bool") # Literal[False] often shows as False
704
+ s = s.replace("True", "bool")
705
+ return s
706
+
707
+
708
+ def format_type_hint(annotation: Any) -> str:
709
+ """Robust type hint formatting."""
710
+ if annotation is inspect.Parameter.empty:
711
+ return "Any"
712
+
713
+ # Handle Annotated (strip metadata)
714
+ if get_origin(annotation) is Annotated:
715
+ args = get_args(annotation)
716
+ if args:
717
+ return format_type_hint(args[0])
718
+
719
+ # Handle Union / Optional (including Python 3.10+ X | Y syntax)
720
+ origin = get_origin(annotation)
721
+ is_union = origin is Union or (HAS_UNION_TYPE and isinstance(annotation, UnionType))
722
+ if is_union:
723
+ args = get_args(annotation)
724
+ # Check if it's Optional (Union[T, None])
725
+ # Filter out NoneType
726
+ non_none = [a for a in args if a is not type(None)]
727
+
728
+ # Check if this Union matches a known Alias
729
+ arg_names = set()
730
+ for a in non_none:
731
+ if hasattr(a, "__name__"):
732
+ arg_names.add(a.__name__)
733
+ else:
734
+ arg_names.add(str(a))
735
+
736
+ for alias, components in TYPE_ALIASES.items():
737
+ if arg_names == set(components):
738
+ return alias
739
+
740
+ formatted_args = [format_type_hint(a) for a in non_none]
741
+ if len(formatted_args) == 1:
742
+ return f"Optional[{formatted_args[0]}]"
743
+ return " | ".join(formatted_args)
744
+
745
+ # Handle Literal
746
+ s_annot = str(annotation)
747
+ if "Literal" in s_annot and (
748
+ "typing.Literal" in s_annot or "typing_extensions.Literal" in s_annot
749
+ ):
750
+ args = get_args(annotation)
751
+ clean_args = []
752
+ for a in args:
753
+ if hasattr(a, "value"): # Enum member
754
+ clean_args.append(repr(a.value))
755
+ else:
756
+ clean_args.append(repr(a))
757
+ return f"Literal[{', '.join(clean_args)}]"
758
+
759
+ # Handle List/Dict
760
+ if origin is list or origin is List:
761
+ args = get_args(annotation)
762
+ inner = format_type_hint(args[0]) if args else "Any"
763
+ return f"List[{inner}]"
764
+
765
+ if origin is dict or origin is Dict:
766
+ args = get_args(annotation)
767
+ k = format_type_hint(args[0]) if args else "Any"
768
+ v = format_type_hint(args[1]) if args else "Any"
769
+ return f"Dict[{k}, {v}]"
770
+
771
+ # Handle Classes / Strings
772
+ if isinstance(annotation, type):
773
+ return clean_type_str(annotation.__name__)
774
+
775
+ return clean_type_str(str(annotation))
776
+
777
+
778
+ def get_pydantic_fields(cls: Type[BaseModel]) -> List[FieldDoc]:
779
+ """Extract fields from a Pydantic model."""
780
+ fields = []
781
+
782
+ # Pydantic V2
783
+ if hasattr(cls, "model_fields"):
784
+ for name, field in cls.model_fields.items():
785
+ # Get type annotation
786
+ type_hint = "Any"
787
+ if field.annotation is not None:
788
+ type_hint = format_type_hint(field.annotation)
789
+
790
+ # Check required/default
791
+ required = field.is_required()
792
+ default = None
793
+ if not required:
794
+ if field.default is not None:
795
+ default = str(field.default)
796
+
797
+ fields.append(
798
+ FieldDoc(
799
+ name=name,
800
+ type_hint=type_hint,
801
+ required=required,
802
+ default=default,
803
+ description=field.description,
804
+ )
805
+ )
806
+ # Pydantic V1 fallback
807
+ elif hasattr(cls, "__fields__"):
808
+ for name, field in cls.__fields__.items():
809
+ type_hint = format_type_hint(field.outer_type_)
810
+ required = field.required
811
+ default = str(field.default) if not required else None
812
+ fields.append(
813
+ FieldDoc(
814
+ name=name,
815
+ type_hint=type_hint,
816
+ required=required,
817
+ default=default,
818
+ description=field.field_info.description,
819
+ )
820
+ )
821
+
822
+ return fields
823
+
824
+
825
+ def get_registry_info(model_cls: Type[BaseModel]) -> Dict[str, Optional[str]]:
826
+ """Lookup function info from registry using the model class."""
827
+ if not HAS_REGISTRY:
828
+ return {}
829
+
830
+ # Iterate registry to find matching model
831
+ # FunctionRegistry._param_models: Dict[str, BaseModel]
832
+ # Accessing protected member is necessary here
833
+ for name, model in FunctionRegistry._param_models.items():
834
+ if model is model_cls:
835
+ # Found it!
836
+ try:
837
+ func_info = FunctionRegistry.get_function_info(name)
838
+ return {
839
+ "function_name": name,
840
+ "function_doc": func_info.get("docstring"),
841
+ }
842
+ except ValueError:
843
+ pass
844
+ return {}
845
+
846
+
847
+ def scan_module_for_models(module_name: str, group_map: Dict[str, str]) -> List[ModelDoc]:
848
+ """Scan a module for Pydantic models."""
849
+ try:
850
+ module = importlib.import_module(module_name)
851
+ except ImportError as e:
852
+ print(f"Warning: Could not import {module_name}: {e}", file=sys.stderr)
853
+ return []
854
+
855
+ models = []
856
+ for name, obj in inspect.getmembers(module):
857
+ if not inspect.isclass(obj):
858
+ continue
859
+
860
+ # Allow including models from sub-modules if they are part of the target package
861
+ # But generally prefer defining module
862
+ if not hasattr(obj, "__module__"):
863
+ continue
864
+
865
+ # Filter out imported pydantic base
866
+ if obj is BaseModel:
867
+ continue
868
+
869
+ if issubclass(obj, BaseModel):
870
+ # Only document models that are either in the module or are explicitly desired
871
+ # For odibi.config, we want everything defined there
872
+ if obj.__module__ == module_name:
873
+ # Determine Group and Category
874
+ group = "Other"
875
+ category = None
876
+
877
+ if name in group_map:
878
+ group = group_map[name]
879
+ elif module_name.startswith("odibi.transformers"):
880
+ group = "Transformation"
881
+ category = TRANSFORM_CATEGORY_MAP.get(module_name, "Other Transformers")
882
+
883
+ # Get Registry Info
884
+ reg_info = get_registry_info(obj)
885
+
886
+ fields = get_pydantic_fields(obj)
887
+
888
+ models.append(
889
+ ModelDoc(
890
+ name=name,
891
+ module=module_name,
892
+ summary=get_summary(obj),
893
+ docstring=get_docstring(obj),
894
+ fields=fields,
895
+ group=group,
896
+ category=category,
897
+ function_name=reg_info.get("function_name"),
898
+ function_doc=reg_info.get("function_doc"),
899
+ )
900
+ )
901
+ return models
902
+
903
+
904
+ def build_usage_map(models: List[ModelDoc]) -> Dict[str, Set[str]]:
905
+ """Build a reverse index of where models are used."""
906
+ usage = {m.name: set() for m in models}
907
+ model_names = set(m.name for m in models)
908
+
909
+ for m in models:
910
+ for f in m.fields:
911
+ # Naive check: if model name appears in type hint
912
+ # This handles List[NodeConfig], Optional[ReadConfig], etc.
913
+ for target in model_names:
914
+ if target == m.name:
915
+ continue
916
+ # Check for whole word match to avoid partials
917
+ if re.search(r"\b" + re.escape(target) + r"\b", f.type_hint):
918
+ usage[target].add(m.name)
919
+
920
+ # Expand type aliases - if field uses an alias, mark all components as used
921
+ for alias, components in TYPE_ALIASES.items():
922
+ if alias in f.type_hint:
923
+ for component in components:
924
+ if component in usage and component != m.name:
925
+ usage[component].add(m.name)
926
+
927
+ return usage
928
+
929
+
930
+ def generate_docs(output_path: str = "docs/reference/yaml_schema.md"):
931
+ """Run introspection and save to file."""
932
+ print("Scanning configuration models...")
933
+
934
+ modules = discover_modules()
935
+ print(f"Discovered {len(modules)} modules.")
936
+
937
+ all_models = []
938
+ for mod in modules:
939
+ all_models.extend(scan_module_for_models(mod, GROUP_MAPPING))
940
+
941
+ # Build Reverse Index
942
+ usage_map = build_usage_map(all_models)
943
+ for m in all_models:
944
+ if m.name in usage_map:
945
+ m.used_in = sorted(list(usage_map[m.name]))
946
+
947
+ # Organize by group
948
+ grouped = {
949
+ "Core": [],
950
+ "Connection": [],
951
+ "Operation": [],
952
+ "Contract": [],
953
+ "Setting": [],
954
+ "Transformation": [],
955
+ "Semantic": [],
956
+ "Validation": [],
957
+ "Pattern": [],
958
+ "Other": [],
959
+ }
960
+
961
+ for m in all_models:
962
+ if m.group in grouped:
963
+ grouped[m.group].append(m)
964
+ else:
965
+ grouped["Other"].append(m)
966
+
967
+ # Render
968
+ lines = [
969
+ "# Odibi Configuration Reference",
970
+ "",
971
+ "This manual details the YAML configuration schema for Odibi projects.",
972
+ "*Auto-generated from Pydantic models.*",
973
+ "",
974
+ ]
975
+
976
+ # Define Section Order
977
+ sections = [
978
+ ("Core", "Project Structure"),
979
+ ("Connection", "Connections"),
980
+ ("Operation", "Node Operations"),
981
+ ("Contract", "Contracts (Data Quality Gates)"),
982
+ ("Setting", "Global Settings"),
983
+ ("Transformation", "Transformation Reference"),
984
+ ("Semantic", "Semantic Layer"),
985
+ ("Validation", "FK Validation"),
986
+ ("Pattern", "Data Patterns"),
987
+ ]
988
+
989
+ model_names = {m.name for m in all_models}
990
+
991
+ for group_key, title in sections:
992
+ models = grouped[group_key]
993
+ if not models:
994
+ continue
995
+
996
+ lines.append(f"## {title}")
997
+ lines.append("")
998
+
999
+ if group_key in SECTION_INTROS:
1000
+ lines.append(SECTION_INTROS[group_key].strip())
1001
+ lines.append("")
1002
+
1003
+ # Special handling for Transformation Grouping
1004
+ if group_key == "Transformation":
1005
+ lines.append("---")
1006
+ lines.append("")
1007
+
1008
+ # Sort models by category, then name
1009
+ def transform_sort_key(m):
1010
+ # Defined order of categories
1011
+ cat_order = [
1012
+ "Common Operations",
1013
+ "Relational Algebra",
1014
+ "Data Quality",
1015
+ "Warehousing Patterns",
1016
+ "Data Engineering Patterns",
1017
+ "Manufacturing & IoT",
1018
+ "Advanced & Feature Engineering",
1019
+ "Other Transformers",
1020
+ ]
1021
+ cat = m.category or "Other Transformers"
1022
+ try:
1023
+ cat_idx = cat_order.index(cat)
1024
+ except ValueError:
1025
+ cat_idx = 999
1026
+
1027
+ return (cat_idx, m.function_name or m.name)
1028
+
1029
+ models.sort(key=transform_sort_key)
1030
+
1031
+ current_category = None
1032
+
1033
+ for model in models:
1034
+ # Category Header
1035
+ if model.category != current_category:
1036
+ current_category = model.category
1037
+ lines.append(f"### 📂 {current_category}")
1038
+ lines.append("")
1039
+
1040
+ # Header with Function Name if available (preferred for transformers)
1041
+ header_name = model.name
1042
+ if model.function_name:
1043
+ header_name = f"`{model.function_name}` ({model.name})"
1044
+
1045
+ lines.append(f"#### {header_name}")
1046
+
1047
+ # Function Docstring (Design/Impl details)
1048
+ if model.function_doc:
1049
+ lines.append(f"{model.function_doc}")
1050
+ lines.append("")
1051
+
1052
+ # Model Docstring (Configuration details)
1053
+ # If function doc is present, we might want to skip model doc if it's redundant,
1054
+ # but usually model doc has the YAML examples which are critical.
1055
+ if model.docstring:
1056
+ if (
1057
+ model.function_doc
1058
+ and model.docstring.strip() == model.function_doc.strip()
1059
+ ):
1060
+ pass # Skip duplicate
1061
+ else:
1062
+ lines.append(f"{model.docstring}\n")
1063
+
1064
+ lines.append("[Back to Catalog](#nodeconfig)")
1065
+ lines.append("")
1066
+
1067
+ if model.fields:
1068
+ lines.append("| Field | Type | Required | Default | Description |")
1069
+ lines.append("| --- | --- | --- | --- | --- |")
1070
+ for field in model.fields:
1071
+ req = "Yes" if field.required else "No"
1072
+ default = f"`{field.default}`" if field.default else "-"
1073
+ desc = field.description or "-"
1074
+ desc = desc.replace("|", "\\|").replace("\n", " ")
1075
+
1076
+ # Cross Linking
1077
+ th_display = field.type_hint.replace("|", "\\|")
1078
+ for target in sorted(list(model_names), key=len, reverse=True):
1079
+ pattern = r"\b" + re.escape(target) + r"\b"
1080
+ if re.search(pattern, th_display):
1081
+ th_display = re.sub(
1082
+ pattern, f"[{target}](#{target.lower()})", th_display
1083
+ )
1084
+
1085
+ # Link type aliases to their sections
1086
+ for alias, anchor in TYPE_ALIAS_LINKS.items():
1087
+ pattern = r"\b" + re.escape(alias) + r"\b"
1088
+ if re.search(pattern, th_display):
1089
+ th_display = re.sub(
1090
+ pattern, f"[{alias}](#{anchor})", th_display
1091
+ )
1092
+
1093
+ # Auto-expand aliases in description
1094
+ for alias, components in TYPE_ALIASES.items():
1095
+ if alias in field.type_hint:
1096
+ links_list = []
1097
+ for c in components:
1098
+ if c in model_names:
1099
+ links_list.append(f"[{c}](#{c.lower()})")
1100
+ else:
1101
+ links_list.append(c)
1102
+
1103
+ if links_list:
1104
+ prefix = (
1105
+ "<br>**Options:** " if desc != "-" else "**Options:** "
1106
+ )
1107
+ if desc == "-":
1108
+ desc = ""
1109
+ desc += f"{prefix}{', '.join(links_list)}"
1110
+
1111
+ lines.append(
1112
+ f"| **{field.name}** | {th_display} | {req} | {default} | {desc} |"
1113
+ )
1114
+ lines.append("")
1115
+
1116
+ lines.append("---\n")
1117
+
1118
+ # Skip standard processing for this group since we did custom rendering
1119
+ continue
1120
+
1121
+ lines.append("---")
1122
+ lines.append("")
1123
+
1124
+ # Sort models (Standard)
1125
+ def sort_key(m):
1126
+ try:
1127
+ return (0, CUSTOM_ORDER.index(m.name))
1128
+ except ValueError:
1129
+ return (1, m.name)
1130
+
1131
+ models.sort(key=sort_key)
1132
+
1133
+ for model in models:
1134
+ lines.append(f"### `{model.name}`")
1135
+
1136
+ # Reverse Index
1137
+ if model.used_in:
1138
+ links = [f"[{u}](#{u.lower()})" for u in model.used_in]
1139
+ lines.append(f"> *Used in: {', '.join(links)}*")
1140
+ lines.append("")
1141
+
1142
+ if model.docstring:
1143
+ lines.append(f"{model.docstring}\n")
1144
+ if model.group == "Transformation":
1145
+ lines.append("[Back to Catalog](#nodeconfig)")
1146
+ lines.append("")
1147
+
1148
+ if model.fields:
1149
+ lines.append("| Field | Type | Required | Default | Description |")
1150
+ lines.append("| --- | --- | --- | --- | --- |")
1151
+ for field in model.fields:
1152
+ req = "Yes" if field.required else "No"
1153
+ default = f"`{field.default}`" if field.default else "-"
1154
+ desc = field.description or "-"
1155
+ desc = desc.replace("|", "\\|").replace("\n", " ")
1156
+
1157
+ # Cross Linking
1158
+ th_display = field.type_hint.replace("|", "\\|")
1159
+ # Find all model names in the type hint and link them
1160
+ # Sort by length desc to replace longest first (avoid replacing substring)
1161
+ for target in sorted(list(model_names), key=len, reverse=True):
1162
+ pattern = r"\b" + re.escape(target) + r"\b"
1163
+ if re.search(pattern, th_display):
1164
+ th_display = re.sub(
1165
+ pattern, f"[{target}](#{target.lower()})", th_display
1166
+ )
1167
+
1168
+ # Link type aliases to their sections
1169
+ for alias, anchor in TYPE_ALIAS_LINKS.items():
1170
+ pattern = r"\b" + re.escape(alias) + r"\b"
1171
+ if re.search(pattern, th_display):
1172
+ th_display = re.sub(pattern, f"[{alias}](#{anchor})", th_display)
1173
+
1174
+ # Auto-expand aliases in description (to provide navigation)
1175
+ for alias, components in TYPE_ALIASES.items():
1176
+ if alias in field.type_hint:
1177
+ links_list = []
1178
+ for c in components:
1179
+ if c in model_names:
1180
+ links_list.append(f"[{c}](#{c.lower()})")
1181
+ else:
1182
+ links_list.append(c)
1183
+
1184
+ if links_list:
1185
+ prefix = "<br>**Options:** " if desc != "-" else "**Options:** "
1186
+ if desc == "-":
1187
+ desc = ""
1188
+ desc += f"{prefix}{', '.join(links_list)}"
1189
+
1190
+ lines.append(
1191
+ f"| **{field.name}** | {th_display} | {req} | {default} | {desc} |"
1192
+ )
1193
+ lines.append("")
1194
+
1195
+ lines.append("---\n")
1196
+
1197
+ path = Path(output_path)
1198
+ path.parent.mkdir(parents=True, exist_ok=True)
1199
+ # Strip trailing whitespace from each line to match pre-commit hook behavior
1200
+ cleaned_lines = [line.rstrip() for line in lines]
1201
+ # Ensure file ends with newline (pre-commit end-of-file-fixer requirement)
1202
+ content = "\n".join(cleaned_lines)
1203
+ if not content.endswith("\n"):
1204
+ content += "\n"
1205
+ path.write_text(content, encoding="utf-8")
1206
+ print(f"Configuration Manual saved to {output_path}")
1207
+
1208
+
1209
+ if __name__ == "__main__":
1210
+ # Ensure current directory is in path for imports
1211
+ if str(Path.cwd()) not in sys.path:
1212
+ sys.path.insert(0, str(Path.cwd()))
1213
+
1214
+ generate_docs()