odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,423 @@
1
+ import time
2
+ from datetime import date, datetime
3
+ from typing import Any, Optional
4
+
5
+ import pandas as pd
6
+
7
+ from odibi.context import EngineContext
8
+ from odibi.enums import EngineType
9
+ from odibi.patterns.base import Pattern
10
+ from odibi.utils.logging_context import get_logging_context
11
+
12
+
13
+ class DateDimensionPattern(Pattern):
14
+ """
15
+ Date Dimension Pattern: Generates a complete date dimension table.
16
+
17
+ Creates a date dimension with pre-calculated attributes useful for
18
+ BI/reporting including day of week, quarter, fiscal year, etc.
19
+
20
+ Configuration Options (via params dict):
21
+ - **start_date** (str): Start date in YYYY-MM-DD format
22
+ - **end_date** (str): End date in YYYY-MM-DD format
23
+ - **date_key_format** (str): Format for date_sk (default: "yyyyMMdd" -> 20240115)
24
+ - **fiscal_year_start_month** (int): Month when fiscal year starts (1-12, default: 1)
25
+ - **include_time** (bool): If true, generate time dimension (not implemented yet)
26
+ - **unknown_member** (bool): If true, add unknown date row with date_sk=0
27
+
28
+ Generated Columns:
29
+ - date_sk: Integer surrogate key (YYYYMMDD format)
30
+ - full_date: The actual date
31
+ - day_of_week: Day name (Monday, Tuesday, etc.)
32
+ - day_of_week_num: Day number (1=Monday, 7=Sunday)
33
+ - day_of_month: Day of month (1-31)
34
+ - day_of_year: Day of year (1-366)
35
+ - is_weekend: Boolean flag
36
+ - week_of_year: ISO week number (1-53)
37
+ - month: Month number (1-12)
38
+ - month_name: Month name (January, February, etc.)
39
+ - quarter: Calendar quarter (1-4)
40
+ - quarter_name: Q1, Q2, Q3, Q4
41
+ - year: Calendar year
42
+ - fiscal_year: Fiscal year
43
+ - fiscal_quarter: Fiscal quarter (1-4)
44
+ - is_month_start: First day of month
45
+ - is_month_end: Last day of month
46
+ - is_year_start: First day of year
47
+ - is_year_end: Last day of year
48
+ """
49
+
50
+ def validate(self) -> None:
51
+ ctx = get_logging_context()
52
+ ctx.debug(
53
+ "DateDimensionPattern validation starting",
54
+ pattern="DateDimensionPattern",
55
+ params=self.params,
56
+ )
57
+
58
+ if not self.params.get("start_date"):
59
+ ctx.error(
60
+ "DateDimensionPattern validation failed: 'start_date' is required",
61
+ pattern="DateDimensionPattern",
62
+ )
63
+ raise ValueError(
64
+ "DateDimensionPattern: 'start_date' parameter is required. "
65
+ "Expected format: 'YYYY-MM-DD' (e.g., '2024-01-01'). "
66
+ "Provide a valid start_date in params."
67
+ )
68
+
69
+ if not self.params.get("end_date"):
70
+ ctx.error(
71
+ "DateDimensionPattern validation failed: 'end_date' is required",
72
+ pattern="DateDimensionPattern",
73
+ )
74
+ raise ValueError(
75
+ "DateDimensionPattern: 'end_date' parameter is required. "
76
+ "Expected format: 'YYYY-MM-DD' (e.g., '2024-12-31'). "
77
+ "Provide a valid end_date in params."
78
+ )
79
+
80
+ try:
81
+ start = self._parse_date(self.params["start_date"])
82
+ end = self._parse_date(self.params["end_date"])
83
+ if start > end:
84
+ raise ValueError(
85
+ f"start_date must be before or equal to end_date. "
86
+ f"Provided: start_date='{self.params['start_date']}', "
87
+ f"end_date='{self.params['end_date']}'. "
88
+ f"Swap the values or adjust the date range."
89
+ )
90
+ except Exception as e:
91
+ ctx.error(
92
+ f"DateDimensionPattern validation failed: {e}",
93
+ pattern="DateDimensionPattern",
94
+ )
95
+ raise ValueError(
96
+ f"DateDimensionPattern: Invalid date parameters. {e} "
97
+ f"Provided: start_date='{self.params.get('start_date')}', "
98
+ f"end_date='{self.params.get('end_date')}'. "
99
+ f"Expected format: 'YYYY-MM-DD'."
100
+ )
101
+
102
+ fiscal_month = self.params.get("fiscal_year_start_month", 1)
103
+ if not isinstance(fiscal_month, int) or fiscal_month < 1 or fiscal_month > 12:
104
+ ctx.error(
105
+ "DateDimensionPattern validation failed: invalid fiscal_year_start_month",
106
+ pattern="DateDimensionPattern",
107
+ )
108
+ raise ValueError(
109
+ f"DateDimensionPattern: 'fiscal_year_start_month' must be an integer 1-12. "
110
+ f"Provided: {fiscal_month!r} (type: {type(fiscal_month).__name__}). "
111
+ f"Use an integer like 1 for January or 7 for July."
112
+ )
113
+
114
+ ctx.debug(
115
+ "DateDimensionPattern validation passed",
116
+ pattern="DateDimensionPattern",
117
+ )
118
+
119
+ def _parse_date(self, date_str: str) -> date:
120
+ """Parse a date string in YYYY-MM-DD format."""
121
+ if isinstance(date_str, (date, datetime)):
122
+ return date_str if isinstance(date_str, date) else date_str.date()
123
+ return datetime.strptime(date_str, "%Y-%m-%d").date()
124
+
125
+ def execute(self, context: EngineContext) -> Any:
126
+ ctx = get_logging_context()
127
+ start_time = time.time()
128
+
129
+ start_date = self._parse_date(self.params["start_date"])
130
+ end_date = self._parse_date(self.params["end_date"])
131
+ fiscal_year_start_month = self.params.get("fiscal_year_start_month", 1)
132
+ unknown_member = self.params.get("unknown_member", False)
133
+
134
+ ctx.debug(
135
+ "DateDimensionPattern starting",
136
+ pattern="DateDimensionPattern",
137
+ start_date=str(start_date),
138
+ end_date=str(end_date),
139
+ fiscal_year_start_month=fiscal_year_start_month,
140
+ )
141
+
142
+ try:
143
+ if context.engine_type == EngineType.SPARK:
144
+ result_df = self._generate_spark(
145
+ context, start_date, end_date, fiscal_year_start_month
146
+ )
147
+ else:
148
+ result_df = self._generate_pandas(start_date, end_date, fiscal_year_start_month)
149
+
150
+ if unknown_member:
151
+ result_df = self._add_unknown_member(context, result_df)
152
+
153
+ row_count = self._get_row_count(result_df, context.engine_type)
154
+ elapsed_ms = (time.time() - start_time) * 1000
155
+
156
+ ctx.info(
157
+ "DateDimensionPattern completed",
158
+ pattern="DateDimensionPattern",
159
+ elapsed_ms=round(elapsed_ms, 2),
160
+ rows_generated=row_count,
161
+ start_date=str(start_date),
162
+ end_date=str(end_date),
163
+ )
164
+
165
+ return result_df
166
+
167
+ except Exception as e:
168
+ elapsed_ms = (time.time() - start_time) * 1000
169
+ ctx.error(
170
+ f"DateDimensionPattern failed: {e}",
171
+ pattern="DateDimensionPattern",
172
+ error_type=type(e).__name__,
173
+ elapsed_ms=round(elapsed_ms, 2),
174
+ )
175
+ raise
176
+
177
+ def _get_row_count(self, df, engine_type) -> Optional[int]:
178
+ try:
179
+ if engine_type == EngineType.SPARK:
180
+ return df.count()
181
+ else:
182
+ return len(df)
183
+ except Exception:
184
+ return None
185
+
186
+ def _generate_pandas(
187
+ self, start_date: date, end_date: date, fiscal_year_start_month: int
188
+ ) -> pd.DataFrame:
189
+ """Generate date dimension using Pandas."""
190
+ dates = pd.date_range(start=start_date, end=end_date, freq="D")
191
+
192
+ df = pd.DataFrame({"full_date": dates})
193
+
194
+ df["date_sk"] = df["full_date"].dt.strftime("%Y%m%d").astype(int)
195
+
196
+ df["day_of_week"] = df["full_date"].dt.day_name()
197
+ df["day_of_week_num"] = df["full_date"].dt.dayofweek + 1
198
+ df["day_of_month"] = df["full_date"].dt.day
199
+ df["day_of_year"] = df["full_date"].dt.dayofyear
200
+
201
+ df["is_weekend"] = df["day_of_week_num"].isin([6, 7])
202
+
203
+ df["week_of_year"] = df["full_date"].dt.isocalendar().week.astype(int)
204
+
205
+ df["month"] = df["full_date"].dt.month
206
+ df["month_name"] = df["full_date"].dt.month_name()
207
+
208
+ df["quarter"] = df["full_date"].dt.quarter
209
+ df["quarter_name"] = "Q" + df["quarter"].astype(str)
210
+
211
+ df["year"] = df["full_date"].dt.year
212
+
213
+ df["fiscal_year"] = df.apply(
214
+ lambda row: self._calc_fiscal_year(row["full_date"], fiscal_year_start_month),
215
+ axis=1,
216
+ )
217
+ df["fiscal_quarter"] = df.apply(
218
+ lambda row: self._calc_fiscal_quarter(row["full_date"], fiscal_year_start_month),
219
+ axis=1,
220
+ )
221
+
222
+ df["is_month_start"] = df["full_date"].dt.is_month_start
223
+ df["is_month_end"] = df["full_date"].dt.is_month_end
224
+ df["is_year_start"] = (df["month"] == 1) & (df["day_of_month"] == 1)
225
+ df["is_year_end"] = (df["month"] == 12) & (df["day_of_month"] == 31)
226
+
227
+ df["full_date"] = df["full_date"].dt.date
228
+
229
+ column_order = [
230
+ "date_sk",
231
+ "full_date",
232
+ "day_of_week",
233
+ "day_of_week_num",
234
+ "day_of_month",
235
+ "day_of_year",
236
+ "is_weekend",
237
+ "week_of_year",
238
+ "month",
239
+ "month_name",
240
+ "quarter",
241
+ "quarter_name",
242
+ "year",
243
+ "fiscal_year",
244
+ "fiscal_quarter",
245
+ "is_month_start",
246
+ "is_month_end",
247
+ "is_year_start",
248
+ "is_year_end",
249
+ ]
250
+ return df[column_order]
251
+
252
+ def _calc_fiscal_year(self, dt, fiscal_start_month: int) -> int:
253
+ """Calculate fiscal year based on fiscal start month."""
254
+ if isinstance(dt, pd.Timestamp):
255
+ month = dt.month
256
+ year = dt.year
257
+ else:
258
+ month = dt.month
259
+ year = dt.year
260
+
261
+ if fiscal_start_month == 1:
262
+ return year
263
+ if month >= fiscal_start_month:
264
+ return year + 1
265
+ return year
266
+
267
+ def _calc_fiscal_quarter(self, dt, fiscal_start_month: int) -> int:
268
+ """Calculate fiscal quarter based on fiscal start month."""
269
+ if isinstance(dt, pd.Timestamp):
270
+ month = dt.month
271
+ else:
272
+ month = dt.month
273
+
274
+ adjusted_month = (month - fiscal_start_month) % 12
275
+ return (adjusted_month // 3) + 1
276
+
277
+ def _generate_spark(
278
+ self, context: EngineContext, start_date: date, end_date: date, fiscal_year_start_month: int
279
+ ):
280
+ """Generate date dimension using Spark."""
281
+ from pyspark.sql import functions as F
282
+ from pyspark.sql.types import IntegerType
283
+
284
+ spark = context.spark
285
+
286
+ num_days = (end_date - start_date).days + 1
287
+ start_date_str = start_date.strftime("%Y-%m-%d")
288
+
289
+ df = spark.range(num_days).select(
290
+ F.date_add(F.lit(start_date_str), F.col("id").cast(IntegerType())).alias("full_date")
291
+ )
292
+
293
+ df = df.withColumn("date_sk", F.date_format("full_date", "yyyyMMdd").cast(IntegerType()))
294
+
295
+ df = df.withColumn("day_of_week", F.date_format("full_date", "EEEE"))
296
+ df = df.withColumn("day_of_week_num", F.dayofweek("full_date"))
297
+ df = df.withColumn(
298
+ "day_of_week_num",
299
+ F.when(F.col("day_of_week_num") == 1, 7).otherwise(F.col("day_of_week_num") - 1),
300
+ )
301
+ df = df.withColumn("day_of_month", F.dayofmonth("full_date"))
302
+ df = df.withColumn("day_of_year", F.dayofyear("full_date"))
303
+
304
+ df = df.withColumn("is_weekend", F.col("day_of_week_num").isin([6, 7]))
305
+
306
+ df = df.withColumn("week_of_year", F.weekofyear("full_date"))
307
+
308
+ df = df.withColumn("month", F.month("full_date"))
309
+ df = df.withColumn("month_name", F.date_format("full_date", "MMMM"))
310
+
311
+ df = df.withColumn("quarter", F.quarter("full_date"))
312
+ df = df.withColumn("quarter_name", F.concat(F.lit("Q"), F.col("quarter")))
313
+
314
+ df = df.withColumn("year", F.year("full_date"))
315
+
316
+ if fiscal_year_start_month == 1:
317
+ df = df.withColumn("fiscal_year", F.col("year"))
318
+ df = df.withColumn("fiscal_quarter", F.col("quarter"))
319
+ else:
320
+ df = df.withColumn(
321
+ "fiscal_year",
322
+ F.when(F.col("month") >= fiscal_year_start_month, F.col("year") + 1).otherwise(
323
+ F.col("year")
324
+ ),
325
+ )
326
+ adjusted_month = (F.col("month") - fiscal_year_start_month + 12) % 12
327
+ df = df.withColumn("fiscal_quarter", (adjusted_month / 3).cast(IntegerType()) + 1)
328
+
329
+ df = df.withColumn(
330
+ "is_month_start",
331
+ F.col("day_of_month") == 1,
332
+ )
333
+ df = df.withColumn(
334
+ "is_month_end",
335
+ F.col("full_date") == F.last_day("full_date"),
336
+ )
337
+ df = df.withColumn(
338
+ "is_year_start",
339
+ (F.col("month") == 1) & (F.col("day_of_month") == 1),
340
+ )
341
+ df = df.withColumn(
342
+ "is_year_end",
343
+ (F.col("month") == 12) & (F.col("day_of_month") == 31),
344
+ )
345
+
346
+ column_order = [
347
+ "date_sk",
348
+ "full_date",
349
+ "day_of_week",
350
+ "day_of_week_num",
351
+ "day_of_month",
352
+ "day_of_year",
353
+ "is_weekend",
354
+ "week_of_year",
355
+ "month",
356
+ "month_name",
357
+ "quarter",
358
+ "quarter_name",
359
+ "year",
360
+ "fiscal_year",
361
+ "fiscal_quarter",
362
+ "is_month_start",
363
+ "is_month_end",
364
+ "is_year_start",
365
+ "is_year_end",
366
+ ]
367
+ return df.select(column_order)
368
+
369
+ def _add_unknown_member(self, context: EngineContext, df):
370
+ """Add unknown member row with date_sk=0."""
371
+ if context.engine_type == EngineType.SPARK:
372
+ from pyspark.sql import Row
373
+
374
+ unknown_data = {
375
+ "date_sk": 0,
376
+ "full_date": date(1900, 1, 1),
377
+ "day_of_week": "Unknown",
378
+ "day_of_week_num": 0,
379
+ "day_of_month": 0,
380
+ "day_of_year": 0,
381
+ "is_weekend": False,
382
+ "week_of_year": 0,
383
+ "month": 0,
384
+ "month_name": "Unknown",
385
+ "quarter": 0,
386
+ "quarter_name": "Unknown",
387
+ "year": 0,
388
+ "fiscal_year": 0,
389
+ "fiscal_quarter": 0,
390
+ "is_month_start": False,
391
+ "is_month_end": False,
392
+ "is_year_start": False,
393
+ "is_year_end": False,
394
+ }
395
+ unknown_row = context.spark.createDataFrame([Row(**unknown_data)])
396
+ return unknown_row.unionByName(df)
397
+ else:
398
+ unknown_row = pd.DataFrame(
399
+ [
400
+ {
401
+ "date_sk": 0,
402
+ "full_date": date(1900, 1, 1),
403
+ "day_of_week": "Unknown",
404
+ "day_of_week_num": 0,
405
+ "day_of_month": 0,
406
+ "day_of_year": 0,
407
+ "is_weekend": False,
408
+ "week_of_year": 0,
409
+ "month": 0,
410
+ "month_name": "Unknown",
411
+ "quarter": 0,
412
+ "quarter_name": "Unknown",
413
+ "year": 0,
414
+ "fiscal_year": 0,
415
+ "fiscal_quarter": 0,
416
+ "is_month_start": False,
417
+ "is_month_end": False,
418
+ "is_year_start": False,
419
+ "is_year_end": False,
420
+ }
421
+ ]
422
+ )
423
+ return pd.concat([unknown_row, df], ignore_index=True)