odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from datetime import date, datetime
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from odibi.context import EngineContext
|
|
8
|
+
from odibi.enums import EngineType
|
|
9
|
+
from odibi.patterns.base import Pattern
|
|
10
|
+
from odibi.utils.logging_context import get_logging_context
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DateDimensionPattern(Pattern):
|
|
14
|
+
"""
|
|
15
|
+
Date Dimension Pattern: Generates a complete date dimension table.
|
|
16
|
+
|
|
17
|
+
Creates a date dimension with pre-calculated attributes useful for
|
|
18
|
+
BI/reporting including day of week, quarter, fiscal year, etc.
|
|
19
|
+
|
|
20
|
+
Configuration Options (via params dict):
|
|
21
|
+
- **start_date** (str): Start date in YYYY-MM-DD format
|
|
22
|
+
- **end_date** (str): End date in YYYY-MM-DD format
|
|
23
|
+
- **date_key_format** (str): Format for date_sk (default: "yyyyMMdd" -> 20240115)
|
|
24
|
+
- **fiscal_year_start_month** (int): Month when fiscal year starts (1-12, default: 1)
|
|
25
|
+
- **include_time** (bool): If true, generate time dimension (not implemented yet)
|
|
26
|
+
- **unknown_member** (bool): If true, add unknown date row with date_sk=0
|
|
27
|
+
|
|
28
|
+
Generated Columns:
|
|
29
|
+
- date_sk: Integer surrogate key (YYYYMMDD format)
|
|
30
|
+
- full_date: The actual date
|
|
31
|
+
- day_of_week: Day name (Monday, Tuesday, etc.)
|
|
32
|
+
- day_of_week_num: Day number (1=Monday, 7=Sunday)
|
|
33
|
+
- day_of_month: Day of month (1-31)
|
|
34
|
+
- day_of_year: Day of year (1-366)
|
|
35
|
+
- is_weekend: Boolean flag
|
|
36
|
+
- week_of_year: ISO week number (1-53)
|
|
37
|
+
- month: Month number (1-12)
|
|
38
|
+
- month_name: Month name (January, February, etc.)
|
|
39
|
+
- quarter: Calendar quarter (1-4)
|
|
40
|
+
- quarter_name: Q1, Q2, Q3, Q4
|
|
41
|
+
- year: Calendar year
|
|
42
|
+
- fiscal_year: Fiscal year
|
|
43
|
+
- fiscal_quarter: Fiscal quarter (1-4)
|
|
44
|
+
- is_month_start: First day of month
|
|
45
|
+
- is_month_end: Last day of month
|
|
46
|
+
- is_year_start: First day of year
|
|
47
|
+
- is_year_end: Last day of year
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def validate(self) -> None:
|
|
51
|
+
ctx = get_logging_context()
|
|
52
|
+
ctx.debug(
|
|
53
|
+
"DateDimensionPattern validation starting",
|
|
54
|
+
pattern="DateDimensionPattern",
|
|
55
|
+
params=self.params,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
if not self.params.get("start_date"):
|
|
59
|
+
ctx.error(
|
|
60
|
+
"DateDimensionPattern validation failed: 'start_date' is required",
|
|
61
|
+
pattern="DateDimensionPattern",
|
|
62
|
+
)
|
|
63
|
+
raise ValueError(
|
|
64
|
+
"DateDimensionPattern: 'start_date' parameter is required. "
|
|
65
|
+
"Expected format: 'YYYY-MM-DD' (e.g., '2024-01-01'). "
|
|
66
|
+
"Provide a valid start_date in params."
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
if not self.params.get("end_date"):
|
|
70
|
+
ctx.error(
|
|
71
|
+
"DateDimensionPattern validation failed: 'end_date' is required",
|
|
72
|
+
pattern="DateDimensionPattern",
|
|
73
|
+
)
|
|
74
|
+
raise ValueError(
|
|
75
|
+
"DateDimensionPattern: 'end_date' parameter is required. "
|
|
76
|
+
"Expected format: 'YYYY-MM-DD' (e.g., '2024-12-31'). "
|
|
77
|
+
"Provide a valid end_date in params."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
start = self._parse_date(self.params["start_date"])
|
|
82
|
+
end = self._parse_date(self.params["end_date"])
|
|
83
|
+
if start > end:
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"start_date must be before or equal to end_date. "
|
|
86
|
+
f"Provided: start_date='{self.params['start_date']}', "
|
|
87
|
+
f"end_date='{self.params['end_date']}'. "
|
|
88
|
+
f"Swap the values or adjust the date range."
|
|
89
|
+
)
|
|
90
|
+
except Exception as e:
|
|
91
|
+
ctx.error(
|
|
92
|
+
f"DateDimensionPattern validation failed: {e}",
|
|
93
|
+
pattern="DateDimensionPattern",
|
|
94
|
+
)
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"DateDimensionPattern: Invalid date parameters. {e} "
|
|
97
|
+
f"Provided: start_date='{self.params.get('start_date')}', "
|
|
98
|
+
f"end_date='{self.params.get('end_date')}'. "
|
|
99
|
+
f"Expected format: 'YYYY-MM-DD'."
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
fiscal_month = self.params.get("fiscal_year_start_month", 1)
|
|
103
|
+
if not isinstance(fiscal_month, int) or fiscal_month < 1 or fiscal_month > 12:
|
|
104
|
+
ctx.error(
|
|
105
|
+
"DateDimensionPattern validation failed: invalid fiscal_year_start_month",
|
|
106
|
+
pattern="DateDimensionPattern",
|
|
107
|
+
)
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"DateDimensionPattern: 'fiscal_year_start_month' must be an integer 1-12. "
|
|
110
|
+
f"Provided: {fiscal_month!r} (type: {type(fiscal_month).__name__}). "
|
|
111
|
+
f"Use an integer like 1 for January or 7 for July."
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
ctx.debug(
|
|
115
|
+
"DateDimensionPattern validation passed",
|
|
116
|
+
pattern="DateDimensionPattern",
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def _parse_date(self, date_str: str) -> date:
|
|
120
|
+
"""Parse a date string in YYYY-MM-DD format."""
|
|
121
|
+
if isinstance(date_str, (date, datetime)):
|
|
122
|
+
return date_str if isinstance(date_str, date) else date_str.date()
|
|
123
|
+
return datetime.strptime(date_str, "%Y-%m-%d").date()
|
|
124
|
+
|
|
125
|
+
def execute(self, context: EngineContext) -> Any:
|
|
126
|
+
ctx = get_logging_context()
|
|
127
|
+
start_time = time.time()
|
|
128
|
+
|
|
129
|
+
start_date = self._parse_date(self.params["start_date"])
|
|
130
|
+
end_date = self._parse_date(self.params["end_date"])
|
|
131
|
+
fiscal_year_start_month = self.params.get("fiscal_year_start_month", 1)
|
|
132
|
+
unknown_member = self.params.get("unknown_member", False)
|
|
133
|
+
|
|
134
|
+
ctx.debug(
|
|
135
|
+
"DateDimensionPattern starting",
|
|
136
|
+
pattern="DateDimensionPattern",
|
|
137
|
+
start_date=str(start_date),
|
|
138
|
+
end_date=str(end_date),
|
|
139
|
+
fiscal_year_start_month=fiscal_year_start_month,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
if context.engine_type == EngineType.SPARK:
|
|
144
|
+
result_df = self._generate_spark(
|
|
145
|
+
context, start_date, end_date, fiscal_year_start_month
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
result_df = self._generate_pandas(start_date, end_date, fiscal_year_start_month)
|
|
149
|
+
|
|
150
|
+
if unknown_member:
|
|
151
|
+
result_df = self._add_unknown_member(context, result_df)
|
|
152
|
+
|
|
153
|
+
row_count = self._get_row_count(result_df, context.engine_type)
|
|
154
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
155
|
+
|
|
156
|
+
ctx.info(
|
|
157
|
+
"DateDimensionPattern completed",
|
|
158
|
+
pattern="DateDimensionPattern",
|
|
159
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
160
|
+
rows_generated=row_count,
|
|
161
|
+
start_date=str(start_date),
|
|
162
|
+
end_date=str(end_date),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return result_df
|
|
166
|
+
|
|
167
|
+
except Exception as e:
|
|
168
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
169
|
+
ctx.error(
|
|
170
|
+
f"DateDimensionPattern failed: {e}",
|
|
171
|
+
pattern="DateDimensionPattern",
|
|
172
|
+
error_type=type(e).__name__,
|
|
173
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
174
|
+
)
|
|
175
|
+
raise
|
|
176
|
+
|
|
177
|
+
def _get_row_count(self, df, engine_type) -> Optional[int]:
|
|
178
|
+
try:
|
|
179
|
+
if engine_type == EngineType.SPARK:
|
|
180
|
+
return df.count()
|
|
181
|
+
else:
|
|
182
|
+
return len(df)
|
|
183
|
+
except Exception:
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
def _generate_pandas(
|
|
187
|
+
self, start_date: date, end_date: date, fiscal_year_start_month: int
|
|
188
|
+
) -> pd.DataFrame:
|
|
189
|
+
"""Generate date dimension using Pandas."""
|
|
190
|
+
dates = pd.date_range(start=start_date, end=end_date, freq="D")
|
|
191
|
+
|
|
192
|
+
df = pd.DataFrame({"full_date": dates})
|
|
193
|
+
|
|
194
|
+
df["date_sk"] = df["full_date"].dt.strftime("%Y%m%d").astype(int)
|
|
195
|
+
|
|
196
|
+
df["day_of_week"] = df["full_date"].dt.day_name()
|
|
197
|
+
df["day_of_week_num"] = df["full_date"].dt.dayofweek + 1
|
|
198
|
+
df["day_of_month"] = df["full_date"].dt.day
|
|
199
|
+
df["day_of_year"] = df["full_date"].dt.dayofyear
|
|
200
|
+
|
|
201
|
+
df["is_weekend"] = df["day_of_week_num"].isin([6, 7])
|
|
202
|
+
|
|
203
|
+
df["week_of_year"] = df["full_date"].dt.isocalendar().week.astype(int)
|
|
204
|
+
|
|
205
|
+
df["month"] = df["full_date"].dt.month
|
|
206
|
+
df["month_name"] = df["full_date"].dt.month_name()
|
|
207
|
+
|
|
208
|
+
df["quarter"] = df["full_date"].dt.quarter
|
|
209
|
+
df["quarter_name"] = "Q" + df["quarter"].astype(str)
|
|
210
|
+
|
|
211
|
+
df["year"] = df["full_date"].dt.year
|
|
212
|
+
|
|
213
|
+
df["fiscal_year"] = df.apply(
|
|
214
|
+
lambda row: self._calc_fiscal_year(row["full_date"], fiscal_year_start_month),
|
|
215
|
+
axis=1,
|
|
216
|
+
)
|
|
217
|
+
df["fiscal_quarter"] = df.apply(
|
|
218
|
+
lambda row: self._calc_fiscal_quarter(row["full_date"], fiscal_year_start_month),
|
|
219
|
+
axis=1,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
df["is_month_start"] = df["full_date"].dt.is_month_start
|
|
223
|
+
df["is_month_end"] = df["full_date"].dt.is_month_end
|
|
224
|
+
df["is_year_start"] = (df["month"] == 1) & (df["day_of_month"] == 1)
|
|
225
|
+
df["is_year_end"] = (df["month"] == 12) & (df["day_of_month"] == 31)
|
|
226
|
+
|
|
227
|
+
df["full_date"] = df["full_date"].dt.date
|
|
228
|
+
|
|
229
|
+
column_order = [
|
|
230
|
+
"date_sk",
|
|
231
|
+
"full_date",
|
|
232
|
+
"day_of_week",
|
|
233
|
+
"day_of_week_num",
|
|
234
|
+
"day_of_month",
|
|
235
|
+
"day_of_year",
|
|
236
|
+
"is_weekend",
|
|
237
|
+
"week_of_year",
|
|
238
|
+
"month",
|
|
239
|
+
"month_name",
|
|
240
|
+
"quarter",
|
|
241
|
+
"quarter_name",
|
|
242
|
+
"year",
|
|
243
|
+
"fiscal_year",
|
|
244
|
+
"fiscal_quarter",
|
|
245
|
+
"is_month_start",
|
|
246
|
+
"is_month_end",
|
|
247
|
+
"is_year_start",
|
|
248
|
+
"is_year_end",
|
|
249
|
+
]
|
|
250
|
+
return df[column_order]
|
|
251
|
+
|
|
252
|
+
def _calc_fiscal_year(self, dt, fiscal_start_month: int) -> int:
|
|
253
|
+
"""Calculate fiscal year based on fiscal start month."""
|
|
254
|
+
if isinstance(dt, pd.Timestamp):
|
|
255
|
+
month = dt.month
|
|
256
|
+
year = dt.year
|
|
257
|
+
else:
|
|
258
|
+
month = dt.month
|
|
259
|
+
year = dt.year
|
|
260
|
+
|
|
261
|
+
if fiscal_start_month == 1:
|
|
262
|
+
return year
|
|
263
|
+
if month >= fiscal_start_month:
|
|
264
|
+
return year + 1
|
|
265
|
+
return year
|
|
266
|
+
|
|
267
|
+
def _calc_fiscal_quarter(self, dt, fiscal_start_month: int) -> int:
|
|
268
|
+
"""Calculate fiscal quarter based on fiscal start month."""
|
|
269
|
+
if isinstance(dt, pd.Timestamp):
|
|
270
|
+
month = dt.month
|
|
271
|
+
else:
|
|
272
|
+
month = dt.month
|
|
273
|
+
|
|
274
|
+
adjusted_month = (month - fiscal_start_month) % 12
|
|
275
|
+
return (adjusted_month // 3) + 1
|
|
276
|
+
|
|
277
|
+
def _generate_spark(
|
|
278
|
+
self, context: EngineContext, start_date: date, end_date: date, fiscal_year_start_month: int
|
|
279
|
+
):
|
|
280
|
+
"""Generate date dimension using Spark."""
|
|
281
|
+
from pyspark.sql import functions as F
|
|
282
|
+
from pyspark.sql.types import IntegerType
|
|
283
|
+
|
|
284
|
+
spark = context.spark
|
|
285
|
+
|
|
286
|
+
num_days = (end_date - start_date).days + 1
|
|
287
|
+
start_date_str = start_date.strftime("%Y-%m-%d")
|
|
288
|
+
|
|
289
|
+
df = spark.range(num_days).select(
|
|
290
|
+
F.date_add(F.lit(start_date_str), F.col("id").cast(IntegerType())).alias("full_date")
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
df = df.withColumn("date_sk", F.date_format("full_date", "yyyyMMdd").cast(IntegerType()))
|
|
294
|
+
|
|
295
|
+
df = df.withColumn("day_of_week", F.date_format("full_date", "EEEE"))
|
|
296
|
+
df = df.withColumn("day_of_week_num", F.dayofweek("full_date"))
|
|
297
|
+
df = df.withColumn(
|
|
298
|
+
"day_of_week_num",
|
|
299
|
+
F.when(F.col("day_of_week_num") == 1, 7).otherwise(F.col("day_of_week_num") - 1),
|
|
300
|
+
)
|
|
301
|
+
df = df.withColumn("day_of_month", F.dayofmonth("full_date"))
|
|
302
|
+
df = df.withColumn("day_of_year", F.dayofyear("full_date"))
|
|
303
|
+
|
|
304
|
+
df = df.withColumn("is_weekend", F.col("day_of_week_num").isin([6, 7]))
|
|
305
|
+
|
|
306
|
+
df = df.withColumn("week_of_year", F.weekofyear("full_date"))
|
|
307
|
+
|
|
308
|
+
df = df.withColumn("month", F.month("full_date"))
|
|
309
|
+
df = df.withColumn("month_name", F.date_format("full_date", "MMMM"))
|
|
310
|
+
|
|
311
|
+
df = df.withColumn("quarter", F.quarter("full_date"))
|
|
312
|
+
df = df.withColumn("quarter_name", F.concat(F.lit("Q"), F.col("quarter")))
|
|
313
|
+
|
|
314
|
+
df = df.withColumn("year", F.year("full_date"))
|
|
315
|
+
|
|
316
|
+
if fiscal_year_start_month == 1:
|
|
317
|
+
df = df.withColumn("fiscal_year", F.col("year"))
|
|
318
|
+
df = df.withColumn("fiscal_quarter", F.col("quarter"))
|
|
319
|
+
else:
|
|
320
|
+
df = df.withColumn(
|
|
321
|
+
"fiscal_year",
|
|
322
|
+
F.when(F.col("month") >= fiscal_year_start_month, F.col("year") + 1).otherwise(
|
|
323
|
+
F.col("year")
|
|
324
|
+
),
|
|
325
|
+
)
|
|
326
|
+
adjusted_month = (F.col("month") - fiscal_year_start_month + 12) % 12
|
|
327
|
+
df = df.withColumn("fiscal_quarter", (adjusted_month / 3).cast(IntegerType()) + 1)
|
|
328
|
+
|
|
329
|
+
df = df.withColumn(
|
|
330
|
+
"is_month_start",
|
|
331
|
+
F.col("day_of_month") == 1,
|
|
332
|
+
)
|
|
333
|
+
df = df.withColumn(
|
|
334
|
+
"is_month_end",
|
|
335
|
+
F.col("full_date") == F.last_day("full_date"),
|
|
336
|
+
)
|
|
337
|
+
df = df.withColumn(
|
|
338
|
+
"is_year_start",
|
|
339
|
+
(F.col("month") == 1) & (F.col("day_of_month") == 1),
|
|
340
|
+
)
|
|
341
|
+
df = df.withColumn(
|
|
342
|
+
"is_year_end",
|
|
343
|
+
(F.col("month") == 12) & (F.col("day_of_month") == 31),
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
column_order = [
|
|
347
|
+
"date_sk",
|
|
348
|
+
"full_date",
|
|
349
|
+
"day_of_week",
|
|
350
|
+
"day_of_week_num",
|
|
351
|
+
"day_of_month",
|
|
352
|
+
"day_of_year",
|
|
353
|
+
"is_weekend",
|
|
354
|
+
"week_of_year",
|
|
355
|
+
"month",
|
|
356
|
+
"month_name",
|
|
357
|
+
"quarter",
|
|
358
|
+
"quarter_name",
|
|
359
|
+
"year",
|
|
360
|
+
"fiscal_year",
|
|
361
|
+
"fiscal_quarter",
|
|
362
|
+
"is_month_start",
|
|
363
|
+
"is_month_end",
|
|
364
|
+
"is_year_start",
|
|
365
|
+
"is_year_end",
|
|
366
|
+
]
|
|
367
|
+
return df.select(column_order)
|
|
368
|
+
|
|
369
|
+
def _add_unknown_member(self, context: EngineContext, df):
|
|
370
|
+
"""Add unknown member row with date_sk=0."""
|
|
371
|
+
if context.engine_type == EngineType.SPARK:
|
|
372
|
+
from pyspark.sql import Row
|
|
373
|
+
|
|
374
|
+
unknown_data = {
|
|
375
|
+
"date_sk": 0,
|
|
376
|
+
"full_date": date(1900, 1, 1),
|
|
377
|
+
"day_of_week": "Unknown",
|
|
378
|
+
"day_of_week_num": 0,
|
|
379
|
+
"day_of_month": 0,
|
|
380
|
+
"day_of_year": 0,
|
|
381
|
+
"is_weekend": False,
|
|
382
|
+
"week_of_year": 0,
|
|
383
|
+
"month": 0,
|
|
384
|
+
"month_name": "Unknown",
|
|
385
|
+
"quarter": 0,
|
|
386
|
+
"quarter_name": "Unknown",
|
|
387
|
+
"year": 0,
|
|
388
|
+
"fiscal_year": 0,
|
|
389
|
+
"fiscal_quarter": 0,
|
|
390
|
+
"is_month_start": False,
|
|
391
|
+
"is_month_end": False,
|
|
392
|
+
"is_year_start": False,
|
|
393
|
+
"is_year_end": False,
|
|
394
|
+
}
|
|
395
|
+
unknown_row = context.spark.createDataFrame([Row(**unknown_data)])
|
|
396
|
+
return unknown_row.unionByName(df)
|
|
397
|
+
else:
|
|
398
|
+
unknown_row = pd.DataFrame(
|
|
399
|
+
[
|
|
400
|
+
{
|
|
401
|
+
"date_sk": 0,
|
|
402
|
+
"full_date": date(1900, 1, 1),
|
|
403
|
+
"day_of_week": "Unknown",
|
|
404
|
+
"day_of_week_num": 0,
|
|
405
|
+
"day_of_month": 0,
|
|
406
|
+
"day_of_year": 0,
|
|
407
|
+
"is_weekend": False,
|
|
408
|
+
"week_of_year": 0,
|
|
409
|
+
"month": 0,
|
|
410
|
+
"month_name": "Unknown",
|
|
411
|
+
"quarter": 0,
|
|
412
|
+
"quarter_name": "Unknown",
|
|
413
|
+
"year": 0,
|
|
414
|
+
"fiscal_year": 0,
|
|
415
|
+
"fiscal_quarter": 0,
|
|
416
|
+
"is_month_start": False,
|
|
417
|
+
"is_month_end": False,
|
|
418
|
+
"is_year_start": False,
|
|
419
|
+
"is_year_end": False,
|
|
420
|
+
}
|
|
421
|
+
]
|
|
422
|
+
)
|
|
423
|
+
return pd.concat([unknown_row, df], ignore_index=True)
|