odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1356 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Dict, List, Literal, Optional, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from odibi.context import EngineContext
|
|
8
|
+
from odibi.utils.logging_context import get_logging_context
|
|
9
|
+
|
|
10
|
+
# -------------------------------------------------------------------------
|
|
11
|
+
# 1. Filter Rows
|
|
12
|
+
# -------------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FilterRowsParams(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration for filtering rows.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
```yaml
|
|
21
|
+
filter_rows:
|
|
22
|
+
condition: "age > 18 AND status = 'active'"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Example (Null Check):
|
|
26
|
+
```yaml
|
|
27
|
+
filter_rows:
|
|
28
|
+
condition: "email IS NOT NULL AND email != ''"
|
|
29
|
+
```
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
condition: str = Field(
|
|
33
|
+
..., description="SQL WHERE clause (e.g., 'age > 18 AND status = \"active\"')"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def filter_rows(context: EngineContext, params: FilterRowsParams) -> EngineContext:
|
|
38
|
+
"""
|
|
39
|
+
Filters rows using a standard SQL WHERE clause.
|
|
40
|
+
|
|
41
|
+
Design:
|
|
42
|
+
- SQL-First: Pushes filtering to the engine's optimizer.
|
|
43
|
+
- Zero-Copy: No data movement to Python.
|
|
44
|
+
"""
|
|
45
|
+
ctx = get_logging_context()
|
|
46
|
+
start_time = time.time()
|
|
47
|
+
|
|
48
|
+
ctx.debug(
|
|
49
|
+
"FilterRows starting",
|
|
50
|
+
condition=params.condition,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
rows_before = None
|
|
54
|
+
try:
|
|
55
|
+
rows_before = context.df.shape[0] if hasattr(context.df, "shape") else None
|
|
56
|
+
if rows_before is None and hasattr(context.df, "count"):
|
|
57
|
+
rows_before = context.df.count()
|
|
58
|
+
except Exception as e:
|
|
59
|
+
ctx.debug(f"Could not get row count before transform: {type(e).__name__}")
|
|
60
|
+
|
|
61
|
+
sql_query = f"SELECT * FROM df WHERE {params.condition}"
|
|
62
|
+
result = context.sql(sql_query)
|
|
63
|
+
|
|
64
|
+
rows_after = None
|
|
65
|
+
try:
|
|
66
|
+
rows_after = result.df.shape[0] if hasattr(result.df, "shape") else None
|
|
67
|
+
if rows_after is None and hasattr(result.df, "count"):
|
|
68
|
+
rows_after = result.df.count()
|
|
69
|
+
except Exception as e:
|
|
70
|
+
ctx.debug(f"Could not get row count after transform: {type(e).__name__}")
|
|
71
|
+
|
|
72
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
73
|
+
rows_filtered = rows_before - rows_after if rows_before and rows_after else None
|
|
74
|
+
ctx.debug(
|
|
75
|
+
"FilterRows completed",
|
|
76
|
+
rows_before=rows_before,
|
|
77
|
+
rows_after=rows_after,
|
|
78
|
+
rows_filtered=rows_filtered,
|
|
79
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
return result
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# -------------------------------------------------------------------------
|
|
86
|
+
# 2. Derive Columns
|
|
87
|
+
# -------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class DeriveColumnsParams(BaseModel):
|
|
91
|
+
"""
|
|
92
|
+
Configuration for derived columns.
|
|
93
|
+
|
|
94
|
+
Example:
|
|
95
|
+
```yaml
|
|
96
|
+
derive_columns:
|
|
97
|
+
derivations:
|
|
98
|
+
total_price: "quantity * unit_price"
|
|
99
|
+
full_name: "concat(first_name, ' ', last_name)"
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Note: Engine will fail if expressions reference non-existent columns.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
# key: new_column_name, value: sql_expression
|
|
106
|
+
derivations: Dict[str, str] = Field(..., description="Map of column name to SQL expression")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def derive_columns(context: EngineContext, params: DeriveColumnsParams) -> EngineContext:
|
|
110
|
+
"""
|
|
111
|
+
Appends new columns based on SQL expressions.
|
|
112
|
+
|
|
113
|
+
Design:
|
|
114
|
+
- Uses projection to add fields.
|
|
115
|
+
- Keeps all existing columns via `*`.
|
|
116
|
+
"""
|
|
117
|
+
ctx = get_logging_context()
|
|
118
|
+
start_time = time.time()
|
|
119
|
+
|
|
120
|
+
ctx.debug(
|
|
121
|
+
"DeriveColumns starting",
|
|
122
|
+
derivations=list(params.derivations.keys()),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
columns_before = len(context.columns) if context.columns else 0
|
|
126
|
+
|
|
127
|
+
expressions = [f"{expr} AS {col}" for col, expr in params.derivations.items()]
|
|
128
|
+
select_clause = ", ".join(expressions)
|
|
129
|
+
|
|
130
|
+
sql_query = f"SELECT *, {select_clause} FROM df"
|
|
131
|
+
result = context.sql(sql_query)
|
|
132
|
+
|
|
133
|
+
columns_after = len(result.columns) if result.columns else 0
|
|
134
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
135
|
+
ctx.debug(
|
|
136
|
+
"DeriveColumns completed",
|
|
137
|
+
columns_added=list(params.derivations.keys()),
|
|
138
|
+
columns_before=columns_before,
|
|
139
|
+
columns_after=columns_after,
|
|
140
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return result
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# -------------------------------------------------------------------------
|
|
147
|
+
# 3. Cast Columns
|
|
148
|
+
# -------------------------------------------------------------------------
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class SimpleType(str, Enum):
|
|
152
|
+
INT = "int"
|
|
153
|
+
INTEGER = "integer"
|
|
154
|
+
STR = "str"
|
|
155
|
+
STRING = "string"
|
|
156
|
+
FLOAT = "float"
|
|
157
|
+
DOUBLE = "double"
|
|
158
|
+
BOOL = "bool"
|
|
159
|
+
BOOLEAN = "boolean"
|
|
160
|
+
DATE = "date"
|
|
161
|
+
TIMESTAMP = "timestamp"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class CastColumnsParams(BaseModel):
|
|
165
|
+
"""
|
|
166
|
+
Configuration for column type casting.
|
|
167
|
+
|
|
168
|
+
Example:
|
|
169
|
+
```yaml
|
|
170
|
+
cast_columns:
|
|
171
|
+
casts:
|
|
172
|
+
age: "int"
|
|
173
|
+
salary: "DOUBLE"
|
|
174
|
+
created_at: "TIMESTAMP"
|
|
175
|
+
tags: "ARRAY<STRING>" # Raw SQL types allowed
|
|
176
|
+
```
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
# key: column_name, value: target_type
|
|
180
|
+
casts: Dict[str, Union[SimpleType, str]] = Field(
|
|
181
|
+
..., description="Map of column to target SQL type"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def cast_columns(context: EngineContext, params: CastColumnsParams) -> EngineContext:
|
|
186
|
+
"""
|
|
187
|
+
Casts specific columns to new types while keeping others intact.
|
|
188
|
+
"""
|
|
189
|
+
current_cols = context.columns
|
|
190
|
+
projection = []
|
|
191
|
+
|
|
192
|
+
# Standardized type map for "Simple over Clever"
|
|
193
|
+
type_map = {
|
|
194
|
+
"int": "INTEGER",
|
|
195
|
+
"integer": "INTEGER",
|
|
196
|
+
"str": "STRING",
|
|
197
|
+
"string": "STRING",
|
|
198
|
+
"float": "DOUBLE",
|
|
199
|
+
"double": "DOUBLE",
|
|
200
|
+
"bool": "BOOLEAN",
|
|
201
|
+
"boolean": "BOOLEAN",
|
|
202
|
+
"date": "DATE",
|
|
203
|
+
"timestamp": "TIMESTAMP",
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
for col in current_cols:
|
|
207
|
+
if col in params.casts:
|
|
208
|
+
raw_type = params.casts[col]
|
|
209
|
+
# Handle Enum or str
|
|
210
|
+
if isinstance(raw_type, Enum):
|
|
211
|
+
raw_type_str = raw_type.value
|
|
212
|
+
else:
|
|
213
|
+
raw_type_str = str(raw_type)
|
|
214
|
+
|
|
215
|
+
target_type = type_map.get(raw_type_str.lower(), raw_type_str)
|
|
216
|
+
projection.append(f"CAST({col} AS {target_type}) AS {col}")
|
|
217
|
+
else:
|
|
218
|
+
projection.append(col)
|
|
219
|
+
|
|
220
|
+
sql_query = f"SELECT {', '.join(projection)} FROM df"
|
|
221
|
+
return context.sql(sql_query)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
# -------------------------------------------------------------------------
|
|
225
|
+
# 4. Clean Text
|
|
226
|
+
# -------------------------------------------------------------------------
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class CleanTextParams(BaseModel):
|
|
230
|
+
"""
|
|
231
|
+
Configuration for text cleaning.
|
|
232
|
+
|
|
233
|
+
Example:
|
|
234
|
+
```yaml
|
|
235
|
+
clean_text:
|
|
236
|
+
columns: ["email", "username"]
|
|
237
|
+
trim: true
|
|
238
|
+
case: "lower"
|
|
239
|
+
```
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
columns: List[str] = Field(..., description="List of columns to clean")
|
|
243
|
+
trim: bool = Field(True, description="Apply TRIM()")
|
|
244
|
+
case: Literal["lower", "upper", "preserve"] = Field("preserve", description="Case conversion")
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def clean_text(context: EngineContext, params: CleanTextParams) -> EngineContext:
|
|
248
|
+
"""
|
|
249
|
+
Applies string cleaning operations (Trim/Case) via SQL.
|
|
250
|
+
"""
|
|
251
|
+
current_cols = context.columns
|
|
252
|
+
projection = []
|
|
253
|
+
|
|
254
|
+
for col in current_cols:
|
|
255
|
+
if col in params.columns:
|
|
256
|
+
expr = col
|
|
257
|
+
if params.trim:
|
|
258
|
+
expr = f"TRIM({expr})"
|
|
259
|
+
if params.case == "lower":
|
|
260
|
+
expr = f"LOWER({expr})"
|
|
261
|
+
elif params.case == "upper":
|
|
262
|
+
expr = f"UPPER({expr})"
|
|
263
|
+
projection.append(f"{expr} AS {col}")
|
|
264
|
+
else:
|
|
265
|
+
projection.append(col)
|
|
266
|
+
|
|
267
|
+
sql_query = f"SELECT {', '.join(projection)} FROM df"
|
|
268
|
+
return context.sql(sql_query)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
# -------------------------------------------------------------------------
|
|
272
|
+
# 5. Extract Date Parts
|
|
273
|
+
# -------------------------------------------------------------------------
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class ExtractDateParams(BaseModel):
|
|
277
|
+
"""
|
|
278
|
+
Configuration for extracting date parts.
|
|
279
|
+
|
|
280
|
+
Example:
|
|
281
|
+
```yaml
|
|
282
|
+
extract_date_parts:
|
|
283
|
+
source_col: "created_at"
|
|
284
|
+
prefix: "created"
|
|
285
|
+
parts: ["year", "month"]
|
|
286
|
+
```
|
|
287
|
+
"""
|
|
288
|
+
|
|
289
|
+
source_col: str
|
|
290
|
+
prefix: Optional[str] = None
|
|
291
|
+
parts: List[Literal["year", "month", "day", "hour"]] = ["year", "month", "day"]
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def extract_date_parts(context: EngineContext, params: ExtractDateParams) -> EngineContext:
|
|
295
|
+
"""
|
|
296
|
+
Extracts date parts using ANSI SQL extract/functions.
|
|
297
|
+
"""
|
|
298
|
+
prefix = params.prefix or params.source_col
|
|
299
|
+
expressions = []
|
|
300
|
+
|
|
301
|
+
for part in params.parts:
|
|
302
|
+
# Standard SQL compatible syntax
|
|
303
|
+
# Note: Using YEAR(col) syntax which is supported by Spark and DuckDB
|
|
304
|
+
if part == "year":
|
|
305
|
+
expressions.append(f"YEAR({params.source_col}) AS {prefix}_year")
|
|
306
|
+
elif part == "month":
|
|
307
|
+
expressions.append(f"MONTH({params.source_col}) AS {prefix}_month")
|
|
308
|
+
elif part == "day":
|
|
309
|
+
expressions.append(f"DAY({params.source_col}) AS {prefix}_day")
|
|
310
|
+
elif part == "hour":
|
|
311
|
+
expressions.append(f"HOUR({params.source_col}) AS {prefix}_hour")
|
|
312
|
+
|
|
313
|
+
select_clause = ", ".join(expressions)
|
|
314
|
+
sql_query = f"SELECT *, {select_clause} FROM df"
|
|
315
|
+
return context.sql(sql_query)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# -------------------------------------------------------------------------
|
|
319
|
+
# 6. Normalize Schema
|
|
320
|
+
# -------------------------------------------------------------------------
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
class NormalizeSchemaParams(BaseModel):
|
|
324
|
+
"""
|
|
325
|
+
Configuration for schema normalization.
|
|
326
|
+
|
|
327
|
+
Example:
|
|
328
|
+
```yaml
|
|
329
|
+
normalize_schema:
|
|
330
|
+
rename:
|
|
331
|
+
old_col: "new_col"
|
|
332
|
+
drop: ["unused_col"]
|
|
333
|
+
select_order: ["id", "new_col", "created_at"]
|
|
334
|
+
```
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
rename: Optional[Dict[str, str]] = Field(
|
|
338
|
+
default_factory=dict, description="old_name -> new_name"
|
|
339
|
+
)
|
|
340
|
+
drop: Optional[List[str]] = Field(
|
|
341
|
+
default_factory=list, description="Columns to remove; ignored if not present"
|
|
342
|
+
)
|
|
343
|
+
select_order: Optional[List[str]] = Field(
|
|
344
|
+
None, description="Final column order; any missing columns appended after"
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def normalize_schema(context: EngineContext, params: NormalizeSchemaParams) -> EngineContext:
|
|
349
|
+
"""
|
|
350
|
+
Structural transformation to rename, drop, and reorder columns.
|
|
351
|
+
|
|
352
|
+
Note: This is one of the few that might behave better with native API in some cases,
|
|
353
|
+
but SQL projection handles it perfectly and is consistent.
|
|
354
|
+
"""
|
|
355
|
+
current_cols = context.columns
|
|
356
|
+
|
|
357
|
+
# 1. Determine columns to keep (exclude dropped)
|
|
358
|
+
cols_to_keep = [c for c in current_cols if c not in (params.drop or [])]
|
|
359
|
+
|
|
360
|
+
# 2. Prepare projection with renames
|
|
361
|
+
projection = []
|
|
362
|
+
|
|
363
|
+
# Helper to get SQL expr for a column
|
|
364
|
+
def get_col_expr(col_name: str) -> str:
|
|
365
|
+
if params.rename and col_name in params.rename:
|
|
366
|
+
return f"{col_name} AS {params.rename[col_name]}"
|
|
367
|
+
return col_name
|
|
368
|
+
|
|
369
|
+
def get_final_name(col_name: str) -> str:
|
|
370
|
+
if params.rename and col_name in params.rename:
|
|
371
|
+
return params.rename[col_name]
|
|
372
|
+
return col_name
|
|
373
|
+
|
|
374
|
+
# 3. Reordering logic
|
|
375
|
+
if params.select_order:
|
|
376
|
+
# Use the user's strict order
|
|
377
|
+
for target_col in params.select_order:
|
|
378
|
+
# Find which source column maps to this target
|
|
379
|
+
# This inverse lookup is a bit complex if we renamed
|
|
380
|
+
# Simplification: We assume select_order uses the FINAL names
|
|
381
|
+
|
|
382
|
+
found = False
|
|
383
|
+
# Check if it's a renamed column
|
|
384
|
+
if params.rename:
|
|
385
|
+
for old, new in params.rename.items():
|
|
386
|
+
if new == target_col:
|
|
387
|
+
projection.append(f"{old} AS {new}")
|
|
388
|
+
found = True
|
|
389
|
+
break
|
|
390
|
+
|
|
391
|
+
if not found:
|
|
392
|
+
# Must be an original column
|
|
393
|
+
projection.append(target_col)
|
|
394
|
+
else:
|
|
395
|
+
# Use existing order of kept columns
|
|
396
|
+
for col in cols_to_keep:
|
|
397
|
+
projection.append(get_col_expr(col))
|
|
398
|
+
|
|
399
|
+
sql_query = f"SELECT {', '.join(projection)} FROM df"
|
|
400
|
+
return context.sql(sql_query)
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
# -------------------------------------------------------------------------
|
|
404
|
+
# 7. Sort
|
|
405
|
+
# -------------------------------------------------------------------------
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
class SortParams(BaseModel):
|
|
409
|
+
"""
|
|
410
|
+
Configuration for sorting.
|
|
411
|
+
|
|
412
|
+
Example:
|
|
413
|
+
```yaml
|
|
414
|
+
sort:
|
|
415
|
+
by: ["created_at", "id"]
|
|
416
|
+
ascending: false
|
|
417
|
+
```
|
|
418
|
+
"""
|
|
419
|
+
|
|
420
|
+
by: Union[str, List[str]] = Field(..., description="Column(s) to sort by")
|
|
421
|
+
ascending: bool = Field(True, description="Sort order")
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def sort(context: EngineContext, params: SortParams) -> EngineContext:
|
|
425
|
+
"""
|
|
426
|
+
Sorts the dataset.
|
|
427
|
+
"""
|
|
428
|
+
cols = [params.by] if isinstance(params.by, str) else params.by
|
|
429
|
+
direction = "ASC" if params.ascending else "DESC"
|
|
430
|
+
# Apply direction to all columns for simplicity
|
|
431
|
+
order_clause = ", ".join([f"{col} {direction}" for col in cols])
|
|
432
|
+
|
|
433
|
+
return context.sql(f"SELECT * FROM df ORDER BY {order_clause}")
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
# -------------------------------------------------------------------------
|
|
437
|
+
# 8. Limit / Sample
|
|
438
|
+
# -------------------------------------------------------------------------
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
class LimitParams(BaseModel):
|
|
442
|
+
"""
|
|
443
|
+
Configuration for result limiting.
|
|
444
|
+
|
|
445
|
+
Example:
|
|
446
|
+
```yaml
|
|
447
|
+
limit:
|
|
448
|
+
n: 100
|
|
449
|
+
offset: 0
|
|
450
|
+
```
|
|
451
|
+
"""
|
|
452
|
+
|
|
453
|
+
n: int = Field(..., description="Number of rows to return")
|
|
454
|
+
offset: int = Field(0, description="Number of rows to skip")
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def limit(context: EngineContext, params: LimitParams) -> EngineContext:
|
|
458
|
+
"""
|
|
459
|
+
Limits result size.
|
|
460
|
+
"""
|
|
461
|
+
return context.sql(f"SELECT * FROM df LIMIT {params.n} OFFSET {params.offset}")
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
class SampleParams(BaseModel):
|
|
465
|
+
"""
|
|
466
|
+
Configuration for random sampling.
|
|
467
|
+
|
|
468
|
+
Example:
|
|
469
|
+
```yaml
|
|
470
|
+
sample:
|
|
471
|
+
fraction: 0.1
|
|
472
|
+
seed: 42
|
|
473
|
+
```
|
|
474
|
+
"""
|
|
475
|
+
|
|
476
|
+
fraction: float = Field(..., description="Fraction of rows to return (0.0 to 1.0)")
|
|
477
|
+
seed: Optional[int] = None
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def sample(context: EngineContext, params: SampleParams) -> EngineContext:
|
|
481
|
+
"""
|
|
482
|
+
Samples data using random filtering.
|
|
483
|
+
"""
|
|
484
|
+
# Generic SQL sampling: WHERE rand() < fraction
|
|
485
|
+
# Spark uses rand(), DuckDB (Pandas) uses random()
|
|
486
|
+
|
|
487
|
+
func = "rand()"
|
|
488
|
+
from odibi.enums import EngineType
|
|
489
|
+
|
|
490
|
+
if context.engine_type == EngineType.PANDAS:
|
|
491
|
+
func = "random()"
|
|
492
|
+
|
|
493
|
+
sql_query = f"SELECT * FROM df WHERE {func} < {params.fraction}"
|
|
494
|
+
return context.sql(sql_query)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
# -------------------------------------------------------------------------
|
|
498
|
+
# 9. Distinct
|
|
499
|
+
# -------------------------------------------------------------------------
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
class DistinctParams(BaseModel):
|
|
503
|
+
"""
|
|
504
|
+
Configuration for distinct rows.
|
|
505
|
+
|
|
506
|
+
Example:
|
|
507
|
+
```yaml
|
|
508
|
+
distinct:
|
|
509
|
+
columns: ["category", "status"]
|
|
510
|
+
```
|
|
511
|
+
"""
|
|
512
|
+
|
|
513
|
+
columns: Optional[List[str]] = Field(
|
|
514
|
+
None, description="Columns to project (if None, keeps all columns unique)"
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def distinct(context: EngineContext, params: DistinctParams) -> EngineContext:
|
|
519
|
+
"""
|
|
520
|
+
Returns unique rows (SELECT DISTINCT).
|
|
521
|
+
"""
|
|
522
|
+
if params.columns:
|
|
523
|
+
cols = ", ".join(params.columns)
|
|
524
|
+
return context.sql(f"SELECT DISTINCT {cols} FROM df")
|
|
525
|
+
else:
|
|
526
|
+
return context.sql("SELECT DISTINCT * FROM df")
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
# -------------------------------------------------------------------------
|
|
530
|
+
# 10. Fill Nulls
|
|
531
|
+
# -------------------------------------------------------------------------
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
class FillNullsParams(BaseModel):
|
|
535
|
+
"""
|
|
536
|
+
Configuration for filling null values.
|
|
537
|
+
|
|
538
|
+
Example:
|
|
539
|
+
```yaml
|
|
540
|
+
fill_nulls:
|
|
541
|
+
values:
|
|
542
|
+
count: 0
|
|
543
|
+
description: "N/A"
|
|
544
|
+
```
|
|
545
|
+
"""
|
|
546
|
+
|
|
547
|
+
# key: column, value: fill value (str, int, float, bool)
|
|
548
|
+
values: Dict[str, Union[str, int, float, bool]] = Field(
|
|
549
|
+
..., description="Map of column to fill value"
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def fill_nulls(context: EngineContext, params: FillNullsParams) -> EngineContext:
|
|
554
|
+
"""
|
|
555
|
+
Replaces null values with specified defaults using COALESCE.
|
|
556
|
+
"""
|
|
557
|
+
current_cols = context.columns
|
|
558
|
+
projection = []
|
|
559
|
+
|
|
560
|
+
for col in current_cols:
|
|
561
|
+
if col in params.values:
|
|
562
|
+
fill_val = params.values[col]
|
|
563
|
+
# Quote string values
|
|
564
|
+
if isinstance(fill_val, str):
|
|
565
|
+
fill_val = f"'{fill_val}'"
|
|
566
|
+
# Boolean to SQL
|
|
567
|
+
elif isinstance(fill_val, bool):
|
|
568
|
+
fill_val = "TRUE" if fill_val else "FALSE"
|
|
569
|
+
|
|
570
|
+
projection.append(f"COALESCE({col}, {fill_val}) AS {col}")
|
|
571
|
+
else:
|
|
572
|
+
projection.append(col)
|
|
573
|
+
|
|
574
|
+
sql_query = f"SELECT {', '.join(projection)} FROM df"
|
|
575
|
+
return context.sql(sql_query)
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
# -------------------------------------------------------------------------
|
|
579
|
+
# 11. Split Part
|
|
580
|
+
# -------------------------------------------------------------------------
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
class SplitPartParams(BaseModel):
|
|
584
|
+
"""
|
|
585
|
+
Configuration for splitting strings.
|
|
586
|
+
|
|
587
|
+
Example:
|
|
588
|
+
```yaml
|
|
589
|
+
split_part:
|
|
590
|
+
col: "email"
|
|
591
|
+
delimiter: "@"
|
|
592
|
+
index: 2 # Extracts domain
|
|
593
|
+
```
|
|
594
|
+
"""
|
|
595
|
+
|
|
596
|
+
col: str = Field(..., description="Column to split")
|
|
597
|
+
delimiter: str = Field(..., description="Delimiter to split by")
|
|
598
|
+
index: int = Field(..., description="1-based index of the token to extract")
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def split_part(context: EngineContext, params: SplitPartParams) -> EngineContext:
|
|
602
|
+
"""
|
|
603
|
+
Extracts the Nth part of a string after splitting by a delimiter.
|
|
604
|
+
"""
|
|
605
|
+
import re
|
|
606
|
+
|
|
607
|
+
from odibi.enums import EngineType
|
|
608
|
+
|
|
609
|
+
if context.engine_type == EngineType.SPARK:
|
|
610
|
+
# Spark: element_at(split(col, delimiter), index)
|
|
611
|
+
# Note: Spark's split function uses Regex. We escape the delimiter to treat it as a literal.
|
|
612
|
+
safe_delimiter = re.escape(params.delimiter).replace("\\", "\\\\")
|
|
613
|
+
expr = f"element_at(split({params.col}, '{safe_delimiter}'), {params.index})"
|
|
614
|
+
else:
|
|
615
|
+
# DuckDB / Postgres / Standard: split_part(col, delimiter, index)
|
|
616
|
+
expr = f"split_part({params.col}, '{params.delimiter}', {params.index})"
|
|
617
|
+
|
|
618
|
+
sql_query = f"SELECT *, {expr} AS {params.col}_part_{params.index} FROM df"
|
|
619
|
+
return context.sql(sql_query)
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
# -------------------------------------------------------------------------
|
|
623
|
+
# 12. Date Add
|
|
624
|
+
# -------------------------------------------------------------------------
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
class DateAddParams(BaseModel):
|
|
628
|
+
"""
|
|
629
|
+
Configuration for date addition.
|
|
630
|
+
|
|
631
|
+
Example:
|
|
632
|
+
```yaml
|
|
633
|
+
date_add:
|
|
634
|
+
col: "created_at"
|
|
635
|
+
value: 1
|
|
636
|
+
unit: "day"
|
|
637
|
+
```
|
|
638
|
+
"""
|
|
639
|
+
|
|
640
|
+
col: str
|
|
641
|
+
value: int
|
|
642
|
+
unit: Literal["day", "month", "year", "hour", "minute", "second"]
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def date_add(context: EngineContext, params: DateAddParams) -> EngineContext:
|
|
646
|
+
"""
|
|
647
|
+
Adds an interval to a date/timestamp column.
|
|
648
|
+
"""
|
|
649
|
+
# Standard SQL: col + INTERVAL 'value' unit
|
|
650
|
+
# DuckDB supports this. Spark supports this.
|
|
651
|
+
|
|
652
|
+
expr = f"{params.col} + INTERVAL {params.value} {params.unit}"
|
|
653
|
+
target_col = f"{params.col}_future"
|
|
654
|
+
|
|
655
|
+
sql_query = f"SELECT *, {expr} AS {target_col} FROM df"
|
|
656
|
+
return context.sql(sql_query)
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
# -------------------------------------------------------------------------
|
|
660
|
+
# 13. Date Trunc
|
|
661
|
+
# -------------------------------------------------------------------------
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
class DateTruncParams(BaseModel):
|
|
665
|
+
"""
|
|
666
|
+
Configuration for date truncation.
|
|
667
|
+
|
|
668
|
+
Example:
|
|
669
|
+
```yaml
|
|
670
|
+
date_trunc:
|
|
671
|
+
col: "created_at"
|
|
672
|
+
unit: "month"
|
|
673
|
+
```
|
|
674
|
+
"""
|
|
675
|
+
|
|
676
|
+
col: str
|
|
677
|
+
unit: Literal["year", "month", "day", "hour", "minute", "second"]
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
def date_trunc(context: EngineContext, params: DateTruncParams) -> EngineContext:
|
|
681
|
+
"""
|
|
682
|
+
Truncates a date/timestamp to the specified precision.
|
|
683
|
+
"""
|
|
684
|
+
# Standard SQL: date_trunc('unit', col)
|
|
685
|
+
# DuckDB: date_trunc('unit', col)
|
|
686
|
+
# Spark: date_trunc('unit', col)
|
|
687
|
+
|
|
688
|
+
expr = f"date_trunc('{params.unit}', {params.col})"
|
|
689
|
+
target_col = f"{params.col}_trunc"
|
|
690
|
+
|
|
691
|
+
sql_query = f"SELECT *, {expr} AS {target_col} FROM df"
|
|
692
|
+
return context.sql(sql_query)
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
# -------------------------------------------------------------------------
|
|
696
|
+
# 14. Date Diff
|
|
697
|
+
# -------------------------------------------------------------------------
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
class DateDiffParams(BaseModel):
|
|
701
|
+
"""
|
|
702
|
+
Configuration for date difference.
|
|
703
|
+
|
|
704
|
+
Example:
|
|
705
|
+
```yaml
|
|
706
|
+
date_diff:
|
|
707
|
+
start_col: "created_at"
|
|
708
|
+
end_col: "updated_at"
|
|
709
|
+
unit: "day"
|
|
710
|
+
```
|
|
711
|
+
"""
|
|
712
|
+
|
|
713
|
+
start_col: str
|
|
714
|
+
end_col: str
|
|
715
|
+
unit: Literal["day", "hour", "minute", "second"] = "day"
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
def date_diff(context: EngineContext, params: DateDiffParams) -> EngineContext:
|
|
719
|
+
"""
|
|
720
|
+
Calculates difference between two dates/timestamps.
|
|
721
|
+
Returns the elapsed time in the specified unit (as float for sub-day units).
|
|
722
|
+
"""
|
|
723
|
+
from odibi.enums import EngineType
|
|
724
|
+
|
|
725
|
+
if context.engine_type == EngineType.SPARK:
|
|
726
|
+
if params.unit == "day":
|
|
727
|
+
# Spark datediff returns days (integer)
|
|
728
|
+
expr = f"datediff({params.end_col}, {params.start_col})"
|
|
729
|
+
else:
|
|
730
|
+
# For hours/minutes, convert difference in seconds
|
|
731
|
+
diff_sec = f"(unix_timestamp({params.end_col}) - unix_timestamp({params.start_col}))"
|
|
732
|
+
if params.unit == "hour":
|
|
733
|
+
expr = f"({diff_sec} / 3600.0)"
|
|
734
|
+
elif params.unit == "minute":
|
|
735
|
+
expr = f"({diff_sec} / 60.0)"
|
|
736
|
+
else:
|
|
737
|
+
expr = diff_sec
|
|
738
|
+
else:
|
|
739
|
+
# DuckDB
|
|
740
|
+
if params.unit == "day":
|
|
741
|
+
expr = f"date_diff('day', {params.start_col}, {params.end_col})"
|
|
742
|
+
else:
|
|
743
|
+
# For elapsed time semantics (consistent with Spark math), use seconds diff / factor
|
|
744
|
+
diff_sec = f"date_diff('second', {params.start_col}, {params.end_col})"
|
|
745
|
+
if params.unit == "hour":
|
|
746
|
+
expr = f"({diff_sec} / 3600.0)"
|
|
747
|
+
elif params.unit == "minute":
|
|
748
|
+
expr = f"({diff_sec} / 60.0)"
|
|
749
|
+
else:
|
|
750
|
+
expr = diff_sec
|
|
751
|
+
|
|
752
|
+
target_col = f"diff_{params.unit}"
|
|
753
|
+
sql_query = f"SELECT *, {expr} AS {target_col} FROM df"
|
|
754
|
+
return context.sql(sql_query)
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
# -------------------------------------------------------------------------
|
|
758
|
+
# 15. Case When
|
|
759
|
+
# -------------------------------------------------------------------------
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
class CaseWhenCase(BaseModel):
|
|
763
|
+
condition: str
|
|
764
|
+
value: str
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
class CaseWhenParams(BaseModel):
|
|
768
|
+
"""
|
|
769
|
+
Configuration for conditional logic.
|
|
770
|
+
|
|
771
|
+
Example:
|
|
772
|
+
```yaml
|
|
773
|
+
case_when:
|
|
774
|
+
output_col: "age_group"
|
|
775
|
+
default: "'Adult'"
|
|
776
|
+
cases:
|
|
777
|
+
- condition: "age < 18"
|
|
778
|
+
value: "'Minor'"
|
|
779
|
+
- condition: "age > 65"
|
|
780
|
+
value: "'Senior'"
|
|
781
|
+
```
|
|
782
|
+
"""
|
|
783
|
+
|
|
784
|
+
# List of (condition, value) tuples
|
|
785
|
+
cases: List[CaseWhenCase] = Field(..., description="List of conditional branches")
|
|
786
|
+
default: str = Field("NULL", description="Default value if no condition met")
|
|
787
|
+
output_col: str = Field(..., description="Name of the resulting column")
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def case_when(context: EngineContext, params: CaseWhenParams) -> EngineContext:
|
|
791
|
+
"""
|
|
792
|
+
Implements structured CASE WHEN logic.
|
|
793
|
+
"""
|
|
794
|
+
when_clauses = []
|
|
795
|
+
for case in params.cases:
|
|
796
|
+
condition = case.condition
|
|
797
|
+
value = case.value
|
|
798
|
+
if condition and value:
|
|
799
|
+
when_clauses.append(f"WHEN {condition} THEN {value}")
|
|
800
|
+
|
|
801
|
+
full_case = f"CASE {' '.join(when_clauses)} ELSE {params.default} END"
|
|
802
|
+
|
|
803
|
+
sql_query = f"SELECT *, {full_case} AS {params.output_col} FROM df"
|
|
804
|
+
return context.sql(sql_query)
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
# -------------------------------------------------------------------------
|
|
808
|
+
# 16. Convert Timezone
|
|
809
|
+
# -------------------------------------------------------------------------
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
class ConvertTimezoneParams(BaseModel):
|
|
813
|
+
"""
|
|
814
|
+
Configuration for timezone conversion.
|
|
815
|
+
|
|
816
|
+
Example:
|
|
817
|
+
```yaml
|
|
818
|
+
convert_timezone:
|
|
819
|
+
col: "utc_time"
|
|
820
|
+
source_tz: "UTC"
|
|
821
|
+
target_tz: "America/New_York"
|
|
822
|
+
```
|
|
823
|
+
"""
|
|
824
|
+
|
|
825
|
+
col: str = Field(..., description="Timestamp column to convert")
|
|
826
|
+
source_tz: str = Field("UTC", description="Source timezone (e.g., 'UTC', 'America/New_York')")
|
|
827
|
+
target_tz: str = Field(..., description="Target timezone (e.g., 'America/Los_Angeles')")
|
|
828
|
+
output_col: Optional[str] = Field(
|
|
829
|
+
None, description="Name of the result column (default: {col}_{target_tz})"
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
def convert_timezone(context: EngineContext, params: ConvertTimezoneParams) -> EngineContext:
|
|
834
|
+
"""
|
|
835
|
+
Converts a timestamp from one timezone to another.
|
|
836
|
+
Assumes the input column is a naive timestamp representing time in source_tz,
|
|
837
|
+
or a timestamp with timezone.
|
|
838
|
+
"""
|
|
839
|
+
from odibi.enums import EngineType
|
|
840
|
+
|
|
841
|
+
target = params.output_col or f"{params.col}_converted"
|
|
842
|
+
|
|
843
|
+
if context.engine_type == EngineType.SPARK:
|
|
844
|
+
# Spark: from_utc_timestamp(to_utc_timestamp(col, source_tz), target_tz)
|
|
845
|
+
# logic:
|
|
846
|
+
# 1. Interpret 'col' as being in 'source_tz', convert to UTC instant -> to_utc_timestamp(col, source)
|
|
847
|
+
# 2. Render that instant in 'target_tz' -> from_utc_timestamp(instant, target)
|
|
848
|
+
|
|
849
|
+
expr = f"from_utc_timestamp(to_utc_timestamp({params.col}, '{params.source_tz}'), '{params.target_tz}')"
|
|
850
|
+
|
|
851
|
+
else:
|
|
852
|
+
# DuckDB / Postgres
|
|
853
|
+
# Logic:
|
|
854
|
+
# 1. Interpret 'col' as timestamp in source_tz -> col AT TIME ZONE source_tz (Creates TIMESTAMPTZ)
|
|
855
|
+
# 2. Convert that TIMESTAMPTZ to local time in target_tz -> AT TIME ZONE target_tz (Creates TIMESTAMP)
|
|
856
|
+
|
|
857
|
+
# Note: We assume the input is NOT already a TIMESTAMPTZ. If it is, the first cast might be redundant but usually safe.
|
|
858
|
+
# We cast to TIMESTAMP first to ensure we start with "Naive" interpretation.
|
|
859
|
+
|
|
860
|
+
expr = f"({params.col}::TIMESTAMP AT TIME ZONE '{params.source_tz}') AT TIME ZONE '{params.target_tz}'"
|
|
861
|
+
|
|
862
|
+
sql_query = f"SELECT *, {expr} AS {target} FROM df"
|
|
863
|
+
return context.sql(sql_query)
|
|
864
|
+
|
|
865
|
+
|
|
866
|
+
# -------------------------------------------------------------------------
|
|
867
|
+
# 17. Concat Columns
|
|
868
|
+
# -------------------------------------------------------------------------
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
class ConcatColumnsParams(BaseModel):
|
|
872
|
+
"""
|
|
873
|
+
Configuration for string concatenation.
|
|
874
|
+
|
|
875
|
+
Example:
|
|
876
|
+
```yaml
|
|
877
|
+
concat_columns:
|
|
878
|
+
columns: ["first_name", "last_name"]
|
|
879
|
+
separator: " "
|
|
880
|
+
output_col: "full_name"
|
|
881
|
+
```
|
|
882
|
+
"""
|
|
883
|
+
|
|
884
|
+
columns: List[str] = Field(..., description="Columns to concatenate")
|
|
885
|
+
separator: str = Field("", description="Separator string")
|
|
886
|
+
output_col: str = Field(..., description="Resulting column name")
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
def concat_columns(context: EngineContext, params: ConcatColumnsParams) -> EngineContext:
|
|
890
|
+
"""
|
|
891
|
+
Concatenates multiple columns into one string.
|
|
892
|
+
NULLs are skipped (treated as empty string) using CONCAT_WS behavior.
|
|
893
|
+
"""
|
|
894
|
+
# Logic: CONCAT_WS(separator, col1, col2...)
|
|
895
|
+
# Both Spark and DuckDB support CONCAT_WS with skip-null behavior.
|
|
896
|
+
|
|
897
|
+
cols_str = ", ".join(params.columns)
|
|
898
|
+
|
|
899
|
+
# Note: Spark CONCAT_WS requires separator as first arg.
|
|
900
|
+
# DuckDB CONCAT_WS requires separator as first arg.
|
|
901
|
+
|
|
902
|
+
expr = f"concat_ws('{params.separator}', {cols_str})"
|
|
903
|
+
|
|
904
|
+
sql_query = f"SELECT *, {expr} AS {params.output_col} FROM df"
|
|
905
|
+
return context.sql(sql_query)
|
|
906
|
+
|
|
907
|
+
|
|
908
|
+
# -------------------------------------------------------------------------
|
|
909
|
+
# 18. Select Columns
|
|
910
|
+
# -------------------------------------------------------------------------
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
class SelectColumnsParams(BaseModel):
|
|
914
|
+
"""
|
|
915
|
+
Configuration for selecting specific columns (whitelist).
|
|
916
|
+
|
|
917
|
+
Example:
|
|
918
|
+
```yaml
|
|
919
|
+
select_columns:
|
|
920
|
+
columns: ["id", "name", "created_at"]
|
|
921
|
+
```
|
|
922
|
+
"""
|
|
923
|
+
|
|
924
|
+
columns: List[str] = Field(..., description="List of column names to keep")
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
def select_columns(context: EngineContext, params: SelectColumnsParams) -> EngineContext:
|
|
928
|
+
"""
|
|
929
|
+
Keeps only the specified columns, dropping all others.
|
|
930
|
+
"""
|
|
931
|
+
cols_str = ", ".join(params.columns)
|
|
932
|
+
sql_query = f"SELECT {cols_str} FROM df"
|
|
933
|
+
return context.sql(sql_query)
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
# -------------------------------------------------------------------------
|
|
937
|
+
# 19. Drop Columns
|
|
938
|
+
# -------------------------------------------------------------------------
|
|
939
|
+
|
|
940
|
+
|
|
941
|
+
class DropColumnsParams(BaseModel):
|
|
942
|
+
"""
|
|
943
|
+
Configuration for dropping specific columns (blacklist).
|
|
944
|
+
|
|
945
|
+
Example:
|
|
946
|
+
```yaml
|
|
947
|
+
drop_columns:
|
|
948
|
+
columns: ["_internal_id", "_temp_flag", "_processing_date"]
|
|
949
|
+
```
|
|
950
|
+
"""
|
|
951
|
+
|
|
952
|
+
columns: List[str] = Field(..., description="List of column names to drop")
|
|
953
|
+
|
|
954
|
+
|
|
955
|
+
def drop_columns(context: EngineContext, params: DropColumnsParams) -> EngineContext:
|
|
956
|
+
"""
|
|
957
|
+
Removes the specified columns from the DataFrame.
|
|
958
|
+
"""
|
|
959
|
+
# Use EXCEPT syntax (Spark) or EXCLUDE (DuckDB)
|
|
960
|
+
from odibi.enums import EngineType
|
|
961
|
+
|
|
962
|
+
drop_cols = ", ".join(params.columns)
|
|
963
|
+
|
|
964
|
+
if context.engine_type == EngineType.PANDAS:
|
|
965
|
+
# DuckDB uses EXCLUDE
|
|
966
|
+
sql_query = f"SELECT * EXCLUDE ({drop_cols}) FROM df"
|
|
967
|
+
else:
|
|
968
|
+
# Spark uses EXCEPT
|
|
969
|
+
sql_query = f"SELECT * EXCEPT ({drop_cols}) FROM df"
|
|
970
|
+
|
|
971
|
+
return context.sql(sql_query)
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
# -------------------------------------------------------------------------
|
|
975
|
+
# 20. Rename Columns
|
|
976
|
+
# -------------------------------------------------------------------------
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
class RenameColumnsParams(BaseModel):
|
|
980
|
+
"""
|
|
981
|
+
Configuration for bulk column renaming.
|
|
982
|
+
|
|
983
|
+
Example:
|
|
984
|
+
```yaml
|
|
985
|
+
rename_columns:
|
|
986
|
+
mapping:
|
|
987
|
+
customer_id: cust_id
|
|
988
|
+
order_date: date
|
|
989
|
+
total_amount: amount
|
|
990
|
+
```
|
|
991
|
+
"""
|
|
992
|
+
|
|
993
|
+
mapping: Dict[str, str] = Field(..., description="Map of old column name to new column name")
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
def rename_columns(context: EngineContext, params: RenameColumnsParams) -> EngineContext:
|
|
997
|
+
"""
|
|
998
|
+
Renames columns according to the provided mapping.
|
|
999
|
+
Columns not in the mapping are kept unchanged.
|
|
1000
|
+
"""
|
|
1001
|
+
# Build SELECT with aliases for renamed columns
|
|
1002
|
+
current_cols = context.columns
|
|
1003
|
+
select_parts = []
|
|
1004
|
+
|
|
1005
|
+
for col in current_cols:
|
|
1006
|
+
if col in params.mapping:
|
|
1007
|
+
select_parts.append(f"{col} AS {params.mapping[col]}")
|
|
1008
|
+
else:
|
|
1009
|
+
select_parts.append(col)
|
|
1010
|
+
|
|
1011
|
+
cols_str = ", ".join(select_parts)
|
|
1012
|
+
sql_query = f"SELECT {cols_str} FROM df"
|
|
1013
|
+
return context.sql(sql_query)
|
|
1014
|
+
|
|
1015
|
+
|
|
1016
|
+
# -------------------------------------------------------------------------
|
|
1017
|
+
# 21. Add Prefix
|
|
1018
|
+
# -------------------------------------------------------------------------
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
class AddPrefixParams(BaseModel):
|
|
1022
|
+
"""
|
|
1023
|
+
Configuration for adding a prefix to column names.
|
|
1024
|
+
|
|
1025
|
+
Example - All columns:
|
|
1026
|
+
```yaml
|
|
1027
|
+
add_prefix:
|
|
1028
|
+
prefix: "src_"
|
|
1029
|
+
```
|
|
1030
|
+
|
|
1031
|
+
Example - Specific columns:
|
|
1032
|
+
```yaml
|
|
1033
|
+
add_prefix:
|
|
1034
|
+
prefix: "raw_"
|
|
1035
|
+
columns: ["id", "name", "value"]
|
|
1036
|
+
```
|
|
1037
|
+
"""
|
|
1038
|
+
|
|
1039
|
+
prefix: str = Field(..., description="Prefix to add to column names")
|
|
1040
|
+
columns: Optional[List[str]] = Field(
|
|
1041
|
+
None, description="Columns to prefix (default: all columns)"
|
|
1042
|
+
)
|
|
1043
|
+
exclude: Optional[List[str]] = Field(None, description="Columns to exclude from prefixing")
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def add_prefix(context: EngineContext, params: AddPrefixParams) -> EngineContext:
|
|
1047
|
+
"""
|
|
1048
|
+
Adds a prefix to column names.
|
|
1049
|
+
"""
|
|
1050
|
+
current_cols = context.columns
|
|
1051
|
+
target_cols = params.columns or current_cols
|
|
1052
|
+
exclude_cols = set(params.exclude or [])
|
|
1053
|
+
|
|
1054
|
+
select_parts = []
|
|
1055
|
+
for col in current_cols:
|
|
1056
|
+
if col in target_cols and col not in exclude_cols:
|
|
1057
|
+
select_parts.append(f"{col} AS {params.prefix}{col}")
|
|
1058
|
+
else:
|
|
1059
|
+
select_parts.append(col)
|
|
1060
|
+
|
|
1061
|
+
cols_str = ", ".join(select_parts)
|
|
1062
|
+
sql_query = f"SELECT {cols_str} FROM df"
|
|
1063
|
+
return context.sql(sql_query)
|
|
1064
|
+
|
|
1065
|
+
|
|
1066
|
+
# -------------------------------------------------------------------------
|
|
1067
|
+
# 22. Add Suffix
|
|
1068
|
+
# -------------------------------------------------------------------------
|
|
1069
|
+
|
|
1070
|
+
|
|
1071
|
+
class AddSuffixParams(BaseModel):
|
|
1072
|
+
"""
|
|
1073
|
+
Configuration for adding a suffix to column names.
|
|
1074
|
+
|
|
1075
|
+
Example - All columns:
|
|
1076
|
+
```yaml
|
|
1077
|
+
add_suffix:
|
|
1078
|
+
suffix: "_raw"
|
|
1079
|
+
```
|
|
1080
|
+
|
|
1081
|
+
Example - Specific columns:
|
|
1082
|
+
```yaml
|
|
1083
|
+
add_suffix:
|
|
1084
|
+
suffix: "_v2"
|
|
1085
|
+
columns: ["id", "name", "value"]
|
|
1086
|
+
```
|
|
1087
|
+
"""
|
|
1088
|
+
|
|
1089
|
+
suffix: str = Field(..., description="Suffix to add to column names")
|
|
1090
|
+
columns: Optional[List[str]] = Field(
|
|
1091
|
+
None, description="Columns to suffix (default: all columns)"
|
|
1092
|
+
)
|
|
1093
|
+
exclude: Optional[List[str]] = Field(None, description="Columns to exclude from suffixing")
|
|
1094
|
+
|
|
1095
|
+
|
|
1096
|
+
def add_suffix(context: EngineContext, params: AddSuffixParams) -> EngineContext:
|
|
1097
|
+
"""
|
|
1098
|
+
Adds a suffix to column names.
|
|
1099
|
+
"""
|
|
1100
|
+
current_cols = context.columns
|
|
1101
|
+
target_cols = params.columns or current_cols
|
|
1102
|
+
exclude_cols = set(params.exclude or [])
|
|
1103
|
+
|
|
1104
|
+
select_parts = []
|
|
1105
|
+
for col in current_cols:
|
|
1106
|
+
if col in target_cols and col not in exclude_cols:
|
|
1107
|
+
select_parts.append(f"{col} AS {col}{params.suffix}")
|
|
1108
|
+
else:
|
|
1109
|
+
select_parts.append(col)
|
|
1110
|
+
|
|
1111
|
+
cols_str = ", ".join(select_parts)
|
|
1112
|
+
sql_query = f"SELECT {cols_str} FROM df"
|
|
1113
|
+
return context.sql(sql_query)
|
|
1114
|
+
|
|
1115
|
+
|
|
1116
|
+
# -------------------------------------------------------------------------
|
|
1117
|
+
# 23. Normalize Column Names
|
|
1118
|
+
# -------------------------------------------------------------------------
|
|
1119
|
+
|
|
1120
|
+
|
|
1121
|
+
class NormalizeColumnNamesParams(BaseModel):
|
|
1122
|
+
"""
|
|
1123
|
+
Configuration for normalizing column names.
|
|
1124
|
+
|
|
1125
|
+
Example:
|
|
1126
|
+
```yaml
|
|
1127
|
+
normalize_column_names:
|
|
1128
|
+
style: "snake_case"
|
|
1129
|
+
lowercase: true
|
|
1130
|
+
```
|
|
1131
|
+
"""
|
|
1132
|
+
|
|
1133
|
+
style: Literal["snake_case", "none"] = Field(
|
|
1134
|
+
"snake_case",
|
|
1135
|
+
description="Naming style: 'snake_case' converts spaces/special chars to underscores",
|
|
1136
|
+
)
|
|
1137
|
+
lowercase: bool = Field(True, description="Convert names to lowercase")
|
|
1138
|
+
remove_special: bool = Field(True, description="Remove special characters except underscores")
|
|
1139
|
+
|
|
1140
|
+
|
|
1141
|
+
def normalize_column_names(
|
|
1142
|
+
context: EngineContext, params: NormalizeColumnNamesParams
|
|
1143
|
+
) -> EngineContext:
|
|
1144
|
+
"""
|
|
1145
|
+
Normalizes column names to a consistent style.
|
|
1146
|
+
Useful for cleaning up messy source data with spaces, mixed case, or special characters.
|
|
1147
|
+
"""
|
|
1148
|
+
import re
|
|
1149
|
+
|
|
1150
|
+
current_cols = context.columns
|
|
1151
|
+
select_parts = []
|
|
1152
|
+
|
|
1153
|
+
for col in current_cols:
|
|
1154
|
+
new_name = col
|
|
1155
|
+
|
|
1156
|
+
# Apply lowercase
|
|
1157
|
+
if params.lowercase:
|
|
1158
|
+
new_name = new_name.lower()
|
|
1159
|
+
|
|
1160
|
+
# Apply snake_case (replace spaces and special chars with underscores)
|
|
1161
|
+
if params.style == "snake_case":
|
|
1162
|
+
# Replace spaces, dashes, dots with underscores
|
|
1163
|
+
new_name = re.sub(r"[\s\-\.]+", "_", new_name)
|
|
1164
|
+
# Insert underscore before uppercase letters (camelCase to snake_case)
|
|
1165
|
+
new_name = re.sub(r"([a-z])([A-Z])", r"\1_\2", new_name).lower()
|
|
1166
|
+
|
|
1167
|
+
# Remove special characters
|
|
1168
|
+
if params.remove_special:
|
|
1169
|
+
new_name = re.sub(r"[^a-zA-Z0-9_]", "", new_name)
|
|
1170
|
+
# Remove consecutive underscores
|
|
1171
|
+
new_name = re.sub(r"_+", "_", new_name)
|
|
1172
|
+
# Remove leading/trailing underscores
|
|
1173
|
+
new_name = new_name.strip("_")
|
|
1174
|
+
|
|
1175
|
+
if new_name != col:
|
|
1176
|
+
select_parts.append(f'"{col}" AS {new_name}')
|
|
1177
|
+
else:
|
|
1178
|
+
select_parts.append(f'"{col}"')
|
|
1179
|
+
|
|
1180
|
+
cols_str = ", ".join(select_parts)
|
|
1181
|
+
sql_query = f"SELECT {cols_str} FROM df"
|
|
1182
|
+
return context.sql(sql_query)
|
|
1183
|
+
|
|
1184
|
+
|
|
1185
|
+
# -------------------------------------------------------------------------
|
|
1186
|
+
# 24. Coalesce Columns
|
|
1187
|
+
# -------------------------------------------------------------------------
|
|
1188
|
+
|
|
1189
|
+
|
|
1190
|
+
class CoalesceColumnsParams(BaseModel):
|
|
1191
|
+
"""
|
|
1192
|
+
Configuration for coalescing columns (first non-null value).
|
|
1193
|
+
|
|
1194
|
+
Example - Phone number fallback:
|
|
1195
|
+
```yaml
|
|
1196
|
+
coalesce_columns:
|
|
1197
|
+
columns: ["mobile_phone", "work_phone", "home_phone"]
|
|
1198
|
+
output_col: "primary_phone"
|
|
1199
|
+
```
|
|
1200
|
+
|
|
1201
|
+
Example - Timestamp fallback:
|
|
1202
|
+
```yaml
|
|
1203
|
+
coalesce_columns:
|
|
1204
|
+
columns: ["updated_at", "modified_at", "created_at"]
|
|
1205
|
+
output_col: "last_change_at"
|
|
1206
|
+
```
|
|
1207
|
+
"""
|
|
1208
|
+
|
|
1209
|
+
columns: List[str] = Field(..., description="List of columns to coalesce (in priority order)")
|
|
1210
|
+
output_col: str = Field(..., description="Name of the output column")
|
|
1211
|
+
drop_source: bool = Field(False, description="Drop the source columns after coalescing")
|
|
1212
|
+
|
|
1213
|
+
|
|
1214
|
+
def coalesce_columns(context: EngineContext, params: CoalesceColumnsParams) -> EngineContext:
|
|
1215
|
+
"""
|
|
1216
|
+
Returns the first non-null value from a list of columns.
|
|
1217
|
+
Useful for fallback/priority scenarios.
|
|
1218
|
+
"""
|
|
1219
|
+
from odibi.enums import EngineType
|
|
1220
|
+
|
|
1221
|
+
cols_str = ", ".join(params.columns)
|
|
1222
|
+
expr = f"COALESCE({cols_str}) AS {params.output_col}"
|
|
1223
|
+
|
|
1224
|
+
if params.drop_source:
|
|
1225
|
+
drop_cols = ", ".join(params.columns)
|
|
1226
|
+
if context.engine_type == EngineType.PANDAS:
|
|
1227
|
+
sql_query = f"SELECT * EXCLUDE ({drop_cols}), {expr} FROM df"
|
|
1228
|
+
else:
|
|
1229
|
+
sql_query = f"SELECT * EXCEPT ({drop_cols}), {expr} FROM df"
|
|
1230
|
+
else:
|
|
1231
|
+
sql_query = f"SELECT *, {expr} FROM df"
|
|
1232
|
+
|
|
1233
|
+
return context.sql(sql_query)
|
|
1234
|
+
|
|
1235
|
+
|
|
1236
|
+
# -------------------------------------------------------------------------
|
|
1237
|
+
# 25. Replace Values
|
|
1238
|
+
# -------------------------------------------------------------------------
|
|
1239
|
+
|
|
1240
|
+
|
|
1241
|
+
class ReplaceValuesParams(BaseModel):
|
|
1242
|
+
"""
|
|
1243
|
+
Configuration for bulk value replacement.
|
|
1244
|
+
|
|
1245
|
+
Example - Standardize nulls:
|
|
1246
|
+
```yaml
|
|
1247
|
+
replace_values:
|
|
1248
|
+
columns: ["status", "category"]
|
|
1249
|
+
mapping:
|
|
1250
|
+
"N/A": null
|
|
1251
|
+
"": null
|
|
1252
|
+
"Unknown": null
|
|
1253
|
+
```
|
|
1254
|
+
|
|
1255
|
+
Example - Code replacement:
|
|
1256
|
+
```yaml
|
|
1257
|
+
replace_values:
|
|
1258
|
+
columns: ["country_code"]
|
|
1259
|
+
mapping:
|
|
1260
|
+
"US": "USA"
|
|
1261
|
+
"UK": "GBR"
|
|
1262
|
+
```
|
|
1263
|
+
"""
|
|
1264
|
+
|
|
1265
|
+
columns: List[str] = Field(..., description="Columns to apply replacements to")
|
|
1266
|
+
mapping: Dict[str, Optional[str]] = Field(
|
|
1267
|
+
..., description="Map of old value to new value (use null for NULL)"
|
|
1268
|
+
)
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
def replace_values(context: EngineContext, params: ReplaceValuesParams) -> EngineContext:
|
|
1272
|
+
"""
|
|
1273
|
+
Replaces values in specified columns according to the mapping.
|
|
1274
|
+
Supports replacing to NULL.
|
|
1275
|
+
"""
|
|
1276
|
+
current_cols = context.columns
|
|
1277
|
+
select_parts = []
|
|
1278
|
+
|
|
1279
|
+
for col in current_cols:
|
|
1280
|
+
if col in params.columns:
|
|
1281
|
+
# Build nested CASE WHEN for replacements
|
|
1282
|
+
case_parts = []
|
|
1283
|
+
for old_val, new_val in params.mapping.items():
|
|
1284
|
+
if old_val == "":
|
|
1285
|
+
case_parts.append(f"WHEN {col} = '' THEN {_sql_value(new_val)}")
|
|
1286
|
+
else:
|
|
1287
|
+
case_parts.append(f"WHEN {col} = '{old_val}' THEN {_sql_value(new_val)}")
|
|
1288
|
+
|
|
1289
|
+
if case_parts:
|
|
1290
|
+
case_expr = f"CASE {' '.join(case_parts)} ELSE {col} END AS {col}"
|
|
1291
|
+
select_parts.append(case_expr)
|
|
1292
|
+
else:
|
|
1293
|
+
select_parts.append(col)
|
|
1294
|
+
else:
|
|
1295
|
+
select_parts.append(col)
|
|
1296
|
+
|
|
1297
|
+
cols_str = ", ".join(select_parts)
|
|
1298
|
+
sql_query = f"SELECT {cols_str} FROM df"
|
|
1299
|
+
return context.sql(sql_query)
|
|
1300
|
+
|
|
1301
|
+
|
|
1302
|
+
def _sql_value(val: Optional[str]) -> str:
|
|
1303
|
+
"""Convert Python value to SQL literal."""
|
|
1304
|
+
if val is None:
|
|
1305
|
+
return "NULL"
|
|
1306
|
+
return f"'{val}'"
|
|
1307
|
+
|
|
1308
|
+
|
|
1309
|
+
# -------------------------------------------------------------------------
|
|
1310
|
+
# 26. Trim Whitespace
|
|
1311
|
+
# -------------------------------------------------------------------------
|
|
1312
|
+
|
|
1313
|
+
|
|
1314
|
+
class TrimWhitespaceParams(BaseModel):
|
|
1315
|
+
"""
|
|
1316
|
+
Configuration for trimming whitespace from string columns.
|
|
1317
|
+
|
|
1318
|
+
Example - All string columns:
|
|
1319
|
+
```yaml
|
|
1320
|
+
trim_whitespace: {}
|
|
1321
|
+
```
|
|
1322
|
+
|
|
1323
|
+
Example - Specific columns:
|
|
1324
|
+
```yaml
|
|
1325
|
+
trim_whitespace:
|
|
1326
|
+
columns: ["name", "address", "city"]
|
|
1327
|
+
```
|
|
1328
|
+
"""
|
|
1329
|
+
|
|
1330
|
+
columns: Optional[List[str]] = Field(
|
|
1331
|
+
None,
|
|
1332
|
+
description="Columns to trim (default: all string columns detected at runtime)",
|
|
1333
|
+
)
|
|
1334
|
+
|
|
1335
|
+
|
|
1336
|
+
def trim_whitespace(context: EngineContext, params: TrimWhitespaceParams) -> EngineContext:
|
|
1337
|
+
"""
|
|
1338
|
+
Trims leading and trailing whitespace from string columns.
|
|
1339
|
+
"""
|
|
1340
|
+
current_cols = context.columns
|
|
1341
|
+
target_cols = params.columns
|
|
1342
|
+
|
|
1343
|
+
# If no columns specified, we trim all columns (SQL TRIM handles non-strings gracefully in most cases)
|
|
1344
|
+
if target_cols is None:
|
|
1345
|
+
target_cols = current_cols
|
|
1346
|
+
|
|
1347
|
+
select_parts = []
|
|
1348
|
+
for col in current_cols:
|
|
1349
|
+
if col in target_cols:
|
|
1350
|
+
select_parts.append(f"TRIM({col}) AS {col}")
|
|
1351
|
+
else:
|
|
1352
|
+
select_parts.append(col)
|
|
1353
|
+
|
|
1354
|
+
cols_str = ", ".join(select_parts)
|
|
1355
|
+
sql_query = f"SELECT {cols_str} FROM df"
|
|
1356
|
+
return context.sql(sql_query)
|