odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1356 @@
1
+ import time
2
+ from enum import Enum
3
+ from typing import Dict, List, Literal, Optional, Union
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from odibi.context import EngineContext
8
+ from odibi.utils.logging_context import get_logging_context
9
+
10
+ # -------------------------------------------------------------------------
11
+ # 1. Filter Rows
12
+ # -------------------------------------------------------------------------
13
+
14
+
15
+ class FilterRowsParams(BaseModel):
16
+ """
17
+ Configuration for filtering rows.
18
+
19
+ Example:
20
+ ```yaml
21
+ filter_rows:
22
+ condition: "age > 18 AND status = 'active'"
23
+ ```
24
+
25
+ Example (Null Check):
26
+ ```yaml
27
+ filter_rows:
28
+ condition: "email IS NOT NULL AND email != ''"
29
+ ```
30
+ """
31
+
32
+ condition: str = Field(
33
+ ..., description="SQL WHERE clause (e.g., 'age > 18 AND status = \"active\"')"
34
+ )
35
+
36
+
37
+ def filter_rows(context: EngineContext, params: FilterRowsParams) -> EngineContext:
38
+ """
39
+ Filters rows using a standard SQL WHERE clause.
40
+
41
+ Design:
42
+ - SQL-First: Pushes filtering to the engine's optimizer.
43
+ - Zero-Copy: No data movement to Python.
44
+ """
45
+ ctx = get_logging_context()
46
+ start_time = time.time()
47
+
48
+ ctx.debug(
49
+ "FilterRows starting",
50
+ condition=params.condition,
51
+ )
52
+
53
+ rows_before = None
54
+ try:
55
+ rows_before = context.df.shape[0] if hasattr(context.df, "shape") else None
56
+ if rows_before is None and hasattr(context.df, "count"):
57
+ rows_before = context.df.count()
58
+ except Exception as e:
59
+ ctx.debug(f"Could not get row count before transform: {type(e).__name__}")
60
+
61
+ sql_query = f"SELECT * FROM df WHERE {params.condition}"
62
+ result = context.sql(sql_query)
63
+
64
+ rows_after = None
65
+ try:
66
+ rows_after = result.df.shape[0] if hasattr(result.df, "shape") else None
67
+ if rows_after is None and hasattr(result.df, "count"):
68
+ rows_after = result.df.count()
69
+ except Exception as e:
70
+ ctx.debug(f"Could not get row count after transform: {type(e).__name__}")
71
+
72
+ elapsed_ms = (time.time() - start_time) * 1000
73
+ rows_filtered = rows_before - rows_after if rows_before and rows_after else None
74
+ ctx.debug(
75
+ "FilterRows completed",
76
+ rows_before=rows_before,
77
+ rows_after=rows_after,
78
+ rows_filtered=rows_filtered,
79
+ elapsed_ms=round(elapsed_ms, 2),
80
+ )
81
+
82
+ return result
83
+
84
+
85
+ # -------------------------------------------------------------------------
86
+ # 2. Derive Columns
87
+ # -------------------------------------------------------------------------
88
+
89
+
90
+ class DeriveColumnsParams(BaseModel):
91
+ """
92
+ Configuration for derived columns.
93
+
94
+ Example:
95
+ ```yaml
96
+ derive_columns:
97
+ derivations:
98
+ total_price: "quantity * unit_price"
99
+ full_name: "concat(first_name, ' ', last_name)"
100
+ ```
101
+
102
+ Note: Engine will fail if expressions reference non-existent columns.
103
+ """
104
+
105
+ # key: new_column_name, value: sql_expression
106
+ derivations: Dict[str, str] = Field(..., description="Map of column name to SQL expression")
107
+
108
+
109
+ def derive_columns(context: EngineContext, params: DeriveColumnsParams) -> EngineContext:
110
+ """
111
+ Appends new columns based on SQL expressions.
112
+
113
+ Design:
114
+ - Uses projection to add fields.
115
+ - Keeps all existing columns via `*`.
116
+ """
117
+ ctx = get_logging_context()
118
+ start_time = time.time()
119
+
120
+ ctx.debug(
121
+ "DeriveColumns starting",
122
+ derivations=list(params.derivations.keys()),
123
+ )
124
+
125
+ columns_before = len(context.columns) if context.columns else 0
126
+
127
+ expressions = [f"{expr} AS {col}" for col, expr in params.derivations.items()]
128
+ select_clause = ", ".join(expressions)
129
+
130
+ sql_query = f"SELECT *, {select_clause} FROM df"
131
+ result = context.sql(sql_query)
132
+
133
+ columns_after = len(result.columns) if result.columns else 0
134
+ elapsed_ms = (time.time() - start_time) * 1000
135
+ ctx.debug(
136
+ "DeriveColumns completed",
137
+ columns_added=list(params.derivations.keys()),
138
+ columns_before=columns_before,
139
+ columns_after=columns_after,
140
+ elapsed_ms=round(elapsed_ms, 2),
141
+ )
142
+
143
+ return result
144
+
145
+
146
+ # -------------------------------------------------------------------------
147
+ # 3. Cast Columns
148
+ # -------------------------------------------------------------------------
149
+
150
+
151
+ class SimpleType(str, Enum):
152
+ INT = "int"
153
+ INTEGER = "integer"
154
+ STR = "str"
155
+ STRING = "string"
156
+ FLOAT = "float"
157
+ DOUBLE = "double"
158
+ BOOL = "bool"
159
+ BOOLEAN = "boolean"
160
+ DATE = "date"
161
+ TIMESTAMP = "timestamp"
162
+
163
+
164
+ class CastColumnsParams(BaseModel):
165
+ """
166
+ Configuration for column type casting.
167
+
168
+ Example:
169
+ ```yaml
170
+ cast_columns:
171
+ casts:
172
+ age: "int"
173
+ salary: "DOUBLE"
174
+ created_at: "TIMESTAMP"
175
+ tags: "ARRAY<STRING>" # Raw SQL types allowed
176
+ ```
177
+ """
178
+
179
+ # key: column_name, value: target_type
180
+ casts: Dict[str, Union[SimpleType, str]] = Field(
181
+ ..., description="Map of column to target SQL type"
182
+ )
183
+
184
+
185
+ def cast_columns(context: EngineContext, params: CastColumnsParams) -> EngineContext:
186
+ """
187
+ Casts specific columns to new types while keeping others intact.
188
+ """
189
+ current_cols = context.columns
190
+ projection = []
191
+
192
+ # Standardized type map for "Simple over Clever"
193
+ type_map = {
194
+ "int": "INTEGER",
195
+ "integer": "INTEGER",
196
+ "str": "STRING",
197
+ "string": "STRING",
198
+ "float": "DOUBLE",
199
+ "double": "DOUBLE",
200
+ "bool": "BOOLEAN",
201
+ "boolean": "BOOLEAN",
202
+ "date": "DATE",
203
+ "timestamp": "TIMESTAMP",
204
+ }
205
+
206
+ for col in current_cols:
207
+ if col in params.casts:
208
+ raw_type = params.casts[col]
209
+ # Handle Enum or str
210
+ if isinstance(raw_type, Enum):
211
+ raw_type_str = raw_type.value
212
+ else:
213
+ raw_type_str = str(raw_type)
214
+
215
+ target_type = type_map.get(raw_type_str.lower(), raw_type_str)
216
+ projection.append(f"CAST({col} AS {target_type}) AS {col}")
217
+ else:
218
+ projection.append(col)
219
+
220
+ sql_query = f"SELECT {', '.join(projection)} FROM df"
221
+ return context.sql(sql_query)
222
+
223
+
224
+ # -------------------------------------------------------------------------
225
+ # 4. Clean Text
226
+ # -------------------------------------------------------------------------
227
+
228
+
229
+ class CleanTextParams(BaseModel):
230
+ """
231
+ Configuration for text cleaning.
232
+
233
+ Example:
234
+ ```yaml
235
+ clean_text:
236
+ columns: ["email", "username"]
237
+ trim: true
238
+ case: "lower"
239
+ ```
240
+ """
241
+
242
+ columns: List[str] = Field(..., description="List of columns to clean")
243
+ trim: bool = Field(True, description="Apply TRIM()")
244
+ case: Literal["lower", "upper", "preserve"] = Field("preserve", description="Case conversion")
245
+
246
+
247
+ def clean_text(context: EngineContext, params: CleanTextParams) -> EngineContext:
248
+ """
249
+ Applies string cleaning operations (Trim/Case) via SQL.
250
+ """
251
+ current_cols = context.columns
252
+ projection = []
253
+
254
+ for col in current_cols:
255
+ if col in params.columns:
256
+ expr = col
257
+ if params.trim:
258
+ expr = f"TRIM({expr})"
259
+ if params.case == "lower":
260
+ expr = f"LOWER({expr})"
261
+ elif params.case == "upper":
262
+ expr = f"UPPER({expr})"
263
+ projection.append(f"{expr} AS {col}")
264
+ else:
265
+ projection.append(col)
266
+
267
+ sql_query = f"SELECT {', '.join(projection)} FROM df"
268
+ return context.sql(sql_query)
269
+
270
+
271
+ # -------------------------------------------------------------------------
272
+ # 5. Extract Date Parts
273
+ # -------------------------------------------------------------------------
274
+
275
+
276
+ class ExtractDateParams(BaseModel):
277
+ """
278
+ Configuration for extracting date parts.
279
+
280
+ Example:
281
+ ```yaml
282
+ extract_date_parts:
283
+ source_col: "created_at"
284
+ prefix: "created"
285
+ parts: ["year", "month"]
286
+ ```
287
+ """
288
+
289
+ source_col: str
290
+ prefix: Optional[str] = None
291
+ parts: List[Literal["year", "month", "day", "hour"]] = ["year", "month", "day"]
292
+
293
+
294
+ def extract_date_parts(context: EngineContext, params: ExtractDateParams) -> EngineContext:
295
+ """
296
+ Extracts date parts using ANSI SQL extract/functions.
297
+ """
298
+ prefix = params.prefix or params.source_col
299
+ expressions = []
300
+
301
+ for part in params.parts:
302
+ # Standard SQL compatible syntax
303
+ # Note: Using YEAR(col) syntax which is supported by Spark and DuckDB
304
+ if part == "year":
305
+ expressions.append(f"YEAR({params.source_col}) AS {prefix}_year")
306
+ elif part == "month":
307
+ expressions.append(f"MONTH({params.source_col}) AS {prefix}_month")
308
+ elif part == "day":
309
+ expressions.append(f"DAY({params.source_col}) AS {prefix}_day")
310
+ elif part == "hour":
311
+ expressions.append(f"HOUR({params.source_col}) AS {prefix}_hour")
312
+
313
+ select_clause = ", ".join(expressions)
314
+ sql_query = f"SELECT *, {select_clause} FROM df"
315
+ return context.sql(sql_query)
316
+
317
+
318
+ # -------------------------------------------------------------------------
319
+ # 6. Normalize Schema
320
+ # -------------------------------------------------------------------------
321
+
322
+
323
+ class NormalizeSchemaParams(BaseModel):
324
+ """
325
+ Configuration for schema normalization.
326
+
327
+ Example:
328
+ ```yaml
329
+ normalize_schema:
330
+ rename:
331
+ old_col: "new_col"
332
+ drop: ["unused_col"]
333
+ select_order: ["id", "new_col", "created_at"]
334
+ ```
335
+ """
336
+
337
+ rename: Optional[Dict[str, str]] = Field(
338
+ default_factory=dict, description="old_name -> new_name"
339
+ )
340
+ drop: Optional[List[str]] = Field(
341
+ default_factory=list, description="Columns to remove; ignored if not present"
342
+ )
343
+ select_order: Optional[List[str]] = Field(
344
+ None, description="Final column order; any missing columns appended after"
345
+ )
346
+
347
+
348
+ def normalize_schema(context: EngineContext, params: NormalizeSchemaParams) -> EngineContext:
349
+ """
350
+ Structural transformation to rename, drop, and reorder columns.
351
+
352
+ Note: This is one of the few that might behave better with native API in some cases,
353
+ but SQL projection handles it perfectly and is consistent.
354
+ """
355
+ current_cols = context.columns
356
+
357
+ # 1. Determine columns to keep (exclude dropped)
358
+ cols_to_keep = [c for c in current_cols if c not in (params.drop or [])]
359
+
360
+ # 2. Prepare projection with renames
361
+ projection = []
362
+
363
+ # Helper to get SQL expr for a column
364
+ def get_col_expr(col_name: str) -> str:
365
+ if params.rename and col_name in params.rename:
366
+ return f"{col_name} AS {params.rename[col_name]}"
367
+ return col_name
368
+
369
+ def get_final_name(col_name: str) -> str:
370
+ if params.rename and col_name in params.rename:
371
+ return params.rename[col_name]
372
+ return col_name
373
+
374
+ # 3. Reordering logic
375
+ if params.select_order:
376
+ # Use the user's strict order
377
+ for target_col in params.select_order:
378
+ # Find which source column maps to this target
379
+ # This inverse lookup is a bit complex if we renamed
380
+ # Simplification: We assume select_order uses the FINAL names
381
+
382
+ found = False
383
+ # Check if it's a renamed column
384
+ if params.rename:
385
+ for old, new in params.rename.items():
386
+ if new == target_col:
387
+ projection.append(f"{old} AS {new}")
388
+ found = True
389
+ break
390
+
391
+ if not found:
392
+ # Must be an original column
393
+ projection.append(target_col)
394
+ else:
395
+ # Use existing order of kept columns
396
+ for col in cols_to_keep:
397
+ projection.append(get_col_expr(col))
398
+
399
+ sql_query = f"SELECT {', '.join(projection)} FROM df"
400
+ return context.sql(sql_query)
401
+
402
+
403
+ # -------------------------------------------------------------------------
404
+ # 7. Sort
405
+ # -------------------------------------------------------------------------
406
+
407
+
408
+ class SortParams(BaseModel):
409
+ """
410
+ Configuration for sorting.
411
+
412
+ Example:
413
+ ```yaml
414
+ sort:
415
+ by: ["created_at", "id"]
416
+ ascending: false
417
+ ```
418
+ """
419
+
420
+ by: Union[str, List[str]] = Field(..., description="Column(s) to sort by")
421
+ ascending: bool = Field(True, description="Sort order")
422
+
423
+
424
+ def sort(context: EngineContext, params: SortParams) -> EngineContext:
425
+ """
426
+ Sorts the dataset.
427
+ """
428
+ cols = [params.by] if isinstance(params.by, str) else params.by
429
+ direction = "ASC" if params.ascending else "DESC"
430
+ # Apply direction to all columns for simplicity
431
+ order_clause = ", ".join([f"{col} {direction}" for col in cols])
432
+
433
+ return context.sql(f"SELECT * FROM df ORDER BY {order_clause}")
434
+
435
+
436
+ # -------------------------------------------------------------------------
437
+ # 8. Limit / Sample
438
+ # -------------------------------------------------------------------------
439
+
440
+
441
+ class LimitParams(BaseModel):
442
+ """
443
+ Configuration for result limiting.
444
+
445
+ Example:
446
+ ```yaml
447
+ limit:
448
+ n: 100
449
+ offset: 0
450
+ ```
451
+ """
452
+
453
+ n: int = Field(..., description="Number of rows to return")
454
+ offset: int = Field(0, description="Number of rows to skip")
455
+
456
+
457
+ def limit(context: EngineContext, params: LimitParams) -> EngineContext:
458
+ """
459
+ Limits result size.
460
+ """
461
+ return context.sql(f"SELECT * FROM df LIMIT {params.n} OFFSET {params.offset}")
462
+
463
+
464
+ class SampleParams(BaseModel):
465
+ """
466
+ Configuration for random sampling.
467
+
468
+ Example:
469
+ ```yaml
470
+ sample:
471
+ fraction: 0.1
472
+ seed: 42
473
+ ```
474
+ """
475
+
476
+ fraction: float = Field(..., description="Fraction of rows to return (0.0 to 1.0)")
477
+ seed: Optional[int] = None
478
+
479
+
480
+ def sample(context: EngineContext, params: SampleParams) -> EngineContext:
481
+ """
482
+ Samples data using random filtering.
483
+ """
484
+ # Generic SQL sampling: WHERE rand() < fraction
485
+ # Spark uses rand(), DuckDB (Pandas) uses random()
486
+
487
+ func = "rand()"
488
+ from odibi.enums import EngineType
489
+
490
+ if context.engine_type == EngineType.PANDAS:
491
+ func = "random()"
492
+
493
+ sql_query = f"SELECT * FROM df WHERE {func} < {params.fraction}"
494
+ return context.sql(sql_query)
495
+
496
+
497
+ # -------------------------------------------------------------------------
498
+ # 9. Distinct
499
+ # -------------------------------------------------------------------------
500
+
501
+
502
+ class DistinctParams(BaseModel):
503
+ """
504
+ Configuration for distinct rows.
505
+
506
+ Example:
507
+ ```yaml
508
+ distinct:
509
+ columns: ["category", "status"]
510
+ ```
511
+ """
512
+
513
+ columns: Optional[List[str]] = Field(
514
+ None, description="Columns to project (if None, keeps all columns unique)"
515
+ )
516
+
517
+
518
+ def distinct(context: EngineContext, params: DistinctParams) -> EngineContext:
519
+ """
520
+ Returns unique rows (SELECT DISTINCT).
521
+ """
522
+ if params.columns:
523
+ cols = ", ".join(params.columns)
524
+ return context.sql(f"SELECT DISTINCT {cols} FROM df")
525
+ else:
526
+ return context.sql("SELECT DISTINCT * FROM df")
527
+
528
+
529
+ # -------------------------------------------------------------------------
530
+ # 10. Fill Nulls
531
+ # -------------------------------------------------------------------------
532
+
533
+
534
+ class FillNullsParams(BaseModel):
535
+ """
536
+ Configuration for filling null values.
537
+
538
+ Example:
539
+ ```yaml
540
+ fill_nulls:
541
+ values:
542
+ count: 0
543
+ description: "N/A"
544
+ ```
545
+ """
546
+
547
+ # key: column, value: fill value (str, int, float, bool)
548
+ values: Dict[str, Union[str, int, float, bool]] = Field(
549
+ ..., description="Map of column to fill value"
550
+ )
551
+
552
+
553
+ def fill_nulls(context: EngineContext, params: FillNullsParams) -> EngineContext:
554
+ """
555
+ Replaces null values with specified defaults using COALESCE.
556
+ """
557
+ current_cols = context.columns
558
+ projection = []
559
+
560
+ for col in current_cols:
561
+ if col in params.values:
562
+ fill_val = params.values[col]
563
+ # Quote string values
564
+ if isinstance(fill_val, str):
565
+ fill_val = f"'{fill_val}'"
566
+ # Boolean to SQL
567
+ elif isinstance(fill_val, bool):
568
+ fill_val = "TRUE" if fill_val else "FALSE"
569
+
570
+ projection.append(f"COALESCE({col}, {fill_val}) AS {col}")
571
+ else:
572
+ projection.append(col)
573
+
574
+ sql_query = f"SELECT {', '.join(projection)} FROM df"
575
+ return context.sql(sql_query)
576
+
577
+
578
+ # -------------------------------------------------------------------------
579
+ # 11. Split Part
580
+ # -------------------------------------------------------------------------
581
+
582
+
583
+ class SplitPartParams(BaseModel):
584
+ """
585
+ Configuration for splitting strings.
586
+
587
+ Example:
588
+ ```yaml
589
+ split_part:
590
+ col: "email"
591
+ delimiter: "@"
592
+ index: 2 # Extracts domain
593
+ ```
594
+ """
595
+
596
+ col: str = Field(..., description="Column to split")
597
+ delimiter: str = Field(..., description="Delimiter to split by")
598
+ index: int = Field(..., description="1-based index of the token to extract")
599
+
600
+
601
+ def split_part(context: EngineContext, params: SplitPartParams) -> EngineContext:
602
+ """
603
+ Extracts the Nth part of a string after splitting by a delimiter.
604
+ """
605
+ import re
606
+
607
+ from odibi.enums import EngineType
608
+
609
+ if context.engine_type == EngineType.SPARK:
610
+ # Spark: element_at(split(col, delimiter), index)
611
+ # Note: Spark's split function uses Regex. We escape the delimiter to treat it as a literal.
612
+ safe_delimiter = re.escape(params.delimiter).replace("\\", "\\\\")
613
+ expr = f"element_at(split({params.col}, '{safe_delimiter}'), {params.index})"
614
+ else:
615
+ # DuckDB / Postgres / Standard: split_part(col, delimiter, index)
616
+ expr = f"split_part({params.col}, '{params.delimiter}', {params.index})"
617
+
618
+ sql_query = f"SELECT *, {expr} AS {params.col}_part_{params.index} FROM df"
619
+ return context.sql(sql_query)
620
+
621
+
622
+ # -------------------------------------------------------------------------
623
+ # 12. Date Add
624
+ # -------------------------------------------------------------------------
625
+
626
+
627
+ class DateAddParams(BaseModel):
628
+ """
629
+ Configuration for date addition.
630
+
631
+ Example:
632
+ ```yaml
633
+ date_add:
634
+ col: "created_at"
635
+ value: 1
636
+ unit: "day"
637
+ ```
638
+ """
639
+
640
+ col: str
641
+ value: int
642
+ unit: Literal["day", "month", "year", "hour", "minute", "second"]
643
+
644
+
645
+ def date_add(context: EngineContext, params: DateAddParams) -> EngineContext:
646
+ """
647
+ Adds an interval to a date/timestamp column.
648
+ """
649
+ # Standard SQL: col + INTERVAL 'value' unit
650
+ # DuckDB supports this. Spark supports this.
651
+
652
+ expr = f"{params.col} + INTERVAL {params.value} {params.unit}"
653
+ target_col = f"{params.col}_future"
654
+
655
+ sql_query = f"SELECT *, {expr} AS {target_col} FROM df"
656
+ return context.sql(sql_query)
657
+
658
+
659
+ # -------------------------------------------------------------------------
660
+ # 13. Date Trunc
661
+ # -------------------------------------------------------------------------
662
+
663
+
664
+ class DateTruncParams(BaseModel):
665
+ """
666
+ Configuration for date truncation.
667
+
668
+ Example:
669
+ ```yaml
670
+ date_trunc:
671
+ col: "created_at"
672
+ unit: "month"
673
+ ```
674
+ """
675
+
676
+ col: str
677
+ unit: Literal["year", "month", "day", "hour", "minute", "second"]
678
+
679
+
680
+ def date_trunc(context: EngineContext, params: DateTruncParams) -> EngineContext:
681
+ """
682
+ Truncates a date/timestamp to the specified precision.
683
+ """
684
+ # Standard SQL: date_trunc('unit', col)
685
+ # DuckDB: date_trunc('unit', col)
686
+ # Spark: date_trunc('unit', col)
687
+
688
+ expr = f"date_trunc('{params.unit}', {params.col})"
689
+ target_col = f"{params.col}_trunc"
690
+
691
+ sql_query = f"SELECT *, {expr} AS {target_col} FROM df"
692
+ return context.sql(sql_query)
693
+
694
+
695
+ # -------------------------------------------------------------------------
696
+ # 14. Date Diff
697
+ # -------------------------------------------------------------------------
698
+
699
+
700
+ class DateDiffParams(BaseModel):
701
+ """
702
+ Configuration for date difference.
703
+
704
+ Example:
705
+ ```yaml
706
+ date_diff:
707
+ start_col: "created_at"
708
+ end_col: "updated_at"
709
+ unit: "day"
710
+ ```
711
+ """
712
+
713
+ start_col: str
714
+ end_col: str
715
+ unit: Literal["day", "hour", "minute", "second"] = "day"
716
+
717
+
718
+ def date_diff(context: EngineContext, params: DateDiffParams) -> EngineContext:
719
+ """
720
+ Calculates difference between two dates/timestamps.
721
+ Returns the elapsed time in the specified unit (as float for sub-day units).
722
+ """
723
+ from odibi.enums import EngineType
724
+
725
+ if context.engine_type == EngineType.SPARK:
726
+ if params.unit == "day":
727
+ # Spark datediff returns days (integer)
728
+ expr = f"datediff({params.end_col}, {params.start_col})"
729
+ else:
730
+ # For hours/minutes, convert difference in seconds
731
+ diff_sec = f"(unix_timestamp({params.end_col}) - unix_timestamp({params.start_col}))"
732
+ if params.unit == "hour":
733
+ expr = f"({diff_sec} / 3600.0)"
734
+ elif params.unit == "minute":
735
+ expr = f"({diff_sec} / 60.0)"
736
+ else:
737
+ expr = diff_sec
738
+ else:
739
+ # DuckDB
740
+ if params.unit == "day":
741
+ expr = f"date_diff('day', {params.start_col}, {params.end_col})"
742
+ else:
743
+ # For elapsed time semantics (consistent with Spark math), use seconds diff / factor
744
+ diff_sec = f"date_diff('second', {params.start_col}, {params.end_col})"
745
+ if params.unit == "hour":
746
+ expr = f"({diff_sec} / 3600.0)"
747
+ elif params.unit == "minute":
748
+ expr = f"({diff_sec} / 60.0)"
749
+ else:
750
+ expr = diff_sec
751
+
752
+ target_col = f"diff_{params.unit}"
753
+ sql_query = f"SELECT *, {expr} AS {target_col} FROM df"
754
+ return context.sql(sql_query)
755
+
756
+
757
+ # -------------------------------------------------------------------------
758
+ # 15. Case When
759
+ # -------------------------------------------------------------------------
760
+
761
+
762
+ class CaseWhenCase(BaseModel):
763
+ condition: str
764
+ value: str
765
+
766
+
767
+ class CaseWhenParams(BaseModel):
768
+ """
769
+ Configuration for conditional logic.
770
+
771
+ Example:
772
+ ```yaml
773
+ case_when:
774
+ output_col: "age_group"
775
+ default: "'Adult'"
776
+ cases:
777
+ - condition: "age < 18"
778
+ value: "'Minor'"
779
+ - condition: "age > 65"
780
+ value: "'Senior'"
781
+ ```
782
+ """
783
+
784
+ # List of (condition, value) tuples
785
+ cases: List[CaseWhenCase] = Field(..., description="List of conditional branches")
786
+ default: str = Field("NULL", description="Default value if no condition met")
787
+ output_col: str = Field(..., description="Name of the resulting column")
788
+
789
+
790
+ def case_when(context: EngineContext, params: CaseWhenParams) -> EngineContext:
791
+ """
792
+ Implements structured CASE WHEN logic.
793
+ """
794
+ when_clauses = []
795
+ for case in params.cases:
796
+ condition = case.condition
797
+ value = case.value
798
+ if condition and value:
799
+ when_clauses.append(f"WHEN {condition} THEN {value}")
800
+
801
+ full_case = f"CASE {' '.join(when_clauses)} ELSE {params.default} END"
802
+
803
+ sql_query = f"SELECT *, {full_case} AS {params.output_col} FROM df"
804
+ return context.sql(sql_query)
805
+
806
+
807
+ # -------------------------------------------------------------------------
808
+ # 16. Convert Timezone
809
+ # -------------------------------------------------------------------------
810
+
811
+
812
+ class ConvertTimezoneParams(BaseModel):
813
+ """
814
+ Configuration for timezone conversion.
815
+
816
+ Example:
817
+ ```yaml
818
+ convert_timezone:
819
+ col: "utc_time"
820
+ source_tz: "UTC"
821
+ target_tz: "America/New_York"
822
+ ```
823
+ """
824
+
825
+ col: str = Field(..., description="Timestamp column to convert")
826
+ source_tz: str = Field("UTC", description="Source timezone (e.g., 'UTC', 'America/New_York')")
827
+ target_tz: str = Field(..., description="Target timezone (e.g., 'America/Los_Angeles')")
828
+ output_col: Optional[str] = Field(
829
+ None, description="Name of the result column (default: {col}_{target_tz})"
830
+ )
831
+
832
+
833
+ def convert_timezone(context: EngineContext, params: ConvertTimezoneParams) -> EngineContext:
834
+ """
835
+ Converts a timestamp from one timezone to another.
836
+ Assumes the input column is a naive timestamp representing time in source_tz,
837
+ or a timestamp with timezone.
838
+ """
839
+ from odibi.enums import EngineType
840
+
841
+ target = params.output_col or f"{params.col}_converted"
842
+
843
+ if context.engine_type == EngineType.SPARK:
844
+ # Spark: from_utc_timestamp(to_utc_timestamp(col, source_tz), target_tz)
845
+ # logic:
846
+ # 1. Interpret 'col' as being in 'source_tz', convert to UTC instant -> to_utc_timestamp(col, source)
847
+ # 2. Render that instant in 'target_tz' -> from_utc_timestamp(instant, target)
848
+
849
+ expr = f"from_utc_timestamp(to_utc_timestamp({params.col}, '{params.source_tz}'), '{params.target_tz}')"
850
+
851
+ else:
852
+ # DuckDB / Postgres
853
+ # Logic:
854
+ # 1. Interpret 'col' as timestamp in source_tz -> col AT TIME ZONE source_tz (Creates TIMESTAMPTZ)
855
+ # 2. Convert that TIMESTAMPTZ to local time in target_tz -> AT TIME ZONE target_tz (Creates TIMESTAMP)
856
+
857
+ # Note: We assume the input is NOT already a TIMESTAMPTZ. If it is, the first cast might be redundant but usually safe.
858
+ # We cast to TIMESTAMP first to ensure we start with "Naive" interpretation.
859
+
860
+ expr = f"({params.col}::TIMESTAMP AT TIME ZONE '{params.source_tz}') AT TIME ZONE '{params.target_tz}'"
861
+
862
+ sql_query = f"SELECT *, {expr} AS {target} FROM df"
863
+ return context.sql(sql_query)
864
+
865
+
866
+ # -------------------------------------------------------------------------
867
+ # 17. Concat Columns
868
+ # -------------------------------------------------------------------------
869
+
870
+
871
+ class ConcatColumnsParams(BaseModel):
872
+ """
873
+ Configuration for string concatenation.
874
+
875
+ Example:
876
+ ```yaml
877
+ concat_columns:
878
+ columns: ["first_name", "last_name"]
879
+ separator: " "
880
+ output_col: "full_name"
881
+ ```
882
+ """
883
+
884
+ columns: List[str] = Field(..., description="Columns to concatenate")
885
+ separator: str = Field("", description="Separator string")
886
+ output_col: str = Field(..., description="Resulting column name")
887
+
888
+
889
+ def concat_columns(context: EngineContext, params: ConcatColumnsParams) -> EngineContext:
890
+ """
891
+ Concatenates multiple columns into one string.
892
+ NULLs are skipped (treated as empty string) using CONCAT_WS behavior.
893
+ """
894
+ # Logic: CONCAT_WS(separator, col1, col2...)
895
+ # Both Spark and DuckDB support CONCAT_WS with skip-null behavior.
896
+
897
+ cols_str = ", ".join(params.columns)
898
+
899
+ # Note: Spark CONCAT_WS requires separator as first arg.
900
+ # DuckDB CONCAT_WS requires separator as first arg.
901
+
902
+ expr = f"concat_ws('{params.separator}', {cols_str})"
903
+
904
+ sql_query = f"SELECT *, {expr} AS {params.output_col} FROM df"
905
+ return context.sql(sql_query)
906
+
907
+
908
+ # -------------------------------------------------------------------------
909
+ # 18. Select Columns
910
+ # -------------------------------------------------------------------------
911
+
912
+
913
+ class SelectColumnsParams(BaseModel):
914
+ """
915
+ Configuration for selecting specific columns (whitelist).
916
+
917
+ Example:
918
+ ```yaml
919
+ select_columns:
920
+ columns: ["id", "name", "created_at"]
921
+ ```
922
+ """
923
+
924
+ columns: List[str] = Field(..., description="List of column names to keep")
925
+
926
+
927
+ def select_columns(context: EngineContext, params: SelectColumnsParams) -> EngineContext:
928
+ """
929
+ Keeps only the specified columns, dropping all others.
930
+ """
931
+ cols_str = ", ".join(params.columns)
932
+ sql_query = f"SELECT {cols_str} FROM df"
933
+ return context.sql(sql_query)
934
+
935
+
936
+ # -------------------------------------------------------------------------
937
+ # 19. Drop Columns
938
+ # -------------------------------------------------------------------------
939
+
940
+
941
+ class DropColumnsParams(BaseModel):
942
+ """
943
+ Configuration for dropping specific columns (blacklist).
944
+
945
+ Example:
946
+ ```yaml
947
+ drop_columns:
948
+ columns: ["_internal_id", "_temp_flag", "_processing_date"]
949
+ ```
950
+ """
951
+
952
+ columns: List[str] = Field(..., description="List of column names to drop")
953
+
954
+
955
+ def drop_columns(context: EngineContext, params: DropColumnsParams) -> EngineContext:
956
+ """
957
+ Removes the specified columns from the DataFrame.
958
+ """
959
+ # Use EXCEPT syntax (Spark) or EXCLUDE (DuckDB)
960
+ from odibi.enums import EngineType
961
+
962
+ drop_cols = ", ".join(params.columns)
963
+
964
+ if context.engine_type == EngineType.PANDAS:
965
+ # DuckDB uses EXCLUDE
966
+ sql_query = f"SELECT * EXCLUDE ({drop_cols}) FROM df"
967
+ else:
968
+ # Spark uses EXCEPT
969
+ sql_query = f"SELECT * EXCEPT ({drop_cols}) FROM df"
970
+
971
+ return context.sql(sql_query)
972
+
973
+
974
+ # -------------------------------------------------------------------------
975
+ # 20. Rename Columns
976
+ # -------------------------------------------------------------------------
977
+
978
+
979
+ class RenameColumnsParams(BaseModel):
980
+ """
981
+ Configuration for bulk column renaming.
982
+
983
+ Example:
984
+ ```yaml
985
+ rename_columns:
986
+ mapping:
987
+ customer_id: cust_id
988
+ order_date: date
989
+ total_amount: amount
990
+ ```
991
+ """
992
+
993
+ mapping: Dict[str, str] = Field(..., description="Map of old column name to new column name")
994
+
995
+
996
+ def rename_columns(context: EngineContext, params: RenameColumnsParams) -> EngineContext:
997
+ """
998
+ Renames columns according to the provided mapping.
999
+ Columns not in the mapping are kept unchanged.
1000
+ """
1001
+ # Build SELECT with aliases for renamed columns
1002
+ current_cols = context.columns
1003
+ select_parts = []
1004
+
1005
+ for col in current_cols:
1006
+ if col in params.mapping:
1007
+ select_parts.append(f"{col} AS {params.mapping[col]}")
1008
+ else:
1009
+ select_parts.append(col)
1010
+
1011
+ cols_str = ", ".join(select_parts)
1012
+ sql_query = f"SELECT {cols_str} FROM df"
1013
+ return context.sql(sql_query)
1014
+
1015
+
1016
+ # -------------------------------------------------------------------------
1017
+ # 21. Add Prefix
1018
+ # -------------------------------------------------------------------------
1019
+
1020
+
1021
+ class AddPrefixParams(BaseModel):
1022
+ """
1023
+ Configuration for adding a prefix to column names.
1024
+
1025
+ Example - All columns:
1026
+ ```yaml
1027
+ add_prefix:
1028
+ prefix: "src_"
1029
+ ```
1030
+
1031
+ Example - Specific columns:
1032
+ ```yaml
1033
+ add_prefix:
1034
+ prefix: "raw_"
1035
+ columns: ["id", "name", "value"]
1036
+ ```
1037
+ """
1038
+
1039
+ prefix: str = Field(..., description="Prefix to add to column names")
1040
+ columns: Optional[List[str]] = Field(
1041
+ None, description="Columns to prefix (default: all columns)"
1042
+ )
1043
+ exclude: Optional[List[str]] = Field(None, description="Columns to exclude from prefixing")
1044
+
1045
+
1046
+ def add_prefix(context: EngineContext, params: AddPrefixParams) -> EngineContext:
1047
+ """
1048
+ Adds a prefix to column names.
1049
+ """
1050
+ current_cols = context.columns
1051
+ target_cols = params.columns or current_cols
1052
+ exclude_cols = set(params.exclude or [])
1053
+
1054
+ select_parts = []
1055
+ for col in current_cols:
1056
+ if col in target_cols and col not in exclude_cols:
1057
+ select_parts.append(f"{col} AS {params.prefix}{col}")
1058
+ else:
1059
+ select_parts.append(col)
1060
+
1061
+ cols_str = ", ".join(select_parts)
1062
+ sql_query = f"SELECT {cols_str} FROM df"
1063
+ return context.sql(sql_query)
1064
+
1065
+
1066
+ # -------------------------------------------------------------------------
1067
+ # 22. Add Suffix
1068
+ # -------------------------------------------------------------------------
1069
+
1070
+
1071
+ class AddSuffixParams(BaseModel):
1072
+ """
1073
+ Configuration for adding a suffix to column names.
1074
+
1075
+ Example - All columns:
1076
+ ```yaml
1077
+ add_suffix:
1078
+ suffix: "_raw"
1079
+ ```
1080
+
1081
+ Example - Specific columns:
1082
+ ```yaml
1083
+ add_suffix:
1084
+ suffix: "_v2"
1085
+ columns: ["id", "name", "value"]
1086
+ ```
1087
+ """
1088
+
1089
+ suffix: str = Field(..., description="Suffix to add to column names")
1090
+ columns: Optional[List[str]] = Field(
1091
+ None, description="Columns to suffix (default: all columns)"
1092
+ )
1093
+ exclude: Optional[List[str]] = Field(None, description="Columns to exclude from suffixing")
1094
+
1095
+
1096
+ def add_suffix(context: EngineContext, params: AddSuffixParams) -> EngineContext:
1097
+ """
1098
+ Adds a suffix to column names.
1099
+ """
1100
+ current_cols = context.columns
1101
+ target_cols = params.columns or current_cols
1102
+ exclude_cols = set(params.exclude or [])
1103
+
1104
+ select_parts = []
1105
+ for col in current_cols:
1106
+ if col in target_cols and col not in exclude_cols:
1107
+ select_parts.append(f"{col} AS {col}{params.suffix}")
1108
+ else:
1109
+ select_parts.append(col)
1110
+
1111
+ cols_str = ", ".join(select_parts)
1112
+ sql_query = f"SELECT {cols_str} FROM df"
1113
+ return context.sql(sql_query)
1114
+
1115
+
1116
+ # -------------------------------------------------------------------------
1117
+ # 23. Normalize Column Names
1118
+ # -------------------------------------------------------------------------
1119
+
1120
+
1121
+ class NormalizeColumnNamesParams(BaseModel):
1122
+ """
1123
+ Configuration for normalizing column names.
1124
+
1125
+ Example:
1126
+ ```yaml
1127
+ normalize_column_names:
1128
+ style: "snake_case"
1129
+ lowercase: true
1130
+ ```
1131
+ """
1132
+
1133
+ style: Literal["snake_case", "none"] = Field(
1134
+ "snake_case",
1135
+ description="Naming style: 'snake_case' converts spaces/special chars to underscores",
1136
+ )
1137
+ lowercase: bool = Field(True, description="Convert names to lowercase")
1138
+ remove_special: bool = Field(True, description="Remove special characters except underscores")
1139
+
1140
+
1141
+ def normalize_column_names(
1142
+ context: EngineContext, params: NormalizeColumnNamesParams
1143
+ ) -> EngineContext:
1144
+ """
1145
+ Normalizes column names to a consistent style.
1146
+ Useful for cleaning up messy source data with spaces, mixed case, or special characters.
1147
+ """
1148
+ import re
1149
+
1150
+ current_cols = context.columns
1151
+ select_parts = []
1152
+
1153
+ for col in current_cols:
1154
+ new_name = col
1155
+
1156
+ # Apply lowercase
1157
+ if params.lowercase:
1158
+ new_name = new_name.lower()
1159
+
1160
+ # Apply snake_case (replace spaces and special chars with underscores)
1161
+ if params.style == "snake_case":
1162
+ # Replace spaces, dashes, dots with underscores
1163
+ new_name = re.sub(r"[\s\-\.]+", "_", new_name)
1164
+ # Insert underscore before uppercase letters (camelCase to snake_case)
1165
+ new_name = re.sub(r"([a-z])([A-Z])", r"\1_\2", new_name).lower()
1166
+
1167
+ # Remove special characters
1168
+ if params.remove_special:
1169
+ new_name = re.sub(r"[^a-zA-Z0-9_]", "", new_name)
1170
+ # Remove consecutive underscores
1171
+ new_name = re.sub(r"_+", "_", new_name)
1172
+ # Remove leading/trailing underscores
1173
+ new_name = new_name.strip("_")
1174
+
1175
+ if new_name != col:
1176
+ select_parts.append(f'"{col}" AS {new_name}')
1177
+ else:
1178
+ select_parts.append(f'"{col}"')
1179
+
1180
+ cols_str = ", ".join(select_parts)
1181
+ sql_query = f"SELECT {cols_str} FROM df"
1182
+ return context.sql(sql_query)
1183
+
1184
+
1185
+ # -------------------------------------------------------------------------
1186
+ # 24. Coalesce Columns
1187
+ # -------------------------------------------------------------------------
1188
+
1189
+
1190
+ class CoalesceColumnsParams(BaseModel):
1191
+ """
1192
+ Configuration for coalescing columns (first non-null value).
1193
+
1194
+ Example - Phone number fallback:
1195
+ ```yaml
1196
+ coalesce_columns:
1197
+ columns: ["mobile_phone", "work_phone", "home_phone"]
1198
+ output_col: "primary_phone"
1199
+ ```
1200
+
1201
+ Example - Timestamp fallback:
1202
+ ```yaml
1203
+ coalesce_columns:
1204
+ columns: ["updated_at", "modified_at", "created_at"]
1205
+ output_col: "last_change_at"
1206
+ ```
1207
+ """
1208
+
1209
+ columns: List[str] = Field(..., description="List of columns to coalesce (in priority order)")
1210
+ output_col: str = Field(..., description="Name of the output column")
1211
+ drop_source: bool = Field(False, description="Drop the source columns after coalescing")
1212
+
1213
+
1214
+ def coalesce_columns(context: EngineContext, params: CoalesceColumnsParams) -> EngineContext:
1215
+ """
1216
+ Returns the first non-null value from a list of columns.
1217
+ Useful for fallback/priority scenarios.
1218
+ """
1219
+ from odibi.enums import EngineType
1220
+
1221
+ cols_str = ", ".join(params.columns)
1222
+ expr = f"COALESCE({cols_str}) AS {params.output_col}"
1223
+
1224
+ if params.drop_source:
1225
+ drop_cols = ", ".join(params.columns)
1226
+ if context.engine_type == EngineType.PANDAS:
1227
+ sql_query = f"SELECT * EXCLUDE ({drop_cols}), {expr} FROM df"
1228
+ else:
1229
+ sql_query = f"SELECT * EXCEPT ({drop_cols}), {expr} FROM df"
1230
+ else:
1231
+ sql_query = f"SELECT *, {expr} FROM df"
1232
+
1233
+ return context.sql(sql_query)
1234
+
1235
+
1236
+ # -------------------------------------------------------------------------
1237
+ # 25. Replace Values
1238
+ # -------------------------------------------------------------------------
1239
+
1240
+
1241
+ class ReplaceValuesParams(BaseModel):
1242
+ """
1243
+ Configuration for bulk value replacement.
1244
+
1245
+ Example - Standardize nulls:
1246
+ ```yaml
1247
+ replace_values:
1248
+ columns: ["status", "category"]
1249
+ mapping:
1250
+ "N/A": null
1251
+ "": null
1252
+ "Unknown": null
1253
+ ```
1254
+
1255
+ Example - Code replacement:
1256
+ ```yaml
1257
+ replace_values:
1258
+ columns: ["country_code"]
1259
+ mapping:
1260
+ "US": "USA"
1261
+ "UK": "GBR"
1262
+ ```
1263
+ """
1264
+
1265
+ columns: List[str] = Field(..., description="Columns to apply replacements to")
1266
+ mapping: Dict[str, Optional[str]] = Field(
1267
+ ..., description="Map of old value to new value (use null for NULL)"
1268
+ )
1269
+
1270
+
1271
+ def replace_values(context: EngineContext, params: ReplaceValuesParams) -> EngineContext:
1272
+ """
1273
+ Replaces values in specified columns according to the mapping.
1274
+ Supports replacing to NULL.
1275
+ """
1276
+ current_cols = context.columns
1277
+ select_parts = []
1278
+
1279
+ for col in current_cols:
1280
+ if col in params.columns:
1281
+ # Build nested CASE WHEN for replacements
1282
+ case_parts = []
1283
+ for old_val, new_val in params.mapping.items():
1284
+ if old_val == "":
1285
+ case_parts.append(f"WHEN {col} = '' THEN {_sql_value(new_val)}")
1286
+ else:
1287
+ case_parts.append(f"WHEN {col} = '{old_val}' THEN {_sql_value(new_val)}")
1288
+
1289
+ if case_parts:
1290
+ case_expr = f"CASE {' '.join(case_parts)} ELSE {col} END AS {col}"
1291
+ select_parts.append(case_expr)
1292
+ else:
1293
+ select_parts.append(col)
1294
+ else:
1295
+ select_parts.append(col)
1296
+
1297
+ cols_str = ", ".join(select_parts)
1298
+ sql_query = f"SELECT {cols_str} FROM df"
1299
+ return context.sql(sql_query)
1300
+
1301
+
1302
+ def _sql_value(val: Optional[str]) -> str:
1303
+ """Convert Python value to SQL literal."""
1304
+ if val is None:
1305
+ return "NULL"
1306
+ return f"'{val}'"
1307
+
1308
+
1309
+ # -------------------------------------------------------------------------
1310
+ # 26. Trim Whitespace
1311
+ # -------------------------------------------------------------------------
1312
+
1313
+
1314
+ class TrimWhitespaceParams(BaseModel):
1315
+ """
1316
+ Configuration for trimming whitespace from string columns.
1317
+
1318
+ Example - All string columns:
1319
+ ```yaml
1320
+ trim_whitespace: {}
1321
+ ```
1322
+
1323
+ Example - Specific columns:
1324
+ ```yaml
1325
+ trim_whitespace:
1326
+ columns: ["name", "address", "city"]
1327
+ ```
1328
+ """
1329
+
1330
+ columns: Optional[List[str]] = Field(
1331
+ None,
1332
+ description="Columns to trim (default: all string columns detected at runtime)",
1333
+ )
1334
+
1335
+
1336
+ def trim_whitespace(context: EngineContext, params: TrimWhitespaceParams) -> EngineContext:
1337
+ """
1338
+ Trims leading and trailing whitespace from string columns.
1339
+ """
1340
+ current_cols = context.columns
1341
+ target_cols = params.columns
1342
+
1343
+ # If no columns specified, we trim all columns (SQL TRIM handles non-strings gracefully in most cases)
1344
+ if target_cols is None:
1345
+ target_cols = current_cols
1346
+
1347
+ select_parts = []
1348
+ for col in current_cols:
1349
+ if col in target_cols:
1350
+ select_parts.append(f"TRIM({col}) AS {col}")
1351
+ else:
1352
+ select_parts.append(col)
1353
+
1354
+ cols_str = ", ".join(select_parts)
1355
+ sql_query = f"SELECT {cols_str} FROM df"
1356
+ return context.sql(sql_query)