odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1114 @@
1
+ """Polars engine implementation."""
2
+
3
+ import hashlib
4
+ import os
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ try:
8
+ import polars as pl
9
+ except ImportError:
10
+ pl = None
11
+
12
+ try:
13
+ import pyarrow as pa
14
+ except ImportError:
15
+ pa = None
16
+
17
+ from odibi.context import Context
18
+ from odibi.engine.base import Engine
19
+ from odibi.enums import EngineType
20
+
21
+
22
+ class PolarsEngine(Engine):
23
+ """Polars-based execution engine (High Performance)."""
24
+
25
+ name = "polars"
26
+ engine_type = EngineType.POLARS
27
+
28
+ def __init__(
29
+ self,
30
+ connections: Optional[Dict[str, Any]] = None,
31
+ config: Optional[Dict[str, Any]] = None,
32
+ ):
33
+ """Initialize Polars engine.
34
+
35
+ Args:
36
+ connections: Dictionary of connection objects
37
+ config: Engine configuration (optional)
38
+ """
39
+ if pl is None:
40
+ raise ImportError("Polars not installed. Run 'pip install polars'.")
41
+
42
+ self.connections = connections or {}
43
+ self.config = config or {}
44
+
45
+ def materialize(self, df: Any) -> Any:
46
+ """Materialize lazy dataset into memory (DataFrame).
47
+
48
+ Args:
49
+ df: LazyFrame or DataFrame
50
+
51
+ Returns:
52
+ Materialized DataFrame (pl.DataFrame)
53
+ """
54
+ if isinstance(df, pl.LazyFrame):
55
+ return df.collect()
56
+ return df
57
+
58
+ def read(
59
+ self,
60
+ connection: Any,
61
+ format: str,
62
+ table: Optional[str] = None,
63
+ path: Optional[str] = None,
64
+ streaming: bool = False,
65
+ schema: Optional[str] = None,
66
+ options: Optional[Dict[str, Any]] = None,
67
+ **kwargs,
68
+ ) -> Any:
69
+ """Read data using Polars (Lazy by default).
70
+
71
+ Returns:
72
+ pl.LazyFrame or pl.DataFrame
73
+ """
74
+ options = options or {}
75
+
76
+ # Get full path
77
+ if path:
78
+ if connection:
79
+ full_path = connection.get_path(path)
80
+ else:
81
+ full_path = path
82
+ elif table:
83
+ if connection:
84
+ full_path = connection.get_path(table)
85
+ else:
86
+ raise ValueError(
87
+ f"Cannot read table '{table}': connection is required when using 'table' parameter. "
88
+ "Provide a valid connection object or use 'path' for file-based reads."
89
+ )
90
+ else:
91
+ raise ValueError(
92
+ "Read operation failed: neither 'path' nor 'table' was provided. "
93
+ "Specify a file path or table name in your configuration."
94
+ )
95
+
96
+ # Handle glob patterns/lists
97
+ # Polars scan methods often support glob strings directly.
98
+
99
+ try:
100
+ if format == "csv":
101
+ # scan_csv supports glob patterns
102
+ return pl.scan_csv(full_path, **options)
103
+
104
+ elif format == "parquet":
105
+ return pl.scan_parquet(full_path, **options)
106
+
107
+ elif format == "json":
108
+ # scan_ndjson for newline delimited json, read_json for standard
109
+ # Assuming ndjson/jsonl for big data usually
110
+ if options.get("json_lines", True): # Default to ndjson scan
111
+ return pl.scan_ndjson(full_path, **options)
112
+ else:
113
+ # Standard JSON doesn't support lazy scan well in all versions, fallback to read
114
+ return pl.read_json(full_path, **options).lazy()
115
+
116
+ elif format == "delta":
117
+ # scan_delta requires 'deltalake' extra usually or feature
118
+ storage_options = options.get("storage_options", None)
119
+ version = options.get("versionAsOf", None)
120
+
121
+ # scan_delta is available in recent polars
122
+ # It might accept storage_options in recent versions
123
+ delta_opts = {}
124
+ if storage_options:
125
+ delta_opts["storage_options"] = storage_options
126
+ if version is not None:
127
+ delta_opts["version"] = version
128
+
129
+ return pl.scan_delta(full_path, **delta_opts)
130
+
131
+ else:
132
+ raise ValueError(
133
+ f"Unsupported format for Polars engine: '{format}'. "
134
+ "Supported formats: csv, parquet, json, delta."
135
+ )
136
+
137
+ except Exception as e:
138
+ raise ValueError(
139
+ f"Failed to read {format} from '{full_path}': {e}. "
140
+ "Check that the file exists, the format is correct, and you have read permissions."
141
+ )
142
+
143
+ def write(
144
+ self,
145
+ df: Any,
146
+ connection: Any,
147
+ format: str,
148
+ table: Optional[str] = None,
149
+ path: Optional[str] = None,
150
+ mode: str = "overwrite",
151
+ options: Optional[Dict[str, Any]] = None,
152
+ streaming_config: Optional[Any] = None,
153
+ ) -> Optional[Dict[str, Any]]:
154
+ """Write data using Polars."""
155
+ options = options or {}
156
+
157
+ if format in ["sql", "sql_server", "azure_sql"]:
158
+ return self._write_sql(df, connection, table, mode, options)
159
+
160
+ if path:
161
+ if connection:
162
+ full_path = connection.get_path(path)
163
+ else:
164
+ full_path = path
165
+ elif table:
166
+ if connection:
167
+ full_path = connection.get_path(table)
168
+ else:
169
+ raise ValueError(
170
+ f"Cannot write to table '{table}': connection is required when using 'table' parameter. "
171
+ "Provide a valid connection object or use 'path' for file-based writes."
172
+ )
173
+ else:
174
+ raise ValueError(
175
+ "Write operation failed: neither 'path' nor 'table' was provided. "
176
+ "Specify a file path or table name in your configuration."
177
+ )
178
+
179
+ is_lazy = isinstance(df, pl.LazyFrame)
180
+
181
+ parent_dir = os.path.dirname(full_path)
182
+ if parent_dir:
183
+ os.makedirs(parent_dir, exist_ok=True)
184
+
185
+ if format == "parquet":
186
+ if is_lazy:
187
+ df.sink_parquet(full_path, **options)
188
+ else:
189
+ df.write_parquet(full_path, **options)
190
+
191
+ elif format == "csv":
192
+ if is_lazy:
193
+ df.sink_csv(full_path, **options)
194
+ else:
195
+ df.write_csv(full_path, **options)
196
+
197
+ elif format == "json":
198
+ if is_lazy:
199
+ df.sink_ndjson(full_path, **options)
200
+ else:
201
+ df.write_ndjson(full_path, **options)
202
+
203
+ elif format == "delta":
204
+ if is_lazy:
205
+ df = df.collect()
206
+
207
+ storage_options = options.get("storage_options", None)
208
+ delta_write_options = options.copy()
209
+ if "storage_options" in delta_write_options:
210
+ del delta_write_options["storage_options"]
211
+
212
+ df.write_delta(
213
+ full_path, mode=mode, storage_options=storage_options, **delta_write_options
214
+ )
215
+
216
+ else:
217
+ raise ValueError(
218
+ f"Unsupported write format for Polars engine: '{format}'. "
219
+ "Supported formats: csv, parquet, json, delta."
220
+ )
221
+
222
+ return None
223
+
224
+ def _write_sql(
225
+ self,
226
+ df: Any,
227
+ connection: Any,
228
+ table: Optional[str],
229
+ mode: str,
230
+ options: Dict[str, Any],
231
+ ) -> Optional[Dict[str, Any]]:
232
+ """Handle SQL writing including merge and enhanced overwrite for Polars (Phase 4)."""
233
+ from odibi.utils.logging_context import get_logging_context
234
+
235
+ ctx = get_logging_context().with_context(engine="polars")
236
+
237
+ if not hasattr(connection, "write_table"):
238
+ raise ValueError(
239
+ f"Connection type '{type(connection).__name__}' does not support SQL operations"
240
+ )
241
+
242
+ if not table:
243
+ raise ValueError(
244
+ "SQL write operation failed: 'table' parameter is required but was not provided. "
245
+ "Specify the target table name in your configuration."
246
+ )
247
+
248
+ if mode == "merge":
249
+ merge_keys = options.get("merge_keys")
250
+ merge_options = options.get("merge_options")
251
+
252
+ if not merge_keys:
253
+ raise ValueError(
254
+ "MERGE mode requires 'merge_keys' in options. "
255
+ "Specify the key columns for the MERGE ON clause."
256
+ )
257
+
258
+ from odibi.writers.sql_server_writer import SqlServerMergeWriter
259
+
260
+ writer = SqlServerMergeWriter(connection)
261
+ ctx.debug(
262
+ "Executing SQL Server MERGE (Polars)",
263
+ target=table,
264
+ merge_keys=merge_keys,
265
+ )
266
+
267
+ result = writer.merge_polars(
268
+ df=df,
269
+ target_table=table,
270
+ merge_keys=merge_keys,
271
+ options=merge_options,
272
+ )
273
+
274
+ ctx.info(
275
+ "SQL Server MERGE completed (Polars)",
276
+ target=table,
277
+ inserted=result.inserted,
278
+ updated=result.updated,
279
+ deleted=result.deleted,
280
+ )
281
+
282
+ return {
283
+ "mode": "merge",
284
+ "inserted": result.inserted,
285
+ "updated": result.updated,
286
+ "deleted": result.deleted,
287
+ "total_affected": result.total_affected,
288
+ }
289
+
290
+ if mode == "overwrite" and options.get("overwrite_options"):
291
+ from odibi.writers.sql_server_writer import SqlServerMergeWriter
292
+
293
+ overwrite_options = options.get("overwrite_options")
294
+ writer = SqlServerMergeWriter(connection)
295
+
296
+ ctx.debug(
297
+ "Executing SQL Server enhanced overwrite (Polars)",
298
+ target=table,
299
+ strategy=(
300
+ overwrite_options.strategy.value
301
+ if hasattr(overwrite_options, "strategy")
302
+ else "truncate_insert"
303
+ ),
304
+ )
305
+
306
+ result = writer.overwrite_polars(
307
+ df=df,
308
+ target_table=table,
309
+ options=overwrite_options,
310
+ )
311
+
312
+ ctx.info(
313
+ "SQL Server enhanced overwrite completed (Polars)",
314
+ target=table,
315
+ strategy=result.strategy,
316
+ rows_written=result.rows_written,
317
+ )
318
+
319
+ return {
320
+ "mode": "overwrite",
321
+ "strategy": result.strategy,
322
+ "rows_written": result.rows_written,
323
+ }
324
+
325
+ if isinstance(df, pl.LazyFrame):
326
+ df = df.collect()
327
+
328
+ if "." in table:
329
+ schema, table_name = table.split(".", 1)
330
+ else:
331
+ schema, table_name = "dbo", table
332
+
333
+ if_exists = "replace"
334
+ if mode == "append":
335
+ if_exists = "append"
336
+ elif mode == "fail":
337
+ if_exists = "fail"
338
+
339
+ df_pandas = df.to_pandas()
340
+ chunksize = options.get("chunksize", 1000)
341
+
342
+ connection.write_table(
343
+ df=df_pandas,
344
+ table_name=table_name,
345
+ schema=schema,
346
+ if_exists=if_exists,
347
+ chunksize=chunksize,
348
+ )
349
+ return None
350
+
351
+ def execute_sql(self, sql: str, context: Context) -> Any:
352
+ """Execute SQL query using Polars SQLContext.
353
+
354
+ Args:
355
+ sql: SQL query string
356
+ context: Execution context with registered DataFrames
357
+
358
+ Returns:
359
+ pl.LazyFrame
360
+ """
361
+ ctx = pl.SQLContext()
362
+
363
+ # Register datasets from context
364
+ # We iterate over all registered names in the context
365
+ try:
366
+ names = context.list_names()
367
+ for name in names:
368
+ df = context.get(name)
369
+ # Register LazyFrame or DataFrame
370
+ # Polars SQLContext supports registering LazyFrame, DataFrame, and some others
371
+ # We might need to convert if it's not a Polars object, but we assume Polars engine uses Polars objects
372
+ ctx.register(name, df)
373
+ except Exception:
374
+ # If context doesn't support listing or getting, we proceed with empty context
375
+ # (e.g. if context is not fully compatible or empty)
376
+ pass
377
+
378
+ return ctx.execute(sql, eager=False)
379
+
380
+ def execute_operation(self, operation: str, params: Dict[str, Any], df: Any) -> Any:
381
+ """Execute built-in operation."""
382
+ # Ensure LazyFrame for consistency if possible, but operations work on both usually.
383
+ # If DataFrame, some operations might need different methods.
384
+
385
+ if operation == "pivot":
386
+ # Pivot requires materialization usually in other engines, but Polars LazyFrame has 'collect' or similar constraints?
387
+ # Polars lazy pivot is not fully supported in older versions without collect, but check recent.
388
+ # Pivot changes shape drastically.
389
+ # params: pivot_column, value_column, group_by, agg_func
390
+
391
+ # If lazy, we might need to collect for pivot if lazy pivot isn't supported or experimental.
392
+ # But let's try to keep it lazy if possible.
393
+ # As of recent Polars, pivot is available on DataFrame, experimental on LazyFrame?
394
+ # Actually, 'unstack' or 'pivot' on LazyFrame is limited.
395
+ # Safe bet: materialize if needed, or use lazy pivot if available.
396
+
397
+ # Let's collect if input is lazy, because pivot usually implies strict schema change hard to predict.
398
+ if isinstance(df, pl.LazyFrame):
399
+ df = df.collect()
400
+
401
+ return df.pivot(
402
+ index=params.get("group_by"),
403
+ on=params["pivot_column"],
404
+ values=params["value_column"],
405
+ aggregate_function=params.get("agg_func", "first"),
406
+ ) # Returns DataFrame
407
+
408
+ elif operation == "drop_duplicates":
409
+ subset = params.get("subset")
410
+ if isinstance(df, pl.LazyFrame):
411
+ return df.unique(subset=subset)
412
+ return df.unique(subset=subset)
413
+
414
+ elif operation == "fillna":
415
+ value = params.get("value")
416
+ # Polars uses fill_null
417
+ if isinstance(value, dict):
418
+ # Fill specific columns
419
+ # value = {'col1': 0, 'col2': 'unknown'}
420
+ # We need to chain with_columns
421
+ exprs = []
422
+ for col, val in value.items():
423
+ exprs.append(pl.col(col).fill_null(val))
424
+ return df.with_columns(exprs)
425
+ else:
426
+ # Fill all columns? Polars fill_null requires specifying columns or using all()
427
+ return df.fill_null(value)
428
+
429
+ elif operation == "drop":
430
+ columns = params.get("columns") or params.get("labels")
431
+ return df.drop(columns)
432
+
433
+ elif operation == "rename":
434
+ columns = params.get("columns") or params.get("mapper")
435
+ return df.rename(columns)
436
+
437
+ elif operation == "sort":
438
+ by = params.get("by")
439
+ descending = not params.get("ascending", True)
440
+ if isinstance(df, pl.LazyFrame):
441
+ return df.sort(by, descending=descending)
442
+ return df.sort(by, descending=descending)
443
+
444
+ elif operation == "sample":
445
+ # Sample n or frac
446
+ n = params.get("n")
447
+ frac = params.get("frac")
448
+ seed = params.get("random_state")
449
+
450
+ # Lazy sample supported
451
+ if n is not None:
452
+ # Note: Polars Lazy sample might be approximate or require 'collect' depending on version/backend?
453
+ # But usually supported.
454
+ if isinstance(df, pl.LazyFrame):
455
+ # LazyFrame.sample takes n (int) or fraction.
456
+ # But polars 0.19+ changed sample signature?
457
+ # It's generally `sample(n=..., fraction=..., seed=...)`
458
+ return (
459
+ df.collect().sample(n=n, seed=seed).lazy()
460
+ ) # Collecting for exact sample n on lazy might be needed if not supported?
461
+ # Actually, fetch(n) is head. Sample is random.
462
+ # Let's materialize for safety with sample as it's often for checks.
463
+ pass
464
+ return df.sample(n=n, seed=seed)
465
+ elif frac is not None:
466
+ if isinstance(df, pl.LazyFrame):
467
+ # Lazy sampling by fraction is supported
468
+ pass # fall through
469
+ return df.sample(fraction=frac, seed=seed)
470
+
471
+ elif operation == "filter":
472
+ # Legacy or simple filter
473
+ pass
474
+
475
+ else:
476
+ # Fallback: check if operation is a registered transformer
477
+ from odibi.context import EngineContext, PandasContext
478
+ from odibi.registry import FunctionRegistry
479
+
480
+ if FunctionRegistry.has_function(operation):
481
+ func = FunctionRegistry.get_function(operation)
482
+ param_model = FunctionRegistry.get_param_model(operation)
483
+
484
+ # Create EngineContext from current df (use PandasContext as placeholder)
485
+ engine_ctx = EngineContext(
486
+ context=PandasContext(),
487
+ df=df,
488
+ engine=self,
489
+ engine_type=self.engine_type,
490
+ )
491
+
492
+ # Validate and instantiate params
493
+ if param_model:
494
+ validated_params = param_model(**params)
495
+ result_ctx = func(engine_ctx, validated_params)
496
+ else:
497
+ result_ctx = func(engine_ctx, **params)
498
+
499
+ return result_ctx.df
500
+
501
+ return df
502
+
503
+ def get_schema(self, df: Any) -> Any:
504
+ """Get DataFrame schema."""
505
+ # Polars schema is a dict {name: DataType}
506
+ # We can return a dict of strings for compatibility
507
+ schema = df.collect_schema() if isinstance(df, pl.LazyFrame) else df.schema
508
+ return {name: str(dtype) for name, dtype in schema.items()}
509
+
510
+ def get_shape(self, df: Any) -> tuple:
511
+ """Get DataFrame shape."""
512
+ if isinstance(df, pl.LazyFrame):
513
+ # Expensive to count rows in LazyFrame without scan
514
+ # But usually shape implies (rows, cols)
515
+ # columns is cheap. rows requires partial scan or metadata.
516
+ # Fetching 1 row might give columns.
517
+ # For exact row count, we need collect(count)
518
+ cols = len(df.collect_schema().names())
519
+ rows = df.select(pl.len()).collect().item()
520
+ return (rows, cols)
521
+ return df.shape
522
+
523
+ def count_rows(self, df: Any) -> int:
524
+ """Count rows in DataFrame."""
525
+ if isinstance(df, pl.LazyFrame):
526
+ return df.select(pl.len()).collect().item()
527
+ return len(df)
528
+
529
+ def count_nulls(self, df: Any, columns: List[str]) -> Dict[str, int]:
530
+ """Count nulls in specified columns."""
531
+ if isinstance(df, pl.LazyFrame):
532
+ # efficient null count
533
+ return df.select([pl.col(c).null_count() for c in columns]).collect().to_dicts()[0]
534
+
535
+ return df.select([pl.col(c).null_count() for c in columns]).to_dicts()[0]
536
+
537
+ def validate_schema(self, df: Any, schema_rules: Dict[str, Any]) -> List[str]:
538
+ """Validate DataFrame schema."""
539
+ failures = []
540
+
541
+ # Schema is dict-like in Polars
542
+ current_schema = df.collect_schema() if isinstance(df, pl.LazyFrame) else df.schema
543
+ current_cols = current_schema.keys()
544
+
545
+ if "required_columns" in schema_rules:
546
+ required = schema_rules["required_columns"]
547
+ missing = set(required) - set(current_cols)
548
+ if missing:
549
+ failures.append(f"Missing required columns: {', '.join(missing)}")
550
+
551
+ if "types" in schema_rules:
552
+ for col, expected_type in schema_rules["types"].items():
553
+ if col not in current_cols:
554
+ failures.append(f"Column '{col}' not found for type validation")
555
+ continue
556
+
557
+ actual_type = str(current_schema[col])
558
+ # Basic type check - simplistic string matching
559
+ if expected_type.lower() not in actual_type.lower():
560
+ failures.append(
561
+ f"Column '{col}' has type '{actual_type}', expected '{expected_type}'"
562
+ )
563
+
564
+ return failures
565
+
566
+ def validate_data(self, df: Any, validation_config: Any) -> List[str]:
567
+ """Validate data against rules.
568
+
569
+ Args:
570
+ df: DataFrame or LazyFrame
571
+ validation_config: ValidationConfig object
572
+
573
+ Returns:
574
+ List of validation failure messages
575
+ """
576
+ failures = []
577
+
578
+ if isinstance(df, pl.LazyFrame):
579
+ schema = df.collect_schema()
580
+ columns = schema.names()
581
+ else:
582
+ columns = df.columns
583
+
584
+ if getattr(validation_config, "not_empty", False):
585
+ count = self.count_rows(df)
586
+ if count == 0:
587
+ failures.append("DataFrame is empty")
588
+
589
+ if getattr(validation_config, "no_nulls", None):
590
+ cols = validation_config.no_nulls
591
+ null_counts = self.count_nulls(df, cols)
592
+ for col, count in null_counts.items():
593
+ if count > 0:
594
+ failures.append(f"Column '{col}' has {count} null values")
595
+
596
+ if getattr(validation_config, "schema_validation", None):
597
+ schema_failures = self.validate_schema(df, validation_config.schema_validation)
598
+ failures.extend(schema_failures)
599
+
600
+ if getattr(validation_config, "ranges", None):
601
+ for col, bounds in validation_config.ranges.items():
602
+ if col in columns:
603
+ min_val = bounds.get("min")
604
+ max_val = bounds.get("max")
605
+
606
+ if min_val is not None:
607
+ if isinstance(df, pl.LazyFrame):
608
+ min_violations = (
609
+ df.filter(pl.col(col) < min_val).select(pl.len()).collect().item()
610
+ )
611
+ else:
612
+ min_violations = len(df.filter(pl.col(col) < min_val))
613
+ if min_violations > 0:
614
+ failures.append(f"Column '{col}' has values < {min_val}")
615
+
616
+ if max_val is not None:
617
+ if isinstance(df, pl.LazyFrame):
618
+ max_violations = (
619
+ df.filter(pl.col(col) > max_val).select(pl.len()).collect().item()
620
+ )
621
+ else:
622
+ max_violations = len(df.filter(pl.col(col) > max_val))
623
+ if max_violations > 0:
624
+ failures.append(f"Column '{col}' has values > {max_val}")
625
+ else:
626
+ failures.append(f"Column '{col}' not found for range validation")
627
+
628
+ if getattr(validation_config, "allowed_values", None):
629
+ for col, allowed in validation_config.allowed_values.items():
630
+ if col in columns:
631
+ if isinstance(df, pl.LazyFrame):
632
+ invalid_count = (
633
+ df.filter(~pl.col(col).is_in(allowed)).select(pl.len()).collect().item()
634
+ )
635
+ else:
636
+ invalid_count = len(df.filter(~pl.col(col).is_in(allowed)))
637
+ if invalid_count > 0:
638
+ failures.append(f"Column '{col}' has invalid values")
639
+ else:
640
+ failures.append(f"Column '{col}' not found for allowed values validation")
641
+
642
+ return failures
643
+
644
+ def get_sample(self, df: Any, n: int = 10) -> List[Dict[str, Any]]:
645
+ """Get sample rows as list of dictionaries."""
646
+ if isinstance(df, pl.LazyFrame):
647
+ return df.limit(n).collect().to_dicts()
648
+ return df.head(n).to_dicts()
649
+
650
+ def profile_nulls(self, df: Any) -> Dict[str, float]:
651
+ """Calculate null percentage for each column."""
652
+ if isinstance(df, pl.LazyFrame):
653
+ # null_count() / count()
654
+ # We can do this in one expression
655
+ total_count = df.select(pl.len()).collect().item()
656
+ if total_count == 0:
657
+ return {col: 0.0 for col in df.collect_schema().names()}
658
+
659
+ cols = df.collect_schema().names()
660
+ null_counts = df.select([pl.col(c).null_count().alias(c) for c in cols]).collect()
661
+ return {col: null_counts[col][0] / total_count for col in cols}
662
+
663
+ total_count = len(df)
664
+ if total_count == 0:
665
+ return {col: 0.0 for col in df.columns}
666
+
667
+ null_counts = df.null_count()
668
+ return {col: null_counts[col][0] / total_count for col in df.columns}
669
+
670
+ def table_exists(
671
+ self, connection: Any, table: Optional[str] = None, path: Optional[str] = None
672
+ ) -> bool:
673
+ """Check if table or location exists."""
674
+ if path:
675
+ full_path = connection.get_path(path)
676
+ return os.path.exists(full_path)
677
+ return False
678
+
679
+ def harmonize_schema(self, df: Any, target_schema: Dict[str, str], policy: Any) -> Any:
680
+ """Harmonize DataFrame schema."""
681
+ # policy: SchemaPolicyConfig
682
+ from odibi.config import OnMissingColumns, OnNewColumns, SchemaMode
683
+
684
+ # Helper to get current columns/schema
685
+ if isinstance(df, pl.LazyFrame):
686
+ current_schema = df.collect_schema()
687
+ else:
688
+ current_schema = df.schema
689
+
690
+ current_cols = current_schema.names()
691
+ target_cols = list(target_schema.keys())
692
+
693
+ missing = set(target_cols) - set(current_cols)
694
+ new_cols = set(current_cols) - set(target_cols)
695
+
696
+ # 1. Validation
697
+ if missing and getattr(policy, "on_missing_columns", None) == OnMissingColumns.FAIL:
698
+ raise ValueError(
699
+ f"Schema Policy Violation: DataFrame is missing required columns {missing}. "
700
+ f"Available columns: {current_cols}. Add missing columns or set on_missing_columns policy."
701
+ )
702
+
703
+ if new_cols and getattr(policy, "on_new_columns", None) == OnNewColumns.FAIL:
704
+ raise ValueError(
705
+ f"Schema Policy Violation: DataFrame contains unexpected columns {new_cols}. "
706
+ f"Expected columns: {target_cols}. Remove extra columns or set on_new_columns policy."
707
+ )
708
+
709
+ # 2. Transformations
710
+ exprs = []
711
+
712
+ # Handle Missing (Add nulls)
713
+ # Evolve means we keep new columns, Enforce means we select only target
714
+ mode = getattr(policy, "mode", SchemaMode.ENFORCE)
715
+
716
+ if (
717
+ mode == SchemaMode.EVOLVE
718
+ and getattr(policy, "on_new_columns", None) == OnNewColumns.ADD_NULLABLE
719
+ ):
720
+ # Add missing (if missing cols exist, we fill them with nulls)
721
+ # on_missing_columns controls what to do with missing target cols.
722
+ # If mode is EVOLVE, we typically keep everything?
723
+ # But harmonize_schema is about matching a TARGET schema.
724
+ # If target has cols that df doesn't:
725
+ # If on_missing_columns == FILL_NULL -> Add them as null.
726
+ pass
727
+
728
+ # We should respect on_missing_columns regardless of mode?
729
+ if missing and getattr(policy, "on_missing_columns", None) == OnMissingColumns.FILL_NULL:
730
+ for col in missing:
731
+ exprs.append(pl.lit(None).alias(col))
732
+
733
+ if exprs:
734
+ df = df.with_columns(exprs)
735
+
736
+ # Now Select
737
+ if mode == SchemaMode.ENFORCE:
738
+ # Select only target columns.
739
+ # Missing columns were added above if configured.
740
+ # New columns (not in target) are dropped implicitly by selecting target_cols.
741
+ # But wait, we added exprs to df (lazy).
742
+
743
+ final_cols = []
744
+ for col in target_cols:
745
+ final_cols.append(pl.col(col))
746
+
747
+ df = df.select(final_cols)
748
+
749
+ elif mode == SchemaMode.EVOLVE:
750
+ # We keep new columns.
751
+ # If target has columns that were missing in df, we added them above (if FILL_NULL).
752
+ # If df has columns not in target (new_cols), we keep them.
753
+ pass
754
+
755
+ return df
756
+
757
+ def anonymize(
758
+ self, df: Any, columns: List[str], method: str, salt: Optional[str] = None
759
+ ) -> Any:
760
+ """Anonymize specified columns."""
761
+ if method == "mask":
762
+ # Mask all but last 4 characters: '******1234'
763
+ # Regex look-around not supported in some envs.
764
+ # Manual approach:
765
+ # If len > 4: repeat('*', len-4) + suffix(4)
766
+ # Else: keep original (or mask all? Pandas engine masked all but last 4, which implies keeping small strings?)
767
+ # Pandas: .str.replace(r".(?=.{4})", "*") -> replaces chars that are followed by 4 chars.
768
+ # If str is "123", no char is followed by 4 chars -> "123".
769
+ # If str is "12345", '1' is followed by '2345' (4 chars) -> "*2345".
770
+
771
+ return df.with_columns(
772
+ [
773
+ pl.when(pl.col(c).cast(pl.Utf8).str.len_chars() > 4)
774
+ .then(
775
+ pl.concat_str(
776
+ [
777
+ pl.lit("*").repeat_by(pl.col(c).str.len_chars() - 4).list.join(""),
778
+ pl.col(c).str.slice(-4),
779
+ ]
780
+ )
781
+ )
782
+ .otherwise(pl.col(c).cast(pl.Utf8))
783
+ .alias(c)
784
+ for c in columns
785
+ ]
786
+ )
787
+
788
+ elif method == "hash":
789
+ # Polars hash() is non-cryptographic usually (xxHash).
790
+ # For cryptographic hash (sha256), we might need map_elements (slow) or plugin.
791
+ # Requirement is just 'hash', often consistent for analytics.
792
+ # Gap Analysis mentions "salt".
793
+ # PandasEngine used sha256 with salt.
794
+ # Polars `hash` is fast 64-bit hash.
795
+ # If we need SHA256, we must use map_elements (python UDF) or custom.
796
+ # For "High Performance", map_elements is bad.
797
+ # However, without native plugin, we have no choice for SHA256.
798
+ # Let's implement SHA256 via map_elements for compatibility,
799
+ # OR use Polars internal hash if user accepts non-crypto.
800
+ # But "salt" implies security/crypto usage.
801
+
802
+ def _hash_val(val):
803
+ if val is None:
804
+ return None
805
+ to_hash = str(val)
806
+ if salt:
807
+ to_hash += salt
808
+ return hashlib.sha256(to_hash.encode("utf-8")).hexdigest()
809
+
810
+ # Apply to each column. Warning: Slow path.
811
+ # But Polars UDFs are still faster than Pandas apply often due to no GIL? No, Python UDF has GIL.
812
+ return df.with_columns(
813
+ [pl.col(c).map_elements(_hash_val, return_dtype=pl.Utf8).alias(c) for c in columns]
814
+ )
815
+
816
+ elif method == "redact":
817
+ return df.with_columns([pl.lit("[REDACTED]").alias(c) for c in columns])
818
+
819
+ return df
820
+
821
+ def get_table_schema(
822
+ self,
823
+ connection: Any,
824
+ table: Optional[str] = None,
825
+ path: Optional[str] = None,
826
+ format: Optional[str] = None,
827
+ ) -> Optional[Dict[str, str]]:
828
+ """Get schema of an existing table/file.
829
+
830
+ Args:
831
+ connection: Connection object
832
+ table: Table name
833
+ path: File path
834
+ format: Data format (optional, helps with file-based sources)
835
+
836
+ Returns:
837
+ Schema dict or None if table doesn't exist or schema fetch fails.
838
+ """
839
+ from odibi.utils.logging_context import get_logging_context
840
+
841
+ ctx = get_logging_context().with_context(engine="polars")
842
+
843
+ try:
844
+ if table and format in ["sql", "sql_server", "azure_sql"]:
845
+ query = f"SELECT TOP 0 * FROM {table}"
846
+ df = connection.read_sql(query)
847
+ return {col: str(dtype) for col, dtype in zip(df.columns, df.dtypes)}
848
+
849
+ if path:
850
+ full_path = connection.get_path(path) if connection else path
851
+ if not os.path.exists(full_path):
852
+ return None
853
+
854
+ if format == "delta":
855
+ try:
856
+ from deltalake import DeltaTable
857
+
858
+ dt = DeltaTable(full_path)
859
+ arrow_schema = dt.schema().to_pyarrow()
860
+ return {field.name: str(field.type) for field in arrow_schema}
861
+ except ImportError:
862
+ ctx.warning(
863
+ "deltalake library not installed for schema introspection",
864
+ path=full_path,
865
+ )
866
+ return None
867
+
868
+ elif format == "parquet":
869
+ try:
870
+ import pyarrow.parquet as pq
871
+ import glob as glob_mod
872
+
873
+ target_path = full_path
874
+ if os.path.isdir(full_path):
875
+ files = glob_mod.glob(os.path.join(full_path, "*.parquet"))
876
+ if not files:
877
+ return None
878
+ target_path = files[0]
879
+
880
+ schema = pq.read_schema(target_path)
881
+ return {field.name: str(field.type) for field in schema}
882
+ except ImportError:
883
+ lf = pl.scan_parquet(full_path)
884
+ schema = lf.collect_schema()
885
+ return {name: str(dtype) for name, dtype in schema.items()}
886
+
887
+ elif format == "csv":
888
+ lf = pl.scan_csv(full_path)
889
+ schema = lf.collect_schema()
890
+ return {name: str(dtype) for name, dtype in schema.items()}
891
+
892
+ except (FileNotFoundError, PermissionError):
893
+ return None
894
+ except Exception as e:
895
+ ctx.warning(f"Failed to infer schema for {table or path}: {e}")
896
+ return None
897
+
898
+ return None
899
+
900
+ def maintain_table(
901
+ self,
902
+ connection: Any,
903
+ format: str,
904
+ table: Optional[str] = None,
905
+ path: Optional[str] = None,
906
+ config: Optional[Any] = None,
907
+ ) -> None:
908
+ """Run table maintenance operations (optimize, vacuum) for Delta tables.
909
+
910
+ Args:
911
+ connection: Connection object
912
+ format: Table format
913
+ table: Table name
914
+ path: Table path
915
+ config: AutoOptimizeConfig object
916
+ """
917
+ from odibi.utils.logging_context import get_logging_context
918
+
919
+ ctx = get_logging_context().with_context(engine="polars")
920
+
921
+ if format != "delta" or not config or not getattr(config, "enabled", False):
922
+ return
923
+
924
+ if not path and not table:
925
+ return
926
+
927
+ full_path = connection.get_path(path if path else table) if connection else (path or table)
928
+
929
+ ctx.info("Starting table maintenance", path=str(full_path))
930
+
931
+ try:
932
+ from deltalake import DeltaTable
933
+ except ImportError:
934
+ ctx.warning(
935
+ "Auto-optimize skipped: 'deltalake' library not installed",
936
+ path=str(full_path),
937
+ )
938
+ return
939
+
940
+ try:
941
+ import time
942
+
943
+ start = time.time()
944
+
945
+ storage_opts = {}
946
+ if hasattr(connection, "pandas_storage_options"):
947
+ storage_opts = connection.pandas_storage_options()
948
+
949
+ dt = DeltaTable(full_path, storage_options=storage_opts)
950
+
951
+ ctx.info("Running Delta OPTIMIZE (compaction)", path=str(full_path))
952
+ dt.optimize.compact()
953
+
954
+ retention = getattr(config, "vacuum_retention_hours", None)
955
+ if retention is not None and retention > 0:
956
+ ctx.info(
957
+ "Running Delta VACUUM",
958
+ path=str(full_path),
959
+ retention_hours=retention,
960
+ )
961
+ dt.vacuum(
962
+ retention_hours=retention,
963
+ enforce_retention_duration=True,
964
+ dry_run=False,
965
+ )
966
+
967
+ elapsed = (time.time() - start) * 1000
968
+ ctx.info(
969
+ "Table maintenance completed",
970
+ path=str(full_path),
971
+ elapsed_ms=round(elapsed, 2),
972
+ )
973
+
974
+ except Exception as e:
975
+ ctx.warning(
976
+ "Auto-optimize failed",
977
+ path=str(full_path),
978
+ error=str(e),
979
+ )
980
+
981
+ def get_source_files(self, df: Any) -> List[str]:
982
+ """Get list of source files that generated this DataFrame.
983
+
984
+ For Polars, this checks if source file info was stored
985
+ in the DataFrame's metadata during read.
986
+
987
+ Args:
988
+ df: DataFrame or LazyFrame
989
+
990
+ Returns:
991
+ List of file paths (or empty list if not applicable/supported)
992
+ """
993
+ if isinstance(df, pl.LazyFrame):
994
+ return []
995
+
996
+ if hasattr(df, "attrs"):
997
+ return df.attrs.get("odibi_source_files", [])
998
+
999
+ return []
1000
+
1001
+ def vacuum_delta(
1002
+ self,
1003
+ connection: Any,
1004
+ path: str,
1005
+ retention_hours: int = 168,
1006
+ dry_run: bool = False,
1007
+ enforce_retention_duration: bool = True,
1008
+ ) -> Dict[str, Any]:
1009
+ """VACUUM a Delta table to remove old files.
1010
+
1011
+ Args:
1012
+ connection: Connection object
1013
+ path: Delta table path
1014
+ retention_hours: Retention period (default 168 = 7 days)
1015
+ dry_run: If True, only show files to be deleted
1016
+ enforce_retention_duration: If False, allows retention < 168 hours (testing only)
1017
+
1018
+ Returns:
1019
+ Dictionary with files_deleted count
1020
+ """
1021
+ from odibi.utils.logging_context import get_logging_context
1022
+ import time
1023
+
1024
+ ctx = get_logging_context().with_context(engine="polars")
1025
+ start = time.time()
1026
+
1027
+ ctx.debug(
1028
+ "Starting Delta VACUUM",
1029
+ path=path,
1030
+ retention_hours=retention_hours,
1031
+ dry_run=dry_run,
1032
+ )
1033
+
1034
+ try:
1035
+ from deltalake import DeltaTable
1036
+ except ImportError:
1037
+ ctx.error("Delta Lake library not installed", path=path)
1038
+ raise ImportError(
1039
+ "Delta Lake support requires 'pip install odibi[polars]' "
1040
+ "or 'pip install deltalake'. See README.md for installation instructions."
1041
+ )
1042
+
1043
+ full_path = connection.get_path(path) if connection else path
1044
+
1045
+ storage_opts = {}
1046
+ if hasattr(connection, "pandas_storage_options"):
1047
+ storage_opts = connection.pandas_storage_options()
1048
+
1049
+ dt = DeltaTable(full_path, storage_options=storage_opts)
1050
+ deleted_files = dt.vacuum(
1051
+ retention_hours=retention_hours,
1052
+ dry_run=dry_run,
1053
+ enforce_retention_duration=enforce_retention_duration,
1054
+ )
1055
+
1056
+ elapsed = (time.time() - start) * 1000
1057
+ ctx.info(
1058
+ "Delta VACUUM completed",
1059
+ path=str(full_path),
1060
+ files_deleted=len(deleted_files),
1061
+ dry_run=dry_run,
1062
+ elapsed_ms=round(elapsed, 2),
1063
+ )
1064
+
1065
+ return {"files_deleted": len(deleted_files)}
1066
+
1067
+ def get_delta_history(
1068
+ self, connection: Any, path: str, limit: Optional[int] = None
1069
+ ) -> List[Dict[str, Any]]:
1070
+ """Get Delta table history.
1071
+
1072
+ Args:
1073
+ connection: Connection object
1074
+ path: Delta table path
1075
+ limit: Maximum number of versions to return
1076
+
1077
+ Returns:
1078
+ List of version metadata dictionaries
1079
+ """
1080
+ from odibi.utils.logging_context import get_logging_context
1081
+ import time
1082
+
1083
+ ctx = get_logging_context().with_context(engine="polars")
1084
+ start = time.time()
1085
+
1086
+ ctx.debug("Getting Delta table history", path=path, limit=limit)
1087
+
1088
+ try:
1089
+ from deltalake import DeltaTable
1090
+ except ImportError:
1091
+ ctx.error("Delta Lake library not installed", path=path)
1092
+ raise ImportError(
1093
+ "Delta Lake support requires 'pip install odibi[polars]' "
1094
+ "or 'pip install deltalake'. See README.md for installation instructions."
1095
+ )
1096
+
1097
+ full_path = connection.get_path(path) if connection else path
1098
+
1099
+ storage_opts = {}
1100
+ if hasattr(connection, "pandas_storage_options"):
1101
+ storage_opts = connection.pandas_storage_options()
1102
+
1103
+ dt = DeltaTable(full_path, storage_options=storage_opts)
1104
+ history = dt.history(limit=limit)
1105
+
1106
+ elapsed = (time.time() - start) * 1000
1107
+ ctx.info(
1108
+ "Delta history retrieved",
1109
+ path=str(full_path),
1110
+ versions_returned=len(history) if history else 0,
1111
+ elapsed_ms=round(elapsed, 2),
1112
+ )
1113
+
1114
+ return history