odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1114 @@
|
|
|
1
|
+
"""Polars engine implementation."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import os
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
import polars as pl
|
|
9
|
+
except ImportError:
|
|
10
|
+
pl = None
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import pyarrow as pa
|
|
14
|
+
except ImportError:
|
|
15
|
+
pa = None
|
|
16
|
+
|
|
17
|
+
from odibi.context import Context
|
|
18
|
+
from odibi.engine.base import Engine
|
|
19
|
+
from odibi.enums import EngineType
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class PolarsEngine(Engine):
|
|
23
|
+
"""Polars-based execution engine (High Performance)."""
|
|
24
|
+
|
|
25
|
+
name = "polars"
|
|
26
|
+
engine_type = EngineType.POLARS
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
connections: Optional[Dict[str, Any]] = None,
|
|
31
|
+
config: Optional[Dict[str, Any]] = None,
|
|
32
|
+
):
|
|
33
|
+
"""Initialize Polars engine.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
connections: Dictionary of connection objects
|
|
37
|
+
config: Engine configuration (optional)
|
|
38
|
+
"""
|
|
39
|
+
if pl is None:
|
|
40
|
+
raise ImportError("Polars not installed. Run 'pip install polars'.")
|
|
41
|
+
|
|
42
|
+
self.connections = connections or {}
|
|
43
|
+
self.config = config or {}
|
|
44
|
+
|
|
45
|
+
def materialize(self, df: Any) -> Any:
|
|
46
|
+
"""Materialize lazy dataset into memory (DataFrame).
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
df: LazyFrame or DataFrame
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Materialized DataFrame (pl.DataFrame)
|
|
53
|
+
"""
|
|
54
|
+
if isinstance(df, pl.LazyFrame):
|
|
55
|
+
return df.collect()
|
|
56
|
+
return df
|
|
57
|
+
|
|
58
|
+
def read(
|
|
59
|
+
self,
|
|
60
|
+
connection: Any,
|
|
61
|
+
format: str,
|
|
62
|
+
table: Optional[str] = None,
|
|
63
|
+
path: Optional[str] = None,
|
|
64
|
+
streaming: bool = False,
|
|
65
|
+
schema: Optional[str] = None,
|
|
66
|
+
options: Optional[Dict[str, Any]] = None,
|
|
67
|
+
**kwargs,
|
|
68
|
+
) -> Any:
|
|
69
|
+
"""Read data using Polars (Lazy by default).
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
pl.LazyFrame or pl.DataFrame
|
|
73
|
+
"""
|
|
74
|
+
options = options or {}
|
|
75
|
+
|
|
76
|
+
# Get full path
|
|
77
|
+
if path:
|
|
78
|
+
if connection:
|
|
79
|
+
full_path = connection.get_path(path)
|
|
80
|
+
else:
|
|
81
|
+
full_path = path
|
|
82
|
+
elif table:
|
|
83
|
+
if connection:
|
|
84
|
+
full_path = connection.get_path(table)
|
|
85
|
+
else:
|
|
86
|
+
raise ValueError(
|
|
87
|
+
f"Cannot read table '{table}': connection is required when using 'table' parameter. "
|
|
88
|
+
"Provide a valid connection object or use 'path' for file-based reads."
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"Read operation failed: neither 'path' nor 'table' was provided. "
|
|
93
|
+
"Specify a file path or table name in your configuration."
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Handle glob patterns/lists
|
|
97
|
+
# Polars scan methods often support glob strings directly.
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
if format == "csv":
|
|
101
|
+
# scan_csv supports glob patterns
|
|
102
|
+
return pl.scan_csv(full_path, **options)
|
|
103
|
+
|
|
104
|
+
elif format == "parquet":
|
|
105
|
+
return pl.scan_parquet(full_path, **options)
|
|
106
|
+
|
|
107
|
+
elif format == "json":
|
|
108
|
+
# scan_ndjson for newline delimited json, read_json for standard
|
|
109
|
+
# Assuming ndjson/jsonl for big data usually
|
|
110
|
+
if options.get("json_lines", True): # Default to ndjson scan
|
|
111
|
+
return pl.scan_ndjson(full_path, **options)
|
|
112
|
+
else:
|
|
113
|
+
# Standard JSON doesn't support lazy scan well in all versions, fallback to read
|
|
114
|
+
return pl.read_json(full_path, **options).lazy()
|
|
115
|
+
|
|
116
|
+
elif format == "delta":
|
|
117
|
+
# scan_delta requires 'deltalake' extra usually or feature
|
|
118
|
+
storage_options = options.get("storage_options", None)
|
|
119
|
+
version = options.get("versionAsOf", None)
|
|
120
|
+
|
|
121
|
+
# scan_delta is available in recent polars
|
|
122
|
+
# It might accept storage_options in recent versions
|
|
123
|
+
delta_opts = {}
|
|
124
|
+
if storage_options:
|
|
125
|
+
delta_opts["storage_options"] = storage_options
|
|
126
|
+
if version is not None:
|
|
127
|
+
delta_opts["version"] = version
|
|
128
|
+
|
|
129
|
+
return pl.scan_delta(full_path, **delta_opts)
|
|
130
|
+
|
|
131
|
+
else:
|
|
132
|
+
raise ValueError(
|
|
133
|
+
f"Unsupported format for Polars engine: '{format}'. "
|
|
134
|
+
"Supported formats: csv, parquet, json, delta."
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
raise ValueError(
|
|
139
|
+
f"Failed to read {format} from '{full_path}': {e}. "
|
|
140
|
+
"Check that the file exists, the format is correct, and you have read permissions."
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
def write(
|
|
144
|
+
self,
|
|
145
|
+
df: Any,
|
|
146
|
+
connection: Any,
|
|
147
|
+
format: str,
|
|
148
|
+
table: Optional[str] = None,
|
|
149
|
+
path: Optional[str] = None,
|
|
150
|
+
mode: str = "overwrite",
|
|
151
|
+
options: Optional[Dict[str, Any]] = None,
|
|
152
|
+
streaming_config: Optional[Any] = None,
|
|
153
|
+
) -> Optional[Dict[str, Any]]:
|
|
154
|
+
"""Write data using Polars."""
|
|
155
|
+
options = options or {}
|
|
156
|
+
|
|
157
|
+
if format in ["sql", "sql_server", "azure_sql"]:
|
|
158
|
+
return self._write_sql(df, connection, table, mode, options)
|
|
159
|
+
|
|
160
|
+
if path:
|
|
161
|
+
if connection:
|
|
162
|
+
full_path = connection.get_path(path)
|
|
163
|
+
else:
|
|
164
|
+
full_path = path
|
|
165
|
+
elif table:
|
|
166
|
+
if connection:
|
|
167
|
+
full_path = connection.get_path(table)
|
|
168
|
+
else:
|
|
169
|
+
raise ValueError(
|
|
170
|
+
f"Cannot write to table '{table}': connection is required when using 'table' parameter. "
|
|
171
|
+
"Provide a valid connection object or use 'path' for file-based writes."
|
|
172
|
+
)
|
|
173
|
+
else:
|
|
174
|
+
raise ValueError(
|
|
175
|
+
"Write operation failed: neither 'path' nor 'table' was provided. "
|
|
176
|
+
"Specify a file path or table name in your configuration."
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
is_lazy = isinstance(df, pl.LazyFrame)
|
|
180
|
+
|
|
181
|
+
parent_dir = os.path.dirname(full_path)
|
|
182
|
+
if parent_dir:
|
|
183
|
+
os.makedirs(parent_dir, exist_ok=True)
|
|
184
|
+
|
|
185
|
+
if format == "parquet":
|
|
186
|
+
if is_lazy:
|
|
187
|
+
df.sink_parquet(full_path, **options)
|
|
188
|
+
else:
|
|
189
|
+
df.write_parquet(full_path, **options)
|
|
190
|
+
|
|
191
|
+
elif format == "csv":
|
|
192
|
+
if is_lazy:
|
|
193
|
+
df.sink_csv(full_path, **options)
|
|
194
|
+
else:
|
|
195
|
+
df.write_csv(full_path, **options)
|
|
196
|
+
|
|
197
|
+
elif format == "json":
|
|
198
|
+
if is_lazy:
|
|
199
|
+
df.sink_ndjson(full_path, **options)
|
|
200
|
+
else:
|
|
201
|
+
df.write_ndjson(full_path, **options)
|
|
202
|
+
|
|
203
|
+
elif format == "delta":
|
|
204
|
+
if is_lazy:
|
|
205
|
+
df = df.collect()
|
|
206
|
+
|
|
207
|
+
storage_options = options.get("storage_options", None)
|
|
208
|
+
delta_write_options = options.copy()
|
|
209
|
+
if "storage_options" in delta_write_options:
|
|
210
|
+
del delta_write_options["storage_options"]
|
|
211
|
+
|
|
212
|
+
df.write_delta(
|
|
213
|
+
full_path, mode=mode, storage_options=storage_options, **delta_write_options
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
else:
|
|
217
|
+
raise ValueError(
|
|
218
|
+
f"Unsupported write format for Polars engine: '{format}'. "
|
|
219
|
+
"Supported formats: csv, parquet, json, delta."
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
def _write_sql(
|
|
225
|
+
self,
|
|
226
|
+
df: Any,
|
|
227
|
+
connection: Any,
|
|
228
|
+
table: Optional[str],
|
|
229
|
+
mode: str,
|
|
230
|
+
options: Dict[str, Any],
|
|
231
|
+
) -> Optional[Dict[str, Any]]:
|
|
232
|
+
"""Handle SQL writing including merge and enhanced overwrite for Polars (Phase 4)."""
|
|
233
|
+
from odibi.utils.logging_context import get_logging_context
|
|
234
|
+
|
|
235
|
+
ctx = get_logging_context().with_context(engine="polars")
|
|
236
|
+
|
|
237
|
+
if not hasattr(connection, "write_table"):
|
|
238
|
+
raise ValueError(
|
|
239
|
+
f"Connection type '{type(connection).__name__}' does not support SQL operations"
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if not table:
|
|
243
|
+
raise ValueError(
|
|
244
|
+
"SQL write operation failed: 'table' parameter is required but was not provided. "
|
|
245
|
+
"Specify the target table name in your configuration."
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
if mode == "merge":
|
|
249
|
+
merge_keys = options.get("merge_keys")
|
|
250
|
+
merge_options = options.get("merge_options")
|
|
251
|
+
|
|
252
|
+
if not merge_keys:
|
|
253
|
+
raise ValueError(
|
|
254
|
+
"MERGE mode requires 'merge_keys' in options. "
|
|
255
|
+
"Specify the key columns for the MERGE ON clause."
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
from odibi.writers.sql_server_writer import SqlServerMergeWriter
|
|
259
|
+
|
|
260
|
+
writer = SqlServerMergeWriter(connection)
|
|
261
|
+
ctx.debug(
|
|
262
|
+
"Executing SQL Server MERGE (Polars)",
|
|
263
|
+
target=table,
|
|
264
|
+
merge_keys=merge_keys,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
result = writer.merge_polars(
|
|
268
|
+
df=df,
|
|
269
|
+
target_table=table,
|
|
270
|
+
merge_keys=merge_keys,
|
|
271
|
+
options=merge_options,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
ctx.info(
|
|
275
|
+
"SQL Server MERGE completed (Polars)",
|
|
276
|
+
target=table,
|
|
277
|
+
inserted=result.inserted,
|
|
278
|
+
updated=result.updated,
|
|
279
|
+
deleted=result.deleted,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
return {
|
|
283
|
+
"mode": "merge",
|
|
284
|
+
"inserted": result.inserted,
|
|
285
|
+
"updated": result.updated,
|
|
286
|
+
"deleted": result.deleted,
|
|
287
|
+
"total_affected": result.total_affected,
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
if mode == "overwrite" and options.get("overwrite_options"):
|
|
291
|
+
from odibi.writers.sql_server_writer import SqlServerMergeWriter
|
|
292
|
+
|
|
293
|
+
overwrite_options = options.get("overwrite_options")
|
|
294
|
+
writer = SqlServerMergeWriter(connection)
|
|
295
|
+
|
|
296
|
+
ctx.debug(
|
|
297
|
+
"Executing SQL Server enhanced overwrite (Polars)",
|
|
298
|
+
target=table,
|
|
299
|
+
strategy=(
|
|
300
|
+
overwrite_options.strategy.value
|
|
301
|
+
if hasattr(overwrite_options, "strategy")
|
|
302
|
+
else "truncate_insert"
|
|
303
|
+
),
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
result = writer.overwrite_polars(
|
|
307
|
+
df=df,
|
|
308
|
+
target_table=table,
|
|
309
|
+
options=overwrite_options,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
ctx.info(
|
|
313
|
+
"SQL Server enhanced overwrite completed (Polars)",
|
|
314
|
+
target=table,
|
|
315
|
+
strategy=result.strategy,
|
|
316
|
+
rows_written=result.rows_written,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
return {
|
|
320
|
+
"mode": "overwrite",
|
|
321
|
+
"strategy": result.strategy,
|
|
322
|
+
"rows_written": result.rows_written,
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
if isinstance(df, pl.LazyFrame):
|
|
326
|
+
df = df.collect()
|
|
327
|
+
|
|
328
|
+
if "." in table:
|
|
329
|
+
schema, table_name = table.split(".", 1)
|
|
330
|
+
else:
|
|
331
|
+
schema, table_name = "dbo", table
|
|
332
|
+
|
|
333
|
+
if_exists = "replace"
|
|
334
|
+
if mode == "append":
|
|
335
|
+
if_exists = "append"
|
|
336
|
+
elif mode == "fail":
|
|
337
|
+
if_exists = "fail"
|
|
338
|
+
|
|
339
|
+
df_pandas = df.to_pandas()
|
|
340
|
+
chunksize = options.get("chunksize", 1000)
|
|
341
|
+
|
|
342
|
+
connection.write_table(
|
|
343
|
+
df=df_pandas,
|
|
344
|
+
table_name=table_name,
|
|
345
|
+
schema=schema,
|
|
346
|
+
if_exists=if_exists,
|
|
347
|
+
chunksize=chunksize,
|
|
348
|
+
)
|
|
349
|
+
return None
|
|
350
|
+
|
|
351
|
+
def execute_sql(self, sql: str, context: Context) -> Any:
|
|
352
|
+
"""Execute SQL query using Polars SQLContext.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
sql: SQL query string
|
|
356
|
+
context: Execution context with registered DataFrames
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
pl.LazyFrame
|
|
360
|
+
"""
|
|
361
|
+
ctx = pl.SQLContext()
|
|
362
|
+
|
|
363
|
+
# Register datasets from context
|
|
364
|
+
# We iterate over all registered names in the context
|
|
365
|
+
try:
|
|
366
|
+
names = context.list_names()
|
|
367
|
+
for name in names:
|
|
368
|
+
df = context.get(name)
|
|
369
|
+
# Register LazyFrame or DataFrame
|
|
370
|
+
# Polars SQLContext supports registering LazyFrame, DataFrame, and some others
|
|
371
|
+
# We might need to convert if it's not a Polars object, but we assume Polars engine uses Polars objects
|
|
372
|
+
ctx.register(name, df)
|
|
373
|
+
except Exception:
|
|
374
|
+
# If context doesn't support listing or getting, we proceed with empty context
|
|
375
|
+
# (e.g. if context is not fully compatible or empty)
|
|
376
|
+
pass
|
|
377
|
+
|
|
378
|
+
return ctx.execute(sql, eager=False)
|
|
379
|
+
|
|
380
|
+
def execute_operation(self, operation: str, params: Dict[str, Any], df: Any) -> Any:
|
|
381
|
+
"""Execute built-in operation."""
|
|
382
|
+
# Ensure LazyFrame for consistency if possible, but operations work on both usually.
|
|
383
|
+
# If DataFrame, some operations might need different methods.
|
|
384
|
+
|
|
385
|
+
if operation == "pivot":
|
|
386
|
+
# Pivot requires materialization usually in other engines, but Polars LazyFrame has 'collect' or similar constraints?
|
|
387
|
+
# Polars lazy pivot is not fully supported in older versions without collect, but check recent.
|
|
388
|
+
# Pivot changes shape drastically.
|
|
389
|
+
# params: pivot_column, value_column, group_by, agg_func
|
|
390
|
+
|
|
391
|
+
# If lazy, we might need to collect for pivot if lazy pivot isn't supported or experimental.
|
|
392
|
+
# But let's try to keep it lazy if possible.
|
|
393
|
+
# As of recent Polars, pivot is available on DataFrame, experimental on LazyFrame?
|
|
394
|
+
# Actually, 'unstack' or 'pivot' on LazyFrame is limited.
|
|
395
|
+
# Safe bet: materialize if needed, or use lazy pivot if available.
|
|
396
|
+
|
|
397
|
+
# Let's collect if input is lazy, because pivot usually implies strict schema change hard to predict.
|
|
398
|
+
if isinstance(df, pl.LazyFrame):
|
|
399
|
+
df = df.collect()
|
|
400
|
+
|
|
401
|
+
return df.pivot(
|
|
402
|
+
index=params.get("group_by"),
|
|
403
|
+
on=params["pivot_column"],
|
|
404
|
+
values=params["value_column"],
|
|
405
|
+
aggregate_function=params.get("agg_func", "first"),
|
|
406
|
+
) # Returns DataFrame
|
|
407
|
+
|
|
408
|
+
elif operation == "drop_duplicates":
|
|
409
|
+
subset = params.get("subset")
|
|
410
|
+
if isinstance(df, pl.LazyFrame):
|
|
411
|
+
return df.unique(subset=subset)
|
|
412
|
+
return df.unique(subset=subset)
|
|
413
|
+
|
|
414
|
+
elif operation == "fillna":
|
|
415
|
+
value = params.get("value")
|
|
416
|
+
# Polars uses fill_null
|
|
417
|
+
if isinstance(value, dict):
|
|
418
|
+
# Fill specific columns
|
|
419
|
+
# value = {'col1': 0, 'col2': 'unknown'}
|
|
420
|
+
# We need to chain with_columns
|
|
421
|
+
exprs = []
|
|
422
|
+
for col, val in value.items():
|
|
423
|
+
exprs.append(pl.col(col).fill_null(val))
|
|
424
|
+
return df.with_columns(exprs)
|
|
425
|
+
else:
|
|
426
|
+
# Fill all columns? Polars fill_null requires specifying columns or using all()
|
|
427
|
+
return df.fill_null(value)
|
|
428
|
+
|
|
429
|
+
elif operation == "drop":
|
|
430
|
+
columns = params.get("columns") or params.get("labels")
|
|
431
|
+
return df.drop(columns)
|
|
432
|
+
|
|
433
|
+
elif operation == "rename":
|
|
434
|
+
columns = params.get("columns") or params.get("mapper")
|
|
435
|
+
return df.rename(columns)
|
|
436
|
+
|
|
437
|
+
elif operation == "sort":
|
|
438
|
+
by = params.get("by")
|
|
439
|
+
descending = not params.get("ascending", True)
|
|
440
|
+
if isinstance(df, pl.LazyFrame):
|
|
441
|
+
return df.sort(by, descending=descending)
|
|
442
|
+
return df.sort(by, descending=descending)
|
|
443
|
+
|
|
444
|
+
elif operation == "sample":
|
|
445
|
+
# Sample n or frac
|
|
446
|
+
n = params.get("n")
|
|
447
|
+
frac = params.get("frac")
|
|
448
|
+
seed = params.get("random_state")
|
|
449
|
+
|
|
450
|
+
# Lazy sample supported
|
|
451
|
+
if n is not None:
|
|
452
|
+
# Note: Polars Lazy sample might be approximate or require 'collect' depending on version/backend?
|
|
453
|
+
# But usually supported.
|
|
454
|
+
if isinstance(df, pl.LazyFrame):
|
|
455
|
+
# LazyFrame.sample takes n (int) or fraction.
|
|
456
|
+
# But polars 0.19+ changed sample signature?
|
|
457
|
+
# It's generally `sample(n=..., fraction=..., seed=...)`
|
|
458
|
+
return (
|
|
459
|
+
df.collect().sample(n=n, seed=seed).lazy()
|
|
460
|
+
) # Collecting for exact sample n on lazy might be needed if not supported?
|
|
461
|
+
# Actually, fetch(n) is head. Sample is random.
|
|
462
|
+
# Let's materialize for safety with sample as it's often for checks.
|
|
463
|
+
pass
|
|
464
|
+
return df.sample(n=n, seed=seed)
|
|
465
|
+
elif frac is not None:
|
|
466
|
+
if isinstance(df, pl.LazyFrame):
|
|
467
|
+
# Lazy sampling by fraction is supported
|
|
468
|
+
pass # fall through
|
|
469
|
+
return df.sample(fraction=frac, seed=seed)
|
|
470
|
+
|
|
471
|
+
elif operation == "filter":
|
|
472
|
+
# Legacy or simple filter
|
|
473
|
+
pass
|
|
474
|
+
|
|
475
|
+
else:
|
|
476
|
+
# Fallback: check if operation is a registered transformer
|
|
477
|
+
from odibi.context import EngineContext, PandasContext
|
|
478
|
+
from odibi.registry import FunctionRegistry
|
|
479
|
+
|
|
480
|
+
if FunctionRegistry.has_function(operation):
|
|
481
|
+
func = FunctionRegistry.get_function(operation)
|
|
482
|
+
param_model = FunctionRegistry.get_param_model(operation)
|
|
483
|
+
|
|
484
|
+
# Create EngineContext from current df (use PandasContext as placeholder)
|
|
485
|
+
engine_ctx = EngineContext(
|
|
486
|
+
context=PandasContext(),
|
|
487
|
+
df=df,
|
|
488
|
+
engine=self,
|
|
489
|
+
engine_type=self.engine_type,
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
# Validate and instantiate params
|
|
493
|
+
if param_model:
|
|
494
|
+
validated_params = param_model(**params)
|
|
495
|
+
result_ctx = func(engine_ctx, validated_params)
|
|
496
|
+
else:
|
|
497
|
+
result_ctx = func(engine_ctx, **params)
|
|
498
|
+
|
|
499
|
+
return result_ctx.df
|
|
500
|
+
|
|
501
|
+
return df
|
|
502
|
+
|
|
503
|
+
def get_schema(self, df: Any) -> Any:
|
|
504
|
+
"""Get DataFrame schema."""
|
|
505
|
+
# Polars schema is a dict {name: DataType}
|
|
506
|
+
# We can return a dict of strings for compatibility
|
|
507
|
+
schema = df.collect_schema() if isinstance(df, pl.LazyFrame) else df.schema
|
|
508
|
+
return {name: str(dtype) for name, dtype in schema.items()}
|
|
509
|
+
|
|
510
|
+
def get_shape(self, df: Any) -> tuple:
|
|
511
|
+
"""Get DataFrame shape."""
|
|
512
|
+
if isinstance(df, pl.LazyFrame):
|
|
513
|
+
# Expensive to count rows in LazyFrame without scan
|
|
514
|
+
# But usually shape implies (rows, cols)
|
|
515
|
+
# columns is cheap. rows requires partial scan or metadata.
|
|
516
|
+
# Fetching 1 row might give columns.
|
|
517
|
+
# For exact row count, we need collect(count)
|
|
518
|
+
cols = len(df.collect_schema().names())
|
|
519
|
+
rows = df.select(pl.len()).collect().item()
|
|
520
|
+
return (rows, cols)
|
|
521
|
+
return df.shape
|
|
522
|
+
|
|
523
|
+
def count_rows(self, df: Any) -> int:
|
|
524
|
+
"""Count rows in DataFrame."""
|
|
525
|
+
if isinstance(df, pl.LazyFrame):
|
|
526
|
+
return df.select(pl.len()).collect().item()
|
|
527
|
+
return len(df)
|
|
528
|
+
|
|
529
|
+
def count_nulls(self, df: Any, columns: List[str]) -> Dict[str, int]:
|
|
530
|
+
"""Count nulls in specified columns."""
|
|
531
|
+
if isinstance(df, pl.LazyFrame):
|
|
532
|
+
# efficient null count
|
|
533
|
+
return df.select([pl.col(c).null_count() for c in columns]).collect().to_dicts()[0]
|
|
534
|
+
|
|
535
|
+
return df.select([pl.col(c).null_count() for c in columns]).to_dicts()[0]
|
|
536
|
+
|
|
537
|
+
def validate_schema(self, df: Any, schema_rules: Dict[str, Any]) -> List[str]:
|
|
538
|
+
"""Validate DataFrame schema."""
|
|
539
|
+
failures = []
|
|
540
|
+
|
|
541
|
+
# Schema is dict-like in Polars
|
|
542
|
+
current_schema = df.collect_schema() if isinstance(df, pl.LazyFrame) else df.schema
|
|
543
|
+
current_cols = current_schema.keys()
|
|
544
|
+
|
|
545
|
+
if "required_columns" in schema_rules:
|
|
546
|
+
required = schema_rules["required_columns"]
|
|
547
|
+
missing = set(required) - set(current_cols)
|
|
548
|
+
if missing:
|
|
549
|
+
failures.append(f"Missing required columns: {', '.join(missing)}")
|
|
550
|
+
|
|
551
|
+
if "types" in schema_rules:
|
|
552
|
+
for col, expected_type in schema_rules["types"].items():
|
|
553
|
+
if col not in current_cols:
|
|
554
|
+
failures.append(f"Column '{col}' not found for type validation")
|
|
555
|
+
continue
|
|
556
|
+
|
|
557
|
+
actual_type = str(current_schema[col])
|
|
558
|
+
# Basic type check - simplistic string matching
|
|
559
|
+
if expected_type.lower() not in actual_type.lower():
|
|
560
|
+
failures.append(
|
|
561
|
+
f"Column '{col}' has type '{actual_type}', expected '{expected_type}'"
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
return failures
|
|
565
|
+
|
|
566
|
+
def validate_data(self, df: Any, validation_config: Any) -> List[str]:
|
|
567
|
+
"""Validate data against rules.
|
|
568
|
+
|
|
569
|
+
Args:
|
|
570
|
+
df: DataFrame or LazyFrame
|
|
571
|
+
validation_config: ValidationConfig object
|
|
572
|
+
|
|
573
|
+
Returns:
|
|
574
|
+
List of validation failure messages
|
|
575
|
+
"""
|
|
576
|
+
failures = []
|
|
577
|
+
|
|
578
|
+
if isinstance(df, pl.LazyFrame):
|
|
579
|
+
schema = df.collect_schema()
|
|
580
|
+
columns = schema.names()
|
|
581
|
+
else:
|
|
582
|
+
columns = df.columns
|
|
583
|
+
|
|
584
|
+
if getattr(validation_config, "not_empty", False):
|
|
585
|
+
count = self.count_rows(df)
|
|
586
|
+
if count == 0:
|
|
587
|
+
failures.append("DataFrame is empty")
|
|
588
|
+
|
|
589
|
+
if getattr(validation_config, "no_nulls", None):
|
|
590
|
+
cols = validation_config.no_nulls
|
|
591
|
+
null_counts = self.count_nulls(df, cols)
|
|
592
|
+
for col, count in null_counts.items():
|
|
593
|
+
if count > 0:
|
|
594
|
+
failures.append(f"Column '{col}' has {count} null values")
|
|
595
|
+
|
|
596
|
+
if getattr(validation_config, "schema_validation", None):
|
|
597
|
+
schema_failures = self.validate_schema(df, validation_config.schema_validation)
|
|
598
|
+
failures.extend(schema_failures)
|
|
599
|
+
|
|
600
|
+
if getattr(validation_config, "ranges", None):
|
|
601
|
+
for col, bounds in validation_config.ranges.items():
|
|
602
|
+
if col in columns:
|
|
603
|
+
min_val = bounds.get("min")
|
|
604
|
+
max_val = bounds.get("max")
|
|
605
|
+
|
|
606
|
+
if min_val is not None:
|
|
607
|
+
if isinstance(df, pl.LazyFrame):
|
|
608
|
+
min_violations = (
|
|
609
|
+
df.filter(pl.col(col) < min_val).select(pl.len()).collect().item()
|
|
610
|
+
)
|
|
611
|
+
else:
|
|
612
|
+
min_violations = len(df.filter(pl.col(col) < min_val))
|
|
613
|
+
if min_violations > 0:
|
|
614
|
+
failures.append(f"Column '{col}' has values < {min_val}")
|
|
615
|
+
|
|
616
|
+
if max_val is not None:
|
|
617
|
+
if isinstance(df, pl.LazyFrame):
|
|
618
|
+
max_violations = (
|
|
619
|
+
df.filter(pl.col(col) > max_val).select(pl.len()).collect().item()
|
|
620
|
+
)
|
|
621
|
+
else:
|
|
622
|
+
max_violations = len(df.filter(pl.col(col) > max_val))
|
|
623
|
+
if max_violations > 0:
|
|
624
|
+
failures.append(f"Column '{col}' has values > {max_val}")
|
|
625
|
+
else:
|
|
626
|
+
failures.append(f"Column '{col}' not found for range validation")
|
|
627
|
+
|
|
628
|
+
if getattr(validation_config, "allowed_values", None):
|
|
629
|
+
for col, allowed in validation_config.allowed_values.items():
|
|
630
|
+
if col in columns:
|
|
631
|
+
if isinstance(df, pl.LazyFrame):
|
|
632
|
+
invalid_count = (
|
|
633
|
+
df.filter(~pl.col(col).is_in(allowed)).select(pl.len()).collect().item()
|
|
634
|
+
)
|
|
635
|
+
else:
|
|
636
|
+
invalid_count = len(df.filter(~pl.col(col).is_in(allowed)))
|
|
637
|
+
if invalid_count > 0:
|
|
638
|
+
failures.append(f"Column '{col}' has invalid values")
|
|
639
|
+
else:
|
|
640
|
+
failures.append(f"Column '{col}' not found for allowed values validation")
|
|
641
|
+
|
|
642
|
+
return failures
|
|
643
|
+
|
|
644
|
+
def get_sample(self, df: Any, n: int = 10) -> List[Dict[str, Any]]:
|
|
645
|
+
"""Get sample rows as list of dictionaries."""
|
|
646
|
+
if isinstance(df, pl.LazyFrame):
|
|
647
|
+
return df.limit(n).collect().to_dicts()
|
|
648
|
+
return df.head(n).to_dicts()
|
|
649
|
+
|
|
650
|
+
def profile_nulls(self, df: Any) -> Dict[str, float]:
|
|
651
|
+
"""Calculate null percentage for each column."""
|
|
652
|
+
if isinstance(df, pl.LazyFrame):
|
|
653
|
+
# null_count() / count()
|
|
654
|
+
# We can do this in one expression
|
|
655
|
+
total_count = df.select(pl.len()).collect().item()
|
|
656
|
+
if total_count == 0:
|
|
657
|
+
return {col: 0.0 for col in df.collect_schema().names()}
|
|
658
|
+
|
|
659
|
+
cols = df.collect_schema().names()
|
|
660
|
+
null_counts = df.select([pl.col(c).null_count().alias(c) for c in cols]).collect()
|
|
661
|
+
return {col: null_counts[col][0] / total_count for col in cols}
|
|
662
|
+
|
|
663
|
+
total_count = len(df)
|
|
664
|
+
if total_count == 0:
|
|
665
|
+
return {col: 0.0 for col in df.columns}
|
|
666
|
+
|
|
667
|
+
null_counts = df.null_count()
|
|
668
|
+
return {col: null_counts[col][0] / total_count for col in df.columns}
|
|
669
|
+
|
|
670
|
+
def table_exists(
|
|
671
|
+
self, connection: Any, table: Optional[str] = None, path: Optional[str] = None
|
|
672
|
+
) -> bool:
|
|
673
|
+
"""Check if table or location exists."""
|
|
674
|
+
if path:
|
|
675
|
+
full_path = connection.get_path(path)
|
|
676
|
+
return os.path.exists(full_path)
|
|
677
|
+
return False
|
|
678
|
+
|
|
679
|
+
def harmonize_schema(self, df: Any, target_schema: Dict[str, str], policy: Any) -> Any:
|
|
680
|
+
"""Harmonize DataFrame schema."""
|
|
681
|
+
# policy: SchemaPolicyConfig
|
|
682
|
+
from odibi.config import OnMissingColumns, OnNewColumns, SchemaMode
|
|
683
|
+
|
|
684
|
+
# Helper to get current columns/schema
|
|
685
|
+
if isinstance(df, pl.LazyFrame):
|
|
686
|
+
current_schema = df.collect_schema()
|
|
687
|
+
else:
|
|
688
|
+
current_schema = df.schema
|
|
689
|
+
|
|
690
|
+
current_cols = current_schema.names()
|
|
691
|
+
target_cols = list(target_schema.keys())
|
|
692
|
+
|
|
693
|
+
missing = set(target_cols) - set(current_cols)
|
|
694
|
+
new_cols = set(current_cols) - set(target_cols)
|
|
695
|
+
|
|
696
|
+
# 1. Validation
|
|
697
|
+
if missing and getattr(policy, "on_missing_columns", None) == OnMissingColumns.FAIL:
|
|
698
|
+
raise ValueError(
|
|
699
|
+
f"Schema Policy Violation: DataFrame is missing required columns {missing}. "
|
|
700
|
+
f"Available columns: {current_cols}. Add missing columns or set on_missing_columns policy."
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
if new_cols and getattr(policy, "on_new_columns", None) == OnNewColumns.FAIL:
|
|
704
|
+
raise ValueError(
|
|
705
|
+
f"Schema Policy Violation: DataFrame contains unexpected columns {new_cols}. "
|
|
706
|
+
f"Expected columns: {target_cols}. Remove extra columns or set on_new_columns policy."
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
# 2. Transformations
|
|
710
|
+
exprs = []
|
|
711
|
+
|
|
712
|
+
# Handle Missing (Add nulls)
|
|
713
|
+
# Evolve means we keep new columns, Enforce means we select only target
|
|
714
|
+
mode = getattr(policy, "mode", SchemaMode.ENFORCE)
|
|
715
|
+
|
|
716
|
+
if (
|
|
717
|
+
mode == SchemaMode.EVOLVE
|
|
718
|
+
and getattr(policy, "on_new_columns", None) == OnNewColumns.ADD_NULLABLE
|
|
719
|
+
):
|
|
720
|
+
# Add missing (if missing cols exist, we fill them with nulls)
|
|
721
|
+
# on_missing_columns controls what to do with missing target cols.
|
|
722
|
+
# If mode is EVOLVE, we typically keep everything?
|
|
723
|
+
# But harmonize_schema is about matching a TARGET schema.
|
|
724
|
+
# If target has cols that df doesn't:
|
|
725
|
+
# If on_missing_columns == FILL_NULL -> Add them as null.
|
|
726
|
+
pass
|
|
727
|
+
|
|
728
|
+
# We should respect on_missing_columns regardless of mode?
|
|
729
|
+
if missing and getattr(policy, "on_missing_columns", None) == OnMissingColumns.FILL_NULL:
|
|
730
|
+
for col in missing:
|
|
731
|
+
exprs.append(pl.lit(None).alias(col))
|
|
732
|
+
|
|
733
|
+
if exprs:
|
|
734
|
+
df = df.with_columns(exprs)
|
|
735
|
+
|
|
736
|
+
# Now Select
|
|
737
|
+
if mode == SchemaMode.ENFORCE:
|
|
738
|
+
# Select only target columns.
|
|
739
|
+
# Missing columns were added above if configured.
|
|
740
|
+
# New columns (not in target) are dropped implicitly by selecting target_cols.
|
|
741
|
+
# But wait, we added exprs to df (lazy).
|
|
742
|
+
|
|
743
|
+
final_cols = []
|
|
744
|
+
for col in target_cols:
|
|
745
|
+
final_cols.append(pl.col(col))
|
|
746
|
+
|
|
747
|
+
df = df.select(final_cols)
|
|
748
|
+
|
|
749
|
+
elif mode == SchemaMode.EVOLVE:
|
|
750
|
+
# We keep new columns.
|
|
751
|
+
# If target has columns that were missing in df, we added them above (if FILL_NULL).
|
|
752
|
+
# If df has columns not in target (new_cols), we keep them.
|
|
753
|
+
pass
|
|
754
|
+
|
|
755
|
+
return df
|
|
756
|
+
|
|
757
|
+
def anonymize(
|
|
758
|
+
self, df: Any, columns: List[str], method: str, salt: Optional[str] = None
|
|
759
|
+
) -> Any:
|
|
760
|
+
"""Anonymize specified columns."""
|
|
761
|
+
if method == "mask":
|
|
762
|
+
# Mask all but last 4 characters: '******1234'
|
|
763
|
+
# Regex look-around not supported in some envs.
|
|
764
|
+
# Manual approach:
|
|
765
|
+
# If len > 4: repeat('*', len-4) + suffix(4)
|
|
766
|
+
# Else: keep original (or mask all? Pandas engine masked all but last 4, which implies keeping small strings?)
|
|
767
|
+
# Pandas: .str.replace(r".(?=.{4})", "*") -> replaces chars that are followed by 4 chars.
|
|
768
|
+
# If str is "123", no char is followed by 4 chars -> "123".
|
|
769
|
+
# If str is "12345", '1' is followed by '2345' (4 chars) -> "*2345".
|
|
770
|
+
|
|
771
|
+
return df.with_columns(
|
|
772
|
+
[
|
|
773
|
+
pl.when(pl.col(c).cast(pl.Utf8).str.len_chars() > 4)
|
|
774
|
+
.then(
|
|
775
|
+
pl.concat_str(
|
|
776
|
+
[
|
|
777
|
+
pl.lit("*").repeat_by(pl.col(c).str.len_chars() - 4).list.join(""),
|
|
778
|
+
pl.col(c).str.slice(-4),
|
|
779
|
+
]
|
|
780
|
+
)
|
|
781
|
+
)
|
|
782
|
+
.otherwise(pl.col(c).cast(pl.Utf8))
|
|
783
|
+
.alias(c)
|
|
784
|
+
for c in columns
|
|
785
|
+
]
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
elif method == "hash":
|
|
789
|
+
# Polars hash() is non-cryptographic usually (xxHash).
|
|
790
|
+
# For cryptographic hash (sha256), we might need map_elements (slow) or plugin.
|
|
791
|
+
# Requirement is just 'hash', often consistent for analytics.
|
|
792
|
+
# Gap Analysis mentions "salt".
|
|
793
|
+
# PandasEngine used sha256 with salt.
|
|
794
|
+
# Polars `hash` is fast 64-bit hash.
|
|
795
|
+
# If we need SHA256, we must use map_elements (python UDF) or custom.
|
|
796
|
+
# For "High Performance", map_elements is bad.
|
|
797
|
+
# However, without native plugin, we have no choice for SHA256.
|
|
798
|
+
# Let's implement SHA256 via map_elements for compatibility,
|
|
799
|
+
# OR use Polars internal hash if user accepts non-crypto.
|
|
800
|
+
# But "salt" implies security/crypto usage.
|
|
801
|
+
|
|
802
|
+
def _hash_val(val):
|
|
803
|
+
if val is None:
|
|
804
|
+
return None
|
|
805
|
+
to_hash = str(val)
|
|
806
|
+
if salt:
|
|
807
|
+
to_hash += salt
|
|
808
|
+
return hashlib.sha256(to_hash.encode("utf-8")).hexdigest()
|
|
809
|
+
|
|
810
|
+
# Apply to each column. Warning: Slow path.
|
|
811
|
+
# But Polars UDFs are still faster than Pandas apply often due to no GIL? No, Python UDF has GIL.
|
|
812
|
+
return df.with_columns(
|
|
813
|
+
[pl.col(c).map_elements(_hash_val, return_dtype=pl.Utf8).alias(c) for c in columns]
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
elif method == "redact":
|
|
817
|
+
return df.with_columns([pl.lit("[REDACTED]").alias(c) for c in columns])
|
|
818
|
+
|
|
819
|
+
return df
|
|
820
|
+
|
|
821
|
+
def get_table_schema(
|
|
822
|
+
self,
|
|
823
|
+
connection: Any,
|
|
824
|
+
table: Optional[str] = None,
|
|
825
|
+
path: Optional[str] = None,
|
|
826
|
+
format: Optional[str] = None,
|
|
827
|
+
) -> Optional[Dict[str, str]]:
|
|
828
|
+
"""Get schema of an existing table/file.
|
|
829
|
+
|
|
830
|
+
Args:
|
|
831
|
+
connection: Connection object
|
|
832
|
+
table: Table name
|
|
833
|
+
path: File path
|
|
834
|
+
format: Data format (optional, helps with file-based sources)
|
|
835
|
+
|
|
836
|
+
Returns:
|
|
837
|
+
Schema dict or None if table doesn't exist or schema fetch fails.
|
|
838
|
+
"""
|
|
839
|
+
from odibi.utils.logging_context import get_logging_context
|
|
840
|
+
|
|
841
|
+
ctx = get_logging_context().with_context(engine="polars")
|
|
842
|
+
|
|
843
|
+
try:
|
|
844
|
+
if table and format in ["sql", "sql_server", "azure_sql"]:
|
|
845
|
+
query = f"SELECT TOP 0 * FROM {table}"
|
|
846
|
+
df = connection.read_sql(query)
|
|
847
|
+
return {col: str(dtype) for col, dtype in zip(df.columns, df.dtypes)}
|
|
848
|
+
|
|
849
|
+
if path:
|
|
850
|
+
full_path = connection.get_path(path) if connection else path
|
|
851
|
+
if not os.path.exists(full_path):
|
|
852
|
+
return None
|
|
853
|
+
|
|
854
|
+
if format == "delta":
|
|
855
|
+
try:
|
|
856
|
+
from deltalake import DeltaTable
|
|
857
|
+
|
|
858
|
+
dt = DeltaTable(full_path)
|
|
859
|
+
arrow_schema = dt.schema().to_pyarrow()
|
|
860
|
+
return {field.name: str(field.type) for field in arrow_schema}
|
|
861
|
+
except ImportError:
|
|
862
|
+
ctx.warning(
|
|
863
|
+
"deltalake library not installed for schema introspection",
|
|
864
|
+
path=full_path,
|
|
865
|
+
)
|
|
866
|
+
return None
|
|
867
|
+
|
|
868
|
+
elif format == "parquet":
|
|
869
|
+
try:
|
|
870
|
+
import pyarrow.parquet as pq
|
|
871
|
+
import glob as glob_mod
|
|
872
|
+
|
|
873
|
+
target_path = full_path
|
|
874
|
+
if os.path.isdir(full_path):
|
|
875
|
+
files = glob_mod.glob(os.path.join(full_path, "*.parquet"))
|
|
876
|
+
if not files:
|
|
877
|
+
return None
|
|
878
|
+
target_path = files[0]
|
|
879
|
+
|
|
880
|
+
schema = pq.read_schema(target_path)
|
|
881
|
+
return {field.name: str(field.type) for field in schema}
|
|
882
|
+
except ImportError:
|
|
883
|
+
lf = pl.scan_parquet(full_path)
|
|
884
|
+
schema = lf.collect_schema()
|
|
885
|
+
return {name: str(dtype) for name, dtype in schema.items()}
|
|
886
|
+
|
|
887
|
+
elif format == "csv":
|
|
888
|
+
lf = pl.scan_csv(full_path)
|
|
889
|
+
schema = lf.collect_schema()
|
|
890
|
+
return {name: str(dtype) for name, dtype in schema.items()}
|
|
891
|
+
|
|
892
|
+
except (FileNotFoundError, PermissionError):
|
|
893
|
+
return None
|
|
894
|
+
except Exception as e:
|
|
895
|
+
ctx.warning(f"Failed to infer schema for {table or path}: {e}")
|
|
896
|
+
return None
|
|
897
|
+
|
|
898
|
+
return None
|
|
899
|
+
|
|
900
|
+
def maintain_table(
|
|
901
|
+
self,
|
|
902
|
+
connection: Any,
|
|
903
|
+
format: str,
|
|
904
|
+
table: Optional[str] = None,
|
|
905
|
+
path: Optional[str] = None,
|
|
906
|
+
config: Optional[Any] = None,
|
|
907
|
+
) -> None:
|
|
908
|
+
"""Run table maintenance operations (optimize, vacuum) for Delta tables.
|
|
909
|
+
|
|
910
|
+
Args:
|
|
911
|
+
connection: Connection object
|
|
912
|
+
format: Table format
|
|
913
|
+
table: Table name
|
|
914
|
+
path: Table path
|
|
915
|
+
config: AutoOptimizeConfig object
|
|
916
|
+
"""
|
|
917
|
+
from odibi.utils.logging_context import get_logging_context
|
|
918
|
+
|
|
919
|
+
ctx = get_logging_context().with_context(engine="polars")
|
|
920
|
+
|
|
921
|
+
if format != "delta" or not config or not getattr(config, "enabled", False):
|
|
922
|
+
return
|
|
923
|
+
|
|
924
|
+
if not path and not table:
|
|
925
|
+
return
|
|
926
|
+
|
|
927
|
+
full_path = connection.get_path(path if path else table) if connection else (path or table)
|
|
928
|
+
|
|
929
|
+
ctx.info("Starting table maintenance", path=str(full_path))
|
|
930
|
+
|
|
931
|
+
try:
|
|
932
|
+
from deltalake import DeltaTable
|
|
933
|
+
except ImportError:
|
|
934
|
+
ctx.warning(
|
|
935
|
+
"Auto-optimize skipped: 'deltalake' library not installed",
|
|
936
|
+
path=str(full_path),
|
|
937
|
+
)
|
|
938
|
+
return
|
|
939
|
+
|
|
940
|
+
try:
|
|
941
|
+
import time
|
|
942
|
+
|
|
943
|
+
start = time.time()
|
|
944
|
+
|
|
945
|
+
storage_opts = {}
|
|
946
|
+
if hasattr(connection, "pandas_storage_options"):
|
|
947
|
+
storage_opts = connection.pandas_storage_options()
|
|
948
|
+
|
|
949
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
950
|
+
|
|
951
|
+
ctx.info("Running Delta OPTIMIZE (compaction)", path=str(full_path))
|
|
952
|
+
dt.optimize.compact()
|
|
953
|
+
|
|
954
|
+
retention = getattr(config, "vacuum_retention_hours", None)
|
|
955
|
+
if retention is not None and retention > 0:
|
|
956
|
+
ctx.info(
|
|
957
|
+
"Running Delta VACUUM",
|
|
958
|
+
path=str(full_path),
|
|
959
|
+
retention_hours=retention,
|
|
960
|
+
)
|
|
961
|
+
dt.vacuum(
|
|
962
|
+
retention_hours=retention,
|
|
963
|
+
enforce_retention_duration=True,
|
|
964
|
+
dry_run=False,
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
elapsed = (time.time() - start) * 1000
|
|
968
|
+
ctx.info(
|
|
969
|
+
"Table maintenance completed",
|
|
970
|
+
path=str(full_path),
|
|
971
|
+
elapsed_ms=round(elapsed, 2),
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
except Exception as e:
|
|
975
|
+
ctx.warning(
|
|
976
|
+
"Auto-optimize failed",
|
|
977
|
+
path=str(full_path),
|
|
978
|
+
error=str(e),
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
def get_source_files(self, df: Any) -> List[str]:
|
|
982
|
+
"""Get list of source files that generated this DataFrame.
|
|
983
|
+
|
|
984
|
+
For Polars, this checks if source file info was stored
|
|
985
|
+
in the DataFrame's metadata during read.
|
|
986
|
+
|
|
987
|
+
Args:
|
|
988
|
+
df: DataFrame or LazyFrame
|
|
989
|
+
|
|
990
|
+
Returns:
|
|
991
|
+
List of file paths (or empty list if not applicable/supported)
|
|
992
|
+
"""
|
|
993
|
+
if isinstance(df, pl.LazyFrame):
|
|
994
|
+
return []
|
|
995
|
+
|
|
996
|
+
if hasattr(df, "attrs"):
|
|
997
|
+
return df.attrs.get("odibi_source_files", [])
|
|
998
|
+
|
|
999
|
+
return []
|
|
1000
|
+
|
|
1001
|
+
def vacuum_delta(
|
|
1002
|
+
self,
|
|
1003
|
+
connection: Any,
|
|
1004
|
+
path: str,
|
|
1005
|
+
retention_hours: int = 168,
|
|
1006
|
+
dry_run: bool = False,
|
|
1007
|
+
enforce_retention_duration: bool = True,
|
|
1008
|
+
) -> Dict[str, Any]:
|
|
1009
|
+
"""VACUUM a Delta table to remove old files.
|
|
1010
|
+
|
|
1011
|
+
Args:
|
|
1012
|
+
connection: Connection object
|
|
1013
|
+
path: Delta table path
|
|
1014
|
+
retention_hours: Retention period (default 168 = 7 days)
|
|
1015
|
+
dry_run: If True, only show files to be deleted
|
|
1016
|
+
enforce_retention_duration: If False, allows retention < 168 hours (testing only)
|
|
1017
|
+
|
|
1018
|
+
Returns:
|
|
1019
|
+
Dictionary with files_deleted count
|
|
1020
|
+
"""
|
|
1021
|
+
from odibi.utils.logging_context import get_logging_context
|
|
1022
|
+
import time
|
|
1023
|
+
|
|
1024
|
+
ctx = get_logging_context().with_context(engine="polars")
|
|
1025
|
+
start = time.time()
|
|
1026
|
+
|
|
1027
|
+
ctx.debug(
|
|
1028
|
+
"Starting Delta VACUUM",
|
|
1029
|
+
path=path,
|
|
1030
|
+
retention_hours=retention_hours,
|
|
1031
|
+
dry_run=dry_run,
|
|
1032
|
+
)
|
|
1033
|
+
|
|
1034
|
+
try:
|
|
1035
|
+
from deltalake import DeltaTable
|
|
1036
|
+
except ImportError:
|
|
1037
|
+
ctx.error("Delta Lake library not installed", path=path)
|
|
1038
|
+
raise ImportError(
|
|
1039
|
+
"Delta Lake support requires 'pip install odibi[polars]' "
|
|
1040
|
+
"or 'pip install deltalake'. See README.md for installation instructions."
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
full_path = connection.get_path(path) if connection else path
|
|
1044
|
+
|
|
1045
|
+
storage_opts = {}
|
|
1046
|
+
if hasattr(connection, "pandas_storage_options"):
|
|
1047
|
+
storage_opts = connection.pandas_storage_options()
|
|
1048
|
+
|
|
1049
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
1050
|
+
deleted_files = dt.vacuum(
|
|
1051
|
+
retention_hours=retention_hours,
|
|
1052
|
+
dry_run=dry_run,
|
|
1053
|
+
enforce_retention_duration=enforce_retention_duration,
|
|
1054
|
+
)
|
|
1055
|
+
|
|
1056
|
+
elapsed = (time.time() - start) * 1000
|
|
1057
|
+
ctx.info(
|
|
1058
|
+
"Delta VACUUM completed",
|
|
1059
|
+
path=str(full_path),
|
|
1060
|
+
files_deleted=len(deleted_files),
|
|
1061
|
+
dry_run=dry_run,
|
|
1062
|
+
elapsed_ms=round(elapsed, 2),
|
|
1063
|
+
)
|
|
1064
|
+
|
|
1065
|
+
return {"files_deleted": len(deleted_files)}
|
|
1066
|
+
|
|
1067
|
+
def get_delta_history(
|
|
1068
|
+
self, connection: Any, path: str, limit: Optional[int] = None
|
|
1069
|
+
) -> List[Dict[str, Any]]:
|
|
1070
|
+
"""Get Delta table history.
|
|
1071
|
+
|
|
1072
|
+
Args:
|
|
1073
|
+
connection: Connection object
|
|
1074
|
+
path: Delta table path
|
|
1075
|
+
limit: Maximum number of versions to return
|
|
1076
|
+
|
|
1077
|
+
Returns:
|
|
1078
|
+
List of version metadata dictionaries
|
|
1079
|
+
"""
|
|
1080
|
+
from odibi.utils.logging_context import get_logging_context
|
|
1081
|
+
import time
|
|
1082
|
+
|
|
1083
|
+
ctx = get_logging_context().with_context(engine="polars")
|
|
1084
|
+
start = time.time()
|
|
1085
|
+
|
|
1086
|
+
ctx.debug("Getting Delta table history", path=path, limit=limit)
|
|
1087
|
+
|
|
1088
|
+
try:
|
|
1089
|
+
from deltalake import DeltaTable
|
|
1090
|
+
except ImportError:
|
|
1091
|
+
ctx.error("Delta Lake library not installed", path=path)
|
|
1092
|
+
raise ImportError(
|
|
1093
|
+
"Delta Lake support requires 'pip install odibi[polars]' "
|
|
1094
|
+
"or 'pip install deltalake'. See README.md for installation instructions."
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
full_path = connection.get_path(path) if connection else path
|
|
1098
|
+
|
|
1099
|
+
storage_opts = {}
|
|
1100
|
+
if hasattr(connection, "pandas_storage_options"):
|
|
1101
|
+
storage_opts = connection.pandas_storage_options()
|
|
1102
|
+
|
|
1103
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
1104
|
+
history = dt.history(limit=limit)
|
|
1105
|
+
|
|
1106
|
+
elapsed = (time.time() - start) * 1000
|
|
1107
|
+
ctx.info(
|
|
1108
|
+
"Delta history retrieved",
|
|
1109
|
+
path=str(full_path),
|
|
1110
|
+
versions_returned=len(history) if history else 0,
|
|
1111
|
+
elapsed_ms=round(elapsed, 2),
|
|
1112
|
+
)
|
|
1113
|
+
|
|
1114
|
+
return history
|