odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2178 @@
|
|
|
1
|
+
"""Pandas engine implementation."""
|
|
2
|
+
|
|
3
|
+
import glob
|
|
4
|
+
import hashlib
|
|
5
|
+
import os
|
|
6
|
+
import random
|
|
7
|
+
import time
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, Iterator, List, Optional, Union
|
|
13
|
+
from urllib.parse import urlparse
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from odibi.context import Context, PandasContext
|
|
18
|
+
from odibi.engine.base import Engine
|
|
19
|
+
from odibi.enums import EngineType
|
|
20
|
+
from odibi.exceptions import TransformError
|
|
21
|
+
from odibi.utils.logging_context import get_logging_context
|
|
22
|
+
|
|
23
|
+
__all__ = ["PandasEngine", "LazyDataset"]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class LazyDataset:
|
|
28
|
+
"""Lazy representation of a dataset (file) for out-of-core processing."""
|
|
29
|
+
|
|
30
|
+
path: Union[str, List[str]]
|
|
31
|
+
format: str
|
|
32
|
+
options: Dict[str, Any]
|
|
33
|
+
connection: Optional[Any] = None # To resolve path/credentials if needed
|
|
34
|
+
|
|
35
|
+
def __repr__(self):
|
|
36
|
+
return f"LazyDataset(path={self.path}, format={self.format})"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class PandasEngine(Engine):
|
|
40
|
+
"""Pandas-based execution engine."""
|
|
41
|
+
|
|
42
|
+
name = "pandas"
|
|
43
|
+
engine_type = EngineType.PANDAS
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
connections: Optional[Dict[str, Any]] = None,
|
|
48
|
+
config: Optional[Dict[str, Any]] = None,
|
|
49
|
+
):
|
|
50
|
+
"""Initialize Pandas engine.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
connections: Dictionary of connection objects
|
|
54
|
+
config: Engine configuration (optional)
|
|
55
|
+
"""
|
|
56
|
+
self.connections = connections or {}
|
|
57
|
+
self.config = config or {}
|
|
58
|
+
|
|
59
|
+
# Suppress noisy delta-rs transaction conflict warnings (handled by retry)
|
|
60
|
+
if "RUST_LOG" not in os.environ:
|
|
61
|
+
os.environ["RUST_LOG"] = "deltalake_core::kernel::transaction=error"
|
|
62
|
+
|
|
63
|
+
# Check for performance flags
|
|
64
|
+
performance = self.config.get("performance", {})
|
|
65
|
+
|
|
66
|
+
# Determine desired state
|
|
67
|
+
if hasattr(performance, "use_arrow"):
|
|
68
|
+
desired_use_arrow = performance.use_arrow
|
|
69
|
+
elif isinstance(performance, dict):
|
|
70
|
+
desired_use_arrow = performance.get("use_arrow", True)
|
|
71
|
+
else:
|
|
72
|
+
desired_use_arrow = True
|
|
73
|
+
|
|
74
|
+
# Verify availability
|
|
75
|
+
if desired_use_arrow:
|
|
76
|
+
try:
|
|
77
|
+
import pyarrow # noqa: F401
|
|
78
|
+
|
|
79
|
+
self.use_arrow = True
|
|
80
|
+
except ImportError:
|
|
81
|
+
import logging
|
|
82
|
+
|
|
83
|
+
logger = logging.getLogger(__name__)
|
|
84
|
+
logger.warning(
|
|
85
|
+
"Apache Arrow not found. Disabling Arrow optimizations. "
|
|
86
|
+
"Install 'pyarrow' to enable."
|
|
87
|
+
)
|
|
88
|
+
self.use_arrow = False
|
|
89
|
+
else:
|
|
90
|
+
self.use_arrow = False
|
|
91
|
+
|
|
92
|
+
# Check for DuckDB
|
|
93
|
+
self.use_duckdb = False
|
|
94
|
+
# Default to False to ensure stability with existing tests (Lazy Loading is opt-in)
|
|
95
|
+
if self.config.get("performance", {}).get("use_duckdb", False):
|
|
96
|
+
try:
|
|
97
|
+
import duckdb # noqa: F401
|
|
98
|
+
|
|
99
|
+
self.use_duckdb = True
|
|
100
|
+
except ImportError:
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
def materialize(self, df: Any) -> Any:
|
|
104
|
+
"""Materialize lazy dataset."""
|
|
105
|
+
if isinstance(df, LazyDataset):
|
|
106
|
+
# Re-invoke read but force materialization (by bypassing Lazy check)
|
|
107
|
+
# We pass the resolved path directly
|
|
108
|
+
# Note: We need to handle the case where path was resolved.
|
|
109
|
+
# LazyDataset.path should be the FULL path.
|
|
110
|
+
return self._read_file(
|
|
111
|
+
full_path=df.path, format=df.format, options=df.options, connection=df.connection
|
|
112
|
+
)
|
|
113
|
+
return df
|
|
114
|
+
|
|
115
|
+
def _process_df(
|
|
116
|
+
self, df: Union[pd.DataFrame, Iterator[pd.DataFrame]], query: Optional[str]
|
|
117
|
+
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
|
|
118
|
+
"""Apply post-read processing (filtering)."""
|
|
119
|
+
if query and df is not None:
|
|
120
|
+
# Handle Iterator
|
|
121
|
+
from collections.abc import Iterator
|
|
122
|
+
|
|
123
|
+
if isinstance(df, Iterator):
|
|
124
|
+
# Filter each chunk
|
|
125
|
+
return (chunk.query(query) for chunk in df)
|
|
126
|
+
|
|
127
|
+
if not df.empty:
|
|
128
|
+
try:
|
|
129
|
+
return df.query(query)
|
|
130
|
+
except Exception as e:
|
|
131
|
+
import logging
|
|
132
|
+
|
|
133
|
+
logger = logging.getLogger(__name__)
|
|
134
|
+
logger.warning(f"Failed to apply query '{query}': {e}")
|
|
135
|
+
return df
|
|
136
|
+
|
|
137
|
+
_CLOUD_URI_PREFIXES = ("abfss://", "s3://", "gs://", "az://", "https://")
|
|
138
|
+
|
|
139
|
+
def _retry_delta_operation(self, func, max_retries: int = 5, base_delay: float = 0.2):
|
|
140
|
+
"""Retry Delta operations with exponential backoff for concurrent conflicts."""
|
|
141
|
+
for attempt in range(max_retries):
|
|
142
|
+
try:
|
|
143
|
+
return func()
|
|
144
|
+
except Exception as e:
|
|
145
|
+
error_str = str(e).lower()
|
|
146
|
+
is_conflict = "conflict" in error_str or "concurrent" in error_str
|
|
147
|
+
if attempt == max_retries - 1 or not is_conflict:
|
|
148
|
+
raise
|
|
149
|
+
delay = base_delay * (2**attempt) + random.uniform(0, 0.1)
|
|
150
|
+
time.sleep(delay)
|
|
151
|
+
|
|
152
|
+
def _resolve_path(self, path: Optional[str], connection: Any) -> str:
|
|
153
|
+
"""Resolve path to full URI, avoiding double-prefixing for cloud URIs.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
path: Relative or absolute path
|
|
157
|
+
connection: Connection object (may have get_path method)
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Full resolved path
|
|
161
|
+
"""
|
|
162
|
+
if not path:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
"Failed to resolve path: path argument is required but was empty or None. "
|
|
165
|
+
"Provide a valid file path or use 'table' parameter with a connection."
|
|
166
|
+
)
|
|
167
|
+
if path.startswith(self._CLOUD_URI_PREFIXES):
|
|
168
|
+
return path
|
|
169
|
+
if connection:
|
|
170
|
+
return connection.get_path(path)
|
|
171
|
+
return path
|
|
172
|
+
|
|
173
|
+
def _merge_storage_options(
|
|
174
|
+
self, connection: Any, options: Optional[Dict[str, Any]] = None
|
|
175
|
+
) -> Dict[str, Any]:
|
|
176
|
+
"""Merge connection storage options with user options.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
connection: Connection object (may have pandas_storage_options method)
|
|
180
|
+
options: User-provided options
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Merged options dictionary
|
|
184
|
+
"""
|
|
185
|
+
options = options or {}
|
|
186
|
+
|
|
187
|
+
# If connection provides storage_options (e.g., AzureADLS), merge them
|
|
188
|
+
if hasattr(connection, "pandas_storage_options"):
|
|
189
|
+
conn_storage_opts = connection.pandas_storage_options()
|
|
190
|
+
user_storage_opts = options.get("storage_options", {})
|
|
191
|
+
|
|
192
|
+
# User options override connection options
|
|
193
|
+
merged_storage_opts = {**conn_storage_opts, **user_storage_opts}
|
|
194
|
+
|
|
195
|
+
# Return options with merged storage_options
|
|
196
|
+
return {**options, "storage_options": merged_storage_opts}
|
|
197
|
+
|
|
198
|
+
return options
|
|
199
|
+
|
|
200
|
+
def _read_parallel(self, read_func: Any, paths: List[str], **kwargs) -> pd.DataFrame:
|
|
201
|
+
"""Read multiple files in parallel using threads.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
read_func: Pandas read function (e.g. pd.read_csv)
|
|
205
|
+
paths: List of file paths
|
|
206
|
+
kwargs: Arguments to pass to read_func
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Concatenated DataFrame
|
|
210
|
+
"""
|
|
211
|
+
# Conservative worker count to avoid OOM on large files
|
|
212
|
+
max_workers = min(8, os.cpu_count() or 4)
|
|
213
|
+
|
|
214
|
+
dfs = []
|
|
215
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
216
|
+
# map preserves order
|
|
217
|
+
results = executor.map(lambda p: read_func(p, **kwargs), paths)
|
|
218
|
+
dfs = list(results)
|
|
219
|
+
|
|
220
|
+
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
|
221
|
+
|
|
222
|
+
def read(
|
|
223
|
+
self,
|
|
224
|
+
connection: Any,
|
|
225
|
+
format: str,
|
|
226
|
+
table: Optional[str] = None,
|
|
227
|
+
path: Optional[str] = None,
|
|
228
|
+
streaming: bool = False,
|
|
229
|
+
schema: Optional[str] = None,
|
|
230
|
+
options: Optional[Dict[str, Any]] = None,
|
|
231
|
+
as_of_version: Optional[int] = None,
|
|
232
|
+
as_of_timestamp: Optional[str] = None,
|
|
233
|
+
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
|
|
234
|
+
"""Read data using Pandas (or LazyDataset)."""
|
|
235
|
+
ctx = get_logging_context().with_context(engine="pandas")
|
|
236
|
+
start = time.time()
|
|
237
|
+
|
|
238
|
+
source = path or table
|
|
239
|
+
ctx.debug(
|
|
240
|
+
"Starting read operation",
|
|
241
|
+
format=format,
|
|
242
|
+
path=source,
|
|
243
|
+
streaming=streaming,
|
|
244
|
+
use_arrow=self.use_arrow,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
if streaming:
|
|
248
|
+
ctx.error(
|
|
249
|
+
"Streaming not supported in Pandas engine",
|
|
250
|
+
format=format,
|
|
251
|
+
path=source,
|
|
252
|
+
)
|
|
253
|
+
raise ValueError(
|
|
254
|
+
"Streaming is not supported in the Pandas engine. "
|
|
255
|
+
"Please use 'engine: spark' for streaming pipelines."
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
options = options or {}
|
|
259
|
+
|
|
260
|
+
# Resolve full path from connection
|
|
261
|
+
try:
|
|
262
|
+
full_path = self._resolve_path(path or table, connection)
|
|
263
|
+
except ValueError:
|
|
264
|
+
if table and not connection:
|
|
265
|
+
ctx.error("Connection required when specifying 'table'", table=table)
|
|
266
|
+
raise ValueError(
|
|
267
|
+
f"Cannot read table '{table}': connection is required when using 'table' parameter. "
|
|
268
|
+
"Provide a valid connection object or use 'path' for file-based reads."
|
|
269
|
+
)
|
|
270
|
+
ctx.error("Neither path nor table provided for read operation")
|
|
271
|
+
raise ValueError(
|
|
272
|
+
"Read operation failed: neither 'path' nor 'table' was provided. "
|
|
273
|
+
"Specify a file path or table name in your configuration."
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Merge storage options for cloud connections
|
|
277
|
+
merged_options = self._merge_storage_options(connection, options)
|
|
278
|
+
|
|
279
|
+
# Sanitize options for pandas compatibility
|
|
280
|
+
if "header" in merged_options:
|
|
281
|
+
if merged_options["header"] is True:
|
|
282
|
+
merged_options["header"] = 0
|
|
283
|
+
elif merged_options["header"] is False:
|
|
284
|
+
merged_options["header"] = None
|
|
285
|
+
|
|
286
|
+
# Handle Time Travel options
|
|
287
|
+
if as_of_version is not None:
|
|
288
|
+
merged_options["versionAsOf"] = as_of_version
|
|
289
|
+
ctx.debug("Time travel enabled", version=as_of_version)
|
|
290
|
+
if as_of_timestamp is not None:
|
|
291
|
+
merged_options["timestampAsOf"] = as_of_timestamp
|
|
292
|
+
ctx.debug("Time travel enabled", timestamp=as_of_timestamp)
|
|
293
|
+
|
|
294
|
+
# Check for Lazy/DuckDB optimization
|
|
295
|
+
can_lazy_load = False
|
|
296
|
+
|
|
297
|
+
if can_lazy_load:
|
|
298
|
+
ctx.debug("Using lazy loading via DuckDB", path=str(full_path))
|
|
299
|
+
if isinstance(full_path, (str, Path)):
|
|
300
|
+
return LazyDataset(
|
|
301
|
+
path=str(full_path),
|
|
302
|
+
format=format,
|
|
303
|
+
options=merged_options,
|
|
304
|
+
connection=connection,
|
|
305
|
+
)
|
|
306
|
+
elif isinstance(full_path, list):
|
|
307
|
+
return LazyDataset(
|
|
308
|
+
path=full_path, format=format, options=merged_options, connection=connection
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
result = self._read_file(full_path, format, merged_options, connection)
|
|
312
|
+
|
|
313
|
+
# Log metrics for materialized DataFrames
|
|
314
|
+
elapsed = (time.time() - start) * 1000
|
|
315
|
+
if isinstance(result, pd.DataFrame):
|
|
316
|
+
row_count = len(result)
|
|
317
|
+
memory_mb = result.memory_usage(deep=True).sum() / (1024 * 1024)
|
|
318
|
+
|
|
319
|
+
ctx.log_file_io(
|
|
320
|
+
path=str(full_path) if not isinstance(full_path, list) else str(full_path[0]),
|
|
321
|
+
format=format,
|
|
322
|
+
mode="read",
|
|
323
|
+
rows=row_count,
|
|
324
|
+
)
|
|
325
|
+
ctx.log_pandas_metrics(
|
|
326
|
+
memory_mb=memory_mb,
|
|
327
|
+
dtypes={col: str(dtype) for col, dtype in result.dtypes.items()},
|
|
328
|
+
)
|
|
329
|
+
ctx.info(
|
|
330
|
+
"Read completed",
|
|
331
|
+
format=format,
|
|
332
|
+
rows=row_count,
|
|
333
|
+
elapsed_ms=round(elapsed, 2),
|
|
334
|
+
memory_mb=round(memory_mb, 2),
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
return result
|
|
338
|
+
|
|
339
|
+
def _read_file(
|
|
340
|
+
self,
|
|
341
|
+
full_path: Union[str, List[str], Any],
|
|
342
|
+
format: str,
|
|
343
|
+
options: Dict[str, Any],
|
|
344
|
+
connection: Any = None,
|
|
345
|
+
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
|
|
346
|
+
"""Internal file reading logic."""
|
|
347
|
+
ctx = get_logging_context().with_context(engine="pandas")
|
|
348
|
+
|
|
349
|
+
ctx.debug(
|
|
350
|
+
"Reading file",
|
|
351
|
+
path=str(full_path) if not isinstance(full_path, list) else f"{len(full_path)} files",
|
|
352
|
+
format=format,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Custom Readers
|
|
356
|
+
if format in self._custom_readers:
|
|
357
|
+
ctx.debug(f"Using custom reader for format: {format}")
|
|
358
|
+
return self._custom_readers[format](full_path, **options)
|
|
359
|
+
|
|
360
|
+
# Handle glob patterns for local files
|
|
361
|
+
is_glob = False
|
|
362
|
+
if isinstance(full_path, (str, Path)) and (
|
|
363
|
+
"*" in str(full_path) or "?" in str(full_path) or "[" in str(full_path)
|
|
364
|
+
):
|
|
365
|
+
parsed = urlparse(str(full_path))
|
|
366
|
+
# Only expand for local files (no scheme, file://, or drive letter)
|
|
367
|
+
is_local = (
|
|
368
|
+
not parsed.scheme
|
|
369
|
+
or parsed.scheme == "file"
|
|
370
|
+
or (len(parsed.scheme) == 1 and parsed.scheme.isalpha())
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
if is_local:
|
|
374
|
+
glob_path = str(full_path)
|
|
375
|
+
if glob_path.startswith("file:///"):
|
|
376
|
+
glob_path = glob_path[8:]
|
|
377
|
+
elif glob_path.startswith("file://"):
|
|
378
|
+
glob_path = glob_path[7:]
|
|
379
|
+
|
|
380
|
+
matched_files = glob.glob(glob_path)
|
|
381
|
+
if not matched_files:
|
|
382
|
+
ctx.error(
|
|
383
|
+
"No files matched glob pattern",
|
|
384
|
+
pattern=glob_path,
|
|
385
|
+
)
|
|
386
|
+
raise FileNotFoundError(f"No files matched pattern: {glob_path}")
|
|
387
|
+
|
|
388
|
+
ctx.info(
|
|
389
|
+
"Glob pattern expanded",
|
|
390
|
+
pattern=glob_path,
|
|
391
|
+
matched_files=len(matched_files),
|
|
392
|
+
)
|
|
393
|
+
full_path = matched_files
|
|
394
|
+
is_glob = True
|
|
395
|
+
|
|
396
|
+
# Prepare read options (options already includes storage_options from caller)
|
|
397
|
+
read_kwargs = options.copy()
|
|
398
|
+
|
|
399
|
+
# Extract 'query' or 'filter' option for post-read filtering
|
|
400
|
+
post_read_query = read_kwargs.pop("query", None) or read_kwargs.pop("filter", None)
|
|
401
|
+
|
|
402
|
+
if self.use_arrow:
|
|
403
|
+
read_kwargs["dtype_backend"] = "pyarrow"
|
|
404
|
+
|
|
405
|
+
# Read based on format
|
|
406
|
+
if format == "csv":
|
|
407
|
+
try:
|
|
408
|
+
if is_glob and isinstance(full_path, list):
|
|
409
|
+
ctx.debug(
|
|
410
|
+
"Parallel CSV read",
|
|
411
|
+
file_count=len(full_path),
|
|
412
|
+
)
|
|
413
|
+
df = self._read_parallel(pd.read_csv, full_path, **read_kwargs)
|
|
414
|
+
df.attrs["odibi_source_files"] = full_path
|
|
415
|
+
return self._process_df(df, post_read_query)
|
|
416
|
+
|
|
417
|
+
df = pd.read_csv(full_path, **read_kwargs)
|
|
418
|
+
if hasattr(df, "attrs"):
|
|
419
|
+
df.attrs["odibi_source_files"] = [str(full_path)]
|
|
420
|
+
return self._process_df(df, post_read_query)
|
|
421
|
+
except UnicodeDecodeError:
|
|
422
|
+
ctx.warning(
|
|
423
|
+
"UnicodeDecodeError, retrying with latin1 encoding",
|
|
424
|
+
path=str(full_path),
|
|
425
|
+
)
|
|
426
|
+
read_kwargs["encoding"] = "latin1"
|
|
427
|
+
if is_glob and isinstance(full_path, list):
|
|
428
|
+
df = self._read_parallel(pd.read_csv, full_path, **read_kwargs)
|
|
429
|
+
df.attrs["odibi_source_files"] = full_path
|
|
430
|
+
return self._process_df(df, post_read_query)
|
|
431
|
+
|
|
432
|
+
df = pd.read_csv(full_path, **read_kwargs)
|
|
433
|
+
if hasattr(df, "attrs"):
|
|
434
|
+
df.attrs["odibi_source_files"] = [str(full_path)]
|
|
435
|
+
return self._process_df(df, post_read_query)
|
|
436
|
+
except pd.errors.ParserError:
|
|
437
|
+
ctx.warning(
|
|
438
|
+
"ParserError, retrying with on_bad_lines='skip'",
|
|
439
|
+
path=str(full_path),
|
|
440
|
+
)
|
|
441
|
+
read_kwargs["on_bad_lines"] = "skip"
|
|
442
|
+
if is_glob and isinstance(full_path, list):
|
|
443
|
+
df = self._read_parallel(pd.read_csv, full_path, **read_kwargs)
|
|
444
|
+
df.attrs["odibi_source_files"] = full_path
|
|
445
|
+
return self._process_df(df, post_read_query)
|
|
446
|
+
|
|
447
|
+
df = pd.read_csv(full_path, **read_kwargs)
|
|
448
|
+
if hasattr(df, "attrs"):
|
|
449
|
+
df.attrs["odibi_source_files"] = [str(full_path)]
|
|
450
|
+
return self._process_df(df, post_read_query)
|
|
451
|
+
elif format == "parquet":
|
|
452
|
+
ctx.debug("Reading parquet", path=str(full_path))
|
|
453
|
+
df = pd.read_parquet(full_path, **read_kwargs)
|
|
454
|
+
if isinstance(full_path, list):
|
|
455
|
+
df.attrs["odibi_source_files"] = full_path
|
|
456
|
+
else:
|
|
457
|
+
df.attrs["odibi_source_files"] = [str(full_path)]
|
|
458
|
+
return self._process_df(df, post_read_query)
|
|
459
|
+
elif format == "json":
|
|
460
|
+
if is_glob and isinstance(full_path, list):
|
|
461
|
+
ctx.debug(
|
|
462
|
+
"Parallel JSON read",
|
|
463
|
+
file_count=len(full_path),
|
|
464
|
+
)
|
|
465
|
+
df = self._read_parallel(pd.read_json, full_path, **read_kwargs)
|
|
466
|
+
df.attrs["odibi_source_files"] = full_path
|
|
467
|
+
return self._process_df(df, post_read_query)
|
|
468
|
+
|
|
469
|
+
df = pd.read_json(full_path, **read_kwargs)
|
|
470
|
+
if hasattr(df, "attrs"):
|
|
471
|
+
df.attrs["odibi_source_files"] = [str(full_path)]
|
|
472
|
+
return self._process_df(df, post_read_query)
|
|
473
|
+
elif format == "excel":
|
|
474
|
+
ctx.debug("Reading Excel file", path=str(full_path))
|
|
475
|
+
read_kwargs.pop("dtype_backend", None)
|
|
476
|
+
return self._process_df(pd.read_excel(full_path, **read_kwargs), post_read_query)
|
|
477
|
+
elif format == "delta":
|
|
478
|
+
ctx.debug("Reading Delta table", path=str(full_path))
|
|
479
|
+
try:
|
|
480
|
+
from deltalake import DeltaTable
|
|
481
|
+
except ImportError:
|
|
482
|
+
ctx.error(
|
|
483
|
+
"Delta Lake library not installed",
|
|
484
|
+
path=str(full_path),
|
|
485
|
+
)
|
|
486
|
+
raise ImportError(
|
|
487
|
+
"Delta Lake support requires 'pip install odibi[pandas]' "
|
|
488
|
+
"or 'pip install deltalake'. See README.md for installation instructions."
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
storage_opts = options.get("storage_options", {})
|
|
492
|
+
version = options.get("versionAsOf")
|
|
493
|
+
timestamp = options.get("timestampAsOf")
|
|
494
|
+
|
|
495
|
+
if timestamp is not None:
|
|
496
|
+
from datetime import datetime as dt_module
|
|
497
|
+
|
|
498
|
+
if isinstance(timestamp, str):
|
|
499
|
+
ts = dt_module.fromisoformat(timestamp.replace("Z", "+00:00"))
|
|
500
|
+
else:
|
|
501
|
+
ts = timestamp
|
|
502
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
503
|
+
dt.load_with_datetime(ts)
|
|
504
|
+
ctx.debug("Delta table loaded with timestamp", timestamp=str(ts))
|
|
505
|
+
elif version is not None:
|
|
506
|
+
dt = DeltaTable(full_path, storage_options=storage_opts, version=version)
|
|
507
|
+
ctx.debug("Delta table loaded with version", version=version)
|
|
508
|
+
else:
|
|
509
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
510
|
+
ctx.debug("Delta table loaded (latest version)")
|
|
511
|
+
|
|
512
|
+
if self.use_arrow:
|
|
513
|
+
import inspect
|
|
514
|
+
|
|
515
|
+
sig = inspect.signature(dt.to_pandas)
|
|
516
|
+
|
|
517
|
+
if "arrow_options" in sig.parameters:
|
|
518
|
+
return self._process_df(
|
|
519
|
+
dt.to_pandas(
|
|
520
|
+
partitions=None, arrow_options={"types_mapper": pd.ArrowDtype}
|
|
521
|
+
),
|
|
522
|
+
post_read_query,
|
|
523
|
+
)
|
|
524
|
+
else:
|
|
525
|
+
return self._process_df(
|
|
526
|
+
dt.to_pyarrow_table().to_pandas(types_mapper=pd.ArrowDtype),
|
|
527
|
+
post_read_query,
|
|
528
|
+
)
|
|
529
|
+
else:
|
|
530
|
+
return self._process_df(dt.to_pandas(), post_read_query)
|
|
531
|
+
elif format == "avro":
|
|
532
|
+
ctx.debug("Reading Avro file", path=str(full_path))
|
|
533
|
+
try:
|
|
534
|
+
import fastavro
|
|
535
|
+
except ImportError:
|
|
536
|
+
ctx.error(
|
|
537
|
+
"fastavro library not installed",
|
|
538
|
+
path=str(full_path),
|
|
539
|
+
)
|
|
540
|
+
raise ImportError(
|
|
541
|
+
"Avro support requires 'pip install odibi[pandas]' "
|
|
542
|
+
"or 'pip install fastavro'. See README.md for installation instructions."
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
parsed = urlparse(full_path)
|
|
546
|
+
if parsed.scheme and parsed.scheme not in ["file", ""]:
|
|
547
|
+
import fsspec
|
|
548
|
+
|
|
549
|
+
storage_opts = options.get("storage_options", {})
|
|
550
|
+
with fsspec.open(full_path, "rb", **storage_opts) as f:
|
|
551
|
+
reader = fastavro.reader(f)
|
|
552
|
+
records = [record for record in reader]
|
|
553
|
+
return pd.DataFrame(records)
|
|
554
|
+
else:
|
|
555
|
+
with open(full_path, "rb") as f:
|
|
556
|
+
reader = fastavro.reader(f)
|
|
557
|
+
records = [record for record in reader]
|
|
558
|
+
return self._process_df(pd.DataFrame(records), post_read_query)
|
|
559
|
+
elif format in ["sql", "sql_server", "azure_sql"]:
|
|
560
|
+
ctx.debug("Reading SQL table", table=str(full_path), format=format)
|
|
561
|
+
if not hasattr(connection, "read_table"):
|
|
562
|
+
ctx.error(
|
|
563
|
+
"Connection does not support SQL operations",
|
|
564
|
+
connection_type=type(connection).__name__,
|
|
565
|
+
)
|
|
566
|
+
raise ValueError(
|
|
567
|
+
f"Cannot read SQL table '{full_path}': connection type '{type(connection).__name__}' "
|
|
568
|
+
"does not support SQL operations. Use a SQL-compatible connection "
|
|
569
|
+
"(e.g., SqlServerConnection, AzureSqlConnection)."
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
table_name = str(full_path)
|
|
573
|
+
if "." in table_name:
|
|
574
|
+
schema, tbl = table_name.split(".", 1)
|
|
575
|
+
else:
|
|
576
|
+
schema, tbl = "dbo", table_name
|
|
577
|
+
|
|
578
|
+
ctx.debug("Executing SQL read", schema=schema, table=tbl)
|
|
579
|
+
return connection.read_table(table_name=tbl, schema=schema)
|
|
580
|
+
else:
|
|
581
|
+
ctx.error("Unsupported format", format=format)
|
|
582
|
+
raise ValueError(
|
|
583
|
+
f"Unsupported format for Pandas engine: '{format}'. "
|
|
584
|
+
"Supported formats: csv, parquet, json, excel, delta, sql, sql_server, azure_sql."
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
def write(
|
|
588
|
+
self,
|
|
589
|
+
df: Union[pd.DataFrame, Iterator[pd.DataFrame]],
|
|
590
|
+
connection: Any,
|
|
591
|
+
format: str,
|
|
592
|
+
table: Optional[str] = None,
|
|
593
|
+
path: Optional[str] = None,
|
|
594
|
+
register_table: Optional[str] = None,
|
|
595
|
+
mode: str = "overwrite",
|
|
596
|
+
options: Optional[Dict[str, Any]] = None,
|
|
597
|
+
streaming_config: Optional[Any] = None,
|
|
598
|
+
) -> Optional[Dict[str, Any]]:
|
|
599
|
+
"""Write data using Pandas."""
|
|
600
|
+
ctx = get_logging_context().with_context(engine="pandas")
|
|
601
|
+
start = time.time()
|
|
602
|
+
|
|
603
|
+
destination = path or table
|
|
604
|
+
ctx.debug(
|
|
605
|
+
"Starting write operation",
|
|
606
|
+
format=format,
|
|
607
|
+
destination=destination,
|
|
608
|
+
mode=mode,
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# Ensure materialization if LazyDataset
|
|
612
|
+
df = self.materialize(df)
|
|
613
|
+
|
|
614
|
+
options = options or {}
|
|
615
|
+
|
|
616
|
+
# Handle iterator/generator input
|
|
617
|
+
from collections.abc import Iterator
|
|
618
|
+
|
|
619
|
+
if isinstance(df, Iterator):
|
|
620
|
+
ctx.debug("Writing iterator/generator input")
|
|
621
|
+
return self._write_iterator(df, connection, format, table, path, mode, options)
|
|
622
|
+
|
|
623
|
+
row_count = len(df)
|
|
624
|
+
memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
|
|
625
|
+
|
|
626
|
+
ctx.log_pandas_metrics(
|
|
627
|
+
memory_mb=memory_mb,
|
|
628
|
+
dtypes={col: str(dtype) for col, dtype in df.dtypes.items()},
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
# SQL Server / Azure SQL Support
|
|
632
|
+
if format in ["sql", "sql_server", "azure_sql"]:
|
|
633
|
+
ctx.debug("Writing to SQL", table=table, mode=mode)
|
|
634
|
+
return self._write_sql(df, connection, table, mode, options)
|
|
635
|
+
|
|
636
|
+
# Resolve full path from connection
|
|
637
|
+
try:
|
|
638
|
+
full_path = self._resolve_path(path or table, connection)
|
|
639
|
+
except ValueError:
|
|
640
|
+
if table and not connection:
|
|
641
|
+
ctx.error("Connection required when specifying 'table'", table=table)
|
|
642
|
+
raise ValueError("Connection is required when specifying 'table'.")
|
|
643
|
+
ctx.error("Neither path nor table provided for write operation")
|
|
644
|
+
raise ValueError("Either path or table must be provided")
|
|
645
|
+
|
|
646
|
+
# Merge storage options for cloud connections
|
|
647
|
+
merged_options = self._merge_storage_options(connection, options)
|
|
648
|
+
|
|
649
|
+
# Custom Writers
|
|
650
|
+
if format in self._custom_writers:
|
|
651
|
+
ctx.debug(f"Using custom writer for format: {format}")
|
|
652
|
+
writer_options = merged_options.copy()
|
|
653
|
+
writer_options.pop("keys", None)
|
|
654
|
+
self._custom_writers[format](df, full_path, mode=mode, **writer_options)
|
|
655
|
+
return None
|
|
656
|
+
|
|
657
|
+
# Ensure directory exists (local only)
|
|
658
|
+
self._ensure_directory(full_path)
|
|
659
|
+
|
|
660
|
+
# Warn about partitioning
|
|
661
|
+
self._check_partitioning(merged_options)
|
|
662
|
+
|
|
663
|
+
# Delta Lake Write
|
|
664
|
+
if format == "delta":
|
|
665
|
+
ctx.debug("Writing Delta table", path=str(full_path), mode=mode)
|
|
666
|
+
result = self._write_delta(df, full_path, mode, merged_options)
|
|
667
|
+
elapsed = (time.time() - start) * 1000
|
|
668
|
+
ctx.log_file_io(
|
|
669
|
+
path=str(full_path),
|
|
670
|
+
format=format,
|
|
671
|
+
mode=mode,
|
|
672
|
+
rows=row_count,
|
|
673
|
+
)
|
|
674
|
+
ctx.info(
|
|
675
|
+
"Write completed",
|
|
676
|
+
format=format,
|
|
677
|
+
rows=row_count,
|
|
678
|
+
elapsed_ms=round(elapsed, 2),
|
|
679
|
+
)
|
|
680
|
+
return result
|
|
681
|
+
|
|
682
|
+
# Handle Generic Upsert/Append-Once for non-Delta
|
|
683
|
+
if mode in ["upsert", "append_once"]:
|
|
684
|
+
ctx.debug(f"Handling {mode} mode for non-Delta format")
|
|
685
|
+
df, mode = self._handle_generic_upsert(df, full_path, format, mode, merged_options)
|
|
686
|
+
row_count = len(df)
|
|
687
|
+
|
|
688
|
+
# Standard File Write
|
|
689
|
+
result = self._write_file(df, full_path, format, mode, merged_options)
|
|
690
|
+
|
|
691
|
+
elapsed = (time.time() - start) * 1000
|
|
692
|
+
ctx.log_file_io(
|
|
693
|
+
path=str(full_path),
|
|
694
|
+
format=format,
|
|
695
|
+
mode=mode,
|
|
696
|
+
rows=row_count,
|
|
697
|
+
)
|
|
698
|
+
ctx.info(
|
|
699
|
+
"Write completed",
|
|
700
|
+
format=format,
|
|
701
|
+
rows=row_count,
|
|
702
|
+
elapsed_ms=round(elapsed, 2),
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
return result
|
|
706
|
+
|
|
707
|
+
def _write_iterator(
|
|
708
|
+
self,
|
|
709
|
+
df_iter: Iterator[pd.DataFrame],
|
|
710
|
+
connection: Any,
|
|
711
|
+
format: str,
|
|
712
|
+
table: Optional[str],
|
|
713
|
+
path: Optional[str],
|
|
714
|
+
mode: str,
|
|
715
|
+
options: Dict[str, Any],
|
|
716
|
+
) -> None:
|
|
717
|
+
"""Handle writing of iterator/generator."""
|
|
718
|
+
first_chunk = True
|
|
719
|
+
for chunk in df_iter:
|
|
720
|
+
# Determine mode for this chunk
|
|
721
|
+
current_mode = mode if first_chunk else "append"
|
|
722
|
+
current_options = options.copy()
|
|
723
|
+
|
|
724
|
+
# Handle CSV header for chunks
|
|
725
|
+
if not first_chunk and format == "csv":
|
|
726
|
+
if current_options.get("header") is not False:
|
|
727
|
+
current_options["header"] = False
|
|
728
|
+
|
|
729
|
+
self.write(
|
|
730
|
+
chunk,
|
|
731
|
+
connection,
|
|
732
|
+
format,
|
|
733
|
+
table,
|
|
734
|
+
path,
|
|
735
|
+
mode=current_mode,
|
|
736
|
+
options=current_options,
|
|
737
|
+
)
|
|
738
|
+
first_chunk = False
|
|
739
|
+
return None
|
|
740
|
+
|
|
741
|
+
def _write_sql(
|
|
742
|
+
self,
|
|
743
|
+
df: pd.DataFrame,
|
|
744
|
+
connection: Any,
|
|
745
|
+
table: Optional[str],
|
|
746
|
+
mode: str,
|
|
747
|
+
options: Dict[str, Any],
|
|
748
|
+
) -> Optional[Dict[str, Any]]:
|
|
749
|
+
"""Handle SQL writing including merge and enhanced overwrite."""
|
|
750
|
+
ctx = get_logging_context().with_context(engine="pandas")
|
|
751
|
+
|
|
752
|
+
if not hasattr(connection, "write_table"):
|
|
753
|
+
raise ValueError(
|
|
754
|
+
f"Connection type '{type(connection).__name__}' does not support SQL operations"
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
if not table:
|
|
758
|
+
raise ValueError("SQL format requires 'table' config")
|
|
759
|
+
|
|
760
|
+
# Handle MERGE mode for SQL Server
|
|
761
|
+
if mode == "merge":
|
|
762
|
+
merge_keys = options.get("merge_keys")
|
|
763
|
+
merge_options = options.get("merge_options")
|
|
764
|
+
|
|
765
|
+
if not merge_keys:
|
|
766
|
+
raise ValueError(
|
|
767
|
+
"MERGE mode requires 'merge_keys' in options. "
|
|
768
|
+
"Specify the key columns for the MERGE ON clause."
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
from odibi.writers.sql_server_writer import SqlServerMergeWriter
|
|
772
|
+
|
|
773
|
+
writer = SqlServerMergeWriter(connection)
|
|
774
|
+
ctx.debug(
|
|
775
|
+
"Executing SQL Server MERGE (Pandas)",
|
|
776
|
+
target=table,
|
|
777
|
+
merge_keys=merge_keys,
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
result = writer.merge_pandas(
|
|
781
|
+
df=df,
|
|
782
|
+
target_table=table,
|
|
783
|
+
merge_keys=merge_keys,
|
|
784
|
+
options=merge_options,
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
ctx.info(
|
|
788
|
+
"SQL Server MERGE completed (Pandas)",
|
|
789
|
+
target=table,
|
|
790
|
+
inserted=result.inserted,
|
|
791
|
+
updated=result.updated,
|
|
792
|
+
deleted=result.deleted,
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
return {
|
|
796
|
+
"mode": "merge",
|
|
797
|
+
"inserted": result.inserted,
|
|
798
|
+
"updated": result.updated,
|
|
799
|
+
"deleted": result.deleted,
|
|
800
|
+
"total_affected": result.total_affected,
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
# Handle enhanced overwrite with strategies
|
|
804
|
+
if mode == "overwrite" and options.get("overwrite_options"):
|
|
805
|
+
from odibi.writers.sql_server_writer import SqlServerMergeWriter
|
|
806
|
+
|
|
807
|
+
overwrite_options = options.get("overwrite_options")
|
|
808
|
+
writer = SqlServerMergeWriter(connection)
|
|
809
|
+
|
|
810
|
+
ctx.debug(
|
|
811
|
+
"Executing SQL Server enhanced overwrite (Pandas)",
|
|
812
|
+
target=table,
|
|
813
|
+
strategy=(
|
|
814
|
+
overwrite_options.strategy.value
|
|
815
|
+
if hasattr(overwrite_options, "strategy")
|
|
816
|
+
else "truncate_insert"
|
|
817
|
+
),
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
result = writer.overwrite_pandas(
|
|
821
|
+
df=df,
|
|
822
|
+
target_table=table,
|
|
823
|
+
options=overwrite_options,
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
ctx.info(
|
|
827
|
+
"SQL Server enhanced overwrite completed (Pandas)",
|
|
828
|
+
target=table,
|
|
829
|
+
strategy=result.strategy,
|
|
830
|
+
rows_written=result.rows_written,
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
return {
|
|
834
|
+
"mode": "overwrite",
|
|
835
|
+
"strategy": result.strategy,
|
|
836
|
+
"rows_written": result.rows_written,
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
# Extract schema from table name if present
|
|
840
|
+
if "." in table:
|
|
841
|
+
schema, table_name = table.split(".", 1)
|
|
842
|
+
else:
|
|
843
|
+
schema, table_name = "dbo", table
|
|
844
|
+
|
|
845
|
+
# Map mode to if_exists
|
|
846
|
+
if_exists = "replace" # overwrite
|
|
847
|
+
if mode == "append":
|
|
848
|
+
if_exists = "append"
|
|
849
|
+
elif mode == "fail":
|
|
850
|
+
if_exists = "fail"
|
|
851
|
+
|
|
852
|
+
chunksize = options.get("chunksize", 1000)
|
|
853
|
+
|
|
854
|
+
connection.write_table(
|
|
855
|
+
df=df,
|
|
856
|
+
table_name=table_name,
|
|
857
|
+
schema=schema,
|
|
858
|
+
if_exists=if_exists,
|
|
859
|
+
chunksize=chunksize,
|
|
860
|
+
)
|
|
861
|
+
return None
|
|
862
|
+
|
|
863
|
+
def _ensure_directory(self, full_path: str) -> None:
|
|
864
|
+
"""Ensure parent directory exists for local files."""
|
|
865
|
+
parsed = urlparse(str(full_path))
|
|
866
|
+
is_windows_drive = (
|
|
867
|
+
len(parsed.scheme) == 1 and parsed.scheme.isalpha() if parsed.scheme else False
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
if not parsed.scheme or parsed.scheme == "file" or is_windows_drive:
|
|
871
|
+
Path(full_path).parent.mkdir(parents=True, exist_ok=True)
|
|
872
|
+
|
|
873
|
+
def _check_partitioning(self, options: Dict[str, Any]) -> None:
|
|
874
|
+
"""Warn about potential partitioning issues."""
|
|
875
|
+
partition_by = options.get("partition_by") or options.get("partitionBy")
|
|
876
|
+
if partition_by:
|
|
877
|
+
import warnings
|
|
878
|
+
|
|
879
|
+
warnings.warn(
|
|
880
|
+
"⚠️ Partitioning can cause performance issues if misused. "
|
|
881
|
+
"Only partition on low-cardinality columns (< 1000 unique values) "
|
|
882
|
+
"and ensure each partition has > 1000 rows.",
|
|
883
|
+
UserWarning,
|
|
884
|
+
)
|
|
885
|
+
|
|
886
|
+
def _write_delta(
|
|
887
|
+
self,
|
|
888
|
+
df: pd.DataFrame,
|
|
889
|
+
full_path: str,
|
|
890
|
+
mode: str,
|
|
891
|
+
merged_options: Dict[str, Any],
|
|
892
|
+
) -> Dict[str, Any]:
|
|
893
|
+
"""Handle Delta Lake writing."""
|
|
894
|
+
try:
|
|
895
|
+
from deltalake import DeltaTable, write_deltalake
|
|
896
|
+
except ImportError:
|
|
897
|
+
raise ImportError(
|
|
898
|
+
"Delta Lake support requires 'pip install odibi[pandas]' or 'pip install deltalake'. "
|
|
899
|
+
"See README.md for installation instructions."
|
|
900
|
+
)
|
|
901
|
+
|
|
902
|
+
storage_opts = merged_options.get("storage_options", {})
|
|
903
|
+
|
|
904
|
+
# Handle null-only columns: Delta Lake doesn't support Null dtype
|
|
905
|
+
# Cast columns with all-null values to string to avoid schema errors
|
|
906
|
+
for col in df.columns:
|
|
907
|
+
if df[col].isna().all():
|
|
908
|
+
df[col] = df[col].astype("string")
|
|
909
|
+
|
|
910
|
+
# Map modes
|
|
911
|
+
delta_mode = "overwrite"
|
|
912
|
+
if mode == "append":
|
|
913
|
+
delta_mode = "append"
|
|
914
|
+
elif mode == "error" or mode == "fail":
|
|
915
|
+
delta_mode = "error"
|
|
916
|
+
elif mode == "ignore":
|
|
917
|
+
delta_mode = "ignore"
|
|
918
|
+
|
|
919
|
+
# Handle upsert/append_once logic
|
|
920
|
+
if mode == "upsert":
|
|
921
|
+
keys = merged_options.get("keys")
|
|
922
|
+
if not keys:
|
|
923
|
+
raise ValueError("Upsert requires 'keys' in options")
|
|
924
|
+
|
|
925
|
+
if isinstance(keys, str):
|
|
926
|
+
keys = [keys]
|
|
927
|
+
|
|
928
|
+
def do_upsert():
|
|
929
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
930
|
+
(
|
|
931
|
+
dt.merge(
|
|
932
|
+
source=df,
|
|
933
|
+
predicate=" AND ".join([f"s.{k} = t.{k}" for k in keys]),
|
|
934
|
+
source_alias="s",
|
|
935
|
+
target_alias="t",
|
|
936
|
+
)
|
|
937
|
+
.when_matched_update_all()
|
|
938
|
+
.when_not_matched_insert_all()
|
|
939
|
+
.execute()
|
|
940
|
+
)
|
|
941
|
+
|
|
942
|
+
self._retry_delta_operation(do_upsert)
|
|
943
|
+
elif mode == "append_once":
|
|
944
|
+
keys = merged_options.get("keys")
|
|
945
|
+
if not keys:
|
|
946
|
+
raise ValueError("Append_once requires 'keys' in options")
|
|
947
|
+
|
|
948
|
+
if isinstance(keys, str):
|
|
949
|
+
keys = [keys]
|
|
950
|
+
|
|
951
|
+
def do_append_once():
|
|
952
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
953
|
+
(
|
|
954
|
+
dt.merge(
|
|
955
|
+
source=df,
|
|
956
|
+
predicate=" AND ".join([f"s.{k} = t.{k}" for k in keys]),
|
|
957
|
+
source_alias="s",
|
|
958
|
+
target_alias="t",
|
|
959
|
+
)
|
|
960
|
+
.when_not_matched_insert_all()
|
|
961
|
+
.execute()
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
self._retry_delta_operation(do_append_once)
|
|
965
|
+
else:
|
|
966
|
+
# Filter options supported by write_deltalake
|
|
967
|
+
write_kwargs = {
|
|
968
|
+
k: v
|
|
969
|
+
for k, v in merged_options.items()
|
|
970
|
+
if k
|
|
971
|
+
in [
|
|
972
|
+
"partition_by",
|
|
973
|
+
"mode",
|
|
974
|
+
"overwrite_schema",
|
|
975
|
+
"schema_mode",
|
|
976
|
+
"name",
|
|
977
|
+
"description",
|
|
978
|
+
"configuration",
|
|
979
|
+
]
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
def do_write():
|
|
983
|
+
write_deltalake(
|
|
984
|
+
full_path, df, mode=delta_mode, storage_options=storage_opts, **write_kwargs
|
|
985
|
+
)
|
|
986
|
+
|
|
987
|
+
self._retry_delta_operation(do_write)
|
|
988
|
+
|
|
989
|
+
# Return commit info
|
|
990
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
991
|
+
history = dt.history(limit=1)
|
|
992
|
+
latest = history[0]
|
|
993
|
+
|
|
994
|
+
return {
|
|
995
|
+
"version": dt.version(),
|
|
996
|
+
"timestamp": datetime.fromtimestamp(latest.get("timestamp", 0) / 1000),
|
|
997
|
+
"operation": latest.get("operation"),
|
|
998
|
+
"operation_metrics": latest.get("operationMetrics", {}),
|
|
999
|
+
"read_version": latest.get("readVersion"),
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
def _handle_generic_upsert(
|
|
1003
|
+
self,
|
|
1004
|
+
df: pd.DataFrame,
|
|
1005
|
+
full_path: str,
|
|
1006
|
+
format: str,
|
|
1007
|
+
mode: str,
|
|
1008
|
+
options: Dict[str, Any],
|
|
1009
|
+
) -> tuple[pd.DataFrame, str]:
|
|
1010
|
+
"""Handle upsert/append_once for standard files by merging with existing data."""
|
|
1011
|
+
if "keys" not in options:
|
|
1012
|
+
raise ValueError(f"Mode '{mode}' requires 'keys' list in options")
|
|
1013
|
+
|
|
1014
|
+
keys = options["keys"]
|
|
1015
|
+
if isinstance(keys, str):
|
|
1016
|
+
keys = [keys]
|
|
1017
|
+
|
|
1018
|
+
# Try to read existing file
|
|
1019
|
+
existing_df = None
|
|
1020
|
+
try:
|
|
1021
|
+
read_opts = options.copy()
|
|
1022
|
+
read_opts.pop("keys", None)
|
|
1023
|
+
|
|
1024
|
+
if format == "csv":
|
|
1025
|
+
existing_df = pd.read_csv(full_path, **read_opts)
|
|
1026
|
+
elif format == "parquet":
|
|
1027
|
+
existing_df = pd.read_parquet(full_path, **read_opts)
|
|
1028
|
+
elif format == "json":
|
|
1029
|
+
existing_df = pd.read_json(full_path, **read_opts)
|
|
1030
|
+
elif format == "excel":
|
|
1031
|
+
existing_df = pd.read_excel(full_path, **read_opts)
|
|
1032
|
+
except Exception:
|
|
1033
|
+
# File doesn't exist or can't be read
|
|
1034
|
+
return df, "overwrite" # Treat as new write
|
|
1035
|
+
|
|
1036
|
+
if existing_df is None:
|
|
1037
|
+
return df, "overwrite"
|
|
1038
|
+
|
|
1039
|
+
if mode == "append_once":
|
|
1040
|
+
# Check if keys exist
|
|
1041
|
+
missing_keys = set(keys) - set(df.columns)
|
|
1042
|
+
if missing_keys:
|
|
1043
|
+
raise KeyError(f"Keys {missing_keys} not found in input data")
|
|
1044
|
+
|
|
1045
|
+
# Identify new rows
|
|
1046
|
+
merged = df.merge(existing_df[keys], on=keys, how="left", indicator=True)
|
|
1047
|
+
new_rows = merged[merged["_merge"] == "left_only"].drop(columns=["_merge"])
|
|
1048
|
+
|
|
1049
|
+
if format in ["csv", "json"]:
|
|
1050
|
+
return new_rows, "append"
|
|
1051
|
+
else:
|
|
1052
|
+
# Rewrite everything
|
|
1053
|
+
return pd.concat([existing_df, new_rows], ignore_index=True), "overwrite"
|
|
1054
|
+
|
|
1055
|
+
elif mode == "upsert":
|
|
1056
|
+
# Check if keys exist
|
|
1057
|
+
missing_keys = set(keys) - set(df.columns)
|
|
1058
|
+
if missing_keys:
|
|
1059
|
+
raise KeyError(f"Keys {missing_keys} not found in input data")
|
|
1060
|
+
|
|
1061
|
+
# 1. Remove rows from existing that are in input
|
|
1062
|
+
merged_indicator = existing_df.merge(df[keys], on=keys, how="left", indicator=True)
|
|
1063
|
+
rows_to_keep = existing_df[merged_indicator["_merge"] == "left_only"]
|
|
1064
|
+
|
|
1065
|
+
# 2. Concat rows_to_keep + input df
|
|
1066
|
+
# 3. Write mode becomes overwrite
|
|
1067
|
+
return pd.concat([rows_to_keep, df], ignore_index=True), "overwrite"
|
|
1068
|
+
|
|
1069
|
+
return df, mode
|
|
1070
|
+
|
|
1071
|
+
def _write_file(
|
|
1072
|
+
self,
|
|
1073
|
+
df: pd.DataFrame,
|
|
1074
|
+
full_path: str,
|
|
1075
|
+
format: str,
|
|
1076
|
+
mode: str,
|
|
1077
|
+
merged_options: Dict[str, Any],
|
|
1078
|
+
) -> None:
|
|
1079
|
+
"""Handle standard file writing (CSV, Parquet, etc.)."""
|
|
1080
|
+
writer_options = merged_options.copy()
|
|
1081
|
+
writer_options.pop("keys", None)
|
|
1082
|
+
|
|
1083
|
+
# Remove storage_options for local pandas writers usually?
|
|
1084
|
+
# Some pandas writers accept storage_options (parquet, csv with fsspec)
|
|
1085
|
+
|
|
1086
|
+
if format == "csv":
|
|
1087
|
+
mode_param = "w"
|
|
1088
|
+
if mode == "append":
|
|
1089
|
+
mode_param = "a"
|
|
1090
|
+
if not os.path.exists(full_path):
|
|
1091
|
+
# If file doesn't exist, include header
|
|
1092
|
+
writer_options["header"] = True
|
|
1093
|
+
else:
|
|
1094
|
+
# If appending, don't write header unless explicit
|
|
1095
|
+
if "header" not in writer_options:
|
|
1096
|
+
writer_options["header"] = False
|
|
1097
|
+
|
|
1098
|
+
df.to_csv(full_path, index=False, mode=mode_param, **writer_options)
|
|
1099
|
+
|
|
1100
|
+
elif format == "parquet":
|
|
1101
|
+
if mode == "append":
|
|
1102
|
+
# Pandas read_parquet doesn't support append directly usually.
|
|
1103
|
+
# We implement simple read-concat-write for local files
|
|
1104
|
+
if os.path.exists(full_path):
|
|
1105
|
+
existing = pd.read_parquet(full_path, **merged_options)
|
|
1106
|
+
df = pd.concat([existing, df], ignore_index=True)
|
|
1107
|
+
|
|
1108
|
+
df.to_parquet(full_path, index=False, **writer_options)
|
|
1109
|
+
|
|
1110
|
+
elif format == "json":
|
|
1111
|
+
if mode == "append":
|
|
1112
|
+
writer_options["mode"] = "a"
|
|
1113
|
+
|
|
1114
|
+
# Default to records if not specified
|
|
1115
|
+
if "orient" not in writer_options:
|
|
1116
|
+
writer_options["orient"] = "records"
|
|
1117
|
+
|
|
1118
|
+
# Include storage_options for cloud storage (ADLS, S3, GCS)
|
|
1119
|
+
if "storage_options" in merged_options:
|
|
1120
|
+
writer_options["storage_options"] = merged_options["storage_options"]
|
|
1121
|
+
|
|
1122
|
+
df.to_json(full_path, **writer_options)
|
|
1123
|
+
|
|
1124
|
+
elif format == "excel":
|
|
1125
|
+
if mode == "append":
|
|
1126
|
+
# Simple append for excel
|
|
1127
|
+
if os.path.exists(full_path):
|
|
1128
|
+
with pd.ExcelWriter(full_path, mode="a", if_sheet_exists="overlay") as writer:
|
|
1129
|
+
df.to_excel(writer, index=False, **writer_options)
|
|
1130
|
+
return
|
|
1131
|
+
|
|
1132
|
+
df.to_excel(full_path, index=False, **writer_options)
|
|
1133
|
+
|
|
1134
|
+
elif format == "avro":
|
|
1135
|
+
try:
|
|
1136
|
+
import fastavro
|
|
1137
|
+
except ImportError:
|
|
1138
|
+
raise ImportError("Avro support requires 'pip install fastavro'")
|
|
1139
|
+
|
|
1140
|
+
# Convert datetime columns to microseconds for Avro timestamp-micros
|
|
1141
|
+
df_avro = df.copy()
|
|
1142
|
+
for col in df_avro.columns:
|
|
1143
|
+
if pd.api.types.is_datetime64_any_dtype(df_avro[col].dtype):
|
|
1144
|
+
df_avro[col] = df_avro[col].apply(
|
|
1145
|
+
lambda x: int(x.timestamp() * 1_000_000) if pd.notna(x) else None
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
records = df_avro.to_dict("records")
|
|
1149
|
+
schema = self._infer_avro_schema(df)
|
|
1150
|
+
|
|
1151
|
+
# Use fsspec for remote URIs (abfss://, s3://, etc.)
|
|
1152
|
+
parsed = urlparse(full_path)
|
|
1153
|
+
if parsed.scheme and parsed.scheme not in ["file", ""]:
|
|
1154
|
+
# Remote file - use fsspec
|
|
1155
|
+
import fsspec
|
|
1156
|
+
|
|
1157
|
+
storage_opts = merged_options.get("storage_options", {})
|
|
1158
|
+
write_mode = "wb" if mode == "overwrite" else "ab"
|
|
1159
|
+
with fsspec.open(full_path, write_mode, **storage_opts) as f:
|
|
1160
|
+
fastavro.writer(f, schema, records)
|
|
1161
|
+
else:
|
|
1162
|
+
# Local file - use standard open
|
|
1163
|
+
open_mode = "wb"
|
|
1164
|
+
if mode == "append" and os.path.exists(full_path):
|
|
1165
|
+
open_mode = "a+b"
|
|
1166
|
+
|
|
1167
|
+
with open(full_path, open_mode) as f:
|
|
1168
|
+
fastavro.writer(f, schema, records)
|
|
1169
|
+
else:
|
|
1170
|
+
raise ValueError(f"Unsupported format for Pandas engine: {format}")
|
|
1171
|
+
|
|
1172
|
+
def add_write_metadata(
|
|
1173
|
+
self,
|
|
1174
|
+
df: pd.DataFrame,
|
|
1175
|
+
metadata_config: Any,
|
|
1176
|
+
source_connection: Optional[str] = None,
|
|
1177
|
+
source_table: Optional[str] = None,
|
|
1178
|
+
source_path: Optional[str] = None,
|
|
1179
|
+
is_file_source: bool = False,
|
|
1180
|
+
) -> pd.DataFrame:
|
|
1181
|
+
"""Add metadata columns to DataFrame before writing (Bronze layer lineage).
|
|
1182
|
+
|
|
1183
|
+
Args:
|
|
1184
|
+
df: Pandas DataFrame
|
|
1185
|
+
metadata_config: WriteMetadataConfig or True (for all defaults)
|
|
1186
|
+
source_connection: Name of the source connection
|
|
1187
|
+
source_table: Name of the source table (SQL sources)
|
|
1188
|
+
source_path: Path of the source file (file sources)
|
|
1189
|
+
is_file_source: True if source is a file-based read
|
|
1190
|
+
|
|
1191
|
+
Returns:
|
|
1192
|
+
DataFrame with metadata columns added
|
|
1193
|
+
"""
|
|
1194
|
+
from odibi.config import WriteMetadataConfig
|
|
1195
|
+
|
|
1196
|
+
# Normalize config: True -> all defaults
|
|
1197
|
+
if metadata_config is True:
|
|
1198
|
+
config = WriteMetadataConfig()
|
|
1199
|
+
elif isinstance(metadata_config, WriteMetadataConfig):
|
|
1200
|
+
config = metadata_config
|
|
1201
|
+
else:
|
|
1202
|
+
return df # None or invalid -> no metadata
|
|
1203
|
+
|
|
1204
|
+
# Work on a copy to avoid modifying original
|
|
1205
|
+
df = df.copy()
|
|
1206
|
+
|
|
1207
|
+
# _extracted_at: always applicable
|
|
1208
|
+
if config.extracted_at:
|
|
1209
|
+
df["_extracted_at"] = pd.Timestamp.now()
|
|
1210
|
+
|
|
1211
|
+
# _source_file: only for file sources
|
|
1212
|
+
if config.source_file and is_file_source and source_path:
|
|
1213
|
+
df["_source_file"] = source_path
|
|
1214
|
+
|
|
1215
|
+
# _source_connection: all sources
|
|
1216
|
+
if config.source_connection and source_connection:
|
|
1217
|
+
df["_source_connection"] = source_connection
|
|
1218
|
+
|
|
1219
|
+
# _source_table: SQL sources only
|
|
1220
|
+
if config.source_table and source_table:
|
|
1221
|
+
df["_source_table"] = source_table
|
|
1222
|
+
|
|
1223
|
+
return df
|
|
1224
|
+
|
|
1225
|
+
def _register_lazy_view_unused(self, conn, name: str, df: Any) -> None:
|
|
1226
|
+
"""Register a LazyDataset as a DuckDB view."""
|
|
1227
|
+
duck_fmt = df.format
|
|
1228
|
+
if duck_fmt == "json":
|
|
1229
|
+
duck_fmt = "json_auto"
|
|
1230
|
+
|
|
1231
|
+
if isinstance(df.path, list):
|
|
1232
|
+
paths = ", ".join([f"'{p}'" for p in df.path])
|
|
1233
|
+
conn.execute(
|
|
1234
|
+
f"CREATE OR REPLACE VIEW {name} AS SELECT * FROM read_{duck_fmt}([{paths}])"
|
|
1235
|
+
)
|
|
1236
|
+
else:
|
|
1237
|
+
conn.execute(
|
|
1238
|
+
f"CREATE OR REPLACE VIEW {name} AS SELECT * FROM read_{duck_fmt}('{df.path}')"
|
|
1239
|
+
)
|
|
1240
|
+
|
|
1241
|
+
def execute_sql(self, sql: str, context: Context) -> pd.DataFrame:
|
|
1242
|
+
"""Execute SQL query using DuckDB (if available) or pandasql.
|
|
1243
|
+
|
|
1244
|
+
Args:
|
|
1245
|
+
sql: SQL query string
|
|
1246
|
+
context: Execution context
|
|
1247
|
+
|
|
1248
|
+
Returns:
|
|
1249
|
+
Result DataFrame
|
|
1250
|
+
"""
|
|
1251
|
+
if not isinstance(context, PandasContext):
|
|
1252
|
+
raise TypeError("PandasEngine requires PandasContext")
|
|
1253
|
+
|
|
1254
|
+
# Try to use DuckDB for SQL
|
|
1255
|
+
try:
|
|
1256
|
+
import duckdb
|
|
1257
|
+
|
|
1258
|
+
# Create in-memory database
|
|
1259
|
+
conn = duckdb.connect(":memory:")
|
|
1260
|
+
|
|
1261
|
+
# Register all DataFrames from context
|
|
1262
|
+
for name in context.list_names():
|
|
1263
|
+
dataset_obj = context.get(name)
|
|
1264
|
+
|
|
1265
|
+
# Debug check
|
|
1266
|
+
# print(f"DEBUG: Registering {name} type={type(dataset_obj)} LazyDataset={LazyDataset}")
|
|
1267
|
+
|
|
1268
|
+
# Handle LazyDataset (DuckDB optimization)
|
|
1269
|
+
# if isinstance(dataset_obj, LazyDataset):
|
|
1270
|
+
# self._register_lazy_view(conn, name, dataset_obj)
|
|
1271
|
+
# # Log that we used DuckDB on file
|
|
1272
|
+
# # logger.info(f"Executing SQL via DuckDB on lazy file: {dataset_obj.path}")
|
|
1273
|
+
# continue
|
|
1274
|
+
|
|
1275
|
+
# Handle chunked data (Iterator)
|
|
1276
|
+
from collections.abc import Iterator
|
|
1277
|
+
|
|
1278
|
+
if isinstance(dataset_obj, Iterator):
|
|
1279
|
+
# Warning: Materializing iterator for SQL execution
|
|
1280
|
+
# Note: DuckDB doesn't support streaming from iterator yet
|
|
1281
|
+
dataset_obj = pd.concat(dataset_obj, ignore_index=True)
|
|
1282
|
+
|
|
1283
|
+
conn.register(name, dataset_obj)
|
|
1284
|
+
|
|
1285
|
+
# Execute query
|
|
1286
|
+
result = conn.execute(sql).df()
|
|
1287
|
+
conn.close()
|
|
1288
|
+
|
|
1289
|
+
return result
|
|
1290
|
+
|
|
1291
|
+
except ImportError:
|
|
1292
|
+
# Fallback: try pandasql
|
|
1293
|
+
try:
|
|
1294
|
+
from pandasql import sqldf
|
|
1295
|
+
|
|
1296
|
+
# Build local namespace with DataFrames
|
|
1297
|
+
locals_dict = {}
|
|
1298
|
+
for name in context.list_names():
|
|
1299
|
+
df = context.get(name)
|
|
1300
|
+
|
|
1301
|
+
# Handle chunked data (Iterator)
|
|
1302
|
+
from collections.abc import Iterator
|
|
1303
|
+
|
|
1304
|
+
if isinstance(df, Iterator):
|
|
1305
|
+
df = pd.concat(df, ignore_index=True)
|
|
1306
|
+
|
|
1307
|
+
locals_dict[name] = df
|
|
1308
|
+
|
|
1309
|
+
return sqldf(sql, locals_dict)
|
|
1310
|
+
|
|
1311
|
+
except ImportError:
|
|
1312
|
+
raise TransformError(
|
|
1313
|
+
"SQL execution requires 'duckdb' or 'pandasql'. "
|
|
1314
|
+
"Install with: pip install duckdb"
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
def execute_operation(
|
|
1318
|
+
self,
|
|
1319
|
+
operation: str,
|
|
1320
|
+
params: Dict[str, Any],
|
|
1321
|
+
df: Union[pd.DataFrame, Iterator[pd.DataFrame]],
|
|
1322
|
+
) -> pd.DataFrame:
|
|
1323
|
+
"""Execute built-in operation.
|
|
1324
|
+
|
|
1325
|
+
Args:
|
|
1326
|
+
operation: Operation name
|
|
1327
|
+
params: Operation parameters
|
|
1328
|
+
df: Input DataFrame or Iterator
|
|
1329
|
+
|
|
1330
|
+
Returns:
|
|
1331
|
+
Result DataFrame
|
|
1332
|
+
"""
|
|
1333
|
+
# Materialize LazyDataset
|
|
1334
|
+
df = self.materialize(df)
|
|
1335
|
+
|
|
1336
|
+
# Handle chunked data (Iterator)
|
|
1337
|
+
from collections.abc import Iterator
|
|
1338
|
+
|
|
1339
|
+
if isinstance(df, Iterator):
|
|
1340
|
+
# Warning: Materializing iterator for operation execution
|
|
1341
|
+
df = pd.concat(df, ignore_index=True)
|
|
1342
|
+
|
|
1343
|
+
if operation == "pivot":
|
|
1344
|
+
return self._pivot(df, params)
|
|
1345
|
+
elif operation == "drop_duplicates":
|
|
1346
|
+
return df.drop_duplicates(**params)
|
|
1347
|
+
elif operation == "fillna":
|
|
1348
|
+
return df.fillna(**params)
|
|
1349
|
+
elif operation == "drop":
|
|
1350
|
+
return df.drop(**params)
|
|
1351
|
+
elif operation == "rename":
|
|
1352
|
+
return df.rename(**params)
|
|
1353
|
+
elif operation == "sort":
|
|
1354
|
+
return df.sort_values(**params)
|
|
1355
|
+
elif operation == "sample":
|
|
1356
|
+
return df.sample(**params)
|
|
1357
|
+
else:
|
|
1358
|
+
# Fallback: check if operation is a registered transformer
|
|
1359
|
+
from odibi.context import EngineContext, PandasContext
|
|
1360
|
+
from odibi.registry import FunctionRegistry
|
|
1361
|
+
|
|
1362
|
+
if FunctionRegistry.has_function(operation):
|
|
1363
|
+
func = FunctionRegistry.get_function(operation)
|
|
1364
|
+
param_model = FunctionRegistry.get_param_model(operation)
|
|
1365
|
+
|
|
1366
|
+
# Create EngineContext from current df
|
|
1367
|
+
engine_ctx = EngineContext(
|
|
1368
|
+
context=PandasContext(),
|
|
1369
|
+
df=df,
|
|
1370
|
+
engine=self,
|
|
1371
|
+
engine_type=self.engine_type,
|
|
1372
|
+
)
|
|
1373
|
+
|
|
1374
|
+
# Validate and instantiate params
|
|
1375
|
+
if param_model:
|
|
1376
|
+
validated_params = param_model(**params)
|
|
1377
|
+
result_ctx = func(engine_ctx, validated_params)
|
|
1378
|
+
else:
|
|
1379
|
+
result_ctx = func(engine_ctx, **params)
|
|
1380
|
+
|
|
1381
|
+
return result_ctx.df
|
|
1382
|
+
|
|
1383
|
+
raise ValueError(f"Unsupported operation: {operation}")
|
|
1384
|
+
|
|
1385
|
+
def _pivot(self, df: pd.DataFrame, params: Dict[str, Any]) -> pd.DataFrame:
|
|
1386
|
+
"""Execute pivot operation.
|
|
1387
|
+
|
|
1388
|
+
Args:
|
|
1389
|
+
df: Input DataFrame
|
|
1390
|
+
params: Pivot parameters
|
|
1391
|
+
|
|
1392
|
+
Returns:
|
|
1393
|
+
Pivoted DataFrame
|
|
1394
|
+
"""
|
|
1395
|
+
group_by = params.get("group_by", [])
|
|
1396
|
+
pivot_column = params["pivot_column"]
|
|
1397
|
+
value_column = params["value_column"]
|
|
1398
|
+
agg_func = params.get("agg_func", "first")
|
|
1399
|
+
|
|
1400
|
+
# Validate columns exist
|
|
1401
|
+
required_columns = set()
|
|
1402
|
+
if isinstance(group_by, list):
|
|
1403
|
+
required_columns.update(group_by)
|
|
1404
|
+
elif isinstance(group_by, str):
|
|
1405
|
+
required_columns.add(group_by)
|
|
1406
|
+
group_by = [group_by]
|
|
1407
|
+
|
|
1408
|
+
required_columns.add(pivot_column)
|
|
1409
|
+
required_columns.add(value_column)
|
|
1410
|
+
|
|
1411
|
+
missing = required_columns - set(df.columns)
|
|
1412
|
+
if missing:
|
|
1413
|
+
raise KeyError(
|
|
1414
|
+
f"Columns not found in DataFrame for pivot operation: {missing}. "
|
|
1415
|
+
f"Available: {list(df.columns)}"
|
|
1416
|
+
)
|
|
1417
|
+
|
|
1418
|
+
result = df.pivot_table(
|
|
1419
|
+
index=group_by, columns=pivot_column, values=value_column, aggfunc=agg_func
|
|
1420
|
+
).reset_index()
|
|
1421
|
+
|
|
1422
|
+
# Flatten column names if multi-level
|
|
1423
|
+
if isinstance(result.columns, pd.MultiIndex):
|
|
1424
|
+
result.columns = ["_".join(col).strip("_") for col in result.columns.values]
|
|
1425
|
+
|
|
1426
|
+
return result
|
|
1427
|
+
|
|
1428
|
+
def harmonize_schema(
|
|
1429
|
+
self, df: pd.DataFrame, target_schema: Dict[str, str], policy: Any
|
|
1430
|
+
) -> pd.DataFrame:
|
|
1431
|
+
"""Harmonize DataFrame schema with target schema according to policy."""
|
|
1432
|
+
# Ensure materialization
|
|
1433
|
+
df = self.materialize(df)
|
|
1434
|
+
|
|
1435
|
+
from odibi.config import OnMissingColumns, OnNewColumns, SchemaMode
|
|
1436
|
+
|
|
1437
|
+
target_cols = list(target_schema.keys())
|
|
1438
|
+
current_cols = df.columns.tolist()
|
|
1439
|
+
|
|
1440
|
+
missing = set(target_cols) - set(current_cols)
|
|
1441
|
+
new_cols = set(current_cols) - set(target_cols)
|
|
1442
|
+
|
|
1443
|
+
# 1. Check Validations
|
|
1444
|
+
if missing and policy.on_missing_columns == OnMissingColumns.FAIL:
|
|
1445
|
+
raise ValueError(f"Schema Policy Violation: Missing columns {missing}")
|
|
1446
|
+
|
|
1447
|
+
if new_cols and policy.on_new_columns == OnNewColumns.FAIL:
|
|
1448
|
+
raise ValueError(f"Schema Policy Violation: New columns {new_cols}")
|
|
1449
|
+
|
|
1450
|
+
# 2. Apply Transformations
|
|
1451
|
+
if policy.mode == SchemaMode.EVOLVE and policy.on_new_columns == OnNewColumns.ADD_NULLABLE:
|
|
1452
|
+
# Evolve: Add missing columns, Keep new columns
|
|
1453
|
+
for col in missing:
|
|
1454
|
+
df[col] = None
|
|
1455
|
+
else:
|
|
1456
|
+
# Enforce / Ignore New: Project to target schema (Drops new, Adds missing)
|
|
1457
|
+
# Note: reindex adds NaN for missing columns
|
|
1458
|
+
df = df.reindex(columns=target_cols)
|
|
1459
|
+
|
|
1460
|
+
return df
|
|
1461
|
+
|
|
1462
|
+
def anonymize(
|
|
1463
|
+
self, df: Any, columns: List[str], method: str, salt: Optional[str] = None
|
|
1464
|
+
) -> pd.DataFrame:
|
|
1465
|
+
"""Anonymize specified columns."""
|
|
1466
|
+
# Ensure materialization
|
|
1467
|
+
df = self.materialize(df)
|
|
1468
|
+
|
|
1469
|
+
res = df.copy()
|
|
1470
|
+
|
|
1471
|
+
for col in columns:
|
|
1472
|
+
if col not in res.columns:
|
|
1473
|
+
continue
|
|
1474
|
+
|
|
1475
|
+
if method == "hash":
|
|
1476
|
+
# Vectorized Hashing (via map/apply)
|
|
1477
|
+
# Note: True vectorization requires C-level support (e.g. pyarrow.compute)
|
|
1478
|
+
# Standard Pandas apply is the fallback but we can optimize string handling
|
|
1479
|
+
|
|
1480
|
+
# Convert to string, handling nulls
|
|
1481
|
+
# s_col = res[col].astype(str)
|
|
1482
|
+
# Nulls become 'nan'/'None' string, we want to preserve them or hash them consistently?
|
|
1483
|
+
# Typically nulls should remain null.
|
|
1484
|
+
|
|
1485
|
+
mask_nulls = res[col].isna()
|
|
1486
|
+
|
|
1487
|
+
def _hash_val(val):
|
|
1488
|
+
to_hash = val
|
|
1489
|
+
if salt:
|
|
1490
|
+
to_hash += salt
|
|
1491
|
+
return hashlib.sha256(to_hash.encode("utf-8")).hexdigest()
|
|
1492
|
+
|
|
1493
|
+
# Apply only to non-nulls
|
|
1494
|
+
res.loc[~mask_nulls, col] = res.loc[~mask_nulls, col].astype(str).apply(_hash_val)
|
|
1495
|
+
|
|
1496
|
+
elif method == "mask":
|
|
1497
|
+
# Vectorized Masking
|
|
1498
|
+
# Mask all but last 4 characters
|
|
1499
|
+
|
|
1500
|
+
mask_nulls = res[col].isna()
|
|
1501
|
+
s_valid = res.loc[~mask_nulls, col].astype(str)
|
|
1502
|
+
|
|
1503
|
+
# Use vectorized regex replacement
|
|
1504
|
+
# Replace any character that is followed by 4 characters with '*'
|
|
1505
|
+
res.loc[~mask_nulls, col] = s_valid.str.replace(r".(?=.{4})", "*", regex=True)
|
|
1506
|
+
|
|
1507
|
+
elif method == "redact":
|
|
1508
|
+
res[col] = "[REDACTED]"
|
|
1509
|
+
|
|
1510
|
+
return res
|
|
1511
|
+
|
|
1512
|
+
def get_schema(self, df: Any) -> Dict[str, str]:
|
|
1513
|
+
"""Get DataFrame schema with types.
|
|
1514
|
+
|
|
1515
|
+
Args:
|
|
1516
|
+
df: DataFrame or LazyDataset
|
|
1517
|
+
|
|
1518
|
+
Returns:
|
|
1519
|
+
Dict[str, str]: Column name -> Type string
|
|
1520
|
+
"""
|
|
1521
|
+
if isinstance(df, LazyDataset):
|
|
1522
|
+
if self.use_duckdb:
|
|
1523
|
+
try:
|
|
1524
|
+
import duckdb
|
|
1525
|
+
|
|
1526
|
+
conn = duckdb.connect(":memory:")
|
|
1527
|
+
self._register_lazy_view(conn, "df", df)
|
|
1528
|
+
res = conn.execute("DESCRIBE SELECT * FROM df").df()
|
|
1529
|
+
return dict(zip(res["column_name"], res["column_type"]))
|
|
1530
|
+
except Exception:
|
|
1531
|
+
pass
|
|
1532
|
+
df = self.materialize(df)
|
|
1533
|
+
|
|
1534
|
+
return {col: str(df[col].dtype) for col in df.columns}
|
|
1535
|
+
|
|
1536
|
+
def get_shape(self, df: Any) -> tuple:
|
|
1537
|
+
"""Get DataFrame shape.
|
|
1538
|
+
|
|
1539
|
+
Args:
|
|
1540
|
+
df: DataFrame or LazyDataset
|
|
1541
|
+
|
|
1542
|
+
Returns:
|
|
1543
|
+
(rows, columns)
|
|
1544
|
+
"""
|
|
1545
|
+
if isinstance(df, LazyDataset):
|
|
1546
|
+
cols = len(self.get_schema(df))
|
|
1547
|
+
rows = self.count_rows(df)
|
|
1548
|
+
return (rows, cols)
|
|
1549
|
+
return df.shape
|
|
1550
|
+
|
|
1551
|
+
def count_rows(self, df: Any) -> int:
|
|
1552
|
+
"""Count rows in DataFrame.
|
|
1553
|
+
|
|
1554
|
+
Args:
|
|
1555
|
+
df: DataFrame or LazyDataset
|
|
1556
|
+
|
|
1557
|
+
Returns:
|
|
1558
|
+
Row count
|
|
1559
|
+
"""
|
|
1560
|
+
if isinstance(df, LazyDataset):
|
|
1561
|
+
if self.use_duckdb:
|
|
1562
|
+
try:
|
|
1563
|
+
import duckdb
|
|
1564
|
+
|
|
1565
|
+
conn = duckdb.connect(":memory:")
|
|
1566
|
+
self._register_lazy_view(conn, "df", df)
|
|
1567
|
+
res = conn.execute("SELECT count(*) FROM df").fetchone()
|
|
1568
|
+
return res[0] if res else 0
|
|
1569
|
+
except Exception:
|
|
1570
|
+
pass
|
|
1571
|
+
df = self.materialize(df)
|
|
1572
|
+
|
|
1573
|
+
return len(df)
|
|
1574
|
+
|
|
1575
|
+
def count_nulls(self, df: pd.DataFrame, columns: List[str]) -> Dict[str, int]:
|
|
1576
|
+
"""Count nulls in specified columns.
|
|
1577
|
+
|
|
1578
|
+
Args:
|
|
1579
|
+
df: DataFrame
|
|
1580
|
+
columns: Columns to check
|
|
1581
|
+
|
|
1582
|
+
Returns:
|
|
1583
|
+
Dictionary of column -> null count
|
|
1584
|
+
"""
|
|
1585
|
+
null_counts = {}
|
|
1586
|
+
for col in columns:
|
|
1587
|
+
if col in df.columns:
|
|
1588
|
+
null_counts[col] = int(df[col].isna().sum())
|
|
1589
|
+
else:
|
|
1590
|
+
raise ValueError(
|
|
1591
|
+
f"Column '{col}' not found in DataFrame. "
|
|
1592
|
+
f"Available columns: {list(df.columns)}"
|
|
1593
|
+
)
|
|
1594
|
+
return null_counts
|
|
1595
|
+
|
|
1596
|
+
def validate_schema(self, df: pd.DataFrame, schema_rules: Dict[str, Any]) -> List[str]:
|
|
1597
|
+
"""Validate DataFrame schema.
|
|
1598
|
+
|
|
1599
|
+
Args:
|
|
1600
|
+
df: DataFrame
|
|
1601
|
+
schema_rules: Validation rules
|
|
1602
|
+
|
|
1603
|
+
Returns:
|
|
1604
|
+
List of validation failures
|
|
1605
|
+
"""
|
|
1606
|
+
# Ensure materialization
|
|
1607
|
+
df = self.materialize(df)
|
|
1608
|
+
|
|
1609
|
+
failures = []
|
|
1610
|
+
|
|
1611
|
+
# Check required columns
|
|
1612
|
+
if "required_columns" in schema_rules:
|
|
1613
|
+
required = schema_rules["required_columns"]
|
|
1614
|
+
missing = set(required) - set(df.columns)
|
|
1615
|
+
if missing:
|
|
1616
|
+
failures.append(f"Missing required columns: {', '.join(missing)}")
|
|
1617
|
+
|
|
1618
|
+
# Check column types
|
|
1619
|
+
if "types" in schema_rules:
|
|
1620
|
+
type_map = {
|
|
1621
|
+
"int": ["int64", "int32", "int16", "int8"],
|
|
1622
|
+
"float": ["float64", "float32"],
|
|
1623
|
+
"str": ["object", "string"],
|
|
1624
|
+
"bool": ["bool"],
|
|
1625
|
+
}
|
|
1626
|
+
|
|
1627
|
+
for col, expected_type in schema_rules["types"].items():
|
|
1628
|
+
if col not in df.columns:
|
|
1629
|
+
failures.append(f"Column '{col}' not found for type validation")
|
|
1630
|
+
continue
|
|
1631
|
+
|
|
1632
|
+
actual_type = str(df[col].dtype)
|
|
1633
|
+
# Handle pyarrow types (e.g. int64[pyarrow])
|
|
1634
|
+
if "[" in actual_type and "pyarrow" in actual_type:
|
|
1635
|
+
actual_type = actual_type.split("[")[0]
|
|
1636
|
+
|
|
1637
|
+
expected_dtypes = type_map.get(expected_type, [expected_type])
|
|
1638
|
+
|
|
1639
|
+
if actual_type not in expected_dtypes:
|
|
1640
|
+
failures.append(
|
|
1641
|
+
f"Column '{col}' has type '{actual_type}', expected '{expected_type}'"
|
|
1642
|
+
)
|
|
1643
|
+
|
|
1644
|
+
return failures
|
|
1645
|
+
|
|
1646
|
+
def _infer_avro_schema(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
1647
|
+
"""Infer Avro schema from pandas DataFrame.
|
|
1648
|
+
|
|
1649
|
+
Args:
|
|
1650
|
+
df: DataFrame to infer schema from
|
|
1651
|
+
|
|
1652
|
+
Returns:
|
|
1653
|
+
Avro schema dictionary
|
|
1654
|
+
"""
|
|
1655
|
+
type_mapping = {
|
|
1656
|
+
"int64": "long",
|
|
1657
|
+
"int32": "int",
|
|
1658
|
+
"float64": "double",
|
|
1659
|
+
"float32": "float",
|
|
1660
|
+
"bool": "boolean",
|
|
1661
|
+
"object": "string",
|
|
1662
|
+
"string": "string",
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
fields = []
|
|
1666
|
+
for col in df.columns:
|
|
1667
|
+
dtype = df[col].dtype
|
|
1668
|
+
dtype_str = str(dtype)
|
|
1669
|
+
|
|
1670
|
+
# Handle datetime types with Avro logical types
|
|
1671
|
+
if pd.api.types.is_datetime64_any_dtype(dtype):
|
|
1672
|
+
avro_type = {
|
|
1673
|
+
"type": "long",
|
|
1674
|
+
"logicalType": "timestamp-micros",
|
|
1675
|
+
}
|
|
1676
|
+
elif dtype_str == "date" or (hasattr(dtype, "name") and "date" in dtype.name.lower()):
|
|
1677
|
+
avro_type = {
|
|
1678
|
+
"type": "int",
|
|
1679
|
+
"logicalType": "date",
|
|
1680
|
+
}
|
|
1681
|
+
elif pd.api.types.is_timedelta64_dtype(dtype):
|
|
1682
|
+
avro_type = {
|
|
1683
|
+
"type": "long",
|
|
1684
|
+
"logicalType": "time-micros",
|
|
1685
|
+
}
|
|
1686
|
+
else:
|
|
1687
|
+
avro_type = type_mapping.get(dtype_str, "string")
|
|
1688
|
+
|
|
1689
|
+
# Handle nullable columns
|
|
1690
|
+
if df[col].isnull().any():
|
|
1691
|
+
avro_type = ["null", avro_type]
|
|
1692
|
+
|
|
1693
|
+
fields.append({"name": col, "type": avro_type})
|
|
1694
|
+
|
|
1695
|
+
return {"type": "record", "name": "DataFrame", "fields": fields}
|
|
1696
|
+
|
|
1697
|
+
def validate_data(self, df: pd.DataFrame, validation_config: Any) -> List[str]:
|
|
1698
|
+
"""Validate DataFrame against rules.
|
|
1699
|
+
|
|
1700
|
+
Args:
|
|
1701
|
+
df: DataFrame
|
|
1702
|
+
validation_config: ValidationConfig object
|
|
1703
|
+
|
|
1704
|
+
Returns:
|
|
1705
|
+
List of validation failure messages
|
|
1706
|
+
"""
|
|
1707
|
+
# Ensure materialization
|
|
1708
|
+
df = self.materialize(df)
|
|
1709
|
+
|
|
1710
|
+
failures = []
|
|
1711
|
+
|
|
1712
|
+
# Check not empty
|
|
1713
|
+
if validation_config.not_empty:
|
|
1714
|
+
if len(df) == 0:
|
|
1715
|
+
failures.append("DataFrame is empty")
|
|
1716
|
+
|
|
1717
|
+
# Check for nulls in specified columns
|
|
1718
|
+
if validation_config.no_nulls:
|
|
1719
|
+
null_counts = self.count_nulls(df, validation_config.no_nulls)
|
|
1720
|
+
for col, count in null_counts.items():
|
|
1721
|
+
if count > 0:
|
|
1722
|
+
failures.append(f"Column '{col}' has {count} null values")
|
|
1723
|
+
|
|
1724
|
+
# Schema validation
|
|
1725
|
+
if validation_config.schema_validation:
|
|
1726
|
+
schema_failures = self.validate_schema(df, validation_config.schema_validation)
|
|
1727
|
+
failures.extend(schema_failures)
|
|
1728
|
+
|
|
1729
|
+
# Range validation
|
|
1730
|
+
if validation_config.ranges:
|
|
1731
|
+
for col, bounds in validation_config.ranges.items():
|
|
1732
|
+
if col in df.columns:
|
|
1733
|
+
min_val = bounds.get("min")
|
|
1734
|
+
max_val = bounds.get("max")
|
|
1735
|
+
|
|
1736
|
+
if min_val is not None:
|
|
1737
|
+
min_violations = df[df[col] < min_val]
|
|
1738
|
+
if len(min_violations) > 0:
|
|
1739
|
+
failures.append(f"Column '{col}' has values < {min_val}")
|
|
1740
|
+
|
|
1741
|
+
if max_val is not None:
|
|
1742
|
+
max_violations = df[df[col] > max_val]
|
|
1743
|
+
if len(max_violations) > 0:
|
|
1744
|
+
failures.append(f"Column '{col}' has values > {max_val}")
|
|
1745
|
+
else:
|
|
1746
|
+
failures.append(f"Column '{col}' not found for range validation")
|
|
1747
|
+
|
|
1748
|
+
# Allowed values validation
|
|
1749
|
+
if validation_config.allowed_values:
|
|
1750
|
+
for col, allowed in validation_config.allowed_values.items():
|
|
1751
|
+
if col in df.columns:
|
|
1752
|
+
# Check for values not in allowed list
|
|
1753
|
+
invalid = df[~df[col].isin(allowed)]
|
|
1754
|
+
if len(invalid) > 0:
|
|
1755
|
+
failures.append(f"Column '{col}' has invalid values")
|
|
1756
|
+
else:
|
|
1757
|
+
failures.append(f"Column '{col}' not found for allowed values validation")
|
|
1758
|
+
|
|
1759
|
+
return failures
|
|
1760
|
+
|
|
1761
|
+
def get_sample(self, df: Any, n: int = 10) -> List[Dict[str, Any]]:
|
|
1762
|
+
"""Get sample rows as list of dictionaries.
|
|
1763
|
+
|
|
1764
|
+
Args:
|
|
1765
|
+
df: DataFrame or LazyDataset
|
|
1766
|
+
n: Number of rows to return
|
|
1767
|
+
|
|
1768
|
+
Returns:
|
|
1769
|
+
List of row dictionaries
|
|
1770
|
+
"""
|
|
1771
|
+
if isinstance(df, LazyDataset):
|
|
1772
|
+
if self.use_duckdb:
|
|
1773
|
+
try:
|
|
1774
|
+
import duckdb
|
|
1775
|
+
|
|
1776
|
+
conn = duckdb.connect(":memory:")
|
|
1777
|
+
self._register_lazy_view(conn, "df", df)
|
|
1778
|
+
res_df = conn.execute(f"SELECT * FROM df LIMIT {n}").df()
|
|
1779
|
+
return res_df.to_dict("records")
|
|
1780
|
+
except Exception:
|
|
1781
|
+
pass
|
|
1782
|
+
df = self.materialize(df)
|
|
1783
|
+
|
|
1784
|
+
return df.head(n).to_dict("records")
|
|
1785
|
+
|
|
1786
|
+
def table_exists(
|
|
1787
|
+
self, connection: Any, table: Optional[str] = None, path: Optional[str] = None
|
|
1788
|
+
) -> bool:
|
|
1789
|
+
"""Check if table or location exists.
|
|
1790
|
+
|
|
1791
|
+
Args:
|
|
1792
|
+
connection: Connection object
|
|
1793
|
+
table: Table name (not used in Pandas—no catalog)
|
|
1794
|
+
path: File path
|
|
1795
|
+
|
|
1796
|
+
Returns:
|
|
1797
|
+
True if file/directory exists, False otherwise
|
|
1798
|
+
"""
|
|
1799
|
+
if path:
|
|
1800
|
+
full_path = connection.get_path(path)
|
|
1801
|
+
return os.path.exists(full_path)
|
|
1802
|
+
return False
|
|
1803
|
+
|
|
1804
|
+
def get_table_schema(
|
|
1805
|
+
self,
|
|
1806
|
+
connection: Any,
|
|
1807
|
+
table: Optional[str] = None,
|
|
1808
|
+
path: Optional[str] = None,
|
|
1809
|
+
format: Optional[str] = None,
|
|
1810
|
+
) -> Optional[Dict[str, str]]:
|
|
1811
|
+
"""Get schema of an existing table/file."""
|
|
1812
|
+
try:
|
|
1813
|
+
if table and format in ["sql", "sql_server", "azure_sql"]:
|
|
1814
|
+
# SQL Server: Read empty result
|
|
1815
|
+
query = f"SELECT TOP 0 * FROM {table}"
|
|
1816
|
+
df = connection.read_sql(query)
|
|
1817
|
+
return self.get_schema(df)
|
|
1818
|
+
|
|
1819
|
+
if path:
|
|
1820
|
+
full_path = connection.get_path(path)
|
|
1821
|
+
if not os.path.exists(full_path):
|
|
1822
|
+
return None
|
|
1823
|
+
|
|
1824
|
+
if format == "delta":
|
|
1825
|
+
from deltalake import DeltaTable
|
|
1826
|
+
|
|
1827
|
+
dt = DeltaTable(full_path)
|
|
1828
|
+
# Use pyarrow schema to pandas schema to avoid reading data
|
|
1829
|
+
arrow_schema = dt.schema().to_pyarrow()
|
|
1830
|
+
empty_df = arrow_schema.empty_table().to_pandas()
|
|
1831
|
+
return self.get_schema(empty_df)
|
|
1832
|
+
|
|
1833
|
+
elif format == "parquet":
|
|
1834
|
+
import pyarrow.parquet as pq
|
|
1835
|
+
|
|
1836
|
+
target_path = full_path
|
|
1837
|
+
if os.path.isdir(full_path):
|
|
1838
|
+
# Find first parquet file
|
|
1839
|
+
files = glob.glob(os.path.join(full_path, "*.parquet"))
|
|
1840
|
+
if not files:
|
|
1841
|
+
return None
|
|
1842
|
+
target_path = files[0]
|
|
1843
|
+
|
|
1844
|
+
schema = pq.read_schema(target_path)
|
|
1845
|
+
empty_df = schema.empty_table().to_pandas()
|
|
1846
|
+
return self.get_schema(empty_df)
|
|
1847
|
+
|
|
1848
|
+
elif format == "csv":
|
|
1849
|
+
df = pd.read_csv(full_path, nrows=0)
|
|
1850
|
+
return self.get_schema(df)
|
|
1851
|
+
|
|
1852
|
+
except (FileNotFoundError, PermissionError):
|
|
1853
|
+
return None
|
|
1854
|
+
except ImportError as e:
|
|
1855
|
+
# Log missing optional dependency
|
|
1856
|
+
import logging
|
|
1857
|
+
|
|
1858
|
+
logging.getLogger(__name__).warning(
|
|
1859
|
+
f"Could not infer schema due to missing dependency: {e}"
|
|
1860
|
+
)
|
|
1861
|
+
return None
|
|
1862
|
+
except Exception as e:
|
|
1863
|
+
import logging
|
|
1864
|
+
|
|
1865
|
+
logging.getLogger(__name__).warning(f"Failed to infer schema for {table or path}: {e}")
|
|
1866
|
+
return None
|
|
1867
|
+
return None
|
|
1868
|
+
|
|
1869
|
+
def vacuum_delta(
|
|
1870
|
+
self,
|
|
1871
|
+
connection: Any,
|
|
1872
|
+
path: str,
|
|
1873
|
+
retention_hours: int = 168,
|
|
1874
|
+
dry_run: bool = False,
|
|
1875
|
+
enforce_retention_duration: bool = True,
|
|
1876
|
+
) -> Dict[str, Any]:
|
|
1877
|
+
"""VACUUM a Delta table to remove old files.
|
|
1878
|
+
|
|
1879
|
+
Args:
|
|
1880
|
+
connection: Connection object
|
|
1881
|
+
path: Delta table path
|
|
1882
|
+
retention_hours: Retention period (default 168 = 7 days)
|
|
1883
|
+
dry_run: If True, only show files to be deleted
|
|
1884
|
+
enforce_retention_duration: If False, allows retention < 168 hours (testing only)
|
|
1885
|
+
|
|
1886
|
+
Returns:
|
|
1887
|
+
Dictionary with files_deleted count
|
|
1888
|
+
"""
|
|
1889
|
+
ctx = get_logging_context().with_context(engine="pandas")
|
|
1890
|
+
start = time.time()
|
|
1891
|
+
|
|
1892
|
+
ctx.debug(
|
|
1893
|
+
"Starting Delta VACUUM",
|
|
1894
|
+
path=path,
|
|
1895
|
+
retention_hours=retention_hours,
|
|
1896
|
+
dry_run=dry_run,
|
|
1897
|
+
)
|
|
1898
|
+
|
|
1899
|
+
try:
|
|
1900
|
+
from deltalake import DeltaTable
|
|
1901
|
+
except ImportError:
|
|
1902
|
+
ctx.error("Delta Lake library not installed", path=path)
|
|
1903
|
+
raise ImportError(
|
|
1904
|
+
"Delta Lake support requires 'pip install odibi[pandas]' "
|
|
1905
|
+
"or 'pip install deltalake'. See README.md for installation instructions."
|
|
1906
|
+
)
|
|
1907
|
+
|
|
1908
|
+
full_path = connection.get_path(path)
|
|
1909
|
+
|
|
1910
|
+
storage_opts = {}
|
|
1911
|
+
if hasattr(connection, "pandas_storage_options"):
|
|
1912
|
+
storage_opts = connection.pandas_storage_options()
|
|
1913
|
+
|
|
1914
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
1915
|
+
deleted_files = dt.vacuum(
|
|
1916
|
+
retention_hours=retention_hours,
|
|
1917
|
+
dry_run=dry_run,
|
|
1918
|
+
enforce_retention_duration=enforce_retention_duration,
|
|
1919
|
+
)
|
|
1920
|
+
|
|
1921
|
+
elapsed = (time.time() - start) * 1000
|
|
1922
|
+
ctx.info(
|
|
1923
|
+
"Delta VACUUM completed",
|
|
1924
|
+
path=str(full_path),
|
|
1925
|
+
files_deleted=len(deleted_files),
|
|
1926
|
+
dry_run=dry_run,
|
|
1927
|
+
elapsed_ms=round(elapsed, 2),
|
|
1928
|
+
)
|
|
1929
|
+
|
|
1930
|
+
return {"files_deleted": len(deleted_files)}
|
|
1931
|
+
|
|
1932
|
+
def get_delta_history(
|
|
1933
|
+
self, connection: Any, path: str, limit: Optional[int] = None
|
|
1934
|
+
) -> List[Dict[str, Any]]:
|
|
1935
|
+
"""Get Delta table history.
|
|
1936
|
+
|
|
1937
|
+
Args:
|
|
1938
|
+
connection: Connection object
|
|
1939
|
+
path: Delta table path
|
|
1940
|
+
limit: Maximum number of versions to return
|
|
1941
|
+
|
|
1942
|
+
Returns:
|
|
1943
|
+
List of version metadata dictionaries
|
|
1944
|
+
"""
|
|
1945
|
+
ctx = get_logging_context().with_context(engine="pandas")
|
|
1946
|
+
start = time.time()
|
|
1947
|
+
|
|
1948
|
+
ctx.debug("Getting Delta table history", path=path, limit=limit)
|
|
1949
|
+
|
|
1950
|
+
try:
|
|
1951
|
+
from deltalake import DeltaTable
|
|
1952
|
+
except ImportError:
|
|
1953
|
+
ctx.error("Delta Lake library not installed", path=path)
|
|
1954
|
+
raise ImportError(
|
|
1955
|
+
"Delta Lake support requires 'pip install odibi[pandas]' "
|
|
1956
|
+
"or 'pip install deltalake'. See README.md for installation instructions."
|
|
1957
|
+
)
|
|
1958
|
+
|
|
1959
|
+
full_path = connection.get_path(path)
|
|
1960
|
+
|
|
1961
|
+
storage_opts = {}
|
|
1962
|
+
if hasattr(connection, "pandas_storage_options"):
|
|
1963
|
+
storage_opts = connection.pandas_storage_options()
|
|
1964
|
+
|
|
1965
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
1966
|
+
history = dt.history(limit=limit)
|
|
1967
|
+
|
|
1968
|
+
elapsed = (time.time() - start) * 1000
|
|
1969
|
+
ctx.info(
|
|
1970
|
+
"Delta history retrieved",
|
|
1971
|
+
path=str(full_path),
|
|
1972
|
+
versions_returned=len(history) if history else 0,
|
|
1973
|
+
elapsed_ms=round(elapsed, 2),
|
|
1974
|
+
)
|
|
1975
|
+
|
|
1976
|
+
return history
|
|
1977
|
+
|
|
1978
|
+
def restore_delta(self, connection: Any, path: str, version: int) -> None:
|
|
1979
|
+
"""Restore Delta table to a specific version.
|
|
1980
|
+
|
|
1981
|
+
Args:
|
|
1982
|
+
connection: Connection object
|
|
1983
|
+
path: Delta table path
|
|
1984
|
+
version: Version number to restore to
|
|
1985
|
+
"""
|
|
1986
|
+
ctx = get_logging_context().with_context(engine="pandas")
|
|
1987
|
+
start = time.time()
|
|
1988
|
+
|
|
1989
|
+
ctx.info("Starting Delta table restore", path=path, target_version=version)
|
|
1990
|
+
|
|
1991
|
+
try:
|
|
1992
|
+
from deltalake import DeltaTable
|
|
1993
|
+
except ImportError:
|
|
1994
|
+
ctx.error("Delta Lake library not installed", path=path)
|
|
1995
|
+
raise ImportError(
|
|
1996
|
+
"Delta Lake support requires 'pip install odibi[pandas]' "
|
|
1997
|
+
"or 'pip install deltalake'. See README.md for installation instructions."
|
|
1998
|
+
)
|
|
1999
|
+
|
|
2000
|
+
full_path = connection.get_path(path)
|
|
2001
|
+
|
|
2002
|
+
storage_opts = {}
|
|
2003
|
+
if hasattr(connection, "pandas_storage_options"):
|
|
2004
|
+
storage_opts = connection.pandas_storage_options()
|
|
2005
|
+
|
|
2006
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
2007
|
+
dt.restore(version)
|
|
2008
|
+
|
|
2009
|
+
elapsed = (time.time() - start) * 1000
|
|
2010
|
+
ctx.info(
|
|
2011
|
+
"Delta table restored",
|
|
2012
|
+
path=str(full_path),
|
|
2013
|
+
restored_to_version=version,
|
|
2014
|
+
elapsed_ms=round(elapsed, 2),
|
|
2015
|
+
)
|
|
2016
|
+
|
|
2017
|
+
def maintain_table(
|
|
2018
|
+
self,
|
|
2019
|
+
connection: Any,
|
|
2020
|
+
format: str,
|
|
2021
|
+
table: Optional[str] = None,
|
|
2022
|
+
path: Optional[str] = None,
|
|
2023
|
+
config: Optional[Any] = None,
|
|
2024
|
+
) -> None:
|
|
2025
|
+
"""Run table maintenance operations (optimize, vacuum)."""
|
|
2026
|
+
ctx = get_logging_context().with_context(engine="pandas")
|
|
2027
|
+
|
|
2028
|
+
if format != "delta" or not config or not config.enabled:
|
|
2029
|
+
return
|
|
2030
|
+
|
|
2031
|
+
if not path and not table:
|
|
2032
|
+
return
|
|
2033
|
+
|
|
2034
|
+
full_path = connection.get_path(path if path else table)
|
|
2035
|
+
start = time.time()
|
|
2036
|
+
|
|
2037
|
+
ctx.info("Starting table maintenance", path=str(full_path))
|
|
2038
|
+
|
|
2039
|
+
try:
|
|
2040
|
+
from deltalake import DeltaTable
|
|
2041
|
+
except ImportError:
|
|
2042
|
+
ctx.warning(
|
|
2043
|
+
"Auto-optimize skipped: 'deltalake' library not installed",
|
|
2044
|
+
path=str(full_path),
|
|
2045
|
+
)
|
|
2046
|
+
return
|
|
2047
|
+
|
|
2048
|
+
try:
|
|
2049
|
+
storage_opts = {}
|
|
2050
|
+
if hasattr(connection, "pandas_storage_options"):
|
|
2051
|
+
storage_opts = connection.pandas_storage_options()
|
|
2052
|
+
|
|
2053
|
+
dt = DeltaTable(full_path, storage_options=storage_opts)
|
|
2054
|
+
|
|
2055
|
+
ctx.info("Running Delta OPTIMIZE (compaction)", path=str(full_path))
|
|
2056
|
+
dt.optimize.compact()
|
|
2057
|
+
|
|
2058
|
+
retention = config.vacuum_retention_hours
|
|
2059
|
+
if retention is not None and retention > 0:
|
|
2060
|
+
ctx.info(
|
|
2061
|
+
"Running Delta VACUUM",
|
|
2062
|
+
path=str(full_path),
|
|
2063
|
+
retention_hours=retention,
|
|
2064
|
+
)
|
|
2065
|
+
dt.vacuum(
|
|
2066
|
+
retention_hours=retention,
|
|
2067
|
+
enforce_retention_duration=True,
|
|
2068
|
+
dry_run=False,
|
|
2069
|
+
)
|
|
2070
|
+
|
|
2071
|
+
elapsed = (time.time() - start) * 1000
|
|
2072
|
+
ctx.info(
|
|
2073
|
+
"Table maintenance completed",
|
|
2074
|
+
path=str(full_path),
|
|
2075
|
+
elapsed_ms=round(elapsed, 2),
|
|
2076
|
+
)
|
|
2077
|
+
|
|
2078
|
+
except Exception as e:
|
|
2079
|
+
ctx.warning(
|
|
2080
|
+
"Auto-optimize failed",
|
|
2081
|
+
path=str(full_path),
|
|
2082
|
+
error=str(e),
|
|
2083
|
+
)
|
|
2084
|
+
|
|
2085
|
+
def get_source_files(self, df: Any) -> List[str]:
|
|
2086
|
+
"""Get list of source files that generated this DataFrame.
|
|
2087
|
+
|
|
2088
|
+
Args:
|
|
2089
|
+
df: DataFrame or LazyDataset
|
|
2090
|
+
|
|
2091
|
+
Returns:
|
|
2092
|
+
List of file paths
|
|
2093
|
+
"""
|
|
2094
|
+
if isinstance(df, LazyDataset):
|
|
2095
|
+
if isinstance(df.path, list):
|
|
2096
|
+
return df.path
|
|
2097
|
+
return [str(df.path)]
|
|
2098
|
+
|
|
2099
|
+
if hasattr(df, "attrs"):
|
|
2100
|
+
return df.attrs.get("odibi_source_files", [])
|
|
2101
|
+
return []
|
|
2102
|
+
|
|
2103
|
+
def profile_nulls(self, df: pd.DataFrame) -> Dict[str, float]:
|
|
2104
|
+
"""Calculate null percentage for each column.
|
|
2105
|
+
|
|
2106
|
+
Args:
|
|
2107
|
+
df: DataFrame
|
|
2108
|
+
|
|
2109
|
+
Returns:
|
|
2110
|
+
Dictionary of {column_name: null_percentage}
|
|
2111
|
+
"""
|
|
2112
|
+
# Ensure materialization
|
|
2113
|
+
df = self.materialize(df)
|
|
2114
|
+
|
|
2115
|
+
# mean() of boolean DataFrame gives the percentage of True values
|
|
2116
|
+
return df.isna().mean().to_dict()
|
|
2117
|
+
|
|
2118
|
+
def filter_greater_than(self, df: pd.DataFrame, column: str, value: Any) -> pd.DataFrame:
|
|
2119
|
+
"""Filter DataFrame where column > value.
|
|
2120
|
+
|
|
2121
|
+
Automatically casts string columns to datetime for proper comparison.
|
|
2122
|
+
"""
|
|
2123
|
+
if column not in df.columns:
|
|
2124
|
+
raise ValueError(f"Column '{column}' not found in DataFrame")
|
|
2125
|
+
|
|
2126
|
+
try:
|
|
2127
|
+
col_series = df[column]
|
|
2128
|
+
|
|
2129
|
+
if pd.api.types.is_string_dtype(col_series):
|
|
2130
|
+
col_series = pd.to_datetime(col_series, errors="coerce")
|
|
2131
|
+
elif pd.api.types.is_datetime64_any_dtype(col_series) and isinstance(value, str):
|
|
2132
|
+
value = pd.to_datetime(value)
|
|
2133
|
+
|
|
2134
|
+
return df[col_series > value]
|
|
2135
|
+
except Exception as e:
|
|
2136
|
+
raise ValueError(f"Failed to filter {column} > {value}: {e}")
|
|
2137
|
+
|
|
2138
|
+
def filter_coalesce(
|
|
2139
|
+
self, df: pd.DataFrame, col1: str, col2: str, op: str, value: Any
|
|
2140
|
+
) -> pd.DataFrame:
|
|
2141
|
+
"""Filter using COALESCE(col1, col2) op value.
|
|
2142
|
+
|
|
2143
|
+
Automatically casts string columns to datetime for proper comparison.
|
|
2144
|
+
"""
|
|
2145
|
+
if col1 not in df.columns:
|
|
2146
|
+
raise ValueError(f"Column '{col1}' not found")
|
|
2147
|
+
|
|
2148
|
+
def _to_datetime_if_string(series: pd.Series) -> pd.Series:
|
|
2149
|
+
if pd.api.types.is_string_dtype(series):
|
|
2150
|
+
return pd.to_datetime(series, errors="coerce")
|
|
2151
|
+
return series
|
|
2152
|
+
|
|
2153
|
+
s1 = _to_datetime_if_string(df[col1])
|
|
2154
|
+
|
|
2155
|
+
if col2 not in df.columns:
|
|
2156
|
+
s = s1
|
|
2157
|
+
else:
|
|
2158
|
+
s2 = _to_datetime_if_string(df[col2])
|
|
2159
|
+
s = s1.combine_first(s2)
|
|
2160
|
+
|
|
2161
|
+
try:
|
|
2162
|
+
if pd.api.types.is_datetime64_any_dtype(s) and isinstance(value, str):
|
|
2163
|
+
value = pd.to_datetime(value)
|
|
2164
|
+
|
|
2165
|
+
if op == ">=":
|
|
2166
|
+
return df[s >= value]
|
|
2167
|
+
elif op == ">":
|
|
2168
|
+
return df[s > value]
|
|
2169
|
+
elif op == "<=":
|
|
2170
|
+
return df[s <= value]
|
|
2171
|
+
elif op == "<":
|
|
2172
|
+
return df[s < value]
|
|
2173
|
+
elif op == "==" or op == "=":
|
|
2174
|
+
return df[s == value]
|
|
2175
|
+
else:
|
|
2176
|
+
raise ValueError(f"Unsupported operator: {op}")
|
|
2177
|
+
except Exception as e:
|
|
2178
|
+
raise ValueError(f"Failed to filter COALESCE({col1}, {col2}) {op} {value}: {e}")
|