odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2178 @@
1
+ """Pandas engine implementation."""
2
+
3
+ import glob
4
+ import hashlib
5
+ import os
6
+ import random
7
+ import time
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from dataclasses import dataclass
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Any, Dict, Iterator, List, Optional, Union
13
+ from urllib.parse import urlparse
14
+
15
+ import pandas as pd
16
+
17
+ from odibi.context import Context, PandasContext
18
+ from odibi.engine.base import Engine
19
+ from odibi.enums import EngineType
20
+ from odibi.exceptions import TransformError
21
+ from odibi.utils.logging_context import get_logging_context
22
+
23
+ __all__ = ["PandasEngine", "LazyDataset"]
24
+
25
+
26
+ @dataclass
27
+ class LazyDataset:
28
+ """Lazy representation of a dataset (file) for out-of-core processing."""
29
+
30
+ path: Union[str, List[str]]
31
+ format: str
32
+ options: Dict[str, Any]
33
+ connection: Optional[Any] = None # To resolve path/credentials if needed
34
+
35
+ def __repr__(self):
36
+ return f"LazyDataset(path={self.path}, format={self.format})"
37
+
38
+
39
+ class PandasEngine(Engine):
40
+ """Pandas-based execution engine."""
41
+
42
+ name = "pandas"
43
+ engine_type = EngineType.PANDAS
44
+
45
+ def __init__(
46
+ self,
47
+ connections: Optional[Dict[str, Any]] = None,
48
+ config: Optional[Dict[str, Any]] = None,
49
+ ):
50
+ """Initialize Pandas engine.
51
+
52
+ Args:
53
+ connections: Dictionary of connection objects
54
+ config: Engine configuration (optional)
55
+ """
56
+ self.connections = connections or {}
57
+ self.config = config or {}
58
+
59
+ # Suppress noisy delta-rs transaction conflict warnings (handled by retry)
60
+ if "RUST_LOG" not in os.environ:
61
+ os.environ["RUST_LOG"] = "deltalake_core::kernel::transaction=error"
62
+
63
+ # Check for performance flags
64
+ performance = self.config.get("performance", {})
65
+
66
+ # Determine desired state
67
+ if hasattr(performance, "use_arrow"):
68
+ desired_use_arrow = performance.use_arrow
69
+ elif isinstance(performance, dict):
70
+ desired_use_arrow = performance.get("use_arrow", True)
71
+ else:
72
+ desired_use_arrow = True
73
+
74
+ # Verify availability
75
+ if desired_use_arrow:
76
+ try:
77
+ import pyarrow # noqa: F401
78
+
79
+ self.use_arrow = True
80
+ except ImportError:
81
+ import logging
82
+
83
+ logger = logging.getLogger(__name__)
84
+ logger.warning(
85
+ "Apache Arrow not found. Disabling Arrow optimizations. "
86
+ "Install 'pyarrow' to enable."
87
+ )
88
+ self.use_arrow = False
89
+ else:
90
+ self.use_arrow = False
91
+
92
+ # Check for DuckDB
93
+ self.use_duckdb = False
94
+ # Default to False to ensure stability with existing tests (Lazy Loading is opt-in)
95
+ if self.config.get("performance", {}).get("use_duckdb", False):
96
+ try:
97
+ import duckdb # noqa: F401
98
+
99
+ self.use_duckdb = True
100
+ except ImportError:
101
+ pass
102
+
103
+ def materialize(self, df: Any) -> Any:
104
+ """Materialize lazy dataset."""
105
+ if isinstance(df, LazyDataset):
106
+ # Re-invoke read but force materialization (by bypassing Lazy check)
107
+ # We pass the resolved path directly
108
+ # Note: We need to handle the case where path was resolved.
109
+ # LazyDataset.path should be the FULL path.
110
+ return self._read_file(
111
+ full_path=df.path, format=df.format, options=df.options, connection=df.connection
112
+ )
113
+ return df
114
+
115
+ def _process_df(
116
+ self, df: Union[pd.DataFrame, Iterator[pd.DataFrame]], query: Optional[str]
117
+ ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
118
+ """Apply post-read processing (filtering)."""
119
+ if query and df is not None:
120
+ # Handle Iterator
121
+ from collections.abc import Iterator
122
+
123
+ if isinstance(df, Iterator):
124
+ # Filter each chunk
125
+ return (chunk.query(query) for chunk in df)
126
+
127
+ if not df.empty:
128
+ try:
129
+ return df.query(query)
130
+ except Exception as e:
131
+ import logging
132
+
133
+ logger = logging.getLogger(__name__)
134
+ logger.warning(f"Failed to apply query '{query}': {e}")
135
+ return df
136
+
137
+ _CLOUD_URI_PREFIXES = ("abfss://", "s3://", "gs://", "az://", "https://")
138
+
139
+ def _retry_delta_operation(self, func, max_retries: int = 5, base_delay: float = 0.2):
140
+ """Retry Delta operations with exponential backoff for concurrent conflicts."""
141
+ for attempt in range(max_retries):
142
+ try:
143
+ return func()
144
+ except Exception as e:
145
+ error_str = str(e).lower()
146
+ is_conflict = "conflict" in error_str or "concurrent" in error_str
147
+ if attempt == max_retries - 1 or not is_conflict:
148
+ raise
149
+ delay = base_delay * (2**attempt) + random.uniform(0, 0.1)
150
+ time.sleep(delay)
151
+
152
+ def _resolve_path(self, path: Optional[str], connection: Any) -> str:
153
+ """Resolve path to full URI, avoiding double-prefixing for cloud URIs.
154
+
155
+ Args:
156
+ path: Relative or absolute path
157
+ connection: Connection object (may have get_path method)
158
+
159
+ Returns:
160
+ Full resolved path
161
+ """
162
+ if not path:
163
+ raise ValueError(
164
+ "Failed to resolve path: path argument is required but was empty or None. "
165
+ "Provide a valid file path or use 'table' parameter with a connection."
166
+ )
167
+ if path.startswith(self._CLOUD_URI_PREFIXES):
168
+ return path
169
+ if connection:
170
+ return connection.get_path(path)
171
+ return path
172
+
173
+ def _merge_storage_options(
174
+ self, connection: Any, options: Optional[Dict[str, Any]] = None
175
+ ) -> Dict[str, Any]:
176
+ """Merge connection storage options with user options.
177
+
178
+ Args:
179
+ connection: Connection object (may have pandas_storage_options method)
180
+ options: User-provided options
181
+
182
+ Returns:
183
+ Merged options dictionary
184
+ """
185
+ options = options or {}
186
+
187
+ # If connection provides storage_options (e.g., AzureADLS), merge them
188
+ if hasattr(connection, "pandas_storage_options"):
189
+ conn_storage_opts = connection.pandas_storage_options()
190
+ user_storage_opts = options.get("storage_options", {})
191
+
192
+ # User options override connection options
193
+ merged_storage_opts = {**conn_storage_opts, **user_storage_opts}
194
+
195
+ # Return options with merged storage_options
196
+ return {**options, "storage_options": merged_storage_opts}
197
+
198
+ return options
199
+
200
+ def _read_parallel(self, read_func: Any, paths: List[str], **kwargs) -> pd.DataFrame:
201
+ """Read multiple files in parallel using threads.
202
+
203
+ Args:
204
+ read_func: Pandas read function (e.g. pd.read_csv)
205
+ paths: List of file paths
206
+ kwargs: Arguments to pass to read_func
207
+
208
+ Returns:
209
+ Concatenated DataFrame
210
+ """
211
+ # Conservative worker count to avoid OOM on large files
212
+ max_workers = min(8, os.cpu_count() or 4)
213
+
214
+ dfs = []
215
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
216
+ # map preserves order
217
+ results = executor.map(lambda p: read_func(p, **kwargs), paths)
218
+ dfs = list(results)
219
+
220
+ return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
221
+
222
+ def read(
223
+ self,
224
+ connection: Any,
225
+ format: str,
226
+ table: Optional[str] = None,
227
+ path: Optional[str] = None,
228
+ streaming: bool = False,
229
+ schema: Optional[str] = None,
230
+ options: Optional[Dict[str, Any]] = None,
231
+ as_of_version: Optional[int] = None,
232
+ as_of_timestamp: Optional[str] = None,
233
+ ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
234
+ """Read data using Pandas (or LazyDataset)."""
235
+ ctx = get_logging_context().with_context(engine="pandas")
236
+ start = time.time()
237
+
238
+ source = path or table
239
+ ctx.debug(
240
+ "Starting read operation",
241
+ format=format,
242
+ path=source,
243
+ streaming=streaming,
244
+ use_arrow=self.use_arrow,
245
+ )
246
+
247
+ if streaming:
248
+ ctx.error(
249
+ "Streaming not supported in Pandas engine",
250
+ format=format,
251
+ path=source,
252
+ )
253
+ raise ValueError(
254
+ "Streaming is not supported in the Pandas engine. "
255
+ "Please use 'engine: spark' for streaming pipelines."
256
+ )
257
+
258
+ options = options or {}
259
+
260
+ # Resolve full path from connection
261
+ try:
262
+ full_path = self._resolve_path(path or table, connection)
263
+ except ValueError:
264
+ if table and not connection:
265
+ ctx.error("Connection required when specifying 'table'", table=table)
266
+ raise ValueError(
267
+ f"Cannot read table '{table}': connection is required when using 'table' parameter. "
268
+ "Provide a valid connection object or use 'path' for file-based reads."
269
+ )
270
+ ctx.error("Neither path nor table provided for read operation")
271
+ raise ValueError(
272
+ "Read operation failed: neither 'path' nor 'table' was provided. "
273
+ "Specify a file path or table name in your configuration."
274
+ )
275
+
276
+ # Merge storage options for cloud connections
277
+ merged_options = self._merge_storage_options(connection, options)
278
+
279
+ # Sanitize options for pandas compatibility
280
+ if "header" in merged_options:
281
+ if merged_options["header"] is True:
282
+ merged_options["header"] = 0
283
+ elif merged_options["header"] is False:
284
+ merged_options["header"] = None
285
+
286
+ # Handle Time Travel options
287
+ if as_of_version is not None:
288
+ merged_options["versionAsOf"] = as_of_version
289
+ ctx.debug("Time travel enabled", version=as_of_version)
290
+ if as_of_timestamp is not None:
291
+ merged_options["timestampAsOf"] = as_of_timestamp
292
+ ctx.debug("Time travel enabled", timestamp=as_of_timestamp)
293
+
294
+ # Check for Lazy/DuckDB optimization
295
+ can_lazy_load = False
296
+
297
+ if can_lazy_load:
298
+ ctx.debug("Using lazy loading via DuckDB", path=str(full_path))
299
+ if isinstance(full_path, (str, Path)):
300
+ return LazyDataset(
301
+ path=str(full_path),
302
+ format=format,
303
+ options=merged_options,
304
+ connection=connection,
305
+ )
306
+ elif isinstance(full_path, list):
307
+ return LazyDataset(
308
+ path=full_path, format=format, options=merged_options, connection=connection
309
+ )
310
+
311
+ result = self._read_file(full_path, format, merged_options, connection)
312
+
313
+ # Log metrics for materialized DataFrames
314
+ elapsed = (time.time() - start) * 1000
315
+ if isinstance(result, pd.DataFrame):
316
+ row_count = len(result)
317
+ memory_mb = result.memory_usage(deep=True).sum() / (1024 * 1024)
318
+
319
+ ctx.log_file_io(
320
+ path=str(full_path) if not isinstance(full_path, list) else str(full_path[0]),
321
+ format=format,
322
+ mode="read",
323
+ rows=row_count,
324
+ )
325
+ ctx.log_pandas_metrics(
326
+ memory_mb=memory_mb,
327
+ dtypes={col: str(dtype) for col, dtype in result.dtypes.items()},
328
+ )
329
+ ctx.info(
330
+ "Read completed",
331
+ format=format,
332
+ rows=row_count,
333
+ elapsed_ms=round(elapsed, 2),
334
+ memory_mb=round(memory_mb, 2),
335
+ )
336
+
337
+ return result
338
+
339
+ def _read_file(
340
+ self,
341
+ full_path: Union[str, List[str], Any],
342
+ format: str,
343
+ options: Dict[str, Any],
344
+ connection: Any = None,
345
+ ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
346
+ """Internal file reading logic."""
347
+ ctx = get_logging_context().with_context(engine="pandas")
348
+
349
+ ctx.debug(
350
+ "Reading file",
351
+ path=str(full_path) if not isinstance(full_path, list) else f"{len(full_path)} files",
352
+ format=format,
353
+ )
354
+
355
+ # Custom Readers
356
+ if format in self._custom_readers:
357
+ ctx.debug(f"Using custom reader for format: {format}")
358
+ return self._custom_readers[format](full_path, **options)
359
+
360
+ # Handle glob patterns for local files
361
+ is_glob = False
362
+ if isinstance(full_path, (str, Path)) and (
363
+ "*" in str(full_path) or "?" in str(full_path) or "[" in str(full_path)
364
+ ):
365
+ parsed = urlparse(str(full_path))
366
+ # Only expand for local files (no scheme, file://, or drive letter)
367
+ is_local = (
368
+ not parsed.scheme
369
+ or parsed.scheme == "file"
370
+ or (len(parsed.scheme) == 1 and parsed.scheme.isalpha())
371
+ )
372
+
373
+ if is_local:
374
+ glob_path = str(full_path)
375
+ if glob_path.startswith("file:///"):
376
+ glob_path = glob_path[8:]
377
+ elif glob_path.startswith("file://"):
378
+ glob_path = glob_path[7:]
379
+
380
+ matched_files = glob.glob(glob_path)
381
+ if not matched_files:
382
+ ctx.error(
383
+ "No files matched glob pattern",
384
+ pattern=glob_path,
385
+ )
386
+ raise FileNotFoundError(f"No files matched pattern: {glob_path}")
387
+
388
+ ctx.info(
389
+ "Glob pattern expanded",
390
+ pattern=glob_path,
391
+ matched_files=len(matched_files),
392
+ )
393
+ full_path = matched_files
394
+ is_glob = True
395
+
396
+ # Prepare read options (options already includes storage_options from caller)
397
+ read_kwargs = options.copy()
398
+
399
+ # Extract 'query' or 'filter' option for post-read filtering
400
+ post_read_query = read_kwargs.pop("query", None) or read_kwargs.pop("filter", None)
401
+
402
+ if self.use_arrow:
403
+ read_kwargs["dtype_backend"] = "pyarrow"
404
+
405
+ # Read based on format
406
+ if format == "csv":
407
+ try:
408
+ if is_glob and isinstance(full_path, list):
409
+ ctx.debug(
410
+ "Parallel CSV read",
411
+ file_count=len(full_path),
412
+ )
413
+ df = self._read_parallel(pd.read_csv, full_path, **read_kwargs)
414
+ df.attrs["odibi_source_files"] = full_path
415
+ return self._process_df(df, post_read_query)
416
+
417
+ df = pd.read_csv(full_path, **read_kwargs)
418
+ if hasattr(df, "attrs"):
419
+ df.attrs["odibi_source_files"] = [str(full_path)]
420
+ return self._process_df(df, post_read_query)
421
+ except UnicodeDecodeError:
422
+ ctx.warning(
423
+ "UnicodeDecodeError, retrying with latin1 encoding",
424
+ path=str(full_path),
425
+ )
426
+ read_kwargs["encoding"] = "latin1"
427
+ if is_glob and isinstance(full_path, list):
428
+ df = self._read_parallel(pd.read_csv, full_path, **read_kwargs)
429
+ df.attrs["odibi_source_files"] = full_path
430
+ return self._process_df(df, post_read_query)
431
+
432
+ df = pd.read_csv(full_path, **read_kwargs)
433
+ if hasattr(df, "attrs"):
434
+ df.attrs["odibi_source_files"] = [str(full_path)]
435
+ return self._process_df(df, post_read_query)
436
+ except pd.errors.ParserError:
437
+ ctx.warning(
438
+ "ParserError, retrying with on_bad_lines='skip'",
439
+ path=str(full_path),
440
+ )
441
+ read_kwargs["on_bad_lines"] = "skip"
442
+ if is_glob and isinstance(full_path, list):
443
+ df = self._read_parallel(pd.read_csv, full_path, **read_kwargs)
444
+ df.attrs["odibi_source_files"] = full_path
445
+ return self._process_df(df, post_read_query)
446
+
447
+ df = pd.read_csv(full_path, **read_kwargs)
448
+ if hasattr(df, "attrs"):
449
+ df.attrs["odibi_source_files"] = [str(full_path)]
450
+ return self._process_df(df, post_read_query)
451
+ elif format == "parquet":
452
+ ctx.debug("Reading parquet", path=str(full_path))
453
+ df = pd.read_parquet(full_path, **read_kwargs)
454
+ if isinstance(full_path, list):
455
+ df.attrs["odibi_source_files"] = full_path
456
+ else:
457
+ df.attrs["odibi_source_files"] = [str(full_path)]
458
+ return self._process_df(df, post_read_query)
459
+ elif format == "json":
460
+ if is_glob and isinstance(full_path, list):
461
+ ctx.debug(
462
+ "Parallel JSON read",
463
+ file_count=len(full_path),
464
+ )
465
+ df = self._read_parallel(pd.read_json, full_path, **read_kwargs)
466
+ df.attrs["odibi_source_files"] = full_path
467
+ return self._process_df(df, post_read_query)
468
+
469
+ df = pd.read_json(full_path, **read_kwargs)
470
+ if hasattr(df, "attrs"):
471
+ df.attrs["odibi_source_files"] = [str(full_path)]
472
+ return self._process_df(df, post_read_query)
473
+ elif format == "excel":
474
+ ctx.debug("Reading Excel file", path=str(full_path))
475
+ read_kwargs.pop("dtype_backend", None)
476
+ return self._process_df(pd.read_excel(full_path, **read_kwargs), post_read_query)
477
+ elif format == "delta":
478
+ ctx.debug("Reading Delta table", path=str(full_path))
479
+ try:
480
+ from deltalake import DeltaTable
481
+ except ImportError:
482
+ ctx.error(
483
+ "Delta Lake library not installed",
484
+ path=str(full_path),
485
+ )
486
+ raise ImportError(
487
+ "Delta Lake support requires 'pip install odibi[pandas]' "
488
+ "or 'pip install deltalake'. See README.md for installation instructions."
489
+ )
490
+
491
+ storage_opts = options.get("storage_options", {})
492
+ version = options.get("versionAsOf")
493
+ timestamp = options.get("timestampAsOf")
494
+
495
+ if timestamp is not None:
496
+ from datetime import datetime as dt_module
497
+
498
+ if isinstance(timestamp, str):
499
+ ts = dt_module.fromisoformat(timestamp.replace("Z", "+00:00"))
500
+ else:
501
+ ts = timestamp
502
+ dt = DeltaTable(full_path, storage_options=storage_opts)
503
+ dt.load_with_datetime(ts)
504
+ ctx.debug("Delta table loaded with timestamp", timestamp=str(ts))
505
+ elif version is not None:
506
+ dt = DeltaTable(full_path, storage_options=storage_opts, version=version)
507
+ ctx.debug("Delta table loaded with version", version=version)
508
+ else:
509
+ dt = DeltaTable(full_path, storage_options=storage_opts)
510
+ ctx.debug("Delta table loaded (latest version)")
511
+
512
+ if self.use_arrow:
513
+ import inspect
514
+
515
+ sig = inspect.signature(dt.to_pandas)
516
+
517
+ if "arrow_options" in sig.parameters:
518
+ return self._process_df(
519
+ dt.to_pandas(
520
+ partitions=None, arrow_options={"types_mapper": pd.ArrowDtype}
521
+ ),
522
+ post_read_query,
523
+ )
524
+ else:
525
+ return self._process_df(
526
+ dt.to_pyarrow_table().to_pandas(types_mapper=pd.ArrowDtype),
527
+ post_read_query,
528
+ )
529
+ else:
530
+ return self._process_df(dt.to_pandas(), post_read_query)
531
+ elif format == "avro":
532
+ ctx.debug("Reading Avro file", path=str(full_path))
533
+ try:
534
+ import fastavro
535
+ except ImportError:
536
+ ctx.error(
537
+ "fastavro library not installed",
538
+ path=str(full_path),
539
+ )
540
+ raise ImportError(
541
+ "Avro support requires 'pip install odibi[pandas]' "
542
+ "or 'pip install fastavro'. See README.md for installation instructions."
543
+ )
544
+
545
+ parsed = urlparse(full_path)
546
+ if parsed.scheme and parsed.scheme not in ["file", ""]:
547
+ import fsspec
548
+
549
+ storage_opts = options.get("storage_options", {})
550
+ with fsspec.open(full_path, "rb", **storage_opts) as f:
551
+ reader = fastavro.reader(f)
552
+ records = [record for record in reader]
553
+ return pd.DataFrame(records)
554
+ else:
555
+ with open(full_path, "rb") as f:
556
+ reader = fastavro.reader(f)
557
+ records = [record for record in reader]
558
+ return self._process_df(pd.DataFrame(records), post_read_query)
559
+ elif format in ["sql", "sql_server", "azure_sql"]:
560
+ ctx.debug("Reading SQL table", table=str(full_path), format=format)
561
+ if not hasattr(connection, "read_table"):
562
+ ctx.error(
563
+ "Connection does not support SQL operations",
564
+ connection_type=type(connection).__name__,
565
+ )
566
+ raise ValueError(
567
+ f"Cannot read SQL table '{full_path}': connection type '{type(connection).__name__}' "
568
+ "does not support SQL operations. Use a SQL-compatible connection "
569
+ "(e.g., SqlServerConnection, AzureSqlConnection)."
570
+ )
571
+
572
+ table_name = str(full_path)
573
+ if "." in table_name:
574
+ schema, tbl = table_name.split(".", 1)
575
+ else:
576
+ schema, tbl = "dbo", table_name
577
+
578
+ ctx.debug("Executing SQL read", schema=schema, table=tbl)
579
+ return connection.read_table(table_name=tbl, schema=schema)
580
+ else:
581
+ ctx.error("Unsupported format", format=format)
582
+ raise ValueError(
583
+ f"Unsupported format for Pandas engine: '{format}'. "
584
+ "Supported formats: csv, parquet, json, excel, delta, sql, sql_server, azure_sql."
585
+ )
586
+
587
+ def write(
588
+ self,
589
+ df: Union[pd.DataFrame, Iterator[pd.DataFrame]],
590
+ connection: Any,
591
+ format: str,
592
+ table: Optional[str] = None,
593
+ path: Optional[str] = None,
594
+ register_table: Optional[str] = None,
595
+ mode: str = "overwrite",
596
+ options: Optional[Dict[str, Any]] = None,
597
+ streaming_config: Optional[Any] = None,
598
+ ) -> Optional[Dict[str, Any]]:
599
+ """Write data using Pandas."""
600
+ ctx = get_logging_context().with_context(engine="pandas")
601
+ start = time.time()
602
+
603
+ destination = path or table
604
+ ctx.debug(
605
+ "Starting write operation",
606
+ format=format,
607
+ destination=destination,
608
+ mode=mode,
609
+ )
610
+
611
+ # Ensure materialization if LazyDataset
612
+ df = self.materialize(df)
613
+
614
+ options = options or {}
615
+
616
+ # Handle iterator/generator input
617
+ from collections.abc import Iterator
618
+
619
+ if isinstance(df, Iterator):
620
+ ctx.debug("Writing iterator/generator input")
621
+ return self._write_iterator(df, connection, format, table, path, mode, options)
622
+
623
+ row_count = len(df)
624
+ memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
625
+
626
+ ctx.log_pandas_metrics(
627
+ memory_mb=memory_mb,
628
+ dtypes={col: str(dtype) for col, dtype in df.dtypes.items()},
629
+ )
630
+
631
+ # SQL Server / Azure SQL Support
632
+ if format in ["sql", "sql_server", "azure_sql"]:
633
+ ctx.debug("Writing to SQL", table=table, mode=mode)
634
+ return self._write_sql(df, connection, table, mode, options)
635
+
636
+ # Resolve full path from connection
637
+ try:
638
+ full_path = self._resolve_path(path or table, connection)
639
+ except ValueError:
640
+ if table and not connection:
641
+ ctx.error("Connection required when specifying 'table'", table=table)
642
+ raise ValueError("Connection is required when specifying 'table'.")
643
+ ctx.error("Neither path nor table provided for write operation")
644
+ raise ValueError("Either path or table must be provided")
645
+
646
+ # Merge storage options for cloud connections
647
+ merged_options = self._merge_storage_options(connection, options)
648
+
649
+ # Custom Writers
650
+ if format in self._custom_writers:
651
+ ctx.debug(f"Using custom writer for format: {format}")
652
+ writer_options = merged_options.copy()
653
+ writer_options.pop("keys", None)
654
+ self._custom_writers[format](df, full_path, mode=mode, **writer_options)
655
+ return None
656
+
657
+ # Ensure directory exists (local only)
658
+ self._ensure_directory(full_path)
659
+
660
+ # Warn about partitioning
661
+ self._check_partitioning(merged_options)
662
+
663
+ # Delta Lake Write
664
+ if format == "delta":
665
+ ctx.debug("Writing Delta table", path=str(full_path), mode=mode)
666
+ result = self._write_delta(df, full_path, mode, merged_options)
667
+ elapsed = (time.time() - start) * 1000
668
+ ctx.log_file_io(
669
+ path=str(full_path),
670
+ format=format,
671
+ mode=mode,
672
+ rows=row_count,
673
+ )
674
+ ctx.info(
675
+ "Write completed",
676
+ format=format,
677
+ rows=row_count,
678
+ elapsed_ms=round(elapsed, 2),
679
+ )
680
+ return result
681
+
682
+ # Handle Generic Upsert/Append-Once for non-Delta
683
+ if mode in ["upsert", "append_once"]:
684
+ ctx.debug(f"Handling {mode} mode for non-Delta format")
685
+ df, mode = self._handle_generic_upsert(df, full_path, format, mode, merged_options)
686
+ row_count = len(df)
687
+
688
+ # Standard File Write
689
+ result = self._write_file(df, full_path, format, mode, merged_options)
690
+
691
+ elapsed = (time.time() - start) * 1000
692
+ ctx.log_file_io(
693
+ path=str(full_path),
694
+ format=format,
695
+ mode=mode,
696
+ rows=row_count,
697
+ )
698
+ ctx.info(
699
+ "Write completed",
700
+ format=format,
701
+ rows=row_count,
702
+ elapsed_ms=round(elapsed, 2),
703
+ )
704
+
705
+ return result
706
+
707
+ def _write_iterator(
708
+ self,
709
+ df_iter: Iterator[pd.DataFrame],
710
+ connection: Any,
711
+ format: str,
712
+ table: Optional[str],
713
+ path: Optional[str],
714
+ mode: str,
715
+ options: Dict[str, Any],
716
+ ) -> None:
717
+ """Handle writing of iterator/generator."""
718
+ first_chunk = True
719
+ for chunk in df_iter:
720
+ # Determine mode for this chunk
721
+ current_mode = mode if first_chunk else "append"
722
+ current_options = options.copy()
723
+
724
+ # Handle CSV header for chunks
725
+ if not first_chunk and format == "csv":
726
+ if current_options.get("header") is not False:
727
+ current_options["header"] = False
728
+
729
+ self.write(
730
+ chunk,
731
+ connection,
732
+ format,
733
+ table,
734
+ path,
735
+ mode=current_mode,
736
+ options=current_options,
737
+ )
738
+ first_chunk = False
739
+ return None
740
+
741
+ def _write_sql(
742
+ self,
743
+ df: pd.DataFrame,
744
+ connection: Any,
745
+ table: Optional[str],
746
+ mode: str,
747
+ options: Dict[str, Any],
748
+ ) -> Optional[Dict[str, Any]]:
749
+ """Handle SQL writing including merge and enhanced overwrite."""
750
+ ctx = get_logging_context().with_context(engine="pandas")
751
+
752
+ if not hasattr(connection, "write_table"):
753
+ raise ValueError(
754
+ f"Connection type '{type(connection).__name__}' does not support SQL operations"
755
+ )
756
+
757
+ if not table:
758
+ raise ValueError("SQL format requires 'table' config")
759
+
760
+ # Handle MERGE mode for SQL Server
761
+ if mode == "merge":
762
+ merge_keys = options.get("merge_keys")
763
+ merge_options = options.get("merge_options")
764
+
765
+ if not merge_keys:
766
+ raise ValueError(
767
+ "MERGE mode requires 'merge_keys' in options. "
768
+ "Specify the key columns for the MERGE ON clause."
769
+ )
770
+
771
+ from odibi.writers.sql_server_writer import SqlServerMergeWriter
772
+
773
+ writer = SqlServerMergeWriter(connection)
774
+ ctx.debug(
775
+ "Executing SQL Server MERGE (Pandas)",
776
+ target=table,
777
+ merge_keys=merge_keys,
778
+ )
779
+
780
+ result = writer.merge_pandas(
781
+ df=df,
782
+ target_table=table,
783
+ merge_keys=merge_keys,
784
+ options=merge_options,
785
+ )
786
+
787
+ ctx.info(
788
+ "SQL Server MERGE completed (Pandas)",
789
+ target=table,
790
+ inserted=result.inserted,
791
+ updated=result.updated,
792
+ deleted=result.deleted,
793
+ )
794
+
795
+ return {
796
+ "mode": "merge",
797
+ "inserted": result.inserted,
798
+ "updated": result.updated,
799
+ "deleted": result.deleted,
800
+ "total_affected": result.total_affected,
801
+ }
802
+
803
+ # Handle enhanced overwrite with strategies
804
+ if mode == "overwrite" and options.get("overwrite_options"):
805
+ from odibi.writers.sql_server_writer import SqlServerMergeWriter
806
+
807
+ overwrite_options = options.get("overwrite_options")
808
+ writer = SqlServerMergeWriter(connection)
809
+
810
+ ctx.debug(
811
+ "Executing SQL Server enhanced overwrite (Pandas)",
812
+ target=table,
813
+ strategy=(
814
+ overwrite_options.strategy.value
815
+ if hasattr(overwrite_options, "strategy")
816
+ else "truncate_insert"
817
+ ),
818
+ )
819
+
820
+ result = writer.overwrite_pandas(
821
+ df=df,
822
+ target_table=table,
823
+ options=overwrite_options,
824
+ )
825
+
826
+ ctx.info(
827
+ "SQL Server enhanced overwrite completed (Pandas)",
828
+ target=table,
829
+ strategy=result.strategy,
830
+ rows_written=result.rows_written,
831
+ )
832
+
833
+ return {
834
+ "mode": "overwrite",
835
+ "strategy": result.strategy,
836
+ "rows_written": result.rows_written,
837
+ }
838
+
839
+ # Extract schema from table name if present
840
+ if "." in table:
841
+ schema, table_name = table.split(".", 1)
842
+ else:
843
+ schema, table_name = "dbo", table
844
+
845
+ # Map mode to if_exists
846
+ if_exists = "replace" # overwrite
847
+ if mode == "append":
848
+ if_exists = "append"
849
+ elif mode == "fail":
850
+ if_exists = "fail"
851
+
852
+ chunksize = options.get("chunksize", 1000)
853
+
854
+ connection.write_table(
855
+ df=df,
856
+ table_name=table_name,
857
+ schema=schema,
858
+ if_exists=if_exists,
859
+ chunksize=chunksize,
860
+ )
861
+ return None
862
+
863
+ def _ensure_directory(self, full_path: str) -> None:
864
+ """Ensure parent directory exists for local files."""
865
+ parsed = urlparse(str(full_path))
866
+ is_windows_drive = (
867
+ len(parsed.scheme) == 1 and parsed.scheme.isalpha() if parsed.scheme else False
868
+ )
869
+
870
+ if not parsed.scheme or parsed.scheme == "file" or is_windows_drive:
871
+ Path(full_path).parent.mkdir(parents=True, exist_ok=True)
872
+
873
+ def _check_partitioning(self, options: Dict[str, Any]) -> None:
874
+ """Warn about potential partitioning issues."""
875
+ partition_by = options.get("partition_by") or options.get("partitionBy")
876
+ if partition_by:
877
+ import warnings
878
+
879
+ warnings.warn(
880
+ "⚠️ Partitioning can cause performance issues if misused. "
881
+ "Only partition on low-cardinality columns (< 1000 unique values) "
882
+ "and ensure each partition has > 1000 rows.",
883
+ UserWarning,
884
+ )
885
+
886
+ def _write_delta(
887
+ self,
888
+ df: pd.DataFrame,
889
+ full_path: str,
890
+ mode: str,
891
+ merged_options: Dict[str, Any],
892
+ ) -> Dict[str, Any]:
893
+ """Handle Delta Lake writing."""
894
+ try:
895
+ from deltalake import DeltaTable, write_deltalake
896
+ except ImportError:
897
+ raise ImportError(
898
+ "Delta Lake support requires 'pip install odibi[pandas]' or 'pip install deltalake'. "
899
+ "See README.md for installation instructions."
900
+ )
901
+
902
+ storage_opts = merged_options.get("storage_options", {})
903
+
904
+ # Handle null-only columns: Delta Lake doesn't support Null dtype
905
+ # Cast columns with all-null values to string to avoid schema errors
906
+ for col in df.columns:
907
+ if df[col].isna().all():
908
+ df[col] = df[col].astype("string")
909
+
910
+ # Map modes
911
+ delta_mode = "overwrite"
912
+ if mode == "append":
913
+ delta_mode = "append"
914
+ elif mode == "error" or mode == "fail":
915
+ delta_mode = "error"
916
+ elif mode == "ignore":
917
+ delta_mode = "ignore"
918
+
919
+ # Handle upsert/append_once logic
920
+ if mode == "upsert":
921
+ keys = merged_options.get("keys")
922
+ if not keys:
923
+ raise ValueError("Upsert requires 'keys' in options")
924
+
925
+ if isinstance(keys, str):
926
+ keys = [keys]
927
+
928
+ def do_upsert():
929
+ dt = DeltaTable(full_path, storage_options=storage_opts)
930
+ (
931
+ dt.merge(
932
+ source=df,
933
+ predicate=" AND ".join([f"s.{k} = t.{k}" for k in keys]),
934
+ source_alias="s",
935
+ target_alias="t",
936
+ )
937
+ .when_matched_update_all()
938
+ .when_not_matched_insert_all()
939
+ .execute()
940
+ )
941
+
942
+ self._retry_delta_operation(do_upsert)
943
+ elif mode == "append_once":
944
+ keys = merged_options.get("keys")
945
+ if not keys:
946
+ raise ValueError("Append_once requires 'keys' in options")
947
+
948
+ if isinstance(keys, str):
949
+ keys = [keys]
950
+
951
+ def do_append_once():
952
+ dt = DeltaTable(full_path, storage_options=storage_opts)
953
+ (
954
+ dt.merge(
955
+ source=df,
956
+ predicate=" AND ".join([f"s.{k} = t.{k}" for k in keys]),
957
+ source_alias="s",
958
+ target_alias="t",
959
+ )
960
+ .when_not_matched_insert_all()
961
+ .execute()
962
+ )
963
+
964
+ self._retry_delta_operation(do_append_once)
965
+ else:
966
+ # Filter options supported by write_deltalake
967
+ write_kwargs = {
968
+ k: v
969
+ for k, v in merged_options.items()
970
+ if k
971
+ in [
972
+ "partition_by",
973
+ "mode",
974
+ "overwrite_schema",
975
+ "schema_mode",
976
+ "name",
977
+ "description",
978
+ "configuration",
979
+ ]
980
+ }
981
+
982
+ def do_write():
983
+ write_deltalake(
984
+ full_path, df, mode=delta_mode, storage_options=storage_opts, **write_kwargs
985
+ )
986
+
987
+ self._retry_delta_operation(do_write)
988
+
989
+ # Return commit info
990
+ dt = DeltaTable(full_path, storage_options=storage_opts)
991
+ history = dt.history(limit=1)
992
+ latest = history[0]
993
+
994
+ return {
995
+ "version": dt.version(),
996
+ "timestamp": datetime.fromtimestamp(latest.get("timestamp", 0) / 1000),
997
+ "operation": latest.get("operation"),
998
+ "operation_metrics": latest.get("operationMetrics", {}),
999
+ "read_version": latest.get("readVersion"),
1000
+ }
1001
+
1002
+ def _handle_generic_upsert(
1003
+ self,
1004
+ df: pd.DataFrame,
1005
+ full_path: str,
1006
+ format: str,
1007
+ mode: str,
1008
+ options: Dict[str, Any],
1009
+ ) -> tuple[pd.DataFrame, str]:
1010
+ """Handle upsert/append_once for standard files by merging with existing data."""
1011
+ if "keys" not in options:
1012
+ raise ValueError(f"Mode '{mode}' requires 'keys' list in options")
1013
+
1014
+ keys = options["keys"]
1015
+ if isinstance(keys, str):
1016
+ keys = [keys]
1017
+
1018
+ # Try to read existing file
1019
+ existing_df = None
1020
+ try:
1021
+ read_opts = options.copy()
1022
+ read_opts.pop("keys", None)
1023
+
1024
+ if format == "csv":
1025
+ existing_df = pd.read_csv(full_path, **read_opts)
1026
+ elif format == "parquet":
1027
+ existing_df = pd.read_parquet(full_path, **read_opts)
1028
+ elif format == "json":
1029
+ existing_df = pd.read_json(full_path, **read_opts)
1030
+ elif format == "excel":
1031
+ existing_df = pd.read_excel(full_path, **read_opts)
1032
+ except Exception:
1033
+ # File doesn't exist or can't be read
1034
+ return df, "overwrite" # Treat as new write
1035
+
1036
+ if existing_df is None:
1037
+ return df, "overwrite"
1038
+
1039
+ if mode == "append_once":
1040
+ # Check if keys exist
1041
+ missing_keys = set(keys) - set(df.columns)
1042
+ if missing_keys:
1043
+ raise KeyError(f"Keys {missing_keys} not found in input data")
1044
+
1045
+ # Identify new rows
1046
+ merged = df.merge(existing_df[keys], on=keys, how="left", indicator=True)
1047
+ new_rows = merged[merged["_merge"] == "left_only"].drop(columns=["_merge"])
1048
+
1049
+ if format in ["csv", "json"]:
1050
+ return new_rows, "append"
1051
+ else:
1052
+ # Rewrite everything
1053
+ return pd.concat([existing_df, new_rows], ignore_index=True), "overwrite"
1054
+
1055
+ elif mode == "upsert":
1056
+ # Check if keys exist
1057
+ missing_keys = set(keys) - set(df.columns)
1058
+ if missing_keys:
1059
+ raise KeyError(f"Keys {missing_keys} not found in input data")
1060
+
1061
+ # 1. Remove rows from existing that are in input
1062
+ merged_indicator = existing_df.merge(df[keys], on=keys, how="left", indicator=True)
1063
+ rows_to_keep = existing_df[merged_indicator["_merge"] == "left_only"]
1064
+
1065
+ # 2. Concat rows_to_keep + input df
1066
+ # 3. Write mode becomes overwrite
1067
+ return pd.concat([rows_to_keep, df], ignore_index=True), "overwrite"
1068
+
1069
+ return df, mode
1070
+
1071
+ def _write_file(
1072
+ self,
1073
+ df: pd.DataFrame,
1074
+ full_path: str,
1075
+ format: str,
1076
+ mode: str,
1077
+ merged_options: Dict[str, Any],
1078
+ ) -> None:
1079
+ """Handle standard file writing (CSV, Parquet, etc.)."""
1080
+ writer_options = merged_options.copy()
1081
+ writer_options.pop("keys", None)
1082
+
1083
+ # Remove storage_options for local pandas writers usually?
1084
+ # Some pandas writers accept storage_options (parquet, csv with fsspec)
1085
+
1086
+ if format == "csv":
1087
+ mode_param = "w"
1088
+ if mode == "append":
1089
+ mode_param = "a"
1090
+ if not os.path.exists(full_path):
1091
+ # If file doesn't exist, include header
1092
+ writer_options["header"] = True
1093
+ else:
1094
+ # If appending, don't write header unless explicit
1095
+ if "header" not in writer_options:
1096
+ writer_options["header"] = False
1097
+
1098
+ df.to_csv(full_path, index=False, mode=mode_param, **writer_options)
1099
+
1100
+ elif format == "parquet":
1101
+ if mode == "append":
1102
+ # Pandas read_parquet doesn't support append directly usually.
1103
+ # We implement simple read-concat-write for local files
1104
+ if os.path.exists(full_path):
1105
+ existing = pd.read_parquet(full_path, **merged_options)
1106
+ df = pd.concat([existing, df], ignore_index=True)
1107
+
1108
+ df.to_parquet(full_path, index=False, **writer_options)
1109
+
1110
+ elif format == "json":
1111
+ if mode == "append":
1112
+ writer_options["mode"] = "a"
1113
+
1114
+ # Default to records if not specified
1115
+ if "orient" not in writer_options:
1116
+ writer_options["orient"] = "records"
1117
+
1118
+ # Include storage_options for cloud storage (ADLS, S3, GCS)
1119
+ if "storage_options" in merged_options:
1120
+ writer_options["storage_options"] = merged_options["storage_options"]
1121
+
1122
+ df.to_json(full_path, **writer_options)
1123
+
1124
+ elif format == "excel":
1125
+ if mode == "append":
1126
+ # Simple append for excel
1127
+ if os.path.exists(full_path):
1128
+ with pd.ExcelWriter(full_path, mode="a", if_sheet_exists="overlay") as writer:
1129
+ df.to_excel(writer, index=False, **writer_options)
1130
+ return
1131
+
1132
+ df.to_excel(full_path, index=False, **writer_options)
1133
+
1134
+ elif format == "avro":
1135
+ try:
1136
+ import fastavro
1137
+ except ImportError:
1138
+ raise ImportError("Avro support requires 'pip install fastavro'")
1139
+
1140
+ # Convert datetime columns to microseconds for Avro timestamp-micros
1141
+ df_avro = df.copy()
1142
+ for col in df_avro.columns:
1143
+ if pd.api.types.is_datetime64_any_dtype(df_avro[col].dtype):
1144
+ df_avro[col] = df_avro[col].apply(
1145
+ lambda x: int(x.timestamp() * 1_000_000) if pd.notna(x) else None
1146
+ )
1147
+
1148
+ records = df_avro.to_dict("records")
1149
+ schema = self._infer_avro_schema(df)
1150
+
1151
+ # Use fsspec for remote URIs (abfss://, s3://, etc.)
1152
+ parsed = urlparse(full_path)
1153
+ if parsed.scheme and parsed.scheme not in ["file", ""]:
1154
+ # Remote file - use fsspec
1155
+ import fsspec
1156
+
1157
+ storage_opts = merged_options.get("storage_options", {})
1158
+ write_mode = "wb" if mode == "overwrite" else "ab"
1159
+ with fsspec.open(full_path, write_mode, **storage_opts) as f:
1160
+ fastavro.writer(f, schema, records)
1161
+ else:
1162
+ # Local file - use standard open
1163
+ open_mode = "wb"
1164
+ if mode == "append" and os.path.exists(full_path):
1165
+ open_mode = "a+b"
1166
+
1167
+ with open(full_path, open_mode) as f:
1168
+ fastavro.writer(f, schema, records)
1169
+ else:
1170
+ raise ValueError(f"Unsupported format for Pandas engine: {format}")
1171
+
1172
+ def add_write_metadata(
1173
+ self,
1174
+ df: pd.DataFrame,
1175
+ metadata_config: Any,
1176
+ source_connection: Optional[str] = None,
1177
+ source_table: Optional[str] = None,
1178
+ source_path: Optional[str] = None,
1179
+ is_file_source: bool = False,
1180
+ ) -> pd.DataFrame:
1181
+ """Add metadata columns to DataFrame before writing (Bronze layer lineage).
1182
+
1183
+ Args:
1184
+ df: Pandas DataFrame
1185
+ metadata_config: WriteMetadataConfig or True (for all defaults)
1186
+ source_connection: Name of the source connection
1187
+ source_table: Name of the source table (SQL sources)
1188
+ source_path: Path of the source file (file sources)
1189
+ is_file_source: True if source is a file-based read
1190
+
1191
+ Returns:
1192
+ DataFrame with metadata columns added
1193
+ """
1194
+ from odibi.config import WriteMetadataConfig
1195
+
1196
+ # Normalize config: True -> all defaults
1197
+ if metadata_config is True:
1198
+ config = WriteMetadataConfig()
1199
+ elif isinstance(metadata_config, WriteMetadataConfig):
1200
+ config = metadata_config
1201
+ else:
1202
+ return df # None or invalid -> no metadata
1203
+
1204
+ # Work on a copy to avoid modifying original
1205
+ df = df.copy()
1206
+
1207
+ # _extracted_at: always applicable
1208
+ if config.extracted_at:
1209
+ df["_extracted_at"] = pd.Timestamp.now()
1210
+
1211
+ # _source_file: only for file sources
1212
+ if config.source_file and is_file_source and source_path:
1213
+ df["_source_file"] = source_path
1214
+
1215
+ # _source_connection: all sources
1216
+ if config.source_connection and source_connection:
1217
+ df["_source_connection"] = source_connection
1218
+
1219
+ # _source_table: SQL sources only
1220
+ if config.source_table and source_table:
1221
+ df["_source_table"] = source_table
1222
+
1223
+ return df
1224
+
1225
+ def _register_lazy_view_unused(self, conn, name: str, df: Any) -> None:
1226
+ """Register a LazyDataset as a DuckDB view."""
1227
+ duck_fmt = df.format
1228
+ if duck_fmt == "json":
1229
+ duck_fmt = "json_auto"
1230
+
1231
+ if isinstance(df.path, list):
1232
+ paths = ", ".join([f"'{p}'" for p in df.path])
1233
+ conn.execute(
1234
+ f"CREATE OR REPLACE VIEW {name} AS SELECT * FROM read_{duck_fmt}([{paths}])"
1235
+ )
1236
+ else:
1237
+ conn.execute(
1238
+ f"CREATE OR REPLACE VIEW {name} AS SELECT * FROM read_{duck_fmt}('{df.path}')"
1239
+ )
1240
+
1241
+ def execute_sql(self, sql: str, context: Context) -> pd.DataFrame:
1242
+ """Execute SQL query using DuckDB (if available) or pandasql.
1243
+
1244
+ Args:
1245
+ sql: SQL query string
1246
+ context: Execution context
1247
+
1248
+ Returns:
1249
+ Result DataFrame
1250
+ """
1251
+ if not isinstance(context, PandasContext):
1252
+ raise TypeError("PandasEngine requires PandasContext")
1253
+
1254
+ # Try to use DuckDB for SQL
1255
+ try:
1256
+ import duckdb
1257
+
1258
+ # Create in-memory database
1259
+ conn = duckdb.connect(":memory:")
1260
+
1261
+ # Register all DataFrames from context
1262
+ for name in context.list_names():
1263
+ dataset_obj = context.get(name)
1264
+
1265
+ # Debug check
1266
+ # print(f"DEBUG: Registering {name} type={type(dataset_obj)} LazyDataset={LazyDataset}")
1267
+
1268
+ # Handle LazyDataset (DuckDB optimization)
1269
+ # if isinstance(dataset_obj, LazyDataset):
1270
+ # self._register_lazy_view(conn, name, dataset_obj)
1271
+ # # Log that we used DuckDB on file
1272
+ # # logger.info(f"Executing SQL via DuckDB on lazy file: {dataset_obj.path}")
1273
+ # continue
1274
+
1275
+ # Handle chunked data (Iterator)
1276
+ from collections.abc import Iterator
1277
+
1278
+ if isinstance(dataset_obj, Iterator):
1279
+ # Warning: Materializing iterator for SQL execution
1280
+ # Note: DuckDB doesn't support streaming from iterator yet
1281
+ dataset_obj = pd.concat(dataset_obj, ignore_index=True)
1282
+
1283
+ conn.register(name, dataset_obj)
1284
+
1285
+ # Execute query
1286
+ result = conn.execute(sql).df()
1287
+ conn.close()
1288
+
1289
+ return result
1290
+
1291
+ except ImportError:
1292
+ # Fallback: try pandasql
1293
+ try:
1294
+ from pandasql import sqldf
1295
+
1296
+ # Build local namespace with DataFrames
1297
+ locals_dict = {}
1298
+ for name in context.list_names():
1299
+ df = context.get(name)
1300
+
1301
+ # Handle chunked data (Iterator)
1302
+ from collections.abc import Iterator
1303
+
1304
+ if isinstance(df, Iterator):
1305
+ df = pd.concat(df, ignore_index=True)
1306
+
1307
+ locals_dict[name] = df
1308
+
1309
+ return sqldf(sql, locals_dict)
1310
+
1311
+ except ImportError:
1312
+ raise TransformError(
1313
+ "SQL execution requires 'duckdb' or 'pandasql'. "
1314
+ "Install with: pip install duckdb"
1315
+ )
1316
+
1317
+ def execute_operation(
1318
+ self,
1319
+ operation: str,
1320
+ params: Dict[str, Any],
1321
+ df: Union[pd.DataFrame, Iterator[pd.DataFrame]],
1322
+ ) -> pd.DataFrame:
1323
+ """Execute built-in operation.
1324
+
1325
+ Args:
1326
+ operation: Operation name
1327
+ params: Operation parameters
1328
+ df: Input DataFrame or Iterator
1329
+
1330
+ Returns:
1331
+ Result DataFrame
1332
+ """
1333
+ # Materialize LazyDataset
1334
+ df = self.materialize(df)
1335
+
1336
+ # Handle chunked data (Iterator)
1337
+ from collections.abc import Iterator
1338
+
1339
+ if isinstance(df, Iterator):
1340
+ # Warning: Materializing iterator for operation execution
1341
+ df = pd.concat(df, ignore_index=True)
1342
+
1343
+ if operation == "pivot":
1344
+ return self._pivot(df, params)
1345
+ elif operation == "drop_duplicates":
1346
+ return df.drop_duplicates(**params)
1347
+ elif operation == "fillna":
1348
+ return df.fillna(**params)
1349
+ elif operation == "drop":
1350
+ return df.drop(**params)
1351
+ elif operation == "rename":
1352
+ return df.rename(**params)
1353
+ elif operation == "sort":
1354
+ return df.sort_values(**params)
1355
+ elif operation == "sample":
1356
+ return df.sample(**params)
1357
+ else:
1358
+ # Fallback: check if operation is a registered transformer
1359
+ from odibi.context import EngineContext, PandasContext
1360
+ from odibi.registry import FunctionRegistry
1361
+
1362
+ if FunctionRegistry.has_function(operation):
1363
+ func = FunctionRegistry.get_function(operation)
1364
+ param_model = FunctionRegistry.get_param_model(operation)
1365
+
1366
+ # Create EngineContext from current df
1367
+ engine_ctx = EngineContext(
1368
+ context=PandasContext(),
1369
+ df=df,
1370
+ engine=self,
1371
+ engine_type=self.engine_type,
1372
+ )
1373
+
1374
+ # Validate and instantiate params
1375
+ if param_model:
1376
+ validated_params = param_model(**params)
1377
+ result_ctx = func(engine_ctx, validated_params)
1378
+ else:
1379
+ result_ctx = func(engine_ctx, **params)
1380
+
1381
+ return result_ctx.df
1382
+
1383
+ raise ValueError(f"Unsupported operation: {operation}")
1384
+
1385
+ def _pivot(self, df: pd.DataFrame, params: Dict[str, Any]) -> pd.DataFrame:
1386
+ """Execute pivot operation.
1387
+
1388
+ Args:
1389
+ df: Input DataFrame
1390
+ params: Pivot parameters
1391
+
1392
+ Returns:
1393
+ Pivoted DataFrame
1394
+ """
1395
+ group_by = params.get("group_by", [])
1396
+ pivot_column = params["pivot_column"]
1397
+ value_column = params["value_column"]
1398
+ agg_func = params.get("agg_func", "first")
1399
+
1400
+ # Validate columns exist
1401
+ required_columns = set()
1402
+ if isinstance(group_by, list):
1403
+ required_columns.update(group_by)
1404
+ elif isinstance(group_by, str):
1405
+ required_columns.add(group_by)
1406
+ group_by = [group_by]
1407
+
1408
+ required_columns.add(pivot_column)
1409
+ required_columns.add(value_column)
1410
+
1411
+ missing = required_columns - set(df.columns)
1412
+ if missing:
1413
+ raise KeyError(
1414
+ f"Columns not found in DataFrame for pivot operation: {missing}. "
1415
+ f"Available: {list(df.columns)}"
1416
+ )
1417
+
1418
+ result = df.pivot_table(
1419
+ index=group_by, columns=pivot_column, values=value_column, aggfunc=agg_func
1420
+ ).reset_index()
1421
+
1422
+ # Flatten column names if multi-level
1423
+ if isinstance(result.columns, pd.MultiIndex):
1424
+ result.columns = ["_".join(col).strip("_") for col in result.columns.values]
1425
+
1426
+ return result
1427
+
1428
+ def harmonize_schema(
1429
+ self, df: pd.DataFrame, target_schema: Dict[str, str], policy: Any
1430
+ ) -> pd.DataFrame:
1431
+ """Harmonize DataFrame schema with target schema according to policy."""
1432
+ # Ensure materialization
1433
+ df = self.materialize(df)
1434
+
1435
+ from odibi.config import OnMissingColumns, OnNewColumns, SchemaMode
1436
+
1437
+ target_cols = list(target_schema.keys())
1438
+ current_cols = df.columns.tolist()
1439
+
1440
+ missing = set(target_cols) - set(current_cols)
1441
+ new_cols = set(current_cols) - set(target_cols)
1442
+
1443
+ # 1. Check Validations
1444
+ if missing and policy.on_missing_columns == OnMissingColumns.FAIL:
1445
+ raise ValueError(f"Schema Policy Violation: Missing columns {missing}")
1446
+
1447
+ if new_cols and policy.on_new_columns == OnNewColumns.FAIL:
1448
+ raise ValueError(f"Schema Policy Violation: New columns {new_cols}")
1449
+
1450
+ # 2. Apply Transformations
1451
+ if policy.mode == SchemaMode.EVOLVE and policy.on_new_columns == OnNewColumns.ADD_NULLABLE:
1452
+ # Evolve: Add missing columns, Keep new columns
1453
+ for col in missing:
1454
+ df[col] = None
1455
+ else:
1456
+ # Enforce / Ignore New: Project to target schema (Drops new, Adds missing)
1457
+ # Note: reindex adds NaN for missing columns
1458
+ df = df.reindex(columns=target_cols)
1459
+
1460
+ return df
1461
+
1462
+ def anonymize(
1463
+ self, df: Any, columns: List[str], method: str, salt: Optional[str] = None
1464
+ ) -> pd.DataFrame:
1465
+ """Anonymize specified columns."""
1466
+ # Ensure materialization
1467
+ df = self.materialize(df)
1468
+
1469
+ res = df.copy()
1470
+
1471
+ for col in columns:
1472
+ if col not in res.columns:
1473
+ continue
1474
+
1475
+ if method == "hash":
1476
+ # Vectorized Hashing (via map/apply)
1477
+ # Note: True vectorization requires C-level support (e.g. pyarrow.compute)
1478
+ # Standard Pandas apply is the fallback but we can optimize string handling
1479
+
1480
+ # Convert to string, handling nulls
1481
+ # s_col = res[col].astype(str)
1482
+ # Nulls become 'nan'/'None' string, we want to preserve them or hash them consistently?
1483
+ # Typically nulls should remain null.
1484
+
1485
+ mask_nulls = res[col].isna()
1486
+
1487
+ def _hash_val(val):
1488
+ to_hash = val
1489
+ if salt:
1490
+ to_hash += salt
1491
+ return hashlib.sha256(to_hash.encode("utf-8")).hexdigest()
1492
+
1493
+ # Apply only to non-nulls
1494
+ res.loc[~mask_nulls, col] = res.loc[~mask_nulls, col].astype(str).apply(_hash_val)
1495
+
1496
+ elif method == "mask":
1497
+ # Vectorized Masking
1498
+ # Mask all but last 4 characters
1499
+
1500
+ mask_nulls = res[col].isna()
1501
+ s_valid = res.loc[~mask_nulls, col].astype(str)
1502
+
1503
+ # Use vectorized regex replacement
1504
+ # Replace any character that is followed by 4 characters with '*'
1505
+ res.loc[~mask_nulls, col] = s_valid.str.replace(r".(?=.{4})", "*", regex=True)
1506
+
1507
+ elif method == "redact":
1508
+ res[col] = "[REDACTED]"
1509
+
1510
+ return res
1511
+
1512
+ def get_schema(self, df: Any) -> Dict[str, str]:
1513
+ """Get DataFrame schema with types.
1514
+
1515
+ Args:
1516
+ df: DataFrame or LazyDataset
1517
+
1518
+ Returns:
1519
+ Dict[str, str]: Column name -> Type string
1520
+ """
1521
+ if isinstance(df, LazyDataset):
1522
+ if self.use_duckdb:
1523
+ try:
1524
+ import duckdb
1525
+
1526
+ conn = duckdb.connect(":memory:")
1527
+ self._register_lazy_view(conn, "df", df)
1528
+ res = conn.execute("DESCRIBE SELECT * FROM df").df()
1529
+ return dict(zip(res["column_name"], res["column_type"]))
1530
+ except Exception:
1531
+ pass
1532
+ df = self.materialize(df)
1533
+
1534
+ return {col: str(df[col].dtype) for col in df.columns}
1535
+
1536
+ def get_shape(self, df: Any) -> tuple:
1537
+ """Get DataFrame shape.
1538
+
1539
+ Args:
1540
+ df: DataFrame or LazyDataset
1541
+
1542
+ Returns:
1543
+ (rows, columns)
1544
+ """
1545
+ if isinstance(df, LazyDataset):
1546
+ cols = len(self.get_schema(df))
1547
+ rows = self.count_rows(df)
1548
+ return (rows, cols)
1549
+ return df.shape
1550
+
1551
+ def count_rows(self, df: Any) -> int:
1552
+ """Count rows in DataFrame.
1553
+
1554
+ Args:
1555
+ df: DataFrame or LazyDataset
1556
+
1557
+ Returns:
1558
+ Row count
1559
+ """
1560
+ if isinstance(df, LazyDataset):
1561
+ if self.use_duckdb:
1562
+ try:
1563
+ import duckdb
1564
+
1565
+ conn = duckdb.connect(":memory:")
1566
+ self._register_lazy_view(conn, "df", df)
1567
+ res = conn.execute("SELECT count(*) FROM df").fetchone()
1568
+ return res[0] if res else 0
1569
+ except Exception:
1570
+ pass
1571
+ df = self.materialize(df)
1572
+
1573
+ return len(df)
1574
+
1575
+ def count_nulls(self, df: pd.DataFrame, columns: List[str]) -> Dict[str, int]:
1576
+ """Count nulls in specified columns.
1577
+
1578
+ Args:
1579
+ df: DataFrame
1580
+ columns: Columns to check
1581
+
1582
+ Returns:
1583
+ Dictionary of column -> null count
1584
+ """
1585
+ null_counts = {}
1586
+ for col in columns:
1587
+ if col in df.columns:
1588
+ null_counts[col] = int(df[col].isna().sum())
1589
+ else:
1590
+ raise ValueError(
1591
+ f"Column '{col}' not found in DataFrame. "
1592
+ f"Available columns: {list(df.columns)}"
1593
+ )
1594
+ return null_counts
1595
+
1596
+ def validate_schema(self, df: pd.DataFrame, schema_rules: Dict[str, Any]) -> List[str]:
1597
+ """Validate DataFrame schema.
1598
+
1599
+ Args:
1600
+ df: DataFrame
1601
+ schema_rules: Validation rules
1602
+
1603
+ Returns:
1604
+ List of validation failures
1605
+ """
1606
+ # Ensure materialization
1607
+ df = self.materialize(df)
1608
+
1609
+ failures = []
1610
+
1611
+ # Check required columns
1612
+ if "required_columns" in schema_rules:
1613
+ required = schema_rules["required_columns"]
1614
+ missing = set(required) - set(df.columns)
1615
+ if missing:
1616
+ failures.append(f"Missing required columns: {', '.join(missing)}")
1617
+
1618
+ # Check column types
1619
+ if "types" in schema_rules:
1620
+ type_map = {
1621
+ "int": ["int64", "int32", "int16", "int8"],
1622
+ "float": ["float64", "float32"],
1623
+ "str": ["object", "string"],
1624
+ "bool": ["bool"],
1625
+ }
1626
+
1627
+ for col, expected_type in schema_rules["types"].items():
1628
+ if col not in df.columns:
1629
+ failures.append(f"Column '{col}' not found for type validation")
1630
+ continue
1631
+
1632
+ actual_type = str(df[col].dtype)
1633
+ # Handle pyarrow types (e.g. int64[pyarrow])
1634
+ if "[" in actual_type and "pyarrow" in actual_type:
1635
+ actual_type = actual_type.split("[")[0]
1636
+
1637
+ expected_dtypes = type_map.get(expected_type, [expected_type])
1638
+
1639
+ if actual_type not in expected_dtypes:
1640
+ failures.append(
1641
+ f"Column '{col}' has type '{actual_type}', expected '{expected_type}'"
1642
+ )
1643
+
1644
+ return failures
1645
+
1646
+ def _infer_avro_schema(self, df: pd.DataFrame) -> Dict[str, Any]:
1647
+ """Infer Avro schema from pandas DataFrame.
1648
+
1649
+ Args:
1650
+ df: DataFrame to infer schema from
1651
+
1652
+ Returns:
1653
+ Avro schema dictionary
1654
+ """
1655
+ type_mapping = {
1656
+ "int64": "long",
1657
+ "int32": "int",
1658
+ "float64": "double",
1659
+ "float32": "float",
1660
+ "bool": "boolean",
1661
+ "object": "string",
1662
+ "string": "string",
1663
+ }
1664
+
1665
+ fields = []
1666
+ for col in df.columns:
1667
+ dtype = df[col].dtype
1668
+ dtype_str = str(dtype)
1669
+
1670
+ # Handle datetime types with Avro logical types
1671
+ if pd.api.types.is_datetime64_any_dtype(dtype):
1672
+ avro_type = {
1673
+ "type": "long",
1674
+ "logicalType": "timestamp-micros",
1675
+ }
1676
+ elif dtype_str == "date" or (hasattr(dtype, "name") and "date" in dtype.name.lower()):
1677
+ avro_type = {
1678
+ "type": "int",
1679
+ "logicalType": "date",
1680
+ }
1681
+ elif pd.api.types.is_timedelta64_dtype(dtype):
1682
+ avro_type = {
1683
+ "type": "long",
1684
+ "logicalType": "time-micros",
1685
+ }
1686
+ else:
1687
+ avro_type = type_mapping.get(dtype_str, "string")
1688
+
1689
+ # Handle nullable columns
1690
+ if df[col].isnull().any():
1691
+ avro_type = ["null", avro_type]
1692
+
1693
+ fields.append({"name": col, "type": avro_type})
1694
+
1695
+ return {"type": "record", "name": "DataFrame", "fields": fields}
1696
+
1697
+ def validate_data(self, df: pd.DataFrame, validation_config: Any) -> List[str]:
1698
+ """Validate DataFrame against rules.
1699
+
1700
+ Args:
1701
+ df: DataFrame
1702
+ validation_config: ValidationConfig object
1703
+
1704
+ Returns:
1705
+ List of validation failure messages
1706
+ """
1707
+ # Ensure materialization
1708
+ df = self.materialize(df)
1709
+
1710
+ failures = []
1711
+
1712
+ # Check not empty
1713
+ if validation_config.not_empty:
1714
+ if len(df) == 0:
1715
+ failures.append("DataFrame is empty")
1716
+
1717
+ # Check for nulls in specified columns
1718
+ if validation_config.no_nulls:
1719
+ null_counts = self.count_nulls(df, validation_config.no_nulls)
1720
+ for col, count in null_counts.items():
1721
+ if count > 0:
1722
+ failures.append(f"Column '{col}' has {count} null values")
1723
+
1724
+ # Schema validation
1725
+ if validation_config.schema_validation:
1726
+ schema_failures = self.validate_schema(df, validation_config.schema_validation)
1727
+ failures.extend(schema_failures)
1728
+
1729
+ # Range validation
1730
+ if validation_config.ranges:
1731
+ for col, bounds in validation_config.ranges.items():
1732
+ if col in df.columns:
1733
+ min_val = bounds.get("min")
1734
+ max_val = bounds.get("max")
1735
+
1736
+ if min_val is not None:
1737
+ min_violations = df[df[col] < min_val]
1738
+ if len(min_violations) > 0:
1739
+ failures.append(f"Column '{col}' has values < {min_val}")
1740
+
1741
+ if max_val is not None:
1742
+ max_violations = df[df[col] > max_val]
1743
+ if len(max_violations) > 0:
1744
+ failures.append(f"Column '{col}' has values > {max_val}")
1745
+ else:
1746
+ failures.append(f"Column '{col}' not found for range validation")
1747
+
1748
+ # Allowed values validation
1749
+ if validation_config.allowed_values:
1750
+ for col, allowed in validation_config.allowed_values.items():
1751
+ if col in df.columns:
1752
+ # Check for values not in allowed list
1753
+ invalid = df[~df[col].isin(allowed)]
1754
+ if len(invalid) > 0:
1755
+ failures.append(f"Column '{col}' has invalid values")
1756
+ else:
1757
+ failures.append(f"Column '{col}' not found for allowed values validation")
1758
+
1759
+ return failures
1760
+
1761
+ def get_sample(self, df: Any, n: int = 10) -> List[Dict[str, Any]]:
1762
+ """Get sample rows as list of dictionaries.
1763
+
1764
+ Args:
1765
+ df: DataFrame or LazyDataset
1766
+ n: Number of rows to return
1767
+
1768
+ Returns:
1769
+ List of row dictionaries
1770
+ """
1771
+ if isinstance(df, LazyDataset):
1772
+ if self.use_duckdb:
1773
+ try:
1774
+ import duckdb
1775
+
1776
+ conn = duckdb.connect(":memory:")
1777
+ self._register_lazy_view(conn, "df", df)
1778
+ res_df = conn.execute(f"SELECT * FROM df LIMIT {n}").df()
1779
+ return res_df.to_dict("records")
1780
+ except Exception:
1781
+ pass
1782
+ df = self.materialize(df)
1783
+
1784
+ return df.head(n).to_dict("records")
1785
+
1786
+ def table_exists(
1787
+ self, connection: Any, table: Optional[str] = None, path: Optional[str] = None
1788
+ ) -> bool:
1789
+ """Check if table or location exists.
1790
+
1791
+ Args:
1792
+ connection: Connection object
1793
+ table: Table name (not used in Pandas—no catalog)
1794
+ path: File path
1795
+
1796
+ Returns:
1797
+ True if file/directory exists, False otherwise
1798
+ """
1799
+ if path:
1800
+ full_path = connection.get_path(path)
1801
+ return os.path.exists(full_path)
1802
+ return False
1803
+
1804
+ def get_table_schema(
1805
+ self,
1806
+ connection: Any,
1807
+ table: Optional[str] = None,
1808
+ path: Optional[str] = None,
1809
+ format: Optional[str] = None,
1810
+ ) -> Optional[Dict[str, str]]:
1811
+ """Get schema of an existing table/file."""
1812
+ try:
1813
+ if table and format in ["sql", "sql_server", "azure_sql"]:
1814
+ # SQL Server: Read empty result
1815
+ query = f"SELECT TOP 0 * FROM {table}"
1816
+ df = connection.read_sql(query)
1817
+ return self.get_schema(df)
1818
+
1819
+ if path:
1820
+ full_path = connection.get_path(path)
1821
+ if not os.path.exists(full_path):
1822
+ return None
1823
+
1824
+ if format == "delta":
1825
+ from deltalake import DeltaTable
1826
+
1827
+ dt = DeltaTable(full_path)
1828
+ # Use pyarrow schema to pandas schema to avoid reading data
1829
+ arrow_schema = dt.schema().to_pyarrow()
1830
+ empty_df = arrow_schema.empty_table().to_pandas()
1831
+ return self.get_schema(empty_df)
1832
+
1833
+ elif format == "parquet":
1834
+ import pyarrow.parquet as pq
1835
+
1836
+ target_path = full_path
1837
+ if os.path.isdir(full_path):
1838
+ # Find first parquet file
1839
+ files = glob.glob(os.path.join(full_path, "*.parquet"))
1840
+ if not files:
1841
+ return None
1842
+ target_path = files[0]
1843
+
1844
+ schema = pq.read_schema(target_path)
1845
+ empty_df = schema.empty_table().to_pandas()
1846
+ return self.get_schema(empty_df)
1847
+
1848
+ elif format == "csv":
1849
+ df = pd.read_csv(full_path, nrows=0)
1850
+ return self.get_schema(df)
1851
+
1852
+ except (FileNotFoundError, PermissionError):
1853
+ return None
1854
+ except ImportError as e:
1855
+ # Log missing optional dependency
1856
+ import logging
1857
+
1858
+ logging.getLogger(__name__).warning(
1859
+ f"Could not infer schema due to missing dependency: {e}"
1860
+ )
1861
+ return None
1862
+ except Exception as e:
1863
+ import logging
1864
+
1865
+ logging.getLogger(__name__).warning(f"Failed to infer schema for {table or path}: {e}")
1866
+ return None
1867
+ return None
1868
+
1869
+ def vacuum_delta(
1870
+ self,
1871
+ connection: Any,
1872
+ path: str,
1873
+ retention_hours: int = 168,
1874
+ dry_run: bool = False,
1875
+ enforce_retention_duration: bool = True,
1876
+ ) -> Dict[str, Any]:
1877
+ """VACUUM a Delta table to remove old files.
1878
+
1879
+ Args:
1880
+ connection: Connection object
1881
+ path: Delta table path
1882
+ retention_hours: Retention period (default 168 = 7 days)
1883
+ dry_run: If True, only show files to be deleted
1884
+ enforce_retention_duration: If False, allows retention < 168 hours (testing only)
1885
+
1886
+ Returns:
1887
+ Dictionary with files_deleted count
1888
+ """
1889
+ ctx = get_logging_context().with_context(engine="pandas")
1890
+ start = time.time()
1891
+
1892
+ ctx.debug(
1893
+ "Starting Delta VACUUM",
1894
+ path=path,
1895
+ retention_hours=retention_hours,
1896
+ dry_run=dry_run,
1897
+ )
1898
+
1899
+ try:
1900
+ from deltalake import DeltaTable
1901
+ except ImportError:
1902
+ ctx.error("Delta Lake library not installed", path=path)
1903
+ raise ImportError(
1904
+ "Delta Lake support requires 'pip install odibi[pandas]' "
1905
+ "or 'pip install deltalake'. See README.md for installation instructions."
1906
+ )
1907
+
1908
+ full_path = connection.get_path(path)
1909
+
1910
+ storage_opts = {}
1911
+ if hasattr(connection, "pandas_storage_options"):
1912
+ storage_opts = connection.pandas_storage_options()
1913
+
1914
+ dt = DeltaTable(full_path, storage_options=storage_opts)
1915
+ deleted_files = dt.vacuum(
1916
+ retention_hours=retention_hours,
1917
+ dry_run=dry_run,
1918
+ enforce_retention_duration=enforce_retention_duration,
1919
+ )
1920
+
1921
+ elapsed = (time.time() - start) * 1000
1922
+ ctx.info(
1923
+ "Delta VACUUM completed",
1924
+ path=str(full_path),
1925
+ files_deleted=len(deleted_files),
1926
+ dry_run=dry_run,
1927
+ elapsed_ms=round(elapsed, 2),
1928
+ )
1929
+
1930
+ return {"files_deleted": len(deleted_files)}
1931
+
1932
+ def get_delta_history(
1933
+ self, connection: Any, path: str, limit: Optional[int] = None
1934
+ ) -> List[Dict[str, Any]]:
1935
+ """Get Delta table history.
1936
+
1937
+ Args:
1938
+ connection: Connection object
1939
+ path: Delta table path
1940
+ limit: Maximum number of versions to return
1941
+
1942
+ Returns:
1943
+ List of version metadata dictionaries
1944
+ """
1945
+ ctx = get_logging_context().with_context(engine="pandas")
1946
+ start = time.time()
1947
+
1948
+ ctx.debug("Getting Delta table history", path=path, limit=limit)
1949
+
1950
+ try:
1951
+ from deltalake import DeltaTable
1952
+ except ImportError:
1953
+ ctx.error("Delta Lake library not installed", path=path)
1954
+ raise ImportError(
1955
+ "Delta Lake support requires 'pip install odibi[pandas]' "
1956
+ "or 'pip install deltalake'. See README.md for installation instructions."
1957
+ )
1958
+
1959
+ full_path = connection.get_path(path)
1960
+
1961
+ storage_opts = {}
1962
+ if hasattr(connection, "pandas_storage_options"):
1963
+ storage_opts = connection.pandas_storage_options()
1964
+
1965
+ dt = DeltaTable(full_path, storage_options=storage_opts)
1966
+ history = dt.history(limit=limit)
1967
+
1968
+ elapsed = (time.time() - start) * 1000
1969
+ ctx.info(
1970
+ "Delta history retrieved",
1971
+ path=str(full_path),
1972
+ versions_returned=len(history) if history else 0,
1973
+ elapsed_ms=round(elapsed, 2),
1974
+ )
1975
+
1976
+ return history
1977
+
1978
+ def restore_delta(self, connection: Any, path: str, version: int) -> None:
1979
+ """Restore Delta table to a specific version.
1980
+
1981
+ Args:
1982
+ connection: Connection object
1983
+ path: Delta table path
1984
+ version: Version number to restore to
1985
+ """
1986
+ ctx = get_logging_context().with_context(engine="pandas")
1987
+ start = time.time()
1988
+
1989
+ ctx.info("Starting Delta table restore", path=path, target_version=version)
1990
+
1991
+ try:
1992
+ from deltalake import DeltaTable
1993
+ except ImportError:
1994
+ ctx.error("Delta Lake library not installed", path=path)
1995
+ raise ImportError(
1996
+ "Delta Lake support requires 'pip install odibi[pandas]' "
1997
+ "or 'pip install deltalake'. See README.md for installation instructions."
1998
+ )
1999
+
2000
+ full_path = connection.get_path(path)
2001
+
2002
+ storage_opts = {}
2003
+ if hasattr(connection, "pandas_storage_options"):
2004
+ storage_opts = connection.pandas_storage_options()
2005
+
2006
+ dt = DeltaTable(full_path, storage_options=storage_opts)
2007
+ dt.restore(version)
2008
+
2009
+ elapsed = (time.time() - start) * 1000
2010
+ ctx.info(
2011
+ "Delta table restored",
2012
+ path=str(full_path),
2013
+ restored_to_version=version,
2014
+ elapsed_ms=round(elapsed, 2),
2015
+ )
2016
+
2017
+ def maintain_table(
2018
+ self,
2019
+ connection: Any,
2020
+ format: str,
2021
+ table: Optional[str] = None,
2022
+ path: Optional[str] = None,
2023
+ config: Optional[Any] = None,
2024
+ ) -> None:
2025
+ """Run table maintenance operations (optimize, vacuum)."""
2026
+ ctx = get_logging_context().with_context(engine="pandas")
2027
+
2028
+ if format != "delta" or not config or not config.enabled:
2029
+ return
2030
+
2031
+ if not path and not table:
2032
+ return
2033
+
2034
+ full_path = connection.get_path(path if path else table)
2035
+ start = time.time()
2036
+
2037
+ ctx.info("Starting table maintenance", path=str(full_path))
2038
+
2039
+ try:
2040
+ from deltalake import DeltaTable
2041
+ except ImportError:
2042
+ ctx.warning(
2043
+ "Auto-optimize skipped: 'deltalake' library not installed",
2044
+ path=str(full_path),
2045
+ )
2046
+ return
2047
+
2048
+ try:
2049
+ storage_opts = {}
2050
+ if hasattr(connection, "pandas_storage_options"):
2051
+ storage_opts = connection.pandas_storage_options()
2052
+
2053
+ dt = DeltaTable(full_path, storage_options=storage_opts)
2054
+
2055
+ ctx.info("Running Delta OPTIMIZE (compaction)", path=str(full_path))
2056
+ dt.optimize.compact()
2057
+
2058
+ retention = config.vacuum_retention_hours
2059
+ if retention is not None and retention > 0:
2060
+ ctx.info(
2061
+ "Running Delta VACUUM",
2062
+ path=str(full_path),
2063
+ retention_hours=retention,
2064
+ )
2065
+ dt.vacuum(
2066
+ retention_hours=retention,
2067
+ enforce_retention_duration=True,
2068
+ dry_run=False,
2069
+ )
2070
+
2071
+ elapsed = (time.time() - start) * 1000
2072
+ ctx.info(
2073
+ "Table maintenance completed",
2074
+ path=str(full_path),
2075
+ elapsed_ms=round(elapsed, 2),
2076
+ )
2077
+
2078
+ except Exception as e:
2079
+ ctx.warning(
2080
+ "Auto-optimize failed",
2081
+ path=str(full_path),
2082
+ error=str(e),
2083
+ )
2084
+
2085
+ def get_source_files(self, df: Any) -> List[str]:
2086
+ """Get list of source files that generated this DataFrame.
2087
+
2088
+ Args:
2089
+ df: DataFrame or LazyDataset
2090
+
2091
+ Returns:
2092
+ List of file paths
2093
+ """
2094
+ if isinstance(df, LazyDataset):
2095
+ if isinstance(df.path, list):
2096
+ return df.path
2097
+ return [str(df.path)]
2098
+
2099
+ if hasattr(df, "attrs"):
2100
+ return df.attrs.get("odibi_source_files", [])
2101
+ return []
2102
+
2103
+ def profile_nulls(self, df: pd.DataFrame) -> Dict[str, float]:
2104
+ """Calculate null percentage for each column.
2105
+
2106
+ Args:
2107
+ df: DataFrame
2108
+
2109
+ Returns:
2110
+ Dictionary of {column_name: null_percentage}
2111
+ """
2112
+ # Ensure materialization
2113
+ df = self.materialize(df)
2114
+
2115
+ # mean() of boolean DataFrame gives the percentage of True values
2116
+ return df.isna().mean().to_dict()
2117
+
2118
+ def filter_greater_than(self, df: pd.DataFrame, column: str, value: Any) -> pd.DataFrame:
2119
+ """Filter DataFrame where column > value.
2120
+
2121
+ Automatically casts string columns to datetime for proper comparison.
2122
+ """
2123
+ if column not in df.columns:
2124
+ raise ValueError(f"Column '{column}' not found in DataFrame")
2125
+
2126
+ try:
2127
+ col_series = df[column]
2128
+
2129
+ if pd.api.types.is_string_dtype(col_series):
2130
+ col_series = pd.to_datetime(col_series, errors="coerce")
2131
+ elif pd.api.types.is_datetime64_any_dtype(col_series) and isinstance(value, str):
2132
+ value = pd.to_datetime(value)
2133
+
2134
+ return df[col_series > value]
2135
+ except Exception as e:
2136
+ raise ValueError(f"Failed to filter {column} > {value}: {e}")
2137
+
2138
+ def filter_coalesce(
2139
+ self, df: pd.DataFrame, col1: str, col2: str, op: str, value: Any
2140
+ ) -> pd.DataFrame:
2141
+ """Filter using COALESCE(col1, col2) op value.
2142
+
2143
+ Automatically casts string columns to datetime for proper comparison.
2144
+ """
2145
+ if col1 not in df.columns:
2146
+ raise ValueError(f"Column '{col1}' not found")
2147
+
2148
+ def _to_datetime_if_string(series: pd.Series) -> pd.Series:
2149
+ if pd.api.types.is_string_dtype(series):
2150
+ return pd.to_datetime(series, errors="coerce")
2151
+ return series
2152
+
2153
+ s1 = _to_datetime_if_string(df[col1])
2154
+
2155
+ if col2 not in df.columns:
2156
+ s = s1
2157
+ else:
2158
+ s2 = _to_datetime_if_string(df[col2])
2159
+ s = s1.combine_first(s2)
2160
+
2161
+ try:
2162
+ if pd.api.types.is_datetime64_any_dtype(s) and isinstance(value, str):
2163
+ value = pd.to_datetime(value)
2164
+
2165
+ if op == ">=":
2166
+ return df[s >= value]
2167
+ elif op == ">":
2168
+ return df[s > value]
2169
+ elif op == "<=":
2170
+ return df[s <= value]
2171
+ elif op == "<":
2172
+ return df[s < value]
2173
+ elif op == "==" or op == "=":
2174
+ return df[s == value]
2175
+ else:
2176
+ raise ValueError(f"Unsupported operator: {op}")
2177
+ except Exception as e:
2178
+ raise ValueError(f"Failed to filter COALESCE({col1}, {col2}) {op} {value}: {e}")