additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -1,483 +0,0 @@
|
|
|
1
|
-
# enhanced_arrow_bridge.py
|
|
2
|
-
# Universal Arrow bridge for cross-backend dataframe compatibility
|
|
3
|
-
|
|
4
|
-
import os
|
|
5
|
-
import gc
|
|
6
|
-
import psutil
|
|
7
|
-
from typing import Any, Dict, Optional, Tuple, Union
|
|
8
|
-
from dataclasses import dataclass
|
|
9
|
-
from datetime import datetime
|
|
10
|
-
|
|
11
|
-
try:
|
|
12
|
-
import pyarrow as pa
|
|
13
|
-
import polars as pl
|
|
14
|
-
import pandas as pd
|
|
15
|
-
ARROW_AVAILABLE = True
|
|
16
|
-
except ImportError as e:
|
|
17
|
-
ARROW_AVAILABLE = False
|
|
18
|
-
IMPORT_ERROR = str(e)
|
|
19
|
-
# Create dummy classes for type annotations
|
|
20
|
-
class pa:
|
|
21
|
-
Table = Any
|
|
22
|
-
class pl:
|
|
23
|
-
DataFrame = Any
|
|
24
|
-
class pd:
|
|
25
|
-
DataFrame = Any
|
|
26
|
-
|
|
27
|
-
from ..logging import log_info, log_warning
|
|
28
|
-
from .cudf_bridge import get_cudf_bridge
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@dataclass
|
|
32
|
-
class ConversionStats:
|
|
33
|
-
"""Statistics for Arrow bridge operations"""
|
|
34
|
-
conversions: int = 0
|
|
35
|
-
memory_used_mb: float = 0.0
|
|
36
|
-
total_rows_processed: int = 0
|
|
37
|
-
total_columns_processed: int = 0
|
|
38
|
-
conversion_time_ms: float = 0.0
|
|
39
|
-
cleanup_count: int = 0
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
class ArrowBridgeError(Exception):
|
|
43
|
-
"""Raised when Arrow bridge operations fail"""
|
|
44
|
-
pass
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class EnhancedArrowBridge:
|
|
48
|
-
"""Universal data bridge using Apache Arrow for cross-backend compatibility"""
|
|
49
|
-
|
|
50
|
-
def __init__(self):
|
|
51
|
-
if not ARROW_AVAILABLE:
|
|
52
|
-
raise ArrowBridgeError(f"Arrow dependencies not available: {IMPORT_ERROR}")
|
|
53
|
-
|
|
54
|
-
# Memory management
|
|
55
|
-
self.memory_pool = pa.default_memory_pool()
|
|
56
|
-
self.memory_threshold_mb = 100 # Cleanup threshold
|
|
57
|
-
|
|
58
|
-
# Statistics tracking
|
|
59
|
-
self.stats = ConversionStats()
|
|
60
|
-
|
|
61
|
-
# Supported backends (order matters - check cuDF before pandas)
|
|
62
|
-
self.supported_backends = {
|
|
63
|
-
"cudf": self._detect_cudf,
|
|
64
|
-
"polars": self._detect_polars,
|
|
65
|
-
"pandas": self._detect_pandas
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
log_info("[arrow_bridge] Enhanced Arrow Bridge initialized")
|
|
69
|
-
|
|
70
|
-
def detect_backend(self, df: Any) -> str:
|
|
71
|
-
"""
|
|
72
|
-
Automatically detect dataframe backend
|
|
73
|
-
|
|
74
|
-
Args:
|
|
75
|
-
df: Input dataframe of unknown type
|
|
76
|
-
|
|
77
|
-
Returns:
|
|
78
|
-
Backend name ("pandas", "polars", "cudf")
|
|
79
|
-
|
|
80
|
-
Raises:
|
|
81
|
-
ArrowBridgeError: If backend cannot be detected
|
|
82
|
-
"""
|
|
83
|
-
for backend_name, detector in self.supported_backends.items():
|
|
84
|
-
if detector(df):
|
|
85
|
-
log_info(f"[arrow_bridge] Detected backend: {backend_name}")
|
|
86
|
-
return backend_name
|
|
87
|
-
|
|
88
|
-
# Try to get type information for better error message
|
|
89
|
-
df_type = type(df).__name__
|
|
90
|
-
df_module = getattr(type(df), '__module__', 'unknown')
|
|
91
|
-
|
|
92
|
-
raise ArrowBridgeError(
|
|
93
|
-
f"Unsupported dataframe type: {df_type} from module {df_module}. "
|
|
94
|
-
f"Supported backends: {list(self.supported_backends.keys())}"
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
def _detect_pandas(self, df: Any) -> bool:
|
|
98
|
-
"""Detect if dataframe is pandas (but not cuDF)"""
|
|
99
|
-
# First check if it's cuDF (which also has pandas-like attributes)
|
|
100
|
-
if self._detect_cudf(df):
|
|
101
|
-
return False
|
|
102
|
-
# Then check for pandas attributes
|
|
103
|
-
return hasattr(df, 'iloc') and hasattr(df, 'loc') and hasattr(df, 'dtypes')
|
|
104
|
-
|
|
105
|
-
def _detect_polars(self, df: Any) -> bool:
|
|
106
|
-
"""Detect if dataframe is Polars"""
|
|
107
|
-
return (hasattr(df, 'lazy') and hasattr(df, 'collect') and hasattr(df, 'schema')) or \
|
|
108
|
-
str(type(df)).find('polars') != -1
|
|
109
|
-
|
|
110
|
-
def _detect_cudf(self, df: Any) -> bool:
|
|
111
|
-
"""Detect if dataframe is cuDF using enhanced detection"""
|
|
112
|
-
cudf_bridge = get_cudf_bridge()
|
|
113
|
-
return cudf_bridge.detect_cudf(df)
|
|
114
|
-
|
|
115
|
-
def to_arrow(self, df: Any, backend_type: Optional[str] = None) -> pa.Table:
|
|
116
|
-
"""
|
|
117
|
-
Convert any dataframe to Arrow table
|
|
118
|
-
|
|
119
|
-
Args:
|
|
120
|
-
df: Input dataframe
|
|
121
|
-
backend_type: Backend type (auto-detected if None)
|
|
122
|
-
|
|
123
|
-
Returns:
|
|
124
|
-
PyArrow Table
|
|
125
|
-
|
|
126
|
-
Raises:
|
|
127
|
-
ArrowBridgeError: If conversion fails
|
|
128
|
-
"""
|
|
129
|
-
start_time = datetime.now()
|
|
130
|
-
|
|
131
|
-
try:
|
|
132
|
-
# Auto-detect backend if not specified
|
|
133
|
-
if backend_type is None:
|
|
134
|
-
backend_type = self.detect_backend(df)
|
|
135
|
-
|
|
136
|
-
# Get memory usage before conversion
|
|
137
|
-
memory_before = self._get_memory_usage_mb()
|
|
138
|
-
|
|
139
|
-
# Convert based on backend
|
|
140
|
-
if backend_type == "pandas":
|
|
141
|
-
arrow_table = self._pandas_to_arrow(df)
|
|
142
|
-
elif backend_type == "polars":
|
|
143
|
-
arrow_table = self._polars_to_arrow(df)
|
|
144
|
-
elif backend_type == "cudf":
|
|
145
|
-
arrow_table = self._cudf_to_arrow(df)
|
|
146
|
-
else:
|
|
147
|
-
raise ArrowBridgeError(f"Unsupported backend for conversion: {backend_type}")
|
|
148
|
-
|
|
149
|
-
# Update statistics
|
|
150
|
-
memory_after = self._get_memory_usage_mb()
|
|
151
|
-
conversion_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
152
|
-
|
|
153
|
-
self.stats.conversions += 1
|
|
154
|
-
self.stats.memory_used_mb += max(0, memory_after - memory_before)
|
|
155
|
-
self.stats.total_rows_processed += arrow_table.num_rows
|
|
156
|
-
self.stats.total_columns_processed += arrow_table.num_columns
|
|
157
|
-
self.stats.conversion_time_ms += conversion_time
|
|
158
|
-
|
|
159
|
-
log_info(f"[arrow_bridge] Converted {backend_type} to Arrow: "
|
|
160
|
-
f"{arrow_table.num_rows} rows, {arrow_table.num_columns} columns "
|
|
161
|
-
f"({conversion_time:.1f}ms)")
|
|
162
|
-
|
|
163
|
-
return arrow_table
|
|
164
|
-
|
|
165
|
-
except Exception as e:
|
|
166
|
-
raise ArrowBridgeError(f"Failed to convert {backend_type} to Arrow: {e}")
|
|
167
|
-
|
|
168
|
-
def from_arrow(self, arrow_table: pa.Table, target_backend: str) -> Any:
|
|
169
|
-
"""
|
|
170
|
-
Convert Arrow table back to target dataframe format
|
|
171
|
-
|
|
172
|
-
Args:
|
|
173
|
-
arrow_table: PyArrow Table
|
|
174
|
-
target_backend: Target backend ("pandas", "polars", "cudf")
|
|
175
|
-
|
|
176
|
-
Returns:
|
|
177
|
-
Dataframe in target format
|
|
178
|
-
|
|
179
|
-
Raises:
|
|
180
|
-
ArrowBridgeError: If conversion fails
|
|
181
|
-
"""
|
|
182
|
-
start_time = datetime.now()
|
|
183
|
-
|
|
184
|
-
try:
|
|
185
|
-
# Convert based on target backend
|
|
186
|
-
if target_backend == "pandas":
|
|
187
|
-
result_df = self._arrow_to_pandas(arrow_table)
|
|
188
|
-
elif target_backend == "polars":
|
|
189
|
-
result_df = self._arrow_to_polars(arrow_table)
|
|
190
|
-
elif target_backend == "cudf":
|
|
191
|
-
result_df = self._arrow_to_cudf(arrow_table)
|
|
192
|
-
else:
|
|
193
|
-
raise ArrowBridgeError(f"Unsupported target backend: {target_backend}")
|
|
194
|
-
|
|
195
|
-
conversion_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
196
|
-
|
|
197
|
-
log_info(f"[arrow_bridge] Converted Arrow to {target_backend}: "
|
|
198
|
-
f"{arrow_table.num_rows} rows, {arrow_table.num_columns} columns "
|
|
199
|
-
f"({conversion_time:.1f}ms)")
|
|
200
|
-
|
|
201
|
-
return result_df
|
|
202
|
-
|
|
203
|
-
except Exception as e:
|
|
204
|
-
raise ArrowBridgeError(f"Failed to convert Arrow to {target_backend}: {e}")
|
|
205
|
-
|
|
206
|
-
def _pandas_to_arrow(self, df: pd.DataFrame) -> pa.Table:
|
|
207
|
-
"""Convert pandas DataFrame to Arrow table"""
|
|
208
|
-
try:
|
|
209
|
-
# Use zero-copy conversion when possible
|
|
210
|
-
return pa.Table.from_pandas(df, preserve_index=False)
|
|
211
|
-
except Exception as e:
|
|
212
|
-
# Fallback: try with schema inference
|
|
213
|
-
try:
|
|
214
|
-
schema = pa.Schema.from_pandas(df)
|
|
215
|
-
return pa.Table.from_pandas(df, schema=schema, preserve_index=False)
|
|
216
|
-
except Exception as e2:
|
|
217
|
-
raise ArrowBridgeError(f"Pandas to Arrow conversion failed: {e2}")
|
|
218
|
-
|
|
219
|
-
def _polars_to_arrow(self, df: pl.DataFrame) -> pa.Table:
|
|
220
|
-
"""Convert Polars DataFrame to Arrow table"""
|
|
221
|
-
try:
|
|
222
|
-
return df.to_arrow()
|
|
223
|
-
except Exception as e:
|
|
224
|
-
raise ArrowBridgeError(f"Polars to Arrow conversion failed: {e}")
|
|
225
|
-
|
|
226
|
-
def _cudf_to_arrow(self, df: Any) -> pa.Table:
|
|
227
|
-
"""Convert cuDF DataFrame to Arrow table using enhanced bridge"""
|
|
228
|
-
cudf_bridge = get_cudf_bridge()
|
|
229
|
-
return cudf_bridge.cudf_to_arrow(df)
|
|
230
|
-
|
|
231
|
-
def _arrow_to_pandas(self, arrow_table: pa.Table) -> pd.DataFrame:
|
|
232
|
-
"""Convert Arrow table to pandas DataFrame"""
|
|
233
|
-
try:
|
|
234
|
-
return arrow_table.to_pandas()
|
|
235
|
-
except Exception as e:
|
|
236
|
-
raise ArrowBridgeError(f"Arrow to pandas conversion failed: {e}")
|
|
237
|
-
|
|
238
|
-
def _arrow_to_polars(self, arrow_table: pa.Table) -> pl.DataFrame:
|
|
239
|
-
"""Convert Arrow table to Polars DataFrame"""
|
|
240
|
-
try:
|
|
241
|
-
return pl.from_arrow(arrow_table)
|
|
242
|
-
except Exception as e:
|
|
243
|
-
raise ArrowBridgeError(f"Arrow to Polars conversion failed: {e}")
|
|
244
|
-
|
|
245
|
-
def _arrow_to_cudf(self, arrow_table: pa.Table) -> Any:
|
|
246
|
-
"""Convert Arrow table to cuDF DataFrame using enhanced bridge"""
|
|
247
|
-
cudf_bridge = get_cudf_bridge()
|
|
248
|
-
return cudf_bridge.arrow_to_cudf(arrow_table)
|
|
249
|
-
|
|
250
|
-
def cleanup_arrow_memory(self) -> Dict[str, float]:
|
|
251
|
-
"""
|
|
252
|
-
Clean up Arrow heap memory
|
|
253
|
-
|
|
254
|
-
Returns:
|
|
255
|
-
Dictionary with cleanup statistics
|
|
256
|
-
"""
|
|
257
|
-
# Get memory usage before cleanup
|
|
258
|
-
memory_before = self._get_memory_usage_mb()
|
|
259
|
-
arrow_allocated_before = self.memory_pool.bytes_allocated()
|
|
260
|
-
|
|
261
|
-
# Force garbage collection
|
|
262
|
-
gc.collect()
|
|
263
|
-
|
|
264
|
-
# Additional Arrow-specific cleanup
|
|
265
|
-
try:
|
|
266
|
-
# Clear any cached Arrow data
|
|
267
|
-
if hasattr(pa, 'jemalloc_memory_pool'):
|
|
268
|
-
# Use jemalloc pool if available for better memory management
|
|
269
|
-
pass
|
|
270
|
-
except Exception:
|
|
271
|
-
pass
|
|
272
|
-
|
|
273
|
-
# Get memory usage after cleanup
|
|
274
|
-
memory_after = self._get_memory_usage_mb()
|
|
275
|
-
arrow_allocated_after = self.memory_pool.bytes_allocated()
|
|
276
|
-
|
|
277
|
-
# Calculate cleanup statistics
|
|
278
|
-
memory_freed_mb = max(0, memory_before - memory_after)
|
|
279
|
-
arrow_freed_bytes = max(0, arrow_allocated_before - arrow_allocated_after)
|
|
280
|
-
|
|
281
|
-
self.stats.cleanup_count += 1
|
|
282
|
-
|
|
283
|
-
cleanup_stats = {
|
|
284
|
-
"memory_freed_mb": memory_freed_mb,
|
|
285
|
-
"arrow_freed_bytes": arrow_freed_bytes,
|
|
286
|
-
"memory_before_mb": memory_before,
|
|
287
|
-
"memory_after_mb": memory_after,
|
|
288
|
-
"cleanup_count": self.stats.cleanup_count
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
if memory_freed_mb > 0 or arrow_freed_bytes > 0:
|
|
292
|
-
log_info(f"[arrow_bridge] Memory cleanup: {memory_freed_mb:.1f}MB system, "
|
|
293
|
-
f"{arrow_freed_bytes} bytes Arrow heap freed")
|
|
294
|
-
|
|
295
|
-
return cleanup_stats
|
|
296
|
-
|
|
297
|
-
def _get_memory_usage_mb(self) -> float:
|
|
298
|
-
"""Get current memory usage in MB"""
|
|
299
|
-
try:
|
|
300
|
-
process = psutil.Process(os.getpid())
|
|
301
|
-
return process.memory_info().rss / 1024 / 1024
|
|
302
|
-
except Exception:
|
|
303
|
-
return 0.0
|
|
304
|
-
|
|
305
|
-
def should_cleanup(self) -> bool:
|
|
306
|
-
"""Check if memory cleanup is needed"""
|
|
307
|
-
current_memory = self._get_memory_usage_mb()
|
|
308
|
-
return current_memory > self.memory_threshold_mb
|
|
309
|
-
|
|
310
|
-
def get_memory_stats(self) -> Dict[str, Any]:
|
|
311
|
-
"""
|
|
312
|
-
Get memory usage statistics
|
|
313
|
-
|
|
314
|
-
Returns:
|
|
315
|
-
Dictionary with memory statistics
|
|
316
|
-
"""
|
|
317
|
-
return {
|
|
318
|
-
"current_memory_mb": self._get_memory_usage_mb(),
|
|
319
|
-
"arrow_allocated_bytes": self.memory_pool.bytes_allocated(),
|
|
320
|
-
"memory_threshold_mb": self.memory_threshold_mb,
|
|
321
|
-
"total_memory_used_mb": self.stats.memory_used_mb,
|
|
322
|
-
"cleanup_needed": self.should_cleanup()
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
def get_conversion_stats(self) -> ConversionStats:
|
|
326
|
-
"""Get conversion statistics"""
|
|
327
|
-
return self.stats
|
|
328
|
-
|
|
329
|
-
def reset_stats(self):
|
|
330
|
-
"""Reset conversion statistics"""
|
|
331
|
-
self.stats = ConversionStats()
|
|
332
|
-
log_info("[arrow_bridge] Statistics reset")
|
|
333
|
-
|
|
334
|
-
def set_memory_threshold(self, threshold_mb: float):
|
|
335
|
-
"""Set memory cleanup threshold"""
|
|
336
|
-
self.memory_threshold_mb = threshold_mb
|
|
337
|
-
log_info(f"[arrow_bridge] Memory threshold set to {threshold_mb}MB")
|
|
338
|
-
|
|
339
|
-
def convert_with_cleanup(self, df: Any, target_backend: str,
|
|
340
|
-
backend_type: Optional[str] = None) -> Any:
|
|
341
|
-
"""
|
|
342
|
-
Convert dataframe with automatic memory cleanup
|
|
343
|
-
|
|
344
|
-
Args:
|
|
345
|
-
df: Input dataframe
|
|
346
|
-
target_backend: Target backend for output
|
|
347
|
-
backend_type: Source backend (auto-detected if None)
|
|
348
|
-
|
|
349
|
-
Returns:
|
|
350
|
-
Converted dataframe
|
|
351
|
-
"""
|
|
352
|
-
try:
|
|
353
|
-
# Convert to Arrow
|
|
354
|
-
arrow_table = self.to_arrow(df, backend_type)
|
|
355
|
-
|
|
356
|
-
# Convert to target format
|
|
357
|
-
result_df = self.from_arrow(arrow_table, target_backend)
|
|
358
|
-
|
|
359
|
-
# Cleanup if needed
|
|
360
|
-
if self.should_cleanup():
|
|
361
|
-
self.cleanup_arrow_memory()
|
|
362
|
-
|
|
363
|
-
return result_df
|
|
364
|
-
|
|
365
|
-
except Exception as e:
|
|
366
|
-
# Always try to cleanup on error
|
|
367
|
-
self.cleanup_arrow_memory()
|
|
368
|
-
raise e
|
|
369
|
-
|
|
370
|
-
def validate_arrow_table(self, arrow_table: pa.Table) -> bool:
|
|
371
|
-
"""
|
|
372
|
-
Validate Arrow table structure
|
|
373
|
-
|
|
374
|
-
Args:
|
|
375
|
-
arrow_table: PyArrow table to validate
|
|
376
|
-
|
|
377
|
-
Returns:
|
|
378
|
-
True if table is valid
|
|
379
|
-
"""
|
|
380
|
-
try:
|
|
381
|
-
# Basic validation
|
|
382
|
-
if arrow_table is None:
|
|
383
|
-
return False
|
|
384
|
-
|
|
385
|
-
if not isinstance(arrow_table, pa.Table):
|
|
386
|
-
return False
|
|
387
|
-
|
|
388
|
-
# Check if table has data
|
|
389
|
-
if arrow_table.num_rows == 0 and arrow_table.num_columns == 0:
|
|
390
|
-
log_warning("[arrow_bridge] Arrow table is empty")
|
|
391
|
-
return True # Empty table is valid
|
|
392
|
-
|
|
393
|
-
# Validate schema
|
|
394
|
-
schema = arrow_table.schema
|
|
395
|
-
if schema is None:
|
|
396
|
-
return False
|
|
397
|
-
|
|
398
|
-
# Check for null schema fields
|
|
399
|
-
for field in schema:
|
|
400
|
-
if field is None:
|
|
401
|
-
return False
|
|
402
|
-
|
|
403
|
-
log_info(f"[arrow_bridge] Arrow table validation passed: "
|
|
404
|
-
f"{arrow_table.num_rows} rows, {arrow_table.num_columns} columns")
|
|
405
|
-
|
|
406
|
-
return True
|
|
407
|
-
|
|
408
|
-
except Exception as e:
|
|
409
|
-
log_warning(f"[arrow_bridge] Arrow table validation failed: {e}")
|
|
410
|
-
return False
|
|
411
|
-
|
|
412
|
-
def get_supported_backends(self) -> Dict[str, bool]:
|
|
413
|
-
"""
|
|
414
|
-
Get list of supported backends and their availability
|
|
415
|
-
|
|
416
|
-
Returns:
|
|
417
|
-
Dictionary mapping backend names to availability status
|
|
418
|
-
"""
|
|
419
|
-
availability = {}
|
|
420
|
-
|
|
421
|
-
# Check pandas
|
|
422
|
-
try:
|
|
423
|
-
import pandas
|
|
424
|
-
availability["pandas"] = True
|
|
425
|
-
except ImportError:
|
|
426
|
-
availability["pandas"] = False
|
|
427
|
-
|
|
428
|
-
# Check Polars
|
|
429
|
-
try:
|
|
430
|
-
import polars
|
|
431
|
-
availability["polars"] = True
|
|
432
|
-
except ImportError:
|
|
433
|
-
availability["polars"] = False
|
|
434
|
-
|
|
435
|
-
# Check cuDF
|
|
436
|
-
try:
|
|
437
|
-
import cudf
|
|
438
|
-
availability["cudf"] = True
|
|
439
|
-
except ImportError:
|
|
440
|
-
availability["cudf"] = False
|
|
441
|
-
|
|
442
|
-
return availability
|
|
443
|
-
|
|
444
|
-
def benchmark_conversion(self, df: Any, target_backend: str,
|
|
445
|
-
iterations: int = 3) -> Dict[str, float]:
|
|
446
|
-
"""
|
|
447
|
-
Benchmark conversion performance
|
|
448
|
-
|
|
449
|
-
Args:
|
|
450
|
-
df: Input dataframe
|
|
451
|
-
target_backend: Target backend
|
|
452
|
-
iterations: Number of iterations for benchmarking
|
|
453
|
-
|
|
454
|
-
Returns:
|
|
455
|
-
Dictionary with benchmark results
|
|
456
|
-
"""
|
|
457
|
-
import time
|
|
458
|
-
|
|
459
|
-
times = []
|
|
460
|
-
source_backend = self.detect_backend(df)
|
|
461
|
-
|
|
462
|
-
for i in range(iterations):
|
|
463
|
-
start_time = time.time()
|
|
464
|
-
|
|
465
|
-
# Perform conversion
|
|
466
|
-
arrow_table = self.to_arrow(df)
|
|
467
|
-
result_df = self.from_arrow(arrow_table, target_backend)
|
|
468
|
-
|
|
469
|
-
end_time = time.time()
|
|
470
|
-
times.append((end_time - start_time) * 1000) # Convert to ms
|
|
471
|
-
|
|
472
|
-
# Cleanup between iterations
|
|
473
|
-
self.cleanup_arrow_memory()
|
|
474
|
-
|
|
475
|
-
return {
|
|
476
|
-
"source_backend": source_backend,
|
|
477
|
-
"target_backend": target_backend,
|
|
478
|
-
"iterations": iterations,
|
|
479
|
-
"min_time_ms": min(times),
|
|
480
|
-
"max_time_ms": max(times),
|
|
481
|
-
"avg_time_ms": sum(times) / len(times),
|
|
482
|
-
"total_time_ms": sum(times)
|
|
483
|
-
}
|