additory 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. additory/__init__.py +15 -0
  2. additory/analysis/__init__.py +48 -0
  3. additory/analysis/cardinality.py +126 -0
  4. additory/analysis/correlations.py +124 -0
  5. additory/analysis/distributions.py +376 -0
  6. additory/analysis/quality.py +158 -0
  7. additory/analysis/scan.py +400 -0
  8. additory/augment/__init__.py +24 -0
  9. additory/augment/augmentor.py +653 -0
  10. additory/augment/builtin_lists.py +430 -0
  11. additory/augment/distributions.py +22 -0
  12. additory/augment/forecast.py +1132 -0
  13. additory/augment/list_registry.py +177 -0
  14. additory/augment/smote.py +320 -0
  15. additory/augment/strategies.py +883 -0
  16. additory/common/__init__.py +157 -0
  17. additory/common/backend.py +355 -0
  18. additory/common/column_utils.py +191 -0
  19. additory/common/distributions.py +737 -0
  20. additory/common/exceptions.py +62 -0
  21. additory/common/lists.py +229 -0
  22. additory/common/patterns.py +240 -0
  23. additory/common/resolver.py +567 -0
  24. additory/common/sample_data.py +182 -0
  25. additory/common/validation.py +197 -0
  26. additory/core/__init__.py +27 -0
  27. additory/core/ast_builder.py +165 -0
  28. additory/core/backends/__init__.py +23 -0
  29. additory/core/backends/arrow_bridge.py +476 -0
  30. additory/core/backends/cudf_bridge.py +355 -0
  31. additory/core/column_positioning.py +358 -0
  32. additory/core/compiler_polars.py +166 -0
  33. additory/core/config.py +342 -0
  34. additory/core/enhanced_cache_manager.py +1119 -0
  35. additory/core/enhanced_matchers.py +473 -0
  36. additory/core/enhanced_version_manager.py +325 -0
  37. additory/core/executor.py +59 -0
  38. additory/core/integrity_manager.py +477 -0
  39. additory/core/loader.py +190 -0
  40. additory/core/logging.py +24 -0
  41. additory/core/memory_manager.py +547 -0
  42. additory/core/namespace_manager.py +657 -0
  43. additory/core/parser.py +176 -0
  44. additory/core/polars_expression_engine.py +551 -0
  45. additory/core/registry.py +176 -0
  46. additory/core/sample_data_manager.py +492 -0
  47. additory/core/user_namespace.py +751 -0
  48. additory/core/validator.py +27 -0
  49. additory/dynamic_api.py +308 -0
  50. additory/expressions/__init__.py +26 -0
  51. additory/expressions/engine.py +551 -0
  52. additory/expressions/parser.py +176 -0
  53. additory/expressions/proxy.py +546 -0
  54. additory/expressions/registry.py +313 -0
  55. additory/expressions/samples.py +492 -0
  56. additory/synthetic/__init__.py +101 -0
  57. additory/synthetic/api.py +220 -0
  58. additory/synthetic/common_integration.py +314 -0
  59. additory/synthetic/config.py +262 -0
  60. additory/synthetic/engines.py +529 -0
  61. additory/synthetic/exceptions.py +180 -0
  62. additory/synthetic/file_managers.py +518 -0
  63. additory/synthetic/generator.py +702 -0
  64. additory/synthetic/generator_parser.py +68 -0
  65. additory/synthetic/integration.py +319 -0
  66. additory/synthetic/models.py +241 -0
  67. additory/synthetic/pattern_resolver.py +573 -0
  68. additory/synthetic/performance.py +469 -0
  69. additory/synthetic/polars_integration.py +464 -0
  70. additory/synthetic/proxy.py +60 -0
  71. additory/synthetic/schema_parser.py +685 -0
  72. additory/synthetic/validator.py +553 -0
  73. additory/utilities/__init__.py +53 -0
  74. additory/utilities/encoding.py +600 -0
  75. additory/utilities/games.py +300 -0
  76. additory/utilities/keys.py +8 -0
  77. additory/utilities/lookup.py +103 -0
  78. additory/utilities/matchers.py +216 -0
  79. additory/utilities/resolvers.py +286 -0
  80. additory/utilities/settings.py +167 -0
  81. additory/utilities/units.py +746 -0
  82. additory/utilities/validators.py +153 -0
  83. additory-0.1.0a1.dist-info/METADATA +293 -0
  84. additory-0.1.0a1.dist-info/RECORD +87 -0
  85. additory-0.1.0a1.dist-info/WHEEL +5 -0
  86. additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
  87. additory-0.1.0a1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,476 @@
1
+ # enhanced_arrow_bridge.py
2
+ # Universal Arrow bridge for cross-backend dataframe compatibility
3
+
4
+ import os
5
+ import gc
6
+ import psutil
7
+ from typing import Any, Dict, Optional, Tuple, Union
8
+ from dataclasses import dataclass
9
+ from datetime import datetime
10
+
11
+ try:
12
+ import pyarrow as pa
13
+ import polars as pl
14
+ import pandas as pd
15
+ ARROW_AVAILABLE = True
16
+ except ImportError as e:
17
+ ARROW_AVAILABLE = False
18
+ IMPORT_ERROR = str(e)
19
+
20
+ from ..logging import log_info, log_warning
21
+ from .cudf_bridge import get_cudf_bridge
22
+
23
+
24
+ @dataclass
25
+ class ConversionStats:
26
+ """Statistics for Arrow bridge operations"""
27
+ conversions: int = 0
28
+ memory_used_mb: float = 0.0
29
+ total_rows_processed: int = 0
30
+ total_columns_processed: int = 0
31
+ conversion_time_ms: float = 0.0
32
+ cleanup_count: int = 0
33
+
34
+
35
+ class ArrowBridgeError(Exception):
36
+ """Raised when Arrow bridge operations fail"""
37
+ pass
38
+
39
+
40
+ class EnhancedArrowBridge:
41
+ """Universal data bridge using Apache Arrow for cross-backend compatibility"""
42
+
43
+ def __init__(self):
44
+ if not ARROW_AVAILABLE:
45
+ raise ArrowBridgeError(f"Arrow dependencies not available: {IMPORT_ERROR}")
46
+
47
+ # Memory management
48
+ self.memory_pool = pa.default_memory_pool()
49
+ self.memory_threshold_mb = 100 # Cleanup threshold
50
+
51
+ # Statistics tracking
52
+ self.stats = ConversionStats()
53
+
54
+ # Supported backends (order matters - check cuDF before pandas)
55
+ self.supported_backends = {
56
+ "cudf": self._detect_cudf,
57
+ "polars": self._detect_polars,
58
+ "pandas": self._detect_pandas
59
+ }
60
+
61
+ log_info("[arrow_bridge] Enhanced Arrow Bridge initialized")
62
+
63
+ def detect_backend(self, df: Any) -> str:
64
+ """
65
+ Automatically detect dataframe backend
66
+
67
+ Args:
68
+ df: Input dataframe of unknown type
69
+
70
+ Returns:
71
+ Backend name ("pandas", "polars", "cudf")
72
+
73
+ Raises:
74
+ ArrowBridgeError: If backend cannot be detected
75
+ """
76
+ for backend_name, detector in self.supported_backends.items():
77
+ if detector(df):
78
+ log_info(f"[arrow_bridge] Detected backend: {backend_name}")
79
+ return backend_name
80
+
81
+ # Try to get type information for better error message
82
+ df_type = type(df).__name__
83
+ df_module = getattr(type(df), '__module__', 'unknown')
84
+
85
+ raise ArrowBridgeError(
86
+ f"Unsupported dataframe type: {df_type} from module {df_module}. "
87
+ f"Supported backends: {list(self.supported_backends.keys())}"
88
+ )
89
+
90
+ def _detect_pandas(self, df: Any) -> bool:
91
+ """Detect if dataframe is pandas (but not cuDF)"""
92
+ # First check if it's cuDF (which also has pandas-like attributes)
93
+ if self._detect_cudf(df):
94
+ return False
95
+ # Then check for pandas attributes
96
+ return hasattr(df, 'iloc') and hasattr(df, 'loc') and hasattr(df, 'dtypes')
97
+
98
+ def _detect_polars(self, df: Any) -> bool:
99
+ """Detect if dataframe is Polars"""
100
+ return (hasattr(df, 'lazy') and hasattr(df, 'collect') and hasattr(df, 'schema')) or \
101
+ str(type(df)).find('polars') != -1
102
+
103
+ def _detect_cudf(self, df: Any) -> bool:
104
+ """Detect if dataframe is cuDF using enhanced detection"""
105
+ cudf_bridge = get_cudf_bridge()
106
+ return cudf_bridge.detect_cudf(df)
107
+
108
+ def to_arrow(self, df: Any, backend_type: Optional[str] = None) -> pa.Table:
109
+ """
110
+ Convert any dataframe to Arrow table
111
+
112
+ Args:
113
+ df: Input dataframe
114
+ backend_type: Backend type (auto-detected if None)
115
+
116
+ Returns:
117
+ PyArrow Table
118
+
119
+ Raises:
120
+ ArrowBridgeError: If conversion fails
121
+ """
122
+ start_time = datetime.now()
123
+
124
+ try:
125
+ # Auto-detect backend if not specified
126
+ if backend_type is None:
127
+ backend_type = self.detect_backend(df)
128
+
129
+ # Get memory usage before conversion
130
+ memory_before = self._get_memory_usage_mb()
131
+
132
+ # Convert based on backend
133
+ if backend_type == "pandas":
134
+ arrow_table = self._pandas_to_arrow(df)
135
+ elif backend_type == "polars":
136
+ arrow_table = self._polars_to_arrow(df)
137
+ elif backend_type == "cudf":
138
+ arrow_table = self._cudf_to_arrow(df)
139
+ else:
140
+ raise ArrowBridgeError(f"Unsupported backend for conversion: {backend_type}")
141
+
142
+ # Update statistics
143
+ memory_after = self._get_memory_usage_mb()
144
+ conversion_time = (datetime.now() - start_time).total_seconds() * 1000
145
+
146
+ self.stats.conversions += 1
147
+ self.stats.memory_used_mb += max(0, memory_after - memory_before)
148
+ self.stats.total_rows_processed += arrow_table.num_rows
149
+ self.stats.total_columns_processed += arrow_table.num_columns
150
+ self.stats.conversion_time_ms += conversion_time
151
+
152
+ log_info(f"[arrow_bridge] Converted {backend_type} to Arrow: "
153
+ f"{arrow_table.num_rows} rows, {arrow_table.num_columns} columns "
154
+ f"({conversion_time:.1f}ms)")
155
+
156
+ return arrow_table
157
+
158
+ except Exception as e:
159
+ raise ArrowBridgeError(f"Failed to convert {backend_type} to Arrow: {e}")
160
+
161
+ def from_arrow(self, arrow_table: pa.Table, target_backend: str) -> Any:
162
+ """
163
+ Convert Arrow table back to target dataframe format
164
+
165
+ Args:
166
+ arrow_table: PyArrow Table
167
+ target_backend: Target backend ("pandas", "polars", "cudf")
168
+
169
+ Returns:
170
+ Dataframe in target format
171
+
172
+ Raises:
173
+ ArrowBridgeError: If conversion fails
174
+ """
175
+ start_time = datetime.now()
176
+
177
+ try:
178
+ # Convert based on target backend
179
+ if target_backend == "pandas":
180
+ result_df = self._arrow_to_pandas(arrow_table)
181
+ elif target_backend == "polars":
182
+ result_df = self._arrow_to_polars(arrow_table)
183
+ elif target_backend == "cudf":
184
+ result_df = self._arrow_to_cudf(arrow_table)
185
+ else:
186
+ raise ArrowBridgeError(f"Unsupported target backend: {target_backend}")
187
+
188
+ conversion_time = (datetime.now() - start_time).total_seconds() * 1000
189
+
190
+ log_info(f"[arrow_bridge] Converted Arrow to {target_backend}: "
191
+ f"{arrow_table.num_rows} rows, {arrow_table.num_columns} columns "
192
+ f"({conversion_time:.1f}ms)")
193
+
194
+ return result_df
195
+
196
+ except Exception as e:
197
+ raise ArrowBridgeError(f"Failed to convert Arrow to {target_backend}: {e}")
198
+
199
+ def _pandas_to_arrow(self, df: pd.DataFrame) -> pa.Table:
200
+ """Convert pandas DataFrame to Arrow table"""
201
+ try:
202
+ # Use zero-copy conversion when possible
203
+ return pa.Table.from_pandas(df, preserve_index=False)
204
+ except Exception as e:
205
+ # Fallback: try with schema inference
206
+ try:
207
+ schema = pa.Schema.from_pandas(df)
208
+ return pa.Table.from_pandas(df, schema=schema, preserve_index=False)
209
+ except Exception as e2:
210
+ raise ArrowBridgeError(f"Pandas to Arrow conversion failed: {e2}")
211
+
212
+ def _polars_to_arrow(self, df: pl.DataFrame) -> pa.Table:
213
+ """Convert Polars DataFrame to Arrow table"""
214
+ try:
215
+ return df.to_arrow()
216
+ except Exception as e:
217
+ raise ArrowBridgeError(f"Polars to Arrow conversion failed: {e}")
218
+
219
+ def _cudf_to_arrow(self, df: Any) -> pa.Table:
220
+ """Convert cuDF DataFrame to Arrow table using enhanced bridge"""
221
+ cudf_bridge = get_cudf_bridge()
222
+ return cudf_bridge.cudf_to_arrow(df)
223
+
224
+ def _arrow_to_pandas(self, arrow_table: pa.Table) -> pd.DataFrame:
225
+ """Convert Arrow table to pandas DataFrame"""
226
+ try:
227
+ return arrow_table.to_pandas()
228
+ except Exception as e:
229
+ raise ArrowBridgeError(f"Arrow to pandas conversion failed: {e}")
230
+
231
+ def _arrow_to_polars(self, arrow_table: pa.Table) -> pl.DataFrame:
232
+ """Convert Arrow table to Polars DataFrame"""
233
+ try:
234
+ return pl.from_arrow(arrow_table)
235
+ except Exception as e:
236
+ raise ArrowBridgeError(f"Arrow to Polars conversion failed: {e}")
237
+
238
+ def _arrow_to_cudf(self, arrow_table: pa.Table) -> Any:
239
+ """Convert Arrow table to cuDF DataFrame using enhanced bridge"""
240
+ cudf_bridge = get_cudf_bridge()
241
+ return cudf_bridge.arrow_to_cudf(arrow_table)
242
+
243
+ def cleanup_arrow_memory(self) -> Dict[str, float]:
244
+ """
245
+ Clean up Arrow heap memory
246
+
247
+ Returns:
248
+ Dictionary with cleanup statistics
249
+ """
250
+ # Get memory usage before cleanup
251
+ memory_before = self._get_memory_usage_mb()
252
+ arrow_allocated_before = self.memory_pool.bytes_allocated()
253
+
254
+ # Force garbage collection
255
+ gc.collect()
256
+
257
+ # Additional Arrow-specific cleanup
258
+ try:
259
+ # Clear any cached Arrow data
260
+ if hasattr(pa, 'jemalloc_memory_pool'):
261
+ # Use jemalloc pool if available for better memory management
262
+ pass
263
+ except Exception:
264
+ pass
265
+
266
+ # Get memory usage after cleanup
267
+ memory_after = self._get_memory_usage_mb()
268
+ arrow_allocated_after = self.memory_pool.bytes_allocated()
269
+
270
+ # Calculate cleanup statistics
271
+ memory_freed_mb = max(0, memory_before - memory_after)
272
+ arrow_freed_bytes = max(0, arrow_allocated_before - arrow_allocated_after)
273
+
274
+ self.stats.cleanup_count += 1
275
+
276
+ cleanup_stats = {
277
+ "memory_freed_mb": memory_freed_mb,
278
+ "arrow_freed_bytes": arrow_freed_bytes,
279
+ "memory_before_mb": memory_before,
280
+ "memory_after_mb": memory_after,
281
+ "cleanup_count": self.stats.cleanup_count
282
+ }
283
+
284
+ if memory_freed_mb > 0 or arrow_freed_bytes > 0:
285
+ log_info(f"[arrow_bridge] Memory cleanup: {memory_freed_mb:.1f}MB system, "
286
+ f"{arrow_freed_bytes} bytes Arrow heap freed")
287
+
288
+ return cleanup_stats
289
+
290
+ def _get_memory_usage_mb(self) -> float:
291
+ """Get current memory usage in MB"""
292
+ try:
293
+ process = psutil.Process(os.getpid())
294
+ return process.memory_info().rss / 1024 / 1024
295
+ except Exception:
296
+ return 0.0
297
+
298
+ def should_cleanup(self) -> bool:
299
+ """Check if memory cleanup is needed"""
300
+ current_memory = self._get_memory_usage_mb()
301
+ return current_memory > self.memory_threshold_mb
302
+
303
+ def get_memory_stats(self) -> Dict[str, Any]:
304
+ """
305
+ Get memory usage statistics
306
+
307
+ Returns:
308
+ Dictionary with memory statistics
309
+ """
310
+ return {
311
+ "current_memory_mb": self._get_memory_usage_mb(),
312
+ "arrow_allocated_bytes": self.memory_pool.bytes_allocated(),
313
+ "memory_threshold_mb": self.memory_threshold_mb,
314
+ "total_memory_used_mb": self.stats.memory_used_mb,
315
+ "cleanup_needed": self.should_cleanup()
316
+ }
317
+
318
+ def get_conversion_stats(self) -> ConversionStats:
319
+ """Get conversion statistics"""
320
+ return self.stats
321
+
322
+ def reset_stats(self):
323
+ """Reset conversion statistics"""
324
+ self.stats = ConversionStats()
325
+ log_info("[arrow_bridge] Statistics reset")
326
+
327
+ def set_memory_threshold(self, threshold_mb: float):
328
+ """Set memory cleanup threshold"""
329
+ self.memory_threshold_mb = threshold_mb
330
+ log_info(f"[arrow_bridge] Memory threshold set to {threshold_mb}MB")
331
+
332
+ def convert_with_cleanup(self, df: Any, target_backend: str,
333
+ backend_type: Optional[str] = None) -> Any:
334
+ """
335
+ Convert dataframe with automatic memory cleanup
336
+
337
+ Args:
338
+ df: Input dataframe
339
+ target_backend: Target backend for output
340
+ backend_type: Source backend (auto-detected if None)
341
+
342
+ Returns:
343
+ Converted dataframe
344
+ """
345
+ try:
346
+ # Convert to Arrow
347
+ arrow_table = self.to_arrow(df, backend_type)
348
+
349
+ # Convert to target format
350
+ result_df = self.from_arrow(arrow_table, target_backend)
351
+
352
+ # Cleanup if needed
353
+ if self.should_cleanup():
354
+ self.cleanup_arrow_memory()
355
+
356
+ return result_df
357
+
358
+ except Exception as e:
359
+ # Always try to cleanup on error
360
+ self.cleanup_arrow_memory()
361
+ raise e
362
+
363
+ def validate_arrow_table(self, arrow_table: pa.Table) -> bool:
364
+ """
365
+ Validate Arrow table structure
366
+
367
+ Args:
368
+ arrow_table: PyArrow table to validate
369
+
370
+ Returns:
371
+ True if table is valid
372
+ """
373
+ try:
374
+ # Basic validation
375
+ if arrow_table is None:
376
+ return False
377
+
378
+ if not isinstance(arrow_table, pa.Table):
379
+ return False
380
+
381
+ # Check if table has data
382
+ if arrow_table.num_rows == 0 and arrow_table.num_columns == 0:
383
+ log_warning("[arrow_bridge] Arrow table is empty")
384
+ return True # Empty table is valid
385
+
386
+ # Validate schema
387
+ schema = arrow_table.schema
388
+ if schema is None:
389
+ return False
390
+
391
+ # Check for null schema fields
392
+ for field in schema:
393
+ if field is None:
394
+ return False
395
+
396
+ log_info(f"[arrow_bridge] Arrow table validation passed: "
397
+ f"{arrow_table.num_rows} rows, {arrow_table.num_columns} columns")
398
+
399
+ return True
400
+
401
+ except Exception as e:
402
+ log_warning(f"[arrow_bridge] Arrow table validation failed: {e}")
403
+ return False
404
+
405
+ def get_supported_backends(self) -> Dict[str, bool]:
406
+ """
407
+ Get list of supported backends and their availability
408
+
409
+ Returns:
410
+ Dictionary mapping backend names to availability status
411
+ """
412
+ availability = {}
413
+
414
+ # Check pandas
415
+ try:
416
+ import pandas
417
+ availability["pandas"] = True
418
+ except ImportError:
419
+ availability["pandas"] = False
420
+
421
+ # Check Polars
422
+ try:
423
+ import polars
424
+ availability["polars"] = True
425
+ except ImportError:
426
+ availability["polars"] = False
427
+
428
+ # Check cuDF
429
+ try:
430
+ import cudf
431
+ availability["cudf"] = True
432
+ except ImportError:
433
+ availability["cudf"] = False
434
+
435
+ return availability
436
+
437
+ def benchmark_conversion(self, df: Any, target_backend: str,
438
+ iterations: int = 3) -> Dict[str, float]:
439
+ """
440
+ Benchmark conversion performance
441
+
442
+ Args:
443
+ df: Input dataframe
444
+ target_backend: Target backend
445
+ iterations: Number of iterations for benchmarking
446
+
447
+ Returns:
448
+ Dictionary with benchmark results
449
+ """
450
+ import time
451
+
452
+ times = []
453
+ source_backend = self.detect_backend(df)
454
+
455
+ for i in range(iterations):
456
+ start_time = time.time()
457
+
458
+ # Perform conversion
459
+ arrow_table = self.to_arrow(df)
460
+ result_df = self.from_arrow(arrow_table, target_backend)
461
+
462
+ end_time = time.time()
463
+ times.append((end_time - start_time) * 1000) # Convert to ms
464
+
465
+ # Cleanup between iterations
466
+ self.cleanup_arrow_memory()
467
+
468
+ return {
469
+ "source_backend": source_backend,
470
+ "target_backend": target_backend,
471
+ "iterations": iterations,
472
+ "min_time_ms": min(times),
473
+ "max_time_ms": max(times),
474
+ "avg_time_ms": sum(times) / len(times),
475
+ "total_time_ms": sum(times)
476
+ }