additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,483 +0,0 @@
1
- # enhanced_arrow_bridge.py
2
- # Universal Arrow bridge for cross-backend dataframe compatibility
3
-
4
- import os
5
- import gc
6
- import psutil
7
- from typing import Any, Dict, Optional, Tuple, Union
8
- from dataclasses import dataclass
9
- from datetime import datetime
10
-
11
- try:
12
- import pyarrow as pa
13
- import polars as pl
14
- import pandas as pd
15
- ARROW_AVAILABLE = True
16
- except ImportError as e:
17
- ARROW_AVAILABLE = False
18
- IMPORT_ERROR = str(e)
19
- # Create dummy classes for type annotations
20
- class pa:
21
- Table = Any
22
- class pl:
23
- DataFrame = Any
24
- class pd:
25
- DataFrame = Any
26
-
27
- from ..logging import log_info, log_warning
28
- from .cudf_bridge import get_cudf_bridge
29
-
30
-
31
- @dataclass
32
- class ConversionStats:
33
- """Statistics for Arrow bridge operations"""
34
- conversions: int = 0
35
- memory_used_mb: float = 0.0
36
- total_rows_processed: int = 0
37
- total_columns_processed: int = 0
38
- conversion_time_ms: float = 0.0
39
- cleanup_count: int = 0
40
-
41
-
42
- class ArrowBridgeError(Exception):
43
- """Raised when Arrow bridge operations fail"""
44
- pass
45
-
46
-
47
- class EnhancedArrowBridge:
48
- """Universal data bridge using Apache Arrow for cross-backend compatibility"""
49
-
50
- def __init__(self):
51
- if not ARROW_AVAILABLE:
52
- raise ArrowBridgeError(f"Arrow dependencies not available: {IMPORT_ERROR}")
53
-
54
- # Memory management
55
- self.memory_pool = pa.default_memory_pool()
56
- self.memory_threshold_mb = 100 # Cleanup threshold
57
-
58
- # Statistics tracking
59
- self.stats = ConversionStats()
60
-
61
- # Supported backends (order matters - check cuDF before pandas)
62
- self.supported_backends = {
63
- "cudf": self._detect_cudf,
64
- "polars": self._detect_polars,
65
- "pandas": self._detect_pandas
66
- }
67
-
68
- log_info("[arrow_bridge] Enhanced Arrow Bridge initialized")
69
-
70
- def detect_backend(self, df: Any) -> str:
71
- """
72
- Automatically detect dataframe backend
73
-
74
- Args:
75
- df: Input dataframe of unknown type
76
-
77
- Returns:
78
- Backend name ("pandas", "polars", "cudf")
79
-
80
- Raises:
81
- ArrowBridgeError: If backend cannot be detected
82
- """
83
- for backend_name, detector in self.supported_backends.items():
84
- if detector(df):
85
- log_info(f"[arrow_bridge] Detected backend: {backend_name}")
86
- return backend_name
87
-
88
- # Try to get type information for better error message
89
- df_type = type(df).__name__
90
- df_module = getattr(type(df), '__module__', 'unknown')
91
-
92
- raise ArrowBridgeError(
93
- f"Unsupported dataframe type: {df_type} from module {df_module}. "
94
- f"Supported backends: {list(self.supported_backends.keys())}"
95
- )
96
-
97
- def _detect_pandas(self, df: Any) -> bool:
98
- """Detect if dataframe is pandas (but not cuDF)"""
99
- # First check if it's cuDF (which also has pandas-like attributes)
100
- if self._detect_cudf(df):
101
- return False
102
- # Then check for pandas attributes
103
- return hasattr(df, 'iloc') and hasattr(df, 'loc') and hasattr(df, 'dtypes')
104
-
105
- def _detect_polars(self, df: Any) -> bool:
106
- """Detect if dataframe is Polars"""
107
- return (hasattr(df, 'lazy') and hasattr(df, 'collect') and hasattr(df, 'schema')) or \
108
- str(type(df)).find('polars') != -1
109
-
110
- def _detect_cudf(self, df: Any) -> bool:
111
- """Detect if dataframe is cuDF using enhanced detection"""
112
- cudf_bridge = get_cudf_bridge()
113
- return cudf_bridge.detect_cudf(df)
114
-
115
- def to_arrow(self, df: Any, backend_type: Optional[str] = None) -> pa.Table:
116
- """
117
- Convert any dataframe to Arrow table
118
-
119
- Args:
120
- df: Input dataframe
121
- backend_type: Backend type (auto-detected if None)
122
-
123
- Returns:
124
- PyArrow Table
125
-
126
- Raises:
127
- ArrowBridgeError: If conversion fails
128
- """
129
- start_time = datetime.now()
130
-
131
- try:
132
- # Auto-detect backend if not specified
133
- if backend_type is None:
134
- backend_type = self.detect_backend(df)
135
-
136
- # Get memory usage before conversion
137
- memory_before = self._get_memory_usage_mb()
138
-
139
- # Convert based on backend
140
- if backend_type == "pandas":
141
- arrow_table = self._pandas_to_arrow(df)
142
- elif backend_type == "polars":
143
- arrow_table = self._polars_to_arrow(df)
144
- elif backend_type == "cudf":
145
- arrow_table = self._cudf_to_arrow(df)
146
- else:
147
- raise ArrowBridgeError(f"Unsupported backend for conversion: {backend_type}")
148
-
149
- # Update statistics
150
- memory_after = self._get_memory_usage_mb()
151
- conversion_time = (datetime.now() - start_time).total_seconds() * 1000
152
-
153
- self.stats.conversions += 1
154
- self.stats.memory_used_mb += max(0, memory_after - memory_before)
155
- self.stats.total_rows_processed += arrow_table.num_rows
156
- self.stats.total_columns_processed += arrow_table.num_columns
157
- self.stats.conversion_time_ms += conversion_time
158
-
159
- log_info(f"[arrow_bridge] Converted {backend_type} to Arrow: "
160
- f"{arrow_table.num_rows} rows, {arrow_table.num_columns} columns "
161
- f"({conversion_time:.1f}ms)")
162
-
163
- return arrow_table
164
-
165
- except Exception as e:
166
- raise ArrowBridgeError(f"Failed to convert {backend_type} to Arrow: {e}")
167
-
168
- def from_arrow(self, arrow_table: pa.Table, target_backend: str) -> Any:
169
- """
170
- Convert Arrow table back to target dataframe format
171
-
172
- Args:
173
- arrow_table: PyArrow Table
174
- target_backend: Target backend ("pandas", "polars", "cudf")
175
-
176
- Returns:
177
- Dataframe in target format
178
-
179
- Raises:
180
- ArrowBridgeError: If conversion fails
181
- """
182
- start_time = datetime.now()
183
-
184
- try:
185
- # Convert based on target backend
186
- if target_backend == "pandas":
187
- result_df = self._arrow_to_pandas(arrow_table)
188
- elif target_backend == "polars":
189
- result_df = self._arrow_to_polars(arrow_table)
190
- elif target_backend == "cudf":
191
- result_df = self._arrow_to_cudf(arrow_table)
192
- else:
193
- raise ArrowBridgeError(f"Unsupported target backend: {target_backend}")
194
-
195
- conversion_time = (datetime.now() - start_time).total_seconds() * 1000
196
-
197
- log_info(f"[arrow_bridge] Converted Arrow to {target_backend}: "
198
- f"{arrow_table.num_rows} rows, {arrow_table.num_columns} columns "
199
- f"({conversion_time:.1f}ms)")
200
-
201
- return result_df
202
-
203
- except Exception as e:
204
- raise ArrowBridgeError(f"Failed to convert Arrow to {target_backend}: {e}")
205
-
206
- def _pandas_to_arrow(self, df: pd.DataFrame) -> pa.Table:
207
- """Convert pandas DataFrame to Arrow table"""
208
- try:
209
- # Use zero-copy conversion when possible
210
- return pa.Table.from_pandas(df, preserve_index=False)
211
- except Exception as e:
212
- # Fallback: try with schema inference
213
- try:
214
- schema = pa.Schema.from_pandas(df)
215
- return pa.Table.from_pandas(df, schema=schema, preserve_index=False)
216
- except Exception as e2:
217
- raise ArrowBridgeError(f"Pandas to Arrow conversion failed: {e2}")
218
-
219
- def _polars_to_arrow(self, df: pl.DataFrame) -> pa.Table:
220
- """Convert Polars DataFrame to Arrow table"""
221
- try:
222
- return df.to_arrow()
223
- except Exception as e:
224
- raise ArrowBridgeError(f"Polars to Arrow conversion failed: {e}")
225
-
226
- def _cudf_to_arrow(self, df: Any) -> pa.Table:
227
- """Convert cuDF DataFrame to Arrow table using enhanced bridge"""
228
- cudf_bridge = get_cudf_bridge()
229
- return cudf_bridge.cudf_to_arrow(df)
230
-
231
- def _arrow_to_pandas(self, arrow_table: pa.Table) -> pd.DataFrame:
232
- """Convert Arrow table to pandas DataFrame"""
233
- try:
234
- return arrow_table.to_pandas()
235
- except Exception as e:
236
- raise ArrowBridgeError(f"Arrow to pandas conversion failed: {e}")
237
-
238
- def _arrow_to_polars(self, arrow_table: pa.Table) -> pl.DataFrame:
239
- """Convert Arrow table to Polars DataFrame"""
240
- try:
241
- return pl.from_arrow(arrow_table)
242
- except Exception as e:
243
- raise ArrowBridgeError(f"Arrow to Polars conversion failed: {e}")
244
-
245
- def _arrow_to_cudf(self, arrow_table: pa.Table) -> Any:
246
- """Convert Arrow table to cuDF DataFrame using enhanced bridge"""
247
- cudf_bridge = get_cudf_bridge()
248
- return cudf_bridge.arrow_to_cudf(arrow_table)
249
-
250
- def cleanup_arrow_memory(self) -> Dict[str, float]:
251
- """
252
- Clean up Arrow heap memory
253
-
254
- Returns:
255
- Dictionary with cleanup statistics
256
- """
257
- # Get memory usage before cleanup
258
- memory_before = self._get_memory_usage_mb()
259
- arrow_allocated_before = self.memory_pool.bytes_allocated()
260
-
261
- # Force garbage collection
262
- gc.collect()
263
-
264
- # Additional Arrow-specific cleanup
265
- try:
266
- # Clear any cached Arrow data
267
- if hasattr(pa, 'jemalloc_memory_pool'):
268
- # Use jemalloc pool if available for better memory management
269
- pass
270
- except Exception:
271
- pass
272
-
273
- # Get memory usage after cleanup
274
- memory_after = self._get_memory_usage_mb()
275
- arrow_allocated_after = self.memory_pool.bytes_allocated()
276
-
277
- # Calculate cleanup statistics
278
- memory_freed_mb = max(0, memory_before - memory_after)
279
- arrow_freed_bytes = max(0, arrow_allocated_before - arrow_allocated_after)
280
-
281
- self.stats.cleanup_count += 1
282
-
283
- cleanup_stats = {
284
- "memory_freed_mb": memory_freed_mb,
285
- "arrow_freed_bytes": arrow_freed_bytes,
286
- "memory_before_mb": memory_before,
287
- "memory_after_mb": memory_after,
288
- "cleanup_count": self.stats.cleanup_count
289
- }
290
-
291
- if memory_freed_mb > 0 or arrow_freed_bytes > 0:
292
- log_info(f"[arrow_bridge] Memory cleanup: {memory_freed_mb:.1f}MB system, "
293
- f"{arrow_freed_bytes} bytes Arrow heap freed")
294
-
295
- return cleanup_stats
296
-
297
- def _get_memory_usage_mb(self) -> float:
298
- """Get current memory usage in MB"""
299
- try:
300
- process = psutil.Process(os.getpid())
301
- return process.memory_info().rss / 1024 / 1024
302
- except Exception:
303
- return 0.0
304
-
305
- def should_cleanup(self) -> bool:
306
- """Check if memory cleanup is needed"""
307
- current_memory = self._get_memory_usage_mb()
308
- return current_memory > self.memory_threshold_mb
309
-
310
- def get_memory_stats(self) -> Dict[str, Any]:
311
- """
312
- Get memory usage statistics
313
-
314
- Returns:
315
- Dictionary with memory statistics
316
- """
317
- return {
318
- "current_memory_mb": self._get_memory_usage_mb(),
319
- "arrow_allocated_bytes": self.memory_pool.bytes_allocated(),
320
- "memory_threshold_mb": self.memory_threshold_mb,
321
- "total_memory_used_mb": self.stats.memory_used_mb,
322
- "cleanup_needed": self.should_cleanup()
323
- }
324
-
325
- def get_conversion_stats(self) -> ConversionStats:
326
- """Get conversion statistics"""
327
- return self.stats
328
-
329
- def reset_stats(self):
330
- """Reset conversion statistics"""
331
- self.stats = ConversionStats()
332
- log_info("[arrow_bridge] Statistics reset")
333
-
334
- def set_memory_threshold(self, threshold_mb: float):
335
- """Set memory cleanup threshold"""
336
- self.memory_threshold_mb = threshold_mb
337
- log_info(f"[arrow_bridge] Memory threshold set to {threshold_mb}MB")
338
-
339
- def convert_with_cleanup(self, df: Any, target_backend: str,
340
- backend_type: Optional[str] = None) -> Any:
341
- """
342
- Convert dataframe with automatic memory cleanup
343
-
344
- Args:
345
- df: Input dataframe
346
- target_backend: Target backend for output
347
- backend_type: Source backend (auto-detected if None)
348
-
349
- Returns:
350
- Converted dataframe
351
- """
352
- try:
353
- # Convert to Arrow
354
- arrow_table = self.to_arrow(df, backend_type)
355
-
356
- # Convert to target format
357
- result_df = self.from_arrow(arrow_table, target_backend)
358
-
359
- # Cleanup if needed
360
- if self.should_cleanup():
361
- self.cleanup_arrow_memory()
362
-
363
- return result_df
364
-
365
- except Exception as e:
366
- # Always try to cleanup on error
367
- self.cleanup_arrow_memory()
368
- raise e
369
-
370
- def validate_arrow_table(self, arrow_table: pa.Table) -> bool:
371
- """
372
- Validate Arrow table structure
373
-
374
- Args:
375
- arrow_table: PyArrow table to validate
376
-
377
- Returns:
378
- True if table is valid
379
- """
380
- try:
381
- # Basic validation
382
- if arrow_table is None:
383
- return False
384
-
385
- if not isinstance(arrow_table, pa.Table):
386
- return False
387
-
388
- # Check if table has data
389
- if arrow_table.num_rows == 0 and arrow_table.num_columns == 0:
390
- log_warning("[arrow_bridge] Arrow table is empty")
391
- return True # Empty table is valid
392
-
393
- # Validate schema
394
- schema = arrow_table.schema
395
- if schema is None:
396
- return False
397
-
398
- # Check for null schema fields
399
- for field in schema:
400
- if field is None:
401
- return False
402
-
403
- log_info(f"[arrow_bridge] Arrow table validation passed: "
404
- f"{arrow_table.num_rows} rows, {arrow_table.num_columns} columns")
405
-
406
- return True
407
-
408
- except Exception as e:
409
- log_warning(f"[arrow_bridge] Arrow table validation failed: {e}")
410
- return False
411
-
412
- def get_supported_backends(self) -> Dict[str, bool]:
413
- """
414
- Get list of supported backends and their availability
415
-
416
- Returns:
417
- Dictionary mapping backend names to availability status
418
- """
419
- availability = {}
420
-
421
- # Check pandas
422
- try:
423
- import pandas
424
- availability["pandas"] = True
425
- except ImportError:
426
- availability["pandas"] = False
427
-
428
- # Check Polars
429
- try:
430
- import polars
431
- availability["polars"] = True
432
- except ImportError:
433
- availability["polars"] = False
434
-
435
- # Check cuDF
436
- try:
437
- import cudf
438
- availability["cudf"] = True
439
- except ImportError:
440
- availability["cudf"] = False
441
-
442
- return availability
443
-
444
- def benchmark_conversion(self, df: Any, target_backend: str,
445
- iterations: int = 3) -> Dict[str, float]:
446
- """
447
- Benchmark conversion performance
448
-
449
- Args:
450
- df: Input dataframe
451
- target_backend: Target backend
452
- iterations: Number of iterations for benchmarking
453
-
454
- Returns:
455
- Dictionary with benchmark results
456
- """
457
- import time
458
-
459
- times = []
460
- source_backend = self.detect_backend(df)
461
-
462
- for i in range(iterations):
463
- start_time = time.time()
464
-
465
- # Perform conversion
466
- arrow_table = self.to_arrow(df)
467
- result_df = self.from_arrow(arrow_table, target_backend)
468
-
469
- end_time = time.time()
470
- times.append((end_time - start_time) * 1000) # Convert to ms
471
-
472
- # Cleanup between iterations
473
- self.cleanup_arrow_memory()
474
-
475
- return {
476
- "source_backend": source_backend,
477
- "target_backend": target_backend,
478
- "iterations": iterations,
479
- "min_time_ms": min(times),
480
- "max_time_ms": max(times),
481
- "avg_time_ms": sum(times) / len(times),
482
- "total_time_ms": sum(times)
483
- }