additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,355 +0,0 @@
1
- # cudf_enhanced_bridge.py
2
- # Enhanced cuDF support for Arrow bridge with proper dtype handling
3
-
4
- import gc
5
- from typing import Any, Dict, Optional, List
6
- from datetime import datetime
7
-
8
- try:
9
- import cudf
10
- import pyarrow as pa
11
- import pandas as pd
12
- CUDF_AVAILABLE = True
13
- except (ImportError, Exception):
14
- CUDF_AVAILABLE = False
15
- cudf = None
16
- pa = None
17
- # Create dummy classes to avoid AttributeError
18
- class DummyPA:
19
- class Table:
20
- pass
21
- if pa is None:
22
- pa = DummyPA()
23
-
24
- from ..logging import log_info, log_warning
25
-
26
-
27
- class CuDFBridgeError(Exception):
28
- """Raised when cuDF bridge operations fail"""
29
- pass
30
-
31
-
32
- class EnhancedCuDFBridge:
33
- """Enhanced cuDF support with proper dtype handling and GPU memory management"""
34
-
35
- def __init__(self):
36
- self.cudf_available = CUDF_AVAILABLE
37
- self.conversion_stats = {
38
- "direct_success": 0,
39
- "dtype_conversion_success": 0,
40
- "pandas_fallback": 0,
41
- "total_failures": 0
42
- }
43
-
44
- if self.cudf_available:
45
- log_info("[cudf_bridge] Enhanced cuDF Bridge initialized")
46
- else:
47
- log_warning("[cudf_bridge] cuDF not available")
48
-
49
- def detect_cudf(self, df: Any) -> bool:
50
- """
51
- Enhanced cuDF detection with multiple methods
52
-
53
- Args:
54
- df: Input dataframe
55
-
56
- Returns:
57
- True if dataframe is cuDF
58
- """
59
- if not self.cudf_available:
60
- return False
61
-
62
- # Method 1: Check module name (most reliable)
63
- try:
64
- if hasattr(df, '__module__') and 'cudf' in str(df.__module__):
65
- return True
66
- except Exception:
67
- pass
68
-
69
- # Method 2: Check class name
70
- try:
71
- if 'cudf' in str(type(df)):
72
- return True
73
- except Exception:
74
- pass
75
-
76
- # Method 3: isinstance check
77
- try:
78
- return isinstance(df, cudf.DataFrame)
79
- except Exception:
80
- pass
81
-
82
- # Method 4: Check for cuDF-specific attributes
83
- try:
84
- if (hasattr(df, '_data') and
85
- hasattr(df, '_index') and
86
- hasattr(df, 'to_arrow') and
87
- hasattr(df, 'to_pandas')):
88
- # Additional check: see if it has cuDF-specific methods
89
- if hasattr(df, 'memory_usage') and hasattr(df, 'hash_values'):
90
- return True
91
- except Exception:
92
- pass
93
-
94
- return False
95
-
96
- def cudf_to_arrow(self, df: Any) -> pa.Table:
97
- """
98
- Enhanced cuDF to Arrow conversion with multiple fallback strategies
99
-
100
- Args:
101
- df: cuDF DataFrame
102
-
103
- Returns:
104
- PyArrow Table
105
-
106
- Raises:
107
- CuDFBridgeError: If all conversion methods fail
108
- """
109
- if not self.cudf_available:
110
- raise CuDFBridgeError("cuDF not available")
111
-
112
- conversion_errors = []
113
-
114
- # Strategy 1: Direct conversion (fastest when it works)
115
- try:
116
- arrow_table = df.to_arrow()
117
- self.conversion_stats["direct_success"] += 1
118
- log_info("[cudf_bridge] Direct cuDF→Arrow conversion successful")
119
- return arrow_table
120
- except Exception as e:
121
- conversion_errors.append(f"Direct conversion: {e}")
122
- log_warning(f"[cudf_bridge] Direct cuDF→Arrow failed: {e}")
123
-
124
- # Strategy 2: Dtype preprocessing
125
- try:
126
- arrow_table = self._cudf_to_arrow_with_dtype_conversion(df)
127
- self.conversion_stats["dtype_conversion_success"] += 1
128
- log_info("[cudf_bridge] cuDF→Arrow with dtype conversion successful")
129
- return arrow_table
130
- except Exception as e:
131
- conversion_errors.append(f"Dtype conversion: {e}")
132
- log_warning(f"[cudf_bridge] cuDF dtype conversion failed: {e}")
133
-
134
- # Strategy 3: Pandas fallback (most compatible)
135
- try:
136
- pandas_df = df.to_pandas()
137
- arrow_table = pa.Table.from_pandas(pandas_df, preserve_index=False)
138
- self.conversion_stats["pandas_fallback"] += 1
139
- log_info("[cudf_bridge] cuDF→pandas→Arrow fallback successful")
140
- return arrow_table
141
- except Exception as e:
142
- conversion_errors.append(f"Pandas fallback: {e}")
143
- log_warning(f"[cudf_bridge] Pandas fallback failed: {e}")
144
-
145
- # All strategies failed
146
- self.conversion_stats["total_failures"] += 1
147
- error_summary = "; ".join(conversion_errors)
148
- raise CuDFBridgeError(f"All cuDF→Arrow conversion strategies failed: {error_summary}")
149
-
150
- def _cudf_to_arrow_with_dtype_conversion(self, df: Any) -> pa.Table:
151
- """
152
- Convert cuDF to Arrow with dtype preprocessing
153
-
154
- Args:
155
- df: cuDF DataFrame
156
-
157
- Returns:
158
- PyArrow Table
159
- """
160
- # Create a copy to avoid modifying original
161
- df_processed = df.copy()
162
-
163
- # Analyze and convert problematic dtypes
164
- dtype_conversions = {}
165
-
166
- for col in df_processed.columns:
167
- col_dtype = df_processed[col].dtype
168
-
169
- # Handle object dtypes (main source of cupy errors)
170
- if col_dtype == 'object':
171
- try:
172
- # Try to convert to string
173
- df_processed[col] = df_processed[col].astype('str')
174
- dtype_conversions[col] = 'object→str'
175
- except Exception as e:
176
- log_warning(f"[cudf_bridge] Failed to convert column '{col}' from object to string: {e}")
177
- # Try alternative: convert via pandas
178
- try:
179
- pandas_series = df_processed[col].to_pandas().astype('str')
180
- df_processed[col] = cudf.from_pandas(pandas_series)
181
- dtype_conversions[col] = 'object→pandas→str'
182
- except Exception as e2:
183
- log_warning(f"[cudf_bridge] Failed pandas conversion for column '{col}': {e2}")
184
- raise CuDFBridgeError(f"Cannot convert object column '{col}': {e2}")
185
-
186
- # Handle other problematic dtypes
187
- elif 'datetime' in str(col_dtype).lower() and 'ns' not in str(col_dtype):
188
- try:
189
- # Ensure datetime has nanosecond precision for Arrow compatibility
190
- df_processed[col] = df_processed[col].astype('datetime64[ns]')
191
- dtype_conversions[col] = f'{col_dtype}→datetime64[ns]'
192
- except Exception as e:
193
- log_warning(f"[cudf_bridge] Failed to convert datetime column '{col}': {e}")
194
-
195
- if dtype_conversions:
196
- log_info(f"[cudf_bridge] Applied dtype conversions: {dtype_conversions}")
197
-
198
- # Try Arrow conversion with processed dtypes
199
- return df_processed.to_arrow()
200
-
201
- def arrow_to_cudf(self, arrow_table: pa.Table) -> Any:
202
- """
203
- Convert Arrow table to cuDF DataFrame
204
-
205
- Args:
206
- arrow_table: PyArrow Table
207
-
208
- Returns:
209
- cuDF DataFrame
210
-
211
- Raises:
212
- CuDFBridgeError: If conversion fails
213
- """
214
- if not self.cudf_available:
215
- raise CuDFBridgeError("cuDF not available")
216
-
217
- try:
218
- # Direct Arrow to cuDF conversion
219
- return cudf.DataFrame.from_arrow(arrow_table)
220
- except Exception as e:
221
- log_warning(f"[cudf_bridge] Direct Arrow→cuDF failed: {e}")
222
-
223
- # Fallback: Arrow → pandas → cuDF
224
- try:
225
- pandas_df = arrow_table.to_pandas()
226
- return cudf.from_pandas(pandas_df)
227
- except Exception as e2:
228
- raise CuDFBridgeError(f"Arrow→cuDF conversion failed. Direct: {e}, Pandas fallback: {e2}")
229
-
230
- def get_cudf_info(self, df: Any) -> Dict[str, Any]:
231
- """
232
- Get detailed information about cuDF DataFrame
233
-
234
- Args:
235
- df: cuDF DataFrame
236
-
237
- Returns:
238
- Dictionary with cuDF information
239
- """
240
- if not self.detect_cudf(df):
241
- return {"is_cudf": False}
242
-
243
- try:
244
- info = {
245
- "is_cudf": True,
246
- "shape": df.shape,
247
- "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
248
- "memory_usage_bytes": df.memory_usage(deep=True).sum(),
249
- "has_object_columns": any(dtype == 'object' for dtype in df.dtypes),
250
- "object_columns": [col for col, dtype in df.dtypes.items() if dtype == 'object'],
251
- "cudf_version": cudf.__version__ if self.cudf_available else "unknown"
252
- }
253
-
254
- # GPU memory info if available
255
- try:
256
- import rmm
257
- gpu_info = rmm.mr.get_current_device_resource().get_memory_info()
258
- info["gpu_memory"] = {
259
- "allocated_bytes": gpu_info[0],
260
- "total_bytes": gpu_info[1],
261
- "usage_percent": (gpu_info[0] / gpu_info[1]) * 100 if gpu_info[1] > 0 else 0
262
- }
263
- except Exception:
264
- info["gpu_memory"] = {"available": False}
265
-
266
- return info
267
-
268
- except Exception as e:
269
- return {"is_cudf": True, "error": str(e)}
270
-
271
- def cleanup_gpu_memory(self) -> Dict[str, Any]:
272
- """
273
- Clean up GPU memory used by cuDF operations
274
-
275
- Returns:
276
- Dictionary with cleanup statistics
277
- """
278
- if not self.cudf_available:
279
- return {"cudf_available": False}
280
-
281
- try:
282
- # Get memory info before cleanup
283
- memory_before = 0
284
- try:
285
- import rmm
286
- memory_before = rmm.mr.get_current_device_resource().get_memory_info()[0]
287
- except Exception:
288
- pass
289
-
290
- # Force garbage collection
291
- gc.collect()
292
-
293
- # cuDF-specific cleanup
294
- try:
295
- # Clear any cached data
296
- if hasattr(cudf, '_lib') and hasattr(cudf._lib, 'rmm'):
297
- # Note: reinitialize() is aggressive and may affect other GPU operations
298
- # Only use in specific scenarios
299
- pass
300
- except Exception:
301
- pass
302
-
303
- # Get memory info after cleanup
304
- memory_after = 0
305
- try:
306
- import rmm
307
- memory_after = rmm.mr.get_current_device_resource().get_memory_info()[0]
308
- except Exception:
309
- pass
310
-
311
- memory_freed = max(0, memory_before - memory_after)
312
-
313
- cleanup_stats = {
314
- "cudf_available": True,
315
- "memory_before_bytes": memory_before,
316
- "memory_after_bytes": memory_after,
317
- "memory_freed_bytes": memory_freed,
318
- "memory_freed_mb": memory_freed / (1024 * 1024)
319
- }
320
-
321
- if memory_freed > 0:
322
- log_info(f"[cudf_bridge] GPU memory cleanup: {memory_freed / (1024 * 1024):.1f}MB freed")
323
-
324
- return cleanup_stats
325
-
326
- except Exception as e:
327
- return {"cudf_available": True, "error": str(e)}
328
-
329
- def get_conversion_stats(self) -> Dict[str, Any]:
330
- """Get conversion statistics"""
331
- total_attempts = sum(self.conversion_stats.values())
332
-
333
- if total_attempts == 0:
334
- return {"no_conversions": True}
335
-
336
- return {
337
- "total_attempts": total_attempts,
338
- "success_rate": ((total_attempts - self.conversion_stats["total_failures"]) / total_attempts) * 100,
339
- "direct_success_rate": (self.conversion_stats["direct_success"] / total_attempts) * 100,
340
- "dtype_conversion_rate": (self.conversion_stats["dtype_conversion_success"] / total_attempts) * 100,
341
- "pandas_fallback_rate": (self.conversion_stats["pandas_fallback"] / total_attempts) * 100,
342
- "failure_rate": (self.conversion_stats["total_failures"] / total_attempts) * 100,
343
- "stats": self.conversion_stats.copy()
344
- }
345
-
346
-
347
- # Global instance
348
- _cudf_bridge = None
349
-
350
- def get_cudf_bridge() -> EnhancedCuDFBridge:
351
- """Get the global cuDF bridge instance"""
352
- global _cudf_bridge
353
- if _cudf_bridge is None:
354
- _cudf_bridge = EnhancedCuDFBridge()
355
- return _cudf_bridge