additory 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/polars_expression_engine.py +66 -16
- additory/dynamic_api.py +42 -46
- additory/expressions/proxy.py +4 -1
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +11 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/METADATA +12 -17
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
|
@@ -1,464 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Integration layer with existing polars infrastructure.
|
|
3
|
-
|
|
4
|
-
Leverages existing arrow bridge components and polars infrastructure
|
|
5
|
-
for enhanced performance and compatibility in synthetic data generation.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import logging
|
|
9
|
-
from typing import Dict, Any, Optional, Union, List
|
|
10
|
-
from dataclasses import dataclass
|
|
11
|
-
import polars as pl
|
|
12
|
-
import pandas as pd
|
|
13
|
-
|
|
14
|
-
from ..core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
|
|
15
|
-
from ..core.polars_expression_engine import PolarsExpressionEngine
|
|
16
|
-
from ..common.backend import detect_backend
|
|
17
|
-
from .models import GenerationResult, GenerationContext
|
|
18
|
-
from .exceptions import SyntheticDataError
|
|
19
|
-
from .performance import performance_monitor
|
|
20
|
-
|
|
21
|
-
logger = logging.getLogger(__name__)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@dataclass
|
|
25
|
-
class IntegrationStats:
|
|
26
|
-
"""Statistics for polars integration operations."""
|
|
27
|
-
arrow_conversions: int = 0
|
|
28
|
-
polars_operations: int = 0
|
|
29
|
-
memory_optimizations: int = 0
|
|
30
|
-
cross_backend_operations: int = 0
|
|
31
|
-
total_time_ms: float = 0.0
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class PolarsIntegrationLayer:
|
|
35
|
-
"""
|
|
36
|
-
Integration layer that leverages existing polars infrastructure
|
|
37
|
-
for synthetic data generation operations.
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
def __init__(self):
|
|
41
|
-
"""Initialize the integration layer."""
|
|
42
|
-
# Use existing infrastructure components
|
|
43
|
-
self.enhanced_bridge = EnhancedArrowBridge()
|
|
44
|
-
self.expression_engine = PolarsExpressionEngine()
|
|
45
|
-
|
|
46
|
-
# Integration statistics
|
|
47
|
-
self.stats = IntegrationStats()
|
|
48
|
-
|
|
49
|
-
logger.info("Polars integration layer initialized")
|
|
50
|
-
|
|
51
|
-
def optimize_dataframe_conversion(self,
|
|
52
|
-
df: Union[pl.DataFrame, pd.DataFrame],
|
|
53
|
-
target_engine: str,
|
|
54
|
-
columns: Optional[List[str]] = None) -> Union[pl.DataFrame, pd.DataFrame]:
|
|
55
|
-
"""
|
|
56
|
-
Optimize dataframe conversion using existing arrow bridge infrastructure.
|
|
57
|
-
|
|
58
|
-
Args:
|
|
59
|
-
df: Source dataframe
|
|
60
|
-
target_engine: Target engine ('pandas' or 'polars')
|
|
61
|
-
columns: Optional column subset for memory efficiency
|
|
62
|
-
|
|
63
|
-
Returns:
|
|
64
|
-
Converted dataframe in target format
|
|
65
|
-
"""
|
|
66
|
-
with performance_monitor.monitor_operation(
|
|
67
|
-
"polars_integration_conversion",
|
|
68
|
-
rows=len(df),
|
|
69
|
-
columns=len(columns) if columns else len(df.columns)
|
|
70
|
-
):
|
|
71
|
-
try:
|
|
72
|
-
source_backend = detect_backend(df)
|
|
73
|
-
|
|
74
|
-
# Use column subsetting for memory efficiency
|
|
75
|
-
if columns and len(columns) < len(df.columns):
|
|
76
|
-
logger.debug(f"Using column subsetting: {len(columns)}/{len(df.columns)} columns")
|
|
77
|
-
|
|
78
|
-
# Convert using existing arrow bridge
|
|
79
|
-
arrow_table = self.enhanced_bridge.to_arrow(df, source_backend)
|
|
80
|
-
result_df = self.enhanced_bridge.from_arrow(arrow_table, target_engine)
|
|
81
|
-
|
|
82
|
-
# Update statistics
|
|
83
|
-
self.stats.arrow_conversions += 1
|
|
84
|
-
if source_backend != target_engine:
|
|
85
|
-
self.stats.cross_backend_operations += 1
|
|
86
|
-
|
|
87
|
-
logger.debug(f"Converted {source_backend} to {target_engine} via Arrow bridge")
|
|
88
|
-
return result_df
|
|
89
|
-
|
|
90
|
-
except Exception as e:
|
|
91
|
-
raise SyntheticDataError(f"Dataframe conversion failed: {e}")
|
|
92
|
-
|
|
93
|
-
def enhance_generation_result(self,
|
|
94
|
-
result: GenerationResult,
|
|
95
|
-
use_memory_optimization: bool = True) -> GenerationResult:
|
|
96
|
-
"""
|
|
97
|
-
Enhance generation result using existing polars infrastructure.
|
|
98
|
-
|
|
99
|
-
Args:
|
|
100
|
-
result: Original generation result
|
|
101
|
-
use_memory_optimization: Whether to apply memory optimizations
|
|
102
|
-
|
|
103
|
-
Returns:
|
|
104
|
-
Enhanced generation result
|
|
105
|
-
"""
|
|
106
|
-
try:
|
|
107
|
-
# Apply memory optimization if requested
|
|
108
|
-
if use_memory_optimization and hasattr(result.dataframe, 'lazy'):
|
|
109
|
-
# Use polars lazy evaluation for memory optimization
|
|
110
|
-
if isinstance(result.dataframe, pl.DataFrame):
|
|
111
|
-
optimized_df = result.dataframe.lazy().collect()
|
|
112
|
-
result.dataframe = optimized_df
|
|
113
|
-
self.stats.memory_optimizations += 1
|
|
114
|
-
logger.debug("Applied polars lazy evaluation optimization")
|
|
115
|
-
|
|
116
|
-
# Add integration metadata
|
|
117
|
-
if 'integration_layer' not in result.metadata:
|
|
118
|
-
result.metadata['integration_layer'] = {
|
|
119
|
-
'arrow_bridge_used': True,
|
|
120
|
-
'polars_optimized': use_memory_optimization,
|
|
121
|
-
'backend_detected': detect_backend(result.dataframe)
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
return result
|
|
125
|
-
|
|
126
|
-
except Exception as e:
|
|
127
|
-
logger.warning(f"Result enhancement failed: {e}")
|
|
128
|
-
return result # Return original result if enhancement fails
|
|
129
|
-
|
|
130
|
-
def create_cross_backend_context(self,
|
|
131
|
-
context: GenerationContext,
|
|
132
|
-
preferred_backend: str = 'polars') -> GenerationContext:
|
|
133
|
-
"""
|
|
134
|
-
Create a generation context optimized for cross-backend operations.
|
|
135
|
-
|
|
136
|
-
Args:
|
|
137
|
-
context: Original generation context
|
|
138
|
-
preferred_backend: Preferred backend for internal operations
|
|
139
|
-
|
|
140
|
-
Returns:
|
|
141
|
-
Optimized generation context
|
|
142
|
-
"""
|
|
143
|
-
try:
|
|
144
|
-
# Clone the context
|
|
145
|
-
optimized_context = GenerationContext(
|
|
146
|
-
schema=context.schema,
|
|
147
|
-
resolved_patterns=context.resolved_patterns,
|
|
148
|
-
target_rows=context.target_rows,
|
|
149
|
-
output_engine=context.output_engine,
|
|
150
|
-
batch_size=context.batch_size,
|
|
151
|
-
seed=context.seed
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
# Optimize batch size using existing infrastructure
|
|
155
|
-
if hasattr(self.enhanced_bridge, 'get_memory_stats'):
|
|
156
|
-
memory_stats = self.enhanced_bridge.get_memory_stats()
|
|
157
|
-
if memory_stats.get('cleanup_needed', False):
|
|
158
|
-
# Reduce batch size if memory pressure is high
|
|
159
|
-
optimized_context.batch_size = min(context.batch_size, 5000)
|
|
160
|
-
logger.debug("Reduced batch size due to memory pressure")
|
|
161
|
-
|
|
162
|
-
return optimized_context
|
|
163
|
-
|
|
164
|
-
except Exception as e:
|
|
165
|
-
logger.warning(f"Context optimization failed: {e}")
|
|
166
|
-
return context # Return original context if optimization fails
|
|
167
|
-
|
|
168
|
-
def apply_polars_expression(self,
|
|
169
|
-
df: Union[pl.DataFrame, pd.DataFrame],
|
|
170
|
-
expression: str,
|
|
171
|
-
output_column: str) -> Union[pl.DataFrame, pd.DataFrame]:
|
|
172
|
-
"""
|
|
173
|
-
Apply expression using existing polars expression engine.
|
|
174
|
-
|
|
175
|
-
Args:
|
|
176
|
-
df: Input dataframe
|
|
177
|
-
expression: Expression to apply
|
|
178
|
-
output_column: Name for output column
|
|
179
|
-
|
|
180
|
-
Returns:
|
|
181
|
-
Dataframe with expression result
|
|
182
|
-
"""
|
|
183
|
-
try:
|
|
184
|
-
# Use existing expression engine
|
|
185
|
-
result = self.expression_engine.execute_expression(
|
|
186
|
-
df, expression, output_column
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
self.stats.polars_operations += 1
|
|
190
|
-
self.stats.total_time_ms += result.execution_time_ms
|
|
191
|
-
|
|
192
|
-
logger.debug(f"Applied expression '{expression}' in {result.execution_time_ms:.1f}ms")
|
|
193
|
-
return result.dataframe
|
|
194
|
-
|
|
195
|
-
except Exception as e:
|
|
196
|
-
raise SyntheticDataError(f"Expression application failed: {e}")
|
|
197
|
-
|
|
198
|
-
def optimize_memory_usage(self,
|
|
199
|
-
df: Union[pl.DataFrame, pd.DataFrame],
|
|
200
|
-
aggressive: bool = False) -> Union[pl.DataFrame, pd.DataFrame]:
|
|
201
|
-
"""
|
|
202
|
-
Optimize memory usage using existing infrastructure.
|
|
203
|
-
|
|
204
|
-
Args:
|
|
205
|
-
df: Input dataframe
|
|
206
|
-
aggressive: Whether to use aggressive optimization
|
|
207
|
-
|
|
208
|
-
Returns:
|
|
209
|
-
Memory-optimized dataframe
|
|
210
|
-
"""
|
|
211
|
-
try:
|
|
212
|
-
backend = detect_backend(df)
|
|
213
|
-
|
|
214
|
-
if backend == 'polars' and isinstance(df, pl.DataFrame):
|
|
215
|
-
# Use polars-specific optimizations
|
|
216
|
-
if aggressive:
|
|
217
|
-
# Use lazy evaluation and collect
|
|
218
|
-
optimized_df = df.lazy().collect()
|
|
219
|
-
else:
|
|
220
|
-
# Simple rechunk for better memory layout
|
|
221
|
-
optimized_df = df.rechunk()
|
|
222
|
-
|
|
223
|
-
self.stats.memory_optimizations += 1
|
|
224
|
-
logger.debug(f"Applied polars memory optimization (aggressive={aggressive})")
|
|
225
|
-
return optimized_df
|
|
226
|
-
|
|
227
|
-
elif backend == 'pandas':
|
|
228
|
-
# For pandas, convert to polars, optimize, then convert back
|
|
229
|
-
polars_df = pl.from_pandas(df)
|
|
230
|
-
optimized_polars = polars_df.lazy().collect() if aggressive else polars_df.rechunk()
|
|
231
|
-
optimized_df = optimized_polars.to_pandas()
|
|
232
|
-
|
|
233
|
-
self.stats.memory_optimizations += 1
|
|
234
|
-
self.stats.cross_backend_operations += 1
|
|
235
|
-
logger.debug("Applied cross-backend memory optimization")
|
|
236
|
-
return optimized_df
|
|
237
|
-
|
|
238
|
-
else:
|
|
239
|
-
logger.debug("No memory optimization applied for unknown backend")
|
|
240
|
-
return df
|
|
241
|
-
|
|
242
|
-
except Exception as e:
|
|
243
|
-
logger.warning(f"Memory optimization failed: {e}")
|
|
244
|
-
return df # Return original dataframe if optimization fails
|
|
245
|
-
|
|
246
|
-
def validate_integration_compatibility(self,
|
|
247
|
-
df: Union[pl.DataFrame, pd.DataFrame]) -> Dict[str, Any]:
|
|
248
|
-
"""
|
|
249
|
-
Validate compatibility with existing polars infrastructure.
|
|
250
|
-
|
|
251
|
-
Args:
|
|
252
|
-
df: Dataframe to validate
|
|
253
|
-
|
|
254
|
-
Returns:
|
|
255
|
-
Compatibility report
|
|
256
|
-
"""
|
|
257
|
-
try:
|
|
258
|
-
backend = detect_backend(df)
|
|
259
|
-
|
|
260
|
-
# Test arrow conversion
|
|
261
|
-
arrow_compatible = True
|
|
262
|
-
try:
|
|
263
|
-
arrow_table, _ = to_arrow(df)
|
|
264
|
-
from_arrow(arrow_table, backend)
|
|
265
|
-
except Exception:
|
|
266
|
-
arrow_compatible = False
|
|
267
|
-
|
|
268
|
-
# Test expression engine compatibility
|
|
269
|
-
expression_compatible = True
|
|
270
|
-
try:
|
|
271
|
-
# Simple test expression
|
|
272
|
-
self.expression_engine.validate_expression("1 + 1")
|
|
273
|
-
except Exception:
|
|
274
|
-
expression_compatible = False
|
|
275
|
-
|
|
276
|
-
# Test enhanced bridge compatibility
|
|
277
|
-
enhanced_compatible = True
|
|
278
|
-
try:
|
|
279
|
-
self.enhanced_bridge.detect_backend(df)
|
|
280
|
-
except Exception:
|
|
281
|
-
enhanced_compatible = False
|
|
282
|
-
|
|
283
|
-
return {
|
|
284
|
-
'backend': backend,
|
|
285
|
-
'arrow_compatible': arrow_compatible,
|
|
286
|
-
'expression_compatible': expression_compatible,
|
|
287
|
-
'enhanced_bridge_compatible': enhanced_compatible,
|
|
288
|
-
'overall_compatible': all([
|
|
289
|
-
arrow_compatible,
|
|
290
|
-
expression_compatible,
|
|
291
|
-
enhanced_compatible
|
|
292
|
-
])
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
except Exception as e:
|
|
296
|
-
return {
|
|
297
|
-
'error': str(e),
|
|
298
|
-
'overall_compatible': False
|
|
299
|
-
}
|
|
300
|
-
|
|
301
|
-
def get_integration_stats(self) -> Dict[str, Any]:
|
|
302
|
-
"""Get integration layer statistics."""
|
|
303
|
-
bridge_stats = self.arrow_bridge.get_stats()
|
|
304
|
-
enhanced_stats = self.enhanced_bridge.get_conversion_stats()
|
|
305
|
-
expression_stats = self.expression_engine.get_execution_stats()
|
|
306
|
-
|
|
307
|
-
return {
|
|
308
|
-
'integration_layer': {
|
|
309
|
-
'arrow_conversions': self.stats.arrow_conversions,
|
|
310
|
-
'polars_operations': self.stats.polars_operations,
|
|
311
|
-
'memory_optimizations': self.stats.memory_optimizations,
|
|
312
|
-
'cross_backend_operations': self.stats.cross_backend_operations,
|
|
313
|
-
'total_time_ms': self.stats.total_time_ms
|
|
314
|
-
},
|
|
315
|
-
'arrow_bridge': bridge_stats,
|
|
316
|
-
'enhanced_bridge': {
|
|
317
|
-
'conversions': enhanced_stats.conversions,
|
|
318
|
-
'memory_used_mb': enhanced_stats.memory_used_mb,
|
|
319
|
-
'total_rows_processed': enhanced_stats.total_rows_processed,
|
|
320
|
-
'cleanup_count': enhanced_stats.cleanup_count
|
|
321
|
-
},
|
|
322
|
-
'expression_engine': expression_stats
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
def cleanup_integration_resources(self):
|
|
326
|
-
"""Clean up integration layer resources."""
|
|
327
|
-
try:
|
|
328
|
-
# Cleanup arrow bridge cache
|
|
329
|
-
self.arrow_bridge.clear_cache()
|
|
330
|
-
|
|
331
|
-
# Cleanup enhanced bridge memory
|
|
332
|
-
self.enhanced_bridge.cleanup_arrow_memory()
|
|
333
|
-
|
|
334
|
-
# Reset expression engine stats if needed
|
|
335
|
-
if self.expression_engine.get_execution_stats()['total_executions'] > 1000:
|
|
336
|
-
self.expression_engine.reset_stats()
|
|
337
|
-
|
|
338
|
-
logger.debug("Integration layer resources cleaned up")
|
|
339
|
-
|
|
340
|
-
except Exception as e:
|
|
341
|
-
logger.warning(f"Integration cleanup failed: {e}")
|
|
342
|
-
|
|
343
|
-
def benchmark_integration_performance(self,
|
|
344
|
-
df: Union[pl.DataFrame, pd.DataFrame],
|
|
345
|
-
operations: List[str] = None) -> Dict[str, Any]:
|
|
346
|
-
"""
|
|
347
|
-
Benchmark integration layer performance.
|
|
348
|
-
|
|
349
|
-
Args:
|
|
350
|
-
df: Test dataframe
|
|
351
|
-
operations: List of operations to benchmark
|
|
352
|
-
|
|
353
|
-
Returns:
|
|
354
|
-
Benchmark results
|
|
355
|
-
"""
|
|
356
|
-
if operations is None:
|
|
357
|
-
operations = ['conversion', 'memory_optimization', 'expression']
|
|
358
|
-
|
|
359
|
-
results = {}
|
|
360
|
-
|
|
361
|
-
try:
|
|
362
|
-
# Benchmark conversion
|
|
363
|
-
if 'conversion' in operations:
|
|
364
|
-
import time
|
|
365
|
-
start_time = time.time()
|
|
366
|
-
|
|
367
|
-
backend = detect_backend(df)
|
|
368
|
-
target = 'pandas' if backend == 'polars' else 'polars'
|
|
369
|
-
converted = self.optimize_dataframe_conversion(df, target)
|
|
370
|
-
|
|
371
|
-
conversion_time = (time.time() - start_time) * 1000
|
|
372
|
-
results['conversion'] = {
|
|
373
|
-
'time_ms': conversion_time,
|
|
374
|
-
'source_backend': backend,
|
|
375
|
-
'target_backend': target,
|
|
376
|
-
'rows': len(df),
|
|
377
|
-
'columns': len(df.columns)
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
# Benchmark memory optimization
|
|
381
|
-
if 'memory_optimization' in operations:
|
|
382
|
-
import time
|
|
383
|
-
start_time = time.time()
|
|
384
|
-
|
|
385
|
-
optimized = self.optimize_memory_usage(df, aggressive=True)
|
|
386
|
-
|
|
387
|
-
optimization_time = (time.time() - start_time) * 1000
|
|
388
|
-
results['memory_optimization'] = {
|
|
389
|
-
'time_ms': optimization_time,
|
|
390
|
-
'backend': detect_backend(df)
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
# Benchmark expression
|
|
394
|
-
if 'expression' in operations:
|
|
395
|
-
try:
|
|
396
|
-
expr_result = self.expression_engine.benchmark_expression(
|
|
397
|
-
df, "1 + 1", "test_column", iterations=3
|
|
398
|
-
)
|
|
399
|
-
results['expression'] = expr_result
|
|
400
|
-
except Exception as e:
|
|
401
|
-
results['expression'] = {'error': str(e)}
|
|
402
|
-
|
|
403
|
-
return results
|
|
404
|
-
|
|
405
|
-
except Exception as e:
|
|
406
|
-
return {'error': str(e)}
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
# Global integration layer instance
|
|
410
|
-
_integration_layer = PolarsIntegrationLayer()
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
# Convenience functions
|
|
414
|
-
def optimize_conversion(df: Union[pl.DataFrame, pd.DataFrame],
|
|
415
|
-
target_engine: str,
|
|
416
|
-
columns: Optional[List[str]] = None) -> Union[pl.DataFrame, pd.DataFrame]:
|
|
417
|
-
"""Optimize dataframe conversion using integration layer."""
|
|
418
|
-
return _integration_layer.optimize_dataframe_conversion(df, target_engine, columns)
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
def enhance_result(result: GenerationResult,
|
|
422
|
-
use_memory_optimization: bool = True) -> GenerationResult:
|
|
423
|
-
"""Enhance generation result using integration layer."""
|
|
424
|
-
return _integration_layer.enhance_generation_result(result, use_memory_optimization)
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
def optimize_context(context: GenerationContext,
|
|
428
|
-
preferred_backend: str = 'polars') -> GenerationContext:
|
|
429
|
-
"""Optimize generation context using integration layer."""
|
|
430
|
-
return _integration_layer.create_cross_backend_context(context, preferred_backend)
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
def apply_expression(df: Union[pl.DataFrame, pd.DataFrame],
|
|
434
|
-
expression: str,
|
|
435
|
-
output_column: str) -> Union[pl.DataFrame, pd.DataFrame]:
|
|
436
|
-
"""Apply expression using integration layer."""
|
|
437
|
-
return _integration_layer.apply_polars_expression(df, expression, output_column)
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
def optimize_memory(df: Union[pl.DataFrame, pd.DataFrame],
|
|
441
|
-
aggressive: bool = False) -> Union[pl.DataFrame, pd.DataFrame]:
|
|
442
|
-
"""Optimize memory usage using integration layer."""
|
|
443
|
-
return _integration_layer.optimize_memory_usage(df, aggressive)
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
def validate_compatibility(df: Union[pl.DataFrame, pd.DataFrame]) -> Dict[str, Any]:
|
|
447
|
-
"""Validate integration compatibility."""
|
|
448
|
-
return _integration_layer.validate_integration_compatibility(df)
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
def get_integration_stats() -> Dict[str, Any]:
|
|
452
|
-
"""Get integration layer statistics."""
|
|
453
|
-
return _integration_layer.get_integration_stats()
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
def cleanup_integration() -> None:
|
|
457
|
-
"""Clean up integration layer resources."""
|
|
458
|
-
_integration_layer.cleanup_integration_resources()
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
def benchmark_integration(df: Union[pl.DataFrame, pd.DataFrame],
|
|
462
|
-
operations: List[str] = None) -> Dict[str, Any]:
|
|
463
|
-
"""Benchmark integration layer performance."""
|
|
464
|
-
return _integration_layer.benchmark_integration_performance(df, operations)
|
additory/synthetic/proxy.py
DELETED
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import string
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from datetime import datetime, timedelta
|
|
5
|
-
|
|
6
|
-
def random_string(n=8):
|
|
7
|
-
return ''.join(random.choices(string.ascii_letters, k=n))
|
|
8
|
-
|
|
9
|
-
def random_int(low=0, high=100):
|
|
10
|
-
return random.randint(low, high)
|
|
11
|
-
|
|
12
|
-
def random_float(low=0, high=1):
|
|
13
|
-
return random.uniform(low, high)
|
|
14
|
-
|
|
15
|
-
def random_date(start="2020-01-01", end="2023-01-01"):
|
|
16
|
-
start_dt = datetime.fromisoformat(start)
|
|
17
|
-
end_dt = datetime.fromisoformat(end)
|
|
18
|
-
delta = end_dt - start_dt
|
|
19
|
-
return start_dt + timedelta(days=random.randint(0, delta.days))
|
|
20
|
-
|
|
21
|
-
def random_email():
|
|
22
|
-
return random_string(6).lower() + "@example.com"
|
|
23
|
-
|
|
24
|
-
def synthetic(rows, schema):
|
|
25
|
-
"""
|
|
26
|
-
schema = {
|
|
27
|
-
"name": "string",
|
|
28
|
-
"age": ("int", 18, 60),
|
|
29
|
-
"score": ("float", 0, 1),
|
|
30
|
-
"signup": ("date", "2020-01-01", "2023-01-01"),
|
|
31
|
-
"email": "email"
|
|
32
|
-
}
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
data = {}
|
|
36
|
-
|
|
37
|
-
for col, rule in schema.items():
|
|
38
|
-
|
|
39
|
-
if rule == "string":
|
|
40
|
-
data[col] = [random_string() for _ in range(rows)]
|
|
41
|
-
|
|
42
|
-
elif rule == "email":
|
|
43
|
-
data[col] = [random_email() for _ in range(rows)]
|
|
44
|
-
|
|
45
|
-
elif isinstance(rule, tuple) and rule[0] == "int":
|
|
46
|
-
_, low, high = rule
|
|
47
|
-
data[col] = [random_int(low, high) for _ in range(rows)]
|
|
48
|
-
|
|
49
|
-
elif isinstance(rule, tuple) and rule[0] == "float":
|
|
50
|
-
_, low, high = rule
|
|
51
|
-
data[col] = [random_float(low, high) for _ in range(rows)]
|
|
52
|
-
|
|
53
|
-
elif isinstance(rule, tuple) and rule[0] == "date":
|
|
54
|
-
_, start, end = rule
|
|
55
|
-
data[col] = [random_date(start, end) for _ in range(rows)]
|
|
56
|
-
|
|
57
|
-
else:
|
|
58
|
-
raise ValueError(f"Unknown schema rule: {rule}")
|
|
59
|
-
|
|
60
|
-
return pd.DataFrame(data)
|