additory 0.1.0a2__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. additory/__init__.py +4 -0
  2. additory/common/__init__.py +2 -2
  3. additory/common/backend.py +20 -4
  4. additory/common/distributions.py +1 -1
  5. additory/common/sample_data.py +19 -19
  6. additory/core/backends/arrow_bridge.py +7 -0
  7. additory/core/polars_expression_engine.py +66 -16
  8. additory/dynamic_api.py +42 -46
  9. additory/expressions/proxy.py +4 -1
  10. additory/synthetic/__init__.py +7 -95
  11. additory/synthetic/column_name_resolver.py +149 -0
  12. additory/{augment → synthetic}/distributions.py +2 -2
  13. additory/{augment → synthetic}/forecast.py +1 -1
  14. additory/synthetic/linked_list_parser.py +415 -0
  15. additory/synthetic/namespace_lookup.py +129 -0
  16. additory/{augment → synthetic}/smote.py +1 -1
  17. additory/{augment → synthetic}/strategies.py +11 -44
  18. additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
  19. additory/utilities/units.py +4 -1
  20. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/METADATA +10 -17
  21. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
  22. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
  23. additory/augment/__init__.py +0 -24
  24. additory/augment/builtin_lists.py +0 -430
  25. additory/augment/list_registry.py +0 -177
  26. additory/synthetic/api.py +0 -220
  27. additory/synthetic/common_integration.py +0 -314
  28. additory/synthetic/config.py +0 -262
  29. additory/synthetic/engines.py +0 -529
  30. additory/synthetic/exceptions.py +0 -180
  31. additory/synthetic/file_managers.py +0 -518
  32. additory/synthetic/generator.py +0 -702
  33. additory/synthetic/generator_parser.py +0 -68
  34. additory/synthetic/integration.py +0 -319
  35. additory/synthetic/models.py +0 -241
  36. additory/synthetic/pattern_resolver.py +0 -573
  37. additory/synthetic/performance.py +0 -469
  38. additory/synthetic/polars_integration.py +0 -464
  39. additory/synthetic/proxy.py +0 -60
  40. additory/synthetic/schema_parser.py +0 -685
  41. additory/synthetic/validator.py +0 -553
  42. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
  43. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
@@ -1,464 +0,0 @@
1
- """
2
- Integration layer with existing polars infrastructure.
3
-
4
- Leverages existing arrow bridge components and polars infrastructure
5
- for enhanced performance and compatibility in synthetic data generation.
6
- """
7
-
8
- import logging
9
- from typing import Dict, Any, Optional, Union, List
10
- from dataclasses import dataclass
11
- import polars as pl
12
- import pandas as pd
13
-
14
- from ..core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
15
- from ..core.polars_expression_engine import PolarsExpressionEngine
16
- from ..common.backend import detect_backend
17
- from .models import GenerationResult, GenerationContext
18
- from .exceptions import SyntheticDataError
19
- from .performance import performance_monitor
20
-
21
- logger = logging.getLogger(__name__)
22
-
23
-
24
- @dataclass
25
- class IntegrationStats:
26
- """Statistics for polars integration operations."""
27
- arrow_conversions: int = 0
28
- polars_operations: int = 0
29
- memory_optimizations: int = 0
30
- cross_backend_operations: int = 0
31
- total_time_ms: float = 0.0
32
-
33
-
34
- class PolarsIntegrationLayer:
35
- """
36
- Integration layer that leverages existing polars infrastructure
37
- for synthetic data generation operations.
38
- """
39
-
40
- def __init__(self):
41
- """Initialize the integration layer."""
42
- # Use existing infrastructure components
43
- self.enhanced_bridge = EnhancedArrowBridge()
44
- self.expression_engine = PolarsExpressionEngine()
45
-
46
- # Integration statistics
47
- self.stats = IntegrationStats()
48
-
49
- logger.info("Polars integration layer initialized")
50
-
51
- def optimize_dataframe_conversion(self,
52
- df: Union[pl.DataFrame, pd.DataFrame],
53
- target_engine: str,
54
- columns: Optional[List[str]] = None) -> Union[pl.DataFrame, pd.DataFrame]:
55
- """
56
- Optimize dataframe conversion using existing arrow bridge infrastructure.
57
-
58
- Args:
59
- df: Source dataframe
60
- target_engine: Target engine ('pandas' or 'polars')
61
- columns: Optional column subset for memory efficiency
62
-
63
- Returns:
64
- Converted dataframe in target format
65
- """
66
- with performance_monitor.monitor_operation(
67
- "polars_integration_conversion",
68
- rows=len(df),
69
- columns=len(columns) if columns else len(df.columns)
70
- ):
71
- try:
72
- source_backend = detect_backend(df)
73
-
74
- # Use column subsetting for memory efficiency
75
- if columns and len(columns) < len(df.columns):
76
- logger.debug(f"Using column subsetting: {len(columns)}/{len(df.columns)} columns")
77
-
78
- # Convert using existing arrow bridge
79
- arrow_table = self.enhanced_bridge.to_arrow(df, source_backend)
80
- result_df = self.enhanced_bridge.from_arrow(arrow_table, target_engine)
81
-
82
- # Update statistics
83
- self.stats.arrow_conversions += 1
84
- if source_backend != target_engine:
85
- self.stats.cross_backend_operations += 1
86
-
87
- logger.debug(f"Converted {source_backend} to {target_engine} via Arrow bridge")
88
- return result_df
89
-
90
- except Exception as e:
91
- raise SyntheticDataError(f"Dataframe conversion failed: {e}")
92
-
93
- def enhance_generation_result(self,
94
- result: GenerationResult,
95
- use_memory_optimization: bool = True) -> GenerationResult:
96
- """
97
- Enhance generation result using existing polars infrastructure.
98
-
99
- Args:
100
- result: Original generation result
101
- use_memory_optimization: Whether to apply memory optimizations
102
-
103
- Returns:
104
- Enhanced generation result
105
- """
106
- try:
107
- # Apply memory optimization if requested
108
- if use_memory_optimization and hasattr(result.dataframe, 'lazy'):
109
- # Use polars lazy evaluation for memory optimization
110
- if isinstance(result.dataframe, pl.DataFrame):
111
- optimized_df = result.dataframe.lazy().collect()
112
- result.dataframe = optimized_df
113
- self.stats.memory_optimizations += 1
114
- logger.debug("Applied polars lazy evaluation optimization")
115
-
116
- # Add integration metadata
117
- if 'integration_layer' not in result.metadata:
118
- result.metadata['integration_layer'] = {
119
- 'arrow_bridge_used': True,
120
- 'polars_optimized': use_memory_optimization,
121
- 'backend_detected': detect_backend(result.dataframe)
122
- }
123
-
124
- return result
125
-
126
- except Exception as e:
127
- logger.warning(f"Result enhancement failed: {e}")
128
- return result # Return original result if enhancement fails
129
-
130
- def create_cross_backend_context(self,
131
- context: GenerationContext,
132
- preferred_backend: str = 'polars') -> GenerationContext:
133
- """
134
- Create a generation context optimized for cross-backend operations.
135
-
136
- Args:
137
- context: Original generation context
138
- preferred_backend: Preferred backend for internal operations
139
-
140
- Returns:
141
- Optimized generation context
142
- """
143
- try:
144
- # Clone the context
145
- optimized_context = GenerationContext(
146
- schema=context.schema,
147
- resolved_patterns=context.resolved_patterns,
148
- target_rows=context.target_rows,
149
- output_engine=context.output_engine,
150
- batch_size=context.batch_size,
151
- seed=context.seed
152
- )
153
-
154
- # Optimize batch size using existing infrastructure
155
- if hasattr(self.enhanced_bridge, 'get_memory_stats'):
156
- memory_stats = self.enhanced_bridge.get_memory_stats()
157
- if memory_stats.get('cleanup_needed', False):
158
- # Reduce batch size if memory pressure is high
159
- optimized_context.batch_size = min(context.batch_size, 5000)
160
- logger.debug("Reduced batch size due to memory pressure")
161
-
162
- return optimized_context
163
-
164
- except Exception as e:
165
- logger.warning(f"Context optimization failed: {e}")
166
- return context # Return original context if optimization fails
167
-
168
- def apply_polars_expression(self,
169
- df: Union[pl.DataFrame, pd.DataFrame],
170
- expression: str,
171
- output_column: str) -> Union[pl.DataFrame, pd.DataFrame]:
172
- """
173
- Apply expression using existing polars expression engine.
174
-
175
- Args:
176
- df: Input dataframe
177
- expression: Expression to apply
178
- output_column: Name for output column
179
-
180
- Returns:
181
- Dataframe with expression result
182
- """
183
- try:
184
- # Use existing expression engine
185
- result = self.expression_engine.execute_expression(
186
- df, expression, output_column
187
- )
188
-
189
- self.stats.polars_operations += 1
190
- self.stats.total_time_ms += result.execution_time_ms
191
-
192
- logger.debug(f"Applied expression '{expression}' in {result.execution_time_ms:.1f}ms")
193
- return result.dataframe
194
-
195
- except Exception as e:
196
- raise SyntheticDataError(f"Expression application failed: {e}")
197
-
198
- def optimize_memory_usage(self,
199
- df: Union[pl.DataFrame, pd.DataFrame],
200
- aggressive: bool = False) -> Union[pl.DataFrame, pd.DataFrame]:
201
- """
202
- Optimize memory usage using existing infrastructure.
203
-
204
- Args:
205
- df: Input dataframe
206
- aggressive: Whether to use aggressive optimization
207
-
208
- Returns:
209
- Memory-optimized dataframe
210
- """
211
- try:
212
- backend = detect_backend(df)
213
-
214
- if backend == 'polars' and isinstance(df, pl.DataFrame):
215
- # Use polars-specific optimizations
216
- if aggressive:
217
- # Use lazy evaluation and collect
218
- optimized_df = df.lazy().collect()
219
- else:
220
- # Simple rechunk for better memory layout
221
- optimized_df = df.rechunk()
222
-
223
- self.stats.memory_optimizations += 1
224
- logger.debug(f"Applied polars memory optimization (aggressive={aggressive})")
225
- return optimized_df
226
-
227
- elif backend == 'pandas':
228
- # For pandas, convert to polars, optimize, then convert back
229
- polars_df = pl.from_pandas(df)
230
- optimized_polars = polars_df.lazy().collect() if aggressive else polars_df.rechunk()
231
- optimized_df = optimized_polars.to_pandas()
232
-
233
- self.stats.memory_optimizations += 1
234
- self.stats.cross_backend_operations += 1
235
- logger.debug("Applied cross-backend memory optimization")
236
- return optimized_df
237
-
238
- else:
239
- logger.debug("No memory optimization applied for unknown backend")
240
- return df
241
-
242
- except Exception as e:
243
- logger.warning(f"Memory optimization failed: {e}")
244
- return df # Return original dataframe if optimization fails
245
-
246
- def validate_integration_compatibility(self,
247
- df: Union[pl.DataFrame, pd.DataFrame]) -> Dict[str, Any]:
248
- """
249
- Validate compatibility with existing polars infrastructure.
250
-
251
- Args:
252
- df: Dataframe to validate
253
-
254
- Returns:
255
- Compatibility report
256
- """
257
- try:
258
- backend = detect_backend(df)
259
-
260
- # Test arrow conversion
261
- arrow_compatible = True
262
- try:
263
- arrow_table, _ = to_arrow(df)
264
- from_arrow(arrow_table, backend)
265
- except Exception:
266
- arrow_compatible = False
267
-
268
- # Test expression engine compatibility
269
- expression_compatible = True
270
- try:
271
- # Simple test expression
272
- self.expression_engine.validate_expression("1 + 1")
273
- except Exception:
274
- expression_compatible = False
275
-
276
- # Test enhanced bridge compatibility
277
- enhanced_compatible = True
278
- try:
279
- self.enhanced_bridge.detect_backend(df)
280
- except Exception:
281
- enhanced_compatible = False
282
-
283
- return {
284
- 'backend': backend,
285
- 'arrow_compatible': arrow_compatible,
286
- 'expression_compatible': expression_compatible,
287
- 'enhanced_bridge_compatible': enhanced_compatible,
288
- 'overall_compatible': all([
289
- arrow_compatible,
290
- expression_compatible,
291
- enhanced_compatible
292
- ])
293
- }
294
-
295
- except Exception as e:
296
- return {
297
- 'error': str(e),
298
- 'overall_compatible': False
299
- }
300
-
301
- def get_integration_stats(self) -> Dict[str, Any]:
302
- """Get integration layer statistics."""
303
- bridge_stats = self.arrow_bridge.get_stats()
304
- enhanced_stats = self.enhanced_bridge.get_conversion_stats()
305
- expression_stats = self.expression_engine.get_execution_stats()
306
-
307
- return {
308
- 'integration_layer': {
309
- 'arrow_conversions': self.stats.arrow_conversions,
310
- 'polars_operations': self.stats.polars_operations,
311
- 'memory_optimizations': self.stats.memory_optimizations,
312
- 'cross_backend_operations': self.stats.cross_backend_operations,
313
- 'total_time_ms': self.stats.total_time_ms
314
- },
315
- 'arrow_bridge': bridge_stats,
316
- 'enhanced_bridge': {
317
- 'conversions': enhanced_stats.conversions,
318
- 'memory_used_mb': enhanced_stats.memory_used_mb,
319
- 'total_rows_processed': enhanced_stats.total_rows_processed,
320
- 'cleanup_count': enhanced_stats.cleanup_count
321
- },
322
- 'expression_engine': expression_stats
323
- }
324
-
325
- def cleanup_integration_resources(self):
326
- """Clean up integration layer resources."""
327
- try:
328
- # Cleanup arrow bridge cache
329
- self.arrow_bridge.clear_cache()
330
-
331
- # Cleanup enhanced bridge memory
332
- self.enhanced_bridge.cleanup_arrow_memory()
333
-
334
- # Reset expression engine stats if needed
335
- if self.expression_engine.get_execution_stats()['total_executions'] > 1000:
336
- self.expression_engine.reset_stats()
337
-
338
- logger.debug("Integration layer resources cleaned up")
339
-
340
- except Exception as e:
341
- logger.warning(f"Integration cleanup failed: {e}")
342
-
343
- def benchmark_integration_performance(self,
344
- df: Union[pl.DataFrame, pd.DataFrame],
345
- operations: List[str] = None) -> Dict[str, Any]:
346
- """
347
- Benchmark integration layer performance.
348
-
349
- Args:
350
- df: Test dataframe
351
- operations: List of operations to benchmark
352
-
353
- Returns:
354
- Benchmark results
355
- """
356
- if operations is None:
357
- operations = ['conversion', 'memory_optimization', 'expression']
358
-
359
- results = {}
360
-
361
- try:
362
- # Benchmark conversion
363
- if 'conversion' in operations:
364
- import time
365
- start_time = time.time()
366
-
367
- backend = detect_backend(df)
368
- target = 'pandas' if backend == 'polars' else 'polars'
369
- converted = self.optimize_dataframe_conversion(df, target)
370
-
371
- conversion_time = (time.time() - start_time) * 1000
372
- results['conversion'] = {
373
- 'time_ms': conversion_time,
374
- 'source_backend': backend,
375
- 'target_backend': target,
376
- 'rows': len(df),
377
- 'columns': len(df.columns)
378
- }
379
-
380
- # Benchmark memory optimization
381
- if 'memory_optimization' in operations:
382
- import time
383
- start_time = time.time()
384
-
385
- optimized = self.optimize_memory_usage(df, aggressive=True)
386
-
387
- optimization_time = (time.time() - start_time) * 1000
388
- results['memory_optimization'] = {
389
- 'time_ms': optimization_time,
390
- 'backend': detect_backend(df)
391
- }
392
-
393
- # Benchmark expression
394
- if 'expression' in operations:
395
- try:
396
- expr_result = self.expression_engine.benchmark_expression(
397
- df, "1 + 1", "test_column", iterations=3
398
- )
399
- results['expression'] = expr_result
400
- except Exception as e:
401
- results['expression'] = {'error': str(e)}
402
-
403
- return results
404
-
405
- except Exception as e:
406
- return {'error': str(e)}
407
-
408
-
409
- # Global integration layer instance
410
- _integration_layer = PolarsIntegrationLayer()
411
-
412
-
413
- # Convenience functions
414
- def optimize_conversion(df: Union[pl.DataFrame, pd.DataFrame],
415
- target_engine: str,
416
- columns: Optional[List[str]] = None) -> Union[pl.DataFrame, pd.DataFrame]:
417
- """Optimize dataframe conversion using integration layer."""
418
- return _integration_layer.optimize_dataframe_conversion(df, target_engine, columns)
419
-
420
-
421
- def enhance_result(result: GenerationResult,
422
- use_memory_optimization: bool = True) -> GenerationResult:
423
- """Enhance generation result using integration layer."""
424
- return _integration_layer.enhance_generation_result(result, use_memory_optimization)
425
-
426
-
427
- def optimize_context(context: GenerationContext,
428
- preferred_backend: str = 'polars') -> GenerationContext:
429
- """Optimize generation context using integration layer."""
430
- return _integration_layer.create_cross_backend_context(context, preferred_backend)
431
-
432
-
433
- def apply_expression(df: Union[pl.DataFrame, pd.DataFrame],
434
- expression: str,
435
- output_column: str) -> Union[pl.DataFrame, pd.DataFrame]:
436
- """Apply expression using integration layer."""
437
- return _integration_layer.apply_polars_expression(df, expression, output_column)
438
-
439
-
440
- def optimize_memory(df: Union[pl.DataFrame, pd.DataFrame],
441
- aggressive: bool = False) -> Union[pl.DataFrame, pd.DataFrame]:
442
- """Optimize memory usage using integration layer."""
443
- return _integration_layer.optimize_memory_usage(df, aggressive)
444
-
445
-
446
- def validate_compatibility(df: Union[pl.DataFrame, pd.DataFrame]) -> Dict[str, Any]:
447
- """Validate integration compatibility."""
448
- return _integration_layer.validate_integration_compatibility(df)
449
-
450
-
451
- def get_integration_stats() -> Dict[str, Any]:
452
- """Get integration layer statistics."""
453
- return _integration_layer.get_integration_stats()
454
-
455
-
456
- def cleanup_integration() -> None:
457
- """Clean up integration layer resources."""
458
- _integration_layer.cleanup_integration_resources()
459
-
460
-
461
- def benchmark_integration(df: Union[pl.DataFrame, pd.DataFrame],
462
- operations: List[str] = None) -> Dict[str, Any]:
463
- """Benchmark integration layer performance."""
464
- return _integration_layer.benchmark_integration_performance(df, operations)
@@ -1,60 +0,0 @@
1
- import random
2
- import string
3
- import pandas as pd
4
- from datetime import datetime, timedelta
5
-
6
- def random_string(n=8):
7
- return ''.join(random.choices(string.ascii_letters, k=n))
8
-
9
- def random_int(low=0, high=100):
10
- return random.randint(low, high)
11
-
12
- def random_float(low=0, high=1):
13
- return random.uniform(low, high)
14
-
15
- def random_date(start="2020-01-01", end="2023-01-01"):
16
- start_dt = datetime.fromisoformat(start)
17
- end_dt = datetime.fromisoformat(end)
18
- delta = end_dt - start_dt
19
- return start_dt + timedelta(days=random.randint(0, delta.days))
20
-
21
- def random_email():
22
- return random_string(6).lower() + "@example.com"
23
-
24
- def synthetic(rows, schema):
25
- """
26
- schema = {
27
- "name": "string",
28
- "age": ("int", 18, 60),
29
- "score": ("float", 0, 1),
30
- "signup": ("date", "2020-01-01", "2023-01-01"),
31
- "email": "email"
32
- }
33
- """
34
-
35
- data = {}
36
-
37
- for col, rule in schema.items():
38
-
39
- if rule == "string":
40
- data[col] = [random_string() for _ in range(rows)]
41
-
42
- elif rule == "email":
43
- data[col] = [random_email() for _ in range(rows)]
44
-
45
- elif isinstance(rule, tuple) and rule[0] == "int":
46
- _, low, high = rule
47
- data[col] = [random_int(low, high) for _ in range(rows)]
48
-
49
- elif isinstance(rule, tuple) and rule[0] == "float":
50
- _, low, high = rule
51
- data[col] = [random_float(low, high) for _ in range(rows)]
52
-
53
- elif isinstance(rule, tuple) and rule[0] == "date":
54
- _, start, end = rule
55
- data[col] = [random_date(start, end) for _ in range(rows)]
56
-
57
- else:
58
- raise ValueError(f"Unknown schema rule: {rule}")
59
-
60
- return pd.DataFrame(data)