additory 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. additory/__init__.py +4 -0
  2. additory/common/__init__.py +2 -2
  3. additory/common/backend.py +20 -4
  4. additory/common/distributions.py +1 -1
  5. additory/common/sample_data.py +19 -19
  6. additory/core/backends/arrow_bridge.py +7 -0
  7. additory/core/polars_expression_engine.py +66 -16
  8. additory/dynamic_api.py +42 -46
  9. additory/expressions/proxy.py +4 -1
  10. additory/synthetic/__init__.py +7 -95
  11. additory/synthetic/column_name_resolver.py +149 -0
  12. additory/{augment → synthetic}/distributions.py +2 -2
  13. additory/{augment → synthetic}/forecast.py +1 -1
  14. additory/synthetic/linked_list_parser.py +415 -0
  15. additory/synthetic/namespace_lookup.py +129 -0
  16. additory/{augment → synthetic}/smote.py +1 -1
  17. additory/{augment → synthetic}/strategies.py +11 -44
  18. additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
  19. additory/utilities/units.py +4 -1
  20. {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/METADATA +12 -17
  21. {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
  22. {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
  23. additory/augment/__init__.py +0 -24
  24. additory/augment/builtin_lists.py +0 -430
  25. additory/augment/list_registry.py +0 -177
  26. additory/synthetic/api.py +0 -220
  27. additory/synthetic/common_integration.py +0 -314
  28. additory/synthetic/config.py +0 -262
  29. additory/synthetic/engines.py +0 -529
  30. additory/synthetic/exceptions.py +0 -180
  31. additory/synthetic/file_managers.py +0 -518
  32. additory/synthetic/generator.py +0 -702
  33. additory/synthetic/generator_parser.py +0 -68
  34. additory/synthetic/integration.py +0 -319
  35. additory/synthetic/models.py +0 -241
  36. additory/synthetic/pattern_resolver.py +0 -573
  37. additory/synthetic/performance.py +0 -469
  38. additory/synthetic/polars_integration.py +0 -464
  39. additory/synthetic/proxy.py +0 -60
  40. additory/synthetic/schema_parser.py +0 -685
  41. additory/synthetic/validator.py +0 -553
  42. {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
  43. {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
@@ -1,469 +0,0 @@
1
- """
2
- Performance monitoring and optimization for synthetic data generation.
3
-
4
- Provides comprehensive performance tracking, memory management, and optimization
5
- features for the polars-native synthetic data generation system.
6
- """
7
-
8
- import time
9
- import psutil
10
- import gc
11
- from typing import Dict, List, Optional, Any, Union
12
- from dataclasses import dataclass, field
13
- from contextlib import contextmanager
14
- import polars as pl
15
- import pandas as pd
16
-
17
- from .exceptions import SyntheticDataError
18
-
19
-
20
- @dataclass
21
- class PerformanceMetrics:
22
- """Performance metrics for data generation operations."""
23
- operation_name: str
24
- start_time: float
25
- end_time: float
26
- duration_ms: float
27
- memory_before_mb: float
28
- memory_after_mb: float
29
- memory_peak_mb: float
30
- memory_delta_mb: float
31
- rows_generated: int
32
- columns_generated: int
33
- rows_per_second: float
34
- memory_per_row_kb: float
35
- batch_count: int = 1
36
- polars_operations: int = 0
37
- conversion_time_ms: float = 0.0
38
- metadata: Dict[str, Any] = field(default_factory=dict)
39
-
40
-
41
- @dataclass
42
- class PerformanceComparison:
43
- """Comparison between pandas and polars performance."""
44
- operation: str
45
- polars_metrics: PerformanceMetrics
46
- pandas_metrics: Optional[PerformanceMetrics] = None
47
- polars_advantage_speed: Optional[float] = None # How many times faster polars is
48
- polars_advantage_memory: Optional[float] = None # How much less memory polars uses
49
- recommendation: str = ""
50
-
51
-
52
- class PerformanceMonitor:
53
- """
54
- Comprehensive performance monitoring system for synthetic data generation.
55
-
56
- Tracks generation speed, memory usage, and provides optimization recommendations.
57
- """
58
-
59
- def __init__(self):
60
- """Initialize the performance monitor."""
61
- self.metrics_history: List[PerformanceMetrics] = []
62
- self.comparisons: List[PerformanceComparison] = []
63
- self._current_operation: Optional[str] = None
64
- self._operation_start_time: Optional[float] = None
65
- self._operation_start_memory: Optional[float] = None
66
- self._peak_memory: float = 0.0
67
- self._polars_operations_count: int = 0
68
- self._conversion_time_ms: float = 0.0
69
-
70
- @contextmanager
71
- def monitor_operation(self, operation_name: str, rows: int = 0, columns: int = 0, **metadata):
72
- """
73
- Context manager for monitoring a data generation operation.
74
-
75
- Args:
76
- operation_name: Name of the operation being monitored
77
- rows: Number of rows being generated
78
- columns: Number of columns being generated
79
- **metadata: Additional metadata to track
80
- """
81
- # Start monitoring
82
- start_time = time.time()
83
- start_memory = self._get_memory_usage_mb()
84
- self._current_operation = operation_name
85
- self._operation_start_time = start_time
86
- self._operation_start_memory = start_memory
87
- self._peak_memory = start_memory
88
- self._polars_operations_count = 0
89
- self._conversion_time_ms = 0.0
90
-
91
- # Force garbage collection for accurate memory measurement
92
- gc.collect()
93
-
94
- try:
95
- yield self
96
- finally:
97
- # End monitoring
98
- end_time = time.time()
99
- end_memory = self._get_memory_usage_mb()
100
-
101
- # Calculate metrics
102
- duration_ms = (end_time - start_time) * 1000
103
- memory_delta = end_memory - start_memory
104
- rows_per_second = rows / (duration_ms / 1000) if duration_ms > 0 and rows > 0 else 0
105
- memory_per_row_kb = (memory_delta * 1024) / rows if rows > 0 else 0
106
-
107
- # Create metrics object
108
- metrics = PerformanceMetrics(
109
- operation_name=operation_name,
110
- start_time=start_time,
111
- end_time=end_time,
112
- duration_ms=duration_ms,
113
- memory_before_mb=start_memory,
114
- memory_after_mb=end_memory,
115
- memory_peak_mb=self._peak_memory,
116
- memory_delta_mb=memory_delta,
117
- rows_generated=rows,
118
- columns_generated=columns,
119
- rows_per_second=rows_per_second,
120
- memory_per_row_kb=memory_per_row_kb,
121
- polars_operations=self._polars_operations_count,
122
- conversion_time_ms=self._conversion_time_ms,
123
- metadata=metadata
124
- )
125
-
126
- # Store metrics
127
- self.metrics_history.append(metrics)
128
-
129
- # Reset monitoring state
130
- self._current_operation = None
131
- self._operation_start_time = None
132
- self._operation_start_memory = None
133
- self._peak_memory = 0.0
134
- self._polars_operations_count = 0
135
- self._conversion_time_ms = 0.0
136
-
137
- def update_peak_memory(self):
138
- """Update peak memory usage during operation."""
139
- if self._current_operation:
140
- current_memory = self._get_memory_usage_mb()
141
- self._peak_memory = max(self._peak_memory, current_memory)
142
-
143
- def track_polars_operation(self):
144
- """Track a polars operation for performance counting."""
145
- if self._current_operation:
146
- self._polars_operations_count += 1
147
-
148
- def track_conversion_time(self, conversion_time_ms: float):
149
- """Track time spent on format conversion."""
150
- if self._current_operation:
151
- self._conversion_time_ms += conversion_time_ms
152
-
153
- def get_latest_metrics(self) -> Optional[PerformanceMetrics]:
154
- """Get the most recent performance metrics."""
155
- return self.metrics_history[-1] if self.metrics_history else None
156
-
157
- def get_metrics_summary(self, operation_filter: Optional[str] = None) -> Dict[str, Any]:
158
- """
159
- Get a summary of performance metrics.
160
-
161
- Args:
162
- operation_filter: Filter metrics by operation name
163
-
164
- Returns:
165
- Dictionary with performance summary statistics
166
- """
167
- filtered_metrics = self.metrics_history
168
- if operation_filter:
169
- filtered_metrics = [m for m in self.metrics_history if operation_filter in m.operation_name]
170
-
171
- if not filtered_metrics:
172
- return {"message": "No metrics available"}
173
-
174
- # Calculate summary statistics
175
- total_operations = len(filtered_metrics)
176
- total_rows = sum(m.rows_generated for m in filtered_metrics)
177
- total_duration_ms = sum(m.duration_ms for m in filtered_metrics)
178
- total_memory_delta = sum(m.memory_delta_mb for m in filtered_metrics)
179
-
180
- avg_rows_per_second = sum(m.rows_per_second for m in filtered_metrics) / total_operations
181
- avg_memory_per_row_kb = sum(m.memory_per_row_kb for m in filtered_metrics) / total_operations
182
- avg_duration_ms = total_duration_ms / total_operations
183
-
184
- max_memory_usage = max(m.memory_peak_mb for m in filtered_metrics)
185
- min_memory_usage = min(m.memory_before_mb for m in filtered_metrics)
186
-
187
- return {
188
- "total_operations": total_operations,
189
- "total_rows_generated": total_rows,
190
- "total_duration_ms": total_duration_ms,
191
- "total_memory_delta_mb": total_memory_delta,
192
- "average_rows_per_second": avg_rows_per_second,
193
- "average_memory_per_row_kb": avg_memory_per_row_kb,
194
- "average_duration_ms": avg_duration_ms,
195
- "peak_memory_usage_mb": max_memory_usage,
196
- "min_memory_usage_mb": min_memory_usage,
197
- "memory_efficiency_score": self._calculate_memory_efficiency_score(filtered_metrics)
198
- }
199
-
200
- def compare_engines(self, polars_metrics: PerformanceMetrics,
201
- pandas_metrics: Optional[PerformanceMetrics] = None) -> PerformanceComparison:
202
- """
203
- Compare performance between polars and pandas engines.
204
-
205
- Args:
206
- polars_metrics: Performance metrics from polars engine
207
- pandas_metrics: Performance metrics from pandas engine (if available)
208
-
209
- Returns:
210
- PerformanceComparison with analysis and recommendations
211
- """
212
- comparison = PerformanceComparison(
213
- operation=polars_metrics.operation_name,
214
- polars_metrics=polars_metrics,
215
- pandas_metrics=pandas_metrics
216
- )
217
-
218
- if pandas_metrics:
219
- # Calculate advantages
220
- if pandas_metrics.duration_ms > 0:
221
- comparison.polars_advantage_speed = pandas_metrics.duration_ms / polars_metrics.duration_ms
222
-
223
- if pandas_metrics.memory_delta_mb > 0:
224
- comparison.polars_advantage_memory = pandas_metrics.memory_delta_mb / polars_metrics.memory_delta_mb
225
-
226
- # Generate recommendation
227
- comparison.recommendation = self._generate_engine_recommendation(comparison)
228
- else:
229
- comparison.recommendation = "Use polars engine for optimal performance"
230
-
231
- self.comparisons.append(comparison)
232
- return comparison
233
-
234
- def get_optimization_recommendations(self) -> List[str]:
235
- """
236
- Get optimization recommendations based on performance history.
237
-
238
- Returns:
239
- List of actionable optimization recommendations
240
- """
241
- recommendations = []
242
-
243
- if not self.metrics_history:
244
- return ["No performance data available for recommendations"]
245
-
246
- # Analyze recent metrics
247
- recent_metrics = self.metrics_history[-10:] # Last 10 operations
248
-
249
- # Memory usage recommendations
250
- avg_memory_per_row = sum(m.memory_per_row_kb for m in recent_metrics) / len(recent_metrics)
251
- if avg_memory_per_row > 100: # More than 100KB per row
252
- recommendations.append(
253
- f"High memory usage detected ({avg_memory_per_row:.1f}KB per row). "
254
- "Consider reducing batch size or using streaming generation."
255
- )
256
-
257
- # Speed recommendations
258
- avg_rows_per_second = sum(m.rows_per_second for m in recent_metrics) / len(recent_metrics)
259
- if avg_rows_per_second < 1000: # Less than 1000 rows per second
260
- recommendations.append(
261
- f"Low generation speed detected ({avg_rows_per_second:.0f} rows/sec). "
262
- "Consider optimizing regex patterns or increasing batch size."
263
- )
264
-
265
- # Batch size recommendations
266
- batch_sizes = [m.metadata.get('batch_size', 0) for m in recent_metrics if 'batch_size' in m.metadata]
267
- if batch_sizes:
268
- avg_batch_size = sum(batch_sizes) / len(batch_sizes)
269
- if avg_batch_size < 1000:
270
- recommendations.append(
271
- f"Small batch size detected ({avg_batch_size:.0f}). "
272
- "Consider increasing batch size for better performance."
273
- )
274
- elif avg_batch_size > 100000:
275
- recommendations.append(
276
- f"Large batch size detected ({avg_batch_size:.0f}). "
277
- "Consider reducing batch size to manage memory usage."
278
- )
279
-
280
- # Conversion overhead recommendations
281
- conversion_times = [m.conversion_time_ms for m in recent_metrics if m.conversion_time_ms > 0]
282
- if conversion_times:
283
- avg_conversion_time = sum(conversion_times) / len(conversion_times)
284
- total_avg_time = sum(m.duration_ms for m in recent_metrics) / len(recent_metrics)
285
- conversion_ratio = avg_conversion_time / total_avg_time if total_avg_time > 0 else 0
286
-
287
- if conversion_ratio > 0.3: # More than 30% of time spent on conversion
288
- recommendations.append(
289
- f"High conversion overhead detected ({conversion_ratio*100:.1f}% of total time). "
290
- "Consider using polars output format for better performance."
291
- )
292
-
293
- return recommendations if recommendations else ["Performance looks good! No specific optimizations needed."]
294
-
295
- def _get_memory_usage_mb(self) -> float:
296
- """Get current memory usage in MB."""
297
- try:
298
- process = psutil.Process()
299
- return process.memory_info().rss / 1024 / 1024
300
- except Exception:
301
- return 0.0
302
-
303
- def _calculate_memory_efficiency_score(self, metrics: List[PerformanceMetrics]) -> float:
304
- """
305
- Calculate a memory efficiency score (0-100).
306
-
307
- Higher scores indicate better memory efficiency.
308
- """
309
- if not metrics:
310
- return 0.0
311
-
312
- # Base score on memory per row (lower is better)
313
- avg_memory_per_row = sum(m.memory_per_row_kb for m in metrics) / len(metrics)
314
-
315
- # Score calculation: 100 - (memory_per_row_kb / 10)
316
- # This gives 100 for 0KB/row, 90 for 1KB/row, etc.
317
- score = max(0, 100 - (avg_memory_per_row / 10))
318
- return min(100, score)
319
-
320
- def _generate_engine_recommendation(self, comparison: PerformanceComparison) -> str:
321
- """Generate engine recommendation based on performance comparison."""
322
- if not comparison.pandas_metrics:
323
- return "Use polars engine for optimal performance"
324
-
325
- speed_advantage = comparison.polars_advantage_speed or 1.0
326
- memory_advantage = comparison.polars_advantage_memory or 1.0
327
-
328
- if speed_advantage > 2.0 and memory_advantage > 1.5:
329
- return f"Strong recommendation: Use polars engine ({speed_advantage:.1f}x faster, {memory_advantage:.1f}x more memory efficient)"
330
- elif speed_advantage > 1.5:
331
- return f"Recommendation: Use polars engine ({speed_advantage:.1f}x faster)"
332
- elif memory_advantage > 1.5:
333
- return f"Recommendation: Use polars engine ({memory_advantage:.1f}x more memory efficient)"
334
- else:
335
- return "Both engines perform similarly. Use polars for consistency."
336
-
337
- def clear_history(self):
338
- """Clear performance metrics history."""
339
- self.metrics_history.clear()
340
- self.comparisons.clear()
341
-
342
- def export_metrics(self, format: str = "dict") -> Union[Dict, pd.DataFrame, pl.DataFrame]:
343
- """
344
- Export performance metrics in various formats.
345
-
346
- Args:
347
- format: Export format ("dict", "pandas", "polars")
348
-
349
- Returns:
350
- Metrics in the requested format
351
- """
352
- if not self.metrics_history:
353
- return {} if format == "dict" else None
354
-
355
- if format == "dict":
356
- return [
357
- {
358
- "operation_name": m.operation_name,
359
- "duration_ms": m.duration_ms,
360
- "memory_delta_mb": m.memory_delta_mb,
361
- "rows_generated": m.rows_generated,
362
- "rows_per_second": m.rows_per_second,
363
- "memory_per_row_kb": m.memory_per_row_kb,
364
- "polars_operations": m.polars_operations,
365
- "conversion_time_ms": m.conversion_time_ms
366
- }
367
- for m in self.metrics_history
368
- ]
369
-
370
- elif format == "pandas":
371
- data = self.export_metrics("dict")
372
- return pd.DataFrame(data)
373
-
374
- elif format == "polars":
375
- data = self.export_metrics("dict")
376
- return pl.DataFrame(data)
377
-
378
- else:
379
- raise ValueError(f"Unsupported export format: {format}")
380
-
381
-
382
- class PerformanceOptimizer:
383
- """
384
- Performance optimization utilities for synthetic data generation.
385
-
386
- Provides automatic optimization recommendations and configuration tuning.
387
- """
388
-
389
- def __init__(self, monitor: PerformanceMonitor):
390
- """Initialize the performance optimizer."""
391
- self.monitor = monitor
392
-
393
- def optimize_batch_size(self, target_rows: int, available_memory_mb: Optional[float] = None) -> int:
394
- """
395
- Calculate optimal batch size based on target rows and available memory.
396
-
397
- Args:
398
- target_rows: Total number of rows to generate
399
- available_memory_mb: Available memory in MB (auto-detected if None)
400
-
401
- Returns:
402
- Recommended batch size
403
- """
404
- if available_memory_mb is None:
405
- available_memory_mb = self._get_available_memory_mb()
406
-
407
- # Get historical memory usage per row
408
- recent_metrics = self.monitor.metrics_history[-5:] if self.monitor.metrics_history else []
409
- if recent_metrics:
410
- avg_memory_per_row_kb = sum(m.memory_per_row_kb for m in recent_metrics) / len(recent_metrics)
411
- else:
412
- avg_memory_per_row_kb = 10 # Default estimate: 10KB per row
413
-
414
- # Calculate batch size to use ~50% of available memory
415
- target_memory_mb = available_memory_mb * 0.5
416
- target_memory_kb = target_memory_mb * 1024
417
-
418
- optimal_batch_size = int(target_memory_kb / avg_memory_per_row_kb) if avg_memory_per_row_kb > 0 else 10000
419
-
420
- # Apply constraints
421
- optimal_batch_size = max(1000, min(optimal_batch_size, 100000)) # Between 1K and 100K
422
- optimal_batch_size = min(optimal_batch_size, target_rows) # Don't exceed target rows
423
-
424
- return optimal_batch_size
425
-
426
- def should_use_streaming(self, target_rows: int, columns: int) -> bool:
427
- """
428
- Determine if streaming generation should be used for large datasets.
429
-
430
- Args:
431
- target_rows: Number of rows to generate
432
- columns: Number of columns to generate
433
-
434
- Returns:
435
- True if streaming is recommended
436
- """
437
- # Use streaming for very large datasets
438
- total_cells = target_rows * columns
439
- return total_cells > 10_000_000 # More than 10M cells
440
-
441
- def get_memory_optimization_config(self) -> Dict[str, Any]:
442
- """
443
- Get configuration recommendations for memory optimization.
444
-
445
- Returns:
446
- Dictionary with optimization configuration
447
- """
448
- available_memory = self._get_available_memory_mb()
449
-
450
- return {
451
- "batch_size": min(50000, max(1000, int(available_memory * 100))), # Scale with memory
452
- "memory_limit_mb": available_memory * 0.8, # Use 80% of available memory
453
- "lazy_evaluation": True,
454
- "garbage_collection_frequency": 5, # GC every 5 batches
455
- "streaming_threshold": 1_000_000 # Use streaming for >1M rows
456
- }
457
-
458
- def _get_available_memory_mb(self) -> float:
459
- """Get available system memory in MB."""
460
- try:
461
- memory = psutil.virtual_memory()
462
- return memory.available / 1024 / 1024
463
- except Exception:
464
- return 1024.0 # Default to 1GB if detection fails
465
-
466
-
467
- # Global performance monitor instance
468
- performance_monitor = PerformanceMonitor()
469
- performance_optimizer = PerformanceOptimizer(performance_monitor)