additory 0.1.0a2__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. additory/__init__.py +4 -0
  2. additory/common/__init__.py +2 -2
  3. additory/common/backend.py +20 -4
  4. additory/common/distributions.py +1 -1
  5. additory/common/sample_data.py +19 -19
  6. additory/core/backends/arrow_bridge.py +7 -0
  7. additory/core/polars_expression_engine.py +66 -16
  8. additory/dynamic_api.py +42 -46
  9. additory/expressions/proxy.py +4 -1
  10. additory/synthetic/__init__.py +7 -95
  11. additory/synthetic/column_name_resolver.py +149 -0
  12. additory/{augment → synthetic}/distributions.py +2 -2
  13. additory/{augment → synthetic}/forecast.py +1 -1
  14. additory/synthetic/linked_list_parser.py +415 -0
  15. additory/synthetic/namespace_lookup.py +129 -0
  16. additory/{augment → synthetic}/smote.py +1 -1
  17. additory/{augment → synthetic}/strategies.py +11 -44
  18. additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
  19. additory/utilities/units.py +4 -1
  20. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/METADATA +10 -17
  21. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
  22. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
  23. additory/augment/__init__.py +0 -24
  24. additory/augment/builtin_lists.py +0 -430
  25. additory/augment/list_registry.py +0 -177
  26. additory/synthetic/api.py +0 -220
  27. additory/synthetic/common_integration.py +0 -314
  28. additory/synthetic/config.py +0 -262
  29. additory/synthetic/engines.py +0 -529
  30. additory/synthetic/exceptions.py +0 -180
  31. additory/synthetic/file_managers.py +0 -518
  32. additory/synthetic/generator.py +0 -702
  33. additory/synthetic/generator_parser.py +0 -68
  34. additory/synthetic/integration.py +0 -319
  35. additory/synthetic/models.py +0 -241
  36. additory/synthetic/pattern_resolver.py +0 -573
  37. additory/synthetic/performance.py +0 -469
  38. additory/synthetic/polars_integration.py +0 -464
  39. additory/synthetic/proxy.py +0 -60
  40. additory/synthetic/schema_parser.py +0 -685
  41. additory/synthetic/validator.py +0 -553
  42. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
  43. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
@@ -1,702 +0,0 @@
1
- """
2
- Polars-native generation engine for synthetic data.
3
-
4
- Implements regex-based value generation using polars operations with
5
- distribution strategy integration and memory-efficient batch processing.
6
- """
7
-
8
- import re
9
- import random
10
- import string
11
- from typing import Dict, List, Optional, Union, Any
12
- from dataclasses import dataclass
13
- import polars as pl
14
- import pandas as pd
15
-
16
- from .models import (
17
- GenerationContext,
18
- GenerationResult,
19
- ResolvedPattern,
20
- DistributionStrategy,
21
- ValidationResult,
22
- DistributionType
23
- )
24
- from .engines import DistributionManager
25
- from .exceptions import SyntheticDataError, ValidationError
26
- from .validator import ValidationSystem
27
- from .performance import performance_monitor, performance_optimizer, PerformanceMetrics
28
- from .polars_integration import optimize_conversion, enhance_result, optimize_context, optimize_memory
29
-
30
-
31
- @dataclass
32
- class GenerationConfig:
33
- """Configuration for data generation operations."""
34
- batch_size: int = 10000
35
- seed: Optional[int] = None
36
- validate_patterns: bool = True
37
- memory_limit_mb: Optional[int] = None
38
- enable_performance_monitoring: bool = True
39
- auto_optimize_batch_size: bool = True
40
- lazy_evaluation: bool = True
41
- garbage_collection_frequency: int = 5
42
-
43
-
44
- class RegexGenerator:
45
- """Generates realistic values from regex patterns."""
46
-
47
- def __init__(self, seed: Optional[int] = None):
48
- """Initialize the regex generator with optional seed."""
49
- self.seed = seed
50
- self.random = random.Random(seed)
51
-
52
- # Regex compilation cache for performance
53
- self._regex_cache: Dict[str, re.Pattern] = {}
54
- self._pattern_cache: Dict[str, List[str]] = {}
55
-
56
- def _get_compiled_regex(self, pattern: str) -> re.Pattern:
57
- """Get compiled regex from cache or compile and cache it."""
58
- if pattern not in self._regex_cache:
59
- try:
60
- self._regex_cache[pattern] = re.compile(pattern)
61
- except re.error as e:
62
- raise SyntheticDataError(f"Invalid regex pattern '{pattern}': {e}")
63
- return self._regex_cache[pattern]
64
-
65
- def warm_cache(self, patterns: List[str]) -> None:
66
- """Pre-compile and cache regex patterns for better performance."""
67
- for pattern in patterns:
68
- self._get_compiled_regex(pattern)
69
-
70
- def clear_cache(self) -> None:
71
- """Clear the regex compilation cache."""
72
- self._regex_cache.clear()
73
- self._pattern_cache.clear()
74
-
75
- def get_cache_stats(self) -> Dict[str, int]:
76
- """Get cache statistics."""
77
- return {
78
- 'regex_cache_size': len(self._regex_cache),
79
- 'pattern_cache_size': len(self._pattern_cache)
80
- }
81
-
82
- def generate_values(self, pattern: str, count: int) -> List[str]:
83
- """Generate realistic values from a regex pattern."""
84
- try:
85
- # For now, implement basic regex generation
86
- # This is a simplified implementation - in production, you'd want
87
- # a more sophisticated regex-to-value generator
88
- return self._generate_from_pattern(pattern, count)
89
- except Exception as e:
90
- raise SyntheticDataError(f"Failed to generate values from pattern '{pattern}': {e}")
91
-
92
- def _generate_from_pattern(self, pattern: str, count: int) -> List[str]:
93
- """Generate values from regex pattern using pattern analysis."""
94
- # Remove anchors if present
95
- clean_pattern = pattern.strip('^$')
96
-
97
- # Handle common patterns
98
- if self._is_email_pattern(clean_pattern):
99
- return self._generate_emails(count)
100
- elif self._is_phone_pattern(clean_pattern):
101
- return self._generate_phones(count)
102
- elif self._is_uuid_pattern(clean_pattern):
103
- return self._generate_uuids(count)
104
- elif self._is_numeric_pattern(clean_pattern):
105
- return self._generate_numbers(clean_pattern, count)
106
- elif self._is_name_pattern(clean_pattern):
107
- return self._generate_names(count)
108
- else:
109
- return self._generate_generic(clean_pattern, count)
110
-
111
- def _is_email_pattern(self, pattern: str) -> bool:
112
- """Check if pattern looks like an email regex."""
113
- email_indicators = ['@', r'\@', r'[A-Za-z0-9._%+-]+', r'\.[A-Za-z]{2,}']
114
- return any(indicator in pattern for indicator in email_indicators)
115
-
116
- def _is_phone_pattern(self, pattern: str) -> bool:
117
- """Check if pattern looks like a phone regex."""
118
- phone_indicators = [r'\+?', r'[0-9\s\-\(\)]', r'\d{3}', r'\(\d{3}\)']
119
- return any(indicator in pattern for indicator in phone_indicators)
120
-
121
- def _is_uuid_pattern(self, pattern: str) -> bool:
122
- """Check if pattern looks like a UUID regex."""
123
- uuid_indicators = [r'[0-9a-f]{8}', r'[0-9A-F]{8}', r'[a-f0-9]{4}', r'\-[a-f0-9]{4}\-']
124
- return any(indicator in pattern for indicator in uuid_indicators)
125
-
126
- def _is_numeric_pattern(self, pattern: str) -> bool:
127
- """Check if pattern is primarily numeric."""
128
- numeric_indicators = [r'\d+', r'[0-9]+', r'\d{', r'[0-9]{']
129
- return any(indicator in pattern for indicator in numeric_indicators)
130
-
131
- def _is_name_pattern(self, pattern: str) -> bool:
132
- """Check if pattern looks like a name regex."""
133
- name_indicators = [r'[A-Z][a-z]+', r'[A-Za-z]+\s[A-Za-z]+']
134
- return any(indicator in pattern for indicator in name_indicators)
135
-
136
- def _generate_emails(self, count: int) -> List[str]:
137
- """Generate realistic email addresses."""
138
- domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'company.com', 'test.org', 'example.net']
139
- prefixes = ['user', 'john', 'jane', 'test', 'admin', 'info', 'contact', 'support', 'sales', 'dev']
140
-
141
- emails = []
142
- for i in range(count):
143
- prefix = self.random.choice(prefixes)
144
- number = self.random.randint(1, 9999) if self.random.random() > 0.3 else ""
145
- domain = self.random.choice(domains)
146
- # Add some variation with dots and underscores
147
- separator = self.random.choice(['', '.', '_']) if self.random.random() > 0.5 else ""
148
- emails.append(f"{prefix}{separator}{number}@{domain}")
149
-
150
- return emails
151
-
152
- def _generate_phones(self, count: int) -> List[str]:
153
- """Generate realistic phone numbers."""
154
- formats = [
155
- lambda: f"+1-{self.random.randint(200, 999)}-{self.random.randint(200, 999)}-{self.random.randint(1000, 9999)}",
156
- lambda: f"({self.random.randint(200, 999)}) {self.random.randint(200, 999)}-{self.random.randint(1000, 9999)}",
157
- lambda: f"{self.random.randint(200, 999)}-{self.random.randint(200, 999)}-{self.random.randint(1000, 9999)}",
158
- lambda: f"{self.random.randint(200, 999)}.{self.random.randint(200, 999)}.{self.random.randint(1000, 9999)}"
159
- ]
160
-
161
- return [self.random.choice(formats)() for _ in range(count)]
162
-
163
- def _generate_uuids(self, count: int) -> List[str]:
164
- """Generate realistic UUID-like strings."""
165
- import uuid
166
- return [str(uuid.uuid4()) for _ in range(count)]
167
-
168
- def _generate_numbers(self, pattern: str, count: int) -> List[str]:
169
- """Generate numbers based on pattern analysis."""
170
- # Extract number ranges from pattern
171
- if r'\d{' in pattern:
172
- # Extract digit count
173
- match = re.search(r'\\d\{(\d+)\}', pattern)
174
- if match:
175
- digit_count = int(match.group(1))
176
- min_val = 10**(digit_count-1) if digit_count > 1 else 0
177
- max_val = 10**digit_count - 1
178
- return [str(self.random.randint(min_val, max_val)) for _ in range(count)]
179
-
180
- # Default numeric generation
181
- return [str(self.random.randint(1, 99999)) for _ in range(count)]
182
-
183
- def _generate_names(self, count: int) -> List[str]:
184
- """Generate realistic names."""
185
- first_names = ['John', 'Jane', 'Michael', 'Sarah', 'David', 'Lisa', 'Robert', 'Emily']
186
- last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis']
187
-
188
- return [f"{self.random.choice(first_names)} {self.random.choice(last_names)}" for _ in range(count)]
189
-
190
- def _generate_generic(self, pattern: str, count: int) -> List[str]:
191
- """Generate generic values for unrecognized patterns."""
192
- # Simple fallback - generate alphanumeric strings
193
- length = self._estimate_length_from_pattern(pattern)
194
-
195
- values = []
196
- for _ in range(count):
197
- value = ''.join(self.random.choices(string.ascii_letters + string.digits, k=length))
198
- values.append(value)
199
-
200
- return values
201
-
202
- def _estimate_length_from_pattern(self, pattern: str) -> int:
203
- """Estimate appropriate length for generated values."""
204
- # Look for explicit length specifications
205
- length_match = re.search(r'\{(\d+)\}', pattern)
206
- if length_match:
207
- return int(length_match.group(1))
208
-
209
- range_match = re.search(r'\{(\d+),(\d+)\}', pattern)
210
- if range_match:
211
- min_len, max_len = int(range_match.group(1)), int(range_match.group(2))
212
- return self.random.randint(min_len, max_len)
213
-
214
- # Default length based on pattern complexity
215
- if len(pattern) > 50:
216
- return self.random.randint(8, 15)
217
- elif len(pattern) > 20:
218
- return self.random.randint(5, 10)
219
- else:
220
- return self.random.randint(3, 8)
221
-
222
-
223
- class PolarsGeneratorCore:
224
- """Core polars-based data generation engine."""
225
-
226
- def __init__(self, config: Optional[GenerationConfig] = None):
227
- """Initialize the polars generator core."""
228
- self.config = config or GenerationConfig()
229
- self.regex_generator = RegexGenerator(seed=self.config.seed)
230
- self.distribution_manager = DistributionManager(seed=self.config.seed)
231
- self.validation_system = ValidationSystem()
232
-
233
- def generate_column(self,
234
- pattern: ResolvedPattern,
235
- distribution: DistributionStrategy,
236
- rows: int,
237
- column_name: str = "values") -> pl.Series:
238
- """Generate a single column using polars operations."""
239
- try:
240
- # Step 1: Generate base values from regex pattern
241
- base_values = self._generate_base_values(pattern, rows)
242
-
243
- # Step 2: Apply distribution strategy
244
- result_series = self.distribution_manager.apply_distribution(
245
- distribution, base_values, rows
246
- )
247
-
248
- # Step 3: Rename series to column name
249
- return result_series.alias(column_name)
250
-
251
- except Exception as e:
252
- raise SyntheticDataError(f"Failed to generate column '{column_name}': {e}")
253
-
254
- def generate_dataframe(self, context: GenerationContext) -> pl.DataFrame:
255
- """Generate complete dataframe from generation context."""
256
- # Only validate context if validation is enabled
257
- if self.config.validate_patterns and not context.validate_context():
258
- raise ValidationError("Invalid generation context")
259
-
260
- try:
261
- columns = []
262
-
263
- # Generate each column
264
- for field_name, field_def in context.schema.field_definitions.items():
265
- pattern = context.get_pattern(field_def.pattern_name)
266
- if not pattern:
267
- raise SyntheticDataError(f"Pattern '{field_def.pattern_name}' not found")
268
-
269
- column = self.generate_column(
270
- pattern=pattern,
271
- distribution=field_def.distribution,
272
- rows=context.target_rows,
273
- column_name=field_name
274
- )
275
- columns.append(column)
276
-
277
- # Combine columns into dataframe
278
- if not columns:
279
- raise SyntheticDataError("No columns to generate")
280
-
281
- # Create dataframe from series
282
- df_data = {col.name: col for col in columns}
283
- return pl.DataFrame(df_data)
284
-
285
- except Exception as e:
286
- raise SyntheticDataError(f"Failed to generate dataframe: {e}")
287
-
288
- def _generate_base_values(self, pattern: ResolvedPattern, rows: int) -> List[str]:
289
- """Generate base values from a resolved pattern."""
290
- # Only check pattern validity if validation is enabled
291
- if self.config.validate_patterns and not pattern.is_valid:
292
- raise ValidationError(f"Pattern '{pattern.name}' is not valid for generation")
293
-
294
- # Calculate how many unique values we need
295
- # Always generate at least 10 unique values to support all distribution strategies
296
- # This ensures skewed distribution (which needs >=2) and others work properly
297
- unique_count = max(10, min(rows, rows // 10 + 10))
298
-
299
- return self.regex_generator.generate_values(pattern.regex, unique_count)
300
-
301
- def validate_generation_context(self, context: GenerationContext) -> ValidationResult:
302
- """Validate that the generation context is ready for data generation."""
303
- # Use the comprehensive validation system for fail-fast validation
304
- return self.validation_system.validate_complete_configuration(
305
- context.schema,
306
- context.resolved_patterns,
307
- context.schema.source_file
308
- )
309
-
310
-
311
- class OutputConverter:
312
- """Handles format conversion and output optimization."""
313
-
314
- def __init__(self):
315
- """Initialize the output converter."""
316
- pass
317
-
318
- def to_pandas(self, df: pl.DataFrame) -> pd.DataFrame:
319
- """Convert polars DataFrame to pandas with optimization."""
320
- import time
321
- start_time = time.time()
322
-
323
- try:
324
- # Use integration layer for optimized conversion
325
- result = optimize_conversion(df, 'pandas')
326
-
327
- # Track conversion time
328
- conversion_time_ms = (time.time() - start_time) * 1000
329
- performance_monitor.track_conversion_time(conversion_time_ms)
330
-
331
- return result
332
- except SyntheticDataError:
333
- # Re-raise SyntheticDataError as-is
334
- raise
335
- except Exception as e:
336
- # Wrap other exceptions
337
- raise SyntheticDataError(f"Failed to convert to pandas: {e}")
338
-
339
- def to_polars(self, df: pl.DataFrame) -> pl.DataFrame:
340
- """Return polars DataFrame as-is (zero-copy operation)."""
341
- # Zero-copy operation - return the same object
342
- performance_monitor.track_conversion_time(0.0) # Zero-copy operation
343
- return df
344
-
345
- def optimize_memory(self, df: pl.DataFrame) -> pl.DataFrame:
346
- """Optimize memory usage of the dataframe."""
347
- try:
348
- # Use polars lazy evaluation for memory optimization
349
- performance_monitor.track_polars_operation()
350
- return df.lazy().collect()
351
- except Exception as e:
352
- raise SyntheticDataError(f"Failed to optimize memory: {e}")
353
-
354
-
355
- class SyntheticDataGenerator:
356
- """High-level synthetic data generator combining all components."""
357
-
358
- def __init__(self, config: Optional[GenerationConfig] = None):
359
- """Initialize the synthetic data generator."""
360
- self.config = config or GenerationConfig()
361
- self.generator_core = PolarsGeneratorCore(self.config)
362
- self.output_converter = OutputConverter()
363
-
364
- def generate(self, context: GenerationContext) -> GenerationResult:
365
- """Generate synthetic data from a generation context."""
366
- import time
367
- import gc
368
-
369
- # Optimize context using integration layer
370
- optimized_context = optimize_context(context)
371
-
372
- # Auto-optimize batch size if enabled
373
- if self.config.auto_optimize_batch_size:
374
- optimal_batch_size = performance_optimizer.optimize_batch_size(
375
- optimized_context.target_rows,
376
- self.config.memory_limit_mb
377
- )
378
- optimized_context.batch_size = optimal_batch_size
379
-
380
- # Start performance monitoring
381
- monitor_context = None
382
- if self.config.enable_performance_monitoring:
383
- monitor_context = performance_monitor.monitor_operation(
384
- operation_name="synthetic_data_generation",
385
- rows=optimized_context.target_rows,
386
- columns=len(optimized_context.schema.field_definitions),
387
- batch_size=optimized_context.batch_size,
388
- output_engine=optimized_context.output_engine,
389
- patterns_count=len(optimized_context.resolved_patterns)
390
- )
391
- monitor_context.__enter__()
392
-
393
- start_time = time.time()
394
- conversion_start_time = None
395
-
396
- try:
397
- # Fail-fast validation - stop immediately on any validation error
398
- if self.config.validate_patterns:
399
- self.generator_core.validation_system.validate_fail_fast(
400
- optimized_context.schema,
401
- optimized_context.resolved_patterns,
402
- optimized_context.schema.source_file
403
- )
404
-
405
- # Update peak memory during generation
406
- if monitor_context:
407
- performance_monitor.update_peak_memory()
408
- performance_monitor.track_polars_operation()
409
-
410
- # Generate polars dataframe
411
- df = self.generator_core.generate_dataframe(optimized_context)
412
-
413
- # Track conversion time
414
- conversion_start_time = time.time()
415
-
416
- # Convert to requested output format
417
- if optimized_context.output_engine.lower() == "pandas":
418
- output_df = self.output_converter.to_pandas(df)
419
- else:
420
- output_df = self.output_converter.to_polars(df)
421
-
422
- # Track conversion time
423
- if monitor_context and conversion_start_time:
424
- conversion_time_ms = (time.time() - conversion_start_time) * 1000
425
- performance_monitor.track_conversion_time(conversion_time_ms)
426
-
427
- # Calculate generation time
428
- generation_time = time.time() - start_time
429
-
430
- # End performance monitoring to get final metrics
431
- if monitor_context:
432
- monitor_context.__exit__(None, None, None)
433
- monitor_context = None # Mark as completed
434
-
435
- # Create result with performance metrics
436
- metadata = {
437
- 'rows_generated': optimized_context.target_rows,
438
- 'columns_generated': len(optimized_context.schema.field_definitions),
439
- 'output_engine': optimized_context.output_engine,
440
- 'batch_size': optimized_context.batch_size,
441
- 'patterns_used': list(optimized_context.resolved_patterns.keys()),
442
- 'generation_time_ms': generation_time * 1000
443
- }
444
-
445
- # Add performance metrics if monitoring is enabled
446
- if self.config.enable_performance_monitoring:
447
- latest_metrics = performance_monitor.get_latest_metrics()
448
- if latest_metrics:
449
- metadata.update({
450
- 'rows_per_second': latest_metrics.rows_per_second,
451
- 'memory_delta_mb': latest_metrics.memory_delta_mb,
452
- 'memory_per_row_kb': latest_metrics.memory_per_row_kb,
453
- 'polars_operations': latest_metrics.polars_operations,
454
- 'conversion_time_ms': latest_metrics.conversion_time_ms
455
- })
456
-
457
- result = GenerationResult(
458
- dataframe=output_df,
459
- generation_time=generation_time,
460
- metadata=metadata
461
- )
462
-
463
- # Enhance result using integration layer
464
- enhanced_result = enhance_result(result, use_memory_optimization=self.config.lazy_evaluation)
465
-
466
- return enhanced_result
467
-
468
- except Exception as e:
469
- raise SyntheticDataError(f"Data generation failed: {e}")
470
- finally:
471
- # End performance monitoring if not already done
472
- if monitor_context:
473
- monitor_context.__exit__(None, None, None)
474
-
475
- def generate_batch(self, context: GenerationContext, batch_size: Optional[int] = None) -> GenerationResult:
476
- """Generate data in batches for memory efficiency."""
477
- import gc
478
-
479
- # Auto-optimize batch size if enabled
480
- if self.config.auto_optimize_batch_size and batch_size is None:
481
- batch_size = performance_optimizer.optimize_batch_size(
482
- context.target_rows,
483
- self.config.memory_limit_mb
484
- )
485
- else:
486
- batch_size = batch_size or self.config.batch_size
487
-
488
- # Start performance monitoring for batch operation
489
- monitor_context = None
490
- if self.config.enable_performance_monitoring:
491
- monitor_context = performance_monitor.monitor_operation(
492
- operation_name="batch_synthetic_data_generation",
493
- rows=context.target_rows,
494
- columns=len(context.schema.field_definitions),
495
- batch_size=batch_size,
496
- output_engine=context.output_engine,
497
- patterns_count=len(context.resolved_patterns)
498
- )
499
- monitor_context.__enter__()
500
-
501
- try:
502
- if context.target_rows <= batch_size:
503
- # Single batch - use regular generate method
504
- result = self.generate(context)
505
- if monitor_context:
506
- # Update batch count in metadata
507
- result.metadata['batch_count'] = 1
508
- return result
509
-
510
- # Multi-batch generation
511
- batches = []
512
- remaining_rows = context.target_rows
513
- batch_count = 0
514
-
515
- while remaining_rows > 0:
516
- current_batch_size = min(batch_size, remaining_rows)
517
- batch_count += 1
518
-
519
- # Create batch context
520
- batch_context = GenerationContext(
521
- schema=context.schema,
522
- resolved_patterns=context.resolved_patterns,
523
- target_rows=current_batch_size,
524
- output_engine=context.output_engine,
525
- batch_size=current_batch_size,
526
- seed=context.seed
527
- )
528
-
529
- # Generate batch with individual monitoring disabled to avoid nested monitoring
530
- batch_config = GenerationConfig(
531
- batch_size=current_batch_size,
532
- seed=self.config.seed,
533
- validate_patterns=self.config.validate_patterns,
534
- memory_limit_mb=self.config.memory_limit_mb,
535
- enable_performance_monitoring=False, # Disable for individual batches
536
- auto_optimize_batch_size=False,
537
- lazy_evaluation=self.config.lazy_evaluation,
538
- garbage_collection_frequency=self.config.garbage_collection_frequency
539
- )
540
-
541
- batch_generator = SyntheticDataGenerator(batch_config)
542
- batch_result = batch_generator.generate(batch_context)
543
- batches.append(batch_result.dataframe)
544
-
545
- remaining_rows -= current_batch_size
546
-
547
- # Update peak memory and perform garbage collection if needed
548
- if monitor_context:
549
- performance_monitor.update_peak_memory()
550
- performance_monitor.track_polars_operation()
551
-
552
- if batch_count % self.config.garbage_collection_frequency == 0:
553
- gc.collect()
554
-
555
- # Combine batches
556
- if context.output_engine.lower() == "pandas":
557
- combined_df = pd.concat(batches, ignore_index=True)
558
- else:
559
- combined_df = pl.concat(batches)
560
-
561
- # Create result with batch metadata
562
- metadata = {
563
- 'rows_generated': context.target_rows,
564
- 'columns_generated': len(context.schema.field_definitions),
565
- 'output_engine': context.output_engine,
566
- 'batch_count': batch_count,
567
- 'batch_size': batch_size,
568
- 'total_batches': len(batches)
569
- }
570
-
571
- # Add performance metrics if monitoring is enabled
572
- if self.config.enable_performance_monitoring:
573
- latest_metrics = performance_monitor.get_latest_metrics()
574
- if latest_metrics:
575
- metadata.update({
576
- 'rows_per_second': latest_metrics.rows_per_second,
577
- 'memory_delta_mb': latest_metrics.memory_delta_mb,
578
- 'memory_per_row_kb': latest_metrics.memory_per_row_kb,
579
- 'polars_operations': latest_metrics.polars_operations,
580
- 'peak_memory_mb': latest_metrics.memory_peak_mb
581
- })
582
-
583
- return GenerationResult(
584
- dataframe=combined_df,
585
- metadata=metadata
586
- )
587
-
588
- except Exception as e:
589
- raise SyntheticDataError(f"Batch data generation failed: {e}")
590
- finally:
591
- # End performance monitoring
592
- if monitor_context:
593
- monitor_context.__exit__(None, None, None)
594
-
595
- def get_performance_metrics(self) -> Optional[PerformanceMetrics]:
596
- """Get the latest performance metrics."""
597
- return performance_monitor.get_latest_metrics()
598
-
599
- def get_performance_summary(self) -> Dict[str, Any]:
600
- """Get a summary of all performance metrics."""
601
- return performance_monitor.get_metrics_summary()
602
-
603
- def get_optimization_recommendations(self) -> List[str]:
604
- """Get performance optimization recommendations."""
605
- return performance_monitor.get_optimization_recommendations()
606
-
607
- def compare_engine_performance(self, context: GenerationContext) -> Dict[str, Any]:
608
- """
609
- Compare performance between polars and pandas engines.
610
-
611
- Args:
612
- context: Generation context for comparison
613
-
614
- Returns:
615
- Dictionary with performance comparison results
616
- """
617
- if not self.config.enable_performance_monitoring:
618
- return {"error": "Performance monitoring is disabled"}
619
-
620
- # Generate with polars engine
621
- polars_context = GenerationContext(
622
- schema=context.schema,
623
- resolved_patterns=context.resolved_patterns,
624
- target_rows=context.target_rows,
625
- output_engine="polars",
626
- batch_size=context.batch_size,
627
- seed=context.seed
628
- )
629
-
630
- polars_result = self.generate(polars_context)
631
- polars_metrics = performance_monitor.get_latest_metrics()
632
-
633
- # Generate with pandas engine
634
- pandas_context = GenerationContext(
635
- schema=context.schema,
636
- resolved_patterns=context.resolved_patterns,
637
- target_rows=context.target_rows,
638
- output_engine="pandas",
639
- batch_size=context.batch_size,
640
- seed=context.seed
641
- )
642
-
643
- pandas_result = self.generate(pandas_context)
644
- pandas_metrics = performance_monitor.get_latest_metrics()
645
-
646
- # Create comparison
647
- if polars_metrics and pandas_metrics:
648
- comparison = performance_monitor.compare_engines(polars_metrics, pandas_metrics)
649
- return {
650
- "polars_performance": {
651
- "duration_ms": polars_metrics.duration_ms,
652
- "memory_delta_mb": polars_metrics.memory_delta_mb,
653
- "rows_per_second": polars_metrics.rows_per_second,
654
- "conversion_time_ms": polars_metrics.conversion_time_ms
655
- },
656
- "pandas_performance": {
657
- "duration_ms": pandas_metrics.duration_ms,
658
- "memory_delta_mb": pandas_metrics.memory_delta_mb,
659
- "rows_per_second": pandas_metrics.rows_per_second,
660
- "conversion_time_ms": pandas_metrics.conversion_time_ms
661
- },
662
- "polars_advantage_speed": comparison.polars_advantage_speed,
663
- "polars_advantage_memory": comparison.polars_advantage_memory,
664
- "recommendation": comparison.recommendation
665
- }
666
- else:
667
- return {"error": "Failed to collect performance metrics for comparison"}
668
-
669
- def optimize_configuration(self, target_rows: int, columns: int) -> GenerationConfig:
670
- """
671
- Get optimized configuration for the given generation parameters.
672
-
673
- Args:
674
- target_rows: Number of rows to generate
675
- columns: Number of columns to generate
676
-
677
- Returns:
678
- Optimized GenerationConfig
679
- """
680
- # Get memory optimization config
681
- memory_config = performance_optimizer.get_memory_optimization_config()
682
-
683
- # Determine if streaming should be used
684
- use_streaming = performance_optimizer.should_use_streaming(target_rows, columns)
685
-
686
- # Create optimized config
687
- optimized_config = GenerationConfig(
688
- batch_size=memory_config["batch_size"],
689
- seed=self.config.seed,
690
- validate_patterns=self.config.validate_patterns,
691
- memory_limit_mb=memory_config["memory_limit_mb"],
692
- enable_performance_monitoring=True,
693
- auto_optimize_batch_size=True,
694
- lazy_evaluation=memory_config["lazy_evaluation"],
695
- garbage_collection_frequency=memory_config["garbage_collection_frequency"]
696
- )
697
-
698
- return optimized_config
699
-
700
- def clear_performance_history(self):
701
- """Clear performance monitoring history."""
702
- performance_monitor.clear_history()