additory 0.1.0a2__py3-none-any.whl → 0.1.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/polars_expression_engine.py +66 -16
- additory/dynamic_api.py +42 -46
- additory/expressions/proxy.py +4 -1
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +11 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/METADATA +10 -17
- {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
- {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
additory/synthetic/generator.py
DELETED
|
@@ -1,702 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Polars-native generation engine for synthetic data.
|
|
3
|
-
|
|
4
|
-
Implements regex-based value generation using polars operations with
|
|
5
|
-
distribution strategy integration and memory-efficient batch processing.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import re
|
|
9
|
-
import random
|
|
10
|
-
import string
|
|
11
|
-
from typing import Dict, List, Optional, Union, Any
|
|
12
|
-
from dataclasses import dataclass
|
|
13
|
-
import polars as pl
|
|
14
|
-
import pandas as pd
|
|
15
|
-
|
|
16
|
-
from .models import (
|
|
17
|
-
GenerationContext,
|
|
18
|
-
GenerationResult,
|
|
19
|
-
ResolvedPattern,
|
|
20
|
-
DistributionStrategy,
|
|
21
|
-
ValidationResult,
|
|
22
|
-
DistributionType
|
|
23
|
-
)
|
|
24
|
-
from .engines import DistributionManager
|
|
25
|
-
from .exceptions import SyntheticDataError, ValidationError
|
|
26
|
-
from .validator import ValidationSystem
|
|
27
|
-
from .performance import performance_monitor, performance_optimizer, PerformanceMetrics
|
|
28
|
-
from .polars_integration import optimize_conversion, enhance_result, optimize_context, optimize_memory
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@dataclass
|
|
32
|
-
class GenerationConfig:
|
|
33
|
-
"""Configuration for data generation operations."""
|
|
34
|
-
batch_size: int = 10000
|
|
35
|
-
seed: Optional[int] = None
|
|
36
|
-
validate_patterns: bool = True
|
|
37
|
-
memory_limit_mb: Optional[int] = None
|
|
38
|
-
enable_performance_monitoring: bool = True
|
|
39
|
-
auto_optimize_batch_size: bool = True
|
|
40
|
-
lazy_evaluation: bool = True
|
|
41
|
-
garbage_collection_frequency: int = 5
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class RegexGenerator:
|
|
45
|
-
"""Generates realistic values from regex patterns."""
|
|
46
|
-
|
|
47
|
-
def __init__(self, seed: Optional[int] = None):
|
|
48
|
-
"""Initialize the regex generator with optional seed."""
|
|
49
|
-
self.seed = seed
|
|
50
|
-
self.random = random.Random(seed)
|
|
51
|
-
|
|
52
|
-
# Regex compilation cache for performance
|
|
53
|
-
self._regex_cache: Dict[str, re.Pattern] = {}
|
|
54
|
-
self._pattern_cache: Dict[str, List[str]] = {}
|
|
55
|
-
|
|
56
|
-
def _get_compiled_regex(self, pattern: str) -> re.Pattern:
|
|
57
|
-
"""Get compiled regex from cache or compile and cache it."""
|
|
58
|
-
if pattern not in self._regex_cache:
|
|
59
|
-
try:
|
|
60
|
-
self._regex_cache[pattern] = re.compile(pattern)
|
|
61
|
-
except re.error as e:
|
|
62
|
-
raise SyntheticDataError(f"Invalid regex pattern '{pattern}': {e}")
|
|
63
|
-
return self._regex_cache[pattern]
|
|
64
|
-
|
|
65
|
-
def warm_cache(self, patterns: List[str]) -> None:
|
|
66
|
-
"""Pre-compile and cache regex patterns for better performance."""
|
|
67
|
-
for pattern in patterns:
|
|
68
|
-
self._get_compiled_regex(pattern)
|
|
69
|
-
|
|
70
|
-
def clear_cache(self) -> None:
|
|
71
|
-
"""Clear the regex compilation cache."""
|
|
72
|
-
self._regex_cache.clear()
|
|
73
|
-
self._pattern_cache.clear()
|
|
74
|
-
|
|
75
|
-
def get_cache_stats(self) -> Dict[str, int]:
|
|
76
|
-
"""Get cache statistics."""
|
|
77
|
-
return {
|
|
78
|
-
'regex_cache_size': len(self._regex_cache),
|
|
79
|
-
'pattern_cache_size': len(self._pattern_cache)
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
def generate_values(self, pattern: str, count: int) -> List[str]:
|
|
83
|
-
"""Generate realistic values from a regex pattern."""
|
|
84
|
-
try:
|
|
85
|
-
# For now, implement basic regex generation
|
|
86
|
-
# This is a simplified implementation - in production, you'd want
|
|
87
|
-
# a more sophisticated regex-to-value generator
|
|
88
|
-
return self._generate_from_pattern(pattern, count)
|
|
89
|
-
except Exception as e:
|
|
90
|
-
raise SyntheticDataError(f"Failed to generate values from pattern '{pattern}': {e}")
|
|
91
|
-
|
|
92
|
-
def _generate_from_pattern(self, pattern: str, count: int) -> List[str]:
|
|
93
|
-
"""Generate values from regex pattern using pattern analysis."""
|
|
94
|
-
# Remove anchors if present
|
|
95
|
-
clean_pattern = pattern.strip('^$')
|
|
96
|
-
|
|
97
|
-
# Handle common patterns
|
|
98
|
-
if self._is_email_pattern(clean_pattern):
|
|
99
|
-
return self._generate_emails(count)
|
|
100
|
-
elif self._is_phone_pattern(clean_pattern):
|
|
101
|
-
return self._generate_phones(count)
|
|
102
|
-
elif self._is_uuid_pattern(clean_pattern):
|
|
103
|
-
return self._generate_uuids(count)
|
|
104
|
-
elif self._is_numeric_pattern(clean_pattern):
|
|
105
|
-
return self._generate_numbers(clean_pattern, count)
|
|
106
|
-
elif self._is_name_pattern(clean_pattern):
|
|
107
|
-
return self._generate_names(count)
|
|
108
|
-
else:
|
|
109
|
-
return self._generate_generic(clean_pattern, count)
|
|
110
|
-
|
|
111
|
-
def _is_email_pattern(self, pattern: str) -> bool:
|
|
112
|
-
"""Check if pattern looks like an email regex."""
|
|
113
|
-
email_indicators = ['@', r'\@', r'[A-Za-z0-9._%+-]+', r'\.[A-Za-z]{2,}']
|
|
114
|
-
return any(indicator in pattern for indicator in email_indicators)
|
|
115
|
-
|
|
116
|
-
def _is_phone_pattern(self, pattern: str) -> bool:
|
|
117
|
-
"""Check if pattern looks like a phone regex."""
|
|
118
|
-
phone_indicators = [r'\+?', r'[0-9\s\-\(\)]', r'\d{3}', r'\(\d{3}\)']
|
|
119
|
-
return any(indicator in pattern for indicator in phone_indicators)
|
|
120
|
-
|
|
121
|
-
def _is_uuid_pattern(self, pattern: str) -> bool:
|
|
122
|
-
"""Check if pattern looks like a UUID regex."""
|
|
123
|
-
uuid_indicators = [r'[0-9a-f]{8}', r'[0-9A-F]{8}', r'[a-f0-9]{4}', r'\-[a-f0-9]{4}\-']
|
|
124
|
-
return any(indicator in pattern for indicator in uuid_indicators)
|
|
125
|
-
|
|
126
|
-
def _is_numeric_pattern(self, pattern: str) -> bool:
|
|
127
|
-
"""Check if pattern is primarily numeric."""
|
|
128
|
-
numeric_indicators = [r'\d+', r'[0-9]+', r'\d{', r'[0-9]{']
|
|
129
|
-
return any(indicator in pattern for indicator in numeric_indicators)
|
|
130
|
-
|
|
131
|
-
def _is_name_pattern(self, pattern: str) -> bool:
|
|
132
|
-
"""Check if pattern looks like a name regex."""
|
|
133
|
-
name_indicators = [r'[A-Z][a-z]+', r'[A-Za-z]+\s[A-Za-z]+']
|
|
134
|
-
return any(indicator in pattern for indicator in name_indicators)
|
|
135
|
-
|
|
136
|
-
def _generate_emails(self, count: int) -> List[str]:
|
|
137
|
-
"""Generate realistic email addresses."""
|
|
138
|
-
domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'company.com', 'test.org', 'example.net']
|
|
139
|
-
prefixes = ['user', 'john', 'jane', 'test', 'admin', 'info', 'contact', 'support', 'sales', 'dev']
|
|
140
|
-
|
|
141
|
-
emails = []
|
|
142
|
-
for i in range(count):
|
|
143
|
-
prefix = self.random.choice(prefixes)
|
|
144
|
-
number = self.random.randint(1, 9999) if self.random.random() > 0.3 else ""
|
|
145
|
-
domain = self.random.choice(domains)
|
|
146
|
-
# Add some variation with dots and underscores
|
|
147
|
-
separator = self.random.choice(['', '.', '_']) if self.random.random() > 0.5 else ""
|
|
148
|
-
emails.append(f"{prefix}{separator}{number}@{domain}")
|
|
149
|
-
|
|
150
|
-
return emails
|
|
151
|
-
|
|
152
|
-
def _generate_phones(self, count: int) -> List[str]:
|
|
153
|
-
"""Generate realistic phone numbers."""
|
|
154
|
-
formats = [
|
|
155
|
-
lambda: f"+1-{self.random.randint(200, 999)}-{self.random.randint(200, 999)}-{self.random.randint(1000, 9999)}",
|
|
156
|
-
lambda: f"({self.random.randint(200, 999)}) {self.random.randint(200, 999)}-{self.random.randint(1000, 9999)}",
|
|
157
|
-
lambda: f"{self.random.randint(200, 999)}-{self.random.randint(200, 999)}-{self.random.randint(1000, 9999)}",
|
|
158
|
-
lambda: f"{self.random.randint(200, 999)}.{self.random.randint(200, 999)}.{self.random.randint(1000, 9999)}"
|
|
159
|
-
]
|
|
160
|
-
|
|
161
|
-
return [self.random.choice(formats)() for _ in range(count)]
|
|
162
|
-
|
|
163
|
-
def _generate_uuids(self, count: int) -> List[str]:
|
|
164
|
-
"""Generate realistic UUID-like strings."""
|
|
165
|
-
import uuid
|
|
166
|
-
return [str(uuid.uuid4()) for _ in range(count)]
|
|
167
|
-
|
|
168
|
-
def _generate_numbers(self, pattern: str, count: int) -> List[str]:
|
|
169
|
-
"""Generate numbers based on pattern analysis."""
|
|
170
|
-
# Extract number ranges from pattern
|
|
171
|
-
if r'\d{' in pattern:
|
|
172
|
-
# Extract digit count
|
|
173
|
-
match = re.search(r'\\d\{(\d+)\}', pattern)
|
|
174
|
-
if match:
|
|
175
|
-
digit_count = int(match.group(1))
|
|
176
|
-
min_val = 10**(digit_count-1) if digit_count > 1 else 0
|
|
177
|
-
max_val = 10**digit_count - 1
|
|
178
|
-
return [str(self.random.randint(min_val, max_val)) for _ in range(count)]
|
|
179
|
-
|
|
180
|
-
# Default numeric generation
|
|
181
|
-
return [str(self.random.randint(1, 99999)) for _ in range(count)]
|
|
182
|
-
|
|
183
|
-
def _generate_names(self, count: int) -> List[str]:
|
|
184
|
-
"""Generate realistic names."""
|
|
185
|
-
first_names = ['John', 'Jane', 'Michael', 'Sarah', 'David', 'Lisa', 'Robert', 'Emily']
|
|
186
|
-
last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis']
|
|
187
|
-
|
|
188
|
-
return [f"{self.random.choice(first_names)} {self.random.choice(last_names)}" for _ in range(count)]
|
|
189
|
-
|
|
190
|
-
def _generate_generic(self, pattern: str, count: int) -> List[str]:
|
|
191
|
-
"""Generate generic values for unrecognized patterns."""
|
|
192
|
-
# Simple fallback - generate alphanumeric strings
|
|
193
|
-
length = self._estimate_length_from_pattern(pattern)
|
|
194
|
-
|
|
195
|
-
values = []
|
|
196
|
-
for _ in range(count):
|
|
197
|
-
value = ''.join(self.random.choices(string.ascii_letters + string.digits, k=length))
|
|
198
|
-
values.append(value)
|
|
199
|
-
|
|
200
|
-
return values
|
|
201
|
-
|
|
202
|
-
def _estimate_length_from_pattern(self, pattern: str) -> int:
|
|
203
|
-
"""Estimate appropriate length for generated values."""
|
|
204
|
-
# Look for explicit length specifications
|
|
205
|
-
length_match = re.search(r'\{(\d+)\}', pattern)
|
|
206
|
-
if length_match:
|
|
207
|
-
return int(length_match.group(1))
|
|
208
|
-
|
|
209
|
-
range_match = re.search(r'\{(\d+),(\d+)\}', pattern)
|
|
210
|
-
if range_match:
|
|
211
|
-
min_len, max_len = int(range_match.group(1)), int(range_match.group(2))
|
|
212
|
-
return self.random.randint(min_len, max_len)
|
|
213
|
-
|
|
214
|
-
# Default length based on pattern complexity
|
|
215
|
-
if len(pattern) > 50:
|
|
216
|
-
return self.random.randint(8, 15)
|
|
217
|
-
elif len(pattern) > 20:
|
|
218
|
-
return self.random.randint(5, 10)
|
|
219
|
-
else:
|
|
220
|
-
return self.random.randint(3, 8)
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
class PolarsGeneratorCore:
|
|
224
|
-
"""Core polars-based data generation engine."""
|
|
225
|
-
|
|
226
|
-
def __init__(self, config: Optional[GenerationConfig] = None):
|
|
227
|
-
"""Initialize the polars generator core."""
|
|
228
|
-
self.config = config or GenerationConfig()
|
|
229
|
-
self.regex_generator = RegexGenerator(seed=self.config.seed)
|
|
230
|
-
self.distribution_manager = DistributionManager(seed=self.config.seed)
|
|
231
|
-
self.validation_system = ValidationSystem()
|
|
232
|
-
|
|
233
|
-
def generate_column(self,
|
|
234
|
-
pattern: ResolvedPattern,
|
|
235
|
-
distribution: DistributionStrategy,
|
|
236
|
-
rows: int,
|
|
237
|
-
column_name: str = "values") -> pl.Series:
|
|
238
|
-
"""Generate a single column using polars operations."""
|
|
239
|
-
try:
|
|
240
|
-
# Step 1: Generate base values from regex pattern
|
|
241
|
-
base_values = self._generate_base_values(pattern, rows)
|
|
242
|
-
|
|
243
|
-
# Step 2: Apply distribution strategy
|
|
244
|
-
result_series = self.distribution_manager.apply_distribution(
|
|
245
|
-
distribution, base_values, rows
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
# Step 3: Rename series to column name
|
|
249
|
-
return result_series.alias(column_name)
|
|
250
|
-
|
|
251
|
-
except Exception as e:
|
|
252
|
-
raise SyntheticDataError(f"Failed to generate column '{column_name}': {e}")
|
|
253
|
-
|
|
254
|
-
def generate_dataframe(self, context: GenerationContext) -> pl.DataFrame:
|
|
255
|
-
"""Generate complete dataframe from generation context."""
|
|
256
|
-
# Only validate context if validation is enabled
|
|
257
|
-
if self.config.validate_patterns and not context.validate_context():
|
|
258
|
-
raise ValidationError("Invalid generation context")
|
|
259
|
-
|
|
260
|
-
try:
|
|
261
|
-
columns = []
|
|
262
|
-
|
|
263
|
-
# Generate each column
|
|
264
|
-
for field_name, field_def in context.schema.field_definitions.items():
|
|
265
|
-
pattern = context.get_pattern(field_def.pattern_name)
|
|
266
|
-
if not pattern:
|
|
267
|
-
raise SyntheticDataError(f"Pattern '{field_def.pattern_name}' not found")
|
|
268
|
-
|
|
269
|
-
column = self.generate_column(
|
|
270
|
-
pattern=pattern,
|
|
271
|
-
distribution=field_def.distribution,
|
|
272
|
-
rows=context.target_rows,
|
|
273
|
-
column_name=field_name
|
|
274
|
-
)
|
|
275
|
-
columns.append(column)
|
|
276
|
-
|
|
277
|
-
# Combine columns into dataframe
|
|
278
|
-
if not columns:
|
|
279
|
-
raise SyntheticDataError("No columns to generate")
|
|
280
|
-
|
|
281
|
-
# Create dataframe from series
|
|
282
|
-
df_data = {col.name: col for col in columns}
|
|
283
|
-
return pl.DataFrame(df_data)
|
|
284
|
-
|
|
285
|
-
except Exception as e:
|
|
286
|
-
raise SyntheticDataError(f"Failed to generate dataframe: {e}")
|
|
287
|
-
|
|
288
|
-
def _generate_base_values(self, pattern: ResolvedPattern, rows: int) -> List[str]:
|
|
289
|
-
"""Generate base values from a resolved pattern."""
|
|
290
|
-
# Only check pattern validity if validation is enabled
|
|
291
|
-
if self.config.validate_patterns and not pattern.is_valid:
|
|
292
|
-
raise ValidationError(f"Pattern '{pattern.name}' is not valid for generation")
|
|
293
|
-
|
|
294
|
-
# Calculate how many unique values we need
|
|
295
|
-
# Always generate at least 10 unique values to support all distribution strategies
|
|
296
|
-
# This ensures skewed distribution (which needs >=2) and others work properly
|
|
297
|
-
unique_count = max(10, min(rows, rows // 10 + 10))
|
|
298
|
-
|
|
299
|
-
return self.regex_generator.generate_values(pattern.regex, unique_count)
|
|
300
|
-
|
|
301
|
-
def validate_generation_context(self, context: GenerationContext) -> ValidationResult:
|
|
302
|
-
"""Validate that the generation context is ready for data generation."""
|
|
303
|
-
# Use the comprehensive validation system for fail-fast validation
|
|
304
|
-
return self.validation_system.validate_complete_configuration(
|
|
305
|
-
context.schema,
|
|
306
|
-
context.resolved_patterns,
|
|
307
|
-
context.schema.source_file
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
class OutputConverter:
|
|
312
|
-
"""Handles format conversion and output optimization."""
|
|
313
|
-
|
|
314
|
-
def __init__(self):
|
|
315
|
-
"""Initialize the output converter."""
|
|
316
|
-
pass
|
|
317
|
-
|
|
318
|
-
def to_pandas(self, df: pl.DataFrame) -> pd.DataFrame:
|
|
319
|
-
"""Convert polars DataFrame to pandas with optimization."""
|
|
320
|
-
import time
|
|
321
|
-
start_time = time.time()
|
|
322
|
-
|
|
323
|
-
try:
|
|
324
|
-
# Use integration layer for optimized conversion
|
|
325
|
-
result = optimize_conversion(df, 'pandas')
|
|
326
|
-
|
|
327
|
-
# Track conversion time
|
|
328
|
-
conversion_time_ms = (time.time() - start_time) * 1000
|
|
329
|
-
performance_monitor.track_conversion_time(conversion_time_ms)
|
|
330
|
-
|
|
331
|
-
return result
|
|
332
|
-
except SyntheticDataError:
|
|
333
|
-
# Re-raise SyntheticDataError as-is
|
|
334
|
-
raise
|
|
335
|
-
except Exception as e:
|
|
336
|
-
# Wrap other exceptions
|
|
337
|
-
raise SyntheticDataError(f"Failed to convert to pandas: {e}")
|
|
338
|
-
|
|
339
|
-
def to_polars(self, df: pl.DataFrame) -> pl.DataFrame:
|
|
340
|
-
"""Return polars DataFrame as-is (zero-copy operation)."""
|
|
341
|
-
# Zero-copy operation - return the same object
|
|
342
|
-
performance_monitor.track_conversion_time(0.0) # Zero-copy operation
|
|
343
|
-
return df
|
|
344
|
-
|
|
345
|
-
def optimize_memory(self, df: pl.DataFrame) -> pl.DataFrame:
|
|
346
|
-
"""Optimize memory usage of the dataframe."""
|
|
347
|
-
try:
|
|
348
|
-
# Use polars lazy evaluation for memory optimization
|
|
349
|
-
performance_monitor.track_polars_operation()
|
|
350
|
-
return df.lazy().collect()
|
|
351
|
-
except Exception as e:
|
|
352
|
-
raise SyntheticDataError(f"Failed to optimize memory: {e}")
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
class SyntheticDataGenerator:
|
|
356
|
-
"""High-level synthetic data generator combining all components."""
|
|
357
|
-
|
|
358
|
-
def __init__(self, config: Optional[GenerationConfig] = None):
|
|
359
|
-
"""Initialize the synthetic data generator."""
|
|
360
|
-
self.config = config or GenerationConfig()
|
|
361
|
-
self.generator_core = PolarsGeneratorCore(self.config)
|
|
362
|
-
self.output_converter = OutputConverter()
|
|
363
|
-
|
|
364
|
-
def generate(self, context: GenerationContext) -> GenerationResult:
|
|
365
|
-
"""Generate synthetic data from a generation context."""
|
|
366
|
-
import time
|
|
367
|
-
import gc
|
|
368
|
-
|
|
369
|
-
# Optimize context using integration layer
|
|
370
|
-
optimized_context = optimize_context(context)
|
|
371
|
-
|
|
372
|
-
# Auto-optimize batch size if enabled
|
|
373
|
-
if self.config.auto_optimize_batch_size:
|
|
374
|
-
optimal_batch_size = performance_optimizer.optimize_batch_size(
|
|
375
|
-
optimized_context.target_rows,
|
|
376
|
-
self.config.memory_limit_mb
|
|
377
|
-
)
|
|
378
|
-
optimized_context.batch_size = optimal_batch_size
|
|
379
|
-
|
|
380
|
-
# Start performance monitoring
|
|
381
|
-
monitor_context = None
|
|
382
|
-
if self.config.enable_performance_monitoring:
|
|
383
|
-
monitor_context = performance_monitor.monitor_operation(
|
|
384
|
-
operation_name="synthetic_data_generation",
|
|
385
|
-
rows=optimized_context.target_rows,
|
|
386
|
-
columns=len(optimized_context.schema.field_definitions),
|
|
387
|
-
batch_size=optimized_context.batch_size,
|
|
388
|
-
output_engine=optimized_context.output_engine,
|
|
389
|
-
patterns_count=len(optimized_context.resolved_patterns)
|
|
390
|
-
)
|
|
391
|
-
monitor_context.__enter__()
|
|
392
|
-
|
|
393
|
-
start_time = time.time()
|
|
394
|
-
conversion_start_time = None
|
|
395
|
-
|
|
396
|
-
try:
|
|
397
|
-
# Fail-fast validation - stop immediately on any validation error
|
|
398
|
-
if self.config.validate_patterns:
|
|
399
|
-
self.generator_core.validation_system.validate_fail_fast(
|
|
400
|
-
optimized_context.schema,
|
|
401
|
-
optimized_context.resolved_patterns,
|
|
402
|
-
optimized_context.schema.source_file
|
|
403
|
-
)
|
|
404
|
-
|
|
405
|
-
# Update peak memory during generation
|
|
406
|
-
if monitor_context:
|
|
407
|
-
performance_monitor.update_peak_memory()
|
|
408
|
-
performance_monitor.track_polars_operation()
|
|
409
|
-
|
|
410
|
-
# Generate polars dataframe
|
|
411
|
-
df = self.generator_core.generate_dataframe(optimized_context)
|
|
412
|
-
|
|
413
|
-
# Track conversion time
|
|
414
|
-
conversion_start_time = time.time()
|
|
415
|
-
|
|
416
|
-
# Convert to requested output format
|
|
417
|
-
if optimized_context.output_engine.lower() == "pandas":
|
|
418
|
-
output_df = self.output_converter.to_pandas(df)
|
|
419
|
-
else:
|
|
420
|
-
output_df = self.output_converter.to_polars(df)
|
|
421
|
-
|
|
422
|
-
# Track conversion time
|
|
423
|
-
if monitor_context and conversion_start_time:
|
|
424
|
-
conversion_time_ms = (time.time() - conversion_start_time) * 1000
|
|
425
|
-
performance_monitor.track_conversion_time(conversion_time_ms)
|
|
426
|
-
|
|
427
|
-
# Calculate generation time
|
|
428
|
-
generation_time = time.time() - start_time
|
|
429
|
-
|
|
430
|
-
# End performance monitoring to get final metrics
|
|
431
|
-
if monitor_context:
|
|
432
|
-
monitor_context.__exit__(None, None, None)
|
|
433
|
-
monitor_context = None # Mark as completed
|
|
434
|
-
|
|
435
|
-
# Create result with performance metrics
|
|
436
|
-
metadata = {
|
|
437
|
-
'rows_generated': optimized_context.target_rows,
|
|
438
|
-
'columns_generated': len(optimized_context.schema.field_definitions),
|
|
439
|
-
'output_engine': optimized_context.output_engine,
|
|
440
|
-
'batch_size': optimized_context.batch_size,
|
|
441
|
-
'patterns_used': list(optimized_context.resolved_patterns.keys()),
|
|
442
|
-
'generation_time_ms': generation_time * 1000
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
# Add performance metrics if monitoring is enabled
|
|
446
|
-
if self.config.enable_performance_monitoring:
|
|
447
|
-
latest_metrics = performance_monitor.get_latest_metrics()
|
|
448
|
-
if latest_metrics:
|
|
449
|
-
metadata.update({
|
|
450
|
-
'rows_per_second': latest_metrics.rows_per_second,
|
|
451
|
-
'memory_delta_mb': latest_metrics.memory_delta_mb,
|
|
452
|
-
'memory_per_row_kb': latest_metrics.memory_per_row_kb,
|
|
453
|
-
'polars_operations': latest_metrics.polars_operations,
|
|
454
|
-
'conversion_time_ms': latest_metrics.conversion_time_ms
|
|
455
|
-
})
|
|
456
|
-
|
|
457
|
-
result = GenerationResult(
|
|
458
|
-
dataframe=output_df,
|
|
459
|
-
generation_time=generation_time,
|
|
460
|
-
metadata=metadata
|
|
461
|
-
)
|
|
462
|
-
|
|
463
|
-
# Enhance result using integration layer
|
|
464
|
-
enhanced_result = enhance_result(result, use_memory_optimization=self.config.lazy_evaluation)
|
|
465
|
-
|
|
466
|
-
return enhanced_result
|
|
467
|
-
|
|
468
|
-
except Exception as e:
|
|
469
|
-
raise SyntheticDataError(f"Data generation failed: {e}")
|
|
470
|
-
finally:
|
|
471
|
-
# End performance monitoring if not already done
|
|
472
|
-
if monitor_context:
|
|
473
|
-
monitor_context.__exit__(None, None, None)
|
|
474
|
-
|
|
475
|
-
def generate_batch(self, context: GenerationContext, batch_size: Optional[int] = None) -> GenerationResult:
|
|
476
|
-
"""Generate data in batches for memory efficiency."""
|
|
477
|
-
import gc
|
|
478
|
-
|
|
479
|
-
# Auto-optimize batch size if enabled
|
|
480
|
-
if self.config.auto_optimize_batch_size and batch_size is None:
|
|
481
|
-
batch_size = performance_optimizer.optimize_batch_size(
|
|
482
|
-
context.target_rows,
|
|
483
|
-
self.config.memory_limit_mb
|
|
484
|
-
)
|
|
485
|
-
else:
|
|
486
|
-
batch_size = batch_size or self.config.batch_size
|
|
487
|
-
|
|
488
|
-
# Start performance monitoring for batch operation
|
|
489
|
-
monitor_context = None
|
|
490
|
-
if self.config.enable_performance_monitoring:
|
|
491
|
-
monitor_context = performance_monitor.monitor_operation(
|
|
492
|
-
operation_name="batch_synthetic_data_generation",
|
|
493
|
-
rows=context.target_rows,
|
|
494
|
-
columns=len(context.schema.field_definitions),
|
|
495
|
-
batch_size=batch_size,
|
|
496
|
-
output_engine=context.output_engine,
|
|
497
|
-
patterns_count=len(context.resolved_patterns)
|
|
498
|
-
)
|
|
499
|
-
monitor_context.__enter__()
|
|
500
|
-
|
|
501
|
-
try:
|
|
502
|
-
if context.target_rows <= batch_size:
|
|
503
|
-
# Single batch - use regular generate method
|
|
504
|
-
result = self.generate(context)
|
|
505
|
-
if monitor_context:
|
|
506
|
-
# Update batch count in metadata
|
|
507
|
-
result.metadata['batch_count'] = 1
|
|
508
|
-
return result
|
|
509
|
-
|
|
510
|
-
# Multi-batch generation
|
|
511
|
-
batches = []
|
|
512
|
-
remaining_rows = context.target_rows
|
|
513
|
-
batch_count = 0
|
|
514
|
-
|
|
515
|
-
while remaining_rows > 0:
|
|
516
|
-
current_batch_size = min(batch_size, remaining_rows)
|
|
517
|
-
batch_count += 1
|
|
518
|
-
|
|
519
|
-
# Create batch context
|
|
520
|
-
batch_context = GenerationContext(
|
|
521
|
-
schema=context.schema,
|
|
522
|
-
resolved_patterns=context.resolved_patterns,
|
|
523
|
-
target_rows=current_batch_size,
|
|
524
|
-
output_engine=context.output_engine,
|
|
525
|
-
batch_size=current_batch_size,
|
|
526
|
-
seed=context.seed
|
|
527
|
-
)
|
|
528
|
-
|
|
529
|
-
# Generate batch with individual monitoring disabled to avoid nested monitoring
|
|
530
|
-
batch_config = GenerationConfig(
|
|
531
|
-
batch_size=current_batch_size,
|
|
532
|
-
seed=self.config.seed,
|
|
533
|
-
validate_patterns=self.config.validate_patterns,
|
|
534
|
-
memory_limit_mb=self.config.memory_limit_mb,
|
|
535
|
-
enable_performance_monitoring=False, # Disable for individual batches
|
|
536
|
-
auto_optimize_batch_size=False,
|
|
537
|
-
lazy_evaluation=self.config.lazy_evaluation,
|
|
538
|
-
garbage_collection_frequency=self.config.garbage_collection_frequency
|
|
539
|
-
)
|
|
540
|
-
|
|
541
|
-
batch_generator = SyntheticDataGenerator(batch_config)
|
|
542
|
-
batch_result = batch_generator.generate(batch_context)
|
|
543
|
-
batches.append(batch_result.dataframe)
|
|
544
|
-
|
|
545
|
-
remaining_rows -= current_batch_size
|
|
546
|
-
|
|
547
|
-
# Update peak memory and perform garbage collection if needed
|
|
548
|
-
if monitor_context:
|
|
549
|
-
performance_monitor.update_peak_memory()
|
|
550
|
-
performance_monitor.track_polars_operation()
|
|
551
|
-
|
|
552
|
-
if batch_count % self.config.garbage_collection_frequency == 0:
|
|
553
|
-
gc.collect()
|
|
554
|
-
|
|
555
|
-
# Combine batches
|
|
556
|
-
if context.output_engine.lower() == "pandas":
|
|
557
|
-
combined_df = pd.concat(batches, ignore_index=True)
|
|
558
|
-
else:
|
|
559
|
-
combined_df = pl.concat(batches)
|
|
560
|
-
|
|
561
|
-
# Create result with batch metadata
|
|
562
|
-
metadata = {
|
|
563
|
-
'rows_generated': context.target_rows,
|
|
564
|
-
'columns_generated': len(context.schema.field_definitions),
|
|
565
|
-
'output_engine': context.output_engine,
|
|
566
|
-
'batch_count': batch_count,
|
|
567
|
-
'batch_size': batch_size,
|
|
568
|
-
'total_batches': len(batches)
|
|
569
|
-
}
|
|
570
|
-
|
|
571
|
-
# Add performance metrics if monitoring is enabled
|
|
572
|
-
if self.config.enable_performance_monitoring:
|
|
573
|
-
latest_metrics = performance_monitor.get_latest_metrics()
|
|
574
|
-
if latest_metrics:
|
|
575
|
-
metadata.update({
|
|
576
|
-
'rows_per_second': latest_metrics.rows_per_second,
|
|
577
|
-
'memory_delta_mb': latest_metrics.memory_delta_mb,
|
|
578
|
-
'memory_per_row_kb': latest_metrics.memory_per_row_kb,
|
|
579
|
-
'polars_operations': latest_metrics.polars_operations,
|
|
580
|
-
'peak_memory_mb': latest_metrics.memory_peak_mb
|
|
581
|
-
})
|
|
582
|
-
|
|
583
|
-
return GenerationResult(
|
|
584
|
-
dataframe=combined_df,
|
|
585
|
-
metadata=metadata
|
|
586
|
-
)
|
|
587
|
-
|
|
588
|
-
except Exception as e:
|
|
589
|
-
raise SyntheticDataError(f"Batch data generation failed: {e}")
|
|
590
|
-
finally:
|
|
591
|
-
# End performance monitoring
|
|
592
|
-
if monitor_context:
|
|
593
|
-
monitor_context.__exit__(None, None, None)
|
|
594
|
-
|
|
595
|
-
def get_performance_metrics(self) -> Optional[PerformanceMetrics]:
|
|
596
|
-
"""Get the latest performance metrics."""
|
|
597
|
-
return performance_monitor.get_latest_metrics()
|
|
598
|
-
|
|
599
|
-
def get_performance_summary(self) -> Dict[str, Any]:
|
|
600
|
-
"""Get a summary of all performance metrics."""
|
|
601
|
-
return performance_monitor.get_metrics_summary()
|
|
602
|
-
|
|
603
|
-
def get_optimization_recommendations(self) -> List[str]:
|
|
604
|
-
"""Get performance optimization recommendations."""
|
|
605
|
-
return performance_monitor.get_optimization_recommendations()
|
|
606
|
-
|
|
607
|
-
def compare_engine_performance(self, context: GenerationContext) -> Dict[str, Any]:
|
|
608
|
-
"""
|
|
609
|
-
Compare performance between polars and pandas engines.
|
|
610
|
-
|
|
611
|
-
Args:
|
|
612
|
-
context: Generation context for comparison
|
|
613
|
-
|
|
614
|
-
Returns:
|
|
615
|
-
Dictionary with performance comparison results
|
|
616
|
-
"""
|
|
617
|
-
if not self.config.enable_performance_monitoring:
|
|
618
|
-
return {"error": "Performance monitoring is disabled"}
|
|
619
|
-
|
|
620
|
-
# Generate with polars engine
|
|
621
|
-
polars_context = GenerationContext(
|
|
622
|
-
schema=context.schema,
|
|
623
|
-
resolved_patterns=context.resolved_patterns,
|
|
624
|
-
target_rows=context.target_rows,
|
|
625
|
-
output_engine="polars",
|
|
626
|
-
batch_size=context.batch_size,
|
|
627
|
-
seed=context.seed
|
|
628
|
-
)
|
|
629
|
-
|
|
630
|
-
polars_result = self.generate(polars_context)
|
|
631
|
-
polars_metrics = performance_monitor.get_latest_metrics()
|
|
632
|
-
|
|
633
|
-
# Generate with pandas engine
|
|
634
|
-
pandas_context = GenerationContext(
|
|
635
|
-
schema=context.schema,
|
|
636
|
-
resolved_patterns=context.resolved_patterns,
|
|
637
|
-
target_rows=context.target_rows,
|
|
638
|
-
output_engine="pandas",
|
|
639
|
-
batch_size=context.batch_size,
|
|
640
|
-
seed=context.seed
|
|
641
|
-
)
|
|
642
|
-
|
|
643
|
-
pandas_result = self.generate(pandas_context)
|
|
644
|
-
pandas_metrics = performance_monitor.get_latest_metrics()
|
|
645
|
-
|
|
646
|
-
# Create comparison
|
|
647
|
-
if polars_metrics and pandas_metrics:
|
|
648
|
-
comparison = performance_monitor.compare_engines(polars_metrics, pandas_metrics)
|
|
649
|
-
return {
|
|
650
|
-
"polars_performance": {
|
|
651
|
-
"duration_ms": polars_metrics.duration_ms,
|
|
652
|
-
"memory_delta_mb": polars_metrics.memory_delta_mb,
|
|
653
|
-
"rows_per_second": polars_metrics.rows_per_second,
|
|
654
|
-
"conversion_time_ms": polars_metrics.conversion_time_ms
|
|
655
|
-
},
|
|
656
|
-
"pandas_performance": {
|
|
657
|
-
"duration_ms": pandas_metrics.duration_ms,
|
|
658
|
-
"memory_delta_mb": pandas_metrics.memory_delta_mb,
|
|
659
|
-
"rows_per_second": pandas_metrics.rows_per_second,
|
|
660
|
-
"conversion_time_ms": pandas_metrics.conversion_time_ms
|
|
661
|
-
},
|
|
662
|
-
"polars_advantage_speed": comparison.polars_advantage_speed,
|
|
663
|
-
"polars_advantage_memory": comparison.polars_advantage_memory,
|
|
664
|
-
"recommendation": comparison.recommendation
|
|
665
|
-
}
|
|
666
|
-
else:
|
|
667
|
-
return {"error": "Failed to collect performance metrics for comparison"}
|
|
668
|
-
|
|
669
|
-
def optimize_configuration(self, target_rows: int, columns: int) -> GenerationConfig:
|
|
670
|
-
"""
|
|
671
|
-
Get optimized configuration for the given generation parameters.
|
|
672
|
-
|
|
673
|
-
Args:
|
|
674
|
-
target_rows: Number of rows to generate
|
|
675
|
-
columns: Number of columns to generate
|
|
676
|
-
|
|
677
|
-
Returns:
|
|
678
|
-
Optimized GenerationConfig
|
|
679
|
-
"""
|
|
680
|
-
# Get memory optimization config
|
|
681
|
-
memory_config = performance_optimizer.get_memory_optimization_config()
|
|
682
|
-
|
|
683
|
-
# Determine if streaming should be used
|
|
684
|
-
use_streaming = performance_optimizer.should_use_streaming(target_rows, columns)
|
|
685
|
-
|
|
686
|
-
# Create optimized config
|
|
687
|
-
optimized_config = GenerationConfig(
|
|
688
|
-
batch_size=memory_config["batch_size"],
|
|
689
|
-
seed=self.config.seed,
|
|
690
|
-
validate_patterns=self.config.validate_patterns,
|
|
691
|
-
memory_limit_mb=memory_config["memory_limit_mb"],
|
|
692
|
-
enable_performance_monitoring=True,
|
|
693
|
-
auto_optimize_batch_size=True,
|
|
694
|
-
lazy_evaluation=memory_config["lazy_evaluation"],
|
|
695
|
-
garbage_collection_frequency=memory_config["garbage_collection_frequency"]
|
|
696
|
-
)
|
|
697
|
-
|
|
698
|
-
return optimized_config
|
|
699
|
-
|
|
700
|
-
def clear_performance_history(self):
|
|
701
|
-
"""Clear performance monitoring history."""
|
|
702
|
-
performance_monitor.clear_history()
|