additory 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/polars_expression_engine.py +66 -16
- additory/dynamic_api.py +42 -46
- additory/expressions/proxy.py +4 -1
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +11 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/METADATA +12 -17
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Generator parser for synthetic data generation
|
|
4
|
-
Handles .gen file parsing and generator resolution
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import os
|
|
8
|
-
from typing import Dict, List
|
|
9
|
-
|
|
10
|
-
BASE = os.path.join(os.path.dirname(__file__), "..", "..", "reference", "schema_definitions")
|
|
11
|
-
|
|
12
|
-
def load_gen_file(filename: str) -> Dict[str, str]:
|
|
13
|
-
"""
|
|
14
|
-
Load generator definitions from a .gen file
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
filename: Name of the .gen file
|
|
18
|
-
|
|
19
|
-
Returns:
|
|
20
|
-
Dictionary mapping generator names to expressions
|
|
21
|
-
"""
|
|
22
|
-
full_path = os.path.join(BASE, filename)
|
|
23
|
-
generators = {}
|
|
24
|
-
|
|
25
|
-
if not os.path.exists(full_path):
|
|
26
|
-
return generators
|
|
27
|
-
|
|
28
|
-
try:
|
|
29
|
-
with open(full_path, "r") as f:
|
|
30
|
-
for line in f:
|
|
31
|
-
line = line.strip()
|
|
32
|
-
if line and ":" in line and not line.startswith("#"):
|
|
33
|
-
key, expr = line.split(":", 1)
|
|
34
|
-
generators[key.strip()] = expr.strip()
|
|
35
|
-
except Exception:
|
|
36
|
-
pass
|
|
37
|
-
|
|
38
|
-
return generators
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def resolve_generators(imports: List[str], inline_gens: Dict[str, str]) -> Dict[str, str]:
|
|
42
|
-
"""
|
|
43
|
-
Resolve generators from imports and inline definitions
|
|
44
|
-
|
|
45
|
-
Args:
|
|
46
|
-
imports: List of .gen files to import
|
|
47
|
-
inline_gens: Inline generator definitions
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
Combined dictionary of all generators
|
|
51
|
-
"""
|
|
52
|
-
generators = {}
|
|
53
|
-
|
|
54
|
-
# Load global.gen first (lowest priority)
|
|
55
|
-
global_gens = load_gen_file("global.gen")
|
|
56
|
-
generators.update(global_gens)
|
|
57
|
-
|
|
58
|
-
# Load imported .gen files (medium priority)
|
|
59
|
-
for import_file in imports:
|
|
60
|
-
if not import_file.endswith(".gen"):
|
|
61
|
-
import_file += ".gen"
|
|
62
|
-
imported_gens = load_gen_file(import_file)
|
|
63
|
-
generators.update(imported_gens)
|
|
64
|
-
|
|
65
|
-
# Apply inline generators (highest priority)
|
|
66
|
-
generators.update(inline_gens)
|
|
67
|
-
|
|
68
|
-
return generators
|
|
@@ -1,319 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Integration layer for synthetic data generation with fail-fast validation.
|
|
3
|
-
|
|
4
|
-
Provides high-level interfaces that integrate all components with comprehensive
|
|
5
|
-
validation and clear error reporting.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from typing import Dict, List, Optional, Union
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
import os
|
|
11
|
-
|
|
12
|
-
from .models import (
|
|
13
|
-
SchemaDefinition,
|
|
14
|
-
GenerationContext,
|
|
15
|
-
GenerationResult,
|
|
16
|
-
ResolvedPattern,
|
|
17
|
-
ValidationResult
|
|
18
|
-
)
|
|
19
|
-
from .file_managers import SchemaFileManager, PatternFileManager, ParsedSchemaFile
|
|
20
|
-
from .pattern_resolver import PatternHierarchyResolver
|
|
21
|
-
from .generator import SyntheticDataGenerator, GenerationConfig
|
|
22
|
-
from .validator import ValidationSystem
|
|
23
|
-
from .exceptions import ValidationError, SyntheticDataError
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class SyntheticDataIntegrator:
|
|
27
|
-
"""High-level integrator for synthetic data generation with validation."""
|
|
28
|
-
|
|
29
|
-
def __init__(self, config: Optional[GenerationConfig] = None):
|
|
30
|
-
"""Initialize the integrator with all components."""
|
|
31
|
-
self.config = config or GenerationConfig()
|
|
32
|
-
self.validation_system = ValidationSystem()
|
|
33
|
-
self.schema_manager = SchemaFileManager()
|
|
34
|
-
self.pattern_manager = PatternFileManager()
|
|
35
|
-
self.pattern_resolver = PatternHierarchyResolver()
|
|
36
|
-
self.generator = SyntheticDataGenerator(self.config)
|
|
37
|
-
|
|
38
|
-
def generate_from_schema_file(self,
|
|
39
|
-
schema_path: str,
|
|
40
|
-
target_rows: int,
|
|
41
|
-
output_engine: str = "pandas") -> GenerationResult:
|
|
42
|
-
"""
|
|
43
|
-
Generate synthetic data from a schema file with comprehensive validation.
|
|
44
|
-
|
|
45
|
-
This is the main entry point that performs:
|
|
46
|
-
1. File validation
|
|
47
|
-
2. Schema parsing and validation
|
|
48
|
-
3. Pattern resolution and validation
|
|
49
|
-
4. Fail-fast validation before generation
|
|
50
|
-
5. Data generation
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
schema_path: Path to the .toml schema file
|
|
54
|
-
target_rows: Number of rows to generate
|
|
55
|
-
output_engine: Output format ("pandas" or "polars")
|
|
56
|
-
|
|
57
|
-
Returns:
|
|
58
|
-
GenerationResult with generated data and metadata
|
|
59
|
-
|
|
60
|
-
Raises:
|
|
61
|
-
ValidationError: If any validation fails (fail-fast)
|
|
62
|
-
SyntheticDataError: If generation fails
|
|
63
|
-
"""
|
|
64
|
-
# Step 1: Validate file exists and has correct extension
|
|
65
|
-
self._validate_schema_file_path(schema_path)
|
|
66
|
-
|
|
67
|
-
# Step 2: Validate file syntax
|
|
68
|
-
file_validation = self.validation_system.validate_file_syntax(schema_path)
|
|
69
|
-
if not file_validation.is_valid:
|
|
70
|
-
self._raise_validation_error("Schema file validation failed", file_validation)
|
|
71
|
-
|
|
72
|
-
# Step 3: Load and parse schema
|
|
73
|
-
try:
|
|
74
|
-
parsed_schema = self.schema_manager.load_toml_schema(schema_path)
|
|
75
|
-
schema = self._convert_parsed_schema_to_definition(parsed_schema)
|
|
76
|
-
schema.source_file = schema_path
|
|
77
|
-
except Exception as e:
|
|
78
|
-
raise SyntheticDataError(f"Failed to load schema file '{schema_path}': {e}")
|
|
79
|
-
|
|
80
|
-
# Step 4: Resolve all patterns
|
|
81
|
-
try:
|
|
82
|
-
resolved_patterns = self._resolve_schema_patterns(schema)
|
|
83
|
-
except Exception as e:
|
|
84
|
-
raise SyntheticDataError(f"Failed to resolve patterns: {e}")
|
|
85
|
-
|
|
86
|
-
# Step 5: Create generation context
|
|
87
|
-
context = GenerationContext(
|
|
88
|
-
schema=schema,
|
|
89
|
-
resolved_patterns=resolved_patterns,
|
|
90
|
-
target_rows=target_rows,
|
|
91
|
-
output_engine=output_engine,
|
|
92
|
-
seed=self.config.seed
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
# Step 6: Comprehensive fail-fast validation (if enabled)
|
|
96
|
-
if self.config.validate_patterns:
|
|
97
|
-
self.validation_system.validate_fail_fast(
|
|
98
|
-
schema, resolved_patterns, schema_path
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
# Step 7: Generate data (validation already passed)
|
|
102
|
-
return self.generator.generate(context)
|
|
103
|
-
|
|
104
|
-
def validate_schema_file(self, schema_path: str) -> ValidationResult:
|
|
105
|
-
"""
|
|
106
|
-
Validate a schema file without generating data.
|
|
107
|
-
|
|
108
|
-
Performs comprehensive validation including:
|
|
109
|
-
- File syntax validation
|
|
110
|
-
- Schema structure validation
|
|
111
|
-
- Pattern resolution validation
|
|
112
|
-
- Distribution strategy validation
|
|
113
|
-
|
|
114
|
-
Args:
|
|
115
|
-
schema_path: Path to the .toml schema file
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
ValidationResult with all validation issues
|
|
119
|
-
"""
|
|
120
|
-
result = ValidationResult(is_valid=True)
|
|
121
|
-
|
|
122
|
-
try:
|
|
123
|
-
# Validate file path
|
|
124
|
-
self._validate_schema_file_path(schema_path)
|
|
125
|
-
|
|
126
|
-
# Validate file syntax
|
|
127
|
-
file_validation = self.validation_system.validate_file_syntax(schema_path)
|
|
128
|
-
result.merge(file_validation)
|
|
129
|
-
|
|
130
|
-
if not file_validation.is_valid:
|
|
131
|
-
return result # Can't continue if file syntax is invalid
|
|
132
|
-
|
|
133
|
-
# Load schema
|
|
134
|
-
schema = self.schema_manager.load_schema_file(schema_path)
|
|
135
|
-
schema.source_file = schema_path
|
|
136
|
-
|
|
137
|
-
# Resolve patterns
|
|
138
|
-
resolved_patterns = self._resolve_schema_patterns(schema)
|
|
139
|
-
|
|
140
|
-
# Comprehensive validation
|
|
141
|
-
comprehensive_validation = self.validation_system.validate_complete_configuration(
|
|
142
|
-
schema, resolved_patterns, schema_path
|
|
143
|
-
)
|
|
144
|
-
result.merge(comprehensive_validation)
|
|
145
|
-
|
|
146
|
-
except Exception as e:
|
|
147
|
-
result.add_error(f"Validation failed: {e}")
|
|
148
|
-
|
|
149
|
-
return result
|
|
150
|
-
|
|
151
|
-
def validate_pattern_file(self, pattern_path: str) -> ValidationResult:
|
|
152
|
-
"""
|
|
153
|
-
Validate a pattern file (.properties).
|
|
154
|
-
|
|
155
|
-
Args:
|
|
156
|
-
pattern_path: Path to the .properties file
|
|
157
|
-
|
|
158
|
-
Returns:
|
|
159
|
-
ValidationResult with validation issues
|
|
160
|
-
"""
|
|
161
|
-
return self.validation_system.validate_file_syntax(pattern_path)
|
|
162
|
-
|
|
163
|
-
def get_available_patterns(self, search_paths: Optional[List[str]] = None) -> Dict[str, str]:
|
|
164
|
-
"""
|
|
165
|
-
Get all available patterns from the pattern hierarchy.
|
|
166
|
-
|
|
167
|
-
Args:
|
|
168
|
-
search_paths: Optional list of paths to search for patterns
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
Dictionary mapping pattern names to their regex definitions
|
|
172
|
-
"""
|
|
173
|
-
if search_paths is None:
|
|
174
|
-
search_paths = self._get_default_pattern_paths()
|
|
175
|
-
|
|
176
|
-
all_patterns = {}
|
|
177
|
-
|
|
178
|
-
for path in search_paths:
|
|
179
|
-
if os.path.exists(path) and path.endswith('.properties'):
|
|
180
|
-
try:
|
|
181
|
-
patterns = self.pattern_manager.load_properties_file(path)
|
|
182
|
-
all_patterns.update(patterns)
|
|
183
|
-
except Exception:
|
|
184
|
-
continue # Skip files that can't be loaded
|
|
185
|
-
|
|
186
|
-
return all_patterns
|
|
187
|
-
|
|
188
|
-
def _validate_schema_file_path(self, schema_path: str) -> None:
|
|
189
|
-
"""Validate schema file path and extension."""
|
|
190
|
-
if not os.path.exists(schema_path):
|
|
191
|
-
raise ValidationError(f"Schema file not found: {schema_path}")
|
|
192
|
-
|
|
193
|
-
if not schema_path.endswith('.toml'):
|
|
194
|
-
raise ValidationError(
|
|
195
|
-
f"Invalid schema file extension. Expected .toml, got: {Path(schema_path).suffix}"
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
def _resolve_schema_patterns(self, schema: SchemaDefinition) -> Dict[str, ResolvedPattern]:
|
|
199
|
-
"""Resolve all patterns referenced in the schema."""
|
|
200
|
-
pattern_names = schema.get_all_pattern_names()
|
|
201
|
-
|
|
202
|
-
try:
|
|
203
|
-
resolution_results = self.pattern_resolver.resolve_multiple_patterns(
|
|
204
|
-
pattern_names,
|
|
205
|
-
inline_patterns=schema.inline_patterns,
|
|
206
|
-
user_imports=schema.imports
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
# Convert PatternResolutionResult to ResolvedPattern
|
|
210
|
-
resolved_patterns = {}
|
|
211
|
-
for name, result in resolution_results.items():
|
|
212
|
-
# Extract resolution chain from search order
|
|
213
|
-
resolution_chain = [source for source, _, _ in result.trace.search_order]
|
|
214
|
-
|
|
215
|
-
resolved_pattern = ResolvedPattern(
|
|
216
|
-
definition=result.pattern,
|
|
217
|
-
resolution_chain=resolution_chain,
|
|
218
|
-
final_source=result.trace.resolved_source
|
|
219
|
-
)
|
|
220
|
-
resolved_patterns[name] = resolved_pattern
|
|
221
|
-
|
|
222
|
-
return resolved_patterns
|
|
223
|
-
|
|
224
|
-
except Exception as e:
|
|
225
|
-
raise SyntheticDataError(f"Pattern resolution failed: {e}")
|
|
226
|
-
|
|
227
|
-
def _convert_parsed_schema_to_definition(self, parsed_schema: ParsedSchemaFile) -> SchemaDefinition:
|
|
228
|
-
"""Convert ParsedSchemaFile to SchemaDefinition."""
|
|
229
|
-
from .models import FieldDefinition, DistributionStrategy, DistributionType
|
|
230
|
-
|
|
231
|
-
schema = SchemaDefinition(
|
|
232
|
-
imports=parsed_schema.imports,
|
|
233
|
-
inline_patterns=parsed_schema.inline_patterns,
|
|
234
|
-
metadata=parsed_schema.metadata
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
# Convert schema definitions to field definitions
|
|
238
|
-
# For now, create simple field definitions - this will be enhanced in later tasks
|
|
239
|
-
for field_name, pattern_name in parsed_schema.schema_definitions.items():
|
|
240
|
-
# Create a simple equal distribution for now
|
|
241
|
-
distribution = DistributionStrategy(
|
|
242
|
-
strategy_type=DistributionType.EQUAL,
|
|
243
|
-
validate_on_init=False
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
field_def = FieldDefinition(
|
|
247
|
-
name=field_name,
|
|
248
|
-
pattern_name=pattern_name,
|
|
249
|
-
distribution=distribution
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
schema.add_field(field_def)
|
|
253
|
-
|
|
254
|
-
return schema
|
|
255
|
-
|
|
256
|
-
def _get_default_pattern_paths(self) -> List[str]:
|
|
257
|
-
"""Get default paths to search for pattern files."""
|
|
258
|
-
paths = []
|
|
259
|
-
|
|
260
|
-
# Global patterns
|
|
261
|
-
global_path = os.path.join("reference", "schema_definitions", "global.properties")
|
|
262
|
-
if os.path.exists(global_path):
|
|
263
|
-
paths.append(global_path)
|
|
264
|
-
|
|
265
|
-
# User patterns (if they exist)
|
|
266
|
-
user_global = os.path.expanduser("~/.additory/patterns/global.properties")
|
|
267
|
-
if os.path.exists(user_global):
|
|
268
|
-
paths.append(user_global)
|
|
269
|
-
|
|
270
|
-
return paths
|
|
271
|
-
|
|
272
|
-
def _raise_validation_error(self, message: str, validation_result: ValidationResult) -> None:
|
|
273
|
-
"""Raise a ValidationError with formatted validation results."""
|
|
274
|
-
error_details = []
|
|
275
|
-
|
|
276
|
-
for error in validation_result.errors:
|
|
277
|
-
error_details.append(f" ERROR: {error}")
|
|
278
|
-
|
|
279
|
-
for warning in validation_result.warnings:
|
|
280
|
-
error_details.append(f" WARNING: {warning}")
|
|
281
|
-
|
|
282
|
-
full_message = f"{message}:\n" + "\n".join(error_details)
|
|
283
|
-
raise ValidationError(full_message)
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
class ValidationHelper:
|
|
287
|
-
"""Helper class for validation operations and error reporting."""
|
|
288
|
-
|
|
289
|
-
@staticmethod
|
|
290
|
-
def format_validation_report(result: ValidationResult, title: str = "Validation Report") -> str:
|
|
291
|
-
"""Format a validation result into a human-readable report."""
|
|
292
|
-
lines = [f"=== {title} ===", ""]
|
|
293
|
-
|
|
294
|
-
if result.is_valid:
|
|
295
|
-
lines.append("✅ All validations passed!")
|
|
296
|
-
if result.warnings:
|
|
297
|
-
lines.append(f"\n⚠️ {len(result.warnings)} warning(s):")
|
|
298
|
-
for warning in result.warnings:
|
|
299
|
-
lines.append(f" • {warning}")
|
|
300
|
-
else:
|
|
301
|
-
lines.append(f"❌ Validation failed with {len(result.errors)} error(s):")
|
|
302
|
-
for error in result.errors:
|
|
303
|
-
lines.append(f" • {error}")
|
|
304
|
-
|
|
305
|
-
if result.warnings:
|
|
306
|
-
lines.append(f"\n⚠️ {len(result.warnings)} warning(s):")
|
|
307
|
-
for warning in result.warnings:
|
|
308
|
-
lines.append(f" • {warning}")
|
|
309
|
-
|
|
310
|
-
return "\n".join(lines)
|
|
311
|
-
|
|
312
|
-
@staticmethod
|
|
313
|
-
def get_validation_summary(result: ValidationResult) -> Dict[str, int]:
|
|
314
|
-
"""Get a summary of validation results."""
|
|
315
|
-
return {
|
|
316
|
-
"errors": len(result.errors),
|
|
317
|
-
"warnings": len(result.warnings),
|
|
318
|
-
"is_valid": result.is_valid
|
|
319
|
-
}
|
additory/synthetic/models.py
DELETED
|
@@ -1,241 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Core data models for synthetic data generation system.
|
|
3
|
-
|
|
4
|
-
Defines the data structures used throughout the system for patterns,
|
|
5
|
-
distributions, schemas, and generation contexts.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from dataclasses import dataclass, field
|
|
9
|
-
from enum import Enum
|
|
10
|
-
from typing import Dict, List, Any, Optional, Union
|
|
11
|
-
import polars as pl
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class PatternSource(Enum):
|
|
15
|
-
"""Sources for pattern definitions in the hierarchy."""
|
|
16
|
-
INLINE = "inline"
|
|
17
|
-
USER_IMPORT = "user_import"
|
|
18
|
-
USER_GLOBAL = "user_global"
|
|
19
|
-
CORE_NON_GLOBAL = "core_non_global"
|
|
20
|
-
CORE_GLOBAL = "core_global"
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class ValidationStatus(Enum):
|
|
24
|
-
"""Validation status for patterns and schemas."""
|
|
25
|
-
VALID = "valid"
|
|
26
|
-
INVALID = "invalid"
|
|
27
|
-
PENDING = "pending"
|
|
28
|
-
NOT_VALIDATED = "not_validated"
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class DistributionType(Enum):
|
|
32
|
-
"""Supported distribution strategy types."""
|
|
33
|
-
EQUAL = "equal"
|
|
34
|
-
CUSTOM = "custom"
|
|
35
|
-
CATEGORICAL = "categorical"
|
|
36
|
-
HIGH_CARDINALITY = "high_cardinality"
|
|
37
|
-
NUMERIC_RANGE = "numeric_range"
|
|
38
|
-
SKEWED = "skewed"
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@dataclass
|
|
42
|
-
class PatternDefinition:
|
|
43
|
-
"""Represents a regex pattern definition with metadata."""
|
|
44
|
-
name: str
|
|
45
|
-
regex: str
|
|
46
|
-
source: PatternSource
|
|
47
|
-
validation_status: ValidationStatus = ValidationStatus.NOT_VALIDATED
|
|
48
|
-
polars_compatible: bool = False
|
|
49
|
-
source_file: Optional[str] = None
|
|
50
|
-
line_number: Optional[int] = None
|
|
51
|
-
|
|
52
|
-
def __post_init__(self):
|
|
53
|
-
"""Validate pattern definition after initialization."""
|
|
54
|
-
if not self.name:
|
|
55
|
-
raise ValueError("Pattern name cannot be empty")
|
|
56
|
-
if not self.regex:
|
|
57
|
-
raise ValueError("Pattern regex cannot be empty")
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
@dataclass
|
|
61
|
-
class ValidationRule:
|
|
62
|
-
"""Represents a validation rule for distributions."""
|
|
63
|
-
rule_type: str
|
|
64
|
-
parameters: Dict[str, Any]
|
|
65
|
-
error_message: str
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
@dataclass
|
|
69
|
-
class DistributionStrategy:
|
|
70
|
-
"""Represents a distribution strategy configuration."""
|
|
71
|
-
strategy_type: DistributionType
|
|
72
|
-
parameters: Dict[str, Any] = field(default_factory=dict)
|
|
73
|
-
validation_rules: List[ValidationRule] = field(default_factory=list)
|
|
74
|
-
validate_on_init: bool = True
|
|
75
|
-
|
|
76
|
-
def __post_init__(self):
|
|
77
|
-
"""Validate distribution strategy after initialization."""
|
|
78
|
-
if self.validate_on_init:
|
|
79
|
-
self._validate_parameters()
|
|
80
|
-
|
|
81
|
-
def _validate_parameters(self):
|
|
82
|
-
"""Validate parameters based on strategy type."""
|
|
83
|
-
if self.strategy_type == DistributionType.CUSTOM:
|
|
84
|
-
if 'weights' not in self.parameters:
|
|
85
|
-
raise ValueError("Custom distribution requires 'weights' parameter")
|
|
86
|
-
elif self.strategy_type == DistributionType.NUMERIC_RANGE:
|
|
87
|
-
if 'min' not in self.parameters or 'max' not in self.parameters:
|
|
88
|
-
raise ValueError("Numeric range distribution requires 'min' and 'max' parameters")
|
|
89
|
-
elif self.strategy_type == DistributionType.CATEGORICAL:
|
|
90
|
-
if 'categories' not in self.parameters:
|
|
91
|
-
raise ValueError("Categorical distribution requires 'categories' parameter")
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
@dataclass
|
|
95
|
-
class FieldDefinition:
|
|
96
|
-
"""Represents a field definition in a schema."""
|
|
97
|
-
name: str
|
|
98
|
-
pattern_name: str
|
|
99
|
-
distribution: DistributionStrategy
|
|
100
|
-
constraints: List[str] = field(default_factory=list)
|
|
101
|
-
|
|
102
|
-
def __post_init__(self):
|
|
103
|
-
"""Validate field definition after initialization."""
|
|
104
|
-
if not self.name:
|
|
105
|
-
raise ValueError("Field name cannot be empty")
|
|
106
|
-
if not self.pattern_name:
|
|
107
|
-
raise ValueError("Pattern name cannot be empty")
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
@dataclass
|
|
111
|
-
class SchemaDefinition:
|
|
112
|
-
"""Represents a complete schema definition."""
|
|
113
|
-
imports: List[str] = field(default_factory=list)
|
|
114
|
-
inline_patterns: Dict[str, str] = field(default_factory=dict)
|
|
115
|
-
field_definitions: Dict[str, FieldDefinition] = field(default_factory=dict)
|
|
116
|
-
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
117
|
-
source_file: Optional[str] = None
|
|
118
|
-
|
|
119
|
-
def add_field(self, field_def: FieldDefinition):
|
|
120
|
-
"""Add a field definition to the schema."""
|
|
121
|
-
self.field_definitions[field_def.name] = field_def
|
|
122
|
-
|
|
123
|
-
def get_field(self, name: str) -> Optional[FieldDefinition]:
|
|
124
|
-
"""Get a field definition by name."""
|
|
125
|
-
return self.field_definitions.get(name)
|
|
126
|
-
|
|
127
|
-
def get_all_pattern_names(self) -> List[str]:
|
|
128
|
-
"""Get all pattern names referenced in the schema."""
|
|
129
|
-
pattern_names = list(self.inline_patterns.keys())
|
|
130
|
-
for field_def in self.field_definitions.values():
|
|
131
|
-
if field_def.pattern_name not in pattern_names:
|
|
132
|
-
pattern_names.append(field_def.pattern_name)
|
|
133
|
-
return pattern_names
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
@dataclass
|
|
137
|
-
class ResolvedPattern:
|
|
138
|
-
"""Represents a pattern that has been resolved through the hierarchy."""
|
|
139
|
-
definition: PatternDefinition
|
|
140
|
-
resolution_chain: List[PatternSource]
|
|
141
|
-
final_source: PatternSource
|
|
142
|
-
|
|
143
|
-
@property
|
|
144
|
-
def name(self) -> str:
|
|
145
|
-
return self.definition.name
|
|
146
|
-
|
|
147
|
-
@property
|
|
148
|
-
def regex(self) -> str:
|
|
149
|
-
return self.definition.regex
|
|
150
|
-
|
|
151
|
-
@property
|
|
152
|
-
def is_valid(self) -> bool:
|
|
153
|
-
return (self.definition.validation_status == ValidationStatus.VALID and
|
|
154
|
-
self.definition.polars_compatible)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
@dataclass
|
|
158
|
-
class GenerationContext:
|
|
159
|
-
"""Context for data generation operations."""
|
|
160
|
-
schema: SchemaDefinition
|
|
161
|
-
resolved_patterns: Dict[str, ResolvedPattern]
|
|
162
|
-
target_rows: int
|
|
163
|
-
output_engine: str = "pandas"
|
|
164
|
-
batch_size: int = 10000
|
|
165
|
-
seed: Optional[int] = None
|
|
166
|
-
|
|
167
|
-
def get_pattern(self, name: str) -> Optional[ResolvedPattern]:
|
|
168
|
-
"""Get a resolved pattern by name."""
|
|
169
|
-
return self.resolved_patterns.get(name)
|
|
170
|
-
|
|
171
|
-
def validate_context(self) -> bool:
|
|
172
|
-
"""Validate that the generation context is complete and valid."""
|
|
173
|
-
# Check that all required patterns are resolved
|
|
174
|
-
for field_def in self.schema.field_definitions.values():
|
|
175
|
-
if field_def.pattern_name not in self.resolved_patterns:
|
|
176
|
-
return False
|
|
177
|
-
|
|
178
|
-
pattern = self.resolved_patterns[field_def.pattern_name]
|
|
179
|
-
if not pattern.is_valid:
|
|
180
|
-
return False
|
|
181
|
-
|
|
182
|
-
return True
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
@dataclass
|
|
186
|
-
class GenerationResult:
|
|
187
|
-
"""Result of a data generation operation."""
|
|
188
|
-
dataframe: Union[pl.DataFrame, 'pd.DataFrame']
|
|
189
|
-
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
190
|
-
generation_time: Optional[float] = None
|
|
191
|
-
memory_usage: Optional[int] = None
|
|
192
|
-
|
|
193
|
-
@property
|
|
194
|
-
def row_count(self) -> int:
|
|
195
|
-
"""Get the number of rows in the generated dataframe."""
|
|
196
|
-
if hasattr(self.dataframe, 'height'): # polars
|
|
197
|
-
return self.dataframe.height
|
|
198
|
-
else: # pandas
|
|
199
|
-
return len(self.dataframe)
|
|
200
|
-
|
|
201
|
-
@property
|
|
202
|
-
def column_count(self) -> int:
|
|
203
|
-
"""Get the number of columns in the generated dataframe."""
|
|
204
|
-
if hasattr(self.dataframe, 'width'): # polars
|
|
205
|
-
return self.dataframe.width
|
|
206
|
-
else: # pandas
|
|
207
|
-
return len(self.dataframe.columns)
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
@dataclass
|
|
211
|
-
class ValidationResult:
|
|
212
|
-
"""Result of a validation operation."""
|
|
213
|
-
is_valid: bool
|
|
214
|
-
errors: List[str] = field(default_factory=list)
|
|
215
|
-
warnings: List[str] = field(default_factory=list)
|
|
216
|
-
suggestions: List[str] = field(default_factory=list)
|
|
217
|
-
|
|
218
|
-
def add_error(self, error: str, suggestion: Optional[str] = None):
|
|
219
|
-
"""Add an error to the validation result."""
|
|
220
|
-
self.errors.append(error)
|
|
221
|
-
self.is_valid = False
|
|
222
|
-
if suggestion:
|
|
223
|
-
self.suggestions.append(suggestion)
|
|
224
|
-
|
|
225
|
-
def add_warning(self, warning: str):
|
|
226
|
-
"""Add a warning to the validation result."""
|
|
227
|
-
self.warnings.append(warning)
|
|
228
|
-
|
|
229
|
-
def merge(self, other: 'ValidationResult'):
|
|
230
|
-
"""Merge another validation result into this one."""
|
|
231
|
-
self.errors.extend(other.errors)
|
|
232
|
-
self.warnings.extend(other.warnings)
|
|
233
|
-
self.suggestions.extend(other.suggestions)
|
|
234
|
-
if not other.is_valid:
|
|
235
|
-
self.is_valid = False
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
# Type aliases for common types
|
|
239
|
-
PatternDict = Dict[str, PatternDefinition]
|
|
240
|
-
ResolvedPatternDict = Dict[str, ResolvedPattern]
|
|
241
|
-
DistributionDict = Dict[str, DistributionStrategy]
|