additory 0.1.0a2__py3-none-any.whl → 0.1.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/config.py +3 -3
- additory/core/polars_expression_engine.py +66 -16
- additory/core/registry.py +4 -3
- additory/dynamic_api.py +95 -51
- additory/expressions/proxy.py +4 -1
- additory/expressions/registry.py +3 -3
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/synthetic/deduce.py +259 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +87 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/METADATA +44 -28
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/RECORD +28 -43
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/top_level.txt +0 -0
|
@@ -1,685 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Schema parser for TOML schema processing.
|
|
3
|
-
|
|
4
|
-
Handles parsing of TOML schema files with import declarations, inline pattern overrides,
|
|
5
|
-
distribution strategy specifications, and comprehensive validation.
|
|
6
|
-
|
|
7
|
-
Enhanced to support:
|
|
8
|
-
- New [generation] section with imports and prefer_mode
|
|
9
|
-
- Pattern type detection (inline_list, inline_regex, reference)
|
|
10
|
-
- Integration with common module for pattern resolution
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
from dataclasses import dataclass, field
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
from typing import Dict, List, Optional, Any, Union, Set
|
|
16
|
-
import toml
|
|
17
|
-
import logging
|
|
18
|
-
|
|
19
|
-
from .models import (
|
|
20
|
-
SchemaDefinition, FieldDefinition, DistributionStrategy, DistributionType,
|
|
21
|
-
ValidationResult, ValidationStatus
|
|
22
|
-
)
|
|
23
|
-
from .file_managers import SchemaFileManager, ParsedSchemaFile
|
|
24
|
-
from .pattern_resolver import PatternHierarchyResolver
|
|
25
|
-
from .common_integration import SyntheticPatternLoader, detect_pattern_type_from_toml
|
|
26
|
-
from .exceptions import (
|
|
27
|
-
SchemaParsingError, ValidationError, PatternResolutionError,
|
|
28
|
-
DistributionValidationError
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
logger = logging.getLogger(__name__)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@dataclass
|
|
36
|
-
class ParsedFieldDefinition:
|
|
37
|
-
"""Represents a parsed field definition from schema."""
|
|
38
|
-
name: str
|
|
39
|
-
pattern_value: Union[str, List[str]] # Can be reference, regex, or list
|
|
40
|
-
pattern_type: str # "inline_list", "inline_regex", or "reference"
|
|
41
|
-
distribution: Optional[DistributionStrategy] = None
|
|
42
|
-
constraints: List[str] = field(default_factory=list)
|
|
43
|
-
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@dataclass
|
|
47
|
-
class SchemaParsingResult:
|
|
48
|
-
"""Result of comprehensive schema parsing."""
|
|
49
|
-
schema_definition: SchemaDefinition
|
|
50
|
-
validation_result: ValidationResult
|
|
51
|
-
referenced_patterns: Set[str]
|
|
52
|
-
distribution_strategies: Dict[str, DistributionStrategy]
|
|
53
|
-
parsing_metadata: Dict[str, Any] = field(default_factory=dict)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
class SchemaParser:
|
|
57
|
-
"""
|
|
58
|
-
Comprehensive TOML schema parser with validation and pattern resolution.
|
|
59
|
-
|
|
60
|
-
Handles:
|
|
61
|
-
- Import declaration parsing and validation
|
|
62
|
-
- Inline pattern override processing
|
|
63
|
-
- Distribution strategy specification parsing
|
|
64
|
-
- Pattern reference validation
|
|
65
|
-
- Schema structure validation
|
|
66
|
-
"""
|
|
67
|
-
|
|
68
|
-
def __init__(self, pattern_resolver: Optional[PatternHierarchyResolver] = None,
|
|
69
|
-
use_common_integration: bool = True):
|
|
70
|
-
"""
|
|
71
|
-
Initialize the schema parser.
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
pattern_resolver: Pattern hierarchy resolver for validation (legacy)
|
|
75
|
-
use_common_integration: Whether to use new common_integration module (default: True)
|
|
76
|
-
"""
|
|
77
|
-
self.schema_file_manager = SchemaFileManager()
|
|
78
|
-
self.pattern_resolver = pattern_resolver or PatternHierarchyResolver()
|
|
79
|
-
|
|
80
|
-
# New: Use common_integration for pattern loading
|
|
81
|
-
self.use_common_integration = use_common_integration
|
|
82
|
-
if use_common_integration:
|
|
83
|
-
self.pattern_loader = SyntheticPatternLoader()
|
|
84
|
-
|
|
85
|
-
# Supported distribution types for validation
|
|
86
|
-
self.supported_distributions = {
|
|
87
|
-
'equal', 'custom', 'categorical', 'high_cardinality',
|
|
88
|
-
'numeric_range', 'skewed'
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
def parse_schema_file(self, file_path: str, validate_patterns: bool = True) -> SchemaParsingResult:
|
|
92
|
-
"""
|
|
93
|
-
Parse a TOML schema file with comprehensive validation.
|
|
94
|
-
|
|
95
|
-
Args:
|
|
96
|
-
file_path: Path to the TOML schema file
|
|
97
|
-
validate_patterns: Whether to validate referenced patterns exist
|
|
98
|
-
|
|
99
|
-
Returns:
|
|
100
|
-
SchemaParsingResult with parsed schema and validation results
|
|
101
|
-
|
|
102
|
-
Raises:
|
|
103
|
-
SchemaParsingError: If schema parsing fails
|
|
104
|
-
ValidationError: If validation fails
|
|
105
|
-
"""
|
|
106
|
-
# Load the raw TOML file
|
|
107
|
-
parsed_file = self.schema_file_manager.load_toml_schema(file_path)
|
|
108
|
-
|
|
109
|
-
# Initialize validation result
|
|
110
|
-
validation_result = ValidationResult(is_valid=True)
|
|
111
|
-
|
|
112
|
-
# Parse field definitions from schema section
|
|
113
|
-
field_definitions, field_validation = self._parse_field_definitions(
|
|
114
|
-
parsed_file.schema_definitions, file_path
|
|
115
|
-
)
|
|
116
|
-
validation_result.merge(field_validation)
|
|
117
|
-
|
|
118
|
-
# Parse distribution strategies
|
|
119
|
-
distribution_strategies, dist_validation = self._parse_distribution_strategies(
|
|
120
|
-
parsed_file.schema_definitions, file_path
|
|
121
|
-
)
|
|
122
|
-
validation_result.merge(dist_validation)
|
|
123
|
-
|
|
124
|
-
# Validate imports
|
|
125
|
-
import_validation = self._validate_imports(parsed_file.imports, file_path)
|
|
126
|
-
validation_result.merge(import_validation)
|
|
127
|
-
|
|
128
|
-
# Validate inline patterns
|
|
129
|
-
inline_validation = self._validate_inline_patterns(parsed_file.inline_patterns, file_path)
|
|
130
|
-
validation_result.merge(inline_validation)
|
|
131
|
-
|
|
132
|
-
# Collect all referenced patterns
|
|
133
|
-
referenced_patterns = self._collect_referenced_patterns(
|
|
134
|
-
field_definitions, parsed_file.inline_patterns
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
# Validate pattern references if requested
|
|
138
|
-
if validate_patterns:
|
|
139
|
-
pattern_validation = self._validate_pattern_references(
|
|
140
|
-
referenced_patterns, parsed_file.imports,
|
|
141
|
-
parsed_file.inline_patterns, file_path
|
|
142
|
-
)
|
|
143
|
-
validation_result.merge(pattern_validation)
|
|
144
|
-
|
|
145
|
-
# Create schema definition
|
|
146
|
-
schema_definition = SchemaDefinition(
|
|
147
|
-
imports=parsed_file.imports,
|
|
148
|
-
inline_patterns=parsed_file.inline_patterns,
|
|
149
|
-
field_definitions={name: self._convert_to_field_definition(field_def, distribution_strategies.get(name))
|
|
150
|
-
for name, field_def in field_definitions.items()},
|
|
151
|
-
metadata=parsed_file.metadata,
|
|
152
|
-
source_file=file_path
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
# Create parsing metadata
|
|
156
|
-
parsing_metadata = {
|
|
157
|
-
'file_path': file_path,
|
|
158
|
-
'field_count': len(field_definitions),
|
|
159
|
-
'import_count': len(parsed_file.imports),
|
|
160
|
-
'inline_pattern_count': len(parsed_file.inline_patterns),
|
|
161
|
-
'distribution_strategy_count': len(distribution_strategies),
|
|
162
|
-
'referenced_pattern_count': len(referenced_patterns)
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
return SchemaParsingResult(
|
|
166
|
-
schema_definition=schema_definition,
|
|
167
|
-
validation_result=validation_result,
|
|
168
|
-
referenced_patterns=referenced_patterns,
|
|
169
|
-
distribution_strategies=distribution_strategies,
|
|
170
|
-
parsing_metadata=parsing_metadata
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
def _parse_field_definitions(self, schema_definitions: Dict[str, Any],
|
|
174
|
-
file_path: str) -> tuple[Dict[str, ParsedFieldDefinition], ValidationResult]:
|
|
175
|
-
"""
|
|
176
|
-
Parse field definitions from schema section.
|
|
177
|
-
|
|
178
|
-
Enhanced to support:
|
|
179
|
-
- String patterns (reference or inline regex)
|
|
180
|
-
- Array patterns (inline list)
|
|
181
|
-
- Pattern type detection
|
|
182
|
-
|
|
183
|
-
Args:
|
|
184
|
-
schema_definitions: Schema definitions from TOML
|
|
185
|
-
file_path: File path for error reporting
|
|
186
|
-
|
|
187
|
-
Returns:
|
|
188
|
-
Tuple of (field_definitions, validation_result)
|
|
189
|
-
"""
|
|
190
|
-
field_definitions = {}
|
|
191
|
-
validation_result = ValidationResult(is_valid=True)
|
|
192
|
-
|
|
193
|
-
for field_name, pattern_spec in schema_definitions.items():
|
|
194
|
-
try:
|
|
195
|
-
# Detect pattern type
|
|
196
|
-
pattern_type = detect_pattern_type_from_toml(pattern_spec)
|
|
197
|
-
|
|
198
|
-
if pattern_type == "unknown":
|
|
199
|
-
validation_result.add_error(
|
|
200
|
-
f"Invalid pattern type for field '{field_name}'",
|
|
201
|
-
f"Pattern must be a string (reference/regex) or array (list), got {type(pattern_spec)}"
|
|
202
|
-
)
|
|
203
|
-
continue
|
|
204
|
-
|
|
205
|
-
# Validate field name
|
|
206
|
-
if not self._is_valid_field_name(field_name):
|
|
207
|
-
validation_result.add_error(
|
|
208
|
-
f"Invalid field name '{field_name}' in schema",
|
|
209
|
-
"Field names must be valid identifiers"
|
|
210
|
-
)
|
|
211
|
-
continue
|
|
212
|
-
|
|
213
|
-
# Create field definition with pattern value and type
|
|
214
|
-
field_def = ParsedFieldDefinition(
|
|
215
|
-
name=field_name,
|
|
216
|
-
pattern_value=pattern_spec,
|
|
217
|
-
pattern_type=pattern_type
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
field_definitions[field_name] = field_def
|
|
221
|
-
|
|
222
|
-
logger.debug(f"Parsed field '{field_name}': type={pattern_type}, value={pattern_spec}")
|
|
223
|
-
|
|
224
|
-
except Exception as e:
|
|
225
|
-
validation_result.add_error(
|
|
226
|
-
f"Error parsing field '{field_name}': {e}",
|
|
227
|
-
"Check field definition syntax"
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
return field_definitions, validation_result
|
|
231
|
-
|
|
232
|
-
def _parse_distribution_strategies(self, schema_definitions: Dict[str, Any],
|
|
233
|
-
file_path: str) -> tuple[Dict[str, DistributionStrategy], ValidationResult]:
|
|
234
|
-
"""
|
|
235
|
-
Parse distribution strategy specifications from schema.
|
|
236
|
-
|
|
237
|
-
Enhanced to handle both string and array patterns.
|
|
238
|
-
|
|
239
|
-
Args:
|
|
240
|
-
schema_definitions: Schema definitions from TOML
|
|
241
|
-
file_path: File path for error reporting
|
|
242
|
-
|
|
243
|
-
Returns:
|
|
244
|
-
Tuple of (distribution_strategies, validation_result)
|
|
245
|
-
"""
|
|
246
|
-
distribution_strategies = {}
|
|
247
|
-
validation_result = ValidationResult(is_valid=True)
|
|
248
|
-
|
|
249
|
-
# Look for distribution specifications in metadata or special sections
|
|
250
|
-
# For now, we'll parse simple distribution specs from pattern specifications
|
|
251
|
-
for field_name, pattern_spec in schema_definitions.items():
|
|
252
|
-
try:
|
|
253
|
-
# Skip non-string patterns (arrays don't have distribution specs in the pattern)
|
|
254
|
-
if not isinstance(pattern_spec, str):
|
|
255
|
-
continue
|
|
256
|
-
|
|
257
|
-
distribution = self._extract_distribution_from_spec(pattern_spec)
|
|
258
|
-
if distribution:
|
|
259
|
-
# Validate distribution strategy
|
|
260
|
-
dist_validation = self._validate_distribution_strategy(distribution, field_name)
|
|
261
|
-
validation_result.merge(dist_validation)
|
|
262
|
-
|
|
263
|
-
if dist_validation.is_valid:
|
|
264
|
-
distribution_strategies[field_name] = distribution
|
|
265
|
-
|
|
266
|
-
except Exception as e:
|
|
267
|
-
validation_result.add_error(
|
|
268
|
-
f"Error parsing distribution for field '{field_name}': {e}",
|
|
269
|
-
"Check distribution strategy syntax"
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
return distribution_strategies, validation_result
|
|
273
|
-
|
|
274
|
-
def _validate_imports(self, imports: List[str], file_path: str) -> ValidationResult:
|
|
275
|
-
"""
|
|
276
|
-
Validate import declarations.
|
|
277
|
-
|
|
278
|
-
Enhanced to use common_integration for validation.
|
|
279
|
-
|
|
280
|
-
Args:
|
|
281
|
-
imports: List of import declarations
|
|
282
|
-
file_path: File path for error reporting
|
|
283
|
-
|
|
284
|
-
Returns:
|
|
285
|
-
ValidationResult with import validation status
|
|
286
|
-
"""
|
|
287
|
-
validation_result = ValidationResult(is_valid=True)
|
|
288
|
-
|
|
289
|
-
# Use common_integration for validation if enabled
|
|
290
|
-
if self.use_common_integration and hasattr(self, 'pattern_loader'):
|
|
291
|
-
is_valid, errors = self.pattern_loader.validate_imports(imports)
|
|
292
|
-
if not is_valid:
|
|
293
|
-
for error in errors:
|
|
294
|
-
validation_result.add_error(error, "Check import declarations")
|
|
295
|
-
return validation_result
|
|
296
|
-
|
|
297
|
-
# Legacy validation
|
|
298
|
-
for import_name in imports:
|
|
299
|
-
# Validate import name format
|
|
300
|
-
if not self._is_valid_import_name(import_name):
|
|
301
|
-
validation_result.add_error(
|
|
302
|
-
f"Invalid import name '{import_name}'",
|
|
303
|
-
"Import names should be valid pattern file names (without .properties extension)"
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
# Check for circular imports (basic check)
|
|
307
|
-
if import_name == "self" or import_name == Path(file_path).stem:
|
|
308
|
-
validation_result.add_error(
|
|
309
|
-
f"Circular import detected: '{import_name}'",
|
|
310
|
-
"Schemas cannot import themselves"
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
return validation_result
|
|
314
|
-
|
|
315
|
-
def _validate_inline_patterns(self, inline_patterns: Dict[str, Union[str, List[str]]],
|
|
316
|
-
file_path: str) -> ValidationResult:
|
|
317
|
-
"""
|
|
318
|
-
Validate inline pattern definitions.
|
|
319
|
-
|
|
320
|
-
Enhanced to support both string (regex) and array (list) patterns.
|
|
321
|
-
|
|
322
|
-
Args:
|
|
323
|
-
inline_patterns: Dictionary of inline patterns
|
|
324
|
-
file_path: File path for error reporting
|
|
325
|
-
|
|
326
|
-
Returns:
|
|
327
|
-
ValidationResult with inline pattern validation status
|
|
328
|
-
"""
|
|
329
|
-
validation_result = ValidationResult(is_valid=True)
|
|
330
|
-
|
|
331
|
-
for pattern_name, pattern_value in inline_patterns.items():
|
|
332
|
-
# Validate pattern name
|
|
333
|
-
if not self._is_valid_pattern_name(pattern_name):
|
|
334
|
-
validation_result.add_error(
|
|
335
|
-
f"Invalid inline pattern name '{pattern_name}'",
|
|
336
|
-
"Pattern names must start with a letter and contain only letters, numbers, and underscores"
|
|
337
|
-
)
|
|
338
|
-
|
|
339
|
-
# Validate based on pattern type
|
|
340
|
-
if isinstance(pattern_value, str):
|
|
341
|
-
# String pattern (regex)
|
|
342
|
-
if not pattern_value.strip():
|
|
343
|
-
validation_result.add_error(
|
|
344
|
-
f"Empty regex pattern for inline pattern '{pattern_name}'",
|
|
345
|
-
"Regex patterns must not be empty"
|
|
346
|
-
)
|
|
347
|
-
|
|
348
|
-
# Basic regex syntax validation
|
|
349
|
-
try:
|
|
350
|
-
import re
|
|
351
|
-
re.compile(pattern_value)
|
|
352
|
-
except re.error as e:
|
|
353
|
-
validation_result.add_error(
|
|
354
|
-
f"Invalid regex syntax in inline pattern '{pattern_name}': {e}",
|
|
355
|
-
"Check regex syntax for proper escaping and structure"
|
|
356
|
-
)
|
|
357
|
-
|
|
358
|
-
elif isinstance(pattern_value, list):
|
|
359
|
-
# Array pattern (list)
|
|
360
|
-
if not pattern_value:
|
|
361
|
-
validation_result.add_error(
|
|
362
|
-
f"Empty list for inline pattern '{pattern_name}'",
|
|
363
|
-
"List patterns must contain at least one value"
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
# Validate all items are strings
|
|
367
|
-
for i, item in enumerate(pattern_value):
|
|
368
|
-
if not isinstance(item, str):
|
|
369
|
-
validation_result.add_error(
|
|
370
|
-
f"Invalid item type in inline pattern '{pattern_name}' at index {i}",
|
|
371
|
-
f"All list items must be strings, got {type(item)}"
|
|
372
|
-
)
|
|
373
|
-
|
|
374
|
-
else:
|
|
375
|
-
validation_result.add_error(
|
|
376
|
-
f"Invalid pattern type for inline pattern '{pattern_name}'",
|
|
377
|
-
f"Pattern must be a string (regex) or array (list), got {type(pattern_value)}"
|
|
378
|
-
)
|
|
379
|
-
|
|
380
|
-
return validation_result
|
|
381
|
-
|
|
382
|
-
def _collect_referenced_patterns(self, field_definitions: Dict[str, ParsedFieldDefinition],
|
|
383
|
-
inline_patterns: Dict[str, Union[str, List[str]]]) -> Set[str]:
|
|
384
|
-
"""
|
|
385
|
-
Collect all pattern names referenced in the schema.
|
|
386
|
-
|
|
387
|
-
Enhanced to only collect references (not inline patterns).
|
|
388
|
-
|
|
389
|
-
Args:
|
|
390
|
-
field_definitions: Parsed field definitions
|
|
391
|
-
inline_patterns: Inline pattern definitions
|
|
392
|
-
|
|
393
|
-
Returns:
|
|
394
|
-
Set of all referenced pattern names (excluding inline patterns)
|
|
395
|
-
"""
|
|
396
|
-
referenced_patterns = set()
|
|
397
|
-
|
|
398
|
-
# Add patterns from field definitions (only references, not inline)
|
|
399
|
-
for field_def in field_definitions.values():
|
|
400
|
-
if field_def.pattern_type == "reference":
|
|
401
|
-
# Only references need to be resolved
|
|
402
|
-
referenced_patterns.add(field_def.pattern_value)
|
|
403
|
-
|
|
404
|
-
# Inline patterns don't need resolution (they're already defined)
|
|
405
|
-
# But we track them for completeness
|
|
406
|
-
referenced_patterns.update(inline_patterns.keys())
|
|
407
|
-
|
|
408
|
-
return referenced_patterns
|
|
409
|
-
|
|
410
|
-
def _validate_pattern_references(self, referenced_patterns: Set[str],
|
|
411
|
-
imports: List[str], inline_patterns: Dict[str, Union[str, List[str]]],
|
|
412
|
-
file_path: str) -> ValidationResult:
|
|
413
|
-
"""
|
|
414
|
-
Validate that all referenced patterns can be resolved.
|
|
415
|
-
|
|
416
|
-
Enhanced to use common_integration for pattern resolution.
|
|
417
|
-
|
|
418
|
-
Args:
|
|
419
|
-
referenced_patterns: Set of referenced pattern names
|
|
420
|
-
imports: List of import declarations
|
|
421
|
-
inline_patterns: Inline pattern definitions
|
|
422
|
-
file_path: File path for error reporting
|
|
423
|
-
|
|
424
|
-
Returns:
|
|
425
|
-
ValidationResult with pattern reference validation status
|
|
426
|
-
"""
|
|
427
|
-
validation_result = ValidationResult(is_valid=True)
|
|
428
|
-
|
|
429
|
-
# Get prefer_mode from parsed file metadata
|
|
430
|
-
prefer_mode = "default" # Default value
|
|
431
|
-
|
|
432
|
-
# Use common_integration for validation if enabled
|
|
433
|
-
if self.use_common_integration and hasattr(self, 'pattern_loader'):
|
|
434
|
-
for pattern_name in referenced_patterns:
|
|
435
|
-
# Skip inline patterns (they're already defined)
|
|
436
|
-
if pattern_name in inline_patterns:
|
|
437
|
-
continue
|
|
438
|
-
|
|
439
|
-
try:
|
|
440
|
-
# Try to resolve the pattern using common_integration
|
|
441
|
-
self.pattern_loader.load_pattern(
|
|
442
|
-
pattern_name,
|
|
443
|
-
imports=imports,
|
|
444
|
-
prefer_mode=prefer_mode
|
|
445
|
-
)
|
|
446
|
-
logger.debug(f"Successfully resolved pattern '{pattern_name}'")
|
|
447
|
-
except PatternResolutionError as e:
|
|
448
|
-
validation_result.add_error(
|
|
449
|
-
f"Cannot resolve pattern '{pattern_name}' referenced in schema",
|
|
450
|
-
f"Pattern not found in imports {imports} or inline patterns. {str(e)}"
|
|
451
|
-
)
|
|
452
|
-
else:
|
|
453
|
-
# Legacy validation using PatternHierarchyResolver
|
|
454
|
-
for pattern_name in referenced_patterns:
|
|
455
|
-
try:
|
|
456
|
-
# Try to resolve the pattern
|
|
457
|
-
self.pattern_resolver.resolve_pattern(
|
|
458
|
-
pattern_name,
|
|
459
|
-
inline_patterns=inline_patterns,
|
|
460
|
-
user_imports=imports
|
|
461
|
-
)
|
|
462
|
-
except PatternResolutionError as e:
|
|
463
|
-
validation_result.add_error(
|
|
464
|
-
f"Cannot resolve pattern '{pattern_name}' referenced in schema",
|
|
465
|
-
f"Pattern not found in imports {imports} or inline patterns. {str(e)}"
|
|
466
|
-
)
|
|
467
|
-
|
|
468
|
-
return validation_result
|
|
469
|
-
|
|
470
|
-
def _parse_pattern_specification(self, pattern_spec: str) -> tuple[str, List[str]]:
|
|
471
|
-
"""
|
|
472
|
-
Parse a pattern specification that might include constraints.
|
|
473
|
-
|
|
474
|
-
Args:
|
|
475
|
-
pattern_spec: Pattern specification string
|
|
476
|
-
|
|
477
|
-
Returns:
|
|
478
|
-
Tuple of (pattern_name, constraints)
|
|
479
|
-
"""
|
|
480
|
-
# For now, simple parsing - just return the pattern name
|
|
481
|
-
# Future enhancement: parse constraints like "email|min_length:5|max_length:50"
|
|
482
|
-
parts = pattern_spec.split('|')
|
|
483
|
-
pattern_name = parts[0].strip()
|
|
484
|
-
constraints = [part.strip() for part in parts[1:]] if len(parts) > 1 else []
|
|
485
|
-
|
|
486
|
-
return pattern_name, constraints
|
|
487
|
-
|
|
488
|
-
def _extract_distribution_from_spec(self, pattern_spec: str) -> Optional[DistributionStrategy]:
|
|
489
|
-
"""
|
|
490
|
-
Extract distribution strategy from pattern specification.
|
|
491
|
-
|
|
492
|
-
Args:
|
|
493
|
-
pattern_spec: Pattern specification string
|
|
494
|
-
|
|
495
|
-
Returns:
|
|
496
|
-
DistributionStrategy if found, None otherwise
|
|
497
|
-
"""
|
|
498
|
-
# Look for distribution specifications in the pattern spec
|
|
499
|
-
# Format: "pattern_name|distribution:equal" or "pattern_name|distribution:custom:20,30,50"
|
|
500
|
-
parts = pattern_spec.split('|')
|
|
501
|
-
|
|
502
|
-
for part in parts:
|
|
503
|
-
part = part.strip()
|
|
504
|
-
if part.startswith('distribution:'):
|
|
505
|
-
dist_spec = part[13:] # Remove 'distribution:' prefix
|
|
506
|
-
return self._parse_distribution_spec(dist_spec)
|
|
507
|
-
|
|
508
|
-
return None
|
|
509
|
-
|
|
510
|
-
def _parse_distribution_spec(self, dist_spec: str) -> DistributionStrategy:
|
|
511
|
-
"""
|
|
512
|
-
Parse a distribution specification string.
|
|
513
|
-
|
|
514
|
-
Args:
|
|
515
|
-
dist_spec: Distribution specification (e.g., "equal", "custom:20,30,50")
|
|
516
|
-
|
|
517
|
-
Returns:
|
|
518
|
-
DistributionStrategy object
|
|
519
|
-
|
|
520
|
-
Raises:
|
|
521
|
-
DistributionValidationError: If distribution spec is invalid
|
|
522
|
-
"""
|
|
523
|
-
parts = dist_spec.split(':')
|
|
524
|
-
dist_type = parts[0].strip()
|
|
525
|
-
|
|
526
|
-
if dist_type not in self.supported_distributions:
|
|
527
|
-
raise DistributionValidationError(
|
|
528
|
-
f"Unsupported distribution type '{dist_type}'",
|
|
529
|
-
dist_type,
|
|
530
|
-
f"Supported types: {', '.join(self.supported_distributions)}"
|
|
531
|
-
)
|
|
532
|
-
|
|
533
|
-
# Parse distribution parameters
|
|
534
|
-
parameters = {}
|
|
535
|
-
if len(parts) > 1:
|
|
536
|
-
param_str = parts[1].strip()
|
|
537
|
-
if dist_type == 'custom':
|
|
538
|
-
# Parse weights: "20,30,50"
|
|
539
|
-
try:
|
|
540
|
-
weights = [float(w.strip()) for w in param_str.split(',')]
|
|
541
|
-
parameters['weights'] = weights
|
|
542
|
-
except ValueError:
|
|
543
|
-
raise DistributionValidationError(
|
|
544
|
-
f"Invalid weights specification '{param_str}'",
|
|
545
|
-
dist_type,
|
|
546
|
-
"Weights must be comma-separated numbers"
|
|
547
|
-
)
|
|
548
|
-
elif dist_type == 'categorical':
|
|
549
|
-
# Parse categories: "A,B,C"
|
|
550
|
-
categories = [cat.strip() for cat in param_str.split(',')]
|
|
551
|
-
parameters['categories'] = categories
|
|
552
|
-
elif dist_type == 'numeric_range':
|
|
553
|
-
# Parse range: "1,100"
|
|
554
|
-
try:
|
|
555
|
-
range_parts = param_str.split(',')
|
|
556
|
-
if len(range_parts) == 2:
|
|
557
|
-
parameters['min'] = float(range_parts[0].strip())
|
|
558
|
-
parameters['max'] = float(range_parts[1].strip())
|
|
559
|
-
else:
|
|
560
|
-
raise ValueError("Range must have exactly 2 values")
|
|
561
|
-
except ValueError:
|
|
562
|
-
raise DistributionValidationError(
|
|
563
|
-
f"Invalid range specification '{param_str}'",
|
|
564
|
-
dist_type,
|
|
565
|
-
"Range must be in format 'min,max'"
|
|
566
|
-
)
|
|
567
|
-
|
|
568
|
-
return DistributionStrategy(
|
|
569
|
-
strategy_type=DistributionType(dist_type),
|
|
570
|
-
parameters=parameters
|
|
571
|
-
)
|
|
572
|
-
|
|
573
|
-
def _validate_distribution_strategy(self, distribution: DistributionStrategy,
|
|
574
|
-
field_name: str) -> ValidationResult:
|
|
575
|
-
"""
|
|
576
|
-
Validate a distribution strategy.
|
|
577
|
-
|
|
578
|
-
Args:
|
|
579
|
-
distribution: Distribution strategy to validate
|
|
580
|
-
field_name: Field name for error reporting
|
|
581
|
-
|
|
582
|
-
Returns:
|
|
583
|
-
ValidationResult with distribution validation status
|
|
584
|
-
"""
|
|
585
|
-
validation_result = ValidationResult(is_valid=True)
|
|
586
|
-
|
|
587
|
-
try:
|
|
588
|
-
# Validate based on distribution type
|
|
589
|
-
if distribution.strategy_type == DistributionType.CUSTOM:
|
|
590
|
-
weights = distribution.parameters.get('weights', [])
|
|
591
|
-
if not weights:
|
|
592
|
-
validation_result.add_error(
|
|
593
|
-
f"Custom distribution for field '{field_name}' missing weights",
|
|
594
|
-
"Custom distributions require weight parameters"
|
|
595
|
-
)
|
|
596
|
-
elif abs(sum(weights) - 100.0) > 0.01:
|
|
597
|
-
validation_result.add_error(
|
|
598
|
-
f"Custom distribution weights for field '{field_name}' must sum to 100",
|
|
599
|
-
f"Current sum: {sum(weights)}"
|
|
600
|
-
)
|
|
601
|
-
|
|
602
|
-
elif distribution.strategy_type == DistributionType.CATEGORICAL:
|
|
603
|
-
categories = distribution.parameters.get('categories', [])
|
|
604
|
-
if not categories:
|
|
605
|
-
validation_result.add_error(
|
|
606
|
-
f"Categorical distribution for field '{field_name}' missing categories",
|
|
607
|
-
"Categorical distributions require category parameters"
|
|
608
|
-
)
|
|
609
|
-
|
|
610
|
-
elif distribution.strategy_type == DistributionType.NUMERIC_RANGE:
|
|
611
|
-
min_val = distribution.parameters.get('min')
|
|
612
|
-
max_val = distribution.parameters.get('max')
|
|
613
|
-
if min_val is None or max_val is None:
|
|
614
|
-
validation_result.add_error(
|
|
615
|
-
f"Numeric range distribution for field '{field_name}' missing range parameters",
|
|
616
|
-
"Numeric range distributions require min and max parameters"
|
|
617
|
-
)
|
|
618
|
-
elif min_val >= max_val:
|
|
619
|
-
validation_result.add_error(
|
|
620
|
-
f"Invalid range for field '{field_name}': min must be less than max",
|
|
621
|
-
f"Current range: {min_val} to {max_val}"
|
|
622
|
-
)
|
|
623
|
-
|
|
624
|
-
except Exception as e:
|
|
625
|
-
validation_result.add_error(
|
|
626
|
-
f"Error validating distribution for field '{field_name}': {e}",
|
|
627
|
-
"Check distribution parameters"
|
|
628
|
-
)
|
|
629
|
-
|
|
630
|
-
return validation_result
|
|
631
|
-
|
|
632
|
-
def _convert_to_field_definition(self, parsed_field: ParsedFieldDefinition,
|
|
633
|
-
distribution: Optional[DistributionStrategy] = None) -> FieldDefinition:
|
|
634
|
-
"""
|
|
635
|
-
Convert parsed field definition to FieldDefinition model.
|
|
636
|
-
|
|
637
|
-
Enhanced to handle new pattern structure with pattern_value and pattern_type.
|
|
638
|
-
|
|
639
|
-
Args:
|
|
640
|
-
parsed_field: Parsed field definition
|
|
641
|
-
distribution: Optional distribution strategy
|
|
642
|
-
|
|
643
|
-
Returns:
|
|
644
|
-
FieldDefinition object
|
|
645
|
-
"""
|
|
646
|
-
# For reference patterns, use the pattern_value as pattern_name
|
|
647
|
-
# For inline patterns, we'll need to handle them differently in the generator
|
|
648
|
-
pattern_name = parsed_field.pattern_value if parsed_field.pattern_type == "reference" else parsed_field.name
|
|
649
|
-
|
|
650
|
-
return FieldDefinition(
|
|
651
|
-
name=parsed_field.name,
|
|
652
|
-
pattern_name=pattern_name,
|
|
653
|
-
distribution=distribution,
|
|
654
|
-
constraints=parsed_field.constraints
|
|
655
|
-
)
|
|
656
|
-
|
|
657
|
-
def _is_valid_field_name(self, name: str) -> bool:
|
|
658
|
-
"""Check if a field name is valid."""
|
|
659
|
-
import keyword
|
|
660
|
-
return (name and
|
|
661
|
-
name.isidentifier() and
|
|
662
|
-
not name.startswith('_') and
|
|
663
|
-
not keyword.iskeyword(name) and
|
|
664
|
-
name.replace('_', '').isalnum())
|
|
665
|
-
|
|
666
|
-
def _is_valid_pattern_name(self, name: str) -> bool:
|
|
667
|
-
"""Check if a pattern name is valid."""
|
|
668
|
-
import keyword
|
|
669
|
-
return (name and
|
|
670
|
-
name.isidentifier() and
|
|
671
|
-
not name.startswith('_') and
|
|
672
|
-
not keyword.iskeyword(name) and
|
|
673
|
-
name.replace('_', '').isalnum())
|
|
674
|
-
|
|
675
|
-
def _is_valid_import_name(self, name: str) -> bool:
|
|
676
|
-
"""Check if an import name is valid."""
|
|
677
|
-
return (name and
|
|
678
|
-
name.replace('_', '').replace('-', '').isalnum() and
|
|
679
|
-
not name.startswith('_') and
|
|
680
|
-
len(name) <= 50)
|
|
681
|
-
|
|
682
|
-
def clear_cache(self):
|
|
683
|
-
"""Clear all caches."""
|
|
684
|
-
self.schema_file_manager.clear_cache()
|
|
685
|
-
self.pattern_resolver.clear_cache()
|