additory 0.1.0a2__py3-none-any.whl → 0.1.0a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. additory/__init__.py +4 -0
  2. additory/common/__init__.py +2 -2
  3. additory/common/backend.py +20 -4
  4. additory/common/distributions.py +1 -1
  5. additory/common/sample_data.py +19 -19
  6. additory/core/backends/arrow_bridge.py +7 -0
  7. additory/core/config.py +3 -3
  8. additory/core/polars_expression_engine.py +66 -16
  9. additory/core/registry.py +4 -3
  10. additory/dynamic_api.py +95 -51
  11. additory/expressions/proxy.py +4 -1
  12. additory/expressions/registry.py +3 -3
  13. additory/synthetic/__init__.py +7 -95
  14. additory/synthetic/column_name_resolver.py +149 -0
  15. additory/synthetic/deduce.py +259 -0
  16. additory/{augment → synthetic}/distributions.py +2 -2
  17. additory/{augment → synthetic}/forecast.py +1 -1
  18. additory/synthetic/linked_list_parser.py +415 -0
  19. additory/synthetic/namespace_lookup.py +129 -0
  20. additory/{augment → synthetic}/smote.py +1 -1
  21. additory/{augment → synthetic}/strategies.py +87 -44
  22. additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
  23. additory/utilities/units.py +4 -1
  24. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/METADATA +44 -28
  25. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/RECORD +28 -43
  26. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/WHEEL +1 -1
  27. additory/augment/__init__.py +0 -24
  28. additory/augment/builtin_lists.py +0 -430
  29. additory/augment/list_registry.py +0 -177
  30. additory/synthetic/api.py +0 -220
  31. additory/synthetic/common_integration.py +0 -314
  32. additory/synthetic/config.py +0 -262
  33. additory/synthetic/engines.py +0 -529
  34. additory/synthetic/exceptions.py +0 -180
  35. additory/synthetic/file_managers.py +0 -518
  36. additory/synthetic/generator.py +0 -702
  37. additory/synthetic/generator_parser.py +0 -68
  38. additory/synthetic/integration.py +0 -319
  39. additory/synthetic/models.py +0 -241
  40. additory/synthetic/pattern_resolver.py +0 -573
  41. additory/synthetic/performance.py +0 -469
  42. additory/synthetic/polars_integration.py +0 -464
  43. additory/synthetic/proxy.py +0 -60
  44. additory/synthetic/schema_parser.py +0 -685
  45. additory/synthetic/validator.py +0 -553
  46. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
  47. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/top_level.txt +0 -0
@@ -1,685 +0,0 @@
1
- """
2
- Schema parser for TOML schema processing.
3
-
4
- Handles parsing of TOML schema files with import declarations, inline pattern overrides,
5
- distribution strategy specifications, and comprehensive validation.
6
-
7
- Enhanced to support:
8
- - New [generation] section with imports and prefer_mode
9
- - Pattern type detection (inline_list, inline_regex, reference)
10
- - Integration with common module for pattern resolution
11
- """
12
-
13
- from dataclasses import dataclass, field
14
- from pathlib import Path
15
- from typing import Dict, List, Optional, Any, Union, Set
16
- import toml
17
- import logging
18
-
19
- from .models import (
20
- SchemaDefinition, FieldDefinition, DistributionStrategy, DistributionType,
21
- ValidationResult, ValidationStatus
22
- )
23
- from .file_managers import SchemaFileManager, ParsedSchemaFile
24
- from .pattern_resolver import PatternHierarchyResolver
25
- from .common_integration import SyntheticPatternLoader, detect_pattern_type_from_toml
26
- from .exceptions import (
27
- SchemaParsingError, ValidationError, PatternResolutionError,
28
- DistributionValidationError
29
- )
30
-
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
-
35
- @dataclass
36
- class ParsedFieldDefinition:
37
- """Represents a parsed field definition from schema."""
38
- name: str
39
- pattern_value: Union[str, List[str]] # Can be reference, regex, or list
40
- pattern_type: str # "inline_list", "inline_regex", or "reference"
41
- distribution: Optional[DistributionStrategy] = None
42
- constraints: List[str] = field(default_factory=list)
43
- metadata: Dict[str, Any] = field(default_factory=dict)
44
-
45
-
46
- @dataclass
47
- class SchemaParsingResult:
48
- """Result of comprehensive schema parsing."""
49
- schema_definition: SchemaDefinition
50
- validation_result: ValidationResult
51
- referenced_patterns: Set[str]
52
- distribution_strategies: Dict[str, DistributionStrategy]
53
- parsing_metadata: Dict[str, Any] = field(default_factory=dict)
54
-
55
-
56
- class SchemaParser:
57
- """
58
- Comprehensive TOML schema parser with validation and pattern resolution.
59
-
60
- Handles:
61
- - Import declaration parsing and validation
62
- - Inline pattern override processing
63
- - Distribution strategy specification parsing
64
- - Pattern reference validation
65
- - Schema structure validation
66
- """
67
-
68
- def __init__(self, pattern_resolver: Optional[PatternHierarchyResolver] = None,
69
- use_common_integration: bool = True):
70
- """
71
- Initialize the schema parser.
72
-
73
- Args:
74
- pattern_resolver: Pattern hierarchy resolver for validation (legacy)
75
- use_common_integration: Whether to use new common_integration module (default: True)
76
- """
77
- self.schema_file_manager = SchemaFileManager()
78
- self.pattern_resolver = pattern_resolver or PatternHierarchyResolver()
79
-
80
- # New: Use common_integration for pattern loading
81
- self.use_common_integration = use_common_integration
82
- if use_common_integration:
83
- self.pattern_loader = SyntheticPatternLoader()
84
-
85
- # Supported distribution types for validation
86
- self.supported_distributions = {
87
- 'equal', 'custom', 'categorical', 'high_cardinality',
88
- 'numeric_range', 'skewed'
89
- }
90
-
91
- def parse_schema_file(self, file_path: str, validate_patterns: bool = True) -> SchemaParsingResult:
92
- """
93
- Parse a TOML schema file with comprehensive validation.
94
-
95
- Args:
96
- file_path: Path to the TOML schema file
97
- validate_patterns: Whether to validate referenced patterns exist
98
-
99
- Returns:
100
- SchemaParsingResult with parsed schema and validation results
101
-
102
- Raises:
103
- SchemaParsingError: If schema parsing fails
104
- ValidationError: If validation fails
105
- """
106
- # Load the raw TOML file
107
- parsed_file = self.schema_file_manager.load_toml_schema(file_path)
108
-
109
- # Initialize validation result
110
- validation_result = ValidationResult(is_valid=True)
111
-
112
- # Parse field definitions from schema section
113
- field_definitions, field_validation = self._parse_field_definitions(
114
- parsed_file.schema_definitions, file_path
115
- )
116
- validation_result.merge(field_validation)
117
-
118
- # Parse distribution strategies
119
- distribution_strategies, dist_validation = self._parse_distribution_strategies(
120
- parsed_file.schema_definitions, file_path
121
- )
122
- validation_result.merge(dist_validation)
123
-
124
- # Validate imports
125
- import_validation = self._validate_imports(parsed_file.imports, file_path)
126
- validation_result.merge(import_validation)
127
-
128
- # Validate inline patterns
129
- inline_validation = self._validate_inline_patterns(parsed_file.inline_patterns, file_path)
130
- validation_result.merge(inline_validation)
131
-
132
- # Collect all referenced patterns
133
- referenced_patterns = self._collect_referenced_patterns(
134
- field_definitions, parsed_file.inline_patterns
135
- )
136
-
137
- # Validate pattern references if requested
138
- if validate_patterns:
139
- pattern_validation = self._validate_pattern_references(
140
- referenced_patterns, parsed_file.imports,
141
- parsed_file.inline_patterns, file_path
142
- )
143
- validation_result.merge(pattern_validation)
144
-
145
- # Create schema definition
146
- schema_definition = SchemaDefinition(
147
- imports=parsed_file.imports,
148
- inline_patterns=parsed_file.inline_patterns,
149
- field_definitions={name: self._convert_to_field_definition(field_def, distribution_strategies.get(name))
150
- for name, field_def in field_definitions.items()},
151
- metadata=parsed_file.metadata,
152
- source_file=file_path
153
- )
154
-
155
- # Create parsing metadata
156
- parsing_metadata = {
157
- 'file_path': file_path,
158
- 'field_count': len(field_definitions),
159
- 'import_count': len(parsed_file.imports),
160
- 'inline_pattern_count': len(parsed_file.inline_patterns),
161
- 'distribution_strategy_count': len(distribution_strategies),
162
- 'referenced_pattern_count': len(referenced_patterns)
163
- }
164
-
165
- return SchemaParsingResult(
166
- schema_definition=schema_definition,
167
- validation_result=validation_result,
168
- referenced_patterns=referenced_patterns,
169
- distribution_strategies=distribution_strategies,
170
- parsing_metadata=parsing_metadata
171
- )
172
-
173
- def _parse_field_definitions(self, schema_definitions: Dict[str, Any],
174
- file_path: str) -> tuple[Dict[str, ParsedFieldDefinition], ValidationResult]:
175
- """
176
- Parse field definitions from schema section.
177
-
178
- Enhanced to support:
179
- - String patterns (reference or inline regex)
180
- - Array patterns (inline list)
181
- - Pattern type detection
182
-
183
- Args:
184
- schema_definitions: Schema definitions from TOML
185
- file_path: File path for error reporting
186
-
187
- Returns:
188
- Tuple of (field_definitions, validation_result)
189
- """
190
- field_definitions = {}
191
- validation_result = ValidationResult(is_valid=True)
192
-
193
- for field_name, pattern_spec in schema_definitions.items():
194
- try:
195
- # Detect pattern type
196
- pattern_type = detect_pattern_type_from_toml(pattern_spec)
197
-
198
- if pattern_type == "unknown":
199
- validation_result.add_error(
200
- f"Invalid pattern type for field '{field_name}'",
201
- f"Pattern must be a string (reference/regex) or array (list), got {type(pattern_spec)}"
202
- )
203
- continue
204
-
205
- # Validate field name
206
- if not self._is_valid_field_name(field_name):
207
- validation_result.add_error(
208
- f"Invalid field name '{field_name}' in schema",
209
- "Field names must be valid identifiers"
210
- )
211
- continue
212
-
213
- # Create field definition with pattern value and type
214
- field_def = ParsedFieldDefinition(
215
- name=field_name,
216
- pattern_value=pattern_spec,
217
- pattern_type=pattern_type
218
- )
219
-
220
- field_definitions[field_name] = field_def
221
-
222
- logger.debug(f"Parsed field '{field_name}': type={pattern_type}, value={pattern_spec}")
223
-
224
- except Exception as e:
225
- validation_result.add_error(
226
- f"Error parsing field '{field_name}': {e}",
227
- "Check field definition syntax"
228
- )
229
-
230
- return field_definitions, validation_result
231
-
232
- def _parse_distribution_strategies(self, schema_definitions: Dict[str, Any],
233
- file_path: str) -> tuple[Dict[str, DistributionStrategy], ValidationResult]:
234
- """
235
- Parse distribution strategy specifications from schema.
236
-
237
- Enhanced to handle both string and array patterns.
238
-
239
- Args:
240
- schema_definitions: Schema definitions from TOML
241
- file_path: File path for error reporting
242
-
243
- Returns:
244
- Tuple of (distribution_strategies, validation_result)
245
- """
246
- distribution_strategies = {}
247
- validation_result = ValidationResult(is_valid=True)
248
-
249
- # Look for distribution specifications in metadata or special sections
250
- # For now, we'll parse simple distribution specs from pattern specifications
251
- for field_name, pattern_spec in schema_definitions.items():
252
- try:
253
- # Skip non-string patterns (arrays don't have distribution specs in the pattern)
254
- if not isinstance(pattern_spec, str):
255
- continue
256
-
257
- distribution = self._extract_distribution_from_spec(pattern_spec)
258
- if distribution:
259
- # Validate distribution strategy
260
- dist_validation = self._validate_distribution_strategy(distribution, field_name)
261
- validation_result.merge(dist_validation)
262
-
263
- if dist_validation.is_valid:
264
- distribution_strategies[field_name] = distribution
265
-
266
- except Exception as e:
267
- validation_result.add_error(
268
- f"Error parsing distribution for field '{field_name}': {e}",
269
- "Check distribution strategy syntax"
270
- )
271
-
272
- return distribution_strategies, validation_result
273
-
274
- def _validate_imports(self, imports: List[str], file_path: str) -> ValidationResult:
275
- """
276
- Validate import declarations.
277
-
278
- Enhanced to use common_integration for validation.
279
-
280
- Args:
281
- imports: List of import declarations
282
- file_path: File path for error reporting
283
-
284
- Returns:
285
- ValidationResult with import validation status
286
- """
287
- validation_result = ValidationResult(is_valid=True)
288
-
289
- # Use common_integration for validation if enabled
290
- if self.use_common_integration and hasattr(self, 'pattern_loader'):
291
- is_valid, errors = self.pattern_loader.validate_imports(imports)
292
- if not is_valid:
293
- for error in errors:
294
- validation_result.add_error(error, "Check import declarations")
295
- return validation_result
296
-
297
- # Legacy validation
298
- for import_name in imports:
299
- # Validate import name format
300
- if not self._is_valid_import_name(import_name):
301
- validation_result.add_error(
302
- f"Invalid import name '{import_name}'",
303
- "Import names should be valid pattern file names (without .properties extension)"
304
- )
305
-
306
- # Check for circular imports (basic check)
307
- if import_name == "self" or import_name == Path(file_path).stem:
308
- validation_result.add_error(
309
- f"Circular import detected: '{import_name}'",
310
- "Schemas cannot import themselves"
311
- )
312
-
313
- return validation_result
314
-
315
- def _validate_inline_patterns(self, inline_patterns: Dict[str, Union[str, List[str]]],
316
- file_path: str) -> ValidationResult:
317
- """
318
- Validate inline pattern definitions.
319
-
320
- Enhanced to support both string (regex) and array (list) patterns.
321
-
322
- Args:
323
- inline_patterns: Dictionary of inline patterns
324
- file_path: File path for error reporting
325
-
326
- Returns:
327
- ValidationResult with inline pattern validation status
328
- """
329
- validation_result = ValidationResult(is_valid=True)
330
-
331
- for pattern_name, pattern_value in inline_patterns.items():
332
- # Validate pattern name
333
- if not self._is_valid_pattern_name(pattern_name):
334
- validation_result.add_error(
335
- f"Invalid inline pattern name '{pattern_name}'",
336
- "Pattern names must start with a letter and contain only letters, numbers, and underscores"
337
- )
338
-
339
- # Validate based on pattern type
340
- if isinstance(pattern_value, str):
341
- # String pattern (regex)
342
- if not pattern_value.strip():
343
- validation_result.add_error(
344
- f"Empty regex pattern for inline pattern '{pattern_name}'",
345
- "Regex patterns must not be empty"
346
- )
347
-
348
- # Basic regex syntax validation
349
- try:
350
- import re
351
- re.compile(pattern_value)
352
- except re.error as e:
353
- validation_result.add_error(
354
- f"Invalid regex syntax in inline pattern '{pattern_name}': {e}",
355
- "Check regex syntax for proper escaping and structure"
356
- )
357
-
358
- elif isinstance(pattern_value, list):
359
- # Array pattern (list)
360
- if not pattern_value:
361
- validation_result.add_error(
362
- f"Empty list for inline pattern '{pattern_name}'",
363
- "List patterns must contain at least one value"
364
- )
365
-
366
- # Validate all items are strings
367
- for i, item in enumerate(pattern_value):
368
- if not isinstance(item, str):
369
- validation_result.add_error(
370
- f"Invalid item type in inline pattern '{pattern_name}' at index {i}",
371
- f"All list items must be strings, got {type(item)}"
372
- )
373
-
374
- else:
375
- validation_result.add_error(
376
- f"Invalid pattern type for inline pattern '{pattern_name}'",
377
- f"Pattern must be a string (regex) or array (list), got {type(pattern_value)}"
378
- )
379
-
380
- return validation_result
381
-
382
- def _collect_referenced_patterns(self, field_definitions: Dict[str, ParsedFieldDefinition],
383
- inline_patterns: Dict[str, Union[str, List[str]]]) -> Set[str]:
384
- """
385
- Collect all pattern names referenced in the schema.
386
-
387
- Enhanced to only collect references (not inline patterns).
388
-
389
- Args:
390
- field_definitions: Parsed field definitions
391
- inline_patterns: Inline pattern definitions
392
-
393
- Returns:
394
- Set of all referenced pattern names (excluding inline patterns)
395
- """
396
- referenced_patterns = set()
397
-
398
- # Add patterns from field definitions (only references, not inline)
399
- for field_def in field_definitions.values():
400
- if field_def.pattern_type == "reference":
401
- # Only references need to be resolved
402
- referenced_patterns.add(field_def.pattern_value)
403
-
404
- # Inline patterns don't need resolution (they're already defined)
405
- # But we track them for completeness
406
- referenced_patterns.update(inline_patterns.keys())
407
-
408
- return referenced_patterns
409
-
410
- def _validate_pattern_references(self, referenced_patterns: Set[str],
411
- imports: List[str], inline_patterns: Dict[str, Union[str, List[str]]],
412
- file_path: str) -> ValidationResult:
413
- """
414
- Validate that all referenced patterns can be resolved.
415
-
416
- Enhanced to use common_integration for pattern resolution.
417
-
418
- Args:
419
- referenced_patterns: Set of referenced pattern names
420
- imports: List of import declarations
421
- inline_patterns: Inline pattern definitions
422
- file_path: File path for error reporting
423
-
424
- Returns:
425
- ValidationResult with pattern reference validation status
426
- """
427
- validation_result = ValidationResult(is_valid=True)
428
-
429
- # Get prefer_mode from parsed file metadata
430
- prefer_mode = "default" # Default value
431
-
432
- # Use common_integration for validation if enabled
433
- if self.use_common_integration and hasattr(self, 'pattern_loader'):
434
- for pattern_name in referenced_patterns:
435
- # Skip inline patterns (they're already defined)
436
- if pattern_name in inline_patterns:
437
- continue
438
-
439
- try:
440
- # Try to resolve the pattern using common_integration
441
- self.pattern_loader.load_pattern(
442
- pattern_name,
443
- imports=imports,
444
- prefer_mode=prefer_mode
445
- )
446
- logger.debug(f"Successfully resolved pattern '{pattern_name}'")
447
- except PatternResolutionError as e:
448
- validation_result.add_error(
449
- f"Cannot resolve pattern '{pattern_name}' referenced in schema",
450
- f"Pattern not found in imports {imports} or inline patterns. {str(e)}"
451
- )
452
- else:
453
- # Legacy validation using PatternHierarchyResolver
454
- for pattern_name in referenced_patterns:
455
- try:
456
- # Try to resolve the pattern
457
- self.pattern_resolver.resolve_pattern(
458
- pattern_name,
459
- inline_patterns=inline_patterns,
460
- user_imports=imports
461
- )
462
- except PatternResolutionError as e:
463
- validation_result.add_error(
464
- f"Cannot resolve pattern '{pattern_name}' referenced in schema",
465
- f"Pattern not found in imports {imports} or inline patterns. {str(e)}"
466
- )
467
-
468
- return validation_result
469
-
470
- def _parse_pattern_specification(self, pattern_spec: str) -> tuple[str, List[str]]:
471
- """
472
- Parse a pattern specification that might include constraints.
473
-
474
- Args:
475
- pattern_spec: Pattern specification string
476
-
477
- Returns:
478
- Tuple of (pattern_name, constraints)
479
- """
480
- # For now, simple parsing - just return the pattern name
481
- # Future enhancement: parse constraints like "email|min_length:5|max_length:50"
482
- parts = pattern_spec.split('|')
483
- pattern_name = parts[0].strip()
484
- constraints = [part.strip() for part in parts[1:]] if len(parts) > 1 else []
485
-
486
- return pattern_name, constraints
487
-
488
- def _extract_distribution_from_spec(self, pattern_spec: str) -> Optional[DistributionStrategy]:
489
- """
490
- Extract distribution strategy from pattern specification.
491
-
492
- Args:
493
- pattern_spec: Pattern specification string
494
-
495
- Returns:
496
- DistributionStrategy if found, None otherwise
497
- """
498
- # Look for distribution specifications in the pattern spec
499
- # Format: "pattern_name|distribution:equal" or "pattern_name|distribution:custom:20,30,50"
500
- parts = pattern_spec.split('|')
501
-
502
- for part in parts:
503
- part = part.strip()
504
- if part.startswith('distribution:'):
505
- dist_spec = part[13:] # Remove 'distribution:' prefix
506
- return self._parse_distribution_spec(dist_spec)
507
-
508
- return None
509
-
510
- def _parse_distribution_spec(self, dist_spec: str) -> DistributionStrategy:
511
- """
512
- Parse a distribution specification string.
513
-
514
- Args:
515
- dist_spec: Distribution specification (e.g., "equal", "custom:20,30,50")
516
-
517
- Returns:
518
- DistributionStrategy object
519
-
520
- Raises:
521
- DistributionValidationError: If distribution spec is invalid
522
- """
523
- parts = dist_spec.split(':')
524
- dist_type = parts[0].strip()
525
-
526
- if dist_type not in self.supported_distributions:
527
- raise DistributionValidationError(
528
- f"Unsupported distribution type '{dist_type}'",
529
- dist_type,
530
- f"Supported types: {', '.join(self.supported_distributions)}"
531
- )
532
-
533
- # Parse distribution parameters
534
- parameters = {}
535
- if len(parts) > 1:
536
- param_str = parts[1].strip()
537
- if dist_type == 'custom':
538
- # Parse weights: "20,30,50"
539
- try:
540
- weights = [float(w.strip()) for w in param_str.split(',')]
541
- parameters['weights'] = weights
542
- except ValueError:
543
- raise DistributionValidationError(
544
- f"Invalid weights specification '{param_str}'",
545
- dist_type,
546
- "Weights must be comma-separated numbers"
547
- )
548
- elif dist_type == 'categorical':
549
- # Parse categories: "A,B,C"
550
- categories = [cat.strip() for cat in param_str.split(',')]
551
- parameters['categories'] = categories
552
- elif dist_type == 'numeric_range':
553
- # Parse range: "1,100"
554
- try:
555
- range_parts = param_str.split(',')
556
- if len(range_parts) == 2:
557
- parameters['min'] = float(range_parts[0].strip())
558
- parameters['max'] = float(range_parts[1].strip())
559
- else:
560
- raise ValueError("Range must have exactly 2 values")
561
- except ValueError:
562
- raise DistributionValidationError(
563
- f"Invalid range specification '{param_str}'",
564
- dist_type,
565
- "Range must be in format 'min,max'"
566
- )
567
-
568
- return DistributionStrategy(
569
- strategy_type=DistributionType(dist_type),
570
- parameters=parameters
571
- )
572
-
573
- def _validate_distribution_strategy(self, distribution: DistributionStrategy,
574
- field_name: str) -> ValidationResult:
575
- """
576
- Validate a distribution strategy.
577
-
578
- Args:
579
- distribution: Distribution strategy to validate
580
- field_name: Field name for error reporting
581
-
582
- Returns:
583
- ValidationResult with distribution validation status
584
- """
585
- validation_result = ValidationResult(is_valid=True)
586
-
587
- try:
588
- # Validate based on distribution type
589
- if distribution.strategy_type == DistributionType.CUSTOM:
590
- weights = distribution.parameters.get('weights', [])
591
- if not weights:
592
- validation_result.add_error(
593
- f"Custom distribution for field '{field_name}' missing weights",
594
- "Custom distributions require weight parameters"
595
- )
596
- elif abs(sum(weights) - 100.0) > 0.01:
597
- validation_result.add_error(
598
- f"Custom distribution weights for field '{field_name}' must sum to 100",
599
- f"Current sum: {sum(weights)}"
600
- )
601
-
602
- elif distribution.strategy_type == DistributionType.CATEGORICAL:
603
- categories = distribution.parameters.get('categories', [])
604
- if not categories:
605
- validation_result.add_error(
606
- f"Categorical distribution for field '{field_name}' missing categories",
607
- "Categorical distributions require category parameters"
608
- )
609
-
610
- elif distribution.strategy_type == DistributionType.NUMERIC_RANGE:
611
- min_val = distribution.parameters.get('min')
612
- max_val = distribution.parameters.get('max')
613
- if min_val is None or max_val is None:
614
- validation_result.add_error(
615
- f"Numeric range distribution for field '{field_name}' missing range parameters",
616
- "Numeric range distributions require min and max parameters"
617
- )
618
- elif min_val >= max_val:
619
- validation_result.add_error(
620
- f"Invalid range for field '{field_name}': min must be less than max",
621
- f"Current range: {min_val} to {max_val}"
622
- )
623
-
624
- except Exception as e:
625
- validation_result.add_error(
626
- f"Error validating distribution for field '{field_name}': {e}",
627
- "Check distribution parameters"
628
- )
629
-
630
- return validation_result
631
-
632
- def _convert_to_field_definition(self, parsed_field: ParsedFieldDefinition,
633
- distribution: Optional[DistributionStrategy] = None) -> FieldDefinition:
634
- """
635
- Convert parsed field definition to FieldDefinition model.
636
-
637
- Enhanced to handle new pattern structure with pattern_value and pattern_type.
638
-
639
- Args:
640
- parsed_field: Parsed field definition
641
- distribution: Optional distribution strategy
642
-
643
- Returns:
644
- FieldDefinition object
645
- """
646
- # For reference patterns, use the pattern_value as pattern_name
647
- # For inline patterns, we'll need to handle them differently in the generator
648
- pattern_name = parsed_field.pattern_value if parsed_field.pattern_type == "reference" else parsed_field.name
649
-
650
- return FieldDefinition(
651
- name=parsed_field.name,
652
- pattern_name=pattern_name,
653
- distribution=distribution,
654
- constraints=parsed_field.constraints
655
- )
656
-
657
- def _is_valid_field_name(self, name: str) -> bool:
658
- """Check if a field name is valid."""
659
- import keyword
660
- return (name and
661
- name.isidentifier() and
662
- not name.startswith('_') and
663
- not keyword.iskeyword(name) and
664
- name.replace('_', '').isalnum())
665
-
666
- def _is_valid_pattern_name(self, name: str) -> bool:
667
- """Check if a pattern name is valid."""
668
- import keyword
669
- return (name and
670
- name.isidentifier() and
671
- not name.startswith('_') and
672
- not keyword.iskeyword(name) and
673
- name.replace('_', '').isalnum())
674
-
675
- def _is_valid_import_name(self, name: str) -> bool:
676
- """Check if an import name is valid."""
677
- return (name and
678
- name.replace('_', '').replace('-', '').isalnum() and
679
- not name.startswith('_') and
680
- len(name) <= 50)
681
-
682
- def clear_cache(self):
683
- """Clear all caches."""
684
- self.schema_file_manager.clear_cache()
685
- self.pattern_resolver.clear_cache()