additory 0.1.0a2__py3-none-any.whl → 0.1.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/config.py +3 -3
- additory/core/polars_expression_engine.py +66 -16
- additory/core/registry.py +4 -3
- additory/dynamic_api.py +95 -51
- additory/expressions/proxy.py +4 -1
- additory/expressions/registry.py +3 -3
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/synthetic/deduce.py +259 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +87 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/METADATA +44 -28
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/RECORD +28 -43
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/top_level.txt +0 -0
additory/synthetic/validator.py
DELETED
|
@@ -1,553 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Comprehensive validation system for synthetic data generation.
|
|
3
|
-
|
|
4
|
-
Implements fail-fast validation with clear error messages, line numbers,
|
|
5
|
-
and actionable suggestions for fixing configuration issues.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import re
|
|
9
|
-
import os
|
|
10
|
-
from typing import Dict, List, Optional, Set, Tuple, Any, Union
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
from dataclasses import dataclass
|
|
13
|
-
import polars as pl
|
|
14
|
-
|
|
15
|
-
from .models import (
|
|
16
|
-
ValidationResult,
|
|
17
|
-
PatternDefinition,
|
|
18
|
-
SchemaDefinition,
|
|
19
|
-
DistributionStrategy,
|
|
20
|
-
DistributionType,
|
|
21
|
-
PatternSource,
|
|
22
|
-
ValidationStatus,
|
|
23
|
-
ResolvedPattern,
|
|
24
|
-
FieldDefinition
|
|
25
|
-
)
|
|
26
|
-
from .exceptions import ValidationError, SyntheticDataError
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@dataclass
|
|
30
|
-
class ValidationContext:
|
|
31
|
-
"""Context information for validation operations."""
|
|
32
|
-
file_path: Optional[str] = None
|
|
33
|
-
line_number: Optional[int] = None
|
|
34
|
-
field_name: Optional[str] = None
|
|
35
|
-
pattern_name: Optional[str] = None
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
@dataclass
|
|
39
|
-
class ValidationIssue:
|
|
40
|
-
"""Represents a validation issue with context and suggestions."""
|
|
41
|
-
severity: str # "error", "warning", "info"
|
|
42
|
-
message: str
|
|
43
|
-
context: ValidationContext
|
|
44
|
-
suggestion: Optional[str] = None
|
|
45
|
-
code: Optional[str] = None
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class RegexValidator:
|
|
49
|
-
"""Validates regex patterns for polars compatibility and correctness."""
|
|
50
|
-
|
|
51
|
-
def __init__(self):
|
|
52
|
-
"""Initialize the regex validator."""
|
|
53
|
-
self.polars_incompatible_features = [
|
|
54
|
-
r'(?<!\\)\(\?P<', # Named groups
|
|
55
|
-
r'(?<!\\)\(\?:', # Non-capturing groups (some cases)
|
|
56
|
-
r'(?<!\\)\(\?=', # Positive lookahead
|
|
57
|
-
r'(?<!\\)\(\?!', # Negative lookahead
|
|
58
|
-
r'(?<!\\)\(\?<=', # Positive lookbehind
|
|
59
|
-
r'(?<!\\)\(\?<!', # Negative lookbehind
|
|
60
|
-
r'\\A', # Start of string (use ^ instead)
|
|
61
|
-
r'\\Z', # End of string (use $ instead)
|
|
62
|
-
r'\\b', # Word boundary (limited support)
|
|
63
|
-
r'\\B', # Non-word boundary
|
|
64
|
-
]
|
|
65
|
-
|
|
66
|
-
def validate_regex_pattern(self, pattern: str, context: ValidationContext) -> List[ValidationIssue]:
|
|
67
|
-
"""Validate a regex pattern for correctness and polars compatibility."""
|
|
68
|
-
issues = []
|
|
69
|
-
|
|
70
|
-
# Check basic regex syntax
|
|
71
|
-
try:
|
|
72
|
-
re.compile(pattern)
|
|
73
|
-
except re.error as e:
|
|
74
|
-
issues.append(ValidationIssue(
|
|
75
|
-
severity="error",
|
|
76
|
-
message=f"Invalid regex syntax: {e}",
|
|
77
|
-
context=context,
|
|
78
|
-
suggestion="Fix the regex syntax error. Common issues: unmatched brackets, invalid escape sequences",
|
|
79
|
-
code="REGEX_SYNTAX_ERROR"
|
|
80
|
-
))
|
|
81
|
-
return issues # Can't continue validation with invalid regex
|
|
82
|
-
|
|
83
|
-
# Check for polars incompatible features
|
|
84
|
-
for incompatible_feature in self.polars_incompatible_features:
|
|
85
|
-
if re.search(incompatible_feature, pattern):
|
|
86
|
-
feature_name = self._get_feature_name(incompatible_feature)
|
|
87
|
-
issues.append(ValidationIssue(
|
|
88
|
-
severity="error",
|
|
89
|
-
message=f"Regex pattern contains polars-incompatible feature: {feature_name}",
|
|
90
|
-
context=context,
|
|
91
|
-
suggestion=self._get_compatibility_suggestion(feature_name),
|
|
92
|
-
code="POLARS_INCOMPATIBLE"
|
|
93
|
-
))
|
|
94
|
-
|
|
95
|
-
# Check for common issues
|
|
96
|
-
issues.extend(self._check_common_regex_issues(pattern, context))
|
|
97
|
-
|
|
98
|
-
# Test pattern with polars
|
|
99
|
-
issues.extend(self._test_polars_compatibility(pattern, context))
|
|
100
|
-
|
|
101
|
-
return issues
|
|
102
|
-
|
|
103
|
-
def _get_feature_name(self, pattern: str) -> str:
|
|
104
|
-
"""Get human-readable name for regex feature."""
|
|
105
|
-
feature_map = {
|
|
106
|
-
r'(?<!\\)\(\?P<': "named groups",
|
|
107
|
-
r'(?<!\\)\(\?:': "non-capturing groups",
|
|
108
|
-
r'(?<!\\)\(\?=': "positive lookahead",
|
|
109
|
-
r'(?<!\\)\(\?!': "negative lookahead",
|
|
110
|
-
r'(?<!\\)\(\?<=': "positive lookbehind",
|
|
111
|
-
r'(?<!\\)\(\?<!': "negative lookbehind",
|
|
112
|
-
r'\\A': "start of string anchor (\\A)",
|
|
113
|
-
r'\\Z': "end of string anchor (\\Z)",
|
|
114
|
-
r'\\b': "word boundary (\\b)",
|
|
115
|
-
r'\\B': "non-word boundary (\\B)",
|
|
116
|
-
}
|
|
117
|
-
return feature_map.get(pattern, "unknown feature")
|
|
118
|
-
|
|
119
|
-
def _get_compatibility_suggestion(self, feature_name: str) -> str:
|
|
120
|
-
"""Get suggestion for making regex polars-compatible."""
|
|
121
|
-
suggestions = {
|
|
122
|
-
"named groups": "Use regular capturing groups () instead of named groups (?P<name>)",
|
|
123
|
-
"non-capturing groups": "Use regular capturing groups () instead of non-capturing groups (?:)",
|
|
124
|
-
"positive lookahead": "Rewrite pattern without lookahead assertions",
|
|
125
|
-
"negative lookahead": "Rewrite pattern without lookahead assertions",
|
|
126
|
-
"positive lookbehind": "Rewrite pattern without lookbehind assertions",
|
|
127
|
-
"negative lookbehind": "Rewrite pattern without lookbehind assertions",
|
|
128
|
-
"start of string anchor (\\A)": "Use ^ instead of \\A for start of string",
|
|
129
|
-
"end of string anchor (\\Z)": "Use $ instead of \\Z for end of string",
|
|
130
|
-
"word boundary (\\b)": "Use character classes like [A-Za-z0-9] instead of \\b",
|
|
131
|
-
"non-word boundary (\\B)": "Use character classes instead of \\B",
|
|
132
|
-
}
|
|
133
|
-
return suggestions.get(feature_name, "Rewrite pattern to be polars-compatible")
|
|
134
|
-
|
|
135
|
-
def _check_common_regex_issues(self, pattern: str, context: ValidationContext) -> List[ValidationIssue]:
|
|
136
|
-
"""Check for common regex issues that might cause problems."""
|
|
137
|
-
issues = []
|
|
138
|
-
|
|
139
|
-
# Check for overly complex patterns
|
|
140
|
-
if len(pattern) > 200:
|
|
141
|
-
issues.append(ValidationIssue(
|
|
142
|
-
severity="warning",
|
|
143
|
-
message="Regex pattern is very long and may impact performance",
|
|
144
|
-
context=context,
|
|
145
|
-
suggestion="Consider simplifying the pattern or breaking it into multiple patterns",
|
|
146
|
-
code="COMPLEX_PATTERN"
|
|
147
|
-
))
|
|
148
|
-
|
|
149
|
-
# Check for potentially inefficient patterns
|
|
150
|
-
if re.search(r'\.\*\.\*', pattern):
|
|
151
|
-
issues.append(ValidationIssue(
|
|
152
|
-
severity="warning",
|
|
153
|
-
message="Pattern contains multiple .* which may be inefficient",
|
|
154
|
-
context=context,
|
|
155
|
-
suggestion="Consider using more specific character classes",
|
|
156
|
-
code="INEFFICIENT_PATTERN"
|
|
157
|
-
))
|
|
158
|
-
|
|
159
|
-
# Check for missing anchors
|
|
160
|
-
if not pattern.startswith('^') and not pattern.endswith('$'):
|
|
161
|
-
issues.append(ValidationIssue(
|
|
162
|
-
severity="info",
|
|
163
|
-
message="Pattern lacks anchors (^ and $) - may match partial strings",
|
|
164
|
-
context=context,
|
|
165
|
-
suggestion="Add ^ at start and $ at end for exact matching",
|
|
166
|
-
code="MISSING_ANCHORS"
|
|
167
|
-
))
|
|
168
|
-
|
|
169
|
-
return issues
|
|
170
|
-
|
|
171
|
-
def _test_polars_compatibility(self, pattern: str, context: ValidationContext) -> List[ValidationIssue]:
|
|
172
|
-
"""Test regex pattern with polars to ensure compatibility."""
|
|
173
|
-
issues = []
|
|
174
|
-
|
|
175
|
-
try:
|
|
176
|
-
# Create a test series with some sample data
|
|
177
|
-
test_data = ["test@example.com", "123-456-7890", "John Doe", "invalid"]
|
|
178
|
-
test_series = pl.Series("test", test_data)
|
|
179
|
-
|
|
180
|
-
# Try to use the pattern with polars
|
|
181
|
-
test_series.str.contains(pattern)
|
|
182
|
-
|
|
183
|
-
except Exception as e:
|
|
184
|
-
issues.append(ValidationIssue(
|
|
185
|
-
severity="error",
|
|
186
|
-
message=f"Pattern failed polars compatibility test: {e}",
|
|
187
|
-
context=context,
|
|
188
|
-
suggestion="Modify pattern to be compatible with polars regex engine",
|
|
189
|
-
code="POLARS_TEST_FAILED"
|
|
190
|
-
))
|
|
191
|
-
|
|
192
|
-
return issues
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
class SchemaValidator:
|
|
196
|
-
"""Validates schema definitions and structure."""
|
|
197
|
-
|
|
198
|
-
def __init__(self, regex_validator: RegexValidator):
|
|
199
|
-
"""Initialize schema validator."""
|
|
200
|
-
self.regex_validator = regex_validator
|
|
201
|
-
|
|
202
|
-
def validate_schema_definition(self, schema: SchemaDefinition,
|
|
203
|
-
resolved_patterns: Dict[str, ResolvedPattern],
|
|
204
|
-
file_path: Optional[str] = None) -> List[ValidationIssue]:
|
|
205
|
-
"""Validate a complete schema definition."""
|
|
206
|
-
issues = []
|
|
207
|
-
|
|
208
|
-
# Validate field definitions
|
|
209
|
-
for field_name, field_def in schema.field_definitions.items():
|
|
210
|
-
context = ValidationContext(
|
|
211
|
-
file_path=file_path,
|
|
212
|
-
field_name=field_name,
|
|
213
|
-
pattern_name=field_def.pattern_name
|
|
214
|
-
)
|
|
215
|
-
|
|
216
|
-
# Check if pattern exists
|
|
217
|
-
if field_def.pattern_name not in resolved_patterns:
|
|
218
|
-
issues.append(ValidationIssue(
|
|
219
|
-
severity="error",
|
|
220
|
-
message=f"Pattern '{field_def.pattern_name}' not found for field '{field_name}'",
|
|
221
|
-
context=context,
|
|
222
|
-
suggestion=f"Define pattern '{field_def.pattern_name}' or use an existing pattern",
|
|
223
|
-
code="PATTERN_NOT_FOUND"
|
|
224
|
-
))
|
|
225
|
-
continue
|
|
226
|
-
|
|
227
|
-
# Validate pattern
|
|
228
|
-
pattern = resolved_patterns[field_def.pattern_name]
|
|
229
|
-
issues.extend(self._validate_field_pattern(field_def, pattern, context))
|
|
230
|
-
|
|
231
|
-
# Validate distribution strategy
|
|
232
|
-
issues.extend(self._validate_distribution_strategy(field_def.distribution, pattern, context))
|
|
233
|
-
|
|
234
|
-
# Validate inline patterns
|
|
235
|
-
for pattern_name, pattern_regex in schema.inline_patterns.items():
|
|
236
|
-
context = ValidationContext(
|
|
237
|
-
file_path=file_path,
|
|
238
|
-
pattern_name=pattern_name
|
|
239
|
-
)
|
|
240
|
-
issues.extend(self.regex_validator.validate_regex_pattern(pattern_regex, context))
|
|
241
|
-
|
|
242
|
-
return issues
|
|
243
|
-
|
|
244
|
-
def _validate_field_pattern(self, field_def: FieldDefinition,
|
|
245
|
-
pattern: ResolvedPattern,
|
|
246
|
-
context: ValidationContext) -> List[ValidationIssue]:
|
|
247
|
-
"""Validate a field's pattern definition."""
|
|
248
|
-
issues = []
|
|
249
|
-
|
|
250
|
-
# Check pattern validity
|
|
251
|
-
if not pattern.is_valid:
|
|
252
|
-
issues.append(ValidationIssue(
|
|
253
|
-
severity="error",
|
|
254
|
-
message=f"Pattern '{pattern.definition.name}' is marked as invalid",
|
|
255
|
-
context=context,
|
|
256
|
-
suggestion="Fix the pattern definition or use a different pattern",
|
|
257
|
-
code="INVALID_PATTERN"
|
|
258
|
-
))
|
|
259
|
-
|
|
260
|
-
# Check polars compatibility
|
|
261
|
-
if not pattern.definition.polars_compatible:
|
|
262
|
-
issues.append(ValidationIssue(
|
|
263
|
-
severity="error",
|
|
264
|
-
message=f"Pattern '{pattern.definition.name}' is not polars-compatible",
|
|
265
|
-
context=context,
|
|
266
|
-
suggestion="Use a polars-compatible pattern or modify the regex",
|
|
267
|
-
code="PATTERN_NOT_POLARS_COMPATIBLE"
|
|
268
|
-
))
|
|
269
|
-
|
|
270
|
-
# Validate the regex itself
|
|
271
|
-
issues.extend(self.regex_validator.validate_regex_pattern(
|
|
272
|
-
pattern.definition.regex, context
|
|
273
|
-
))
|
|
274
|
-
|
|
275
|
-
return issues
|
|
276
|
-
|
|
277
|
-
def _validate_distribution_strategy(self, distribution: DistributionStrategy,
|
|
278
|
-
pattern: ResolvedPattern,
|
|
279
|
-
context: ValidationContext) -> List[ValidationIssue]:
|
|
280
|
-
"""Validate distribution strategy compatibility with pattern."""
|
|
281
|
-
issues = []
|
|
282
|
-
|
|
283
|
-
# Check strategy-specific requirements
|
|
284
|
-
if distribution.strategy_type == DistributionType.NUMERIC_RANGE:
|
|
285
|
-
if not self._is_numeric_pattern(pattern.definition.regex):
|
|
286
|
-
issues.append(ValidationIssue(
|
|
287
|
-
severity="error",
|
|
288
|
-
message="Numeric range distribution requires a numeric pattern",
|
|
289
|
-
context=context,
|
|
290
|
-
suggestion="Use a numeric pattern or change distribution strategy",
|
|
291
|
-
code="DISTRIBUTION_PATTERN_MISMATCH"
|
|
292
|
-
))
|
|
293
|
-
|
|
294
|
-
elif distribution.strategy_type == DistributionType.CUSTOM:
|
|
295
|
-
if 'weights' not in distribution.parameters:
|
|
296
|
-
issues.append(ValidationIssue(
|
|
297
|
-
severity="error",
|
|
298
|
-
message="Custom distribution requires 'weights' parameter",
|
|
299
|
-
context=context,
|
|
300
|
-
suggestion="Add weights parameter: custom[value1:50%, value2:30%, value3:20%]",
|
|
301
|
-
code="MISSING_DISTRIBUTION_PARAMETER"
|
|
302
|
-
))
|
|
303
|
-
|
|
304
|
-
elif distribution.strategy_type == DistributionType.CATEGORICAL:
|
|
305
|
-
if 'categories' not in distribution.parameters:
|
|
306
|
-
issues.append(ValidationIssue(
|
|
307
|
-
severity="error",
|
|
308
|
-
message="Categorical distribution requires 'categories' parameter",
|
|
309
|
-
context=context,
|
|
310
|
-
suggestion="Add categories parameter: categorical[cat1, cat2, cat3]",
|
|
311
|
-
code="MISSING_DISTRIBUTION_PARAMETER"
|
|
312
|
-
))
|
|
313
|
-
|
|
314
|
-
return issues
|
|
315
|
-
|
|
316
|
-
def _is_numeric_pattern(self, regex: str) -> bool:
|
|
317
|
-
"""Check if a regex pattern is primarily numeric."""
|
|
318
|
-
numeric_indicators = [r'\d', r'[0-9]', r'\d+', r'[0-9]+']
|
|
319
|
-
return any(indicator in regex for indicator in numeric_indicators)
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
class FileValidator:
|
|
323
|
-
"""Validates file syntax and structure."""
|
|
324
|
-
|
|
325
|
-
def validate_properties_file(self, file_path: str, content: str) -> List[ValidationIssue]:
|
|
326
|
-
"""Validate .properties file syntax and structure."""
|
|
327
|
-
issues = []
|
|
328
|
-
lines = content.split('\n')
|
|
329
|
-
|
|
330
|
-
for line_num, line in enumerate(lines, 1):
|
|
331
|
-
line = line.strip()
|
|
332
|
-
|
|
333
|
-
# Skip empty lines and comments
|
|
334
|
-
if not line or line.startswith('#'):
|
|
335
|
-
continue
|
|
336
|
-
|
|
337
|
-
context = ValidationContext(
|
|
338
|
-
file_path=file_path,
|
|
339
|
-
line_number=line_num
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
# Check for valid key=value format
|
|
343
|
-
if '=' not in line:
|
|
344
|
-
issues.append(ValidationIssue(
|
|
345
|
-
severity="error",
|
|
346
|
-
message="Invalid properties format - missing '=' separator",
|
|
347
|
-
context=context,
|
|
348
|
-
suggestion="Use format: pattern_name=^regex_pattern$",
|
|
349
|
-
code="INVALID_PROPERTIES_FORMAT"
|
|
350
|
-
))
|
|
351
|
-
continue
|
|
352
|
-
|
|
353
|
-
key, value = line.split('=', 1)
|
|
354
|
-
key = key.strip()
|
|
355
|
-
value = value.strip()
|
|
356
|
-
|
|
357
|
-
# Validate key format
|
|
358
|
-
if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', key):
|
|
359
|
-
issues.append(ValidationIssue(
|
|
360
|
-
severity="error",
|
|
361
|
-
message=f"Invalid pattern name '{key}' - must start with letter/underscore",
|
|
362
|
-
context=context,
|
|
363
|
-
suggestion="Use alphanumeric characters and underscores only",
|
|
364
|
-
code="INVALID_PATTERN_NAME"
|
|
365
|
-
))
|
|
366
|
-
|
|
367
|
-
# Validate regex value
|
|
368
|
-
if not value:
|
|
369
|
-
issues.append(ValidationIssue(
|
|
370
|
-
severity="error",
|
|
371
|
-
message=f"Empty regex pattern for '{key}'",
|
|
372
|
-
context=context,
|
|
373
|
-
suggestion="Provide a valid regex pattern",
|
|
374
|
-
code="EMPTY_PATTERN"
|
|
375
|
-
))
|
|
376
|
-
|
|
377
|
-
return issues
|
|
378
|
-
|
|
379
|
-
def validate_toml_file(self, file_path: str, content: str) -> List[ValidationIssue]:
|
|
380
|
-
"""Validate TOML file syntax and structure."""
|
|
381
|
-
issues = []
|
|
382
|
-
|
|
383
|
-
try:
|
|
384
|
-
import tomllib
|
|
385
|
-
except ImportError:
|
|
386
|
-
try:
|
|
387
|
-
import tomli as tomllib
|
|
388
|
-
except ImportError:
|
|
389
|
-
issues.append(ValidationIssue(
|
|
390
|
-
severity="error",
|
|
391
|
-
message="TOML parsing library not available",
|
|
392
|
-
context=ValidationContext(file_path=file_path),
|
|
393
|
-
suggestion="Install tomli: pip install tomli",
|
|
394
|
-
code="TOML_LIBRARY_MISSING"
|
|
395
|
-
))
|
|
396
|
-
return issues
|
|
397
|
-
|
|
398
|
-
try:
|
|
399
|
-
parsed = tomllib.loads(content)
|
|
400
|
-
except Exception as e:
|
|
401
|
-
issues.append(ValidationIssue(
|
|
402
|
-
severity="error",
|
|
403
|
-
message=f"Invalid TOML syntax: {e}",
|
|
404
|
-
context=ValidationContext(file_path=file_path),
|
|
405
|
-
suggestion="Fix TOML syntax errors",
|
|
406
|
-
code="INVALID_TOML_SYNTAX"
|
|
407
|
-
))
|
|
408
|
-
return issues
|
|
409
|
-
|
|
410
|
-
# Validate TOML structure
|
|
411
|
-
if 'schema' not in parsed:
|
|
412
|
-
issues.append(ValidationIssue(
|
|
413
|
-
severity="warning",
|
|
414
|
-
message="No [schema] section found in TOML file",
|
|
415
|
-
context=ValidationContext(file_path=file_path),
|
|
416
|
-
suggestion="Add [schema] section with field definitions",
|
|
417
|
-
code="MISSING_SCHEMA_SECTION"
|
|
418
|
-
))
|
|
419
|
-
|
|
420
|
-
return issues
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
class ValidationSystem:
|
|
424
|
-
"""Comprehensive validation system for synthetic data generation."""
|
|
425
|
-
|
|
426
|
-
def __init__(self):
|
|
427
|
-
"""Initialize the validation system."""
|
|
428
|
-
self.regex_validator = RegexValidator()
|
|
429
|
-
self.schema_validator = SchemaValidator(self.regex_validator)
|
|
430
|
-
self.file_validator = FileValidator()
|
|
431
|
-
|
|
432
|
-
def validate_complete_configuration(self,
|
|
433
|
-
schema: SchemaDefinition,
|
|
434
|
-
resolved_patterns: Dict[str, ResolvedPattern],
|
|
435
|
-
schema_file_path: Optional[str] = None) -> ValidationResult:
|
|
436
|
-
"""Perform comprehensive validation of the complete configuration."""
|
|
437
|
-
all_issues = []
|
|
438
|
-
|
|
439
|
-
# Validate schema definition
|
|
440
|
-
schema_issues = self.schema_validator.validate_schema_definition(
|
|
441
|
-
schema, resolved_patterns, schema_file_path
|
|
442
|
-
)
|
|
443
|
-
all_issues.extend(schema_issues)
|
|
444
|
-
|
|
445
|
-
# Validate all resolved patterns
|
|
446
|
-
for pattern_name, pattern in resolved_patterns.items():
|
|
447
|
-
context = ValidationContext(
|
|
448
|
-
file_path=pattern.definition.source_file,
|
|
449
|
-
line_number=pattern.definition.line_number,
|
|
450
|
-
pattern_name=pattern_name
|
|
451
|
-
)
|
|
452
|
-
|
|
453
|
-
pattern_issues = self.regex_validator.validate_regex_pattern(
|
|
454
|
-
pattern.definition.regex, context
|
|
455
|
-
)
|
|
456
|
-
all_issues.extend(pattern_issues)
|
|
457
|
-
|
|
458
|
-
# Convert issues to ValidationResult
|
|
459
|
-
result = ValidationResult(is_valid=True)
|
|
460
|
-
|
|
461
|
-
for issue in all_issues:
|
|
462
|
-
if issue.severity == "error":
|
|
463
|
-
result.add_error(
|
|
464
|
-
self._format_issue_message(issue),
|
|
465
|
-
issue.suggestion or "No suggestion available"
|
|
466
|
-
)
|
|
467
|
-
elif issue.severity == "warning":
|
|
468
|
-
result.add_warning(self._format_issue_message(issue))
|
|
469
|
-
|
|
470
|
-
return result
|
|
471
|
-
|
|
472
|
-
def validate_file_syntax(self, file_path: str) -> ValidationResult:
|
|
473
|
-
"""Validate file syntax based on file extension."""
|
|
474
|
-
if not os.path.exists(file_path):
|
|
475
|
-
result = ValidationResult(is_valid=False)
|
|
476
|
-
result.add_error(f"File not found: {file_path}", "Check file path and permissions")
|
|
477
|
-
return result
|
|
478
|
-
|
|
479
|
-
try:
|
|
480
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
481
|
-
content = f.read()
|
|
482
|
-
except Exception as e:
|
|
483
|
-
result = ValidationResult(is_valid=False)
|
|
484
|
-
result.add_error(f"Cannot read file: {e}", "Check file permissions and encoding")
|
|
485
|
-
return result
|
|
486
|
-
|
|
487
|
-
issues = []
|
|
488
|
-
|
|
489
|
-
if file_path.endswith('.properties'):
|
|
490
|
-
issues = self.file_validator.validate_properties_file(file_path, content)
|
|
491
|
-
elif file_path.endswith('.toml'):
|
|
492
|
-
issues = self.file_validator.validate_toml_file(file_path, content)
|
|
493
|
-
else:
|
|
494
|
-
issues.append(ValidationIssue(
|
|
495
|
-
severity="error",
|
|
496
|
-
message=f"Unsupported file type: {Path(file_path).suffix}",
|
|
497
|
-
context=ValidationContext(file_path=file_path),
|
|
498
|
-
suggestion="Use .properties for patterns or .toml for schemas",
|
|
499
|
-
code="UNSUPPORTED_FILE_TYPE"
|
|
500
|
-
))
|
|
501
|
-
|
|
502
|
-
# Convert to ValidationResult
|
|
503
|
-
result = ValidationResult(is_valid=True)
|
|
504
|
-
for issue in issues:
|
|
505
|
-
if issue.severity == "error":
|
|
506
|
-
error_message = self._format_issue_message(issue)
|
|
507
|
-
if issue.suggestion:
|
|
508
|
-
error_message += f" | Suggestion: {issue.suggestion}"
|
|
509
|
-
result.add_error(error_message)
|
|
510
|
-
elif issue.severity == "warning":
|
|
511
|
-
result.add_warning(self._format_issue_message(issue))
|
|
512
|
-
|
|
513
|
-
return result
|
|
514
|
-
|
|
515
|
-
def _format_issue_message(self, issue: ValidationIssue) -> str:
|
|
516
|
-
"""Format validation issue message with context."""
|
|
517
|
-
parts = []
|
|
518
|
-
|
|
519
|
-
if issue.context.file_path:
|
|
520
|
-
parts.append(f"File: {issue.context.file_path}")
|
|
521
|
-
|
|
522
|
-
if issue.context.line_number:
|
|
523
|
-
parts.append(f"Line: {issue.context.line_number}")
|
|
524
|
-
|
|
525
|
-
if issue.context.field_name:
|
|
526
|
-
parts.append(f"Field: {issue.context.field_name}")
|
|
527
|
-
|
|
528
|
-
if issue.context.pattern_name:
|
|
529
|
-
parts.append(f"Pattern: {issue.context.pattern_name}")
|
|
530
|
-
|
|
531
|
-
context_str = " | ".join(parts)
|
|
532
|
-
|
|
533
|
-
if context_str:
|
|
534
|
-
return f"{context_str} | {issue.message}"
|
|
535
|
-
else:
|
|
536
|
-
return issue.message
|
|
537
|
-
|
|
538
|
-
def validate_fail_fast(self,
|
|
539
|
-
schema: SchemaDefinition,
|
|
540
|
-
resolved_patterns: Dict[str, ResolvedPattern],
|
|
541
|
-
schema_file_path: Optional[str] = None) -> None:
|
|
542
|
-
"""Perform fail-fast validation that raises exception on any error."""
|
|
543
|
-
result = self.validate_complete_configuration(schema, resolved_patterns, schema_file_path)
|
|
544
|
-
|
|
545
|
-
if not result.is_valid:
|
|
546
|
-
error_messages = []
|
|
547
|
-
for error in result.errors:
|
|
548
|
-
error_messages.append(error)
|
|
549
|
-
|
|
550
|
-
raise ValidationError(
|
|
551
|
-
f"Validation failed with {len(result.errors)} error(s):\n" +
|
|
552
|
-
"\n".join(f" - {error}" for error in error_messages)
|
|
553
|
-
)
|
|
File without changes
|
|
File without changes
|