additory 0.1.0a2__py3-none-any.whl → 0.1.0a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. additory/__init__.py +4 -0
  2. additory/common/__init__.py +2 -2
  3. additory/common/backend.py +20 -4
  4. additory/common/distributions.py +1 -1
  5. additory/common/sample_data.py +19 -19
  6. additory/core/backends/arrow_bridge.py +7 -0
  7. additory/core/config.py +3 -3
  8. additory/core/polars_expression_engine.py +66 -16
  9. additory/core/registry.py +4 -3
  10. additory/dynamic_api.py +95 -51
  11. additory/expressions/proxy.py +4 -1
  12. additory/expressions/registry.py +3 -3
  13. additory/synthetic/__init__.py +7 -95
  14. additory/synthetic/column_name_resolver.py +149 -0
  15. additory/synthetic/deduce.py +259 -0
  16. additory/{augment → synthetic}/distributions.py +2 -2
  17. additory/{augment → synthetic}/forecast.py +1 -1
  18. additory/synthetic/linked_list_parser.py +415 -0
  19. additory/synthetic/namespace_lookup.py +129 -0
  20. additory/{augment → synthetic}/smote.py +1 -1
  21. additory/{augment → synthetic}/strategies.py +87 -44
  22. additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
  23. additory/utilities/units.py +4 -1
  24. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/METADATA +44 -28
  25. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/RECORD +28 -43
  26. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/WHEEL +1 -1
  27. additory/augment/__init__.py +0 -24
  28. additory/augment/builtin_lists.py +0 -430
  29. additory/augment/list_registry.py +0 -177
  30. additory/synthetic/api.py +0 -220
  31. additory/synthetic/common_integration.py +0 -314
  32. additory/synthetic/config.py +0 -262
  33. additory/synthetic/engines.py +0 -529
  34. additory/synthetic/exceptions.py +0 -180
  35. additory/synthetic/file_managers.py +0 -518
  36. additory/synthetic/generator.py +0 -702
  37. additory/synthetic/generator_parser.py +0 -68
  38. additory/synthetic/integration.py +0 -319
  39. additory/synthetic/models.py +0 -241
  40. additory/synthetic/pattern_resolver.py +0 -573
  41. additory/synthetic/performance.py +0 -469
  42. additory/synthetic/polars_integration.py +0 -464
  43. additory/synthetic/proxy.py +0 -60
  44. additory/synthetic/schema_parser.py +0 -685
  45. additory/synthetic/validator.py +0 -553
  46. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
  47. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/top_level.txt +0 -0
@@ -1,553 +0,0 @@
1
- """
2
- Comprehensive validation system for synthetic data generation.
3
-
4
- Implements fail-fast validation with clear error messages, line numbers,
5
- and actionable suggestions for fixing configuration issues.
6
- """
7
-
8
- import re
9
- import os
10
- from typing import Dict, List, Optional, Set, Tuple, Any, Union
11
- from pathlib import Path
12
- from dataclasses import dataclass
13
- import polars as pl
14
-
15
- from .models import (
16
- ValidationResult,
17
- PatternDefinition,
18
- SchemaDefinition,
19
- DistributionStrategy,
20
- DistributionType,
21
- PatternSource,
22
- ValidationStatus,
23
- ResolvedPattern,
24
- FieldDefinition
25
- )
26
- from .exceptions import ValidationError, SyntheticDataError
27
-
28
-
29
- @dataclass
30
- class ValidationContext:
31
- """Context information for validation operations."""
32
- file_path: Optional[str] = None
33
- line_number: Optional[int] = None
34
- field_name: Optional[str] = None
35
- pattern_name: Optional[str] = None
36
-
37
-
38
- @dataclass
39
- class ValidationIssue:
40
- """Represents a validation issue with context and suggestions."""
41
- severity: str # "error", "warning", "info"
42
- message: str
43
- context: ValidationContext
44
- suggestion: Optional[str] = None
45
- code: Optional[str] = None
46
-
47
-
48
- class RegexValidator:
49
- """Validates regex patterns for polars compatibility and correctness."""
50
-
51
- def __init__(self):
52
- """Initialize the regex validator."""
53
- self.polars_incompatible_features = [
54
- r'(?<!\\)\(\?P<', # Named groups
55
- r'(?<!\\)\(\?:', # Non-capturing groups (some cases)
56
- r'(?<!\\)\(\?=', # Positive lookahead
57
- r'(?<!\\)\(\?!', # Negative lookahead
58
- r'(?<!\\)\(\?<=', # Positive lookbehind
59
- r'(?<!\\)\(\?<!', # Negative lookbehind
60
- r'\\A', # Start of string (use ^ instead)
61
- r'\\Z', # End of string (use $ instead)
62
- r'\\b', # Word boundary (limited support)
63
- r'\\B', # Non-word boundary
64
- ]
65
-
66
- def validate_regex_pattern(self, pattern: str, context: ValidationContext) -> List[ValidationIssue]:
67
- """Validate a regex pattern for correctness and polars compatibility."""
68
- issues = []
69
-
70
- # Check basic regex syntax
71
- try:
72
- re.compile(pattern)
73
- except re.error as e:
74
- issues.append(ValidationIssue(
75
- severity="error",
76
- message=f"Invalid regex syntax: {e}",
77
- context=context,
78
- suggestion="Fix the regex syntax error. Common issues: unmatched brackets, invalid escape sequences",
79
- code="REGEX_SYNTAX_ERROR"
80
- ))
81
- return issues # Can't continue validation with invalid regex
82
-
83
- # Check for polars incompatible features
84
- for incompatible_feature in self.polars_incompatible_features:
85
- if re.search(incompatible_feature, pattern):
86
- feature_name = self._get_feature_name(incompatible_feature)
87
- issues.append(ValidationIssue(
88
- severity="error",
89
- message=f"Regex pattern contains polars-incompatible feature: {feature_name}",
90
- context=context,
91
- suggestion=self._get_compatibility_suggestion(feature_name),
92
- code="POLARS_INCOMPATIBLE"
93
- ))
94
-
95
- # Check for common issues
96
- issues.extend(self._check_common_regex_issues(pattern, context))
97
-
98
- # Test pattern with polars
99
- issues.extend(self._test_polars_compatibility(pattern, context))
100
-
101
- return issues
102
-
103
- def _get_feature_name(self, pattern: str) -> str:
104
- """Get human-readable name for regex feature."""
105
- feature_map = {
106
- r'(?<!\\)\(\?P<': "named groups",
107
- r'(?<!\\)\(\?:': "non-capturing groups",
108
- r'(?<!\\)\(\?=': "positive lookahead",
109
- r'(?<!\\)\(\?!': "negative lookahead",
110
- r'(?<!\\)\(\?<=': "positive lookbehind",
111
- r'(?<!\\)\(\?<!': "negative lookbehind",
112
- r'\\A': "start of string anchor (\\A)",
113
- r'\\Z': "end of string anchor (\\Z)",
114
- r'\\b': "word boundary (\\b)",
115
- r'\\B': "non-word boundary (\\B)",
116
- }
117
- return feature_map.get(pattern, "unknown feature")
118
-
119
- def _get_compatibility_suggestion(self, feature_name: str) -> str:
120
- """Get suggestion for making regex polars-compatible."""
121
- suggestions = {
122
- "named groups": "Use regular capturing groups () instead of named groups (?P<name>)",
123
- "non-capturing groups": "Use regular capturing groups () instead of non-capturing groups (?:)",
124
- "positive lookahead": "Rewrite pattern without lookahead assertions",
125
- "negative lookahead": "Rewrite pattern without lookahead assertions",
126
- "positive lookbehind": "Rewrite pattern without lookbehind assertions",
127
- "negative lookbehind": "Rewrite pattern without lookbehind assertions",
128
- "start of string anchor (\\A)": "Use ^ instead of \\A for start of string",
129
- "end of string anchor (\\Z)": "Use $ instead of \\Z for end of string",
130
- "word boundary (\\b)": "Use character classes like [A-Za-z0-9] instead of \\b",
131
- "non-word boundary (\\B)": "Use character classes instead of \\B",
132
- }
133
- return suggestions.get(feature_name, "Rewrite pattern to be polars-compatible")
134
-
135
- def _check_common_regex_issues(self, pattern: str, context: ValidationContext) -> List[ValidationIssue]:
136
- """Check for common regex issues that might cause problems."""
137
- issues = []
138
-
139
- # Check for overly complex patterns
140
- if len(pattern) > 200:
141
- issues.append(ValidationIssue(
142
- severity="warning",
143
- message="Regex pattern is very long and may impact performance",
144
- context=context,
145
- suggestion="Consider simplifying the pattern or breaking it into multiple patterns",
146
- code="COMPLEX_PATTERN"
147
- ))
148
-
149
- # Check for potentially inefficient patterns
150
- if re.search(r'\.\*\.\*', pattern):
151
- issues.append(ValidationIssue(
152
- severity="warning",
153
- message="Pattern contains multiple .* which may be inefficient",
154
- context=context,
155
- suggestion="Consider using more specific character classes",
156
- code="INEFFICIENT_PATTERN"
157
- ))
158
-
159
- # Check for missing anchors
160
- if not pattern.startswith('^') and not pattern.endswith('$'):
161
- issues.append(ValidationIssue(
162
- severity="info",
163
- message="Pattern lacks anchors (^ and $) - may match partial strings",
164
- context=context,
165
- suggestion="Add ^ at start and $ at end for exact matching",
166
- code="MISSING_ANCHORS"
167
- ))
168
-
169
- return issues
170
-
171
- def _test_polars_compatibility(self, pattern: str, context: ValidationContext) -> List[ValidationIssue]:
172
- """Test regex pattern with polars to ensure compatibility."""
173
- issues = []
174
-
175
- try:
176
- # Create a test series with some sample data
177
- test_data = ["test@example.com", "123-456-7890", "John Doe", "invalid"]
178
- test_series = pl.Series("test", test_data)
179
-
180
- # Try to use the pattern with polars
181
- test_series.str.contains(pattern)
182
-
183
- except Exception as e:
184
- issues.append(ValidationIssue(
185
- severity="error",
186
- message=f"Pattern failed polars compatibility test: {e}",
187
- context=context,
188
- suggestion="Modify pattern to be compatible with polars regex engine",
189
- code="POLARS_TEST_FAILED"
190
- ))
191
-
192
- return issues
193
-
194
-
195
- class SchemaValidator:
196
- """Validates schema definitions and structure."""
197
-
198
- def __init__(self, regex_validator: RegexValidator):
199
- """Initialize schema validator."""
200
- self.regex_validator = regex_validator
201
-
202
- def validate_schema_definition(self, schema: SchemaDefinition,
203
- resolved_patterns: Dict[str, ResolvedPattern],
204
- file_path: Optional[str] = None) -> List[ValidationIssue]:
205
- """Validate a complete schema definition."""
206
- issues = []
207
-
208
- # Validate field definitions
209
- for field_name, field_def in schema.field_definitions.items():
210
- context = ValidationContext(
211
- file_path=file_path,
212
- field_name=field_name,
213
- pattern_name=field_def.pattern_name
214
- )
215
-
216
- # Check if pattern exists
217
- if field_def.pattern_name not in resolved_patterns:
218
- issues.append(ValidationIssue(
219
- severity="error",
220
- message=f"Pattern '{field_def.pattern_name}' not found for field '{field_name}'",
221
- context=context,
222
- suggestion=f"Define pattern '{field_def.pattern_name}' or use an existing pattern",
223
- code="PATTERN_NOT_FOUND"
224
- ))
225
- continue
226
-
227
- # Validate pattern
228
- pattern = resolved_patterns[field_def.pattern_name]
229
- issues.extend(self._validate_field_pattern(field_def, pattern, context))
230
-
231
- # Validate distribution strategy
232
- issues.extend(self._validate_distribution_strategy(field_def.distribution, pattern, context))
233
-
234
- # Validate inline patterns
235
- for pattern_name, pattern_regex in schema.inline_patterns.items():
236
- context = ValidationContext(
237
- file_path=file_path,
238
- pattern_name=pattern_name
239
- )
240
- issues.extend(self.regex_validator.validate_regex_pattern(pattern_regex, context))
241
-
242
- return issues
243
-
244
- def _validate_field_pattern(self, field_def: FieldDefinition,
245
- pattern: ResolvedPattern,
246
- context: ValidationContext) -> List[ValidationIssue]:
247
- """Validate a field's pattern definition."""
248
- issues = []
249
-
250
- # Check pattern validity
251
- if not pattern.is_valid:
252
- issues.append(ValidationIssue(
253
- severity="error",
254
- message=f"Pattern '{pattern.definition.name}' is marked as invalid",
255
- context=context,
256
- suggestion="Fix the pattern definition or use a different pattern",
257
- code="INVALID_PATTERN"
258
- ))
259
-
260
- # Check polars compatibility
261
- if not pattern.definition.polars_compatible:
262
- issues.append(ValidationIssue(
263
- severity="error",
264
- message=f"Pattern '{pattern.definition.name}' is not polars-compatible",
265
- context=context,
266
- suggestion="Use a polars-compatible pattern or modify the regex",
267
- code="PATTERN_NOT_POLARS_COMPATIBLE"
268
- ))
269
-
270
- # Validate the regex itself
271
- issues.extend(self.regex_validator.validate_regex_pattern(
272
- pattern.definition.regex, context
273
- ))
274
-
275
- return issues
276
-
277
- def _validate_distribution_strategy(self, distribution: DistributionStrategy,
278
- pattern: ResolvedPattern,
279
- context: ValidationContext) -> List[ValidationIssue]:
280
- """Validate distribution strategy compatibility with pattern."""
281
- issues = []
282
-
283
- # Check strategy-specific requirements
284
- if distribution.strategy_type == DistributionType.NUMERIC_RANGE:
285
- if not self._is_numeric_pattern(pattern.definition.regex):
286
- issues.append(ValidationIssue(
287
- severity="error",
288
- message="Numeric range distribution requires a numeric pattern",
289
- context=context,
290
- suggestion="Use a numeric pattern or change distribution strategy",
291
- code="DISTRIBUTION_PATTERN_MISMATCH"
292
- ))
293
-
294
- elif distribution.strategy_type == DistributionType.CUSTOM:
295
- if 'weights' not in distribution.parameters:
296
- issues.append(ValidationIssue(
297
- severity="error",
298
- message="Custom distribution requires 'weights' parameter",
299
- context=context,
300
- suggestion="Add weights parameter: custom[value1:50%, value2:30%, value3:20%]",
301
- code="MISSING_DISTRIBUTION_PARAMETER"
302
- ))
303
-
304
- elif distribution.strategy_type == DistributionType.CATEGORICAL:
305
- if 'categories' not in distribution.parameters:
306
- issues.append(ValidationIssue(
307
- severity="error",
308
- message="Categorical distribution requires 'categories' parameter",
309
- context=context,
310
- suggestion="Add categories parameter: categorical[cat1, cat2, cat3]",
311
- code="MISSING_DISTRIBUTION_PARAMETER"
312
- ))
313
-
314
- return issues
315
-
316
- def _is_numeric_pattern(self, regex: str) -> bool:
317
- """Check if a regex pattern is primarily numeric."""
318
- numeric_indicators = [r'\d', r'[0-9]', r'\d+', r'[0-9]+']
319
- return any(indicator in regex for indicator in numeric_indicators)
320
-
321
-
322
- class FileValidator:
323
- """Validates file syntax and structure."""
324
-
325
- def validate_properties_file(self, file_path: str, content: str) -> List[ValidationIssue]:
326
- """Validate .properties file syntax and structure."""
327
- issues = []
328
- lines = content.split('\n')
329
-
330
- for line_num, line in enumerate(lines, 1):
331
- line = line.strip()
332
-
333
- # Skip empty lines and comments
334
- if not line or line.startswith('#'):
335
- continue
336
-
337
- context = ValidationContext(
338
- file_path=file_path,
339
- line_number=line_num
340
- )
341
-
342
- # Check for valid key=value format
343
- if '=' not in line:
344
- issues.append(ValidationIssue(
345
- severity="error",
346
- message="Invalid properties format - missing '=' separator",
347
- context=context,
348
- suggestion="Use format: pattern_name=^regex_pattern$",
349
- code="INVALID_PROPERTIES_FORMAT"
350
- ))
351
- continue
352
-
353
- key, value = line.split('=', 1)
354
- key = key.strip()
355
- value = value.strip()
356
-
357
- # Validate key format
358
- if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', key):
359
- issues.append(ValidationIssue(
360
- severity="error",
361
- message=f"Invalid pattern name '{key}' - must start with letter/underscore",
362
- context=context,
363
- suggestion="Use alphanumeric characters and underscores only",
364
- code="INVALID_PATTERN_NAME"
365
- ))
366
-
367
- # Validate regex value
368
- if not value:
369
- issues.append(ValidationIssue(
370
- severity="error",
371
- message=f"Empty regex pattern for '{key}'",
372
- context=context,
373
- suggestion="Provide a valid regex pattern",
374
- code="EMPTY_PATTERN"
375
- ))
376
-
377
- return issues
378
-
379
- def validate_toml_file(self, file_path: str, content: str) -> List[ValidationIssue]:
380
- """Validate TOML file syntax and structure."""
381
- issues = []
382
-
383
- try:
384
- import tomllib
385
- except ImportError:
386
- try:
387
- import tomli as tomllib
388
- except ImportError:
389
- issues.append(ValidationIssue(
390
- severity="error",
391
- message="TOML parsing library not available",
392
- context=ValidationContext(file_path=file_path),
393
- suggestion="Install tomli: pip install tomli",
394
- code="TOML_LIBRARY_MISSING"
395
- ))
396
- return issues
397
-
398
- try:
399
- parsed = tomllib.loads(content)
400
- except Exception as e:
401
- issues.append(ValidationIssue(
402
- severity="error",
403
- message=f"Invalid TOML syntax: {e}",
404
- context=ValidationContext(file_path=file_path),
405
- suggestion="Fix TOML syntax errors",
406
- code="INVALID_TOML_SYNTAX"
407
- ))
408
- return issues
409
-
410
- # Validate TOML structure
411
- if 'schema' not in parsed:
412
- issues.append(ValidationIssue(
413
- severity="warning",
414
- message="No [schema] section found in TOML file",
415
- context=ValidationContext(file_path=file_path),
416
- suggestion="Add [schema] section with field definitions",
417
- code="MISSING_SCHEMA_SECTION"
418
- ))
419
-
420
- return issues
421
-
422
-
423
- class ValidationSystem:
424
- """Comprehensive validation system for synthetic data generation."""
425
-
426
- def __init__(self):
427
- """Initialize the validation system."""
428
- self.regex_validator = RegexValidator()
429
- self.schema_validator = SchemaValidator(self.regex_validator)
430
- self.file_validator = FileValidator()
431
-
432
- def validate_complete_configuration(self,
433
- schema: SchemaDefinition,
434
- resolved_patterns: Dict[str, ResolvedPattern],
435
- schema_file_path: Optional[str] = None) -> ValidationResult:
436
- """Perform comprehensive validation of the complete configuration."""
437
- all_issues = []
438
-
439
- # Validate schema definition
440
- schema_issues = self.schema_validator.validate_schema_definition(
441
- schema, resolved_patterns, schema_file_path
442
- )
443
- all_issues.extend(schema_issues)
444
-
445
- # Validate all resolved patterns
446
- for pattern_name, pattern in resolved_patterns.items():
447
- context = ValidationContext(
448
- file_path=pattern.definition.source_file,
449
- line_number=pattern.definition.line_number,
450
- pattern_name=pattern_name
451
- )
452
-
453
- pattern_issues = self.regex_validator.validate_regex_pattern(
454
- pattern.definition.regex, context
455
- )
456
- all_issues.extend(pattern_issues)
457
-
458
- # Convert issues to ValidationResult
459
- result = ValidationResult(is_valid=True)
460
-
461
- for issue in all_issues:
462
- if issue.severity == "error":
463
- result.add_error(
464
- self._format_issue_message(issue),
465
- issue.suggestion or "No suggestion available"
466
- )
467
- elif issue.severity == "warning":
468
- result.add_warning(self._format_issue_message(issue))
469
-
470
- return result
471
-
472
- def validate_file_syntax(self, file_path: str) -> ValidationResult:
473
- """Validate file syntax based on file extension."""
474
- if not os.path.exists(file_path):
475
- result = ValidationResult(is_valid=False)
476
- result.add_error(f"File not found: {file_path}", "Check file path and permissions")
477
- return result
478
-
479
- try:
480
- with open(file_path, 'r', encoding='utf-8') as f:
481
- content = f.read()
482
- except Exception as e:
483
- result = ValidationResult(is_valid=False)
484
- result.add_error(f"Cannot read file: {e}", "Check file permissions and encoding")
485
- return result
486
-
487
- issues = []
488
-
489
- if file_path.endswith('.properties'):
490
- issues = self.file_validator.validate_properties_file(file_path, content)
491
- elif file_path.endswith('.toml'):
492
- issues = self.file_validator.validate_toml_file(file_path, content)
493
- else:
494
- issues.append(ValidationIssue(
495
- severity="error",
496
- message=f"Unsupported file type: {Path(file_path).suffix}",
497
- context=ValidationContext(file_path=file_path),
498
- suggestion="Use .properties for patterns or .toml for schemas",
499
- code="UNSUPPORTED_FILE_TYPE"
500
- ))
501
-
502
- # Convert to ValidationResult
503
- result = ValidationResult(is_valid=True)
504
- for issue in issues:
505
- if issue.severity == "error":
506
- error_message = self._format_issue_message(issue)
507
- if issue.suggestion:
508
- error_message += f" | Suggestion: {issue.suggestion}"
509
- result.add_error(error_message)
510
- elif issue.severity == "warning":
511
- result.add_warning(self._format_issue_message(issue))
512
-
513
- return result
514
-
515
- def _format_issue_message(self, issue: ValidationIssue) -> str:
516
- """Format validation issue message with context."""
517
- parts = []
518
-
519
- if issue.context.file_path:
520
- parts.append(f"File: {issue.context.file_path}")
521
-
522
- if issue.context.line_number:
523
- parts.append(f"Line: {issue.context.line_number}")
524
-
525
- if issue.context.field_name:
526
- parts.append(f"Field: {issue.context.field_name}")
527
-
528
- if issue.context.pattern_name:
529
- parts.append(f"Pattern: {issue.context.pattern_name}")
530
-
531
- context_str = " | ".join(parts)
532
-
533
- if context_str:
534
- return f"{context_str} | {issue.message}"
535
- else:
536
- return issue.message
537
-
538
- def validate_fail_fast(self,
539
- schema: SchemaDefinition,
540
- resolved_patterns: Dict[str, ResolvedPattern],
541
- schema_file_path: Optional[str] = None) -> None:
542
- """Perform fail-fast validation that raises exception on any error."""
543
- result = self.validate_complete_configuration(schema, resolved_patterns, schema_file_path)
544
-
545
- if not result.is_valid:
546
- error_messages = []
547
- for error in result.errors:
548
- error_messages.append(error)
549
-
550
- raise ValidationError(
551
- f"Validation failed with {len(result.errors)} error(s):\n" +
552
- "\n".join(f" - {error}" for error in error_messages)
553
- )