additory 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. additory/__init__.py +4 -0
  2. additory/common/__init__.py +2 -2
  3. additory/common/backend.py +20 -4
  4. additory/common/distributions.py +1 -1
  5. additory/common/sample_data.py +19 -19
  6. additory/core/backends/arrow_bridge.py +7 -0
  7. additory/core/polars_expression_engine.py +66 -16
  8. additory/dynamic_api.py +42 -46
  9. additory/expressions/proxy.py +4 -1
  10. additory/synthetic/__init__.py +7 -95
  11. additory/synthetic/column_name_resolver.py +149 -0
  12. additory/{augment → synthetic}/distributions.py +2 -2
  13. additory/{augment → synthetic}/forecast.py +1 -1
  14. additory/synthetic/linked_list_parser.py +415 -0
  15. additory/synthetic/namespace_lookup.py +129 -0
  16. additory/{augment → synthetic}/smote.py +1 -1
  17. additory/{augment → synthetic}/strategies.py +11 -44
  18. additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
  19. additory/utilities/units.py +4 -1
  20. {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/METADATA +12 -17
  21. {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
  22. {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
  23. additory/augment/__init__.py +0 -24
  24. additory/augment/builtin_lists.py +0 -430
  25. additory/augment/list_registry.py +0 -177
  26. additory/synthetic/api.py +0 -220
  27. additory/synthetic/common_integration.py +0 -314
  28. additory/synthetic/config.py +0 -262
  29. additory/synthetic/engines.py +0 -529
  30. additory/synthetic/exceptions.py +0 -180
  31. additory/synthetic/file_managers.py +0 -518
  32. additory/synthetic/generator.py +0 -702
  33. additory/synthetic/generator_parser.py +0 -68
  34. additory/synthetic/integration.py +0 -319
  35. additory/synthetic/models.py +0 -241
  36. additory/synthetic/pattern_resolver.py +0 -573
  37. additory/synthetic/performance.py +0 -469
  38. additory/synthetic/polars_integration.py +0 -464
  39. additory/synthetic/proxy.py +0 -60
  40. additory/synthetic/schema_parser.py +0 -685
  41. additory/synthetic/validator.py +0 -553
  42. {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
  43. {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
@@ -1,573 +0,0 @@
1
- """
2
- Pattern hierarchy resolution system for synthetic data generation.
3
-
4
- Implements the 5-level pattern hierarchy with caching and tracing capabilities.
5
- """
6
-
7
- import time
8
- import logging
9
- from pathlib import Path
10
- from typing import Dict, List, Optional, Tuple, Set
11
- from dataclasses import dataclass
12
- from enum import Enum
13
-
14
- from .models import PatternDefinition, PatternSource, ValidationStatus
15
- from .file_managers import PatternFileManager
16
- from .exceptions import PatternResolutionError, PatternImportError
17
-
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- @dataclass
23
- class ResolutionTrace:
24
- """Trace information for pattern resolution debugging."""
25
- pattern_name: str
26
- resolved_source: PatternSource
27
- resolved_value: str
28
- source_file: str
29
- search_order: List[Tuple[PatternSource, str, bool]] # (source, file, found)
30
- resolution_time_ms: float
31
-
32
-
33
- @dataclass
34
- class PatternResolutionResult:
35
- """Result of pattern resolution with tracing information."""
36
- pattern: PatternDefinition
37
- trace: ResolutionTrace
38
-
39
-
40
- class PatternHierarchyResolver:
41
- """
42
- Resolves patterns according to the 5-level hierarchy system.
43
-
44
- Hierarchy (highest to lowest priority):
45
- 1. Inline Schema Parameters
46
- 2. User Imports (non-global)
47
- 3. User Global.properties
48
- 4. Core Non-Global.properties (regional/domain)
49
- 5. Core Global.properties (fallback)
50
- """
51
-
52
- def __init__(self, core_patterns_path: str = "reference/schema_definitions"):
53
- """
54
- Initialize the pattern hierarchy resolver.
55
-
56
- Args:
57
- core_patterns_path: Path to core pattern files directory
58
- """
59
- self.core_patterns_path = Path(core_patterns_path)
60
- self.file_manager = PatternFileManager()
61
-
62
- # Caches for performance
63
- self._pattern_cache: Dict[str, PatternDefinition] = {}
64
- self._file_cache: Dict[str, Dict[str, str]] = {}
65
- self._resolution_traces: List[ResolutionTrace] = []
66
-
67
- # Track loaded files to avoid circular imports
68
- self._loading_files: Set[str] = set()
69
-
70
- def resolve_pattern(self,
71
- pattern_name: str,
72
- inline_patterns: Optional[Dict[str, str]] = None,
73
- user_imports: Optional[List[str]] = None,
74
- user_global_path: Optional[str] = None) -> PatternResolutionResult:
75
- """
76
- Resolve a pattern according to the 5-level hierarchy.
77
-
78
- Args:
79
- pattern_name: Name of pattern to resolve
80
- inline_patterns: Inline patterns from schema (highest priority)
81
- user_imports: List of user import files
82
- user_global_path: Path to user global.properties file
83
-
84
- Returns:
85
- PatternResolutionResult with pattern and trace information
86
-
87
- Raises:
88
- PatternResolutionError: If pattern cannot be resolved
89
- """
90
- import time
91
- start_time = time.time()
92
-
93
- search_order = []
94
-
95
- # Level 1: Inline Schema Parameters (Highest Priority)
96
- if inline_patterns and pattern_name in inline_patterns:
97
- pattern_def = PatternDefinition(
98
- name=pattern_name,
99
- regex=inline_patterns[pattern_name],
100
- source=PatternSource.INLINE,
101
- validation_status=ValidationStatus.VALID, # Assume inline patterns are valid
102
- source_file="<inline>",
103
- polars_compatible=self._is_polars_compatible(inline_patterns[pattern_name]) # Test compatibility
104
- )
105
-
106
- search_order.append((PatternSource.INLINE, "<inline>", True))
107
- trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
108
-
109
- return PatternResolutionResult(pattern=pattern_def, trace=trace)
110
- else:
111
- search_order.append((PatternSource.INLINE, "<inline>", False))
112
-
113
- # Level 2: User Imports (non-global)
114
- if user_imports:
115
- for import_file in user_imports:
116
- if import_file == "global":
117
- continue # Skip global, handle in level 3
118
-
119
- try:
120
- patterns = self._load_user_import_file(import_file)
121
- if pattern_name in patterns:
122
- pattern_def = PatternDefinition(
123
- name=pattern_name,
124
- regex=patterns[pattern_name],
125
- source=PatternSource.USER_IMPORT,
126
- validation_status=ValidationStatus.VALID, # Assume user patterns are valid
127
- source_file=import_file,
128
- polars_compatible=self._is_polars_compatible(patterns[pattern_name]) # Test compatibility
129
- )
130
-
131
- search_order.append((PatternSource.USER_IMPORT, import_file, True))
132
- trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
133
-
134
- return PatternResolutionResult(pattern=pattern_def, trace=trace)
135
- else:
136
- search_order.append((PatternSource.USER_IMPORT, import_file, False))
137
-
138
- except (FileNotFoundError, PatternImportError, PermissionError) as e:
139
- search_order.append((PatternSource.USER_IMPORT, import_file, False))
140
- # Log the specific error for debugging but continue searching
141
- logger.debug(f"Failed to load user import '{import_file}': {e}")
142
-
143
- # Level 3: User Global.properties
144
- user_global_checked = False
145
- if user_global_path:
146
- try:
147
- patterns = self._load_user_global_file(user_global_path)
148
- if pattern_name in patterns:
149
- pattern_def = PatternDefinition(
150
- name=pattern_name,
151
- regex=patterns[pattern_name],
152
- source=PatternSource.USER_GLOBAL,
153
- validation_status=ValidationStatus.VALID, # Assume user patterns are valid
154
- source_file=user_global_path,
155
- polars_compatible=self._is_polars_compatible(patterns[pattern_name]) # Test compatibility
156
- )
157
-
158
- search_order.append((PatternSource.USER_GLOBAL, user_global_path, True))
159
- trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
160
-
161
- return PatternResolutionResult(pattern=pattern_def, trace=trace)
162
- else:
163
- search_order.append((PatternSource.USER_GLOBAL, user_global_path, False))
164
- user_global_checked = True
165
-
166
- except (FileNotFoundError, PatternImportError, PermissionError) as e:
167
- search_order.append((PatternSource.USER_GLOBAL, user_global_path, False))
168
- user_global_checked = True
169
- logger.debug(f"Failed to load user global '{user_global_path}': {e}")
170
-
171
- # Also check for "global" in user imports (treated as user global)
172
- if user_imports and "global" in user_imports and not user_global_checked:
173
- try:
174
- patterns = self._load_user_global_file("global.properties")
175
- if pattern_name in patterns:
176
- pattern_def = PatternDefinition(
177
- name=pattern_name,
178
- regex=patterns[pattern_name],
179
- source=PatternSource.USER_GLOBAL,
180
- validation_status=ValidationStatus.VALID, # Assume user patterns are valid
181
- source_file="global.properties",
182
- polars_compatible=self._is_polars_compatible(patterns[pattern_name]) # Test compatibility
183
- )
184
-
185
- search_order.append((PatternSource.USER_GLOBAL, "global.properties", True))
186
- trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
187
-
188
- return PatternResolutionResult(pattern=pattern_def, trace=trace)
189
- else:
190
- search_order.append((PatternSource.USER_GLOBAL, "global.properties", False))
191
-
192
- except (FileNotFoundError, PatternImportError, PermissionError) as e:
193
- search_order.append((PatternSource.USER_GLOBAL, "global.properties", False))
194
- logger.debug(f"Failed to load global.properties: {e}")
195
-
196
- # Level 4: Core Non-Global.properties (regional/domain)
197
- core_non_global_files = self._discover_core_non_global_files()
198
- for core_file in core_non_global_files:
199
- try:
200
- patterns = self._load_core_file(core_file)
201
- if pattern_name in patterns:
202
- pattern_def = PatternDefinition(
203
- name=pattern_name,
204
- regex=patterns[pattern_name],
205
- source=PatternSource.CORE_NON_GLOBAL,
206
- validation_status=ValidationStatus.VALID, # Mark builtin patterns as valid
207
- source_file=str(core_file),
208
- polars_compatible=self._is_polars_compatible(patterns[pattern_name]) # Test compatibility
209
- )
210
-
211
- search_order.append((PatternSource.CORE_NON_GLOBAL, str(core_file), True))
212
- trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
213
-
214
- return PatternResolutionResult(pattern=pattern_def, trace=trace)
215
- else:
216
- search_order.append((PatternSource.CORE_NON_GLOBAL, str(core_file), False))
217
-
218
- except (FileNotFoundError, PatternImportError, PermissionError) as e:
219
- search_order.append((PatternSource.CORE_NON_GLOBAL, str(core_file), False))
220
- logger.debug(f"Failed to load core file '{core_file}': {e}")
221
-
222
- # Level 5: Core Global.properties (Lowest Priority - Ultimate Fallback)
223
- core_global_file = self.core_patterns_path / "global.properties"
224
- try:
225
- patterns = self._load_core_file(core_global_file)
226
- if pattern_name in patterns:
227
- pattern_def = PatternDefinition(
228
- name=pattern_name,
229
- regex=patterns[pattern_name],
230
- source=PatternSource.CORE_GLOBAL,
231
- validation_status=ValidationStatus.VALID, # Mark builtin patterns as valid
232
- source_file=str(core_global_file),
233
- polars_compatible=self._is_polars_compatible(patterns[pattern_name]) # Test compatibility
234
- )
235
-
236
- search_order.append((PatternSource.CORE_GLOBAL, str(core_global_file), True))
237
- trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
238
-
239
- return PatternResolutionResult(pattern=pattern_def, trace=trace)
240
- else:
241
- search_order.append((PatternSource.CORE_GLOBAL, str(core_global_file), False))
242
-
243
- except (FileNotFoundError, PatternImportError, PermissionError) as e:
244
- search_order.append((PatternSource.CORE_GLOBAL, str(core_global_file), False))
245
- logger.debug(f"Failed to load core global file '{core_global_file}': {e}")
246
-
247
- # Pattern not found in any source
248
- trace = ResolutionTrace(
249
- pattern_name=pattern_name,
250
- resolved_source=PatternSource.CORE_GLOBAL, # Placeholder
251
- resolved_value="",
252
- source_file="",
253
- search_order=search_order,
254
- resolution_time_ms=(time.time() - start_time) * 1000
255
- )
256
-
257
- self._resolution_traces.append(trace)
258
-
259
- # Create detailed error message with search information
260
- searched_locations = []
261
- for source, file_path, found in search_order:
262
- status = "✓" if found else "✗"
263
- searched_locations.append(f" {status} {source.value}: {file_path}")
264
-
265
- search_details = "\n".join(searched_locations)
266
-
267
- raise PatternResolutionError(
268
- f"Pattern '{pattern_name}' not found in any source.\n\nSearched locations:\n{search_details}",
269
- pattern_name,
270
- [source.value for source, _, _ in search_order],
271
- f"Searched {len(search_order)} sources in hierarchy order. Consider adding the pattern to global.properties or defining it inline in your schema."
272
- )
273
-
274
- def resolve_multiple_patterns(self,
275
- pattern_names: List[str],
276
- inline_patterns: Optional[Dict[str, str]] = None,
277
- user_imports: Optional[List[str]] = None,
278
- user_global_path: Optional[str] = None) -> Dict[str, PatternResolutionResult]:
279
- """
280
- Resolve multiple patterns efficiently with shared caching.
281
-
282
- Args:
283
- pattern_names: List of pattern names to resolve
284
- inline_patterns: Inline patterns from schema
285
- user_imports: List of user import files
286
- user_global_path: Path to user global.properties file
287
-
288
- Returns:
289
- Dictionary mapping pattern names to resolution results
290
-
291
- Raises:
292
- PatternResolutionError: If any pattern cannot be resolved
293
- """
294
- results = {}
295
- failed_patterns = []
296
-
297
- for pattern_name in pattern_names:
298
- try:
299
- result = self.resolve_pattern(
300
- pattern_name, inline_patterns, user_imports, user_global_path
301
- )
302
- results[pattern_name] = result
303
- except PatternResolutionError as e:
304
- failed_patterns.append((pattern_name, str(e)))
305
-
306
- if failed_patterns:
307
- failed_names = [name for name, _ in failed_patterns]
308
- raise PatternResolutionError(
309
- f"Failed to resolve {len(failed_patterns)} patterns: {', '.join(failed_names)}",
310
- failed_names[0] if failed_names else "",
311
- [],
312
- f"Multiple pattern resolution failures: {failed_patterns}"
313
- )
314
-
315
- return results
316
-
317
- def get_resolution_traces(self) -> List[ResolutionTrace]:
318
- """Get all resolution traces for debugging."""
319
- return self._resolution_traces.copy()
320
-
321
- def clear_traces(self) -> None:
322
- """Clear resolution traces."""
323
- self._resolution_traces.clear()
324
-
325
- def get_available_regional_patterns(self) -> Dict[str, List[str]]:
326
- """
327
- Get information about available regional patterns.
328
-
329
- Returns:
330
- Dictionary mapping region names to lists of available patterns
331
- """
332
- regional_patterns = {}
333
-
334
- # Known regional file prefixes
335
- regional_prefixes = {'us', 'eu', 'ca', 'uk', 'au', 'jp', 'in', 'br', 'mx'}
336
-
337
- for file_path in self.core_patterns_path.glob("*.properties"):
338
- if file_path.name == "global.properties":
339
- continue
340
-
341
- file_stem = file_path.stem.lower()
342
-
343
- if file_stem in regional_prefixes or file_stem.endswith('_region'):
344
- try:
345
- patterns = self._load_core_file(file_path)
346
- regional_patterns[file_stem] = list(patterns.keys())
347
- except Exception:
348
- # Skip files that can't be loaded
349
- continue
350
-
351
- return regional_patterns
352
-
353
- def get_available_domain_patterns(self) -> Dict[str, List[str]]:
354
- """
355
- Get information about available domain-specific patterns.
356
-
357
- Returns:
358
- Dictionary mapping domain names to lists of available patterns
359
- """
360
- domain_patterns = {}
361
-
362
- # Known regional file prefixes
363
- regional_prefixes = {'us', 'eu', 'ca', 'uk', 'au', 'jp', 'in', 'br', 'mx'}
364
-
365
- for file_path in self.core_patterns_path.glob("*.properties"):
366
- if file_path.name == "global.properties":
367
- continue
368
-
369
- file_stem = file_path.stem.lower()
370
-
371
- if file_stem not in regional_prefixes and not file_stem.endswith('_region'):
372
- try:
373
- patterns = self._load_core_file(file_path)
374
- domain_patterns[file_stem] = list(patterns.keys())
375
- except Exception:
376
- # Skip files that can't be loaded
377
- continue
378
-
379
- return domain_patterns
380
-
381
- def clear_cache(self) -> None:
382
- """Clear all caches."""
383
- self._pattern_cache.clear()
384
- self._file_cache.clear()
385
- self.file_manager.clear_cache()
386
-
387
- def _create_trace(self, pattern_name: str, pattern_def: PatternDefinition,
388
- search_order: List[Tuple[PatternSource, str, bool]],
389
- start_time: float) -> ResolutionTrace:
390
- """Create a resolution trace for debugging."""
391
- trace = ResolutionTrace(
392
- pattern_name=pattern_name,
393
- resolved_source=pattern_def.source,
394
- resolved_value=pattern_def.regex,
395
- source_file=pattern_def.source_file,
396
- search_order=search_order,
397
- resolution_time_ms=(time.time() - start_time) * 1000
398
- )
399
-
400
- self._resolution_traces.append(trace)
401
- return trace
402
-
403
- def _load_user_import_file(self, import_name: str) -> Dict[str, str]:
404
- """Load a user import file with improved error handling."""
405
- # Try different possible paths for user imports
406
- possible_paths = [
407
- f"{import_name}.properties",
408
- f"patterns/{import_name}.properties",
409
- f"./{import_name}.properties"
410
- ]
411
-
412
- errors_encountered = []
413
-
414
- for path in possible_paths:
415
- if path in self._file_cache:
416
- return self._file_cache[path]
417
-
418
- try:
419
- if Path(path).exists():
420
- parsed = self.file_manager.load_properties_file(path)
421
- self._file_cache[path] = parsed.patterns
422
- return parsed.patterns
423
- else:
424
- errors_encountered.append(f"File not found: {path}")
425
- except PermissionError:
426
- errors_encountered.append(f"Permission denied: {path}")
427
- except FileNotFoundError:
428
- errors_encountered.append(f"File not found: {path}")
429
- except Exception as e:
430
- errors_encountered.append(f"Error reading {path}: {str(e)}")
431
-
432
- # Create detailed error message
433
- error_details = "; ".join(errors_encountered)
434
- raise PatternImportError(
435
- import_name,
436
- file_path=", ".join(possible_paths),
437
- reason=f"Tried multiple locations but failed: {error_details}"
438
- )
439
-
440
- def _load_user_global_file(self, global_path: str) -> Dict[str, str]:
441
- """Load user global.properties file with improved error handling."""
442
- if global_path in self._file_cache:
443
- return self._file_cache[global_path]
444
-
445
- # Try different possible paths for user global
446
- possible_paths = [
447
- global_path,
448
- "global.properties",
449
- "./global.properties",
450
- "patterns/global.properties"
451
- ]
452
-
453
- errors_encountered = []
454
-
455
- for path in possible_paths:
456
- try:
457
- if Path(path).exists():
458
- parsed = self.file_manager.load_properties_file(path)
459
- self._file_cache[global_path] = parsed.patterns
460
- return parsed.patterns
461
- else:
462
- errors_encountered.append(f"File not found: {path}")
463
- except PermissionError:
464
- errors_encountered.append(f"Permission denied: {path}")
465
- except FileNotFoundError:
466
- errors_encountered.append(f"File not found: {path}")
467
- except Exception as e:
468
- errors_encountered.append(f"Error reading {path}: {str(e)}")
469
-
470
- # Create detailed error message
471
- error_details = "; ".join(errors_encountered)
472
- raise PatternImportError(
473
- "global",
474
- file_path=", ".join(possible_paths),
475
- reason=f"Tried multiple locations but failed: {error_details}"
476
- )
477
-
478
- def _load_core_file(self, file_path: Path) -> Dict[str, str]:
479
- """Load a core pattern file with improved error handling."""
480
- file_key = str(file_path)
481
-
482
- if file_key in self._file_cache:
483
- return self._file_cache[file_key]
484
-
485
- try:
486
- if not file_path.exists():
487
- raise FileNotFoundError(f"Core pattern file not found: {file_path}")
488
-
489
- parsed = self.file_manager.load_properties_file(str(file_path))
490
- self._file_cache[file_key] = parsed.patterns
491
- return parsed.patterns
492
-
493
- except PermissionError:
494
- raise PatternImportError(
495
- file_path.stem,
496
- file_path=str(file_path),
497
- reason="Permission denied - check file permissions"
498
- )
499
- except FileNotFoundError:
500
- raise PatternImportError(
501
- file_path.stem,
502
- file_path=str(file_path),
503
- reason="Core pattern file not found"
504
- )
505
- except Exception as e:
506
- raise PatternImportError(
507
- file_path.stem,
508
- file_path=str(file_path),
509
- reason=f"Error reading core pattern file: {str(e)}"
510
- )
511
-
512
- def _discover_core_non_global_files(self) -> List[Path]:
513
- """
514
- Discover all core non-global pattern files.
515
-
516
- Returns files in priority order:
517
- 1. Regional files (us.properties, eu.properties, ca.properties, etc.)
518
- 2. Domain-specific files (finance.properties, healthcare.properties, etc.)
519
-
520
- This ensures regional patterns are checked before domain-specific ones.
521
- """
522
- if not self.core_patterns_path.exists():
523
- return []
524
-
525
- regional_files = []
526
- domain_files = []
527
-
528
- # Known regional file prefixes (can be extended)
529
- regional_prefixes = {'us', 'eu', 'ca', 'uk', 'au', 'jp', 'in', 'br', 'mx'}
530
-
531
- for file_path in self.core_patterns_path.glob("*.properties"):
532
- if file_path.name == "global.properties":
533
- continue
534
-
535
- file_stem = file_path.stem.lower()
536
-
537
- # Check if it's a regional file
538
- if file_stem in regional_prefixes or file_stem.endswith('_region'):
539
- regional_files.append(file_path)
540
- else:
541
- domain_files.append(file_path)
542
-
543
- # Sort each category for consistent ordering
544
- regional_files.sort()
545
- domain_files.sort()
546
-
547
- # Return regional files first, then domain files
548
- return regional_files + domain_files
549
-
550
- def _is_polars_compatible(self, pattern: str) -> bool:
551
- """
552
- Test if a regex pattern is compatible with polars.
553
-
554
- Args:
555
- pattern: Regex pattern to test
556
-
557
- Returns:
558
- True if pattern is polars-compatible, False otherwise
559
- """
560
- try:
561
- import polars as pl
562
-
563
- # Create a test series with some sample data
564
- test_data = ["test@example.com", "123-456-7890", "John Doe", "invalid", "123", "ABC"]
565
- test_series = pl.Series("test", test_data)
566
-
567
- # Try to use the pattern with polars
568
- test_series.str.contains(pattern)
569
- return True
570
-
571
- except Exception:
572
- # If any error occurs, consider it incompatible
573
- return False