additory 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/polars_expression_engine.py +66 -16
- additory/dynamic_api.py +42 -46
- additory/expressions/proxy.py +4 -1
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +11 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/METADATA +12 -17
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
|
@@ -1,573 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Pattern hierarchy resolution system for synthetic data generation.
|
|
3
|
-
|
|
4
|
-
Implements the 5-level pattern hierarchy with caching and tracing capabilities.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import time
|
|
8
|
-
import logging
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from typing import Dict, List, Optional, Tuple, Set
|
|
11
|
-
from dataclasses import dataclass
|
|
12
|
-
from enum import Enum
|
|
13
|
-
|
|
14
|
-
from .models import PatternDefinition, PatternSource, ValidationStatus
|
|
15
|
-
from .file_managers import PatternFileManager
|
|
16
|
-
from .exceptions import PatternResolutionError, PatternImportError
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@dataclass
|
|
23
|
-
class ResolutionTrace:
|
|
24
|
-
"""Trace information for pattern resolution debugging."""
|
|
25
|
-
pattern_name: str
|
|
26
|
-
resolved_source: PatternSource
|
|
27
|
-
resolved_value: str
|
|
28
|
-
source_file: str
|
|
29
|
-
search_order: List[Tuple[PatternSource, str, bool]] # (source, file, found)
|
|
30
|
-
resolution_time_ms: float
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@dataclass
|
|
34
|
-
class PatternResolutionResult:
|
|
35
|
-
"""Result of pattern resolution with tracing information."""
|
|
36
|
-
pattern: PatternDefinition
|
|
37
|
-
trace: ResolutionTrace
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class PatternHierarchyResolver:
|
|
41
|
-
"""
|
|
42
|
-
Resolves patterns according to the 5-level hierarchy system.
|
|
43
|
-
|
|
44
|
-
Hierarchy (highest to lowest priority):
|
|
45
|
-
1. Inline Schema Parameters
|
|
46
|
-
2. User Imports (non-global)
|
|
47
|
-
3. User Global.properties
|
|
48
|
-
4. Core Non-Global.properties (regional/domain)
|
|
49
|
-
5. Core Global.properties (fallback)
|
|
50
|
-
"""
|
|
51
|
-
|
|
52
|
-
def __init__(self, core_patterns_path: str = "reference/schema_definitions"):
|
|
53
|
-
"""
|
|
54
|
-
Initialize the pattern hierarchy resolver.
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
core_patterns_path: Path to core pattern files directory
|
|
58
|
-
"""
|
|
59
|
-
self.core_patterns_path = Path(core_patterns_path)
|
|
60
|
-
self.file_manager = PatternFileManager()
|
|
61
|
-
|
|
62
|
-
# Caches for performance
|
|
63
|
-
self._pattern_cache: Dict[str, PatternDefinition] = {}
|
|
64
|
-
self._file_cache: Dict[str, Dict[str, str]] = {}
|
|
65
|
-
self._resolution_traces: List[ResolutionTrace] = []
|
|
66
|
-
|
|
67
|
-
# Track loaded files to avoid circular imports
|
|
68
|
-
self._loading_files: Set[str] = set()
|
|
69
|
-
|
|
70
|
-
def resolve_pattern(self,
|
|
71
|
-
pattern_name: str,
|
|
72
|
-
inline_patterns: Optional[Dict[str, str]] = None,
|
|
73
|
-
user_imports: Optional[List[str]] = None,
|
|
74
|
-
user_global_path: Optional[str] = None) -> PatternResolutionResult:
|
|
75
|
-
"""
|
|
76
|
-
Resolve a pattern according to the 5-level hierarchy.
|
|
77
|
-
|
|
78
|
-
Args:
|
|
79
|
-
pattern_name: Name of pattern to resolve
|
|
80
|
-
inline_patterns: Inline patterns from schema (highest priority)
|
|
81
|
-
user_imports: List of user import files
|
|
82
|
-
user_global_path: Path to user global.properties file
|
|
83
|
-
|
|
84
|
-
Returns:
|
|
85
|
-
PatternResolutionResult with pattern and trace information
|
|
86
|
-
|
|
87
|
-
Raises:
|
|
88
|
-
PatternResolutionError: If pattern cannot be resolved
|
|
89
|
-
"""
|
|
90
|
-
import time
|
|
91
|
-
start_time = time.time()
|
|
92
|
-
|
|
93
|
-
search_order = []
|
|
94
|
-
|
|
95
|
-
# Level 1: Inline Schema Parameters (Highest Priority)
|
|
96
|
-
if inline_patterns and pattern_name in inline_patterns:
|
|
97
|
-
pattern_def = PatternDefinition(
|
|
98
|
-
name=pattern_name,
|
|
99
|
-
regex=inline_patterns[pattern_name],
|
|
100
|
-
source=PatternSource.INLINE,
|
|
101
|
-
validation_status=ValidationStatus.VALID, # Assume inline patterns are valid
|
|
102
|
-
source_file="<inline>",
|
|
103
|
-
polars_compatible=self._is_polars_compatible(inline_patterns[pattern_name]) # Test compatibility
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
search_order.append((PatternSource.INLINE, "<inline>", True))
|
|
107
|
-
trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
|
|
108
|
-
|
|
109
|
-
return PatternResolutionResult(pattern=pattern_def, trace=trace)
|
|
110
|
-
else:
|
|
111
|
-
search_order.append((PatternSource.INLINE, "<inline>", False))
|
|
112
|
-
|
|
113
|
-
# Level 2: User Imports (non-global)
|
|
114
|
-
if user_imports:
|
|
115
|
-
for import_file in user_imports:
|
|
116
|
-
if import_file == "global":
|
|
117
|
-
continue # Skip global, handle in level 3
|
|
118
|
-
|
|
119
|
-
try:
|
|
120
|
-
patterns = self._load_user_import_file(import_file)
|
|
121
|
-
if pattern_name in patterns:
|
|
122
|
-
pattern_def = PatternDefinition(
|
|
123
|
-
name=pattern_name,
|
|
124
|
-
regex=patterns[pattern_name],
|
|
125
|
-
source=PatternSource.USER_IMPORT,
|
|
126
|
-
validation_status=ValidationStatus.VALID, # Assume user patterns are valid
|
|
127
|
-
source_file=import_file,
|
|
128
|
-
polars_compatible=self._is_polars_compatible(patterns[pattern_name]) # Test compatibility
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
search_order.append((PatternSource.USER_IMPORT, import_file, True))
|
|
132
|
-
trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
|
|
133
|
-
|
|
134
|
-
return PatternResolutionResult(pattern=pattern_def, trace=trace)
|
|
135
|
-
else:
|
|
136
|
-
search_order.append((PatternSource.USER_IMPORT, import_file, False))
|
|
137
|
-
|
|
138
|
-
except (FileNotFoundError, PatternImportError, PermissionError) as e:
|
|
139
|
-
search_order.append((PatternSource.USER_IMPORT, import_file, False))
|
|
140
|
-
# Log the specific error for debugging but continue searching
|
|
141
|
-
logger.debug(f"Failed to load user import '{import_file}': {e}")
|
|
142
|
-
|
|
143
|
-
# Level 3: User Global.properties
|
|
144
|
-
user_global_checked = False
|
|
145
|
-
if user_global_path:
|
|
146
|
-
try:
|
|
147
|
-
patterns = self._load_user_global_file(user_global_path)
|
|
148
|
-
if pattern_name in patterns:
|
|
149
|
-
pattern_def = PatternDefinition(
|
|
150
|
-
name=pattern_name,
|
|
151
|
-
regex=patterns[pattern_name],
|
|
152
|
-
source=PatternSource.USER_GLOBAL,
|
|
153
|
-
validation_status=ValidationStatus.VALID, # Assume user patterns are valid
|
|
154
|
-
source_file=user_global_path,
|
|
155
|
-
polars_compatible=self._is_polars_compatible(patterns[pattern_name]) # Test compatibility
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
search_order.append((PatternSource.USER_GLOBAL, user_global_path, True))
|
|
159
|
-
trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
|
|
160
|
-
|
|
161
|
-
return PatternResolutionResult(pattern=pattern_def, trace=trace)
|
|
162
|
-
else:
|
|
163
|
-
search_order.append((PatternSource.USER_GLOBAL, user_global_path, False))
|
|
164
|
-
user_global_checked = True
|
|
165
|
-
|
|
166
|
-
except (FileNotFoundError, PatternImportError, PermissionError) as e:
|
|
167
|
-
search_order.append((PatternSource.USER_GLOBAL, user_global_path, False))
|
|
168
|
-
user_global_checked = True
|
|
169
|
-
logger.debug(f"Failed to load user global '{user_global_path}': {e}")
|
|
170
|
-
|
|
171
|
-
# Also check for "global" in user imports (treated as user global)
|
|
172
|
-
if user_imports and "global" in user_imports and not user_global_checked:
|
|
173
|
-
try:
|
|
174
|
-
patterns = self._load_user_global_file("global.properties")
|
|
175
|
-
if pattern_name in patterns:
|
|
176
|
-
pattern_def = PatternDefinition(
|
|
177
|
-
name=pattern_name,
|
|
178
|
-
regex=patterns[pattern_name],
|
|
179
|
-
source=PatternSource.USER_GLOBAL,
|
|
180
|
-
validation_status=ValidationStatus.VALID, # Assume user patterns are valid
|
|
181
|
-
source_file="global.properties",
|
|
182
|
-
polars_compatible=self._is_polars_compatible(patterns[pattern_name]) # Test compatibility
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
search_order.append((PatternSource.USER_GLOBAL, "global.properties", True))
|
|
186
|
-
trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
|
|
187
|
-
|
|
188
|
-
return PatternResolutionResult(pattern=pattern_def, trace=trace)
|
|
189
|
-
else:
|
|
190
|
-
search_order.append((PatternSource.USER_GLOBAL, "global.properties", False))
|
|
191
|
-
|
|
192
|
-
except (FileNotFoundError, PatternImportError, PermissionError) as e:
|
|
193
|
-
search_order.append((PatternSource.USER_GLOBAL, "global.properties", False))
|
|
194
|
-
logger.debug(f"Failed to load global.properties: {e}")
|
|
195
|
-
|
|
196
|
-
# Level 4: Core Non-Global.properties (regional/domain)
|
|
197
|
-
core_non_global_files = self._discover_core_non_global_files()
|
|
198
|
-
for core_file in core_non_global_files:
|
|
199
|
-
try:
|
|
200
|
-
patterns = self._load_core_file(core_file)
|
|
201
|
-
if pattern_name in patterns:
|
|
202
|
-
pattern_def = PatternDefinition(
|
|
203
|
-
name=pattern_name,
|
|
204
|
-
regex=patterns[pattern_name],
|
|
205
|
-
source=PatternSource.CORE_NON_GLOBAL,
|
|
206
|
-
validation_status=ValidationStatus.VALID, # Mark builtin patterns as valid
|
|
207
|
-
source_file=str(core_file),
|
|
208
|
-
polars_compatible=self._is_polars_compatible(patterns[pattern_name]) # Test compatibility
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
search_order.append((PatternSource.CORE_NON_GLOBAL, str(core_file), True))
|
|
212
|
-
trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
|
|
213
|
-
|
|
214
|
-
return PatternResolutionResult(pattern=pattern_def, trace=trace)
|
|
215
|
-
else:
|
|
216
|
-
search_order.append((PatternSource.CORE_NON_GLOBAL, str(core_file), False))
|
|
217
|
-
|
|
218
|
-
except (FileNotFoundError, PatternImportError, PermissionError) as e:
|
|
219
|
-
search_order.append((PatternSource.CORE_NON_GLOBAL, str(core_file), False))
|
|
220
|
-
logger.debug(f"Failed to load core file '{core_file}': {e}")
|
|
221
|
-
|
|
222
|
-
# Level 5: Core Global.properties (Lowest Priority - Ultimate Fallback)
|
|
223
|
-
core_global_file = self.core_patterns_path / "global.properties"
|
|
224
|
-
try:
|
|
225
|
-
patterns = self._load_core_file(core_global_file)
|
|
226
|
-
if pattern_name in patterns:
|
|
227
|
-
pattern_def = PatternDefinition(
|
|
228
|
-
name=pattern_name,
|
|
229
|
-
regex=patterns[pattern_name],
|
|
230
|
-
source=PatternSource.CORE_GLOBAL,
|
|
231
|
-
validation_status=ValidationStatus.VALID, # Mark builtin patterns as valid
|
|
232
|
-
source_file=str(core_global_file),
|
|
233
|
-
polars_compatible=self._is_polars_compatible(patterns[pattern_name]) # Test compatibility
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
search_order.append((PatternSource.CORE_GLOBAL, str(core_global_file), True))
|
|
237
|
-
trace = self._create_trace(pattern_name, pattern_def, search_order, start_time)
|
|
238
|
-
|
|
239
|
-
return PatternResolutionResult(pattern=pattern_def, trace=trace)
|
|
240
|
-
else:
|
|
241
|
-
search_order.append((PatternSource.CORE_GLOBAL, str(core_global_file), False))
|
|
242
|
-
|
|
243
|
-
except (FileNotFoundError, PatternImportError, PermissionError) as e:
|
|
244
|
-
search_order.append((PatternSource.CORE_GLOBAL, str(core_global_file), False))
|
|
245
|
-
logger.debug(f"Failed to load core global file '{core_global_file}': {e}")
|
|
246
|
-
|
|
247
|
-
# Pattern not found in any source
|
|
248
|
-
trace = ResolutionTrace(
|
|
249
|
-
pattern_name=pattern_name,
|
|
250
|
-
resolved_source=PatternSource.CORE_GLOBAL, # Placeholder
|
|
251
|
-
resolved_value="",
|
|
252
|
-
source_file="",
|
|
253
|
-
search_order=search_order,
|
|
254
|
-
resolution_time_ms=(time.time() - start_time) * 1000
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
self._resolution_traces.append(trace)
|
|
258
|
-
|
|
259
|
-
# Create detailed error message with search information
|
|
260
|
-
searched_locations = []
|
|
261
|
-
for source, file_path, found in search_order:
|
|
262
|
-
status = "✓" if found else "✗"
|
|
263
|
-
searched_locations.append(f" {status} {source.value}: {file_path}")
|
|
264
|
-
|
|
265
|
-
search_details = "\n".join(searched_locations)
|
|
266
|
-
|
|
267
|
-
raise PatternResolutionError(
|
|
268
|
-
f"Pattern '{pattern_name}' not found in any source.\n\nSearched locations:\n{search_details}",
|
|
269
|
-
pattern_name,
|
|
270
|
-
[source.value for source, _, _ in search_order],
|
|
271
|
-
f"Searched {len(search_order)} sources in hierarchy order. Consider adding the pattern to global.properties or defining it inline in your schema."
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
def resolve_multiple_patterns(self,
|
|
275
|
-
pattern_names: List[str],
|
|
276
|
-
inline_patterns: Optional[Dict[str, str]] = None,
|
|
277
|
-
user_imports: Optional[List[str]] = None,
|
|
278
|
-
user_global_path: Optional[str] = None) -> Dict[str, PatternResolutionResult]:
|
|
279
|
-
"""
|
|
280
|
-
Resolve multiple patterns efficiently with shared caching.
|
|
281
|
-
|
|
282
|
-
Args:
|
|
283
|
-
pattern_names: List of pattern names to resolve
|
|
284
|
-
inline_patterns: Inline patterns from schema
|
|
285
|
-
user_imports: List of user import files
|
|
286
|
-
user_global_path: Path to user global.properties file
|
|
287
|
-
|
|
288
|
-
Returns:
|
|
289
|
-
Dictionary mapping pattern names to resolution results
|
|
290
|
-
|
|
291
|
-
Raises:
|
|
292
|
-
PatternResolutionError: If any pattern cannot be resolved
|
|
293
|
-
"""
|
|
294
|
-
results = {}
|
|
295
|
-
failed_patterns = []
|
|
296
|
-
|
|
297
|
-
for pattern_name in pattern_names:
|
|
298
|
-
try:
|
|
299
|
-
result = self.resolve_pattern(
|
|
300
|
-
pattern_name, inline_patterns, user_imports, user_global_path
|
|
301
|
-
)
|
|
302
|
-
results[pattern_name] = result
|
|
303
|
-
except PatternResolutionError as e:
|
|
304
|
-
failed_patterns.append((pattern_name, str(e)))
|
|
305
|
-
|
|
306
|
-
if failed_patterns:
|
|
307
|
-
failed_names = [name for name, _ in failed_patterns]
|
|
308
|
-
raise PatternResolutionError(
|
|
309
|
-
f"Failed to resolve {len(failed_patterns)} patterns: {', '.join(failed_names)}",
|
|
310
|
-
failed_names[0] if failed_names else "",
|
|
311
|
-
[],
|
|
312
|
-
f"Multiple pattern resolution failures: {failed_patterns}"
|
|
313
|
-
)
|
|
314
|
-
|
|
315
|
-
return results
|
|
316
|
-
|
|
317
|
-
def get_resolution_traces(self) -> List[ResolutionTrace]:
|
|
318
|
-
"""Get all resolution traces for debugging."""
|
|
319
|
-
return self._resolution_traces.copy()
|
|
320
|
-
|
|
321
|
-
def clear_traces(self) -> None:
|
|
322
|
-
"""Clear resolution traces."""
|
|
323
|
-
self._resolution_traces.clear()
|
|
324
|
-
|
|
325
|
-
def get_available_regional_patterns(self) -> Dict[str, List[str]]:
|
|
326
|
-
"""
|
|
327
|
-
Get information about available regional patterns.
|
|
328
|
-
|
|
329
|
-
Returns:
|
|
330
|
-
Dictionary mapping region names to lists of available patterns
|
|
331
|
-
"""
|
|
332
|
-
regional_patterns = {}
|
|
333
|
-
|
|
334
|
-
# Known regional file prefixes
|
|
335
|
-
regional_prefixes = {'us', 'eu', 'ca', 'uk', 'au', 'jp', 'in', 'br', 'mx'}
|
|
336
|
-
|
|
337
|
-
for file_path in self.core_patterns_path.glob("*.properties"):
|
|
338
|
-
if file_path.name == "global.properties":
|
|
339
|
-
continue
|
|
340
|
-
|
|
341
|
-
file_stem = file_path.stem.lower()
|
|
342
|
-
|
|
343
|
-
if file_stem in regional_prefixes or file_stem.endswith('_region'):
|
|
344
|
-
try:
|
|
345
|
-
patterns = self._load_core_file(file_path)
|
|
346
|
-
regional_patterns[file_stem] = list(patterns.keys())
|
|
347
|
-
except Exception:
|
|
348
|
-
# Skip files that can't be loaded
|
|
349
|
-
continue
|
|
350
|
-
|
|
351
|
-
return regional_patterns
|
|
352
|
-
|
|
353
|
-
def get_available_domain_patterns(self) -> Dict[str, List[str]]:
|
|
354
|
-
"""
|
|
355
|
-
Get information about available domain-specific patterns.
|
|
356
|
-
|
|
357
|
-
Returns:
|
|
358
|
-
Dictionary mapping domain names to lists of available patterns
|
|
359
|
-
"""
|
|
360
|
-
domain_patterns = {}
|
|
361
|
-
|
|
362
|
-
# Known regional file prefixes
|
|
363
|
-
regional_prefixes = {'us', 'eu', 'ca', 'uk', 'au', 'jp', 'in', 'br', 'mx'}
|
|
364
|
-
|
|
365
|
-
for file_path in self.core_patterns_path.glob("*.properties"):
|
|
366
|
-
if file_path.name == "global.properties":
|
|
367
|
-
continue
|
|
368
|
-
|
|
369
|
-
file_stem = file_path.stem.lower()
|
|
370
|
-
|
|
371
|
-
if file_stem not in regional_prefixes and not file_stem.endswith('_region'):
|
|
372
|
-
try:
|
|
373
|
-
patterns = self._load_core_file(file_path)
|
|
374
|
-
domain_patterns[file_stem] = list(patterns.keys())
|
|
375
|
-
except Exception:
|
|
376
|
-
# Skip files that can't be loaded
|
|
377
|
-
continue
|
|
378
|
-
|
|
379
|
-
return domain_patterns
|
|
380
|
-
|
|
381
|
-
def clear_cache(self) -> None:
|
|
382
|
-
"""Clear all caches."""
|
|
383
|
-
self._pattern_cache.clear()
|
|
384
|
-
self._file_cache.clear()
|
|
385
|
-
self.file_manager.clear_cache()
|
|
386
|
-
|
|
387
|
-
def _create_trace(self, pattern_name: str, pattern_def: PatternDefinition,
|
|
388
|
-
search_order: List[Tuple[PatternSource, str, bool]],
|
|
389
|
-
start_time: float) -> ResolutionTrace:
|
|
390
|
-
"""Create a resolution trace for debugging."""
|
|
391
|
-
trace = ResolutionTrace(
|
|
392
|
-
pattern_name=pattern_name,
|
|
393
|
-
resolved_source=pattern_def.source,
|
|
394
|
-
resolved_value=pattern_def.regex,
|
|
395
|
-
source_file=pattern_def.source_file,
|
|
396
|
-
search_order=search_order,
|
|
397
|
-
resolution_time_ms=(time.time() - start_time) * 1000
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
self._resolution_traces.append(trace)
|
|
401
|
-
return trace
|
|
402
|
-
|
|
403
|
-
def _load_user_import_file(self, import_name: str) -> Dict[str, str]:
|
|
404
|
-
"""Load a user import file with improved error handling."""
|
|
405
|
-
# Try different possible paths for user imports
|
|
406
|
-
possible_paths = [
|
|
407
|
-
f"{import_name}.properties",
|
|
408
|
-
f"patterns/{import_name}.properties",
|
|
409
|
-
f"./{import_name}.properties"
|
|
410
|
-
]
|
|
411
|
-
|
|
412
|
-
errors_encountered = []
|
|
413
|
-
|
|
414
|
-
for path in possible_paths:
|
|
415
|
-
if path in self._file_cache:
|
|
416
|
-
return self._file_cache[path]
|
|
417
|
-
|
|
418
|
-
try:
|
|
419
|
-
if Path(path).exists():
|
|
420
|
-
parsed = self.file_manager.load_properties_file(path)
|
|
421
|
-
self._file_cache[path] = parsed.patterns
|
|
422
|
-
return parsed.patterns
|
|
423
|
-
else:
|
|
424
|
-
errors_encountered.append(f"File not found: {path}")
|
|
425
|
-
except PermissionError:
|
|
426
|
-
errors_encountered.append(f"Permission denied: {path}")
|
|
427
|
-
except FileNotFoundError:
|
|
428
|
-
errors_encountered.append(f"File not found: {path}")
|
|
429
|
-
except Exception as e:
|
|
430
|
-
errors_encountered.append(f"Error reading {path}: {str(e)}")
|
|
431
|
-
|
|
432
|
-
# Create detailed error message
|
|
433
|
-
error_details = "; ".join(errors_encountered)
|
|
434
|
-
raise PatternImportError(
|
|
435
|
-
import_name,
|
|
436
|
-
file_path=", ".join(possible_paths),
|
|
437
|
-
reason=f"Tried multiple locations but failed: {error_details}"
|
|
438
|
-
)
|
|
439
|
-
|
|
440
|
-
def _load_user_global_file(self, global_path: str) -> Dict[str, str]:
|
|
441
|
-
"""Load user global.properties file with improved error handling."""
|
|
442
|
-
if global_path in self._file_cache:
|
|
443
|
-
return self._file_cache[global_path]
|
|
444
|
-
|
|
445
|
-
# Try different possible paths for user global
|
|
446
|
-
possible_paths = [
|
|
447
|
-
global_path,
|
|
448
|
-
"global.properties",
|
|
449
|
-
"./global.properties",
|
|
450
|
-
"patterns/global.properties"
|
|
451
|
-
]
|
|
452
|
-
|
|
453
|
-
errors_encountered = []
|
|
454
|
-
|
|
455
|
-
for path in possible_paths:
|
|
456
|
-
try:
|
|
457
|
-
if Path(path).exists():
|
|
458
|
-
parsed = self.file_manager.load_properties_file(path)
|
|
459
|
-
self._file_cache[global_path] = parsed.patterns
|
|
460
|
-
return parsed.patterns
|
|
461
|
-
else:
|
|
462
|
-
errors_encountered.append(f"File not found: {path}")
|
|
463
|
-
except PermissionError:
|
|
464
|
-
errors_encountered.append(f"Permission denied: {path}")
|
|
465
|
-
except FileNotFoundError:
|
|
466
|
-
errors_encountered.append(f"File not found: {path}")
|
|
467
|
-
except Exception as e:
|
|
468
|
-
errors_encountered.append(f"Error reading {path}: {str(e)}")
|
|
469
|
-
|
|
470
|
-
# Create detailed error message
|
|
471
|
-
error_details = "; ".join(errors_encountered)
|
|
472
|
-
raise PatternImportError(
|
|
473
|
-
"global",
|
|
474
|
-
file_path=", ".join(possible_paths),
|
|
475
|
-
reason=f"Tried multiple locations but failed: {error_details}"
|
|
476
|
-
)
|
|
477
|
-
|
|
478
|
-
def _load_core_file(self, file_path: Path) -> Dict[str, str]:
|
|
479
|
-
"""Load a core pattern file with improved error handling."""
|
|
480
|
-
file_key = str(file_path)
|
|
481
|
-
|
|
482
|
-
if file_key in self._file_cache:
|
|
483
|
-
return self._file_cache[file_key]
|
|
484
|
-
|
|
485
|
-
try:
|
|
486
|
-
if not file_path.exists():
|
|
487
|
-
raise FileNotFoundError(f"Core pattern file not found: {file_path}")
|
|
488
|
-
|
|
489
|
-
parsed = self.file_manager.load_properties_file(str(file_path))
|
|
490
|
-
self._file_cache[file_key] = parsed.patterns
|
|
491
|
-
return parsed.patterns
|
|
492
|
-
|
|
493
|
-
except PermissionError:
|
|
494
|
-
raise PatternImportError(
|
|
495
|
-
file_path.stem,
|
|
496
|
-
file_path=str(file_path),
|
|
497
|
-
reason="Permission denied - check file permissions"
|
|
498
|
-
)
|
|
499
|
-
except FileNotFoundError:
|
|
500
|
-
raise PatternImportError(
|
|
501
|
-
file_path.stem,
|
|
502
|
-
file_path=str(file_path),
|
|
503
|
-
reason="Core pattern file not found"
|
|
504
|
-
)
|
|
505
|
-
except Exception as e:
|
|
506
|
-
raise PatternImportError(
|
|
507
|
-
file_path.stem,
|
|
508
|
-
file_path=str(file_path),
|
|
509
|
-
reason=f"Error reading core pattern file: {str(e)}"
|
|
510
|
-
)
|
|
511
|
-
|
|
512
|
-
def _discover_core_non_global_files(self) -> List[Path]:
|
|
513
|
-
"""
|
|
514
|
-
Discover all core non-global pattern files.
|
|
515
|
-
|
|
516
|
-
Returns files in priority order:
|
|
517
|
-
1. Regional files (us.properties, eu.properties, ca.properties, etc.)
|
|
518
|
-
2. Domain-specific files (finance.properties, healthcare.properties, etc.)
|
|
519
|
-
|
|
520
|
-
This ensures regional patterns are checked before domain-specific ones.
|
|
521
|
-
"""
|
|
522
|
-
if not self.core_patterns_path.exists():
|
|
523
|
-
return []
|
|
524
|
-
|
|
525
|
-
regional_files = []
|
|
526
|
-
domain_files = []
|
|
527
|
-
|
|
528
|
-
# Known regional file prefixes (can be extended)
|
|
529
|
-
regional_prefixes = {'us', 'eu', 'ca', 'uk', 'au', 'jp', 'in', 'br', 'mx'}
|
|
530
|
-
|
|
531
|
-
for file_path in self.core_patterns_path.glob("*.properties"):
|
|
532
|
-
if file_path.name == "global.properties":
|
|
533
|
-
continue
|
|
534
|
-
|
|
535
|
-
file_stem = file_path.stem.lower()
|
|
536
|
-
|
|
537
|
-
# Check if it's a regional file
|
|
538
|
-
if file_stem in regional_prefixes or file_stem.endswith('_region'):
|
|
539
|
-
regional_files.append(file_path)
|
|
540
|
-
else:
|
|
541
|
-
domain_files.append(file_path)
|
|
542
|
-
|
|
543
|
-
# Sort each category for consistent ordering
|
|
544
|
-
regional_files.sort()
|
|
545
|
-
domain_files.sort()
|
|
546
|
-
|
|
547
|
-
# Return regional files first, then domain files
|
|
548
|
-
return regional_files + domain_files
|
|
549
|
-
|
|
550
|
-
def _is_polars_compatible(self, pattern: str) -> bool:
|
|
551
|
-
"""
|
|
552
|
-
Test if a regex pattern is compatible with polars.
|
|
553
|
-
|
|
554
|
-
Args:
|
|
555
|
-
pattern: Regex pattern to test
|
|
556
|
-
|
|
557
|
-
Returns:
|
|
558
|
-
True if pattern is polars-compatible, False otherwise
|
|
559
|
-
"""
|
|
560
|
-
try:
|
|
561
|
-
import polars as pl
|
|
562
|
-
|
|
563
|
-
# Create a test series with some sample data
|
|
564
|
-
test_data = ["test@example.com", "123-456-7890", "John Doe", "invalid", "123", "ABC"]
|
|
565
|
-
test_series = pl.Series("test", test_data)
|
|
566
|
-
|
|
567
|
-
# Try to use the pattern with polars
|
|
568
|
-
test_series.str.contains(pattern)
|
|
569
|
-
return True
|
|
570
|
-
|
|
571
|
-
except Exception:
|
|
572
|
-
# If any error occurs, consider it incompatible
|
|
573
|
-
return False
|