additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -177
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -352
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/deduce.py +0 -259
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -926
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a4.dist-info/METADATA +0 -311
- additory-0.1.0a4.dist-info/RECORD +0 -72
- additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -1,473 +0,0 @@
|
|
|
1
|
-
# additory/core/enhanced_matchers.py
|
|
2
|
-
|
|
3
|
-
"""
|
|
4
|
-
Enhanced Match Parameters System for Intelligent String Matching
|
|
5
|
-
|
|
6
|
-
This module provides the enhanced matching system for add.to() with case-insensitive
|
|
7
|
-
defaults and comprehensive matching strategies. It builds upon the existing matchers
|
|
8
|
-
but provides a more user-friendly interface with intelligent defaults.
|
|
9
|
-
|
|
10
|
-
New Match Parameter Design:
|
|
11
|
-
- "exact": Case-insensitive exact match (DEFAULT) - maps to "iexact"
|
|
12
|
-
- "exact_case": Case-sensitive exact match - maps to "exact"
|
|
13
|
-
- "contains": Case-insensitive substring matching - maps to "icontains"
|
|
14
|
-
- "contains_case": Case-sensitive substring matching - maps to "contains"
|
|
15
|
-
- "startswith": Case-insensitive prefix matching - maps to "ibeginswith"
|
|
16
|
-
- "startswith_case": Case-sensitive prefix matching - maps to "beginswith"
|
|
17
|
-
- "endswith": Case-insensitive suffix matching - maps to "iendswith"
|
|
18
|
-
- "endswith_case": Case-sensitive suffix matching - maps to "endswith"
|
|
19
|
-
- "regex": Regular expression matching (case-sensitive by default)
|
|
20
|
-
- "range": Numeric range matching
|
|
21
|
-
- "fuzzy": Fuzzy string matching with configurable threshold
|
|
22
|
-
|
|
23
|
-
Design Philosophy:
|
|
24
|
-
- Case-insensitive by default for real-world messy data
|
|
25
|
-
- Explicit "_case" suffix when case sensitivity is needed
|
|
26
|
-
- Backward compatibility with existing matchers
|
|
27
|
-
- Enhanced fuzzy matching with configurable parameters
|
|
28
|
-
- Comprehensive validation and helpful error messages
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
import logging
|
|
32
|
-
import re
|
|
33
|
-
from typing import List, Dict, Any, Tuple, Optional, Union
|
|
34
|
-
from dataclasses import dataclass
|
|
35
|
-
|
|
36
|
-
# Import existing matchers
|
|
37
|
-
from ..ops.matchers import (
|
|
38
|
-
match_exact, match_iexact, match_contains, match_icontains,
|
|
39
|
-
match_beginswith, match_ibeginswith, match_endswith, match_iendswith,
|
|
40
|
-
match_regex, match_numeric_range, match_fuzzy,
|
|
41
|
-
_safe_contains, _safe_startswith, _safe_endswith, _calculate_similarity
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
logger = logging.getLogger(__name__)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
@dataclass
|
|
48
|
-
class MatchConfig:
|
|
49
|
-
"""Configuration for enhanced matching operations"""
|
|
50
|
-
strategy: str
|
|
51
|
-
case_sensitive: bool
|
|
52
|
-
fuzzy_threshold: float = 0.8
|
|
53
|
-
regex_flags: int = 0
|
|
54
|
-
numeric_tolerance: float = 0.0
|
|
55
|
-
description: str = ""
|
|
56
|
-
examples: List[str] = None
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
class EnhancedMatcherSystem:
|
|
60
|
-
"""
|
|
61
|
-
Enhanced matching system with case-insensitive defaults and comprehensive strategies
|
|
62
|
-
"""
|
|
63
|
-
|
|
64
|
-
def __init__(self):
|
|
65
|
-
self._match_configs = self._initialize_match_configs()
|
|
66
|
-
self._match_stats = {
|
|
67
|
-
'total_matches': 0,
|
|
68
|
-
'exact_matches': 0,
|
|
69
|
-
'case_insensitive_matches': 0,
|
|
70
|
-
'case_sensitive_matches': 0,
|
|
71
|
-
'fuzzy_matches': 0,
|
|
72
|
-
'regex_matches': 0,
|
|
73
|
-
'contains_matches': 0,
|
|
74
|
-
'prefix_suffix_matches': 0,
|
|
75
|
-
'range_matches': 0
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
def get_matcher_function(self, match_strategy: str) -> callable:
|
|
79
|
-
"""
|
|
80
|
-
Get the appropriate matcher function for the given strategy
|
|
81
|
-
|
|
82
|
-
Args:
|
|
83
|
-
match_strategy: Enhanced match strategy name
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
Callable matcher function
|
|
87
|
-
"""
|
|
88
|
-
|
|
89
|
-
# Validate strategy
|
|
90
|
-
if match_strategy not in self._match_configs:
|
|
91
|
-
available = list(self._match_configs.keys())
|
|
92
|
-
raise ValueError(f"Unknown match strategy: '{match_strategy}'. Available: {available}")
|
|
93
|
-
|
|
94
|
-
config = self._match_configs[match_strategy]
|
|
95
|
-
|
|
96
|
-
# Map enhanced strategies to existing matcher functions
|
|
97
|
-
strategy_mapping = {
|
|
98
|
-
"exact": match_iexact, # Case-insensitive by default
|
|
99
|
-
"exact_case": match_exact, # Case-sensitive when explicit
|
|
100
|
-
"contains": match_icontains, # Case-insensitive by default
|
|
101
|
-
"contains_case": match_contains, # Case-sensitive when explicit
|
|
102
|
-
"startswith": match_ibeginswith, # Case-insensitive by default
|
|
103
|
-
"startswith_case": match_beginswith, # Case-sensitive when explicit
|
|
104
|
-
"endswith": match_iendswith, # Case-insensitive by default
|
|
105
|
-
"endswith_case": match_endswith, # Case-sensitive when explicit
|
|
106
|
-
"regex": match_regex, # Case-sensitive (standard regex behavior)
|
|
107
|
-
"range": match_numeric_range, # Numeric range matching
|
|
108
|
-
"fuzzy": self._create_fuzzy_matcher(config.fuzzy_threshold)
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
matcher_func = strategy_mapping.get(match_strategy)
|
|
112
|
-
if matcher_func is None:
|
|
113
|
-
raise ValueError(f"No matcher implementation for strategy: '{match_strategy}'")
|
|
114
|
-
|
|
115
|
-
# Wrap the matcher to collect statistics
|
|
116
|
-
return self._wrap_matcher_with_stats(matcher_func, match_strategy)
|
|
117
|
-
|
|
118
|
-
def _create_fuzzy_matcher(self, threshold: float = 0.8) -> callable:
|
|
119
|
-
"""Create a fuzzy matcher with configurable threshold"""
|
|
120
|
-
|
|
121
|
-
def fuzzy_matcher_with_threshold(key, lookup):
|
|
122
|
-
"""Fuzzy matching with configurable threshold"""
|
|
123
|
-
matches = []
|
|
124
|
-
|
|
125
|
-
for k, rows in lookup.items():
|
|
126
|
-
similarity = _calculate_similarity(key, k)
|
|
127
|
-
if similarity >= threshold:
|
|
128
|
-
matches.extend(rows)
|
|
129
|
-
|
|
130
|
-
return matches
|
|
131
|
-
|
|
132
|
-
return fuzzy_matcher_with_threshold
|
|
133
|
-
|
|
134
|
-
def _wrap_matcher_with_stats(self, matcher_func: callable, strategy: str) -> callable:
|
|
135
|
-
"""Wrap matcher function to collect statistics"""
|
|
136
|
-
|
|
137
|
-
def wrapped_matcher(key, lookup):
|
|
138
|
-
"""Wrapped matcher that collects statistics"""
|
|
139
|
-
matches = matcher_func(key, lookup)
|
|
140
|
-
|
|
141
|
-
# Update statistics
|
|
142
|
-
self._match_stats['total_matches'] += 1
|
|
143
|
-
|
|
144
|
-
if strategy in ['exact', 'contains', 'startswith', 'endswith']:
|
|
145
|
-
self._match_stats['case_insensitive_matches'] += 1
|
|
146
|
-
elif strategy in ['exact_case', 'contains_case', 'startswith_case', 'endswith_case']:
|
|
147
|
-
self._match_stats['case_sensitive_matches'] += 1
|
|
148
|
-
elif strategy == 'fuzzy':
|
|
149
|
-
self._match_stats['fuzzy_matches'] += 1
|
|
150
|
-
elif strategy == 'regex':
|
|
151
|
-
self._match_stats['regex_matches'] += 1
|
|
152
|
-
elif strategy == 'range':
|
|
153
|
-
self._match_stats['range_matches'] += 1
|
|
154
|
-
|
|
155
|
-
if strategy in ['exact', 'exact_case']:
|
|
156
|
-
self._match_stats['exact_matches'] += 1
|
|
157
|
-
elif strategy in ['contains', 'contains_case']:
|
|
158
|
-
self._match_stats['contains_matches'] += 1
|
|
159
|
-
elif strategy in ['startswith', 'startswith_case', 'endswith', 'endswith_case']:
|
|
160
|
-
self._match_stats['prefix_suffix_matches'] += 1
|
|
161
|
-
|
|
162
|
-
return matches
|
|
163
|
-
|
|
164
|
-
return wrapped_matcher
|
|
165
|
-
|
|
166
|
-
def validate_match_strategy(self, strategy: str) -> Dict[str, Any]:
|
|
167
|
-
"""
|
|
168
|
-
Validate match strategy and provide helpful information
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
Dict with validation results, description, and examples
|
|
172
|
-
"""
|
|
173
|
-
|
|
174
|
-
result = {
|
|
175
|
-
'valid': strategy in self._match_configs,
|
|
176
|
-
'strategy': strategy,
|
|
177
|
-
'config': None,
|
|
178
|
-
'suggestions': [],
|
|
179
|
-
'similar_strategies': []
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
if result['valid']:
|
|
183
|
-
config = self._match_configs[strategy]
|
|
184
|
-
result['config'] = {
|
|
185
|
-
'description': config.description,
|
|
186
|
-
'case_sensitive': config.case_sensitive,
|
|
187
|
-
'examples': config.examples or [],
|
|
188
|
-
'fuzzy_threshold': config.fuzzy_threshold if strategy == 'fuzzy' else None
|
|
189
|
-
}
|
|
190
|
-
else:
|
|
191
|
-
# Provide suggestions for invalid strategies
|
|
192
|
-
result['suggestions'] = [
|
|
193
|
-
"Valid match strategies:",
|
|
194
|
-
" Case-insensitive (default):",
|
|
195
|
-
" - 'exact': Exact match, ignoring case",
|
|
196
|
-
" - 'contains': Substring match, ignoring case",
|
|
197
|
-
" - 'startswith': Prefix match, ignoring case",
|
|
198
|
-
" - 'endswith': Suffix match, ignoring case",
|
|
199
|
-
" Case-sensitive (explicit):",
|
|
200
|
-
" - 'exact_case': Exact match, case-sensitive",
|
|
201
|
-
" - 'contains_case': Substring match, case-sensitive",
|
|
202
|
-
" - 'startswith_case': Prefix match, case-sensitive",
|
|
203
|
-
" - 'endswith_case': Suffix match, case-sensitive",
|
|
204
|
-
" Advanced:",
|
|
205
|
-
" - 'fuzzy': Fuzzy string matching",
|
|
206
|
-
" - 'regex': Regular expression matching",
|
|
207
|
-
" - 'range': Numeric range matching"
|
|
208
|
-
]
|
|
209
|
-
|
|
210
|
-
# Find similar strategies
|
|
211
|
-
available_strategies = list(self._match_configs.keys())
|
|
212
|
-
result['similar_strategies'] = [
|
|
213
|
-
s for s in available_strategies
|
|
214
|
-
if strategy.lower() in s.lower() or s.lower() in strategy.lower()
|
|
215
|
-
][:3] # Top 3 similar
|
|
216
|
-
|
|
217
|
-
return result
|
|
218
|
-
|
|
219
|
-
def get_match_examples(self, strategy: str) -> List[Dict[str, Any]]:
|
|
220
|
-
"""Get practical examples for a match strategy"""
|
|
221
|
-
|
|
222
|
-
examples = {
|
|
223
|
-
"exact": [
|
|
224
|
-
{"target": "Apple", "reference": "APPLE", "matches": True, "reason": "Case-insensitive exact match"},
|
|
225
|
-
{"target": "Apple", "reference": "Orange", "matches": False, "reason": "Different values"},
|
|
226
|
-
{"target": "Apple Inc", "reference": "apple inc", "matches": True, "reason": "Case-insensitive exact match"}
|
|
227
|
-
],
|
|
228
|
-
"exact_case": [
|
|
229
|
-
{"target": "Apple", "reference": "APPLE", "matches": False, "reason": "Case-sensitive, different case"},
|
|
230
|
-
{"target": "Apple", "reference": "Apple", "matches": True, "reason": "Exact case match"},
|
|
231
|
-
{"target": "Apple", "reference": "apple", "matches": False, "reason": "Case-sensitive, different case"}
|
|
232
|
-
],
|
|
233
|
-
"contains": [
|
|
234
|
-
{"target": "laptop", "reference": "Gaming Laptop Pro", "matches": True, "reason": "Contains 'laptop' (case-insensitive)"},
|
|
235
|
-
{"target": "MOUSE", "reference": "wireless mouse", "matches": True, "reason": "Contains 'mouse' (case-insensitive)"},
|
|
236
|
-
{"target": "keyboard", "reference": "Monitor", "matches": False, "reason": "Does not contain 'keyboard'"}
|
|
237
|
-
],
|
|
238
|
-
"contains_case": [
|
|
239
|
-
{"target": "Laptop", "reference": "Gaming Laptop Pro", "matches": True, "reason": "Contains 'Laptop' (exact case)"},
|
|
240
|
-
{"target": "laptop", "reference": "Gaming Laptop Pro", "matches": False, "reason": "Case-sensitive, different case"},
|
|
241
|
-
{"target": "MOUSE", "reference": "wireless mouse", "matches": False, "reason": "Case-sensitive, different case"}
|
|
242
|
-
],
|
|
243
|
-
"startswith": [
|
|
244
|
-
{"target": "tech", "reference": "TechCorp Inc", "matches": True, "reason": "Starts with 'tech' (case-insensitive)"},
|
|
245
|
-
{"target": "PROD", "reference": "product-001", "matches": True, "reason": "Starts with 'prod' (case-insensitive)"},
|
|
246
|
-
{"target": "sales", "reference": "Marketing Dept", "matches": False, "reason": "Does not start with 'sales'"}
|
|
247
|
-
],
|
|
248
|
-
"fuzzy": [
|
|
249
|
-
{"target": "John Smith", "reference": "Jon Smith", "matches": True, "reason": "High similarity (typo tolerance)"},
|
|
250
|
-
{"target": "TechCorp", "reference": "Tech Corp", "matches": True, "reason": "High similarity (spacing difference)"},
|
|
251
|
-
{"target": "Apple", "reference": "Orange", "matches": False, "reason": "Low similarity, different words"}
|
|
252
|
-
],
|
|
253
|
-
"regex": [
|
|
254
|
-
{"target": r"P\d{3}", "reference": "P001", "matches": True, "reason": "Matches pattern P + 3 digits"},
|
|
255
|
-
{"target": r"^[A-Z]{2}\d{4}$", "reference": "AB1234", "matches": True, "reason": "Matches 2 letters + 4 digits"},
|
|
256
|
-
{"target": r"\d+", "reference": "Product123", "matches": True, "reason": "Contains digits"}
|
|
257
|
-
]
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
return examples.get(strategy, [])
|
|
261
|
-
|
|
262
|
-
def _initialize_match_configs(self) -> Dict[str, MatchConfig]:
|
|
263
|
-
"""Initialize match strategy configurations"""
|
|
264
|
-
|
|
265
|
-
return {
|
|
266
|
-
"exact": MatchConfig(
|
|
267
|
-
strategy="exact",
|
|
268
|
-
case_sensitive=False,
|
|
269
|
-
description="Case-insensitive exact match (default)",
|
|
270
|
-
examples=["'Apple' matches 'APPLE', 'apple', 'Apple'"]
|
|
271
|
-
),
|
|
272
|
-
"exact_case": MatchConfig(
|
|
273
|
-
strategy="exact_case",
|
|
274
|
-
case_sensitive=True,
|
|
275
|
-
description="Case-sensitive exact match",
|
|
276
|
-
examples=["'Apple' matches only 'Apple', not 'APPLE' or 'apple'"]
|
|
277
|
-
),
|
|
278
|
-
"contains": MatchConfig(
|
|
279
|
-
strategy="contains",
|
|
280
|
-
case_sensitive=False,
|
|
281
|
-
description="Case-insensitive substring matching",
|
|
282
|
-
examples=["'laptop' matches 'Gaming Laptop Pro', 'LAPTOP-001'"]
|
|
283
|
-
),
|
|
284
|
-
"contains_case": MatchConfig(
|
|
285
|
-
strategy="contains_case",
|
|
286
|
-
case_sensitive=True,
|
|
287
|
-
description="Case-sensitive substring matching",
|
|
288
|
-
examples=["'Laptop' matches 'Gaming Laptop Pro' but not 'gaming laptop pro'"]
|
|
289
|
-
),
|
|
290
|
-
"startswith": MatchConfig(
|
|
291
|
-
strategy="startswith",
|
|
292
|
-
case_sensitive=False,
|
|
293
|
-
description="Case-insensitive prefix matching",
|
|
294
|
-
examples=["'tech' matches 'TechCorp', 'TECHNOLOGY', 'tech-support'"]
|
|
295
|
-
),
|
|
296
|
-
"startswith_case": MatchConfig(
|
|
297
|
-
strategy="startswith_case",
|
|
298
|
-
case_sensitive=True,
|
|
299
|
-
description="Case-sensitive prefix matching",
|
|
300
|
-
examples=["'Tech' matches 'TechCorp' but not 'technology'"]
|
|
301
|
-
),
|
|
302
|
-
"endswith": MatchConfig(
|
|
303
|
-
strategy="endswith",
|
|
304
|
-
case_sensitive=False,
|
|
305
|
-
description="Case-insensitive suffix matching",
|
|
306
|
-
examples=["'corp' matches 'TechCorp', 'RETAILCORP', 'my-corp'"]
|
|
307
|
-
),
|
|
308
|
-
"endswith_case": MatchConfig(
|
|
309
|
-
strategy="endswith_case",
|
|
310
|
-
case_sensitive=True,
|
|
311
|
-
description="Case-sensitive suffix matching",
|
|
312
|
-
examples=["'Corp' matches 'TechCorp' but not 'techcorp'"]
|
|
313
|
-
),
|
|
314
|
-
"fuzzy": MatchConfig(
|
|
315
|
-
strategy="fuzzy",
|
|
316
|
-
case_sensitive=False,
|
|
317
|
-
fuzzy_threshold=0.8,
|
|
318
|
-
description="Fuzzy string matching with similarity threshold",
|
|
319
|
-
examples=["'John Smith' matches 'Jon Smith', 'John Smyth' (typo tolerance)"]
|
|
320
|
-
),
|
|
321
|
-
"regex": MatchConfig(
|
|
322
|
-
strategy="regex",
|
|
323
|
-
case_sensitive=True,
|
|
324
|
-
description="Regular expression pattern matching",
|
|
325
|
-
examples=["r'P\\d{3}' matches 'P001', 'P123' (product codes)"]
|
|
326
|
-
),
|
|
327
|
-
"range": MatchConfig(
|
|
328
|
-
strategy="range",
|
|
329
|
-
case_sensitive=False,
|
|
330
|
-
description="Numeric range matching",
|
|
331
|
-
examples=["(10, 50) matches values between 10 and 50 inclusive"]
|
|
332
|
-
)
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
def get_strategy_recommendations(self, use_case: str) -> List[str]:
|
|
336
|
-
"""Get strategy recommendations based on use case"""
|
|
337
|
-
|
|
338
|
-
recommendations = {
|
|
339
|
-
"messy_data": ["exact", "fuzzy", "contains"],
|
|
340
|
-
"clean_data": ["exact_case", "exact", "contains_case"],
|
|
341
|
-
"product_codes": ["exact", "regex", "startswith"],
|
|
342
|
-
"names": ["fuzzy", "exact", "contains"],
|
|
343
|
-
"categories": ["exact", "contains", "startswith"],
|
|
344
|
-
"ids": ["exact_case", "exact", "regex"],
|
|
345
|
-
"text_search": ["contains", "fuzzy", "startswith"],
|
|
346
|
-
"strict_matching": ["exact_case", "regex"],
|
|
347
|
-
"flexible_matching": ["fuzzy", "contains", "exact"]
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
return recommendations.get(use_case.lower(), ["exact", "contains", "fuzzy"])
|
|
351
|
-
|
|
352
|
-
def get_stats(self) -> Dict[str, Any]:
|
|
353
|
-
"""Get matching statistics"""
|
|
354
|
-
return self._match_stats.copy()
|
|
355
|
-
|
|
356
|
-
def reset_stats(self):
|
|
357
|
-
"""Reset matching statistics"""
|
|
358
|
-
self._match_stats = {
|
|
359
|
-
'total_matches': 0,
|
|
360
|
-
'exact_matches': 0,
|
|
361
|
-
'case_insensitive_matches': 0,
|
|
362
|
-
'case_sensitive_matches': 0,
|
|
363
|
-
'fuzzy_matches': 0,
|
|
364
|
-
'regex_matches': 0,
|
|
365
|
-
'contains_matches': 0,
|
|
366
|
-
'prefix_suffix_matches': 0,
|
|
367
|
-
'range_matches': 0
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
# Global enhanced matcher system
|
|
372
|
-
_enhanced_matcher = EnhancedMatcherSystem()
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
# Convenience functions
|
|
376
|
-
def get_enhanced_matcher(strategy: str) -> callable:
|
|
377
|
-
"""Get enhanced matcher function for strategy"""
|
|
378
|
-
return _enhanced_matcher.get_matcher_function(strategy)
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
def validate_enhanced_match_strategy(strategy: str) -> Dict[str, Any]:
|
|
382
|
-
"""Validate enhanced match strategy"""
|
|
383
|
-
return _enhanced_matcher.validate_match_strategy(strategy)
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
def get_enhanced_match_examples(strategy: str) -> List[Dict[str, Any]]:
|
|
387
|
-
"""Get examples for enhanced match strategy"""
|
|
388
|
-
return _enhanced_matcher.get_match_examples(strategy)
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
def get_strategy_recommendations(use_case: str) -> List[str]:
|
|
392
|
-
"""Get strategy recommendations for use case"""
|
|
393
|
-
return _enhanced_matcher.get_strategy_recommendations(use_case)
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
def get_enhanced_match_stats() -> Dict[str, Any]:
|
|
397
|
-
"""Get enhanced matching statistics"""
|
|
398
|
-
return _enhanced_matcher.get_stats()
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
# Enhanced matcher mapping for backward compatibility
|
|
402
|
-
ENHANCED_MATCHERS = {
|
|
403
|
-
# Case-insensitive defaults (new behavior)
|
|
404
|
-
"exact": "iexact", # Maps to existing case-insensitive matcher
|
|
405
|
-
"contains": "icontains", # Maps to existing case-insensitive matcher
|
|
406
|
-
"startswith": "ibeginswith", # Maps to existing case-insensitive matcher
|
|
407
|
-
"endswith": "iendswith", # Maps to existing case-insensitive matcher
|
|
408
|
-
|
|
409
|
-
# Case-sensitive explicit (when needed)
|
|
410
|
-
"exact_case": "exact", # Maps to existing case-sensitive matcher
|
|
411
|
-
"contains_case": "contains", # Maps to existing case-sensitive matcher
|
|
412
|
-
"startswith_case": "beginswith", # Maps to existing case-sensitive matcher
|
|
413
|
-
"endswith_case": "endswith", # Maps to existing case-sensitive matcher
|
|
414
|
-
|
|
415
|
-
# Advanced matching (unchanged)
|
|
416
|
-
"fuzzy": "fuzzy",
|
|
417
|
-
"regex": "regex",
|
|
418
|
-
"range": "range"
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
def map_enhanced_to_legacy_strategy(enhanced_strategy: str) -> str:
|
|
423
|
-
"""Map enhanced strategy name to legacy matcher name"""
|
|
424
|
-
return ENHANCED_MATCHERS.get(enhanced_strategy, enhanced_strategy)
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
# Demonstration and validation
|
|
428
|
-
def demonstrate_enhanced_matching():
|
|
429
|
-
"""Demonstrate enhanced matching capabilities"""
|
|
430
|
-
|
|
431
|
-
print("Enhanced Match Parameters Demonstration")
|
|
432
|
-
print("=" * 50)
|
|
433
|
-
|
|
434
|
-
# Test data
|
|
435
|
-
test_cases = [
|
|
436
|
-
("exact", "Apple", ["APPLE", "apple", "Apple Inc"]),
|
|
437
|
-
("exact_case", "Apple", ["APPLE", "apple", "Apple"]),
|
|
438
|
-
("contains", "laptop", ["Gaming Laptop", "LAPTOP-001", "My Laptop"]),
|
|
439
|
-
("startswith", "tech", ["TechCorp", "TECHNOLOGY", "tech-support"]),
|
|
440
|
-
("fuzzy", "John Smith", ["Jon Smith", "John Smyth", "Jane Doe"])
|
|
441
|
-
]
|
|
442
|
-
|
|
443
|
-
for strategy, target, candidates in test_cases:
|
|
444
|
-
print(f"\nStrategy: {strategy}")
|
|
445
|
-
print(f"Target: '{target}'")
|
|
446
|
-
print("Candidates:")
|
|
447
|
-
|
|
448
|
-
# Get matcher function
|
|
449
|
-
try:
|
|
450
|
-
matcher = get_enhanced_matcher(strategy)
|
|
451
|
-
|
|
452
|
-
# Create simple lookup structure
|
|
453
|
-
lookup = {(candidate,): [i] for i, candidate in enumerate(candidates)}
|
|
454
|
-
|
|
455
|
-
# Test matching
|
|
456
|
-
matches = matcher((target,), lookup)
|
|
457
|
-
|
|
458
|
-
for i, candidate in enumerate(candidates):
|
|
459
|
-
match_status = "✅" if i in matches else "❌"
|
|
460
|
-
print(f" {match_status} '{candidate}'")
|
|
461
|
-
|
|
462
|
-
except Exception as e:
|
|
463
|
-
print(f" Error: {e}")
|
|
464
|
-
|
|
465
|
-
# Show statistics
|
|
466
|
-
stats = get_enhanced_match_stats()
|
|
467
|
-
print(f"\nMatching Statistics:")
|
|
468
|
-
for key, value in stats.items():
|
|
469
|
-
print(f" {key}: {value}")
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
if __name__ == "__main__":
|
|
473
|
-
demonstrate_enhanced_matching()
|