code2flow-toon 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code2flow/__init__.py +47 -0
- code2flow/__main__.py +6 -0
- code2flow/analysis/__init__.py +17 -0
- code2flow/analysis/call_graph.py +210 -0
- code2flow/analysis/cfg.py +293 -0
- code2flow/analysis/coupling.py +77 -0
- code2flow/analysis/data_analysis.py +249 -0
- code2flow/analysis/dfg.py +224 -0
- code2flow/analysis/smells.py +192 -0
- code2flow/cli.py +464 -0
- code2flow/core/__init__.py +36 -0
- code2flow/core/analyzer.py +765 -0
- code2flow/core/config.py +177 -0
- code2flow/core/models.py +194 -0
- code2flow/core/streaming_analyzer.py +666 -0
- code2flow/exporters/__init__.py +17 -0
- code2flow/exporters/base.py +13 -0
- code2flow/exporters/json_exporter.py +17 -0
- code2flow/exporters/llm_exporter.py +199 -0
- code2flow/exporters/mermaid_exporter.py +67 -0
- code2flow/exporters/toon.py +401 -0
- code2flow/exporters/yaml_exporter.py +108 -0
- code2flow/llm_flow_generator.py +451 -0
- code2flow/llm_task_generator.py +263 -0
- code2flow/mermaid_generator.py +481 -0
- code2flow/nlp/__init__.py +23 -0
- code2flow/nlp/config.py +174 -0
- code2flow/nlp/entity_resolution.py +326 -0
- code2flow/nlp/intent_matching.py +297 -0
- code2flow/nlp/normalization.py +122 -0
- code2flow/nlp/pipeline.py +388 -0
- code2flow/patterns/__init__.py +0 -0
- code2flow/patterns/detector.py +168 -0
- code2flow/refactor/__init__.py +0 -0
- code2flow/refactor/prompt_engine.py +150 -0
- code2flow/visualizers/__init__.py +0 -0
- code2flow/visualizers/graph.py +196 -0
- code2flow_toon-0.2.4.dist-info/METADATA +599 -0
- code2flow_toon-0.2.4.dist-info/RECORD +43 -0
- code2flow_toon-0.2.4.dist-info/WHEEL +5 -0
- code2flow_toon-0.2.4.dist-info/entry_points.txt +2 -0
- code2flow_toon-0.2.4.dist-info/licenses/LICENSE +201 -0
- code2flow_toon-0.2.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
"""Entity Resolution - Steps 3a-3e.
|
|
2
|
+
|
|
3
|
+
3a. Entity types to extract (function, class, module, variable, file)
|
|
4
|
+
3b. Name matching threshold
|
|
5
|
+
3c. Context-aware disambiguation
|
|
6
|
+
3d. Hierarchical resolution (class.method -> method)
|
|
7
|
+
3e. Alias resolution (short names -> qualified names)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import List, Dict, Optional, Set, Tuple
|
|
13
|
+
from difflib import SequenceMatcher
|
|
14
|
+
|
|
15
|
+
from .config import EntityResolutionConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Entity:
|
|
20
|
+
"""Resolved entity."""
|
|
21
|
+
name: str
|
|
22
|
+
qualified_name: str
|
|
23
|
+
entity_type: str # function, class, module, variable, file
|
|
24
|
+
confidence: float
|
|
25
|
+
source_file: Optional[str] = None
|
|
26
|
+
line_number: Optional[int] = None
|
|
27
|
+
context: Optional[str] = None
|
|
28
|
+
aliases: List[str] = field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class EntityResolutionResult:
|
|
33
|
+
"""Result of entity resolution."""
|
|
34
|
+
query: str
|
|
35
|
+
entities: List[Entity] = field(default_factory=list)
|
|
36
|
+
primary_entity: Optional[Entity] = None
|
|
37
|
+
disambiguation_needed: bool = False
|
|
38
|
+
|
|
39
|
+
def get_by_type(self, entity_type: str) -> List[Entity]:
|
|
40
|
+
"""Get entities of specific type."""
|
|
41
|
+
return [e for e in self.entities if e.entity_type == entity_type]
|
|
42
|
+
|
|
43
|
+
def get_best_match(self) -> Optional[Entity]:
|
|
44
|
+
"""Get highest confidence entity."""
|
|
45
|
+
if not self.entities:
|
|
46
|
+
return None
|
|
47
|
+
return max(self.entities, key=lambda e: e.confidence)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class EntityResolver:
|
|
51
|
+
"""Resolve entities (functions, classes, etc.) from queries."""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
config: Optional[EntityResolutionConfig] = None,
|
|
56
|
+
codebase_entities: Optional[Dict[str, List[Entity]]] = None
|
|
57
|
+
):
|
|
58
|
+
self.config = config or EntityResolutionConfig()
|
|
59
|
+
# Initialize with empty dict, populate from codebase analysis
|
|
60
|
+
self.codebase_entities = codebase_entities or {}
|
|
61
|
+
|
|
62
|
+
def resolve(
|
|
63
|
+
self,
|
|
64
|
+
query: str,
|
|
65
|
+
context: Optional[str] = None,
|
|
66
|
+
expected_types: Optional[List[str]] = None
|
|
67
|
+
) -> EntityResolutionResult:
|
|
68
|
+
"""Resolve entities from query (steps 3a-3e)."""
|
|
69
|
+
result = EntityResolutionResult(query=query)
|
|
70
|
+
|
|
71
|
+
# Determine expected entity types
|
|
72
|
+
if expected_types is None:
|
|
73
|
+
expected_types = self.config.entity_types
|
|
74
|
+
|
|
75
|
+
# 3a. Extract candidate entities by type
|
|
76
|
+
candidates = []
|
|
77
|
+
for entity_type in expected_types:
|
|
78
|
+
type_candidates = self._extract_candidates(query, entity_type)
|
|
79
|
+
candidates.extend(type_candidates)
|
|
80
|
+
|
|
81
|
+
# 3b. Match with threshold
|
|
82
|
+
threshold = self.config.name_match_threshold
|
|
83
|
+
matches = [c for c in candidates if c.confidence >= threshold]
|
|
84
|
+
|
|
85
|
+
# 3c. Context-aware disambiguation
|
|
86
|
+
if context and self.config.context_disambiguation:
|
|
87
|
+
matches = self._disambiguate(matches, context)
|
|
88
|
+
|
|
89
|
+
# 3d. Hierarchical resolution
|
|
90
|
+
if self.config.hierarchical_resolution:
|
|
91
|
+
matches = self._resolve_hierarchical(matches)
|
|
92
|
+
|
|
93
|
+
# 3e. Alias resolution
|
|
94
|
+
if self.config.alias_resolution:
|
|
95
|
+
matches = self._resolve_aliases(matches)
|
|
96
|
+
|
|
97
|
+
result.entities = matches
|
|
98
|
+
|
|
99
|
+
# Determine if disambiguation needed
|
|
100
|
+
if len(matches) > 1:
|
|
101
|
+
top_confidences = [m.confidence for m in matches[:2]]
|
|
102
|
+
if abs(top_confidences[0] - top_confidences[1]) < 0.1:
|
|
103
|
+
result.disambiguation_needed = True
|
|
104
|
+
|
|
105
|
+
# Set primary entity
|
|
106
|
+
if matches:
|
|
107
|
+
result.primary_entity = matches[0]
|
|
108
|
+
|
|
109
|
+
return result
|
|
110
|
+
|
|
111
|
+
def _extract_candidates(self, query: str, entity_type: str) -> List[Entity]:
|
|
112
|
+
"""3a. Extract candidate entities of given type from query."""
|
|
113
|
+
candidates = []
|
|
114
|
+
|
|
115
|
+
# Get entities from codebase
|
|
116
|
+
type_entities = self.codebase_entities.get(entity_type, [])
|
|
117
|
+
|
|
118
|
+
for entity in type_entities:
|
|
119
|
+
# Calculate name similarity
|
|
120
|
+
similarity = self._name_similarity(query, entity.name)
|
|
121
|
+
|
|
122
|
+
if similarity > 0.5: # Minimum threshold for candidacy
|
|
123
|
+
candidate = Entity(
|
|
124
|
+
name=entity.name,
|
|
125
|
+
qualified_name=entity.qualified_name,
|
|
126
|
+
entity_type=entity_type,
|
|
127
|
+
confidence=similarity,
|
|
128
|
+
source_file=entity.source_file,
|
|
129
|
+
line_number=entity.line_number,
|
|
130
|
+
)
|
|
131
|
+
candidates.append(candidate)
|
|
132
|
+
|
|
133
|
+
# Also extract potential entities from query patterns
|
|
134
|
+
pattern_matches = self._extract_from_patterns(query, entity_type)
|
|
135
|
+
candidates.extend(pattern_matches)
|
|
136
|
+
|
|
137
|
+
return candidates
|
|
138
|
+
|
|
139
|
+
def _extract_from_patterns(self, query: str, entity_type: str) -> List[Entity]:
|
|
140
|
+
"""Extract entities using regex patterns."""
|
|
141
|
+
candidates = []
|
|
142
|
+
|
|
143
|
+
if entity_type == "function":
|
|
144
|
+
# Match patterns like: function_name(), call function_name, etc.
|
|
145
|
+
patterns = [
|
|
146
|
+
r'\b(\w+)\s*\(', # function_name(
|
|
147
|
+
r'function\s+(\w+)',
|
|
148
|
+
r'call\s+(\w+)',
|
|
149
|
+
r'wywołaj\s+(\w+)', # Polish
|
|
150
|
+
]
|
|
151
|
+
elif entity_type == "class":
|
|
152
|
+
patterns = [
|
|
153
|
+
r'class\s+(\w+)',
|
|
154
|
+
r'klasa\s+(\w+)',
|
|
155
|
+
r'(\w+)\s*\.\s*\w+\s*\(', # ClassName.method()
|
|
156
|
+
]
|
|
157
|
+
elif entity_type == "file":
|
|
158
|
+
patterns = [
|
|
159
|
+
r'(\w+\.py)\b',
|
|
160
|
+
r'file\s+(\w+)',
|
|
161
|
+
r'plik\s+(\w+)',
|
|
162
|
+
]
|
|
163
|
+
else:
|
|
164
|
+
patterns = [r'\b(\w+)\b']
|
|
165
|
+
|
|
166
|
+
for pattern in patterns:
|
|
167
|
+
matches = re.finditer(pattern, query, re.IGNORECASE)
|
|
168
|
+
for match in matches:
|
|
169
|
+
name = match.group(1)
|
|
170
|
+
candidates.append(Entity(
|
|
171
|
+
name=name,
|
|
172
|
+
qualified_name=name,
|
|
173
|
+
entity_type=entity_type,
|
|
174
|
+
confidence=0.7, # Pattern-based confidence
|
|
175
|
+
))
|
|
176
|
+
|
|
177
|
+
return candidates
|
|
178
|
+
|
|
179
|
+
def _disambiguate(
|
|
180
|
+
self,
|
|
181
|
+
candidates: List[Entity],
|
|
182
|
+
context: str
|
|
183
|
+
) -> List[Entity]:
|
|
184
|
+
"""3c. Disambiguate entities using context."""
|
|
185
|
+
if not candidates:
|
|
186
|
+
return candidates
|
|
187
|
+
|
|
188
|
+
context_lower = context.lower()
|
|
189
|
+
|
|
190
|
+
# Boost confidence for entities mentioned in context
|
|
191
|
+
for candidate in candidates:
|
|
192
|
+
# Check if entity name or related terms appear in context
|
|
193
|
+
if candidate.name.lower() in context_lower:
|
|
194
|
+
candidate.confidence = min(1.0, candidate.confidence + 0.15)
|
|
195
|
+
|
|
196
|
+
# Check if source file is mentioned in context
|
|
197
|
+
if candidate.source_file:
|
|
198
|
+
file_name = candidate.source_file.split('/')[-1].lower()
|
|
199
|
+
if file_name in context_lower:
|
|
200
|
+
candidate.confidence = min(1.0, candidate.confidence + 0.1)
|
|
201
|
+
|
|
202
|
+
# Re-sort by updated confidence
|
|
203
|
+
return sorted(candidates, key=lambda e: e.confidence, reverse=True)
|
|
204
|
+
|
|
205
|
+
def _resolve_hierarchical(self, candidates: List[Entity]) -> List[Entity]:
|
|
206
|
+
"""3d. Resolve hierarchical names (Class.method -> method)."""
|
|
207
|
+
resolved = []
|
|
208
|
+
|
|
209
|
+
for candidate in candidates:
|
|
210
|
+
# Check if name contains hierarchy separator
|
|
211
|
+
if '.' in candidate.name:
|
|
212
|
+
parts = candidate.name.split('.')
|
|
213
|
+
|
|
214
|
+
# Create resolved entity with short name
|
|
215
|
+
short_name = parts[-1]
|
|
216
|
+
resolved_entity = Entity(
|
|
217
|
+
name=short_name,
|
|
218
|
+
qualified_name=candidate.qualified_name,
|
|
219
|
+
entity_type=candidate.entity_type,
|
|
220
|
+
confidence=candidate.confidence,
|
|
221
|
+
source_file=candidate.source_file,
|
|
222
|
+
line_number=candidate.line_number,
|
|
223
|
+
context=candidate.context,
|
|
224
|
+
aliases=candidate.aliases + [candidate.name]
|
|
225
|
+
)
|
|
226
|
+
resolved.append(resolved_entity)
|
|
227
|
+
|
|
228
|
+
# Also keep original
|
|
229
|
+
resolved.append(candidate)
|
|
230
|
+
else:
|
|
231
|
+
resolved.append(candidate)
|
|
232
|
+
|
|
233
|
+
return resolved
|
|
234
|
+
|
|
235
|
+
def _resolve_aliases(self, candidates: List[Entity]) -> List[Entity]:
|
|
236
|
+
"""3e. Resolve aliases to canonical names."""
|
|
237
|
+
resolved = []
|
|
238
|
+
|
|
239
|
+
for candidate in candidates:
|
|
240
|
+
# Check for known aliases
|
|
241
|
+
if candidate.aliases:
|
|
242
|
+
# Prefer qualified name as canonical
|
|
243
|
+
if len(candidate.qualified_name) > len(candidate.name):
|
|
244
|
+
candidate.aliases.append(candidate.name)
|
|
245
|
+
candidate.name = candidate.qualified_name
|
|
246
|
+
|
|
247
|
+
resolved.append(candidate)
|
|
248
|
+
|
|
249
|
+
return resolved
|
|
250
|
+
|
|
251
|
+
def _name_similarity(self, query: str, name: str) -> float:
|
|
252
|
+
"""Calculate similarity between query and entity name."""
|
|
253
|
+
# Direct match
|
|
254
|
+
if name.lower() in query.lower():
|
|
255
|
+
return 0.95
|
|
256
|
+
|
|
257
|
+
if query.lower() in name.lower():
|
|
258
|
+
return 0.9
|
|
259
|
+
|
|
260
|
+
# Fuzzy match
|
|
261
|
+
return SequenceMatcher(None, query.lower(), name.lower()).ratio()
|
|
262
|
+
|
|
263
|
+
def load_from_analysis(self, analysis_result) -> None:
|
|
264
|
+
"""Load entities from code analysis result."""
|
|
265
|
+
self.codebase_entities = {
|
|
266
|
+
"function": [],
|
|
267
|
+
"class": [],
|
|
268
|
+
"module": [],
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
# Load functions
|
|
272
|
+
for func_name, func_info in analysis_result.functions.items():
|
|
273
|
+
entity = Entity(
|
|
274
|
+
name=func_info.name,
|
|
275
|
+
qualified_name=func_info.qualified_name,
|
|
276
|
+
entity_type="function",
|
|
277
|
+
confidence=1.0,
|
|
278
|
+
source_file=func_info.file,
|
|
279
|
+
line_number=func_info.line,
|
|
280
|
+
)
|
|
281
|
+
self.codebase_entities["function"].append(entity)
|
|
282
|
+
|
|
283
|
+
# Load classes
|
|
284
|
+
for class_name, class_info in analysis_result.classes.items():
|
|
285
|
+
entity = Entity(
|
|
286
|
+
name=class_info.name,
|
|
287
|
+
qualified_name=class_info.qualified_name,
|
|
288
|
+
entity_type="class",
|
|
289
|
+
confidence=1.0,
|
|
290
|
+
source_file=class_info.file,
|
|
291
|
+
line_number=class_info.line,
|
|
292
|
+
)
|
|
293
|
+
self.codebase_entities["class"].append(entity)
|
|
294
|
+
|
|
295
|
+
# Load modules
|
|
296
|
+
for mod_name, mod_info in analysis_result.modules.items():
|
|
297
|
+
entity = Entity(
|
|
298
|
+
name=mod_info.name,
|
|
299
|
+
qualified_name=mod_info.name,
|
|
300
|
+
entity_type="module",
|
|
301
|
+
confidence=1.0,
|
|
302
|
+
source_file=mod_info.file,
|
|
303
|
+
)
|
|
304
|
+
self.codebase_entities["module"].append(entity)
|
|
305
|
+
|
|
306
|
+
# Individual step methods
|
|
307
|
+
def step_3a_extract_entities(self, query: str, entity_type: str) -> List[Entity]:
|
|
308
|
+
"""Step 3a: Extract entities by type."""
|
|
309
|
+
return self._extract_candidates(query, entity_type)
|
|
310
|
+
|
|
311
|
+
def step_3b_match_threshold(self, candidates: List[Entity]) -> List[Entity]:
|
|
312
|
+
"""Step 3b: Apply name matching threshold."""
|
|
313
|
+
threshold = self.config.name_match_threshold
|
|
314
|
+
return [c for c in candidates if c.confidence >= threshold]
|
|
315
|
+
|
|
316
|
+
def step_3c_disambiguate(self, candidates: List[Entity], context: str) -> List[Entity]:
|
|
317
|
+
"""Step 3c: Context-aware disambiguation."""
|
|
318
|
+
return self._disambiguate(candidates, context)
|
|
319
|
+
|
|
320
|
+
def step_3d_hierarchical_resolve(self, candidates: List[Entity]) -> List[Entity]:
|
|
321
|
+
"""Step 3d: Resolve hierarchical names."""
|
|
322
|
+
return self._resolve_hierarchical(candidates)
|
|
323
|
+
|
|
324
|
+
def step_3e_alias_resolve(self, candidates: List[Entity]) -> List[Entity]:
|
|
325
|
+
"""Step 3e: Resolve aliases."""
|
|
326
|
+
return self._resolve_aliases(candidates)
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""Intent Matching - Steps 2a-2e.
|
|
2
|
+
|
|
3
|
+
2a. Fuzzy matching threshold (0.0-1.0)
|
|
4
|
+
2b. Semantic similarity threshold
|
|
5
|
+
2c. Keyword matching weight
|
|
6
|
+
2d. Context window size
|
|
7
|
+
2e. Multi-intent resolution strategy
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import List, Dict, Optional, Tuple
|
|
13
|
+
from difflib import SequenceMatcher
|
|
14
|
+
|
|
15
|
+
from .config import IntentMatchingConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class IntentMatch:
|
|
20
|
+
"""Single intent match result."""
|
|
21
|
+
intent: str
|
|
22
|
+
confidence: float
|
|
23
|
+
matched_phrase: str
|
|
24
|
+
match_type: str # exact, fuzzy, keyword, semantic
|
|
25
|
+
context_score: float = 0.0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class IntentMatchingResult:
|
|
30
|
+
"""Result of intent matching."""
|
|
31
|
+
query: str
|
|
32
|
+
primary_intent: Optional[IntentMatch] = None
|
|
33
|
+
all_matches: List[IntentMatch] = field(default_factory=list)
|
|
34
|
+
strategy_used: str = "best_match"
|
|
35
|
+
|
|
36
|
+
def get_best_intent(self) -> Optional[str]:
|
|
37
|
+
"""Get the best matching intent name."""
|
|
38
|
+
if self.primary_intent:
|
|
39
|
+
return self.primary_intent.intent
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
def get_confidence(self) -> float:
|
|
43
|
+
"""Get confidence of best match."""
|
|
44
|
+
if self.primary_intent:
|
|
45
|
+
return self.primary_intent.confidence
|
|
46
|
+
return 0.0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class IntentMatcher:
|
|
50
|
+
"""Match queries to intents using fuzzy and keyword matching."""
|
|
51
|
+
|
|
52
|
+
# Predefined intent patterns
|
|
53
|
+
DEFAULT_INTENTS = {
|
|
54
|
+
"find_function": [
|
|
55
|
+
"find function", "lookup function", "search function",
|
|
56
|
+
"znajdź funkcję", "szukaj funkcji", "gdzie jest funkcja"
|
|
57
|
+
],
|
|
58
|
+
"find_class": [
|
|
59
|
+
"find class", "lookup class", "search class",
|
|
60
|
+
"znajdź klasę", "szukaj klasy", "gdzie jest klasa"
|
|
61
|
+
],
|
|
62
|
+
"analyze_flow": [
|
|
63
|
+
"analyze flow", "show flow", "trace flow",
|
|
64
|
+
"analizuj przepływ", "pokaż przepływ", "śledź przepływ"
|
|
65
|
+
],
|
|
66
|
+
"show_call_graph": [
|
|
67
|
+
"show call graph", "display call graph", "call graph",
|
|
68
|
+
"pokaż graf wywołań", "graf wywołań"
|
|
69
|
+
],
|
|
70
|
+
"find_dependencies": [
|
|
71
|
+
"find dependencies", "show dependencies", "what depends on",
|
|
72
|
+
"znajdź zależności", "pokaż zależności"
|
|
73
|
+
],
|
|
74
|
+
"explain_code": [
|
|
75
|
+
"explain code", "explain function", "what does",
|
|
76
|
+
"wyjaśnij kod", "wyjaśnij funkcję", "co robi"
|
|
77
|
+
],
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
config: Optional[IntentMatchingConfig] = None,
|
|
83
|
+
intents: Optional[Dict[str, List[str]]] = None
|
|
84
|
+
):
|
|
85
|
+
self.config = config or IntentMatchingConfig()
|
|
86
|
+
self.intents = intents or self.DEFAULT_INTENTS
|
|
87
|
+
|
|
88
|
+
def match(
|
|
89
|
+
self,
|
|
90
|
+
query: str,
|
|
91
|
+
context: Optional[List[str]] = None
|
|
92
|
+
) -> IntentMatchingResult:
|
|
93
|
+
"""Match query to intents (steps 2a-2e)."""
|
|
94
|
+
result = IntentMatchingResult(query=query)
|
|
95
|
+
|
|
96
|
+
# 2a. Fuzzy matching
|
|
97
|
+
fuzzy_matches = self._fuzzy_match(query)
|
|
98
|
+
|
|
99
|
+
# 2c. Keyword matching
|
|
100
|
+
keyword_matches = self._keyword_match(query)
|
|
101
|
+
|
|
102
|
+
# 2d. Context scoring
|
|
103
|
+
if context:
|
|
104
|
+
self._apply_context(query, fuzzy_matches + keyword_matches, context)
|
|
105
|
+
|
|
106
|
+
# Combine and deduplicate
|
|
107
|
+
all_matches = self._combine_matches(fuzzy_matches + keyword_matches)
|
|
108
|
+
result.all_matches = all_matches
|
|
109
|
+
|
|
110
|
+
# 2e. Multi-intent resolution
|
|
111
|
+
result.primary_intent = self._resolve_multi_intent(all_matches)
|
|
112
|
+
result.strategy_used = self.config.multi_intent_strategy
|
|
113
|
+
|
|
114
|
+
return result
|
|
115
|
+
|
|
116
|
+
def _fuzzy_match(self, query: str) -> List[IntentMatch]:
|
|
117
|
+
"""2a. Fuzzy matching with configurable threshold."""
|
|
118
|
+
matches = []
|
|
119
|
+
threshold = self.config.fuzzy_threshold
|
|
120
|
+
|
|
121
|
+
for intent, phrases in self.intents.items():
|
|
122
|
+
for phrase in phrases:
|
|
123
|
+
similarity = self._calculate_similarity(query, phrase)
|
|
124
|
+
if similarity >= threshold:
|
|
125
|
+
matches.append(IntentMatch(
|
|
126
|
+
intent=intent,
|
|
127
|
+
confidence=similarity,
|
|
128
|
+
matched_phrase=phrase,
|
|
129
|
+
match_type="fuzzy"
|
|
130
|
+
))
|
|
131
|
+
|
|
132
|
+
return matches
|
|
133
|
+
|
|
134
|
+
def _keyword_match(self, query: str) -> List[IntentMatch]:
|
|
135
|
+
"""2c. Keyword matching with configurable weight."""
|
|
136
|
+
matches = []
|
|
137
|
+
query_words = set(query.lower().split())
|
|
138
|
+
|
|
139
|
+
for intent, phrases in self.intents.items():
|
|
140
|
+
best_score = 0.0
|
|
141
|
+
best_phrase = ""
|
|
142
|
+
|
|
143
|
+
for phrase in phrases:
|
|
144
|
+
phrase_words = set(phrase.lower().split())
|
|
145
|
+
|
|
146
|
+
# Calculate Jaccard similarity
|
|
147
|
+
intersection = len(query_words & phrase_words)
|
|
148
|
+
union = len(query_words | phrase_words)
|
|
149
|
+
|
|
150
|
+
if union > 0:
|
|
151
|
+
score = intersection / union
|
|
152
|
+
if score > best_score:
|
|
153
|
+
best_score = score
|
|
154
|
+
best_phrase = phrase
|
|
155
|
+
|
|
156
|
+
# Apply keyword weight
|
|
157
|
+
weighted_score = best_score * self.config.keyword_weight
|
|
158
|
+
|
|
159
|
+
if weighted_score > 0:
|
|
160
|
+
matches.append(IntentMatch(
|
|
161
|
+
intent=intent,
|
|
162
|
+
confidence=weighted_score,
|
|
163
|
+
matched_phrase=best_phrase,
|
|
164
|
+
match_type="keyword"
|
|
165
|
+
))
|
|
166
|
+
|
|
167
|
+
return matches
|
|
168
|
+
|
|
169
|
+
def _apply_context(
|
|
170
|
+
self,
|
|
171
|
+
query: str,
|
|
172
|
+
matches: List[IntentMatch],
|
|
173
|
+
context: List[str]
|
|
174
|
+
) -> None:
|
|
175
|
+
"""2d. Apply context window scoring."""
|
|
176
|
+
window_size = self.config.context_window
|
|
177
|
+
context_text = ' '.join(context[-window_size:]).lower()
|
|
178
|
+
|
|
179
|
+
for match in matches:
|
|
180
|
+
# Boost confidence if intent keywords appear in context
|
|
181
|
+
intent_phrases = self.intents.get(match.intent, [])
|
|
182
|
+
context_score = 0.0
|
|
183
|
+
|
|
184
|
+
for phrase in intent_phrases:
|
|
185
|
+
phrase_words = phrase.lower().split()
|
|
186
|
+
for word in phrase_words:
|
|
187
|
+
if word in context_text:
|
|
188
|
+
context_score += 0.1
|
|
189
|
+
|
|
190
|
+
match.context_score = min(context_score, 0.5) # Cap at 0.5
|
|
191
|
+
match.confidence = min(1.0, match.confidence + match.context_score)
|
|
192
|
+
|
|
193
|
+
def _combine_matches(self, matches: List[IntentMatch]) -> List[IntentMatch]:
|
|
194
|
+
"""Combine and deduplicate matches, keeping highest confidence per intent."""
|
|
195
|
+
best_by_intent: Dict[str, IntentMatch] = {}
|
|
196
|
+
|
|
197
|
+
for match in matches:
|
|
198
|
+
if match.intent not in best_by_intent:
|
|
199
|
+
best_by_intent[match.intent] = match
|
|
200
|
+
elif match.confidence > best_by_intent[match.intent].confidence:
|
|
201
|
+
best_by_intent[match.intent] = match
|
|
202
|
+
|
|
203
|
+
# Sort by confidence descending
|
|
204
|
+
return sorted(best_by_intent.values(), key=lambda m: m.confidence, reverse=True)
|
|
205
|
+
|
|
206
|
+
def _resolve_multi_intent(self, matches: List[IntentMatch]) -> Optional[IntentMatch]:
|
|
207
|
+
"""2e. Multi-intent resolution strategy."""
|
|
208
|
+
if not matches:
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
strategy = self.config.multi_intent_strategy
|
|
212
|
+
|
|
213
|
+
if strategy == "best_match":
|
|
214
|
+
return matches[0]
|
|
215
|
+
|
|
216
|
+
elif strategy == "combine":
|
|
217
|
+
# Combine top matches if close in confidence
|
|
218
|
+
if len(matches) >= 2:
|
|
219
|
+
gap = matches[0].confidence - matches[1].confidence
|
|
220
|
+
if gap < 0.1: # Within 0.1 confidence
|
|
221
|
+
# Return combined intent
|
|
222
|
+
combined = IntentMatch(
|
|
223
|
+
intent=f"{matches[0].intent}+{matches[1].intent}",
|
|
224
|
+
confidence=(matches[0].confidence + matches[1].confidence) / 2,
|
|
225
|
+
matched_phrase=f"{matches[0].matched_phrase} | {matches[1].matched_phrase}",
|
|
226
|
+
match_type="combined"
|
|
227
|
+
)
|
|
228
|
+
return combined
|
|
229
|
+
return matches[0]
|
|
230
|
+
|
|
231
|
+
elif strategy == "sequential":
|
|
232
|
+
# Return list of intents to process sequentially
|
|
233
|
+
return matches[0] # For now, just return best
|
|
234
|
+
|
|
235
|
+
return matches[0]
|
|
236
|
+
|
|
237
|
+
def _calculate_similarity(self, a: str, b: str) -> float:
|
|
238
|
+
"""Calculate string similarity using configured algorithm."""
|
|
239
|
+
algorithm = self.config.fuzzy_algorithm
|
|
240
|
+
|
|
241
|
+
if algorithm == "ratio":
|
|
242
|
+
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
|
243
|
+
|
|
244
|
+
elif algorithm == "partial_ratio":
|
|
245
|
+
# Check if one is substring of other
|
|
246
|
+
a_lower = a.lower()
|
|
247
|
+
b_lower = b.lower()
|
|
248
|
+
|
|
249
|
+
if a_lower in b_lower or b_lower in a_lower:
|
|
250
|
+
return 0.9 # High score for partial match
|
|
251
|
+
|
|
252
|
+
return SequenceMatcher(None, a_lower, b_lower).ratio()
|
|
253
|
+
|
|
254
|
+
elif algorithm == "token_sort_ratio":
|
|
255
|
+
# Sort tokens and compare
|
|
256
|
+
a_sorted = ' '.join(sorted(a.lower().split()))
|
|
257
|
+
b_sorted = ' '.join(sorted(b.lower().split()))
|
|
258
|
+
return SequenceMatcher(None, a_sorted, b_sorted).ratio()
|
|
259
|
+
|
|
260
|
+
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
|
261
|
+
|
|
262
|
+
# Individual step methods
|
|
263
|
+
def step_2a_fuzzy_match(self, query: str, phrase: str) -> float:
|
|
264
|
+
"""Step 2a: Calculate fuzzy match score."""
|
|
265
|
+
return self._calculate_similarity(query, phrase)
|
|
266
|
+
|
|
267
|
+
def step_2b_semantic_match(self, query: str, phrase: str) -> float:
|
|
268
|
+
"""Step 2b: Semantic similarity (placeholder for embeddings)."""
|
|
269
|
+
# Placeholder - would use sentence embeddings in production
|
|
270
|
+
return self._calculate_similarity(query, phrase)
|
|
271
|
+
|
|
272
|
+
def step_2c_keyword_match(self, query: str, phrase: str) -> float:
|
|
273
|
+
"""Step 2c: Keyword matching score."""
|
|
274
|
+
query_words = set(query.lower().split())
|
|
275
|
+
phrase_words = set(phrase.lower().split())
|
|
276
|
+
|
|
277
|
+
if not phrase_words:
|
|
278
|
+
return 0.0
|
|
279
|
+
|
|
280
|
+
intersection = len(query_words & phrase_words)
|
|
281
|
+
return intersection / len(phrase_words)
|
|
282
|
+
|
|
283
|
+
def step_2d_context_score(self, query: str, context: List[str]) -> float:
|
|
284
|
+
"""Step 2d: Calculate context relevance score."""
|
|
285
|
+
window = ' '.join(context[-self.config.context_window:]).lower()
|
|
286
|
+
query_words = query.lower().split()
|
|
287
|
+
|
|
288
|
+
score = 0.0
|
|
289
|
+
for word in query_words:
|
|
290
|
+
if word in window:
|
|
291
|
+
score += 0.1
|
|
292
|
+
|
|
293
|
+
return min(score, 0.5)
|
|
294
|
+
|
|
295
|
+
def step_2e_resolve_intents(self, matches: List[IntentMatch]) -> Optional[IntentMatch]:
|
|
296
|
+
"""Step 2e: Resolve multiple intent matches."""
|
|
297
|
+
return self._resolve_multi_intent(matches)
|