stravinsky 0.2.67__py3-none-any.whl → 0.4.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of stravinsky might be problematic. Click here for more details.

@@ -0,0 +1,323 @@
1
+ """Query classifier for intelligent search routing.
2
+
3
+ This module provides a fast, regex-based system that categorizes search queries
4
+ into four types: PATTERN (exact text matching), STRUCTURAL (AST-aware code structure),
5
+ SEMANTIC (conceptual/behavioral), and HYBRID (multi-modal).
6
+
7
+ It enables intelligent routing to the optimal search tool without LLM overhead.
8
+
9
+ Design Goals:
10
+ - Fast: <10ms classification per query
11
+ - No LLM calls: Pure regex-based detection (no API overhead)
12
+ - Confidence scoring: Return probability (0.0-1.0) for each category
13
+ - Fallback safe: Default to HYBRID when ambiguous
14
+ - Extensible: Easy to add new patterns/indicators
15
+ """
16
+
17
+ import logging
18
+ import re
19
+ from dataclasses import dataclass
20
+ from enum import Enum
21
+ from typing import Literal
22
+
23
+ # Module-level logger
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class QueryCategory(Enum):
28
+ """Query classification categories."""
29
+
30
+ SEMANTIC = "semantic" # Conceptual, "what it does" queries
31
+ PATTERN = "pattern" # Exact text/regex matching
32
+ STRUCTURAL = "structural" # AST-aware code structure queries
33
+ HYBRID = "hybrid" # Multi-modal search recommended
34
+
35
+
36
+ @dataclass
37
+ class QueryClassification:
38
+ """Result of query classification.
39
+
40
+ Attributes:
41
+ category: The classified query category (SEMANTIC, PATTERN, STRUCTURAL, HYBRID)
42
+ confidence: Confidence score from 0.0 (low) to 1.0 (high)
43
+ indicators: List of matched patterns/reasons that led to this classification
44
+ suggested_tool: The recommended search tool to use
45
+ - "grep_search" for PATTERN queries
46
+ - "ast_grep_search" for STRUCTURAL queries
47
+ - "semantic_search" for SEMANTIC queries
48
+ - "enhanced_search" for HYBRID queries
49
+ reasoning: Human-readable explanation of the classification
50
+ """
51
+
52
+ category: QueryCategory
53
+ confidence: float # 0.0-1.0
54
+ indicators: list[str] # Matched patterns/reasons
55
+ suggested_tool: Literal[
56
+ "semantic_search", "grep_search", "ast_grep_search", "enhanced_search"
57
+ ]
58
+ reasoning: str # Human-readable explanation
59
+
60
+
61
+ # Phase 1: Exact Pattern Detection (High Confidence)
62
+ # Triggered when query contains quoted strings, exact identifiers with code syntax,
63
+ # file paths, regular expressions, or known constant patterns.
64
+ PATTERN_INDICATORS = [
65
+ r'["\'][\w_]+["\']', # Quoted identifiers like "authenticate()" or 'API_KEY'
66
+ r'\b\w+\(\)', # Function calls with () like authenticate()
67
+ r'[\w_]+\.[\w_]+', # Dot notation (Class.method) like database.query()
68
+ r'[\w/]+\.\w{2,4}$', # File paths with extension
69
+ r'/.*?/', # Regex patterns
70
+ r'\b[A-Z_]{4,}\b', # CONSTANT_NAMES (4+ uppercase chars)
71
+ ]
72
+
73
+ # Phase 2: Structural Detection (High Confidence)
74
+ # Triggered when query contains AST keywords, structural relationships,
75
+ # or code structure terms.
76
+ STRUCTURAL_INDICATORS = [
77
+ r'\b(class|function|method|async|interface)\b', # AST keywords
78
+ r'\b(inherits?|extends?|implements?|overrides?)\b', # Structural relationships
79
+ r'\b(decorated?)\s+(with|by)\b', # Decorator patterns
80
+ r'\@\w+', # Decorator syntax
81
+ r'\b(definition|declaration|signature)\b', # Code structure terms
82
+ ]
83
+
84
+ # Phase 3: Conceptual Detection (Medium-High Confidence)
85
+ # Triggered when query contains intent verbs, how/why/where questions,
86
+ # design patterns, conceptual nouns, or cross-cutting concerns.
87
+ SEMANTIC_INDICATORS = [
88
+ r'\b(how|why|where)\s+(does|is|are)', # How/why/where questions
89
+ r'\b(handles?|manages?|processes?|validates?|transforms?)\b', # Intent verbs
90
+ r'\b(logic|mechanism|strategy|approach|workflow|implementation)\b', # Conceptual nouns
91
+ r'\b(pattern|anti-pattern)\b', # Design patterns
92
+ r'\b(authentication|authorization|caching|logging|error handling)\b', # Cross-cutting
93
+ r'\bfind\s+(all\s+)?(code|places|instances|implementations)\s+that\b', # Find code pattern
94
+ ]
95
+
96
+ # Phase 4: Hybrid Detection (Medium Confidence)
97
+ # Triggered when query contains multiple concepts, both exact + conceptual,
98
+ # broad scopes, or vague qualifiers.
99
+ HYBRID_INDICATORS = [
100
+ r'\s+(and|then|also|plus|with)\s+', # Conjunctions
101
+ r'\b(across|throughout|in all|system-wide)\b', # Broad scopes
102
+ r'\b(similar|related|like|kind of|type of)\b', # Vague qualifiers
103
+ r'\b(all|every|any)\s+\w+\s+(that|which|where)\b', # Broad quantifiers
104
+ ]
105
+
106
+ # Tool routing based on category
107
+ TOOL_ROUTING = {
108
+ QueryCategory.PATTERN: "grep_search",
109
+ QueryCategory.STRUCTURAL: "ast_grep_search",
110
+ QueryCategory.SEMANTIC: "semantic_search",
111
+ QueryCategory.HYBRID: "enhanced_search",
112
+ }
113
+
114
+
115
+ def classify_query(query: str) -> QueryClassification:
116
+ """Classify a search query into one of four categories.
117
+
118
+ This function analyzes a search query using regex-based pattern matching
119
+ to determine its type (PATTERN, STRUCTURAL, SEMANTIC, or HYBRID) and
120
+ recommends the most appropriate search tool.
121
+
122
+ The classification process has 4 phases:
123
+ 1. Pattern Detection: Looks for exact identifiers, quoted strings, file paths
124
+ 2. Structural Detection: Looks for AST keywords (class, function, etc.)
125
+ 3. Conceptual Detection: Looks for intent verbs and semantic concepts
126
+ 4. Hybrid Detection: Looks for conjunctions and broad scopes
127
+ 5. Fallback: Defaults to HYBRID with 0.5 confidence if no strong match
128
+
129
+ Args:
130
+ query: Natural language search query (e.g., "Find authenticate()" or
131
+ "Where is authentication handled?")
132
+
133
+ Returns:
134
+ QueryClassification object containing:
135
+ - category: One of SEMANTIC, PATTERN, STRUCTURAL, HYBRID
136
+ - confidence: Score from 0.0 to 1.0 (capped at 0.95, never 1.0)
137
+ - indicators: List of matched pattern names
138
+ - suggested_tool: Recommended tool (grep_search, ast_grep_search,
139
+ semantic_search, or enhanced_search)
140
+ - reasoning: Human-readable explanation
141
+
142
+ Examples:
143
+ >>> result = classify_query("Find all calls to authenticate()")
144
+ >>> result.category
145
+ <QueryCategory.PATTERN: 'pattern'>
146
+ >>> result.confidence
147
+ 0.9
148
+ >>> result.suggested_tool
149
+ 'grep_search'
150
+
151
+ >>> result = classify_query("Where is authentication handled?")
152
+ >>> result.category
153
+ <QueryCategory.SEMANTIC: 'semantic'>
154
+ >>> result.confidence
155
+ 0.85
156
+ >>> result.suggested_tool
157
+ 'semantic_search'
158
+
159
+ >>> result = classify_query("Find class definitions inheriting from Base")
160
+ >>> result.category
161
+ <QueryCategory.STRUCTURAL: 'structural'>
162
+ >>> result.confidence
163
+ 0.95
164
+ >>> result.suggested_tool
165
+ 'ast_grep_search'
166
+
167
+ Performance:
168
+ - Target: <10ms per classification
169
+ - Uses only pure Python stdlib (re module)
170
+ - No external dependencies or API calls
171
+ """
172
+ try:
173
+ # Input validation
174
+ if not query or not isinstance(query, str):
175
+ return QueryClassification(
176
+ category=QueryCategory.HYBRID,
177
+ confidence=0.5,
178
+ indicators=["invalid_input"],
179
+ suggested_tool="enhanced_search",
180
+ reasoning="Invalid or empty query, using safe default",
181
+ )
182
+
183
+ # Normalize query
184
+ query_normalized = query.strip()
185
+ if len(query_normalized) < 3:
186
+ return QueryClassification(
187
+ category=QueryCategory.HYBRID,
188
+ confidence=0.5,
189
+ indicators=["too_short"],
190
+ suggested_tool="enhanced_search",
191
+ reasoning="Query too short for accurate classification",
192
+ )
193
+
194
+ query_lower = query_normalized.lower()
195
+
196
+ # Phase 1: Pattern Detection
197
+ pattern_matches = []
198
+ for pattern in PATTERN_INDICATORS:
199
+ if re.search(pattern, query_lower):
200
+ pattern_matches.append(pattern)
201
+
202
+ # Phase 2: Structural Detection
203
+ structural_matches = []
204
+ for pattern in STRUCTURAL_INDICATORS:
205
+ if re.search(pattern, query_lower):
206
+ structural_matches.append(pattern)
207
+
208
+ # Phase 3: Semantic Detection
209
+ semantic_matches = []
210
+ for pattern in SEMANTIC_INDICATORS:
211
+ if re.search(pattern, query_lower):
212
+ semantic_matches.append(pattern)
213
+
214
+ # Phase 4: Hybrid Detection
215
+ hybrid_matches = []
216
+ for pattern in HYBRID_INDICATORS:
217
+ if re.search(pattern, query_lower):
218
+ hybrid_matches.append(pattern)
219
+
220
+ # Confidence Scoring
221
+ # Score calculation:
222
+ # - Each pattern match: +0.15
223
+ # - Each structural match: +0.20
224
+ # - Each semantic match: +0.15
225
+ # - Each hybrid match: +0.10
226
+ scores = {
227
+ QueryCategory.PATTERN: len(pattern_matches) * 0.15,
228
+ QueryCategory.STRUCTURAL: len(structural_matches) * 0.20,
229
+ QueryCategory.SEMANTIC: len(semantic_matches) * 0.15,
230
+ QueryCategory.HYBRID: len(hybrid_matches) * 0.10,
231
+ }
232
+
233
+ # Find maximum score
234
+ max_score = max(scores.values())
235
+
236
+ # Fallback to HYBRID if no matches
237
+ if max_score == 0:
238
+ result = QueryClassification(
239
+ category=QueryCategory.HYBRID,
240
+ confidence=0.5,
241
+ indicators=[],
242
+ suggested_tool="enhanced_search",
243
+ reasoning="No clear indicators found, using multi-modal search",
244
+ )
245
+ logger.debug(
246
+ f"QUERY-CLASSIFY: query='{query_normalized[:50]}...' "
247
+ f"category={result.category.value} "
248
+ f"confidence={result.confidence:.2f} "
249
+ f"tool={result.suggested_tool}"
250
+ )
251
+ return result
252
+
253
+ # Find all categories with maximum score (potential ties)
254
+ winners = [cat for cat, score in scores.items() if score == max_score]
255
+
256
+ # If tie, use HYBRID
257
+ if len(winners) > 1:
258
+ confidence = min(max_score, 0.95)
259
+ category = QueryCategory.HYBRID
260
+ else:
261
+ confidence = min(max_score, 0.95)
262
+ category = winners[0]
263
+
264
+ # Gather all indicators for reporting
265
+ all_indicators = []
266
+ if pattern_matches:
267
+ all_indicators.append("pattern_match")
268
+ if structural_matches:
269
+ all_indicators.append("structural_match")
270
+ if semantic_matches:
271
+ all_indicators.append("semantic_match")
272
+ if hybrid_matches:
273
+ all_indicators.append("hybrid_match")
274
+
275
+ # Generate reasoning
276
+ reasoning_parts = []
277
+ if category == QueryCategory.PATTERN:
278
+ reasoning_parts.append(
279
+ "Query contains exact identifiers or code syntax"
280
+ )
281
+ elif category == QueryCategory.STRUCTURAL:
282
+ reasoning_parts.append(
283
+ "Query requires AST-level understanding of code structure"
284
+ )
285
+ elif category == QueryCategory.SEMANTIC:
286
+ reasoning_parts.append(
287
+ "Query asks about conceptual logic or behavior"
288
+ )
289
+ elif category == QueryCategory.HYBRID:
290
+ reasoning_parts.append(
291
+ "Query combines multiple search approaches or is ambiguous"
292
+ )
293
+
294
+ reasoning = "; ".join(reasoning_parts)
295
+
296
+ result = QueryClassification(
297
+ category=category,
298
+ confidence=confidence,
299
+ indicators=all_indicators,
300
+ suggested_tool=TOOL_ROUTING[category],
301
+ reasoning=reasoning,
302
+ )
303
+
304
+ # Log classification for analytics
305
+ logger.debug(
306
+ f"QUERY-CLASSIFY: query='{query_normalized[:50]}...' "
307
+ f"category={result.category.value} "
308
+ f"confidence={result.confidence:.2f} "
309
+ f"tool={result.suggested_tool}"
310
+ )
311
+
312
+ return result
313
+
314
+ except Exception as e:
315
+ # Safe fallback on any error
316
+ logger.exception(f"Error classifying query: {e}")
317
+ return QueryClassification(
318
+ category=QueryCategory.HYBRID,
319
+ confidence=0.5,
320
+ indicators=["error"],
321
+ suggested_tool="enhanced_search",
322
+ reasoning=f"Classification error: {str(e)}, using safe default",
323
+ )