claude-code-workflow 6.3.2 → 6.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/CLAUDE.md +9 -1
- package/.claude/commands/{clean.md → workflow/clean.md} +5 -5
- package/.claude/commands/workflow/docs/analyze.md +1467 -0
- package/.claude/commands/workflow/docs/copyright.md +1265 -0
- package/.claude/commands/workflow/lite-plan.md +1 -1
- package/.claude/commands/workflow/tools/conflict-resolution.md +76 -240
- package/.claude/commands/workflow/tools/task-generate-agent.md +81 -8
- package/.claude/skills/_shared/mermaid-utils.md +584 -0
- package/.claude/skills/copyright-docs/SKILL.md +132 -0
- package/.claude/skills/copyright-docs/phases/01-metadata-collection.md +78 -0
- package/.claude/skills/copyright-docs/phases/02-deep-analysis.md +454 -0
- package/.claude/skills/copyright-docs/phases/02.5-consolidation.md +192 -0
- package/.claude/skills/copyright-docs/phases/04-document-assembly.md +261 -0
- package/.claude/skills/copyright-docs/phases/05-compliance-refinement.md +192 -0
- package/.claude/skills/copyright-docs/specs/cpcc-requirements.md +121 -0
- package/.claude/skills/copyright-docs/templates/agent-base.md +200 -0
- package/.claude/skills/project-analyze/SKILL.md +162 -0
- package/.claude/skills/project-analyze/phases/01-requirements-discovery.md +79 -0
- package/.claude/skills/project-analyze/phases/02-project-exploration.md +75 -0
- package/.claude/skills/project-analyze/phases/03-deep-analysis.md +640 -0
- package/.claude/skills/project-analyze/phases/03.5-consolidation.md +208 -0
- package/.claude/skills/project-analyze/phases/04-report-generation.md +217 -0
- package/.claude/skills/project-analyze/phases/05-iterative-refinement.md +124 -0
- package/.claude/skills/project-analyze/specs/quality-standards.md +115 -0
- package/.claude/skills/project-analyze/specs/writing-style.md +152 -0
- package/.claude/workflows/cli-templates/schemas/conflict-resolution-schema.json +79 -65
- package/.claude/workflows/cli-tools-usage.md +515 -516
- package/README.md +11 -1
- package/ccw/dist/cli.d.ts.map +1 -1
- package/ccw/dist/cli.js +7 -1
- package/ccw/dist/cli.js.map +1 -1
- package/ccw/dist/commands/cli.d.ts +1 -1
- package/ccw/dist/commands/cli.d.ts.map +1 -1
- package/ccw/dist/commands/cli.js +116 -14
- package/ccw/dist/commands/cli.js.map +1 -1
- package/ccw/dist/core/routes/cli-routes.js +2 -2
- package/ccw/dist/core/routes/cli-routes.js.map +1 -1
- package/ccw/dist/tools/claude-cli-tools.d.ts +7 -3
- package/ccw/dist/tools/claude-cli-tools.d.ts.map +1 -1
- package/ccw/dist/tools/claude-cli-tools.js +31 -17
- package/ccw/dist/tools/claude-cli-tools.js.map +1 -1
- package/ccw/dist/tools/cli-executor.d.ts.map +1 -1
- package/ccw/dist/tools/cli-executor.js +19 -7
- package/ccw/dist/tools/cli-executor.js.map +1 -1
- package/ccw/dist/tools/cli-history-store.d.ts +33 -0
- package/ccw/dist/tools/cli-history-store.d.ts.map +1 -1
- package/ccw/dist/tools/cli-history-store.js +89 -5
- package/ccw/dist/tools/cli-history-store.js.map +1 -1
- package/ccw/dist/tools/smart-search.d.ts +25 -0
- package/ccw/dist/tools/smart-search.d.ts.map +1 -1
- package/ccw/dist/tools/smart-search.js +121 -17
- package/ccw/dist/tools/smart-search.js.map +1 -1
- package/ccw/src/cli.ts +264 -258
- package/ccw/src/commands/cli.ts +1009 -884
- package/ccw/src/core/routes/cli-routes.ts +3 -3
- package/ccw/src/templates/dashboard-js/components/cli-history.js +40 -13
- package/ccw/src/templates/dashboard-js/components/cli-status.js +26 -2
- package/ccw/src/templates/dashboard-js/views/cli-manager.js +5 -0
- package/ccw/src/templates/dashboard-js/views/history.js +19 -4
- package/ccw/src/tools/claude-cli-tools.ts +37 -20
- package/ccw/src/tools/cli-executor.ts +20 -7
- package/ccw/src/tools/cli-history-store.ts +125 -5
- package/ccw/src/tools/smart-search.ts +157 -16
- package/codex-lens/src/codexlens/__pycache__/config.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/config.py +8 -0
- package/codex-lens/src/codexlens/search/__pycache__/chain_search.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/search/__pycache__/hybrid_search.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/search/__pycache__/ranking.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/search/chain_search.py +71 -1
- package/codex-lens/src/codexlens/search/hybrid_search.py +144 -11
- package/codex-lens/src/codexlens/search/ranking.py +540 -274
- package/codex-lens/src/codexlens/semantic/__pycache__/chunker.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/semantic/chunker.py +55 -10
- package/codex-lens/src/codexlens/storage/__pycache__/dir_index.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/storage/__pycache__/global_index.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/storage/__pycache__/index_tree.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/storage/dir_index.py +1888 -1850
- package/codex-lens/src/codexlens/storage/global_index.py +365 -0
- package/codex-lens/src/codexlens/storage/index_tree.py +83 -10
- package/package.json +2 -2
|
@@ -1,274 +1,540 @@
|
|
|
1
|
-
"""Ranking algorithms for hybrid search result fusion.
|
|
2
|
-
|
|
3
|
-
Implements Reciprocal Rank Fusion (RRF) and score normalization utilities
|
|
4
|
-
for combining results from heterogeneous search backends (exact FTS, fuzzy FTS, vector search).
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
import
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
from
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
""
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
k:
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
"""
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
0
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
source:
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
for
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
""
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
1
|
+
"""Ranking algorithms for hybrid search result fusion.
|
|
2
|
+
|
|
3
|
+
Implements Reciprocal Rank Fusion (RRF) and score normalization utilities
|
|
4
|
+
for combining results from heterogeneous search backends (exact FTS, fuzzy FTS, vector search).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import math
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import Any, Dict, List
|
|
13
|
+
|
|
14
|
+
from codexlens.entities import SearchResult, AdditionalLocation
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class QueryIntent(str, Enum):
|
|
18
|
+
"""Query intent for adaptive RRF weights (Python/TypeScript parity)."""
|
|
19
|
+
|
|
20
|
+
KEYWORD = "keyword"
|
|
21
|
+
SEMANTIC = "semantic"
|
|
22
|
+
MIXED = "mixed"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def normalize_weights(weights: Dict[str, float]) -> Dict[str, float]:
|
|
26
|
+
"""Normalize weights to sum to 1.0 (best-effort)."""
|
|
27
|
+
total = sum(float(v) for v in weights.values() if v is not None)
|
|
28
|
+
if not math.isfinite(total) or total <= 0:
|
|
29
|
+
return {k: float(v) for k, v in weights.items()}
|
|
30
|
+
return {k: float(v) / total for k, v in weights.items()}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def detect_query_intent(query: str) -> QueryIntent:
|
|
34
|
+
"""Detect whether a query is code-like, natural-language, or mixed.
|
|
35
|
+
|
|
36
|
+
Heuristic signals kept aligned with `ccw/src/tools/smart-search.ts`.
|
|
37
|
+
"""
|
|
38
|
+
trimmed = (query or "").strip()
|
|
39
|
+
if not trimmed:
|
|
40
|
+
return QueryIntent.MIXED
|
|
41
|
+
|
|
42
|
+
lower = trimmed.lower()
|
|
43
|
+
word_count = len([w for w in re.split(r"\s+", trimmed) if w])
|
|
44
|
+
|
|
45
|
+
has_code_signals = bool(
|
|
46
|
+
re.search(r"(::|->|\.)", trimmed)
|
|
47
|
+
or re.search(r"[A-Z][a-z]+[A-Z]", trimmed)
|
|
48
|
+
or re.search(r"\b\w+_\w+\b", trimmed)
|
|
49
|
+
or re.search(
|
|
50
|
+
r"\b(def|class|function|const|let|var|import|from|return|async|await|interface|type)\b",
|
|
51
|
+
lower,
|
|
52
|
+
flags=re.IGNORECASE,
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
has_natural_signals = bool(
|
|
56
|
+
word_count > 5
|
|
57
|
+
or "?" in trimmed
|
|
58
|
+
or re.search(r"\b(how|what|why|when|where)\b", trimmed, flags=re.IGNORECASE)
|
|
59
|
+
or re.search(
|
|
60
|
+
r"\b(handle|explain|fix|implement|create|build|use|find|search|convert|parse|generate|support)\b",
|
|
61
|
+
trimmed,
|
|
62
|
+
flags=re.IGNORECASE,
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if has_code_signals and has_natural_signals:
|
|
67
|
+
return QueryIntent.MIXED
|
|
68
|
+
if has_code_signals:
|
|
69
|
+
return QueryIntent.KEYWORD
|
|
70
|
+
if has_natural_signals:
|
|
71
|
+
return QueryIntent.SEMANTIC
|
|
72
|
+
return QueryIntent.MIXED
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def adjust_weights_by_intent(
|
|
76
|
+
intent: QueryIntent,
|
|
77
|
+
base_weights: Dict[str, float],
|
|
78
|
+
) -> Dict[str, float]:
|
|
79
|
+
"""Map intent → weights (kept aligned with TypeScript mapping)."""
|
|
80
|
+
if intent == QueryIntent.KEYWORD:
|
|
81
|
+
target = {"exact": 0.5, "fuzzy": 0.1, "vector": 0.4}
|
|
82
|
+
elif intent == QueryIntent.SEMANTIC:
|
|
83
|
+
target = {"exact": 0.2, "fuzzy": 0.1, "vector": 0.7}
|
|
84
|
+
else:
|
|
85
|
+
target = dict(base_weights)
|
|
86
|
+
|
|
87
|
+
# Preserve only keys that are present in base_weights (active backends).
|
|
88
|
+
keys = list(base_weights.keys())
|
|
89
|
+
filtered = {k: float(target.get(k, 0.0)) for k in keys}
|
|
90
|
+
return normalize_weights(filtered)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_rrf_weights(
|
|
94
|
+
query: str,
|
|
95
|
+
base_weights: Dict[str, float],
|
|
96
|
+
) -> Dict[str, float]:
|
|
97
|
+
"""Compute adaptive RRF weights from query intent."""
|
|
98
|
+
return adjust_weights_by_intent(detect_query_intent(query), base_weights)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def reciprocal_rank_fusion(
|
|
102
|
+
results_map: Dict[str, List[SearchResult]],
|
|
103
|
+
weights: Dict[str, float] = None,
|
|
104
|
+
k: int = 60,
|
|
105
|
+
) -> List[SearchResult]:
|
|
106
|
+
"""Combine search results from multiple sources using Reciprocal Rank Fusion.
|
|
107
|
+
|
|
108
|
+
RRF formula: score(d) = Σ weight_source / (k + rank_source(d))
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
results_map: Dictionary mapping source name to list of SearchResult objects
|
|
112
|
+
Sources: 'exact', 'fuzzy', 'vector'
|
|
113
|
+
weights: Dictionary mapping source name to weight (default: equal weights)
|
|
114
|
+
Example: {'exact': 0.3, 'fuzzy': 0.1, 'vector': 0.6}
|
|
115
|
+
k: Constant to avoid division by zero and control rank influence (default 60)
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
List of SearchResult objects sorted by fused score (descending)
|
|
119
|
+
|
|
120
|
+
Examples:
|
|
121
|
+
>>> exact_results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
|
|
122
|
+
>>> fuzzy_results = [SearchResult(path="b.py", score=8.0, excerpt="...")]
|
|
123
|
+
>>> results_map = {'exact': exact_results, 'fuzzy': fuzzy_results}
|
|
124
|
+
>>> fused = reciprocal_rank_fusion(results_map)
|
|
125
|
+
"""
|
|
126
|
+
if not results_map:
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
# Default equal weights if not provided
|
|
130
|
+
if weights is None:
|
|
131
|
+
num_sources = len(results_map)
|
|
132
|
+
weights = {source: 1.0 / num_sources for source in results_map}
|
|
133
|
+
|
|
134
|
+
# Validate weights sum to 1.0
|
|
135
|
+
weight_sum = sum(weights.values())
|
|
136
|
+
if not math.isclose(weight_sum, 1.0, abs_tol=0.01):
|
|
137
|
+
# Normalize weights to sum to 1.0
|
|
138
|
+
weights = {source: w / weight_sum for source, w in weights.items()}
|
|
139
|
+
|
|
140
|
+
# Build unified result set with RRF scores
|
|
141
|
+
path_to_result: Dict[str, SearchResult] = {}
|
|
142
|
+
path_to_fusion_score: Dict[str, float] = {}
|
|
143
|
+
|
|
144
|
+
for source_name, results in results_map.items():
|
|
145
|
+
weight = weights.get(source_name, 0.0)
|
|
146
|
+
if weight == 0:
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
for rank, result in enumerate(results, start=1):
|
|
150
|
+
path = result.path
|
|
151
|
+
rrf_contribution = weight / (k + rank)
|
|
152
|
+
|
|
153
|
+
# Initialize or accumulate fusion score
|
|
154
|
+
if path not in path_to_fusion_score:
|
|
155
|
+
path_to_fusion_score[path] = 0.0
|
|
156
|
+
path_to_result[path] = result
|
|
157
|
+
|
|
158
|
+
path_to_fusion_score[path] += rrf_contribution
|
|
159
|
+
|
|
160
|
+
# Create final results with fusion scores
|
|
161
|
+
fused_results = []
|
|
162
|
+
for path, base_result in path_to_result.items():
|
|
163
|
+
fusion_score = path_to_fusion_score[path]
|
|
164
|
+
|
|
165
|
+
# Create new SearchResult with fusion_score in metadata
|
|
166
|
+
fused_result = SearchResult(
|
|
167
|
+
path=base_result.path,
|
|
168
|
+
score=fusion_score,
|
|
169
|
+
excerpt=base_result.excerpt,
|
|
170
|
+
content=base_result.content,
|
|
171
|
+
symbol=base_result.symbol,
|
|
172
|
+
chunk=base_result.chunk,
|
|
173
|
+
metadata={
|
|
174
|
+
**base_result.metadata,
|
|
175
|
+
"fusion_score": fusion_score,
|
|
176
|
+
"original_score": base_result.score,
|
|
177
|
+
},
|
|
178
|
+
start_line=base_result.start_line,
|
|
179
|
+
end_line=base_result.end_line,
|
|
180
|
+
symbol_name=base_result.symbol_name,
|
|
181
|
+
symbol_kind=base_result.symbol_kind,
|
|
182
|
+
)
|
|
183
|
+
fused_results.append(fused_result)
|
|
184
|
+
|
|
185
|
+
# Sort by fusion score descending
|
|
186
|
+
fused_results.sort(key=lambda r: r.score, reverse=True)
|
|
187
|
+
|
|
188
|
+
return fused_results
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def apply_symbol_boost(
|
|
192
|
+
results: List[SearchResult],
|
|
193
|
+
boost_factor: float = 1.5,
|
|
194
|
+
) -> List[SearchResult]:
|
|
195
|
+
"""Boost fused scores for results that include an explicit symbol match.
|
|
196
|
+
|
|
197
|
+
The boost is multiplicative on the current result.score (typically the RRF fusion score).
|
|
198
|
+
When boosted, the original score is preserved in metadata["original_fusion_score"] and
|
|
199
|
+
metadata["boosted"] is set to True.
|
|
200
|
+
"""
|
|
201
|
+
if not results:
|
|
202
|
+
return []
|
|
203
|
+
|
|
204
|
+
if boost_factor <= 1.0:
|
|
205
|
+
# Still return new objects to follow immutable transformation pattern.
|
|
206
|
+
return [
|
|
207
|
+
SearchResult(
|
|
208
|
+
path=r.path,
|
|
209
|
+
score=r.score,
|
|
210
|
+
excerpt=r.excerpt,
|
|
211
|
+
content=r.content,
|
|
212
|
+
symbol=r.symbol,
|
|
213
|
+
chunk=r.chunk,
|
|
214
|
+
metadata={**r.metadata},
|
|
215
|
+
start_line=r.start_line,
|
|
216
|
+
end_line=r.end_line,
|
|
217
|
+
symbol_name=r.symbol_name,
|
|
218
|
+
symbol_kind=r.symbol_kind,
|
|
219
|
+
additional_locations=list(r.additional_locations),
|
|
220
|
+
)
|
|
221
|
+
for r in results
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
boosted_results: List[SearchResult] = []
|
|
225
|
+
for result in results:
|
|
226
|
+
has_symbol = bool(result.symbol_name)
|
|
227
|
+
original_score = float(result.score)
|
|
228
|
+
boosted_score = original_score * boost_factor if has_symbol else original_score
|
|
229
|
+
|
|
230
|
+
metadata = {**result.metadata}
|
|
231
|
+
if has_symbol:
|
|
232
|
+
metadata.setdefault("original_fusion_score", metadata.get("fusion_score", original_score))
|
|
233
|
+
metadata["boosted"] = True
|
|
234
|
+
metadata["symbol_boost_factor"] = boost_factor
|
|
235
|
+
|
|
236
|
+
boosted_results.append(
|
|
237
|
+
SearchResult(
|
|
238
|
+
path=result.path,
|
|
239
|
+
score=boosted_score,
|
|
240
|
+
excerpt=result.excerpt,
|
|
241
|
+
content=result.content,
|
|
242
|
+
symbol=result.symbol,
|
|
243
|
+
chunk=result.chunk,
|
|
244
|
+
metadata=metadata,
|
|
245
|
+
start_line=result.start_line,
|
|
246
|
+
end_line=result.end_line,
|
|
247
|
+
symbol_name=result.symbol_name,
|
|
248
|
+
symbol_kind=result.symbol_kind,
|
|
249
|
+
additional_locations=list(result.additional_locations),
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
boosted_results.sort(key=lambda r: r.score, reverse=True)
|
|
254
|
+
return boosted_results
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def rerank_results(
|
|
258
|
+
query: str,
|
|
259
|
+
results: List[SearchResult],
|
|
260
|
+
embedder: Any,
|
|
261
|
+
top_k: int = 50,
|
|
262
|
+
) -> List[SearchResult]:
|
|
263
|
+
"""Re-rank results with embedding cosine similarity, combined with current score.
|
|
264
|
+
|
|
265
|
+
Combined score formula:
|
|
266
|
+
0.5 * rrf_score + 0.5 * cosine_similarity
|
|
267
|
+
|
|
268
|
+
If embedder is None or embedding fails, returns results as-is.
|
|
269
|
+
"""
|
|
270
|
+
if not results:
|
|
271
|
+
return []
|
|
272
|
+
|
|
273
|
+
if embedder is None or top_k <= 0:
|
|
274
|
+
return results
|
|
275
|
+
|
|
276
|
+
rerank_count = min(int(top_k), len(results))
|
|
277
|
+
|
|
278
|
+
def cosine_similarity(vec_a: List[float], vec_b: List[float]) -> float:
|
|
279
|
+
# Defensive: handle mismatched lengths and zero vectors.
|
|
280
|
+
n = min(len(vec_a), len(vec_b))
|
|
281
|
+
if n == 0:
|
|
282
|
+
return 0.0
|
|
283
|
+
dot = 0.0
|
|
284
|
+
norm_a = 0.0
|
|
285
|
+
norm_b = 0.0
|
|
286
|
+
for i in range(n):
|
|
287
|
+
a = float(vec_a[i])
|
|
288
|
+
b = float(vec_b[i])
|
|
289
|
+
dot += a * b
|
|
290
|
+
norm_a += a * a
|
|
291
|
+
norm_b += b * b
|
|
292
|
+
if norm_a <= 0.0 or norm_b <= 0.0:
|
|
293
|
+
return 0.0
|
|
294
|
+
sim = dot / (math.sqrt(norm_a) * math.sqrt(norm_b))
|
|
295
|
+
# SearchResult.score requires non-negative scores; clamp cosine similarity to [0, 1].
|
|
296
|
+
return max(0.0, min(1.0, sim))
|
|
297
|
+
|
|
298
|
+
def text_for_embedding(r: SearchResult) -> str:
|
|
299
|
+
if r.excerpt and r.excerpt.strip():
|
|
300
|
+
return r.excerpt
|
|
301
|
+
if r.content and r.content.strip():
|
|
302
|
+
return r.content
|
|
303
|
+
if r.chunk and r.chunk.content and r.chunk.content.strip():
|
|
304
|
+
return r.chunk.content
|
|
305
|
+
# Fallback: stable, non-empty text.
|
|
306
|
+
return r.symbol_name or r.path
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
if hasattr(embedder, "embed_single"):
|
|
310
|
+
query_vec = embedder.embed_single(query)
|
|
311
|
+
else:
|
|
312
|
+
query_vec = embedder.embed(query)[0]
|
|
313
|
+
|
|
314
|
+
doc_texts = [text_for_embedding(r) for r in results[:rerank_count]]
|
|
315
|
+
doc_vecs = embedder.embed(doc_texts)
|
|
316
|
+
except Exception:
|
|
317
|
+
return results
|
|
318
|
+
|
|
319
|
+
reranked_results: List[SearchResult] = []
|
|
320
|
+
|
|
321
|
+
for idx, result in enumerate(results):
|
|
322
|
+
if idx < rerank_count:
|
|
323
|
+
rrf_score = float(result.score)
|
|
324
|
+
sim = cosine_similarity(query_vec, doc_vecs[idx])
|
|
325
|
+
combined_score = 0.5 * rrf_score + 0.5 * sim
|
|
326
|
+
|
|
327
|
+
reranked_results.append(
|
|
328
|
+
SearchResult(
|
|
329
|
+
path=result.path,
|
|
330
|
+
score=combined_score,
|
|
331
|
+
excerpt=result.excerpt,
|
|
332
|
+
content=result.content,
|
|
333
|
+
symbol=result.symbol,
|
|
334
|
+
chunk=result.chunk,
|
|
335
|
+
metadata={
|
|
336
|
+
**result.metadata,
|
|
337
|
+
"rrf_score": rrf_score,
|
|
338
|
+
"cosine_similarity": sim,
|
|
339
|
+
"reranked": True,
|
|
340
|
+
},
|
|
341
|
+
start_line=result.start_line,
|
|
342
|
+
end_line=result.end_line,
|
|
343
|
+
symbol_name=result.symbol_name,
|
|
344
|
+
symbol_kind=result.symbol_kind,
|
|
345
|
+
additional_locations=list(result.additional_locations),
|
|
346
|
+
)
|
|
347
|
+
)
|
|
348
|
+
else:
|
|
349
|
+
# Preserve remaining results without re-ranking, but keep immutability.
|
|
350
|
+
reranked_results.append(
|
|
351
|
+
SearchResult(
|
|
352
|
+
path=result.path,
|
|
353
|
+
score=result.score,
|
|
354
|
+
excerpt=result.excerpt,
|
|
355
|
+
content=result.content,
|
|
356
|
+
symbol=result.symbol,
|
|
357
|
+
chunk=result.chunk,
|
|
358
|
+
metadata={**result.metadata},
|
|
359
|
+
start_line=result.start_line,
|
|
360
|
+
end_line=result.end_line,
|
|
361
|
+
symbol_name=result.symbol_name,
|
|
362
|
+
symbol_kind=result.symbol_kind,
|
|
363
|
+
additional_locations=list(result.additional_locations),
|
|
364
|
+
)
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
reranked_results.sort(key=lambda r: r.score, reverse=True)
|
|
368
|
+
return reranked_results
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def normalize_bm25_score(score: float) -> float:
|
|
372
|
+
"""Normalize BM25 scores from SQLite FTS5 to 0-1 range.
|
|
373
|
+
|
|
374
|
+
SQLite FTS5 returns negative BM25 scores (more negative = better match).
|
|
375
|
+
Uses sigmoid transformation for normalization.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
score: Raw BM25 score from SQLite (typically negative)
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Normalized score in range [0, 1]
|
|
382
|
+
|
|
383
|
+
Examples:
|
|
384
|
+
>>> normalize_bm25_score(-10.5) # Good match
|
|
385
|
+
0.85
|
|
386
|
+
>>> normalize_bm25_score(-1.2) # Weak match
|
|
387
|
+
0.62
|
|
388
|
+
"""
|
|
389
|
+
# Take absolute value (BM25 is negative in SQLite)
|
|
390
|
+
abs_score = abs(score)
|
|
391
|
+
|
|
392
|
+
# Sigmoid transformation: 1 / (1 + e^(-x))
|
|
393
|
+
# Scale factor of 0.1 maps typical BM25 range (-20 to 0) to (0, 1)
|
|
394
|
+
normalized = 1.0 / (1.0 + math.exp(-abs_score * 0.1))
|
|
395
|
+
|
|
396
|
+
return normalized
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def tag_search_source(results: List[SearchResult], source: str) -> List[SearchResult]:
|
|
400
|
+
"""Tag search results with their source for RRF tracking.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
results: List of SearchResult objects
|
|
404
|
+
source: Source identifier ('exact', 'fuzzy', 'vector')
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
List of SearchResult objects with 'search_source' in metadata
|
|
408
|
+
"""
|
|
409
|
+
tagged_results = []
|
|
410
|
+
for result in results:
|
|
411
|
+
tagged_result = SearchResult(
|
|
412
|
+
path=result.path,
|
|
413
|
+
score=result.score,
|
|
414
|
+
excerpt=result.excerpt,
|
|
415
|
+
content=result.content,
|
|
416
|
+
symbol=result.symbol,
|
|
417
|
+
chunk=result.chunk,
|
|
418
|
+
metadata={**result.metadata, "search_source": source},
|
|
419
|
+
start_line=result.start_line,
|
|
420
|
+
end_line=result.end_line,
|
|
421
|
+
symbol_name=result.symbol_name,
|
|
422
|
+
symbol_kind=result.symbol_kind,
|
|
423
|
+
)
|
|
424
|
+
tagged_results.append(tagged_result)
|
|
425
|
+
|
|
426
|
+
return tagged_results
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def group_similar_results(
|
|
430
|
+
results: List[SearchResult],
|
|
431
|
+
score_threshold_abs: float = 0.01,
|
|
432
|
+
content_field: str = "excerpt"
|
|
433
|
+
) -> List[SearchResult]:
|
|
434
|
+
"""Group search results by content and score similarity.
|
|
435
|
+
|
|
436
|
+
Groups results that have similar content and similar scores into a single
|
|
437
|
+
representative result, with other locations stored in additional_locations.
|
|
438
|
+
|
|
439
|
+
Algorithm:
|
|
440
|
+
1. Group results by content (using excerpt or content field)
|
|
441
|
+
2. Within each content group, create subgroups based on score similarity
|
|
442
|
+
3. Select highest-scoring result as representative for each subgroup
|
|
443
|
+
4. Store other results in subgroup as additional_locations
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
results: A list of SearchResult objects (typically sorted by score)
|
|
447
|
+
score_threshold_abs: Absolute score difference to consider results similar.
|
|
448
|
+
Results with |score_a - score_b| <= threshold are grouped.
|
|
449
|
+
Default 0.01 is suitable for RRF fusion scores.
|
|
450
|
+
content_field: The field to use for content grouping ('excerpt' or 'content')
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
A new list of SearchResult objects where similar items are grouped.
|
|
454
|
+
The list is sorted by score descending.
|
|
455
|
+
|
|
456
|
+
Examples:
|
|
457
|
+
>>> results = [SearchResult(path="a.py", score=0.5, excerpt="def foo()"),
|
|
458
|
+
... SearchResult(path="b.py", score=0.5, excerpt="def foo()")]
|
|
459
|
+
>>> grouped = group_similar_results(results)
|
|
460
|
+
>>> len(grouped) # Two results merged into one
|
|
461
|
+
1
|
|
462
|
+
>>> len(grouped[0].additional_locations) # One additional location
|
|
463
|
+
1
|
|
464
|
+
"""
|
|
465
|
+
if not results:
|
|
466
|
+
return []
|
|
467
|
+
|
|
468
|
+
# Group results by content
|
|
469
|
+
content_map: Dict[str, List[SearchResult]] = {}
|
|
470
|
+
unidentifiable_results: List[SearchResult] = []
|
|
471
|
+
|
|
472
|
+
for r in results:
|
|
473
|
+
key = getattr(r, content_field, None)
|
|
474
|
+
if key and key.strip():
|
|
475
|
+
content_map.setdefault(key, []).append(r)
|
|
476
|
+
else:
|
|
477
|
+
# Results without content can't be grouped by content
|
|
478
|
+
unidentifiable_results.append(r)
|
|
479
|
+
|
|
480
|
+
final_results: List[SearchResult] = []
|
|
481
|
+
|
|
482
|
+
# Process each content group
|
|
483
|
+
for content_group in content_map.values():
|
|
484
|
+
# Sort by score descending within group
|
|
485
|
+
content_group.sort(key=lambda r: r.score, reverse=True)
|
|
486
|
+
|
|
487
|
+
while content_group:
|
|
488
|
+
# Take highest scoring as representative
|
|
489
|
+
representative = content_group.pop(0)
|
|
490
|
+
others_in_group = []
|
|
491
|
+
remaining_for_next_pass = []
|
|
492
|
+
|
|
493
|
+
# Find results with similar scores
|
|
494
|
+
for item in content_group:
|
|
495
|
+
if abs(representative.score - item.score) <= score_threshold_abs:
|
|
496
|
+
others_in_group.append(item)
|
|
497
|
+
else:
|
|
498
|
+
remaining_for_next_pass.append(item)
|
|
499
|
+
|
|
500
|
+
# Create grouped result with additional locations
|
|
501
|
+
if others_in_group:
|
|
502
|
+
# Build new result with additional_locations populated
|
|
503
|
+
grouped_result = SearchResult(
|
|
504
|
+
path=representative.path,
|
|
505
|
+
score=representative.score,
|
|
506
|
+
excerpt=representative.excerpt,
|
|
507
|
+
content=representative.content,
|
|
508
|
+
symbol=representative.symbol,
|
|
509
|
+
chunk=representative.chunk,
|
|
510
|
+
metadata={
|
|
511
|
+
**representative.metadata,
|
|
512
|
+
"grouped_count": len(others_in_group) + 1,
|
|
513
|
+
},
|
|
514
|
+
start_line=representative.start_line,
|
|
515
|
+
end_line=representative.end_line,
|
|
516
|
+
symbol_name=representative.symbol_name,
|
|
517
|
+
symbol_kind=representative.symbol_kind,
|
|
518
|
+
additional_locations=[
|
|
519
|
+
AdditionalLocation(
|
|
520
|
+
path=other.path,
|
|
521
|
+
score=other.score,
|
|
522
|
+
start_line=other.start_line,
|
|
523
|
+
end_line=other.end_line,
|
|
524
|
+
symbol_name=other.symbol_name,
|
|
525
|
+
) for other in others_in_group
|
|
526
|
+
],
|
|
527
|
+
)
|
|
528
|
+
final_results.append(grouped_result)
|
|
529
|
+
else:
|
|
530
|
+
final_results.append(representative)
|
|
531
|
+
|
|
532
|
+
content_group = remaining_for_next_pass
|
|
533
|
+
|
|
534
|
+
# Add ungroupable results
|
|
535
|
+
final_results.extend(unidentifiable_results)
|
|
536
|
+
|
|
537
|
+
# Sort final results by score descending
|
|
538
|
+
final_results.sort(key=lambda r: r.score, reverse=True)
|
|
539
|
+
|
|
540
|
+
return final_results
|