docsgraph 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cairn/__init__.py +5 -0
- cairn/bench/__init__.py +37 -0
- cairn/bench/baseline.py +236 -0
- cairn/bench/dataset.py +109 -0
- cairn/bench/judge.py +126 -0
- cairn/bench/metrics.py +32 -0
- cairn/bench/report.py +143 -0
- cairn/bench/runner.py +219 -0
- cairn/cli/__init__.py +5 -0
- cairn/cli/app.py +776 -0
- cairn/cli/config.py +105 -0
- cairn/core/__init__.py +41 -0
- cairn/core/errors.py +68 -0
- cairn/core/types.py +147 -0
- cairn/embed/__init__.py +17 -0
- cairn/embed/base.py +31 -0
- cairn/embed/doubao.py +167 -0
- cairn/embed/fake.py +36 -0
- cairn/embed/openai_compatible.py +155 -0
- cairn/engine/__init__.py +18 -0
- cairn/engine/indexer.py +298 -0
- cairn/engine/manifest.py +83 -0
- cairn/entity/__init__.py +21 -0
- cairn/entity/base.py +52 -0
- cairn/entity/fake.py +34 -0
- cairn/entity/heuristic.py +148 -0
- cairn/index/__init__.py +39 -0
- cairn/index/entities.py +244 -0
- cairn/index/summaries.py +269 -0
- cairn/index/tree.py +274 -0
- cairn/index/vectors.py +287 -0
- cairn/index/xrefs.py +195 -0
- cairn/ingest/__init__.py +36 -0
- cairn/ingest/base.py +46 -0
- cairn/ingest/markdown.py +244 -0
- cairn/ingest/markitdown.py +145 -0
- cairn/ingest/pdf.py +357 -0
- cairn/inspection.py +971 -0
- cairn/mcp/__init__.py +12 -0
- cairn/mcp/schemas.py +547 -0
- cairn/mcp/server.py +363 -0
- cairn/providers.py +50 -0
- cairn/py.typed +0 -0
- cairn/repo.py +1486 -0
- cairn/repo_search.py +1505 -0
- cairn/summarize/__init__.py +18 -0
- cairn/summarize/base.py +56 -0
- cairn/summarize/cache.py +66 -0
- cairn/summarize/fake.py +43 -0
- cairn/summarize/openai_compatible.py +148 -0
- cairn/summarize/prompts.py +73 -0
- cairn/tools/__init__.py +31 -0
- cairn/tools/base.py +126 -0
- cairn/tools/find_mentions.py +93 -0
- cairn/tools/get_related.py +140 -0
- cairn/tools/get_section.py +130 -0
- cairn/tools/outline.py +75 -0
- cairn/tools/read_range.py +94 -0
- cairn/tools/search_keyword.py +94 -0
- cairn/tools/search_semantic.py +181 -0
- cairn/xref/__init__.py +24 -0
- cairn/xref/base.py +50 -0
- cairn/xref/fake.py +40 -0
- cairn/xref/heuristic.py +217 -0
- docsgraph-0.1.0a2.dist-info/METADATA +688 -0
- docsgraph-0.1.0a2.dist-info/RECORD +69 -0
- docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
- docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
- docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/repo_search.py
ADDED
|
@@ -0,0 +1,1505 @@
|
|
|
1
|
+
"""Repo-scoped search cache, evidence blending, and ranking.
|
|
2
|
+
|
|
3
|
+
The public repo lifecycle API stays in :mod:`cairn.repo`. This module owns the
|
|
4
|
+
large, performance-sensitive search implementation so repository status/sync
|
|
5
|
+
logic does not have to carry ranking internals.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import heapq
|
|
12
|
+
import math
|
|
13
|
+
import re
|
|
14
|
+
from collections import Counter, defaultdict
|
|
15
|
+
from collections.abc import Collection
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any, Final, Protocol
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
from cairn.index.vectors import l2_normalize
|
|
23
|
+
from cairn.tools.base import DocumentIndex
|
|
24
|
+
from cairn.tools.search_semantic import IncludeField, _evidence_snippet, _query_terms
|
|
25
|
+
|
|
26
|
+
_REPO_SEARCH_CACHE_MAX: Final = 4
|
|
27
|
+
_REPO_SEARCH_LOAD_CONCURRENCY: Final = 16
|
|
28
|
+
_REPO_SEARCH_FULL_SCORE_LIMIT: Final = 2048
|
|
29
|
+
_REPO_SEARCH_SHORTLIST_MIN: Final = 768
|
|
30
|
+
_REPO_SEARCH_SHORTLIST_PER_RESULT: Final = 96
|
|
31
|
+
_REPO_SEARCH_SHORTLIST_PER_DOC_RESULT: Final = 64
|
|
32
|
+
_REPO_SEARCH_GRAPH_EXPANSION_LIMIT: Final = 256
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class RepoSearchCandidate(Protocol):
|
|
36
|
+
"""Status fields needed by repo search without importing ``cairn.repo``."""
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def id(self) -> str: ...
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def source(self) -> str: ...
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def doc_dir(self) -> str: ...
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def state(self) -> str: ...
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def indexed_hash(self) -> str | None: ...
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def source_file_hash(self) -> str | None: ...
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def indexed_source_file_hash(self) -> str | None: ...
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def section_count(self) -> int | None: ...
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass(slots=True)
|
|
64
|
+
class _RepoSectionRecord:
|
|
65
|
+
doc_id: str
|
|
66
|
+
source: str
|
|
67
|
+
index: DocumentIndex
|
|
68
|
+
section_id: str
|
|
69
|
+
title: str
|
|
70
|
+
body: str
|
|
71
|
+
synopsis: str
|
|
72
|
+
vector: tuple[float, ...]
|
|
73
|
+
haystacks: tuple[str, str, str, str, str]
|
|
74
|
+
token_counts: dict[str, int]
|
|
75
|
+
token_count: int
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass(slots=True)
|
|
79
|
+
class _RepoLexicalQuery:
|
|
80
|
+
terms: tuple[str, ...]
|
|
81
|
+
variants: dict[str, tuple[str, ...]]
|
|
82
|
+
weights: dict[str, float]
|
|
83
|
+
phrases: tuple[str, ...]
|
|
84
|
+
max_score: float
|
|
85
|
+
preferred_locales: tuple[str, ...]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass(slots=True)
|
|
89
|
+
class _RepoScoredHit:
|
|
90
|
+
record: _RepoSectionRecord
|
|
91
|
+
score: float
|
|
92
|
+
vector_score: float
|
|
93
|
+
lexical_score: float
|
|
94
|
+
sparse_score: float
|
|
95
|
+
graph_score: float
|
|
96
|
+
base_score: float
|
|
97
|
+
rank_factor: float
|
|
98
|
+
identity_bonus: float
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass(slots=True)
|
|
102
|
+
class _RepoSearchCache:
|
|
103
|
+
signature: tuple[tuple[str, str, str, str, str, str, int], ...]
|
|
104
|
+
records: tuple[_RepoSectionRecord, ...]
|
|
105
|
+
skipped: tuple[dict[str, str], ...]
|
|
106
|
+
doc_dims: dict[str, int]
|
|
107
|
+
df: dict[str, int]
|
|
108
|
+
avg_token_count: float
|
|
109
|
+
graph_neighbors: dict[tuple[str, str], tuple[tuple[tuple[str, str], float], ...]]
|
|
110
|
+
record_index_by_key: dict[tuple[str, str], int]
|
|
111
|
+
vector_matrices: dict[int, Any]
|
|
112
|
+
vector_record_indices: dict[int, tuple[int, ...]]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass(slots=True)
|
|
116
|
+
class _RepoSearchDocumentChunk:
|
|
117
|
+
doc_id: str | None
|
|
118
|
+
dim: int | None
|
|
119
|
+
records: list[_RepoSectionRecord]
|
|
120
|
+
skipped: dict[str, str] | None
|
|
121
|
+
df: Counter[str]
|
|
122
|
+
graph_edges: list[tuple[tuple[str, str], tuple[str, str], float]]
|
|
123
|
+
entity_sections: dict[str, set[tuple[str, str]]]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass(frozen=True, slots=True)
|
|
127
|
+
class _RepoRankProfile:
|
|
128
|
+
field_weights: tuple[float, float, float, float, float] = (2.5, 2.0, 3.0, 1.0, 1.0)
|
|
129
|
+
vector_weight: float = 0.22
|
|
130
|
+
lexical_weight: float = 0.50
|
|
131
|
+
sparse_weight: float = 0.28
|
|
132
|
+
graph_weight: float = 0.10
|
|
133
|
+
no_lexical_vector_weight: float = 0.25
|
|
134
|
+
sparse_floor_gate: float = 0.25
|
|
135
|
+
sparse_lexical_gate_multiplier: float = 2.0
|
|
136
|
+
overview_doc_bonus: float = 0.16
|
|
137
|
+
overview_title_bonus: float = 0.12
|
|
138
|
+
overview_shallow_bonus: float = 0.04
|
|
139
|
+
overview_max_bonus: float = 0.22
|
|
140
|
+
focus_support_floor: float = 0.30
|
|
141
|
+
focus_support_weight: float = 0.70
|
|
142
|
+
focus_synopsis_support: float = 0.65
|
|
143
|
+
focus_body_support: float = 0.45
|
|
144
|
+
root_meta_doc_factor: float = 0.55
|
|
145
|
+
coverage_floor: float = 0.45
|
|
146
|
+
coverage_weight: float = 0.55
|
|
147
|
+
doc_identity_bonus_weight: float = 0.25
|
|
148
|
+
history_doc_generic_factor: float = 0.45
|
|
149
|
+
locale_match_factor: float = 1.04
|
|
150
|
+
locale_mismatch_factor: float = 0.72
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
_REPO_SEARCH_CACHES: dict[Path, _RepoSearchCache] = {}
|
|
154
|
+
_DEFAULT_RANK_PROFILE = _RepoRankProfile()
|
|
155
|
+
_HISTORY_DOC_TERMS: Final = frozenset(
|
|
156
|
+
{
|
|
157
|
+
"changelog",
|
|
158
|
+
"changes",
|
|
159
|
+
"history",
|
|
160
|
+
"release",
|
|
161
|
+
"releases",
|
|
162
|
+
"migration",
|
|
163
|
+
"migrations",
|
|
164
|
+
}
|
|
165
|
+
)
|
|
166
|
+
_HISTORY_QUERY_TERMS: Final = frozenset(
|
|
167
|
+
{
|
|
168
|
+
"breaking",
|
|
169
|
+
"change",
|
|
170
|
+
"changes",
|
|
171
|
+
"changelog",
|
|
172
|
+
"deprecated",
|
|
173
|
+
"deprecation",
|
|
174
|
+
"history",
|
|
175
|
+
"migration",
|
|
176
|
+
"migrations",
|
|
177
|
+
"release",
|
|
178
|
+
"released",
|
|
179
|
+
"releases",
|
|
180
|
+
"upgrade",
|
|
181
|
+
"version",
|
|
182
|
+
"versions",
|
|
183
|
+
}
|
|
184
|
+
)
|
|
185
|
+
_KNOWN_LOCALES: Final = frozenset(
|
|
186
|
+
{
|
|
187
|
+
"ar",
|
|
188
|
+
"de",
|
|
189
|
+
"en",
|
|
190
|
+
"es",
|
|
191
|
+
"fa",
|
|
192
|
+
"fr",
|
|
193
|
+
"hi",
|
|
194
|
+
"id",
|
|
195
|
+
"it",
|
|
196
|
+
"ja",
|
|
197
|
+
"ko",
|
|
198
|
+
"nl",
|
|
199
|
+
"pl",
|
|
200
|
+
"pt",
|
|
201
|
+
"ru",
|
|
202
|
+
"tr",
|
|
203
|
+
"uk",
|
|
204
|
+
"vi",
|
|
205
|
+
"zh",
|
|
206
|
+
}
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
async def search_repo_index(
|
|
211
|
+
root: Path,
|
|
212
|
+
*,
|
|
213
|
+
candidates: Collection[RepoSearchCandidate],
|
|
214
|
+
query: str,
|
|
215
|
+
query_vec: list[float],
|
|
216
|
+
k: int,
|
|
217
|
+
include_set: Collection[IncludeField],
|
|
218
|
+
sections_per_doc: int,
|
|
219
|
+
preferred_locales: tuple[str, ...],
|
|
220
|
+
) -> dict[str, Any]:
|
|
221
|
+
"""Return the repo-search payload for already validated inputs."""
|
|
222
|
+
cache = await _get_repo_search_cache(root, candidates)
|
|
223
|
+
lexical_query = _build_repo_lexical_query(
|
|
224
|
+
query,
|
|
225
|
+
cache=cache,
|
|
226
|
+
preferred_locales=preferred_locales,
|
|
227
|
+
)
|
|
228
|
+
hits_by_key: dict[tuple[str, str], _RepoScoredHit] = {}
|
|
229
|
+
skipped: list[dict[str, str]] = list(cache.skipped)
|
|
230
|
+
query_dim = len(query_vec)
|
|
231
|
+
incompatible_docs = {
|
|
232
|
+
doc_id for doc_id, dim in cache.doc_dims.items() if dim != query_dim
|
|
233
|
+
}
|
|
234
|
+
for doc_id in sorted(incompatible_docs):
|
|
235
|
+
skipped.append(
|
|
236
|
+
{
|
|
237
|
+
"doc": doc_id,
|
|
238
|
+
"reason": f"query embedding dim {query_dim} != index dim {cache.doc_dims[doc_id]}",
|
|
239
|
+
}
|
|
240
|
+
)
|
|
241
|
+
normalized_query = l2_normalize(query_vec)
|
|
242
|
+
vector_scores = _repo_vector_scores(cache, normalized_query, query_dim)
|
|
243
|
+
candidate_indices, ranker_mode, compatible_count = _repo_candidate_indices(
|
|
244
|
+
cache,
|
|
245
|
+
query=lexical_query,
|
|
246
|
+
vector_scores=vector_scores,
|
|
247
|
+
incompatible_docs=incompatible_docs,
|
|
248
|
+
k=k,
|
|
249
|
+
sections_per_doc=sections_per_doc,
|
|
250
|
+
)
|
|
251
|
+
for index in candidate_indices:
|
|
252
|
+
record = cache.records[index]
|
|
253
|
+
scored = _score_repo_record(
|
|
254
|
+
record,
|
|
255
|
+
query=lexical_query,
|
|
256
|
+
cache=cache,
|
|
257
|
+
vector_score=vector_scores[index],
|
|
258
|
+
)
|
|
259
|
+
hits_by_key[(record.doc_id, record.section_id)] = scored
|
|
260
|
+
|
|
261
|
+
hits = list(hits_by_key.values())
|
|
262
|
+
_apply_graph_scores(hits, cache)
|
|
263
|
+
hits.sort(key=lambda item: item.score, reverse=True)
|
|
264
|
+
selected_records = _diversify_repo_hits(
|
|
265
|
+
hits,
|
|
266
|
+
limit=k,
|
|
267
|
+
sections_per_doc=sections_per_doc,
|
|
268
|
+
)
|
|
269
|
+
selected = [
|
|
270
|
+
_repo_scored_payload(
|
|
271
|
+
hit,
|
|
272
|
+
query=query,
|
|
273
|
+
lexical_query=lexical_query,
|
|
274
|
+
include_set=include_set,
|
|
275
|
+
)
|
|
276
|
+
for hit in selected_records
|
|
277
|
+
]
|
|
278
|
+
return {
|
|
279
|
+
"query": query,
|
|
280
|
+
"hits": selected,
|
|
281
|
+
"sections_per_doc": sections_per_doc,
|
|
282
|
+
"searched_documents": len(candidates),
|
|
283
|
+
"ranker": {
|
|
284
|
+
"mode": ranker_mode,
|
|
285
|
+
"total_sections": len(cache.records),
|
|
286
|
+
"compatible_sections": compatible_count,
|
|
287
|
+
"scored_sections": len(candidate_indices),
|
|
288
|
+
},
|
|
289
|
+
"stale_documents": [
|
|
290
|
+
doc.id for doc in candidates if doc.state == "stale"
|
|
291
|
+
],
|
|
292
|
+
"skipped_documents": skipped,
|
|
293
|
+
"cursor": None,
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
async def _get_repo_search_cache(
|
|
298
|
+
root: Path,
|
|
299
|
+
candidates: Collection[RepoSearchCandidate],
|
|
300
|
+
) -> _RepoSearchCache:
|
|
301
|
+
resolved_root = root.resolve()
|
|
302
|
+
signature = _repo_search_signature(candidates)
|
|
303
|
+
cached = _REPO_SEARCH_CACHES.get(resolved_root)
|
|
304
|
+
if cached is not None and cached.signature == signature:
|
|
305
|
+
return cached
|
|
306
|
+
|
|
307
|
+
records: list[_RepoSectionRecord] = []
|
|
308
|
+
skipped: list[dict[str, str]] = []
|
|
309
|
+
doc_dims: dict[str, int] = {}
|
|
310
|
+
df_counter: Counter[str] = Counter()
|
|
311
|
+
graph_weights: dict[tuple[str, str], dict[tuple[str, str], float]] = defaultdict(dict)
|
|
312
|
+
entity_sections: dict[str, set[tuple[str, str]]] = defaultdict(set)
|
|
313
|
+
semaphore = asyncio.Semaphore(_REPO_SEARCH_LOAD_CONCURRENCY)
|
|
314
|
+
|
|
315
|
+
async def load(doc: RepoSearchCandidate) -> _RepoSearchDocumentChunk:
|
|
316
|
+
async with semaphore:
|
|
317
|
+
return await _load_repo_search_document(root, doc)
|
|
318
|
+
|
|
319
|
+
chunks = await asyncio.gather(*(load(doc) for doc in candidates))
|
|
320
|
+
for chunk in chunks:
|
|
321
|
+
if chunk.skipped is not None:
|
|
322
|
+
skipped.append(chunk.skipped)
|
|
323
|
+
continue
|
|
324
|
+
if chunk.doc_id is not None and chunk.dim is not None:
|
|
325
|
+
doc_dims[chunk.doc_id] = chunk.dim
|
|
326
|
+
records.extend(chunk.records)
|
|
327
|
+
df_counter.update(chunk.df)
|
|
328
|
+
for left, right, weight in chunk.graph_edges:
|
|
329
|
+
_add_graph_edge(graph_weights, left, right, weight=weight)
|
|
330
|
+
for key, section_keys in chunk.entity_sections.items():
|
|
331
|
+
entity_sections[key].update(section_keys)
|
|
332
|
+
|
|
333
|
+
for section_keys in entity_sections.values():
|
|
334
|
+
if len(section_keys) < 2 or len(section_keys) > 24:
|
|
335
|
+
continue
|
|
336
|
+
ordered = sorted(section_keys)
|
|
337
|
+
for i, src in enumerate(ordered):
|
|
338
|
+
for dst in ordered[i + 1 :]:
|
|
339
|
+
_add_graph_edge(graph_weights, src, dst, weight=0.18)
|
|
340
|
+
|
|
341
|
+
vector_matrices, vector_record_indices = _repo_vector_matrices(records)
|
|
342
|
+
cache = _RepoSearchCache(
|
|
343
|
+
signature=signature,
|
|
344
|
+
records=tuple(records),
|
|
345
|
+
skipped=tuple(skipped),
|
|
346
|
+
doc_dims=doc_dims,
|
|
347
|
+
df=dict(df_counter),
|
|
348
|
+
avg_token_count=(
|
|
349
|
+
sum(record.token_count for record in records) / len(records)
|
|
350
|
+
if records
|
|
351
|
+
else 0.0
|
|
352
|
+
),
|
|
353
|
+
graph_neighbors={
|
|
354
|
+
key: tuple(neighbors.items())
|
|
355
|
+
for key, neighbors in graph_weights.items()
|
|
356
|
+
},
|
|
357
|
+
record_index_by_key={
|
|
358
|
+
(record.doc_id, record.section_id): index
|
|
359
|
+
for index, record in enumerate(records)
|
|
360
|
+
},
|
|
361
|
+
vector_matrices=vector_matrices,
|
|
362
|
+
vector_record_indices=vector_record_indices,
|
|
363
|
+
)
|
|
364
|
+
if (
|
|
365
|
+
resolved_root not in _REPO_SEARCH_CACHES
|
|
366
|
+
and len(_REPO_SEARCH_CACHES) >= _REPO_SEARCH_CACHE_MAX
|
|
367
|
+
):
|
|
368
|
+
oldest = next(iter(_REPO_SEARCH_CACHES))
|
|
369
|
+
del _REPO_SEARCH_CACHES[oldest]
|
|
370
|
+
_REPO_SEARCH_CACHES[resolved_root] = cache
|
|
371
|
+
return cache
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
async def _load_repo_search_document(
|
|
375
|
+
root: Path,
|
|
376
|
+
doc: RepoSearchCandidate,
|
|
377
|
+
) -> _RepoSearchDocumentChunk:
|
|
378
|
+
records: list[_RepoSectionRecord] = []
|
|
379
|
+
df_counter: Counter[str] = Counter()
|
|
380
|
+
graph_edges: list[tuple[tuple[str, str], tuple[str, str], float]] = []
|
|
381
|
+
entity_sections: dict[str, set[tuple[str, str]]] = defaultdict(set)
|
|
382
|
+
try:
|
|
383
|
+
index = DocumentIndex.load(root / doc.doc_dir)
|
|
384
|
+
vectors = {
|
|
385
|
+
entry.id: tuple(entry.vector)
|
|
386
|
+
for entry in await index.vectors.entries()
|
|
387
|
+
}
|
|
388
|
+
except Exception as exc:
|
|
389
|
+
return _RepoSearchDocumentChunk(
|
|
390
|
+
doc_id=None,
|
|
391
|
+
dim=None,
|
|
392
|
+
records=[],
|
|
393
|
+
skipped={"doc": doc.id, "reason": str(exc)},
|
|
394
|
+
df=Counter(),
|
|
395
|
+
graph_edges=[],
|
|
396
|
+
entity_sections={},
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
vector_section_ids = set(vectors)
|
|
400
|
+
for node in index.tree:
|
|
401
|
+
vector = vectors.get(node.id)
|
|
402
|
+
if vector is None:
|
|
403
|
+
continue
|
|
404
|
+
summary = index.summaries.get(node.id)
|
|
405
|
+
synopsis = summary.synopsis if summary is not None else ""
|
|
406
|
+
token_counts = _section_token_counts(
|
|
407
|
+
doc_id=doc.id,
|
|
408
|
+
source=doc.source,
|
|
409
|
+
title=node.title,
|
|
410
|
+
synopsis=synopsis,
|
|
411
|
+
body=node.raw_text,
|
|
412
|
+
)
|
|
413
|
+
df_counter.update(token_counts.keys())
|
|
414
|
+
records.append(
|
|
415
|
+
_RepoSectionRecord(
|
|
416
|
+
doc_id=doc.id,
|
|
417
|
+
source=doc.source,
|
|
418
|
+
index=index,
|
|
419
|
+
section_id=node.id,
|
|
420
|
+
title=node.title,
|
|
421
|
+
body=node.raw_text,
|
|
422
|
+
synopsis=synopsis,
|
|
423
|
+
vector=vector,
|
|
424
|
+
haystacks=(
|
|
425
|
+
_normalize_field_text(doc.id),
|
|
426
|
+
_normalize_field_text(doc.source),
|
|
427
|
+
_normalize_field_text(node.title),
|
|
428
|
+
_normalize_field_text(synopsis),
|
|
429
|
+
_normalize_field_text(node.raw_text[:2000]),
|
|
430
|
+
),
|
|
431
|
+
token_counts=dict(token_counts),
|
|
432
|
+
token_count=sum(token_counts.values()),
|
|
433
|
+
)
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
for node in index.tree:
|
|
437
|
+
if (
|
|
438
|
+
node.parent is not None
|
|
439
|
+
and node.id in vector_section_ids
|
|
440
|
+
and node.parent in vector_section_ids
|
|
441
|
+
):
|
|
442
|
+
graph_edges.append(((doc.id, node.id), (doc.id, node.parent), 0.55))
|
|
443
|
+
if index.xrefs is not None:
|
|
444
|
+
for ref in index.xrefs:
|
|
445
|
+
if ref.src in vector_section_ids and ref.dst in vector_section_ids:
|
|
446
|
+
graph_edges.append(
|
|
447
|
+
(
|
|
448
|
+
(doc.id, ref.src),
|
|
449
|
+
(doc.id, ref.dst),
|
|
450
|
+
max(0.2, min(1.0, ref.confidence)),
|
|
451
|
+
)
|
|
452
|
+
)
|
|
453
|
+
if index.entities is not None:
|
|
454
|
+
for entity in index.entities:
|
|
455
|
+
key = f"{entity.kind}:{entity.canonical}".lower()
|
|
456
|
+
for mention in entity.mentions:
|
|
457
|
+
if mention.section_id in vector_section_ids:
|
|
458
|
+
entity_sections[key].add((doc.id, mention.section_id))
|
|
459
|
+
|
|
460
|
+
return _RepoSearchDocumentChunk(
|
|
461
|
+
doc_id=doc.id,
|
|
462
|
+
dim=index.vectors.dim,
|
|
463
|
+
records=records,
|
|
464
|
+
skipped=None,
|
|
465
|
+
df=df_counter,
|
|
466
|
+
graph_edges=graph_edges,
|
|
467
|
+
entity_sections=dict(entity_sections),
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _repo_search_signature(
|
|
472
|
+
candidates: Collection[RepoSearchCandidate],
|
|
473
|
+
) -> tuple[tuple[str, str, str, str, str, str, int], ...]:
|
|
474
|
+
return tuple(
|
|
475
|
+
(
|
|
476
|
+
doc.id,
|
|
477
|
+
doc.doc_dir,
|
|
478
|
+
doc.state,
|
|
479
|
+
doc.indexed_hash or "",
|
|
480
|
+
doc.source_file_hash or "",
|
|
481
|
+
doc.indexed_source_file_hash or "",
|
|
482
|
+
doc.section_count or 0,
|
|
483
|
+
)
|
|
484
|
+
for doc in candidates
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def _repo_vector_matrices(
|
|
489
|
+
records: list[_RepoSectionRecord],
|
|
490
|
+
) -> tuple[dict[int, Any], dict[int, tuple[int, ...]]]:
|
|
491
|
+
by_dim: dict[int, list[tuple[int, tuple[float, ...]]]] = defaultdict(list)
|
|
492
|
+
for index, record in enumerate(records):
|
|
493
|
+
by_dim[len(record.vector)].append((index, record.vector))
|
|
494
|
+
matrices: dict[int, Any] = {}
|
|
495
|
+
indices: dict[int, tuple[int, ...]] = {}
|
|
496
|
+
for dim, rows in by_dim.items():
|
|
497
|
+
indices[dim] = tuple(index for index, _ in rows)
|
|
498
|
+
matrices[dim] = np.asarray([vector for _, vector in rows], dtype=np.float32)
|
|
499
|
+
return matrices, indices
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def _repo_vector_scores(
|
|
503
|
+
cache: _RepoSearchCache,
|
|
504
|
+
query: list[float],
|
|
505
|
+
query_dim: int,
|
|
506
|
+
) -> list[float]:
|
|
507
|
+
scores = [0.0] * len(cache.records)
|
|
508
|
+
matrix = cache.vector_matrices.get(query_dim)
|
|
509
|
+
indices = cache.vector_record_indices.get(query_dim)
|
|
510
|
+
if matrix is None or indices is None:
|
|
511
|
+
return scores
|
|
512
|
+
query_array = np.asarray(query, dtype=np.float32)
|
|
513
|
+
raw_scores = matrix @ query_array
|
|
514
|
+
clipped = np.clip(raw_scores, 0.0, 1.0)
|
|
515
|
+
for record_index, score in zip(indices, clipped.tolist(), strict=True):
|
|
516
|
+
scores[record_index] = float(score)
|
|
517
|
+
return scores
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def _repo_candidate_indices(
|
|
521
|
+
cache: _RepoSearchCache,
|
|
522
|
+
*,
|
|
523
|
+
query: _RepoLexicalQuery,
|
|
524
|
+
vector_scores: list[float],
|
|
525
|
+
incompatible_docs: set[str],
|
|
526
|
+
k: int,
|
|
527
|
+
sections_per_doc: int,
|
|
528
|
+
) -> tuple[tuple[int, ...], str, int]:
|
|
529
|
+
"""Choose records that should receive the full ranker pass."""
|
|
530
|
+
compatible = tuple(
|
|
531
|
+
index
|
|
532
|
+
for index, record in enumerate(cache.records)
|
|
533
|
+
if record.doc_id not in incompatible_docs
|
|
534
|
+
)
|
|
535
|
+
compatible_count = len(compatible)
|
|
536
|
+
if compatible_count <= _REPO_SEARCH_FULL_SCORE_LIMIT:
|
|
537
|
+
return compatible, "full", compatible_count
|
|
538
|
+
|
|
539
|
+
target = max(
|
|
540
|
+
_REPO_SEARCH_SHORTLIST_MIN,
|
|
541
|
+
k * _REPO_SEARCH_SHORTLIST_PER_RESULT,
|
|
542
|
+
k * sections_per_doc * _REPO_SEARCH_SHORTLIST_PER_DOC_RESULT,
|
|
543
|
+
)
|
|
544
|
+
target = min(target, compatible_count)
|
|
545
|
+
if target >= compatible_count:
|
|
546
|
+
return compatible, "full", compatible_count
|
|
547
|
+
|
|
548
|
+
candidate_set: set[int] = set()
|
|
549
|
+
vector_budget = min(target, max(k * 32, target // 2))
|
|
550
|
+
candidate_set.update(
|
|
551
|
+
heapq.nlargest(
|
|
552
|
+
vector_budget,
|
|
553
|
+
compatible,
|
|
554
|
+
key=lambda index: vector_scores[index],
|
|
555
|
+
)
|
|
556
|
+
)
|
|
557
|
+
if query.terms or query.phrases:
|
|
558
|
+
candidate_set.update(
|
|
559
|
+
heapq.nlargest(
|
|
560
|
+
target,
|
|
561
|
+
compatible,
|
|
562
|
+
key=lambda index: _repo_quick_recall_score(
|
|
563
|
+
query,
|
|
564
|
+
cache.records[index],
|
|
565
|
+
),
|
|
566
|
+
)
|
|
567
|
+
)
|
|
568
|
+
if len(candidate_set) < target:
|
|
569
|
+
candidate_set.update(
|
|
570
|
+
heapq.nlargest(
|
|
571
|
+
target,
|
|
572
|
+
compatible,
|
|
573
|
+
key=lambda index: vector_scores[index],
|
|
574
|
+
)
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
_expand_repo_candidate_neighbors(
|
|
578
|
+
candidate_set,
|
|
579
|
+
cache,
|
|
580
|
+
limit=min(
|
|
581
|
+
compatible_count,
|
|
582
|
+
target + _REPO_SEARCH_GRAPH_EXPANSION_LIMIT,
|
|
583
|
+
),
|
|
584
|
+
)
|
|
585
|
+
return tuple(sorted(candidate_set)), "shortlist", compatible_count
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def _expand_repo_candidate_neighbors(
|
|
589
|
+
candidate_set: set[int],
|
|
590
|
+
cache: _RepoSearchCache,
|
|
591
|
+
*,
|
|
592
|
+
limit: int,
|
|
593
|
+
) -> None:
|
|
594
|
+
for index in tuple(candidate_set):
|
|
595
|
+
record = cache.records[index]
|
|
596
|
+
key = (record.doc_id, record.section_id)
|
|
597
|
+
for neighbor_key, _ in cache.graph_neighbors.get(key, ()):
|
|
598
|
+
neighbor_index = cache.record_index_by_key.get(neighbor_key)
|
|
599
|
+
if neighbor_index is None:
|
|
600
|
+
continue
|
|
601
|
+
candidate_set.add(neighbor_index)
|
|
602
|
+
if len(candidate_set) >= limit:
|
|
603
|
+
return
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def _repo_quick_recall_score(
|
|
607
|
+
query: _RepoLexicalQuery,
|
|
608
|
+
record: _RepoSectionRecord,
|
|
609
|
+
) -> float:
|
|
610
|
+
if not query.terms and not query.phrases:
|
|
611
|
+
return 0.0
|
|
612
|
+
doc_id, source, title, synopsis, body = record.haystacks
|
|
613
|
+
score = 0.0
|
|
614
|
+
for term in query.terms:
|
|
615
|
+
weight = query.weights.get(term, 0.0)
|
|
616
|
+
variants = query.variants.get(term, ())
|
|
617
|
+
token_variants = tuple(variant for variant in variants if " " not in variant)
|
|
618
|
+
if any(
|
|
619
|
+
variant in doc_id or variant in source or variant in title
|
|
620
|
+
for variant in token_variants
|
|
621
|
+
):
|
|
622
|
+
score += weight * 6.0
|
|
623
|
+
elif any(variant in synopsis for variant in token_variants):
|
|
624
|
+
score += weight * 2.5
|
|
625
|
+
elif any(variant in body for variant in token_variants):
|
|
626
|
+
score += weight
|
|
627
|
+
term_frequency = max(
|
|
628
|
+
(
|
|
629
|
+
record.token_counts.get(variant, 0)
|
|
630
|
+
for variant in token_variants
|
|
631
|
+
),
|
|
632
|
+
default=0,
|
|
633
|
+
)
|
|
634
|
+
if term_frequency:
|
|
635
|
+
score += weight * (1.0 + min(3.0, float(term_frequency)) * 0.25)
|
|
636
|
+
if query.phrases:
|
|
637
|
+
combined = " ".join(record.haystacks)
|
|
638
|
+
for phrase in query.phrases:
|
|
639
|
+
if phrase in combined:
|
|
640
|
+
score += _repo_phrase_weight(phrase)
|
|
641
|
+
score += _doc_identity_bonus(query, record.haystacks) * 8.0
|
|
642
|
+
score += _overview_intent_bonus(query, record) * 4.0
|
|
643
|
+
return score * _root_meta_doc_factor(query, record) * _history_doc_factor(
|
|
644
|
+
query,
|
|
645
|
+
record,
|
|
646
|
+
) * _locale_doc_factor(query, record)
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
def _section_token_counts(
|
|
650
|
+
*,
|
|
651
|
+
doc_id: str,
|
|
652
|
+
source: str,
|
|
653
|
+
title: str,
|
|
654
|
+
synopsis: str,
|
|
655
|
+
body: str,
|
|
656
|
+
) -> Counter[str]:
|
|
657
|
+
text = " ".join(
|
|
658
|
+
(
|
|
659
|
+
doc_id,
|
|
660
|
+
source,
|
|
661
|
+
title,
|
|
662
|
+
title,
|
|
663
|
+
synopsis,
|
|
664
|
+
body[:4000],
|
|
665
|
+
)
|
|
666
|
+
)
|
|
667
|
+
return Counter(_tokenize_search_text(text))
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def _tokenize_search_text(text: str) -> list[str]:
|
|
671
|
+
return re.findall(
|
|
672
|
+
r"[a-z0-9][a-z0-9]*",
|
|
673
|
+
text.lower().replace("/", " ").replace("-", " ").replace("_", " "),
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
def _bm25_sparse_score(
|
|
678
|
+
query: _RepoLexicalQuery,
|
|
679
|
+
record: _RepoSectionRecord,
|
|
680
|
+
cache: _RepoSearchCache,
|
|
681
|
+
) -> float:
|
|
682
|
+
if not query.terms or record.token_count <= 0 or cache.avg_token_count <= 0:
|
|
683
|
+
return 0.0
|
|
684
|
+
corpus_size = max(1, len(cache.records))
|
|
685
|
+
k1 = 1.2
|
|
686
|
+
b = 0.75
|
|
687
|
+
raw = 0.0
|
|
688
|
+
max_raw = 0.0
|
|
689
|
+
length_norm = k1 * (
|
|
690
|
+
(1.0 - b) + b * (record.token_count / cache.avg_token_count)
|
|
691
|
+
)
|
|
692
|
+
for term in query.terms:
|
|
693
|
+
tf = max(
|
|
694
|
+
(
|
|
695
|
+
record.token_counts.get(variant, 0)
|
|
696
|
+
for variant in query.variants[term]
|
|
697
|
+
if " " not in variant
|
|
698
|
+
),
|
|
699
|
+
default=0,
|
|
700
|
+
)
|
|
701
|
+
df = max(
|
|
702
|
+
(
|
|
703
|
+
cache.df.get(variant, 0)
|
|
704
|
+
for variant in query.variants[term]
|
|
705
|
+
if " " not in variant
|
|
706
|
+
),
|
|
707
|
+
default=0,
|
|
708
|
+
)
|
|
709
|
+
if tf <= 0 or df <= 0:
|
|
710
|
+
continue
|
|
711
|
+
idf = math.log(1.0 + ((corpus_size - df + 0.5) / (df + 0.5)))
|
|
712
|
+
weighted_idf = idf * query.weights[term]
|
|
713
|
+
raw += weighted_idf * ((tf * (k1 + 1.0)) / (tf + length_norm))
|
|
714
|
+
max_raw += weighted_idf * (k1 + 1.0)
|
|
715
|
+
if max_raw <= 0:
|
|
716
|
+
return 0.0
|
|
717
|
+
return max(0.0, min(1.0, raw / max_raw))
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
def _add_graph_edge(
|
|
721
|
+
graph: dict[tuple[str, str], dict[tuple[str, str], float]],
|
|
722
|
+
left: tuple[str, str],
|
|
723
|
+
right: tuple[str, str],
|
|
724
|
+
*,
|
|
725
|
+
weight: float,
|
|
726
|
+
) -> None:
|
|
727
|
+
if left == right:
|
|
728
|
+
return
|
|
729
|
+
graph[left][right] = max(graph[left].get(right, 0.0), weight)
|
|
730
|
+
graph[right][left] = max(graph[right].get(left, 0.0), weight)
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def _score_repo_record(
|
|
734
|
+
record: _RepoSectionRecord,
|
|
735
|
+
*,
|
|
736
|
+
query: _RepoLexicalQuery,
|
|
737
|
+
cache: _RepoSearchCache,
|
|
738
|
+
vector_score: float,
|
|
739
|
+
) -> _RepoScoredHit:
|
|
740
|
+
focus_support = _focus_field_support(query, record.haystacks)
|
|
741
|
+
coverage = _weighted_term_coverage(query, record.haystacks)
|
|
742
|
+
lexical_score = min(
|
|
743
|
+
1.0,
|
|
744
|
+
_field_supported_lexical_score(
|
|
745
|
+
_lexical_score_from_profile(query, record.haystacks),
|
|
746
|
+
focus_support=focus_support,
|
|
747
|
+
)
|
|
748
|
+
* _coverage_factor(coverage)
|
|
749
|
+
+ _overview_intent_bonus(query, record),
|
|
750
|
+
)
|
|
751
|
+
sparse_score = _bm25_sparse_score(query, record, cache)
|
|
752
|
+
rank_factor = _root_meta_doc_factor(query, record) * _history_doc_factor(
|
|
753
|
+
query,
|
|
754
|
+
record,
|
|
755
|
+
) * _locale_doc_factor(query, record)
|
|
756
|
+
identity_bonus = _doc_identity_bonus(query, record.haystacks)
|
|
757
|
+
base_score = min(
|
|
758
|
+
1.0,
|
|
759
|
+
_combine_repo_scores(
|
|
760
|
+
vector_score,
|
|
761
|
+
lexical_score,
|
|
762
|
+
sparse_score=sparse_score,
|
|
763
|
+
graph_score=0.0,
|
|
764
|
+
)
|
|
765
|
+
+ identity_bonus,
|
|
766
|
+
) * rank_factor
|
|
767
|
+
return _RepoScoredHit(
|
|
768
|
+
record=record,
|
|
769
|
+
score=base_score,
|
|
770
|
+
vector_score=vector_score,
|
|
771
|
+
lexical_score=lexical_score,
|
|
772
|
+
sparse_score=sparse_score,
|
|
773
|
+
graph_score=0.0,
|
|
774
|
+
base_score=base_score,
|
|
775
|
+
rank_factor=rank_factor,
|
|
776
|
+
identity_bonus=identity_bonus,
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
def _apply_graph_scores(
|
|
781
|
+
hits: list[_RepoScoredHit],
|
|
782
|
+
cache: _RepoSearchCache,
|
|
783
|
+
) -> None:
|
|
784
|
+
by_key = {
|
|
785
|
+
(hit.record.doc_id, hit.record.section_id): hit
|
|
786
|
+
for hit in hits
|
|
787
|
+
}
|
|
788
|
+
for hit in hits:
|
|
789
|
+
key = (hit.record.doc_id, hit.record.section_id)
|
|
790
|
+
neighbors = cache.graph_neighbors.get(key, ())
|
|
791
|
+
total = 0.0
|
|
792
|
+
weight_sum = 0.0
|
|
793
|
+
for neighbor_key, weight in neighbors:
|
|
794
|
+
weight_sum += weight
|
|
795
|
+
neighbor = by_key.get(neighbor_key)
|
|
796
|
+
if neighbor is None:
|
|
797
|
+
continue
|
|
798
|
+
total += neighbor.base_score * weight
|
|
799
|
+
graph_score = total / weight_sum if weight_sum else 0.0
|
|
800
|
+
hit.graph_score = graph_score
|
|
801
|
+
hit.score = min(
|
|
802
|
+
1.0,
|
|
803
|
+
_combine_repo_scores(
|
|
804
|
+
hit.vector_score,
|
|
805
|
+
hit.lexical_score,
|
|
806
|
+
sparse_score=hit.sparse_score,
|
|
807
|
+
graph_score=graph_score,
|
|
808
|
+
graph_present=bool(neighbors),
|
|
809
|
+
)
|
|
810
|
+
+ hit.identity_bonus,
|
|
811
|
+
) * hit.rank_factor
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
def _repo_scored_payload(
|
|
815
|
+
hit: _RepoScoredHit,
|
|
816
|
+
*,
|
|
817
|
+
query: str,
|
|
818
|
+
lexical_query: _RepoLexicalQuery,
|
|
819
|
+
include_set: Collection[IncludeField],
|
|
820
|
+
) -> dict[str, Any]:
|
|
821
|
+
record = hit.record
|
|
822
|
+
result: dict[str, Any] = {
|
|
823
|
+
"doc": record.doc_id,
|
|
824
|
+
"source": record.source,
|
|
825
|
+
"id": record.section_id,
|
|
826
|
+
"title": record.title,
|
|
827
|
+
"score": hit.score,
|
|
828
|
+
"vector_score": hit.vector_score,
|
|
829
|
+
"lexical_score": hit.lexical_score,
|
|
830
|
+
"sparse_score": hit.sparse_score,
|
|
831
|
+
"graph_score": hit.graph_score,
|
|
832
|
+
"anchor": record.index.anchor(record.section_id),
|
|
833
|
+
"explanation": _repo_hit_explanation(hit, lexical_query),
|
|
834
|
+
}
|
|
835
|
+
if "synopsis" in include_set and record.synopsis:
|
|
836
|
+
result["synopsis"] = record.synopsis
|
|
837
|
+
if "head" in include_set:
|
|
838
|
+
result["head"] = record.body[:200]
|
|
839
|
+
if "evidence" in include_set:
|
|
840
|
+
result["evidence"] = _evidence_snippet(record.body, query)
|
|
841
|
+
return result
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
def _repo_hit_explanation(
|
|
845
|
+
hit: _RepoScoredHit,
|
|
846
|
+
query: _RepoLexicalQuery,
|
|
847
|
+
) -> dict[str, Any]:
|
|
848
|
+
profile = _DEFAULT_RANK_PROFILE
|
|
849
|
+
signal_scores = {
|
|
850
|
+
"lexical": hit.lexical_score,
|
|
851
|
+
"sparse": hit.sparse_score,
|
|
852
|
+
"vector": hit.vector_score,
|
|
853
|
+
"graph": hit.graph_score,
|
|
854
|
+
}
|
|
855
|
+
dominant_order = {"lexical": 4, "sparse": 3, "vector": 2, "graph": 1}
|
|
856
|
+
dominant_signal = max(
|
|
857
|
+
signal_scores,
|
|
858
|
+
key=lambda name: (signal_scores[name], dominant_order[name]),
|
|
859
|
+
)
|
|
860
|
+
matched_terms = _repo_matched_terms(query, hit.record)
|
|
861
|
+
notes: list[str] = []
|
|
862
|
+
if matched_terms:
|
|
863
|
+
notes.append("matched query terms in doc/source/title/summary/body fields")
|
|
864
|
+
if hit.sparse_score > 0:
|
|
865
|
+
notes.append("BM25-style sparse evidence contributed")
|
|
866
|
+
if hit.graph_score > 0:
|
|
867
|
+
notes.append("tree/xref/entity neighborhood support contributed")
|
|
868
|
+
if hit.identity_bonus > 0:
|
|
869
|
+
notes.append("document or path identity matched the query")
|
|
870
|
+
if hit.rank_factor != 1.0:
|
|
871
|
+
notes.append("rank factor adjusted broad root-document placement")
|
|
872
|
+
|
|
873
|
+
return {
|
|
874
|
+
"dominant_signal": dominant_signal,
|
|
875
|
+
"matched_terms": matched_terms,
|
|
876
|
+
"signals": {
|
|
877
|
+
"vector": {
|
|
878
|
+
"score": _round_score(hit.vector_score),
|
|
879
|
+
"weight": profile.vector_weight,
|
|
880
|
+
},
|
|
881
|
+
"lexical": {
|
|
882
|
+
"score": _round_score(hit.lexical_score),
|
|
883
|
+
"weight": profile.lexical_weight,
|
|
884
|
+
},
|
|
885
|
+
"sparse": {
|
|
886
|
+
"score": _round_score(hit.sparse_score),
|
|
887
|
+
"weight": profile.sparse_weight,
|
|
888
|
+
},
|
|
889
|
+
"graph": {
|
|
890
|
+
"score": _round_score(hit.graph_score),
|
|
891
|
+
"weight": profile.graph_weight,
|
|
892
|
+
},
|
|
893
|
+
},
|
|
894
|
+
"rank_factor": _round_score(hit.rank_factor),
|
|
895
|
+
"identity_bonus": _round_score(hit.identity_bonus),
|
|
896
|
+
"notes": notes,
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
def _repo_matched_terms(
|
|
901
|
+
query: _RepoLexicalQuery,
|
|
902
|
+
record: _RepoSectionRecord,
|
|
903
|
+
) -> list[str]:
|
|
904
|
+
if not query.terms:
|
|
905
|
+
return []
|
|
906
|
+
haystack = " ".join(record.haystacks)
|
|
907
|
+
return [
|
|
908
|
+
term
|
|
909
|
+
for term in query.terms
|
|
910
|
+
if any(variant in haystack for variant in query.variants[term])
|
|
911
|
+
]
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
def _round_score(value: float) -> float:
|
|
915
|
+
return round(float(value), 4)
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def _combine_repo_scores(
|
|
919
|
+
vector_score: float,
|
|
920
|
+
lexical_score: float,
|
|
921
|
+
*,
|
|
922
|
+
sparse_score: float,
|
|
923
|
+
graph_score: float,
|
|
924
|
+
graph_present: bool = False,
|
|
925
|
+
) -> float:
|
|
926
|
+
profile = _DEFAULT_RANK_PROFILE
|
|
927
|
+
if lexical_score <= 0 and sparse_score <= 0:
|
|
928
|
+
return vector_score * profile.no_lexical_vector_weight
|
|
929
|
+
trusted_sparse = _trusted_sparse_score(
|
|
930
|
+
lexical_score=lexical_score,
|
|
931
|
+
sparse_score=sparse_score,
|
|
932
|
+
)
|
|
933
|
+
base = (
|
|
934
|
+
(vector_score * profile.vector_weight)
|
|
935
|
+
+ (lexical_score * profile.lexical_weight)
|
|
936
|
+
+ (trusted_sparse * profile.sparse_weight)
|
|
937
|
+
)
|
|
938
|
+
if not graph_present and graph_score <= 0:
|
|
939
|
+
return min(1.0, base)
|
|
940
|
+
return min(
|
|
941
|
+
1.0,
|
|
942
|
+
(base * (1.0 - profile.graph_weight)) + (graph_score * profile.graph_weight),
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
def _trusted_sparse_score(*, lexical_score: float, sparse_score: float) -> float:
|
|
947
|
+
if sparse_score <= 0:
|
|
948
|
+
return 0.0
|
|
949
|
+
if lexical_score <= 0:
|
|
950
|
+
return sparse_score * 0.15
|
|
951
|
+
profile = _DEFAULT_RANK_PROFILE
|
|
952
|
+
gate = min(
|
|
953
|
+
1.0,
|
|
954
|
+
max(
|
|
955
|
+
profile.sparse_floor_gate,
|
|
956
|
+
lexical_score * profile.sparse_lexical_gate_multiplier,
|
|
957
|
+
),
|
|
958
|
+
)
|
|
959
|
+
return sparse_score * gate
|
|
960
|
+
|
|
961
|
+
|
|
962
|
+
def _build_repo_lexical_query(
|
|
963
|
+
query: str,
|
|
964
|
+
*,
|
|
965
|
+
cache: _RepoSearchCache | None = None,
|
|
966
|
+
preferred_locales: tuple[str, ...] = (),
|
|
967
|
+
) -> _RepoLexicalQuery:
|
|
968
|
+
terms = tuple(_repo_query_terms(query))
|
|
969
|
+
field_weights = _DEFAULT_RANK_PROFILE.field_weights
|
|
970
|
+
variants = {term: _term_variants(term) for term in terms}
|
|
971
|
+
weights = {
|
|
972
|
+
term: _repo_term_weight(term)
|
|
973
|
+
* _repo_corpus_term_weight(variants[term], cache)
|
|
974
|
+
for term in terms
|
|
975
|
+
}
|
|
976
|
+
max_score = sum(weights[term] * sum(field_weights) for term in terms)
|
|
977
|
+
return _RepoLexicalQuery(
|
|
978
|
+
terms=terms,
|
|
979
|
+
variants=variants,
|
|
980
|
+
weights=weights,
|
|
981
|
+
phrases=tuple(_command_phrases(query)),
|
|
982
|
+
max_score=max_score,
|
|
983
|
+
preferred_locales=_normalized_preferred_locales(
|
|
984
|
+
preferred_locales,
|
|
985
|
+
fallback=_infer_query_locale(query),
|
|
986
|
+
),
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
def _repo_corpus_term_weight(
|
|
991
|
+
variants: tuple[str, ...],
|
|
992
|
+
cache: _RepoSearchCache | None,
|
|
993
|
+
) -> float:
|
|
994
|
+
if cache is None or not cache.records:
|
|
995
|
+
return 1.0
|
|
996
|
+
token_variants = {variant for variant in variants if " " not in variant}
|
|
997
|
+
if not token_variants:
|
|
998
|
+
return 1.0
|
|
999
|
+
coverage = sum(
|
|
1000
|
+
1
|
|
1001
|
+
for record in cache.records
|
|
1002
|
+
if any(variant in record.token_counts for variant in token_variants)
|
|
1003
|
+
)
|
|
1004
|
+
if coverage <= 0:
|
|
1005
|
+
return 1.0
|
|
1006
|
+
corpus_size = len(cache.records)
|
|
1007
|
+
idf = math.log(1.0 + ((corpus_size - coverage + 0.5) / (coverage + 0.5)))
|
|
1008
|
+
max_idf = math.log(1.0 + ((corpus_size + 0.5) / 0.5))
|
|
1009
|
+
if max_idf <= 0:
|
|
1010
|
+
return 1.0
|
|
1011
|
+
return 0.35 + (0.65 * max(0.0, min(1.0, idf / max_idf)))
|
|
1012
|
+
|
|
1013
|
+
|
|
1014
|
+
def _lexical_score_from_profile(
|
|
1015
|
+
query: _RepoLexicalQuery,
|
|
1016
|
+
haystacks: tuple[str, str, str, str, str],
|
|
1017
|
+
) -> float:
|
|
1018
|
+
if not query.terms or query.max_score <= 0:
|
|
1019
|
+
return 0.0
|
|
1020
|
+
weighted = 0.0
|
|
1021
|
+
for term in query.terms:
|
|
1022
|
+
term_weight = query.weights[term]
|
|
1023
|
+
variants = query.variants[term]
|
|
1024
|
+
field_weights = _DEFAULT_RANK_PROFILE.field_weights
|
|
1025
|
+
for haystack, field_weight in zip(haystacks, field_weights, strict=True):
|
|
1026
|
+
if any(variant in haystack for variant in variants):
|
|
1027
|
+
weighted += term_weight * field_weight
|
|
1028
|
+
combined = _normalize_search_text(" ".join(haystacks))
|
|
1029
|
+
for size in range(min(4, len(query.terms)), 1, -1):
|
|
1030
|
+
for start in range(0, len(query.terms) - size + 1):
|
|
1031
|
+
phrase = " ".join(query.terms[start : start + size])
|
|
1032
|
+
if phrase in combined:
|
|
1033
|
+
weighted += float(size)
|
|
1034
|
+
for phrase in query.phrases:
|
|
1035
|
+
if phrase in combined:
|
|
1036
|
+
weighted += _repo_phrase_weight(phrase)
|
|
1037
|
+
return min(1.0, weighted / query.max_score)
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
def _repo_query_terms(query: str) -> list[str]:
|
|
1041
|
+
generic = {
|
|
1042
|
+
"about",
|
|
1043
|
+
"do",
|
|
1044
|
+
"does",
|
|
1045
|
+
"from",
|
|
1046
|
+
"how",
|
|
1047
|
+
"in",
|
|
1048
|
+
"into",
|
|
1049
|
+
"it",
|
|
1050
|
+
"on",
|
|
1051
|
+
"using",
|
|
1052
|
+
"what",
|
|
1053
|
+
"when",
|
|
1054
|
+
"where",
|
|
1055
|
+
"which",
|
|
1056
|
+
"work",
|
|
1057
|
+
"works",
|
|
1058
|
+
"with",
|
|
1059
|
+
}
|
|
1060
|
+
terms = [term for term in _query_terms(query) if term not in generic]
|
|
1061
|
+
seen = set(terms)
|
|
1062
|
+
for word in re.findall(r"[A-Za-z0-9_][A-Za-z0-9_-]*", query.lower()):
|
|
1063
|
+
if (
|
|
1064
|
+
len(word) >= 2
|
|
1065
|
+
and word not in generic
|
|
1066
|
+
and word not in seen
|
|
1067
|
+
and _looks_like_compact_identifier(word)
|
|
1068
|
+
):
|
|
1069
|
+
seen.add(word)
|
|
1070
|
+
terms.append(word)
|
|
1071
|
+
return terms
|
|
1072
|
+
|
|
1073
|
+
|
|
1074
|
+
def _normalize_field_text(text: str) -> str:
|
|
1075
|
+
lowered = text.lower().replace("/", " ").replace("-", " ").replace("_", " ")
|
|
1076
|
+
return _normalize_search_text(lowered)
|
|
1077
|
+
|
|
1078
|
+
|
|
1079
|
+
def _normalize_search_text(text: str) -> str:
|
|
1080
|
+
normalized = text.lower().replace("/", " ").replace("-", " ").replace("_", " ")
|
|
1081
|
+
return " ".join(re.findall(r"[a-z0-9][a-z0-9]*", normalized))
|
|
1082
|
+
|
|
1083
|
+
|
|
1084
|
+
def _looks_like_compact_identifier(token: str) -> bool:
|
|
1085
|
+
return len(token) <= 4 or any(char.isdigit() for char in token) or "_" in token
|
|
1086
|
+
|
|
1087
|
+
|
|
1088
|
+
def _repo_term_weight(term: str) -> float:
|
|
1089
|
+
"""Down-weight broad verbs that otherwise dominate docs-heavy repos."""
|
|
1090
|
+
if term in {
|
|
1091
|
+
"run",
|
|
1092
|
+
"runs",
|
|
1093
|
+
"test",
|
|
1094
|
+
"tests",
|
|
1095
|
+
"testing",
|
|
1096
|
+
"use",
|
|
1097
|
+
"using",
|
|
1098
|
+
"write",
|
|
1099
|
+
"writes",
|
|
1100
|
+
"written",
|
|
1101
|
+
}:
|
|
1102
|
+
return 0.35
|
|
1103
|
+
return 1.0
|
|
1104
|
+
|
|
1105
|
+
|
|
1106
|
+
def _overview_intent_bonus(
|
|
1107
|
+
query: _RepoLexicalQuery,
|
|
1108
|
+
record: _RepoSectionRecord,
|
|
1109
|
+
) -> float:
|
|
1110
|
+
if not query.terms:
|
|
1111
|
+
return 0.0
|
|
1112
|
+
focus_term = query.terms[0]
|
|
1113
|
+
|
|
1114
|
+
doc_tokens = tuple(_tokenize_search_text(record.doc_id))
|
|
1115
|
+
title = _normalize_search_text(record.title)
|
|
1116
|
+
profile = _DEFAULT_RANK_PROFILE
|
|
1117
|
+
bonus = 0.0
|
|
1118
|
+
variants = {
|
|
1119
|
+
variant
|
|
1120
|
+
for variant in query.variants.get(focus_term, ())
|
|
1121
|
+
if " " not in variant
|
|
1122
|
+
}
|
|
1123
|
+
if not variants:
|
|
1124
|
+
return 0.0
|
|
1125
|
+
if any(doc_tokens in {(variant,), ("docs", variant)} for variant in variants):
|
|
1126
|
+
bonus = max(bonus, profile.overview_doc_bonus)
|
|
1127
|
+
if title in variants:
|
|
1128
|
+
bonus = max(bonus, profile.overview_title_bonus)
|
|
1129
|
+
|
|
1130
|
+
if bonus > 0 and record.section_id.count("/") <= 1:
|
|
1131
|
+
bonus += profile.overview_shallow_bonus
|
|
1132
|
+
return min(profile.overview_max_bonus, bonus)
|
|
1133
|
+
|
|
1134
|
+
|
|
1135
|
+
def _focus_field_support(
|
|
1136
|
+
query: _RepoLexicalQuery,
|
|
1137
|
+
haystacks: tuple[str, str, str, str, str],
|
|
1138
|
+
) -> float:
|
|
1139
|
+
if not query.terms:
|
|
1140
|
+
return 1.0
|
|
1141
|
+
doc_id, source, title, synopsis, body = haystacks
|
|
1142
|
+
profile = _DEFAULT_RANK_PROFILE
|
|
1143
|
+
focus_terms = tuple(
|
|
1144
|
+
term for term in query.terms if query.weights.get(term, 0.0) > 0
|
|
1145
|
+
)[:2]
|
|
1146
|
+
if not focus_terms:
|
|
1147
|
+
return 1.0
|
|
1148
|
+
total = sum(query.weights[term] for term in focus_terms)
|
|
1149
|
+
if total <= 0:
|
|
1150
|
+
return 1.0
|
|
1151
|
+
support = 0.0
|
|
1152
|
+
for term in focus_terms:
|
|
1153
|
+
variants = {
|
|
1154
|
+
variant
|
|
1155
|
+
for variant in query.variants.get(term, ())
|
|
1156
|
+
if " " not in variant
|
|
1157
|
+
}
|
|
1158
|
+
if not variants:
|
|
1159
|
+
continue
|
|
1160
|
+
if any(
|
|
1161
|
+
variant in doc_id or variant in source or variant in title
|
|
1162
|
+
for variant in variants
|
|
1163
|
+
):
|
|
1164
|
+
support += query.weights[term]
|
|
1165
|
+
elif any(variant in synopsis for variant in variants):
|
|
1166
|
+
support += query.weights[term] * profile.focus_synopsis_support
|
|
1167
|
+
elif any(variant in body for variant in variants):
|
|
1168
|
+
support += query.weights[term] * profile.focus_body_support
|
|
1169
|
+
return max(0.0, min(1.0, support / total))
|
|
1170
|
+
|
|
1171
|
+
|
|
1172
|
+
def _field_supported_lexical_score(score: float, *, focus_support: float) -> float:
|
|
1173
|
+
if score <= 0 or focus_support >= 1:
|
|
1174
|
+
return score
|
|
1175
|
+
profile = _DEFAULT_RANK_PROFILE
|
|
1176
|
+
multiplier = profile.focus_support_floor + (
|
|
1177
|
+
profile.focus_support_weight * max(0.0, focus_support)
|
|
1178
|
+
)
|
|
1179
|
+
return score * multiplier
|
|
1180
|
+
|
|
1181
|
+
|
|
1182
|
+
def _weighted_term_coverage(
|
|
1183
|
+
query: _RepoLexicalQuery,
|
|
1184
|
+
haystacks: tuple[str, str, str, str, str],
|
|
1185
|
+
) -> float:
|
|
1186
|
+
if not query.terms:
|
|
1187
|
+
return 1.0
|
|
1188
|
+
combined = " ".join(haystacks)
|
|
1189
|
+
total = sum(query.weights[term] for term in query.terms)
|
|
1190
|
+
if total <= 0:
|
|
1191
|
+
return 1.0
|
|
1192
|
+
matched = 0.0
|
|
1193
|
+
for term in query.terms:
|
|
1194
|
+
variants = query.variants.get(term, ())
|
|
1195
|
+
if any(variant in combined for variant in variants):
|
|
1196
|
+
matched += query.weights[term]
|
|
1197
|
+
return max(0.0, min(1.0, matched / total))
|
|
1198
|
+
|
|
1199
|
+
|
|
1200
|
+
def _coverage_factor(coverage: float) -> float:
|
|
1201
|
+
profile = _DEFAULT_RANK_PROFILE
|
|
1202
|
+
return profile.coverage_floor + (
|
|
1203
|
+
profile.coverage_weight * max(0.0, min(1.0, coverage))
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
|
|
1207
|
+
def _doc_identity_bonus(
|
|
1208
|
+
query: _RepoLexicalQuery,
|
|
1209
|
+
haystacks: tuple[str, str, str, str, str],
|
|
1210
|
+
) -> float:
|
|
1211
|
+
if not query.terms:
|
|
1212
|
+
return 0.0
|
|
1213
|
+
doc_id, source, _, _, _ = haystacks
|
|
1214
|
+
focus_terms = tuple(
|
|
1215
|
+
term for term in query.terms if query.weights.get(term, 0.0) > 0
|
|
1216
|
+
)[:3]
|
|
1217
|
+
total = sum(query.weights[term] for term in focus_terms)
|
|
1218
|
+
if total <= 0:
|
|
1219
|
+
return 0.0
|
|
1220
|
+
matched = 0.0
|
|
1221
|
+
for term in focus_terms:
|
|
1222
|
+
variants = {
|
|
1223
|
+
variant
|
|
1224
|
+
for variant in query.variants.get(term, ())
|
|
1225
|
+
if " " not in variant
|
|
1226
|
+
}
|
|
1227
|
+
if any(variant in doc_id or variant in source for variant in variants):
|
|
1228
|
+
matched += query.weights[term]
|
|
1229
|
+
support = matched / total
|
|
1230
|
+
return _DEFAULT_RANK_PROFILE.doc_identity_bonus_weight * max(
|
|
1231
|
+
0.0,
|
|
1232
|
+
min(1.0, support),
|
|
1233
|
+
)
|
|
1234
|
+
|
|
1235
|
+
|
|
1236
|
+
def _root_meta_doc_factor(
|
|
1237
|
+
query: _RepoLexicalQuery,
|
|
1238
|
+
record: _RepoSectionRecord,
|
|
1239
|
+
) -> float:
|
|
1240
|
+
source_path = Path(record.source)
|
|
1241
|
+
if source_path.parent != Path(".") or source_path.stem in {"README", "CHANGELOG"}:
|
|
1242
|
+
return 1.0
|
|
1243
|
+
if source_path.stem != source_path.stem.upper():
|
|
1244
|
+
return 1.0
|
|
1245
|
+
if _first_term_has_structural_support(query, record.haystacks):
|
|
1246
|
+
return 1.0
|
|
1247
|
+
return _DEFAULT_RANK_PROFILE.root_meta_doc_factor
|
|
1248
|
+
|
|
1249
|
+
|
|
1250
|
+
def _history_doc_factor(query: _RepoLexicalQuery, record: _RepoSectionRecord) -> float:
|
|
1251
|
+
if _query_wants_history(query):
|
|
1252
|
+
return 1.0
|
|
1253
|
+
if not _is_history_doc(record):
|
|
1254
|
+
return 1.0
|
|
1255
|
+
return _DEFAULT_RANK_PROFILE.history_doc_generic_factor
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
def _is_history_doc(record: _RepoSectionRecord) -> bool:
|
|
1259
|
+
tokens = set(_tokenize_search_text(f"{record.doc_id} {record.source} {record.title}"))
|
|
1260
|
+
return bool(tokens & _HISTORY_DOC_TERMS)
|
|
1261
|
+
|
|
1262
|
+
|
|
1263
|
+
def _query_wants_history(query: _RepoLexicalQuery) -> bool:
|
|
1264
|
+
query_terms = set(query.terms)
|
|
1265
|
+
if query_terms & _HISTORY_QUERY_TERMS:
|
|
1266
|
+
return True
|
|
1267
|
+
return any(
|
|
1268
|
+
variant in _HISTORY_QUERY_TERMS
|
|
1269
|
+
for term in query.terms
|
|
1270
|
+
for variant in query.variants.get(term, ())
|
|
1271
|
+
if " " not in variant
|
|
1272
|
+
)
|
|
1273
|
+
|
|
1274
|
+
|
|
1275
|
+
def _locale_doc_factor(query: _RepoLexicalQuery, record: _RepoSectionRecord) -> float:
|
|
1276
|
+
if not query.preferred_locales:
|
|
1277
|
+
return 1.0
|
|
1278
|
+
doc_locale = _source_locale(record.source)
|
|
1279
|
+
if doc_locale is None:
|
|
1280
|
+
return 1.0
|
|
1281
|
+
profile = _DEFAULT_RANK_PROFILE
|
|
1282
|
+
if doc_locale in query.preferred_locales:
|
|
1283
|
+
return profile.locale_match_factor
|
|
1284
|
+
return profile.locale_mismatch_factor
|
|
1285
|
+
|
|
1286
|
+
|
|
1287
|
+
def _source_locale(source: str) -> str | None:
|
|
1288
|
+
for part in Path(source).parts:
|
|
1289
|
+
normalized = part.lower().replace("_", "-")
|
|
1290
|
+
if normalized in _KNOWN_LOCALES:
|
|
1291
|
+
return normalized
|
|
1292
|
+
match = re.fullmatch(r"([a-z]{2})(?:-[a-z0-9]{2,8})+", normalized)
|
|
1293
|
+
if match and match.group(1) in _KNOWN_LOCALES:
|
|
1294
|
+
return match.group(1)
|
|
1295
|
+
return None
|
|
1296
|
+
|
|
1297
|
+
|
|
1298
|
+
def _infer_query_locale(query: str) -> str | None:
|
|
1299
|
+
if re.search(r"[\u3040-\u30ff\u3400-\u9fff\uac00-\ud7af]", query):
|
|
1300
|
+
return None
|
|
1301
|
+
if re.search(r"[A-Za-z]", query):
|
|
1302
|
+
return "en"
|
|
1303
|
+
return None
|
|
1304
|
+
|
|
1305
|
+
|
|
1306
|
+
def _normalized_preferred_locales(
|
|
1307
|
+
locales: tuple[str, ...],
|
|
1308
|
+
*,
|
|
1309
|
+
fallback: str | None,
|
|
1310
|
+
) -> tuple[str, ...]:
|
|
1311
|
+
normalized = tuple(
|
|
1312
|
+
locale.lower().replace("_", "-").split("-", 1)[0]
|
|
1313
|
+
for locale in locales
|
|
1314
|
+
if locale.strip()
|
|
1315
|
+
)
|
|
1316
|
+
if normalized:
|
|
1317
|
+
return tuple(
|
|
1318
|
+
locale for locale in normalized if locale in _KNOWN_LOCALES
|
|
1319
|
+
)
|
|
1320
|
+
if fallback is None:
|
|
1321
|
+
return ()
|
|
1322
|
+
return (fallback,)
|
|
1323
|
+
|
|
1324
|
+
|
|
1325
|
+
def _first_term_has_structural_support(
|
|
1326
|
+
query: _RepoLexicalQuery,
|
|
1327
|
+
haystacks: tuple[str, str, str, str, str],
|
|
1328
|
+
) -> bool:
|
|
1329
|
+
if not query.terms:
|
|
1330
|
+
return True
|
|
1331
|
+
variants = {
|
|
1332
|
+
variant
|
|
1333
|
+
for variant in query.variants.get(query.terms[0], ())
|
|
1334
|
+
if " " not in variant
|
|
1335
|
+
}
|
|
1336
|
+
if not variants:
|
|
1337
|
+
return True
|
|
1338
|
+
doc_id, source, title, _, _ = haystacks
|
|
1339
|
+
return any(
|
|
1340
|
+
variant in doc_id or variant in source or variant in title
|
|
1341
|
+
for variant in variants
|
|
1342
|
+
)
|
|
1343
|
+
|
|
1344
|
+
|
|
1345
|
+
def _term_variants(term: str) -> tuple[str, ...]:
|
|
1346
|
+
variants = {term}
|
|
1347
|
+
if term.endswith("ies") and len(term) > 4:
|
|
1348
|
+
variants.add(f"{term[:-3]}y")
|
|
1349
|
+
if term.endswith("s") and len(term) > 3:
|
|
1350
|
+
variants.add(term[:-1])
|
|
1351
|
+
if not term.endswith("s") and len(term) > 2:
|
|
1352
|
+
variants.add(f"{term}s")
|
|
1353
|
+
if term.endswith("y") and len(term) > 3:
|
|
1354
|
+
variants.add(f"{term[:-1]}ies")
|
|
1355
|
+
|
|
1356
|
+
if term.startswith(("eval", "evaluat")):
|
|
1357
|
+
variants.update(
|
|
1358
|
+
{
|
|
1359
|
+
"eval",
|
|
1360
|
+
"evals",
|
|
1361
|
+
"evaluate",
|
|
1362
|
+
"evaluates",
|
|
1363
|
+
"evaluated",
|
|
1364
|
+
"evaluating",
|
|
1365
|
+
"evaluation",
|
|
1366
|
+
"evaluations",
|
|
1367
|
+
"evaluator",
|
|
1368
|
+
"evaluators",
|
|
1369
|
+
}
|
|
1370
|
+
)
|
|
1371
|
+
if term.startswith(("depend", "deps")):
|
|
1372
|
+
variants.update(
|
|
1373
|
+
{
|
|
1374
|
+
"dep",
|
|
1375
|
+
"deps",
|
|
1376
|
+
"depend",
|
|
1377
|
+
"depends",
|
|
1378
|
+
"dependency",
|
|
1379
|
+
"dependencies",
|
|
1380
|
+
"dependent",
|
|
1381
|
+
"dependents",
|
|
1382
|
+
}
|
|
1383
|
+
)
|
|
1384
|
+
if term.startswith("inject"):
|
|
1385
|
+
variants.update(
|
|
1386
|
+
{
|
|
1387
|
+
"inject",
|
|
1388
|
+
"injects",
|
|
1389
|
+
"injected",
|
|
1390
|
+
"injecting",
|
|
1391
|
+
"injection",
|
|
1392
|
+
"injections",
|
|
1393
|
+
}
|
|
1394
|
+
)
|
|
1395
|
+
if term.startswith("config"):
|
|
1396
|
+
variants.update(
|
|
1397
|
+
{"config", "configs", "configure", "configured", "configuration"}
|
|
1398
|
+
)
|
|
1399
|
+
if term.startswith("auth"):
|
|
1400
|
+
variants.update({"auth", "authenticate", "authentication", "authorization"})
|
|
1401
|
+
if term.startswith("install"):
|
|
1402
|
+
variants.update(
|
|
1403
|
+
{
|
|
1404
|
+
"install",
|
|
1405
|
+
"installs",
|
|
1406
|
+
"installed",
|
|
1407
|
+
"installer",
|
|
1408
|
+
"installers",
|
|
1409
|
+
"installing",
|
|
1410
|
+
"installation",
|
|
1411
|
+
}
|
|
1412
|
+
)
|
|
1413
|
+
if term.startswith("login"):
|
|
1414
|
+
variants.update({"login", "logins", "logged", "logging"})
|
|
1415
|
+
if term.startswith("publish"):
|
|
1416
|
+
variants.update({"publish", "published", "publishes", "publishing"})
|
|
1417
|
+
if term.startswith("store"):
|
|
1418
|
+
variants.update({"store", "stored", "stores", "storage"})
|
|
1419
|
+
if term.startswith("stream"):
|
|
1420
|
+
variants.update({"stream", "streams", "streamed", "streaming"})
|
|
1421
|
+
|
|
1422
|
+
return tuple(sorted(variants, key=lambda item: (len(item), item), reverse=True))
|
|
1423
|
+
|
|
1424
|
+
|
|
1425
|
+
def _repo_phrase_weight(phrase: str) -> float:
|
|
1426
|
+
tokens = phrase.split()
|
|
1427
|
+
base = 2.0 + min(4.0, float(len(tokens)))
|
|
1428
|
+
if any(_looks_like_compact_identifier(token) for token in tokens):
|
|
1429
|
+
base += 2.0
|
|
1430
|
+
return min(8.0, base)
|
|
1431
|
+
|
|
1432
|
+
|
|
1433
|
+
def _command_phrases(query: str) -> list[str]:
|
|
1434
|
+
tokens = re.findall(r"[a-z0-9_][a-z0-9_-]*", query.lower())
|
|
1435
|
+
if len(tokens) < 2:
|
|
1436
|
+
return []
|
|
1437
|
+
generic = {
|
|
1438
|
+
"a",
|
|
1439
|
+
"an",
|
|
1440
|
+
"and",
|
|
1441
|
+
"are",
|
|
1442
|
+
"for",
|
|
1443
|
+
"how",
|
|
1444
|
+
"in",
|
|
1445
|
+
"is",
|
|
1446
|
+
"of",
|
|
1447
|
+
"or",
|
|
1448
|
+
"the",
|
|
1449
|
+
"to",
|
|
1450
|
+
"what",
|
|
1451
|
+
"where",
|
|
1452
|
+
"with",
|
|
1453
|
+
}
|
|
1454
|
+
phrases: list[str] = []
|
|
1455
|
+
seen: set[str] = set()
|
|
1456
|
+
for size in range(min(4, len(tokens)), 1, -1):
|
|
1457
|
+
for start in range(0, len(tokens) - size + 1):
|
|
1458
|
+
window = tokens[start : start + size]
|
|
1459
|
+
if any(token in generic for token in window):
|
|
1460
|
+
continue
|
|
1461
|
+
if not any(_looks_like_compact_identifier(token) for token in window):
|
|
1462
|
+
continue
|
|
1463
|
+
phrase = " ".join(window)
|
|
1464
|
+
if phrase not in seen:
|
|
1465
|
+
seen.add(phrase)
|
|
1466
|
+
phrases.append(phrase)
|
|
1467
|
+
return phrases
|
|
1468
|
+
|
|
1469
|
+
|
|
1470
|
+
def _diversify_repo_hits(
|
|
1471
|
+
hits: list[_RepoScoredHit],
|
|
1472
|
+
*,
|
|
1473
|
+
limit: int,
|
|
1474
|
+
sections_per_doc: int,
|
|
1475
|
+
) -> list[_RepoScoredHit]:
|
|
1476
|
+
selected: list[_RepoScoredHit] = []
|
|
1477
|
+
counts: dict[str, int] = {}
|
|
1478
|
+
|
|
1479
|
+
def add(hit: _RepoScoredHit) -> None:
|
|
1480
|
+
doc_id = hit.record.doc_id
|
|
1481
|
+
selected.append(hit)
|
|
1482
|
+
counts[doc_id] = counts.get(doc_id, 0) + 1
|
|
1483
|
+
|
|
1484
|
+
for hit in hits:
|
|
1485
|
+
doc_id = hit.record.doc_id
|
|
1486
|
+
if counts.get(doc_id, 0) > 0:
|
|
1487
|
+
continue
|
|
1488
|
+
add(hit)
|
|
1489
|
+
if len(selected) >= limit:
|
|
1490
|
+
return selected
|
|
1491
|
+
|
|
1492
|
+
if sections_per_doc <= 1:
|
|
1493
|
+
return selected
|
|
1494
|
+
|
|
1495
|
+
seen = {(hit.record.doc_id, hit.record.section_id) for hit in selected}
|
|
1496
|
+
for hit in hits:
|
|
1497
|
+
key = (hit.record.doc_id, hit.record.section_id)
|
|
1498
|
+
doc_id = key[0]
|
|
1499
|
+
if key in seen or counts.get(doc_id, 0) >= sections_per_doc:
|
|
1500
|
+
continue
|
|
1501
|
+
add(hit)
|
|
1502
|
+
seen.add(key)
|
|
1503
|
+
if len(selected) >= limit:
|
|
1504
|
+
return selected
|
|
1505
|
+
return selected
|