docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/repo_search.py ADDED
@@ -0,0 +1,1505 @@
1
+ """Repo-scoped search cache, evidence blending, and ranking.
2
+
3
+ The public repo lifecycle API stays in :mod:`cairn.repo`. This module owns the
4
+ large, performance-sensitive search implementation so repository status/sync
5
+ logic does not have to carry ranking internals.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import heapq
12
+ import math
13
+ import re
14
+ from collections import Counter, defaultdict
15
+ from collections.abc import Collection
16
+ from dataclasses import dataclass
17
+ from pathlib import Path
18
+ from typing import Any, Final, Protocol
19
+
20
+ import numpy as np
21
+
22
+ from cairn.index.vectors import l2_normalize
23
+ from cairn.tools.base import DocumentIndex
24
+ from cairn.tools.search_semantic import IncludeField, _evidence_snippet, _query_terms
25
+
26
+ _REPO_SEARCH_CACHE_MAX: Final = 4
27
+ _REPO_SEARCH_LOAD_CONCURRENCY: Final = 16
28
+ _REPO_SEARCH_FULL_SCORE_LIMIT: Final = 2048
29
+ _REPO_SEARCH_SHORTLIST_MIN: Final = 768
30
+ _REPO_SEARCH_SHORTLIST_PER_RESULT: Final = 96
31
+ _REPO_SEARCH_SHORTLIST_PER_DOC_RESULT: Final = 64
32
+ _REPO_SEARCH_GRAPH_EXPANSION_LIMIT: Final = 256
33
+
34
+
35
+ class RepoSearchCandidate(Protocol):
36
+ """Status fields needed by repo search without importing ``cairn.repo``."""
37
+
38
+ @property
39
+ def id(self) -> str: ...
40
+
41
+ @property
42
+ def source(self) -> str: ...
43
+
44
+ @property
45
+ def doc_dir(self) -> str: ...
46
+
47
+ @property
48
+ def state(self) -> str: ...
49
+
50
+ @property
51
+ def indexed_hash(self) -> str | None: ...
52
+
53
+ @property
54
+ def source_file_hash(self) -> str | None: ...
55
+
56
+ @property
57
+ def indexed_source_file_hash(self) -> str | None: ...
58
+
59
+ @property
60
+ def section_count(self) -> int | None: ...
61
+
62
+
63
+ @dataclass(slots=True)
64
+ class _RepoSectionRecord:
65
+ doc_id: str
66
+ source: str
67
+ index: DocumentIndex
68
+ section_id: str
69
+ title: str
70
+ body: str
71
+ synopsis: str
72
+ vector: tuple[float, ...]
73
+ haystacks: tuple[str, str, str, str, str]
74
+ token_counts: dict[str, int]
75
+ token_count: int
76
+
77
+
78
+ @dataclass(slots=True)
79
+ class _RepoLexicalQuery:
80
+ terms: tuple[str, ...]
81
+ variants: dict[str, tuple[str, ...]]
82
+ weights: dict[str, float]
83
+ phrases: tuple[str, ...]
84
+ max_score: float
85
+ preferred_locales: tuple[str, ...]
86
+
87
+
88
+ @dataclass(slots=True)
89
+ class _RepoScoredHit:
90
+ record: _RepoSectionRecord
91
+ score: float
92
+ vector_score: float
93
+ lexical_score: float
94
+ sparse_score: float
95
+ graph_score: float
96
+ base_score: float
97
+ rank_factor: float
98
+ identity_bonus: float
99
+
100
+
101
+ @dataclass(slots=True)
102
+ class _RepoSearchCache:
103
+ signature: tuple[tuple[str, str, str, str, str, str, int], ...]
104
+ records: tuple[_RepoSectionRecord, ...]
105
+ skipped: tuple[dict[str, str], ...]
106
+ doc_dims: dict[str, int]
107
+ df: dict[str, int]
108
+ avg_token_count: float
109
+ graph_neighbors: dict[tuple[str, str], tuple[tuple[tuple[str, str], float], ...]]
110
+ record_index_by_key: dict[tuple[str, str], int]
111
+ vector_matrices: dict[int, Any]
112
+ vector_record_indices: dict[int, tuple[int, ...]]
113
+
114
+
115
+ @dataclass(slots=True)
116
+ class _RepoSearchDocumentChunk:
117
+ doc_id: str | None
118
+ dim: int | None
119
+ records: list[_RepoSectionRecord]
120
+ skipped: dict[str, str] | None
121
+ df: Counter[str]
122
+ graph_edges: list[tuple[tuple[str, str], tuple[str, str], float]]
123
+ entity_sections: dict[str, set[tuple[str, str]]]
124
+
125
+
126
+ @dataclass(frozen=True, slots=True)
127
+ class _RepoRankProfile:
128
+ field_weights: tuple[float, float, float, float, float] = (2.5, 2.0, 3.0, 1.0, 1.0)
129
+ vector_weight: float = 0.22
130
+ lexical_weight: float = 0.50
131
+ sparse_weight: float = 0.28
132
+ graph_weight: float = 0.10
133
+ no_lexical_vector_weight: float = 0.25
134
+ sparse_floor_gate: float = 0.25
135
+ sparse_lexical_gate_multiplier: float = 2.0
136
+ overview_doc_bonus: float = 0.16
137
+ overview_title_bonus: float = 0.12
138
+ overview_shallow_bonus: float = 0.04
139
+ overview_max_bonus: float = 0.22
140
+ focus_support_floor: float = 0.30
141
+ focus_support_weight: float = 0.70
142
+ focus_synopsis_support: float = 0.65
143
+ focus_body_support: float = 0.45
144
+ root_meta_doc_factor: float = 0.55
145
+ coverage_floor: float = 0.45
146
+ coverage_weight: float = 0.55
147
+ doc_identity_bonus_weight: float = 0.25
148
+ history_doc_generic_factor: float = 0.45
149
+ locale_match_factor: float = 1.04
150
+ locale_mismatch_factor: float = 0.72
151
+
152
+
153
+ _REPO_SEARCH_CACHES: dict[Path, _RepoSearchCache] = {}
154
+ _DEFAULT_RANK_PROFILE = _RepoRankProfile()
155
+ _HISTORY_DOC_TERMS: Final = frozenset(
156
+ {
157
+ "changelog",
158
+ "changes",
159
+ "history",
160
+ "release",
161
+ "releases",
162
+ "migration",
163
+ "migrations",
164
+ }
165
+ )
166
+ _HISTORY_QUERY_TERMS: Final = frozenset(
167
+ {
168
+ "breaking",
169
+ "change",
170
+ "changes",
171
+ "changelog",
172
+ "deprecated",
173
+ "deprecation",
174
+ "history",
175
+ "migration",
176
+ "migrations",
177
+ "release",
178
+ "released",
179
+ "releases",
180
+ "upgrade",
181
+ "version",
182
+ "versions",
183
+ }
184
+ )
185
+ _KNOWN_LOCALES: Final = frozenset(
186
+ {
187
+ "ar",
188
+ "de",
189
+ "en",
190
+ "es",
191
+ "fa",
192
+ "fr",
193
+ "hi",
194
+ "id",
195
+ "it",
196
+ "ja",
197
+ "ko",
198
+ "nl",
199
+ "pl",
200
+ "pt",
201
+ "ru",
202
+ "tr",
203
+ "uk",
204
+ "vi",
205
+ "zh",
206
+ }
207
+ )
208
+
209
+
210
+ async def search_repo_index(
211
+ root: Path,
212
+ *,
213
+ candidates: Collection[RepoSearchCandidate],
214
+ query: str,
215
+ query_vec: list[float],
216
+ k: int,
217
+ include_set: Collection[IncludeField],
218
+ sections_per_doc: int,
219
+ preferred_locales: tuple[str, ...],
220
+ ) -> dict[str, Any]:
221
+ """Return the repo-search payload for already validated inputs."""
222
+ cache = await _get_repo_search_cache(root, candidates)
223
+ lexical_query = _build_repo_lexical_query(
224
+ query,
225
+ cache=cache,
226
+ preferred_locales=preferred_locales,
227
+ )
228
+ hits_by_key: dict[tuple[str, str], _RepoScoredHit] = {}
229
+ skipped: list[dict[str, str]] = list(cache.skipped)
230
+ query_dim = len(query_vec)
231
+ incompatible_docs = {
232
+ doc_id for doc_id, dim in cache.doc_dims.items() if dim != query_dim
233
+ }
234
+ for doc_id in sorted(incompatible_docs):
235
+ skipped.append(
236
+ {
237
+ "doc": doc_id,
238
+ "reason": f"query embedding dim {query_dim} != index dim {cache.doc_dims[doc_id]}",
239
+ }
240
+ )
241
+ normalized_query = l2_normalize(query_vec)
242
+ vector_scores = _repo_vector_scores(cache, normalized_query, query_dim)
243
+ candidate_indices, ranker_mode, compatible_count = _repo_candidate_indices(
244
+ cache,
245
+ query=lexical_query,
246
+ vector_scores=vector_scores,
247
+ incompatible_docs=incompatible_docs,
248
+ k=k,
249
+ sections_per_doc=sections_per_doc,
250
+ )
251
+ for index in candidate_indices:
252
+ record = cache.records[index]
253
+ scored = _score_repo_record(
254
+ record,
255
+ query=lexical_query,
256
+ cache=cache,
257
+ vector_score=vector_scores[index],
258
+ )
259
+ hits_by_key[(record.doc_id, record.section_id)] = scored
260
+
261
+ hits = list(hits_by_key.values())
262
+ _apply_graph_scores(hits, cache)
263
+ hits.sort(key=lambda item: item.score, reverse=True)
264
+ selected_records = _diversify_repo_hits(
265
+ hits,
266
+ limit=k,
267
+ sections_per_doc=sections_per_doc,
268
+ )
269
+ selected = [
270
+ _repo_scored_payload(
271
+ hit,
272
+ query=query,
273
+ lexical_query=lexical_query,
274
+ include_set=include_set,
275
+ )
276
+ for hit in selected_records
277
+ ]
278
+ return {
279
+ "query": query,
280
+ "hits": selected,
281
+ "sections_per_doc": sections_per_doc,
282
+ "searched_documents": len(candidates),
283
+ "ranker": {
284
+ "mode": ranker_mode,
285
+ "total_sections": len(cache.records),
286
+ "compatible_sections": compatible_count,
287
+ "scored_sections": len(candidate_indices),
288
+ },
289
+ "stale_documents": [
290
+ doc.id for doc in candidates if doc.state == "stale"
291
+ ],
292
+ "skipped_documents": skipped,
293
+ "cursor": None,
294
+ }
295
+
296
+
297
+ async def _get_repo_search_cache(
298
+ root: Path,
299
+ candidates: Collection[RepoSearchCandidate],
300
+ ) -> _RepoSearchCache:
301
+ resolved_root = root.resolve()
302
+ signature = _repo_search_signature(candidates)
303
+ cached = _REPO_SEARCH_CACHES.get(resolved_root)
304
+ if cached is not None and cached.signature == signature:
305
+ return cached
306
+
307
+ records: list[_RepoSectionRecord] = []
308
+ skipped: list[dict[str, str]] = []
309
+ doc_dims: dict[str, int] = {}
310
+ df_counter: Counter[str] = Counter()
311
+ graph_weights: dict[tuple[str, str], dict[tuple[str, str], float]] = defaultdict(dict)
312
+ entity_sections: dict[str, set[tuple[str, str]]] = defaultdict(set)
313
+ semaphore = asyncio.Semaphore(_REPO_SEARCH_LOAD_CONCURRENCY)
314
+
315
+ async def load(doc: RepoSearchCandidate) -> _RepoSearchDocumentChunk:
316
+ async with semaphore:
317
+ return await _load_repo_search_document(root, doc)
318
+
319
+ chunks = await asyncio.gather(*(load(doc) for doc in candidates))
320
+ for chunk in chunks:
321
+ if chunk.skipped is not None:
322
+ skipped.append(chunk.skipped)
323
+ continue
324
+ if chunk.doc_id is not None and chunk.dim is not None:
325
+ doc_dims[chunk.doc_id] = chunk.dim
326
+ records.extend(chunk.records)
327
+ df_counter.update(chunk.df)
328
+ for left, right, weight in chunk.graph_edges:
329
+ _add_graph_edge(graph_weights, left, right, weight=weight)
330
+ for key, section_keys in chunk.entity_sections.items():
331
+ entity_sections[key].update(section_keys)
332
+
333
+ for section_keys in entity_sections.values():
334
+ if len(section_keys) < 2 or len(section_keys) > 24:
335
+ continue
336
+ ordered = sorted(section_keys)
337
+ for i, src in enumerate(ordered):
338
+ for dst in ordered[i + 1 :]:
339
+ _add_graph_edge(graph_weights, src, dst, weight=0.18)
340
+
341
+ vector_matrices, vector_record_indices = _repo_vector_matrices(records)
342
+ cache = _RepoSearchCache(
343
+ signature=signature,
344
+ records=tuple(records),
345
+ skipped=tuple(skipped),
346
+ doc_dims=doc_dims,
347
+ df=dict(df_counter),
348
+ avg_token_count=(
349
+ sum(record.token_count for record in records) / len(records)
350
+ if records
351
+ else 0.0
352
+ ),
353
+ graph_neighbors={
354
+ key: tuple(neighbors.items())
355
+ for key, neighbors in graph_weights.items()
356
+ },
357
+ record_index_by_key={
358
+ (record.doc_id, record.section_id): index
359
+ for index, record in enumerate(records)
360
+ },
361
+ vector_matrices=vector_matrices,
362
+ vector_record_indices=vector_record_indices,
363
+ )
364
+ if (
365
+ resolved_root not in _REPO_SEARCH_CACHES
366
+ and len(_REPO_SEARCH_CACHES) >= _REPO_SEARCH_CACHE_MAX
367
+ ):
368
+ oldest = next(iter(_REPO_SEARCH_CACHES))
369
+ del _REPO_SEARCH_CACHES[oldest]
370
+ _REPO_SEARCH_CACHES[resolved_root] = cache
371
+ return cache
372
+
373
+
374
+ async def _load_repo_search_document(
375
+ root: Path,
376
+ doc: RepoSearchCandidate,
377
+ ) -> _RepoSearchDocumentChunk:
378
+ records: list[_RepoSectionRecord] = []
379
+ df_counter: Counter[str] = Counter()
380
+ graph_edges: list[tuple[tuple[str, str], tuple[str, str], float]] = []
381
+ entity_sections: dict[str, set[tuple[str, str]]] = defaultdict(set)
382
+ try:
383
+ index = DocumentIndex.load(root / doc.doc_dir)
384
+ vectors = {
385
+ entry.id: tuple(entry.vector)
386
+ for entry in await index.vectors.entries()
387
+ }
388
+ except Exception as exc:
389
+ return _RepoSearchDocumentChunk(
390
+ doc_id=None,
391
+ dim=None,
392
+ records=[],
393
+ skipped={"doc": doc.id, "reason": str(exc)},
394
+ df=Counter(),
395
+ graph_edges=[],
396
+ entity_sections={},
397
+ )
398
+
399
+ vector_section_ids = set(vectors)
400
+ for node in index.tree:
401
+ vector = vectors.get(node.id)
402
+ if vector is None:
403
+ continue
404
+ summary = index.summaries.get(node.id)
405
+ synopsis = summary.synopsis if summary is not None else ""
406
+ token_counts = _section_token_counts(
407
+ doc_id=doc.id,
408
+ source=doc.source,
409
+ title=node.title,
410
+ synopsis=synopsis,
411
+ body=node.raw_text,
412
+ )
413
+ df_counter.update(token_counts.keys())
414
+ records.append(
415
+ _RepoSectionRecord(
416
+ doc_id=doc.id,
417
+ source=doc.source,
418
+ index=index,
419
+ section_id=node.id,
420
+ title=node.title,
421
+ body=node.raw_text,
422
+ synopsis=synopsis,
423
+ vector=vector,
424
+ haystacks=(
425
+ _normalize_field_text(doc.id),
426
+ _normalize_field_text(doc.source),
427
+ _normalize_field_text(node.title),
428
+ _normalize_field_text(synopsis),
429
+ _normalize_field_text(node.raw_text[:2000]),
430
+ ),
431
+ token_counts=dict(token_counts),
432
+ token_count=sum(token_counts.values()),
433
+ )
434
+ )
435
+
436
+ for node in index.tree:
437
+ if (
438
+ node.parent is not None
439
+ and node.id in vector_section_ids
440
+ and node.parent in vector_section_ids
441
+ ):
442
+ graph_edges.append(((doc.id, node.id), (doc.id, node.parent), 0.55))
443
+ if index.xrefs is not None:
444
+ for ref in index.xrefs:
445
+ if ref.src in vector_section_ids and ref.dst in vector_section_ids:
446
+ graph_edges.append(
447
+ (
448
+ (doc.id, ref.src),
449
+ (doc.id, ref.dst),
450
+ max(0.2, min(1.0, ref.confidence)),
451
+ )
452
+ )
453
+ if index.entities is not None:
454
+ for entity in index.entities:
455
+ key = f"{entity.kind}:{entity.canonical}".lower()
456
+ for mention in entity.mentions:
457
+ if mention.section_id in vector_section_ids:
458
+ entity_sections[key].add((doc.id, mention.section_id))
459
+
460
+ return _RepoSearchDocumentChunk(
461
+ doc_id=doc.id,
462
+ dim=index.vectors.dim,
463
+ records=records,
464
+ skipped=None,
465
+ df=df_counter,
466
+ graph_edges=graph_edges,
467
+ entity_sections=dict(entity_sections),
468
+ )
469
+
470
+
471
+ def _repo_search_signature(
472
+ candidates: Collection[RepoSearchCandidate],
473
+ ) -> tuple[tuple[str, str, str, str, str, str, int], ...]:
474
+ return tuple(
475
+ (
476
+ doc.id,
477
+ doc.doc_dir,
478
+ doc.state,
479
+ doc.indexed_hash or "",
480
+ doc.source_file_hash or "",
481
+ doc.indexed_source_file_hash or "",
482
+ doc.section_count or 0,
483
+ )
484
+ for doc in candidates
485
+ )
486
+
487
+
488
+ def _repo_vector_matrices(
489
+ records: list[_RepoSectionRecord],
490
+ ) -> tuple[dict[int, Any], dict[int, tuple[int, ...]]]:
491
+ by_dim: dict[int, list[tuple[int, tuple[float, ...]]]] = defaultdict(list)
492
+ for index, record in enumerate(records):
493
+ by_dim[len(record.vector)].append((index, record.vector))
494
+ matrices: dict[int, Any] = {}
495
+ indices: dict[int, tuple[int, ...]] = {}
496
+ for dim, rows in by_dim.items():
497
+ indices[dim] = tuple(index for index, _ in rows)
498
+ matrices[dim] = np.asarray([vector for _, vector in rows], dtype=np.float32)
499
+ return matrices, indices
500
+
501
+
502
+ def _repo_vector_scores(
503
+ cache: _RepoSearchCache,
504
+ query: list[float],
505
+ query_dim: int,
506
+ ) -> list[float]:
507
+ scores = [0.0] * len(cache.records)
508
+ matrix = cache.vector_matrices.get(query_dim)
509
+ indices = cache.vector_record_indices.get(query_dim)
510
+ if matrix is None or indices is None:
511
+ return scores
512
+ query_array = np.asarray(query, dtype=np.float32)
513
+ raw_scores = matrix @ query_array
514
+ clipped = np.clip(raw_scores, 0.0, 1.0)
515
+ for record_index, score in zip(indices, clipped.tolist(), strict=True):
516
+ scores[record_index] = float(score)
517
+ return scores
518
+
519
+
520
+ def _repo_candidate_indices(
521
+ cache: _RepoSearchCache,
522
+ *,
523
+ query: _RepoLexicalQuery,
524
+ vector_scores: list[float],
525
+ incompatible_docs: set[str],
526
+ k: int,
527
+ sections_per_doc: int,
528
+ ) -> tuple[tuple[int, ...], str, int]:
529
+ """Choose records that should receive the full ranker pass."""
530
+ compatible = tuple(
531
+ index
532
+ for index, record in enumerate(cache.records)
533
+ if record.doc_id not in incompatible_docs
534
+ )
535
+ compatible_count = len(compatible)
536
+ if compatible_count <= _REPO_SEARCH_FULL_SCORE_LIMIT:
537
+ return compatible, "full", compatible_count
538
+
539
+ target = max(
540
+ _REPO_SEARCH_SHORTLIST_MIN,
541
+ k * _REPO_SEARCH_SHORTLIST_PER_RESULT,
542
+ k * sections_per_doc * _REPO_SEARCH_SHORTLIST_PER_DOC_RESULT,
543
+ )
544
+ target = min(target, compatible_count)
545
+ if target >= compatible_count:
546
+ return compatible, "full", compatible_count
547
+
548
+ candidate_set: set[int] = set()
549
+ vector_budget = min(target, max(k * 32, target // 2))
550
+ candidate_set.update(
551
+ heapq.nlargest(
552
+ vector_budget,
553
+ compatible,
554
+ key=lambda index: vector_scores[index],
555
+ )
556
+ )
557
+ if query.terms or query.phrases:
558
+ candidate_set.update(
559
+ heapq.nlargest(
560
+ target,
561
+ compatible,
562
+ key=lambda index: _repo_quick_recall_score(
563
+ query,
564
+ cache.records[index],
565
+ ),
566
+ )
567
+ )
568
+ if len(candidate_set) < target:
569
+ candidate_set.update(
570
+ heapq.nlargest(
571
+ target,
572
+ compatible,
573
+ key=lambda index: vector_scores[index],
574
+ )
575
+ )
576
+
577
+ _expand_repo_candidate_neighbors(
578
+ candidate_set,
579
+ cache,
580
+ limit=min(
581
+ compatible_count,
582
+ target + _REPO_SEARCH_GRAPH_EXPANSION_LIMIT,
583
+ ),
584
+ )
585
+ return tuple(sorted(candidate_set)), "shortlist", compatible_count
586
+
587
+
588
+ def _expand_repo_candidate_neighbors(
589
+ candidate_set: set[int],
590
+ cache: _RepoSearchCache,
591
+ *,
592
+ limit: int,
593
+ ) -> None:
594
+ for index in tuple(candidate_set):
595
+ record = cache.records[index]
596
+ key = (record.doc_id, record.section_id)
597
+ for neighbor_key, _ in cache.graph_neighbors.get(key, ()):
598
+ neighbor_index = cache.record_index_by_key.get(neighbor_key)
599
+ if neighbor_index is None:
600
+ continue
601
+ candidate_set.add(neighbor_index)
602
+ if len(candidate_set) >= limit:
603
+ return
604
+
605
+
606
+ def _repo_quick_recall_score(
607
+ query: _RepoLexicalQuery,
608
+ record: _RepoSectionRecord,
609
+ ) -> float:
610
+ if not query.terms and not query.phrases:
611
+ return 0.0
612
+ doc_id, source, title, synopsis, body = record.haystacks
613
+ score = 0.0
614
+ for term in query.terms:
615
+ weight = query.weights.get(term, 0.0)
616
+ variants = query.variants.get(term, ())
617
+ token_variants = tuple(variant for variant in variants if " " not in variant)
618
+ if any(
619
+ variant in doc_id or variant in source or variant in title
620
+ for variant in token_variants
621
+ ):
622
+ score += weight * 6.0
623
+ elif any(variant in synopsis for variant in token_variants):
624
+ score += weight * 2.5
625
+ elif any(variant in body for variant in token_variants):
626
+ score += weight
627
+ term_frequency = max(
628
+ (
629
+ record.token_counts.get(variant, 0)
630
+ for variant in token_variants
631
+ ),
632
+ default=0,
633
+ )
634
+ if term_frequency:
635
+ score += weight * (1.0 + min(3.0, float(term_frequency)) * 0.25)
636
+ if query.phrases:
637
+ combined = " ".join(record.haystacks)
638
+ for phrase in query.phrases:
639
+ if phrase in combined:
640
+ score += _repo_phrase_weight(phrase)
641
+ score += _doc_identity_bonus(query, record.haystacks) * 8.0
642
+ score += _overview_intent_bonus(query, record) * 4.0
643
+ return score * _root_meta_doc_factor(query, record) * _history_doc_factor(
644
+ query,
645
+ record,
646
+ ) * _locale_doc_factor(query, record)
647
+
648
+
649
+ def _section_token_counts(
650
+ *,
651
+ doc_id: str,
652
+ source: str,
653
+ title: str,
654
+ synopsis: str,
655
+ body: str,
656
+ ) -> Counter[str]:
657
+ text = " ".join(
658
+ (
659
+ doc_id,
660
+ source,
661
+ title,
662
+ title,
663
+ synopsis,
664
+ body[:4000],
665
+ )
666
+ )
667
+ return Counter(_tokenize_search_text(text))
668
+
669
+
670
+ def _tokenize_search_text(text: str) -> list[str]:
671
+ return re.findall(
672
+ r"[a-z0-9][a-z0-9]*",
673
+ text.lower().replace("/", " ").replace("-", " ").replace("_", " "),
674
+ )
675
+
676
+
677
+ def _bm25_sparse_score(
678
+ query: _RepoLexicalQuery,
679
+ record: _RepoSectionRecord,
680
+ cache: _RepoSearchCache,
681
+ ) -> float:
682
+ if not query.terms or record.token_count <= 0 or cache.avg_token_count <= 0:
683
+ return 0.0
684
+ corpus_size = max(1, len(cache.records))
685
+ k1 = 1.2
686
+ b = 0.75
687
+ raw = 0.0
688
+ max_raw = 0.0
689
+ length_norm = k1 * (
690
+ (1.0 - b) + b * (record.token_count / cache.avg_token_count)
691
+ )
692
+ for term in query.terms:
693
+ tf = max(
694
+ (
695
+ record.token_counts.get(variant, 0)
696
+ for variant in query.variants[term]
697
+ if " " not in variant
698
+ ),
699
+ default=0,
700
+ )
701
+ df = max(
702
+ (
703
+ cache.df.get(variant, 0)
704
+ for variant in query.variants[term]
705
+ if " " not in variant
706
+ ),
707
+ default=0,
708
+ )
709
+ if tf <= 0 or df <= 0:
710
+ continue
711
+ idf = math.log(1.0 + ((corpus_size - df + 0.5) / (df + 0.5)))
712
+ weighted_idf = idf * query.weights[term]
713
+ raw += weighted_idf * ((tf * (k1 + 1.0)) / (tf + length_norm))
714
+ max_raw += weighted_idf * (k1 + 1.0)
715
+ if max_raw <= 0:
716
+ return 0.0
717
+ return max(0.0, min(1.0, raw / max_raw))
718
+
719
+
720
+ def _add_graph_edge(
721
+ graph: dict[tuple[str, str], dict[tuple[str, str], float]],
722
+ left: tuple[str, str],
723
+ right: tuple[str, str],
724
+ *,
725
+ weight: float,
726
+ ) -> None:
727
+ if left == right:
728
+ return
729
+ graph[left][right] = max(graph[left].get(right, 0.0), weight)
730
+ graph[right][left] = max(graph[right].get(left, 0.0), weight)
731
+
732
+
733
+ def _score_repo_record(
734
+ record: _RepoSectionRecord,
735
+ *,
736
+ query: _RepoLexicalQuery,
737
+ cache: _RepoSearchCache,
738
+ vector_score: float,
739
+ ) -> _RepoScoredHit:
740
+ focus_support = _focus_field_support(query, record.haystacks)
741
+ coverage = _weighted_term_coverage(query, record.haystacks)
742
+ lexical_score = min(
743
+ 1.0,
744
+ _field_supported_lexical_score(
745
+ _lexical_score_from_profile(query, record.haystacks),
746
+ focus_support=focus_support,
747
+ )
748
+ * _coverage_factor(coverage)
749
+ + _overview_intent_bonus(query, record),
750
+ )
751
+ sparse_score = _bm25_sparse_score(query, record, cache)
752
+ rank_factor = _root_meta_doc_factor(query, record) * _history_doc_factor(
753
+ query,
754
+ record,
755
+ ) * _locale_doc_factor(query, record)
756
+ identity_bonus = _doc_identity_bonus(query, record.haystacks)
757
+ base_score = min(
758
+ 1.0,
759
+ _combine_repo_scores(
760
+ vector_score,
761
+ lexical_score,
762
+ sparse_score=sparse_score,
763
+ graph_score=0.0,
764
+ )
765
+ + identity_bonus,
766
+ ) * rank_factor
767
+ return _RepoScoredHit(
768
+ record=record,
769
+ score=base_score,
770
+ vector_score=vector_score,
771
+ lexical_score=lexical_score,
772
+ sparse_score=sparse_score,
773
+ graph_score=0.0,
774
+ base_score=base_score,
775
+ rank_factor=rank_factor,
776
+ identity_bonus=identity_bonus,
777
+ )
778
+
779
+
780
+ def _apply_graph_scores(
781
+ hits: list[_RepoScoredHit],
782
+ cache: _RepoSearchCache,
783
+ ) -> None:
784
+ by_key = {
785
+ (hit.record.doc_id, hit.record.section_id): hit
786
+ for hit in hits
787
+ }
788
+ for hit in hits:
789
+ key = (hit.record.doc_id, hit.record.section_id)
790
+ neighbors = cache.graph_neighbors.get(key, ())
791
+ total = 0.0
792
+ weight_sum = 0.0
793
+ for neighbor_key, weight in neighbors:
794
+ weight_sum += weight
795
+ neighbor = by_key.get(neighbor_key)
796
+ if neighbor is None:
797
+ continue
798
+ total += neighbor.base_score * weight
799
+ graph_score = total / weight_sum if weight_sum else 0.0
800
+ hit.graph_score = graph_score
801
+ hit.score = min(
802
+ 1.0,
803
+ _combine_repo_scores(
804
+ hit.vector_score,
805
+ hit.lexical_score,
806
+ sparse_score=hit.sparse_score,
807
+ graph_score=graph_score,
808
+ graph_present=bool(neighbors),
809
+ )
810
+ + hit.identity_bonus,
811
+ ) * hit.rank_factor
812
+
813
+
814
+ def _repo_scored_payload(
815
+ hit: _RepoScoredHit,
816
+ *,
817
+ query: str,
818
+ lexical_query: _RepoLexicalQuery,
819
+ include_set: Collection[IncludeField],
820
+ ) -> dict[str, Any]:
821
+ record = hit.record
822
+ result: dict[str, Any] = {
823
+ "doc": record.doc_id,
824
+ "source": record.source,
825
+ "id": record.section_id,
826
+ "title": record.title,
827
+ "score": hit.score,
828
+ "vector_score": hit.vector_score,
829
+ "lexical_score": hit.lexical_score,
830
+ "sparse_score": hit.sparse_score,
831
+ "graph_score": hit.graph_score,
832
+ "anchor": record.index.anchor(record.section_id),
833
+ "explanation": _repo_hit_explanation(hit, lexical_query),
834
+ }
835
+ if "synopsis" in include_set and record.synopsis:
836
+ result["synopsis"] = record.synopsis
837
+ if "head" in include_set:
838
+ result["head"] = record.body[:200]
839
+ if "evidence" in include_set:
840
+ result["evidence"] = _evidence_snippet(record.body, query)
841
+ return result
842
+
843
+
844
+ def _repo_hit_explanation(
845
+ hit: _RepoScoredHit,
846
+ query: _RepoLexicalQuery,
847
+ ) -> dict[str, Any]:
848
+ profile = _DEFAULT_RANK_PROFILE
849
+ signal_scores = {
850
+ "lexical": hit.lexical_score,
851
+ "sparse": hit.sparse_score,
852
+ "vector": hit.vector_score,
853
+ "graph": hit.graph_score,
854
+ }
855
+ dominant_order = {"lexical": 4, "sparse": 3, "vector": 2, "graph": 1}
856
+ dominant_signal = max(
857
+ signal_scores,
858
+ key=lambda name: (signal_scores[name], dominant_order[name]),
859
+ )
860
+ matched_terms = _repo_matched_terms(query, hit.record)
861
+ notes: list[str] = []
862
+ if matched_terms:
863
+ notes.append("matched query terms in doc/source/title/summary/body fields")
864
+ if hit.sparse_score > 0:
865
+ notes.append("BM25-style sparse evidence contributed")
866
+ if hit.graph_score > 0:
867
+ notes.append("tree/xref/entity neighborhood support contributed")
868
+ if hit.identity_bonus > 0:
869
+ notes.append("document or path identity matched the query")
870
+ if hit.rank_factor != 1.0:
871
+ notes.append("rank factor adjusted broad root-document placement")
872
+
873
+ return {
874
+ "dominant_signal": dominant_signal,
875
+ "matched_terms": matched_terms,
876
+ "signals": {
877
+ "vector": {
878
+ "score": _round_score(hit.vector_score),
879
+ "weight": profile.vector_weight,
880
+ },
881
+ "lexical": {
882
+ "score": _round_score(hit.lexical_score),
883
+ "weight": profile.lexical_weight,
884
+ },
885
+ "sparse": {
886
+ "score": _round_score(hit.sparse_score),
887
+ "weight": profile.sparse_weight,
888
+ },
889
+ "graph": {
890
+ "score": _round_score(hit.graph_score),
891
+ "weight": profile.graph_weight,
892
+ },
893
+ },
894
+ "rank_factor": _round_score(hit.rank_factor),
895
+ "identity_bonus": _round_score(hit.identity_bonus),
896
+ "notes": notes,
897
+ }
898
+
899
+
900
+ def _repo_matched_terms(
901
+ query: _RepoLexicalQuery,
902
+ record: _RepoSectionRecord,
903
+ ) -> list[str]:
904
+ if not query.terms:
905
+ return []
906
+ haystack = " ".join(record.haystacks)
907
+ return [
908
+ term
909
+ for term in query.terms
910
+ if any(variant in haystack for variant in query.variants[term])
911
+ ]
912
+
913
+
914
+ def _round_score(value: float) -> float:
915
+ return round(float(value), 4)
916
+
917
+
918
+ def _combine_repo_scores(
919
+ vector_score: float,
920
+ lexical_score: float,
921
+ *,
922
+ sparse_score: float,
923
+ graph_score: float,
924
+ graph_present: bool = False,
925
+ ) -> float:
926
+ profile = _DEFAULT_RANK_PROFILE
927
+ if lexical_score <= 0 and sparse_score <= 0:
928
+ return vector_score * profile.no_lexical_vector_weight
929
+ trusted_sparse = _trusted_sparse_score(
930
+ lexical_score=lexical_score,
931
+ sparse_score=sparse_score,
932
+ )
933
+ base = (
934
+ (vector_score * profile.vector_weight)
935
+ + (lexical_score * profile.lexical_weight)
936
+ + (trusted_sparse * profile.sparse_weight)
937
+ )
938
+ if not graph_present and graph_score <= 0:
939
+ return min(1.0, base)
940
+ return min(
941
+ 1.0,
942
+ (base * (1.0 - profile.graph_weight)) + (graph_score * profile.graph_weight),
943
+ )
944
+
945
+
946
+ def _trusted_sparse_score(*, lexical_score: float, sparse_score: float) -> float:
947
+ if sparse_score <= 0:
948
+ return 0.0
949
+ if lexical_score <= 0:
950
+ return sparse_score * 0.15
951
+ profile = _DEFAULT_RANK_PROFILE
952
+ gate = min(
953
+ 1.0,
954
+ max(
955
+ profile.sparse_floor_gate,
956
+ lexical_score * profile.sparse_lexical_gate_multiplier,
957
+ ),
958
+ )
959
+ return sparse_score * gate
960
+
961
+
962
+ def _build_repo_lexical_query(
963
+ query: str,
964
+ *,
965
+ cache: _RepoSearchCache | None = None,
966
+ preferred_locales: tuple[str, ...] = (),
967
+ ) -> _RepoLexicalQuery:
968
+ terms = tuple(_repo_query_terms(query))
969
+ field_weights = _DEFAULT_RANK_PROFILE.field_weights
970
+ variants = {term: _term_variants(term) for term in terms}
971
+ weights = {
972
+ term: _repo_term_weight(term)
973
+ * _repo_corpus_term_weight(variants[term], cache)
974
+ for term in terms
975
+ }
976
+ max_score = sum(weights[term] * sum(field_weights) for term in terms)
977
+ return _RepoLexicalQuery(
978
+ terms=terms,
979
+ variants=variants,
980
+ weights=weights,
981
+ phrases=tuple(_command_phrases(query)),
982
+ max_score=max_score,
983
+ preferred_locales=_normalized_preferred_locales(
984
+ preferred_locales,
985
+ fallback=_infer_query_locale(query),
986
+ ),
987
+ )
988
+
989
+
990
+ def _repo_corpus_term_weight(
991
+ variants: tuple[str, ...],
992
+ cache: _RepoSearchCache | None,
993
+ ) -> float:
994
+ if cache is None or not cache.records:
995
+ return 1.0
996
+ token_variants = {variant for variant in variants if " " not in variant}
997
+ if not token_variants:
998
+ return 1.0
999
+ coverage = sum(
1000
+ 1
1001
+ for record in cache.records
1002
+ if any(variant in record.token_counts for variant in token_variants)
1003
+ )
1004
+ if coverage <= 0:
1005
+ return 1.0
1006
+ corpus_size = len(cache.records)
1007
+ idf = math.log(1.0 + ((corpus_size - coverage + 0.5) / (coverage + 0.5)))
1008
+ max_idf = math.log(1.0 + ((corpus_size + 0.5) / 0.5))
1009
+ if max_idf <= 0:
1010
+ return 1.0
1011
+ return 0.35 + (0.65 * max(0.0, min(1.0, idf / max_idf)))
1012
+
1013
+
1014
+ def _lexical_score_from_profile(
1015
+ query: _RepoLexicalQuery,
1016
+ haystacks: tuple[str, str, str, str, str],
1017
+ ) -> float:
1018
+ if not query.terms or query.max_score <= 0:
1019
+ return 0.0
1020
+ weighted = 0.0
1021
+ for term in query.terms:
1022
+ term_weight = query.weights[term]
1023
+ variants = query.variants[term]
1024
+ field_weights = _DEFAULT_RANK_PROFILE.field_weights
1025
+ for haystack, field_weight in zip(haystacks, field_weights, strict=True):
1026
+ if any(variant in haystack for variant in variants):
1027
+ weighted += term_weight * field_weight
1028
+ combined = _normalize_search_text(" ".join(haystacks))
1029
+ for size in range(min(4, len(query.terms)), 1, -1):
1030
+ for start in range(0, len(query.terms) - size + 1):
1031
+ phrase = " ".join(query.terms[start : start + size])
1032
+ if phrase in combined:
1033
+ weighted += float(size)
1034
+ for phrase in query.phrases:
1035
+ if phrase in combined:
1036
+ weighted += _repo_phrase_weight(phrase)
1037
+ return min(1.0, weighted / query.max_score)
1038
+
1039
+
1040
+ def _repo_query_terms(query: str) -> list[str]:
1041
+ generic = {
1042
+ "about",
1043
+ "do",
1044
+ "does",
1045
+ "from",
1046
+ "how",
1047
+ "in",
1048
+ "into",
1049
+ "it",
1050
+ "on",
1051
+ "using",
1052
+ "what",
1053
+ "when",
1054
+ "where",
1055
+ "which",
1056
+ "work",
1057
+ "works",
1058
+ "with",
1059
+ }
1060
+ terms = [term for term in _query_terms(query) if term not in generic]
1061
+ seen = set(terms)
1062
+ for word in re.findall(r"[A-Za-z0-9_][A-Za-z0-9_-]*", query.lower()):
1063
+ if (
1064
+ len(word) >= 2
1065
+ and word not in generic
1066
+ and word not in seen
1067
+ and _looks_like_compact_identifier(word)
1068
+ ):
1069
+ seen.add(word)
1070
+ terms.append(word)
1071
+ return terms
1072
+
1073
+
1074
+ def _normalize_field_text(text: str) -> str:
1075
+ lowered = text.lower().replace("/", " ").replace("-", " ").replace("_", " ")
1076
+ return _normalize_search_text(lowered)
1077
+
1078
+
1079
+ def _normalize_search_text(text: str) -> str:
1080
+ normalized = text.lower().replace("/", " ").replace("-", " ").replace("_", " ")
1081
+ return " ".join(re.findall(r"[a-z0-9][a-z0-9]*", normalized))
1082
+
1083
+
1084
+ def _looks_like_compact_identifier(token: str) -> bool:
1085
+ return len(token) <= 4 or any(char.isdigit() for char in token) or "_" in token
1086
+
1087
+
1088
+ def _repo_term_weight(term: str) -> float:
1089
+ """Down-weight broad verbs that otherwise dominate docs-heavy repos."""
1090
+ if term in {
1091
+ "run",
1092
+ "runs",
1093
+ "test",
1094
+ "tests",
1095
+ "testing",
1096
+ "use",
1097
+ "using",
1098
+ "write",
1099
+ "writes",
1100
+ "written",
1101
+ }:
1102
+ return 0.35
1103
+ return 1.0
1104
+
1105
+
1106
+ def _overview_intent_bonus(
1107
+ query: _RepoLexicalQuery,
1108
+ record: _RepoSectionRecord,
1109
+ ) -> float:
1110
+ if not query.terms:
1111
+ return 0.0
1112
+ focus_term = query.terms[0]
1113
+
1114
+ doc_tokens = tuple(_tokenize_search_text(record.doc_id))
1115
+ title = _normalize_search_text(record.title)
1116
+ profile = _DEFAULT_RANK_PROFILE
1117
+ bonus = 0.0
1118
+ variants = {
1119
+ variant
1120
+ for variant in query.variants.get(focus_term, ())
1121
+ if " " not in variant
1122
+ }
1123
+ if not variants:
1124
+ return 0.0
1125
+ if any(doc_tokens in {(variant,), ("docs", variant)} for variant in variants):
1126
+ bonus = max(bonus, profile.overview_doc_bonus)
1127
+ if title in variants:
1128
+ bonus = max(bonus, profile.overview_title_bonus)
1129
+
1130
+ if bonus > 0 and record.section_id.count("/") <= 1:
1131
+ bonus += profile.overview_shallow_bonus
1132
+ return min(profile.overview_max_bonus, bonus)
1133
+
1134
+
1135
+ def _focus_field_support(
1136
+ query: _RepoLexicalQuery,
1137
+ haystacks: tuple[str, str, str, str, str],
1138
+ ) -> float:
1139
+ if not query.terms:
1140
+ return 1.0
1141
+ doc_id, source, title, synopsis, body = haystacks
1142
+ profile = _DEFAULT_RANK_PROFILE
1143
+ focus_terms = tuple(
1144
+ term for term in query.terms if query.weights.get(term, 0.0) > 0
1145
+ )[:2]
1146
+ if not focus_terms:
1147
+ return 1.0
1148
+ total = sum(query.weights[term] for term in focus_terms)
1149
+ if total <= 0:
1150
+ return 1.0
1151
+ support = 0.0
1152
+ for term in focus_terms:
1153
+ variants = {
1154
+ variant
1155
+ for variant in query.variants.get(term, ())
1156
+ if " " not in variant
1157
+ }
1158
+ if not variants:
1159
+ continue
1160
+ if any(
1161
+ variant in doc_id or variant in source or variant in title
1162
+ for variant in variants
1163
+ ):
1164
+ support += query.weights[term]
1165
+ elif any(variant in synopsis for variant in variants):
1166
+ support += query.weights[term] * profile.focus_synopsis_support
1167
+ elif any(variant in body for variant in variants):
1168
+ support += query.weights[term] * profile.focus_body_support
1169
+ return max(0.0, min(1.0, support / total))
1170
+
1171
+
1172
+ def _field_supported_lexical_score(score: float, *, focus_support: float) -> float:
1173
+ if score <= 0 or focus_support >= 1:
1174
+ return score
1175
+ profile = _DEFAULT_RANK_PROFILE
1176
+ multiplier = profile.focus_support_floor + (
1177
+ profile.focus_support_weight * max(0.0, focus_support)
1178
+ )
1179
+ return score * multiplier
1180
+
1181
+
1182
+ def _weighted_term_coverage(
1183
+ query: _RepoLexicalQuery,
1184
+ haystacks: tuple[str, str, str, str, str],
1185
+ ) -> float:
1186
+ if not query.terms:
1187
+ return 1.0
1188
+ combined = " ".join(haystacks)
1189
+ total = sum(query.weights[term] for term in query.terms)
1190
+ if total <= 0:
1191
+ return 1.0
1192
+ matched = 0.0
1193
+ for term in query.terms:
1194
+ variants = query.variants.get(term, ())
1195
+ if any(variant in combined for variant in variants):
1196
+ matched += query.weights[term]
1197
+ return max(0.0, min(1.0, matched / total))
1198
+
1199
+
1200
+ def _coverage_factor(coverage: float) -> float:
1201
+ profile = _DEFAULT_RANK_PROFILE
1202
+ return profile.coverage_floor + (
1203
+ profile.coverage_weight * max(0.0, min(1.0, coverage))
1204
+ )
1205
+
1206
+
1207
+ def _doc_identity_bonus(
1208
+ query: _RepoLexicalQuery,
1209
+ haystacks: tuple[str, str, str, str, str],
1210
+ ) -> float:
1211
+ if not query.terms:
1212
+ return 0.0
1213
+ doc_id, source, _, _, _ = haystacks
1214
+ focus_terms = tuple(
1215
+ term for term in query.terms if query.weights.get(term, 0.0) > 0
1216
+ )[:3]
1217
+ total = sum(query.weights[term] for term in focus_terms)
1218
+ if total <= 0:
1219
+ return 0.0
1220
+ matched = 0.0
1221
+ for term in focus_terms:
1222
+ variants = {
1223
+ variant
1224
+ for variant in query.variants.get(term, ())
1225
+ if " " not in variant
1226
+ }
1227
+ if any(variant in doc_id or variant in source for variant in variants):
1228
+ matched += query.weights[term]
1229
+ support = matched / total
1230
+ return _DEFAULT_RANK_PROFILE.doc_identity_bonus_weight * max(
1231
+ 0.0,
1232
+ min(1.0, support),
1233
+ )
1234
+
1235
+
1236
+ def _root_meta_doc_factor(
1237
+ query: _RepoLexicalQuery,
1238
+ record: _RepoSectionRecord,
1239
+ ) -> float:
1240
+ source_path = Path(record.source)
1241
+ if source_path.parent != Path(".") or source_path.stem in {"README", "CHANGELOG"}:
1242
+ return 1.0
1243
+ if source_path.stem != source_path.stem.upper():
1244
+ return 1.0
1245
+ if _first_term_has_structural_support(query, record.haystacks):
1246
+ return 1.0
1247
+ return _DEFAULT_RANK_PROFILE.root_meta_doc_factor
1248
+
1249
+
1250
+ def _history_doc_factor(query: _RepoLexicalQuery, record: _RepoSectionRecord) -> float:
1251
+ if _query_wants_history(query):
1252
+ return 1.0
1253
+ if not _is_history_doc(record):
1254
+ return 1.0
1255
+ return _DEFAULT_RANK_PROFILE.history_doc_generic_factor
1256
+
1257
+
1258
+ def _is_history_doc(record: _RepoSectionRecord) -> bool:
1259
+ tokens = set(_tokenize_search_text(f"{record.doc_id} {record.source} {record.title}"))
1260
+ return bool(tokens & _HISTORY_DOC_TERMS)
1261
+
1262
+
1263
+ def _query_wants_history(query: _RepoLexicalQuery) -> bool:
1264
+ query_terms = set(query.terms)
1265
+ if query_terms & _HISTORY_QUERY_TERMS:
1266
+ return True
1267
+ return any(
1268
+ variant in _HISTORY_QUERY_TERMS
1269
+ for term in query.terms
1270
+ for variant in query.variants.get(term, ())
1271
+ if " " not in variant
1272
+ )
1273
+
1274
+
1275
+ def _locale_doc_factor(query: _RepoLexicalQuery, record: _RepoSectionRecord) -> float:
1276
+ if not query.preferred_locales:
1277
+ return 1.0
1278
+ doc_locale = _source_locale(record.source)
1279
+ if doc_locale is None:
1280
+ return 1.0
1281
+ profile = _DEFAULT_RANK_PROFILE
1282
+ if doc_locale in query.preferred_locales:
1283
+ return profile.locale_match_factor
1284
+ return profile.locale_mismatch_factor
1285
+
1286
+
1287
+ def _source_locale(source: str) -> str | None:
1288
+ for part in Path(source).parts:
1289
+ normalized = part.lower().replace("_", "-")
1290
+ if normalized in _KNOWN_LOCALES:
1291
+ return normalized
1292
+ match = re.fullmatch(r"([a-z]{2})(?:-[a-z0-9]{2,8})+", normalized)
1293
+ if match and match.group(1) in _KNOWN_LOCALES:
1294
+ return match.group(1)
1295
+ return None
1296
+
1297
+
1298
+ def _infer_query_locale(query: str) -> str | None:
1299
+ if re.search(r"[\u3040-\u30ff\u3400-\u9fff\uac00-\ud7af]", query):
1300
+ return None
1301
+ if re.search(r"[A-Za-z]", query):
1302
+ return "en"
1303
+ return None
1304
+
1305
+
1306
+ def _normalized_preferred_locales(
1307
+ locales: tuple[str, ...],
1308
+ *,
1309
+ fallback: str | None,
1310
+ ) -> tuple[str, ...]:
1311
+ normalized = tuple(
1312
+ locale.lower().replace("_", "-").split("-", 1)[0]
1313
+ for locale in locales
1314
+ if locale.strip()
1315
+ )
1316
+ if normalized:
1317
+ return tuple(
1318
+ locale for locale in normalized if locale in _KNOWN_LOCALES
1319
+ )
1320
+ if fallback is None:
1321
+ return ()
1322
+ return (fallback,)
1323
+
1324
+
1325
+ def _first_term_has_structural_support(
1326
+ query: _RepoLexicalQuery,
1327
+ haystacks: tuple[str, str, str, str, str],
1328
+ ) -> bool:
1329
+ if not query.terms:
1330
+ return True
1331
+ variants = {
1332
+ variant
1333
+ for variant in query.variants.get(query.terms[0], ())
1334
+ if " " not in variant
1335
+ }
1336
+ if not variants:
1337
+ return True
1338
+ doc_id, source, title, _, _ = haystacks
1339
+ return any(
1340
+ variant in doc_id or variant in source or variant in title
1341
+ for variant in variants
1342
+ )
1343
+
1344
+
1345
+ def _term_variants(term: str) -> tuple[str, ...]:
1346
+ variants = {term}
1347
+ if term.endswith("ies") and len(term) > 4:
1348
+ variants.add(f"{term[:-3]}y")
1349
+ if term.endswith("s") and len(term) > 3:
1350
+ variants.add(term[:-1])
1351
+ if not term.endswith("s") and len(term) > 2:
1352
+ variants.add(f"{term}s")
1353
+ if term.endswith("y") and len(term) > 3:
1354
+ variants.add(f"{term[:-1]}ies")
1355
+
1356
+ if term.startswith(("eval", "evaluat")):
1357
+ variants.update(
1358
+ {
1359
+ "eval",
1360
+ "evals",
1361
+ "evaluate",
1362
+ "evaluates",
1363
+ "evaluated",
1364
+ "evaluating",
1365
+ "evaluation",
1366
+ "evaluations",
1367
+ "evaluator",
1368
+ "evaluators",
1369
+ }
1370
+ )
1371
+ if term.startswith(("depend", "deps")):
1372
+ variants.update(
1373
+ {
1374
+ "dep",
1375
+ "deps",
1376
+ "depend",
1377
+ "depends",
1378
+ "dependency",
1379
+ "dependencies",
1380
+ "dependent",
1381
+ "dependents",
1382
+ }
1383
+ )
1384
+ if term.startswith("inject"):
1385
+ variants.update(
1386
+ {
1387
+ "inject",
1388
+ "injects",
1389
+ "injected",
1390
+ "injecting",
1391
+ "injection",
1392
+ "injections",
1393
+ }
1394
+ )
1395
+ if term.startswith("config"):
1396
+ variants.update(
1397
+ {"config", "configs", "configure", "configured", "configuration"}
1398
+ )
1399
+ if term.startswith("auth"):
1400
+ variants.update({"auth", "authenticate", "authentication", "authorization"})
1401
+ if term.startswith("install"):
1402
+ variants.update(
1403
+ {
1404
+ "install",
1405
+ "installs",
1406
+ "installed",
1407
+ "installer",
1408
+ "installers",
1409
+ "installing",
1410
+ "installation",
1411
+ }
1412
+ )
1413
+ if term.startswith("login"):
1414
+ variants.update({"login", "logins", "logged", "logging"})
1415
+ if term.startswith("publish"):
1416
+ variants.update({"publish", "published", "publishes", "publishing"})
1417
+ if term.startswith("store"):
1418
+ variants.update({"store", "stored", "stores", "storage"})
1419
+ if term.startswith("stream"):
1420
+ variants.update({"stream", "streams", "streamed", "streaming"})
1421
+
1422
+ return tuple(sorted(variants, key=lambda item: (len(item), item), reverse=True))
1423
+
1424
+
1425
+ def _repo_phrase_weight(phrase: str) -> float:
1426
+ tokens = phrase.split()
1427
+ base = 2.0 + min(4.0, float(len(tokens)))
1428
+ if any(_looks_like_compact_identifier(token) for token in tokens):
1429
+ base += 2.0
1430
+ return min(8.0, base)
1431
+
1432
+
1433
+ def _command_phrases(query: str) -> list[str]:
1434
+ tokens = re.findall(r"[a-z0-9_][a-z0-9_-]*", query.lower())
1435
+ if len(tokens) < 2:
1436
+ return []
1437
+ generic = {
1438
+ "a",
1439
+ "an",
1440
+ "and",
1441
+ "are",
1442
+ "for",
1443
+ "how",
1444
+ "in",
1445
+ "is",
1446
+ "of",
1447
+ "or",
1448
+ "the",
1449
+ "to",
1450
+ "what",
1451
+ "where",
1452
+ "with",
1453
+ }
1454
+ phrases: list[str] = []
1455
+ seen: set[str] = set()
1456
+ for size in range(min(4, len(tokens)), 1, -1):
1457
+ for start in range(0, len(tokens) - size + 1):
1458
+ window = tokens[start : start + size]
1459
+ if any(token in generic for token in window):
1460
+ continue
1461
+ if not any(_looks_like_compact_identifier(token) for token in window):
1462
+ continue
1463
+ phrase = " ".join(window)
1464
+ if phrase not in seen:
1465
+ seen.add(phrase)
1466
+ phrases.append(phrase)
1467
+ return phrases
1468
+
1469
+
1470
+ def _diversify_repo_hits(
1471
+ hits: list[_RepoScoredHit],
1472
+ *,
1473
+ limit: int,
1474
+ sections_per_doc: int,
1475
+ ) -> list[_RepoScoredHit]:
1476
+ selected: list[_RepoScoredHit] = []
1477
+ counts: dict[str, int] = {}
1478
+
1479
+ def add(hit: _RepoScoredHit) -> None:
1480
+ doc_id = hit.record.doc_id
1481
+ selected.append(hit)
1482
+ counts[doc_id] = counts.get(doc_id, 0) + 1
1483
+
1484
+ for hit in hits:
1485
+ doc_id = hit.record.doc_id
1486
+ if counts.get(doc_id, 0) > 0:
1487
+ continue
1488
+ add(hit)
1489
+ if len(selected) >= limit:
1490
+ return selected
1491
+
1492
+ if sections_per_doc <= 1:
1493
+ return selected
1494
+
1495
+ seen = {(hit.record.doc_id, hit.record.section_id) for hit in selected}
1496
+ for hit in hits:
1497
+ key = (hit.record.doc_id, hit.record.section_id)
1498
+ doc_id = key[0]
1499
+ if key in seen or counts.get(doc_id, 0) >= sections_per_doc:
1500
+ continue
1501
+ add(hit)
1502
+ seen.add(key)
1503
+ if len(selected) >= limit:
1504
+ return selected
1505
+ return selected