codespine 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codespine-0.3.0 → codespine-0.4.0}/PKG-INFO +1 -1
- {codespine-0.3.0 → codespine-0.4.0}/codespine/__init__.py +1 -1
- {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/deadcode.py +48 -2
- {codespine-0.3.0 → codespine-0.4.0}/codespine/config.py +1 -1
- {codespine-0.3.0 → codespine-0.4.0}/codespine/db/schema.py +5 -2
- {codespine-0.3.0 → codespine-0.4.0}/codespine/db/store.py +21 -11
- {codespine-0.3.0 → codespine-0.4.0}/codespine/mcp/server.py +67 -13
- {codespine-0.3.0 → codespine-0.4.0}/codespine/search/bm25.py +17 -1
- {codespine-0.3.0 → codespine-0.4.0}/codespine/search/hybrid.py +31 -9
- codespine-0.4.0/codespine/search/vector.py +139 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/PKG-INFO +1 -1
- {codespine-0.3.0 → codespine-0.4.0}/pyproject.toml +1 -1
- codespine-0.3.0/codespine/search/vector.py +0 -122
- {codespine-0.3.0 → codespine-0.4.0}/LICENSE +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/README.md +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/__init__.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/community.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/context.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/coupling.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/flow.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/impact.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/cli.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/db/__init__.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/diff/__init__.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/diff/branch_diff.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/indexer/__init__.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/indexer/call_resolver.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/indexer/engine.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/indexer/java_parser.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/indexer/symbol_builder.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/mcp/__init__.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/noise/__init__.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/noise/blocklist.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/search/__init__.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/search/fuzzy.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/search/rrf.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/watch/__init__.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine/watch/watcher.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/SOURCES.txt +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/dependency_links.txt +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/entry_points.txt +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/requires.txt +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/top_level.txt +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/gindex.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/setup.cfg +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/tests/test_branch_diff_normalize.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/tests/test_call_resolver.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/tests/test_index_and_hybrid.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/tests/test_java_parser.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/tests/test_multimodule_index.py +0 -0
- {codespine-0.3.0 → codespine-0.4.0}/tests/test_search_ranking.py +0 -0
|
@@ -1,17 +1,63 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
EXEMPT_ANNOTATIONS = {
|
|
4
|
+
# Java standard
|
|
4
5
|
"Override",
|
|
6
|
+
# JUnit / testing
|
|
5
7
|
"Test",
|
|
6
8
|
"ParameterizedTest",
|
|
9
|
+
"BeforeEach",
|
|
10
|
+
"AfterEach",
|
|
11
|
+
"BeforeAll",
|
|
12
|
+
"AfterAll",
|
|
13
|
+
# Spring – component model (class-level; methods inside are never "dead")
|
|
14
|
+
"Component",
|
|
15
|
+
"Service",
|
|
16
|
+
"Repository",
|
|
17
|
+
"Controller",
|
|
18
|
+
"RestController",
|
|
19
|
+
"Configuration",
|
|
7
20
|
"Bean",
|
|
21
|
+
"Aspect",
|
|
22
|
+
# Spring – lifecycle / event hooks
|
|
8
23
|
"PostConstruct",
|
|
9
24
|
"PreDestroy",
|
|
25
|
+
"EventListener",
|
|
26
|
+
"TransactionalEventListener",
|
|
10
27
|
"Scheduled",
|
|
28
|
+
# Spring – web entry points
|
|
29
|
+
"RequestMapping",
|
|
30
|
+
"GetMapping",
|
|
31
|
+
"PostMapping",
|
|
32
|
+
"PutMapping",
|
|
33
|
+
"DeleteMapping",
|
|
34
|
+
"PatchMapping",
|
|
35
|
+
"MessageMapping",
|
|
36
|
+
# Spring – messaging / async
|
|
11
37
|
"KafkaListener",
|
|
12
|
-
"
|
|
13
|
-
"
|
|
38
|
+
"RabbitListener",
|
|
39
|
+
"JmsListener",
|
|
40
|
+
"SqsListener",
|
|
41
|
+
"StreamListener",
|
|
42
|
+
# Spring Data / persistence
|
|
43
|
+
"Query",
|
|
44
|
+
"Modifying",
|
|
45
|
+
# Guice DI
|
|
14
46
|
"Inject",
|
|
47
|
+
"Provides",
|
|
48
|
+
"Singleton",
|
|
49
|
+
"Named",
|
|
50
|
+
"Qualifier",
|
|
51
|
+
# Jakarta / javax DI (same semantics as Guice/Spring variants)
|
|
52
|
+
"ApplicationScoped",
|
|
53
|
+
"RequestScoped",
|
|
54
|
+
"SessionScoped",
|
|
55
|
+
"Dependent",
|
|
56
|
+
# Jackson / serialization (called reflectively)
|
|
57
|
+
"JsonCreator",
|
|
58
|
+
"JsonProperty",
|
|
59
|
+
"JsonDeserialize",
|
|
60
|
+
"JsonSerialize",
|
|
15
61
|
}
|
|
16
62
|
|
|
17
63
|
EXEMPT_CONTRACT_METHODS = {
|
|
@@ -7,7 +7,7 @@ class Settings:
|
|
|
7
7
|
db_path: str = os.path.expanduser("~/.codespine_db")
|
|
8
8
|
pid_file: str = os.path.expanduser("~/.codespine.pid")
|
|
9
9
|
log_file: str = os.path.expanduser("~/.codespine.log")
|
|
10
|
-
|
|
10
|
+
embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")
|
|
11
11
|
index_meta_dir: str = os.path.expanduser("~/.codespine_index_meta")
|
|
12
12
|
embedding_model: str = "BAAI/bge-small-en-v1.5"
|
|
13
13
|
vector_dim: int = 384
|
|
@@ -10,7 +10,7 @@ NODE_TABLES: list[tuple[str, str]] = [
|
|
|
10
10
|
("SchemaMeta", "CREATE NODE TABLE SchemaMeta(key STRING, value STRING, PRIMARY KEY (key))"),
|
|
11
11
|
(
|
|
12
12
|
"Project",
|
|
13
|
-
"CREATE NODE TABLE Project(id STRING, path STRING, language STRING, PRIMARY KEY (id))",
|
|
13
|
+
"CREATE NODE TABLE Project(id STRING, path STRING, language STRING, indexed_at STRING, PRIMARY KEY (id))",
|
|
14
14
|
),
|
|
15
15
|
(
|
|
16
16
|
"File",
|
|
@@ -76,7 +76,10 @@ def ensure_schema(conn) -> None:
|
|
|
76
76
|
_safe_execute(conn, "CALL CREATE_FTS_INDEX('method_fts', 'Method', ['name', 'signature'])")
|
|
77
77
|
_safe_execute(conn, "CALL CREATE_FTS_INDEX('class_fts', 'Class', ['name', 'fqcn'])")
|
|
78
78
|
|
|
79
|
+
# Best-effort migration: add indexed_at column to existing Project tables.
|
|
80
|
+
_safe_execute(conn, "ALTER TABLE Project ADD indexed_at STRING DEFAULT ''")
|
|
81
|
+
|
|
79
82
|
_safe_execute(
|
|
80
83
|
conn,
|
|
81
|
-
"MERGE (s:SchemaMeta {key: 'schema_version'}) SET s.value = '
|
|
84
|
+
"MERGE (s:SchemaMeta {key: 'schema_version'}) SET s.value = '3'",
|
|
82
85
|
)
|
|
@@ -5,6 +5,7 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
import threading
|
|
8
|
+
import time
|
|
8
9
|
from contextlib import contextmanager
|
|
9
10
|
from dataclasses import dataclass
|
|
10
11
|
from typing import Any
|
|
@@ -69,7 +70,12 @@ class GraphStore:
|
|
|
69
70
|
self.execute("COMMIT")
|
|
70
71
|
except Exception:
|
|
71
72
|
if tx_started:
|
|
72
|
-
|
|
73
|
+
try:
|
|
74
|
+
self.execute("ROLLBACK")
|
|
75
|
+
except Exception:
|
|
76
|
+
# Kuzu may have already rolled back (e.g. on OOM), making a
|
|
77
|
+
# second ROLLBACK raise "No active transaction". Swallow it.
|
|
78
|
+
pass
|
|
73
79
|
raise
|
|
74
80
|
|
|
75
81
|
def clear_project(self, project_id: str) -> None:
|
|
@@ -108,8 +114,8 @@ class GraphStore:
|
|
|
108
114
|
|
|
109
115
|
def upsert_project(self, project_id: str, path: str) -> None:
|
|
110
116
|
self.execute(
|
|
111
|
-
"MERGE (p:Project {id: $id}) SET p.path = $path, p.language = 'java'",
|
|
112
|
-
{"id": project_id, "path": path},
|
|
117
|
+
"MERGE (p:Project {id: $id}) SET p.path = $path, p.language = 'java', p.indexed_at = $ts",
|
|
118
|
+
{"id": project_id, "path": path, "ts": str(int(time.time()))},
|
|
113
119
|
)
|
|
114
120
|
|
|
115
121
|
def project_file_hashes(self, project_id: str) -> dict[str, dict[str, str]]:
|
|
@@ -297,14 +303,18 @@ class GraphStore:
|
|
|
297
303
|
"MERGE (c:Community {id: $id}) SET c.label = $label, c.cohesion = $cohesion",
|
|
298
304
|
{"id": community_id, "label": label, "cohesion": cohesion},
|
|
299
305
|
)
|
|
300
|
-
#
|
|
301
|
-
#
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
306
|
+
# Commit in batches of 50 to keep Kuzu's buffer pool from OOMing on large
|
|
307
|
+
# communities. A single transaction over thousands of MERGE statements exhausts
|
|
308
|
+
# the 256 MB buffer pool before it can page out.
|
|
309
|
+
_BATCH = 50
|
|
310
|
+
for i in range(0, len(symbol_ids), _BATCH):
|
|
311
|
+
batch = symbol_ids[i : i + _BATCH]
|
|
312
|
+
with self.transaction():
|
|
313
|
+
for sid in batch:
|
|
314
|
+
self.execute(
|
|
315
|
+
"MATCH (s:Symbol {id: $sid}), (c:Community {id: $cid}) MERGE (s)-[:IN_COMMUNITY]->(c)",
|
|
316
|
+
{"sid": sid, "cid": community_id},
|
|
317
|
+
)
|
|
308
318
|
|
|
309
319
|
def set_flow(self, flow_id: str, entry_symbol_id: str, kind: str, symbols_at_depth: list[tuple[str, int]]) -> None:
|
|
310
320
|
self.execute(
|
|
@@ -71,7 +71,9 @@ def build_mcp_server(store, repo_path_provider):
|
|
|
71
71
|
Call this before other tools so you know what's ready without trial-and-error.
|
|
72
72
|
Features marked false may need 'codespine analyse --deep' or optional dependencies.
|
|
73
73
|
"""
|
|
74
|
-
projects = store.query_records(
|
|
74
|
+
projects = store.query_records(
|
|
75
|
+
"MATCH (p:Project) RETURN p.id as id, p.path as path, p.indexed_at as indexed_at"
|
|
76
|
+
)
|
|
75
77
|
sym_q = store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
|
|
76
78
|
comm_q = store.query_records("MATCH (c:Community) RETURN count(c) as count")
|
|
77
79
|
flow_q = store.query_records("MATCH (f:Flow) RETURN count(f) as count")
|
|
@@ -97,7 +99,20 @@ def build_mcp_server(store, repo_path_provider):
|
|
|
97
99
|
watch_running = _watch["proc"] is not None and _watch["proc"].poll() is None
|
|
98
100
|
analyse_running = _analyse["proc"] is not None and _analyse["proc"].poll() is None
|
|
99
101
|
|
|
102
|
+
now = int(time.time())
|
|
103
|
+
stale_projects = []
|
|
104
|
+
for p in projects:
|
|
105
|
+
ts = int(p.get("indexed_at") or 0)
|
|
106
|
+
if ts and (now - ts) > 3600 and not watch_running:
|
|
107
|
+
age_h = (now - ts) // 3600
|
|
108
|
+
stale_projects.append(f"{p['id']} ({age_h}h old)")
|
|
109
|
+
|
|
100
110
|
notes: dict[str, str] = {}
|
|
111
|
+
if stale_projects:
|
|
112
|
+
notes["stale_index"] = (
|
|
113
|
+
f"Index is stale for: {', '.join(stale_projects)}. "
|
|
114
|
+
"Run analyse_project() or start_watch() to refresh."
|
|
115
|
+
)
|
|
101
116
|
if not n_comm:
|
|
102
117
|
notes["community_detection"] = "Run 'codespine analyse --deep' to enable"
|
|
103
118
|
if not n_flows:
|
|
@@ -156,9 +171,12 @@ def build_mcp_server(store, repo_path_provider):
|
|
|
156
171
|
@mcp.tool()
|
|
157
172
|
def list_projects():
|
|
158
173
|
"""List all indexed projects with their symbol and file counts."""
|
|
159
|
-
projects = store.query_records(
|
|
174
|
+
projects = store.query_records(
|
|
175
|
+
"MATCH (p:Project) RETURN p.id as id, p.path as path, p.indexed_at as indexed_at"
|
|
176
|
+
)
|
|
160
177
|
if not projects:
|
|
161
178
|
return {"available": False, "note": "No projects indexed yet. Run 'codespine analyse <path>'."}
|
|
179
|
+
now = int(time.time())
|
|
162
180
|
result = []
|
|
163
181
|
for p in projects:
|
|
164
182
|
sym = store.query_records(
|
|
@@ -173,14 +191,22 @@ def build_mcp_server(store, repo_path_provider):
|
|
|
173
191
|
"MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as count",
|
|
174
192
|
{"pid": p["id"]},
|
|
175
193
|
)
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
194
|
+
indexed_at_ts = int(p.get("indexed_at") or 0)
|
|
195
|
+
age_s = now - indexed_at_ts if indexed_at_ts else None
|
|
196
|
+
entry: dict = {
|
|
197
|
+
"project_id": p["id"],
|
|
198
|
+
"path": p["path"],
|
|
199
|
+
"symbol_count": sym[0]["count"] if sym else 0,
|
|
200
|
+
"file_count": files[0]["count"] if files else 0,
|
|
201
|
+
"indexed_at_epoch": indexed_at_ts or None,
|
|
202
|
+
"index_age_seconds": age_s,
|
|
203
|
+
}
|
|
204
|
+
if age_s is not None and age_s > 3600:
|
|
205
|
+
entry["stale_warning"] = (
|
|
206
|
+
f"Index is {age_s // 3600}h {(age_s % 3600) // 60}m old. "
|
|
207
|
+
"Run analyse_project() or start_watch() to refresh."
|
|
208
|
+
)
|
|
209
|
+
result.append(entry)
|
|
184
210
|
return {"available": True, "projects": result}
|
|
185
211
|
|
|
186
212
|
# ------------------------------------------------------------------
|
|
@@ -371,7 +397,10 @@ def build_mcp_server(store, repo_path_provider):
|
|
|
371
397
|
"""
|
|
372
398
|
name_lower = name.lower()
|
|
373
399
|
project_clause = "AND f.project_id = $proj" if project else ""
|
|
374
|
-
|
|
400
|
+
# Note: only $namel and $lim are referenced in the queries below.
|
|
401
|
+
# Do NOT add extra keys here — some Kuzu versions raise "Parameter not found"
|
|
402
|
+
# when the params dict contains keys absent from the query string.
|
|
403
|
+
params: dict = {"namel": name_lower, "lim": limit}
|
|
375
404
|
if project:
|
|
376
405
|
params["proj"] = project
|
|
377
406
|
|
|
@@ -591,15 +620,40 @@ def build_mcp_server(store, repo_path_provider):
|
|
|
591
620
|
if not os.path.isdir(abs_path):
|
|
592
621
|
return {"available": False, "note": f"Path does not exist or is not a directory: {abs_path}"}
|
|
593
622
|
|
|
623
|
+
import tempfile as _tempfile
|
|
624
|
+
watch_err_file = _tempfile.NamedTemporaryFile(
|
|
625
|
+
mode="w", suffix=".log", prefix="codespine_watch_", delete=False
|
|
626
|
+
)
|
|
627
|
+
watch_err_path = watch_err_file.name
|
|
628
|
+
watch_err_file.close()
|
|
629
|
+
|
|
594
630
|
proc = subprocess.Popen(
|
|
595
631
|
[
|
|
596
632
|
sys.executable, "-m", "codespine.cli",
|
|
597
633
|
"watch", "--path", abs_path,
|
|
598
634
|
"--global-interval", str(global_interval),
|
|
599
635
|
],
|
|
600
|
-
stdout=
|
|
601
|
-
stderr=subprocess.
|
|
636
|
+
stdout=open(watch_err_path, "w", encoding="utf-8"),
|
|
637
|
+
stderr=subprocess.STDOUT,
|
|
602
638
|
)
|
|
639
|
+
|
|
640
|
+
# Brief health check — if the process dies within 1 s it crashed at startup.
|
|
641
|
+
time.sleep(1)
|
|
642
|
+
if proc.poll() is not None:
|
|
643
|
+
try:
|
|
644
|
+
with open(watch_err_path, "r", encoding="utf-8", errors="replace") as fh:
|
|
645
|
+
err_tail = fh.read().strip().splitlines()[-10:]
|
|
646
|
+
except Exception:
|
|
647
|
+
err_tail = []
|
|
648
|
+
return {
|
|
649
|
+
"available": False,
|
|
650
|
+
"note": (
|
|
651
|
+
f"Watch mode process exited immediately (code {proc.returncode}). "
|
|
652
|
+
"Check that the path is valid and watchfiles is installed."
|
|
653
|
+
),
|
|
654
|
+
"error_tail": err_tail,
|
|
655
|
+
}
|
|
656
|
+
|
|
603
657
|
_watch["proc"] = proc
|
|
604
658
|
_watch["path"] = abs_path
|
|
605
659
|
_watch["started_at"] = time.time()
|
|
@@ -5,10 +5,26 @@ import re
|
|
|
5
5
|
from collections import Counter
|
|
6
6
|
|
|
7
7
|
TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
|
|
8
|
+
_CAMEL_SPLIT_RE = re.compile(r"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|(?<=\D)(?=\d)|(?<=\d)(?=\D)")
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
def tokenize(text: str) -> list[str]:
|
|
11
|
-
|
|
12
|
+
"""Tokenize text, splitting on camelCase and underscores in addition to whitespace.
|
|
13
|
+
|
|
14
|
+
'SolicitPanFetchActionCompletionEvent' → ['solicit', 'pan', 'fetch', 'action', 'completion', 'event']
|
|
15
|
+
'get_symbol_context' → ['get', 'symbol', 'context']
|
|
16
|
+
"""
|
|
17
|
+
raw_tokens = TOKEN_RE.findall(text or "")
|
|
18
|
+
out: list[str] = []
|
|
19
|
+
for tok in raw_tokens:
|
|
20
|
+
# Split underscore-joined segments first, then camelCase within each
|
|
21
|
+
for segment in tok.split("_"):
|
|
22
|
+
if not segment:
|
|
23
|
+
continue
|
|
24
|
+
for part in _CAMEL_SPLIT_RE.split(segment):
|
|
25
|
+
if part:
|
|
26
|
+
out.append(part.lower())
|
|
27
|
+
return out
|
|
12
28
|
|
|
13
29
|
|
|
14
30
|
def rank_bm25(query: str, docs: list[tuple[str, str]], k1: float = 1.2, b: float = 0.75) -> list[tuple[str, float]]:
|
|
@@ -1,18 +1,22 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from codespine.config import SETTINGS
|
|
4
3
|
from codespine.search.bm25 import rank_bm25
|
|
5
4
|
from codespine.search.fuzzy import rank_fuzzy
|
|
6
5
|
from codespine.search.rrf import reciprocal_rank_fusion
|
|
7
6
|
from codespine.search.vector import rank_semantic
|
|
8
7
|
|
|
8
|
+
_LOW_CONFIDENCE_THRESHOLD = 0.05
|
|
9
|
+
|
|
9
10
|
|
|
10
11
|
def hybrid_search(store, query: str, k: int = 20, project: str | None = None) -> list[dict]:
|
|
11
12
|
project_clause = "AND f.project_id = $proj" if project else ""
|
|
12
|
-
params: dict = {
|
|
13
|
+
params: dict = {}
|
|
13
14
|
if project:
|
|
14
15
|
params["proj"] = project
|
|
15
16
|
|
|
17
|
+
# No LIMIT — load all symbols for the scoped project so that exact class names
|
|
18
|
+
# are never missing from the candidate pool (previously capped at 2000 which
|
|
19
|
+
# caused exact matches on 4000+ file projects to be silently dropped).
|
|
16
20
|
recs = store.query_records(
|
|
17
21
|
f"""
|
|
18
22
|
MATCH (s:Symbol), (f:File)
|
|
@@ -24,7 +28,6 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
|
|
|
24
28
|
s.embedding as embedding,
|
|
25
29
|
f.path as file_path,
|
|
26
30
|
f.is_test as is_test
|
|
27
|
-
LIMIT $lim
|
|
28
31
|
""",
|
|
29
32
|
params,
|
|
30
33
|
)
|
|
@@ -32,6 +35,8 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
|
|
|
32
35
|
if not recs:
|
|
33
36
|
return []
|
|
34
37
|
|
|
38
|
+
query_lower = query.lower().strip()
|
|
39
|
+
|
|
35
40
|
lexical_docs = [(r["id"], f"{r.get('name', '')} {r.get('fqname', '')}") for r in recs]
|
|
36
41
|
fuzzy_docs = [(r["id"], r.get("name", "")) for r in recs]
|
|
37
42
|
vector_docs = [(r["id"], r.get("embedding")) for r in recs]
|
|
@@ -40,11 +45,11 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
|
|
|
40
45
|
fuzzy_rank = rank_fuzzy(query, fuzzy_docs)
|
|
41
46
|
semantic_rank = rank_semantic(query, vector_docs)
|
|
42
47
|
|
|
43
|
-
fused = reciprocal_rank_fusion([bm25_rank, semantic_rank, fuzzy_rank]
|
|
48
|
+
fused = reciprocal_rank_fusion([bm25_rank, semantic_rank, fuzzy_rank])
|
|
44
49
|
rec_by_id = {r["id"]: r for r in recs}
|
|
45
50
|
|
|
46
51
|
results = []
|
|
47
|
-
for doc_id, score in fused
|
|
52
|
+
for doc_id, score in fused:
|
|
48
53
|
rec = rec_by_id.get(doc_id)
|
|
49
54
|
if not rec:
|
|
50
55
|
continue
|
|
@@ -55,7 +60,12 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
|
|
|
55
60
|
if rec.get("kind") in {"method", "class"}:
|
|
56
61
|
multiplier *= 1.2
|
|
57
62
|
|
|
58
|
-
|
|
63
|
+
# Exact name match: guarantee this symbol ranks first regardless of RRF score.
|
|
64
|
+
name_lower = (rec.get("name") or "").lower()
|
|
65
|
+
fqname_lower = (rec.get("fqname") or "").lower()
|
|
66
|
+
if name_lower == query_lower or fqname_lower == query_lower:
|
|
67
|
+
multiplier *= 5.0
|
|
68
|
+
|
|
59
69
|
results.append(
|
|
60
70
|
{
|
|
61
71
|
"id": doc_id,
|
|
@@ -63,14 +73,15 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
|
|
|
63
73
|
"name": rec.get("name"),
|
|
64
74
|
"fqname": rec.get("fqname"),
|
|
65
75
|
"file_path": rec.get("file_path"),
|
|
66
|
-
"score":
|
|
76
|
+
"score": score * multiplier,
|
|
67
77
|
}
|
|
68
78
|
)
|
|
69
79
|
|
|
70
80
|
results.sort(key=lambda x: x["score"], reverse=True)
|
|
81
|
+
top_k = results[:k]
|
|
71
82
|
|
|
72
83
|
# Attach architectural context in same response.
|
|
73
|
-
for item in
|
|
84
|
+
for item in top_k:
|
|
74
85
|
ctx = store.query_records(
|
|
75
86
|
"""
|
|
76
87
|
MATCH (s:Symbol {id: $sid})-[:IN_COMMUNITY]->(c:Community)
|
|
@@ -83,4 +94,15 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
|
|
|
83
94
|
)
|
|
84
95
|
item["context"] = ctx
|
|
85
96
|
|
|
86
|
-
|
|
97
|
+
# Warn when all scores are near zero — the results are likely noise.
|
|
98
|
+
if top_k and top_k[0]["score"] < _LOW_CONFIDENCE_THRESHOLD:
|
|
99
|
+
for item in top_k:
|
|
100
|
+
item["low_confidence"] = True
|
|
101
|
+
top_k.append({
|
|
102
|
+
"note": (
|
|
103
|
+
"Low confidence results — all scores below threshold. "
|
|
104
|
+
"If searching for an exact class or method name, use find_symbol instead."
|
|
105
|
+
)
|
|
106
|
+
})
|
|
107
|
+
|
|
108
|
+
return top_k
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import math
|
|
6
|
+
import os
|
|
7
|
+
import threading
|
|
8
|
+
from functools import lru_cache
|
|
9
|
+
|
|
10
|
+
from codespine.config import SETTINGS
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _hash_vector(text: str, dim: int) -> list[float]:
|
|
14
|
+
"""Deterministic fallback embedding when sentence-transformers is unavailable."""
|
|
15
|
+
vec = [0.0] * dim
|
|
16
|
+
if not text:
|
|
17
|
+
return vec
|
|
18
|
+
tokens = text.lower().split()
|
|
19
|
+
for token in tokens:
|
|
20
|
+
digest = hashlib.sha1(token.encode("utf-8")).digest()
|
|
21
|
+
idx = int.from_bytes(digest[:2], "big") % dim
|
|
22
|
+
sign = 1.0 if digest[2] % 2 == 0 else -1.0
|
|
23
|
+
vec[idx] += sign
|
|
24
|
+
norm = math.sqrt(sum(v * v for v in vec)) or 1.0
|
|
25
|
+
return [v / norm for v in vec]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@lru_cache(maxsize=1)
|
|
29
|
+
def _load_model():
|
|
30
|
+
try:
|
|
31
|
+
from sentence_transformers import SentenceTransformer
|
|
32
|
+
|
|
33
|
+
return SentenceTransformer(SETTINGS.embedding_model)
|
|
34
|
+
except Exception:
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class _EmbeddingCache:
|
|
39
|
+
"""Thread-safe in-memory embedding cache backed by a JSON file.
|
|
40
|
+
|
|
41
|
+
Replaces the previous SQLite-based cache which caused threading issues
|
|
42
|
+
(database is locked / created in wrong thread) under MCP server concurrency.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, path: str) -> None:
|
|
46
|
+
self._path = path
|
|
47
|
+
self._lock = threading.Lock()
|
|
48
|
+
self._data: dict[str, str] | None = None # loaded lazily
|
|
49
|
+
|
|
50
|
+
def _ensure_loaded(self) -> None:
|
|
51
|
+
"""Load cache from disk. Must be called with _lock held."""
|
|
52
|
+
if self._data is not None:
|
|
53
|
+
return
|
|
54
|
+
if os.path.isfile(self._path):
|
|
55
|
+
try:
|
|
56
|
+
with open(self._path, "r", encoding="utf-8") as f:
|
|
57
|
+
loaded = json.load(f)
|
|
58
|
+
if isinstance(loaded, dict):
|
|
59
|
+
self._data = loaded
|
|
60
|
+
return
|
|
61
|
+
except Exception:
|
|
62
|
+
pass
|
|
63
|
+
self._data = {}
|
|
64
|
+
|
|
65
|
+
def _flush(self) -> None:
|
|
66
|
+
"""Persist cache to disk atomically. Must be called with _lock held."""
|
|
67
|
+
try:
|
|
68
|
+
dir_path = os.path.dirname(self._path)
|
|
69
|
+
if dir_path:
|
|
70
|
+
os.makedirs(dir_path, exist_ok=True)
|
|
71
|
+
tmp = self._path + ".tmp"
|
|
72
|
+
with open(tmp, "w", encoding="utf-8") as f:
|
|
73
|
+
json.dump(self._data, f, separators=(",", ":"))
|
|
74
|
+
os.replace(tmp, self._path)
|
|
75
|
+
except Exception:
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
def get(self, key: str) -> list[float] | None:
|
|
79
|
+
with self._lock:
|
|
80
|
+
self._ensure_loaded()
|
|
81
|
+
raw = self._data.get(key) # type: ignore[union-attr]
|
|
82
|
+
if raw is None:
|
|
83
|
+
return None
|
|
84
|
+
try:
|
|
85
|
+
return [float(x) for x in json.loads(raw)]
|
|
86
|
+
except Exception:
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
def set(self, key: str, vec: list[float]) -> None:
|
|
90
|
+
with self._lock:
|
|
91
|
+
self._ensure_loaded()
|
|
92
|
+
self._data[key] = json.dumps(vec) # type: ignore[index]
|
|
93
|
+
self._flush()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
_CACHE = _EmbeddingCache(SETTINGS.embedding_cache_path)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _cache_key(text: str, dim: int) -> str:
|
|
100
|
+
return hashlib.sha1(f"{SETTINGS.embedding_model}|{dim}|{text}".encode("utf-8")).hexdigest()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def embed_text(text: str, dim: int | None = None) -> list[float]:
|
|
104
|
+
dim = dim or SETTINGS.vector_dim
|
|
105
|
+
key = _cache_key(text or "", dim)
|
|
106
|
+
|
|
107
|
+
cached = _CACHE.get(key)
|
|
108
|
+
if cached is not None:
|
|
109
|
+
return cached
|
|
110
|
+
|
|
111
|
+
model = _load_model()
|
|
112
|
+
if model is None:
|
|
113
|
+
vec = _hash_vector(text, dim)
|
|
114
|
+
else:
|
|
115
|
+
vec = [float(x) for x in model.encode([text or ""], normalize_embeddings=True)[0]]
|
|
116
|
+
|
|
117
|
+
_CACHE.set(key, vec)
|
|
118
|
+
return vec
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
|
|
122
|
+
if not vec_a or not vec_b:
|
|
123
|
+
return 0.0
|
|
124
|
+
n = min(len(vec_a), len(vec_b))
|
|
125
|
+
dot = sum(vec_a[i] * vec_b[i] for i in range(n))
|
|
126
|
+
na = math.sqrt(sum(vec_a[i] * vec_a[i] for i in range(n))) or 1.0
|
|
127
|
+
nb = math.sqrt(sum(vec_b[i] * vec_b[i] for i in range(n))) or 1.0
|
|
128
|
+
return dot / (na * nb)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def rank_semantic(query: str, docs: list[tuple[str, list[float] | None]]) -> list[tuple[str, float]]:
|
|
132
|
+
qv = embed_text(query)
|
|
133
|
+
ranked: list[tuple[str, float]] = []
|
|
134
|
+
for doc_id, emb in docs:
|
|
135
|
+
if emb is None:
|
|
136
|
+
continue
|
|
137
|
+
ranked.append((doc_id, cosine_similarity(qv, emb)))
|
|
138
|
+
ranked.sort(key=lambda x: x[1], reverse=True)
|
|
139
|
+
return ranked
|
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import hashlib
|
|
4
|
-
import math
|
|
5
|
-
import sqlite3
|
|
6
|
-
from functools import lru_cache
|
|
7
|
-
|
|
8
|
-
from codespine.config import SETTINGS
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def _hash_vector(text: str, dim: int) -> list[float]:
|
|
12
|
-
"""Deterministic fallback embedding when sentence-transformers is unavailable."""
|
|
13
|
-
vec = [0.0] * dim
|
|
14
|
-
if not text:
|
|
15
|
-
return vec
|
|
16
|
-
tokens = text.lower().split()
|
|
17
|
-
for token in tokens:
|
|
18
|
-
digest = hashlib.sha1(token.encode("utf-8")).digest()
|
|
19
|
-
idx = int.from_bytes(digest[:2], "big") % dim
|
|
20
|
-
sign = 1.0 if digest[2] % 2 == 0 else -1.0
|
|
21
|
-
vec[idx] += sign
|
|
22
|
-
norm = math.sqrt(sum(v * v for v in vec)) or 1.0
|
|
23
|
-
return [v / norm for v in vec]
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
@lru_cache(maxsize=1)
|
|
27
|
-
def _load_model():
|
|
28
|
-
try:
|
|
29
|
-
from sentence_transformers import SentenceTransformer
|
|
30
|
-
|
|
31
|
-
return SentenceTransformer(SETTINGS.embedding_model)
|
|
32
|
-
except Exception:
|
|
33
|
-
return None
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
@lru_cache(maxsize=1)
|
|
37
|
-
def _embedding_cache_conn():
|
|
38
|
-
path = SETTINGS.embedding_cache_db
|
|
39
|
-
try:
|
|
40
|
-
os_dir = path.rsplit("/", 1)[0] if "/" in path else ""
|
|
41
|
-
if os_dir:
|
|
42
|
-
import os
|
|
43
|
-
|
|
44
|
-
os.makedirs(os_dir, exist_ok=True)
|
|
45
|
-
conn = sqlite3.connect(path, check_same_thread=False)
|
|
46
|
-
except Exception:
|
|
47
|
-
conn = sqlite3.connect("/tmp/.codespine_embedding_cache.sqlite3", check_same_thread=False)
|
|
48
|
-
conn.execute(
|
|
49
|
-
"""
|
|
50
|
-
CREATE TABLE IF NOT EXISTS embedding_cache (
|
|
51
|
-
cache_key TEXT PRIMARY KEY,
|
|
52
|
-
dim INTEGER NOT NULL,
|
|
53
|
-
vector_json TEXT NOT NULL
|
|
54
|
-
)
|
|
55
|
-
"""
|
|
56
|
-
)
|
|
57
|
-
return conn
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def _cache_key(text: str, dim: int) -> str:
|
|
61
|
-
return hashlib.sha1(f"{SETTINGS.embedding_model}|{dim}|{text}".encode("utf-8")).hexdigest()
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def _get_cached_embedding(text: str, dim: int) -> list[float] | None:
|
|
65
|
-
key = _cache_key(text, dim)
|
|
66
|
-
conn = _embedding_cache_conn()
|
|
67
|
-
row = conn.execute("SELECT vector_json FROM embedding_cache WHERE cache_key = ? AND dim = ?", (key, dim)).fetchone()
|
|
68
|
-
if not row:
|
|
69
|
-
return None
|
|
70
|
-
import json
|
|
71
|
-
|
|
72
|
-
return [float(x) for x in json.loads(row[0])]
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def _set_cached_embedding(text: str, dim: int, vec: list[float]) -> None:
|
|
76
|
-
key = _cache_key(text, dim)
|
|
77
|
-
conn = _embedding_cache_conn()
|
|
78
|
-
import json
|
|
79
|
-
|
|
80
|
-
conn.execute(
|
|
81
|
-
"INSERT OR REPLACE INTO embedding_cache(cache_key, dim, vector_json) VALUES (?, ?, ?)",
|
|
82
|
-
(key, dim, json.dumps(vec)),
|
|
83
|
-
)
|
|
84
|
-
conn.commit()
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def embed_text(text: str, dim: int | None = None) -> list[float]:
|
|
88
|
-
dim = dim or SETTINGS.vector_dim
|
|
89
|
-
cached = _get_cached_embedding(text or "", dim)
|
|
90
|
-
if cached is not None:
|
|
91
|
-
return cached
|
|
92
|
-
|
|
93
|
-
model = _load_model()
|
|
94
|
-
if model is None:
|
|
95
|
-
vec = _hash_vector(text, dim)
|
|
96
|
-
_set_cached_embedding(text or "", dim, vec)
|
|
97
|
-
return vec
|
|
98
|
-
|
|
99
|
-
vec = [float(x) for x in model.encode([text or ""], normalize_embeddings=True)[0]]
|
|
100
|
-
_set_cached_embedding(text or "", dim, vec)
|
|
101
|
-
return vec
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
|
|
105
|
-
if not vec_a or not vec_b:
|
|
106
|
-
return 0.0
|
|
107
|
-
n = min(len(vec_a), len(vec_b))
|
|
108
|
-
dot = sum(vec_a[i] * vec_b[i] for i in range(n))
|
|
109
|
-
na = math.sqrt(sum(vec_a[i] * vec_a[i] for i in range(n))) or 1.0
|
|
110
|
-
nb = math.sqrt(sum(vec_b[i] * vec_b[i] for i in range(n))) or 1.0
|
|
111
|
-
return dot / (na * nb)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def rank_semantic(query: str, docs: list[tuple[str, list[float] | None]]) -> list[tuple[str, float]]:
|
|
115
|
-
qv = embed_text(query)
|
|
116
|
-
ranked: list[tuple[str, float]] = []
|
|
117
|
-
for doc_id, emb in docs:
|
|
118
|
-
if emb is None:
|
|
119
|
-
continue
|
|
120
|
-
ranked.append((doc_id, cosine_similarity(qv, emb)))
|
|
121
|
-
ranked.sort(key=lambda x: x[1], reverse=True)
|
|
122
|
-
return ranked
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|