codespine 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {codespine-0.3.0 → codespine-0.4.0}/PKG-INFO +1 -1
  2. {codespine-0.3.0 → codespine-0.4.0}/codespine/__init__.py +1 -1
  3. {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/deadcode.py +48 -2
  4. {codespine-0.3.0 → codespine-0.4.0}/codespine/config.py +1 -1
  5. {codespine-0.3.0 → codespine-0.4.0}/codespine/db/schema.py +5 -2
  6. {codespine-0.3.0 → codespine-0.4.0}/codespine/db/store.py +21 -11
  7. {codespine-0.3.0 → codespine-0.4.0}/codespine/mcp/server.py +67 -13
  8. {codespine-0.3.0 → codespine-0.4.0}/codespine/search/bm25.py +17 -1
  9. {codespine-0.3.0 → codespine-0.4.0}/codespine/search/hybrid.py +31 -9
  10. codespine-0.4.0/codespine/search/vector.py +139 -0
  11. {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/PKG-INFO +1 -1
  12. {codespine-0.3.0 → codespine-0.4.0}/pyproject.toml +1 -1
  13. codespine-0.3.0/codespine/search/vector.py +0 -122
  14. {codespine-0.3.0 → codespine-0.4.0}/LICENSE +0 -0
  15. {codespine-0.3.0 → codespine-0.4.0}/README.md +0 -0
  16. {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/__init__.py +0 -0
  17. {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/community.py +0 -0
  18. {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/context.py +0 -0
  19. {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/coupling.py +0 -0
  20. {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/flow.py +0 -0
  21. {codespine-0.3.0 → codespine-0.4.0}/codespine/analysis/impact.py +0 -0
  22. {codespine-0.3.0 → codespine-0.4.0}/codespine/cli.py +0 -0
  23. {codespine-0.3.0 → codespine-0.4.0}/codespine/db/__init__.py +0 -0
  24. {codespine-0.3.0 → codespine-0.4.0}/codespine/diff/__init__.py +0 -0
  25. {codespine-0.3.0 → codespine-0.4.0}/codespine/diff/branch_diff.py +0 -0
  26. {codespine-0.3.0 → codespine-0.4.0}/codespine/indexer/__init__.py +0 -0
  27. {codespine-0.3.0 → codespine-0.4.0}/codespine/indexer/call_resolver.py +0 -0
  28. {codespine-0.3.0 → codespine-0.4.0}/codespine/indexer/engine.py +0 -0
  29. {codespine-0.3.0 → codespine-0.4.0}/codespine/indexer/java_parser.py +0 -0
  30. {codespine-0.3.0 → codespine-0.4.0}/codespine/indexer/symbol_builder.py +0 -0
  31. {codespine-0.3.0 → codespine-0.4.0}/codespine/mcp/__init__.py +0 -0
  32. {codespine-0.3.0 → codespine-0.4.0}/codespine/noise/__init__.py +0 -0
  33. {codespine-0.3.0 → codespine-0.4.0}/codespine/noise/blocklist.py +0 -0
  34. {codespine-0.3.0 → codespine-0.4.0}/codespine/search/__init__.py +0 -0
  35. {codespine-0.3.0 → codespine-0.4.0}/codespine/search/fuzzy.py +0 -0
  36. {codespine-0.3.0 → codespine-0.4.0}/codespine/search/rrf.py +0 -0
  37. {codespine-0.3.0 → codespine-0.4.0}/codespine/watch/__init__.py +0 -0
  38. {codespine-0.3.0 → codespine-0.4.0}/codespine/watch/watcher.py +0 -0
  39. {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/SOURCES.txt +0 -0
  40. {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/dependency_links.txt +0 -0
  41. {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/entry_points.txt +0 -0
  42. {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/requires.txt +0 -0
  43. {codespine-0.3.0 → codespine-0.4.0}/codespine.egg-info/top_level.txt +0 -0
  44. {codespine-0.3.0 → codespine-0.4.0}/gindex.py +0 -0
  45. {codespine-0.3.0 → codespine-0.4.0}/setup.cfg +0 -0
  46. {codespine-0.3.0 → codespine-0.4.0}/tests/test_branch_diff_normalize.py +0 -0
  47. {codespine-0.3.0 → codespine-0.4.0}/tests/test_call_resolver.py +0 -0
  48. {codespine-0.3.0 → codespine-0.4.0}/tests/test_index_and_hybrid.py +0 -0
  49. {codespine-0.3.0 → codespine-0.4.0}/tests/test_java_parser.py +0 -0
  50. {codespine-0.3.0 → codespine-0.4.0}/tests/test_multimodule_index.py +0 -0
  51. {codespine-0.3.0 → codespine-0.4.0}/tests/test_search_ranking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -1,4 +1,4 @@
1
1
  """CodeSpine package."""
2
2
 
3
3
  __all__ = ["__version__"]
4
- __version__ = "0.3.0"
4
+ __version__ = "0.4.0"
@@ -1,17 +1,63 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  EXEMPT_ANNOTATIONS = {
4
+ # Java standard
4
5
  "Override",
6
+ # JUnit / testing
5
7
  "Test",
6
8
  "ParameterizedTest",
9
+ "BeforeEach",
10
+ "AfterEach",
11
+ "BeforeAll",
12
+ "AfterAll",
13
+ # Spring – component model (class-level; methods inside are never "dead")
14
+ "Component",
15
+ "Service",
16
+ "Repository",
17
+ "Controller",
18
+ "RestController",
19
+ "Configuration",
7
20
  "Bean",
21
+ "Aspect",
22
+ # Spring – lifecycle / event hooks
8
23
  "PostConstruct",
9
24
  "PreDestroy",
25
+ "EventListener",
26
+ "TransactionalEventListener",
10
27
  "Scheduled",
28
+ # Spring – web entry points
29
+ "RequestMapping",
30
+ "GetMapping",
31
+ "PostMapping",
32
+ "PutMapping",
33
+ "DeleteMapping",
34
+ "PatchMapping",
35
+ "MessageMapping",
36
+ # Spring – messaging / async
11
37
  "KafkaListener",
12
- "EventListener",
13
- "JsonCreator",
38
+ "RabbitListener",
39
+ "JmsListener",
40
+ "SqsListener",
41
+ "StreamListener",
42
+ # Spring Data / persistence
43
+ "Query",
44
+ "Modifying",
45
+ # Guice DI
14
46
  "Inject",
47
+ "Provides",
48
+ "Singleton",
49
+ "Named",
50
+ "Qualifier",
51
+ # Jakarta / javax DI (same semantics as Guice/Spring variants)
52
+ "ApplicationScoped",
53
+ "RequestScoped",
54
+ "SessionScoped",
55
+ "Dependent",
56
+ # Jackson / serialization (called reflectively)
57
+ "JsonCreator",
58
+ "JsonProperty",
59
+ "JsonDeserialize",
60
+ "JsonSerialize",
15
61
  }
16
62
 
17
63
  EXEMPT_CONTRACT_METHODS = {
@@ -7,7 +7,7 @@ class Settings:
7
7
  db_path: str = os.path.expanduser("~/.codespine_db")
8
8
  pid_file: str = os.path.expanduser("~/.codespine.pid")
9
9
  log_file: str = os.path.expanduser("~/.codespine.log")
10
- embedding_cache_db: str = os.path.expanduser("~/.codespine_embedding_cache.sqlite3")
10
+ embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")
11
11
  index_meta_dir: str = os.path.expanduser("~/.codespine_index_meta")
12
12
  embedding_model: str = "BAAI/bge-small-en-v1.5"
13
13
  vector_dim: int = 384
@@ -10,7 +10,7 @@ NODE_TABLES: list[tuple[str, str]] = [
10
10
  ("SchemaMeta", "CREATE NODE TABLE SchemaMeta(key STRING, value STRING, PRIMARY KEY (key))"),
11
11
  (
12
12
  "Project",
13
- "CREATE NODE TABLE Project(id STRING, path STRING, language STRING, PRIMARY KEY (id))",
13
+ "CREATE NODE TABLE Project(id STRING, path STRING, language STRING, indexed_at STRING, PRIMARY KEY (id))",
14
14
  ),
15
15
  (
16
16
  "File",
@@ -76,7 +76,10 @@ def ensure_schema(conn) -> None:
76
76
  _safe_execute(conn, "CALL CREATE_FTS_INDEX('method_fts', 'Method', ['name', 'signature'])")
77
77
  _safe_execute(conn, "CALL CREATE_FTS_INDEX('class_fts', 'Class', ['name', 'fqcn'])")
78
78
 
79
+ # Best-effort migration: add indexed_at column to existing Project tables.
80
+ _safe_execute(conn, "ALTER TABLE Project ADD indexed_at STRING DEFAULT ''")
81
+
79
82
  _safe_execute(
80
83
  conn,
81
- "MERGE (s:SchemaMeta {key: 'schema_version'}) SET s.value = '2'",
84
+ "MERGE (s:SchemaMeta {key: 'schema_version'}) SET s.value = '3'",
82
85
  )
@@ -5,6 +5,7 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import threading
8
+ import time
8
9
  from contextlib import contextmanager
9
10
  from dataclasses import dataclass
10
11
  from typing import Any
@@ -69,7 +70,12 @@ class GraphStore:
69
70
  self.execute("COMMIT")
70
71
  except Exception:
71
72
  if tx_started:
72
- self.execute("ROLLBACK")
73
+ try:
74
+ self.execute("ROLLBACK")
75
+ except Exception:
76
+ # Kuzu may have already rolled back (e.g. on OOM), making a
77
+ # second ROLLBACK raise "No active transaction". Swallow it.
78
+ pass
73
79
  raise
74
80
 
75
81
  def clear_project(self, project_id: str) -> None:
@@ -108,8 +114,8 @@ class GraphStore:
108
114
 
109
115
  def upsert_project(self, project_id: str, path: str) -> None:
110
116
  self.execute(
111
- "MERGE (p:Project {id: $id}) SET p.path = $path, p.language = 'java'",
112
- {"id": project_id, "path": path},
117
+ "MERGE (p:Project {id: $id}) SET p.path = $path, p.language = 'java', p.indexed_at = $ts",
118
+ {"id": project_id, "path": path, "ts": str(int(time.time()))},
113
119
  )
114
120
 
115
121
  def project_file_hashes(self, project_id: str) -> dict[str, dict[str, str]]:
@@ -297,14 +303,18 @@ class GraphStore:
297
303
  "MERGE (c:Community {id: $id}) SET c.label = $label, c.cohesion = $cohesion",
298
304
  {"id": community_id, "label": label, "cohesion": cohesion},
299
305
  )
300
- # Batch all symbol→community edges in one transaction to prevent buffer pool exhaustion
301
- # on large projects (53 K+ symbols would OOM without a single commit boundary).
302
- with self.transaction():
303
- for sid in symbol_ids:
304
- self.execute(
305
- "MATCH (s:Symbol {id: $sid}), (c:Community {id: $cid}) MERGE (s)-[:IN_COMMUNITY]->(c)",
306
- {"sid": sid, "cid": community_id},
307
- )
306
+ # Commit in batches of 50 to keep Kuzu's buffer pool from OOMing on large
307
+ # communities. A single transaction over thousands of MERGE statements exhausts
308
+ # the 256 MB buffer pool before it can page out.
309
+ _BATCH = 50
310
+ for i in range(0, len(symbol_ids), _BATCH):
311
+ batch = symbol_ids[i : i + _BATCH]
312
+ with self.transaction():
313
+ for sid in batch:
314
+ self.execute(
315
+ "MATCH (s:Symbol {id: $sid}), (c:Community {id: $cid}) MERGE (s)-[:IN_COMMUNITY]->(c)",
316
+ {"sid": sid, "cid": community_id},
317
+ )
308
318
 
309
319
  def set_flow(self, flow_id: str, entry_symbol_id: str, kind: str, symbols_at_depth: list[tuple[str, int]]) -> None:
310
320
  self.execute(
@@ -71,7 +71,9 @@ def build_mcp_server(store, repo_path_provider):
71
71
  Call this before other tools so you know what's ready without trial-and-error.
72
72
  Features marked false may need 'codespine analyse --deep' or optional dependencies.
73
73
  """
74
- projects = store.query_records("MATCH (p:Project) RETURN p.id as id, p.path as path")
74
+ projects = store.query_records(
75
+ "MATCH (p:Project) RETURN p.id as id, p.path as path, p.indexed_at as indexed_at"
76
+ )
75
77
  sym_q = store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
76
78
  comm_q = store.query_records("MATCH (c:Community) RETURN count(c) as count")
77
79
  flow_q = store.query_records("MATCH (f:Flow) RETURN count(f) as count")
@@ -97,7 +99,20 @@ def build_mcp_server(store, repo_path_provider):
97
99
  watch_running = _watch["proc"] is not None and _watch["proc"].poll() is None
98
100
  analyse_running = _analyse["proc"] is not None and _analyse["proc"].poll() is None
99
101
 
102
+ now = int(time.time())
103
+ stale_projects = []
104
+ for p in projects:
105
+ ts = int(p.get("indexed_at") or 0)
106
+ if ts and (now - ts) > 3600 and not watch_running:
107
+ age_h = (now - ts) // 3600
108
+ stale_projects.append(f"{p['id']} ({age_h}h old)")
109
+
100
110
  notes: dict[str, str] = {}
111
+ if stale_projects:
112
+ notes["stale_index"] = (
113
+ f"Index is stale for: {', '.join(stale_projects)}. "
114
+ "Run analyse_project() or start_watch() to refresh."
115
+ )
101
116
  if not n_comm:
102
117
  notes["community_detection"] = "Run 'codespine analyse --deep' to enable"
103
118
  if not n_flows:
@@ -156,9 +171,12 @@ def build_mcp_server(store, repo_path_provider):
156
171
  @mcp.tool()
157
172
  def list_projects():
158
173
  """List all indexed projects with their symbol and file counts."""
159
- projects = store.query_records("MATCH (p:Project) RETURN p.id as id, p.path as path")
174
+ projects = store.query_records(
175
+ "MATCH (p:Project) RETURN p.id as id, p.path as path, p.indexed_at as indexed_at"
176
+ )
160
177
  if not projects:
161
178
  return {"available": False, "note": "No projects indexed yet. Run 'codespine analyse <path>'."}
179
+ now = int(time.time())
162
180
  result = []
163
181
  for p in projects:
164
182
  sym = store.query_records(
@@ -173,14 +191,22 @@ def build_mcp_server(store, repo_path_provider):
173
191
  "MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as count",
174
192
  {"pid": p["id"]},
175
193
  )
176
- result.append(
177
- {
178
- "project_id": p["id"],
179
- "path": p["path"],
180
- "symbol_count": sym[0]["count"] if sym else 0,
181
- "file_count": files[0]["count"] if files else 0,
182
- }
183
- )
194
+ indexed_at_ts = int(p.get("indexed_at") or 0)
195
+ age_s = now - indexed_at_ts if indexed_at_ts else None
196
+ entry: dict = {
197
+ "project_id": p["id"],
198
+ "path": p["path"],
199
+ "symbol_count": sym[0]["count"] if sym else 0,
200
+ "file_count": files[0]["count"] if files else 0,
201
+ "indexed_at_epoch": indexed_at_ts or None,
202
+ "index_age_seconds": age_s,
203
+ }
204
+ if age_s is not None and age_s > 3600:
205
+ entry["stale_warning"] = (
206
+ f"Index is {age_s // 3600}h {(age_s % 3600) // 60}m old. "
207
+ "Run analyse_project() or start_watch() to refresh."
208
+ )
209
+ result.append(entry)
184
210
  return {"available": True, "projects": result}
185
211
 
186
212
  # ------------------------------------------------------------------
@@ -371,7 +397,10 @@ def build_mcp_server(store, repo_path_provider):
371
397
  """
372
398
  name_lower = name.lower()
373
399
  project_clause = "AND f.project_id = $proj" if project else ""
374
- params: dict = {"name": name, "namel": name_lower, "lim": limit}
400
+ # Note: only $namel and $lim are referenced in the queries below.
401
+ # Do NOT add extra keys here — some Kuzu versions raise "Parameter not found"
402
+ # when the params dict contains keys absent from the query string.
403
+ params: dict = {"namel": name_lower, "lim": limit}
375
404
  if project:
376
405
  params["proj"] = project
377
406
 
@@ -591,15 +620,40 @@ def build_mcp_server(store, repo_path_provider):
591
620
  if not os.path.isdir(abs_path):
592
621
  return {"available": False, "note": f"Path does not exist or is not a directory: {abs_path}"}
593
622
 
623
+ import tempfile as _tempfile
624
+ watch_err_file = _tempfile.NamedTemporaryFile(
625
+ mode="w", suffix=".log", prefix="codespine_watch_", delete=False
626
+ )
627
+ watch_err_path = watch_err_file.name
628
+ watch_err_file.close()
629
+
594
630
  proc = subprocess.Popen(
595
631
  [
596
632
  sys.executable, "-m", "codespine.cli",
597
633
  "watch", "--path", abs_path,
598
634
  "--global-interval", str(global_interval),
599
635
  ],
600
- stdout=subprocess.DEVNULL,
601
- stderr=subprocess.DEVNULL,
636
+ stdout=open(watch_err_path, "w", encoding="utf-8"),
637
+ stderr=subprocess.STDOUT,
602
638
  )
639
+
640
+ # Brief health check — if the process dies within 1 s it crashed at startup.
641
+ time.sleep(1)
642
+ if proc.poll() is not None:
643
+ try:
644
+ with open(watch_err_path, "r", encoding="utf-8", errors="replace") as fh:
645
+ err_tail = fh.read().strip().splitlines()[-10:]
646
+ except Exception:
647
+ err_tail = []
648
+ return {
649
+ "available": False,
650
+ "note": (
651
+ f"Watch mode process exited immediately (code {proc.returncode}). "
652
+ "Check that the path is valid and watchfiles is installed."
653
+ ),
654
+ "error_tail": err_tail,
655
+ }
656
+
603
657
  _watch["proc"] = proc
604
658
  _watch["path"] = abs_path
605
659
  _watch["started_at"] = time.time()
@@ -5,10 +5,26 @@ import re
5
5
  from collections import Counter
6
6
 
7
7
  TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
8
+ _CAMEL_SPLIT_RE = re.compile(r"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|(?<=\D)(?=\d)|(?<=\d)(?=\D)")
8
9
 
9
10
 
10
11
  def tokenize(text: str) -> list[str]:
11
- return [t.lower() for t in TOKEN_RE.findall(text or "")]
12
+ """Tokenize text, splitting on camelCase and underscores in addition to whitespace.
13
+
14
+ 'SolicitPanFetchActionCompletionEvent' → ['solicit', 'pan', 'fetch', 'action', 'completion', 'event']
15
+ 'get_symbol_context' → ['get', 'symbol', 'context']
16
+ """
17
+ raw_tokens = TOKEN_RE.findall(text or "")
18
+ out: list[str] = []
19
+ for tok in raw_tokens:
20
+ # Split underscore-joined segments first, then camelCase within each
21
+ for segment in tok.split("_"):
22
+ if not segment:
23
+ continue
24
+ for part in _CAMEL_SPLIT_RE.split(segment):
25
+ if part:
26
+ out.append(part.lower())
27
+ return out
12
28
 
13
29
 
14
30
  def rank_bm25(query: str, docs: list[tuple[str, str]], k1: float = 1.2, b: float = 0.75) -> list[tuple[str, float]]:
@@ -1,18 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
- from codespine.config import SETTINGS
4
3
  from codespine.search.bm25 import rank_bm25
5
4
  from codespine.search.fuzzy import rank_fuzzy
6
5
  from codespine.search.rrf import reciprocal_rank_fusion
7
6
  from codespine.search.vector import rank_semantic
8
7
 
8
+ _LOW_CONFIDENCE_THRESHOLD = 0.05
9
+
9
10
 
10
11
  def hybrid_search(store, query: str, k: int = 20, project: str | None = None) -> list[dict]:
11
12
  project_clause = "AND f.project_id = $proj" if project else ""
12
- params: dict = {"lim": SETTINGS.semantic_candidate_pool}
13
+ params: dict = {}
13
14
  if project:
14
15
  params["proj"] = project
15
16
 
17
+ # No LIMIT — load all symbols for the scoped project so that exact class names
18
+ # are never missing from the candidate pool (previously capped at 2000 which
19
+ # caused exact matches on 4000+ file projects to be silently dropped).
16
20
  recs = store.query_records(
17
21
  f"""
18
22
  MATCH (s:Symbol), (f:File)
@@ -24,7 +28,6 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
24
28
  s.embedding as embedding,
25
29
  f.path as file_path,
26
30
  f.is_test as is_test
27
- LIMIT $lim
28
31
  """,
29
32
  params,
30
33
  )
@@ -32,6 +35,8 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
32
35
  if not recs:
33
36
  return []
34
37
 
38
+ query_lower = query.lower().strip()
39
+
35
40
  lexical_docs = [(r["id"], f"{r.get('name', '')} {r.get('fqname', '')}") for r in recs]
36
41
  fuzzy_docs = [(r["id"], r.get("name", "")) for r in recs]
37
42
  vector_docs = [(r["id"], r.get("embedding")) for r in recs]
@@ -40,11 +45,11 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
40
45
  fuzzy_rank = rank_fuzzy(query, fuzzy_docs)
41
46
  semantic_rank = rank_semantic(query, vector_docs)
42
47
 
43
- fused = reciprocal_rank_fusion([bm25_rank, semantic_rank, fuzzy_rank], k=SETTINGS.rrf_k)
48
+ fused = reciprocal_rank_fusion([bm25_rank, semantic_rank, fuzzy_rank])
44
49
  rec_by_id = {r["id"]: r for r in recs}
45
50
 
46
51
  results = []
47
- for doc_id, score in fused[: max(k * 3, k)]:
52
+ for doc_id, score in fused:
48
53
  rec = rec_by_id.get(doc_id)
49
54
  if not rec:
50
55
  continue
@@ -55,7 +60,12 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
55
60
  if rec.get("kind") in {"method", "class"}:
56
61
  multiplier *= 1.2
57
62
 
58
- final_score = score * multiplier
63
+ # Exact name match: guarantee this symbol ranks first regardless of RRF score.
64
+ name_lower = (rec.get("name") or "").lower()
65
+ fqname_lower = (rec.get("fqname") or "").lower()
66
+ if name_lower == query_lower or fqname_lower == query_lower:
67
+ multiplier *= 5.0
68
+
59
69
  results.append(
60
70
  {
61
71
  "id": doc_id,
@@ -63,14 +73,15 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
63
73
  "name": rec.get("name"),
64
74
  "fqname": rec.get("fqname"),
65
75
  "file_path": rec.get("file_path"),
66
- "score": final_score,
76
+ "score": score * multiplier,
67
77
  }
68
78
  )
69
79
 
70
80
  results.sort(key=lambda x: x["score"], reverse=True)
81
+ top_k = results[:k]
71
82
 
72
83
  # Attach architectural context in same response.
73
- for item in results[:k]:
84
+ for item in top_k:
74
85
  ctx = store.query_records(
75
86
  """
76
87
  MATCH (s:Symbol {id: $sid})-[:IN_COMMUNITY]->(c:Community)
@@ -83,4 +94,15 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
83
94
  )
84
95
  item["context"] = ctx
85
96
 
86
- return results[:k]
97
+ # Warn when all scores are near zero — the results are likely noise.
98
+ if top_k and top_k[0]["score"] < _LOW_CONFIDENCE_THRESHOLD:
99
+ for item in top_k:
100
+ item["low_confidence"] = True
101
+ top_k.append({
102
+ "note": (
103
+ "Low confidence results — all scores below threshold. "
104
+ "If searching for an exact class or method name, use find_symbol instead."
105
+ )
106
+ })
107
+
108
+ return top_k
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import math
6
+ import os
7
+ import threading
8
+ from functools import lru_cache
9
+
10
+ from codespine.config import SETTINGS
11
+
12
+
13
+ def _hash_vector(text: str, dim: int) -> list[float]:
14
+ """Deterministic fallback embedding when sentence-transformers is unavailable."""
15
+ vec = [0.0] * dim
16
+ if not text:
17
+ return vec
18
+ tokens = text.lower().split()
19
+ for token in tokens:
20
+ digest = hashlib.sha1(token.encode("utf-8")).digest()
21
+ idx = int.from_bytes(digest[:2], "big") % dim
22
+ sign = 1.0 if digest[2] % 2 == 0 else -1.0
23
+ vec[idx] += sign
24
+ norm = math.sqrt(sum(v * v for v in vec)) or 1.0
25
+ return [v / norm for v in vec]
26
+
27
+
28
+ @lru_cache(maxsize=1)
29
+ def _load_model():
30
+ try:
31
+ from sentence_transformers import SentenceTransformer
32
+
33
+ return SentenceTransformer(SETTINGS.embedding_model)
34
+ except Exception:
35
+ return None
36
+
37
+
38
+ class _EmbeddingCache:
39
+ """Thread-safe in-memory embedding cache backed by a JSON file.
40
+
41
+ Replaces the previous SQLite-based cache which caused threading issues
42
+ (database is locked / created in wrong thread) under MCP server concurrency.
43
+ """
44
+
45
+ def __init__(self, path: str) -> None:
46
+ self._path = path
47
+ self._lock = threading.Lock()
48
+ self._data: dict[str, str] | None = None # loaded lazily
49
+
50
+ def _ensure_loaded(self) -> None:
51
+ """Load cache from disk. Must be called with _lock held."""
52
+ if self._data is not None:
53
+ return
54
+ if os.path.isfile(self._path):
55
+ try:
56
+ with open(self._path, "r", encoding="utf-8") as f:
57
+ loaded = json.load(f)
58
+ if isinstance(loaded, dict):
59
+ self._data = loaded
60
+ return
61
+ except Exception:
62
+ pass
63
+ self._data = {}
64
+
65
+ def _flush(self) -> None:
66
+ """Persist cache to disk atomically. Must be called with _lock held."""
67
+ try:
68
+ dir_path = os.path.dirname(self._path)
69
+ if dir_path:
70
+ os.makedirs(dir_path, exist_ok=True)
71
+ tmp = self._path + ".tmp"
72
+ with open(tmp, "w", encoding="utf-8") as f:
73
+ json.dump(self._data, f, separators=(",", ":"))
74
+ os.replace(tmp, self._path)
75
+ except Exception:
76
+ pass
77
+
78
+ def get(self, key: str) -> list[float] | None:
79
+ with self._lock:
80
+ self._ensure_loaded()
81
+ raw = self._data.get(key) # type: ignore[union-attr]
82
+ if raw is None:
83
+ return None
84
+ try:
85
+ return [float(x) for x in json.loads(raw)]
86
+ except Exception:
87
+ return None
88
+
89
+ def set(self, key: str, vec: list[float]) -> None:
90
+ with self._lock:
91
+ self._ensure_loaded()
92
+ self._data[key] = json.dumps(vec) # type: ignore[index]
93
+ self._flush()
94
+
95
+
96
+ _CACHE = _EmbeddingCache(SETTINGS.embedding_cache_path)
97
+
98
+
99
+ def _cache_key(text: str, dim: int) -> str:
100
+ return hashlib.sha1(f"{SETTINGS.embedding_model}|{dim}|{text}".encode("utf-8")).hexdigest()
101
+
102
+
103
+ def embed_text(text: str, dim: int | None = None) -> list[float]:
104
+ dim = dim or SETTINGS.vector_dim
105
+ key = _cache_key(text or "", dim)
106
+
107
+ cached = _CACHE.get(key)
108
+ if cached is not None:
109
+ return cached
110
+
111
+ model = _load_model()
112
+ if model is None:
113
+ vec = _hash_vector(text, dim)
114
+ else:
115
+ vec = [float(x) for x in model.encode([text or ""], normalize_embeddings=True)[0]]
116
+
117
+ _CACHE.set(key, vec)
118
+ return vec
119
+
120
+
121
+ def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
122
+ if not vec_a or not vec_b:
123
+ return 0.0
124
+ n = min(len(vec_a), len(vec_b))
125
+ dot = sum(vec_a[i] * vec_b[i] for i in range(n))
126
+ na = math.sqrt(sum(vec_a[i] * vec_a[i] for i in range(n))) or 1.0
127
+ nb = math.sqrt(sum(vec_b[i] * vec_b[i] for i in range(n))) or 1.0
128
+ return dot / (na * nb)
129
+
130
+
131
+ def rank_semantic(query: str, docs: list[tuple[str, list[float] | None]]) -> list[tuple[str, float]]:
132
+ qv = embed_text(query)
133
+ ranked: list[tuple[str, float]] = []
134
+ for doc_id, emb in docs:
135
+ if emb is None:
136
+ continue
137
+ ranked.append((doc_id, cosine_similarity(qv, emb)))
138
+ ranked.sort(key=lambda x: x[1], reverse=True)
139
+ return ranked
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "codespine"
7
- version = "0.3.0"
7
+ version = "0.4.0"
8
8
  description = "Local Java code intelligence indexer backed by a graph database"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -1,122 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import hashlib
4
- import math
5
- import sqlite3
6
- from functools import lru_cache
7
-
8
- from codespine.config import SETTINGS
9
-
10
-
11
- def _hash_vector(text: str, dim: int) -> list[float]:
12
- """Deterministic fallback embedding when sentence-transformers is unavailable."""
13
- vec = [0.0] * dim
14
- if not text:
15
- return vec
16
- tokens = text.lower().split()
17
- for token in tokens:
18
- digest = hashlib.sha1(token.encode("utf-8")).digest()
19
- idx = int.from_bytes(digest[:2], "big") % dim
20
- sign = 1.0 if digest[2] % 2 == 0 else -1.0
21
- vec[idx] += sign
22
- norm = math.sqrt(sum(v * v for v in vec)) or 1.0
23
- return [v / norm for v in vec]
24
-
25
-
26
- @lru_cache(maxsize=1)
27
- def _load_model():
28
- try:
29
- from sentence_transformers import SentenceTransformer
30
-
31
- return SentenceTransformer(SETTINGS.embedding_model)
32
- except Exception:
33
- return None
34
-
35
-
36
- @lru_cache(maxsize=1)
37
- def _embedding_cache_conn():
38
- path = SETTINGS.embedding_cache_db
39
- try:
40
- os_dir = path.rsplit("/", 1)[0] if "/" in path else ""
41
- if os_dir:
42
- import os
43
-
44
- os.makedirs(os_dir, exist_ok=True)
45
- conn = sqlite3.connect(path, check_same_thread=False)
46
- except Exception:
47
- conn = sqlite3.connect("/tmp/.codespine_embedding_cache.sqlite3", check_same_thread=False)
48
- conn.execute(
49
- """
50
- CREATE TABLE IF NOT EXISTS embedding_cache (
51
- cache_key TEXT PRIMARY KEY,
52
- dim INTEGER NOT NULL,
53
- vector_json TEXT NOT NULL
54
- )
55
- """
56
- )
57
- return conn
58
-
59
-
60
- def _cache_key(text: str, dim: int) -> str:
61
- return hashlib.sha1(f"{SETTINGS.embedding_model}|{dim}|{text}".encode("utf-8")).hexdigest()
62
-
63
-
64
- def _get_cached_embedding(text: str, dim: int) -> list[float] | None:
65
- key = _cache_key(text, dim)
66
- conn = _embedding_cache_conn()
67
- row = conn.execute("SELECT vector_json FROM embedding_cache WHERE cache_key = ? AND dim = ?", (key, dim)).fetchone()
68
- if not row:
69
- return None
70
- import json
71
-
72
- return [float(x) for x in json.loads(row[0])]
73
-
74
-
75
- def _set_cached_embedding(text: str, dim: int, vec: list[float]) -> None:
76
- key = _cache_key(text, dim)
77
- conn = _embedding_cache_conn()
78
- import json
79
-
80
- conn.execute(
81
- "INSERT OR REPLACE INTO embedding_cache(cache_key, dim, vector_json) VALUES (?, ?, ?)",
82
- (key, dim, json.dumps(vec)),
83
- )
84
- conn.commit()
85
-
86
-
87
- def embed_text(text: str, dim: int | None = None) -> list[float]:
88
- dim = dim or SETTINGS.vector_dim
89
- cached = _get_cached_embedding(text or "", dim)
90
- if cached is not None:
91
- return cached
92
-
93
- model = _load_model()
94
- if model is None:
95
- vec = _hash_vector(text, dim)
96
- _set_cached_embedding(text or "", dim, vec)
97
- return vec
98
-
99
- vec = [float(x) for x in model.encode([text or ""], normalize_embeddings=True)[0]]
100
- _set_cached_embedding(text or "", dim, vec)
101
- return vec
102
-
103
-
104
- def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
105
- if not vec_a or not vec_b:
106
- return 0.0
107
- n = min(len(vec_a), len(vec_b))
108
- dot = sum(vec_a[i] * vec_b[i] for i in range(n))
109
- na = math.sqrt(sum(vec_a[i] * vec_a[i] for i in range(n))) or 1.0
110
- nb = math.sqrt(sum(vec_b[i] * vec_b[i] for i in range(n))) or 1.0
111
- return dot / (na * nb)
112
-
113
-
114
- def rank_semantic(query: str, docs: list[tuple[str, list[float] | None]]) -> list[tuple[str, float]]:
115
- qv = embed_text(query)
116
- ranked: list[tuple[str, float]] = []
117
- for doc_id, emb in docs:
118
- if emb is None:
119
- continue
120
- ranked.append((doc_id, cosine_similarity(qv, emb)))
121
- ranked.sort(key=lambda x: x[1], reverse=True)
122
- return ranked
File without changes
File without changes
File without changes
File without changes
File without changes