codespine 0.3.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {codespine-0.3.0 → codespine-0.4.1}/PKG-INFO +1 -1
  2. {codespine-0.3.0 → codespine-0.4.1}/codespine/__init__.py +1 -1
  3. {codespine-0.3.0 → codespine-0.4.1}/codespine/analysis/deadcode.py +48 -2
  4. {codespine-0.3.0 → codespine-0.4.1}/codespine/config.py +1 -1
  5. {codespine-0.3.0 → codespine-0.4.1}/codespine/db/schema.py +5 -2
  6. {codespine-0.3.0 → codespine-0.4.1}/codespine/db/store.py +21 -11
  7. {codespine-0.3.0 → codespine-0.4.1}/codespine/indexer/engine.py +4 -0
  8. {codespine-0.3.0 → codespine-0.4.1}/codespine/mcp/server.py +79 -13
  9. {codespine-0.3.0 → codespine-0.4.1}/codespine/search/bm25.py +17 -1
  10. {codespine-0.3.0 → codespine-0.4.1}/codespine/search/hybrid.py +31 -9
  11. codespine-0.4.1/codespine/search/vector.py +155 -0
  12. {codespine-0.3.0 → codespine-0.4.1}/codespine.egg-info/PKG-INFO +1 -1
  13. {codespine-0.3.0 → codespine-0.4.1}/pyproject.toml +1 -1
  14. codespine-0.3.0/codespine/search/vector.py +0 -122
  15. {codespine-0.3.0 → codespine-0.4.1}/LICENSE +0 -0
  16. {codespine-0.3.0 → codespine-0.4.1}/README.md +0 -0
  17. {codespine-0.3.0 → codespine-0.4.1}/codespine/analysis/__init__.py +0 -0
  18. {codespine-0.3.0 → codespine-0.4.1}/codespine/analysis/community.py +0 -0
  19. {codespine-0.3.0 → codespine-0.4.1}/codespine/analysis/context.py +0 -0
  20. {codespine-0.3.0 → codespine-0.4.1}/codespine/analysis/coupling.py +0 -0
  21. {codespine-0.3.0 → codespine-0.4.1}/codespine/analysis/flow.py +0 -0
  22. {codespine-0.3.0 → codespine-0.4.1}/codespine/analysis/impact.py +0 -0
  23. {codespine-0.3.0 → codespine-0.4.1}/codespine/cli.py +0 -0
  24. {codespine-0.3.0 → codespine-0.4.1}/codespine/db/__init__.py +0 -0
  25. {codespine-0.3.0 → codespine-0.4.1}/codespine/diff/__init__.py +0 -0
  26. {codespine-0.3.0 → codespine-0.4.1}/codespine/diff/branch_diff.py +0 -0
  27. {codespine-0.3.0 → codespine-0.4.1}/codespine/indexer/__init__.py +0 -0
  28. {codespine-0.3.0 → codespine-0.4.1}/codespine/indexer/call_resolver.py +0 -0
  29. {codespine-0.3.0 → codespine-0.4.1}/codespine/indexer/java_parser.py +0 -0
  30. {codespine-0.3.0 → codespine-0.4.1}/codespine/indexer/symbol_builder.py +0 -0
  31. {codespine-0.3.0 → codespine-0.4.1}/codespine/mcp/__init__.py +0 -0
  32. {codespine-0.3.0 → codespine-0.4.1}/codespine/noise/__init__.py +0 -0
  33. {codespine-0.3.0 → codespine-0.4.1}/codespine/noise/blocklist.py +0 -0
  34. {codespine-0.3.0 → codespine-0.4.1}/codespine/search/__init__.py +0 -0
  35. {codespine-0.3.0 → codespine-0.4.1}/codespine/search/fuzzy.py +0 -0
  36. {codespine-0.3.0 → codespine-0.4.1}/codespine/search/rrf.py +0 -0
  37. {codespine-0.3.0 → codespine-0.4.1}/codespine/watch/__init__.py +0 -0
  38. {codespine-0.3.0 → codespine-0.4.1}/codespine/watch/watcher.py +0 -0
  39. {codespine-0.3.0 → codespine-0.4.1}/codespine.egg-info/SOURCES.txt +0 -0
  40. {codespine-0.3.0 → codespine-0.4.1}/codespine.egg-info/dependency_links.txt +0 -0
  41. {codespine-0.3.0 → codespine-0.4.1}/codespine.egg-info/entry_points.txt +0 -0
  42. {codespine-0.3.0 → codespine-0.4.1}/codespine.egg-info/requires.txt +0 -0
  43. {codespine-0.3.0 → codespine-0.4.1}/codespine.egg-info/top_level.txt +0 -0
  44. {codespine-0.3.0 → codespine-0.4.1}/gindex.py +0 -0
  45. {codespine-0.3.0 → codespine-0.4.1}/setup.cfg +0 -0
  46. {codespine-0.3.0 → codespine-0.4.1}/tests/test_branch_diff_normalize.py +0 -0
  47. {codespine-0.3.0 → codespine-0.4.1}/tests/test_call_resolver.py +0 -0
  48. {codespine-0.3.0 → codespine-0.4.1}/tests/test_index_and_hybrid.py +0 -0
  49. {codespine-0.3.0 → codespine-0.4.1}/tests/test_java_parser.py +0 -0
  50. {codespine-0.3.0 → codespine-0.4.1}/tests/test_multimodule_index.py +0 -0
  51. {codespine-0.3.0 → codespine-0.4.1}/tests/test_search_ranking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -1,4 +1,4 @@
1
1
  """CodeSpine package."""
2
2
 
3
3
  __all__ = ["__version__"]
4
- __version__ = "0.3.0"
4
+ __version__ = "0.4.1"
@@ -1,17 +1,63 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  EXEMPT_ANNOTATIONS = {
4
+ # Java standard
4
5
  "Override",
6
+ # JUnit / testing
5
7
  "Test",
6
8
  "ParameterizedTest",
9
+ "BeforeEach",
10
+ "AfterEach",
11
+ "BeforeAll",
12
+ "AfterAll",
13
+ # Spring – component model (class-level; methods inside are never "dead")
14
+ "Component",
15
+ "Service",
16
+ "Repository",
17
+ "Controller",
18
+ "RestController",
19
+ "Configuration",
7
20
  "Bean",
21
+ "Aspect",
22
+ # Spring – lifecycle / event hooks
8
23
  "PostConstruct",
9
24
  "PreDestroy",
25
+ "EventListener",
26
+ "TransactionalEventListener",
10
27
  "Scheduled",
28
+ # Spring – web entry points
29
+ "RequestMapping",
30
+ "GetMapping",
31
+ "PostMapping",
32
+ "PutMapping",
33
+ "DeleteMapping",
34
+ "PatchMapping",
35
+ "MessageMapping",
36
+ # Spring – messaging / async
11
37
  "KafkaListener",
12
- "EventListener",
13
- "JsonCreator",
38
+ "RabbitListener",
39
+ "JmsListener",
40
+ "SqsListener",
41
+ "StreamListener",
42
+ # Spring Data / persistence
43
+ "Query",
44
+ "Modifying",
45
+ # Guice DI
14
46
  "Inject",
47
+ "Provides",
48
+ "Singleton",
49
+ "Named",
50
+ "Qualifier",
51
+ # Jakarta / javax DI (same semantics as Guice/Spring variants)
52
+ "ApplicationScoped",
53
+ "RequestScoped",
54
+ "SessionScoped",
55
+ "Dependent",
56
+ # Jackson / serialization (called reflectively)
57
+ "JsonCreator",
58
+ "JsonProperty",
59
+ "JsonDeserialize",
60
+ "JsonSerialize",
15
61
  }
16
62
 
17
63
  EXEMPT_CONTRACT_METHODS = {
@@ -7,7 +7,7 @@ class Settings:
7
7
  db_path: str = os.path.expanduser("~/.codespine_db")
8
8
  pid_file: str = os.path.expanduser("~/.codespine.pid")
9
9
  log_file: str = os.path.expanduser("~/.codespine.log")
10
- embedding_cache_db: str = os.path.expanduser("~/.codespine_embedding_cache.sqlite3")
10
+ embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")
11
11
  index_meta_dir: str = os.path.expanduser("~/.codespine_index_meta")
12
12
  embedding_model: str = "BAAI/bge-small-en-v1.5"
13
13
  vector_dim: int = 384
@@ -10,7 +10,7 @@ NODE_TABLES: list[tuple[str, str]] = [
10
10
  ("SchemaMeta", "CREATE NODE TABLE SchemaMeta(key STRING, value STRING, PRIMARY KEY (key))"),
11
11
  (
12
12
  "Project",
13
- "CREATE NODE TABLE Project(id STRING, path STRING, language STRING, PRIMARY KEY (id))",
13
+ "CREATE NODE TABLE Project(id STRING, path STRING, language STRING, indexed_at STRING, PRIMARY KEY (id))",
14
14
  ),
15
15
  (
16
16
  "File",
@@ -76,7 +76,10 @@ def ensure_schema(conn) -> None:
76
76
  _safe_execute(conn, "CALL CREATE_FTS_INDEX('method_fts', 'Method', ['name', 'signature'])")
77
77
  _safe_execute(conn, "CALL CREATE_FTS_INDEX('class_fts', 'Class', ['name', 'fqcn'])")
78
78
 
79
+ # Best-effort migration: add indexed_at column to existing Project tables.
80
+ _safe_execute(conn, "ALTER TABLE Project ADD indexed_at STRING DEFAULT ''")
81
+
79
82
  _safe_execute(
80
83
  conn,
81
- "MERGE (s:SchemaMeta {key: 'schema_version'}) SET s.value = '2'",
84
+ "MERGE (s:SchemaMeta {key: 'schema_version'}) SET s.value = '3'",
82
85
  )
@@ -5,6 +5,7 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import threading
8
+ import time
8
9
  from contextlib import contextmanager
9
10
  from dataclasses import dataclass
10
11
  from typing import Any
@@ -69,7 +70,12 @@ class GraphStore:
69
70
  self.execute("COMMIT")
70
71
  except Exception:
71
72
  if tx_started:
72
- self.execute("ROLLBACK")
73
+ try:
74
+ self.execute("ROLLBACK")
75
+ except Exception:
76
+ # Kuzu may have already rolled back (e.g. on OOM), making a
77
+ # second ROLLBACK raise "No active transaction". Swallow it.
78
+ pass
73
79
  raise
74
80
 
75
81
  def clear_project(self, project_id: str) -> None:
@@ -108,8 +114,8 @@ class GraphStore:
108
114
 
109
115
  def upsert_project(self, project_id: str, path: str) -> None:
110
116
  self.execute(
111
- "MERGE (p:Project {id: $id}) SET p.path = $path, p.language = 'java'",
112
- {"id": project_id, "path": path},
117
+ "MERGE (p:Project {id: $id}) SET p.path = $path, p.language = 'java', p.indexed_at = $ts",
118
+ {"id": project_id, "path": path, "ts": str(int(time.time()))},
113
119
  )
114
120
 
115
121
  def project_file_hashes(self, project_id: str) -> dict[str, dict[str, str]]:
@@ -297,14 +303,18 @@ class GraphStore:
297
303
  "MERGE (c:Community {id: $id}) SET c.label = $label, c.cohesion = $cohesion",
298
304
  {"id": community_id, "label": label, "cohesion": cohesion},
299
305
  )
300
- # Batch all symbol→community edges in one transaction to prevent buffer pool exhaustion
301
- # on large projects (53 K+ symbols would OOM without a single commit boundary).
302
- with self.transaction():
303
- for sid in symbol_ids:
304
- self.execute(
305
- "MATCH (s:Symbol {id: $sid}), (c:Community {id: $cid}) MERGE (s)-[:IN_COMMUNITY]->(c)",
306
- {"sid": sid, "cid": community_id},
307
- )
306
+ # Commit in batches of 50 to keep Kuzu's buffer pool from OOMing on large
307
+ # communities. A single transaction over thousands of MERGE statements exhausts
308
+ # the 256 MB buffer pool before it can page out.
309
+ _BATCH = 50
310
+ for i in range(0, len(symbol_ids), _BATCH):
311
+ batch = symbol_ids[i : i + _BATCH]
312
+ with self.transaction():
313
+ for sid in batch:
314
+ self.execute(
315
+ "MATCH (s:Symbol {id: $sid}), (c:Community {id: $cid}) MERGE (s)-[:IN_COMMUNITY]->(c)",
316
+ {"sid": sid, "cid": community_id},
317
+ )
308
318
 
309
319
  def set_flow(self, flow_id: str, entry_symbol_id: str, kind: str, symbols_at_depth: list[tuple[str, int]]) -> None:
310
320
  self.execute(
@@ -167,6 +167,10 @@ class JavaIndexer:
167
167
  to_reindex = current_files
168
168
  deleted_file_ids = []
169
169
  meta_cache = {}
170
+ # Wipe the embedding cache on a full re-index so stale embeddings
171
+ # (including those from the old SQLite format) are not carried over.
172
+ from codespine.search.vector import _CACHE as _embed_cache
173
+ _embed_cache.clear()
170
174
  else:
171
175
  to_reindex, deleted_file_ids, meta_cache = self._plan_incremental(
172
176
  project_id,
@@ -71,7 +71,16 @@ def build_mcp_server(store, repo_path_provider):
71
71
  Call this before other tools so you know what's ready without trial-and-error.
72
72
  Features marked false may need 'codespine analyse --deep' or optional dependencies.
73
73
  """
74
- projects = store.query_records("MATCH (p:Project) RETURN p.id as id, p.path as path")
74
+ try:
75
+ projects = store.query_records(
76
+ "MATCH (p:Project) RETURN p.id as id, p.path as path, p.indexed_at as indexed_at"
77
+ )
78
+ except Exception:
79
+ # Old DB schema (pre-0.4.0) doesn't have indexed_at column yet.
80
+ # Falls back gracefully; column is added next time 'analyse' runs.
81
+ projects = store.query_records(
82
+ "MATCH (p:Project) RETURN p.id as id, p.path as path"
83
+ )
75
84
  sym_q = store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
76
85
  comm_q = store.query_records("MATCH (c:Community) RETURN count(c) as count")
77
86
  flow_q = store.query_records("MATCH (f:Flow) RETURN count(f) as count")
@@ -97,7 +106,20 @@ def build_mcp_server(store, repo_path_provider):
97
106
  watch_running = _watch["proc"] is not None and _watch["proc"].poll() is None
98
107
  analyse_running = _analyse["proc"] is not None and _analyse["proc"].poll() is None
99
108
 
109
+ now = int(time.time())
110
+ stale_projects = []
111
+ for p in projects:
112
+ ts = int(p.get("indexed_at") or 0)
113
+ if ts and (now - ts) > 3600 and not watch_running:
114
+ age_h = (now - ts) // 3600
115
+ stale_projects.append(f"{p['id']} ({age_h}h old)")
116
+
100
117
  notes: dict[str, str] = {}
118
+ if stale_projects:
119
+ notes["stale_index"] = (
120
+ f"Index is stale for: {', '.join(stale_projects)}. "
121
+ "Run analyse_project() or start_watch() to refresh."
122
+ )
101
123
  if not n_comm:
102
124
  notes["community_detection"] = "Run 'codespine analyse --deep' to enable"
103
125
  if not n_flows:
@@ -156,9 +178,17 @@ def build_mcp_server(store, repo_path_provider):
156
178
  @mcp.tool()
157
179
  def list_projects():
158
180
  """List all indexed projects with their symbol and file counts."""
159
- projects = store.query_records("MATCH (p:Project) RETURN p.id as id, p.path as path")
181
+ try:
182
+ projects = store.query_records(
183
+ "MATCH (p:Project) RETURN p.id as id, p.path as path, p.indexed_at as indexed_at"
184
+ )
185
+ except Exception:
186
+ projects = store.query_records(
187
+ "MATCH (p:Project) RETURN p.id as id, p.path as path"
188
+ )
160
189
  if not projects:
161
190
  return {"available": False, "note": "No projects indexed yet. Run 'codespine analyse <path>'."}
191
+ now = int(time.time())
162
192
  result = []
163
193
  for p in projects:
164
194
  sym = store.query_records(
@@ -173,14 +203,22 @@ def build_mcp_server(store, repo_path_provider):
173
203
  "MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as count",
174
204
  {"pid": p["id"]},
175
205
  )
176
- result.append(
177
- {
178
- "project_id": p["id"],
179
- "path": p["path"],
180
- "symbol_count": sym[0]["count"] if sym else 0,
181
- "file_count": files[0]["count"] if files else 0,
182
- }
183
- )
206
+ indexed_at_ts = int(p.get("indexed_at") or 0)
207
+ age_s = now - indexed_at_ts if indexed_at_ts else None
208
+ entry: dict = {
209
+ "project_id": p["id"],
210
+ "path": p["path"],
211
+ "symbol_count": sym[0]["count"] if sym else 0,
212
+ "file_count": files[0]["count"] if files else 0,
213
+ "indexed_at_epoch": indexed_at_ts or None,
214
+ "index_age_seconds": age_s,
215
+ }
216
+ if age_s is not None and age_s > 3600:
217
+ entry["stale_warning"] = (
218
+ f"Index is {age_s // 3600}h {(age_s % 3600) // 60}m old. "
219
+ "Run analyse_project() or start_watch() to refresh."
220
+ )
221
+ result.append(entry)
184
222
  return {"available": True, "projects": result}
185
223
 
186
224
  # ------------------------------------------------------------------
@@ -371,7 +409,10 @@ def build_mcp_server(store, repo_path_provider):
371
409
  """
372
410
  name_lower = name.lower()
373
411
  project_clause = "AND f.project_id = $proj" if project else ""
374
- params: dict = {"name": name, "namel": name_lower, "lim": limit}
412
+ # Note: only $namel and $lim are referenced in the queries below.
413
+ # Do NOT add extra keys here — some Kuzu versions raise "Parameter not found"
414
+ # when the params dict contains keys absent from the query string.
415
+ params: dict = {"namel": name_lower, "lim": limit}
375
416
  if project:
376
417
  params["proj"] = project
377
418
 
@@ -591,15 +632,40 @@ def build_mcp_server(store, repo_path_provider):
591
632
  if not os.path.isdir(abs_path):
592
633
  return {"available": False, "note": f"Path does not exist or is not a directory: {abs_path}"}
593
634
 
635
+ import tempfile as _tempfile
636
+ watch_err_file = _tempfile.NamedTemporaryFile(
637
+ mode="w", suffix=".log", prefix="codespine_watch_", delete=False
638
+ )
639
+ watch_err_path = watch_err_file.name
640
+ watch_err_file.close()
641
+
594
642
  proc = subprocess.Popen(
595
643
  [
596
644
  sys.executable, "-m", "codespine.cli",
597
645
  "watch", "--path", abs_path,
598
646
  "--global-interval", str(global_interval),
599
647
  ],
600
- stdout=subprocess.DEVNULL,
601
- stderr=subprocess.DEVNULL,
648
+ stdout=open(watch_err_path, "w", encoding="utf-8"),
649
+ stderr=subprocess.STDOUT,
602
650
  )
651
+
652
+ # Brief health check — if the process dies within 1 s it crashed at startup.
653
+ time.sleep(1)
654
+ if proc.poll() is not None:
655
+ try:
656
+ with open(watch_err_path, "r", encoding="utf-8", errors="replace") as fh:
657
+ err_tail = fh.read().strip().splitlines()[-10:]
658
+ except Exception:
659
+ err_tail = []
660
+ return {
661
+ "available": False,
662
+ "note": (
663
+ f"Watch mode process exited immediately (code {proc.returncode}). "
664
+ "Check that the path is valid and watchfiles is installed."
665
+ ),
666
+ "error_tail": err_tail,
667
+ }
668
+
603
669
  _watch["proc"] = proc
604
670
  _watch["path"] = abs_path
605
671
  _watch["started_at"] = time.time()
@@ -5,10 +5,26 @@ import re
5
5
  from collections import Counter
6
6
 
7
7
  TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
8
+ _CAMEL_SPLIT_RE = re.compile(r"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|(?<=\D)(?=\d)|(?<=\d)(?=\D)")
8
9
 
9
10
 
10
11
  def tokenize(text: str) -> list[str]:
11
- return [t.lower() for t in TOKEN_RE.findall(text or "")]
12
+ """Tokenize text, splitting on camelCase and underscores in addition to whitespace.
13
+
14
+ 'SolicitPanFetchActionCompletionEvent' → ['solicit', 'pan', 'fetch', 'action', 'completion', 'event']
15
+ 'get_symbol_context' → ['get', 'symbol', 'context']
16
+ """
17
+ raw_tokens = TOKEN_RE.findall(text or "")
18
+ out: list[str] = []
19
+ for tok in raw_tokens:
20
+ # Split underscore-joined segments first, then camelCase within each
21
+ for segment in tok.split("_"):
22
+ if not segment:
23
+ continue
24
+ for part in _CAMEL_SPLIT_RE.split(segment):
25
+ if part:
26
+ out.append(part.lower())
27
+ return out
12
28
 
13
29
 
14
30
  def rank_bm25(query: str, docs: list[tuple[str, str]], k1: float = 1.2, b: float = 0.75) -> list[tuple[str, float]]:
@@ -1,18 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
- from codespine.config import SETTINGS
4
3
  from codespine.search.bm25 import rank_bm25
5
4
  from codespine.search.fuzzy import rank_fuzzy
6
5
  from codespine.search.rrf import reciprocal_rank_fusion
7
6
  from codespine.search.vector import rank_semantic
8
7
 
8
+ _LOW_CONFIDENCE_THRESHOLD = 0.05
9
+
9
10
 
10
11
  def hybrid_search(store, query: str, k: int = 20, project: str | None = None) -> list[dict]:
11
12
  project_clause = "AND f.project_id = $proj" if project else ""
12
- params: dict = {"lim": SETTINGS.semantic_candidate_pool}
13
+ params: dict = {}
13
14
  if project:
14
15
  params["proj"] = project
15
16
 
17
+ # No LIMIT — load all symbols for the scoped project so that exact class names
18
+ # are never missing from the candidate pool (previously capped at 2000 which
19
+ # caused exact matches on 4000+ file projects to be silently dropped).
16
20
  recs = store.query_records(
17
21
  f"""
18
22
  MATCH (s:Symbol), (f:File)
@@ -24,7 +28,6 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
24
28
  s.embedding as embedding,
25
29
  f.path as file_path,
26
30
  f.is_test as is_test
27
- LIMIT $lim
28
31
  """,
29
32
  params,
30
33
  )
@@ -32,6 +35,8 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
32
35
  if not recs:
33
36
  return []
34
37
 
38
+ query_lower = query.lower().strip()
39
+
35
40
  lexical_docs = [(r["id"], f"{r.get('name', '')} {r.get('fqname', '')}") for r in recs]
36
41
  fuzzy_docs = [(r["id"], r.get("name", "")) for r in recs]
37
42
  vector_docs = [(r["id"], r.get("embedding")) for r in recs]
@@ -40,11 +45,11 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
40
45
  fuzzy_rank = rank_fuzzy(query, fuzzy_docs)
41
46
  semantic_rank = rank_semantic(query, vector_docs)
42
47
 
43
- fused = reciprocal_rank_fusion([bm25_rank, semantic_rank, fuzzy_rank], k=SETTINGS.rrf_k)
48
+ fused = reciprocal_rank_fusion([bm25_rank, semantic_rank, fuzzy_rank])
44
49
  rec_by_id = {r["id"]: r for r in recs}
45
50
 
46
51
  results = []
47
- for doc_id, score in fused[: max(k * 3, k)]:
52
+ for doc_id, score in fused:
48
53
  rec = rec_by_id.get(doc_id)
49
54
  if not rec:
50
55
  continue
@@ -55,7 +60,12 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
55
60
  if rec.get("kind") in {"method", "class"}:
56
61
  multiplier *= 1.2
57
62
 
58
- final_score = score * multiplier
63
+ # Exact name match: guarantee this symbol ranks first regardless of RRF score.
64
+ name_lower = (rec.get("name") or "").lower()
65
+ fqname_lower = (rec.get("fqname") or "").lower()
66
+ if name_lower == query_lower or fqname_lower == query_lower:
67
+ multiplier *= 5.0
68
+
59
69
  results.append(
60
70
  {
61
71
  "id": doc_id,
@@ -63,14 +73,15 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
63
73
  "name": rec.get("name"),
64
74
  "fqname": rec.get("fqname"),
65
75
  "file_path": rec.get("file_path"),
66
- "score": final_score,
76
+ "score": score * multiplier,
67
77
  }
68
78
  )
69
79
 
70
80
  results.sort(key=lambda x: x["score"], reverse=True)
81
+ top_k = results[:k]
71
82
 
72
83
  # Attach architectural context in same response.
73
- for item in results[:k]:
84
+ for item in top_k:
74
85
  ctx = store.query_records(
75
86
  """
76
87
  MATCH (s:Symbol {id: $sid})-[:IN_COMMUNITY]->(c:Community)
@@ -83,4 +94,15 @@ def hybrid_search(store, query: str, k: int = 20, project: str | None = None) ->
83
94
  )
84
95
  item["context"] = ctx
85
96
 
86
- return results[:k]
97
+ # Warn when all scores are near zero — the results are likely noise.
98
+ if top_k and top_k[0]["score"] < _LOW_CONFIDENCE_THRESHOLD:
99
+ for item in top_k:
100
+ item["low_confidence"] = True
101
+ top_k.append({
102
+ "note": (
103
+ "Low confidence results — all scores below threshold. "
104
+ "If searching for an exact class or method name, use find_symbol instead."
105
+ )
106
+ })
107
+
108
+ return top_k
@@ -0,0 +1,155 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import math
6
+ import os
7
+ import threading
8
+ from functools import lru_cache
9
+
10
+ from codespine.config import SETTINGS
11
+
12
+
13
+ def _hash_vector(text: str, dim: int) -> list[float]:
14
+ """Deterministic fallback embedding when sentence-transformers is unavailable."""
15
+ vec = [0.0] * dim
16
+ if not text:
17
+ return vec
18
+ tokens = text.lower().split()
19
+ for token in tokens:
20
+ digest = hashlib.sha1(token.encode("utf-8")).digest()
21
+ idx = int.from_bytes(digest[:2], "big") % dim
22
+ sign = 1.0 if digest[2] % 2 == 0 else -1.0
23
+ vec[idx] += sign
24
+ norm = math.sqrt(sum(v * v for v in vec)) or 1.0
25
+ return [v / norm for v in vec]
26
+
27
+
28
+ @lru_cache(maxsize=1)
29
+ def _load_model():
30
+ try:
31
+ from sentence_transformers import SentenceTransformer
32
+
33
+ return SentenceTransformer(SETTINGS.embedding_model)
34
+ except Exception:
35
+ return None
36
+
37
+
38
+ class _EmbeddingCache:
39
+ """Thread-safe in-memory embedding cache backed by a JSON file.
40
+
41
+ Replaces the previous SQLite-based cache which caused threading issues
42
+ (database is locked / created in wrong thread) under MCP server concurrency.
43
+ """
44
+
45
+ def __init__(self, path: str) -> None:
46
+ self._path = path
47
+ self._lock = threading.Lock()
48
+ self._data: dict[str, str] | None = None # loaded lazily
49
+
50
+ def _ensure_loaded(self) -> None:
51
+ """Load cache from disk. Must be called with _lock held."""
52
+ if self._data is not None:
53
+ return
54
+ # Delete the old SQLite cache file left by versions < 0.4.0.
55
+ old_sqlite = self._path.replace(".json", ".sqlite3")
56
+ if os.path.isfile(old_sqlite):
57
+ try:
58
+ os.remove(old_sqlite)
59
+ except OSError:
60
+ pass
61
+ if os.path.isfile(self._path):
62
+ try:
63
+ with open(self._path, "r", encoding="utf-8") as f:
64
+ loaded = json.load(f)
65
+ if isinstance(loaded, dict):
66
+ self._data = loaded
67
+ return
68
+ except Exception:
69
+ pass
70
+ self._data = {}
71
+
72
+ def clear(self) -> None:
73
+ """Wipe the in-memory cache and delete the backing file."""
74
+ with self._lock:
75
+ self._data = {}
76
+ try:
77
+ os.remove(self._path)
78
+ except OSError:
79
+ pass
80
+
81
+ def _flush(self) -> None:
82
+ """Persist cache to disk atomically. Must be called with _lock held."""
83
+ try:
84
+ dir_path = os.path.dirname(self._path)
85
+ if dir_path:
86
+ os.makedirs(dir_path, exist_ok=True)
87
+ tmp = self._path + ".tmp"
88
+ with open(tmp, "w", encoding="utf-8") as f:
89
+ json.dump(self._data, f, separators=(",", ":"))
90
+ os.replace(tmp, self._path)
91
+ except Exception:
92
+ pass
93
+
94
+ def get(self, key: str) -> list[float] | None:
95
+ with self._lock:
96
+ self._ensure_loaded()
97
+ raw = self._data.get(key) # type: ignore[union-attr]
98
+ if raw is None:
99
+ return None
100
+ try:
101
+ return [float(x) for x in json.loads(raw)]
102
+ except Exception:
103
+ return None
104
+
105
+ def set(self, key: str, vec: list[float]) -> None:
106
+ with self._lock:
107
+ self._ensure_loaded()
108
+ self._data[key] = json.dumps(vec) # type: ignore[index]
109
+ self._flush()
110
+
111
+
112
+ _CACHE = _EmbeddingCache(SETTINGS.embedding_cache_path)
113
+
114
+
115
+ def _cache_key(text: str, dim: int) -> str:
116
+ return hashlib.sha1(f"{SETTINGS.embedding_model}|{dim}|{text}".encode("utf-8")).hexdigest()
117
+
118
+
119
+ def embed_text(text: str, dim: int | None = None) -> list[float]:
120
+ dim = dim or SETTINGS.vector_dim
121
+ key = _cache_key(text or "", dim)
122
+
123
+ cached = _CACHE.get(key)
124
+ if cached is not None:
125
+ return cached
126
+
127
+ model = _load_model()
128
+ if model is None:
129
+ vec = _hash_vector(text, dim)
130
+ else:
131
+ vec = [float(x) for x in model.encode([text or ""], normalize_embeddings=True)[0]]
132
+
133
+ _CACHE.set(key, vec)
134
+ return vec
135
+
136
+
137
+ def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
138
+ if not vec_a or not vec_b:
139
+ return 0.0
140
+ n = min(len(vec_a), len(vec_b))
141
+ dot = sum(vec_a[i] * vec_b[i] for i in range(n))
142
+ na = math.sqrt(sum(vec_a[i] * vec_a[i] for i in range(n))) or 1.0
143
+ nb = math.sqrt(sum(vec_b[i] * vec_b[i] for i in range(n))) or 1.0
144
+ return dot / (na * nb)
145
+
146
+
147
+ def rank_semantic(query: str, docs: list[tuple[str, list[float] | None]]) -> list[tuple[str, float]]:
148
+ qv = embed_text(query)
149
+ ranked: list[tuple[str, float]] = []
150
+ for doc_id, emb in docs:
151
+ if emb is None:
152
+ continue
153
+ ranked.append((doc_id, cosine_similarity(qv, emb)))
154
+ ranked.sort(key=lambda x: x[1], reverse=True)
155
+ return ranked
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "codespine"
7
- version = "0.3.0"
7
+ version = "0.4.1"
8
8
  description = "Local Java code intelligence indexer backed by a graph database"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -1,122 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import hashlib
4
- import math
5
- import sqlite3
6
- from functools import lru_cache
7
-
8
- from codespine.config import SETTINGS
9
-
10
-
11
- def _hash_vector(text: str, dim: int) -> list[float]:
12
- """Deterministic fallback embedding when sentence-transformers is unavailable."""
13
- vec = [0.0] * dim
14
- if not text:
15
- return vec
16
- tokens = text.lower().split()
17
- for token in tokens:
18
- digest = hashlib.sha1(token.encode("utf-8")).digest()
19
- idx = int.from_bytes(digest[:2], "big") % dim
20
- sign = 1.0 if digest[2] % 2 == 0 else -1.0
21
- vec[idx] += sign
22
- norm = math.sqrt(sum(v * v for v in vec)) or 1.0
23
- return [v / norm for v in vec]
24
-
25
-
26
- @lru_cache(maxsize=1)
27
- def _load_model():
28
- try:
29
- from sentence_transformers import SentenceTransformer
30
-
31
- return SentenceTransformer(SETTINGS.embedding_model)
32
- except Exception:
33
- return None
34
-
35
-
36
- @lru_cache(maxsize=1)
37
- def _embedding_cache_conn():
38
- path = SETTINGS.embedding_cache_db
39
- try:
40
- os_dir = path.rsplit("/", 1)[0] if "/" in path else ""
41
- if os_dir:
42
- import os
43
-
44
- os.makedirs(os_dir, exist_ok=True)
45
- conn = sqlite3.connect(path, check_same_thread=False)
46
- except Exception:
47
- conn = sqlite3.connect("/tmp/.codespine_embedding_cache.sqlite3", check_same_thread=False)
48
- conn.execute(
49
- """
50
- CREATE TABLE IF NOT EXISTS embedding_cache (
51
- cache_key TEXT PRIMARY KEY,
52
- dim INTEGER NOT NULL,
53
- vector_json TEXT NOT NULL
54
- )
55
- """
56
- )
57
- return conn
58
-
59
-
60
- def _cache_key(text: str, dim: int) -> str:
61
- return hashlib.sha1(f"{SETTINGS.embedding_model}|{dim}|{text}".encode("utf-8")).hexdigest()
62
-
63
-
64
- def _get_cached_embedding(text: str, dim: int) -> list[float] | None:
65
- key = _cache_key(text, dim)
66
- conn = _embedding_cache_conn()
67
- row = conn.execute("SELECT vector_json FROM embedding_cache WHERE cache_key = ? AND dim = ?", (key, dim)).fetchone()
68
- if not row:
69
- return None
70
- import json
71
-
72
- return [float(x) for x in json.loads(row[0])]
73
-
74
-
75
- def _set_cached_embedding(text: str, dim: int, vec: list[float]) -> None:
76
- key = _cache_key(text, dim)
77
- conn = _embedding_cache_conn()
78
- import json
79
-
80
- conn.execute(
81
- "INSERT OR REPLACE INTO embedding_cache(cache_key, dim, vector_json) VALUES (?, ?, ?)",
82
- (key, dim, json.dumps(vec)),
83
- )
84
- conn.commit()
85
-
86
-
87
- def embed_text(text: str, dim: int | None = None) -> list[float]:
88
- dim = dim or SETTINGS.vector_dim
89
- cached = _get_cached_embedding(text or "", dim)
90
- if cached is not None:
91
- return cached
92
-
93
- model = _load_model()
94
- if model is None:
95
- vec = _hash_vector(text, dim)
96
- _set_cached_embedding(text or "", dim, vec)
97
- return vec
98
-
99
- vec = [float(x) for x in model.encode([text or ""], normalize_embeddings=True)[0]]
100
- _set_cached_embedding(text or "", dim, vec)
101
- return vec
102
-
103
-
104
- def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
105
- if not vec_a or not vec_b:
106
- return 0.0
107
- n = min(len(vec_a), len(vec_b))
108
- dot = sum(vec_a[i] * vec_b[i] for i in range(n))
109
- na = math.sqrt(sum(vec_a[i] * vec_a[i] for i in range(n))) or 1.0
110
- nb = math.sqrt(sum(vec_b[i] * vec_b[i] for i in range(n))) or 1.0
111
- return dot / (na * nb)
112
-
113
-
114
- def rank_semantic(query: str, docs: list[tuple[str, list[float] | None]]) -> list[tuple[str, float]]:
115
- qv = embed_text(query)
116
- ranked: list[tuple[str, float]] = []
117
- for doc_id, emb in docs:
118
- if emb is None:
119
- continue
120
- ranked.append((doc_id, cosine_similarity(qv, emb)))
121
- ranked.sort(key=lambda x: x[1], reverse=True)
122
- return ranked
File without changes
File without changes
File without changes
File without changes
File without changes