code-context-mcp 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. code_context/__init__.py +3 -0
  2. code_context/_background.py +93 -0
  3. code_context/_composition.py +425 -0
  4. code_context/_watcher.py +89 -0
  5. code_context/adapters/__init__.py +0 -0
  6. code_context/adapters/driven/__init__.py +0 -0
  7. code_context/adapters/driven/chunker_dispatcher.py +43 -0
  8. code_context/adapters/driven/chunker_line.py +54 -0
  9. code_context/adapters/driven/chunker_treesitter.py +215 -0
  10. code_context/adapters/driven/chunker_treesitter_queries.py +111 -0
  11. code_context/adapters/driven/code_source_fs.py +122 -0
  12. code_context/adapters/driven/embeddings_local.py +111 -0
  13. code_context/adapters/driven/embeddings_openai.py +58 -0
  14. code_context/adapters/driven/git_source_cli.py +211 -0
  15. code_context/adapters/driven/introspector_fs.py +224 -0
  16. code_context/adapters/driven/keyword_index_sqlite.py +206 -0
  17. code_context/adapters/driven/reranker_crossencoder.py +61 -0
  18. code_context/adapters/driven/symbol_index_sqlite.py +264 -0
  19. code_context/adapters/driven/vector_store_numpy.py +119 -0
  20. code_context/adapters/driving/__init__.py +0 -0
  21. code_context/adapters/driving/mcp_server.py +365 -0
  22. code_context/cli.py +161 -0
  23. code_context/config.py +114 -0
  24. code_context/domain/__init__.py +0 -0
  25. code_context/domain/index_bus.py +52 -0
  26. code_context/domain/models.py +140 -0
  27. code_context/domain/ports.py +205 -0
  28. code_context/domain/use_cases/__init__.py +0 -0
  29. code_context/domain/use_cases/explain_diff.py +98 -0
  30. code_context/domain/use_cases/find_definition.py +30 -0
  31. code_context/domain/use_cases/find_references.py +22 -0
  32. code_context/domain/use_cases/get_file_tree.py +36 -0
  33. code_context/domain/use_cases/get_summary.py +24 -0
  34. code_context/domain/use_cases/indexer.py +336 -0
  35. code_context/domain/use_cases/recent_changes.py +36 -0
  36. code_context/domain/use_cases/search_repo.py +131 -0
  37. code_context/server.py +151 -0
  38. code_context_mcp-1.0.0.dist-info/METADATA +181 -0
  39. code_context_mcp-1.0.0.dist-info/RECORD +43 -0
  40. code_context_mcp-1.0.0.dist-info/WHEEL +5 -0
  41. code_context_mcp-1.0.0.dist-info/entry_points.txt +3 -0
  42. code_context_mcp-1.0.0.dist-info/licenses/LICENSE +21 -0
  43. code_context_mcp-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,54 @@
1
+ """LineChunker — splits text into N-line windows with overlap."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from dataclasses import dataclass
7
+
8
+ from code_context.domain.models import Chunk
9
+
10
+ _MIN_LINES = 5
11
+
12
+
13
+ @dataclass
14
+ class LineChunker:
15
+ """Splits content into `chunk_lines`-line chunks with `overlap` between consecutive chunks."""
16
+
17
+ chunk_lines: int = 50
18
+ overlap: int = 10
19
+
20
+ @property
21
+ def version(self) -> str:
22
+ return f"line-{self.chunk_lines}-{self.overlap}-v1"
23
+
24
+ def chunk(self, content: str, path: str) -> list[Chunk]:
25
+ if not content:
26
+ return []
27
+ lines = content.splitlines()
28
+ if len(lines) < _MIN_LINES:
29
+ return []
30
+
31
+ step = self.chunk_lines - self.overlap
32
+ if step <= 0:
33
+ raise ValueError(
34
+ f"overlap ({self.overlap}) must be less than chunk_lines ({self.chunk_lines})"
35
+ )
36
+
37
+ chunks: list[Chunk] = []
38
+ i = 0
39
+ while i < len(lines):
40
+ j = min(i + self.chunk_lines, len(lines))
41
+ snippet = "\n".join(lines[i:j])
42
+ chunks.append(
43
+ Chunk(
44
+ path=path,
45
+ line_start=i + 1,
46
+ line_end=j,
47
+ content_hash=hashlib.sha256(snippet.encode("utf-8")).hexdigest(),
48
+ snippet=snippet,
49
+ )
50
+ )
51
+ if j >= len(lines):
52
+ break
53
+ i += step
54
+ return chunks
@@ -0,0 +1,215 @@
1
+ """TreeSitterChunker — AST-aware chunking via tree-sitter.
2
+
3
+ Lazy-loads parsers per language. Returns whole-function / whole-class
4
+ chunks. On unsupported language or parse failure, returns []. Caller
5
+ (usually ChunkerDispatcher) is responsible for routing unsupported
6
+ files to LineChunker.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+ import logging
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from code_context.adapters.driven.chunker_treesitter_queries import QUERIES_BY_LANG
18
+ from code_context.domain.models import Chunk, SymbolDef
19
+
20
+ log = logging.getLogger(__name__)
21
+
22
+ _EXT_TO_LANG: dict[str, str] = {
23
+ ".py": "python",
24
+ ".js": "javascript",
25
+ ".jsx": "javascript",
26
+ ".ts": "typescript",
27
+ ".tsx": "typescript",
28
+ ".go": "go",
29
+ ".rs": "rust",
30
+ ".cs": "csharp",
31
+ }
32
+
33
+
34
+ def _load_language(lang: str) -> tuple[Any, Any]: # pragma: no cover - exercised in tests
35
+ """Lazy import + load. Patched in unit tests where needed."""
36
+ from tree_sitter_language_pack import get_language, get_parser
37
+
38
+ return get_language(lang), get_parser(lang)
39
+
40
+
41
+ def _make_query_cursor(language: Any, source: str) -> Any: # pragma: no cover
42
+ """Lazy import of tree-sitter's Query + QueryCursor."""
43
+ from tree_sitter import Query, QueryCursor
44
+
45
+ return QueryCursor(Query(language, source))
46
+
47
+
48
+ @dataclass
49
+ class TreeSitterChunker:
50
+ """Splits source code into chunks aligned to AST node boundaries."""
51
+
52
+ @property
53
+ def version(self) -> str:
54
+ # Bump the trailing -vN when query semantics change — invalidates the index cache.
55
+ return "treesitter-v2"
56
+
57
+ def chunk(self, content: str, path: str) -> list[Chunk]:
58
+ if not content:
59
+ return []
60
+ lang = _detect_language(path)
61
+ if lang is None or lang not in QUERIES_BY_LANG:
62
+ return []
63
+ try:
64
+ return _chunk_via_treesitter(content, path, lang)
65
+ except Exception as exc: # parse errors are rare; LineChunker fallback handles them
66
+ log.warning("treesitter parse failed for %s (%s); returning []", path, exc)
67
+ return []
68
+
69
+ def extract_definitions(self, content: str, path: str) -> list[SymbolDef]:
70
+ """Walk the AST and emit a SymbolDef per @chunk node paired with its @name."""
71
+ if not content:
72
+ return []
73
+ lang = _detect_language(path)
74
+ if lang is None or lang not in QUERIES_BY_LANG:
75
+ return []
76
+ try:
77
+ return _extract_via_treesitter(content, path, lang)
78
+ except Exception as exc:
79
+ log.warning("treesitter extract_definitions failed for %s (%s)", path, exc)
80
+ return []
81
+
82
+
83
+ def _detect_language(path: str) -> str | None:
84
+ suffix = Path(path).suffix.lower()
85
+ return _EXT_TO_LANG.get(suffix)
86
+
87
+
88
+ def _chunk_via_treesitter(content: str, path: str, lang: str) -> list[Chunk]:
89
+ language, parser = _load_language(lang)
90
+ tree = parser.parse(content.encode("utf-8"))
91
+ cursor = _make_query_cursor(language, QUERIES_BY_LANG[lang])
92
+ captures = cursor.captures(tree.root_node)
93
+ # QueryCursor.captures returns dict[capture_name, list[Node]] in tree-sitter ≥0.24.
94
+ # Older fallback: list of (Node, capture_name) tuples.
95
+ chunk_nodes = _flatten_chunk_nodes(captures)
96
+ # Sort by start line for stable, document-order output.
97
+ chunk_nodes.sort(key=lambda n: (n.start_point[0], n.start_point[1]))
98
+ source_lines = content.splitlines()
99
+ chunks: list[Chunk] = []
100
+ for node in chunk_nodes:
101
+ start_line = node.start_point[0] + 1
102
+ end_line = node.end_point[0] + 1
103
+ # Use the source-line slice (not node.text) so leading indentation is
104
+ # preserved — matters for indented methods whose tree-sitter node.text
105
+ # starts at the column where the keyword sits.
106
+ snippet = "\n".join(source_lines[start_line - 1 : end_line])
107
+ chunks.append(
108
+ Chunk(
109
+ path=path,
110
+ line_start=start_line,
111
+ line_end=end_line,
112
+ content_hash=hashlib.sha256(snippet.encode("utf-8")).hexdigest(),
113
+ snippet=snippet,
114
+ )
115
+ )
116
+ return chunks
117
+
118
+
119
+ def _extract_via_treesitter(content: str, path: str, lang: str) -> list[SymbolDef]:
120
+ """Pair @chunk nodes with their @name children to produce SymbolDef list."""
121
+ language, parser = _load_language(lang)
122
+ tree = parser.parse(content.encode("utf-8"))
123
+ cursor = _make_query_cursor(language, QUERIES_BY_LANG[lang])
124
+ captures = cursor.captures(tree.root_node)
125
+ # captures is dict[capture_name, list[Node]] in tree-sitter ≥0.24.
126
+
127
+ chunk_nodes = list(captures.get("chunk", [])) if isinstance(captures, dict) else []
128
+ name_nodes = list(captures.get("name", [])) if isinstance(captures, dict) else []
129
+ if not chunk_nodes or not name_nodes:
130
+ return []
131
+
132
+ # Pair @name with the closest enclosing @chunk by walking up parents.
133
+ # Use node.id (the underlying tree-sitter AST node identity) rather than
134
+ # id(node) (Python wrapper identity) — re-fetched nodes get fresh wrappers
135
+ # but the same underlying tree-sitter id.
136
+ chunk_set = {n.id: n for n in chunk_nodes}
137
+ pairs: list[tuple[Any, Any]] = []
138
+ for name_node in name_nodes:
139
+ cur = name_node.parent
140
+ while cur is not None and cur.id not in chunk_set:
141
+ cur = cur.parent
142
+ if cur is not None:
143
+ pairs.append((chunk_set[cur.id], name_node))
144
+
145
+ defs: list[SymbolDef] = []
146
+ seen_keys: set[tuple[str, int, int]] = set()
147
+ for chunk_node, name_node in pairs:
148
+ try:
149
+ symbol_name = name_node.text.decode("utf-8", errors="replace")
150
+ except (AttributeError, UnicodeDecodeError):
151
+ continue
152
+ start_line = chunk_node.start_point[0] + 1
153
+ end_line = chunk_node.end_point[0] + 1
154
+ # Skip duplicates (same chunk node with multiple matched names).
155
+ key = (symbol_name, start_line, end_line)
156
+ if key in seen_keys:
157
+ continue
158
+ seen_keys.add(key)
159
+ defs.append(
160
+ SymbolDef(
161
+ name=symbol_name,
162
+ path=path,
163
+ lines=(start_line, end_line),
164
+ kind=_kind_from_node(chunk_node, lang),
165
+ language=lang,
166
+ )
167
+ )
168
+ # Sort for stable output (by start_line, then name).
169
+ defs.sort(key=lambda d: (d.lines[0], d.name))
170
+ return defs
171
+
172
+
173
+ def _kind_from_node(node: Any, lang: str) -> str:
174
+ """Map tree-sitter node types to our SymbolDef.kind vocabulary."""
175
+ kind_map = {
176
+ # Python
177
+ "function_definition": "function",
178
+ "class_definition": "class",
179
+ # JS / TS
180
+ "function_declaration": "function",
181
+ "class_declaration": "class",
182
+ "method_definition": "method",
183
+ "interface_declaration": "interface",
184
+ "type_alias_declaration": "type",
185
+ "variable_declarator": "function", # arrow function bound to const
186
+ # Go
187
+ "method_declaration": "method",
188
+ "type_declaration": "type",
189
+ # Rust
190
+ "function_item": "function",
191
+ "struct_item": "struct",
192
+ "enum_item": "enum",
193
+ "impl_item": "impl",
194
+ "trait_item": "trait",
195
+ # C# (some overlap with the above; latest hit wins, ordering matters
196
+ # because dicts preserve insertion order — listed here last on purpose
197
+ # so e.g. C# class_declaration produces "class" not "class" via the JS path).
198
+ "constructor_declaration": "constructor",
199
+ "struct_declaration": "struct",
200
+ "record_declaration": "record",
201
+ "enum_declaration": "enum",
202
+ }
203
+ return kind_map.get(node.type, "unknown")
204
+
205
+
206
+ def _flatten_chunk_nodes(captures: Any) -> list[Any]:
207
+ """Return the @chunk-tagged nodes regardless of which API shape we got."""
208
+ if isinstance(captures, dict):
209
+ return list(captures.get("chunk", []))
210
+ out: list[Any] = []
211
+ for item in captures:
212
+ # item is (node, name) in older bindings.
213
+ if isinstance(item, tuple) and len(item) == 2 and item[1] == "chunk":
214
+ out.append(item[0])
215
+ return out
@@ -0,0 +1,111 @@
1
+ """Tree-sitter S-expression queries: one per supported language.
2
+
3
+ Each query captures the AST nodes we want to emit as chunks. A node is
4
+ "chunk-worthy" when it represents a complete top-level semantic unit:
5
+
6
+ - Python: function_definition, class_definition.
7
+ - JavaScript / TypeScript: function_declaration, class_declaration,
8
+ method_definition, arrow function assigned at module scope.
9
+ - Go: function_declaration, method_declaration, type_declaration.
10
+ - Rust: function_item, struct_item, enum_item, impl_item, trait_item.
11
+ - C#: method_declaration, constructor_declaration, class_declaration,
12
+ interface_declaration, struct_declaration, record_declaration,
13
+ enum_declaration.
14
+
15
+ Each query also captures the symbol's identifier as ``@name`` so callers
16
+ that want to mine ``SymbolDef`` objects (extract_definitions) can pair
17
+ the chunk node with its name child. Existing chunk-only consumers that
18
+ filter ``captures["chunk"]`` continue to work unchanged.
19
+
20
+ Smaller nodes (assignments, single statements) are NOT captured — they
21
+ are rolled up into a synthetic "module-prelude" chunk in the chunker
22
+ (future work; v0.2.0 simply skips them).
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ PYTHON = """
28
+ (function_definition
29
+ name: (identifier) @name) @chunk
30
+ (class_definition
31
+ name: (identifier) @name) @chunk
32
+ """
33
+
34
+ JAVASCRIPT = """
35
+ (function_declaration
36
+ name: (identifier) @name) @chunk
37
+ (class_declaration
38
+ name: (identifier) @name) @chunk
39
+ (method_definition
40
+ name: (property_identifier) @name) @chunk
41
+ (variable_declarator
42
+ name: (identifier) @name
43
+ value: (arrow_function)) @chunk
44
+ """
45
+
46
+ # TypeScript shares JS's overall shape but the grammar names the class with
47
+ # (type_identifier) rather than (identifier), so we cannot reuse the JS
48
+ # string verbatim — the TS class_declaration pattern needs its own form.
49
+ TYPESCRIPT = """
50
+ (function_declaration
51
+ name: (identifier) @name) @chunk
52
+ (class_declaration
53
+ name: (type_identifier) @name) @chunk
54
+ (method_definition
55
+ name: (property_identifier) @name) @chunk
56
+ (variable_declarator
57
+ name: (identifier) @name
58
+ value: (arrow_function)) @chunk
59
+ (interface_declaration
60
+ name: (type_identifier) @name) @chunk
61
+ (type_alias_declaration
62
+ name: (type_identifier) @name) @chunk
63
+ """
64
+
65
+ GO = """
66
+ (function_declaration
67
+ name: (identifier) @name) @chunk
68
+ (method_declaration
69
+ name: (field_identifier) @name) @chunk
70
+ (type_declaration
71
+ (type_spec name: (type_identifier) @name)) @chunk
72
+ """
73
+
74
+ RUST = """
75
+ (function_item
76
+ name: (identifier) @name) @chunk
77
+ (struct_item
78
+ name: (type_identifier) @name) @chunk
79
+ (enum_item
80
+ name: (type_identifier) @name) @chunk
81
+ (impl_item
82
+ type: (type_identifier) @name) @chunk
83
+ (trait_item
84
+ name: (type_identifier) @name) @chunk
85
+ """
86
+
87
+ CSHARP = """
88
+ (method_declaration
89
+ name: (identifier) @name) @chunk
90
+ (constructor_declaration
91
+ name: (identifier) @name) @chunk
92
+ (class_declaration
93
+ name: (identifier) @name) @chunk
94
+ (interface_declaration
95
+ name: (identifier) @name) @chunk
96
+ (struct_declaration
97
+ name: (identifier) @name) @chunk
98
+ (record_declaration
99
+ name: (identifier) @name) @chunk
100
+ (enum_declaration
101
+ name: (identifier) @name) @chunk
102
+ """
103
+
104
+ QUERIES_BY_LANG: dict[str, str] = {
105
+ "python": PYTHON,
106
+ "javascript": JAVASCRIPT,
107
+ "typescript": TYPESCRIPT,
108
+ "go": GO,
109
+ "rust": RUST,
110
+ "csharp": CSHARP,
111
+ }
@@ -0,0 +1,122 @@
1
+ """FilesystemSource — gitignore-aware walk + binary detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import pathspec
8
+
9
+ from code_context.domain.models import FileTreeNode
10
+
11
+ _BINARY_PROBE_BYTES = 4096
12
+
13
+
14
+ class FilesystemSource:
15
+ def list_files(self, root: Path, include_exts: list[str], max_bytes: int) -> list[Path]:
16
+ gitignore = self._load_gitignore(root)
17
+ results: list[Path] = []
18
+ ext_set = {e.lower() for e in include_exts}
19
+
20
+ for path in sorted(root.rglob("*")):
21
+ if not path.is_file():
22
+ continue
23
+ rel = path.relative_to(root).as_posix()
24
+ if gitignore.match_file(rel):
25
+ continue
26
+ if path.suffix.lower() not in ext_set:
27
+ continue
28
+ try:
29
+ size = path.stat().st_size
30
+ except OSError:
31
+ continue
32
+ if size > max_bytes:
33
+ continue
34
+ if self._looks_binary(path):
35
+ continue
36
+ results.append(path)
37
+ return results
38
+
39
+ def read(self, path: Path) -> str:
40
+ return path.read_text(encoding="utf-8", errors="replace")
41
+
42
+ def walk_tree(
43
+ self,
44
+ root: Path,
45
+ max_depth: int = 4,
46
+ include_hidden: bool = False,
47
+ subpath: Path | None = None,
48
+ ) -> FileTreeNode:
49
+ root_resolved = root.resolve()
50
+ target = root_resolved if subpath is None else (root / subpath).resolve()
51
+ # Refuse to walk outside the root.
52
+ try:
53
+ target.relative_to(root_resolved)
54
+ except ValueError as exc:
55
+ raise ValueError(f"subpath {subpath!r} escapes root {root!r}") from exc
56
+
57
+ gitignore = self._load_gitignore(root)
58
+ return self._walk_node(target, root_resolved, gitignore, max_depth, include_hidden, 0)
59
+
60
+ def _walk_node(
61
+ self,
62
+ node: Path,
63
+ root: Path,
64
+ gitignore: pathspec.PathSpec,
65
+ max_depth: int,
66
+ include_hidden: bool,
67
+ current_depth: int,
68
+ ) -> FileTreeNode:
69
+ rel = node.relative_to(root).as_posix() if node != root else ""
70
+ rel_display = rel if rel else "."
71
+
72
+ if node.is_file():
73
+ try:
74
+ size = node.stat().st_size
75
+ except OSError:
76
+ size = None
77
+ return FileTreeNode(path=rel_display, kind="file", children=(), size=size)
78
+
79
+ # Directory.
80
+ if current_depth >= max_depth:
81
+ # Cap reached — empty dir node (signals depth cap to caller).
82
+ return FileTreeNode(path=rel_display, kind="dir", children=(), size=None)
83
+
84
+ children: list[FileTreeNode] = []
85
+ try:
86
+ entries = sorted(node.iterdir(), key=lambda p: (p.is_file(), p.name.lower()))
87
+ except OSError:
88
+ return FileTreeNode(path=rel_display, kind="dir", children=(), size=None)
89
+
90
+ for child in entries:
91
+ name = child.name
92
+ if not include_hidden and name.startswith("."):
93
+ continue
94
+ child_rel = child.relative_to(root).as_posix()
95
+ # gitignore matching: dirs need trailing slash to match dir patterns.
96
+ match_path = child_rel + ("/" if child.is_dir() else "")
97
+ if gitignore.match_file(match_path) or gitignore.match_file(child_rel):
98
+ continue
99
+ children.append(
100
+ self._walk_node(
101
+ child, root, gitignore, max_depth, include_hidden, current_depth + 1
102
+ )
103
+ )
104
+
105
+ return FileTreeNode(path=rel_display, kind="dir", children=tuple(children), size=None)
106
+
107
+ @staticmethod
108
+ def _load_gitignore(root: Path) -> pathspec.PathSpec:
109
+ lines = [".git/"]
110
+ gi = root / ".gitignore"
111
+ if gi.exists():
112
+ lines.extend(gi.read_text().splitlines())
113
+ return pathspec.PathSpec.from_lines("gitignore", lines)
114
+
115
+ @staticmethod
116
+ def _looks_binary(path: Path) -> bool:
117
+ try:
118
+ with path.open("rb") as fh:
119
+ probe = fh.read(_BINARY_PROBE_BYTES)
120
+ return b"\x00" in probe
121
+ except OSError:
122
+ return True
@@ -0,0 +1,111 @@
1
+ """LocalST — sentence-transformers wrapped as an EmbeddingsProvider.
2
+
3
+ The sentence-transformers import is lazy: constructing this adapter doesn't
4
+ trigger torch loading. The model is loaded on first `embed()` call.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from typing import Any
11
+
12
+ import numpy as np
13
+
14
+ log = logging.getLogger(__name__)
15
+
16
+
17
+ # MODEL_REGISTRY enumerates models we have verified and characterised. Models
18
+ # missing from here still work, but staleness, dimension hints, and benchmarks
19
+ # won't recognise them and the adapter will warn at construction time.
20
+ #
21
+ # v0.3.3 trimmed this list to verified entries only. v0.3.0-v0.3.2 listed
22
+ # `BAAI/bge-code-v1.5` which never existed on Hugging Face — a planning error
23
+ # corrected here. Other code-tuned candidates (`jinaai/jina-embeddings-v2-base-code`,
24
+ # `BAAI/bge-code-v1`) work via `CC_EMBEDDINGS_MODEL` override but are not yet
25
+ # pre-characterised here because their embedding dims have not been independently
26
+ # verified. v0.4 will re-introduce a verified code-tuned default after benchmark
27
+ # validation and a CI check that pings the HF API for each registered name.
28
+ MODEL_REGISTRY: dict[str, dict[str, int | str]] = {
29
+ "sentence-transformers/all-MiniLM-L6-v2": {"dimension": 384, "kind": "general"},
30
+ "all-MiniLM-L6-v2": {"dimension": 384, "kind": "general"}, # short alias
31
+ # Code-tuned (opt-in via CC_EMBEDDINGS_MODEL + CC_TRUST_REMOTE_CODE=true).
32
+ # 161M params (~640 MB FP32), Apache-2.0, English + 30 programming languages.
33
+ # Verified existing on HF as of v0.6.0 release; CI's hf-guard job re-checks
34
+ # on every push.
35
+ "jinaai/jina-embeddings-v2-base-code": {"dimension": 768, "kind": "code"},
36
+ }
37
+
38
+
39
+ # Whole-function chunks from tree-sitter can run 5K+ chars and overflow the
40
+ # 512-token context of BERT-family encoders. We embed the truncated head; the
41
+ # full snippet is preserved in the chunk for the search response payload, so
42
+ # users still see the complete code. 2048 chars ~= 512 tokens for code-heavy
43
+ # text.
44
+ _MAX_EMBED_CHARS = 2048
45
+
46
+
47
+ def _load_model(
48
+ model_name: str, *, trust_remote_code: bool = False
49
+ ) -> Any: # pragma: no cover - integration-tested
50
+ """Lazy import + load. Patched in unit tests."""
51
+ from sentence_transformers import SentenceTransformer
52
+
53
+ log.info(
54
+ "loading sentence-transformers model: %s (trust_remote_code=%s)",
55
+ model_name,
56
+ trust_remote_code,
57
+ )
58
+ return SentenceTransformer(model_name, trust_remote_code=trust_remote_code)
59
+
60
+
61
+ def _lib_version() -> str:
62
+ try:
63
+ from importlib.metadata import version
64
+
65
+ return version("sentence-transformers")
66
+ except Exception: # pragma: no cover
67
+ return "unknown"
68
+
69
+
70
+ class LocalST:
71
+ def __init__(
72
+ self,
73
+ model_name: str = "all-MiniLM-L6-v2",
74
+ *,
75
+ trust_remote_code: bool = False,
76
+ ) -> None:
77
+ if model_name not in MODEL_REGISTRY:
78
+ log.warning(
79
+ "embeddings model %r not in MODEL_REGISTRY; staleness, "
80
+ "dimension hints, and benchmarks won't recognise it",
81
+ model_name,
82
+ )
83
+ self.model_name = model_name
84
+ self.trust_remote_code = trust_remote_code
85
+ self._model: Any = None
86
+
87
+ @property
88
+ def dimension(self) -> int:
89
+ self._ensure_loaded()
90
+ # sentence-transformers >= 5 renamed the method; fall back to the old name
91
+ # so we work across both lines without a hard pin.
92
+ getter = getattr(self._model, "get_embedding_dimension", None) or (
93
+ self._model.get_sentence_embedding_dimension
94
+ )
95
+ return int(getter())
96
+
97
+ @property
98
+ def model_id(self) -> str:
99
+ return f"local:{self.model_name}@v{_lib_version()}"
100
+
101
+ def embed(self, texts: list[str]) -> np.ndarray:
102
+ self._ensure_loaded()
103
+ if not texts:
104
+ return np.empty((0, self.dimension), dtype=np.float32)
105
+ truncated = [t[:_MAX_EMBED_CHARS] for t in texts]
106
+ out = self._model.encode(truncated, convert_to_numpy=True, show_progress_bar=False)
107
+ return out.astype(np.float32, copy=False)
108
+
109
+ def _ensure_loaded(self) -> None:
110
+ if self._model is None:
111
+ self._model = _load_model(self.model_name, trust_remote_code=self.trust_remote_code)
@@ -0,0 +1,58 @@
1
+ """OpenAIProvider — OpenAI embeddings via the openai SDK."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+
10
+ log = logging.getLogger(__name__)
11
+
12
+ _DIMENSION_BY_MODEL = {
13
+ "text-embedding-3-small": 1536,
14
+ "text-embedding-3-large": 3072,
15
+ "text-embedding-ada-002": 1536,
16
+ }
17
+
18
+
19
+ def _load_client(api_key: str) -> Any: # pragma: no cover - patched in tests
20
+ """Lazy import + construct."""
21
+ from openai import OpenAI
22
+
23
+ return OpenAI(api_key=api_key)
24
+
25
+
26
+ def _lib_version() -> str:
27
+ try:
28
+ from importlib.metadata import version
29
+
30
+ return version("openai")
31
+ except Exception: # pragma: no cover
32
+ return "unknown"
33
+
34
+
35
+ class OpenAIProvider:
36
+ def __init__(self, model: str, api_key: str) -> None:
37
+ if not api_key:
38
+ raise ValueError("api_key is required for OpenAIProvider")
39
+ self.model = model
40
+ self._api_key = api_key
41
+ self._client: Any = None
42
+
43
+ @property
44
+ def dimension(self) -> int:
45
+ return _DIMENSION_BY_MODEL.get(self.model, 1536)
46
+
47
+ @property
48
+ def model_id(self) -> str:
49
+ return f"openai:{self.model}@v{_lib_version()}"
50
+
51
+ def embed(self, texts: list[str]) -> np.ndarray:
52
+ if self._client is None:
53
+ self._client = _load_client(self._api_key)
54
+ if not texts:
55
+ return np.empty((0, self.dimension), dtype=np.float32)
56
+ resp = self._client.embeddings.create(model=self.model, input=texts)
57
+ out = np.array([d.embedding for d in resp.data], dtype=np.float32)
58
+ return out