code-context-mcp 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context/__init__.py +3 -0
- code_context/_background.py +93 -0
- code_context/_composition.py +425 -0
- code_context/_watcher.py +89 -0
- code_context/adapters/__init__.py +0 -0
- code_context/adapters/driven/__init__.py +0 -0
- code_context/adapters/driven/chunker_dispatcher.py +43 -0
- code_context/adapters/driven/chunker_line.py +54 -0
- code_context/adapters/driven/chunker_treesitter.py +215 -0
- code_context/adapters/driven/chunker_treesitter_queries.py +111 -0
- code_context/adapters/driven/code_source_fs.py +122 -0
- code_context/adapters/driven/embeddings_local.py +111 -0
- code_context/adapters/driven/embeddings_openai.py +58 -0
- code_context/adapters/driven/git_source_cli.py +211 -0
- code_context/adapters/driven/introspector_fs.py +224 -0
- code_context/adapters/driven/keyword_index_sqlite.py +206 -0
- code_context/adapters/driven/reranker_crossencoder.py +61 -0
- code_context/adapters/driven/symbol_index_sqlite.py +264 -0
- code_context/adapters/driven/vector_store_numpy.py +119 -0
- code_context/adapters/driving/__init__.py +0 -0
- code_context/adapters/driving/mcp_server.py +365 -0
- code_context/cli.py +161 -0
- code_context/config.py +114 -0
- code_context/domain/__init__.py +0 -0
- code_context/domain/index_bus.py +52 -0
- code_context/domain/models.py +140 -0
- code_context/domain/ports.py +205 -0
- code_context/domain/use_cases/__init__.py +0 -0
- code_context/domain/use_cases/explain_diff.py +98 -0
- code_context/domain/use_cases/find_definition.py +30 -0
- code_context/domain/use_cases/find_references.py +22 -0
- code_context/domain/use_cases/get_file_tree.py +36 -0
- code_context/domain/use_cases/get_summary.py +24 -0
- code_context/domain/use_cases/indexer.py +336 -0
- code_context/domain/use_cases/recent_changes.py +36 -0
- code_context/domain/use_cases/search_repo.py +131 -0
- code_context/server.py +151 -0
- code_context_mcp-1.0.0.dist-info/METADATA +181 -0
- code_context_mcp-1.0.0.dist-info/RECORD +43 -0
- code_context_mcp-1.0.0.dist-info/WHEEL +5 -0
- code_context_mcp-1.0.0.dist-info/entry_points.txt +3 -0
- code_context_mcp-1.0.0.dist-info/licenses/LICENSE +21 -0
- code_context_mcp-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""LineChunker — splits text into N-line windows with overlap."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
from code_context.domain.models import Chunk
|
|
9
|
+
|
|
10
|
+
_MIN_LINES = 5
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class LineChunker:
|
|
15
|
+
"""Splits content into `chunk_lines`-line chunks with `overlap` between consecutive chunks."""
|
|
16
|
+
|
|
17
|
+
chunk_lines: int = 50
|
|
18
|
+
overlap: int = 10
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def version(self) -> str:
|
|
22
|
+
return f"line-{self.chunk_lines}-{self.overlap}-v1"
|
|
23
|
+
|
|
24
|
+
def chunk(self, content: str, path: str) -> list[Chunk]:
|
|
25
|
+
if not content:
|
|
26
|
+
return []
|
|
27
|
+
lines = content.splitlines()
|
|
28
|
+
if len(lines) < _MIN_LINES:
|
|
29
|
+
return []
|
|
30
|
+
|
|
31
|
+
step = self.chunk_lines - self.overlap
|
|
32
|
+
if step <= 0:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"overlap ({self.overlap}) must be less than chunk_lines ({self.chunk_lines})"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
chunks: list[Chunk] = []
|
|
38
|
+
i = 0
|
|
39
|
+
while i < len(lines):
|
|
40
|
+
j = min(i + self.chunk_lines, len(lines))
|
|
41
|
+
snippet = "\n".join(lines[i:j])
|
|
42
|
+
chunks.append(
|
|
43
|
+
Chunk(
|
|
44
|
+
path=path,
|
|
45
|
+
line_start=i + 1,
|
|
46
|
+
line_end=j,
|
|
47
|
+
content_hash=hashlib.sha256(snippet.encode("utf-8")).hexdigest(),
|
|
48
|
+
snippet=snippet,
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
if j >= len(lines):
|
|
52
|
+
break
|
|
53
|
+
i += step
|
|
54
|
+
return chunks
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""TreeSitterChunker — AST-aware chunking via tree-sitter.
|
|
2
|
+
|
|
3
|
+
Lazy-loads parsers per language. Returns whole-function / whole-class
|
|
4
|
+
chunks. On unsupported language or parse failure, returns []. Caller
|
|
5
|
+
(usually ChunkerDispatcher) is responsible for routing unsupported
|
|
6
|
+
files to LineChunker.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import logging
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from code_context.adapters.driven.chunker_treesitter_queries import QUERIES_BY_LANG
|
|
18
|
+
from code_context.domain.models import Chunk, SymbolDef
|
|
19
|
+
|
|
20
|
+
log = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
_EXT_TO_LANG: dict[str, str] = {
|
|
23
|
+
".py": "python",
|
|
24
|
+
".js": "javascript",
|
|
25
|
+
".jsx": "javascript",
|
|
26
|
+
".ts": "typescript",
|
|
27
|
+
".tsx": "typescript",
|
|
28
|
+
".go": "go",
|
|
29
|
+
".rs": "rust",
|
|
30
|
+
".cs": "csharp",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _load_language(lang: str) -> tuple[Any, Any]: # pragma: no cover - exercised in tests
|
|
35
|
+
"""Lazy import + load. Patched in unit tests where needed."""
|
|
36
|
+
from tree_sitter_language_pack import get_language, get_parser
|
|
37
|
+
|
|
38
|
+
return get_language(lang), get_parser(lang)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _make_query_cursor(language: Any, source: str) -> Any: # pragma: no cover
|
|
42
|
+
"""Lazy import of tree-sitter's Query + QueryCursor."""
|
|
43
|
+
from tree_sitter import Query, QueryCursor
|
|
44
|
+
|
|
45
|
+
return QueryCursor(Query(language, source))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class TreeSitterChunker:
|
|
50
|
+
"""Splits source code into chunks aligned to AST node boundaries."""
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def version(self) -> str:
|
|
54
|
+
# Bump the trailing -vN when query semantics change — invalidates the index cache.
|
|
55
|
+
return "treesitter-v2"
|
|
56
|
+
|
|
57
|
+
def chunk(self, content: str, path: str) -> list[Chunk]:
|
|
58
|
+
if not content:
|
|
59
|
+
return []
|
|
60
|
+
lang = _detect_language(path)
|
|
61
|
+
if lang is None or lang not in QUERIES_BY_LANG:
|
|
62
|
+
return []
|
|
63
|
+
try:
|
|
64
|
+
return _chunk_via_treesitter(content, path, lang)
|
|
65
|
+
except Exception as exc: # parse errors are rare; LineChunker fallback handles them
|
|
66
|
+
log.warning("treesitter parse failed for %s (%s); returning []", path, exc)
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
def extract_definitions(self, content: str, path: str) -> list[SymbolDef]:
|
|
70
|
+
"""Walk the AST and emit a SymbolDef per @chunk node paired with its @name."""
|
|
71
|
+
if not content:
|
|
72
|
+
return []
|
|
73
|
+
lang = _detect_language(path)
|
|
74
|
+
if lang is None or lang not in QUERIES_BY_LANG:
|
|
75
|
+
return []
|
|
76
|
+
try:
|
|
77
|
+
return _extract_via_treesitter(content, path, lang)
|
|
78
|
+
except Exception as exc:
|
|
79
|
+
log.warning("treesitter extract_definitions failed for %s (%s)", path, exc)
|
|
80
|
+
return []
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _detect_language(path: str) -> str | None:
|
|
84
|
+
suffix = Path(path).suffix.lower()
|
|
85
|
+
return _EXT_TO_LANG.get(suffix)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _chunk_via_treesitter(content: str, path: str, lang: str) -> list[Chunk]:
|
|
89
|
+
language, parser = _load_language(lang)
|
|
90
|
+
tree = parser.parse(content.encode("utf-8"))
|
|
91
|
+
cursor = _make_query_cursor(language, QUERIES_BY_LANG[lang])
|
|
92
|
+
captures = cursor.captures(tree.root_node)
|
|
93
|
+
# QueryCursor.captures returns dict[capture_name, list[Node]] in tree-sitter ≥0.24.
|
|
94
|
+
# Older fallback: list of (Node, capture_name) tuples.
|
|
95
|
+
chunk_nodes = _flatten_chunk_nodes(captures)
|
|
96
|
+
# Sort by start line for stable, document-order output.
|
|
97
|
+
chunk_nodes.sort(key=lambda n: (n.start_point[0], n.start_point[1]))
|
|
98
|
+
source_lines = content.splitlines()
|
|
99
|
+
chunks: list[Chunk] = []
|
|
100
|
+
for node in chunk_nodes:
|
|
101
|
+
start_line = node.start_point[0] + 1
|
|
102
|
+
end_line = node.end_point[0] + 1
|
|
103
|
+
# Use the source-line slice (not node.text) so leading indentation is
|
|
104
|
+
# preserved — matters for indented methods whose tree-sitter node.text
|
|
105
|
+
# starts at the column where the keyword sits.
|
|
106
|
+
snippet = "\n".join(source_lines[start_line - 1 : end_line])
|
|
107
|
+
chunks.append(
|
|
108
|
+
Chunk(
|
|
109
|
+
path=path,
|
|
110
|
+
line_start=start_line,
|
|
111
|
+
line_end=end_line,
|
|
112
|
+
content_hash=hashlib.sha256(snippet.encode("utf-8")).hexdigest(),
|
|
113
|
+
snippet=snippet,
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
return chunks
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _extract_via_treesitter(content: str, path: str, lang: str) -> list[SymbolDef]:
|
|
120
|
+
"""Pair @chunk nodes with their @name children to produce SymbolDef list."""
|
|
121
|
+
language, parser = _load_language(lang)
|
|
122
|
+
tree = parser.parse(content.encode("utf-8"))
|
|
123
|
+
cursor = _make_query_cursor(language, QUERIES_BY_LANG[lang])
|
|
124
|
+
captures = cursor.captures(tree.root_node)
|
|
125
|
+
# captures is dict[capture_name, list[Node]] in tree-sitter ≥0.24.
|
|
126
|
+
|
|
127
|
+
chunk_nodes = list(captures.get("chunk", [])) if isinstance(captures, dict) else []
|
|
128
|
+
name_nodes = list(captures.get("name", [])) if isinstance(captures, dict) else []
|
|
129
|
+
if not chunk_nodes or not name_nodes:
|
|
130
|
+
return []
|
|
131
|
+
|
|
132
|
+
# Pair @name with the closest enclosing @chunk by walking up parents.
|
|
133
|
+
# Use node.id (the underlying tree-sitter AST node identity) rather than
|
|
134
|
+
# id(node) (Python wrapper identity) — re-fetched nodes get fresh wrappers
|
|
135
|
+
# but the same underlying tree-sitter id.
|
|
136
|
+
chunk_set = {n.id: n for n in chunk_nodes}
|
|
137
|
+
pairs: list[tuple[Any, Any]] = []
|
|
138
|
+
for name_node in name_nodes:
|
|
139
|
+
cur = name_node.parent
|
|
140
|
+
while cur is not None and cur.id not in chunk_set:
|
|
141
|
+
cur = cur.parent
|
|
142
|
+
if cur is not None:
|
|
143
|
+
pairs.append((chunk_set[cur.id], name_node))
|
|
144
|
+
|
|
145
|
+
defs: list[SymbolDef] = []
|
|
146
|
+
seen_keys: set[tuple[str, int, int]] = set()
|
|
147
|
+
for chunk_node, name_node in pairs:
|
|
148
|
+
try:
|
|
149
|
+
symbol_name = name_node.text.decode("utf-8", errors="replace")
|
|
150
|
+
except (AttributeError, UnicodeDecodeError):
|
|
151
|
+
continue
|
|
152
|
+
start_line = chunk_node.start_point[0] + 1
|
|
153
|
+
end_line = chunk_node.end_point[0] + 1
|
|
154
|
+
# Skip duplicates (same chunk node with multiple matched names).
|
|
155
|
+
key = (symbol_name, start_line, end_line)
|
|
156
|
+
if key in seen_keys:
|
|
157
|
+
continue
|
|
158
|
+
seen_keys.add(key)
|
|
159
|
+
defs.append(
|
|
160
|
+
SymbolDef(
|
|
161
|
+
name=symbol_name,
|
|
162
|
+
path=path,
|
|
163
|
+
lines=(start_line, end_line),
|
|
164
|
+
kind=_kind_from_node(chunk_node, lang),
|
|
165
|
+
language=lang,
|
|
166
|
+
)
|
|
167
|
+
)
|
|
168
|
+
# Sort for stable output (by start_line, then name).
|
|
169
|
+
defs.sort(key=lambda d: (d.lines[0], d.name))
|
|
170
|
+
return defs
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _kind_from_node(node: Any, lang: str) -> str:
|
|
174
|
+
"""Map tree-sitter node types to our SymbolDef.kind vocabulary."""
|
|
175
|
+
kind_map = {
|
|
176
|
+
# Python
|
|
177
|
+
"function_definition": "function",
|
|
178
|
+
"class_definition": "class",
|
|
179
|
+
# JS / TS
|
|
180
|
+
"function_declaration": "function",
|
|
181
|
+
"class_declaration": "class",
|
|
182
|
+
"method_definition": "method",
|
|
183
|
+
"interface_declaration": "interface",
|
|
184
|
+
"type_alias_declaration": "type",
|
|
185
|
+
"variable_declarator": "function", # arrow function bound to const
|
|
186
|
+
# Go
|
|
187
|
+
"method_declaration": "method",
|
|
188
|
+
"type_declaration": "type",
|
|
189
|
+
# Rust
|
|
190
|
+
"function_item": "function",
|
|
191
|
+
"struct_item": "struct",
|
|
192
|
+
"enum_item": "enum",
|
|
193
|
+
"impl_item": "impl",
|
|
194
|
+
"trait_item": "trait",
|
|
195
|
+
# C# (some overlap with the above; latest hit wins, ordering matters
|
|
196
|
+
# because dicts preserve insertion order — listed here last on purpose
|
|
197
|
+
# so e.g. C# class_declaration produces "class" not "class" via the JS path).
|
|
198
|
+
"constructor_declaration": "constructor",
|
|
199
|
+
"struct_declaration": "struct",
|
|
200
|
+
"record_declaration": "record",
|
|
201
|
+
"enum_declaration": "enum",
|
|
202
|
+
}
|
|
203
|
+
return kind_map.get(node.type, "unknown")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _flatten_chunk_nodes(captures: Any) -> list[Any]:
|
|
207
|
+
"""Return the @chunk-tagged nodes regardless of which API shape we got."""
|
|
208
|
+
if isinstance(captures, dict):
|
|
209
|
+
return list(captures.get("chunk", []))
|
|
210
|
+
out: list[Any] = []
|
|
211
|
+
for item in captures:
|
|
212
|
+
# item is (node, name) in older bindings.
|
|
213
|
+
if isinstance(item, tuple) and len(item) == 2 and item[1] == "chunk":
|
|
214
|
+
out.append(item[0])
|
|
215
|
+
return out
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Tree-sitter S-expression queries: one per supported language.
|
|
2
|
+
|
|
3
|
+
Each query captures the AST nodes we want to emit as chunks. A node is
|
|
4
|
+
"chunk-worthy" when it represents a complete top-level semantic unit:
|
|
5
|
+
|
|
6
|
+
- Python: function_definition, class_definition.
|
|
7
|
+
- JavaScript / TypeScript: function_declaration, class_declaration,
|
|
8
|
+
method_definition, arrow function assigned at module scope.
|
|
9
|
+
- Go: function_declaration, method_declaration, type_declaration.
|
|
10
|
+
- Rust: function_item, struct_item, enum_item, impl_item, trait_item.
|
|
11
|
+
- C#: method_declaration, constructor_declaration, class_declaration,
|
|
12
|
+
interface_declaration, struct_declaration, record_declaration,
|
|
13
|
+
enum_declaration.
|
|
14
|
+
|
|
15
|
+
Each query also captures the symbol's identifier as ``@name`` so callers
|
|
16
|
+
that want to mine ``SymbolDef`` objects (extract_definitions) can pair
|
|
17
|
+
the chunk node with its name child. Existing chunk-only consumers that
|
|
18
|
+
filter ``captures["chunk"]`` continue to work unchanged.
|
|
19
|
+
|
|
20
|
+
Smaller nodes (assignments, single statements) are NOT captured — they
|
|
21
|
+
are rolled up into a synthetic "module-prelude" chunk in the chunker
|
|
22
|
+
(future work; v0.2.0 simply skips them).
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
PYTHON = """
|
|
28
|
+
(function_definition
|
|
29
|
+
name: (identifier) @name) @chunk
|
|
30
|
+
(class_definition
|
|
31
|
+
name: (identifier) @name) @chunk
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
JAVASCRIPT = """
|
|
35
|
+
(function_declaration
|
|
36
|
+
name: (identifier) @name) @chunk
|
|
37
|
+
(class_declaration
|
|
38
|
+
name: (identifier) @name) @chunk
|
|
39
|
+
(method_definition
|
|
40
|
+
name: (property_identifier) @name) @chunk
|
|
41
|
+
(variable_declarator
|
|
42
|
+
name: (identifier) @name
|
|
43
|
+
value: (arrow_function)) @chunk
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
# TypeScript shares JS's overall shape but the grammar names the class with
|
|
47
|
+
# (type_identifier) rather than (identifier), so we cannot reuse the JS
|
|
48
|
+
# string verbatim — the TS class_declaration pattern needs its own form.
|
|
49
|
+
TYPESCRIPT = """
|
|
50
|
+
(function_declaration
|
|
51
|
+
name: (identifier) @name) @chunk
|
|
52
|
+
(class_declaration
|
|
53
|
+
name: (type_identifier) @name) @chunk
|
|
54
|
+
(method_definition
|
|
55
|
+
name: (property_identifier) @name) @chunk
|
|
56
|
+
(variable_declarator
|
|
57
|
+
name: (identifier) @name
|
|
58
|
+
value: (arrow_function)) @chunk
|
|
59
|
+
(interface_declaration
|
|
60
|
+
name: (type_identifier) @name) @chunk
|
|
61
|
+
(type_alias_declaration
|
|
62
|
+
name: (type_identifier) @name) @chunk
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
GO = """
|
|
66
|
+
(function_declaration
|
|
67
|
+
name: (identifier) @name) @chunk
|
|
68
|
+
(method_declaration
|
|
69
|
+
name: (field_identifier) @name) @chunk
|
|
70
|
+
(type_declaration
|
|
71
|
+
(type_spec name: (type_identifier) @name)) @chunk
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
RUST = """
|
|
75
|
+
(function_item
|
|
76
|
+
name: (identifier) @name) @chunk
|
|
77
|
+
(struct_item
|
|
78
|
+
name: (type_identifier) @name) @chunk
|
|
79
|
+
(enum_item
|
|
80
|
+
name: (type_identifier) @name) @chunk
|
|
81
|
+
(impl_item
|
|
82
|
+
type: (type_identifier) @name) @chunk
|
|
83
|
+
(trait_item
|
|
84
|
+
name: (type_identifier) @name) @chunk
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
CSHARP = """
|
|
88
|
+
(method_declaration
|
|
89
|
+
name: (identifier) @name) @chunk
|
|
90
|
+
(constructor_declaration
|
|
91
|
+
name: (identifier) @name) @chunk
|
|
92
|
+
(class_declaration
|
|
93
|
+
name: (identifier) @name) @chunk
|
|
94
|
+
(interface_declaration
|
|
95
|
+
name: (identifier) @name) @chunk
|
|
96
|
+
(struct_declaration
|
|
97
|
+
name: (identifier) @name) @chunk
|
|
98
|
+
(record_declaration
|
|
99
|
+
name: (identifier) @name) @chunk
|
|
100
|
+
(enum_declaration
|
|
101
|
+
name: (identifier) @name) @chunk
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
QUERIES_BY_LANG: dict[str, str] = {
|
|
105
|
+
"python": PYTHON,
|
|
106
|
+
"javascript": JAVASCRIPT,
|
|
107
|
+
"typescript": TYPESCRIPT,
|
|
108
|
+
"go": GO,
|
|
109
|
+
"rust": RUST,
|
|
110
|
+
"csharp": CSHARP,
|
|
111
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""FilesystemSource — gitignore-aware walk + binary detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pathspec
|
|
8
|
+
|
|
9
|
+
from code_context.domain.models import FileTreeNode
|
|
10
|
+
|
|
11
|
+
_BINARY_PROBE_BYTES = 4096
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FilesystemSource:
|
|
15
|
+
def list_files(self, root: Path, include_exts: list[str], max_bytes: int) -> list[Path]:
|
|
16
|
+
gitignore = self._load_gitignore(root)
|
|
17
|
+
results: list[Path] = []
|
|
18
|
+
ext_set = {e.lower() for e in include_exts}
|
|
19
|
+
|
|
20
|
+
for path in sorted(root.rglob("*")):
|
|
21
|
+
if not path.is_file():
|
|
22
|
+
continue
|
|
23
|
+
rel = path.relative_to(root).as_posix()
|
|
24
|
+
if gitignore.match_file(rel):
|
|
25
|
+
continue
|
|
26
|
+
if path.suffix.lower() not in ext_set:
|
|
27
|
+
continue
|
|
28
|
+
try:
|
|
29
|
+
size = path.stat().st_size
|
|
30
|
+
except OSError:
|
|
31
|
+
continue
|
|
32
|
+
if size > max_bytes:
|
|
33
|
+
continue
|
|
34
|
+
if self._looks_binary(path):
|
|
35
|
+
continue
|
|
36
|
+
results.append(path)
|
|
37
|
+
return results
|
|
38
|
+
|
|
39
|
+
def read(self, path: Path) -> str:
|
|
40
|
+
return path.read_text(encoding="utf-8", errors="replace")
|
|
41
|
+
|
|
42
|
+
def walk_tree(
|
|
43
|
+
self,
|
|
44
|
+
root: Path,
|
|
45
|
+
max_depth: int = 4,
|
|
46
|
+
include_hidden: bool = False,
|
|
47
|
+
subpath: Path | None = None,
|
|
48
|
+
) -> FileTreeNode:
|
|
49
|
+
root_resolved = root.resolve()
|
|
50
|
+
target = root_resolved if subpath is None else (root / subpath).resolve()
|
|
51
|
+
# Refuse to walk outside the root.
|
|
52
|
+
try:
|
|
53
|
+
target.relative_to(root_resolved)
|
|
54
|
+
except ValueError as exc:
|
|
55
|
+
raise ValueError(f"subpath {subpath!r} escapes root {root!r}") from exc
|
|
56
|
+
|
|
57
|
+
gitignore = self._load_gitignore(root)
|
|
58
|
+
return self._walk_node(target, root_resolved, gitignore, max_depth, include_hidden, 0)
|
|
59
|
+
|
|
60
|
+
def _walk_node(
|
|
61
|
+
self,
|
|
62
|
+
node: Path,
|
|
63
|
+
root: Path,
|
|
64
|
+
gitignore: pathspec.PathSpec,
|
|
65
|
+
max_depth: int,
|
|
66
|
+
include_hidden: bool,
|
|
67
|
+
current_depth: int,
|
|
68
|
+
) -> FileTreeNode:
|
|
69
|
+
rel = node.relative_to(root).as_posix() if node != root else ""
|
|
70
|
+
rel_display = rel if rel else "."
|
|
71
|
+
|
|
72
|
+
if node.is_file():
|
|
73
|
+
try:
|
|
74
|
+
size = node.stat().st_size
|
|
75
|
+
except OSError:
|
|
76
|
+
size = None
|
|
77
|
+
return FileTreeNode(path=rel_display, kind="file", children=(), size=size)
|
|
78
|
+
|
|
79
|
+
# Directory.
|
|
80
|
+
if current_depth >= max_depth:
|
|
81
|
+
# Cap reached — empty dir node (signals depth cap to caller).
|
|
82
|
+
return FileTreeNode(path=rel_display, kind="dir", children=(), size=None)
|
|
83
|
+
|
|
84
|
+
children: list[FileTreeNode] = []
|
|
85
|
+
try:
|
|
86
|
+
entries = sorted(node.iterdir(), key=lambda p: (p.is_file(), p.name.lower()))
|
|
87
|
+
except OSError:
|
|
88
|
+
return FileTreeNode(path=rel_display, kind="dir", children=(), size=None)
|
|
89
|
+
|
|
90
|
+
for child in entries:
|
|
91
|
+
name = child.name
|
|
92
|
+
if not include_hidden and name.startswith("."):
|
|
93
|
+
continue
|
|
94
|
+
child_rel = child.relative_to(root).as_posix()
|
|
95
|
+
# gitignore matching: dirs need trailing slash to match dir patterns.
|
|
96
|
+
match_path = child_rel + ("/" if child.is_dir() else "")
|
|
97
|
+
if gitignore.match_file(match_path) or gitignore.match_file(child_rel):
|
|
98
|
+
continue
|
|
99
|
+
children.append(
|
|
100
|
+
self._walk_node(
|
|
101
|
+
child, root, gitignore, max_depth, include_hidden, current_depth + 1
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return FileTreeNode(path=rel_display, kind="dir", children=tuple(children), size=None)
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def _load_gitignore(root: Path) -> pathspec.PathSpec:
|
|
109
|
+
lines = [".git/"]
|
|
110
|
+
gi = root / ".gitignore"
|
|
111
|
+
if gi.exists():
|
|
112
|
+
lines.extend(gi.read_text().splitlines())
|
|
113
|
+
return pathspec.PathSpec.from_lines("gitignore", lines)
|
|
114
|
+
|
|
115
|
+
@staticmethod
|
|
116
|
+
def _looks_binary(path: Path) -> bool:
|
|
117
|
+
try:
|
|
118
|
+
with path.open("rb") as fh:
|
|
119
|
+
probe = fh.read(_BINARY_PROBE_BYTES)
|
|
120
|
+
return b"\x00" in probe
|
|
121
|
+
except OSError:
|
|
122
|
+
return True
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""LocalST — sentence-transformers wrapped as an EmbeddingsProvider.
|
|
2
|
+
|
|
3
|
+
The sentence-transformers import is lazy: constructing this adapter doesn't
|
|
4
|
+
trigger torch loading. The model is loaded on first `embed()` call.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
log = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# MODEL_REGISTRY enumerates models we have verified and characterised. Models
|
|
18
|
+
# missing from here still work, but staleness, dimension hints, and benchmarks
|
|
19
|
+
# won't recognise them and the adapter will warn at construction time.
|
|
20
|
+
#
|
|
21
|
+
# v0.3.3 trimmed this list to verified entries only. v0.3.0-v0.3.2 listed
|
|
22
|
+
# `BAAI/bge-code-v1.5` which never existed on Hugging Face — a planning error
|
|
23
|
+
# corrected here. Other code-tuned candidates (`jinaai/jina-embeddings-v2-base-code`,
|
|
24
|
+
# `BAAI/bge-code-v1`) work via `CC_EMBEDDINGS_MODEL` override but are not yet
|
|
25
|
+
# pre-characterised here because their embedding dims have not been independently
|
|
26
|
+
# verified. v0.4 will re-introduce a verified code-tuned default after benchmark
|
|
27
|
+
# validation and a CI check that pings the HF API for each registered name.
|
|
28
|
+
MODEL_REGISTRY: dict[str, dict[str, int | str]] = {
|
|
29
|
+
"sentence-transformers/all-MiniLM-L6-v2": {"dimension": 384, "kind": "general"},
|
|
30
|
+
"all-MiniLM-L6-v2": {"dimension": 384, "kind": "general"}, # short alias
|
|
31
|
+
# Code-tuned (opt-in via CC_EMBEDDINGS_MODEL + CC_TRUST_REMOTE_CODE=true).
|
|
32
|
+
# 161M params (~640 MB FP32), Apache-2.0, English + 30 programming languages.
|
|
33
|
+
# Verified existing on HF as of v0.6.0 release; CI's hf-guard job re-checks
|
|
34
|
+
# on every push.
|
|
35
|
+
"jinaai/jina-embeddings-v2-base-code": {"dimension": 768, "kind": "code"},
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Whole-function chunks from tree-sitter can run 5K+ chars and overflow the
|
|
40
|
+
# 512-token context of BERT-family encoders. We embed the truncated head; the
|
|
41
|
+
# full snippet is preserved in the chunk for the search response payload, so
|
|
42
|
+
# users still see the complete code. 2048 chars ~= 512 tokens for code-heavy
|
|
43
|
+
# text.
|
|
44
|
+
_MAX_EMBED_CHARS = 2048
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _load_model(
|
|
48
|
+
model_name: str, *, trust_remote_code: bool = False
|
|
49
|
+
) -> Any: # pragma: no cover - integration-tested
|
|
50
|
+
"""Lazy import + load. Patched in unit tests."""
|
|
51
|
+
from sentence_transformers import SentenceTransformer
|
|
52
|
+
|
|
53
|
+
log.info(
|
|
54
|
+
"loading sentence-transformers model: %s (trust_remote_code=%s)",
|
|
55
|
+
model_name,
|
|
56
|
+
trust_remote_code,
|
|
57
|
+
)
|
|
58
|
+
return SentenceTransformer(model_name, trust_remote_code=trust_remote_code)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _lib_version() -> str:
|
|
62
|
+
try:
|
|
63
|
+
from importlib.metadata import version
|
|
64
|
+
|
|
65
|
+
return version("sentence-transformers")
|
|
66
|
+
except Exception: # pragma: no cover
|
|
67
|
+
return "unknown"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class LocalST:
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
model_name: str = "all-MiniLM-L6-v2",
|
|
74
|
+
*,
|
|
75
|
+
trust_remote_code: bool = False,
|
|
76
|
+
) -> None:
|
|
77
|
+
if model_name not in MODEL_REGISTRY:
|
|
78
|
+
log.warning(
|
|
79
|
+
"embeddings model %r not in MODEL_REGISTRY; staleness, "
|
|
80
|
+
"dimension hints, and benchmarks won't recognise it",
|
|
81
|
+
model_name,
|
|
82
|
+
)
|
|
83
|
+
self.model_name = model_name
|
|
84
|
+
self.trust_remote_code = trust_remote_code
|
|
85
|
+
self._model: Any = None
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def dimension(self) -> int:
|
|
89
|
+
self._ensure_loaded()
|
|
90
|
+
# sentence-transformers >= 5 renamed the method; fall back to the old name
|
|
91
|
+
# so we work across both lines without a hard pin.
|
|
92
|
+
getter = getattr(self._model, "get_embedding_dimension", None) or (
|
|
93
|
+
self._model.get_sentence_embedding_dimension
|
|
94
|
+
)
|
|
95
|
+
return int(getter())
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def model_id(self) -> str:
|
|
99
|
+
return f"local:{self.model_name}@v{_lib_version()}"
|
|
100
|
+
|
|
101
|
+
def embed(self, texts: list[str]) -> np.ndarray:
|
|
102
|
+
self._ensure_loaded()
|
|
103
|
+
if not texts:
|
|
104
|
+
return np.empty((0, self.dimension), dtype=np.float32)
|
|
105
|
+
truncated = [t[:_MAX_EMBED_CHARS] for t in texts]
|
|
106
|
+
out = self._model.encode(truncated, convert_to_numpy=True, show_progress_bar=False)
|
|
107
|
+
return out.astype(np.float32, copy=False)
|
|
108
|
+
|
|
109
|
+
def _ensure_loaded(self) -> None:
|
|
110
|
+
if self._model is None:
|
|
111
|
+
self._model = _load_model(self.model_name, trust_remote_code=self.trust_remote_code)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""OpenAIProvider — OpenAI embeddings via the openai SDK."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
log = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
_DIMENSION_BY_MODEL = {
|
|
13
|
+
"text-embedding-3-small": 1536,
|
|
14
|
+
"text-embedding-3-large": 3072,
|
|
15
|
+
"text-embedding-ada-002": 1536,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load_client(api_key: str) -> Any: # pragma: no cover - patched in tests
|
|
20
|
+
"""Lazy import + construct."""
|
|
21
|
+
from openai import OpenAI
|
|
22
|
+
|
|
23
|
+
return OpenAI(api_key=api_key)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _lib_version() -> str:
|
|
27
|
+
try:
|
|
28
|
+
from importlib.metadata import version
|
|
29
|
+
|
|
30
|
+
return version("openai")
|
|
31
|
+
except Exception: # pragma: no cover
|
|
32
|
+
return "unknown"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class OpenAIProvider:
|
|
36
|
+
def __init__(self, model: str, api_key: str) -> None:
|
|
37
|
+
if not api_key:
|
|
38
|
+
raise ValueError("api_key is required for OpenAIProvider")
|
|
39
|
+
self.model = model
|
|
40
|
+
self._api_key = api_key
|
|
41
|
+
self._client: Any = None
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def dimension(self) -> int:
|
|
45
|
+
return _DIMENSION_BY_MODEL.get(self.model, 1536)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def model_id(self) -> str:
|
|
49
|
+
return f"openai:{self.model}@v{_lib_version()}"
|
|
50
|
+
|
|
51
|
+
def embed(self, texts: list[str]) -> np.ndarray:
|
|
52
|
+
if self._client is None:
|
|
53
|
+
self._client = _load_client(self._api_key)
|
|
54
|
+
if not texts:
|
|
55
|
+
return np.empty((0, self.dimension), dtype=np.float32)
|
|
56
|
+
resp = self._client.embeddings.create(model=self.model, input=texts)
|
|
57
|
+
out = np.array([d.embedding for d in resp.data], dtype=np.float32)
|
|
58
|
+
return out
|