sourcefire 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sourcefire/__init__.py +0 -0
- sourcefire/api/__init__.py +0 -0
- sourcefire/api/models.py +24 -0
- sourcefire/api/routes.py +166 -0
- sourcefire/chain/__init__.py +0 -0
- sourcefire/chain/prompts.py +195 -0
- sourcefire/chain/rag_chain.py +967 -0
- sourcefire/cli.py +293 -0
- sourcefire/config.py +148 -0
- sourcefire/db.py +196 -0
- sourcefire/indexer/__init__.py +0 -0
- sourcefire/indexer/embeddings.py +27 -0
- sourcefire/indexer/language_profiles.py +448 -0
- sourcefire/indexer/metadata.py +289 -0
- sourcefire/indexer/pipeline.py +406 -0
- sourcefire/init.py +189 -0
- sourcefire/prompts/system.md +28 -0
- sourcefire/retriever/__init__.py +0 -0
- sourcefire/retriever/graph.py +162 -0
- sourcefire/retriever/search.py +86 -0
- sourcefire/static/.DS_Store +0 -0
- sourcefire/static/app.js +414 -0
- sourcefire/static/index.html +102 -0
- sourcefire/static/styles.css +607 -0
- sourcefire/watcher.py +105 -0
- sourcefire-0.2.0.dist-info/METADATA +145 -0
- sourcefire-0.2.0.dist-info/RECORD +31 -0
- sourcefire-0.2.0.dist-info/WHEEL +5 -0
- sourcefire-0.2.0.dist-info/entry_points.txt +2 -0
- sourcefire-0.2.0.dist-info/licenses/LICENSE +21 -0
- sourcefire-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
"""Language-agnostic code metadata extractor.
|
|
2
|
+
|
|
3
|
+
Extracts structural metadata from source files using tree-sitter (when available)
|
|
4
|
+
or regex fallback. Language-specific behavior is driven by LanguageProfile.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
|
|
13
|
+
from sourcefire.indexer.language_profiles import LanguageProfile, get_profile_for_extension
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# Optional tree-sitter import
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
try:
|
|
19
|
+
from tree_sitter_languages import get_language, get_parser # type: ignore
|
|
20
|
+
|
|
21
|
+
_HAS_TREE_SITTER = True
|
|
22
|
+
except Exception:
|
|
23
|
+
_HAS_TREE_SITTER = False
|
|
24
|
+
|
|
25
|
+
# Cache loaded parsers by language name
|
|
26
|
+
_PARSERS: dict[str, Any] = {}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _get_parser(language: str) -> Any | None:
|
|
30
|
+
"""Get a tree-sitter parser for the given language, or None."""
|
|
31
|
+
if not _HAS_TREE_SITTER:
|
|
32
|
+
return None
|
|
33
|
+
if language not in _PARSERS:
|
|
34
|
+
try:
|
|
35
|
+
_PARSERS[language] = get_parser(language)
|
|
36
|
+
except Exception:
|
|
37
|
+
_PARSERS[language] = None
|
|
38
|
+
return _PARSERS[language]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
# Public API
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def extract_metadata(source: str, file_path: str, profile: Optional[LanguageProfile] = None) -> dict[str, Any]:
|
|
47
|
+
"""Return a metadata dict for a source file.
|
|
48
|
+
|
|
49
|
+
Keys:
|
|
50
|
+
imports : list[str] — imported URIs/modules
|
|
51
|
+
exports : list[str] — top-level declaration names
|
|
52
|
+
layer : str — architecture layer inferred from path
|
|
53
|
+
feature : str — feature name inferred from path
|
|
54
|
+
file_type : str — file role inferred from path
|
|
55
|
+
"""
|
|
56
|
+
if profile is None:
|
|
57
|
+
ext = Path(file_path).suffix
|
|
58
|
+
profile = get_profile_for_extension(ext)
|
|
59
|
+
|
|
60
|
+
if profile is None:
|
|
61
|
+
# No profile — return path-only defaults
|
|
62
|
+
return {
|
|
63
|
+
"imports": [],
|
|
64
|
+
"exports": [],
|
|
65
|
+
"layer": "unknown",
|
|
66
|
+
"feature": "unknown",
|
|
67
|
+
"file_type": "unknown",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
parser = _get_parser(profile.tree_sitter_language) if profile.tree_sitter_language else None
|
|
71
|
+
|
|
72
|
+
if parser is not None and source:
|
|
73
|
+
imports = _extract_imports_tree_sitter(source, parser, profile)
|
|
74
|
+
exports = _extract_exports_tree_sitter(source, parser, profile)
|
|
75
|
+
elif source:
|
|
76
|
+
imports = _extract_imports_regex(source, profile)
|
|
77
|
+
exports = _extract_exports_regex(source, profile)
|
|
78
|
+
else:
|
|
79
|
+
imports = []
|
|
80
|
+
exports = []
|
|
81
|
+
|
|
82
|
+
return {
|
|
83
|
+
"imports": imports,
|
|
84
|
+
"exports": exports,
|
|
85
|
+
"layer": _infer_layer(file_path, profile),
|
|
86
|
+
"feature": _infer_feature(file_path, profile),
|
|
87
|
+
"file_type": _infer_file_type(file_path, profile),
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def chunk_source_file(
|
|
92
|
+
source: str,
|
|
93
|
+
file_path: str,
|
|
94
|
+
profile: Optional[LanguageProfile] = None,
|
|
95
|
+
chunk_size: int = 1000,
|
|
96
|
+
) -> list[dict[str, Any]]:
|
|
97
|
+
"""Split a source file into chunks and attach metadata to each chunk.
|
|
98
|
+
|
|
99
|
+
Splitting strategy:
|
|
100
|
+
1. If tree-sitter is available and a profile exists, split at declaration boundaries.
|
|
101
|
+
2. Otherwise, use regex boundary splitting.
|
|
102
|
+
3. If the source is shorter than chunk_size, return a single chunk.
|
|
103
|
+
|
|
104
|
+
Each chunk dict has keys:
|
|
105
|
+
text : str — the chunk text
|
|
106
|
+
metadata : dict — output of extract_metadata for the whole file
|
|
107
|
+
"""
|
|
108
|
+
if profile is None:
|
|
109
|
+
ext = Path(file_path).suffix
|
|
110
|
+
profile = get_profile_for_extension(ext)
|
|
111
|
+
|
|
112
|
+
metadata = extract_metadata(source, file_path, profile)
|
|
113
|
+
|
|
114
|
+
if profile is None:
|
|
115
|
+
# No profile — return source as a single chunk
|
|
116
|
+
return [{"text": source, "metadata": metadata}]
|
|
117
|
+
|
|
118
|
+
parser = _get_parser(profile.tree_sitter_language) if profile.tree_sitter_language else None
|
|
119
|
+
|
|
120
|
+
if parser is not None:
|
|
121
|
+
raw_chunks = _chunk_tree_sitter(source, parser, profile, chunk_size)
|
|
122
|
+
else:
|
|
123
|
+
raw_chunks = _chunk_regex(source, profile, chunk_size)
|
|
124
|
+
|
|
125
|
+
return [{"text": text, "metadata": metadata} for text in raw_chunks]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ---------------------------------------------------------------------------
|
|
129
|
+
# Tree-sitter implementations
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _extract_imports_tree_sitter(source: str, parser: Any, profile: LanguageProfile) -> list[str]:
|
|
134
|
+
"""Extract import URIs/modules using tree-sitter."""
|
|
135
|
+
tree = parser.parse(source.encode())
|
|
136
|
+
imports: list[str] = []
|
|
137
|
+
_walk_for_imports(tree.root_node, imports, profile)
|
|
138
|
+
return imports
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _walk_for_imports(node: Any, imports: list[str], profile: LanguageProfile) -> None:
|
|
142
|
+
if node.type in profile.import_node_types:
|
|
143
|
+
for child in node.children:
|
|
144
|
+
if child.type == profile.string_literal_type:
|
|
145
|
+
uri = child.text.decode().strip("'\"")
|
|
146
|
+
imports.append(uri)
|
|
147
|
+
for child in node.children:
|
|
148
|
+
_walk_for_imports(child, imports, profile)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _extract_exports_tree_sitter(source: str, parser: Any, profile: LanguageProfile) -> list[str]:
|
|
152
|
+
"""Extract top-level declaration names using tree-sitter."""
|
|
153
|
+
tree = parser.parse(source.encode())
|
|
154
|
+
exports: list[str] = []
|
|
155
|
+
_walk_for_exports(tree.root_node, exports, profile)
|
|
156
|
+
return exports
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _walk_for_exports(node: Any, exports: list[str], profile: LanguageProfile) -> None:
|
|
160
|
+
if node.type in profile.export_node_types:
|
|
161
|
+
for child in node.children:
|
|
162
|
+
if child.type == "identifier":
|
|
163
|
+
exports.append(child.text.decode())
|
|
164
|
+
break
|
|
165
|
+
for child in node.children:
|
|
166
|
+
_walk_for_exports(child, exports, profile)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _chunk_tree_sitter(source: str, parser: Any, profile: LanguageProfile, chunk_size: int) -> list[str]:
|
|
170
|
+
"""Split source at top-level declaration boundaries using tree-sitter."""
|
|
171
|
+
tree = parser.parse(source.encode())
|
|
172
|
+
chunks: list[str] = []
|
|
173
|
+
current: list[str] = []
|
|
174
|
+
current_len = 0
|
|
175
|
+
|
|
176
|
+
for node in tree.root_node.children:
|
|
177
|
+
if node.type in profile.boundary_node_types:
|
|
178
|
+
text = node.text.decode()
|
|
179
|
+
if current_len + len(text) > chunk_size and current:
|
|
180
|
+
chunks.append("\n".join(current).strip())
|
|
181
|
+
current = []
|
|
182
|
+
current_len = 0
|
|
183
|
+
current.append(text)
|
|
184
|
+
current_len += len(text)
|
|
185
|
+
|
|
186
|
+
if current:
|
|
187
|
+
chunks.append("\n".join(current).strip())
|
|
188
|
+
|
|
189
|
+
return chunks or [source]
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# ---------------------------------------------------------------------------
|
|
193
|
+
# Regex implementations
|
|
194
|
+
# ---------------------------------------------------------------------------
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _extract_imports_regex(source: str, profile: LanguageProfile) -> list[str]:
|
|
198
|
+
if not profile.import_pattern:
|
|
199
|
+
return []
|
|
200
|
+
regex = re.compile(profile.import_pattern, re.MULTILINE)
|
|
201
|
+
results = []
|
|
202
|
+
for m in regex.finditer(source):
|
|
203
|
+
# Take the first non-None group (different patterns use different groups)
|
|
204
|
+
for g in m.groups():
|
|
205
|
+
if g:
|
|
206
|
+
results.append(g)
|
|
207
|
+
break
|
|
208
|
+
return results
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _extract_exports_regex(source: str, profile: LanguageProfile) -> list[str]:
|
|
212
|
+
if not profile.export_pattern:
|
|
213
|
+
return []
|
|
214
|
+
return re.compile(profile.export_pattern, re.MULTILINE).findall(source)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _chunk_regex(source: str, profile: LanguageProfile, chunk_size: int) -> list[str]:
|
|
218
|
+
"""Split source at declaration boundaries using regex."""
|
|
219
|
+
if len(source) <= chunk_size:
|
|
220
|
+
return [source]
|
|
221
|
+
|
|
222
|
+
if not profile.boundary_pattern:
|
|
223
|
+
# No boundary pattern — fall back to size-based splitting
|
|
224
|
+
return [source[i : i + chunk_size] for i in range(0, len(source), chunk_size)]
|
|
225
|
+
|
|
226
|
+
boundary_re = re.compile(profile.boundary_pattern, re.MULTILINE)
|
|
227
|
+
boundaries = [m.start() for m in boundary_re.finditer(source)]
|
|
228
|
+
|
|
229
|
+
if not boundaries:
|
|
230
|
+
return [source[i : i + chunk_size] for i in range(0, len(source), chunk_size)]
|
|
231
|
+
|
|
232
|
+
# Include the preamble (imports, top-level comments) before first boundary.
|
|
233
|
+
segments: list[str] = []
|
|
234
|
+
starts = boundaries + [len(source)]
|
|
235
|
+
if boundaries[0] > 0:
|
|
236
|
+
preamble = source[: boundaries[0]].strip()
|
|
237
|
+
if preamble:
|
|
238
|
+
segments.append(preamble)
|
|
239
|
+
|
|
240
|
+
current_text = ""
|
|
241
|
+
for idx, start in enumerate(boundaries):
|
|
242
|
+
end = starts[idx + 1]
|
|
243
|
+
segment = source[start:end].strip()
|
|
244
|
+
if len(current_text) + len(segment) > chunk_size and current_text:
|
|
245
|
+
segments.append(current_text.strip())
|
|
246
|
+
current_text = segment
|
|
247
|
+
else:
|
|
248
|
+
current_text = (current_text + "\n\n" + segment).strip() if current_text else segment
|
|
249
|
+
|
|
250
|
+
if current_text:
|
|
251
|
+
segments.append(current_text.strip())
|
|
252
|
+
|
|
253
|
+
return segments or [source]
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
# ---------------------------------------------------------------------------
|
|
257
|
+
# Path-based inference helpers
|
|
258
|
+
# ---------------------------------------------------------------------------
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _infer_layer(file_path: str, profile: LanguageProfile) -> str:
|
|
262
|
+
for part in profile.layer_parts:
|
|
263
|
+
if f"/{part}/" in file_path:
|
|
264
|
+
return part
|
|
265
|
+
return "unknown"
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _infer_feature(file_path: str, profile: LanguageProfile) -> str:
|
|
269
|
+
if profile.feature_regex:
|
|
270
|
+
match = re.search(profile.feature_regex, file_path)
|
|
271
|
+
if match:
|
|
272
|
+
return match.group(1)
|
|
273
|
+
if "/core/" in file_path:
|
|
274
|
+
return "core"
|
|
275
|
+
return "unknown"
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _infer_file_type(file_path: str, profile: LanguageProfile) -> str:
|
|
279
|
+
stem = Path(file_path).stem.lower()
|
|
280
|
+
|
|
281
|
+
for suffix, file_type in profile.file_type_suffixes:
|
|
282
|
+
if stem.endswith(suffix.lower()):
|
|
283
|
+
return file_type
|
|
284
|
+
|
|
285
|
+
for pattern, file_type in profile.directory_type_patterns.items():
|
|
286
|
+
if pattern in file_path:
|
|
287
|
+
return file_type
|
|
288
|
+
|
|
289
|
+
return "unknown"
|
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
"""Full indexing pipeline for Sourcefire.
|
|
2
|
+
|
|
3
|
+
Scans a codebase, chunks files (AST-aware when a language profile exists,
|
|
4
|
+
simple split otherwise), embeds all chunks, and inserts them into ChromaDB.
|
|
5
|
+
Also builds the import graph for graph-augmented retrieval.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import fnmatch
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import chromadb
|
|
15
|
+
|
|
16
|
+
from sourcefire.config import SourcefireConfig
|
|
17
|
+
from sourcefire.db import add_chunks, reset_collection, delete_file_chunks, get_indexed_files, get_stored_mtimes
|
|
18
|
+
from sourcefire.indexer.embeddings import embed_batch
|
|
19
|
+
from sourcefire.indexer.language_profiles import LanguageProfile, get_profile, get_profile_for_extension
|
|
20
|
+
from sourcefire.indexer.metadata import chunk_source_file, extract_metadata
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# .gitignore parsing
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _parse_gitignore(codebase_path: Path) -> list[str]:
|
|
29
|
+
"""Parse .gitignore and return a list of glob patterns to exclude."""
|
|
30
|
+
gitignore_path = codebase_path / ".gitignore"
|
|
31
|
+
if not gitignore_path.is_file():
|
|
32
|
+
return []
|
|
33
|
+
|
|
34
|
+
patterns: list[str] = []
|
|
35
|
+
try:
|
|
36
|
+
for line in gitignore_path.read_text(encoding="utf-8", errors="replace").splitlines():
|
|
37
|
+
line = line.strip()
|
|
38
|
+
if not line or line.startswith("#"):
|
|
39
|
+
continue
|
|
40
|
+
if line.startswith("/"):
|
|
41
|
+
line = line[1:]
|
|
42
|
+
if line.endswith("/"):
|
|
43
|
+
line = line + "**"
|
|
44
|
+
patterns.append(line)
|
|
45
|
+
except OSError:
|
|
46
|
+
pass
|
|
47
|
+
return patterns
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
# Pattern matching helpers
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _match_patterns(rel_path: str, patterns: list[str]) -> bool:
|
|
56
|
+
"""Return True if *rel_path* matches any of *patterns* using fnmatch."""
|
|
57
|
+
for pattern in patterns:
|
|
58
|
+
if fnmatch.fnmatch(rel_path, pattern):
|
|
59
|
+
return True
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
# Non-AST chunker (simple recursive text split)
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _chunk_plain_text(
|
|
69
|
+
text: str,
|
|
70
|
+
chunk_size: int = 1000,
|
|
71
|
+
chunk_overlap: int = 300,
|
|
72
|
+
) -> list[str]:
|
|
73
|
+
"""Split *text* into overlapping chunks of at most *chunk_size* characters."""
|
|
74
|
+
if len(text) <= chunk_size:
|
|
75
|
+
return [text]
|
|
76
|
+
|
|
77
|
+
chunks: list[str] = []
|
|
78
|
+
start = 0
|
|
79
|
+
while start < len(text):
|
|
80
|
+
end = start + chunk_size
|
|
81
|
+
chunks.append(text[start:end])
|
|
82
|
+
if end >= len(text):
|
|
83
|
+
break
|
|
84
|
+
start = end - chunk_overlap
|
|
85
|
+
|
|
86
|
+
return chunks
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
# File scanning
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _collect_files(
|
|
95
|
+
codebase_path: Path,
|
|
96
|
+
config: SourcefireConfig,
|
|
97
|
+
profile: LanguageProfile | None,
|
|
98
|
+
) -> list[Path]:
|
|
99
|
+
"""Return all files under *codebase_path* that pass include/exclude filters.
|
|
100
|
+
|
|
101
|
+
Config patterns are authoritative. If config has include patterns, those
|
|
102
|
+
are used. Otherwise falls back to language profile patterns.
|
|
103
|
+
"""
|
|
104
|
+
include_patterns: list[str] = list(config.include) if config.include else []
|
|
105
|
+
exclude_patterns: list[str] = list(config.exclude) if config.exclude else []
|
|
106
|
+
|
|
107
|
+
# If no config patterns, fall back to profile
|
|
108
|
+
if not include_patterns and profile:
|
|
109
|
+
include_patterns = list(profile.include_patterns)
|
|
110
|
+
if not exclude_patterns and profile:
|
|
111
|
+
exclude_patterns = list(profile.exclude_patterns)
|
|
112
|
+
|
|
113
|
+
# Always exclude .gitignore patterns
|
|
114
|
+
exclude_patterns.extend(_parse_gitignore(codebase_path))
|
|
115
|
+
|
|
116
|
+
if not include_patterns:
|
|
117
|
+
include_patterns = ["**/*"]
|
|
118
|
+
|
|
119
|
+
matched: list[Path] = []
|
|
120
|
+
for pattern in include_patterns:
|
|
121
|
+
for file_path in codebase_path.glob(pattern):
|
|
122
|
+
if not file_path.is_file():
|
|
123
|
+
continue
|
|
124
|
+
rel = file_path.relative_to(codebase_path).as_posix()
|
|
125
|
+
if not _match_patterns(rel, exclude_patterns):
|
|
126
|
+
matched.append(file_path)
|
|
127
|
+
|
|
128
|
+
seen: set[Path] = set()
|
|
129
|
+
unique: list[Path] = []
|
|
130
|
+
for p in matched:
|
|
131
|
+
if p not in seen:
|
|
132
|
+
seen.add(p)
|
|
133
|
+
unique.append(p)
|
|
134
|
+
|
|
135
|
+
return unique
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ---------------------------------------------------------------------------
|
|
139
|
+
# Chunk production
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _chunks_for_file(
|
|
144
|
+
file_path: Path,
|
|
145
|
+
codebase_path: Path,
|
|
146
|
+
profile: LanguageProfile | None,
|
|
147
|
+
chunk_size: int = 1000,
|
|
148
|
+
) -> list[dict[str, Any]]:
|
|
149
|
+
"""Return a list of chunk dicts for *file_path*."""
|
|
150
|
+
rel = file_path.relative_to(codebase_path).as_posix()
|
|
151
|
+
source = file_path.read_text(encoding="utf-8", errors="replace")
|
|
152
|
+
|
|
153
|
+
file_profile = get_profile_for_extension(file_path.suffix) or profile
|
|
154
|
+
|
|
155
|
+
if file_profile and file_path.suffix in [e for e in file_profile.file_extensions]:
|
|
156
|
+
raw_chunks = chunk_source_file(source, rel, file_profile, chunk_size=chunk_size)
|
|
157
|
+
chunks_out: list[dict[str, Any]] = []
|
|
158
|
+
for idx, chunk in enumerate(raw_chunks):
|
|
159
|
+
meta = chunk["metadata"]
|
|
160
|
+
chunks_out.append({
|
|
161
|
+
"filename": rel,
|
|
162
|
+
"location": f"{rel}:{idx}",
|
|
163
|
+
"code": chunk["text"],
|
|
164
|
+
"feature": meta.get("feature", ""),
|
|
165
|
+
"layer": meta.get("layer", ""),
|
|
166
|
+
"file_type": meta.get("file_type", ""),
|
|
167
|
+
})
|
|
168
|
+
return chunks_out
|
|
169
|
+
else:
|
|
170
|
+
meta = extract_metadata("", rel, file_profile)
|
|
171
|
+
raw_texts = _chunk_plain_text(source, chunk_size=chunk_size)
|
|
172
|
+
return [
|
|
173
|
+
{
|
|
174
|
+
"filename": rel,
|
|
175
|
+
"location": f"{rel}:{idx}",
|
|
176
|
+
"code": text,
|
|
177
|
+
"feature": meta.get("feature", ""),
|
|
178
|
+
"layer": meta.get("layer", ""),
|
|
179
|
+
"file_type": meta.get("file_type", ""),
|
|
180
|
+
}
|
|
181
|
+
for idx, text in enumerate(raw_texts)
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# ---------------------------------------------------------------------------
|
|
186
|
+
# Public entry points
|
|
187
|
+
# ---------------------------------------------------------------------------
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def run_indexing(
|
|
191
|
+
collection: chromadb.Collection,
|
|
192
|
+
config: SourcefireConfig,
|
|
193
|
+
client: chromadb.ClientAPI | None = None,
|
|
194
|
+
full: bool = True,
|
|
195
|
+
) -> dict[str, Any]:
|
|
196
|
+
"""Run the indexing pipeline.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
collection: ChromaDB collection to write to.
|
|
200
|
+
config: Sourcefire configuration.
|
|
201
|
+
client: ChromaDB client (needed for full reset).
|
|
202
|
+
full: If True, reset collection and re-index everything.
|
|
203
|
+
If False, incremental — compare file mtimes and only
|
|
204
|
+
re-index changed/new files, delete removed files.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
A stats dict with keys: files, chunks, edges, language, import_edges.
|
|
208
|
+
"""
|
|
209
|
+
codebase_path = config.project_dir
|
|
210
|
+
print(f"[pipeline] Scanning codebase at: {codebase_path}")
|
|
211
|
+
|
|
212
|
+
language_override = config.language if config.language != "auto" else None
|
|
213
|
+
profile = get_profile(codebase_path, language_override)
|
|
214
|
+
lang_name = profile.language if profile else "generic"
|
|
215
|
+
print(f"[pipeline] Detected language: {lang_name}")
|
|
216
|
+
|
|
217
|
+
# Collect all files on disk
|
|
218
|
+
all_disk_files = _collect_files(codebase_path, config, profile)
|
|
219
|
+
print(f"[pipeline] Found {len(all_disk_files)} files to index.")
|
|
220
|
+
|
|
221
|
+
if not all_disk_files:
|
|
222
|
+
print("[pipeline] Error: No source files found matching the configured patterns.")
|
|
223
|
+
print("Run `sourcefire --reinit` to regenerate patterns, or edit .sourcefire/config.toml manually.")
|
|
224
|
+
return {
|
|
225
|
+
"files": 0, "chunks": 0, "edges": 0,
|
|
226
|
+
"language": lang_name, "import_edges": {},
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
# Determine which files to process
|
|
230
|
+
if full and client:
|
|
231
|
+
collection = reset_collection(client)
|
|
232
|
+
print("[pipeline] Collection reset for full re-index.")
|
|
233
|
+
files_to_index = all_disk_files
|
|
234
|
+
elif not full:
|
|
235
|
+
# Incremental: compare mtimes
|
|
236
|
+
indexed_files = get_indexed_files(collection)
|
|
237
|
+
stored_mtimes = get_stored_mtimes(collection)
|
|
238
|
+
|
|
239
|
+
current_files: dict[str, Path] = {}
|
|
240
|
+
for f in all_disk_files:
|
|
241
|
+
rel = f.relative_to(codebase_path).as_posix()
|
|
242
|
+
current_files[rel] = f
|
|
243
|
+
|
|
244
|
+
# Find changed/new files
|
|
245
|
+
changed: list[Path] = []
|
|
246
|
+
for rel, f in current_files.items():
|
|
247
|
+
stored_mtime = stored_mtimes.get(rel, 0.0)
|
|
248
|
+
if rel not in indexed_files or f.stat().st_mtime > stored_mtime:
|
|
249
|
+
changed.append(f)
|
|
250
|
+
|
|
251
|
+
# Find deleted files
|
|
252
|
+
deleted = indexed_files - set(current_files.keys())
|
|
253
|
+
for rel in deleted:
|
|
254
|
+
delete_file_chunks(collection, rel)
|
|
255
|
+
|
|
256
|
+
if not changed and not deleted:
|
|
257
|
+
print("[pipeline] Index is up to date.")
|
|
258
|
+
return {
|
|
259
|
+
"files": len(all_disk_files), "chunks": collection.count(), "edges": 0,
|
|
260
|
+
"language": lang_name, "import_edges": {},
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
print(f"[pipeline] {len(changed)} changed, {len(deleted)} deleted files.")
|
|
264
|
+
files_to_index = changed
|
|
265
|
+
|
|
266
|
+
# Delete old chunks for changed files before re-inserting
|
|
267
|
+
for f in changed:
|
|
268
|
+
rel = f.relative_to(codebase_path).as_posix()
|
|
269
|
+
delete_file_chunks(collection, rel)
|
|
270
|
+
else:
|
|
271
|
+
files_to_index = all_disk_files
|
|
272
|
+
|
|
273
|
+
# Produce chunks and collect imports
|
|
274
|
+
all_chunks: list[dict[str, Any]] = []
|
|
275
|
+
file_imports: dict[str, list[str]] = {}
|
|
276
|
+
|
|
277
|
+
for file_path in files_to_index:
|
|
278
|
+
rel = file_path.relative_to(codebase_path).as_posix()
|
|
279
|
+
chunks = _chunks_for_file(file_path, codebase_path, profile, chunk_size=config.chunk_size)
|
|
280
|
+
all_chunks.extend(chunks)
|
|
281
|
+
|
|
282
|
+
file_profile = get_profile_for_extension(file_path.suffix) or profile
|
|
283
|
+
if file_profile and file_path.suffix in file_profile.file_extensions:
|
|
284
|
+
source = file_path.read_text(encoding="utf-8", errors="replace")
|
|
285
|
+
meta = extract_metadata(source, rel, file_profile)
|
|
286
|
+
if meta.get("imports"):
|
|
287
|
+
file_imports[rel] = meta["imports"]
|
|
288
|
+
|
|
289
|
+
print(f"[pipeline] Produced {len(all_chunks)} chunks.")
|
|
290
|
+
|
|
291
|
+
if not all_chunks:
|
|
292
|
+
return {
|
|
293
|
+
"files": len(all_disk_files), "chunks": 0, "edges": 0,
|
|
294
|
+
"language": lang_name, "import_edges": file_imports,
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
# Embed
|
|
298
|
+
print("[pipeline] Embedding chunks...")
|
|
299
|
+
texts = [c["code"] for c in all_chunks]
|
|
300
|
+
embeddings = embed_batch(texts)
|
|
301
|
+
print("[pipeline] Embeddings done.")
|
|
302
|
+
|
|
303
|
+
# Build mtime lookup
|
|
304
|
+
file_mtimes: dict[str, str] = {}
|
|
305
|
+
for file_path in files_to_index:
|
|
306
|
+
rel = file_path.relative_to(codebase_path).as_posix()
|
|
307
|
+
file_mtimes[rel] = str(file_path.stat().st_mtime)
|
|
308
|
+
|
|
309
|
+
# Insert into ChromaDB in batches
|
|
310
|
+
BATCH_SIZE = 5000
|
|
311
|
+
print("[pipeline] Inserting into ChromaDB...")
|
|
312
|
+
for i in range(0, len(all_chunks), BATCH_SIZE):
|
|
313
|
+
batch = all_chunks[i:i + BATCH_SIZE]
|
|
314
|
+
batch_emb = embeddings[i:i + BATCH_SIZE]
|
|
315
|
+
add_chunks(
|
|
316
|
+
collection,
|
|
317
|
+
ids=[c["location"] for c in batch],
|
|
318
|
+
documents=[c["code"] for c in batch],
|
|
319
|
+
embeddings=batch_emb,
|
|
320
|
+
metadatas=[
|
|
321
|
+
{
|
|
322
|
+
"filename": c["filename"],
|
|
323
|
+
"location": c["location"],
|
|
324
|
+
"feature": c["feature"],
|
|
325
|
+
"layer": c["layer"],
|
|
326
|
+
"file_type": c["file_type"],
|
|
327
|
+
"mtime": file_mtimes.get(c["filename"], "0"),
|
|
328
|
+
}
|
|
329
|
+
for c in batch
|
|
330
|
+
],
|
|
331
|
+
)
|
|
332
|
+
print(f"[pipeline] Inserted {len(all_chunks)} chunks.")
|
|
333
|
+
|
|
334
|
+
edge_count = sum(len(v) for v in file_imports.values())
|
|
335
|
+
print(f"[pipeline] Import edges: {edge_count}")
|
|
336
|
+
|
|
337
|
+
return {
|
|
338
|
+
"files": len(all_disk_files),
|
|
339
|
+
"chunks": len(all_chunks),
|
|
340
|
+
"edges": edge_count,
|
|
341
|
+
"language": lang_name,
|
|
342
|
+
"import_edges": file_imports,
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def index_files(
|
|
347
|
+
collection: chromadb.Collection,
|
|
348
|
+
file_paths: list[Path],
|
|
349
|
+
config: SourcefireConfig,
|
|
350
|
+
profile: LanguageProfile | None,
|
|
351
|
+
) -> dict[str, list[str]]:
|
|
352
|
+
"""Index specific files (for incremental re-indexing by the watcher).
|
|
353
|
+
|
|
354
|
+
Deletes existing chunks for each file, then re-chunks, embeds, and inserts.
|
|
355
|
+
Returns the import map for updated files.
|
|
356
|
+
"""
|
|
357
|
+
codebase_path = config.project_dir
|
|
358
|
+
file_imports: dict[str, list[str]] = {}
|
|
359
|
+
all_chunks: list[dict[str, Any]] = []
|
|
360
|
+
|
|
361
|
+
for file_path in file_paths:
|
|
362
|
+
rel = file_path.relative_to(codebase_path).as_posix()
|
|
363
|
+
delete_file_chunks(collection, rel)
|
|
364
|
+
|
|
365
|
+
chunks = _chunks_for_file(file_path, codebase_path, profile, chunk_size=config.chunk_size)
|
|
366
|
+
all_chunks.extend(chunks)
|
|
367
|
+
|
|
368
|
+
file_profile = get_profile_for_extension(file_path.suffix) or profile
|
|
369
|
+
if file_profile and file_path.suffix in file_profile.file_extensions:
|
|
370
|
+
source = file_path.read_text(encoding="utf-8", errors="replace")
|
|
371
|
+
meta = extract_metadata(source, rel, file_profile)
|
|
372
|
+
if meta.get("imports"):
|
|
373
|
+
file_imports[rel] = meta["imports"]
|
|
374
|
+
|
|
375
|
+
if all_chunks:
|
|
376
|
+
texts = [c["code"] for c in all_chunks]
|
|
377
|
+
embeddings = embed_batch(texts)
|
|
378
|
+
|
|
379
|
+
file_mtimes: dict[str, str] = {}
|
|
380
|
+
for file_path in file_paths:
|
|
381
|
+
rel = file_path.relative_to(codebase_path).as_posix()
|
|
382
|
+
file_mtimes[rel] = str(file_path.stat().st_mtime)
|
|
383
|
+
|
|
384
|
+
BATCH_SIZE = 5000
|
|
385
|
+
for i in range(0, len(all_chunks), BATCH_SIZE):
|
|
386
|
+
batch = all_chunks[i:i + BATCH_SIZE]
|
|
387
|
+
batch_emb = embeddings[i:i + BATCH_SIZE]
|
|
388
|
+
add_chunks(
|
|
389
|
+
collection,
|
|
390
|
+
ids=[c["location"] for c in batch],
|
|
391
|
+
documents=[c["code"] for c in batch],
|
|
392
|
+
embeddings=batch_emb,
|
|
393
|
+
metadatas=[
|
|
394
|
+
{
|
|
395
|
+
"filename": c["filename"],
|
|
396
|
+
"location": c["location"],
|
|
397
|
+
"feature": c["feature"],
|
|
398
|
+
"layer": c["layer"],
|
|
399
|
+
"file_type": c["file_type"],
|
|
400
|
+
"mtime": file_mtimes.get(c["filename"], "0"),
|
|
401
|
+
}
|
|
402
|
+
for c in batch
|
|
403
|
+
],
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
return file_imports
|