sourcefire 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,289 @@
1
+ """Language-agnostic code metadata extractor.
2
+
3
+ Extracts structural metadata from source files using tree-sitter (when available)
4
+ or regex fallback. Language-specific behavior is driven by LanguageProfile.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from pathlib import Path
11
+ from typing import Any, Optional
12
+
13
+ from sourcefire.indexer.language_profiles import LanguageProfile, get_profile_for_extension
14
+
15
+ # ---------------------------------------------------------------------------
16
+ # Optional tree-sitter import
17
+ # ---------------------------------------------------------------------------
18
+ try:
19
+ from tree_sitter_languages import get_language, get_parser # type: ignore
20
+
21
+ _HAS_TREE_SITTER = True
22
+ except Exception:
23
+ _HAS_TREE_SITTER = False
24
+
25
+ # Cache loaded parsers by language name
26
+ _PARSERS: dict[str, Any] = {}
27
+
28
+
29
+ def _get_parser(language: str) -> Any | None:
30
+ """Get a tree-sitter parser for the given language, or None."""
31
+ if not _HAS_TREE_SITTER:
32
+ return None
33
+ if language not in _PARSERS:
34
+ try:
35
+ _PARSERS[language] = get_parser(language)
36
+ except Exception:
37
+ _PARSERS[language] = None
38
+ return _PARSERS[language]
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Public API
43
+ # ---------------------------------------------------------------------------
44
+
45
+
46
+ def extract_metadata(source: str, file_path: str, profile: Optional[LanguageProfile] = None) -> dict[str, Any]:
47
+ """Return a metadata dict for a source file.
48
+
49
+ Keys:
50
+ imports : list[str] — imported URIs/modules
51
+ exports : list[str] — top-level declaration names
52
+ layer : str — architecture layer inferred from path
53
+ feature : str — feature name inferred from path
54
+ file_type : str — file role inferred from path
55
+ """
56
+ if profile is None:
57
+ ext = Path(file_path).suffix
58
+ profile = get_profile_for_extension(ext)
59
+
60
+ if profile is None:
61
+ # No profile — return path-only defaults
62
+ return {
63
+ "imports": [],
64
+ "exports": [],
65
+ "layer": "unknown",
66
+ "feature": "unknown",
67
+ "file_type": "unknown",
68
+ }
69
+
70
+ parser = _get_parser(profile.tree_sitter_language) if profile.tree_sitter_language else None
71
+
72
+ if parser is not None and source:
73
+ imports = _extract_imports_tree_sitter(source, parser, profile)
74
+ exports = _extract_exports_tree_sitter(source, parser, profile)
75
+ elif source:
76
+ imports = _extract_imports_regex(source, profile)
77
+ exports = _extract_exports_regex(source, profile)
78
+ else:
79
+ imports = []
80
+ exports = []
81
+
82
+ return {
83
+ "imports": imports,
84
+ "exports": exports,
85
+ "layer": _infer_layer(file_path, profile),
86
+ "feature": _infer_feature(file_path, profile),
87
+ "file_type": _infer_file_type(file_path, profile),
88
+ }
89
+
90
+
91
+ def chunk_source_file(
92
+ source: str,
93
+ file_path: str,
94
+ profile: Optional[LanguageProfile] = None,
95
+ chunk_size: int = 1000,
96
+ ) -> list[dict[str, Any]]:
97
+ """Split a source file into chunks and attach metadata to each chunk.
98
+
99
+ Splitting strategy:
100
+ 1. If tree-sitter is available and a profile exists, split at declaration boundaries.
101
+ 2. Otherwise, use regex boundary splitting.
102
+ 3. If the source is shorter than chunk_size, return a single chunk.
103
+
104
+ Each chunk dict has keys:
105
+ text : str — the chunk text
106
+ metadata : dict — output of extract_metadata for the whole file
107
+ """
108
+ if profile is None:
109
+ ext = Path(file_path).suffix
110
+ profile = get_profile_for_extension(ext)
111
+
112
+ metadata = extract_metadata(source, file_path, profile)
113
+
114
+ if profile is None:
115
+ # No profile — return source as a single chunk
116
+ return [{"text": source, "metadata": metadata}]
117
+
118
+ parser = _get_parser(profile.tree_sitter_language) if profile.tree_sitter_language else None
119
+
120
+ if parser is not None:
121
+ raw_chunks = _chunk_tree_sitter(source, parser, profile, chunk_size)
122
+ else:
123
+ raw_chunks = _chunk_regex(source, profile, chunk_size)
124
+
125
+ return [{"text": text, "metadata": metadata} for text in raw_chunks]
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # Tree-sitter implementations
130
+ # ---------------------------------------------------------------------------
131
+
132
+
133
+ def _extract_imports_tree_sitter(source: str, parser: Any, profile: LanguageProfile) -> list[str]:
134
+ """Extract import URIs/modules using tree-sitter."""
135
+ tree = parser.parse(source.encode())
136
+ imports: list[str] = []
137
+ _walk_for_imports(tree.root_node, imports, profile)
138
+ return imports
139
+
140
+
141
+ def _walk_for_imports(node: Any, imports: list[str], profile: LanguageProfile) -> None:
142
+ if node.type in profile.import_node_types:
143
+ for child in node.children:
144
+ if child.type == profile.string_literal_type:
145
+ uri = child.text.decode().strip("'\"")
146
+ imports.append(uri)
147
+ for child in node.children:
148
+ _walk_for_imports(child, imports, profile)
149
+
150
+
151
+ def _extract_exports_tree_sitter(source: str, parser: Any, profile: LanguageProfile) -> list[str]:
152
+ """Extract top-level declaration names using tree-sitter."""
153
+ tree = parser.parse(source.encode())
154
+ exports: list[str] = []
155
+ _walk_for_exports(tree.root_node, exports, profile)
156
+ return exports
157
+
158
+
159
+ def _walk_for_exports(node: Any, exports: list[str], profile: LanguageProfile) -> None:
160
+ if node.type in profile.export_node_types:
161
+ for child in node.children:
162
+ if child.type == "identifier":
163
+ exports.append(child.text.decode())
164
+ break
165
+ for child in node.children:
166
+ _walk_for_exports(child, exports, profile)
167
+
168
+
169
+ def _chunk_tree_sitter(source: str, parser: Any, profile: LanguageProfile, chunk_size: int) -> list[str]:
170
+ """Split source at top-level declaration boundaries using tree-sitter."""
171
+ tree = parser.parse(source.encode())
172
+ chunks: list[str] = []
173
+ current: list[str] = []
174
+ current_len = 0
175
+
176
+ for node in tree.root_node.children:
177
+ if node.type in profile.boundary_node_types:
178
+ text = node.text.decode()
179
+ if current_len + len(text) > chunk_size and current:
180
+ chunks.append("\n".join(current).strip())
181
+ current = []
182
+ current_len = 0
183
+ current.append(text)
184
+ current_len += len(text)
185
+
186
+ if current:
187
+ chunks.append("\n".join(current).strip())
188
+
189
+ return chunks or [source]
190
+
191
+
192
+ # ---------------------------------------------------------------------------
193
+ # Regex implementations
194
+ # ---------------------------------------------------------------------------
195
+
196
+
197
+ def _extract_imports_regex(source: str, profile: LanguageProfile) -> list[str]:
198
+ if not profile.import_pattern:
199
+ return []
200
+ regex = re.compile(profile.import_pattern, re.MULTILINE)
201
+ results = []
202
+ for m in regex.finditer(source):
203
+ # Take the first non-None group (different patterns use different groups)
204
+ for g in m.groups():
205
+ if g:
206
+ results.append(g)
207
+ break
208
+ return results
209
+
210
+
211
+ def _extract_exports_regex(source: str, profile: LanguageProfile) -> list[str]:
212
+ if not profile.export_pattern:
213
+ return []
214
+ return re.compile(profile.export_pattern, re.MULTILINE).findall(source)
215
+
216
+
217
+ def _chunk_regex(source: str, profile: LanguageProfile, chunk_size: int) -> list[str]:
218
+ """Split source at declaration boundaries using regex."""
219
+ if len(source) <= chunk_size:
220
+ return [source]
221
+
222
+ if not profile.boundary_pattern:
223
+ # No boundary pattern — fall back to size-based splitting
224
+ return [source[i : i + chunk_size] for i in range(0, len(source), chunk_size)]
225
+
226
+ boundary_re = re.compile(profile.boundary_pattern, re.MULTILINE)
227
+ boundaries = [m.start() for m in boundary_re.finditer(source)]
228
+
229
+ if not boundaries:
230
+ return [source[i : i + chunk_size] for i in range(0, len(source), chunk_size)]
231
+
232
+ # Include the preamble (imports, top-level comments) before first boundary.
233
+ segments: list[str] = []
234
+ starts = boundaries + [len(source)]
235
+ if boundaries[0] > 0:
236
+ preamble = source[: boundaries[0]].strip()
237
+ if preamble:
238
+ segments.append(preamble)
239
+
240
+ current_text = ""
241
+ for idx, start in enumerate(boundaries):
242
+ end = starts[idx + 1]
243
+ segment = source[start:end].strip()
244
+ if len(current_text) + len(segment) > chunk_size and current_text:
245
+ segments.append(current_text.strip())
246
+ current_text = segment
247
+ else:
248
+ current_text = (current_text + "\n\n" + segment).strip() if current_text else segment
249
+
250
+ if current_text:
251
+ segments.append(current_text.strip())
252
+
253
+ return segments or [source]
254
+
255
+
256
+ # ---------------------------------------------------------------------------
257
+ # Path-based inference helpers
258
+ # ---------------------------------------------------------------------------
259
+
260
+
261
+ def _infer_layer(file_path: str, profile: LanguageProfile) -> str:
262
+ for part in profile.layer_parts:
263
+ if f"/{part}/" in file_path:
264
+ return part
265
+ return "unknown"
266
+
267
+
268
+ def _infer_feature(file_path: str, profile: LanguageProfile) -> str:
269
+ if profile.feature_regex:
270
+ match = re.search(profile.feature_regex, file_path)
271
+ if match:
272
+ return match.group(1)
273
+ if "/core/" in file_path:
274
+ return "core"
275
+ return "unknown"
276
+
277
+
278
+ def _infer_file_type(file_path: str, profile: LanguageProfile) -> str:
279
+ stem = Path(file_path).stem.lower()
280
+
281
+ for suffix, file_type in profile.file_type_suffixes:
282
+ if stem.endswith(suffix.lower()):
283
+ return file_type
284
+
285
+ for pattern, file_type in profile.directory_type_patterns.items():
286
+ if pattern in file_path:
287
+ return file_type
288
+
289
+ return "unknown"
@@ -0,0 +1,406 @@
1
+ """Full indexing pipeline for Sourcefire.
2
+
3
+ Scans a codebase, chunks files (AST-aware when a language profile exists,
4
+ simple split otherwise), embeds all chunks, and inserts them into ChromaDB.
5
+ Also builds the import graph for graph-augmented retrieval.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import fnmatch
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import chromadb
15
+
16
+ from sourcefire.config import SourcefireConfig
17
+ from sourcefire.db import add_chunks, reset_collection, delete_file_chunks, get_indexed_files, get_stored_mtimes
18
+ from sourcefire.indexer.embeddings import embed_batch
19
+ from sourcefire.indexer.language_profiles import LanguageProfile, get_profile, get_profile_for_extension
20
+ from sourcefire.indexer.metadata import chunk_source_file, extract_metadata
21
+
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # .gitignore parsing
25
+ # ---------------------------------------------------------------------------
26
+
27
+
28
+ def _parse_gitignore(codebase_path: Path) -> list[str]:
29
+ """Parse .gitignore and return a list of glob patterns to exclude."""
30
+ gitignore_path = codebase_path / ".gitignore"
31
+ if not gitignore_path.is_file():
32
+ return []
33
+
34
+ patterns: list[str] = []
35
+ try:
36
+ for line in gitignore_path.read_text(encoding="utf-8", errors="replace").splitlines():
37
+ line = line.strip()
38
+ if not line or line.startswith("#"):
39
+ continue
40
+ if line.startswith("/"):
41
+ line = line[1:]
42
+ if line.endswith("/"):
43
+ line = line + "**"
44
+ patterns.append(line)
45
+ except OSError:
46
+ pass
47
+ return patterns
48
+
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # Pattern matching helpers
52
+ # ---------------------------------------------------------------------------
53
+
54
+
55
+ def _match_patterns(rel_path: str, patterns: list[str]) -> bool:
56
+ """Return True if *rel_path* matches any of *patterns* using fnmatch."""
57
+ for pattern in patterns:
58
+ if fnmatch.fnmatch(rel_path, pattern):
59
+ return True
60
+ return False
61
+
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Non-AST chunker (simple recursive text split)
65
+ # ---------------------------------------------------------------------------
66
+
67
+
68
+ def _chunk_plain_text(
69
+ text: str,
70
+ chunk_size: int = 1000,
71
+ chunk_overlap: int = 300,
72
+ ) -> list[str]:
73
+ """Split *text* into overlapping chunks of at most *chunk_size* characters."""
74
+ if len(text) <= chunk_size:
75
+ return [text]
76
+
77
+ chunks: list[str] = []
78
+ start = 0
79
+ while start < len(text):
80
+ end = start + chunk_size
81
+ chunks.append(text[start:end])
82
+ if end >= len(text):
83
+ break
84
+ start = end - chunk_overlap
85
+
86
+ return chunks
87
+
88
+
89
+ # ---------------------------------------------------------------------------
90
+ # File scanning
91
+ # ---------------------------------------------------------------------------
92
+
93
+
94
+ def _collect_files(
95
+ codebase_path: Path,
96
+ config: SourcefireConfig,
97
+ profile: LanguageProfile | None,
98
+ ) -> list[Path]:
99
+ """Return all files under *codebase_path* that pass include/exclude filters.
100
+
101
+ Config patterns are authoritative. If config has include patterns, those
102
+ are used. Otherwise falls back to language profile patterns.
103
+ """
104
+ include_patterns: list[str] = list(config.include) if config.include else []
105
+ exclude_patterns: list[str] = list(config.exclude) if config.exclude else []
106
+
107
+ # If no config patterns, fall back to profile
108
+ if not include_patterns and profile:
109
+ include_patterns = list(profile.include_patterns)
110
+ if not exclude_patterns and profile:
111
+ exclude_patterns = list(profile.exclude_patterns)
112
+
113
+ # Always exclude .gitignore patterns
114
+ exclude_patterns.extend(_parse_gitignore(codebase_path))
115
+
116
+ if not include_patterns:
117
+ include_patterns = ["**/*"]
118
+
119
+ matched: list[Path] = []
120
+ for pattern in include_patterns:
121
+ for file_path in codebase_path.glob(pattern):
122
+ if not file_path.is_file():
123
+ continue
124
+ rel = file_path.relative_to(codebase_path).as_posix()
125
+ if not _match_patterns(rel, exclude_patterns):
126
+ matched.append(file_path)
127
+
128
+ seen: set[Path] = set()
129
+ unique: list[Path] = []
130
+ for p in matched:
131
+ if p not in seen:
132
+ seen.add(p)
133
+ unique.append(p)
134
+
135
+ return unique
136
+
137
+
138
+ # ---------------------------------------------------------------------------
139
+ # Chunk production
140
+ # ---------------------------------------------------------------------------
141
+
142
+
143
+ def _chunks_for_file(
144
+ file_path: Path,
145
+ codebase_path: Path,
146
+ profile: LanguageProfile | None,
147
+ chunk_size: int = 1000,
148
+ ) -> list[dict[str, Any]]:
149
+ """Return a list of chunk dicts for *file_path*."""
150
+ rel = file_path.relative_to(codebase_path).as_posix()
151
+ source = file_path.read_text(encoding="utf-8", errors="replace")
152
+
153
+ file_profile = get_profile_for_extension(file_path.suffix) or profile
154
+
155
+ if file_profile and file_path.suffix in [e for e in file_profile.file_extensions]:
156
+ raw_chunks = chunk_source_file(source, rel, file_profile, chunk_size=chunk_size)
157
+ chunks_out: list[dict[str, Any]] = []
158
+ for idx, chunk in enumerate(raw_chunks):
159
+ meta = chunk["metadata"]
160
+ chunks_out.append({
161
+ "filename": rel,
162
+ "location": f"{rel}:{idx}",
163
+ "code": chunk["text"],
164
+ "feature": meta.get("feature", ""),
165
+ "layer": meta.get("layer", ""),
166
+ "file_type": meta.get("file_type", ""),
167
+ })
168
+ return chunks_out
169
+ else:
170
+ meta = extract_metadata("", rel, file_profile)
171
+ raw_texts = _chunk_plain_text(source, chunk_size=chunk_size)
172
+ return [
173
+ {
174
+ "filename": rel,
175
+ "location": f"{rel}:{idx}",
176
+ "code": text,
177
+ "feature": meta.get("feature", ""),
178
+ "layer": meta.get("layer", ""),
179
+ "file_type": meta.get("file_type", ""),
180
+ }
181
+ for idx, text in enumerate(raw_texts)
182
+ ]
183
+
184
+
185
+ # ---------------------------------------------------------------------------
186
+ # Public entry points
187
+ # ---------------------------------------------------------------------------
188
+
189
+
190
+ def run_indexing(
191
+ collection: chromadb.Collection,
192
+ config: SourcefireConfig,
193
+ client: chromadb.ClientAPI | None = None,
194
+ full: bool = True,
195
+ ) -> dict[str, Any]:
196
+ """Run the indexing pipeline.
197
+
198
+ Args:
199
+ collection: ChromaDB collection to write to.
200
+ config: Sourcefire configuration.
201
+ client: ChromaDB client (needed for full reset).
202
+ full: If True, reset collection and re-index everything.
203
+ If False, incremental — compare file mtimes and only
204
+ re-index changed/new files, delete removed files.
205
+
206
+ Returns:
207
+ A stats dict with keys: files, chunks, edges, language, import_edges.
208
+ """
209
+ codebase_path = config.project_dir
210
+ print(f"[pipeline] Scanning codebase at: {codebase_path}")
211
+
212
+ language_override = config.language if config.language != "auto" else None
213
+ profile = get_profile(codebase_path, language_override)
214
+ lang_name = profile.language if profile else "generic"
215
+ print(f"[pipeline] Detected language: {lang_name}")
216
+
217
+ # Collect all files on disk
218
+ all_disk_files = _collect_files(codebase_path, config, profile)
219
+ print(f"[pipeline] Found {len(all_disk_files)} files to index.")
220
+
221
+ if not all_disk_files:
222
+ print("[pipeline] Error: No source files found matching the configured patterns.")
223
+ print("Run `sourcefire --reinit` to regenerate patterns, or edit .sourcefire/config.toml manually.")
224
+ return {
225
+ "files": 0, "chunks": 0, "edges": 0,
226
+ "language": lang_name, "import_edges": {},
227
+ }
228
+
229
+ # Determine which files to process
230
+ if full and client:
231
+ collection = reset_collection(client)
232
+ print("[pipeline] Collection reset for full re-index.")
233
+ files_to_index = all_disk_files
234
+ elif not full:
235
+ # Incremental: compare mtimes
236
+ indexed_files = get_indexed_files(collection)
237
+ stored_mtimes = get_stored_mtimes(collection)
238
+
239
+ current_files: dict[str, Path] = {}
240
+ for f in all_disk_files:
241
+ rel = f.relative_to(codebase_path).as_posix()
242
+ current_files[rel] = f
243
+
244
+ # Find changed/new files
245
+ changed: list[Path] = []
246
+ for rel, f in current_files.items():
247
+ stored_mtime = stored_mtimes.get(rel, 0.0)
248
+ if rel not in indexed_files or f.stat().st_mtime > stored_mtime:
249
+ changed.append(f)
250
+
251
+ # Find deleted files
252
+ deleted = indexed_files - set(current_files.keys())
253
+ for rel in deleted:
254
+ delete_file_chunks(collection, rel)
255
+
256
+ if not changed and not deleted:
257
+ print("[pipeline] Index is up to date.")
258
+ return {
259
+ "files": len(all_disk_files), "chunks": collection.count(), "edges": 0,
260
+ "language": lang_name, "import_edges": {},
261
+ }
262
+
263
+ print(f"[pipeline] {len(changed)} changed, {len(deleted)} deleted files.")
264
+ files_to_index = changed
265
+
266
+ # Delete old chunks for changed files before re-inserting
267
+ for f in changed:
268
+ rel = f.relative_to(codebase_path).as_posix()
269
+ delete_file_chunks(collection, rel)
270
+ else:
271
+ files_to_index = all_disk_files
272
+
273
+ # Produce chunks and collect imports
274
+ all_chunks: list[dict[str, Any]] = []
275
+ file_imports: dict[str, list[str]] = {}
276
+
277
+ for file_path in files_to_index:
278
+ rel = file_path.relative_to(codebase_path).as_posix()
279
+ chunks = _chunks_for_file(file_path, codebase_path, profile, chunk_size=config.chunk_size)
280
+ all_chunks.extend(chunks)
281
+
282
+ file_profile = get_profile_for_extension(file_path.suffix) or profile
283
+ if file_profile and file_path.suffix in file_profile.file_extensions:
284
+ source = file_path.read_text(encoding="utf-8", errors="replace")
285
+ meta = extract_metadata(source, rel, file_profile)
286
+ if meta.get("imports"):
287
+ file_imports[rel] = meta["imports"]
288
+
289
+ print(f"[pipeline] Produced {len(all_chunks)} chunks.")
290
+
291
+ if not all_chunks:
292
+ return {
293
+ "files": len(all_disk_files), "chunks": 0, "edges": 0,
294
+ "language": lang_name, "import_edges": file_imports,
295
+ }
296
+
297
+ # Embed
298
+ print("[pipeline] Embedding chunks...")
299
+ texts = [c["code"] for c in all_chunks]
300
+ embeddings = embed_batch(texts)
301
+ print("[pipeline] Embeddings done.")
302
+
303
+ # Build mtime lookup
304
+ file_mtimes: dict[str, str] = {}
305
+ for file_path in files_to_index:
306
+ rel = file_path.relative_to(codebase_path).as_posix()
307
+ file_mtimes[rel] = str(file_path.stat().st_mtime)
308
+
309
+ # Insert into ChromaDB in batches
310
+ BATCH_SIZE = 5000
311
+ print("[pipeline] Inserting into ChromaDB...")
312
+ for i in range(0, len(all_chunks), BATCH_SIZE):
313
+ batch = all_chunks[i:i + BATCH_SIZE]
314
+ batch_emb = embeddings[i:i + BATCH_SIZE]
315
+ add_chunks(
316
+ collection,
317
+ ids=[c["location"] for c in batch],
318
+ documents=[c["code"] for c in batch],
319
+ embeddings=batch_emb,
320
+ metadatas=[
321
+ {
322
+ "filename": c["filename"],
323
+ "location": c["location"],
324
+ "feature": c["feature"],
325
+ "layer": c["layer"],
326
+ "file_type": c["file_type"],
327
+ "mtime": file_mtimes.get(c["filename"], "0"),
328
+ }
329
+ for c in batch
330
+ ],
331
+ )
332
+ print(f"[pipeline] Inserted {len(all_chunks)} chunks.")
333
+
334
+ edge_count = sum(len(v) for v in file_imports.values())
335
+ print(f"[pipeline] Import edges: {edge_count}")
336
+
337
+ return {
338
+ "files": len(all_disk_files),
339
+ "chunks": len(all_chunks),
340
+ "edges": edge_count,
341
+ "language": lang_name,
342
+ "import_edges": file_imports,
343
+ }
344
+
345
+
346
+ def index_files(
347
+ collection: chromadb.Collection,
348
+ file_paths: list[Path],
349
+ config: SourcefireConfig,
350
+ profile: LanguageProfile | None,
351
+ ) -> dict[str, list[str]]:
352
+ """Index specific files (for incremental re-indexing by the watcher).
353
+
354
+ Deletes existing chunks for each file, then re-chunks, embeds, and inserts.
355
+ Returns the import map for updated files.
356
+ """
357
+ codebase_path = config.project_dir
358
+ file_imports: dict[str, list[str]] = {}
359
+ all_chunks: list[dict[str, Any]] = []
360
+
361
+ for file_path in file_paths:
362
+ rel = file_path.relative_to(codebase_path).as_posix()
363
+ delete_file_chunks(collection, rel)
364
+
365
+ chunks = _chunks_for_file(file_path, codebase_path, profile, chunk_size=config.chunk_size)
366
+ all_chunks.extend(chunks)
367
+
368
+ file_profile = get_profile_for_extension(file_path.suffix) or profile
369
+ if file_profile and file_path.suffix in file_profile.file_extensions:
370
+ source = file_path.read_text(encoding="utf-8", errors="replace")
371
+ meta = extract_metadata(source, rel, file_profile)
372
+ if meta.get("imports"):
373
+ file_imports[rel] = meta["imports"]
374
+
375
+ if all_chunks:
376
+ texts = [c["code"] for c in all_chunks]
377
+ embeddings = embed_batch(texts)
378
+
379
+ file_mtimes: dict[str, str] = {}
380
+ for file_path in file_paths:
381
+ rel = file_path.relative_to(codebase_path).as_posix()
382
+ file_mtimes[rel] = str(file_path.stat().st_mtime)
383
+
384
+ BATCH_SIZE = 5000
385
+ for i in range(0, len(all_chunks), BATCH_SIZE):
386
+ batch = all_chunks[i:i + BATCH_SIZE]
387
+ batch_emb = embeddings[i:i + BATCH_SIZE]
388
+ add_chunks(
389
+ collection,
390
+ ids=[c["location"] for c in batch],
391
+ documents=[c["code"] for c in batch],
392
+ embeddings=batch_emb,
393
+ metadatas=[
394
+ {
395
+ "filename": c["filename"],
396
+ "location": c["location"],
397
+ "feature": c["feature"],
398
+ "layer": c["layer"],
399
+ "file_type": c["file_type"],
400
+ "mtime": file_mtimes.get(c["filename"], "0"),
401
+ }
402
+ for c in batch
403
+ ],
404
+ )
405
+
406
+ return file_imports