sourcefire 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sourcefire/__init__.py +0 -0
- sourcefire/api/__init__.py +0 -0
- sourcefire/api/models.py +24 -0
- sourcefire/api/routes.py +166 -0
- sourcefire/chain/__init__.py +0 -0
- sourcefire/chain/prompts.py +195 -0
- sourcefire/chain/rag_chain.py +967 -0
- sourcefire/cli.py +293 -0
- sourcefire/config.py +148 -0
- sourcefire/db.py +196 -0
- sourcefire/indexer/__init__.py +0 -0
- sourcefire/indexer/embeddings.py +27 -0
- sourcefire/indexer/language_profiles.py +448 -0
- sourcefire/indexer/metadata.py +289 -0
- sourcefire/indexer/pipeline.py +406 -0
- sourcefire/init.py +189 -0
- sourcefire/prompts/system.md +28 -0
- sourcefire/retriever/__init__.py +0 -0
- sourcefire/retriever/graph.py +162 -0
- sourcefire/retriever/search.py +86 -0
- sourcefire/static/.DS_Store +0 -0
- sourcefire/static/app.js +414 -0
- sourcefire/static/index.html +102 -0
- sourcefire/static/styles.css +607 -0
- sourcefire/watcher.py +105 -0
- sourcefire-0.2.0.dist-info/METADATA +145 -0
- sourcefire-0.2.0.dist-info/RECORD +31 -0
- sourcefire-0.2.0.dist-info/WHEEL +5 -0
- sourcefire-0.2.0.dist-info/entry_points.txt +2 -0
- sourcefire-0.2.0.dist-info/licenses/LICENSE +21 -0
- sourcefire-0.2.0.dist-info/top_level.txt +1 -0
sourcefire/init.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Auto-initialization for Sourcefire — creates .sourcefire/ with LLM-generated config."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import tomllib
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from sourcefire.config import SourcefireConfig, default_config, save_config
|
|
11
|
+
from sourcefire.indexer.language_profiles import get_profile
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def scan_file_tree(project_dir: Path, max_files: int = 5000) -> str:
|
|
15
|
+
"""Scan the project directory and return a text representation of the file tree."""
|
|
16
|
+
skip_dirs = {
|
|
17
|
+
".git", "node_modules", "__pycache__", "build", "dist", "target",
|
|
18
|
+
".dart_tool", ".next", "venv", ".venv", ".idea", ".vs", ".sourcefire",
|
|
19
|
+
".tox", "eggs",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
lines: list[str] = []
|
|
23
|
+
file_count = 0
|
|
24
|
+
|
|
25
|
+
for root, dirs, files in os.walk(project_dir):
|
|
26
|
+
dirs[:] = [d for d in dirs if d not in skip_dirs and not d.startswith(".")]
|
|
27
|
+
|
|
28
|
+
rel_root = Path(root).relative_to(project_dir).as_posix()
|
|
29
|
+
if rel_root == ".":
|
|
30
|
+
rel_root = ""
|
|
31
|
+
|
|
32
|
+
for f in sorted(files):
|
|
33
|
+
if file_count >= max_files:
|
|
34
|
+
break
|
|
35
|
+
rel_path = f"{rel_root}/{f}" if rel_root else f
|
|
36
|
+
lines.append(rel_path)
|
|
37
|
+
file_count += 1
|
|
38
|
+
|
|
39
|
+
if file_count >= max_files:
|
|
40
|
+
break
|
|
41
|
+
|
|
42
|
+
return "\n".join(lines)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _generate_patterns_via_llm(file_tree: str, api_key: str) -> dict[str, list[str]] | None:
|
|
46
|
+
"""Ask the LLM to generate include/exclude patterns from the file tree."""
|
|
47
|
+
try:
|
|
48
|
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
49
|
+
from langchain_core.messages import HumanMessage
|
|
50
|
+
|
|
51
|
+
llm = ChatGoogleGenerativeAI(
|
|
52
|
+
model="gemini-2.5-flash",
|
|
53
|
+
google_api_key=api_key,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
prompt = (
|
|
57
|
+
"Given this project file tree, determine which files are source code worth "
|
|
58
|
+
"indexing for a code RAG system. Respond with ONLY a TOML code block containing "
|
|
59
|
+
"two arrays: `include` (glob patterns for source files, configs, and docs) and "
|
|
60
|
+
"`exclude` (glob patterns for build artifacts, dependencies, generated files, "
|
|
61
|
+
"and non-code assets). Be comprehensive but conservative.\n\n"
|
|
62
|
+
"Always include these in exclude: .git/**, .sourcefire/**\n\n"
|
|
63
|
+
f"```\n{file_tree}\n```"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
response = llm.invoke([HumanMessage(content=prompt)])
|
|
67
|
+
text = response.content if hasattr(response, "content") else str(response)
|
|
68
|
+
|
|
69
|
+
match = re.search(r"```(?:toml)?\s*\n(.*?)\n```", text, re.DOTALL)
|
|
70
|
+
if not match:
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
toml_str = match.group(1)
|
|
74
|
+
data = tomllib.loads(toml_str)
|
|
75
|
+
|
|
76
|
+
include = data.get("include", [])
|
|
77
|
+
exclude = data.get("exclude", [])
|
|
78
|
+
|
|
79
|
+
if isinstance(include, list) and isinstance(exclude, list):
|
|
80
|
+
if ".sourcefire/**" not in exclude:
|
|
81
|
+
exclude.append(".sourcefire/**")
|
|
82
|
+
return {"include": include, "exclude": exclude}
|
|
83
|
+
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
except Exception as exc:
|
|
87
|
+
print(f"[init] LLM config generation failed: {exc}")
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _fallback_patterns(project_dir: Path, language_override: str | None = None) -> dict[str, list[str]]:
|
|
92
|
+
"""Generate patterns from language profile when LLM is unavailable."""
|
|
93
|
+
profile = get_profile(project_dir, language_override)
|
|
94
|
+
|
|
95
|
+
if profile:
|
|
96
|
+
include = list(profile.include_patterns)
|
|
97
|
+
exclude = list(profile.exclude_patterns)
|
|
98
|
+
else:
|
|
99
|
+
include = ["**/*"]
|
|
100
|
+
exclude = []
|
|
101
|
+
|
|
102
|
+
for pat in [".git/**", ".sourcefire/**", "node_modules/**", "__pycache__/**",
|
|
103
|
+
"*.pyc", ".venv/**", "venv/**", "dist/**", "build/**"]:
|
|
104
|
+
if pat not in exclude:
|
|
105
|
+
exclude.append(pat)
|
|
106
|
+
|
|
107
|
+
for pat in ["README.md", "CLAUDE.md"]:
|
|
108
|
+
if pat not in include:
|
|
109
|
+
include.append(pat)
|
|
110
|
+
|
|
111
|
+
return {"include": include, "exclude": exclude}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def auto_init(
|
|
115
|
+
project_dir: Path,
|
|
116
|
+
sourcefire_dir: Path | None = None,
|
|
117
|
+
api_key: str = "",
|
|
118
|
+
language_override: str | None = None,
|
|
119
|
+
) -> SourcefireConfig:
|
|
120
|
+
"""Initialize .sourcefire/ directory with LLM-generated config."""
|
|
121
|
+
if sourcefire_dir is None:
|
|
122
|
+
sourcefire_dir = project_dir / ".sourcefire"
|
|
123
|
+
|
|
124
|
+
print(f"[init] Initializing Sourcefire for: {project_dir.name}")
|
|
125
|
+
|
|
126
|
+
sourcefire_dir.mkdir(parents=True, exist_ok=True)
|
|
127
|
+
|
|
128
|
+
print("[init] Scanning project structure...")
|
|
129
|
+
file_tree = scan_file_tree(project_dir)
|
|
130
|
+
|
|
131
|
+
patterns: dict[str, list[str]] | None = None
|
|
132
|
+
if api_key:
|
|
133
|
+
print("[init] Generating config via LLM...")
|
|
134
|
+
patterns = _generate_patterns_via_llm(file_tree, api_key)
|
|
135
|
+
|
|
136
|
+
if patterns:
|
|
137
|
+
print(f"[init] LLM generated {len(patterns['include'])} include, {len(patterns['exclude'])} exclude patterns.")
|
|
138
|
+
else:
|
|
139
|
+
print("[init] Using language-profile defaults for patterns.")
|
|
140
|
+
patterns = _fallback_patterns(project_dir, language_override)
|
|
141
|
+
|
|
142
|
+
profile = get_profile(project_dir, language_override)
|
|
143
|
+
language = profile.language if profile else "auto"
|
|
144
|
+
|
|
145
|
+
config = default_config(project_dir)
|
|
146
|
+
config.sourcefire_dir = sourcefire_dir
|
|
147
|
+
config.include = patterns["include"]
|
|
148
|
+
config.exclude = patterns["exclude"]
|
|
149
|
+
config.language = language
|
|
150
|
+
|
|
151
|
+
save_config(config)
|
|
152
|
+
print(f"[init] Config written to: {config.config_path}")
|
|
153
|
+
|
|
154
|
+
print("\nTip: Add to your .gitignore:")
|
|
155
|
+
print(" .sourcefire/chroma/")
|
|
156
|
+
print(" .sourcefire/graph.json")
|
|
157
|
+
print(" .sourcefire/.lock\n")
|
|
158
|
+
|
|
159
|
+
return config
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def reinit_patterns(
|
|
163
|
+
config: SourcefireConfig,
|
|
164
|
+
api_key: str = "",
|
|
165
|
+
) -> SourcefireConfig:
|
|
166
|
+
"""Regenerate only the [indexer] include/exclude patterns, preserving other config."""
|
|
167
|
+
print(f"[init] Regenerating patterns for: {config.project_dir.name}")
|
|
168
|
+
|
|
169
|
+
file_tree = scan_file_tree(config.project_dir)
|
|
170
|
+
|
|
171
|
+
patterns: dict[str, list[str]] | None = None
|
|
172
|
+
if api_key:
|
|
173
|
+
print("[init] Generating patterns via LLM...")
|
|
174
|
+
patterns = _generate_patterns_via_llm(file_tree, api_key)
|
|
175
|
+
|
|
176
|
+
if patterns:
|
|
177
|
+
print(f"[init] LLM generated {len(patterns['include'])} include, {len(patterns['exclude'])} exclude patterns.")
|
|
178
|
+
else:
|
|
179
|
+
print("[init] Using language-profile defaults.")
|
|
180
|
+
language_override = config.language if config.language != "auto" else None
|
|
181
|
+
patterns = _fallback_patterns(config.project_dir, language_override)
|
|
182
|
+
|
|
183
|
+
config.include = patterns["include"]
|
|
184
|
+
config.exclude = patterns["exclude"]
|
|
185
|
+
|
|
186
|
+
save_config(config)
|
|
187
|
+
print(f"[init] Updated patterns in: {config.config_path}")
|
|
188
|
+
|
|
189
|
+
return config
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
You are Sourcefire — an expert codebase guide. You answer questions about the codebase accurately and concisely.
|
|
2
|
+
|
|
3
|
+
## How you work
|
|
4
|
+
|
|
5
|
+
You receive two types of information:
|
|
6
|
+
|
|
7
|
+
1. **Retrieved Context** (in the user message under "Retrieved Code Context") — these are semantically relevant code chunks pulled from a vector database. They are your PRIMARY source of truth. Trust them. Base your answers on what you see in these chunks first.
|
|
8
|
+
|
|
9
|
+
2. **Tools** — you have tools to read files, search code, query the vector DB, trace imports, and check git history. Use them ONLY when the retrieved context is insufficient — for example when you need to see a full file, trace a call chain deeper, or verify something not covered by the chunks.
|
|
10
|
+
|
|
11
|
+
## Rules
|
|
12
|
+
|
|
13
|
+
- NEVER invent file paths, function names, or code that isn't in the retrieved context or tool results. If you don't know, say so and use a tool to find out.
|
|
14
|
+
- NEVER mix in knowledge from other projects. Only reference code you can see.
|
|
15
|
+
- When referencing a file, format it as `[filename](file://path)` so the UI can make it clickable.
|
|
16
|
+
- Show full file paths so the developer can navigate to the source.
|
|
17
|
+
- Trace connections between files — show the chain, not just the endpoint.
|
|
18
|
+
- Be concise. Lead with the answer, then support with code references.
|
|
19
|
+
|
|
20
|
+
## Tool strategy
|
|
21
|
+
|
|
22
|
+
1. First, answer from the retrieved context — it's fast, semantic, and already relevant.
|
|
23
|
+
2. If the context is partial or you need more detail, use `semantic_code_search` to find related code by meaning.
|
|
24
|
+
3. If you need exact code, use `read_local_file` or `read_lines`.
|
|
25
|
+
4. If you need to find where something is defined or used, use `find_definition` or `find_references`.
|
|
26
|
+
5. If you need project structure, use `get_file_structure` or `find_files_by_name`.
|
|
27
|
+
6. For history/blame, use git tools.
|
|
28
|
+
7. Do NOT call tools for information already present in the retrieved context.
|
|
File without changes
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""Bidirectional import graph for source files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import posixpath
|
|
7
|
+
from collections import defaultdict, deque
|
|
8
|
+
from pathlib import Path, PurePosixPath
|
|
9
|
+
from typing import ClassVar
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ImportGraph:
|
|
13
|
+
"""Directed graph that tracks import relationships between source files.
|
|
14
|
+
|
|
15
|
+
Edges are stored in both forward (imports) and reverse (importers)
|
|
16
|
+
directions so either direction can be queried in O(1).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
# Default external schemes — overridden at construction time by the profile
|
|
20
|
+
_external_prefixes: tuple[str, ...] = ()
|
|
21
|
+
|
|
22
|
+
def __init__(self, external_prefixes: tuple[str, ...] = ()) -> None:
|
|
23
|
+
self._forward: defaultdict[str, set[str]] = defaultdict(set)
|
|
24
|
+
self._reverse: defaultdict[str, set[str]] = defaultdict(set)
|
|
25
|
+
self._external_prefixes = external_prefixes
|
|
26
|
+
|
|
27
|
+
# ------------------------------------------------------------------
|
|
28
|
+
# Mutation
|
|
29
|
+
# ------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
def add_edge(self, source: str, target: str) -> None:
|
|
32
|
+
"""Record that *source* imports *target*."""
|
|
33
|
+
self._forward[source].add(target)
|
|
34
|
+
self._reverse[target].add(source)
|
|
35
|
+
|
|
36
|
+
# ------------------------------------------------------------------
|
|
37
|
+
# Queries
|
|
38
|
+
# ------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
def get_imports(self, file_path: str) -> list[str]:
|
|
41
|
+
"""Return files that *file_path* imports (forward edges)."""
|
|
42
|
+
return list(self._forward.get(file_path, []))
|
|
43
|
+
|
|
44
|
+
def get_importers(self, file_path: str) -> list[str]:
|
|
45
|
+
"""Return files that import *file_path* (reverse edges)."""
|
|
46
|
+
return list(self._reverse.get(file_path, []))
|
|
47
|
+
|
|
48
|
+
def get_neighbors(self, file_path: str, hops: int = 1) -> list[str]:
|
|
49
|
+
"""Return all files reachable from *file_path* within *hops* steps.
|
|
50
|
+
|
|
51
|
+
Traversal goes in *both* directions (imports and importers).
|
|
52
|
+
The seed file itself is excluded from the result.
|
|
53
|
+
"""
|
|
54
|
+
visited: set[str] = {file_path}
|
|
55
|
+
queue: deque[tuple[str, int]] = deque([(file_path, 0)])
|
|
56
|
+
|
|
57
|
+
while queue:
|
|
58
|
+
current, depth = queue.popleft()
|
|
59
|
+
if depth >= hops:
|
|
60
|
+
continue
|
|
61
|
+
for neighbor in (*self._forward.get(current, ()), *self._reverse.get(current, ())):
|
|
62
|
+
if neighbor not in visited:
|
|
63
|
+
visited.add(neighbor)
|
|
64
|
+
queue.append((neighbor, depth + 1))
|
|
65
|
+
|
|
66
|
+
visited.discard(file_path)
|
|
67
|
+
return list(visited)
|
|
68
|
+
|
|
69
|
+
# ------------------------------------------------------------------
|
|
70
|
+
# Properties
|
|
71
|
+
# ------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def node_count(self) -> int:
|
|
75
|
+
"""Number of unique nodes (files) in the graph."""
|
|
76
|
+
nodes: set[str] = set(self._forward.keys()) | set(self._reverse.keys())
|
|
77
|
+
return len(nodes)
|
|
78
|
+
|
|
79
|
+
# ------------------------------------------------------------------
|
|
80
|
+
# Construction helpers
|
|
81
|
+
# ------------------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def from_import_map(
|
|
85
|
+
cls,
|
|
86
|
+
file_imports: dict[str, list[str]],
|
|
87
|
+
base_dir: str = "",
|
|
88
|
+
external_prefixes: tuple[str, ...] = (),
|
|
89
|
+
) -> "ImportGraph":
|
|
90
|
+
"""Build an ImportGraph from a mapping of ``{file: [import_strings]}``.
|
|
91
|
+
|
|
92
|
+
- Imports starting with any prefix in *external_prefixes* are skipped.
|
|
93
|
+
- Relative imports (``../foo``, ``./bar``) are resolved relative to
|
|
94
|
+
the importing file's directory.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
file_imports: Mapping from source file path to its raw import list.
|
|
98
|
+
base_dir: Optional prefix (unused, kept for forward-compatibility).
|
|
99
|
+
external_prefixes: Prefixes that indicate external packages.
|
|
100
|
+
"""
|
|
101
|
+
graph = cls(external_prefixes=external_prefixes)
|
|
102
|
+
for source_file, imports in file_imports.items():
|
|
103
|
+
for raw_import in imports:
|
|
104
|
+
if external_prefixes and any(raw_import.startswith(scheme) for scheme in external_prefixes):
|
|
105
|
+
continue
|
|
106
|
+
resolved = cls._resolve_import(source_file, raw_import)
|
|
107
|
+
graph.add_edge(source_file, resolved)
|
|
108
|
+
return graph
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def _resolve_import(source_file: str, relative_import: str) -> str:
|
|
112
|
+
"""Resolve *relative_import* relative to *source_file*'s directory."""
|
|
113
|
+
source_dir = str(PurePosixPath(source_file).parent)
|
|
114
|
+
joined = posixpath.join(source_dir, relative_import)
|
|
115
|
+
return posixpath.normpath(joined)
|
|
116
|
+
|
|
117
|
+
# ------------------------------------------------------------------
|
|
118
|
+
# File removal (for incremental re-index)
|
|
119
|
+
# ------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def remove_file(self, file_path: str) -> None:
|
|
122
|
+
"""Remove all edges involving *file_path*."""
|
|
123
|
+
if file_path in self._forward:
|
|
124
|
+
for target in self._forward[file_path]:
|
|
125
|
+
self._reverse[target].discard(file_path)
|
|
126
|
+
del self._forward[file_path]
|
|
127
|
+
if file_path in self._reverse:
|
|
128
|
+
for source in self._reverse[file_path]:
|
|
129
|
+
self._forward[source].discard(file_path)
|
|
130
|
+
del self._reverse[file_path]
|
|
131
|
+
|
|
132
|
+
# ------------------------------------------------------------------
|
|
133
|
+
# JSON persistence
|
|
134
|
+
# ------------------------------------------------------------------
|
|
135
|
+
|
|
136
|
+
def to_dict(self) -> dict:
|
|
137
|
+
"""Serialize to a dict for JSON storage."""
|
|
138
|
+
edges = []
|
|
139
|
+
for source, targets in self._forward.items():
|
|
140
|
+
for target in targets:
|
|
141
|
+
edges.append({"source": source, "target": target})
|
|
142
|
+
return {"edges": edges}
|
|
143
|
+
|
|
144
|
+
@classmethod
|
|
145
|
+
def from_dict(cls, data: dict, external_prefixes: tuple[str, ...] = ()) -> "ImportGraph":
|
|
146
|
+
"""Deserialize from a dict."""
|
|
147
|
+
graph = cls(external_prefixes=external_prefixes)
|
|
148
|
+
for edge in data.get("edges", []):
|
|
149
|
+
graph.add_edge(edge["source"], edge["target"])
|
|
150
|
+
return graph
|
|
151
|
+
|
|
152
|
+
def save(self, path: Path) -> None:
|
|
153
|
+
"""Save graph to a JSON file."""
|
|
154
|
+
path.write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
|
|
155
|
+
|
|
156
|
+
@classmethod
|
|
157
|
+
def load(cls, path: Path, external_prefixes: tuple[str, ...] = ()) -> "ImportGraph":
|
|
158
|
+
"""Load graph from a JSON file."""
|
|
159
|
+
if not path.is_file():
|
|
160
|
+
return cls(external_prefixes=external_prefixes)
|
|
161
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
162
|
+
return cls.from_dict(data, external_prefixes=external_prefixes)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Vector search retriever for Sourcefire.
|
|
2
|
+
|
|
3
|
+
Provides:
|
|
4
|
+
- parse_file_references: extract file references from stack traces and error
|
|
5
|
+
messages, driven by language profile patterns.
|
|
6
|
+
- semantic_search: cosine similarity search against ChromaDB.
|
|
7
|
+
- get_chunks_by_filenames: retrieve all chunks for a specific set of files.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from sourcefire.db import async_query_similar, async_get_chunks_by_files
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Stack trace / error message parsing
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
_COMPILED_FILE_REF_PATTERNS: dict[str, list[re.Pattern]] = {}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def parse_file_references(text: str, file_ref_patterns: list[str] | None = None) -> list[dict[str, Any]]:
|
|
26
|
+
"""Extract file references from stack traces and error messages."""
|
|
27
|
+
if not file_ref_patterns:
|
|
28
|
+
file_ref_patterns = [r"\b([\w./\\-]+\.\w+)(?::(\d+))?"]
|
|
29
|
+
|
|
30
|
+
cache_key = str(file_ref_patterns)
|
|
31
|
+
if cache_key not in _COMPILED_FILE_REF_PATTERNS:
|
|
32
|
+
_COMPILED_FILE_REF_PATTERNS[cache_key] = [re.compile(p) for p in file_ref_patterns]
|
|
33
|
+
|
|
34
|
+
compiled = _COMPILED_FILE_REF_PATTERNS[cache_key]
|
|
35
|
+
|
|
36
|
+
results: list[dict[str, Any]] = []
|
|
37
|
+
seen: set[tuple[str, int]] = set()
|
|
38
|
+
|
|
39
|
+
for regex in compiled:
|
|
40
|
+
for m in regex.finditer(text):
|
|
41
|
+
raw_path = m.group(1) if m.group(1) else ""
|
|
42
|
+
if not raw_path:
|
|
43
|
+
continue
|
|
44
|
+
line = int(m.group(2)) if m.lastindex and m.lastindex >= 2 and m.group(2) else 0
|
|
45
|
+
key = (raw_path, line)
|
|
46
|
+
if key not in seen:
|
|
47
|
+
seen.add(key)
|
|
48
|
+
results.append({"file": raw_path, "line": line})
|
|
49
|
+
|
|
50
|
+
return results
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
# ChromaDB search functions
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def semantic_search(
|
|
59
|
+
collection: Any,
|
|
60
|
+
query_vector: list[float],
|
|
61
|
+
top_k: int = 8,
|
|
62
|
+
threshold: float = 0.3,
|
|
63
|
+
feature: str | None = None,
|
|
64
|
+
filenames: list[str] | None = None,
|
|
65
|
+
) -> list[dict[str, Any]]:
|
|
66
|
+
"""Cosine similarity search against ChromaDB."""
|
|
67
|
+
where: dict | None = None
|
|
68
|
+
|
|
69
|
+
if feature and filenames:
|
|
70
|
+
where = {"$and": [{"feature": feature}, {"filename": {"$in": filenames}}]}
|
|
71
|
+
elif feature:
|
|
72
|
+
where = {"feature": feature}
|
|
73
|
+
elif filenames:
|
|
74
|
+
where = {"filename": {"$in": filenames}}
|
|
75
|
+
|
|
76
|
+
rows = await async_query_similar(collection, query_vector, n_results=top_k, where=where)
|
|
77
|
+
|
|
78
|
+
return [r for r in rows if r.get("relevance", 0) >= threshold]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def get_chunks_by_filenames(
|
|
82
|
+
collection: Any,
|
|
83
|
+
filenames: list[str],
|
|
84
|
+
) -> list[dict[str, Any]]:
|
|
85
|
+
"""Retrieve all chunks for the given file paths."""
|
|
86
|
+
return await async_get_chunks_by_files(collection, filenames)
|
|
Binary file
|