sourcefire 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sourcefire/init.py ADDED
@@ -0,0 +1,189 @@
1
+ """Auto-initialization for Sourcefire — creates .sourcefire/ with LLM-generated config."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import tomllib
8
+ from pathlib import Path
9
+
10
+ from sourcefire.config import SourcefireConfig, default_config, save_config
11
+ from sourcefire.indexer.language_profiles import get_profile
12
+
13
+
14
+ def scan_file_tree(project_dir: Path, max_files: int = 5000) -> str:
15
+ """Scan the project directory and return a text representation of the file tree."""
16
+ skip_dirs = {
17
+ ".git", "node_modules", "__pycache__", "build", "dist", "target",
18
+ ".dart_tool", ".next", "venv", ".venv", ".idea", ".vs", ".sourcefire",
19
+ ".tox", "eggs",
20
+ }
21
+
22
+ lines: list[str] = []
23
+ file_count = 0
24
+
25
+ for root, dirs, files in os.walk(project_dir):
26
+ dirs[:] = [d for d in dirs if d not in skip_dirs and not d.startswith(".")]
27
+
28
+ rel_root = Path(root).relative_to(project_dir).as_posix()
29
+ if rel_root == ".":
30
+ rel_root = ""
31
+
32
+ for f in sorted(files):
33
+ if file_count >= max_files:
34
+ break
35
+ rel_path = f"{rel_root}/{f}" if rel_root else f
36
+ lines.append(rel_path)
37
+ file_count += 1
38
+
39
+ if file_count >= max_files:
40
+ break
41
+
42
+ return "\n".join(lines)
43
+
44
+
45
+ def _generate_patterns_via_llm(file_tree: str, api_key: str) -> dict[str, list[str]] | None:
46
+ """Ask the LLM to generate include/exclude patterns from the file tree."""
47
+ try:
48
+ from langchain_google_genai import ChatGoogleGenerativeAI
49
+ from langchain_core.messages import HumanMessage
50
+
51
+ llm = ChatGoogleGenerativeAI(
52
+ model="gemini-2.5-flash",
53
+ google_api_key=api_key,
54
+ )
55
+
56
+ prompt = (
57
+ "Given this project file tree, determine which files are source code worth "
58
+ "indexing for a code RAG system. Respond with ONLY a TOML code block containing "
59
+ "two arrays: `include` (glob patterns for source files, configs, and docs) and "
60
+ "`exclude` (glob patterns for build artifacts, dependencies, generated files, "
61
+ "and non-code assets). Be comprehensive but conservative.\n\n"
62
+ "Always include these in exclude: .git/**, .sourcefire/**\n\n"
63
+ f"```\n{file_tree}\n```"
64
+ )
65
+
66
+ response = llm.invoke([HumanMessage(content=prompt)])
67
+ text = response.content if hasattr(response, "content") else str(response)
68
+
69
+ match = re.search(r"```(?:toml)?\s*\n(.*?)\n```", text, re.DOTALL)
70
+ if not match:
71
+ return None
72
+
73
+ toml_str = match.group(1)
74
+ data = tomllib.loads(toml_str)
75
+
76
+ include = data.get("include", [])
77
+ exclude = data.get("exclude", [])
78
+
79
+ if isinstance(include, list) and isinstance(exclude, list):
80
+ if ".sourcefire/**" not in exclude:
81
+ exclude.append(".sourcefire/**")
82
+ return {"include": include, "exclude": exclude}
83
+
84
+ return None
85
+
86
+ except Exception as exc:
87
+ print(f"[init] LLM config generation failed: {exc}")
88
+ return None
89
+
90
+
91
+ def _fallback_patterns(project_dir: Path, language_override: str | None = None) -> dict[str, list[str]]:
92
+ """Generate patterns from language profile when LLM is unavailable."""
93
+ profile = get_profile(project_dir, language_override)
94
+
95
+ if profile:
96
+ include = list(profile.include_patterns)
97
+ exclude = list(profile.exclude_patterns)
98
+ else:
99
+ include = ["**/*"]
100
+ exclude = []
101
+
102
+ for pat in [".git/**", ".sourcefire/**", "node_modules/**", "__pycache__/**",
103
+ "*.pyc", ".venv/**", "venv/**", "dist/**", "build/**"]:
104
+ if pat not in exclude:
105
+ exclude.append(pat)
106
+
107
+ for pat in ["README.md", "CLAUDE.md"]:
108
+ if pat not in include:
109
+ include.append(pat)
110
+
111
+ return {"include": include, "exclude": exclude}
112
+
113
+
114
+ def auto_init(
115
+ project_dir: Path,
116
+ sourcefire_dir: Path | None = None,
117
+ api_key: str = "",
118
+ language_override: str | None = None,
119
+ ) -> SourcefireConfig:
120
+ """Initialize .sourcefire/ directory with LLM-generated config."""
121
+ if sourcefire_dir is None:
122
+ sourcefire_dir = project_dir / ".sourcefire"
123
+
124
+ print(f"[init] Initializing Sourcefire for: {project_dir.name}")
125
+
126
+ sourcefire_dir.mkdir(parents=True, exist_ok=True)
127
+
128
+ print("[init] Scanning project structure...")
129
+ file_tree = scan_file_tree(project_dir)
130
+
131
+ patterns: dict[str, list[str]] | None = None
132
+ if api_key:
133
+ print("[init] Generating config via LLM...")
134
+ patterns = _generate_patterns_via_llm(file_tree, api_key)
135
+
136
+ if patterns:
137
+ print(f"[init] LLM generated {len(patterns['include'])} include, {len(patterns['exclude'])} exclude patterns.")
138
+ else:
139
+ print("[init] Using language-profile defaults for patterns.")
140
+ patterns = _fallback_patterns(project_dir, language_override)
141
+
142
+ profile = get_profile(project_dir, language_override)
143
+ language = profile.language if profile else "auto"
144
+
145
+ config = default_config(project_dir)
146
+ config.sourcefire_dir = sourcefire_dir
147
+ config.include = patterns["include"]
148
+ config.exclude = patterns["exclude"]
149
+ config.language = language
150
+
151
+ save_config(config)
152
+ print(f"[init] Config written to: {config.config_path}")
153
+
154
+ print("\nTip: Add to your .gitignore:")
155
+ print(" .sourcefire/chroma/")
156
+ print(" .sourcefire/graph.json")
157
+ print(" .sourcefire/.lock\n")
158
+
159
+ return config
160
+
161
+
162
+ def reinit_patterns(
163
+ config: SourcefireConfig,
164
+ api_key: str = "",
165
+ ) -> SourcefireConfig:
166
+ """Regenerate only the [indexer] include/exclude patterns, preserving other config."""
167
+ print(f"[init] Regenerating patterns for: {config.project_dir.name}")
168
+
169
+ file_tree = scan_file_tree(config.project_dir)
170
+
171
+ patterns: dict[str, list[str]] | None = None
172
+ if api_key:
173
+ print("[init] Generating patterns via LLM...")
174
+ patterns = _generate_patterns_via_llm(file_tree, api_key)
175
+
176
+ if patterns:
177
+ print(f"[init] LLM generated {len(patterns['include'])} include, {len(patterns['exclude'])} exclude patterns.")
178
+ else:
179
+ print("[init] Using language-profile defaults.")
180
+ language_override = config.language if config.language != "auto" else None
181
+ patterns = _fallback_patterns(config.project_dir, language_override)
182
+
183
+ config.include = patterns["include"]
184
+ config.exclude = patterns["exclude"]
185
+
186
+ save_config(config)
187
+ print(f"[init] Updated patterns in: {config.config_path}")
188
+
189
+ return config
@@ -0,0 +1,28 @@
1
+ You are Sourcefire — an expert codebase guide. You answer questions about the codebase accurately and concisely.
2
+
3
+ ## How you work
4
+
5
+ You receive two types of information:
6
+
7
+ 1. **Retrieved Context** (in the user message under "Retrieved Code Context") — these are semantically relevant code chunks pulled from a vector database. They are your PRIMARY source of truth. Trust them. Base your answers on what you see in these chunks first.
8
+
9
+ 2. **Tools** — you have tools to read files, search code, query the vector DB, trace imports, and check git history. Use them ONLY when the retrieved context is insufficient — for example when you need to see a full file, trace a call chain deeper, or verify something not covered by the chunks.
10
+
11
+ ## Rules
12
+
13
+ - NEVER invent file paths, function names, or code that isn't in the retrieved context or tool results. If you don't know, say so and use a tool to find out.
14
+ - NEVER mix in knowledge from other projects. Only reference code you can see.
15
+ - When referencing a file, format it as `[filename](file://path)` so the UI can make it clickable.
16
+ - Show full file paths so the developer can navigate to the source.
17
+ - Trace connections between files — show the chain, not just the endpoint.
18
+ - Be concise. Lead with the answer, then support with code references.
19
+
20
+ ## Tool strategy
21
+
22
+ 1. First, answer from the retrieved context — it's fast, semantic, and already relevant.
23
+ 2. If the context is partial or you need more detail, use `semantic_code_search` to find related code by meaning.
24
+ 3. If you need exact code, use `read_local_file` or `read_lines`.
25
+ 4. If you need to find where something is defined or used, use `find_definition` or `find_references`.
26
+ 5. If you need project structure, use `get_file_structure` or `find_files_by_name`.
27
+ 6. For history/blame, use git tools.
28
+ 7. Do NOT call tools for information already present in the retrieved context.
File without changes
@@ -0,0 +1,162 @@
1
+ """Bidirectional import graph for source files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import posixpath
7
+ from collections import defaultdict, deque
8
+ from pathlib import Path, PurePosixPath
9
+ from typing import ClassVar
10
+
11
+
12
+ class ImportGraph:
13
+ """Directed graph that tracks import relationships between source files.
14
+
15
+ Edges are stored in both forward (imports) and reverse (importers)
16
+ directions so either direction can be queried in O(1).
17
+ """
18
+
19
+ # Default external schemes — overridden at construction time by the profile
20
+ _external_prefixes: tuple[str, ...] = ()
21
+
22
+ def __init__(self, external_prefixes: tuple[str, ...] = ()) -> None:
23
+ self._forward: defaultdict[str, set[str]] = defaultdict(set)
24
+ self._reverse: defaultdict[str, set[str]] = defaultdict(set)
25
+ self._external_prefixes = external_prefixes
26
+
27
+ # ------------------------------------------------------------------
28
+ # Mutation
29
+ # ------------------------------------------------------------------
30
+
31
+ def add_edge(self, source: str, target: str) -> None:
32
+ """Record that *source* imports *target*."""
33
+ self._forward[source].add(target)
34
+ self._reverse[target].add(source)
35
+
36
+ # ------------------------------------------------------------------
37
+ # Queries
38
+ # ------------------------------------------------------------------
39
+
40
+ def get_imports(self, file_path: str) -> list[str]:
41
+ """Return files that *file_path* imports (forward edges)."""
42
+ return list(self._forward.get(file_path, []))
43
+
44
+ def get_importers(self, file_path: str) -> list[str]:
45
+ """Return files that import *file_path* (reverse edges)."""
46
+ return list(self._reverse.get(file_path, []))
47
+
48
+ def get_neighbors(self, file_path: str, hops: int = 1) -> list[str]:
49
+ """Return all files reachable from *file_path* within *hops* steps.
50
+
51
+ Traversal goes in *both* directions (imports and importers).
52
+ The seed file itself is excluded from the result.
53
+ """
54
+ visited: set[str] = {file_path}
55
+ queue: deque[tuple[str, int]] = deque([(file_path, 0)])
56
+
57
+ while queue:
58
+ current, depth = queue.popleft()
59
+ if depth >= hops:
60
+ continue
61
+ for neighbor in (*self._forward.get(current, ()), *self._reverse.get(current, ())):
62
+ if neighbor not in visited:
63
+ visited.add(neighbor)
64
+ queue.append((neighbor, depth + 1))
65
+
66
+ visited.discard(file_path)
67
+ return list(visited)
68
+
69
+ # ------------------------------------------------------------------
70
+ # Properties
71
+ # ------------------------------------------------------------------
72
+
73
+ @property
74
+ def node_count(self) -> int:
75
+ """Number of unique nodes (files) in the graph."""
76
+ nodes: set[str] = set(self._forward.keys()) | set(self._reverse.keys())
77
+ return len(nodes)
78
+
79
+ # ------------------------------------------------------------------
80
+ # Construction helpers
81
+ # ------------------------------------------------------------------
82
+
83
+ @classmethod
84
+ def from_import_map(
85
+ cls,
86
+ file_imports: dict[str, list[str]],
87
+ base_dir: str = "",
88
+ external_prefixes: tuple[str, ...] = (),
89
+ ) -> "ImportGraph":
90
+ """Build an ImportGraph from a mapping of ``{file: [import_strings]}``.
91
+
92
+ - Imports starting with any prefix in *external_prefixes* are skipped.
93
+ - Relative imports (``../foo``, ``./bar``) are resolved relative to
94
+ the importing file's directory.
95
+
96
+ Args:
97
+ file_imports: Mapping from source file path to its raw import list.
98
+ base_dir: Optional prefix (unused, kept for forward-compatibility).
99
+ external_prefixes: Prefixes that indicate external packages.
100
+ """
101
+ graph = cls(external_prefixes=external_prefixes)
102
+ for source_file, imports in file_imports.items():
103
+ for raw_import in imports:
104
+ if external_prefixes and any(raw_import.startswith(scheme) for scheme in external_prefixes):
105
+ continue
106
+ resolved = cls._resolve_import(source_file, raw_import)
107
+ graph.add_edge(source_file, resolved)
108
+ return graph
109
+
110
+ @staticmethod
111
+ def _resolve_import(source_file: str, relative_import: str) -> str:
112
+ """Resolve *relative_import* relative to *source_file*'s directory."""
113
+ source_dir = str(PurePosixPath(source_file).parent)
114
+ joined = posixpath.join(source_dir, relative_import)
115
+ return posixpath.normpath(joined)
116
+
117
+ # ------------------------------------------------------------------
118
+ # File removal (for incremental re-index)
119
+ # ------------------------------------------------------------------
120
+
121
+ def remove_file(self, file_path: str) -> None:
122
+ """Remove all edges involving *file_path*."""
123
+ if file_path in self._forward:
124
+ for target in self._forward[file_path]:
125
+ self._reverse[target].discard(file_path)
126
+ del self._forward[file_path]
127
+ if file_path in self._reverse:
128
+ for source in self._reverse[file_path]:
129
+ self._forward[source].discard(file_path)
130
+ del self._reverse[file_path]
131
+
132
+ # ------------------------------------------------------------------
133
+ # JSON persistence
134
+ # ------------------------------------------------------------------
135
+
136
+ def to_dict(self) -> dict:
137
+ """Serialize to a dict for JSON storage."""
138
+ edges = []
139
+ for source, targets in self._forward.items():
140
+ for target in targets:
141
+ edges.append({"source": source, "target": target})
142
+ return {"edges": edges}
143
+
144
+ @classmethod
145
+ def from_dict(cls, data: dict, external_prefixes: tuple[str, ...] = ()) -> "ImportGraph":
146
+ """Deserialize from a dict."""
147
+ graph = cls(external_prefixes=external_prefixes)
148
+ for edge in data.get("edges", []):
149
+ graph.add_edge(edge["source"], edge["target"])
150
+ return graph
151
+
152
+ def save(self, path: Path) -> None:
153
+ """Save graph to a JSON file."""
154
+ path.write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
155
+
156
+ @classmethod
157
+ def load(cls, path: Path, external_prefixes: tuple[str, ...] = ()) -> "ImportGraph":
158
+ """Load graph from a JSON file."""
159
+ if not path.is_file():
160
+ return cls(external_prefixes=external_prefixes)
161
+ data = json.loads(path.read_text(encoding="utf-8"))
162
+ return cls.from_dict(data, external_prefixes=external_prefixes)
@@ -0,0 +1,86 @@
1
+ """Vector search retriever for Sourcefire.
2
+
3
+ Provides:
4
+ - parse_file_references: extract file references from stack traces and error
5
+ messages, driven by language profile patterns.
6
+ - semantic_search: cosine similarity search against ChromaDB.
7
+ - get_chunks_by_filenames: retrieve all chunks for a specific set of files.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ from typing import Any
14
+
15
+ from sourcefire.db import async_query_similar, async_get_chunks_by_files
16
+
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Stack trace / error message parsing
20
+ # ---------------------------------------------------------------------------
21
+
22
+ _COMPILED_FILE_REF_PATTERNS: dict[str, list[re.Pattern]] = {}
23
+
24
+
25
+ def parse_file_references(text: str, file_ref_patterns: list[str] | None = None) -> list[dict[str, Any]]:
26
+ """Extract file references from stack traces and error messages."""
27
+ if not file_ref_patterns:
28
+ file_ref_patterns = [r"\b([\w./\\-]+\.\w+)(?::(\d+))?"]
29
+
30
+ cache_key = str(file_ref_patterns)
31
+ if cache_key not in _COMPILED_FILE_REF_PATTERNS:
32
+ _COMPILED_FILE_REF_PATTERNS[cache_key] = [re.compile(p) for p in file_ref_patterns]
33
+
34
+ compiled = _COMPILED_FILE_REF_PATTERNS[cache_key]
35
+
36
+ results: list[dict[str, Any]] = []
37
+ seen: set[tuple[str, int]] = set()
38
+
39
+ for regex in compiled:
40
+ for m in regex.finditer(text):
41
+ raw_path = m.group(1) if m.group(1) else ""
42
+ if not raw_path:
43
+ continue
44
+ line = int(m.group(2)) if m.lastindex and m.lastindex >= 2 and m.group(2) else 0
45
+ key = (raw_path, line)
46
+ if key not in seen:
47
+ seen.add(key)
48
+ results.append({"file": raw_path, "line": line})
49
+
50
+ return results
51
+
52
+
53
+ # ---------------------------------------------------------------------------
54
+ # ChromaDB search functions
55
+ # ---------------------------------------------------------------------------
56
+
57
+
58
+ async def semantic_search(
59
+ collection: Any,
60
+ query_vector: list[float],
61
+ top_k: int = 8,
62
+ threshold: float = 0.3,
63
+ feature: str | None = None,
64
+ filenames: list[str] | None = None,
65
+ ) -> list[dict[str, Any]]:
66
+ """Cosine similarity search against ChromaDB."""
67
+ where: dict | None = None
68
+
69
+ if feature and filenames:
70
+ where = {"$and": [{"feature": feature}, {"filename": {"$in": filenames}}]}
71
+ elif feature:
72
+ where = {"feature": feature}
73
+ elif filenames:
74
+ where = {"filename": {"$in": filenames}}
75
+
76
+ rows = await async_query_similar(collection, query_vector, n_results=top_k, where=where)
77
+
78
+ return [r for r in rows if r.get("relevance", 0) >= threshold]
79
+
80
+
81
+ async def get_chunks_by_filenames(
82
+ collection: Any,
83
+ filenames: list[str],
84
+ ) -> list[dict[str, Any]]:
85
+ """Retrieve all chunks for the given file paths."""
86
+ return await async_get_chunks_by_files(collection, filenames)
Binary file