@pmaddire/gcie 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENT.md +256 -0
- package/AGENT_USAGE.md +231 -0
- package/ARCHITECTURE.md +151 -0
- package/CLAUDE.md +69 -0
- package/DEBUGGING_PLAYBOOK.md +160 -0
- package/KNOWLEDGE_INDEX.md +154 -0
- package/POTENTIAL_UPDATES +130 -0
- package/PROJECT.md +141 -0
- package/README.md +371 -0
- package/REPO_DIGITAL_TWIN.md +98 -0
- package/ROADMAP.md +301 -0
- package/SETUP_ANY_REPO.md +85 -0
- package/bin/gcie-init.js +20 -0
- package/bin/gcie.js +45 -0
- package/cli/__init__.py +1 -0
- package/cli/app.py +163 -0
- package/cli/commands/__init__.py +1 -0
- package/cli/commands/cache.py +35 -0
- package/cli/commands/context.py +2426 -0
- package/cli/commands/context_slices.py +617 -0
- package/cli/commands/debug.py +24 -0
- package/cli/commands/index.py +17 -0
- package/cli/commands/query.py +20 -0
- package/cli/commands/setup.py +73 -0
- package/config/__init__.py +1 -0
- package/config/scanner_config.py +82 -0
- package/context/__init__.py +1 -0
- package/context/architecture_bootstrap.py +170 -0
- package/context/architecture_index.py +185 -0
- package/context/architecture_parser.py +170 -0
- package/context/architecture_slicer.py +308 -0
- package/context/context_router.py +70 -0
- package/context/fallback_evaluator.py +21 -0
- package/coverage_integration/__init__.py +1 -0
- package/coverage_integration/coverage_loader.py +55 -0
- package/debugging/__init__.py +12 -0
- package/debugging/bug_localizer.py +81 -0
- package/debugging/execution_path_analyzer.py +42 -0
- package/embeddings/__init__.py +6 -0
- package/embeddings/encoder.py +45 -0
- package/embeddings/faiss_index.py +72 -0
- package/git_integration/__init__.py +1 -0
- package/git_integration/git_miner.py +78 -0
- package/graphs/__init__.py +17 -0
- package/graphs/call_graph.py +70 -0
- package/graphs/code_graph.py +81 -0
- package/graphs/execution_graph.py +35 -0
- package/graphs/git_graph.py +43 -0
- package/graphs/graph_store.py +25 -0
- package/graphs/node_factory.py +21 -0
- package/graphs/test_graph.py +65 -0
- package/graphs/validators.py +28 -0
- package/graphs/variable_graph.py +51 -0
- package/knowledge_index/__init__.py +1 -0
- package/knowledge_index/index_builder.py +60 -0
- package/knowledge_index/models.py +35 -0
- package/knowledge_index/query_api.py +38 -0
- package/knowledge_index/store.py +23 -0
- package/llm_context/__init__.py +6 -0
- package/llm_context/context_builder.py +67 -0
- package/llm_context/snippet_selector.py +57 -0
- package/package.json +14 -0
- package/parser/__init__.py +18 -0
- package/parser/ast_parser.py +216 -0
- package/parser/call_resolver.py +52 -0
- package/parser/models.py +75 -0
- package/parser/tree_sitter_adapter.py +56 -0
- package/parser/variable_extractor.py +31 -0
- package/retrieval/__init__.py +17 -0
- package/retrieval/cache.py +22 -0
- package/retrieval/hybrid_retriever.py +249 -0
- package/retrieval/query_parser.py +38 -0
- package/retrieval/ranking.py +43 -0
- package/retrieval/semantic_retriever.py +39 -0
- package/retrieval/symbolic_retriever.py +80 -0
- package/scanner/__init__.py +5 -0
- package/scanner/file_filters.py +37 -0
- package/scanner/models.py +44 -0
- package/scanner/repository_scanner.py +55 -0
- package/scripts/bootstrap_from_github.ps1 +41 -0
- package/tracing/__init__.py +1 -0
- package/tracing/runtime_tracer.py +60 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Variable dependency extraction helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from .models import FunctionEntry, ModuleParseResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True, slots=True)
|
|
11
|
+
class VariableDependency:
|
|
12
|
+
"""Represents variable usage by a function."""
|
|
13
|
+
|
|
14
|
+
function_name: str
|
|
15
|
+
variable_name: str
|
|
16
|
+
access_type: str # READS | WRITES | MODIFIES
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def extract_variable_dependencies(module: ModuleParseResult) -> tuple[VariableDependency, ...]:
|
|
20
|
+
"""Extract variable read/write/modifies relationships from parsed functions."""
|
|
21
|
+
out: list[VariableDependency] = []
|
|
22
|
+
|
|
23
|
+
for fn in module.functions:
|
|
24
|
+
for name in sorted(set(fn.variables_read)):
|
|
25
|
+
out.append(VariableDependency(function_name=fn.name, variable_name=name, access_type="READS"))
|
|
26
|
+
|
|
27
|
+
for name in sorted(set(fn.variables_written)):
|
|
28
|
+
out.append(VariableDependency(function_name=fn.name, variable_name=name, access_type="WRITES"))
|
|
29
|
+
out.append(VariableDependency(function_name=fn.name, variable_name=name, access_type="MODIFIES"))
|
|
30
|
+
|
|
31
|
+
return tuple(out)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Retrieval package."""
|
|
2
|
+
|
|
3
|
+
from .hybrid_retriever import HybridCandidate, hybrid_retrieve
|
|
4
|
+
from .query_parser import ParsedQuery, parse_query
|
|
5
|
+
from .semantic_retriever import SemanticCandidate, SemanticRetriever
|
|
6
|
+
from .symbolic_retriever import SymbolicCandidate, symbolic_retrieve
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"HybridCandidate",
|
|
10
|
+
"ParsedQuery",
|
|
11
|
+
"SemanticCandidate",
|
|
12
|
+
"SemanticRetriever",
|
|
13
|
+
"SymbolicCandidate",
|
|
14
|
+
"hybrid_retrieve",
|
|
15
|
+
"parse_query",
|
|
16
|
+
"symbolic_retrieve",
|
|
17
|
+
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""In-memory retrieval cache."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(slots=True)
|
|
9
|
+
class RetrievalCache:
|
|
10
|
+
_data: dict[str, tuple[str, ...]]
|
|
11
|
+
|
|
12
|
+
def __init__(self) -> None:
|
|
13
|
+
self._data = {}
|
|
14
|
+
|
|
15
|
+
def get(self, key: str) -> tuple[str, ...] | None:
|
|
16
|
+
return self._data.get(key)
|
|
17
|
+
|
|
18
|
+
def set(self, key: str, value: tuple[str, ...]) -> None:
|
|
19
|
+
self._data[key] = value
|
|
20
|
+
|
|
21
|
+
def clear(self) -> None:
|
|
22
|
+
self._data.clear()
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Hybrid symbolic + semantic retrieval pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
import networkx as nx
|
|
9
|
+
|
|
10
|
+
from .ranking import HybridScore, combine_score
|
|
11
|
+
from .semantic_retriever import SemanticRetriever
|
|
12
|
+
from .symbolic_retriever import symbolic_retrieve
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True, slots=True)
|
|
16
|
+
class HybridCandidate:
|
|
17
|
+
node_id: str
|
|
18
|
+
score: float
|
|
19
|
+
rationale: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True, slots=True)
|
|
23
|
+
class HybridDiagnostics:
|
|
24
|
+
query_terms: tuple[str, ...]
|
|
25
|
+
symbolic_candidates: tuple[str, ...]
|
|
26
|
+
semantic_candidates: tuple[str, ...]
|
|
27
|
+
merged_candidates: tuple[str, ...]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True, slots=True)
|
|
31
|
+
class _SemanticAggregate:
|
|
32
|
+
node_id: str
|
|
33
|
+
score: float
|
|
34
|
+
hit_count: int
|
|
35
|
+
best_text_score: float
|
|
36
|
+
lexical_overlap: int
|
|
37
|
+
path_relevance: float
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
_STOPWORDS = {
|
|
41
|
+
"how",
|
|
42
|
+
"does",
|
|
43
|
+
"when",
|
|
44
|
+
"what",
|
|
45
|
+
"why",
|
|
46
|
+
"where",
|
|
47
|
+
"which",
|
|
48
|
+
"the",
|
|
49
|
+
"this",
|
|
50
|
+
"that",
|
|
51
|
+
"into",
|
|
52
|
+
"from",
|
|
53
|
+
"with",
|
|
54
|
+
"files",
|
|
55
|
+
"file",
|
|
56
|
+
"used",
|
|
57
|
+
"using",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _query_terms(query: str) -> tuple[str, ...]:
|
|
62
|
+
terms = {
|
|
63
|
+
part
|
|
64
|
+
for token in re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*", query.lower())
|
|
65
|
+
for part in token.split("_")
|
|
66
|
+
if len(part) >= 3 and part not in _STOPWORDS
|
|
67
|
+
}
|
|
68
|
+
return tuple(sorted(terms))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _node_path(node_id: str, attrs: dict) -> str:
|
|
72
|
+
path = str(attrs.get("path") or attrs.get("file") or "")
|
|
73
|
+
if path:
|
|
74
|
+
return path
|
|
75
|
+
if node_id.startswith("file:"):
|
|
76
|
+
return node_id[5:]
|
|
77
|
+
if node_id.startswith(("function:", "class:")):
|
|
78
|
+
return node_id.split(":", 1)[1].split("::", 1)[0]
|
|
79
|
+
return ""
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _canonical_semantic_node_id(graph: nx.DiGraph, node_id: str, attrs: dict) -> str:
|
|
83
|
+
path = _node_path(node_id, attrs)
|
|
84
|
+
file_node_id = f"file:{path}" if path else ""
|
|
85
|
+
if file_node_id and graph.has_node(file_node_id):
|
|
86
|
+
return file_node_id
|
|
87
|
+
return node_id
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _lexical_overlap(node_id: str, attrs: dict, query_terms: tuple[str, ...]) -> int:
|
|
91
|
+
if not query_terms:
|
|
92
|
+
return 0
|
|
93
|
+
label = str(attrs.get("label", ""))
|
|
94
|
+
path = _node_path(node_id, attrs)
|
|
95
|
+
haystack = f"{node_id} {label} {path}".lower()
|
|
96
|
+
return sum(1 for term in query_terms if term in haystack)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _path_relevance(node_id: str, attrs: dict, query_terms: tuple[str, ...]) -> float:
|
|
100
|
+
if not query_terms:
|
|
101
|
+
return 0.0
|
|
102
|
+
path = _node_path(node_id, attrs).lower()
|
|
103
|
+
if not path:
|
|
104
|
+
return 0.0
|
|
105
|
+
parts = {part for part in re.split(r"[^a-zA-Z0-9_]+", path) if part}
|
|
106
|
+
overlap = sum(1 for term in query_terms if term in path)
|
|
107
|
+
exact_parts = sum(1 for term in query_terms if term in parts)
|
|
108
|
+
return overlap * 0.08 + exact_parts * 0.05
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _semantic_node_scores(
|
|
112
|
+
graph: nx.DiGraph,
|
|
113
|
+
query: str,
|
|
114
|
+
*,
|
|
115
|
+
top_k: int,
|
|
116
|
+
) -> tuple[dict[str, _SemanticAggregate], tuple[str, ...]]:
|
|
117
|
+
query_terms = _query_terms(query)
|
|
118
|
+
entries: list[tuple[str, str]] = []
|
|
119
|
+
for node_id, attrs in sorted(graph.nodes(data=True), key=lambda item: item[0]):
|
|
120
|
+
label = str(attrs.get("label", ""))
|
|
121
|
+
path = _node_path(node_id, attrs)
|
|
122
|
+
text = f"{node_id} {path} {label}".strip()
|
|
123
|
+
entries.append((node_id, text))
|
|
124
|
+
|
|
125
|
+
if not entries:
|
|
126
|
+
return {}, ()
|
|
127
|
+
|
|
128
|
+
retriever = SemanticRetriever([text for _, text in entries])
|
|
129
|
+
semantic_top_k = min(max(top_k * 4, 24), len(entries))
|
|
130
|
+
hits = retriever.retrieve(query, top_k=semantic_top_k)
|
|
131
|
+
|
|
132
|
+
aggregates: dict[str, _SemanticAggregate] = {}
|
|
133
|
+
raw_hits: list[str] = []
|
|
134
|
+
for hit in hits:
|
|
135
|
+
source_node_id, _ = entries[hit.idx]
|
|
136
|
+
attrs = graph.nodes[source_node_id]
|
|
137
|
+
target_node_id = _canonical_semantic_node_id(graph, source_node_id, attrs)
|
|
138
|
+
target_attrs = graph.nodes[target_node_id] if graph.has_node(target_node_id) else attrs
|
|
139
|
+
lexical = _lexical_overlap(target_node_id, target_attrs, query_terms)
|
|
140
|
+
path_rel = _path_relevance(target_node_id, target_attrs, query_terms)
|
|
141
|
+
raw_hits.append(source_node_id)
|
|
142
|
+
|
|
143
|
+
existing = aggregates.get(target_node_id)
|
|
144
|
+
hit_count = 1 if existing is None else existing.hit_count + 1
|
|
145
|
+
best_text_score = hit.score if existing is None else max(existing.best_text_score, hit.score)
|
|
146
|
+
best_lexical = lexical if existing is None else max(existing.lexical_overlap, lexical)
|
|
147
|
+
best_path = path_rel if existing is None else max(existing.path_relevance, path_rel)
|
|
148
|
+
bonus = min(0.12, 0.03 * max(hit_count - 1, 0))
|
|
149
|
+
aggregate_score = min(1.0, best_text_score + bonus + min(0.08, best_lexical * 0.02) + min(0.08, best_path))
|
|
150
|
+
aggregates[target_node_id] = _SemanticAggregate(
|
|
151
|
+
node_id=target_node_id,
|
|
152
|
+
score=aggregate_score,
|
|
153
|
+
hit_count=hit_count,
|
|
154
|
+
best_text_score=best_text_score,
|
|
155
|
+
lexical_overlap=best_lexical,
|
|
156
|
+
path_relevance=best_path,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return aggregates, tuple(raw_hits)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def collect_hybrid_diagnostics(
|
|
163
|
+
graph: nx.DiGraph,
|
|
164
|
+
query: str,
|
|
165
|
+
*,
|
|
166
|
+
git_recency_by_node: dict[str, float] | None = None,
|
|
167
|
+
coverage_risk_by_node: dict[str, float] | None = None,
|
|
168
|
+
max_hops: int = 2,
|
|
169
|
+
top_k: int = 10,
|
|
170
|
+
) -> HybridDiagnostics:
|
|
171
|
+
symbolic = symbolic_retrieve(graph, query, max_hops=max_hops)
|
|
172
|
+
semantic, semantic_hits = _semantic_node_scores(graph, query, top_k=top_k)
|
|
173
|
+
git_map = git_recency_by_node or {}
|
|
174
|
+
cov_map = coverage_risk_by_node or {}
|
|
175
|
+
merged = sorted(set(c.node_id for c in symbolic) | set(semantic) | set(git_map) | set(cov_map))
|
|
176
|
+
return HybridDiagnostics(
|
|
177
|
+
query_terms=_query_terms(query),
|
|
178
|
+
symbolic_candidates=tuple(c.node_id for c in symbolic),
|
|
179
|
+
semantic_candidates=semantic_hits,
|
|
180
|
+
merged_candidates=tuple(merged),
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def hybrid_retrieve(
|
|
185
|
+
graph: nx.DiGraph,
|
|
186
|
+
query: str,
|
|
187
|
+
*,
|
|
188
|
+
git_recency_by_node: dict[str, float] | None = None,
|
|
189
|
+
coverage_risk_by_node: dict[str, float] | None = None,
|
|
190
|
+
max_hops: int = 2,
|
|
191
|
+
top_k: int = 10,
|
|
192
|
+
) -> tuple[HybridCandidate, ...]:
|
|
193
|
+
"""Rank retrieval candidates with hybrid weighted scoring."""
|
|
194
|
+
query_terms = _query_terms(query)
|
|
195
|
+
symbolic = symbolic_retrieve(graph, query, max_hops=max_hops)
|
|
196
|
+
sym_distance = {c.node_id: c.distance for c in symbolic}
|
|
197
|
+
semantic, _ = _semantic_node_scores(graph, query, top_k=top_k)
|
|
198
|
+
|
|
199
|
+
git_map = git_recency_by_node or {}
|
|
200
|
+
cov_map = coverage_risk_by_node or {}
|
|
201
|
+
|
|
202
|
+
all_nodes = sorted(set(sym_distance) | set(semantic) | set(git_map) | set(cov_map))
|
|
203
|
+
scored: list[HybridScore] = []
|
|
204
|
+
|
|
205
|
+
for node_id in all_nodes:
|
|
206
|
+
attrs = graph.nodes[node_id] if graph.has_node(node_id) else {}
|
|
207
|
+
semantic_item = semantic.get(node_id)
|
|
208
|
+
score = combine_score(
|
|
209
|
+
symbolic_distance=sym_distance.get(node_id),
|
|
210
|
+
semantic_score=semantic_item.score if semantic_item else None,
|
|
211
|
+
git_recency=git_map.get(node_id),
|
|
212
|
+
coverage_risk=cov_map.get(node_id),
|
|
213
|
+
)
|
|
214
|
+
scored.append(
|
|
215
|
+
HybridScore(
|
|
216
|
+
node_id=node_id,
|
|
217
|
+
score=score,
|
|
218
|
+
symbolic_distance=sym_distance.get(node_id),
|
|
219
|
+
semantic_score=semantic_item.score if semantic_item else None,
|
|
220
|
+
git_recency=git_map.get(node_id),
|
|
221
|
+
coverage_risk=cov_map.get(node_id),
|
|
222
|
+
lexical_overlap=_lexical_overlap(node_id, attrs, query_terms),
|
|
223
|
+
path_relevance=_path_relevance(node_id, attrs, query_terms),
|
|
224
|
+
semantic_hits=0 if semantic_item is None else semantic_item.hit_count,
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
scored.sort(
|
|
229
|
+
key=lambda s: (
|
|
230
|
+
-round(s.score, 10),
|
|
231
|
+
-s.lexical_overlap,
|
|
232
|
+
-round(s.path_relevance, 10),
|
|
233
|
+
-s.semantic_hits,
|
|
234
|
+
s.symbolic_distance if s.symbolic_distance is not None else 9999,
|
|
235
|
+
s.node_id,
|
|
236
|
+
)
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
out: list[HybridCandidate] = []
|
|
240
|
+
for item in scored[:top_k]:
|
|
241
|
+
rationale = (
|
|
242
|
+
f"symbolic_distance={item.symbolic_distance}, "
|
|
243
|
+
f"semantic={item.semantic_score}, semantic_hits={item.semantic_hits}, "
|
|
244
|
+
f"lexical_overlap={item.lexical_overlap}, path_relevance={item.path_relevance}, "
|
|
245
|
+
f"git={item.git_recency}, coverage={item.coverage_risk}"
|
|
246
|
+
)
|
|
247
|
+
out.append(HybridCandidate(node_id=item.node_id, score=item.score, rationale=rationale))
|
|
248
|
+
|
|
249
|
+
return tuple(out)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Query parsing utilities for retrieval."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True, slots=True)
|
|
10
|
+
class ParsedQuery:
|
|
11
|
+
raw: str
|
|
12
|
+
tokens: tuple[str, ...]
|
|
13
|
+
symbols: tuple[str, ...]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def parse_query(query: str) -> ParsedQuery:
|
|
17
|
+
"""Extract likely symbol tokens from a user query."""
|
|
18
|
+
lowered = query.strip().lower()
|
|
19
|
+
words = tuple(re.findall(r"[a-zA-Z_][a-zA-Z0-9_\.]*", lowered))
|
|
20
|
+
|
|
21
|
+
stop = {
|
|
22
|
+
"why",
|
|
23
|
+
"is",
|
|
24
|
+
"the",
|
|
25
|
+
"a",
|
|
26
|
+
"an",
|
|
27
|
+
"to",
|
|
28
|
+
"in",
|
|
29
|
+
"of",
|
|
30
|
+
"and",
|
|
31
|
+
"or",
|
|
32
|
+
"for",
|
|
33
|
+
"variable",
|
|
34
|
+
"function",
|
|
35
|
+
"class",
|
|
36
|
+
}
|
|
37
|
+
symbols = tuple(w for w in words if w not in stop)
|
|
38
|
+
return ParsedQuery(raw=query, tokens=words, symbols=symbols)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Ranking utilities for hybrid retrieval."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True, slots=True)
|
|
9
|
+
class HybridScore:
|
|
10
|
+
node_id: str
|
|
11
|
+
score: float
|
|
12
|
+
symbolic_distance: int | None
|
|
13
|
+
semantic_score: float | None
|
|
14
|
+
git_recency: float | None
|
|
15
|
+
coverage_risk: float | None
|
|
16
|
+
lexical_overlap: int = 0
|
|
17
|
+
path_relevance: float = 0.0
|
|
18
|
+
semantic_hits: int = 0
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def combine_score(
|
|
22
|
+
*,
|
|
23
|
+
symbolic_distance: int | None,
|
|
24
|
+
semantic_score: float | None,
|
|
25
|
+
git_recency: float | None,
|
|
26
|
+
coverage_risk: float | None,
|
|
27
|
+
w_symbolic: float = 0.4,
|
|
28
|
+
w_semantic: float = 0.3,
|
|
29
|
+
w_git: float = 0.2,
|
|
30
|
+
w_coverage: float = 0.1,
|
|
31
|
+
) -> float:
|
|
32
|
+
"""Combine component signals into a single hybrid score."""
|
|
33
|
+
symbolic_component = 0.0 if symbolic_distance is None else (1.0 / (1.0 + float(symbolic_distance)))
|
|
34
|
+
semantic_component = 0.0 if semantic_score is None else float(semantic_score)
|
|
35
|
+
git_component = 0.0 if git_recency is None else float(git_recency)
|
|
36
|
+
coverage_component = 0.0 if coverage_risk is None else float(coverage_risk)
|
|
37
|
+
|
|
38
|
+
return (
|
|
39
|
+
w_symbolic * symbolic_component
|
|
40
|
+
+ w_semantic * semantic_component
|
|
41
|
+
+ w_git * git_component
|
|
42
|
+
+ w_coverage * coverage_component
|
|
43
|
+
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Semantic retriever implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from embeddings.encoder import TextEncoder
|
|
8
|
+
from embeddings.faiss_index import VectorIndex
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True, slots=True)
|
|
12
|
+
class SemanticCandidate:
|
|
13
|
+
idx: int
|
|
14
|
+
text: str
|
|
15
|
+
score: float
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SemanticRetriever:
|
|
19
|
+
"""Semantic retrieval over text snippets."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, snippets: list[str]) -> None:
|
|
22
|
+
self._snippets = snippets
|
|
23
|
+
self._encoder = TextEncoder()
|
|
24
|
+
self._index = VectorIndex()
|
|
25
|
+
|
|
26
|
+
vectors = self._encoder.encode(snippets)
|
|
27
|
+
self._index.add(vectors)
|
|
28
|
+
|
|
29
|
+
def retrieve(self, query: str, top_k: int = 5) -> tuple[SemanticCandidate, ...]:
|
|
30
|
+
qvec = self._encoder.encode([query])[0]
|
|
31
|
+
hits = self._index.search(qvec, top_k=top_k)
|
|
32
|
+
|
|
33
|
+
out: list[SemanticCandidate] = []
|
|
34
|
+
for hit in hits:
|
|
35
|
+
if hit.idx < 0 or hit.idx >= len(self._snippets):
|
|
36
|
+
continue
|
|
37
|
+
out.append(SemanticCandidate(idx=hit.idx, text=self._snippets[hit.idx], score=hit.score))
|
|
38
|
+
|
|
39
|
+
return tuple(out)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Symbolic retriever over GCIE graphs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections import deque
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
import networkx as nx
|
|
9
|
+
|
|
10
|
+
from .query_parser import ParsedQuery, parse_query
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True, slots=True)
|
|
14
|
+
class SymbolicCandidate:
|
|
15
|
+
node_id: str
|
|
16
|
+
node_type: str
|
|
17
|
+
distance: int
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _symbol_matches(text: str, symbol: str) -> bool:
|
|
21
|
+
t = text.lower()
|
|
22
|
+
s = symbol.lower()
|
|
23
|
+
return s == t or s in t
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _seed_nodes(graph: nx.DiGraph, parsed: ParsedQuery) -> list[str]:
|
|
27
|
+
seeds: list[str] = []
|
|
28
|
+
for node_id, attrs in graph.nodes(data=True):
|
|
29
|
+
label = str(attrs.get("label", ""))
|
|
30
|
+
path = str(attrs.get("path", attrs.get("file", "")))
|
|
31
|
+
if any(_symbol_matches(node_id, sym) or _symbol_matches(label, sym) or _symbol_matches(path, sym) for sym in parsed.symbols):
|
|
32
|
+
seeds.append(node_id)
|
|
33
|
+
return sorted(set(seeds))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _bounded_traversal(graph: nx.DiGraph, seeds: list[str], max_hops: int) -> dict[str, int]:
|
|
37
|
+
distances: dict[str, int] = {}
|
|
38
|
+
queue: deque[tuple[str, int]] = deque((seed, 0) for seed in seeds)
|
|
39
|
+
|
|
40
|
+
while queue:
|
|
41
|
+
node, dist = queue.popleft()
|
|
42
|
+
if node in distances and dist >= distances[node]:
|
|
43
|
+
continue
|
|
44
|
+
distances[node] = dist
|
|
45
|
+
|
|
46
|
+
if dist >= max_hops:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
neighbors = set(graph.predecessors(node)).union(graph.successors(node))
|
|
50
|
+
for nxt in neighbors:
|
|
51
|
+
queue.append((nxt, dist + 1))
|
|
52
|
+
|
|
53
|
+
return distances
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def symbolic_retrieve(graph: nx.DiGraph, query: str, *, max_hops: int = 2) -> tuple[SymbolicCandidate, ...]:
|
|
57
|
+
"""Retrieve symbolic candidates by seeded graph traversal."""
|
|
58
|
+
parsed = parse_query(query)
|
|
59
|
+
if not parsed.symbols:
|
|
60
|
+
return ()
|
|
61
|
+
|
|
62
|
+
seeds = _seed_nodes(graph, parsed)
|
|
63
|
+
if not seeds:
|
|
64
|
+
return ()
|
|
65
|
+
|
|
66
|
+
distances = _bounded_traversal(graph, seeds, max_hops=max_hops)
|
|
67
|
+
|
|
68
|
+
ranked: list[SymbolicCandidate] = []
|
|
69
|
+
for node_id, dist in distances.items():
|
|
70
|
+
attrs = graph.nodes[node_id]
|
|
71
|
+
ranked.append(
|
|
72
|
+
SymbolicCandidate(
|
|
73
|
+
node_id=node_id,
|
|
74
|
+
node_type=str(attrs.get("type", "unknown")),
|
|
75
|
+
distance=dist,
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
ranked.sort(key=lambda c: (c.distance, c.node_type, c.node_id))
|
|
80
|
+
return tuple(ranked)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Filtering and classification helpers for repository scanning."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
CONFIG_FILENAMES = {
|
|
8
|
+
"pyproject.toml",
|
|
9
|
+
"requirements.txt",
|
|
10
|
+
"requirements-dev.txt",
|
|
11
|
+
"setup.py",
|
|
12
|
+
"setup.cfg",
|
|
13
|
+
"tox.ini",
|
|
14
|
+
".env",
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
CONFIG_EXTENSIONS = {".toml", ".ini", ".cfg", ".yaml", ".yml", ".json"}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def classify_file(relative_path: Path) -> str:
|
|
21
|
+
"""Classify file type for downstream indexing priorities."""
|
|
22
|
+
name = relative_path.name.lower()
|
|
23
|
+
suffix = relative_path.suffix.lower()
|
|
24
|
+
rel = relative_path.as_posix().lower()
|
|
25
|
+
|
|
26
|
+
if "/tests/" in f"/{rel}" or name.startswith("test_") or name.endswith("_test.py"):
|
|
27
|
+
return "test"
|
|
28
|
+
if name in CONFIG_FILENAMES or suffix in CONFIG_EXTENSIONS:
|
|
29
|
+
return "config"
|
|
30
|
+
if suffix in {".py", ".pyi"}:
|
|
31
|
+
return "source"
|
|
32
|
+
return "other"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def should_skip_hidden_file(path: Path, include_hidden: bool) -> bool:
|
|
36
|
+
"""Exclude hidden files unless explicitly enabled."""
|
|
37
|
+
return (not include_hidden) and path.name.startswith(".")
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Data models for repository scanning."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Literal
|
|
8
|
+
|
|
9
|
+
FileKind = Literal["source", "test", "config", "other"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True, slots=True)
|
|
13
|
+
class ScannedFile:
|
|
14
|
+
"""A file discovered by the repository scanner."""
|
|
15
|
+
|
|
16
|
+
path: Path
|
|
17
|
+
relative_path: Path
|
|
18
|
+
size_bytes: int
|
|
19
|
+
suffix: str
|
|
20
|
+
kind: FileKind
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True, slots=True)
|
|
24
|
+
class RepositoryManifest:
|
|
25
|
+
"""Normalized scanner output for downstream indexing."""
|
|
26
|
+
|
|
27
|
+
root: Path
|
|
28
|
+
files: tuple[ScannedFile, ...]
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def total_files(self) -> int:
|
|
32
|
+
return len(self.files)
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def source_files(self) -> tuple[ScannedFile, ...]:
|
|
36
|
+
return tuple(f for f in self.files if f.kind == "source")
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def test_files(self) -> tuple[ScannedFile, ...]:
|
|
40
|
+
return tuple(f for f in self.files if f.kind == "test")
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def config_files(self) -> tuple[ScannedFile, ...]:
|
|
44
|
+
return tuple(f for f in self.files if f.kind == "config")
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Repository scanner implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from config.scanner_config import ScannerConfig
|
|
9
|
+
|
|
10
|
+
from .file_filters import classify_file, should_skip_hidden_file
|
|
11
|
+
from .models import RepositoryManifest, ScannedFile
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _iter_candidate_files(root: Path, config: ScannerConfig):
|
|
15
|
+
"""Yield candidate file paths in deterministic order."""
|
|
16
|
+
for current_root, dirs, files in os.walk(root, topdown=True):
|
|
17
|
+
dirs[:] = sorted(d for d in dirs if not config.is_excluded_dir(d))
|
|
18
|
+
for file_name in sorted(files):
|
|
19
|
+
candidate = Path(current_root) / file_name
|
|
20
|
+
rel = candidate.relative_to(root)
|
|
21
|
+
|
|
22
|
+
if should_skip_hidden_file(candidate, config.include_hidden):
|
|
23
|
+
continue
|
|
24
|
+
if config.matches_exclude_glob(rel):
|
|
25
|
+
continue
|
|
26
|
+
if not config.allows_extension(candidate):
|
|
27
|
+
continue
|
|
28
|
+
try:
|
|
29
|
+
size_bytes = candidate.stat().st_size
|
|
30
|
+
except OSError:
|
|
31
|
+
continue
|
|
32
|
+
if size_bytes > config.max_file_size_bytes:
|
|
33
|
+
continue
|
|
34
|
+
yield candidate, rel, size_bytes
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def scan_repository(root: str | Path, config: ScannerConfig | None = None) -> RepositoryManifest:
|
|
38
|
+
"""Scan a repository and return a normalized manifest."""
|
|
39
|
+
scan_config = config or ScannerConfig()
|
|
40
|
+
root_path = Path(root).resolve()
|
|
41
|
+
|
|
42
|
+
files: list[ScannedFile] = []
|
|
43
|
+
for candidate, rel, size_bytes in _iter_candidate_files(root_path, scan_config):
|
|
44
|
+
files.append(
|
|
45
|
+
ScannedFile(
|
|
46
|
+
path=candidate,
|
|
47
|
+
relative_path=rel,
|
|
48
|
+
size_bytes=size_bytes,
|
|
49
|
+
suffix=candidate.suffix.lower(),
|
|
50
|
+
kind=classify_file(rel),
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
files.sort(key=lambda item: item.relative_path.as_posix())
|
|
55
|
+
return RepositoryManifest(root=root_path, files=tuple(files))
|