@pmaddire/gcie 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/AGENT.md +256 -0
  2. package/AGENT_USAGE.md +231 -0
  3. package/ARCHITECTURE.md +151 -0
  4. package/CLAUDE.md +69 -0
  5. package/DEBUGGING_PLAYBOOK.md +160 -0
  6. package/KNOWLEDGE_INDEX.md +154 -0
  7. package/POTENTIAL_UPDATES +130 -0
  8. package/PROJECT.md +141 -0
  9. package/README.md +371 -0
  10. package/REPO_DIGITAL_TWIN.md +98 -0
  11. package/ROADMAP.md +301 -0
  12. package/SETUP_ANY_REPO.md +85 -0
  13. package/bin/gcie-init.js +20 -0
  14. package/bin/gcie.js +45 -0
  15. package/cli/__init__.py +1 -0
  16. package/cli/app.py +163 -0
  17. package/cli/commands/__init__.py +1 -0
  18. package/cli/commands/cache.py +35 -0
  19. package/cli/commands/context.py +2426 -0
  20. package/cli/commands/context_slices.py +617 -0
  21. package/cli/commands/debug.py +24 -0
  22. package/cli/commands/index.py +17 -0
  23. package/cli/commands/query.py +20 -0
  24. package/cli/commands/setup.py +73 -0
  25. package/config/__init__.py +1 -0
  26. package/config/scanner_config.py +82 -0
  27. package/context/__init__.py +1 -0
  28. package/context/architecture_bootstrap.py +170 -0
  29. package/context/architecture_index.py +185 -0
  30. package/context/architecture_parser.py +170 -0
  31. package/context/architecture_slicer.py +308 -0
  32. package/context/context_router.py +70 -0
  33. package/context/fallback_evaluator.py +21 -0
  34. package/coverage_integration/__init__.py +1 -0
  35. package/coverage_integration/coverage_loader.py +55 -0
  36. package/debugging/__init__.py +12 -0
  37. package/debugging/bug_localizer.py +81 -0
  38. package/debugging/execution_path_analyzer.py +42 -0
  39. package/embeddings/__init__.py +6 -0
  40. package/embeddings/encoder.py +45 -0
  41. package/embeddings/faiss_index.py +72 -0
  42. package/git_integration/__init__.py +1 -0
  43. package/git_integration/git_miner.py +78 -0
  44. package/graphs/__init__.py +17 -0
  45. package/graphs/call_graph.py +70 -0
  46. package/graphs/code_graph.py +81 -0
  47. package/graphs/execution_graph.py +35 -0
  48. package/graphs/git_graph.py +43 -0
  49. package/graphs/graph_store.py +25 -0
  50. package/graphs/node_factory.py +21 -0
  51. package/graphs/test_graph.py +65 -0
  52. package/graphs/validators.py +28 -0
  53. package/graphs/variable_graph.py +51 -0
  54. package/knowledge_index/__init__.py +1 -0
  55. package/knowledge_index/index_builder.py +60 -0
  56. package/knowledge_index/models.py +35 -0
  57. package/knowledge_index/query_api.py +38 -0
  58. package/knowledge_index/store.py +23 -0
  59. package/llm_context/__init__.py +6 -0
  60. package/llm_context/context_builder.py +67 -0
  61. package/llm_context/snippet_selector.py +57 -0
  62. package/package.json +14 -0
  63. package/parser/__init__.py +18 -0
  64. package/parser/ast_parser.py +216 -0
  65. package/parser/call_resolver.py +52 -0
  66. package/parser/models.py +75 -0
  67. package/parser/tree_sitter_adapter.py +56 -0
  68. package/parser/variable_extractor.py +31 -0
  69. package/retrieval/__init__.py +17 -0
  70. package/retrieval/cache.py +22 -0
  71. package/retrieval/hybrid_retriever.py +249 -0
  72. package/retrieval/query_parser.py +38 -0
  73. package/retrieval/ranking.py +43 -0
  74. package/retrieval/semantic_retriever.py +39 -0
  75. package/retrieval/symbolic_retriever.py +80 -0
  76. package/scanner/__init__.py +5 -0
  77. package/scanner/file_filters.py +37 -0
  78. package/scanner/models.py +44 -0
  79. package/scanner/repository_scanner.py +55 -0
  80. package/scripts/bootstrap_from_github.ps1 +41 -0
  81. package/tracing/__init__.py +1 -0
  82. package/tracing/runtime_tracer.py +60 -0
@@ -0,0 +1,308 @@
1
+ """Architecture-driven context slicing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ import re
7
+ from pathlib import Path
8
+
9
+ from llm_context.snippet_selector import estimate_tokens
10
+
11
+ from .architecture_index import load_architecture_index
12
+
13
+
14
+ _MISSING_RATIO_FALLBACK = 0.5
15
+ _STOPWORDS = {
16
+ "for",
17
+ "and",
18
+ "the",
19
+ "with",
20
+ "from",
21
+ "this",
22
+ "that",
23
+ "into",
24
+ "onto",
25
+ "over",
26
+ "under",
27
+ "fix",
28
+ "add",
29
+ "update",
30
+ "refactor",
31
+ "change",
32
+ "when",
33
+ "why",
34
+ "how",
35
+ "use",
36
+ "using",
37
+ "used",
38
+ "make",
39
+ "new",
40
+ }
41
+ _ARCH_KEYWORDS = {
42
+ "fallback",
43
+ "router",
44
+ "routing",
45
+ "context",
46
+ "slicer",
47
+ "architecture",
48
+ "validation",
49
+ "mode",
50
+ "confidence",
51
+ }
52
+
53
+
54
+ @dataclass
55
+ class ArchitectureSliceResult:
56
+ query: str
57
+ snippets: list[dict]
58
+ confidence: float
59
+ matched_subsystems: list[dict]
60
+ missing_files: list[str]
61
+ error: str | None = None
62
+
63
+
64
+ def _tokenize(text: str) -> set[str]:
65
+ tokens = []
66
+ for raw in re.split(r"[\s_-]+", text.lower()):
67
+ token = "".join(ch for ch in raw if ch.isalnum() or ch == "_")
68
+ if len(token) >= 3:
69
+ if token not in _STOPWORDS:
70
+ tokens.append(token)
71
+ return set(tokens)
72
+
73
+
74
+ def _subsystem_blob(subsystem: dict) -> str:
75
+ parts = [subsystem.get("name", ""), subsystem.get("purpose", ""), subsystem.get("status", "")]
76
+ for field in (
77
+ subsystem.get("interfaces", []),
78
+ subsystem.get("depends_on", []),
79
+ subsystem.get("used_by", []),
80
+ subsystem.get("failure_modes", []),
81
+ subsystem.get("notes", []),
82
+ ):
83
+ if field:
84
+ parts.extend(field)
85
+ return " ".join(parts)
86
+
87
+
88
+ def _score_subsystem(subsystem: dict, query_tokens: set[str]) -> float:
89
+ if not query_tokens:
90
+ return 0.0
91
+ blob = _subsystem_blob(subsystem).lower()
92
+ matches = sum(1 for token in query_tokens if token in blob)
93
+ return matches / max(len(query_tokens), 1)
94
+
95
+
96
+ def _snippet_from_lines(lines: list[str], max_lines: int) -> str:
97
+ return "\n".join(lines[:max_lines]).strip()
98
+
99
+
100
+ def _collect_snippets(repo_path: Path, files: list[str], max_lines: int = 120) -> tuple[list[dict], list[str]]:
101
+ snippets: list[dict] = []
102
+ missing: list[str] = []
103
+ for rel_path in files:
104
+ file_path = repo_path / rel_path
105
+ if not file_path.exists():
106
+ missing.append(rel_path)
107
+ continue
108
+ try:
109
+ content = file_path.read_text(encoding="utf-8").splitlines()
110
+ except Exception:
111
+ missing.append(rel_path)
112
+ continue
113
+ snippet = _snippet_from_lines(content, max_lines=max_lines)
114
+ if snippet:
115
+ snippets.append(
116
+ {
117
+ "node_id": f"file:{rel_path}",
118
+ "score": 1.0,
119
+ "content": snippet,
120
+ }
121
+ )
122
+ return snippets, missing
123
+
124
+
125
+ def _validate_index(repo_path: Path, index_data: dict) -> tuple[list[dict], list[str], float]:
126
+ missing: list[str] = []
127
+ cleaned: list[dict] = []
128
+ total = 0
129
+
130
+ for subsystem in index_data.get("subsystems", []):
131
+ key_files = subsystem.get("key_files", []) or []
132
+ total += len(key_files)
133
+ valid_files: list[str] = []
134
+ for rel_path in key_files:
135
+ if (repo_path / rel_path).exists():
136
+ valid_files.append(rel_path)
137
+ else:
138
+ missing.append(rel_path)
139
+ cleaned.append({**subsystem, "key_files": valid_files})
140
+
141
+ if total == 0:
142
+ return cleaned, missing, 0.0
143
+
144
+ missing_ratio = len(missing) / total
145
+ return cleaned, missing, missing_ratio
146
+
147
+
148
+ def _arch_query(query_tokens: set[str]) -> bool:
149
+ return bool(query_tokens & _ARCH_KEYWORDS)
150
+
151
+
152
+ def _rank_core_files(core_files: list[str], query_tokens: set[str]) -> list[str]:
153
+ weights = {
154
+ "router": 3,
155
+ "routing": 3,
156
+ "fallback": 3,
157
+ "architecture": 2,
158
+ "slicer": 2,
159
+ "validation": 2,
160
+ "context": 1,
161
+ "mode": 1,
162
+ "confidence": 1,
163
+ }
164
+ ranked = []
165
+ for path in core_files:
166
+ lowered = path.lower()
167
+ score = 0
168
+ for key, weight in weights.items():
169
+ if key in lowered:
170
+ score += weight
171
+ if query_tokens:
172
+ score += sum(1 for token in query_tokens if token in lowered)
173
+ ranked.append((score, path))
174
+ ranked.sort(key=lambda item: item[0], reverse=True)
175
+ return [path for score, path in ranked]
176
+
177
+
178
+ def _select_core_files(index_data: dict, query_tokens: set[str]) -> list[str]:
179
+ core_files = index_data.get("core_infrastructure", []) or []
180
+ return _rank_core_files(core_files, query_tokens)
181
+
182
+
183
+ def slice_with_architecture(repo_path: Path, query: str) -> ArchitectureSliceResult:
184
+ index_path = repo_path / ".gcie" / "architecture_index.json"
185
+ index_data = load_architecture_index(index_path)
186
+ if index_data is None:
187
+ return ArchitectureSliceResult(
188
+ query=query,
189
+ snippets=[],
190
+ confidence=0.0,
191
+ matched_subsystems=[],
192
+ missing_files=[],
193
+ error="index_missing",
194
+ )
195
+
196
+ subsystems, missing_files, missing_ratio = _validate_index(repo_path, index_data)
197
+ if not subsystems:
198
+ return ArchitectureSliceResult(
199
+ query=query,
200
+ snippets=[],
201
+ confidence=0.0,
202
+ matched_subsystems=[],
203
+ missing_files=missing_files,
204
+ error="no_subsystems",
205
+ )
206
+
207
+ if missing_ratio >= _MISSING_RATIO_FALLBACK and missing_files:
208
+ return ArchitectureSliceResult(
209
+ query=query,
210
+ snippets=[],
211
+ confidence=0.0,
212
+ matched_subsystems=[],
213
+ missing_files=missing_files,
214
+ error="index_missing_files",
215
+ )
216
+
217
+ query_tokens = _tokenize(query)
218
+ scored = []
219
+ for subsystem in subsystems:
220
+ score = _score_subsystem(subsystem, query_tokens)
221
+ scored.append((score, subsystem))
222
+
223
+ scored.sort(key=lambda item: item[0], reverse=True)
224
+ matched = [(score, subsystem) for score, subsystem in scored if score > 0]
225
+
226
+ if not matched:
227
+ if _arch_query(query_tokens) and index_data.get("core_infrastructure"):
228
+ core_files = _select_core_files(index_data, query_tokens)
229
+ snippets, missing = _collect_snippets(repo_path, core_files)
230
+ missing_files.extend(missing)
231
+ return ArchitectureSliceResult(
232
+ query=query,
233
+ snippets=snippets,
234
+ confidence=0.25,
235
+ matched_subsystems=[],
236
+ missing_files=missing_files,
237
+ error=None,
238
+ )
239
+ return ArchitectureSliceResult(
240
+ query=query,
241
+ snippets=[],
242
+ confidence=0.0,
243
+ matched_subsystems=[],
244
+ missing_files=missing_files,
245
+ error="low_match",
246
+ )
247
+
248
+ top_score = matched[0][0]
249
+ if missing_ratio > 0:
250
+ top_score = max(top_score * (1.0 - missing_ratio), 0.0)
251
+
252
+ selected_subsystems = [subsystem for score, subsystem in matched[:3]]
253
+ selected_files: list[str] = []
254
+ for subsystem in selected_subsystems:
255
+ selected_files.extend(subsystem.get("key_files", []))
256
+
257
+ include_core = False
258
+ arch_query = _arch_query(query_tokens)
259
+ if arch_query:
260
+ include_core = True
261
+ if arch_query and top_score <= 0.35:
262
+ include_core = True
263
+ if arch_query and len(selected_subsystems) <= 1:
264
+ include_core = True
265
+
266
+ core_files = _select_core_files(index_data, query_tokens)
267
+ if include_core and core_files:
268
+ selected_files = core_files + selected_files
269
+
270
+ snippets, missing = _collect_snippets(repo_path, selected_files)
271
+ missing_files.extend(missing)
272
+
273
+ if include_core and core_files and not snippets:
274
+ snippets, missing = _collect_snippets(repo_path, core_files)
275
+ missing_files.extend(missing)
276
+
277
+ return ArchitectureSliceResult(
278
+ query=query,
279
+ snippets=snippets,
280
+ confidence=top_score,
281
+ matched_subsystems=[
282
+ {"name": subsystem.get("name", ""), "score": score}
283
+ for score, subsystem in matched[:3]
284
+ ],
285
+ missing_files=missing_files,
286
+ error=None,
287
+ )
288
+
289
+
290
+ def trim_snippets_to_budget(snippets: list[dict], max_total: int) -> list[dict]:
291
+ out: list[dict] = []
292
+ used = 0
293
+ for item in snippets:
294
+ tokens = estimate_tokens(item.get("content", ""))
295
+ if used + tokens > max_total:
296
+ continue
297
+ out.append(item)
298
+ used += tokens
299
+ return out
300
+
301
+
302
+
303
+
304
+
305
+
306
+
307
+
308
+
@@ -0,0 +1,70 @@
1
+ """Route context requests between architecture-driven and normal modes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Callable
8
+
9
+ from llm_context.snippet_selector import estimate_tokens
10
+
11
+ from .architecture_bootstrap import ensure_initialized
12
+ from .architecture_slicer import slice_with_architecture, trim_snippets_to_budget
13
+ from .fallback_evaluator import should_fallback
14
+
15
+
16
+ NormalRunner = Callable[[], dict]
17
+
18
+
19
+ def _total_tokens(snippets: list[dict]) -> int:
20
+ return sum(estimate_tokens(item.get("content", "")) for item in snippets)
21
+
22
+
23
+ def _record_fallback(repo_path: Path, reason: str | None, config: dict) -> None:
24
+ if reason is None:
25
+ return
26
+ config_path = repo_path / ".gcie" / "context_config.json"
27
+ config["fallback_reason"] = reason
28
+ try:
29
+ config_path.write_text(json.dumps(config, indent=2), encoding="utf-8")
30
+ except Exception:
31
+ return
32
+
33
+
34
+ def route_context(
35
+ repo: str,
36
+ query: str,
37
+ *,
38
+ intent: str | None,
39
+ max_total: int,
40
+ profile: str | None,
41
+ normal_runner: NormalRunner,
42
+ ) -> dict:
43
+ repo_path = Path(repo)
44
+ config = ensure_initialized(repo_path)
45
+
46
+ if not config.get("architecture_slicer_enabled", True):
47
+ _record_fallback(repo_path, "architecture_disabled", config)
48
+ payload = normal_runner()
49
+ payload["fallback_reason"] = "architecture_disabled"
50
+ return payload
51
+
52
+ arch_result = slice_with_architecture(repo_path, query)
53
+ fallback, reason = should_fallback(arch_result, config)
54
+ if fallback:
55
+ _record_fallback(repo_path, reason, config)
56
+ payload = normal_runner()
57
+ payload["fallback_reason"] = reason
58
+ return payload
59
+
60
+ trimmed = trim_snippets_to_budget(arch_result.snippets, max_total)
61
+ return {
62
+ "query": arch_result.query,
63
+ "profile": profile,
64
+ "mode": "architecture",
65
+ "intent": intent,
66
+ "confidence": arch_result.confidence,
67
+ "matched_subsystems": arch_result.matched_subsystems,
68
+ "snippets": trimmed,
69
+ "token_estimate": _total_tokens(trimmed),
70
+ }
@@ -0,0 +1,21 @@
1
+ """Evaluate whether to fall back to normal context."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .architecture_slicer import ArchitectureSliceResult
6
+
7
+
8
+ def should_fallback(result: ArchitectureSliceResult, config: dict) -> tuple[bool, str | None]:
9
+ """Decide whether architecture slicing is insufficient."""
10
+ if result.error:
11
+ return True, result.error
12
+
13
+ if not result.snippets:
14
+ return True, "no_snippets"
15
+
16
+ threshold = float(config.get("confidence_threshold", 0.2))
17
+ if result.confidence < threshold:
18
+ if config.get("fallback_to_normal_on_low_confidence", True):
19
+ return True, "low_confidence"
20
+
21
+ return False, None
@@ -0,0 +1 @@
1
+ """Coverage integration package."""
@@ -0,0 +1,55 @@
1
+ """Coverage.py JSON report loader."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+
10
+ @dataclass(frozen=True, slots=True)
11
+ class CoverageFileRecord:
12
+ """Coverage record for a single file."""
13
+
14
+ path: str
15
+ executed_lines: tuple[int, ...]
16
+ missing_lines: tuple[int, ...]
17
+ percent_covered: float
18
+ num_statements: int
19
+ num_branches: int
20
+ num_partial_branches: int
21
+
22
+
23
+ @dataclass(frozen=True, slots=True)
24
+ class CoverageReport:
25
+ """Loaded coverage report."""
26
+
27
+ files: tuple[CoverageFileRecord, ...]
28
+
29
+
30
+ def load_coverage_json(path: str | Path) -> CoverageReport:
31
+ """Load Coverage.py JSON report from disk."""
32
+ report_path = Path(path)
33
+ if not report_path.exists():
34
+ return CoverageReport(files=())
35
+
36
+ data = json.loads(report_path.read_text(encoding="utf-8"))
37
+ files_data = data.get("files", {})
38
+
39
+ records: list[CoverageFileRecord] = []
40
+ for file_path, entry in files_data.items():
41
+ summary = entry.get("summary", {})
42
+ records.append(
43
+ CoverageFileRecord(
44
+ path=Path(file_path).as_posix(),
45
+ executed_lines=tuple(entry.get("executed_lines", [])),
46
+ missing_lines=tuple(entry.get("missing_lines", [])),
47
+ percent_covered=float(summary.get("percent_covered", 0.0)),
48
+ num_statements=int(summary.get("num_statements", 0)),
49
+ num_branches=int(summary.get("num_branches", 0)),
50
+ num_partial_branches=int(summary.get("num_partial_branches", 0)),
51
+ )
52
+ )
53
+
54
+ records.sort(key=lambda r: r.path)
55
+ return CoverageReport(files=tuple(records))
@@ -0,0 +1,12 @@
1
+ """Debugging package."""
2
+
3
+ from .bug_localizer import LocalizedBugReport, localize_bug
4
+ from .execution_path_analyzer import ExecutionPath, neighborhood_path, shortest_path_between
5
+
6
+ __all__ = [
7
+ "ExecutionPath",
8
+ "LocalizedBugReport",
9
+ "localize_bug",
10
+ "neighborhood_path",
11
+ "shortest_path_between",
12
+ ]
@@ -0,0 +1,81 @@
1
+ """Bug localization workflow for GCIE debugging queries."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import networkx as nx
8
+
9
+ from retrieval.hybrid_retriever import hybrid_retrieve
10
+ from retrieval.query_parser import parse_query
11
+
12
+ from .execution_path_analyzer import neighborhood_path
13
+
14
+
15
+ @dataclass(frozen=True, slots=True)
16
+ class LocalizedBugReport:
17
+ query: str
18
+ target_symbols: tuple[str, ...]
19
+ relevant_functions: tuple[str, ...]
20
+ call_chain: tuple[str, ...]
21
+ variable_modifications: tuple[str, ...]
22
+ ranked_candidates: tuple[str, ...]
23
+
24
+
25
+ def _function_nodes_touching_symbol(graph: nx.DiGraph, symbol: str) -> tuple[str, ...]:
26
+ out: set[str] = set()
27
+ needle = symbol.lower()
28
+
29
+ for src, dst, attrs in graph.edges(data=True):
30
+ edge_type = str(attrs.get("type", ""))
31
+ if edge_type not in {"WRITES", "MODIFIES", "READS"}:
32
+ continue
33
+
34
+ if str(dst).lower() == f"variable:{needle}" and str(src).startswith("function:"):
35
+ out.add(str(src))
36
+
37
+ return tuple(sorted(out))
38
+
39
+
40
+ def localize_bug(
41
+ graph: nx.DiGraph,
42
+ query: str,
43
+ *,
44
+ git_recency_by_node: dict[str, float] | None = None,
45
+ coverage_risk_by_node: dict[str, float] | None = None,
46
+ ) -> LocalizedBugReport:
47
+ """Localize likely bug sources from a debugging query."""
48
+ parsed = parse_query(query)
49
+ symbols = parsed.symbols
50
+
51
+ variable_mods: set[str] = set()
52
+ for symbol in symbols:
53
+ variable_mods.update(_function_nodes_touching_symbol(graph, symbol))
54
+
55
+ hybrid = hybrid_retrieve(
56
+ graph,
57
+ query,
58
+ git_recency_by_node=git_recency_by_node,
59
+ coverage_risk_by_node=coverage_risk_by_node,
60
+ max_hops=3,
61
+ top_k=10,
62
+ )
63
+ ranked = tuple(c.node_id for c in hybrid)
64
+
65
+ relevant_functions = tuple(
66
+ node_id for node_id in ranked if node_id.startswith("function:")
67
+ )
68
+
69
+ chain: tuple[str, ...] = ()
70
+ if relevant_functions:
71
+ seed = relevant_functions[0]
72
+ chain = neighborhood_path(graph, seed=seed, hops=2).nodes
73
+
74
+ return LocalizedBugReport(
75
+ query=query,
76
+ target_symbols=symbols,
77
+ relevant_functions=relevant_functions,
78
+ call_chain=chain,
79
+ variable_modifications=tuple(sorted(variable_mods)),
80
+ ranked_candidates=ranked,
81
+ )
@@ -0,0 +1,42 @@
1
+ """Execution path analysis helpers for debugging output."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import networkx as nx
8
+
9
+
10
+ @dataclass(frozen=True, slots=True)
11
+ class ExecutionPath:
12
+ nodes: tuple[str, ...]
13
+ reason: str
14
+
15
+
16
+ def shortest_path_between(graph: nx.DiGraph, source: str, target: str) -> ExecutionPath | None:
17
+ """Return shortest undirected path between two nodes when available."""
18
+ try:
19
+ path = nx.shortest_path(graph.to_undirected(), source=source, target=target)
20
+ return ExecutionPath(nodes=tuple(path), reason="shortest_undirected_path")
21
+ except (nx.NetworkXNoPath, nx.NodeNotFound):
22
+ return None
23
+
24
+
25
+ def neighborhood_path(graph: nx.DiGraph, seed: str, hops: int = 2) -> ExecutionPath:
26
+ """Return bounded BFS neighborhood around a seed node."""
27
+ seen = {seed}
28
+ frontier = {seed}
29
+
30
+ for _ in range(max(hops, 0)):
31
+ nxt: set[str] = set()
32
+ for node in frontier:
33
+ nxt.update(graph.predecessors(node))
34
+ nxt.update(graph.successors(node))
35
+ nxt -= seen
36
+ if not nxt:
37
+ break
38
+ seen.update(nxt)
39
+ frontier = nxt
40
+
41
+ ordered = tuple(sorted(seen))
42
+ return ExecutionPath(nodes=ordered, reason=f"neighborhood_hops={hops}")
@@ -0,0 +1,6 @@
1
+ """Embeddings package."""
2
+
3
+ from .encoder import TextEncoder
4
+ from .faiss_index import SearchHit, VectorIndex
5
+
6
+ __all__ = ["SearchHit", "TextEncoder", "VectorIndex"]
@@ -0,0 +1,45 @@
1
+ """Embedding encoder with SentenceTransformers fallback."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import math
7
+ from typing import Iterable
8
+
9
+
10
+ def _fallback_vector(text: str, dims: int = 64) -> list[float]:
11
+ vec = [0.0] * dims
12
+ tokens = text.lower().split()
13
+ if not tokens:
14
+ return vec
15
+
16
+ for tok in tokens:
17
+ digest = hashlib.sha256(tok.encode("utf-8")).digest()
18
+ idx = int.from_bytes(digest[:4], "big") % dims
19
+ sign = 1.0 if digest[4] % 2 == 0 else -1.0
20
+ vec[idx] += sign
21
+
22
+ norm = math.sqrt(sum(v * v for v in vec)) or 1.0
23
+ return [v / norm for v in vec]
24
+
25
+
26
+ class TextEncoder:
27
+ """SentenceTransformer-compatible text encoder with deterministic fallback."""
28
+
29
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
30
+ self.model_name = model_name
31
+ self._model = None
32
+ try:
33
+ from sentence_transformers import SentenceTransformer # type: ignore
34
+
35
+ self._model = SentenceTransformer(model_name)
36
+ except Exception:
37
+ self._model = None
38
+
39
+ def encode(self, texts: Iterable[str]) -> list[list[float]]:
40
+ values = list(texts)
41
+ if self._model is not None:
42
+ vectors = self._model.encode(values, normalize_embeddings=True)
43
+ return [list(map(float, row)) for row in vectors]
44
+
45
+ return [_fallback_vector(text) for text in values]