@pmaddire/gcie 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/AGENT.md +256 -0
  2. package/AGENT_USAGE.md +231 -0
  3. package/ARCHITECTURE.md +151 -0
  4. package/CLAUDE.md +69 -0
  5. package/DEBUGGING_PLAYBOOK.md +160 -0
  6. package/KNOWLEDGE_INDEX.md +154 -0
  7. package/POTENTIAL_UPDATES +130 -0
  8. package/PROJECT.md +141 -0
  9. package/README.md +371 -0
  10. package/REPO_DIGITAL_TWIN.md +98 -0
  11. package/ROADMAP.md +301 -0
  12. package/SETUP_ANY_REPO.md +85 -0
  13. package/bin/gcie-init.js +20 -0
  14. package/bin/gcie.js +45 -0
  15. package/cli/__init__.py +1 -0
  16. package/cli/app.py +163 -0
  17. package/cli/commands/__init__.py +1 -0
  18. package/cli/commands/cache.py +35 -0
  19. package/cli/commands/context.py +2426 -0
  20. package/cli/commands/context_slices.py +617 -0
  21. package/cli/commands/debug.py +24 -0
  22. package/cli/commands/index.py +17 -0
  23. package/cli/commands/query.py +20 -0
  24. package/cli/commands/setup.py +73 -0
  25. package/config/__init__.py +1 -0
  26. package/config/scanner_config.py +82 -0
  27. package/context/__init__.py +1 -0
  28. package/context/architecture_bootstrap.py +170 -0
  29. package/context/architecture_index.py +185 -0
  30. package/context/architecture_parser.py +170 -0
  31. package/context/architecture_slicer.py +308 -0
  32. package/context/context_router.py +70 -0
  33. package/context/fallback_evaluator.py +21 -0
  34. package/coverage_integration/__init__.py +1 -0
  35. package/coverage_integration/coverage_loader.py +55 -0
  36. package/debugging/__init__.py +12 -0
  37. package/debugging/bug_localizer.py +81 -0
  38. package/debugging/execution_path_analyzer.py +42 -0
  39. package/embeddings/__init__.py +6 -0
  40. package/embeddings/encoder.py +45 -0
  41. package/embeddings/faiss_index.py +72 -0
  42. package/git_integration/__init__.py +1 -0
  43. package/git_integration/git_miner.py +78 -0
  44. package/graphs/__init__.py +17 -0
  45. package/graphs/call_graph.py +70 -0
  46. package/graphs/code_graph.py +81 -0
  47. package/graphs/execution_graph.py +35 -0
  48. package/graphs/git_graph.py +43 -0
  49. package/graphs/graph_store.py +25 -0
  50. package/graphs/node_factory.py +21 -0
  51. package/graphs/test_graph.py +65 -0
  52. package/graphs/validators.py +28 -0
  53. package/graphs/variable_graph.py +51 -0
  54. package/knowledge_index/__init__.py +1 -0
  55. package/knowledge_index/index_builder.py +60 -0
  56. package/knowledge_index/models.py +35 -0
  57. package/knowledge_index/query_api.py +38 -0
  58. package/knowledge_index/store.py +23 -0
  59. package/llm_context/__init__.py +6 -0
  60. package/llm_context/context_builder.py +67 -0
  61. package/llm_context/snippet_selector.py +57 -0
  62. package/package.json +14 -0
  63. package/parser/__init__.py +18 -0
  64. package/parser/ast_parser.py +216 -0
  65. package/parser/call_resolver.py +52 -0
  66. package/parser/models.py +75 -0
  67. package/parser/tree_sitter_adapter.py +56 -0
  68. package/parser/variable_extractor.py +31 -0
  69. package/retrieval/__init__.py +17 -0
  70. package/retrieval/cache.py +22 -0
  71. package/retrieval/hybrid_retriever.py +249 -0
  72. package/retrieval/query_parser.py +38 -0
  73. package/retrieval/ranking.py +43 -0
  74. package/retrieval/semantic_retriever.py +39 -0
  75. package/retrieval/symbolic_retriever.py +80 -0
  76. package/scanner/__init__.py +5 -0
  77. package/scanner/file_filters.py +37 -0
  78. package/scanner/models.py +44 -0
  79. package/scanner/repository_scanner.py +55 -0
  80. package/scripts/bootstrap_from_github.ps1 +41 -0
  81. package/tracing/__init__.py +1 -0
  82. package/tracing/runtime_tracer.py +60 -0
@@ -0,0 +1,31 @@
1
+ """Variable dependency extraction helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from .models import FunctionEntry, ModuleParseResult
8
+
9
+
10
+ @dataclass(frozen=True, slots=True)
11
+ class VariableDependency:
12
+ """Represents variable usage by a function."""
13
+
14
+ function_name: str
15
+ variable_name: str
16
+ access_type: str # READS | WRITES | MODIFIES
17
+
18
+
19
+ def extract_variable_dependencies(module: ModuleParseResult) -> tuple[VariableDependency, ...]:
20
+ """Extract variable read/write/modifies relationships from parsed functions."""
21
+ out: list[VariableDependency] = []
22
+
23
+ for fn in module.functions:
24
+ for name in sorted(set(fn.variables_read)):
25
+ out.append(VariableDependency(function_name=fn.name, variable_name=name, access_type="READS"))
26
+
27
+ for name in sorted(set(fn.variables_written)):
28
+ out.append(VariableDependency(function_name=fn.name, variable_name=name, access_type="WRITES"))
29
+ out.append(VariableDependency(function_name=fn.name, variable_name=name, access_type="MODIFIES"))
30
+
31
+ return tuple(out)
@@ -0,0 +1,17 @@
1
+ """Retrieval package."""
2
+
3
+ from .hybrid_retriever import HybridCandidate, hybrid_retrieve
4
+ from .query_parser import ParsedQuery, parse_query
5
+ from .semantic_retriever import SemanticCandidate, SemanticRetriever
6
+ from .symbolic_retriever import SymbolicCandidate, symbolic_retrieve
7
+
8
+ __all__ = [
9
+ "HybridCandidate",
10
+ "ParsedQuery",
11
+ "SemanticCandidate",
12
+ "SemanticRetriever",
13
+ "SymbolicCandidate",
14
+ "hybrid_retrieve",
15
+ "parse_query",
16
+ "symbolic_retrieve",
17
+ ]
@@ -0,0 +1,22 @@
1
+ """In-memory retrieval cache."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass(slots=True)
9
+ class RetrievalCache:
10
+ _data: dict[str, tuple[str, ...]]
11
+
12
+ def __init__(self) -> None:
13
+ self._data = {}
14
+
15
+ def get(self, key: str) -> tuple[str, ...] | None:
16
+ return self._data.get(key)
17
+
18
+ def set(self, key: str, value: tuple[str, ...]) -> None:
19
+ self._data[key] = value
20
+
21
+ def clear(self) -> None:
22
+ self._data.clear()
@@ -0,0 +1,249 @@
1
+ """Hybrid symbolic + semantic retrieval pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ import re
7
+
8
+ import networkx as nx
9
+
10
+ from .ranking import HybridScore, combine_score
11
+ from .semantic_retriever import SemanticRetriever
12
+ from .symbolic_retriever import symbolic_retrieve
13
+
14
+
15
+ @dataclass(frozen=True, slots=True)
16
+ class HybridCandidate:
17
+ node_id: str
18
+ score: float
19
+ rationale: str
20
+
21
+
22
+ @dataclass(frozen=True, slots=True)
23
+ class HybridDiagnostics:
24
+ query_terms: tuple[str, ...]
25
+ symbolic_candidates: tuple[str, ...]
26
+ semantic_candidates: tuple[str, ...]
27
+ merged_candidates: tuple[str, ...]
28
+
29
+
30
+ @dataclass(frozen=True, slots=True)
31
+ class _SemanticAggregate:
32
+ node_id: str
33
+ score: float
34
+ hit_count: int
35
+ best_text_score: float
36
+ lexical_overlap: int
37
+ path_relevance: float
38
+
39
+
40
+ _STOPWORDS = {
41
+ "how",
42
+ "does",
43
+ "when",
44
+ "what",
45
+ "why",
46
+ "where",
47
+ "which",
48
+ "the",
49
+ "this",
50
+ "that",
51
+ "into",
52
+ "from",
53
+ "with",
54
+ "files",
55
+ "file",
56
+ "used",
57
+ "using",
58
+ }
59
+
60
+
61
+ def _query_terms(query: str) -> tuple[str, ...]:
62
+ terms = {
63
+ part
64
+ for token in re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*", query.lower())
65
+ for part in token.split("_")
66
+ if len(part) >= 3 and part not in _STOPWORDS
67
+ }
68
+ return tuple(sorted(terms))
69
+
70
+
71
+ def _node_path(node_id: str, attrs: dict) -> str:
72
+ path = str(attrs.get("path") or attrs.get("file") or "")
73
+ if path:
74
+ return path
75
+ if node_id.startswith("file:"):
76
+ return node_id[5:]
77
+ if node_id.startswith(("function:", "class:")):
78
+ return node_id.split(":", 1)[1].split("::", 1)[0]
79
+ return ""
80
+
81
+
82
+ def _canonical_semantic_node_id(graph: nx.DiGraph, node_id: str, attrs: dict) -> str:
83
+ path = _node_path(node_id, attrs)
84
+ file_node_id = f"file:{path}" if path else ""
85
+ if file_node_id and graph.has_node(file_node_id):
86
+ return file_node_id
87
+ return node_id
88
+
89
+
90
+ def _lexical_overlap(node_id: str, attrs: dict, query_terms: tuple[str, ...]) -> int:
91
+ if not query_terms:
92
+ return 0
93
+ label = str(attrs.get("label", ""))
94
+ path = _node_path(node_id, attrs)
95
+ haystack = f"{node_id} {label} {path}".lower()
96
+ return sum(1 for term in query_terms if term in haystack)
97
+
98
+
99
+ def _path_relevance(node_id: str, attrs: dict, query_terms: tuple[str, ...]) -> float:
100
+ if not query_terms:
101
+ return 0.0
102
+ path = _node_path(node_id, attrs).lower()
103
+ if not path:
104
+ return 0.0
105
+ parts = {part for part in re.split(r"[^a-zA-Z0-9_]+", path) if part}
106
+ overlap = sum(1 for term in query_terms if term in path)
107
+ exact_parts = sum(1 for term in query_terms if term in parts)
108
+ return overlap * 0.08 + exact_parts * 0.05
109
+
110
+
111
+ def _semantic_node_scores(
112
+ graph: nx.DiGraph,
113
+ query: str,
114
+ *,
115
+ top_k: int,
116
+ ) -> tuple[dict[str, _SemanticAggregate], tuple[str, ...]]:
117
+ query_terms = _query_terms(query)
118
+ entries: list[tuple[str, str]] = []
119
+ for node_id, attrs in sorted(graph.nodes(data=True), key=lambda item: item[0]):
120
+ label = str(attrs.get("label", ""))
121
+ path = _node_path(node_id, attrs)
122
+ text = f"{node_id} {path} {label}".strip()
123
+ entries.append((node_id, text))
124
+
125
+ if not entries:
126
+ return {}, ()
127
+
128
+ retriever = SemanticRetriever([text for _, text in entries])
129
+ semantic_top_k = min(max(top_k * 4, 24), len(entries))
130
+ hits = retriever.retrieve(query, top_k=semantic_top_k)
131
+
132
+ aggregates: dict[str, _SemanticAggregate] = {}
133
+ raw_hits: list[str] = []
134
+ for hit in hits:
135
+ source_node_id, _ = entries[hit.idx]
136
+ attrs = graph.nodes[source_node_id]
137
+ target_node_id = _canonical_semantic_node_id(graph, source_node_id, attrs)
138
+ target_attrs = graph.nodes[target_node_id] if graph.has_node(target_node_id) else attrs
139
+ lexical = _lexical_overlap(target_node_id, target_attrs, query_terms)
140
+ path_rel = _path_relevance(target_node_id, target_attrs, query_terms)
141
+ raw_hits.append(source_node_id)
142
+
143
+ existing = aggregates.get(target_node_id)
144
+ hit_count = 1 if existing is None else existing.hit_count + 1
145
+ best_text_score = hit.score if existing is None else max(existing.best_text_score, hit.score)
146
+ best_lexical = lexical if existing is None else max(existing.lexical_overlap, lexical)
147
+ best_path = path_rel if existing is None else max(existing.path_relevance, path_rel)
148
+ bonus = min(0.12, 0.03 * max(hit_count - 1, 0))
149
+ aggregate_score = min(1.0, best_text_score + bonus + min(0.08, best_lexical * 0.02) + min(0.08, best_path))
150
+ aggregates[target_node_id] = _SemanticAggregate(
151
+ node_id=target_node_id,
152
+ score=aggregate_score,
153
+ hit_count=hit_count,
154
+ best_text_score=best_text_score,
155
+ lexical_overlap=best_lexical,
156
+ path_relevance=best_path,
157
+ )
158
+
159
+ return aggregates, tuple(raw_hits)
160
+
161
+
162
+ def collect_hybrid_diagnostics(
163
+ graph: nx.DiGraph,
164
+ query: str,
165
+ *,
166
+ git_recency_by_node: dict[str, float] | None = None,
167
+ coverage_risk_by_node: dict[str, float] | None = None,
168
+ max_hops: int = 2,
169
+ top_k: int = 10,
170
+ ) -> HybridDiagnostics:
171
+ symbolic = symbolic_retrieve(graph, query, max_hops=max_hops)
172
+ semantic, semantic_hits = _semantic_node_scores(graph, query, top_k=top_k)
173
+ git_map = git_recency_by_node or {}
174
+ cov_map = coverage_risk_by_node or {}
175
+ merged = sorted(set(c.node_id for c in symbolic) | set(semantic) | set(git_map) | set(cov_map))
176
+ return HybridDiagnostics(
177
+ query_terms=_query_terms(query),
178
+ symbolic_candidates=tuple(c.node_id for c in symbolic),
179
+ semantic_candidates=semantic_hits,
180
+ merged_candidates=tuple(merged),
181
+ )
182
+
183
+
184
+ def hybrid_retrieve(
185
+ graph: nx.DiGraph,
186
+ query: str,
187
+ *,
188
+ git_recency_by_node: dict[str, float] | None = None,
189
+ coverage_risk_by_node: dict[str, float] | None = None,
190
+ max_hops: int = 2,
191
+ top_k: int = 10,
192
+ ) -> tuple[HybridCandidate, ...]:
193
+ """Rank retrieval candidates with hybrid weighted scoring."""
194
+ query_terms = _query_terms(query)
195
+ symbolic = symbolic_retrieve(graph, query, max_hops=max_hops)
196
+ sym_distance = {c.node_id: c.distance for c in symbolic}
197
+ semantic, _ = _semantic_node_scores(graph, query, top_k=top_k)
198
+
199
+ git_map = git_recency_by_node or {}
200
+ cov_map = coverage_risk_by_node or {}
201
+
202
+ all_nodes = sorted(set(sym_distance) | set(semantic) | set(git_map) | set(cov_map))
203
+ scored: list[HybridScore] = []
204
+
205
+ for node_id in all_nodes:
206
+ attrs = graph.nodes[node_id] if graph.has_node(node_id) else {}
207
+ semantic_item = semantic.get(node_id)
208
+ score = combine_score(
209
+ symbolic_distance=sym_distance.get(node_id),
210
+ semantic_score=semantic_item.score if semantic_item else None,
211
+ git_recency=git_map.get(node_id),
212
+ coverage_risk=cov_map.get(node_id),
213
+ )
214
+ scored.append(
215
+ HybridScore(
216
+ node_id=node_id,
217
+ score=score,
218
+ symbolic_distance=sym_distance.get(node_id),
219
+ semantic_score=semantic_item.score if semantic_item else None,
220
+ git_recency=git_map.get(node_id),
221
+ coverage_risk=cov_map.get(node_id),
222
+ lexical_overlap=_lexical_overlap(node_id, attrs, query_terms),
223
+ path_relevance=_path_relevance(node_id, attrs, query_terms),
224
+ semantic_hits=0 if semantic_item is None else semantic_item.hit_count,
225
+ )
226
+ )
227
+
228
+ scored.sort(
229
+ key=lambda s: (
230
+ -round(s.score, 10),
231
+ -s.lexical_overlap,
232
+ -round(s.path_relevance, 10),
233
+ -s.semantic_hits,
234
+ s.symbolic_distance if s.symbolic_distance is not None else 9999,
235
+ s.node_id,
236
+ )
237
+ )
238
+
239
+ out: list[HybridCandidate] = []
240
+ for item in scored[:top_k]:
241
+ rationale = (
242
+ f"symbolic_distance={item.symbolic_distance}, "
243
+ f"semantic={item.semantic_score}, semantic_hits={item.semantic_hits}, "
244
+ f"lexical_overlap={item.lexical_overlap}, path_relevance={item.path_relevance}, "
245
+ f"git={item.git_recency}, coverage={item.coverage_risk}"
246
+ )
247
+ out.append(HybridCandidate(node_id=item.node_id, score=item.score, rationale=rationale))
248
+
249
+ return tuple(out)
@@ -0,0 +1,38 @@
1
+ """Query parsing utilities for retrieval."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass
7
+
8
+
9
+ @dataclass(frozen=True, slots=True)
10
+ class ParsedQuery:
11
+ raw: str
12
+ tokens: tuple[str, ...]
13
+ symbols: tuple[str, ...]
14
+
15
+
16
+ def parse_query(query: str) -> ParsedQuery:
17
+ """Extract likely symbol tokens from a user query."""
18
+ lowered = query.strip().lower()
19
+ words = tuple(re.findall(r"[a-zA-Z_][a-zA-Z0-9_\.]*", lowered))
20
+
21
+ stop = {
22
+ "why",
23
+ "is",
24
+ "the",
25
+ "a",
26
+ "an",
27
+ "to",
28
+ "in",
29
+ "of",
30
+ "and",
31
+ "or",
32
+ "for",
33
+ "variable",
34
+ "function",
35
+ "class",
36
+ }
37
+ symbols = tuple(w for w in words if w not in stop)
38
+ return ParsedQuery(raw=query, tokens=words, symbols=symbols)
@@ -0,0 +1,43 @@
1
+ """Ranking utilities for hybrid retrieval."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass(frozen=True, slots=True)
9
+ class HybridScore:
10
+ node_id: str
11
+ score: float
12
+ symbolic_distance: int | None
13
+ semantic_score: float | None
14
+ git_recency: float | None
15
+ coverage_risk: float | None
16
+ lexical_overlap: int = 0
17
+ path_relevance: float = 0.0
18
+ semantic_hits: int = 0
19
+
20
+
21
+ def combine_score(
22
+ *,
23
+ symbolic_distance: int | None,
24
+ semantic_score: float | None,
25
+ git_recency: float | None,
26
+ coverage_risk: float | None,
27
+ w_symbolic: float = 0.4,
28
+ w_semantic: float = 0.3,
29
+ w_git: float = 0.2,
30
+ w_coverage: float = 0.1,
31
+ ) -> float:
32
+ """Combine component signals into a single hybrid score."""
33
+ symbolic_component = 0.0 if symbolic_distance is None else (1.0 / (1.0 + float(symbolic_distance)))
34
+ semantic_component = 0.0 if semantic_score is None else float(semantic_score)
35
+ git_component = 0.0 if git_recency is None else float(git_recency)
36
+ coverage_component = 0.0 if coverage_risk is None else float(coverage_risk)
37
+
38
+ return (
39
+ w_symbolic * symbolic_component
40
+ + w_semantic * semantic_component
41
+ + w_git * git_component
42
+ + w_coverage * coverage_component
43
+ )
@@ -0,0 +1,39 @@
1
+ """Semantic retriever implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from embeddings.encoder import TextEncoder
8
+ from embeddings.faiss_index import VectorIndex
9
+
10
+
11
+ @dataclass(frozen=True, slots=True)
12
+ class SemanticCandidate:
13
+ idx: int
14
+ text: str
15
+ score: float
16
+
17
+
18
+ class SemanticRetriever:
19
+ """Semantic retrieval over text snippets."""
20
+
21
+ def __init__(self, snippets: list[str]) -> None:
22
+ self._snippets = snippets
23
+ self._encoder = TextEncoder()
24
+ self._index = VectorIndex()
25
+
26
+ vectors = self._encoder.encode(snippets)
27
+ self._index.add(vectors)
28
+
29
+ def retrieve(self, query: str, top_k: int = 5) -> tuple[SemanticCandidate, ...]:
30
+ qvec = self._encoder.encode([query])[0]
31
+ hits = self._index.search(qvec, top_k=top_k)
32
+
33
+ out: list[SemanticCandidate] = []
34
+ for hit in hits:
35
+ if hit.idx < 0 or hit.idx >= len(self._snippets):
36
+ continue
37
+ out.append(SemanticCandidate(idx=hit.idx, text=self._snippets[hit.idx], score=hit.score))
38
+
39
+ return tuple(out)
@@ -0,0 +1,80 @@
1
+ """Symbolic retriever over GCIE graphs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import deque
6
+ from dataclasses import dataclass
7
+
8
+ import networkx as nx
9
+
10
+ from .query_parser import ParsedQuery, parse_query
11
+
12
+
13
+ @dataclass(frozen=True, slots=True)
14
+ class SymbolicCandidate:
15
+ node_id: str
16
+ node_type: str
17
+ distance: int
18
+
19
+
20
+ def _symbol_matches(text: str, symbol: str) -> bool:
21
+ t = text.lower()
22
+ s = symbol.lower()
23
+ return s == t or s in t
24
+
25
+
26
+ def _seed_nodes(graph: nx.DiGraph, parsed: ParsedQuery) -> list[str]:
27
+ seeds: list[str] = []
28
+ for node_id, attrs in graph.nodes(data=True):
29
+ label = str(attrs.get("label", ""))
30
+ path = str(attrs.get("path", attrs.get("file", "")))
31
+ if any(_symbol_matches(node_id, sym) or _symbol_matches(label, sym) or _symbol_matches(path, sym) for sym in parsed.symbols):
32
+ seeds.append(node_id)
33
+ return sorted(set(seeds))
34
+
35
+
36
+ def _bounded_traversal(graph: nx.DiGraph, seeds: list[str], max_hops: int) -> dict[str, int]:
37
+ distances: dict[str, int] = {}
38
+ queue: deque[tuple[str, int]] = deque((seed, 0) for seed in seeds)
39
+
40
+ while queue:
41
+ node, dist = queue.popleft()
42
+ if node in distances and dist >= distances[node]:
43
+ continue
44
+ distances[node] = dist
45
+
46
+ if dist >= max_hops:
47
+ continue
48
+
49
+ neighbors = set(graph.predecessors(node)).union(graph.successors(node))
50
+ for nxt in neighbors:
51
+ queue.append((nxt, dist + 1))
52
+
53
+ return distances
54
+
55
+
56
+ def symbolic_retrieve(graph: nx.DiGraph, query: str, *, max_hops: int = 2) -> tuple[SymbolicCandidate, ...]:
57
+ """Retrieve symbolic candidates by seeded graph traversal."""
58
+ parsed = parse_query(query)
59
+ if not parsed.symbols:
60
+ return ()
61
+
62
+ seeds = _seed_nodes(graph, parsed)
63
+ if not seeds:
64
+ return ()
65
+
66
+ distances = _bounded_traversal(graph, seeds, max_hops=max_hops)
67
+
68
+ ranked: list[SymbolicCandidate] = []
69
+ for node_id, dist in distances.items():
70
+ attrs = graph.nodes[node_id]
71
+ ranked.append(
72
+ SymbolicCandidate(
73
+ node_id=node_id,
74
+ node_type=str(attrs.get("type", "unknown")),
75
+ distance=dist,
76
+ )
77
+ )
78
+
79
+ ranked.sort(key=lambda c: (c.distance, c.node_type, c.node_id))
80
+ return tuple(ranked)
@@ -0,0 +1,5 @@
1
+ """Repository scanning package for GCIE."""
2
+
3
+ from .repository_scanner import scan_repository
4
+
5
+ __all__ = ["scan_repository"]
@@ -0,0 +1,37 @@
1
+ """Filtering and classification helpers for repository scanning."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ CONFIG_FILENAMES = {
8
+ "pyproject.toml",
9
+ "requirements.txt",
10
+ "requirements-dev.txt",
11
+ "setup.py",
12
+ "setup.cfg",
13
+ "tox.ini",
14
+ ".env",
15
+ }
16
+
17
+ CONFIG_EXTENSIONS = {".toml", ".ini", ".cfg", ".yaml", ".yml", ".json"}
18
+
19
+
20
+ def classify_file(relative_path: Path) -> str:
21
+ """Classify file type for downstream indexing priorities."""
22
+ name = relative_path.name.lower()
23
+ suffix = relative_path.suffix.lower()
24
+ rel = relative_path.as_posix().lower()
25
+
26
+ if "/tests/" in f"/{rel}" or name.startswith("test_") or name.endswith("_test.py"):
27
+ return "test"
28
+ if name in CONFIG_FILENAMES or suffix in CONFIG_EXTENSIONS:
29
+ return "config"
30
+ if suffix in {".py", ".pyi"}:
31
+ return "source"
32
+ return "other"
33
+
34
+
35
+ def should_skip_hidden_file(path: Path, include_hidden: bool) -> bool:
36
+ """Exclude hidden files unless explicitly enabled."""
37
+ return (not include_hidden) and path.name.startswith(".")
@@ -0,0 +1,44 @@
1
+ """Data models for repository scanning."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Literal
8
+
9
+ FileKind = Literal["source", "test", "config", "other"]
10
+
11
+
12
+ @dataclass(frozen=True, slots=True)
13
+ class ScannedFile:
14
+ """A file discovered by the repository scanner."""
15
+
16
+ path: Path
17
+ relative_path: Path
18
+ size_bytes: int
19
+ suffix: str
20
+ kind: FileKind
21
+
22
+
23
+ @dataclass(frozen=True, slots=True)
24
+ class RepositoryManifest:
25
+ """Normalized scanner output for downstream indexing."""
26
+
27
+ root: Path
28
+ files: tuple[ScannedFile, ...]
29
+
30
+ @property
31
+ def total_files(self) -> int:
32
+ return len(self.files)
33
+
34
+ @property
35
+ def source_files(self) -> tuple[ScannedFile, ...]:
36
+ return tuple(f for f in self.files if f.kind == "source")
37
+
38
+ @property
39
+ def test_files(self) -> tuple[ScannedFile, ...]:
40
+ return tuple(f for f in self.files if f.kind == "test")
41
+
42
+ @property
43
+ def config_files(self) -> tuple[ScannedFile, ...]:
44
+ return tuple(f for f in self.files if f.kind == "config")
@@ -0,0 +1,55 @@
1
+ """Repository scanner implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ from config.scanner_config import ScannerConfig
9
+
10
+ from .file_filters import classify_file, should_skip_hidden_file
11
+ from .models import RepositoryManifest, ScannedFile
12
+
13
+
14
+ def _iter_candidate_files(root: Path, config: ScannerConfig):
15
+ """Yield candidate file paths in deterministic order."""
16
+ for current_root, dirs, files in os.walk(root, topdown=True):
17
+ dirs[:] = sorted(d for d in dirs if not config.is_excluded_dir(d))
18
+ for file_name in sorted(files):
19
+ candidate = Path(current_root) / file_name
20
+ rel = candidate.relative_to(root)
21
+
22
+ if should_skip_hidden_file(candidate, config.include_hidden):
23
+ continue
24
+ if config.matches_exclude_glob(rel):
25
+ continue
26
+ if not config.allows_extension(candidate):
27
+ continue
28
+ try:
29
+ size_bytes = candidate.stat().st_size
30
+ except OSError:
31
+ continue
32
+ if size_bytes > config.max_file_size_bytes:
33
+ continue
34
+ yield candidate, rel, size_bytes
35
+
36
+
37
+ def scan_repository(root: str | Path, config: ScannerConfig | None = None) -> RepositoryManifest:
38
+ """Scan a repository and return a normalized manifest."""
39
+ scan_config = config or ScannerConfig()
40
+ root_path = Path(root).resolve()
41
+
42
+ files: list[ScannedFile] = []
43
+ for candidate, rel, size_bytes in _iter_candidate_files(root_path, scan_config):
44
+ files.append(
45
+ ScannedFile(
46
+ path=candidate,
47
+ relative_path=rel,
48
+ size_bytes=size_bytes,
49
+ suffix=candidate.suffix.lower(),
50
+ kind=classify_file(rel),
51
+ )
52
+ )
53
+
54
+ files.sort(key=lambda item: item.relative_path.as_posix())
55
+ return RepositoryManifest(root=root_path, files=tuple(files))