@pmaddire/gcie 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/AGENT.md +256 -0
  2. package/AGENT_USAGE.md +231 -0
  3. package/ARCHITECTURE.md +151 -0
  4. package/CLAUDE.md +69 -0
  5. package/DEBUGGING_PLAYBOOK.md +160 -0
  6. package/KNOWLEDGE_INDEX.md +154 -0
  7. package/POTENTIAL_UPDATES +130 -0
  8. package/PROJECT.md +141 -0
  9. package/README.md +371 -0
  10. package/REPO_DIGITAL_TWIN.md +98 -0
  11. package/ROADMAP.md +301 -0
  12. package/SETUP_ANY_REPO.md +85 -0
  13. package/bin/gcie-init.js +20 -0
  14. package/bin/gcie.js +45 -0
  15. package/cli/__init__.py +1 -0
  16. package/cli/app.py +163 -0
  17. package/cli/commands/__init__.py +1 -0
  18. package/cli/commands/cache.py +35 -0
  19. package/cli/commands/context.py +2426 -0
  20. package/cli/commands/context_slices.py +617 -0
  21. package/cli/commands/debug.py +24 -0
  22. package/cli/commands/index.py +17 -0
  23. package/cli/commands/query.py +20 -0
  24. package/cli/commands/setup.py +73 -0
  25. package/config/__init__.py +1 -0
  26. package/config/scanner_config.py +82 -0
  27. package/context/__init__.py +1 -0
  28. package/context/architecture_bootstrap.py +170 -0
  29. package/context/architecture_index.py +185 -0
  30. package/context/architecture_parser.py +170 -0
  31. package/context/architecture_slicer.py +308 -0
  32. package/context/context_router.py +70 -0
  33. package/context/fallback_evaluator.py +21 -0
  34. package/coverage_integration/__init__.py +1 -0
  35. package/coverage_integration/coverage_loader.py +55 -0
  36. package/debugging/__init__.py +12 -0
  37. package/debugging/bug_localizer.py +81 -0
  38. package/debugging/execution_path_analyzer.py +42 -0
  39. package/embeddings/__init__.py +6 -0
  40. package/embeddings/encoder.py +45 -0
  41. package/embeddings/faiss_index.py +72 -0
  42. package/git_integration/__init__.py +1 -0
  43. package/git_integration/git_miner.py +78 -0
  44. package/graphs/__init__.py +17 -0
  45. package/graphs/call_graph.py +70 -0
  46. package/graphs/code_graph.py +81 -0
  47. package/graphs/execution_graph.py +35 -0
  48. package/graphs/git_graph.py +43 -0
  49. package/graphs/graph_store.py +25 -0
  50. package/graphs/node_factory.py +21 -0
  51. package/graphs/test_graph.py +65 -0
  52. package/graphs/validators.py +28 -0
  53. package/graphs/variable_graph.py +51 -0
  54. package/knowledge_index/__init__.py +1 -0
  55. package/knowledge_index/index_builder.py +60 -0
  56. package/knowledge_index/models.py +35 -0
  57. package/knowledge_index/query_api.py +38 -0
  58. package/knowledge_index/store.py +23 -0
  59. package/llm_context/__init__.py +6 -0
  60. package/llm_context/context_builder.py +67 -0
  61. package/llm_context/snippet_selector.py +57 -0
  62. package/package.json +14 -0
  63. package/parser/__init__.py +18 -0
  64. package/parser/ast_parser.py +216 -0
  65. package/parser/call_resolver.py +52 -0
  66. package/parser/models.py +75 -0
  67. package/parser/tree_sitter_adapter.py +56 -0
  68. package/parser/variable_extractor.py +31 -0
  69. package/retrieval/__init__.py +17 -0
  70. package/retrieval/cache.py +22 -0
  71. package/retrieval/hybrid_retriever.py +249 -0
  72. package/retrieval/query_parser.py +38 -0
  73. package/retrieval/ranking.py +43 -0
  74. package/retrieval/semantic_retriever.py +39 -0
  75. package/retrieval/symbolic_retriever.py +80 -0
  76. package/scanner/__init__.py +5 -0
  77. package/scanner/file_filters.py +37 -0
  78. package/scanner/models.py +44 -0
  79. package/scanner/repository_scanner.py +55 -0
  80. package/scripts/bootstrap_from_github.ps1 +41 -0
  81. package/tracing/__init__.py +1 -0
  82. package/tracing/runtime_tracer.py +60 -0
package/README.md ADDED
@@ -0,0 +1,371 @@
1
+ # GraphCode Intelligence Engine (GCIE)
2
+
3
+ GCIE is a graph-first code intelligence engine that minimizes LLM prompt context.
4
+
5
+ It is designed for coding-agent workflows where we want to retrieve the smallest
6
+ useful set of code and operational context instead of reading whole files or
7
+ whole directories into the model.
8
+
9
+ ## How It Works
10
+
11
+ GCIE builds a retrieval-oriented view of a repository and then composes context
12
+ from several signals:
13
+
14
+ 1. Repository scan
15
+ - discovers source files, frontend files, config files, and selected docs
16
+ 2. Graph and index construction
17
+ - structure and relationship data
18
+ - semantic search index
19
+ - architecture-oriented metadata where available
20
+ 3. Multi-channel retrieval
21
+ - lexical filename/path/content matching
22
+ - semantic vector matching
23
+ - query expansion for code and system terms
24
+ - adjacency/support-file recovery
25
+ 4. Fusion and reranking
26
+ - merges candidates with stable deterministic ordering
27
+ - boosts exact file mentions, wiring files, and intent-relevant code
28
+ 5. Context packing
29
+ - returns compact snippets or file-level context depending on the task
30
+ - preserves important support files when confidence would otherwise be weak
31
+ 6. Fallback
32
+ - if the optimized path looks insufficient, GCIE can recover extra files via
33
+ a broader fallback search instead of silently returning thin context
34
+
35
+ The practical goal is simple: return the implementation file, the wiring file,
36
+ and the nearest supporting files that explain behavior, while avoiding the token
37
+ cost of sending full repo surfaces to the model.
38
+
39
+ ## Quick Start
40
+
41
+ 1. Create venv: `.venv\\Scripts\\python.exe -m venv .venv`
42
+ 2. Install deps as needed (networkx, GitPython, typer):
43
+ `.venv\\Scripts\\python.exe -m pip install networkx GitPython typer`
44
+ 3. Run tests: `.venv\\Scripts\\python.exe -m unittest`
45
+ 4. CLI help: `.venv\\Scripts\\python.exe -m cli.app --help`
46
+
47
+ ## Easiest Setup In Any Repo
48
+
49
+ Use this when you want a fast drop-in setup for coding agents.
50
+
51
+ 1. Install GCIE CLI in the target repo (via your preferred method: npm link, local wrapper, or direct Python module).
52
+ 2. Copy [AGENT_USAGE.md](c:\GBCRSS\AGENT_USAGE.md) into the target repo root.
53
+ 3. Run one index pass:
54
+ - `gcie.cmd index .`
55
+ 4. Start using adaptive retrieval immediately:
56
+ - `gcie.cmd context . "<task>" --intent edit --budget auto`
57
+
58
+ No heavy upfront tuning is required. The workflow starts portable-first and only adds local overrides after repeated miss patterns.
59
+
60
+ One-command repo bootstrap:
61
+ - `gcie.cmd setup .`
62
+
63
+ This creates `.gcie` architecture tracking files, copies portable agent workflow docs, and runs an initial index pass.
64
+
65
+ ## Canonical Retrieval Protocol (2026-03)
66
+
67
+ Default protocol is now adaptive by task family:
68
+
69
+ 1. `plain-context-first` for most tasks
70
+ 2. `slicer-first` only where architecture/routed multi-hop families benchmark better
71
+ 3. `direct-file-check` (`rg`) whenever must-have coverage is uncertain
72
+
73
+ Key rule: one mode does not fit all families. Mode routing is part of retrieval quality.
74
+
75
+ ## Latest Protocol Benchmark Snapshot
76
+
77
+ Current protocol performance target: `78.9%` average token savings while preserving high accuracy.
78
+
79
+ From external 50-query mixed-layer benchmark results you provided:
80
+
81
+ - Stable plain-context baseline:
82
+ - `1501.3` avg tokens
83
+ - `78.6%` savings
84
+ - `100%` accuracy
85
+ - `100%` full-hit
86
+ - Naive slicer-first:
87
+ - `1979.9` avg tokens
88
+ - `72.4%` savings
89
+ - `100%` accuracy
90
+ - `100%` full-hit
91
+ - Adapted family-routed protocol:
92
+ - `1372.3` avg tokens
93
+ - `79.5%` savings
94
+ - `100%` accuracy
95
+ - `100%` full-hit
96
+
97
+ Net: adapted protocol preserved full accuracy while reducing average tokens by ~`129` vs stable baseline.
98
+
99
+ ## NPX One-Liner
100
+
101
+ After publishing to npm, users can set up any repo with one command:
102
+
103
+ ```powershell
104
+ npx gcie@latest
105
+ ```
106
+
107
+ This runs `gcie setup .` in the current repo by default.
108
+
109
+ Optional setup flags are passed through:
110
+
111
+ ```powershell
112
+ npx gcie@latest --no-index
113
+ npx gcie@latest --force
114
+ ```
115
+
116
+ ## One-Command GitHub Bootstrap
117
+
118
+ Run this from the target repo to download GCIE from GitHub and set it up automatically:
119
+
120
+ ```powershell
121
+ powershell -ExecutionPolicy Bypass -Command "iwr https://raw.githubusercontent.com/pmaddire/GBCRSS/main/scripts/bootstrap_from_github.ps1 | iex"
122
+ ```
123
+
124
+ What it does:
125
+ - clones `https://github.com/pmaddire/GBCRSS.git`
126
+ - creates a temporary GCIE venv
127
+ - installs minimal deps
128
+ - runs `gcie setup` against your current repo
129
+
130
+ ## In-Depth Setup
131
+
132
+ ### A) Use GCIE directly from this repo
133
+
134
+ 1. Create venv:
135
+ - `python -m venv .venv`
136
+ 2. Install deps:
137
+ - `.venv\\Scripts\\python.exe -m pip install -r requirements.txt`
138
+ - If `requirements.txt` is missing, install minimal deps:
139
+ - `.venv\\Scripts\\python.exe -m pip install networkx GitPython typer`
140
+ 3. Run the CLI:
141
+ - `.venv\\Scripts\\python.exe -m cli.app --help`
142
+
143
+ ### B) Use GCIE from another repo via npm link
144
+
145
+ 1. In the GCIE repo:
146
+ - `npm link`
147
+ 2. In your target repo:
148
+ - `npm link gcie`
149
+ 3. Verify:
150
+ - `gcie --help`
151
+
152
+ ### C) Windows note
153
+
154
+ If PowerShell blocks the shim, use `gcie.cmd` instead of `gcie`.
155
+
156
+ ## NPM Wrapper
157
+
158
+ This repo includes a lightweight npm wrapper so you can run `gcie` like other npm CLIs.
159
+
160
+ 1. In GCIE repo: `npm link`
161
+ 2. In target repo: `gcie --help`
162
+
163
+ Local option:
164
+ - `npm install` then `npx gcie --help`
165
+
166
+ The wrapper prefers `.venv` in the GCIE repo and falls back to system Python.
167
+
168
+ ## Performance Snapshot (AEO benchmark)
169
+
170
+ Two profiles observed after the update:
171
+
172
+ High-recall profile (recommended):
173
+ - Total GCIE tokens: 5,871
174
+ - No-tool baseline: 23,543
175
+ - Savings: 75.1%
176
+ - Coverage: 5/5 required files for all 3 tasks
177
+
178
+ Low-token profile (aggressive):
179
+ - Total GCIE tokens: 2,709
180
+ - No-tool baseline: 23,543
181
+ - Savings: 88.5%
182
+ - Coverage: incomplete (missed key files)
183
+
184
+ Per-task high-recall results:
185
+ - export_ui: 1,934 vs 5,481 (64.7% saved)
186
+ - blank_canvas: 2,322 vs 13,730 (83.1% saved)
187
+ - refine_patch: 1,615 vs 4,332 (62.7% saved)
188
+
189
+ ## Current Accuracy And Token Snapshot
190
+
191
+ ### Mixed-layer external repo finding
192
+
193
+ In a separate active repo with frontend/backend/build wiring, the newer
194
+ repo-local `gcie context` workflow performed much better when used with:
195
+
196
+ - file-first, symbol-heavy queries
197
+ - `--budget 1200` for cross-layer tasks
198
+ - `rg` verification before edits
199
+
200
+ Observed savings there:
201
+
202
+ - Frontend/API task: about `89.5%`
203
+ - Theme/build task: about `91.9%`
204
+ - Backend/config task: about `78.2%`
205
+ - Average: about `86.5%`
206
+
207
+ Important note:
208
+
209
+ - `--budget auto` was too conservative for those cross-layer tasks
210
+ - `--budget 1200` consistently improved recall without needing broad manual reads
211
+ - `1500` added more noise without materially helping more than `1200`
212
+
213
+ ## Core Commands
214
+
215
+ - `gcie index <path>`
216
+ - `gcie query <file.py> "<question>"`
217
+ - `gcie debug <file.py> "<question>"`
218
+ - `gcie context <repo|file> "<task>" --budget auto --intent <edit|debug|refactor|explore>`
219
+ - `gcie context-slices <repo> "<task>" --intent <edit|debug|refactor|explore> [--profile recall|low] [--stage-a 400] [--stage-b 800] [--max-total 1200] [--pin frontend/src/App.jsx] [--pin-budget 300] [--include-tests]`
220
+
221
+ ## How To Use It
222
+
223
+ ### 1. Index the repo
224
+
225
+ ```
226
+ gcie index .
227
+ ```
228
+
229
+ Re-run indexing after major structural changes.
230
+
231
+ ### 2. Start with plain `context`
232
+
233
+ ```
234
+ gcie context . "<task>" --budget auto --intent <edit|debug|refactor|explore>
235
+ ```
236
+
237
+ Recommended intent guidance:
238
+
239
+ - `edit`: making code changes
240
+ - `debug`: tracing a bug or incorrect behavior
241
+ - `refactor`: changing structure or interfaces
242
+ - `explore`: understanding code without immediate edits
243
+
244
+ ### 3. For cross-layer or wiring-heavy tasks, prefer a file-first query
245
+
246
+ This works better than abstract phrasing:
247
+
248
+ ```
249
+ gcie context . "frontend/src/App.jsx selectedTheme activeJobId /api/convert/start app.py start_convert" --budget 1200 --intent edit
250
+ ```
251
+
252
+ Good query ingredients:
253
+
254
+ - explicit file names
255
+ - endpoint names
256
+ - prop names
257
+ - function names
258
+ - config keys
259
+ - state variables
260
+
261
+ ### 4. Use `context-slices` when you want the recall-first workflow
262
+
263
+ ```
264
+ gcie context-slices . "<task>" --intent <edit|debug|refactor|explore>
265
+ ```
266
+
267
+ Optional flags: `--profile low`, `--include-tests`, `--pin <path>`, `--max-total 1200`.
268
+
269
+ ### 5. Verify before editing
270
+
271
+ GCIE should be treated as context compression, not final truth. For important
272
+ edits, verify the returned context with a targeted local search:
273
+
274
+ ```
275
+ rg -n "<key symbols>" app.py main.py frontend/src/App.jsx
276
+ ```
277
+
278
+ ## Usage Patterns That Work Best
279
+
280
+ ### Simple local tasks
281
+
282
+ Use:
283
+
284
+ ```
285
+ gcie context . "<task>" --budget auto --intent debug
286
+ ```
287
+
288
+ ### Cross-layer frontend/backend tasks
289
+
290
+ Use:
291
+
292
+ ```
293
+ gcie context . "<file-first symbol-rich query>" --budget 1200 --intent edit
294
+ ```
295
+
296
+ Why:
297
+
298
+ - the extra budget improves recall for wiring files
299
+ - file-first phrasing reduces generic entrypoint noise
300
+
301
+ ### High-recall workflows
302
+
303
+ Use:
304
+
305
+ ```
306
+ gcie context-slices . "<task>" --intent edit --pin <expected wiring file>
307
+ ```
308
+
309
+ This is still the safest mode when you already know a few must-have files.
310
+
311
+ ## Agent Workflow
312
+
313
+ For coding agents, the safest practical pattern is:
314
+
315
+ 1. Run GCIE first
316
+ 2. Check that the result includes:
317
+ - the main implementation file
318
+ - the wiring or entry file
319
+ - at least one validation or test surface when relevant
320
+ 3. If a must-have file is missing:
321
+ - rerun with a more file-first query
322
+ - increase budget to `1000` or `1200`
323
+ - or pin the missing file in `context-slices`
324
+ 4. Verify with `rg` before editing
325
+
326
+ This usually gives a much better accuracy/token tradeoff than broad manual file
327
+ reading.
328
+
329
+ ## Cache
330
+
331
+ Repo-wide context is cached to speed up repeated calls.
332
+
333
+ - `gcie cache-warm .`
334
+ - `gcie cache-status .`
335
+ - `gcie cache-clear .`
336
+
337
+ Cache file: `.gcie/cache/context_cache.json` (auto-invalidated on file changes).
338
+
339
+ ## Frontend and Non-Python Files
340
+
341
+ Repo-wide context scans common frontend and config extensions and adds file nodes so
342
+ queries can retrieve non-Python surfaces when relevant.
343
+
344
+ Default extensions include: `.js`, `.jsx`, `.ts`, `.tsx`, `.css`, `.scss`, `.html`, `.vue`,
345
+ plus `.json`, `.yaml`, `.yml`, `.toml`, `.md`, `.txt`.
346
+
347
+ ## Core Capabilities
348
+
349
+ - Repository scanning
350
+ - Graph construction (structure, call, variable, execution, git, test coverage)
351
+ - Symbolic + semantic + hybrid retrieval
352
+ - Bug localization
353
+ - Minimal LLM context building
354
+ - Architecture-aware context routing and fallback
355
+ - Agent-friendly retrieval for edit/debug/refactor workflows
356
+
357
+ ## Publish For NPX
358
+
359
+ From this repo:
360
+
361
+ ```powershell
362
+ npm login
363
+ npm publish --access public
364
+ ```
365
+
366
+ Then users can run:
367
+
368
+ ```powershell
369
+ npx gcie@latest
370
+ ```
371
+
@@ -0,0 +1,98 @@
1
+ # REPO_DIGITAL_TWIN.md
2
+
3
+ GraphCode Intelligence Engine – Repository Digital Twin
4
+
5
+ ---
6
+
7
+ PURPOSE
8
+
9
+ The Digital Twin is an in-memory representation of the repository including:
10
+
11
+ * Knowledge Index
12
+ * All Graphs (Code Structure, Call, Variable Dependency, Execution Trace, Git History, Test Coverage)
13
+ * File metadata
14
+ * Function and class metadata
15
+
16
+ It allows coding agents to:
17
+
18
+ * Query repository structure without reading files
19
+ * Perform symbolic + semantic retrieval efficiently
20
+ * Trace execution paths and variable dependencies
21
+ * Perform debugging reasoning offline
22
+ * Reduce LLM token usage dramatically
23
+
24
+ ---
25
+
26
+ CONTENT
27
+
28
+ The Digital Twin stores:
29
+
30
+ 1. **Knowledge Index**
31
+
32
+ * Functions
33
+ * Classes
34
+ * Files
35
+ * Variables
36
+ * Dependencies
37
+
38
+ 2. **Graphs**
39
+
40
+ * Code Structure Graph
41
+ * Call Graph
42
+ * Variable Dependency Graph
43
+ * Execution Trace Graph
44
+ * Git History Graph
45
+ * Test Coverage Graph
46
+
47
+ 3. **Execution Metadata**
48
+
49
+ * Line coverage from tests
50
+ * Recent commit timestamps
51
+ * Function call frequency statistics
52
+
53
+ ---
54
+
55
+ USAGE IN PIPELINE
56
+
57
+ 1. When a query arrives:
58
+
59
+ a. Extract target symbols (variables/functions)
60
+ b. Query Digital Twin for candidate nodes
61
+ c. Perform graph traversal
62
+ d. Rank results with embeddings
63
+ e. Return minimal context for LLM
64
+
65
+ 2. Optional: If new files are added, incrementally update the Digital Twin.
66
+
67
+ ---
68
+
69
+ STORAGE FORMAT
70
+
71
+ Initially:
72
+
73
+ * In-memory Python objects (dicts, NetworkX graphs)
74
+
75
+ Later:
76
+
77
+ * Serialized using pickle, JSON, or SQLite for persistence
78
+
79
+ ---
80
+
81
+ BENEFITS
82
+
83
+ * Avoids repeatedly scanning files
84
+ * Reduces LLM token consumption
85
+ * Speeds up debugging queries
86
+ * Supports multi-agent workflows
87
+
88
+ ---
89
+
90
+ EXTENSIONS
91
+
92
+ * Periodically snapshot the twin to disk for long-term agent sessions
93
+ * Integrate with IDE for offline code analysis
94
+ * Cross-language twin using Tree-sitter
95
+
96
+ ---
97
+
98
+ END REPO DIGITAL TWIN