@pmaddire/gcie 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/AGENT.md +256 -0
  2. package/AGENT_USAGE.md +231 -0
  3. package/ARCHITECTURE.md +151 -0
  4. package/CLAUDE.md +69 -0
  5. package/DEBUGGING_PLAYBOOK.md +160 -0
  6. package/KNOWLEDGE_INDEX.md +154 -0
  7. package/POTENTIAL_UPDATES +130 -0
  8. package/PROJECT.md +141 -0
  9. package/README.md +371 -0
  10. package/REPO_DIGITAL_TWIN.md +98 -0
  11. package/ROADMAP.md +301 -0
  12. package/SETUP_ANY_REPO.md +85 -0
  13. package/bin/gcie-init.js +20 -0
  14. package/bin/gcie.js +45 -0
  15. package/cli/__init__.py +1 -0
  16. package/cli/app.py +163 -0
  17. package/cli/commands/__init__.py +1 -0
  18. package/cli/commands/cache.py +35 -0
  19. package/cli/commands/context.py +2426 -0
  20. package/cli/commands/context_slices.py +617 -0
  21. package/cli/commands/debug.py +24 -0
  22. package/cli/commands/index.py +17 -0
  23. package/cli/commands/query.py +20 -0
  24. package/cli/commands/setup.py +73 -0
  25. package/config/__init__.py +1 -0
  26. package/config/scanner_config.py +82 -0
  27. package/context/__init__.py +1 -0
  28. package/context/architecture_bootstrap.py +170 -0
  29. package/context/architecture_index.py +185 -0
  30. package/context/architecture_parser.py +170 -0
  31. package/context/architecture_slicer.py +308 -0
  32. package/context/context_router.py +70 -0
  33. package/context/fallback_evaluator.py +21 -0
  34. package/coverage_integration/__init__.py +1 -0
  35. package/coverage_integration/coverage_loader.py +55 -0
  36. package/debugging/__init__.py +12 -0
  37. package/debugging/bug_localizer.py +81 -0
  38. package/debugging/execution_path_analyzer.py +42 -0
  39. package/embeddings/__init__.py +6 -0
  40. package/embeddings/encoder.py +45 -0
  41. package/embeddings/faiss_index.py +72 -0
  42. package/git_integration/__init__.py +1 -0
  43. package/git_integration/git_miner.py +78 -0
  44. package/graphs/__init__.py +17 -0
  45. package/graphs/call_graph.py +70 -0
  46. package/graphs/code_graph.py +81 -0
  47. package/graphs/execution_graph.py +35 -0
  48. package/graphs/git_graph.py +43 -0
  49. package/graphs/graph_store.py +25 -0
  50. package/graphs/node_factory.py +21 -0
  51. package/graphs/test_graph.py +65 -0
  52. package/graphs/validators.py +28 -0
  53. package/graphs/variable_graph.py +51 -0
  54. package/knowledge_index/__init__.py +1 -0
  55. package/knowledge_index/index_builder.py +60 -0
  56. package/knowledge_index/models.py +35 -0
  57. package/knowledge_index/query_api.py +38 -0
  58. package/knowledge_index/store.py +23 -0
  59. package/llm_context/__init__.py +6 -0
  60. package/llm_context/context_builder.py +67 -0
  61. package/llm_context/snippet_selector.py +57 -0
  62. package/package.json +14 -0
  63. package/parser/__init__.py +18 -0
  64. package/parser/ast_parser.py +216 -0
  65. package/parser/call_resolver.py +52 -0
  66. package/parser/models.py +75 -0
  67. package/parser/tree_sitter_adapter.py +56 -0
  68. package/parser/variable_extractor.py +31 -0
  69. package/retrieval/__init__.py +17 -0
  70. package/retrieval/cache.py +22 -0
  71. package/retrieval/hybrid_retriever.py +249 -0
  72. package/retrieval/query_parser.py +38 -0
  73. package/retrieval/ranking.py +43 -0
  74. package/retrieval/semantic_retriever.py +39 -0
  75. package/retrieval/symbolic_retriever.py +80 -0
  76. package/scanner/__init__.py +5 -0
  77. package/scanner/file_filters.py +37 -0
  78. package/scanner/models.py +44 -0
  79. package/scanner/repository_scanner.py +55 -0
  80. package/scripts/bootstrap_from_github.ps1 +41 -0
  81. package/tracing/__init__.py +1 -0
  82. package/tracing/runtime_tracer.py +60 -0
@@ -0,0 +1,160 @@
1
+ # DEBUGGING_PLAYBOOK.md
2
+
3
+ GraphCode Intelligence Engine Debugging Playbook
4
+
5
+ This document describes how agents should debug problems using the GCIE graph system.
6
+
7
+ Agents must follow this structured debugging workflow instead of guessing.
8
+
9
+ ---
10
+
11
+ DEBUGGING PRINCIPLES
12
+
13
+ Always prioritize structural analysis before semantic guessing.
14
+
15
+ Use graph queries whenever possible.
16
+
17
+ Only use LLM reasoning after symbolic analysis.
18
+
19
+ ---
20
+
21
+ DEBUGGING WORKFLOW
22
+
23
+ When debugging a problem, follow this procedure.
24
+
25
+ Step 1 — Identify Target Symbols
26
+
27
+ Extract relevant symbols from the query.
28
+
29
+ Examples:
30
+
31
+ variable names
32
+ function names
33
+ file names
34
+ modules
35
+
36
+ Example query:
37
+
38
+ "Why is variable diff exploding?"
39
+
40
+ Target symbol:
41
+
42
+ diff
43
+
44
+ ---
45
+
46
+ Step 2 — Query Knowledge Index
47
+
48
+ Use the knowledge index to find:
49
+
50
+ functions that modify the variable
51
+ functions that read the variable
52
+
53
+ Example query:
54
+
55
+ find functions writing variable diff
56
+
57
+ ---
58
+
59
+ Step 3 — Trace Variable Dependencies
60
+
61
+ Use the variable dependency graph.
62
+
63
+ Identify:
64
+
65
+ where the variable is modified
66
+ which functions depend on it
67
+
68
+ ---
69
+
70
+ Step 4 — Trace Call Graph
71
+
72
+ From functions modifying the variable:
73
+
74
+ trace upstream callers
75
+
76
+ trace downstream calls
77
+
78
+ This reveals execution paths that influence the variable.
79
+
80
+ ---
81
+
82
+ Step 5 — Analyze Execution Paths
83
+
84
+ Use the execution trace graph when available.
85
+
86
+ Identify the runtime path leading to the issue.
87
+
88
+ ---
89
+
90
+ Step 6 — Prioritize Suspicious Code
91
+
92
+ Rank candidate functions using:
93
+
94
+ recent git commits
95
+
96
+ low test coverage
97
+
98
+ large code complexity
99
+
100
+ ---
101
+
102
+ Step 7 — Build Minimal Debugging Context
103
+
104
+ Return only the relevant functions:
105
+
106
+ the function modifying the variable
107
+
108
+ its callers
109
+
110
+ any functions it calls
111
+
112
+ ---
113
+
114
+ EXAMPLE DEBUGGING FLOW
115
+
116
+ Query:
117
+
118
+ "Why is diff exploding?"
119
+
120
+ Process:
121
+
122
+ extract symbol → diff
123
+
124
+ query knowledge index → functions modifying diff
125
+
126
+ trace variable graph → compute_diff()
127
+
128
+ trace call graph → update_state() → run_simulation()
129
+
130
+ retrieve minimal code context
131
+
132
+ ---
133
+
134
+ DEBUGGING HEURISTICS
135
+
136
+ Prefer code that:
137
+
138
+ modifies the target variable
139
+
140
+ recently changed in git
141
+
142
+ has low test coverage
143
+
144
+ appears frequently in execution traces
145
+
146
+ ---
147
+
148
+ DEBUGGING OUTPUT FORMAT
149
+
150
+ When returning debugging results include:
151
+
152
+ relevant functions
153
+
154
+ call chain
155
+
156
+ variable modifications
157
+
158
+ ---
159
+
160
+ END DEBUGGING PLAYBOOK
@@ -0,0 +1,154 @@
1
+ # KNOWLEDGE_INDEX.md
2
+
3
+ Codebase Knowledge Index Specification
4
+
5
+ ---
6
+
7
+ PURPOSE
8
+
9
+ The Knowledge Index is a structured metadata representation of the repository.
10
+
11
+ It enables fast code queries without requiring an LLM.
12
+
13
+ The index works alongside the graph system.
14
+
15
+ ---
16
+
17
+ INDEX CONTENT
18
+
19
+ The index stores structured metadata for:
20
+
21
+ files
22
+ classes
23
+ functions
24
+ variables
25
+ imports
26
+ dependencies
27
+
28
+ ---
29
+
30
+ FUNCTION ENTRY FORMAT
31
+
32
+ Each function entry should contain:
33
+
34
+ name
35
+ file
36
+ start_line
37
+ end_line
38
+ parameters
39
+ variables_read
40
+ variables_written
41
+ functions_called
42
+ docstring
43
+
44
+ Example:
45
+
46
+ FunctionEntry
47
+
48
+ name: compute_diff
49
+ file: slam/update.py
50
+ start_line: 42
51
+ end_line: 68
52
+ parameters: state, prediction
53
+ variables_read: state
54
+ variables_written: diff
55
+ functions_called: normalize, clip
56
+
57
+ ---
58
+
59
+ CLASS ENTRY FORMAT
60
+
61
+ Each class entry should contain:
62
+
63
+ class name
64
+ file
65
+ methods
66
+ attributes
67
+ base classes
68
+
69
+ ---
70
+
71
+ FILE ENTRY FORMAT
72
+
73
+ Each file entry should contain:
74
+
75
+ file path
76
+ imports
77
+ classes defined
78
+ functions defined
79
+
80
+ ---
81
+
82
+ INDEX STORAGE
83
+
84
+ Initial implementation should store the index in memory.
85
+
86
+ Later versions may persist the index using:
87
+
88
+ JSON
89
+ SQLite
90
+ or graph database.
91
+
92
+ ---
93
+
94
+ QUERY TYPES
95
+
96
+ The Knowledge Index must support queries such as:
97
+
98
+ find functions modifying variable
99
+
100
+ find functions calling function
101
+
102
+ find files importing module
103
+
104
+ find classes inheriting from class
105
+
106
+ ---
107
+
108
+ INDEX USAGE IN RETRIEVAL PIPELINE
109
+
110
+ Query
111
+
112
+ Extract symbol
113
+
114
+ Search knowledge index
115
+
116
+ Retrieve candidate nodes
117
+
118
+ Graph traversal
119
+
120
+ Semantic ranking
121
+
122
+ ---
123
+
124
+ ADVANTAGES
125
+
126
+ The Knowledge Index allows many queries to be answered without LLM usage.
127
+
128
+ Examples:
129
+
130
+ Where is variable diff modified?
131
+
132
+ Which functions call compute_diff?
133
+
134
+ Which modules depend on slam.update?
135
+
136
+ These queries can be answered directly using the index.
137
+
138
+ ---
139
+
140
+ FUTURE EXTENSIONS
141
+
142
+ Possible improvements:
143
+
144
+ cross-language support
145
+
146
+ dependency metrics
147
+
148
+ code complexity scoring
149
+
150
+ architecture summaries
151
+
152
+ ---
153
+
154
+ END KNOWLEDGE INDEX SPEC
@@ -0,0 +1,130 @@
1
+ # Potential Updates
2
+
3
+ ## Context
4
+
5
+ Current state (from latest validation):
6
+
7
+ - Hybrid workflow can reach very high recall when staged.
8
+ - One-shot/root retrieval is still brittle in mixed-layer repos.
9
+ - The best near-term target is improving efficiency without giving up coverage.
10
+
11
+ ## Candidate Updates
12
+
13
+ ### 1. Pivot + Skeleton Packaging
14
+
15
+ What it is:
16
+ - Keep full content for pivot files (must-have files).
17
+ - Include only compact skeletons/signatures for adjacent support files.
18
+
19
+ Why it helps:
20
+ - Preserves correctness path while reducing token load from neighboring files.
21
+ - Avoids paying full-file cost for files used only as context bridges.
22
+
23
+ Risk:
24
+ - Low to medium. Requires careful control so skeleton files are never used where full content is required.
25
+
26
+ Expected impact:
27
+ - High efficiency gain with low recall risk.
28
+
29
+ ---
30
+
31
+ ### 2. Path-Local First Expansion (Root Queries)
32
+
33
+ What it is:
34
+ - If query names explicit files, rank same-subtree candidates first.
35
+ - Expand globally only when must-have coverage is still incomplete.
36
+
37
+ Why it helps:
38
+ - Reduces root-scope noise from generic/support files.
39
+ - Aligns ranking to the strongest user-provided signal (explicit paths).
40
+
41
+ Risk:
42
+ - Medium. Needs guardrails for true cross-layer tasks.
43
+
44
+ Expected impact:
45
+ - High recall gain on root queries; moderate efficiency gain.
46
+
47
+ ---
48
+
49
+ ### 3. Dynamic Threshold Relaxation
50
+
51
+ What it is:
52
+ - Start retrieval strict.
53
+ - Relax thresholds only if hit count/coverage is too low.
54
+
55
+ Why it helps:
56
+ - Keeps compact context on easy tasks.
57
+ - Avoids over-expanding by default.
58
+
59
+ Risk:
60
+ - Medium. Can oscillate if thresholds are not stable.
61
+
62
+ Expected impact:
63
+ - Moderate efficiency gain; moderate stability gain.
64
+
65
+ ---
66
+
67
+ ### 4. Intent Pipeline Preset (Single Orchestration Mode)
68
+
69
+ What it is:
70
+ - One orchestrated mode that chooses strategy by intent and task shape.
71
+
72
+ Why it helps:
73
+ - Better UX and fewer manual tuning steps.
74
+
75
+ Risk:
76
+ - Medium to high. More moving parts and easier to regress.
77
+
78
+ Expected impact:
79
+ - High UX gain; uncertain near-term accuracy/efficiency gain.
80
+
81
+ ---
82
+
83
+ ### 5. Memory Budget Cap
84
+
85
+ What it is:
86
+ - Hard cap on memory/history context slice (for example 10%).
87
+
88
+ Why it helps:
89
+ - Prevents historical context from displacing code context.
90
+
91
+ Risk:
92
+ - Low.
93
+
94
+ Expected impact:
95
+ - Small to moderate efficiency gain.
96
+
97
+ ## Recommendation: Add Now
98
+
99
+ ### Add #1 first: Pivot + Skeleton Packaging
100
+
101
+ This is the highest confidence next step right now.
102
+
103
+ Why this makes the most sense immediately:
104
+
105
+ 1. It directly targets the current optimization goal: better efficiency without recall loss.
106
+ 2. It is incremental and low-risk compared with a broader orchestration rewrite.
107
+ 3. It complements the current hybrid workflow (which already identifies must-have pivots).
108
+ 4. It gives measurable gains quickly with a clear A/B test:
109
+ - same must-have coverage
110
+ - lower average tokens
111
+
112
+ ## Suggested Implementation Order
113
+
114
+ 1. Add file role marker in context packaging:
115
+ - `pivot` vs `adjacent_support`.
116
+ 2. For `adjacent_support`, package signatures/symbol blocks first.
117
+ 3. Keep full content for:
118
+ - explicit targets
119
+ n - chain middle files
120
+ - files selected by must-have gate.
121
+ 4. Run benchmark comparison:
122
+ - hybrid current vs hybrid + skeleton packaging
123
+ - compare average tokens, savings, accuracy, full-hit rate.
124
+
125
+ ## Success Criteria
126
+
127
+ - No drop in full-hit rate.
128
+ - No drop in average accuracy.
129
+ - Average tokens reduced versus current hybrid workflow baseline.
130
+
package/PROJECT.md ADDED
@@ -0,0 +1,141 @@
1
+ # PROJECT.md
2
+
3
+ ## Project Name
4
+
5
+ GraphCode Intelligence Engine (GCIE)
6
+
7
+ ## System Purpose
8
+
9
+ GCIE is a graph-first code intelligence engine that minimizes LLM context size for large repositories.
10
+ It answers developer queries by retrieving only execution-relevant symbols and code snippets instead of full files.
11
+ Primary success target: produce minimal, high-signal debugging context (for example, tracing why `diff` explodes).
12
+
13
+ ## Architecture Overview
14
+
15
+ GCIE follows a staged pipeline:
16
+
17
+ 1. Repository scanning discovers source files, tests, and metadata.
18
+ 2. Parsing extracts symbols and relationships from source code.
19
+ 3. Graph builders construct specialized graphs.
20
+ 4. Knowledge index stores normalized symbol metadata for fast lookup.
21
+ 5. Retrieval engine performs symbolic traversal first, then semantic ranking.
22
+ 6. Context builder packages a minimal, ordered prompt payload.
23
+ 7. CLI exposes indexing, querying, debugging, and diagnostics workflows.
24
+
25
+ ## Core Subsystems
26
+
27
+ 1. `scanner/`
28
+ Repository scanning, language filtering, include/exclude rules.
29
+
30
+ 2. `parser/`
31
+ AST and Tree-sitter parsing, symbol extraction, normalized intermediate representation.
32
+
33
+ 3. `graphs/`
34
+ Specialized graph construction and unified graph merge using NetworkX.
35
+
36
+ 4. `knowledge_index/`
37
+ In-memory index for files/classes/functions/variables/imports/dependencies with query APIs.
38
+
39
+ 5. `retrieval/`
40
+ Symbolic retriever, semantic retriever, hybrid ranking and candidate consolidation.
41
+
42
+ 6. `embeddings/`
43
+ SentenceTransformers embedding generation and FAISS vector indexing.
44
+
45
+ 7. `debugging/`
46
+ Bug localization and execution-path analysis workflows.
47
+
48
+ 8. `llm_context/`
49
+ Minimal snippet extraction, ordering, deduplication, and context packaging.
50
+
51
+ 9. `cli/`
52
+ Typer-based commands for index/build/query/debug/report operations.
53
+
54
+ 10. `tests/`
55
+ Unit, integration, coverage, and retrieval quality validation.
56
+
57
+ ## Graph Models
58
+
59
+ GCIE maintains these graph models and composes them into a unified knowledge graph:
60
+
61
+ 1. Code Structure Graph
62
+ Nodes: files, modules, classes, functions.
63
+ Edges: `DEFINES`, `CONTAINS`, `IMPORTS`.
64
+
65
+ 2. Call Graph
66
+ Nodes: callable symbols.
67
+ Edges: `CALLS`.
68
+
69
+ 3. Variable Dependency Graph
70
+ Nodes: variables, functions, assignments.
71
+ Edges: `READS`, `WRITES`, `MODIFIES`.
72
+
73
+ 4. Execution Trace Graph
74
+ Nodes: runtime function frames/events.
75
+ Edges: `EXECUTES`, `RETURNS`, temporal path edges.
76
+
77
+ 5. Git History Graph
78
+ Nodes: commits, files, symbols.
79
+ Edges: `CHANGED_IN`, `TOUCHES`.
80
+
81
+ 6. Test Coverage Graph
82
+ Nodes: tests, functions, files.
83
+ Edges: `COVERED_BY`, `ASSERTS_ON`.
84
+
85
+ 7. Unified Knowledge Graph
86
+ Merged, queryable graph used by retrieval and bug localization.
87
+
88
+ ## Retrieval Pipeline
89
+
90
+ 1. Query ingestion
91
+ Normalize query text and identify query intent (`debug`, `trace`, `dependency`, `explain`).
92
+
93
+ 2. Symbol extraction
94
+ Extract candidate symbols (variable/function/class/file/module names).
95
+
96
+ 3. Knowledge index lookup
97
+ Resolve exact/fuzzy symbol matches and seed candidate nodes.
98
+
99
+ 4. Symbolic graph traversal
100
+ Traverse variable/call/structure/trace graphs for execution-relevant neighborhood.
101
+
102
+ 5. Semantic retrieval
103
+ Use embeddings + FAISS to rank semantically similar snippets among symbolic candidates.
104
+
105
+ 6. Hybrid ranking
106
+ Combine symbolic distance, semantic score, git recency, and test coverage risk weighting.
107
+
108
+ 7. Minimal context assembly
109
+ Return only necessary snippets: symbol definition, writes/modifies points, callers/callees, and trace path.
110
+
111
+ 8. Output formatting
112
+ Provide structured debugging payload: relevant functions, call chain, variable modifications, evidence scores.
113
+
114
+ ## GSD Workflow Contract
115
+
116
+ 1. Plan before implementation.
117
+ 2. Execute in phases with atomic tasks.
118
+ 3. Verify each phase before advancing.
119
+ 4. Keep artifacts updated (`PROJECT.md`, `ROADMAP.md`, and phase plans).
120
+ 5. Do not implement large features in a single step.
121
+
122
+ ## Initial Constraints
123
+
124
+ 1. Language/runtime: Python 3.11+.
125
+ 2. Parsers: `ast` first, Tree-sitter extension path.
126
+ 3. Graph engine: NetworkX.
127
+ 4. Semantic layer: SentenceTransformers + FAISS.
128
+ 5. Git analysis: GitPython.
129
+ 6. Execution tracing: `sys.settrace`.
130
+ 7. Coverage: Coverage.py.
131
+ 8. CLI: Typer.
132
+ 9. Retrieval principle: symbolic first, semantic second.
133
+
134
+ ## Success Criteria
135
+
136
+ 1. Repository indexing works across project files.
137
+ 2. Core graph suite is built and queryable.
138
+ 3. Symbolic + semantic hybrid retrieval returns ranked candidates.
139
+ 4. Debug query returns compact execution-relevant context only.
140
+ 5. End-to-end token usage is materially reduced versus full-file prompting.
141
+