onboarding-agent 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onboarding_agent-0.1.0/.claude/settings.local.json +7 -0
- onboarding_agent-0.1.0/.gitignore +8 -0
- onboarding_agent-0.1.0/.mcp.json +8 -0
- onboarding_agent-0.1.0/.python-version +1 -0
- onboarding_agent-0.1.0/PKG-INFO +181 -0
- onboarding_agent-0.1.0/README.md +170 -0
- onboarding_agent-0.1.0/main.py +15 -0
- onboarding_agent-0.1.0/onboarding_agent/__init__.py +0 -0
- onboarding_agent-0.1.0/onboarding_agent/constants.py +30 -0
- onboarding_agent-0.1.0/onboarding_agent/helpers.py +101 -0
- onboarding_agent-0.1.0/onboarding_agent/knowledge_graph.py +139 -0
- onboarding_agent-0.1.0/onboarding_agent/prompts.py +81 -0
- onboarding_agent-0.1.0/onboarding_agent/resources.py +35 -0
- onboarding_agent-0.1.0/onboarding_agent/server.py +34 -0
- onboarding_agent-0.1.0/onboarding_agent/tools/__init__.py +0 -0
- onboarding_agent-0.1.0/onboarding_agent/tools/analysis.py +186 -0
- onboarding_agent-0.1.0/onboarding_agent/tools/git_history.py +123 -0
- onboarding_agent-0.1.0/onboarding_agent/tools/graph.py +140 -0
- onboarding_agent-0.1.0/onboarding_agent/tools/ingest.py +82 -0
- onboarding_agent-0.1.0/onboarding_agent/tools/search.py +200 -0
- onboarding_agent-0.1.0/pyproject.toml +19 -0
- onboarding_agent-0.1.0/uv.lock +677 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: onboarding-agent
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: An MCP server that onboards you to any codebase — point it at a repo and ask questions like a senior engineer who knows the project inside out.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Keywords: codebase,developer-tools,knowledge-graph,mcp,onboarding
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: gitpython>=3.1.50
|
|
9
|
+
Requires-Dist: mcp>=1.27.2
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# onboarding-agent
|
|
13
|
+
|
|
14
|
+
An MCP server that onboards you to any codebase. Point it at a repo, and it builds a knowledge graph of the project — files, functions, classes, imports, and their relationships. Then ask questions like you're talking to a senior engineer who knows the project inside out.
|
|
15
|
+
|
|
16
|
+
## What it does
|
|
17
|
+
|
|
18
|
+
- **Ingests any local repo** — crawls the file tree, detects languages and frameworks
|
|
19
|
+
- **Builds a knowledge graph** — maps files, functions, classes, modules, and how they connect via imports
|
|
20
|
+
- **Answers onboarding questions** — "where does auth happen?", "what does this file do?", "who should I ask about the database?"
|
|
21
|
+
- **Analyzes git history** — finds the most-changed files (often the most important), recent activity, and contributors per file
|
|
22
|
+
- **Works 100% locally** — no API keys, no cloud, no data leaves your machine
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
Requires Python 3.12+.
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# With uv (recommended)
|
|
30
|
+
uv pip install git+https://github.com/abab754/onboarding-agent.git
|
|
31
|
+
|
|
32
|
+
# Or clone and install locally
|
|
33
|
+
git clone https://github.com/abab754/onboarding-agent.git
|
|
34
|
+
cd onboarding-agent
|
|
35
|
+
uv sync
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Connect to an MCP client
|
|
39
|
+
|
|
40
|
+
### Claude Desktop
|
|
41
|
+
|
|
42
|
+
Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
|
43
|
+
|
|
44
|
+
```json
|
|
45
|
+
{
|
|
46
|
+
"mcpServers": {
|
|
47
|
+
"onboarding-agent": {
|
|
48
|
+
"command": "uv",
|
|
49
|
+
"args": ["--directory", "/path/to/onboarding-agent", "run", "main.py"]
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Claude Code
|
|
56
|
+
|
|
57
|
+
Add a `.mcp.json` file to your project root (or the repo you want to onboard to):
|
|
58
|
+
|
|
59
|
+
```json
|
|
60
|
+
{
|
|
61
|
+
"mcpServers": {
|
|
62
|
+
"onboarding-agent": {
|
|
63
|
+
"command": "uv",
|
|
64
|
+
"args": ["--directory", "/path/to/onboarding-agent", "run", "main.py"]
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Then restart Claude Code.
|
|
71
|
+
|
|
72
|
+
### Any MCP-compatible client
|
|
73
|
+
|
|
74
|
+
The server uses stdio transport. Run it with:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
uv run main.py
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Any MCP client can connect via stdin/stdout.
|
|
81
|
+
|
|
82
|
+
## Usage examples
|
|
83
|
+
|
|
84
|
+
Once connected, just talk to your AI assistant naturally:
|
|
85
|
+
|
|
86
|
+
**Full onboarding:**
|
|
87
|
+
> "Onboard me to the repo at /Users/me/projects/my-app"
|
|
88
|
+
|
|
89
|
+
**Understand a file:**
|
|
90
|
+
> "Explain what /Users/me/projects/my-app/src/auth.py does"
|
|
91
|
+
|
|
92
|
+
**Find relevant code:**
|
|
93
|
+
> "Where does database configuration happen in /Users/me/projects/my-app?"
|
|
94
|
+
|
|
95
|
+
**Check project activity:**
|
|
96
|
+
> "Who are the main contributors to /Users/me/projects/my-app and what files change the most?"
|
|
97
|
+
|
|
98
|
+
**Freeform questions:**
|
|
99
|
+
> "How is error handling done in this project?"
|
|
100
|
+
|
|
101
|
+
## Tools
|
|
102
|
+
|
|
103
|
+
| Tool | Description |
|
|
104
|
+
|---|---|
|
|
105
|
+
| `ingest_repo` | Crawl a repo and return the file tree |
|
|
106
|
+
| `read_file` | Read a file's contents with metadata |
|
|
107
|
+
| `get_overview` | High-level summary: languages, frameworks, entry points |
|
|
108
|
+
| `explain_file` | File contents + imports, functions, classes |
|
|
109
|
+
| `explain_module` | Directory overview with per-file symbols |
|
|
110
|
+
| `build_knowledge_graph` | Index the repo into a queryable knowledge graph |
|
|
111
|
+
| `query_entities` | Search the graph for files, functions, classes, modules |
|
|
112
|
+
| `query_relationships` | Find how entities connect (imports, contains) |
|
|
113
|
+
| `find_relevant_code` | Search by topic and get ranked results |
|
|
114
|
+
| `get_architecture` | Import graph, module structure, coupling analysis |
|
|
115
|
+
| `ask` | Freeform Q&A — gathers context automatically |
|
|
116
|
+
| `get_git_history` | Recent commits and contributor summary |
|
|
117
|
+
| `get_hot_files` | Most frequently changed files |
|
|
118
|
+
| `get_file_contributors` | Who has worked on a specific file |
|
|
119
|
+
|
|
120
|
+
## Resources
|
|
121
|
+
|
|
122
|
+
| URI | Description |
|
|
123
|
+
|---|---|
|
|
124
|
+
| `repo://overview` | Project summary (after a repo is loaded) |
|
|
125
|
+
| `repo://structure` | File tree |
|
|
126
|
+
| `repo://dependencies` | Import/dependency graph |
|
|
127
|
+
|
|
128
|
+
## Prompts
|
|
129
|
+
|
|
130
|
+
| Prompt | Description |
|
|
131
|
+
|---|---|
|
|
132
|
+
| `onboard` | Full onboarding walkthrough |
|
|
133
|
+
| `explain_this_file` | Deep dive into a specific file |
|
|
134
|
+
| `find_code_for` | Find code related to a topic |
|
|
135
|
+
| `ask_question` | Freeform question answering |
|
|
136
|
+
|
|
137
|
+
## How it works
|
|
138
|
+
|
|
139
|
+
1. You point the server at a repo path
|
|
140
|
+
2. It crawls the file tree, skipping noise directories (`.git`, `node_modules`, etc.)
|
|
141
|
+
3. For Python files, it extracts functions, classes, and import statements
|
|
142
|
+
4. Everything gets stored in a knowledge graph (saved to `.onboarding_agent/graph.json` in the repo)
|
|
143
|
+
5. When you ask a question, it searches the graph, reads relevant files, and bundles the context for the LLM to answer
|
|
144
|
+
|
|
145
|
+
The knowledge graph persists between sessions, so re-analysis is only needed when the code changes.
|
|
146
|
+
|
|
147
|
+
## Development
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
git clone https://github.com/YOUR_USERNAME/onboarding-agent.git
|
|
151
|
+
cd onboarding-agent
|
|
152
|
+
uv sync
|
|
153
|
+
|
|
154
|
+
# Run the server
|
|
155
|
+
uv run main.py
|
|
156
|
+
|
|
157
|
+
# Test with MCP Inspector
|
|
158
|
+
npx @modelcontextprotocol/inspector uv run main.py
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Project structure
|
|
162
|
+
|
|
163
|
+
```
|
|
164
|
+
onboarding_agent/
|
|
165
|
+
├── server.py # FastMCP instance and global state
|
|
166
|
+
├── constants.py # Language maps, config signals, skip dirs
|
|
167
|
+
├── helpers.py # File tree building, Python symbol extraction
|
|
168
|
+
├── knowledge_graph.py # KnowledgeGraph class with JSON persistence
|
|
169
|
+
├── resources.py # MCP resources (repo://overview, etc.)
|
|
170
|
+
├── prompts.py # MCP prompt templates
|
|
171
|
+
└── tools/
|
|
172
|
+
├── ingest.py # ingest_repo, read_file
|
|
173
|
+
├── analysis.py # get_overview, explain_file, explain_module
|
|
174
|
+
├── graph.py # build_knowledge_graph, query_entities, query_relationships
|
|
175
|
+
├── search.py # find_relevant_code, get_architecture, ask
|
|
176
|
+
└── git_history.py # get_git_history, get_hot_files, get_file_contributors
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## License
|
|
180
|
+
|
|
181
|
+
MIT
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# onboarding-agent
|
|
2
|
+
|
|
3
|
+
An MCP server that onboards you to any codebase. Point it at a repo, and it builds a knowledge graph of the project — files, functions, classes, imports, and their relationships. Then ask questions like you're talking to a senior engineer who knows the project inside out.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
- **Ingests any local repo** — crawls the file tree, detects languages and frameworks
|
|
8
|
+
- **Builds a knowledge graph** — maps files, functions, classes, modules, and how they connect via imports
|
|
9
|
+
- **Answers onboarding questions** — "where does auth happen?", "what does this file do?", "who should I ask about the database?"
|
|
10
|
+
- **Analyzes git history** — finds the most-changed files (often the most important), recent activity, and contributors per file
|
|
11
|
+
- **Works 100% locally** — no API keys, no cloud, no data leaves your machine
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
Requires Python 3.12+.
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# With uv (recommended)
|
|
19
|
+
uv pip install git+https://github.com/abab754/onboarding-agent.git
|
|
20
|
+
|
|
21
|
+
# Or clone and install locally
|
|
22
|
+
git clone https://github.com/abab754/onboarding-agent.git
|
|
23
|
+
cd onboarding-agent
|
|
24
|
+
uv sync
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Connect to an MCP client
|
|
28
|
+
|
|
29
|
+
### Claude Desktop
|
|
30
|
+
|
|
31
|
+
Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
|
32
|
+
|
|
33
|
+
```json
|
|
34
|
+
{
|
|
35
|
+
"mcpServers": {
|
|
36
|
+
"onboarding-agent": {
|
|
37
|
+
"command": "uv",
|
|
38
|
+
"args": ["--directory", "/path/to/onboarding-agent", "run", "main.py"]
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Claude Code
|
|
45
|
+
|
|
46
|
+
Add a `.mcp.json` file to your project root (or the repo you want to onboard to):
|
|
47
|
+
|
|
48
|
+
```json
|
|
49
|
+
{
|
|
50
|
+
"mcpServers": {
|
|
51
|
+
"onboarding-agent": {
|
|
52
|
+
"command": "uv",
|
|
53
|
+
"args": ["--directory", "/path/to/onboarding-agent", "run", "main.py"]
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Then restart Claude Code.
|
|
60
|
+
|
|
61
|
+
### Any MCP-compatible client
|
|
62
|
+
|
|
63
|
+
The server uses stdio transport. Run it with:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
uv run main.py
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Any MCP client can connect via stdin/stdout.
|
|
70
|
+
|
|
71
|
+
## Usage examples
|
|
72
|
+
|
|
73
|
+
Once connected, just talk to your AI assistant naturally:
|
|
74
|
+
|
|
75
|
+
**Full onboarding:**
|
|
76
|
+
> "Onboard me to the repo at /Users/me/projects/my-app"
|
|
77
|
+
|
|
78
|
+
**Understand a file:**
|
|
79
|
+
> "Explain what /Users/me/projects/my-app/src/auth.py does"
|
|
80
|
+
|
|
81
|
+
**Find relevant code:**
|
|
82
|
+
> "Where does database configuration happen in /Users/me/projects/my-app?"
|
|
83
|
+
|
|
84
|
+
**Check project activity:**
|
|
85
|
+
> "Who are the main contributors to /Users/me/projects/my-app and what files change the most?"
|
|
86
|
+
|
|
87
|
+
**Freeform questions:**
|
|
88
|
+
> "How is error handling done in this project?"
|
|
89
|
+
|
|
90
|
+
## Tools
|
|
91
|
+
|
|
92
|
+
| Tool | Description |
|
|
93
|
+
|---|---|
|
|
94
|
+
| `ingest_repo` | Crawl a repo and return the file tree |
|
|
95
|
+
| `read_file` | Read a file's contents with metadata |
|
|
96
|
+
| `get_overview` | High-level summary: languages, frameworks, entry points |
|
|
97
|
+
| `explain_file` | File contents + imports, functions, classes |
|
|
98
|
+
| `explain_module` | Directory overview with per-file symbols |
|
|
99
|
+
| `build_knowledge_graph` | Index the repo into a queryable knowledge graph |
|
|
100
|
+
| `query_entities` | Search the graph for files, functions, classes, modules |
|
|
101
|
+
| `query_relationships` | Find how entities connect (imports, contains) |
|
|
102
|
+
| `find_relevant_code` | Search by topic and get ranked results |
|
|
103
|
+
| `get_architecture` | Import graph, module structure, coupling analysis |
|
|
104
|
+
| `ask` | Freeform Q&A — gathers context automatically |
|
|
105
|
+
| `get_git_history` | Recent commits and contributor summary |
|
|
106
|
+
| `get_hot_files` | Most frequently changed files |
|
|
107
|
+
| `get_file_contributors` | Who has worked on a specific file |
|
|
108
|
+
|
|
109
|
+
## Resources
|
|
110
|
+
|
|
111
|
+
| URI | Description |
|
|
112
|
+
|---|---|
|
|
113
|
+
| `repo://overview` | Project summary (after a repo is loaded) |
|
|
114
|
+
| `repo://structure` | File tree |
|
|
115
|
+
| `repo://dependencies` | Import/dependency graph |
|
|
116
|
+
|
|
117
|
+
## Prompts
|
|
118
|
+
|
|
119
|
+
| Prompt | Description |
|
|
120
|
+
|---|---|
|
|
121
|
+
| `onboard` | Full onboarding walkthrough |
|
|
122
|
+
| `explain_this_file` | Deep dive into a specific file |
|
|
123
|
+
| `find_code_for` | Find code related to a topic |
|
|
124
|
+
| `ask_question` | Freeform question answering |
|
|
125
|
+
|
|
126
|
+
## How it works
|
|
127
|
+
|
|
128
|
+
1. You point the server at a repo path
|
|
129
|
+
2. It crawls the file tree, skipping noise directories (`.git`, `node_modules`, etc.)
|
|
130
|
+
3. For Python files, it extracts functions, classes, and import statements
|
|
131
|
+
4. Everything gets stored in a knowledge graph (saved to `.onboarding_agent/graph.json` in the repo)
|
|
132
|
+
5. When you ask a question, it searches the graph, reads relevant files, and bundles the context for the LLM to answer
|
|
133
|
+
|
|
134
|
+
The knowledge graph persists between sessions, so re-analysis is only needed when the code changes.
|
|
135
|
+
|
|
136
|
+
## Development
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
git clone https://github.com/YOUR_USERNAME/onboarding-agent.git
|
|
140
|
+
cd onboarding-agent
|
|
141
|
+
uv sync
|
|
142
|
+
|
|
143
|
+
# Run the server
|
|
144
|
+
uv run main.py
|
|
145
|
+
|
|
146
|
+
# Test with MCP Inspector
|
|
147
|
+
npx @modelcontextprotocol/inspector uv run main.py
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Project structure
|
|
151
|
+
|
|
152
|
+
```
|
|
153
|
+
onboarding_agent/
|
|
154
|
+
├── server.py # FastMCP instance and global state
|
|
155
|
+
├── constants.py # Language maps, config signals, skip dirs
|
|
156
|
+
├── helpers.py # File tree building, Python symbol extraction
|
|
157
|
+
├── knowledge_graph.py # KnowledgeGraph class with JSON persistence
|
|
158
|
+
├── resources.py # MCP resources (repo://overview, etc.)
|
|
159
|
+
├── prompts.py # MCP prompt templates
|
|
160
|
+
└── tools/
|
|
161
|
+
├── ingest.py # ingest_repo, read_file
|
|
162
|
+
├── analysis.py # get_overview, explain_file, explain_module
|
|
163
|
+
├── graph.py # build_knowledge_graph, query_entities, query_relationships
|
|
164
|
+
├── search.py # find_relevant_code, get_architecture, ask
|
|
165
|
+
└── git_history.py # get_git_history, get_hot_files, get_file_contributors
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## License
|
|
169
|
+
|
|
170
|
+
MIT
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Entry point for the onboarding-agent MCP server."""
|
|
2
|
+
|
|
3
|
+
from onboarding_agent.server import mcp
|
|
4
|
+
|
|
5
|
+
# Import all tool/resource/prompt modules so they register with the mcp instance.
|
|
6
|
+
import onboarding_agent.tools.ingest # noqa: F401
|
|
7
|
+
import onboarding_agent.tools.analysis # noqa: F401
|
|
8
|
+
import onboarding_agent.tools.graph # noqa: F401
|
|
9
|
+
import onboarding_agent.tools.search # noqa: F401
|
|
10
|
+
import onboarding_agent.tools.git_history # noqa: F401
|
|
11
|
+
import onboarding_agent.resources # noqa: F401
|
|
12
|
+
import onboarding_agent.prompts # noqa: F401
|
|
13
|
+
|
|
14
|
+
if __name__ == "__main__":
|
|
15
|
+
mcp.run()
|
|
File without changes
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Directories we never want to crawl — these are noise, not project structure.
|
|
2
|
+
SKIP_DIRS = {".git", ".venv", "venv", "node_modules", "__pycache__", ".tox", ".mypy_cache"}
|
|
3
|
+
|
|
4
|
+
# Maps file extensions to language names.
|
|
5
|
+
EXTENSION_TO_LANGUAGE = {
|
|
6
|
+
".py": "Python", ".js": "JavaScript", ".ts": "TypeScript",
|
|
7
|
+
".jsx": "JavaScript (React)", ".tsx": "TypeScript (React)",
|
|
8
|
+
".java": "Java", ".go": "Go", ".rs": "Rust", ".rb": "Ruby",
|
|
9
|
+
".cpp": "C++", ".c": "C", ".cs": "C#", ".swift": "Swift",
|
|
10
|
+
".kt": "Kotlin", ".scala": "Scala", ".php": "PHP",
|
|
11
|
+
".html": "HTML", ".css": "CSS", ".scss": "SCSS",
|
|
12
|
+
".sh": "Shell", ".sql": "SQL", ".r": "R",
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
# Config files that reveal which frameworks/tools the project uses.
|
|
16
|
+
CONFIG_SIGNALS = {
|
|
17
|
+
"pyproject.toml": "Python (uv/pip)", "setup.py": "Python (setuptools)",
|
|
18
|
+
"requirements.txt": "Python (pip)", "Pipfile": "Python (pipenv)",
|
|
19
|
+
"package.json": "Node.js", "tsconfig.json": "TypeScript",
|
|
20
|
+
"Cargo.toml": "Rust", "go.mod": "Go", "pom.xml": "Java (Maven)",
|
|
21
|
+
"build.gradle": "Java (Gradle)", "Gemfile": "Ruby",
|
|
22
|
+
"Makefile": "Make", "Dockerfile": "Docker",
|
|
23
|
+
"docker-compose.yml": "Docker Compose", "docker-compose.yaml": "Docker Compose",
|
|
24
|
+
".eslintrc.json": "ESLint", ".prettierrc": "Prettier",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
ENTRY_POINT_NAMES = {
|
|
28
|
+
"main.py", "app.py", "manage.py", "server.py", "cli.py",
|
|
29
|
+
"index.js", "index.ts", "main.go", "main.rs", "App.java",
|
|
30
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Shared helper functions used across multiple tools."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from onboarding_agent.constants import SKIP_DIRS, EXTENSION_TO_LANGUAGE, CONFIG_SIGNALS, ENTRY_POINT_NAMES
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_file_tree(root: Path, max_depth: int = 5) -> dict:
|
|
9
|
+
"""Recursively walk a directory and return a nested dict representing the file tree.
|
|
10
|
+
|
|
11
|
+
Each directory is a dict with "type": "directory" and "children": {...}.
|
|
12
|
+
Each file is a dict with "type": "file" and "size": <bytes>.
|
|
13
|
+
"""
|
|
14
|
+
tree: dict = {}
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
entries = sorted(root.iterdir(), key=lambda e: (e.is_file(), e.name))
|
|
18
|
+
except PermissionError:
|
|
19
|
+
return {"error": "permission denied"}
|
|
20
|
+
|
|
21
|
+
for entry in entries:
|
|
22
|
+
if entry.name in SKIP_DIRS:
|
|
23
|
+
continue
|
|
24
|
+
|
|
25
|
+
if entry.is_dir():
|
|
26
|
+
if max_depth <= 0:
|
|
27
|
+
tree[entry.name] = {"type": "directory", "children": "...truncated"}
|
|
28
|
+
else:
|
|
29
|
+
tree[entry.name] = {
|
|
30
|
+
"type": "directory",
|
|
31
|
+
"children": build_file_tree(entry, max_depth - 1),
|
|
32
|
+
}
|
|
33
|
+
elif entry.is_file():
|
|
34
|
+
tree[entry.name] = {
|
|
35
|
+
"type": "file",
|
|
36
|
+
"size": entry.stat().st_size,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return tree
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def extract_python_symbols(content: str) -> dict:
|
|
43
|
+
"""Extract imports, function names, and class names from Python source code.
|
|
44
|
+
|
|
45
|
+
Uses simple line-based parsing rather than AST — faster, works on files
|
|
46
|
+
with syntax errors, and good enough for an overview.
|
|
47
|
+
"""
|
|
48
|
+
imports: list[str] = []
|
|
49
|
+
functions: list[str] = []
|
|
50
|
+
classes: list[str] = []
|
|
51
|
+
|
|
52
|
+
for line in content.splitlines():
|
|
53
|
+
stripped = line.strip()
|
|
54
|
+
if stripped.startswith("import ") or stripped.startswith("from "):
|
|
55
|
+
imports.append(stripped)
|
|
56
|
+
elif stripped.startswith("def "):
|
|
57
|
+
name = stripped[4:].split("(")[0].strip()
|
|
58
|
+
functions.append(name)
|
|
59
|
+
elif stripped.startswith("class "):
|
|
60
|
+
name = stripped[6:].split("(")[0].split(":")[0].strip()
|
|
61
|
+
classes.append(name)
|
|
62
|
+
|
|
63
|
+
return {"imports": imports, "functions": functions, "classes": classes}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def collect_extensions(tree: dict) -> dict[str, int]:
|
|
67
|
+
"""Walk the file tree and count occurrences of each file extension."""
|
|
68
|
+
counts: dict[str, int] = {}
|
|
69
|
+
for name, info in tree.items():
|
|
70
|
+
if not isinstance(info, dict):
|
|
71
|
+
continue
|
|
72
|
+
if info.get("type") == "file":
|
|
73
|
+
ext = Path(name).suffix.lower()
|
|
74
|
+
if ext:
|
|
75
|
+
counts[ext] = counts.get(ext, 0) + 1
|
|
76
|
+
elif info.get("type") == "directory":
|
|
77
|
+
children = info.get("children")
|
|
78
|
+
if isinstance(children, dict):
|
|
79
|
+
for ext, n in collect_extensions(children).items():
|
|
80
|
+
counts[ext] = counts.get(ext, 0) + n
|
|
81
|
+
return counts
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def find_config_files(tree: dict) -> list[str]:
|
|
85
|
+
"""Return names of known config/framework files found at the repo root."""
|
|
86
|
+
return [name for name in tree if name in CONFIG_SIGNALS]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def find_entry_points(tree: dict, root_path: str) -> list[str]:
|
|
90
|
+
"""Find likely entry point files in the tree (searches recursively)."""
|
|
91
|
+
found: list[str] = []
|
|
92
|
+
for name, info in tree.items():
|
|
93
|
+
if not isinstance(info, dict):
|
|
94
|
+
continue
|
|
95
|
+
if info.get("type") == "file" and name in ENTRY_POINT_NAMES:
|
|
96
|
+
found.append(f"{root_path}/{name}")
|
|
97
|
+
elif info.get("type") == "directory":
|
|
98
|
+
children = info.get("children")
|
|
99
|
+
if isinstance(children, dict):
|
|
100
|
+
found.extend(find_entry_points(children, f"{root_path}/{name}"))
|
|
101
|
+
return found
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""A simple file-backed knowledge graph for storing codebase entities and relationships.
|
|
2
|
+
|
|
3
|
+
Entities are things like files, functions, classes, and modules.
|
|
4
|
+
Relationships connect them: "file contains function", "file imports module", etc.
|
|
5
|
+
|
|
6
|
+
The graph persists to a JSON file so it survives server restarts.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class KnowledgeGraph:
|
|
14
|
+
"""In-memory knowledge graph with JSON file persistence.
|
|
15
|
+
|
|
16
|
+
Each entity has:
|
|
17
|
+
- id: unique identifier (e.g., file path or "filepath::function_name")
|
|
18
|
+
- type: "file", "function", "class", "module"
|
|
19
|
+
- name: human-readable name
|
|
20
|
+
- metadata: dict of extra info (language, size, etc.)
|
|
21
|
+
|
|
22
|
+
Each relationship has:
|
|
23
|
+
- source: entity id
|
|
24
|
+
- target: entity id
|
|
25
|
+
- type: "contains", "imports", "calls", etc.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, storage_path: str | None = None):
|
|
29
|
+
self.entities: dict[str, dict] = {}
|
|
30
|
+
self.relationships: list[dict] = []
|
|
31
|
+
self.storage_path = Path(storage_path) if storage_path else None
|
|
32
|
+
|
|
33
|
+
if self.storage_path and self.storage_path.exists():
|
|
34
|
+
self._load()
|
|
35
|
+
|
|
36
|
+
def add_entity(self, entity_id: str, entity_type: str, name: str, metadata: dict | None = None) -> None:
|
|
37
|
+
"""Add or update an entity in the graph."""
|
|
38
|
+
self.entities[entity_id] = {
|
|
39
|
+
"id": entity_id,
|
|
40
|
+
"type": entity_type,
|
|
41
|
+
"name": name,
|
|
42
|
+
"metadata": metadata or {},
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
def add_relationship(self, source: str, target: str, rel_type: str) -> None:
|
|
46
|
+
"""Add a relationship between two entities. Skips duplicates."""
|
|
47
|
+
rel = {"source": source, "target": target, "type": rel_type}
|
|
48
|
+
if rel not in self.relationships:
|
|
49
|
+
self.relationships.append(rel)
|
|
50
|
+
|
|
51
|
+
def get_entity(self, entity_id: str) -> dict | None:
|
|
52
|
+
"""Look up a single entity by id."""
|
|
53
|
+
return self.entities.get(entity_id)
|
|
54
|
+
|
|
55
|
+
def find_entities(self, entity_type: str | None = None, name_contains: str | None = None) -> list[dict]:
|
|
56
|
+
"""Search entities by type and/or name substring."""
|
|
57
|
+
results = list(self.entities.values())
|
|
58
|
+
if entity_type:
|
|
59
|
+
results = [e for e in results if e["type"] == entity_type]
|
|
60
|
+
if name_contains:
|
|
61
|
+
query = name_contains.lower()
|
|
62
|
+
results = [e for e in results if query in e["name"].lower()]
|
|
63
|
+
return results
|
|
64
|
+
|
|
65
|
+
def find_relationships(
|
|
66
|
+
self,
|
|
67
|
+
source: str | None = None,
|
|
68
|
+
target: str | None = None,
|
|
69
|
+
rel_type: str | None = None,
|
|
70
|
+
) -> list[dict]:
|
|
71
|
+
"""Search relationships by source, target, and/or type."""
|
|
72
|
+
results = self.relationships
|
|
73
|
+
if source:
|
|
74
|
+
results = [r for r in results if r["source"] == source]
|
|
75
|
+
if target:
|
|
76
|
+
results = [r for r in results if r["target"] == target]
|
|
77
|
+
if rel_type:
|
|
78
|
+
results = [r for r in results if r["type"] == rel_type]
|
|
79
|
+
return results
|
|
80
|
+
|
|
81
|
+
def get_neighbors(self, entity_id: str) -> dict:
|
|
82
|
+
"""Find all entities directly connected to the given entity."""
|
|
83
|
+
outgoing = self.find_relationships(source=entity_id)
|
|
84
|
+
incoming = self.find_relationships(target=entity_id)
|
|
85
|
+
|
|
86
|
+
connected_ids = set()
|
|
87
|
+
for r in outgoing:
|
|
88
|
+
connected_ids.add(r["target"])
|
|
89
|
+
for r in incoming:
|
|
90
|
+
connected_ids.add(r["source"])
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"entity": self.get_entity(entity_id),
|
|
94
|
+
"outgoing": outgoing,
|
|
95
|
+
"incoming": incoming,
|
|
96
|
+
"neighbors": [self.entities[eid] for eid in connected_ids if eid in self.entities],
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
def clear(self) -> None:
|
|
100
|
+
"""Wipe the graph."""
|
|
101
|
+
self.entities.clear()
|
|
102
|
+
self.relationships.clear()
|
|
103
|
+
|
|
104
|
+
def save(self) -> None:
|
|
105
|
+
"""Persist the graph to disk as JSON."""
|
|
106
|
+
if not self.storage_path:
|
|
107
|
+
return
|
|
108
|
+
self.storage_path.parent.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
data = {"entities": self.entities, "relationships": self.relationships}
|
|
110
|
+
self.storage_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
|
111
|
+
|
|
112
|
+
def _load(self) -> None:
|
|
113
|
+
"""Load graph from disk."""
|
|
114
|
+
try:
|
|
115
|
+
data = json.loads(self.storage_path.read_text(encoding="utf-8"))
|
|
116
|
+
self.entities = data.get("entities", {})
|
|
117
|
+
self.relationships = data.get("relationships", [])
|
|
118
|
+
except (json.JSONDecodeError, KeyError):
|
|
119
|
+
self.entities = {}
|
|
120
|
+
self.relationships = []
|
|
121
|
+
|
|
122
|
+
def stats(self) -> dict:
|
|
123
|
+
"""Return summary stats about the graph."""
|
|
124
|
+
type_counts: dict[str, int] = {}
|
|
125
|
+
for e in self.entities.values():
|
|
126
|
+
t = e["type"]
|
|
127
|
+
type_counts[t] = type_counts.get(t, 0) + 1
|
|
128
|
+
|
|
129
|
+
rel_type_counts: dict[str, int] = {}
|
|
130
|
+
for r in self.relationships:
|
|
131
|
+
t = r["type"]
|
|
132
|
+
rel_type_counts[t] = rel_type_counts.get(t, 0) + 1
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
"total_entities": len(self.entities),
|
|
136
|
+
"total_relationships": len(self.relationships),
|
|
137
|
+
"entities_by_type": type_counts,
|
|
138
|
+
"relationships_by_type": rel_type_counts,
|
|
139
|
+
}
|