codereview-local 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codereview_local-0.1.0/LICENSE +21 -0
- codereview_local-0.1.0/PKG-INFO +224 -0
- codereview_local-0.1.0/README.md +191 -0
- codereview_local-0.1.0/codereview/__init__.py +0 -0
- codereview_local-0.1.0/codereview/chunker.py +34 -0
- codereview_local-0.1.0/codereview/cli.py +88 -0
- codereview_local-0.1.0/codereview/embedder.py +23 -0
- codereview_local-0.1.0/codereview/retriever.py +29 -0
- codereview_local-0.1.0/codereview/reviewer.py +56 -0
- codereview_local-0.1.0/codereview_local.egg-info/PKG-INFO +224 -0
- codereview_local-0.1.0/codereview_local.egg-info/SOURCES.txt +15 -0
- codereview_local-0.1.0/codereview_local.egg-info/dependency_links.txt +1 -0
- codereview_local-0.1.0/codereview_local.egg-info/entry_points.txt +2 -0
- codereview_local-0.1.0/codereview_local.egg-info/requires.txt +7 -0
- codereview_local-0.1.0/codereview_local.egg-info/top_level.txt +1 -0
- codereview_local-0.1.0/setup.cfg +4 -0
- codereview_local-0.1.0/setup.py +35 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Muhammad-NSQ
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: codereview-local
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local RAG-based code review CLI. No API keys. Runs fully on your machine.
|
|
5
|
+
Home-page: https://github.com/Muhammad-NSQ/codereview
|
|
6
|
+
Author: Muhammad
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
12
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: typer
|
|
17
|
+
Requires-Dist: chromadb
|
|
18
|
+
Requires-Dist: sentence-transformers
|
|
19
|
+
Requires-Dist: tree-sitter
|
|
20
|
+
Requires-Dist: tree-sitter-python
|
|
21
|
+
Requires-Dist: requests
|
|
22
|
+
Requires-Dist: torch
|
|
23
|
+
Dynamic: author
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: description
|
|
26
|
+
Dynamic: description-content-type
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
33
|
+
|
|
34
|
+
# codereview
|
|
35
|
+
|
|
36
|
+
A local, privacy-first code review CLI tool powered by RAG and a local LLM. No API keys. No data leaves your machine.
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install codereview-local
|
|
40
|
+
codereview your_file.py
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## How it works
|
|
46
|
+
|
|
47
|
+
Most code review tools send your code to a remote API. This one runs entirely on your machine.
|
|
48
|
+
|
|
49
|
+
It uses a RAG (Retrieval-Augmented Generation) pipeline to intelligently select the most relevant parts of your code before sending them to a local LLM for review. This means it scales to large codebases without hitting context window limits.
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
your code
|
|
53
|
+
│
|
|
54
|
+
▼
|
|
55
|
+
tree-sitter parses into functions/classes
|
|
56
|
+
│
|
|
57
|
+
▼
|
|
58
|
+
sentence-transformers converts chunks to vectors
|
|
59
|
+
│
|
|
60
|
+
▼
|
|
61
|
+
ChromaDB stores all vectors in memory
|
|
62
|
+
│
|
|
63
|
+
▼
|
|
64
|
+
semantic queries retrieve the most relevant chunks
|
|
65
|
+
("security vulnerabilities", "missing error handling", ...)
|
|
66
|
+
│
|
|
67
|
+
▼
|
|
68
|
+
local LLM reviews only what matters
|
|
69
|
+
│
|
|
70
|
+
▼
|
|
71
|
+
actionable feedback printed to terminal
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Features
|
|
77
|
+
|
|
78
|
+
- **Fully local** — runs on your machine, no API keys, no data sent anywhere
|
|
79
|
+
- **RAG pipeline** — semantic retrieval finds the most relevant code across your entire project
|
|
80
|
+
- **AST-based chunking** — splits by functions and classes using tree-sitter, not arbitrary character counts
|
|
81
|
+
- **Multi-query retrieval** — five semantic queries cast different nets across your codebase
|
|
82
|
+
- **Any file type** — works on Python, JavaScript, JSX, and anything else
|
|
83
|
+
- **Directory support** — review an entire project at once
|
|
84
|
+
- **Streaming output** — see the review as it generates, token by token
|
|
85
|
+
- **GPU accelerated** — embedding model uses CUDA automatically if available
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Requirements
|
|
90
|
+
|
|
91
|
+
- Python 3.10+
|
|
92
|
+
- [Ollama](https://ollama.com) installed and running
|
|
93
|
+
- A coding model pulled in Ollama
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
ollama pull qwen3-coder:latest
|
|
97
|
+
# or a smaller/faster option:
|
|
98
|
+
ollama pull deepseek-coder:6.7b
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Installation
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
pip install codereview-local
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Or from source:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
git clone https://github.com/Muhammad-NSQ/codereview
|
|
113
|
+
cd codereview
|
|
114
|
+
pip install -e .
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Usage
|
|
120
|
+
|
|
121
|
+
**Review a single file:**
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
codereview path/to/file.py
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Review an entire directory:**
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
codereview path/to/project/
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
**Use a different model:**
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
codereview path/to/file.py --model deepseek-coder:6.7b
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Example output
|
|
142
|
+
|
|
143
|
+
```
|
|
144
|
+
$ codereview app/auth.py
|
|
145
|
+
|
|
146
|
+
📂 Indexing app/auth.py...
|
|
147
|
+
3 chunks indexed
|
|
148
|
+
🔎 Running semantic retrieval...
|
|
149
|
+
🤖 Reviewing with LLM...
|
|
150
|
+
|
|
151
|
+
## Critical Security Issues
|
|
152
|
+
|
|
153
|
+
**SQL Injection Vulnerability**
|
|
154
|
+
- Line 3: Direct string concatenation in SQL query
|
|
155
|
+
- Fix: Use parameterized queries: db.query("SELECT * FROM users WHERE id = ?", (id,))
|
|
156
|
+
|
|
157
|
+
**Hardcoded Credentials**
|
|
158
|
+
- Line 2: Database password exposed in plain text
|
|
159
|
+
- Fix: Use environment variables or a secrets manager
|
|
160
|
+
|
|
161
|
+
## Runtime Errors
|
|
162
|
+
|
|
163
|
+
**Division by Zero**
|
|
164
|
+
- Line 12: No check for b == 0 before division
|
|
165
|
+
- Fix: Add validation: if b == 0: raise ValueError("Cannot divide by zero")
|
|
166
|
+
|
|
167
|
+
## Bad Practices
|
|
168
|
+
|
|
169
|
+
**Resource Leak**
|
|
170
|
+
- Line 7: File handle opened but never closed
|
|
171
|
+
- Fix: Use context manager: with open(path) as f:
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## Tech stack
|
|
177
|
+
|
|
178
|
+
| Component | Library | Purpose |
|
|
179
|
+
|---|---|---|
|
|
180
|
+
| CLI | Typer | Command line interface |
|
|
181
|
+
| AST parsing | tree-sitter | Split code by functions/classes |
|
|
182
|
+
| Embeddings | sentence-transformers | Convert code to vectors |
|
|
183
|
+
| Vector DB | ChromaDB | Store and search embeddings |
|
|
184
|
+
| LLM | Ollama | Local language model inference |
|
|
185
|
+
| HTTP | requests | Talk to Ollama API |
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Why RAG for code review?
|
|
190
|
+
|
|
191
|
+
**The naive approach** — dump the entire file into the LLM — breaks on large codebases. A 2000-line file with 80 functions easily exceeds most models' context windows.
|
|
192
|
+
|
|
193
|
+
**The RAG approach** — index everything, retrieve only what's relevant, send a focused context to the LLM. Five semantic queries target different problem categories:
|
|
194
|
+
|
|
195
|
+
- Security vulnerabilities and injection attacks
|
|
196
|
+
- Missing error handling and uncaught exceptions
|
|
197
|
+
- Resource leaks and connection management
|
|
198
|
+
- Bad practices and code smells
|
|
199
|
+
- Input validation and type safety
|
|
200
|
+
|
|
201
|
+
All matching chunks from all files share one ChromaDB collection, so the retrieval competes across your entire codebase — not file by file.
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Project structure
|
|
206
|
+
|
|
207
|
+
```
|
|
208
|
+
codereview/
|
|
209
|
+
├── codereview/
|
|
210
|
+
│ ├── __init__.py
|
|
211
|
+
│ ├── chunker.py # tree-sitter AST parsing
|
|
212
|
+
│ ├── embedder.py # sentence-transformers embeddings
|
|
213
|
+
│ ├── retriever.py # ChromaDB storage and retrieval
|
|
214
|
+
│ ├── reviewer.py # Ollama LLM integration
|
|
215
|
+
│ └── cli.py # Typer CLI and pipeline orchestration
|
|
216
|
+
├── main.py
|
|
217
|
+
└── setup.py
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## Author
|
|
223
|
+
|
|
224
|
+
Muhammad — [GitHub](https://github.com/Muhammad-NSQ)
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# codereview
|
|
2
|
+
|
|
3
|
+
A local, privacy-first code review CLI tool powered by RAG and a local LLM. No API keys. No data leaves your machine.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install codereview-local
|
|
7
|
+
codereview your_file.py
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## How it works
|
|
13
|
+
|
|
14
|
+
Most code review tools send your code to a remote API. This one runs entirely on your machine.
|
|
15
|
+
|
|
16
|
+
It uses a RAG (Retrieval-Augmented Generation) pipeline to intelligently select the most relevant parts of your code before sending them to a local LLM for review. This means it scales to large codebases without hitting context window limits.
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
your code
|
|
20
|
+
│
|
|
21
|
+
▼
|
|
22
|
+
tree-sitter parses into functions/classes
|
|
23
|
+
│
|
|
24
|
+
▼
|
|
25
|
+
sentence-transformers converts chunks to vectors
|
|
26
|
+
│
|
|
27
|
+
▼
|
|
28
|
+
ChromaDB stores all vectors in memory
|
|
29
|
+
│
|
|
30
|
+
▼
|
|
31
|
+
semantic queries retrieve the most relevant chunks
|
|
32
|
+
("security vulnerabilities", "missing error handling", ...)
|
|
33
|
+
│
|
|
34
|
+
▼
|
|
35
|
+
local LLM reviews only what matters
|
|
36
|
+
│
|
|
37
|
+
▼
|
|
38
|
+
actionable feedback printed to terminal
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Features
|
|
44
|
+
|
|
45
|
+
- **Fully local** — runs on your machine, no API keys, no data sent anywhere
|
|
46
|
+
- **RAG pipeline** — semantic retrieval finds the most relevant code across your entire project
|
|
47
|
+
- **AST-based chunking** — splits by functions and classes using tree-sitter, not arbitrary character counts
|
|
48
|
+
- **Multi-query retrieval** — five semantic queries cast different nets across your codebase
|
|
49
|
+
- **Any file type** — works on Python, JavaScript, JSX, and anything else
|
|
50
|
+
- **Directory support** — review an entire project at once
|
|
51
|
+
- **Streaming output** — see the review as it generates, token by token
|
|
52
|
+
- **GPU accelerated** — embedding model uses CUDA automatically if available
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Requirements
|
|
57
|
+
|
|
58
|
+
- Python 3.10+
|
|
59
|
+
- [Ollama](https://ollama.com) installed and running
|
|
60
|
+
- A coding model pulled in Ollama
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
ollama pull qwen3-coder:latest
|
|
64
|
+
# or a smaller/faster option:
|
|
65
|
+
ollama pull deepseek-coder:6.7b
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## Installation
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install codereview-local
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Or from source:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
git clone https://github.com/Muhammad-NSQ/codereview
|
|
80
|
+
cd codereview
|
|
81
|
+
pip install -e .
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## Usage
|
|
87
|
+
|
|
88
|
+
**Review a single file:**
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
codereview path/to/file.py
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Review an entire directory:**
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
codereview path/to/project/
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
**Use a different model:**
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
codereview path/to/file.py --model deepseek-coder:6.7b
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Example output
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
$ codereview app/auth.py
|
|
112
|
+
|
|
113
|
+
📂 Indexing app/auth.py...
|
|
114
|
+
3 chunks indexed
|
|
115
|
+
🔎 Running semantic retrieval...
|
|
116
|
+
🤖 Reviewing with LLM...
|
|
117
|
+
|
|
118
|
+
## Critical Security Issues
|
|
119
|
+
|
|
120
|
+
**SQL Injection Vulnerability**
|
|
121
|
+
- Line 3: Direct string concatenation in SQL query
|
|
122
|
+
- Fix: Use parameterized queries: db.query("SELECT * FROM users WHERE id = ?", (id,))
|
|
123
|
+
|
|
124
|
+
**Hardcoded Credentials**
|
|
125
|
+
- Line 2: Database password exposed in plain text
|
|
126
|
+
- Fix: Use environment variables or a secrets manager
|
|
127
|
+
|
|
128
|
+
## Runtime Errors
|
|
129
|
+
|
|
130
|
+
**Division by Zero**
|
|
131
|
+
- Line 12: No check for b == 0 before division
|
|
132
|
+
- Fix: Add validation: if b == 0: raise ValueError("Cannot divide by zero")
|
|
133
|
+
|
|
134
|
+
## Bad Practices
|
|
135
|
+
|
|
136
|
+
**Resource Leak**
|
|
137
|
+
- Line 7: File handle opened but never closed
|
|
138
|
+
- Fix: Use context manager: with open(path) as f:
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Tech stack
|
|
144
|
+
|
|
145
|
+
| Component | Library | Purpose |
|
|
146
|
+
|---|---|---|
|
|
147
|
+
| CLI | Typer | Command line interface |
|
|
148
|
+
| AST parsing | tree-sitter | Split code by functions/classes |
|
|
149
|
+
| Embeddings | sentence-transformers | Convert code to vectors |
|
|
150
|
+
| Vector DB | ChromaDB | Store and search embeddings |
|
|
151
|
+
| LLM | Ollama | Local language model inference |
|
|
152
|
+
| HTTP | requests | Talk to Ollama API |
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Why RAG for code review?
|
|
157
|
+
|
|
158
|
+
**The naive approach** — dump the entire file into the LLM — breaks on large codebases. A 2000-line file with 80 functions easily exceeds most models' context windows.
|
|
159
|
+
|
|
160
|
+
**The RAG approach** — index everything, retrieve only what's relevant, send a focused context to the LLM. Five semantic queries target different problem categories:
|
|
161
|
+
|
|
162
|
+
- Security vulnerabilities and injection attacks
|
|
163
|
+
- Missing error handling and uncaught exceptions
|
|
164
|
+
- Resource leaks and connection management
|
|
165
|
+
- Bad practices and code smells
|
|
166
|
+
- Input validation and type safety
|
|
167
|
+
|
|
168
|
+
All matching chunks from all files share one ChromaDB collection, so the retrieval competes across your entire codebase — not file by file.
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Project structure
|
|
173
|
+
|
|
174
|
+
```
|
|
175
|
+
codereview/
|
|
176
|
+
├── codereview/
|
|
177
|
+
│ ├── __init__.py
|
|
178
|
+
│ ├── chunker.py # tree-sitter AST parsing
|
|
179
|
+
│ ├── embedder.py # sentence-transformers embeddings
|
|
180
|
+
│ ├── retriever.py # ChromaDB storage and retrieval
|
|
181
|
+
│ ├── reviewer.py # Ollama LLM integration
|
|
182
|
+
│ └── cli.py # Typer CLI and pipeline orchestration
|
|
183
|
+
├── main.py
|
|
184
|
+
└── setup.py
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Author
|
|
190
|
+
|
|
191
|
+
Muhammad — [GitHub](https://github.com/Muhammad-NSQ)
|
|
File without changes
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import tree_sitter_python as tspython
|
|
2
|
+
from tree_sitter import Language, Parser
|
|
3
|
+
|
|
4
|
+
PY_LANGUAGE = Language(tspython.language())
|
|
5
|
+
parser = Parser(PY_LANGUAGE)
|
|
6
|
+
|
|
7
|
+
def chunk_code(source_code: str, file_path: str) -> list[dict]:
|
|
8
|
+
"""Parse Python code and split into function/class chunks."""
|
|
9
|
+
chunks = []
|
|
10
|
+
tree = parser.parse(bytes(source_code, "utf8"))
|
|
11
|
+
root = tree.root_node
|
|
12
|
+
|
|
13
|
+
for node in root.children:
|
|
14
|
+
if node.type in ("function_definition", "class_definition"):
|
|
15
|
+
chunk_text = source_code[node.start_byte:node.end_byte]
|
|
16
|
+
chunks.append({
|
|
17
|
+
"text": chunk_text,
|
|
18
|
+
"file": file_path,
|
|
19
|
+
"start_line": node.start_point[0] + 1,
|
|
20
|
+
"end_line": node.end_point[0] + 1,
|
|
21
|
+
"type": node.type,
|
|
22
|
+
})
|
|
23
|
+
|
|
24
|
+
# If no functions/classes found, treat whole file as one chunk
|
|
25
|
+
if not chunks:
|
|
26
|
+
chunks.append({
|
|
27
|
+
"text": source_code,
|
|
28
|
+
"file": file_path,
|
|
29
|
+
"start_line": 1,
|
|
30
|
+
"end_line": source_code.count("\n") + 1,
|
|
31
|
+
"type": "module",
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
return chunks
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from codereview.chunker import chunk_code
|
|
4
|
+
from codereview.embedder import embed_chunks, embed_query
|
|
5
|
+
from codereview.retriever import store_chunks, retrieve_chunks, get_or_create_collection
|
|
6
|
+
from codereview.reviewer import review_chunks
|
|
7
|
+
|
|
8
|
+
app = typer.Typer()
|
|
9
|
+
|
|
10
|
+
COLLECTION = "project_review"
|
|
11
|
+
|
|
12
|
+
REVIEW_QUERIES = [
|
|
13
|
+
"security vulnerabilities SQL injection hardcoded credentials exposed secrets",
|
|
14
|
+
"missing error handling no try except uncaught exceptions crashes",
|
|
15
|
+
"resource leaks file handles not closed database connections not closed",
|
|
16
|
+
"bad practices code smells inefficient logic poor structure",
|
|
17
|
+
"input validation missing type checking no sanitization",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
def index_file(file_path: str):
|
|
21
|
+
"""Chunk and embed a file and store in the shared collection."""
|
|
22
|
+
source_code = Path(file_path).read_text()
|
|
23
|
+
chunks = chunk_code(source_code, file_path)
|
|
24
|
+
chunks = embed_chunks(chunks)
|
|
25
|
+
store_chunks(chunks, COLLECTION)
|
|
26
|
+
return len(chunks)
|
|
27
|
+
|
|
28
|
+
def run_review(n_results: int = 10) -> str:
|
|
29
|
+
"""Query the shared collection with semantic queries and review results."""
|
|
30
|
+
seen_ids = set()
|
|
31
|
+
all_documents = []
|
|
32
|
+
|
|
33
|
+
for query in REVIEW_QUERIES:
|
|
34
|
+
query_embedding = embed_query(query)
|
|
35
|
+
results = retrieve_chunks(query_embedding, n_results=n_results, collection_name=COLLECTION)
|
|
36
|
+
for doc, id_ in zip(results["documents"][0], results["ids"][0]):
|
|
37
|
+
if id_ not in seen_ids:
|
|
38
|
+
seen_ids.add(id_)
|
|
39
|
+
all_documents.append(doc)
|
|
40
|
+
|
|
41
|
+
if not all_documents:
|
|
42
|
+
return "No chunks retrieved."
|
|
43
|
+
|
|
44
|
+
return review_chunks(all_documents)
|
|
45
|
+
|
|
46
|
+
@app.command()
|
|
47
|
+
def review(
|
|
48
|
+
path: str = typer.Argument(..., help="File or directory to review"),
|
|
49
|
+
model: str = typer.Option("qwen3-coder:latest", help="Ollama model to use"),
|
|
50
|
+
):
|
|
51
|
+
"""Review code using local LLM and RAG."""
|
|
52
|
+
p = Path(path)
|
|
53
|
+
|
|
54
|
+
# reset collection for each run
|
|
55
|
+
try:
|
|
56
|
+
import chromadb
|
|
57
|
+
from chromadb.config import Settings
|
|
58
|
+
client = chromadb.Client(Settings(anonymized_telemetry=False))
|
|
59
|
+
client.delete_collection(COLLECTION)
|
|
60
|
+
except Exception:
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
if p.is_file():
|
|
64
|
+
typer.echo(f"📂 Indexing {p}...")
|
|
65
|
+
n = index_file(str(p))
|
|
66
|
+
typer.echo(f" {n} chunks indexed")
|
|
67
|
+
|
|
68
|
+
elif p.is_dir():
|
|
69
|
+
files = list(p.rglob("*.py"))
|
|
70
|
+
typer.echo(f"📂 Indexing {len(files)} files...")
|
|
71
|
+
total = 0
|
|
72
|
+
for f in files:
|
|
73
|
+
n = index_file(str(f))
|
|
74
|
+
total += n
|
|
75
|
+
typer.echo(f" {f} → {n} chunks")
|
|
76
|
+
typer.echo(f" Total: {total} chunks indexed\n")
|
|
77
|
+
|
|
78
|
+
else:
|
|
79
|
+
typer.echo(f"Error: {path} is not a valid file or directory")
|
|
80
|
+
raise typer.Exit(1)
|
|
81
|
+
|
|
82
|
+
typer.echo("🔎 Running semantic retrieval...")
|
|
83
|
+
typer.echo("🤖 Reviewing with LLM...\n")
|
|
84
|
+
result = run_review()
|
|
85
|
+
typer.echo(result)
|
|
86
|
+
|
|
87
|
+
if __name__ == "__main__":
|
|
88
|
+
app()
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import torch
|
|
3
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
4
|
+
os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
|
|
5
|
+
|
|
6
|
+
from sentence_transformers import SentenceTransformer
|
|
7
|
+
|
|
8
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
9
|
+
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
|
|
10
|
+
|
|
11
|
+
def embed_chunks(chunks: list[dict]) -> list[dict]:
|
|
12
|
+
"""Add embeddings to each chunk."""
|
|
13
|
+
texts = [chunk["text"] for chunk in chunks]
|
|
14
|
+
embeddings = model.encode(texts, show_progress_bar=False)
|
|
15
|
+
|
|
16
|
+
for chunk, embedding in zip(chunks, embeddings):
|
|
17
|
+
chunk["embedding"] = embedding.tolist()
|
|
18
|
+
|
|
19
|
+
return chunks
|
|
20
|
+
|
|
21
|
+
def embed_query(query: str) -> list[float]:
|
|
22
|
+
"""Embed a natural language query string."""
|
|
23
|
+
return model.encode(query).tolist()
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import chromadb
|
|
2
|
+
from chromadb.config import Settings
|
|
3
|
+
|
|
4
|
+
client = chromadb.Client(Settings(anonymized_telemetry=False))
|
|
5
|
+
|
|
6
|
+
def get_or_create_collection(name: str = "codereview"):
|
|
7
|
+
return client.get_or_create_collection(name=name)
|
|
8
|
+
|
|
9
|
+
def store_chunks(chunks: list[dict], collection_name: str = "codereview"):
|
|
10
|
+
"""Store embedded chunks in ChromaDB."""
|
|
11
|
+
collection = get_or_create_collection(collection_name)
|
|
12
|
+
|
|
13
|
+
ids = [f"{chunk['file']}:{chunk['start_line']}" for chunk in chunks]
|
|
14
|
+
embeddings = [chunk["embedding"] for chunk in chunks]
|
|
15
|
+
documents = [chunk["text"] for chunk in chunks]
|
|
16
|
+
metadatas = [{
|
|
17
|
+
"file": chunk["file"],
|
|
18
|
+
"start_line": chunk["start_line"],
|
|
19
|
+
"end_line": chunk["end_line"],
|
|
20
|
+
"type": chunk["type"],
|
|
21
|
+
} for chunk in chunks]
|
|
22
|
+
|
|
23
|
+
collection.upsert(ids=ids, embeddings=embeddings, documents=documents, metadatas=metadatas)
|
|
24
|
+
|
|
25
|
+
def retrieve_chunks(query_embedding: list[float], n_results: int = 5, collection_name: str = "codereview"):
|
|
26
|
+
"""Retrieve most relevant chunks for a query."""
|
|
27
|
+
collection = get_or_create_collection(collection_name)
|
|
28
|
+
results = collection.query(query_embeddings=[query_embedding], n_results=n_results)
|
|
29
|
+
return results
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
|
|
3
|
+
OLLAMA_URL = "http://localhost:11434/api/generate"
|
|
4
|
+
DEFAULT_MODEL = "qwen3-coder:latest"
|
|
5
|
+
|
|
6
|
+
def review_chunks(documents: list[str], model: str = DEFAULT_MODEL) -> str:
|
|
7
|
+
"""Send retrieved code chunks to Ollama for review with streaming output."""
|
|
8
|
+
|
|
9
|
+
combined_code = "\n\n---\n\n".join(documents)
|
|
10
|
+
|
|
11
|
+
prompt = f"""You are an expert code reviewer. Review the following code and provide specific, actionable feedback.
|
|
12
|
+
|
|
13
|
+
Focus on:
|
|
14
|
+
1. Bugs and potential runtime errors
|
|
15
|
+
2. Security issues
|
|
16
|
+
3. Bad practices and code smells
|
|
17
|
+
4. Performance problems
|
|
18
|
+
5. Missing error handling
|
|
19
|
+
|
|
20
|
+
Be specific — mention line numbers or function names when possible.
|
|
21
|
+
Do not praise the code, only give constructive feedback.
|
|
22
|
+
|
|
23
|
+
Code to review:
|
|
24
|
+
{combined_code}
|
|
25
|
+
|
|
26
|
+
Review:"""
|
|
27
|
+
|
|
28
|
+
payload = {
|
|
29
|
+
"model": model,
|
|
30
|
+
"prompt": prompt,
|
|
31
|
+
"stream": True,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
full_response = []
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
with requests.post(OLLAMA_URL, json=payload, stream=True, timeout=120) as response:
|
|
38
|
+
response.raise_for_status()
|
|
39
|
+
for line in response.iter_lines():
|
|
40
|
+
if line:
|
|
41
|
+
import json
|
|
42
|
+
chunk = json.loads(line)
|
|
43
|
+
token = chunk.get("response", "")
|
|
44
|
+
print(token, end="", flush=True)
|
|
45
|
+
full_response.append(token)
|
|
46
|
+
if chunk.get("done", False):
|
|
47
|
+
break
|
|
48
|
+
print()
|
|
49
|
+
return "".join(full_response)
|
|
50
|
+
|
|
51
|
+
except requests.exceptions.ConnectionError:
|
|
52
|
+
return "Error: Ollama is not running. Start it with: ollama serve"
|
|
53
|
+
except requests.exceptions.Timeout:
|
|
54
|
+
return "Error: Ollama timed out. The model may be overloaded."
|
|
55
|
+
except Exception as e:
|
|
56
|
+
return f"Error: {e}"
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: codereview-local
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local RAG-based code review CLI. No API keys. Runs fully on your machine.
|
|
5
|
+
Home-page: https://github.com/Muhammad-NSQ/codereview
|
|
6
|
+
Author: Muhammad
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
12
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: typer
|
|
17
|
+
Requires-Dist: chromadb
|
|
18
|
+
Requires-Dist: sentence-transformers
|
|
19
|
+
Requires-Dist: tree-sitter
|
|
20
|
+
Requires-Dist: tree-sitter-python
|
|
21
|
+
Requires-Dist: requests
|
|
22
|
+
Requires-Dist: torch
|
|
23
|
+
Dynamic: author
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: description
|
|
26
|
+
Dynamic: description-content-type
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
33
|
+
|
|
34
|
+
# codereview
|
|
35
|
+
|
|
36
|
+
A local, privacy-first code review CLI tool powered by RAG and a local LLM. No API keys. No data leaves your machine.
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install codereview-local
|
|
40
|
+
codereview your_file.py
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## How it works
|
|
46
|
+
|
|
47
|
+
Most code review tools send your code to a remote API. This one runs entirely on your machine.
|
|
48
|
+
|
|
49
|
+
It uses a RAG (Retrieval-Augmented Generation) pipeline to intelligently select the most relevant parts of your code before sending them to a local LLM for review. This means it scales to large codebases without hitting context window limits.
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
your code
|
|
53
|
+
│
|
|
54
|
+
▼
|
|
55
|
+
tree-sitter parses into functions/classes
|
|
56
|
+
│
|
|
57
|
+
▼
|
|
58
|
+
sentence-transformers converts chunks to vectors
|
|
59
|
+
│
|
|
60
|
+
▼
|
|
61
|
+
ChromaDB stores all vectors in memory
|
|
62
|
+
│
|
|
63
|
+
▼
|
|
64
|
+
semantic queries retrieve the most relevant chunks
|
|
65
|
+
("security vulnerabilities", "missing error handling", ...)
|
|
66
|
+
│
|
|
67
|
+
▼
|
|
68
|
+
local LLM reviews only what matters
|
|
69
|
+
│
|
|
70
|
+
▼
|
|
71
|
+
actionable feedback printed to terminal
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Features
|
|
77
|
+
|
|
78
|
+
- **Fully local** — runs on your machine, no API keys, no data sent anywhere
|
|
79
|
+
- **RAG pipeline** — semantic retrieval finds the most relevant code across your entire project
|
|
80
|
+
- **AST-based chunking** — splits by functions and classes using tree-sitter, not arbitrary character counts
|
|
81
|
+
- **Multi-query retrieval** — five semantic queries cast different nets across your codebase
|
|
82
|
+
- **Any file type** — works on Python, JavaScript, JSX, and anything else
|
|
83
|
+
- **Directory support** — review an entire project at once
|
|
84
|
+
- **Streaming output** — see the review as it generates, token by token
|
|
85
|
+
- **GPU accelerated** — embedding model uses CUDA automatically if available
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Requirements
|
|
90
|
+
|
|
91
|
+
- Python 3.10+
|
|
92
|
+
- [Ollama](https://ollama.com) installed and running
|
|
93
|
+
- A coding model pulled in Ollama
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
ollama pull qwen3-coder:latest
|
|
97
|
+
# or a smaller/faster option:
|
|
98
|
+
ollama pull deepseek-coder:6.7b
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Installation
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
pip install codereview-local
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Or from source:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
git clone https://github.com/Muhammad-NSQ/codereview
|
|
113
|
+
cd codereview
|
|
114
|
+
pip install -e .
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Usage
|
|
120
|
+
|
|
121
|
+
**Review a single file:**
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
codereview path/to/file.py
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Review an entire directory:**
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
codereview path/to/project/
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
**Use a different model:**
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
codereview path/to/file.py --model deepseek-coder:6.7b
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Example output
|
|
142
|
+
|
|
143
|
+
```
|
|
144
|
+
$ codereview app/auth.py
|
|
145
|
+
|
|
146
|
+
📂 Indexing app/auth.py...
|
|
147
|
+
3 chunks indexed
|
|
148
|
+
🔎 Running semantic retrieval...
|
|
149
|
+
🤖 Reviewing with LLM...
|
|
150
|
+
|
|
151
|
+
## Critical Security Issues
|
|
152
|
+
|
|
153
|
+
**SQL Injection Vulnerability**
|
|
154
|
+
- Line 3: Direct string concatenation in SQL query
|
|
155
|
+
- Fix: Use parameterized queries: db.query("SELECT * FROM users WHERE id = ?", (id,))
|
|
156
|
+
|
|
157
|
+
**Hardcoded Credentials**
|
|
158
|
+
- Line 2: Database password exposed in plain text
|
|
159
|
+
- Fix: Use environment variables or a secrets manager
|
|
160
|
+
|
|
161
|
+
## Runtime Errors
|
|
162
|
+
|
|
163
|
+
**Division by Zero**
|
|
164
|
+
- Line 12: No check for b == 0 before division
|
|
165
|
+
- Fix: Add validation: if b == 0: raise ValueError("Cannot divide by zero")
|
|
166
|
+
|
|
167
|
+
## Bad Practices
|
|
168
|
+
|
|
169
|
+
**Resource Leak**
|
|
170
|
+
- Line 7: File handle opened but never closed
|
|
171
|
+
- Fix: Use context manager: with open(path) as f:
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## Tech stack
|
|
177
|
+
|
|
178
|
+
| Component | Library | Purpose |
|
|
179
|
+
|---|---|---|
|
|
180
|
+
| CLI | Typer | Command line interface |
|
|
181
|
+
| AST parsing | tree-sitter | Split code by functions/classes |
|
|
182
|
+
| Embeddings | sentence-transformers | Convert code to vectors |
|
|
183
|
+
| Vector DB | ChromaDB | Store and search embeddings |
|
|
184
|
+
| LLM | Ollama | Local language model inference |
|
|
185
|
+
| HTTP | requests | Talk to Ollama API |
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Why RAG for code review?
|
|
190
|
+
|
|
191
|
+
**The naive approach** — dump the entire file into the LLM — breaks on large codebases. A 2000-line file with 80 functions easily exceeds most models' context windows.
|
|
192
|
+
|
|
193
|
+
**The RAG approach** — index everything, retrieve only what's relevant, send a focused context to the LLM. Five semantic queries target different problem categories:
|
|
194
|
+
|
|
195
|
+
- Security vulnerabilities and injection attacks
|
|
196
|
+
- Missing error handling and uncaught exceptions
|
|
197
|
+
- Resource leaks and connection management
|
|
198
|
+
- Bad practices and code smells
|
|
199
|
+
- Input validation and type safety
|
|
200
|
+
|
|
201
|
+
All matching chunks from all files share one ChromaDB collection, so the retrieval competes across your entire codebase — not file by file.
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Project structure
|
|
206
|
+
|
|
207
|
+
```
|
|
208
|
+
codereview/
|
|
209
|
+
├── codereview/
|
|
210
|
+
│ ├── __init__.py
|
|
211
|
+
│ ├── chunker.py # tree-sitter AST parsing
|
|
212
|
+
│ ├── embedder.py # sentence-transformers embeddings
|
|
213
|
+
│ ├── retriever.py # ChromaDB storage and retrieval
|
|
214
|
+
│ ├── reviewer.py # Ollama LLM integration
|
|
215
|
+
│ └── cli.py # Typer CLI and pipeline orchestration
|
|
216
|
+
├── main.py
|
|
217
|
+
└── setup.py
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## Author
|
|
223
|
+
|
|
224
|
+
Muhammad — [GitHub](https://github.com/Muhammad-NSQ)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.py
|
|
4
|
+
codereview/__init__.py
|
|
5
|
+
codereview/chunker.py
|
|
6
|
+
codereview/cli.py
|
|
7
|
+
codereview/embedder.py
|
|
8
|
+
codereview/retriever.py
|
|
9
|
+
codereview/reviewer.py
|
|
10
|
+
codereview_local.egg-info/PKG-INFO
|
|
11
|
+
codereview_local.egg-info/SOURCES.txt
|
|
12
|
+
codereview_local.egg-info/dependency_links.txt
|
|
13
|
+
codereview_local.egg-info/entry_points.txt
|
|
14
|
+
codereview_local.egg-info/requires.txt
|
|
15
|
+
codereview_local.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
codereview
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="codereview-local",
|
|
5
|
+
version="0.1.0",
|
|
6
|
+
description="Local RAG-based code review CLI. No API keys. Runs fully on your machine.",
|
|
7
|
+
long_description=open("README.md").read(),
|
|
8
|
+
long_description_content_type="text/markdown",
|
|
9
|
+
author="Muhammad",
|
|
10
|
+
url="https://github.com/Muhammad-NSQ/codereview",
|
|
11
|
+
packages=find_packages(),
|
|
12
|
+
install_requires=[
|
|
13
|
+
"typer",
|
|
14
|
+
"chromadb",
|
|
15
|
+
"sentence-transformers",
|
|
16
|
+
"tree-sitter",
|
|
17
|
+
"tree-sitter-python",
|
|
18
|
+
"requests",
|
|
19
|
+
"torch",
|
|
20
|
+
],
|
|
21
|
+
entry_points={
|
|
22
|
+
"console_scripts": [
|
|
23
|
+
"codereview=codereview.cli:app",
|
|
24
|
+
],
|
|
25
|
+
},
|
|
26
|
+
python_requires=">=3.10",
|
|
27
|
+
license="MIT",
|
|
28
|
+
classifiers=[
|
|
29
|
+
"Programming Language :: Python :: 3",
|
|
30
|
+
"License :: OSI Approved :: MIT License",
|
|
31
|
+
"Operating System :: OS Independent",
|
|
32
|
+
"Topic :: Software Development :: Quality Assurance",
|
|
33
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
34
|
+
],
|
|
35
|
+
)
|