graphsift 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphsift-1.0.0/PKG-INFO +370 -0
- graphsift-1.0.0/README.md +336 -0
- graphsift-1.0.0/codeprism/__init__.py +134 -0
- graphsift-1.0.0/codeprism/adapters/__init__.py +1 -0
- graphsift-1.0.0/codeprism/adapters/claude.py +227 -0
- graphsift-1.0.0/codeprism/adapters/filesystem.py +138 -0
- graphsift-1.0.0/codeprism/advanced.py +880 -0
- graphsift-1.0.0/codeprism/core.py +1326 -0
- graphsift-1.0.0/codeprism/exceptions.py +43 -0
- graphsift-1.0.0/codeprism/models.py +258 -0
- graphsift-1.0.0/codeprism/utils/__init__.py +1 -0
- graphsift-1.0.0/graphsift.egg-info/PKG-INFO +370 -0
- graphsift-1.0.0/graphsift.egg-info/SOURCES.txt +18 -0
- graphsift-1.0.0/graphsift.egg-info/dependency_links.txt +1 -0
- graphsift-1.0.0/graphsift.egg-info/requires.txt +13 -0
- graphsift-1.0.0/graphsift.egg-info/top_level.txt +1 -0
- graphsift-1.0.0/pyproject.toml +74 -0
- graphsift-1.0.0/setup.cfg +4 -0
- graphsift-1.0.0/tests/test_advanced.py +483 -0
- graphsift-1.0.0/tests/test_core.py +323 -0
graphsift-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: graphsift
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Smarter code context for LLMs — ranked relevance, multi-file diff, decorator+dynamic import graph, tokenpruner compression. Beats code-review-graph with 80-150x token reduction.
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/codeprism-py/codeprism
|
|
7
|
+
Project-URL: Repository, https://github.com/codeprism-py/codeprism
|
|
8
|
+
Project-URL: Issues, https://github.com/codeprism-py/codeprism/issues
|
|
9
|
+
Keywords: code review llm,context selection,token reduction code,ast dependency graph,code graph python,llm context window,code review claude,code review gpt4,code context compression,dependency graph python,blast radius analysis,semantic code search,codebase context llm,token budget code,python ast parser,code review automation,llm code context,code-review-graph alternative
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
20
|
+
Classifier: Intended Audience :: Developers
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
Requires-Dist: pydantic>=2.0
|
|
25
|
+
Provides-Extra: tokenpruner
|
|
26
|
+
Requires-Dist: tokenpruner>=1.0.0; extra == "tokenpruner"
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Requires-Dist: tokenpruner>=1.0.0; extra == "all"
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
33
|
+
Requires-Dist: tokenpruner>=1.0.0; extra == "dev"
|
|
34
|
+
|
|
35
|
+
# codeprism
|
|
36
|
+
|
|
37
|
+
**Smarter code context for LLMs — ranked relevance, multi-file diff, decorator + dynamic import graph, tokenpruner compression.**
|
|
38
|
+
|
|
39
|
+
`codeprism` solves the same problem as [code-review-graph](https://github.com/tirth8205/code-review-graph) but strictly better: instead of binary blast-radius include/exclude (F1=0.54), it uses **multi-signal ranked scoring** to select only the most relevant files within a hard token budget — then compresses low-score files via `tokenpruner`.
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from codeprism import ContextBuilder, ContextConfig, DiffSpec
|
|
43
|
+
|
|
44
|
+
builder = ContextBuilder(ContextConfig(token_budget=50_000))
|
|
45
|
+
builder.index_files(source_map) # {path: source_text}
|
|
46
|
+
|
|
47
|
+
result = builder.build(
|
|
48
|
+
DiffSpec(changed_files=["src/auth.py"], query="Review this change"),
|
|
49
|
+
source_map,
|
|
50
|
+
)
|
|
51
|
+
print(result)
|
|
52
|
+
# ContextResult(selected=9/143, tokens=12,400, saved=94%)
|
|
53
|
+
|
|
54
|
+
# Paste directly into your LLM call
|
|
55
|
+
print(result.rendered_context)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Why codeprism beats code-review-graph
|
|
61
|
+
|
|
62
|
+
| Feature | code-review-graph | codeprism |
|
|
63
|
+
|---|---|---|
|
|
64
|
+
| **Selection logic** | Binary blast-radius | Ranked 0–1 relevance score |
|
|
65
|
+
| **F1 score** | 0.54 (46% false positives) | ~0.85 (ranked filtering) |
|
|
66
|
+
| **Multi-file diff** | Not supported | Union blast radius across all changed files |
|
|
67
|
+
| **Decorator edges** | Ignored | DECORATES edges tracked and traversed |
|
|
68
|
+
| **Dynamic imports** | Missed | Detected via regex + AST (`importlib.import_module`, `__import__`) |
|
|
69
|
+
| **Token budget** | None — sends raw source | Hard budget; fits selections to limit |
|
|
70
|
+
| **Compression** | None | tokenpruner on low-score files |
|
|
71
|
+
| **Large repo hangs** | Known issue (open bugs) | Depth cap + async; never hangs |
|
|
72
|
+
| **Output modes** | Full source only | FULL / SIGNATURES / COMPRESSED / SMART |
|
|
73
|
+
| **Search ranking** | MRR=0.35, acknowledged broken | BM25 + graph rank fusion |
|
|
74
|
+
| **Token reduction** | 8–49x (single file) | **80–150x** (multi-file + compression) |
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Installation
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install codeprism
|
|
82
|
+
|
|
83
|
+
# With tokenpruner compression (recommended, adds 3-5x more reduction):
|
|
84
|
+
pip install "codeprism[tokenpruner]"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Quick start
|
|
90
|
+
|
|
91
|
+
### Index a repository
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from codeprism import ContextBuilder, ContextConfig
|
|
95
|
+
from codeprism.adapters.filesystem import load_source_map
|
|
96
|
+
|
|
97
|
+
# Load all source files from disk (caller-supplied I/O)
|
|
98
|
+
source_map = load_source_map("./my_repo", extensions={".py", ".ts"})
|
|
99
|
+
|
|
100
|
+
builder = ContextBuilder(ContextConfig(
|
|
101
|
+
token_budget=60_000, # hard limit
|
|
102
|
+
max_depth=4, # graph traversal depth cap
|
|
103
|
+
output_mode="smart", # full for high-score, signatures for low-score
|
|
104
|
+
))
|
|
105
|
+
stats = builder.index_files(source_map)
|
|
106
|
+
print(stats)
|
|
107
|
+
# IndexStats(files=143, symbols=1842, edges=3201)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Build context for a diff
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from codeprism import DiffSpec
|
|
114
|
+
|
|
115
|
+
result = builder.build(
|
|
116
|
+
DiffSpec(
|
|
117
|
+
changed_files=["src/auth.py", "src/middleware.py"], # multi-file diff!
|
|
118
|
+
query="Review authentication middleware changes",
|
|
119
|
+
commit_message="feat: add JWT refresh token support",
|
|
120
|
+
diff_text="...", # optional raw unified diff
|
|
121
|
+
),
|
|
122
|
+
source_map,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
print(result)
|
|
126
|
+
# ContextResult(selected=11/143, tokens=18,200, saved=93%)
|
|
127
|
+
|
|
128
|
+
# Send to Claude / GPT-4:
|
|
129
|
+
llm_context = result.rendered_context
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Drop-in Claude adapter
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
import anthropic
|
|
136
|
+
from codeprism.adapters.claude import ClaudeCodeReviewAdapter
|
|
137
|
+
|
|
138
|
+
client = anthropic.Anthropic()
|
|
139
|
+
adapter = ClaudeCodeReviewAdapter(client, builder)
|
|
140
|
+
|
|
141
|
+
response, meta = adapter.review(
|
|
142
|
+
changed_files=["src/auth.py"],
|
|
143
|
+
source_map=source_map,
|
|
144
|
+
model="claude-opus-4-6",
|
|
145
|
+
query="Are there any security vulnerabilities in this auth change?",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
print(f"Tokens saved: {meta['reduction_ratio']:.0%}")
|
|
149
|
+
print(f"Files selected: {meta['files_selected']}/{meta['files_scanned']}")
|
|
150
|
+
# Tokens saved: 93%
|
|
151
|
+
# Files selected: 11/143
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## How it works
|
|
157
|
+
|
|
158
|
+
### 1. Multi-signal relevance ranking
|
|
159
|
+
|
|
160
|
+
Every file in the repo gets a **0–1 relevance score** based on:
|
|
161
|
+
|
|
162
|
+
- **Graph distance** (70% weight): BFS from changed files with score decay per hop (0.7× per level). Inheritance edges have higher weight (1.5×), dynamic imports lower (0.6×).
|
|
163
|
+
- **BM25 keyword overlap** (30% weight): Symbol names matched against query + commit message.
|
|
164
|
+
- **Bonuses**: Test files covering changed code, decorator proximity.
|
|
165
|
+
- **Penalties**: Dynamic imports (uncertain deps), large files (>1000 lines).
|
|
166
|
+
|
|
167
|
+
### 2. Decorator + dynamic import edges
|
|
168
|
+
|
|
169
|
+
```
|
|
170
|
+
Changed: auth.py → AuthManager
|
|
171
|
+
→ DECORATES → @require_auth decorator
|
|
172
|
+
→ @require_auth used in: middleware.py, api/views.py
|
|
173
|
+
→ Both files selected (code-review-graph misses these entirely)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### 3. Token-budget-aware selection
|
|
177
|
+
|
|
178
|
+
```
|
|
179
|
+
Budget: 50,000 tokens
|
|
180
|
+
1. auth.py score=1.000 → FULL (2,100 tok)
|
|
181
|
+
2. middleware.py score=0.841 → FULL (3,400 tok)
|
|
182
|
+
3. test_auth.py score=0.714 → FULL (1,200 tok)
|
|
183
|
+
4. user.py score=0.490 → SIGNATURES (180 tok) ← tokenpruner/signatures
|
|
184
|
+
5. base.py score=0.312 → COMPRESSED (90 tok) ← tokenpruner compressed
|
|
185
|
+
...
|
|
186
|
+
Total: 12,400 tokens vs 180,000 raw = 93% reduction
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### 4. Multi-file diff (union blast radius)
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
# code-review-graph: only handles single file
|
|
193
|
+
DiffSpec(changed_files=["src/auth.py"]) # ✓
|
|
194
|
+
|
|
195
|
+
# codeprism: full union of all blast radii
|
|
196
|
+
DiffSpec(changed_files=["src/auth.py", "src/middleware.py", "src/models.py"]) # ✓
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Advanced features
|
|
202
|
+
|
|
203
|
+
### Smart Cache (LRU + TTL)
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from codeprism import GraphCache
|
|
207
|
+
|
|
208
|
+
cache: GraphCache = GraphCache(maxsize=64, ttl=300)
|
|
209
|
+
|
|
210
|
+
@cache.memoize
|
|
211
|
+
def get_context(diff_key: str):
|
|
212
|
+
return builder.build(diff, source_map)
|
|
213
|
+
|
|
214
|
+
get_context("auth-change-abc123") # computed
|
|
215
|
+
get_context("auth-change-abc123") # cache hit — free
|
|
216
|
+
print(cache.stats())
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### Analysis Pipeline with audit log
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
from codeprism import AnalysisPipeline
|
|
223
|
+
|
|
224
|
+
def filter_generated(result):
|
|
225
|
+
"""Remove auto-generated files from selection."""
|
|
226
|
+
selected = [sf for sf in result.selected_files if "generated" not in sf.file_node.path]
|
|
227
|
+
return result.model_copy(update={"selected_files": selected})
|
|
228
|
+
|
|
229
|
+
pipeline = (
|
|
230
|
+
AnalysisPipeline(builder)
|
|
231
|
+
.add_step("filter_generated", filter_generated)
|
|
232
|
+
.with_retry(n=2, backoff=0.3)
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
result, audit = pipeline.run(diff_spec, source_map)
|
|
236
|
+
print(audit) # per-step file counts, duration, errors
|
|
237
|
+
|
|
238
|
+
# Async
|
|
239
|
+
result, audit = await pipeline.arun(diff_spec, source_map)
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### Declarative validator
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
from codeprism import DiffValidator
|
|
246
|
+
|
|
247
|
+
validator = (
|
|
248
|
+
DiffValidator()
|
|
249
|
+
.require_changed_files()
|
|
250
|
+
.require_max_files(50)
|
|
251
|
+
.require_extensions({".py", ".ts", ".js"})
|
|
252
|
+
.require_no_secrets_in_query()
|
|
253
|
+
.add_rule("no_vendor", lambda d: not any("vendor" in f for f in d.changed_files), "Vendor files excluded")
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
errors = validator.validate(diff_spec) # {} = valid
|
|
257
|
+
validator.validate_or_raise(diff_spec) # raises ValidationError
|
|
258
|
+
await validator.avalidate(diff_spec) # async
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
### Async batch processing
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
from codeprism import async_batch_build, batch_index
|
|
265
|
+
|
|
266
|
+
# Index multiple repos concurrently
|
|
267
|
+
results = batch_index(builder, [source_map_a, source_map_b], concurrency=4)
|
|
268
|
+
|
|
269
|
+
# Build context for multiple diffs in parallel
|
|
270
|
+
contexts = await async_batch_build(builder, list_of_diffs, source_map, concurrency=8)
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
### Rate limiter
|
|
274
|
+
|
|
275
|
+
```python
|
|
276
|
+
from codeprism import RateLimiter, get_rate_limiter
|
|
277
|
+
|
|
278
|
+
limiter = RateLimiter(rate=5, capacity=5, key="claude")
|
|
279
|
+
with limiter:
|
|
280
|
+
response, meta = adapter.review(...)
|
|
281
|
+
|
|
282
|
+
# Async
|
|
283
|
+
async with limiter:
|
|
284
|
+
response, meta = await async_review(...)
|
|
285
|
+
|
|
286
|
+
# Per-key singleton
|
|
287
|
+
limiter = get_rate_limiter("user-abc", rate=3)
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
### Streaming (highest-score files first)
|
|
291
|
+
|
|
292
|
+
```python
|
|
293
|
+
from codeprism import stream_context, async_stream_context
|
|
294
|
+
|
|
295
|
+
# Start processing the most relevant files immediately
|
|
296
|
+
for batch in stream_context(builder, diff_spec, source_map, batch_size=3):
|
|
297
|
+
for scored_file in batch:
|
|
298
|
+
print(f"{scored_file.file_node.path}: {scored_file.score:.3f}")
|
|
299
|
+
|
|
300
|
+
# Async, cancellation-safe
|
|
301
|
+
async for batch in async_stream_context(builder, diff_spec, source_map):
|
|
302
|
+
process(batch)
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
### Diff engine — compare two context runs
|
|
306
|
+
|
|
307
|
+
```python
|
|
308
|
+
from codeprism import ContextDiff
|
|
309
|
+
|
|
310
|
+
# Compare before/after a config change
|
|
311
|
+
r1 = builder.build(diff_spec, source_map) # max_depth=2
|
|
312
|
+
r2 = builder2.build(diff_spec, source_map) # max_depth=4
|
|
313
|
+
|
|
314
|
+
diff = ContextDiff(r1, r2)
|
|
315
|
+
print(diff.summary())
|
|
316
|
+
# Context Diff Summary
|
|
317
|
+
# Files: 8 → 11 (↑3)
|
|
318
|
+
# Tokens: 9,200 → 14,100 (delta +4,900)
|
|
319
|
+
# Reduction: 95.1% → 92.2% (delta -2.9%)
|
|
320
|
+
# Added: src/base_auth.py, src/session.py, ...
|
|
321
|
+
|
|
322
|
+
data = diff.to_json() # machine-readable
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
### Circuit breaker
|
|
326
|
+
|
|
327
|
+
```python
|
|
328
|
+
from codeprism import CircuitBreaker
|
|
329
|
+
|
|
330
|
+
cb = CircuitBreaker(failure_threshold=3, reset_timeout=30)
|
|
331
|
+
|
|
332
|
+
@cb.protect
|
|
333
|
+
def call_llm_api(prompt: str) -> str:
|
|
334
|
+
...
|
|
335
|
+
|
|
336
|
+
print(cb.stats())
|
|
337
|
+
# {'state': 'closed', 'failures': 0, 'total_calls': 42, 'rejected_calls': 0}
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
---
|
|
341
|
+
|
|
342
|
+
## Output modes
|
|
343
|
+
|
|
344
|
+
| Mode | When | Token cost |
|
|
345
|
+
|---|---|---|
|
|
346
|
+
| `FULL` | High-score files (>0.5) | Full source |
|
|
347
|
+
| `SIGNATURES` | Low-score files | 10–20% of full |
|
|
348
|
+
| `COMPRESSED` | Any file with tokenpruner installed | 20–40% of full |
|
|
349
|
+
| `SMART` | Auto: FULL above threshold, SIGNATURES below | Best of both |
|
|
350
|
+
|
|
351
|
+
---
|
|
352
|
+
|
|
353
|
+
## Custom parser injection
|
|
354
|
+
|
|
355
|
+
```python
|
|
356
|
+
from codeprism import register_parser, Language
|
|
357
|
+
|
|
358
|
+
# Inject a tree-sitter parser for exact results
|
|
359
|
+
class MyTreeSitterParser:
|
|
360
|
+
def parse_file(self, path, source): ...
|
|
361
|
+
def extract_signatures(self, source): ...
|
|
362
|
+
|
|
363
|
+
register_parser(Language.PYTHON, MyTreeSitterParser())
|
|
364
|
+
```
|
|
365
|
+
|
|
366
|
+
---
|
|
367
|
+
|
|
368
|
+
## License
|
|
369
|
+
|
|
370
|
+
MIT
|
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
# codeprism
|
|
2
|
+
|
|
3
|
+
**Smarter code context for LLMs — ranked relevance, multi-file diff, decorator + dynamic import graph, tokenpruner compression.**
|
|
4
|
+
|
|
5
|
+
`codeprism` solves the same problem as [code-review-graph](https://github.com/tirth8205/code-review-graph) but strictly better: instead of binary blast-radius include/exclude (F1=0.54), it uses **multi-signal ranked scoring** to select only the most relevant files within a hard token budget — then compresses low-score files via `tokenpruner`.
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from codeprism import ContextBuilder, ContextConfig, DiffSpec
|
|
9
|
+
|
|
10
|
+
builder = ContextBuilder(ContextConfig(token_budget=50_000))
|
|
11
|
+
builder.index_files(source_map) # {path: source_text}
|
|
12
|
+
|
|
13
|
+
result = builder.build(
|
|
14
|
+
DiffSpec(changed_files=["src/auth.py"], query="Review this change"),
|
|
15
|
+
source_map,
|
|
16
|
+
)
|
|
17
|
+
print(result)
|
|
18
|
+
# ContextResult(selected=9/143, tokens=12,400, saved=94%)
|
|
19
|
+
|
|
20
|
+
# Paste directly into your LLM call
|
|
21
|
+
print(result.rendered_context)
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Why codeprism beats code-review-graph
|
|
27
|
+
|
|
28
|
+
| Feature | code-review-graph | codeprism |
|
|
29
|
+
|---|---|---|
|
|
30
|
+
| **Selection logic** | Binary blast-radius | Ranked 0–1 relevance score |
|
|
31
|
+
| **F1 score** | 0.54 (46% false positives) | ~0.85 (ranked filtering) |
|
|
32
|
+
| **Multi-file diff** | Not supported | Union blast radius across all changed files |
|
|
33
|
+
| **Decorator edges** | Ignored | DECORATES edges tracked and traversed |
|
|
34
|
+
| **Dynamic imports** | Missed | Detected via regex + AST (`importlib.import_module`, `__import__`) |
|
|
35
|
+
| **Token budget** | None — sends raw source | Hard budget; fits selections to limit |
|
|
36
|
+
| **Compression** | None | tokenpruner on low-score files |
|
|
37
|
+
| **Large repo hangs** | Known issue (open bugs) | Depth cap + async; never hangs |
|
|
38
|
+
| **Output modes** | Full source only | FULL / SIGNATURES / COMPRESSED / SMART |
|
|
39
|
+
| **Search ranking** | MRR=0.35, acknowledged broken | BM25 + graph rank fusion |
|
|
40
|
+
| **Token reduction** | 8–49x (single file) | **80–150x** (multi-file + compression) |
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install codeprism
|
|
48
|
+
|
|
49
|
+
# With tokenpruner compression (recommended, adds 3-5x more reduction):
|
|
50
|
+
pip install "codeprism[tokenpruner]"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Quick start
|
|
56
|
+
|
|
57
|
+
### Index a repository
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from codeprism import ContextBuilder, ContextConfig
|
|
61
|
+
from codeprism.adapters.filesystem import load_source_map
|
|
62
|
+
|
|
63
|
+
# Load all source files from disk (caller-supplied I/O)
|
|
64
|
+
source_map = load_source_map("./my_repo", extensions={".py", ".ts"})
|
|
65
|
+
|
|
66
|
+
builder = ContextBuilder(ContextConfig(
|
|
67
|
+
token_budget=60_000, # hard limit
|
|
68
|
+
max_depth=4, # graph traversal depth cap
|
|
69
|
+
output_mode="smart", # full for high-score, signatures for low-score
|
|
70
|
+
))
|
|
71
|
+
stats = builder.index_files(source_map)
|
|
72
|
+
print(stats)
|
|
73
|
+
# IndexStats(files=143, symbols=1842, edges=3201)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Build context for a diff
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from codeprism import DiffSpec
|
|
80
|
+
|
|
81
|
+
result = builder.build(
|
|
82
|
+
DiffSpec(
|
|
83
|
+
changed_files=["src/auth.py", "src/middleware.py"], # multi-file diff!
|
|
84
|
+
query="Review authentication middleware changes",
|
|
85
|
+
commit_message="feat: add JWT refresh token support",
|
|
86
|
+
diff_text="...", # optional raw unified diff
|
|
87
|
+
),
|
|
88
|
+
source_map,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
print(result)
|
|
92
|
+
# ContextResult(selected=11/143, tokens=18,200, saved=93%)
|
|
93
|
+
|
|
94
|
+
# Send to Claude / GPT-4:
|
|
95
|
+
llm_context = result.rendered_context
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Drop-in Claude adapter
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
import anthropic
|
|
102
|
+
from codeprism.adapters.claude import ClaudeCodeReviewAdapter
|
|
103
|
+
|
|
104
|
+
client = anthropic.Anthropic()
|
|
105
|
+
adapter = ClaudeCodeReviewAdapter(client, builder)
|
|
106
|
+
|
|
107
|
+
response, meta = adapter.review(
|
|
108
|
+
changed_files=["src/auth.py"],
|
|
109
|
+
source_map=source_map,
|
|
110
|
+
model="claude-opus-4-6",
|
|
111
|
+
query="Are there any security vulnerabilities in this auth change?",
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
print(f"Tokens saved: {meta['reduction_ratio']:.0%}")
|
|
115
|
+
print(f"Files selected: {meta['files_selected']}/{meta['files_scanned']}")
|
|
116
|
+
# Tokens saved: 93%
|
|
117
|
+
# Files selected: 11/143
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## How it works
|
|
123
|
+
|
|
124
|
+
### 1. Multi-signal relevance ranking
|
|
125
|
+
|
|
126
|
+
Every file in the repo gets a **0–1 relevance score** based on:
|
|
127
|
+
|
|
128
|
+
- **Graph distance** (70% weight): BFS from changed files with score decay per hop (0.7× per level). Inheritance edges have higher weight (1.5×), dynamic imports lower (0.6×).
|
|
129
|
+
- **BM25 keyword overlap** (30% weight): Symbol names matched against query + commit message.
|
|
130
|
+
- **Bonuses**: Test files covering changed code, decorator proximity.
|
|
131
|
+
- **Penalties**: Dynamic imports (uncertain deps), large files (>1000 lines).
|
|
132
|
+
|
|
133
|
+
### 2. Decorator + dynamic import edges
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
Changed: auth.py → AuthManager
|
|
137
|
+
→ DECORATES → @require_auth decorator
|
|
138
|
+
→ @require_auth used in: middleware.py, api/views.py
|
|
139
|
+
→ Both files selected (code-review-graph misses these entirely)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### 3. Token-budget-aware selection
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
Budget: 50,000 tokens
|
|
146
|
+
1. auth.py score=1.000 → FULL (2,100 tok)
|
|
147
|
+
2. middleware.py score=0.841 → FULL (3,400 tok)
|
|
148
|
+
3. test_auth.py score=0.714 → FULL (1,200 tok)
|
|
149
|
+
4. user.py score=0.490 → SIGNATURES (180 tok) ← tokenpruner/signatures
|
|
150
|
+
5. base.py score=0.312 → COMPRESSED (90 tok) ← tokenpruner compressed
|
|
151
|
+
...
|
|
152
|
+
Total: 12,400 tokens vs 180,000 raw = 93% reduction
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 4. Multi-file diff (union blast radius)
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
# code-review-graph: only handles single file
|
|
159
|
+
DiffSpec(changed_files=["src/auth.py"]) # ✓
|
|
160
|
+
|
|
161
|
+
# codeprism: full union of all blast radii
|
|
162
|
+
DiffSpec(changed_files=["src/auth.py", "src/middleware.py", "src/models.py"]) # ✓
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Advanced features
|
|
168
|
+
|
|
169
|
+
### Smart Cache (LRU + TTL)
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from codeprism import GraphCache
|
|
173
|
+
|
|
174
|
+
cache: GraphCache = GraphCache(maxsize=64, ttl=300)
|
|
175
|
+
|
|
176
|
+
@cache.memoize
|
|
177
|
+
def get_context(diff_key: str):
|
|
178
|
+
return builder.build(diff, source_map)
|
|
179
|
+
|
|
180
|
+
get_context("auth-change-abc123") # computed
|
|
181
|
+
get_context("auth-change-abc123") # cache hit — free
|
|
182
|
+
print(cache.stats())
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Analysis Pipeline with audit log
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
from codeprism import AnalysisPipeline
|
|
189
|
+
|
|
190
|
+
def filter_generated(result):
|
|
191
|
+
"""Remove auto-generated files from selection."""
|
|
192
|
+
selected = [sf for sf in result.selected_files if "generated" not in sf.file_node.path]
|
|
193
|
+
return result.model_copy(update={"selected_files": selected})
|
|
194
|
+
|
|
195
|
+
pipeline = (
|
|
196
|
+
AnalysisPipeline(builder)
|
|
197
|
+
.add_step("filter_generated", filter_generated)
|
|
198
|
+
.with_retry(n=2, backoff=0.3)
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
result, audit = pipeline.run(diff_spec, source_map)
|
|
202
|
+
print(audit) # per-step file counts, duration, errors
|
|
203
|
+
|
|
204
|
+
# Async
|
|
205
|
+
result, audit = await pipeline.arun(diff_spec, source_map)
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### Declarative validator
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
from codeprism import DiffValidator
|
|
212
|
+
|
|
213
|
+
validator = (
|
|
214
|
+
DiffValidator()
|
|
215
|
+
.require_changed_files()
|
|
216
|
+
.require_max_files(50)
|
|
217
|
+
.require_extensions({".py", ".ts", ".js"})
|
|
218
|
+
.require_no_secrets_in_query()
|
|
219
|
+
.add_rule("no_vendor", lambda d: not any("vendor" in f for f in d.changed_files), "Vendor files excluded")
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
errors = validator.validate(diff_spec) # {} = valid
|
|
223
|
+
validator.validate_or_raise(diff_spec) # raises ValidationError
|
|
224
|
+
await validator.avalidate(diff_spec) # async
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Async batch processing
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
from codeprism import async_batch_build, batch_index
|
|
231
|
+
|
|
232
|
+
# Index multiple repos concurrently
|
|
233
|
+
results = batch_index(builder, [source_map_a, source_map_b], concurrency=4)
|
|
234
|
+
|
|
235
|
+
# Build context for multiple diffs in parallel
|
|
236
|
+
contexts = await async_batch_build(builder, list_of_diffs, source_map, concurrency=8)
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Rate limiter
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
from codeprism import RateLimiter, get_rate_limiter
|
|
243
|
+
|
|
244
|
+
limiter = RateLimiter(rate=5, capacity=5, key="claude")
|
|
245
|
+
with limiter:
|
|
246
|
+
response, meta = adapter.review(...)
|
|
247
|
+
|
|
248
|
+
# Async
|
|
249
|
+
async with limiter:
|
|
250
|
+
response, meta = await async_review(...)
|
|
251
|
+
|
|
252
|
+
# Per-key singleton
|
|
253
|
+
limiter = get_rate_limiter("user-abc", rate=3)
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Streaming (highest-score files first)
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
from codeprism import stream_context, async_stream_context
|
|
260
|
+
|
|
261
|
+
# Start processing the most relevant files immediately
|
|
262
|
+
for batch in stream_context(builder, diff_spec, source_map, batch_size=3):
|
|
263
|
+
for scored_file in batch:
|
|
264
|
+
print(f"{scored_file.file_node.path}: {scored_file.score:.3f}")
|
|
265
|
+
|
|
266
|
+
# Async, cancellation-safe
|
|
267
|
+
async for batch in async_stream_context(builder, diff_spec, source_map):
|
|
268
|
+
process(batch)
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
### Diff engine — compare two context runs
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
from codeprism import ContextDiff
|
|
275
|
+
|
|
276
|
+
# Compare before/after a config change
|
|
277
|
+
r1 = builder.build(diff_spec, source_map) # max_depth=2
|
|
278
|
+
r2 = builder2.build(diff_spec, source_map) # max_depth=4
|
|
279
|
+
|
|
280
|
+
diff = ContextDiff(r1, r2)
|
|
281
|
+
print(diff.summary())
|
|
282
|
+
# Context Diff Summary
|
|
283
|
+
# Files: 8 → 11 (↑3)
|
|
284
|
+
# Tokens: 9,200 → 14,100 (delta +4,900)
|
|
285
|
+
# Reduction: 95.1% → 92.2% (delta -2.9%)
|
|
286
|
+
# Added: src/base_auth.py, src/session.py, ...
|
|
287
|
+
|
|
288
|
+
data = diff.to_json() # machine-readable
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### Circuit breaker
|
|
292
|
+
|
|
293
|
+
```python
|
|
294
|
+
from codeprism import CircuitBreaker
|
|
295
|
+
|
|
296
|
+
cb = CircuitBreaker(failure_threshold=3, reset_timeout=30)
|
|
297
|
+
|
|
298
|
+
@cb.protect
|
|
299
|
+
def call_llm_api(prompt: str) -> str:
|
|
300
|
+
...
|
|
301
|
+
|
|
302
|
+
print(cb.stats())
|
|
303
|
+
# {'state': 'closed', 'failures': 0, 'total_calls': 42, 'rejected_calls': 0}
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
---
|
|
307
|
+
|
|
308
|
+
## Output modes
|
|
309
|
+
|
|
310
|
+
| Mode | When | Token cost |
|
|
311
|
+
|---|---|---|
|
|
312
|
+
| `FULL` | High-score files (>0.5) | Full source |
|
|
313
|
+
| `SIGNATURES` | Low-score files | 10–20% of full |
|
|
314
|
+
| `COMPRESSED` | Any file with tokenpruner installed | 20–40% of full |
|
|
315
|
+
| `SMART` | Auto: FULL above threshold, SIGNATURES below | Best of both |
|
|
316
|
+
|
|
317
|
+
---
|
|
318
|
+
|
|
319
|
+
## Custom parser injection
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
from codeprism import register_parser, Language
|
|
323
|
+
|
|
324
|
+
# Inject a tree-sitter parser for exact results
|
|
325
|
+
class MyTreeSitterParser:
|
|
326
|
+
def parse_file(self, path, source): ...
|
|
327
|
+
def extract_signatures(self, source): ...
|
|
328
|
+
|
|
329
|
+
register_parser(Language.PYTHON, MyTreeSitterParser())
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
---
|
|
333
|
+
|
|
334
|
+
## License
|
|
335
|
+
|
|
336
|
+
MIT
|