codegraph-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codegraph/qa.py ADDED
@@ -0,0 +1,471 @@
1
+ """Unified Code Intelligence Q&A — retrieval layer.
2
+
3
+ This module classifies natural-language questions and routes them to the
4
+ appropriate CodeScope retrieval strategies across three knowledge graph layers
5
+ (Structure, Evolution, Intent). It returns structured context — NOT
6
+ LLM-generated answers. The LLM reasoning is the AI assistant's job (e.g.
7
+ Cursor, OpenClaw).
8
+
9
+ Usage (programmatic)::
10
+
11
+ from codegraph.core import CodeScope
12
+ from codegraph.qa import codegraph_query
13
+
14
+ cs = CodeScope("my_db")
15
+ result = codegraph_query(cs, "Who calls jv_free?")
16
+ print(result)
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import re
22
+ import time
23
+ from dataclasses import dataclass, field
24
+ from enum import Enum
25
+ from typing import TYPE_CHECKING
26
+
27
+ if TYPE_CHECKING:
28
+ from codegraph.core import CodeScope
29
+
30
+
31
+ class QuestionCategory(str, Enum):
32
+ STRUCTURAL = "structural"
33
+ SEMANTIC = "semantic"
34
+ HISTORICAL = "historical"
35
+ INTENTIONAL = "intentional"
36
+ COMBINED = "combined"
37
+
38
+
39
+ @dataclass
40
+ class ContextItem:
41
+ """A single piece of evidence retrieved from the knowledge graph."""
42
+
43
+ type: str
44
+ content: str
45
+ source: str
46
+ score: float = 0.0
47
+
48
+
49
+ @dataclass
50
+ class RetrievalResult:
51
+ """Structured retrieval output for a question."""
52
+
53
+ question: str
54
+ category: QuestionCategory
55
+ context_items: list[ContextItem] = field(default_factory=list)
56
+ retrieval_time_ms: float = 0.0
57
+ entities: dict = field(default_factory=dict)
58
+
59
+ def to_context_string(self) -> str:
60
+ """Format as a text block suitable for LLM context injection."""
61
+ lines = [
62
+ f"Question type: {self.category.value}",
63
+ f"Retrieved {len(self.context_items)} evidence items "
64
+ f"in {self.retrieval_time_ms:.0f}ms:",
65
+ "",
66
+ ]
67
+ for i, item in enumerate(self.context_items, 1):
68
+ lines.append(f"[{i}] ({item.type}) {item.content}")
69
+ lines.append(f" Source: {item.source}")
70
+ if item.score > 0:
71
+ lines.append(f" Relevance: {item.score:.3f}")
72
+ lines.append("")
73
+ return "\n".join(lines)
74
+
75
+ def to_dict(self) -> dict:
76
+ return {
77
+ "question": self.question,
78
+ "category": self.category.value,
79
+ "retrieval_time_ms": self.retrieval_time_ms,
80
+ "entities": self.entities,
81
+ "context_items": [
82
+ {
83
+ "type": c.type,
84
+ "content": c.content,
85
+ "source": c.source,
86
+ "score": c.score,
87
+ }
88
+ for c in self.context_items
89
+ ],
90
+ }
91
+
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # Question classification
95
+ # ---------------------------------------------------------------------------
96
+
97
+ _STRUCTURAL_PATTERNS = [
98
+ (r"\bwho\s+calls?\b", 0.9),
99
+ (r"\bcallers?\s+of\b", 0.9),
100
+ (r"\bdepend(?:s|encies)?\s+on\b", 0.8),
101
+ (r"\bdead\s+code\b", 0.9),
102
+ (r"\bhotspot", 0.9),
103
+ (r"\bcoupling\b", 0.8),
104
+ (r"\bcircular\b", 0.8),
105
+ (r"\bfan.?in\b", 0.8),
106
+ (r"\bfan.?out\b", 0.8),
107
+ (r"\bimpact\b", 0.7),
108
+ (r"\b调用\b", 0.8),
109
+ (r"\b依赖\b", 0.8),
110
+ (r"\b热点\b", 0.9),
111
+ (r"\b死代码\b", 0.9),
112
+ (r"\b耦合\b", 0.8),
113
+ ]
114
+
115
+ _HISTORICAL_PATTERNS = [
116
+ (r"\bhow\s+has\b.*\bchanged\b", 0.9),
117
+ (r"\bwhen\s+was\b.*\b(introduced|added|created|modified)\b", 0.9),
118
+ (r"\bwhat\s+changed\b", 0.9),
119
+ (r"\bwho\s+(modified|changed|wrote|last)\b", 0.9),
120
+ (r"\bhistory\s+of\b", 0.9),
121
+ (r"\bcommit", 0.7),
122
+ (r"\bmodified\b.*\btimes\b", 0.8),
123
+ (r"\brecently\b.*\b(changed|modified)\b", 0.8),
124
+ (r"\bchurn\b", 0.8),
125
+ (r"\b变更\b", 0.8),
126
+ (r"\b历史\b", 0.8),
127
+ (r"\b谁.*修改\b", 0.9),
128
+ (r"\b什么时候\b", 0.8),
129
+ ]
130
+
131
+ _INTENTIONAL_PATTERNS = [
132
+ (r"\bwhy\s+(was|were|did|is)\b", 0.95),
133
+ (r"\bmotivat", 0.9),
134
+ (r"\breason\s+for\b", 0.9),
135
+ (r"\bpurpose\s+of\b", 0.8),
136
+ (r"\bwhat\s+was\s+the\s+(reason|motivation|purpose)\b", 0.95),
137
+ (r"\bfix\b.*\b(bug|leak|crash|race|error)\b", 0.7),
138
+ (r"\b为什么\b", 0.95),
139
+ (r"\b原因\b", 0.8),
140
+ (r"\b目的\b", 0.8),
141
+ (r"\b动机\b", 0.9),
142
+ ]
143
+
144
+ _COMBINED_PATTERNS = [
145
+ (r"\bvolatile\b.*\bhotspot\b", 0.9),
146
+ (r"\bhigh\b.*\b(coupling|churn)\b.*\b(coupling|churn)\b", 0.9),
147
+ (r"\brecently\b.*\bdead\s+code\b", 0.9),
148
+ (r"\bmost\b.*\b(modified|changed)\b.*\bhotspot\b", 0.9),
149
+ (r"\brisk\b.*\b(churn|change)\b", 0.8),
150
+ ]
151
+
152
+ _SEMANTIC_PATTERNS = [
153
+ (r"\bhow\s+does\b.*\bhandle\b", 0.8),
154
+ (r"\bfind\b.*\b(functions?|code)\b.*\brelated\b", 0.8),
155
+ (r"\bsimilar\s+to\b", 0.8),
156
+ (r"\bcode\b.*\babout\b", 0.7),
157
+ (r"\b相关\b.*\b(代码|函数)\b", 0.8),
158
+ (r"\b类似\b", 0.8),
159
+ ]
160
+
161
+
162
+ def classify_question(question: str) -> tuple[QuestionCategory, float]:
163
+ """Classify a question into a retrieval category using pattern matching."""
164
+ q = question.lower().strip()
165
+
166
+ best_cat = QuestionCategory.SEMANTIC
167
+ best_score = 0.0
168
+
169
+ for patterns, cat in [
170
+ (_COMBINED_PATTERNS, QuestionCategory.COMBINED),
171
+ (_INTENTIONAL_PATTERNS, QuestionCategory.INTENTIONAL),
172
+ (_HISTORICAL_PATTERNS, QuestionCategory.HISTORICAL),
173
+ (_STRUCTURAL_PATTERNS, QuestionCategory.STRUCTURAL),
174
+ (_SEMANTIC_PATTERNS, QuestionCategory.SEMANTIC),
175
+ ]:
176
+ for pattern, weight in patterns:
177
+ if re.search(pattern, q, re.IGNORECASE):
178
+ if weight > best_score:
179
+ best_score = weight
180
+ best_cat = cat
181
+ break
182
+
183
+ return best_cat, best_score
184
+
185
+
186
+ def extract_entities(question: str) -> dict:
187
+ """Extract function names, file paths, and subsystem references."""
188
+ entities: dict = {}
189
+
190
+ func_match = re.search(
191
+ r"(?:calls?|callers?\s+of|impact\s+of\s+\w+|history\s+of|"
192
+ r"who\s+calls?|depends?\s+on|co.?changed?\s+(?:of|with))"
193
+ r"\s+[`'\"]?(\w+)[`'\"]?",
194
+ question,
195
+ re.IGNORECASE,
196
+ )
197
+ if func_match:
198
+ entities["function"] = func_match.group(1)
199
+
200
+ func_match2 = re.search(
201
+ r"[`'\"](\w+)\(\)[`'\"]",
202
+ question,
203
+ )
204
+ if func_match2 and "function" not in entities:
205
+ entities["function"] = func_match2.group(1)
206
+
207
+ backtick = re.search(r"`(\w+)`", question)
208
+ if backtick and "function" not in entities:
209
+ entities["function"] = backtick.group(1)
210
+
211
+ file_match = re.search(r"([\w/]+\.\w{1,4})", question)
212
+ if file_match:
213
+ entities["file_path"] = file_match.group(1)
214
+
215
+ subsys_match = re.search(
216
+ r"\bin\s+[`'\"]?([\w/]+)[`'\"]?(?:\s+(?:subsystem|module|directory))?",
217
+ question,
218
+ re.IGNORECASE,
219
+ )
220
+ if subsys_match:
221
+ candidate = subsys_match.group(1)
222
+ if "/" in candidate or candidate in (
223
+ "src", "vendor", "tests", "kernel", "drivers", "fs", "mm", "net",
224
+ ):
225
+ entities["subsystem"] = candidate
226
+
227
+ return entities
228
+
229
+
230
+ # ---------------------------------------------------------------------------
231
+ # Retrieval strategies
232
+ # ---------------------------------------------------------------------------
233
+
234
+
235
+ def _retrieve_structural(
236
+ cs: "CodeScope", question: str, entities: dict
237
+ ) -> list[ContextItem]:
238
+ items: list[ContextItem] = []
239
+ q = question.lower()
240
+
241
+ func_name = entities.get("function")
242
+
243
+ if any(kw in q for kw in ("who calls", "callers of", "调用", "impact")):
244
+ if func_name:
245
+ results = cs.impact(
246
+ func_name, question, max_hops=2
247
+ )
248
+ for r in results[:15]:
249
+ items.append(ContextItem(
250
+ type="caller",
251
+ content=f"{r.name} ({r.file_path}) — "
252
+ f"hop={r.hop_distance}, relevance={r.relevance:.3f}",
253
+ source=f"{r.file_path}",
254
+ score=r.relevance,
255
+ ))
256
+
257
+ if any(kw in q for kw in ("hotspot", "热点", "risk", "fan_in", "fan_out", "fan-in", "fan-out", "ranking")):
258
+ results = cs.hotspots(topk=15)
259
+ for r in results:
260
+ items.append(ContextItem(
261
+ type="hotspot",
262
+ content=f"{r.name} ({r.file_path}) — "
263
+ f"fan_in={r.fan_in}, fan_out={r.fan_out}, "
264
+ f"risk={r.risk_score:.0f}",
265
+ source=r.file_path,
266
+ score=r.risk_score,
267
+ ))
268
+
269
+ if any(kw in q for kw in ("dead code", "死代码", "unreachable", "unused")):
270
+ results = cs.dead_code()
271
+ subsys = entities.get("subsystem")
272
+ if subsys:
273
+ results = [r for r in results if r.file_path.startswith(subsys)]
274
+ for r in results[:20]:
275
+ items.append(ContextItem(
276
+ type="dead_code",
277
+ content=f"{r.name} ({r.file_path}) — {r.reason}",
278
+ source=r.file_path,
279
+ ))
280
+
281
+ if any(kw in q for kw in ("coupling", "耦合", "depends", "依赖")):
282
+ results = cs.module_coupling(topk=10)
283
+ for r in results:
284
+ items.append(ContextItem(
285
+ type="coupling",
286
+ content=f"{r.module_a} <-> {r.module_b}: "
287
+ f"{r.calls_a_to_b}+{r.calls_b_to_a} cross-calls",
288
+ source=f"{r.module_a}, {r.module_b}",
289
+ score=float(r.calls_a_to_b + r.calls_b_to_a),
290
+ ))
291
+
292
+ if any(kw in q for kw in ("circular", "cycle")):
293
+ cycles = cs.circular_deps()
294
+ for cyc in cycles[:10]:
295
+ items.append(ContextItem(
296
+ type="circular_dep",
297
+ content=" -> ".join(cyc),
298
+ source="file-level imports",
299
+ ))
300
+
301
+ if not items and func_name:
302
+ results = cs.impact(func_name, question, max_hops=2)
303
+ for r in results[:10]:
304
+ items.append(ContextItem(
305
+ type="caller",
306
+ content=f"{r.name} ({r.file_path}) — relevance={r.relevance:.3f}",
307
+ source=r.file_path,
308
+ score=r.relevance,
309
+ ))
310
+
311
+ return items
312
+
313
+
314
+ def _retrieve_semantic(
315
+ cs: "CodeScope", question: str, entities: dict
316
+ ) -> list[ContextItem]:
317
+ items: list[ContextItem] = []
318
+ func_name = entities.get("function")
319
+ scope = entities.get("subsystem", "")
320
+
321
+ if func_name:
322
+ results = cs.similar(func_name, scope, topk=10)
323
+ for r in results:
324
+ items.append(ContextItem(
325
+ type="similar_function",
326
+ content=f"{r.name} — {r.signature} ({r.file_path})",
327
+ source=r.file_path,
328
+ score=r.score,
329
+ ))
330
+
331
+ if not items:
332
+ results = cs.intent_search(question, topk=10)
333
+ for r in results:
334
+ funcs = ", ".join(r.functions_modified[:5])
335
+ items.append(ContextItem(
336
+ type="semantic_match",
337
+ content=f"Commit {r.commit_hash[:8]}: {r.message[:100]}",
338
+ source=f"commit:{r.commit_hash[:8]}",
339
+ score=r.similarity_score,
340
+ ))
341
+ if funcs:
342
+ items.append(ContextItem(
343
+ type="modified_functions",
344
+ content=f"Functions: {funcs}",
345
+ source=f"commit:{r.commit_hash[:8]}",
346
+ score=r.similarity_score,
347
+ ))
348
+
349
+ return items
350
+
351
+
352
+ def _retrieve_historical(
353
+ cs: "CodeScope", question: str, entities: dict
354
+ ) -> list[ContextItem]:
355
+ items: list[ContextItem] = []
356
+ func_name = entities.get("function")
357
+ file_path = entities.get("file_path")
358
+
359
+ if func_name:
360
+ attr = cs.change_attribution(func_name, file_path, limit=15)
361
+ for a in attr:
362
+ items.append(ContextItem(
363
+ type="commit",
364
+ content=f"[{a.change_type}] {a.author}: {a.message[:120]}",
365
+ source=f"commit:{a.commit_hash[:8]}",
366
+ score=float(a.timestamp),
367
+ ))
368
+
369
+ co = cs.co_change(func_name, file_path, min_commits=1, topk=10)
370
+ for c in co:
371
+ items.append(ContextItem(
372
+ type="co_change",
373
+ content=f"{c.function_name} ({c.file_path}) — "
374
+ f"co-changed {c.co_change_count}x in "
375
+ f"{', '.join(h[:8] for h in c.shared_commits[:3])}",
376
+ source=c.file_path,
377
+ score=float(c.co_change_count),
378
+ ))
379
+ else:
380
+ results = cs.intent_search(question, topk=10)
381
+ for r in results:
382
+ funcs = ", ".join(r.functions_modified[:5])
383
+ items.append(ContextItem(
384
+ type="commit",
385
+ content=f"{r.commit_hash[:8]}: {r.message[:120]}",
386
+ source=f"commit:{r.commit_hash[:8]}",
387
+ score=r.similarity_score,
388
+ ))
389
+
390
+ return items
391
+
392
+
393
+ def _retrieve_intentional(
394
+ cs: "CodeScope", question: str, entities: dict
395
+ ) -> list[ContextItem]:
396
+ items: list[ContextItem] = []
397
+
398
+ results = cs.intent_search(question, topk=10)
399
+ for r in results:
400
+ items.append(ContextItem(
401
+ type="intent_match",
402
+ content=f"Commit {r.commit_hash[:8]}: {r.message[:150]}",
403
+ source=f"commit:{r.commit_hash[:8]}",
404
+ score=r.similarity_score,
405
+ ))
406
+ if r.functions_modified:
407
+ funcs = ", ".join(r.functions_modified[:5])
408
+ items.append(ContextItem(
409
+ type="modified_functions",
410
+ content=f"Modified: {funcs}",
411
+ source=f"commit:{r.commit_hash[:8]}",
412
+ score=r.similarity_score,
413
+ ))
414
+
415
+ func_name = entities.get("function")
416
+ if func_name:
417
+ attr = cs.change_attribution(func_name, entities.get("file_path"))
418
+ for a in attr[:5]:
419
+ items.append(ContextItem(
420
+ type="attribution",
421
+ content=f"[{a.change_type}] {a.author}: {a.message[:120]}",
422
+ source=f"commit:{a.commit_hash[:8]}",
423
+ ))
424
+
425
+ return items
426
+
427
+
428
+ def _retrieve_combined(
429
+ cs: "CodeScope", question: str, entities: dict
430
+ ) -> list[ContextItem]:
431
+ items: list[ContextItem] = []
432
+ items.extend(_retrieve_structural(cs, question, entities))
433
+ items.extend(_retrieve_historical(cs, question, entities))
434
+ if not items:
435
+ items.extend(_retrieve_semantic(cs, question, entities))
436
+ return items
437
+
438
+
439
+ _STRATEGY_MAP = {
440
+ QuestionCategory.STRUCTURAL: _retrieve_structural,
441
+ QuestionCategory.SEMANTIC: _retrieve_semantic,
442
+ QuestionCategory.HISTORICAL: _retrieve_historical,
443
+ QuestionCategory.INTENTIONAL: _retrieve_intentional,
444
+ QuestionCategory.COMBINED: _retrieve_combined,
445
+ }
446
+
447
+
448
+ # ---------------------------------------------------------------------------
449
+ # Public API
450
+ # ---------------------------------------------------------------------------
451
+
452
+
453
+ def codegraph_query(cs: "CodeScope", question: str) -> RetrievalResult:
454
+ """Unified entry point: classify, retrieve, return structured context."""
455
+ t0 = time.time()
456
+
457
+ category, confidence = classify_question(question)
458
+ entities = extract_entities(question)
459
+
460
+ strategy = _STRATEGY_MAP[category]
461
+ context_items = strategy(cs, question, entities)
462
+
463
+ elapsed_ms = (time.time() - t0) * 1000
464
+
465
+ return RetrievalResult(
466
+ question=question,
467
+ category=category,
468
+ context_items=context_items,
469
+ retrieval_time_ms=elapsed_ms,
470
+ entities=entities,
471
+ )
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: codegraph-ai
3
+ Version: 0.1.0
4
+ Summary: Hybrid graph + vector code intelligence powered by NeuG and zvec
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: neug
7
+ Requires-Dist: zvec
8
+ Requires-Dist: tree-sitter-language-pack
9
+ Requires-Dist: sentence-transformers
10
+ Requires-Dist: numpy
11
+ Provides-Extra: server
12
+ Requires-Dist: fastmcp; extra == "server"
13
+ Provides-Extra: dev
14
+ Requires-Dist: pytest>=7.0; extra == "dev"
@@ -0,0 +1,18 @@
1
+ codegraph/__init__.py,sha256=qIBQEFex1NisDhc5DBNwFkj29HNoGQNCaGOEH_Ozwxo,76
2
+ codegraph/__main__.py,sha256=f1U6TofPM03KHXoVidcWv24Y8wwXb_L-CYNEW9iu2D0,95
3
+ codegraph/analyzer.py,sha256=x093iNVrTCtba-MvlZEFkzHmotm_2uh_8N92T0SCcw8,15400
4
+ codegraph/cli.py,sha256=keim_D2fHntw29jxKn_KFnAHHjRXN2qHEIAG1KGHSpg,16772
5
+ codegraph/core.py,sha256=I_XvWPtl7u5Du8ke6byMdvJsLh23PMHiQPdcfjsgRJ4,134160
6
+ codegraph/mcp_server.py,sha256=OCqJx2tsynrvixMwL1p26lbwHSX7ODQJ4zpGQizAfqM,19511
7
+ codegraph/models.py,sha256=g9id4c1LuoQDoA5fzTLf091vQLTANIAvoPCovf1kFJU,6814
8
+ codegraph/qa.py,sha256=mktPfvmO984Ao9vWWwCCFv9o5QCc6xQcj2awf6HHCT4,15371
9
+ codegraph/adapters/__init__.py,sha256=DBKvBU_xkVHlqxzvg6Ea-5h3BH5NsYCDe2tHMEYRrkI,75
10
+ codegraph/adapters/base.py,sha256=O2EP59EC-VwWTKUKngdBUih317mfvIiF57yNhupoaEY,1160
11
+ codegraph/adapters/c_adapter.py,sha256=-_LJ0_wLHzVPsPUFtJXA1uTlSMoyo3PBTdgbGTxQepc,17413
12
+ codegraph/adapters/js_adapter.py,sha256=iIuBlybfhl2PUBmxWUFHnf6eVwVHuGHALPS18NmJIXk,19489
13
+ codegraph/adapters/python_adapter.py,sha256=b5mB9Ixy7jDHOrIGKLtabUYjPhhQI9JAakPq8y4i8nc,12497
14
+ codegraph_ai-0.1.0.dist-info/METADATA,sha256=WDvTYj9n7J4KfnMPq0pjHXC9JC1QtH16HKG9VsrbLG0,421
15
+ codegraph_ai-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
16
+ codegraph_ai-0.1.0.dist-info/entry_points.txt,sha256=3f2dJK7oR3dBzP21qRk_KuQa6Li8MVbVXeKcx3UjQ6c,49
17
+ codegraph_ai-0.1.0.dist-info/top_level.txt,sha256=RqBj9sPbifZTb9aeHtnbxTgKJvfHIQdmYE1Brv8Wdkg,10
18
+ codegraph_ai-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ codegraph = codegraph.cli:main
@@ -0,0 +1 @@
1
+ codegraph