codecompass-mcp 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codecompass_mcp-2.0.0.dist-info/METADATA +368 -0
- codecompass_mcp-2.0.0.dist-info/RECORD +28 -0
- codecompass_mcp-2.0.0.dist-info/WHEEL +5 -0
- codecompass_mcp-2.0.0.dist-info/entry_points.txt +3 -0
- codecompass_mcp-2.0.0.dist-info/licenses/LICENSE +21 -0
- codecompass_mcp-2.0.0.dist-info/top_level.txt +6 -0
- config.py +16 -0
- graph/__init__.py +0 -0
- graph/cli.py +13 -0
- graph/code_graph_client.py +485 -0
- graph/code_query_cli.py +504 -0
- graph/mcp_server.py +280 -0
- graph/setup.py +255 -0
- ingestion/__init__.py +0 -0
- ingestion/chunker.py +70 -0
- ingestion/code_normalizer.py +158 -0
- ingestion/code_parser.py +709 -0
- ingestion/entity_resolver.py +179 -0
- ingestion/file_watcher.py +165 -0
- ingestion/graph_writer.py +17 -0
- ingestion/hierarchy_builder.py +148 -0
- ingestion/reader_agent.py +135 -0
- main.py +306 -0
- models/__init__.py +0 -0
- models/code_types.py +35 -0
- models/types.py +45 -0
- utils/__init__.py +0 -0
- utils/formatting.py +24 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""Haiku-powered normalization pass for raw code triples.
|
|
2
|
+
|
|
3
|
+
Tree-sitter extraction is syntactic — it knows `db.connect()` is a call
|
|
4
|
+
but not that `db` refers to a `DatabaseClient`. This module sends batches
|
|
5
|
+
of raw triples to Claude Haiku to:
|
|
6
|
+
|
|
7
|
+
1. Resolve ambiguous/aliased entity names to their canonical form.
|
|
8
|
+
2. Reclassify relation types where the syntactic guess was wrong.
|
|
9
|
+
|
|
10
|
+
Only entity names and relation types are sent — raw source code never
|
|
11
|
+
leaves the machine.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import anthropic
|
|
20
|
+
|
|
21
|
+
from config import anthropic_api_key
|
|
22
|
+
from models.code_types import CodeTriple
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# Constants
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
BATCH_SIZE = 75 # triples per Haiku call — keeps prompts under ~2k tokens
|
|
29
|
+
MAX_RETRIES = 2
|
|
30
|
+
|
|
31
|
+
_SYSTEM_PROMPT = """\
|
|
32
|
+
You are a code knowledge graph normalizer.
|
|
33
|
+
|
|
34
|
+
You receive a JSON array of code triples extracted by a syntax parser.
|
|
35
|
+
Each triple has: from_entity, from_type, relation_type, to_entity, to_type.
|
|
36
|
+
|
|
37
|
+
Your job:
|
|
38
|
+
1. Resolve aliased or abbreviated entity names to their full canonical form
|
|
39
|
+
when the alias is obvious from context (e.g. "db" → "DatabaseClient" if
|
|
40
|
+
another triple clarifies this). Do NOT guess — leave the name unchanged
|
|
41
|
+
if you are not certain.
|
|
42
|
+
2. Correct the relation_type if the parser clearly mis-classified it.
|
|
43
|
+
Allowed types: CALLS, IMPORTS, INHERITS, DEFINED_IN, STYLES, HAS_CLASS,
|
|
44
|
+
POSTS_TO, INCLUDES, USED_BY, OVERRIDES, RAISES, RETURNS_TYPE.
|
|
45
|
+
3. Correct entity types if obviously wrong.
|
|
46
|
+
Allowed types: function, class, module, css_selector, html_element,
|
|
47
|
+
scss_mixin, scss_variable, endpoint, css_class, file.
|
|
48
|
+
|
|
49
|
+
Return the same JSON array with corrections applied.
|
|
50
|
+
Do NOT add, remove, or reorder triples.
|
|
51
|
+
Do NOT include any text outside the JSON array.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def normalize_triples(triples: list[CodeTriple], progress: bool = False) -> list[CodeTriple]:
|
|
56
|
+
"""Run the Haiku normalization pass over all triples.
|
|
57
|
+
|
|
58
|
+
Splits into batches of BATCH_SIZE, calls Haiku once per batch, and
|
|
59
|
+
returns the full corrected list. Falls back to the original triples if
|
|
60
|
+
a batch fails after MAX_RETRIES.
|
|
61
|
+
"""
|
|
62
|
+
if not triples:
|
|
63
|
+
return []
|
|
64
|
+
|
|
65
|
+
client = anthropic.Anthropic(api_key=anthropic_api_key())
|
|
66
|
+
batches = _split_into_batches(triples, BATCH_SIZE)
|
|
67
|
+
normalized: list[CodeTriple] = []
|
|
68
|
+
|
|
69
|
+
if progress:
|
|
70
|
+
try:
|
|
71
|
+
from tqdm import tqdm
|
|
72
|
+
batches_iter = tqdm(batches, desc="Normalizing batches", unit="batch")
|
|
73
|
+
except ImportError:
|
|
74
|
+
batches_iter = batches
|
|
75
|
+
else:
|
|
76
|
+
batches_iter = batches
|
|
77
|
+
|
|
78
|
+
for batch in batches_iter:
|
|
79
|
+
corrected = _normalize_batch(client, batch)
|
|
80
|
+
normalized.extend(corrected)
|
|
81
|
+
|
|
82
|
+
return normalized
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# Internal helpers
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def _normalize_batch(client: anthropic.Anthropic, batch: list[CodeTriple]) -> list[CodeTriple]:
|
|
90
|
+
"""Send one batch to Haiku and return corrected triples.
|
|
91
|
+
|
|
92
|
+
Falls back to the original batch if the API response cannot be parsed.
|
|
93
|
+
"""
|
|
94
|
+
raw = _triples_to_dicts(batch)
|
|
95
|
+
payload = json.dumps(raw, ensure_ascii=False)
|
|
96
|
+
|
|
97
|
+
for attempt in range(MAX_RETRIES + 1):
|
|
98
|
+
try:
|
|
99
|
+
response = client.messages.create(
|
|
100
|
+
model="claude-haiku-4-5",
|
|
101
|
+
max_tokens=4096,
|
|
102
|
+
system=_SYSTEM_PROMPT,
|
|
103
|
+
messages=[{"role": "user", "content": payload}],
|
|
104
|
+
)
|
|
105
|
+
corrected_dicts = json.loads(response.content[0].text)
|
|
106
|
+
return _dicts_to_triples(corrected_dicts, batch)
|
|
107
|
+
except (json.JSONDecodeError, KeyError, IndexError):
|
|
108
|
+
if attempt == MAX_RETRIES:
|
|
109
|
+
# Return originals rather than crashing the pipeline
|
|
110
|
+
return batch
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
return batch
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _split_into_batches(triples: list[CodeTriple], size: int) -> list[list[CodeTriple]]:
|
|
117
|
+
return [triples[i:i + size] for i in range(0, len(triples), size)]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _triples_to_dicts(triples: list[CodeTriple]) -> list[dict[str, Any]]:
|
|
121
|
+
return [
|
|
122
|
+
{
|
|
123
|
+
"from_entity": t.from_entity,
|
|
124
|
+
"from_type": t.from_type,
|
|
125
|
+
"relation_type": t.relation_type,
|
|
126
|
+
"to_entity": t.to_entity,
|
|
127
|
+
"to_type": t.to_type,
|
|
128
|
+
}
|
|
129
|
+
for t in triples
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _dicts_to_triples(corrected: list[dict], originals: list[CodeTriple]) -> list[CodeTriple]:
|
|
134
|
+
"""Merge corrected dict fields back into the original CodeTriple objects.
|
|
135
|
+
|
|
136
|
+
Preserves source_file and line_number (which Haiku doesn't see) from
|
|
137
|
+
the originals. Falls back to the original triple if a corrected entry
|
|
138
|
+
is malformed.
|
|
139
|
+
"""
|
|
140
|
+
result: list[CodeTriple] = []
|
|
141
|
+
for i, original in enumerate(originals):
|
|
142
|
+
if i >= len(corrected):
|
|
143
|
+
result.append(original)
|
|
144
|
+
continue
|
|
145
|
+
patch = corrected[i]
|
|
146
|
+
try:
|
|
147
|
+
result.append(CodeTriple(
|
|
148
|
+
from_entity=patch.get("from_entity", original.from_entity),
|
|
149
|
+
from_type=patch.get("from_type", original.from_type),
|
|
150
|
+
relation_type=patch.get("relation_type", original.relation_type),
|
|
151
|
+
to_entity=patch.get("to_entity", original.to_entity),
|
|
152
|
+
to_type=patch.get("to_type", original.to_type),
|
|
153
|
+
source_file=original.source_file,
|
|
154
|
+
line_number=original.line_number,
|
|
155
|
+
))
|
|
156
|
+
except (TypeError, AttributeError):
|
|
157
|
+
result.append(original)
|
|
158
|
+
return result
|