codecompass-mcp 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,158 @@
1
+ """Haiku-powered normalization pass for raw code triples.
2
+
3
+ Tree-sitter extraction is syntactic — it knows `db.connect()` is a call
4
+ but not that `db` refers to a `DatabaseClient`. This module sends batches
5
+ of raw triples to Claude Haiku to:
6
+
7
+ 1. Resolve ambiguous/aliased entity names to their canonical form.
8
+ 2. Reclassify relation types where the syntactic guess was wrong.
9
+
10
+ Only entity names and relation types are sent — raw source code never
11
+ leaves the machine.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ from typing import Any
18
+
19
+ import anthropic
20
+
21
+ from config import anthropic_api_key
22
+ from models.code_types import CodeTriple
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Constants
26
+ # ---------------------------------------------------------------------------
27
+
28
+ BATCH_SIZE = 75 # triples per Haiku call — keeps prompts under ~2k tokens
29
+ MAX_RETRIES = 2
30
+
31
+ _SYSTEM_PROMPT = """\
32
+ You are a code knowledge graph normalizer.
33
+
34
+ You receive a JSON array of code triples extracted by a syntax parser.
35
+ Each triple has: from_entity, from_type, relation_type, to_entity, to_type.
36
+
37
+ Your job:
38
+ 1. Resolve aliased or abbreviated entity names to their full canonical form
39
+ when the alias is obvious from context (e.g. "db" → "DatabaseClient" if
40
+ another triple clarifies this). Do NOT guess — leave the name unchanged
41
+ if you are not certain.
42
+ 2. Correct the relation_type if the parser clearly mis-classified it.
43
+ Allowed types: CALLS, IMPORTS, INHERITS, DEFINED_IN, STYLES, HAS_CLASS,
44
+ POSTS_TO, INCLUDES, USED_BY, OVERRIDES, RAISES, RETURNS_TYPE.
45
+ 3. Correct entity types if obviously wrong.
46
+ Allowed types: function, class, module, css_selector, html_element,
47
+ scss_mixin, scss_variable, endpoint, css_class, file.
48
+
49
+ Return the same JSON array with corrections applied.
50
+ Do NOT add, remove, or reorder triples.
51
+ Do NOT include any text outside the JSON array.
52
+ """
53
+
54
+
55
+ def normalize_triples(triples: list[CodeTriple], progress: bool = False) -> list[CodeTriple]:
56
+ """Run the Haiku normalization pass over all triples.
57
+
58
+ Splits into batches of BATCH_SIZE, calls Haiku once per batch, and
59
+ returns the full corrected list. Falls back to the original triples if
60
+ a batch fails after MAX_RETRIES.
61
+ """
62
+ if not triples:
63
+ return []
64
+
65
+ client = anthropic.Anthropic(api_key=anthropic_api_key())
66
+ batches = _split_into_batches(triples, BATCH_SIZE)
67
+ normalized: list[CodeTriple] = []
68
+
69
+ if progress:
70
+ try:
71
+ from tqdm import tqdm
72
+ batches_iter = tqdm(batches, desc="Normalizing batches", unit="batch")
73
+ except ImportError:
74
+ batches_iter = batches
75
+ else:
76
+ batches_iter = batches
77
+
78
+ for batch in batches_iter:
79
+ corrected = _normalize_batch(client, batch)
80
+ normalized.extend(corrected)
81
+
82
+ return normalized
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # Internal helpers
87
+ # ---------------------------------------------------------------------------
88
+
89
+ def _normalize_batch(client: anthropic.Anthropic, batch: list[CodeTriple]) -> list[CodeTriple]:
90
+ """Send one batch to Haiku and return corrected triples.
91
+
92
+ Falls back to the original batch if the API response cannot be parsed.
93
+ """
94
+ raw = _triples_to_dicts(batch)
95
+ payload = json.dumps(raw, ensure_ascii=False)
96
+
97
+ for attempt in range(MAX_RETRIES + 1):
98
+ try:
99
+ response = client.messages.create(
100
+ model="claude-haiku-4-5",
101
+ max_tokens=4096,
102
+ system=_SYSTEM_PROMPT,
103
+ messages=[{"role": "user", "content": payload}],
104
+ )
105
+ corrected_dicts = json.loads(response.content[0].text)
106
+ return _dicts_to_triples(corrected_dicts, batch)
107
+ except (json.JSONDecodeError, KeyError, IndexError):
108
+ if attempt == MAX_RETRIES:
109
+ # Return originals rather than crashing the pipeline
110
+ return batch
111
+ continue
112
+
113
+ return batch
114
+
115
+
116
+ def _split_into_batches(triples: list[CodeTriple], size: int) -> list[list[CodeTriple]]:
117
+ return [triples[i:i + size] for i in range(0, len(triples), size)]
118
+
119
+
120
+ def _triples_to_dicts(triples: list[CodeTriple]) -> list[dict[str, Any]]:
121
+ return [
122
+ {
123
+ "from_entity": t.from_entity,
124
+ "from_type": t.from_type,
125
+ "relation_type": t.relation_type,
126
+ "to_entity": t.to_entity,
127
+ "to_type": t.to_type,
128
+ }
129
+ for t in triples
130
+ ]
131
+
132
+
133
+ def _dicts_to_triples(corrected: list[dict], originals: list[CodeTriple]) -> list[CodeTriple]:
134
+ """Merge corrected dict fields back into the original CodeTriple objects.
135
+
136
+ Preserves source_file and line_number (which Haiku doesn't see) from
137
+ the originals. Falls back to the original triple if a corrected entry
138
+ is malformed.
139
+ """
140
+ result: list[CodeTriple] = []
141
+ for i, original in enumerate(originals):
142
+ if i >= len(corrected):
143
+ result.append(original)
144
+ continue
145
+ patch = corrected[i]
146
+ try:
147
+ result.append(CodeTriple(
148
+ from_entity=patch.get("from_entity", original.from_entity),
149
+ from_type=patch.get("from_type", original.from_type),
150
+ relation_type=patch.get("relation_type", original.relation_type),
151
+ to_entity=patch.get("to_entity", original.to_entity),
152
+ to_type=patch.get("to_type", original.to_type),
153
+ source_file=original.source_file,
154
+ line_number=original.line_number,
155
+ ))
156
+ except (TypeError, AttributeError):
157
+ result.append(original)
158
+ return result