codemap-semantic-index 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codemap_semantic_index/__init__.py +13 -0
- codemap_semantic_index/chunker.py +283 -0
- codemap_semantic_index/cli.py +446 -0
- codemap_semantic_index/config.py +174 -0
- codemap_semantic_index/embedding/__init__.py +18 -0
- codemap_semantic_index/embedding/base.py +37 -0
- codemap_semantic_index/embedding/factory.py +46 -0
- codemap_semantic_index/embedding/local.py +83 -0
- codemap_semantic_index/embedding/openai_compat.py +122 -0
- codemap_semantic_index/embedding/presets.py +110 -0
- codemap_semantic_index/indexer.py +149 -0
- codemap_semantic_index/recall_hook.py +193 -0
- codemap_semantic_index/store.py +246 -0
- codemap_semantic_index-0.1.0.dist-info/METADATA +76 -0
- codemap_semantic_index-0.1.0.dist-info/RECORD +17 -0
- codemap_semantic_index-0.1.0.dist-info/WHEEL +4 -0
- codemap_semantic_index-0.1.0.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""codemap-semantic-index — embedding-based semantic recall for codemap.
|
|
2
|
+
|
|
3
|
+
Registers via two entry-point groups:
|
|
4
|
+
|
|
5
|
+
* ``codemap.cli_commands.embed`` → :func:`codemap_semantic_index.cli.register`
|
|
6
|
+
adds the ``codemap embed`` subcommand tree.
|
|
7
|
+
* ``codemap.recall_hooks.semantic`` → :func:`codemap_semantic_index.
|
|
8
|
+
recall_hook.rank` plugs an embedding-based ranker into ``codemap recall``;
|
|
9
|
+
``codemap-aimemory>=0.4.1`` discovers it automatically and RRF-fuses
|
|
10
|
+
the result with token recall + freshness.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
"""Markdown → chunks for the semantic index.
|
|
2
|
+
|
|
3
|
+
Source: ``<project_root>/knowledge-base/{rules,business,modules,cases,
|
|
4
|
+
pitfalls}/*.md`` (written by ``specode-distill`` 3.0+ and ``task-swarm``
|
|
5
|
+
0.6+ — see specode-distill's ``references/doc-template.md`` for the
|
|
6
|
+
human-readable templates these files follow).
|
|
7
|
+
|
|
8
|
+
Algorithm (regex-only — no markdown lib so the chunker stays a
|
|
9
|
+
dependency-free wheel of its own):
|
|
10
|
+
|
|
11
|
+
1. Strip YAML frontmatter (``---`` ... ``---``)
|
|
12
|
+
2. Read the H1 (``# ...``) as the document title
|
|
13
|
+
3. Split the body on ``^## `` headings; each section = ``(h2_title, body)``
|
|
14
|
+
4. Body sections whose token count exceeds ``MAX_TOKENS`` are split with
|
|
15
|
+
a sliding window (``WINDOW_TOKENS`` / ``WINDOW_OVERLAP``)
|
|
16
|
+
5. Each emitted chunk's text is prefixed with the title path
|
|
17
|
+
``"<knowledge_id> / <h2_title>\\n\\n<body>"`` so embedding models
|
|
18
|
+
anchor on the right doc even when the body is a generic snippet.
|
|
19
|
+
|
|
20
|
+
Token counting is approximate: 1 token ≈ 4 characters for English /
|
|
21
|
+
2 characters for Chinese. The whole pipeline tolerates being slightly
|
|
22
|
+
off — a longer chunk gets one extra sliding-window slice; nothing
|
|
23
|
+
breaks."""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import hashlib
|
|
28
|
+
import re
|
|
29
|
+
from collections.abc import Iterator
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"MAX_TOKENS",
|
|
35
|
+
"WINDOW_OVERLAP",
|
|
36
|
+
"WINDOW_TOKENS",
|
|
37
|
+
"Chunk",
|
|
38
|
+
"approx_token_count",
|
|
39
|
+
"chunk_knowledge_base",
|
|
40
|
+
"chunk_markdown",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
#: Body sections longer than this get split into sliding windows.
|
|
44
|
+
MAX_TOKENS = 1000
|
|
45
|
+
#: Sliding window size when splitting an over-long section.
|
|
46
|
+
WINDOW_TOKENS = 500
|
|
47
|
+
#: Token overlap between adjacent windows (preserves boundary context).
|
|
48
|
+
WINDOW_OVERLAP = 50
|
|
49
|
+
|
|
50
|
+
#: Categories under ``knowledge-base/`` recognised by spec-distill v3.
|
|
51
|
+
KNOWLEDGE_CATEGORIES: tuple[str, ...] = (
|
|
52
|
+
"rules",
|
|
53
|
+
"business",
|
|
54
|
+
"modules",
|
|
55
|
+
"cases",
|
|
56
|
+
"pitfalls",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class Chunk:
|
|
62
|
+
"""One unit of text fed to the embedding model.
|
|
63
|
+
|
|
64
|
+
``chunk_id`` is stable across re-runs (knowledge_id + h2 slug +
|
|
65
|
+
optional window index) so incremental embedding can hash-compare and
|
|
66
|
+
only re-encode the chunks whose ``text`` changed.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
chunk_id: str
|
|
70
|
+
knowledge_id: str
|
|
71
|
+
category: str # rules / business / modules / cases / pitfalls
|
|
72
|
+
title: str # the H1 of the md doc
|
|
73
|
+
h2_title: str # the H2 of the section this chunk came from
|
|
74
|
+
text: str # prefixed text fed to the embedder
|
|
75
|
+
source_md: str # path relative to project_root
|
|
76
|
+
source_yml: str # twin yml path under .ai-memory/knowledge/
|
|
77
|
+
text_hash: str # sha1 of text — incremental diff key
|
|
78
|
+
|
|
79
|
+
def to_dict(self) -> dict[str, str]:
|
|
80
|
+
return {
|
|
81
|
+
"chunk_id": self.chunk_id,
|
|
82
|
+
"knowledge_id": self.knowledge_id,
|
|
83
|
+
"category": self.category,
|
|
84
|
+
"title": self.title,
|
|
85
|
+
"h2_title": self.h2_title,
|
|
86
|
+
"text": self.text,
|
|
87
|
+
"source_md": self.source_md,
|
|
88
|
+
"source_yml": self.source_yml,
|
|
89
|
+
"text_hash": self.text_hash,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ---------- core algorithm ----------
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
_FRONTMATTER_RE = re.compile(r"\A---\s*\n.*?\n---\s*\n", re.DOTALL)
|
|
97
|
+
_H1_RE = re.compile(r"^#\s+(.+?)\s*$", re.MULTILINE)
|
|
98
|
+
# A heading line is "## ..." OR "### ..." (we split at the same depth as
|
|
99
|
+
# H2 only; H3 stays inside its parent section).
|
|
100
|
+
_H2_SPLIT_RE = re.compile(r"^##\s+(.+?)\s*$", re.MULTILINE)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _strip_frontmatter(text: str) -> str:
|
|
104
|
+
return _FRONTMATTER_RE.sub("", text, count=1)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _extract_h1(text: str) -> str:
|
|
108
|
+
m = _H1_RE.search(text)
|
|
109
|
+
return m.group(1).strip() if m else ""
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _split_h2_sections(body: str) -> list[tuple[str, str]]:
|
|
113
|
+
"""Return ``[(h2_title, section_body), ...]``. Content before the
|
|
114
|
+
first H2 lands as ``("", preamble)``; sections without a body are
|
|
115
|
+
dropped."""
|
|
116
|
+
# Find all H2 positions; iterate to build slices.
|
|
117
|
+
matches = list(_H2_SPLIT_RE.finditer(body))
|
|
118
|
+
if not matches:
|
|
119
|
+
stripped = body.strip()
|
|
120
|
+
return [("", stripped)] if stripped else []
|
|
121
|
+
|
|
122
|
+
out: list[tuple[str, str]] = []
|
|
123
|
+
# Preamble (text before first H2).
|
|
124
|
+
preamble = body[: matches[0].start()].strip()
|
|
125
|
+
if preamble:
|
|
126
|
+
out.append(("", preamble))
|
|
127
|
+
for i, m in enumerate(matches):
|
|
128
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
|
|
129
|
+
section_body = body[m.end() : end].strip()
|
|
130
|
+
if section_body:
|
|
131
|
+
out.append((m.group(1).strip(), section_body))
|
|
132
|
+
return out
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def approx_token_count(text: str) -> int:
|
|
136
|
+
"""Conservative ≈ token estimator: 1 token per 2 CJK chars,
|
|
137
|
+
1 per 4 ASCII chars. Off by ~20% vs real BPE but consistent."""
|
|
138
|
+
cjk = sum(1 for ch in text if "一" <= ch <= "鿿")
|
|
139
|
+
other = len(text) - cjk
|
|
140
|
+
return max(1, cjk // 2 + other // 4)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _sliding_split(text: str) -> Iterator[str]:
|
|
144
|
+
"""Split an over-long section into windows of ~WINDOW_TOKENS each.
|
|
145
|
+
|
|
146
|
+
Uses character indices proportional to the token estimator above so a
|
|
147
|
+
pure-CJK section yields 2x as many chars per window as a pure-ASCII
|
|
148
|
+
one (the inverse of the token math)."""
|
|
149
|
+
cjk_ratio = sum(1 for ch in text if "一" <= ch <= "鿿") / max(1, len(text))
|
|
150
|
+
chars_per_token = 2 if cjk_ratio > 0.5 else 4
|
|
151
|
+
window_chars = WINDOW_TOKENS * chars_per_token
|
|
152
|
+
overlap_chars = WINDOW_OVERLAP * chars_per_token
|
|
153
|
+
step = max(1, window_chars - overlap_chars)
|
|
154
|
+
i = 0
|
|
155
|
+
while i < len(text):
|
|
156
|
+
yield text[i : i + window_chars]
|
|
157
|
+
if i + window_chars >= len(text):
|
|
158
|
+
return
|
|
159
|
+
i += step
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def chunk_markdown(
|
|
163
|
+
md_text: str,
|
|
164
|
+
*,
|
|
165
|
+
knowledge_id: str,
|
|
166
|
+
category: str,
|
|
167
|
+
source_md: str,
|
|
168
|
+
source_yml: str,
|
|
169
|
+
) -> list[Chunk]:
|
|
170
|
+
"""Turn one md document into a list of :class:`Chunk` ready for
|
|
171
|
+
embedding."""
|
|
172
|
+
stripped = _strip_frontmatter(md_text)
|
|
173
|
+
title = _extract_h1(stripped)
|
|
174
|
+
# Remove the H1 line itself before sectioning so the preamble doesn't
|
|
175
|
+
# carry the heading text twice.
|
|
176
|
+
if title:
|
|
177
|
+
stripped = _H1_RE.sub("", stripped, count=1).lstrip("\n")
|
|
178
|
+
sections = _split_h2_sections(stripped)
|
|
179
|
+
|
|
180
|
+
out: list[Chunk] = []
|
|
181
|
+
for h2_title, section_body in sections:
|
|
182
|
+
h2_slug = _slug(h2_title) if h2_title else "_preamble"
|
|
183
|
+
if approx_token_count(section_body) <= MAX_TOKENS:
|
|
184
|
+
out.append(
|
|
185
|
+
_build_chunk(
|
|
186
|
+
chunk_id=f"{knowledge_id}::{h2_slug}",
|
|
187
|
+
knowledge_id=knowledge_id,
|
|
188
|
+
category=category,
|
|
189
|
+
title=title,
|
|
190
|
+
h2_title=h2_title,
|
|
191
|
+
section_body=section_body,
|
|
192
|
+
source_md=source_md,
|
|
193
|
+
source_yml=source_yml,
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
continue
|
|
197
|
+
# Over-long → sliding-window split
|
|
198
|
+
for w_idx, window in enumerate(_sliding_split(section_body)):
|
|
199
|
+
out.append(
|
|
200
|
+
_build_chunk(
|
|
201
|
+
chunk_id=f"{knowledge_id}::{h2_slug}::w{w_idx}",
|
|
202
|
+
knowledge_id=knowledge_id,
|
|
203
|
+
category=category,
|
|
204
|
+
title=title,
|
|
205
|
+
h2_title=h2_title,
|
|
206
|
+
section_body=window,
|
|
207
|
+
source_md=source_md,
|
|
208
|
+
source_yml=source_yml,
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
return out
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
_SLUG_RE = re.compile(r"[^a-z0-9一-鿿]+")
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _slug(text: str) -> str:
|
|
218
|
+
return _SLUG_RE.sub("-", text.lower()).strip("-") or "section"
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _build_chunk(
|
|
222
|
+
*,
|
|
223
|
+
chunk_id: str,
|
|
224
|
+
knowledge_id: str,
|
|
225
|
+
category: str,
|
|
226
|
+
title: str,
|
|
227
|
+
h2_title: str,
|
|
228
|
+
section_body: str,
|
|
229
|
+
source_md: str,
|
|
230
|
+
source_yml: str,
|
|
231
|
+
) -> Chunk:
|
|
232
|
+
# Prefix: title path so the embedding has the "which doc / which
|
|
233
|
+
# section" anchor even when the body is a generic sentence.
|
|
234
|
+
prefix = f"{knowledge_id} / {h2_title}" if h2_title else knowledge_id
|
|
235
|
+
text = f"{prefix}\n\n{section_body}"
|
|
236
|
+
text_hash = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest()[:16]
|
|
237
|
+
return Chunk(
|
|
238
|
+
chunk_id=chunk_id,
|
|
239
|
+
knowledge_id=knowledge_id,
|
|
240
|
+
category=category,
|
|
241
|
+
title=title,
|
|
242
|
+
h2_title=h2_title,
|
|
243
|
+
text=text,
|
|
244
|
+
source_md=source_md,
|
|
245
|
+
source_yml=source_yml,
|
|
246
|
+
text_hash=text_hash,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# ---------- knowledge-base traversal ----------
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def chunk_knowledge_base(project_root: Path) -> list[Chunk]:
|
|
254
|
+
"""Walk ``<project_root>/knowledge-base/{5 categories}/*.md`` and
|
|
255
|
+
chunk every file. Missing dirs / files are silently tolerated
|
|
256
|
+
(consistent with the rest of codemap's "missing inputs degrade
|
|
257
|
+
gracefully" stance)."""
|
|
258
|
+
kb_root = project_root / "knowledge-base"
|
|
259
|
+
if not kb_root.is_dir():
|
|
260
|
+
return []
|
|
261
|
+
out: list[Chunk] = []
|
|
262
|
+
for category in KNOWLEDGE_CATEGORIES:
|
|
263
|
+
cat_dir = kb_root / category
|
|
264
|
+
if not cat_dir.is_dir():
|
|
265
|
+
continue
|
|
266
|
+
for md_file in sorted(cat_dir.glob("*.md")):
|
|
267
|
+
try:
|
|
268
|
+
md_text = md_file.read_text(encoding="utf-8")
|
|
269
|
+
except OSError:
|
|
270
|
+
continue
|
|
271
|
+
knowledge_id = md_file.stem
|
|
272
|
+
source_md = str(md_file.relative_to(project_root))
|
|
273
|
+
source_yml = f".ai-memory/knowledge/{category}/{knowledge_id}.yml"
|
|
274
|
+
out.extend(
|
|
275
|
+
chunk_markdown(
|
|
276
|
+
md_text,
|
|
277
|
+
knowledge_id=knowledge_id,
|
|
278
|
+
category=category,
|
|
279
|
+
source_md=source_md,
|
|
280
|
+
source_yml=source_yml,
|
|
281
|
+
)
|
|
282
|
+
)
|
|
283
|
+
return out
|