codemap-semantic-index 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ """codemap-semantic-index — embedding-based semantic recall for codemap.
2
+
3
+ Registers via two entry-point groups:
4
+
5
+ * ``codemap.cli_commands.embed`` → :func:`codemap_semantic_index.cli.register`
6
+ adds the ``codemap embed`` subcommand tree.
7
+ * ``codemap.recall_hooks.semantic`` → :func:`codemap_semantic_index.
8
+ recall_hook.rank` plugs an embedding-based ranker into ``codemap recall``;
9
+ ``codemap-aimemory>=0.4.1`` discovers it automatically and RRF-fuses
10
+ the result with token recall + freshness.
11
+ """
12
+
13
+ __version__ = "0.1.0"
@@ -0,0 +1,283 @@
1
+ """Markdown → chunks for the semantic index.
2
+
3
+ Source: ``<project_root>/knowledge-base/{rules,business,modules,cases,
4
+ pitfalls}/*.md`` (written by ``specode-distill`` 3.0+ and ``task-swarm``
5
+ 0.6+ — see specode-distill's ``references/doc-template.md`` for the
6
+ human-readable templates these files follow).
7
+
8
+ Algorithm (regex-only — no markdown lib so the chunker stays a
9
+ dependency-free wheel of its own):
10
+
11
+ 1. Strip YAML frontmatter (``---`` ... ``---``)
12
+ 2. Read the H1 (``# ...``) as the document title
13
+ 3. Split the body on ``^## `` headings; each section = ``(h2_title, body)``
14
+ 4. Body sections whose token count exceeds ``MAX_TOKENS`` are split with
15
+ a sliding window (``WINDOW_TOKENS`` / ``WINDOW_OVERLAP``)
16
+ 5. Each emitted chunk's text is prefixed with the title path
17
+ ``"<knowledge_id> / <h2_title>\\n\\n<body>"`` so embedding models
18
+ anchor on the right doc even when the body is a generic snippet.
19
+
20
+ Token counting is approximate: 1 token ≈ 4 characters for English /
21
+ 2 characters for Chinese. The whole pipeline tolerates being slightly
22
+ off — a longer chunk gets one extra sliding-window slice; nothing
23
+ breaks."""
24
+
25
+ from __future__ import annotations
26
+
27
+ import hashlib
28
+ import re
29
+ from collections.abc import Iterator
30
+ from dataclasses import dataclass
31
+ from pathlib import Path
32
+
33
+ __all__ = [
34
+ "MAX_TOKENS",
35
+ "WINDOW_OVERLAP",
36
+ "WINDOW_TOKENS",
37
+ "Chunk",
38
+ "approx_token_count",
39
+ "chunk_knowledge_base",
40
+ "chunk_markdown",
41
+ ]
42
+
43
+ #: Body sections longer than this get split into sliding windows.
44
+ MAX_TOKENS = 1000
45
+ #: Sliding window size when splitting an over-long section.
46
+ WINDOW_TOKENS = 500
47
+ #: Token overlap between adjacent windows (preserves boundary context).
48
+ WINDOW_OVERLAP = 50
49
+
50
+ #: Categories under ``knowledge-base/`` recognised by spec-distill v3.
51
+ KNOWLEDGE_CATEGORIES: tuple[str, ...] = (
52
+ "rules",
53
+ "business",
54
+ "modules",
55
+ "cases",
56
+ "pitfalls",
57
+ )
58
+
59
+
60
+ @dataclass
61
+ class Chunk:
62
+ """One unit of text fed to the embedding model.
63
+
64
+ ``chunk_id`` is stable across re-runs (knowledge_id + h2 slug +
65
+ optional window index) so incremental embedding can hash-compare and
66
+ only re-encode the chunks whose ``text`` changed.
67
+ """
68
+
69
+ chunk_id: str
70
+ knowledge_id: str
71
+ category: str # rules / business / modules / cases / pitfalls
72
+ title: str # the H1 of the md doc
73
+ h2_title: str # the H2 of the section this chunk came from
74
+ text: str # prefixed text fed to the embedder
75
+ source_md: str # path relative to project_root
76
+ source_yml: str # twin yml path under .ai-memory/knowledge/
77
+ text_hash: str # sha1 of text — incremental diff key
78
+
79
+ def to_dict(self) -> dict[str, str]:
80
+ return {
81
+ "chunk_id": self.chunk_id,
82
+ "knowledge_id": self.knowledge_id,
83
+ "category": self.category,
84
+ "title": self.title,
85
+ "h2_title": self.h2_title,
86
+ "text": self.text,
87
+ "source_md": self.source_md,
88
+ "source_yml": self.source_yml,
89
+ "text_hash": self.text_hash,
90
+ }
91
+
92
+
93
+ # ---------- core algorithm ----------
94
+
95
+
96
+ _FRONTMATTER_RE = re.compile(r"\A---\s*\n.*?\n---\s*\n", re.DOTALL)
97
+ _H1_RE = re.compile(r"^#\s+(.+?)\s*$", re.MULTILINE)
98
+ # A heading line is "## ..." OR "### ..." (we split at the same depth as
99
+ # H2 only; H3 stays inside its parent section).
100
+ _H2_SPLIT_RE = re.compile(r"^##\s+(.+?)\s*$", re.MULTILINE)
101
+
102
+
103
+ def _strip_frontmatter(text: str) -> str:
104
+ return _FRONTMATTER_RE.sub("", text, count=1)
105
+
106
+
107
+ def _extract_h1(text: str) -> str:
108
+ m = _H1_RE.search(text)
109
+ return m.group(1).strip() if m else ""
110
+
111
+
112
+ def _split_h2_sections(body: str) -> list[tuple[str, str]]:
113
+ """Return ``[(h2_title, section_body), ...]``. Content before the
114
+ first H2 lands as ``("", preamble)``; sections without a body are
115
+ dropped."""
116
+ # Find all H2 positions; iterate to build slices.
117
+ matches = list(_H2_SPLIT_RE.finditer(body))
118
+ if not matches:
119
+ stripped = body.strip()
120
+ return [("", stripped)] if stripped else []
121
+
122
+ out: list[tuple[str, str]] = []
123
+ # Preamble (text before first H2).
124
+ preamble = body[: matches[0].start()].strip()
125
+ if preamble:
126
+ out.append(("", preamble))
127
+ for i, m in enumerate(matches):
128
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
129
+ section_body = body[m.end() : end].strip()
130
+ if section_body:
131
+ out.append((m.group(1).strip(), section_body))
132
+ return out
133
+
134
+
135
+ def approx_token_count(text: str) -> int:
136
+ """Conservative ≈ token estimator: 1 token per 2 CJK chars,
137
+ 1 per 4 ASCII chars. Off by ~20% vs real BPE but consistent."""
138
+ cjk = sum(1 for ch in text if "一" <= ch <= "鿿")
139
+ other = len(text) - cjk
140
+ return max(1, cjk // 2 + other // 4)
141
+
142
+
143
+ def _sliding_split(text: str) -> Iterator[str]:
144
+ """Split an over-long section into windows of ~WINDOW_TOKENS each.
145
+
146
+ Uses character indices proportional to the token estimator above so a
147
+ pure-CJK section yields 2x as many chars per window as a pure-ASCII
148
+ one (the inverse of the token math)."""
149
+ cjk_ratio = sum(1 for ch in text if "一" <= ch <= "鿿") / max(1, len(text))
150
+ chars_per_token = 2 if cjk_ratio > 0.5 else 4
151
+ window_chars = WINDOW_TOKENS * chars_per_token
152
+ overlap_chars = WINDOW_OVERLAP * chars_per_token
153
+ step = max(1, window_chars - overlap_chars)
154
+ i = 0
155
+ while i < len(text):
156
+ yield text[i : i + window_chars]
157
+ if i + window_chars >= len(text):
158
+ return
159
+ i += step
160
+
161
+
162
+ def chunk_markdown(
163
+ md_text: str,
164
+ *,
165
+ knowledge_id: str,
166
+ category: str,
167
+ source_md: str,
168
+ source_yml: str,
169
+ ) -> list[Chunk]:
170
+ """Turn one md document into a list of :class:`Chunk` ready for
171
+ embedding."""
172
+ stripped = _strip_frontmatter(md_text)
173
+ title = _extract_h1(stripped)
174
+ # Remove the H1 line itself before sectioning so the preamble doesn't
175
+ # carry the heading text twice.
176
+ if title:
177
+ stripped = _H1_RE.sub("", stripped, count=1).lstrip("\n")
178
+ sections = _split_h2_sections(stripped)
179
+
180
+ out: list[Chunk] = []
181
+ for h2_title, section_body in sections:
182
+ h2_slug = _slug(h2_title) if h2_title else "_preamble"
183
+ if approx_token_count(section_body) <= MAX_TOKENS:
184
+ out.append(
185
+ _build_chunk(
186
+ chunk_id=f"{knowledge_id}::{h2_slug}",
187
+ knowledge_id=knowledge_id,
188
+ category=category,
189
+ title=title,
190
+ h2_title=h2_title,
191
+ section_body=section_body,
192
+ source_md=source_md,
193
+ source_yml=source_yml,
194
+ )
195
+ )
196
+ continue
197
+ # Over-long → sliding-window split
198
+ for w_idx, window in enumerate(_sliding_split(section_body)):
199
+ out.append(
200
+ _build_chunk(
201
+ chunk_id=f"{knowledge_id}::{h2_slug}::w{w_idx}",
202
+ knowledge_id=knowledge_id,
203
+ category=category,
204
+ title=title,
205
+ h2_title=h2_title,
206
+ section_body=window,
207
+ source_md=source_md,
208
+ source_yml=source_yml,
209
+ )
210
+ )
211
+ return out
212
+
213
+
214
+ _SLUG_RE = re.compile(r"[^a-z0-9一-鿿]+")
215
+
216
+
217
+ def _slug(text: str) -> str:
218
+ return _SLUG_RE.sub("-", text.lower()).strip("-") or "section"
219
+
220
+
221
+ def _build_chunk(
222
+ *,
223
+ chunk_id: str,
224
+ knowledge_id: str,
225
+ category: str,
226
+ title: str,
227
+ h2_title: str,
228
+ section_body: str,
229
+ source_md: str,
230
+ source_yml: str,
231
+ ) -> Chunk:
232
+ # Prefix: title path so the embedding has the "which doc / which
233
+ # section" anchor even when the body is a generic sentence.
234
+ prefix = f"{knowledge_id} / {h2_title}" if h2_title else knowledge_id
235
+ text = f"{prefix}\n\n{section_body}"
236
+ text_hash = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest()[:16]
237
+ return Chunk(
238
+ chunk_id=chunk_id,
239
+ knowledge_id=knowledge_id,
240
+ category=category,
241
+ title=title,
242
+ h2_title=h2_title,
243
+ text=text,
244
+ source_md=source_md,
245
+ source_yml=source_yml,
246
+ text_hash=text_hash,
247
+ )
248
+
249
+
250
+ # ---------- knowledge-base traversal ----------
251
+
252
+
253
+ def chunk_knowledge_base(project_root: Path) -> list[Chunk]:
254
+ """Walk ``<project_root>/knowledge-base/{5 categories}/*.md`` and
255
+ chunk every file. Missing dirs / files are silently tolerated
256
+ (consistent with the rest of codemap's "missing inputs degrade
257
+ gracefully" stance)."""
258
+ kb_root = project_root / "knowledge-base"
259
+ if not kb_root.is_dir():
260
+ return []
261
+ out: list[Chunk] = []
262
+ for category in KNOWLEDGE_CATEGORIES:
263
+ cat_dir = kb_root / category
264
+ if not cat_dir.is_dir():
265
+ continue
266
+ for md_file in sorted(cat_dir.glob("*.md")):
267
+ try:
268
+ md_text = md_file.read_text(encoding="utf-8")
269
+ except OSError:
270
+ continue
271
+ knowledge_id = md_file.stem
272
+ source_md = str(md_file.relative_to(project_root))
273
+ source_yml = f".ai-memory/knowledge/{category}/{knowledge_id}.yml"
274
+ out.extend(
275
+ chunk_markdown(
276
+ md_text,
277
+ knowledge_id=knowledge_id,
278
+ category=category,
279
+ source_md=source_md,
280
+ source_yml=source_yml,
281
+ )
282
+ )
283
+ return out