codemap-semantic-index 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. codemap_semantic_index-0.1.0/.gitignore +43 -0
  2. codemap_semantic_index-0.1.0/CHANGELOG.md +86 -0
  3. codemap_semantic_index-0.1.0/PKG-INFO +76 -0
  4. codemap_semantic_index-0.1.0/README.md +52 -0
  5. codemap_semantic_index-0.1.0/pyproject.toml +46 -0
  6. codemap_semantic_index-0.1.0/src/codemap_semantic_index/__init__.py +13 -0
  7. codemap_semantic_index-0.1.0/src/codemap_semantic_index/chunker.py +283 -0
  8. codemap_semantic_index-0.1.0/src/codemap_semantic_index/cli.py +446 -0
  9. codemap_semantic_index-0.1.0/src/codemap_semantic_index/config.py +174 -0
  10. codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/__init__.py +18 -0
  11. codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/base.py +37 -0
  12. codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/factory.py +46 -0
  13. codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/local.py +83 -0
  14. codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/openai_compat.py +122 -0
  15. codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/presets.py +110 -0
  16. codemap_semantic_index-0.1.0/src/codemap_semantic_index/indexer.py +149 -0
  17. codemap_semantic_index-0.1.0/src/codemap_semantic_index/recall_hook.py +193 -0
  18. codemap_semantic_index-0.1.0/src/codemap_semantic_index/store.py +246 -0
  19. codemap_semantic_index-0.1.0/tests/__init__.py +0 -0
  20. codemap_semantic_index-0.1.0/tests/test_chunker.py +144 -0
  21. codemap_semantic_index-0.1.0/tests/test_cli.py +169 -0
  22. codemap_semantic_index-0.1.0/tests/test_config.py +102 -0
  23. codemap_semantic_index-0.1.0/tests/test_indexer.py +170 -0
  24. codemap_semantic_index-0.1.0/tests/test_openai_compat.py +177 -0
  25. codemap_semantic_index-0.1.0/tests/test_recall_hook.py +163 -0
  26. codemap_semantic_index-0.1.0/tests/test_store.py +133 -0
@@ -0,0 +1,43 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Build artifacts
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+ *.egg
11
+ .eggs/
12
+
13
+ # Test / coverage
14
+ .pytest_cache/
15
+ .coverage
16
+ .coverage.*
17
+ htmlcov/
18
+ coverage.xml
19
+ .tox/
20
+ .mypy_cache/
21
+ .ruff_cache/
22
+ .benchmarks/
23
+
24
+ # Virtualenv
25
+ .venv/
26
+ venv/
27
+ env/
28
+
29
+ # uv / pdm lockfiles (commit uv.lock once we settle)
30
+ # uv.lock
31
+
32
+ # IDE
33
+ .idea/
34
+ .vscode/
35
+ *.swp
36
+ *.swo
37
+
38
+ # OS
39
+ .DS_Store
40
+ Thumbs.db
41
+
42
+ # CodeMap own index when dogfooding
43
+ .codemap/
@@ -0,0 +1,86 @@
1
+ # Changelog — codemap-semantic-index
2
+
3
+ This plugin's version is **independent** of `codemap-core` lockstep —
4
+ it's an opt-in semantic ranker, not part of the L1 indexing core.
5
+
6
+ ## 0.1.0 (2026-06-27)
7
+
8
+ First release. Closes AI-EDS roadmap **P1-3**.
9
+
10
+ ### What it does
11
+
12
+ Registers an embedding-based ranker into `codemap recall` via the
13
+ `codemap.recall_hooks` entry-point group (introduced in `codemap-core`
14
+ 0.4.1). `codemap-aimemory` RRF-fuses our ranking with its token
15
+ ranking and multiplies by freshness (P4-2), so installing this plugin
16
+ upgrades recall from token-only to hybrid semantic + token + freshness
17
+ with zero user code change.
18
+
19
+ ### Modules
20
+
21
+ - `chunker` — markdown → chunks. Splits on `##` headings; over-long
22
+ sections re-split with sliding windows (500 tokens / 50 overlap).
23
+ Every chunk text is prefixed `"<knowledge_id> / <h2_title>\n\n..."`
24
+ so embeddings have an anchor to the source doc.
25
+ - `store` — atomic on-disk store under `.ai-memory/_semantic/`:
26
+ `chunks.json` (model-independent metadata) + `vectors.npy`
27
+ (model-specific 1024-dim float32) + `model_id.txt` (active
28
+ backend fingerprint) + `manifest.json` (text_hash → chunk_id for
29
+ incremental).
30
+ - `config` — `~/.config/codemap/embedding.yaml` reader/writer; chmod 600.
31
+ - `embedding/local.py` — sentence-transformers wrapper, default
32
+ `Qwen/Qwen3-Embedding-0.6B` (1024-dim, 32k context, same-source as
33
+ Qwen cloud text-embedding-v3). Lazy-imports `sentence_transformers`
34
+ so plain `--help` doesn't pay torch boot cost.
35
+ - `embedding/openai_compat.py` — `POST {base_url}/embeddings` over
36
+ httpx. Handles 4 preset providers (Qwen / OpenAI / Zhipu / Voyage)
37
+ + custom (self-hosted vLLM / Ollama / TEI / Jina).
38
+ - `indexer` — `rebuild_index` (full) + `incremental_index` (hash-diff;
39
+ only re-encode chunks whose text changed); refuses on model mismatch.
40
+ - `recall_hook` — entry-point function. Loads the on-disk store, encodes
41
+ the query, computes cosine similarities (vectors are L2-normalised so
42
+ dot-product), aggregates chunks → knowledge_id (best chunk wins),
43
+ returns hook-contract-shaped candidates with freshness already
44
+ computed. Failure modes (no store / model mismatch / network down)
45
+ all silently return `[]` so recall never crashes.
46
+
47
+ ### CLI — `codemap embed`
48
+
49
+ 11 sub-commands organised in two groups:
50
+
51
+ - `codemap embed [--rebuild | --incremental | --dry-run | --project P]`
52
+ — main embed pipeline; default is incremental.
53
+ - `codemap embed install [<model_id>]` — interactive picker (3 preset
54
+ candidates + custom) or direct install.
55
+ - `codemap embed list` — show locally downloaded HF models, mark active.
56
+ - `codemap embed use <model_id>` — switch active local model; prints
57
+ rebuild hint.
58
+ - `codemap embed backend set [--provider P --api-key K --base-url U
59
+ --model M --dimensions N]` — configure local or cloud backend.
60
+ Interactive picker when no `--provider`. Auto-fills base_url / model /
61
+ dimensions from preset.
62
+ - `codemap embed backend show` — print effective config (api key masked).
63
+ - `codemap embed backend reset` — back to local defaults.
64
+ - `codemap embed backend path` — print config file location.
65
+
66
+ ### Dependencies
67
+
68
+ - `codemap-core>=0.4.1` (entry-point group)
69
+ - `codemap-aimemory>=0.4.1` (freshness + recall infrastructure reused)
70
+ - `numpy>=1.24` (vector math)
71
+ - `httpx>=0.27` (cloud backend HTTP)
72
+ - `pyyaml>=6.0`, `typer>=0.12`
73
+ - `sentence-transformers>=3.0` — default install includes this (pulls
74
+ torch ~200MB) so `codemap embed install` works out of the box
75
+
76
+ ### Tests
77
+
78
+ 66 unit tests covering chunker (10) + store (12) + config (9) +
79
+ openai_compat backend (8) + indexer (8) + recall_hook (8) + cli (11).
80
+ All deterministic — no network, no real embedding model download in
81
+ tests (uses a hashing fake backend that produces stable 4-dim unit
82
+ vectors).
83
+
84
+ ### Design doc
85
+
86
+ `Obsidian/Notes/07-Ideas/AI-Enterprise-Delivery-System/2026-06-27-p1-3-codemap-semantic-index-设计方案.md`
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.4
2
+ Name: codemap-semantic-index
3
+ Version: 0.1.0
4
+ Summary: Embedding-based semantic recall hook for CodeMap — registers into `codemap recall` via the recall_hooks entry-point group and adds vector search over .ai-memory/knowledge/*.yml
5
+ Project-URL: Homepage, https://github.com/qxbyte/codemap
6
+ Author: CodeMap Contributors
7
+ License: MIT
8
+ Keywords: ai-memory,codemap,embedding,rag,semantic-search
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Topic :: Software Development
12
+ Requires-Python: >=3.11
13
+ Requires-Dist: codemap-aimemory>=0.4.1
14
+ Requires-Dist: codemap-core>=0.4.1
15
+ Requires-Dist: httpx>=0.27
16
+ Requires-Dist: numpy>=1.24
17
+ Requires-Dist: pyyaml>=6.0
18
+ Requires-Dist: sentence-transformers>=3.0
19
+ Requires-Dist: typer>=0.12
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest-cov>=6.0; extra == 'dev'
22
+ Requires-Dist: pytest>=8.0; extra == 'dev'
23
+ Description-Content-Type: text/markdown
24
+
25
+ # codemap-semantic-index
26
+
27
+ Embedding-based semantic recall plugin for [codemap](https://github.com/qxbyte/codemap).
28
+
29
+ Registers an embedding ranker into `codemap recall` via the `codemap.recall_hooks` entry-point group (introduced in `codemap-core` 0.4.1). `codemap-aimemory` automatically RRF-fuses the embedding ranking with its token ranking and applies freshness decay (P4-2).
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ pipx inject codemap codemap-semantic-index
35
+ # pulls sentence-transformers + torch (~200MB)
36
+
37
+ # Pick + download a local model (1.2GB default)
38
+ codemap embed install # interactive picker
39
+ codemap embed install BAAI/bge-m3 # direct
40
+
41
+ # First embed (writes <project>/.ai-memory/_semantic/)
42
+ codemap embed
43
+
44
+ # Now `codemap recall` does double-path (token + embedding) + RRF + freshness
45
+ codemap recall '<query>' --with-content
46
+ ```
47
+
48
+ ## Default model
49
+
50
+ `Qwen/Qwen3-Embedding-0.6B` (1024 dim, 32k context, 1.2GB). Same-source training as the Qwen cloud `text-embedding-v3`, so switching to cloud preserves recall "feel".
51
+
52
+ ## Cloud backend (any OpenAI-compatible embedding API)
53
+
54
+ ```bash
55
+ codemap embed backend set # interactive picker (qwen / openai / zhipu / voyage / custom)
56
+ codemap embed backend show
57
+ codemap embed backend reset # back to local
58
+ ```
59
+
60
+ Four preset providers + `custom` for self-hosted vLLM / Ollama / TEI. Config persists to `~/.config/codemap/embedding.yaml` (chmod 600).
61
+
62
+ ## Storage
63
+
64
+ ```
65
+ <project_root>/.ai-memory/_semantic/
66
+ ├── chunks.json chunked text + metadata (model-independent)
67
+ ├── vectors.npy (n_chunks, 1024) float32 (model-specific)
68
+ ├── model_id.txt active backend fingerprint
69
+ └── manifest.json text_hash → chunk_id (drives incremental)
70
+ ```
71
+
72
+ Switching models requires `codemap embed --rebuild` (different vector spaces are not comparable).
73
+
74
+ ## Design doc
75
+
76
+ `Obsidian/Notes/07-Ideas/AI-Enterprise-Delivery-System/2026-06-27-p1-3-codemap-semantic-index-设计方案.md` — full spec including chunker / RRF / freshness integration.
@@ -0,0 +1,52 @@
1
+ # codemap-semantic-index
2
+
3
+ Embedding-based semantic recall plugin for [codemap](https://github.com/qxbyte/codemap).
4
+
5
+ Registers an embedding ranker into `codemap recall` via the `codemap.recall_hooks` entry-point group (introduced in `codemap-core` 0.4.1). `codemap-aimemory` automatically RRF-fuses the embedding ranking with its token ranking and applies freshness decay (P4-2).
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pipx inject codemap codemap-semantic-index
11
+ # pulls sentence-transformers + torch (~200MB)
12
+
13
+ # Pick + download a local model (1.2GB default)
14
+ codemap embed install # interactive picker
15
+ codemap embed install BAAI/bge-m3 # direct
16
+
17
+ # First embed (writes <project>/.ai-memory/_semantic/)
18
+ codemap embed
19
+
20
+ # Now `codemap recall` does double-path (token + embedding) + RRF + freshness
21
+ codemap recall '<query>' --with-content
22
+ ```
23
+
24
+ ## Default model
25
+
26
+ `Qwen/Qwen3-Embedding-0.6B` (1024 dim, 32k context, 1.2GB). Same-source training as the Qwen cloud `text-embedding-v3`, so switching to cloud preserves recall "feel".
27
+
28
+ ## Cloud backend (any OpenAI-compatible embedding API)
29
+
30
+ ```bash
31
+ codemap embed backend set # interactive picker (qwen / openai / zhipu / voyage / custom)
32
+ codemap embed backend show
33
+ codemap embed backend reset # back to local
34
+ ```
35
+
36
+ Four preset providers + `custom` for self-hosted vLLM / Ollama / TEI. Config persists to `~/.config/codemap/embedding.yaml` (chmod 600).
37
+
38
+ ## Storage
39
+
40
+ ```
41
+ <project_root>/.ai-memory/_semantic/
42
+ ├── chunks.json chunked text + metadata (model-independent)
43
+ ├── vectors.npy (n_chunks, 1024) float32 (model-specific)
44
+ ├── model_id.txt active backend fingerprint
45
+ └── manifest.json text_hash → chunk_id (drives incremental)
46
+ ```
47
+
48
+ Switching models requires `codemap embed --rebuild` (different vector spaces are not comparable).
49
+
50
+ ## Design doc
51
+
52
+ `Obsidian/Notes/07-Ideas/AI-Enterprise-Delivery-System/2026-06-27-p1-3-codemap-semantic-index-设计方案.md` — full spec including chunker / RRF / freshness integration.
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.21"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "codemap-semantic-index"
7
+ version = "0.1.0"
8
+ description = "Embedding-based semantic recall hook for CodeMap — registers into `codemap recall` via the recall_hooks entry-point group and adds vector search over .ai-memory/knowledge/*.yml"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "CodeMap Contributors" }]
13
+ keywords = ["codemap", "embedding", "semantic-search", "ai-memory", "rag"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Programming Language :: Python :: 3",
17
+ "Topic :: Software Development",
18
+ ]
19
+ dependencies = [
20
+ "codemap-core>=0.4.1",
21
+ "codemap-aimemory>=0.4.1",
22
+ "numpy>=1.24",
23
+ "httpx>=0.27",
24
+ "pyyaml>=6.0",
25
+ "typer>=0.12",
26
+ # sentence-transformers (and torch) is the default local backend; pulled
27
+ # by default so `codemap embed install` works out of the box. Users who
28
+ # only want the cloud backend can drop torch with --no-deps gymnastics
29
+ # at their own risk.
30
+ "sentence-transformers>=3.0",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ dev = ["pytest>=8.0", "pytest-cov>=6.0"]
35
+
36
+ [project.entry-points."codemap.cli_commands"]
37
+ embed = "codemap_semantic_index.cli:register"
38
+
39
+ [project.entry-points."codemap.recall_hooks"]
40
+ semantic = "codemap_semantic_index.recall_hook:rank"
41
+
42
+ [project.urls]
43
+ Homepage = "https://github.com/qxbyte/codemap"
44
+
45
+ [tool.hatch.build.targets.wheel]
46
+ packages = ["src/codemap_semantic_index"]
@@ -0,0 +1,13 @@
1
+ """codemap-semantic-index — embedding-based semantic recall for codemap.
2
+
3
+ Registers via two entry-point groups:
4
+
5
+ * ``codemap.cli_commands.embed`` → :func:`codemap_semantic_index.cli.register`
6
+ adds the ``codemap embed`` subcommand tree.
7
+ * ``codemap.recall_hooks.semantic`` → :func:`codemap_semantic_index.
8
+ recall_hook.rank` plugs an embedding-based ranker into ``codemap recall``;
9
+ ``codemap-aimemory>=0.4.1`` discovers it automatically and RRF-fuses
10
+ the result with token recall + freshness.
11
+ """
12
+
13
+ __version__ = "0.1.0"
@@ -0,0 +1,283 @@
1
+ """Markdown → chunks for the semantic index.
2
+
3
+ Source: ``<project_root>/knowledge-base/{rules,business,modules,cases,
4
+ pitfalls}/*.md`` (written by ``specode-distill`` 3.0+ and ``task-swarm``
5
+ 0.6+ — see specode-distill's ``references/doc-template.md`` for the
6
+ human-readable templates these files follow).
7
+
8
+ Algorithm (regex-only — no markdown lib so the chunker stays a
9
+ dependency-free wheel of its own):
10
+
11
+ 1. Strip YAML frontmatter (``---`` ... ``---``)
12
+ 2. Read the H1 (``# ...``) as the document title
13
+ 3. Split the body on ``^## `` headings; each section = ``(h2_title, body)``
14
+ 4. Body sections whose token count exceeds ``MAX_TOKENS`` are split with
15
+ a sliding window (``WINDOW_TOKENS`` / ``WINDOW_OVERLAP``)
16
+ 5. Each emitted chunk's text is prefixed with the title path
17
+ ``"<knowledge_id> / <h2_title>\\n\\n<body>"`` so embedding models
18
+ anchor on the right doc even when the body is a generic snippet.
19
+
20
+ Token counting is approximate: 1 token ≈ 4 characters for English /
21
+ 2 characters for Chinese. The whole pipeline tolerates being slightly
22
+ off — a longer chunk gets one extra sliding-window slice; nothing
23
+ breaks."""
24
+
25
+ from __future__ import annotations
26
+
27
+ import hashlib
28
+ import re
29
+ from collections.abc import Iterator
30
+ from dataclasses import dataclass
31
+ from pathlib import Path
32
+
33
+ __all__ = [
34
+ "MAX_TOKENS",
35
+ "WINDOW_OVERLAP",
36
+ "WINDOW_TOKENS",
37
+ "Chunk",
38
+ "approx_token_count",
39
+ "chunk_knowledge_base",
40
+ "chunk_markdown",
41
+ ]
42
+
43
+ #: Body sections longer than this get split into sliding windows.
44
+ MAX_TOKENS = 1000
45
+ #: Sliding window size when splitting an over-long section.
46
+ WINDOW_TOKENS = 500
47
+ #: Token overlap between adjacent windows (preserves boundary context).
48
+ WINDOW_OVERLAP = 50
49
+
50
+ #: Categories under ``knowledge-base/`` recognised by spec-distill v3.
51
+ KNOWLEDGE_CATEGORIES: tuple[str, ...] = (
52
+ "rules",
53
+ "business",
54
+ "modules",
55
+ "cases",
56
+ "pitfalls",
57
+ )
58
+
59
+
60
+ @dataclass
61
+ class Chunk:
62
+ """One unit of text fed to the embedding model.
63
+
64
+ ``chunk_id`` is stable across re-runs (knowledge_id + h2 slug +
65
+ optional window index) so incremental embedding can hash-compare and
66
+ only re-encode the chunks whose ``text`` changed.
67
+ """
68
+
69
+ chunk_id: str
70
+ knowledge_id: str
71
+ category: str # rules / business / modules / cases / pitfalls
72
+ title: str # the H1 of the md doc
73
+ h2_title: str # the H2 of the section this chunk came from
74
+ text: str # prefixed text fed to the embedder
75
+ source_md: str # path relative to project_root
76
+ source_yml: str # twin yml path under .ai-memory/knowledge/
77
+ text_hash: str # sha1 of text — incremental diff key
78
+
79
+ def to_dict(self) -> dict[str, str]:
80
+ return {
81
+ "chunk_id": self.chunk_id,
82
+ "knowledge_id": self.knowledge_id,
83
+ "category": self.category,
84
+ "title": self.title,
85
+ "h2_title": self.h2_title,
86
+ "text": self.text,
87
+ "source_md": self.source_md,
88
+ "source_yml": self.source_yml,
89
+ "text_hash": self.text_hash,
90
+ }
91
+
92
+
93
+ # ---------- core algorithm ----------
94
+
95
+
96
+ _FRONTMATTER_RE = re.compile(r"\A---\s*\n.*?\n---\s*\n", re.DOTALL)
97
+ _H1_RE = re.compile(r"^#\s+(.+?)\s*$", re.MULTILINE)
98
+ # A heading line is "## ..." OR "### ..." (we split at the same depth as
99
+ # H2 only; H3 stays inside its parent section).
100
+ _H2_SPLIT_RE = re.compile(r"^##\s+(.+?)\s*$", re.MULTILINE)
101
+
102
+
103
+ def _strip_frontmatter(text: str) -> str:
104
+ return _FRONTMATTER_RE.sub("", text, count=1)
105
+
106
+
107
+ def _extract_h1(text: str) -> str:
108
+ m = _H1_RE.search(text)
109
+ return m.group(1).strip() if m else ""
110
+
111
+
112
+ def _split_h2_sections(body: str) -> list[tuple[str, str]]:
113
+ """Return ``[(h2_title, section_body), ...]``. Content before the
114
+ first H2 lands as ``("", preamble)``; sections without a body are
115
+ dropped."""
116
+ # Find all H2 positions; iterate to build slices.
117
+ matches = list(_H2_SPLIT_RE.finditer(body))
118
+ if not matches:
119
+ stripped = body.strip()
120
+ return [("", stripped)] if stripped else []
121
+
122
+ out: list[tuple[str, str]] = []
123
+ # Preamble (text before first H2).
124
+ preamble = body[: matches[0].start()].strip()
125
+ if preamble:
126
+ out.append(("", preamble))
127
+ for i, m in enumerate(matches):
128
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
129
+ section_body = body[m.end() : end].strip()
130
+ if section_body:
131
+ out.append((m.group(1).strip(), section_body))
132
+ return out
133
+
134
+
135
+ def approx_token_count(text: str) -> int:
136
+ """Conservative ≈ token estimator: 1 token per 2 CJK chars,
137
+ 1 per 4 ASCII chars. Off by ~20% vs real BPE but consistent."""
138
+ cjk = sum(1 for ch in text if "一" <= ch <= "鿿")
139
+ other = len(text) - cjk
140
+ return max(1, cjk // 2 + other // 4)
141
+
142
+
143
+ def _sliding_split(text: str) -> Iterator[str]:
144
+ """Split an over-long section into windows of ~WINDOW_TOKENS each.
145
+
146
+ Uses character indices proportional to the token estimator above so a
147
+ pure-CJK section yields 2x as many chars per window as a pure-ASCII
148
+ one (the inverse of the token math)."""
149
+ cjk_ratio = sum(1 for ch in text if "一" <= ch <= "鿿") / max(1, len(text))
150
+ chars_per_token = 2 if cjk_ratio > 0.5 else 4
151
+ window_chars = WINDOW_TOKENS * chars_per_token
152
+ overlap_chars = WINDOW_OVERLAP * chars_per_token
153
+ step = max(1, window_chars - overlap_chars)
154
+ i = 0
155
+ while i < len(text):
156
+ yield text[i : i + window_chars]
157
+ if i + window_chars >= len(text):
158
+ return
159
+ i += step
160
+
161
+
162
+ def chunk_markdown(
163
+ md_text: str,
164
+ *,
165
+ knowledge_id: str,
166
+ category: str,
167
+ source_md: str,
168
+ source_yml: str,
169
+ ) -> list[Chunk]:
170
+ """Turn one md document into a list of :class:`Chunk` ready for
171
+ embedding."""
172
+ stripped = _strip_frontmatter(md_text)
173
+ title = _extract_h1(stripped)
174
+ # Remove the H1 line itself before sectioning so the preamble doesn't
175
+ # carry the heading text twice.
176
+ if title:
177
+ stripped = _H1_RE.sub("", stripped, count=1).lstrip("\n")
178
+ sections = _split_h2_sections(stripped)
179
+
180
+ out: list[Chunk] = []
181
+ for h2_title, section_body in sections:
182
+ h2_slug = _slug(h2_title) if h2_title else "_preamble"
183
+ if approx_token_count(section_body) <= MAX_TOKENS:
184
+ out.append(
185
+ _build_chunk(
186
+ chunk_id=f"{knowledge_id}::{h2_slug}",
187
+ knowledge_id=knowledge_id,
188
+ category=category,
189
+ title=title,
190
+ h2_title=h2_title,
191
+ section_body=section_body,
192
+ source_md=source_md,
193
+ source_yml=source_yml,
194
+ )
195
+ )
196
+ continue
197
+ # Over-long → sliding-window split
198
+ for w_idx, window in enumerate(_sliding_split(section_body)):
199
+ out.append(
200
+ _build_chunk(
201
+ chunk_id=f"{knowledge_id}::{h2_slug}::w{w_idx}",
202
+ knowledge_id=knowledge_id,
203
+ category=category,
204
+ title=title,
205
+ h2_title=h2_title,
206
+ section_body=window,
207
+ source_md=source_md,
208
+ source_yml=source_yml,
209
+ )
210
+ )
211
+ return out
212
+
213
+
214
+ _SLUG_RE = re.compile(r"[^a-z0-9一-鿿]+")
215
+
216
+
217
+ def _slug(text: str) -> str:
218
+ return _SLUG_RE.sub("-", text.lower()).strip("-") or "section"
219
+
220
+
221
+ def _build_chunk(
222
+ *,
223
+ chunk_id: str,
224
+ knowledge_id: str,
225
+ category: str,
226
+ title: str,
227
+ h2_title: str,
228
+ section_body: str,
229
+ source_md: str,
230
+ source_yml: str,
231
+ ) -> Chunk:
232
+ # Prefix: title path so the embedding has the "which doc / which
233
+ # section" anchor even when the body is a generic sentence.
234
+ prefix = f"{knowledge_id} / {h2_title}" if h2_title else knowledge_id
235
+ text = f"{prefix}\n\n{section_body}"
236
+ text_hash = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest()[:16]
237
+ return Chunk(
238
+ chunk_id=chunk_id,
239
+ knowledge_id=knowledge_id,
240
+ category=category,
241
+ title=title,
242
+ h2_title=h2_title,
243
+ text=text,
244
+ source_md=source_md,
245
+ source_yml=source_yml,
246
+ text_hash=text_hash,
247
+ )
248
+
249
+
250
+ # ---------- knowledge-base traversal ----------
251
+
252
+
253
+ def chunk_knowledge_base(project_root: Path) -> list[Chunk]:
254
+ """Walk ``<project_root>/knowledge-base/{5 categories}/*.md`` and
255
+ chunk every file. Missing dirs / files are silently tolerated
256
+ (consistent with the rest of codemap's "missing inputs degrade
257
+ gracefully" stance)."""
258
+ kb_root = project_root / "knowledge-base"
259
+ if not kb_root.is_dir():
260
+ return []
261
+ out: list[Chunk] = []
262
+ for category in KNOWLEDGE_CATEGORIES:
263
+ cat_dir = kb_root / category
264
+ if not cat_dir.is_dir():
265
+ continue
266
+ for md_file in sorted(cat_dir.glob("*.md")):
267
+ try:
268
+ md_text = md_file.read_text(encoding="utf-8")
269
+ except OSError:
270
+ continue
271
+ knowledge_id = md_file.stem
272
+ source_md = str(md_file.relative_to(project_root))
273
+ source_yml = f".ai-memory/knowledge/{category}/{knowledge_id}.yml"
274
+ out.extend(
275
+ chunk_markdown(
276
+ md_text,
277
+ knowledge_id=knowledge_id,
278
+ category=category,
279
+ source_md=source_md,
280
+ source_yml=source_yml,
281
+ )
282
+ )
283
+ return out