codemap-semantic-index 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codemap_semantic_index-0.1.0/.gitignore +43 -0
- codemap_semantic_index-0.1.0/CHANGELOG.md +86 -0
- codemap_semantic_index-0.1.0/PKG-INFO +76 -0
- codemap_semantic_index-0.1.0/README.md +52 -0
- codemap_semantic_index-0.1.0/pyproject.toml +46 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/__init__.py +13 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/chunker.py +283 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/cli.py +446 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/config.py +174 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/__init__.py +18 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/base.py +37 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/factory.py +46 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/local.py +83 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/openai_compat.py +122 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/embedding/presets.py +110 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/indexer.py +149 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/recall_hook.py +193 -0
- codemap_semantic_index-0.1.0/src/codemap_semantic_index/store.py +246 -0
- codemap_semantic_index-0.1.0/tests/__init__.py +0 -0
- codemap_semantic_index-0.1.0/tests/test_chunker.py +144 -0
- codemap_semantic_index-0.1.0/tests/test_cli.py +169 -0
- codemap_semantic_index-0.1.0/tests/test_config.py +102 -0
- codemap_semantic_index-0.1.0/tests/test_indexer.py +170 -0
- codemap_semantic_index-0.1.0/tests/test_openai_compat.py +177 -0
- codemap_semantic_index-0.1.0/tests/test_recall_hook.py +163 -0
- codemap_semantic_index-0.1.0/tests/test_store.py +133 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Build artifacts
|
|
7
|
+
build/
|
|
8
|
+
dist/
|
|
9
|
+
*.egg-info/
|
|
10
|
+
*.egg
|
|
11
|
+
.eggs/
|
|
12
|
+
|
|
13
|
+
# Test / coverage
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
.coverage
|
|
16
|
+
.coverage.*
|
|
17
|
+
htmlcov/
|
|
18
|
+
coverage.xml
|
|
19
|
+
.tox/
|
|
20
|
+
.mypy_cache/
|
|
21
|
+
.ruff_cache/
|
|
22
|
+
.benchmarks/
|
|
23
|
+
|
|
24
|
+
# Virtualenv
|
|
25
|
+
.venv/
|
|
26
|
+
venv/
|
|
27
|
+
env/
|
|
28
|
+
|
|
29
|
+
# uv / pdm lockfiles (commit uv.lock once we settle)
|
|
30
|
+
# uv.lock
|
|
31
|
+
|
|
32
|
+
# IDE
|
|
33
|
+
.idea/
|
|
34
|
+
.vscode/
|
|
35
|
+
*.swp
|
|
36
|
+
*.swo
|
|
37
|
+
|
|
38
|
+
# OS
|
|
39
|
+
.DS_Store
|
|
40
|
+
Thumbs.db
|
|
41
|
+
|
|
42
|
+
# CodeMap own index when dogfooding
|
|
43
|
+
.codemap/
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Changelog — codemap-semantic-index
|
|
2
|
+
|
|
3
|
+
This plugin's version is **independent** of `codemap-core` lockstep —
|
|
4
|
+
it's an opt-in semantic ranker, not part of the L1 indexing core.
|
|
5
|
+
|
|
6
|
+
## 0.1.0 (2026-06-27)
|
|
7
|
+
|
|
8
|
+
First release. Closes AI-EDS roadmap **P1-3**.
|
|
9
|
+
|
|
10
|
+
### What it does
|
|
11
|
+
|
|
12
|
+
Registers an embedding-based ranker into `codemap recall` via the
|
|
13
|
+
`codemap.recall_hooks` entry-point group (introduced in `codemap-core`
|
|
14
|
+
0.4.1). `codemap-aimemory` RRF-fuses our ranking with its token
|
|
15
|
+
ranking and multiplies by freshness (P4-2), so installing this plugin
|
|
16
|
+
upgrades recall from token-only to hybrid semantic + token + freshness
|
|
17
|
+
with zero user code change.
|
|
18
|
+
|
|
19
|
+
### Modules
|
|
20
|
+
|
|
21
|
+
- `chunker` — markdown → chunks. Splits on `##` headings; over-long
|
|
22
|
+
sections re-split with sliding windows (500 tokens / 50 overlap).
|
|
23
|
+
Every chunk text is prefixed `"<knowledge_id> / <h2_title>\n\n..."`
|
|
24
|
+
so embeddings have an anchor to the source doc.
|
|
25
|
+
- `store` — atomic on-disk store under `.ai-memory/_semantic/`:
|
|
26
|
+
`chunks.json` (model-independent metadata) + `vectors.npy`
|
|
27
|
+
(model-specific 1024-dim float32) + `model_id.txt` (active
|
|
28
|
+
backend fingerprint) + `manifest.json` (text_hash → chunk_id for
|
|
29
|
+
incremental).
|
|
30
|
+
- `config` — `~/.config/codemap/embedding.yaml` reader/writer; chmod 600.
|
|
31
|
+
- `embedding/local.py` — sentence-transformers wrapper, default
|
|
32
|
+
`Qwen/Qwen3-Embedding-0.6B` (1024-dim, 32k context, same-source as
|
|
33
|
+
Qwen cloud text-embedding-v3). Lazy-imports `sentence_transformers`
|
|
34
|
+
so plain `--help` doesn't pay torch boot cost.
|
|
35
|
+
- `embedding/openai_compat.py` — `POST {base_url}/embeddings` over
|
|
36
|
+
httpx. Handles 4 preset providers (Qwen / OpenAI / Zhipu / Voyage)
|
|
37
|
+
+ custom (self-hosted vLLM / Ollama / TEI / Jina).
|
|
38
|
+
- `indexer` — `rebuild_index` (full) + `incremental_index` (hash-diff;
|
|
39
|
+
only re-encode chunks whose text changed); refuses on model mismatch.
|
|
40
|
+
- `recall_hook` — entry-point function. Loads the on-disk store, encodes
|
|
41
|
+
the query, computes cosine similarities (vectors are L2-normalised so
|
|
42
|
+
dot-product), aggregates chunks → knowledge_id (best chunk wins),
|
|
43
|
+
returns hook-contract-shaped candidates with freshness already
|
|
44
|
+
computed. Failure modes (no store / model mismatch / network down)
|
|
45
|
+
all silently return `[]` so recall never crashes.
|
|
46
|
+
|
|
47
|
+
### CLI — `codemap embed`
|
|
48
|
+
|
|
49
|
+
11 sub-commands organised in two groups:
|
|
50
|
+
|
|
51
|
+
- `codemap embed [--rebuild | --incremental | --dry-run | --project P]`
|
|
52
|
+
— main embed pipeline; default is incremental.
|
|
53
|
+
- `codemap embed install [<model_id>]` — interactive picker (3 preset
|
|
54
|
+
candidates + custom) or direct install.
|
|
55
|
+
- `codemap embed list` — show locally downloaded HF models, mark active.
|
|
56
|
+
- `codemap embed use <model_id>` — switch active local model; prints
|
|
57
|
+
rebuild hint.
|
|
58
|
+
- `codemap embed backend set [--provider P --api-key K --base-url U
|
|
59
|
+
--model M --dimensions N]` — configure local or cloud backend.
|
|
60
|
+
Interactive picker when no `--provider`. Auto-fills base_url / model /
|
|
61
|
+
dimensions from preset.
|
|
62
|
+
- `codemap embed backend show` — print effective config (api key masked).
|
|
63
|
+
- `codemap embed backend reset` — back to local defaults.
|
|
64
|
+
- `codemap embed backend path` — print config file location.
|
|
65
|
+
|
|
66
|
+
### Dependencies
|
|
67
|
+
|
|
68
|
+
- `codemap-core>=0.4.1` (entry-point group)
|
|
69
|
+
- `codemap-aimemory>=0.4.1` (freshness + recall infrastructure reused)
|
|
70
|
+
- `numpy>=1.24` (vector math)
|
|
71
|
+
- `httpx>=0.27` (cloud backend HTTP)
|
|
72
|
+
- `pyyaml>=6.0`, `typer>=0.12`
|
|
73
|
+
- `sentence-transformers>=3.0` — default install includes this (pulls
|
|
74
|
+
torch ~200MB) so `codemap embed install` works out of the box
|
|
75
|
+
|
|
76
|
+
### Tests
|
|
77
|
+
|
|
78
|
+
66 unit tests covering chunker (10) + store (12) + config (9) +
|
|
79
|
+
openai_compat backend (8) + indexer (8) + recall_hook (8) + cli (11).
|
|
80
|
+
All deterministic — no network, no real embedding model download in
|
|
81
|
+
tests (uses a hashing fake backend that produces stable 4-dim unit
|
|
82
|
+
vectors).
|
|
83
|
+
|
|
84
|
+
### Design doc
|
|
85
|
+
|
|
86
|
+
`Obsidian/Notes/07-Ideas/AI-Enterprise-Delivery-System/2026-06-27-p1-3-codemap-semantic-index-设计方案.md`
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: codemap-semantic-index
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Embedding-based semantic recall hook for CodeMap — registers into `codemap recall` via the recall_hooks entry-point group and adds vector search over .ai-memory/knowledge/*.yml
|
|
5
|
+
Project-URL: Homepage, https://github.com/qxbyte/codemap
|
|
6
|
+
Author: CodeMap Contributors
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: ai-memory,codemap,embedding,rag,semantic-search
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Topic :: Software Development
|
|
12
|
+
Requires-Python: >=3.11
|
|
13
|
+
Requires-Dist: codemap-aimemory>=0.4.1
|
|
14
|
+
Requires-Dist: codemap-core>=0.4.1
|
|
15
|
+
Requires-Dist: httpx>=0.27
|
|
16
|
+
Requires-Dist: numpy>=1.24
|
|
17
|
+
Requires-Dist: pyyaml>=6.0
|
|
18
|
+
Requires-Dist: sentence-transformers>=3.0
|
|
19
|
+
Requires-Dist: typer>=0.12
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest-cov>=6.0; extra == 'dev'
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# codemap-semantic-index
|
|
26
|
+
|
|
27
|
+
Embedding-based semantic recall plugin for [codemap](https://github.com/qxbyte/codemap).
|
|
28
|
+
|
|
29
|
+
Registers an embedding ranker into `codemap recall` via the `codemap.recall_hooks` entry-point group (introduced in `codemap-core` 0.4.1). `codemap-aimemory` automatically RRF-fuses the embedding ranking with its token ranking and applies freshness decay (P4-2).
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pipx inject codemap codemap-semantic-index
|
|
35
|
+
# pulls sentence-transformers + torch (~200MB)
|
|
36
|
+
|
|
37
|
+
# Pick + download a local model (1.2GB default)
|
|
38
|
+
codemap embed install # interactive picker
|
|
39
|
+
codemap embed install BAAI/bge-m3 # direct
|
|
40
|
+
|
|
41
|
+
# First embed (writes <project>/.ai-memory/_semantic/)
|
|
42
|
+
codemap embed
|
|
43
|
+
|
|
44
|
+
# Now `codemap recall` does double-path (token + embedding) + RRF + freshness
|
|
45
|
+
codemap recall '<query>' --with-content
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Default model
|
|
49
|
+
|
|
50
|
+
`Qwen/Qwen3-Embedding-0.6B` (1024 dim, 32k context, 1.2GB). Same-source training as the Qwen cloud `text-embedding-v3`, so switching to cloud preserves recall "feel".
|
|
51
|
+
|
|
52
|
+
## Cloud backend (any OpenAI-compatible embedding API)
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
codemap embed backend set # interactive picker (qwen / openai / zhipu / voyage / custom)
|
|
56
|
+
codemap embed backend show
|
|
57
|
+
codemap embed backend reset # back to local
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Four preset providers + `custom` for self-hosted vLLM / Ollama / TEI. Config persists to `~/.config/codemap/embedding.yaml` (chmod 600).
|
|
61
|
+
|
|
62
|
+
## Storage
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
<project_root>/.ai-memory/_semantic/
|
|
66
|
+
├── chunks.json chunked text + metadata (model-independent)
|
|
67
|
+
├── vectors.npy (n_chunks, 1024) float32 (model-specific)
|
|
68
|
+
├── model_id.txt active backend fingerprint
|
|
69
|
+
└── manifest.json text_hash → chunk_id (drives incremental)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Switching models requires `codemap embed --rebuild` (different vector spaces are not comparable).
|
|
73
|
+
|
|
74
|
+
## Design doc
|
|
75
|
+
|
|
76
|
+
`Obsidian/Notes/07-Ideas/AI-Enterprise-Delivery-System/2026-06-27-p1-3-codemap-semantic-index-设计方案.md` — full spec including chunker / RRF / freshness integration.
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# codemap-semantic-index
|
|
2
|
+
|
|
3
|
+
Embedding-based semantic recall plugin for [codemap](https://github.com/qxbyte/codemap).
|
|
4
|
+
|
|
5
|
+
Registers an embedding ranker into `codemap recall` via the `codemap.recall_hooks` entry-point group (introduced in `codemap-core` 0.4.1). `codemap-aimemory` automatically RRF-fuses the embedding ranking with its token ranking and applies freshness decay (P4-2).
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pipx inject codemap codemap-semantic-index
|
|
11
|
+
# pulls sentence-transformers + torch (~200MB)
|
|
12
|
+
|
|
13
|
+
# Pick + download a local model (1.2GB default)
|
|
14
|
+
codemap embed install # interactive picker
|
|
15
|
+
codemap embed install BAAI/bge-m3 # direct
|
|
16
|
+
|
|
17
|
+
# First embed (writes <project>/.ai-memory/_semantic/)
|
|
18
|
+
codemap embed
|
|
19
|
+
|
|
20
|
+
# Now `codemap recall` does double-path (token + embedding) + RRF + freshness
|
|
21
|
+
codemap recall '<query>' --with-content
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Default model
|
|
25
|
+
|
|
26
|
+
`Qwen/Qwen3-Embedding-0.6B` (1024 dim, 32k context, 1.2GB). Same-source training as the Qwen cloud `text-embedding-v3`, so switching to cloud preserves recall "feel".
|
|
27
|
+
|
|
28
|
+
## Cloud backend (any OpenAI-compatible embedding API)
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
codemap embed backend set # interactive picker (qwen / openai / zhipu / voyage / custom)
|
|
32
|
+
codemap embed backend show
|
|
33
|
+
codemap embed backend reset # back to local
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Four preset providers + `custom` for self-hosted vLLM / Ollama / TEI. Config persists to `~/.config/codemap/embedding.yaml` (chmod 600).
|
|
37
|
+
|
|
38
|
+
## Storage
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
<project_root>/.ai-memory/_semantic/
|
|
42
|
+
├── chunks.json chunked text + metadata (model-independent)
|
|
43
|
+
├── vectors.npy (n_chunks, 1024) float32 (model-specific)
|
|
44
|
+
├── model_id.txt active backend fingerprint
|
|
45
|
+
└── manifest.json text_hash → chunk_id (drives incremental)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Switching models requires `codemap embed --rebuild` (different vector spaces are not comparable).
|
|
49
|
+
|
|
50
|
+
## Design doc
|
|
51
|
+
|
|
52
|
+
`Obsidian/Notes/07-Ideas/AI-Enterprise-Delivery-System/2026-06-27-p1-3-codemap-semantic-index-设计方案.md` — full spec including chunker / RRF / freshness integration.
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.21"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "codemap-semantic-index"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Embedding-based semantic recall hook for CodeMap — registers into `codemap recall` via the recall_hooks entry-point group and adds vector search over .ai-memory/knowledge/*.yml"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "CodeMap Contributors" }]
|
|
13
|
+
keywords = ["codemap", "embedding", "semantic-search", "ai-memory", "rag"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Topic :: Software Development",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"codemap-core>=0.4.1",
|
|
21
|
+
"codemap-aimemory>=0.4.1",
|
|
22
|
+
"numpy>=1.24",
|
|
23
|
+
"httpx>=0.27",
|
|
24
|
+
"pyyaml>=6.0",
|
|
25
|
+
"typer>=0.12",
|
|
26
|
+
# sentence-transformers (and torch) is the default local backend; pulled
|
|
27
|
+
# by default so `codemap embed install` works out of the box. Users who
|
|
28
|
+
# only want the cloud backend can drop torch with --no-deps gymnastics
|
|
29
|
+
# at their own risk.
|
|
30
|
+
"sentence-transformers>=3.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
dev = ["pytest>=8.0", "pytest-cov>=6.0"]
|
|
35
|
+
|
|
36
|
+
[project.entry-points."codemap.cli_commands"]
|
|
37
|
+
embed = "codemap_semantic_index.cli:register"
|
|
38
|
+
|
|
39
|
+
[project.entry-points."codemap.recall_hooks"]
|
|
40
|
+
semantic = "codemap_semantic_index.recall_hook:rank"
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Homepage = "https://github.com/qxbyte/codemap"
|
|
44
|
+
|
|
45
|
+
[tool.hatch.build.targets.wheel]
|
|
46
|
+
packages = ["src/codemap_semantic_index"]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""codemap-semantic-index — embedding-based semantic recall for codemap.
|
|
2
|
+
|
|
3
|
+
Registers via two entry-point groups:
|
|
4
|
+
|
|
5
|
+
* ``codemap.cli_commands.embed`` → :func:`codemap_semantic_index.cli.register`
|
|
6
|
+
adds the ``codemap embed`` subcommand tree.
|
|
7
|
+
* ``codemap.recall_hooks.semantic`` → :func:`codemap_semantic_index.
|
|
8
|
+
recall_hook.rank` plugs an embedding-based ranker into ``codemap recall``;
|
|
9
|
+
``codemap-aimemory>=0.4.1`` discovers it automatically and RRF-fuses
|
|
10
|
+
the result with token recall + freshness.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
"""Markdown → chunks for the semantic index.
|
|
2
|
+
|
|
3
|
+
Source: ``<project_root>/knowledge-base/{rules,business,modules,cases,
|
|
4
|
+
pitfalls}/*.md`` (written by ``specode-distill`` 3.0+ and ``task-swarm``
|
|
5
|
+
0.6+ — see specode-distill's ``references/doc-template.md`` for the
|
|
6
|
+
human-readable templates these files follow).
|
|
7
|
+
|
|
8
|
+
Algorithm (regex-only — no markdown lib so the chunker stays a
|
|
9
|
+
dependency-free wheel of its own):
|
|
10
|
+
|
|
11
|
+
1. Strip YAML frontmatter (``---`` ... ``---``)
|
|
12
|
+
2. Read the H1 (``# ...``) as the document title
|
|
13
|
+
3. Split the body on ``^## `` headings; each section = ``(h2_title, body)``
|
|
14
|
+
4. Body sections whose token count exceeds ``MAX_TOKENS`` are split with
|
|
15
|
+
a sliding window (``WINDOW_TOKENS`` / ``WINDOW_OVERLAP``)
|
|
16
|
+
5. Each emitted chunk's text is prefixed with the title path
|
|
17
|
+
``"<knowledge_id> / <h2_title>\\n\\n<body>"`` so embedding models
|
|
18
|
+
anchor on the right doc even when the body is a generic snippet.
|
|
19
|
+
|
|
20
|
+
Token counting is approximate: 1 token ≈ 4 characters for English /
|
|
21
|
+
2 characters for Chinese. The whole pipeline tolerates being slightly
|
|
22
|
+
off — a longer chunk gets one extra sliding-window slice; nothing
|
|
23
|
+
breaks."""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import hashlib
|
|
28
|
+
import re
|
|
29
|
+
from collections.abc import Iterator
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"MAX_TOKENS",
|
|
35
|
+
"WINDOW_OVERLAP",
|
|
36
|
+
"WINDOW_TOKENS",
|
|
37
|
+
"Chunk",
|
|
38
|
+
"approx_token_count",
|
|
39
|
+
"chunk_knowledge_base",
|
|
40
|
+
"chunk_markdown",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
#: Body sections longer than this get split into sliding windows.
|
|
44
|
+
MAX_TOKENS = 1000
|
|
45
|
+
#: Sliding window size when splitting an over-long section.
|
|
46
|
+
WINDOW_TOKENS = 500
|
|
47
|
+
#: Token overlap between adjacent windows (preserves boundary context).
|
|
48
|
+
WINDOW_OVERLAP = 50
|
|
49
|
+
|
|
50
|
+
#: Categories under ``knowledge-base/`` recognised by spec-distill v3.
|
|
51
|
+
KNOWLEDGE_CATEGORIES: tuple[str, ...] = (
|
|
52
|
+
"rules",
|
|
53
|
+
"business",
|
|
54
|
+
"modules",
|
|
55
|
+
"cases",
|
|
56
|
+
"pitfalls",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class Chunk:
|
|
62
|
+
"""One unit of text fed to the embedding model.
|
|
63
|
+
|
|
64
|
+
``chunk_id`` is stable across re-runs (knowledge_id + h2 slug +
|
|
65
|
+
optional window index) so incremental embedding can hash-compare and
|
|
66
|
+
only re-encode the chunks whose ``text`` changed.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
chunk_id: str
|
|
70
|
+
knowledge_id: str
|
|
71
|
+
category: str # rules / business / modules / cases / pitfalls
|
|
72
|
+
title: str # the H1 of the md doc
|
|
73
|
+
h2_title: str # the H2 of the section this chunk came from
|
|
74
|
+
text: str # prefixed text fed to the embedder
|
|
75
|
+
source_md: str # path relative to project_root
|
|
76
|
+
source_yml: str # twin yml path under .ai-memory/knowledge/
|
|
77
|
+
text_hash: str # sha1 of text — incremental diff key
|
|
78
|
+
|
|
79
|
+
def to_dict(self) -> dict[str, str]:
|
|
80
|
+
return {
|
|
81
|
+
"chunk_id": self.chunk_id,
|
|
82
|
+
"knowledge_id": self.knowledge_id,
|
|
83
|
+
"category": self.category,
|
|
84
|
+
"title": self.title,
|
|
85
|
+
"h2_title": self.h2_title,
|
|
86
|
+
"text": self.text,
|
|
87
|
+
"source_md": self.source_md,
|
|
88
|
+
"source_yml": self.source_yml,
|
|
89
|
+
"text_hash": self.text_hash,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ---------- core algorithm ----------
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
_FRONTMATTER_RE = re.compile(r"\A---\s*\n.*?\n---\s*\n", re.DOTALL)
|
|
97
|
+
_H1_RE = re.compile(r"^#\s+(.+?)\s*$", re.MULTILINE)
|
|
98
|
+
# A heading line is "## ..." OR "### ..." (we split at the same depth as
|
|
99
|
+
# H2 only; H3 stays inside its parent section).
|
|
100
|
+
_H2_SPLIT_RE = re.compile(r"^##\s+(.+?)\s*$", re.MULTILINE)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _strip_frontmatter(text: str) -> str:
|
|
104
|
+
return _FRONTMATTER_RE.sub("", text, count=1)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _extract_h1(text: str) -> str:
|
|
108
|
+
m = _H1_RE.search(text)
|
|
109
|
+
return m.group(1).strip() if m else ""
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _split_h2_sections(body: str) -> list[tuple[str, str]]:
|
|
113
|
+
"""Return ``[(h2_title, section_body), ...]``. Content before the
|
|
114
|
+
first H2 lands as ``("", preamble)``; sections without a body are
|
|
115
|
+
dropped."""
|
|
116
|
+
# Find all H2 positions; iterate to build slices.
|
|
117
|
+
matches = list(_H2_SPLIT_RE.finditer(body))
|
|
118
|
+
if not matches:
|
|
119
|
+
stripped = body.strip()
|
|
120
|
+
return [("", stripped)] if stripped else []
|
|
121
|
+
|
|
122
|
+
out: list[tuple[str, str]] = []
|
|
123
|
+
# Preamble (text before first H2).
|
|
124
|
+
preamble = body[: matches[0].start()].strip()
|
|
125
|
+
if preamble:
|
|
126
|
+
out.append(("", preamble))
|
|
127
|
+
for i, m in enumerate(matches):
|
|
128
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
|
|
129
|
+
section_body = body[m.end() : end].strip()
|
|
130
|
+
if section_body:
|
|
131
|
+
out.append((m.group(1).strip(), section_body))
|
|
132
|
+
return out
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def approx_token_count(text: str) -> int:
|
|
136
|
+
"""Conservative ≈ token estimator: 1 token per 2 CJK chars,
|
|
137
|
+
1 per 4 ASCII chars. Off by ~20% vs real BPE but consistent."""
|
|
138
|
+
cjk = sum(1 for ch in text if "一" <= ch <= "鿿")
|
|
139
|
+
other = len(text) - cjk
|
|
140
|
+
return max(1, cjk // 2 + other // 4)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _sliding_split(text: str) -> Iterator[str]:
|
|
144
|
+
"""Split an over-long section into windows of ~WINDOW_TOKENS each.
|
|
145
|
+
|
|
146
|
+
Uses character indices proportional to the token estimator above so a
|
|
147
|
+
pure-CJK section yields 2x as many chars per window as a pure-ASCII
|
|
148
|
+
one (the inverse of the token math)."""
|
|
149
|
+
cjk_ratio = sum(1 for ch in text if "一" <= ch <= "鿿") / max(1, len(text))
|
|
150
|
+
chars_per_token = 2 if cjk_ratio > 0.5 else 4
|
|
151
|
+
window_chars = WINDOW_TOKENS * chars_per_token
|
|
152
|
+
overlap_chars = WINDOW_OVERLAP * chars_per_token
|
|
153
|
+
step = max(1, window_chars - overlap_chars)
|
|
154
|
+
i = 0
|
|
155
|
+
while i < len(text):
|
|
156
|
+
yield text[i : i + window_chars]
|
|
157
|
+
if i + window_chars >= len(text):
|
|
158
|
+
return
|
|
159
|
+
i += step
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def chunk_markdown(
|
|
163
|
+
md_text: str,
|
|
164
|
+
*,
|
|
165
|
+
knowledge_id: str,
|
|
166
|
+
category: str,
|
|
167
|
+
source_md: str,
|
|
168
|
+
source_yml: str,
|
|
169
|
+
) -> list[Chunk]:
|
|
170
|
+
"""Turn one md document into a list of :class:`Chunk` ready for
|
|
171
|
+
embedding."""
|
|
172
|
+
stripped = _strip_frontmatter(md_text)
|
|
173
|
+
title = _extract_h1(stripped)
|
|
174
|
+
# Remove the H1 line itself before sectioning so the preamble doesn't
|
|
175
|
+
# carry the heading text twice.
|
|
176
|
+
if title:
|
|
177
|
+
stripped = _H1_RE.sub("", stripped, count=1).lstrip("\n")
|
|
178
|
+
sections = _split_h2_sections(stripped)
|
|
179
|
+
|
|
180
|
+
out: list[Chunk] = []
|
|
181
|
+
for h2_title, section_body in sections:
|
|
182
|
+
h2_slug = _slug(h2_title) if h2_title else "_preamble"
|
|
183
|
+
if approx_token_count(section_body) <= MAX_TOKENS:
|
|
184
|
+
out.append(
|
|
185
|
+
_build_chunk(
|
|
186
|
+
chunk_id=f"{knowledge_id}::{h2_slug}",
|
|
187
|
+
knowledge_id=knowledge_id,
|
|
188
|
+
category=category,
|
|
189
|
+
title=title,
|
|
190
|
+
h2_title=h2_title,
|
|
191
|
+
section_body=section_body,
|
|
192
|
+
source_md=source_md,
|
|
193
|
+
source_yml=source_yml,
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
continue
|
|
197
|
+
# Over-long → sliding-window split
|
|
198
|
+
for w_idx, window in enumerate(_sliding_split(section_body)):
|
|
199
|
+
out.append(
|
|
200
|
+
_build_chunk(
|
|
201
|
+
chunk_id=f"{knowledge_id}::{h2_slug}::w{w_idx}",
|
|
202
|
+
knowledge_id=knowledge_id,
|
|
203
|
+
category=category,
|
|
204
|
+
title=title,
|
|
205
|
+
h2_title=h2_title,
|
|
206
|
+
section_body=window,
|
|
207
|
+
source_md=source_md,
|
|
208
|
+
source_yml=source_yml,
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
return out
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
_SLUG_RE = re.compile(r"[^a-z0-9一-鿿]+")
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _slug(text: str) -> str:
|
|
218
|
+
return _SLUG_RE.sub("-", text.lower()).strip("-") or "section"
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _build_chunk(
|
|
222
|
+
*,
|
|
223
|
+
chunk_id: str,
|
|
224
|
+
knowledge_id: str,
|
|
225
|
+
category: str,
|
|
226
|
+
title: str,
|
|
227
|
+
h2_title: str,
|
|
228
|
+
section_body: str,
|
|
229
|
+
source_md: str,
|
|
230
|
+
source_yml: str,
|
|
231
|
+
) -> Chunk:
|
|
232
|
+
# Prefix: title path so the embedding has the "which doc / which
|
|
233
|
+
# section" anchor even when the body is a generic sentence.
|
|
234
|
+
prefix = f"{knowledge_id} / {h2_title}" if h2_title else knowledge_id
|
|
235
|
+
text = f"{prefix}\n\n{section_body}"
|
|
236
|
+
text_hash = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest()[:16]
|
|
237
|
+
return Chunk(
|
|
238
|
+
chunk_id=chunk_id,
|
|
239
|
+
knowledge_id=knowledge_id,
|
|
240
|
+
category=category,
|
|
241
|
+
title=title,
|
|
242
|
+
h2_title=h2_title,
|
|
243
|
+
text=text,
|
|
244
|
+
source_md=source_md,
|
|
245
|
+
source_yml=source_yml,
|
|
246
|
+
text_hash=text_hash,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# ---------- knowledge-base traversal ----------
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def chunk_knowledge_base(project_root: Path) -> list[Chunk]:
|
|
254
|
+
"""Walk ``<project_root>/knowledge-base/{5 categories}/*.md`` and
|
|
255
|
+
chunk every file. Missing dirs / files are silently tolerated
|
|
256
|
+
(consistent with the rest of codemap's "missing inputs degrade
|
|
257
|
+
gracefully" stance)."""
|
|
258
|
+
kb_root = project_root / "knowledge-base"
|
|
259
|
+
if not kb_root.is_dir():
|
|
260
|
+
return []
|
|
261
|
+
out: list[Chunk] = []
|
|
262
|
+
for category in KNOWLEDGE_CATEGORIES:
|
|
263
|
+
cat_dir = kb_root / category
|
|
264
|
+
if not cat_dir.is_dir():
|
|
265
|
+
continue
|
|
266
|
+
for md_file in sorted(cat_dir.glob("*.md")):
|
|
267
|
+
try:
|
|
268
|
+
md_text = md_file.read_text(encoding="utf-8")
|
|
269
|
+
except OSError:
|
|
270
|
+
continue
|
|
271
|
+
knowledge_id = md_file.stem
|
|
272
|
+
source_md = str(md_file.relative_to(project_root))
|
|
273
|
+
source_yml = f".ai-memory/knowledge/{category}/{knowledge_id}.yml"
|
|
274
|
+
out.extend(
|
|
275
|
+
chunk_markdown(
|
|
276
|
+
md_text,
|
|
277
|
+
knowledge_id=knowledge_id,
|
|
278
|
+
category=category,
|
|
279
|
+
source_md=source_md,
|
|
280
|
+
source_yml=source_yml,
|
|
281
|
+
)
|
|
282
|
+
)
|
|
283
|
+
return out
|