@naveenadi/mnemonic 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +93 -0
- package/SKILL.md +158 -0
- package/bin/mne +8 -0
- package/dist/chunker/index.d.ts +8 -0
- package/dist/chunker/index.d.ts.map +1 -0
- package/dist/chunker/index.js +157 -0
- package/dist/chunker/index.js.map +1 -0
- package/dist/cli/commands.d.ts +2 -0
- package/dist/cli/commands.d.ts.map +1 -0
- package/dist/cli/commands.js +793 -0
- package/dist/cli/commands.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +7 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/mne.d.ts +3 -0
- package/dist/cli/mne.d.ts.map +1 -0
- package/dist/cli/mne.js +7 -0
- package/dist/cli/mne.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +15 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/factory.d.ts +15 -0
- package/dist/llm/factory.d.ts.map +1 -0
- package/dist/llm/factory.js +60 -0
- package/dist/llm/factory.js.map +1 -0
- package/dist/llm/index.d.ts +19 -0
- package/dist/llm/index.d.ts.map +1 -0
- package/dist/llm/index.js +18 -0
- package/dist/llm/index.js.map +1 -0
- package/dist/llm/node-llama-cpp.d.ts +30 -0
- package/dist/llm/node-llama-cpp.d.ts.map +1 -0
- package/dist/llm/node-llama-cpp.js +143 -0
- package/dist/llm/node-llama-cpp.js.map +1 -0
- package/dist/llm/ollama.d.ts +19 -0
- package/dist/llm/ollama.d.ts.map +1 -0
- package/dist/llm/ollama.js +99 -0
- package/dist/llm/ollama.js.map +1 -0
- package/dist/mcp/server.d.ts +2 -0
- package/dist/mcp/server.d.ts.map +1 -0
- package/dist/mcp/server.js +360 -0
- package/dist/mcp/server.js.map +1 -0
- package/dist/pi-extension/index.d.ts +3 -0
- package/dist/pi-extension/index.d.ts.map +1 -0
- package/dist/pi-extension/index.js +193 -0
- package/dist/pi-extension/index.js.map +1 -0
- package/dist/search/fts.d.ts +17 -0
- package/dist/search/fts.d.ts.map +1 -0
- package/dist/search/fts.js +110 -0
- package/dist/search/fts.js.map +1 -0
- package/dist/search/fusion.d.ts +33 -0
- package/dist/search/fusion.d.ts.map +1 -0
- package/dist/search/fusion.js +97 -0
- package/dist/search/fusion.js.map +1 -0
- package/dist/search/pipeline.d.ts +34 -0
- package/dist/search/pipeline.d.ts.map +1 -0
- package/dist/search/pipeline.js +216 -0
- package/dist/search/pipeline.js.map +1 -0
- package/dist/search/vector.d.ts +20 -0
- package/dist/search/vector.d.ts.map +1 -0
- package/dist/search/vector.js +70 -0
- package/dist/search/vector.js.map +1 -0
- package/dist/store/collections.d.ts +23 -0
- package/dist/store/collections.d.ts.map +1 -0
- package/dist/store/collections.js +99 -0
- package/dist/store/collections.js.map +1 -0
- package/dist/store/database.d.ts +22 -0
- package/dist/store/database.d.ts.map +1 -0
- package/dist/store/database.js +209 -0
- package/dist/store/database.js.map +1 -0
- package/dist/store/documents.d.ts +68 -0
- package/dist/store/documents.d.ts.map +1 -0
- package/dist/store/documents.js +273 -0
- package/dist/store/documents.js.map +1 -0
- package/dist/types.d.ts +193 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/package.json +58 -0
package/README.md
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# mnemonic
|
|
2
|
+
|
|
3
|
+
On-device hybrid search for markdown knowledge bases. BM25 + vector + LLM reranking with link graphs, time decay, and HyDE. Designed for [pi](https://github.com/earendil-works/pi) coding agent.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
npm install -g @naveenadi/mnemonic
|
|
7
|
+
mne init
|
|
8
|
+
mne collection add ~/notes --name notes
|
|
9
|
+
mne index
|
|
10
|
+
mne embed
|
|
11
|
+
mne query "what was the Q4 planning discussion"
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Features
|
|
15
|
+
|
|
16
|
+
- **Hybrid search** — BM25 (FTS5) + Vector embeddings + RRF fusion
|
|
17
|
+
- **Structured queries** — `intent:`, `lex:`, `vec:`, `hyde:` fields for deliberate retrieval
|
|
18
|
+
- **Query expansion** — LLM generates alternative phrasings for better recall
|
|
19
|
+
- **HyDE** — Hypothetical Document Embeddings
|
|
20
|
+
- **LLM reranking** — Cross-encoder re-ranks top candidates with position-aware blending
|
|
21
|
+
- **Link graph** — Wikilinks, backlinks, orphan detection, link boosting
|
|
22
|
+
- **Time decay** — Exponential recency weighting (favor recent notes)
|
|
23
|
+
- **Tagging** — Manual + frontmatter auto-parse
|
|
24
|
+
- **Context tree** — Hierarchical metadata (`mne://` virtual paths)
|
|
25
|
+
- **Smart chunking** — Markdown heading-aware boundaries
|
|
26
|
+
- **Dual LLM backend** — Ollama (default) or node-llama-cpp (self-contained GGUF models)
|
|
27
|
+
|
|
28
|
+
## Pi Integration
|
|
29
|
+
|
|
30
|
+
Three layers:
|
|
31
|
+
|
|
32
|
+
| Layer | What | How |
|
|
33
|
+
|---|---|---|
|
|
34
|
+
| **Pi Skill** | `SKILL.md` | Bash commands via `mne search`, `mne query`, `mne get` |
|
|
35
|
+
| **MCP Server** | `mne mcp` | Stdio + HTTP, typed tools: `query`, `get`, `multi_get`, `status` |
|
|
36
|
+
| **Pi Extension** | `src/pi-extension/index.ts` | 4 custom tools registered with `pi.registerTool()` |
|
|
37
|
+
|
|
38
|
+
Configure MCP in `~/.pi/agent/mcp.json`:
|
|
39
|
+
|
|
40
|
+
```json
|
|
41
|
+
{
|
|
42
|
+
"mcpServers": {
|
|
43
|
+
"mnemonic": {
|
|
44
|
+
"command": "mne",
|
|
45
|
+
"args": ["mcp"],
|
|
46
|
+
"lifecycle": "keep-alive"
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Architecture
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
Core SDK (@naveenadi/mnemonic)
|
|
56
|
+
Store (SQLite FTS5 + vec) | Search Pipeline | Chunker
|
|
57
|
+
LLM Backend (Ollama <-> node-llama-cpp)
|
|
58
|
+
Link Graph | Time Decay | HyDE
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
Query ──► HyDE ──► Query Expansion ──► BM25 + Vector (per variant)
|
|
63
|
+
│
|
|
64
|
+
└──► RRF Fusion ──► Reranking ──► Time Decay ──► Link Boost ──► Results
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## CLI Commands
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
mne init Initialize index
|
|
71
|
+
mne collection add <dir> Add a collection
|
|
72
|
+
mne collection list List collections
|
|
73
|
+
mne index Index all collections
|
|
74
|
+
mne embed Generate vector embeddings
|
|
75
|
+
mne search <query> BM25 full-text search
|
|
76
|
+
mne vsearch <query> Vector semantic search
|
|
77
|
+
mne query <query> Hybrid search (BM25 + vector + reranking)
|
|
78
|
+
mne get <#docid|path> Retrieve a document
|
|
79
|
+
mne multi-get <pattern> Batch retrieve
|
|
80
|
+
mne ls [collection] List files
|
|
81
|
+
mne status Show index status
|
|
82
|
+
mne doctor Diagnostic checks
|
|
83
|
+
mne context add <path> <txt> Add context metadata
|
|
84
|
+
mne tag <#docid> <tag> Add a tag
|
|
85
|
+
mne links <#docid> Show outgoing links
|
|
86
|
+
mne backlinks <#docid> Show incoming links
|
|
87
|
+
mne orphans Find orphan documents
|
|
88
|
+
mne mcp Start MCP server
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## License
|
|
92
|
+
|
|
93
|
+
MIT
|
package/SKILL.md
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: mnemonic
|
|
3
|
+
description: Search local markdown knowledge bases, notes, docs, and wikis with mnemonic. Hybrid BM25 + vector + LLM reranking. Use when users ask to find notes, retrieve documents, or answer from indexed markdown.
|
|
4
|
+
license: MIT
|
|
5
|
+
compatibility: Requires mnemonic CLI. Install via `npm install -g @naveenadi/mnemonic`. Initialize with `mne init`.
|
|
6
|
+
metadata:
|
|
7
|
+
version: "0.1.0"
|
|
8
|
+
allowed-tools: Bash(mne:*)
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# mnemonic — On-Device Hybrid Search
|
|
12
|
+
|
|
13
|
+
## How search works
|
|
14
|
+
|
|
15
|
+
mnemonic searches local markdown collections: notes, docs, wikis, and project
|
|
16
|
+
knowledge bases. Use it before web search when the answer may already be in
|
|
17
|
+
indexed local files.
|
|
18
|
+
|
|
19
|
+
The workflow is always:
|
|
20
|
+
|
|
21
|
+
1. Check what's indexed with `mne status`.
|
|
22
|
+
2. Search for candidate documents.
|
|
23
|
+
3. Retrieve the full source with `mne get` or `mne multi-get`.
|
|
24
|
+
4. Answer from retrieved text, citing paths or docids.
|
|
25
|
+
|
|
26
|
+
Do not answer from snippets alone when the user needs facts, decisions, quotes,
|
|
27
|
+
or nuance. Snippets are only leads.
|
|
28
|
+
|
|
29
|
+
Typical loop:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
mne status
|
|
33
|
+
mne query "merchant reality support interviews" -n 5
|
|
34
|
+
# leads: #abc123 concepts/customer-proximity.md; #def432 sources/merchant-call.md
|
|
35
|
+
mne get "#abc123" --no-line-numbers
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Pick the right search mode
|
|
39
|
+
|
|
40
|
+
Use **BM25 lexical search** when you know exact words, titles, names, code
|
|
41
|
+
symbols, or rare phrases:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
mne search "cockpit OKR Goodhart" -n 10
|
|
45
|
+
mne search '"AI Before Headcount"' -c concepts -n 5
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Use **`mne query` with structured fields** when the user describes an idea
|
|
49
|
+
indirectly, uses different wording than the source, or needs conceptual recall.
|
|
50
|
+
Write the fields yourself rather than leaning on query expansion. Combine exact
|
|
51
|
+
anchors with semantic recall:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
mne query $'intent: Find the concept note about metrics as instruments without letting OKRs replace judgment.\nlex: cockpit instruments OKR Goodhart metrics judgment\nvec: data informed not metric driven product judgment\nhyde: A concept note says metrics are useful like cockpit instruments, but leaders should remain data-informed rather than metric-driven because OKRs and dashboards can Goodhart product judgment.'
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Structured query fields (you author each one):
|
|
58
|
+
|
|
59
|
+
- `intent:` states what you are trying to find **and what to avoid**. Always
|
|
60
|
+
supply this.
|
|
61
|
+
- `lex:` exact terms, aliases, titles, code symbols, and rare words.
|
|
62
|
+
- `vec:` paraphrases the idea in natural language.
|
|
63
|
+
- `hyde:` a hypothetical document that would answer the request.
|
|
64
|
+
|
|
65
|
+
You do not need all four every time, but always supply at least `intent:` plus
|
|
66
|
+
one of `lex:`/`vec:`.
|
|
67
|
+
|
|
68
|
+
## Retrieve sources
|
|
69
|
+
|
|
70
|
+
Search results include docids like `#abc123` and `mne://` paths. Fetch them:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
mne get "#abc123"
|
|
74
|
+
mne get mne://concepts/ai-before-headcount.md
|
|
75
|
+
mne multi-get "#abc123,#def432" --json
|
|
76
|
+
mne get "#abc123:120:40" # 40 lines starting at line 120
|
|
77
|
+
mne get mne://concepts/note.md -l 80 --from 200
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Output is line-numbered by default and carries the docid:
|
|
81
|
+
|
|
82
|
+
```text
|
|
83
|
+
mne://concepts/note.md #abc123
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
1: # Metrics as instruments
|
|
87
|
+
2:
|
|
88
|
+
3: Treat dashboards like cockpit instruments...
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Cite the docid and exact line numbers in your answer. Pass `--no-line-numbers`
|
|
92
|
+
only when you need raw content to copy verbatim.
|
|
93
|
+
|
|
94
|
+
## Discover what is indexed
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
mne collection list
|
|
98
|
+
mne ls
|
|
99
|
+
mne status
|
|
100
|
+
mne ls concepts # List files in a collection
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Add collection filters to scope searches:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
mne search "headcount autonomous agents" -c concepts -n 10
|
|
107
|
+
mne query "merchant support product reality" -c concepts -c sources -n 10
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Omit `-c` to search all default-included collections.
|
|
111
|
+
|
|
112
|
+
## Link graph
|
|
113
|
+
|
|
114
|
+
mnemonic tracks wikilinks (`[[target]]`) and markdown links between documents:
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
mne links #abc123 # Outgoing links
|
|
118
|
+
mne backlinks #abc123 # Incoming links (what links to this)
|
|
119
|
+
mne orphans # Documents with no links at all
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Use the link graph to discover related context:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
mne query "deployment" --boost-links # Boost results with many backlinks
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Setup and maintenance
|
|
129
|
+
|
|
130
|
+
Only mutate indexes when the user asked for setup or maintenance.
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
npm install -g @naveenadi/mnemonic
|
|
134
|
+
mne init
|
|
135
|
+
mne collection add ~/notes --name notes
|
|
136
|
+
mne collection add ~/Documents --name docs --mask "**/*.md"
|
|
137
|
+
mne index
|
|
138
|
+
mne embed
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Health and diagnostics:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
mne doctor
|
|
145
|
+
mne status
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Pitfalls
|
|
149
|
+
|
|
150
|
+
- **Do not stop at snippets.** Fetch documents before making claims.
|
|
151
|
+
- **Do not slice files with `sed`/`head`/`tail`.** Use the `path:from:count`
|
|
152
|
+
suffix or `--from`/`-l` flags.
|
|
153
|
+
- **Do not lean on auto query expansion.** Write `intent:`/`lex:`/`vec:`/`hyde:`
|
|
154
|
+
yourself when you know the domain context.
|
|
155
|
+
- **Do not overuse semantic search.** If you know exact titles or terms,
|
|
156
|
+
`mne search` (BM25) is faster and often better.
|
|
157
|
+
- **Do not mutate indexes casually.** `mne index` and `mne embed` change local
|
|
158
|
+
state and can be expensive.
|
package/bin/mne
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { ChunkRecord } from '../types.js';
|
|
2
|
+
/** Chunk markdown content into ~900-token pieces with smart boundaries */
|
|
3
|
+
export declare function chunkMarkdown(content: string, docid: string, options?: {
|
|
4
|
+
chunkStrategy?: 'regex' | 'auto';
|
|
5
|
+
}): ChunkRecord[];
|
|
6
|
+
/** Format a chunk for embedding: "title | text" */
|
|
7
|
+
export declare function formatChunkForEmbedding(chunk: ChunkRecord, title: string): string;
|
|
8
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/chunker/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AA2I/C,0EAA0E;AAC1E,wBAAgB,aAAa,CAC3B,OAAO,EAAE,MAAM,EACf,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE;IAAE,aAAa,CAAC,EAAE,OAAO,GAAG,MAAM,CAAA;CAAE,GAC7C,WAAW,EAAE,CA0Df;AAED,mDAAmD;AACnD,wBAAgB,uBAAuB,CACrC,KAAK,EAAE,WAAW,EAClB,KAAK,EAAE,MAAM,GACZ,MAAM,CAKR"}
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
const HEADING_SCORES = {
|
|
2
|
+
'# ': 100, // H1
|
|
3
|
+
'## ': 90, // H2
|
|
4
|
+
'### ': 80, // H3
|
|
5
|
+
'#### ': 70, // H4
|
|
6
|
+
'##### ': 60, // H5
|
|
7
|
+
'###### ': 50, // H6
|
|
8
|
+
};
|
|
9
|
+
const CODE_BLOCK_SCORE = 80;
|
|
10
|
+
const HR_SCORE = 60;
|
|
11
|
+
const BLANK_LINE_SCORE = 20;
|
|
12
|
+
const LIST_ITEM_SCORE = 5;
|
|
13
|
+
const LINE_BREAK_SCORE = 1;
|
|
14
|
+
/** Detect all break points in markdown content with scores */
|
|
15
|
+
function findBreakPoints(content) {
|
|
16
|
+
const points = [];
|
|
17
|
+
const lines = content.split('\n');
|
|
18
|
+
let inCodeBlock = false;
|
|
19
|
+
let linePos = 0;
|
|
20
|
+
for (let lineNum = 0; lineNum < lines.length; lineNum++) {
|
|
21
|
+
const line = lines[lineNum];
|
|
22
|
+
// Toggle code block state
|
|
23
|
+
if (/^```/.test(line)) {
|
|
24
|
+
inCodeBlock = !inCodeBlock;
|
|
25
|
+
linePos += line.length + 1;
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
// Skip content inside code blocks
|
|
29
|
+
if (inCodeBlock) {
|
|
30
|
+
linePos += line.length + 1;
|
|
31
|
+
continue;
|
|
32
|
+
}
|
|
33
|
+
// Check heading patterns
|
|
34
|
+
for (const [prefix, score] of Object.entries(HEADING_SCORES)) {
|
|
35
|
+
if (line.startsWith(prefix)) {
|
|
36
|
+
points.push({ pos: linePos, score, line: lineNum });
|
|
37
|
+
break;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
// Horizontal rules
|
|
41
|
+
if (/^(---|\*\*\*)\s*$/.test(line) && line.length >= 3) {
|
|
42
|
+
points.push({ pos: linePos, score: HR_SCORE, line: lineNum });
|
|
43
|
+
}
|
|
44
|
+
// Blank line
|
|
45
|
+
if (/^\s*$/.test(line)) {
|
|
46
|
+
points.push({ pos: linePos, score: BLANK_LINE_SCORE, line: lineNum });
|
|
47
|
+
}
|
|
48
|
+
// List items
|
|
49
|
+
if (/^(\s*[-*]\s|\s*\d+[.)]\s)/.test(line)) {
|
|
50
|
+
points.push({ pos: linePos, score: LIST_ITEM_SCORE, line: lineNum });
|
|
51
|
+
}
|
|
52
|
+
// Line break (every line gets a minimal score)
|
|
53
|
+
if (line.length > 0) {
|
|
54
|
+
points.push({ pos: linePos, score: LINE_BREAK_SCORE, line: lineNum });
|
|
55
|
+
}
|
|
56
|
+
linePos += line.length + 1;
|
|
57
|
+
}
|
|
58
|
+
return points;
|
|
59
|
+
}
|
|
60
|
+
/** Find the best break point near a target position */
|
|
61
|
+
function findBestBreak(breakPoints, targetPos, windowSize) {
|
|
62
|
+
const windowStart = Math.max(0, targetPos - windowSize);
|
|
63
|
+
const windowEnd = Math.min(targetPos, targetPos + 100);
|
|
64
|
+
const candidates = breakPoints.filter((bp) => bp.pos >= windowStart && bp.pos <= windowEnd && bp.pos > 0);
|
|
65
|
+
if (candidates.length === 0)
|
|
66
|
+
return null;
|
|
67
|
+
// Score: base score decays with distance from target using squared decay
|
|
68
|
+
let best = null;
|
|
69
|
+
let bestScore = -1;
|
|
70
|
+
for (const bp of candidates) {
|
|
71
|
+
const distance = Math.abs(bp.pos - targetPos);
|
|
72
|
+
const distanceFactor = 1 - (distance / windowSize) ** 2 * 0.7;
|
|
73
|
+
const finalScore = bp.score * distanceFactor;
|
|
74
|
+
if (finalScore > bestScore) {
|
|
75
|
+
bestScore = finalScore;
|
|
76
|
+
best = bp;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
return best ? best.pos : null;
|
|
80
|
+
}
|
|
81
|
+
/** Get the nearest heading context for a position */
|
|
82
|
+
function getHeadingForPosition(content, pos) {
|
|
83
|
+
const before = content.slice(0, pos);
|
|
84
|
+
const headings = before.matchAll(/^(#{1,6})\s+(.+)$/gm);
|
|
85
|
+
let lastHeading = '';
|
|
86
|
+
for (const match of headings) {
|
|
87
|
+
lastHeading = match[2].trim();
|
|
88
|
+
}
|
|
89
|
+
return lastHeading;
|
|
90
|
+
}
|
|
91
|
+
const CHUNK_TARGET_TOKENS = 900;
|
|
92
|
+
const CHUNK_OVERLAP = 0.15; // 15% overlap
|
|
93
|
+
const OVERLAP_TOKENS = Math.round(CHUNK_TARGET_TOKENS * CHUNK_OVERLAP);
|
|
94
|
+
const WINDOW_SIZE = 200; // tokens to search for break point
|
|
95
|
+
/** Rough token count estimate (4 chars per token) */
|
|
96
|
+
function estimateTokens(text) {
|
|
97
|
+
return Math.ceil(text.length / 4);
|
|
98
|
+
}
|
|
99
|
+
/** Chunk markdown content into ~900-token pieces with smart boundaries */
|
|
100
|
+
export function chunkMarkdown(content, docid, options) {
|
|
101
|
+
const chunks = [];
|
|
102
|
+
const breakPoints = findBreakPoints(content);
|
|
103
|
+
const totalTokens = estimateTokens(content);
|
|
104
|
+
const targetChars = CHUNK_TARGET_TOKENS * 4;
|
|
105
|
+
const overlapChars = OVERLAP_TOKENS * 4;
|
|
106
|
+
if (totalTokens <= CHUNK_TARGET_TOKENS * 1.1) {
|
|
107
|
+
// Small document — single chunk
|
|
108
|
+
const heading = getHeadingForPosition(content, 0) || '';
|
|
109
|
+
chunks.push({
|
|
110
|
+
hash: docid,
|
|
111
|
+
seq: 0,
|
|
112
|
+
pos: 0,
|
|
113
|
+
content,
|
|
114
|
+
heading,
|
|
115
|
+
});
|
|
116
|
+
return chunks;
|
|
117
|
+
}
|
|
118
|
+
let startPos = 0;
|
|
119
|
+
let seq = 0;
|
|
120
|
+
while (startPos < content.length) {
|
|
121
|
+
const chunkEnd = Math.min(startPos + targetChars, content.length);
|
|
122
|
+
if (chunkEnd >= content.length) {
|
|
123
|
+
// Final chunk
|
|
124
|
+
const heading = getHeadingForPosition(content, startPos);
|
|
125
|
+
chunks.push({
|
|
126
|
+
hash: docid,
|
|
127
|
+
seq,
|
|
128
|
+
pos: startPos,
|
|
129
|
+
content: content.slice(startPos),
|
|
130
|
+
heading,
|
|
131
|
+
});
|
|
132
|
+
break;
|
|
133
|
+
}
|
|
134
|
+
// Find best break point near the target
|
|
135
|
+
const breakPos = findBestBreak(breakPoints, chunkEnd, WINDOW_SIZE);
|
|
136
|
+
const endPos = breakPos ?? chunkEnd;
|
|
137
|
+
const heading = getHeadingForPosition(content, startPos);
|
|
138
|
+
chunks.push({
|
|
139
|
+
hash: docid,
|
|
140
|
+
seq,
|
|
141
|
+
pos: startPos,
|
|
142
|
+
content: content.slice(startPos, endPos).trim(),
|
|
143
|
+
heading,
|
|
144
|
+
});
|
|
145
|
+
startPos = endPos;
|
|
146
|
+
seq++;
|
|
147
|
+
}
|
|
148
|
+
return chunks;
|
|
149
|
+
}
|
|
150
|
+
/** Format a chunk for embedding: "title | text" */
|
|
151
|
+
export function formatChunkForEmbedding(chunk, title) {
|
|
152
|
+
if (chunk.heading && chunk.heading !== title) {
|
|
153
|
+
return `${title} > ${chunk.heading} | ${chunk.content}`;
|
|
154
|
+
}
|
|
155
|
+
return `${title} | ${chunk.content}`;
|
|
156
|
+
}
|
|
157
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/chunker/index.ts"],"names":[],"mappings":"AAWA,MAAM,cAAc,GAA2B;IAC7C,IAAI,EAAE,GAAG,EAAE,KAAK;IAChB,KAAK,EAAE,EAAE,EAAE,KAAK;IAChB,MAAM,EAAE,EAAE,EAAE,KAAK;IACjB,OAAO,EAAE,EAAE,EAAE,KAAK;IAClB,QAAQ,EAAE,EAAE,EAAE,KAAK;IACnB,SAAS,EAAE,EAAE,EAAE,KAAK;CACrB,CAAC;AAEF,MAAM,gBAAgB,GAAG,EAAE,CAAC;AAC5B,MAAM,QAAQ,GAAG,EAAE,CAAC;AACpB,MAAM,gBAAgB,GAAG,EAAE,CAAC;AAC5B,MAAM,eAAe,GAAG,CAAC,CAAC;AAC1B,MAAM,gBAAgB,GAAG,CAAC,CAAC;AAE3B,8DAA8D;AAC9D,SAAS,eAAe,CAAC,OAAe;IACtC,MAAM,MAAM,GAAiB,EAAE,CAAC;IAChC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAClC,IAAI,WAAW,GAAG,KAAK,CAAC;IACxB,IAAI,OAAO,GAAG,CAAC,CAAC;IAEhB,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,GAAG,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC;QACxD,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC;QAE5B,0BAA0B;QAC1B,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACtB,WAAW,GAAG,CAAC,WAAW,CAAC;YAC3B,OAAO,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;YAC3B,SAAS;QACX,CAAC;QAED,kCAAkC;QAClC,IAAI,WAAW,EAAE,CAAC;YAChB,OAAO,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;YAC3B,SAAS;QACX,CAAC;QAED,yBAAyB;QACzB,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE,CAAC;YAC7D,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;gBAC5B,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;gBACpD,MAAM;YACR,CAAC;QACH,CAAC;QAED,mBAAmB;QACnB,IAAI,mBAAmB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;YACvD,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;QAChE,CAAC;QAED,aAAa;QACb,IAAI,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACvB,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,gBAAgB,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;QACxE,CAAC;QAED,aAAa;QACb,IAAI,2BAA2B,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAC3C,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,eAAe,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,+CAA+C;QAC/C,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpB,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,gBAAgB,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;QACxE,CAAC;QAED,OAAO,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;IAC7B,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,uDAAuD;AACvD,SAAS,aAAa,CACpB,WAAyB,EACzB,SAAiB,EACjB,UAAkB;IAElB,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,GAAG,UAAU,CAAC,CAAC;IACxD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,SAAS,GAAG,GAAG,CAAC,CAAC;IAEvD,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,CACnC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,GAAG,IAAI,WAAW,IAAI,EAAE,CAAC,GAAG,IAAI,SAAS,IAAI,EAAE,CAAC,GAAG,GAAG,CAAC,CACnE,CAAC;IAEF,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEzC,yEAAyE;IACzE,IAAI,IAAI,GAAsB,IAAI,CAAC;IACnC,IAAI,SAAS,GAAG,CAAC,CAAC,CAAC;IAEnB,KAAK,MAAM,EAAE,IAAI,UAAU,EAAE,CAAC;QAC5B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,SAAS,CAAC,CAAC;QAC9C,MAAM,cAAc,GAAG,CAAC,GAAG,CAAC,QAAQ,GAAG,UAAU,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC;QAC9D,MAAM,UAAU,GAAG,EAAE,CAAC,KAAK,GAAG,cAAc,CAAC;QAE7C,IAAI,UAAU,GAAG,SAAS,EAAE,CAAC;YAC3B,SAAS,GAAG,UAAU,CAAC;YACvB,IAAI,GAAG,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC;AAChC,CAAC;AAED,qDAAqD;AACrD,SAAS,qBAAqB,CAAC,OAAe,EAAE,GAAW;IACzD,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;IACrC,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,qBAAqB,CAAC,CAAC;IACxD,IAAI,WAAW,GAAG,EAAE,CAAC;IAErB,KAAK,MAAM,KAAK,IAAI,QAAQ,EAAE,CAAC;QAC7B,WAAW,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAChC,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC;AAED,MAAM,mBAAmB,GAAG,GAAG,CAAC;AAChC,MAAM,aAAa,GAAG,IAAI,CAAC,CAAC,cAAc;AAC1C,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,GAAG,aAAa,CAAC,CAAC;AACvE,MAAM,WAAW,GAAG,GAAG,CAAC,CAAC,mCAAmC;AAE5D,qDAAqD;AACrD,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACpC,CAAC;AAED,0EAA0E;AAC1E,MAAM,UAAU,aAAa,CAC3B,OAAe,EACf,KAAa,EACb,OAA8C;IAE9C,MAAM,MAAM,GAAkB,EAAE,CAAC;IACjC,MAAM,WAAW,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAC7C,MAAM,WAAW,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;IAC5C,MAAM,WAAW,GAAG,mBAAmB,GAAG,CAAC,CAAC;IAC5C,MAAM,YAAY,GAAG,cAAc,GAAG,CAAC,CAAC;IAExC,IAAI,WAAW,IAAI,mBAAmB,GAAG,GAAG,EAAE,CAAC;QAC7C,gCAAgC;QAChC,MAAM,OAAO,GAAG,qBAAqB,CAAC,OAAO,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACxD,MAAM,CAAC,IAAI,CAAC;YACV,IAAI,EAAE,KAAK;YACX,GAAG,EAAE,CAAC;YACN,GAAG,EAAE,CAAC;YACN,OAAO;YACP,OAAO;SACR,CAAC,CAAC;QACH,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,GAAG,GAAG,CAAC,CAAC;IAEZ,OAAO,QAAQ,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;QACjC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,GAAG,WAAW,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QAElE,IAAI,QAAQ,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;YAC/B,cAAc;YACd,MAAM,OAAO,GAAG,qBAAqB,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;YACzD,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,KAAK;gBACX,GAAG;gBACH,GAAG,EAAE,QAAQ;gBACb,OAAO,EAAE,OAAO,CAAC,KAAK,CAAC,QAAQ,CAAC;gBAChC,OAAO;aACR,CAAC,CAAC;YACH,MAAM;QACR,CAAC;QAED,wCAAwC;QACxC,MAAM,QAAQ,GAAG,aAAa,CAAC,WAAW,EAAE,QAAQ,EAAE,WAAW,CAAC,CAAC;QAEnE,MAAM,MAAM,GAAG,QAAQ,IAAI,QAAQ,CAAC;QACpC,MAAM,OAAO,GAAG,qBAAqB,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;QAEzD,MAAM,CAAC,IAAI,CAAC;YACV,IAAI,EAAE,KAAK;YACX,GAAG;YACH,GAAG,EAAE,QAAQ;YACb,OAAO,EAAE,OAAO,CAAC,KAAK,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE;YAC/C,OAAO;SACR,CAAC,CAAC;QAEH,QAAQ,GAAG,MAAM,CAAC;QAClB,GAAG,EAAE,CAAC;IACR,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,mDAAmD;AACnD,MAAM,UAAU,uBAAuB,CACrC,KAAkB,EAClB,KAAa;IAEb,IAAI,KAAK,CAAC,OAAO,IAAI,KAAK,CAAC,OAAO,KAAK,KAAK,EAAE,CAAC;QAC7C,OAAO,GAAG,KAAK,MAAM,KAAK,CAAC,OAAO,MAAM,KAAK,CAAC,OAAO,EAAE,CAAC;IAC1D,CAAC;IACD,OAAO,GAAG,KAAK,MAAM,KAAK,CAAC,OAAO,EAAE,CAAC;AACvC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"commands.d.ts","sourceRoot":"","sources":["../../src/cli/commands.ts"],"names":[],"mappings":"AAuBA,wBAAsB,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,iBA4DxC"}
|