@abhishekmcp/notes 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ /**
2
+ * Hand-rolled BERT-uncased WordPiece tokenizer — pure JS, no dependencies.
3
+ * Mirrors google-research/bert `tokenization.py` (BasicTokenizer +
4
+ * WordpieceTokenizer) closely enough to feed all-MiniLM-L6-v2. We hand-roll it
5
+ * for the same reason we hand-roll the frontmatter parser: avoid a native /
6
+ * audit-heavy dependency (@huggingface/tokenizers is a Rust binding).
7
+ */
8
+ const SPECIAL = { unk: "[UNK]", cls: "[CLS]", sep: "[SEP]", pad: "[PAD]" };
9
+ /** True if a Unicode code point is a CJK character (BERT `_is_chinese_char`). */
10
+ function isCjk(cp) {
11
+ return ((cp >= 0x4e00 && cp <= 0x9fff) ||
12
+ (cp >= 0x3400 && cp <= 0x4dbf) ||
13
+ (cp >= 0x20000 && cp <= 0x2a6df) ||
14
+ (cp >= 0x2a700 && cp <= 0x2b73f) ||
15
+ (cp >= 0x2b740 && cp <= 0x2b81f) ||
16
+ (cp >= 0x2b820 && cp <= 0x2ceaf) ||
17
+ (cp >= 0xf900 && cp <= 0xfaff) ||
18
+ (cp >= 0x2f800 && cp <= 0x2fa1f));
19
+ }
20
+ /** BERT `_is_punctuation`: all non-alphanumeric ASCII + any Unicode P* category. */
21
+ function isPunct(ch) {
22
+ const cp = ch.codePointAt(0);
23
+ if ((cp >= 33 && cp <= 47) ||
24
+ (cp >= 58 && cp <= 64) ||
25
+ (cp >= 91 && cp <= 96) ||
26
+ (cp >= 123 && cp <= 126)) {
27
+ return true;
28
+ }
29
+ return /\p{P}/u.test(ch);
30
+ }
31
+ export class WordPieceTokenizer {
32
+ vocab;
33
+ unkId;
34
+ clsId;
35
+ sepId;
36
+ padId;
37
+ constructor(vocabText) {
38
+ this.vocab = new Map();
39
+ const lines = vocabText.split(/\r?\n/);
40
+ let i = 0;
41
+ for (const line of lines) {
42
+ // vocab.txt is one token per line; the line number is the id. Do not trim
43
+ // away a token that is itself whitespace-like — but blank trailing lines
44
+ // (from the split) must not consume ids, so stop at the first empty tail.
45
+ const tok = line.replace(/\r$/, "");
46
+ if (tok === "" && i >= lines.length - 2)
47
+ break;
48
+ this.vocab.set(tok, i);
49
+ i++;
50
+ }
51
+ this.unkId = this.vocab.get(SPECIAL.unk);
52
+ this.clsId = this.vocab.get(SPECIAL.cls);
53
+ this.sepId = this.vocab.get(SPECIAL.sep);
54
+ this.padId = this.vocab.get(SPECIAL.pad) ?? 0;
55
+ if (this.unkId === undefined || this.clsId === undefined || this.sepId === undefined) {
56
+ throw new Error("vocab.txt is missing required special tokens ([UNK]/[CLS]/[SEP]).");
57
+ }
58
+ }
59
+ /** Normalize: strip control chars, pad CJK, lowercase, remove accents (NFD/Mn). */
60
+ normalize(text) {
61
+ let out = "";
62
+ for (const ch of text) {
63
+ const cp = ch.codePointAt(0);
64
+ if (cp === 0 || cp === 0xfffd)
65
+ continue; // invalid
66
+ if (cp !== 9 && cp !== 10 && cp !== 13 && cp < 32)
67
+ continue; // control chars
68
+ out += isCjk(cp) ? ` ${ch} ` : ch;
69
+ }
70
+ return out
71
+ .toLowerCase()
72
+ .normalize("NFD")
73
+ .replace(/\p{Mn}/gu, "");
74
+ }
75
+ /** Whitespace + punctuation split into basic tokens (BERT BasicTokenizer). */
76
+ basicTokenize(text) {
77
+ const tokens = [];
78
+ for (const word of this.normalize(text).split(/\s+/)) {
79
+ if (!word)
80
+ continue;
81
+ let cur = "";
82
+ for (const ch of word) {
83
+ if (isPunct(ch)) {
84
+ if (cur) {
85
+ tokens.push(cur);
86
+ cur = "";
87
+ }
88
+ tokens.push(ch);
89
+ }
90
+ else {
91
+ cur += ch;
92
+ }
93
+ }
94
+ if (cur)
95
+ tokens.push(cur);
96
+ }
97
+ return tokens;
98
+ }
99
+ /** Greedy longest-match-first WordPiece split of a single word. */
100
+ wordpiece(word) {
101
+ if (word.length > 200)
102
+ return [SPECIAL.unk]; // BERT max_input_chars_per_word
103
+ const out = [];
104
+ let start = 0;
105
+ while (start < word.length) {
106
+ let end = word.length;
107
+ let found = null;
108
+ while (start < end) {
109
+ const sub = (start > 0 ? "##" : "") + word.slice(start, end);
110
+ if (this.vocab.has(sub)) {
111
+ found = sub;
112
+ break;
113
+ }
114
+ end--;
115
+ }
116
+ if (found === null)
117
+ return [SPECIAL.unk]; // any unmatchable piece → whole word UNK
118
+ out.push(found);
119
+ start = end;
120
+ }
121
+ return out;
122
+ }
123
+ /**
124
+ * Encode text into `[CLS] … [SEP]` token ids (truncated to maxLen) plus an
125
+ * all-ones attention mask. We embed one text at a time (batch 1), so no
126
+ * padding is needed.
127
+ */
128
+ encode(text, maxLen = 256) {
129
+ const pieceIds = [];
130
+ for (const tok of this.basicTokenize(text)) {
131
+ for (const piece of this.wordpiece(tok)) {
132
+ pieceIds.push(this.vocab.get(piece) ?? this.unkId);
133
+ }
134
+ }
135
+ const body = pieceIds.slice(0, Math.max(0, maxLen - 2));
136
+ const inputIds = [this.clsId, ...body, this.sepId];
137
+ return { inputIds, attentionMask: inputIds.map(() => 1) };
138
+ }
139
+ }
140
+ //# sourceMappingURL=tokenizer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../src/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,MAAM,OAAO,GAAG,EAAE,GAAG,EAAE,OAAO,EAAE,GAAG,EAAE,OAAO,EAAE,GAAG,EAAE,OAAO,EAAE,GAAG,EAAE,OAAO,EAAW,CAAC;AAEpF,iFAAiF;AACjF,SAAS,KAAK,CAAC,EAAU;IACvB,OAAO,CACL,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC;QAC9B,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC;QAC9B,CAAC,EAAE,IAAI,OAAO,IAAI,EAAE,IAAI,OAAO,CAAC;QAChC,CAAC,EAAE,IAAI,OAAO,IAAI,EAAE,IAAI,OAAO,CAAC;QAChC,CAAC,EAAE,IAAI,OAAO,IAAI,EAAE,IAAI,OAAO,CAAC;QAChC,CAAC,EAAE,IAAI,OAAO,IAAI,EAAE,IAAI,OAAO,CAAC;QAChC,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC;QAC9B,CAAC,EAAE,IAAI,OAAO,IAAI,EAAE,IAAI,OAAO,CAAC,CACjC,CAAC;AACJ,CAAC;AAED,oFAAoF;AACpF,SAAS,OAAO,CAAC,EAAU;IACzB,MAAM,EAAE,GAAG,EAAE,CAAC,WAAW,CAAC,CAAC,CAAE,CAAC;IAC9B,IACE,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACtB,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACtB,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACtB,CAAC,EAAE,IAAI,GAAG,IAAI,EAAE,IAAI,GAAG,CAAC,EACxB,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IACD,OAAO,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;AAC3B,CAAC;AAED,MAAM,OAAO,kBAAkB;IACrB,KAAK,CAAsB;IAC1B,KAAK,CAAS;IACd,KAAK,CAAS;IACd,KAAK,CAAS;IACd,KAAK,CAAS;IAEvB,YAAY,SAAiB;QAC3B,IAAI,CAAC,KAAK,GAAG,IAAI,GAAG,EAAE,CAAC;QACvB,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACvC,IAAI,CAAC,GAAG,CAAC,CAAC;QACV,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,0EAA0E;YAC1E,yEAAyE;YACzE,0EAA0E;YAC1E,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YACpC,IAAI,GAAG,KAAK,EAAE,IAAI,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;gBAAE,MAAM;YAC/C,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;YACvB,CAAC,EAAE,CAAC;QACN,CAAC;QACD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,GAAG,CAAE,CAAC;QAC1C,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,GAAG,CAAE,CAAC;QAC1C,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,GAAG,CAAE,CAAC;QAC1C,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAC9C,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,EAAE,CAAC;YACrF,MAAM,IAAI,KAAK,CAAC,mEAAmE,CAAC,CAAC;QACvF,CAAC;IACH,CAAC;IAED,mFAAmF;IAC3E,SAAS,CAAC,IAAY;QAC5B,IAAI,GAAG,GAAG,EAAE,CAAC;QACb,KAAK,MAAM,EAAE,IAAI,IAAI,EAAE,CAAC;YACtB,MAAM,EAAE,GAAG,EAAE,CAAC,WAAW,CAAC,CAAC,CAAE,CAAC;YAC9B,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,KAAK,MAAM;gBAAE,SAAS,CAAC,UAAU;YACnD,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,GAAG,EAAE;gBAAE,SAAS,CAAC,gBAAgB;YAC7E,GAAG,IAAI,KAAK,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;QACpC,CAAC;QACD,OAAO,GAAG;aACP,WAAW,EAAE;aACb,SAAS,CAAC,KAAK,CAAC;aAChB,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;IAC7B,CAAC;IAED,8EAA8E;IACtE,aAAa,CAAC,IAAY;QAChC,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;YACrD,IAAI,CAAC,IAAI;gBAAE,SAAS;YACpB,IAAI,GAAG,GAAG,EAAE,CAAC;YACb,KAAK,MAAM,EAAE,IAAI,IAAI,EAAE,CAAC;gBACtB,IAAI,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC;oBAChB,IAAI,GAAG,EAAE,CAAC;wBACR,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;wBACjB,GAAG,GAAG,EAAE,CAAC;oBACX,CAAC;oBACD,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBAClB,CAAC;qBAAM,CAAC;oBACN,GAAG,IAAI,EAAE,CAAC;gBACZ,CAAC;YACH,CAAC;YACD,IAAI,GAAG;gBAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC5B,CAAC;QACD,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,mEAAmE;IAC3D,SAAS,CAAC,IAAY;QAC5B,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG;YAAE,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,gCAAgC;QAC7E,MAAM,GAAG,GAAa,EAAE,CAAC;QACzB,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YAC3B,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;YACtB,IAAI,KAAK,GAAkB,IAAI,CAAC;YAChC,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;gBACnB,MAAM,GAAG,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;gBAC7D,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;oBACxB,KAAK,GAAG,GAAG,CAAC;oBACZ,MAAM;gBACR,CAAC;gBACD,GAAG,EAAE,CAAC;YACR,CAAC;YACD,IAAI,KAAK,KAAK,IAAI;gBAAE,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,yCAAyC;YACnF,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAChB,KAAK,GAAG,GAAG,CAAC;QACd,CAAC;QACD,OAAO,GAAG,CAAC;IACb,CAAC;IAED;;;;OAIG;IACH,MAAM,CAAC,IAAY,EAAE,MAAM,GAAG,GAAG;QAC/B,MAAM,QAAQ,GAAa,EAAE,CAAC;QAC9B,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;YAC3C,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC;gBACxC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC;YACrD,CAAC;QACH,CAAC;QACD,MAAM,IAAI,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;QACxD,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,KAAK,EAAE,GAAG,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;QACnD,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IAC5D,CAAC;CACF"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@abhishekmcp/notes",
3
- "version": "0.2.0",
3
+ "version": "0.4.0",
4
4
  "description": "MCP server for local markdown notes — ranked search, tags, todos, and a wiki-link knowledge graph, from any MCP client.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -44,6 +44,7 @@
44
44
  "dependencies": {
45
45
  "@modelcontextprotocol/sdk": "^1.29.0",
46
46
  "minisearch": "^7.1.0",
47
+ "onnxruntime-web": "^1.27.0",
47
48
  "zod": "^3.23.8"
48
49
  },
49
50
  "devDependencies": {