@cvfile/embed 0.1.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +56 -25
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +7 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.js +56 -25
- package/dist/index.js.map +1 -1
- package/package.json +16 -7
package/dist/index.cjs
CHANGED
|
@@ -3,25 +3,31 @@
|
|
|
3
3
|
var transformers = require('@huggingface/transformers');
|
|
4
4
|
|
|
5
5
|
// src/chunk.ts
|
|
6
|
+
var encoder = new TextEncoder();
|
|
7
|
+
var decoder = new TextDecoder();
|
|
6
8
|
var HEADING = /^(#{1,6})\s+(.+?)\s*$/;
|
|
7
9
|
function chunkMarkdown(markdown, opts = {}) {
|
|
8
10
|
const mode = opts.mode ?? "section";
|
|
11
|
+
const bytes = encoder.encode(markdown);
|
|
9
12
|
if (mode === "document") {
|
|
10
|
-
return [
|
|
13
|
+
return [documentChunk(bytes)];
|
|
11
14
|
}
|
|
12
15
|
if (mode === "paragraph") {
|
|
13
|
-
return paragraphChunks(
|
|
16
|
+
return paragraphChunks(bytes);
|
|
14
17
|
}
|
|
15
|
-
return sectionChunks(
|
|
18
|
+
return sectionChunks(bytes);
|
|
16
19
|
}
|
|
17
|
-
function
|
|
18
|
-
|
|
20
|
+
function documentChunk(bytes) {
|
|
21
|
+
return { id: "document", textOffset: 0, textLength: bytes.byteLength, text: sliceText(bytes, 0, bytes.byteLength) };
|
|
22
|
+
}
|
|
23
|
+
function sectionChunks(bytes) {
|
|
24
|
+
const lines = splitWithByteOffsets(bytes);
|
|
19
25
|
const sections = [];
|
|
20
26
|
let current = null;
|
|
21
27
|
const ids = /* @__PURE__ */ new Set();
|
|
22
28
|
function flush(end) {
|
|
23
29
|
if (!current) return;
|
|
24
|
-
const text =
|
|
30
|
+
const text = sliceText(bytes, current.start, end);
|
|
25
31
|
if (text.trim().length === 0) {
|
|
26
32
|
current = null;
|
|
27
33
|
return;
|
|
@@ -31,59 +37,84 @@ function sectionChunks(markdown) {
|
|
|
31
37
|
}
|
|
32
38
|
for (const line of lines) {
|
|
33
39
|
const match = HEADING.exec(line.text);
|
|
40
|
+
const lineEnd = line.offset + line.byteLength;
|
|
34
41
|
if (match) {
|
|
35
42
|
flush(line.offset);
|
|
36
43
|
const id = uniqueId(slugify(match[2] ?? `section-${sections.length + 1}`), ids);
|
|
37
44
|
ids.add(id);
|
|
38
|
-
current = { id, start: line.offset, end:
|
|
45
|
+
current = { id, start: line.offset, end: lineEnd };
|
|
39
46
|
continue;
|
|
40
47
|
}
|
|
41
48
|
if (current === null) {
|
|
42
49
|
const id = uniqueId("preamble", ids);
|
|
43
50
|
ids.add(id);
|
|
44
|
-
current = { id, start: line.offset, end:
|
|
51
|
+
current = { id, start: line.offset, end: lineEnd };
|
|
45
52
|
} else {
|
|
46
|
-
current.end =
|
|
53
|
+
current.end = lineEnd;
|
|
47
54
|
}
|
|
48
55
|
}
|
|
49
|
-
flush(
|
|
56
|
+
flush(bytes.byteLength);
|
|
50
57
|
if (sections.length === 0) {
|
|
51
|
-
return [
|
|
58
|
+
return [documentChunk(bytes)];
|
|
52
59
|
}
|
|
53
60
|
return sections;
|
|
54
61
|
}
|
|
55
|
-
function paragraphChunks(
|
|
62
|
+
function paragraphChunks(bytes) {
|
|
56
63
|
const out = [];
|
|
57
64
|
const ids = /* @__PURE__ */ new Set();
|
|
65
|
+
const separator = encoder.encode("\n\n");
|
|
58
66
|
let cursor = 0;
|
|
59
67
|
let i = 0;
|
|
60
|
-
while (cursor <
|
|
61
|
-
let end =
|
|
62
|
-
if (end === -1) end =
|
|
63
|
-
const text =
|
|
68
|
+
while (cursor < bytes.byteLength) {
|
|
69
|
+
let end = indexOfBytes(bytes, separator, cursor);
|
|
70
|
+
if (end === -1) end = bytes.byteLength;
|
|
71
|
+
const text = sliceText(bytes, cursor, end);
|
|
64
72
|
if (text.trim().length > 0) {
|
|
65
73
|
const id = uniqueId(slugify(text.split("\n")[0] ?? `p-${i}`), ids);
|
|
66
74
|
ids.add(id);
|
|
67
|
-
out.push({ id, textOffset: cursor, textLength:
|
|
75
|
+
out.push({ id, textOffset: cursor, textLength: end - cursor, text });
|
|
68
76
|
i += 1;
|
|
69
77
|
}
|
|
70
|
-
cursor = end +
|
|
78
|
+
cursor = end + separator.byteLength;
|
|
71
79
|
}
|
|
72
80
|
if (out.length === 0) {
|
|
73
|
-
return [
|
|
81
|
+
return [documentChunk(bytes)];
|
|
74
82
|
}
|
|
75
83
|
return out;
|
|
76
84
|
}
|
|
77
|
-
function
|
|
85
|
+
function sliceText(bytes, start, end) {
|
|
86
|
+
return decoder.decode(bytes.subarray(start, end));
|
|
87
|
+
}
|
|
88
|
+
function splitWithByteOffsets(bytes) {
|
|
89
|
+
const newline = 10;
|
|
78
90
|
const lines = [];
|
|
79
|
-
let
|
|
80
|
-
for (
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
91
|
+
let start = 0;
|
|
92
|
+
for (let i = 0; i < bytes.byteLength; i += 1) {
|
|
93
|
+
if (bytes[i] === newline) {
|
|
94
|
+
const byteLength = i - start + 1;
|
|
95
|
+
lines.push({ text: sliceText(bytes, start, i + 1), offset: start, byteLength });
|
|
96
|
+
start = i + 1;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
if (start < bytes.byteLength) {
|
|
100
|
+
lines.push({ text: sliceText(bytes, start, bytes.byteLength), offset: start, byteLength: bytes.byteLength - start });
|
|
84
101
|
}
|
|
85
102
|
return lines;
|
|
86
103
|
}
|
|
104
|
+
function indexOfBytes(haystack, needle, from) {
|
|
105
|
+
const last = haystack.byteLength - needle.byteLength;
|
|
106
|
+
for (let i = from; i <= last; i += 1) {
|
|
107
|
+
let matched = true;
|
|
108
|
+
for (let j = 0; j < needle.byteLength; j += 1) {
|
|
109
|
+
if (haystack[i + j] !== needle[j]) {
|
|
110
|
+
matched = false;
|
|
111
|
+
break;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
if (matched) return i;
|
|
115
|
+
}
|
|
116
|
+
return -1;
|
|
117
|
+
}
|
|
87
118
|
function slugify(s) {
|
|
88
119
|
return s.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 64) || "section";
|
|
89
120
|
}
|
package/dist/index.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/chunk.ts","../src/transformers-backend.ts","../src/types.ts","../src/embed.ts","../src/search.ts","../src/huggingface-backend.ts"],"names":["env","pipeline"],"mappings":";;;;;AAsBA,IAAM,OAAA,GAAU,uBAAA;AAET,SAAS,aAAA,CAAc,QAAA,EAAkB,IAAA,GAAqB,EAAC,EAAoB;AACxF,EAAA,MAAM,IAAA,GAAO,KAAK,IAAA,IAAQ,SAAA;AAC1B,EAAA,IAAI,SAAS,UAAA,EAAY;AACvB,IAAA,OAAO,CAAC,EAAE,EAAA,EAAI,UAAA,EAAY,UAAA,EAAY,CAAA,EAAG,UAAA,EAAY,QAAA,CAAS,MAAA,EAAQ,IAAA,EAAM,QAAA,EAAU,CAAA;AAAA,EACxF;AACA,EAAA,IAAI,SAAS,WAAA,EAAa;AACxB,IAAA,OAAO,gBAAgB,QAAQ,CAAA;AAAA,EACjC;AACA,EAAA,OAAO,cAAc,QAAQ,CAAA;AAC/B;AAEA,SAAS,cAAc,QAAA,EAAmC;AACxD,EAAA,MAAM,KAAA,GAAQ,iBAAiB,QAAQ,CAAA;AACvC,EAAA,MAAM,WAA4B,EAAC;AACnC,EAAA,IAAI,OAAA,GAA6D,IAAA;AACjE,EAAA,MAAM,GAAA,uBAAU,GAAA,EAAY;AAE5B,EAAA,SAAS,MAAM,GAAA,EAAmB;AAChC,IAAA,IAAI,CAAC,OAAA,EAAS;AACd,IAAA,MAAM,IAAA,GAAO,QAAA,CAAS,KAAA,CAAM,OAAA,CAAQ,OAAO,GAAG,CAAA;AAC9C,IAAA,IAAI,IAAA,CAAK,IAAA,EAAK,CAAE,MAAA,KAAW,CAAA,EAAG;AAC5B,MAAA,OAAA,GAAU,IAAA;AACV,MAAA;AAAA,IACF;AACA,IAAA,QAAA,CAAS,IAAA,CAAK,EAAE,EAAA,EAAI,OAAA,CAAQ,EAAA,EAAI,UAAA,EAAY,OAAA,CAAQ,KAAA,EAAO,UAAA,EAAY,GAAA,GAAM,OAAA,CAAQ,KAAA,EAAO,MAAM,CAAA;AAClG,IAAA,OAAA,GAAU,IAAA;AAAA,EACZ;AAEA,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,MAAM,KAAA,GAAQ,OAAA,CAAQ,IAAA,CAAK,IAAA,CAAK,IAAI,CAAA;AACpC,IAAA,IAAI,KAAA,EAAO;AACT,MAAA,KAAA,CAAM,KAAK,MAAM,CAAA;AACjB,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,OAAA,CAAQ,KAAA,CAAM,CAAC,CAAA,IAAK,CAAA,QAAA,EAAW,QAAA,CAAS,MAAA,GAAS,CAAC,CAAA,CAAE,CAAA,EAAG,GAAG,CAAA;AAC9E,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,OAAA,GAAU,EAAE,EAAA,EAAI,KAAA,EAAO,IAAA,CAAK,MAAA,EAAQ,KAAK,IAAA,CAAK,MAAA,GAAS,IAAA,CAAK,IAAA,CAAK,MAAA,EAAO;AACxE,MAAA;AAAA,IACF;AACA,IAAA,IAAI,YAAY,IAAA,EAAM;AACpB,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,UAAA,EAAY,GAAG,CAAA;AACnC,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,OAAA,GAAU,EAAE,EAAA,EAAI,KAAA,EAAO,IAAA,CAAK,MAAA,EAAQ,KAAK,IAAA,CAAK,MAAA,GAAS,IAAA,CAAK,IAAA,CAAK,MAAA,EAAO;AAAA,IAC1E,CAAA,MAAO;AACL,MAAA,OAAA,CAAQ,GAAA,GAAM,IAAA,CAAK,MAAA,GAAS,IAAA,CAAK,IAAA,CAAK,MAAA;AAAA,IACxC;AAAA,EACF;AACA,EAAA,KAAA,CAAM,SAAS,MAAM,CAAA;AAErB,EAAA,IAAI,QAAA,CAAS,WAAW,CAAA,EAAG;AACzB,IAAA,OAAO,CAAC,EAAE,EAAA,EAAI,UAAA,EAAY,UAAA,EAAY,CAAA,EAAG,UAAA,EAAY,QAAA,CAAS,MAAA,EAAQ,IAAA,EAAM,QAAA,EAAU,CAAA;AAAA,EACxF;AACA,EAAA,OAAO,QAAA;AACT;AAEA,SAAS,gBAAgB,QAAA,EAAmC;AAC1D,EAAA,MAAM,MAAuB,EAAC;AAC9B,EAAA,MAAM,GAAA,uBAAU,GAAA,EAAY;AAC5B,EAAA,IAAI,MAAA,GAAS,CAAA;AACb,EAAA,IAAI,CAAA,GAAI,CAAA;AACR,EAAA,OAAO,MAAA,GAAS,SAAS,MAAA,EAAQ;AAC/B,IAAA,IAAI,GAAA,GAAM,QAAA,CAAS,OAAA,CAAQ,MAAA,EAAQ,MAAM,CAAA;AACzC,IAAA,IAAI,GAAA,KAAQ,EAAA,EAAI,GAAA,GAAM,QAAA,CAAS,MAAA;AAC/B,IAAA,MAAM,IAAA,GAAO,QAAA,CAAS,KAAA,CAAM,MAAA,EAAQ,GAAG,CAAA;AACvC,IAAA,IAAI,IAAA,CAAK,IAAA,EAAK,CAAE,MAAA,GAAS,CAAA,EAAG;AAC1B,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,OAAA,CAAQ,IAAA,CAAK,KAAA,CAAM,IAAI,CAAA,CAAE,CAAC,CAAA,IAAK,CAAA,EAAA,EAAK,CAAC,CAAA,CAAE,GAAG,GAAG,CAAA;AACjE,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,GAAA,CAAI,IAAA,CAAK,EAAE,EAAA,EAAI,UAAA,EAAY,QAAQ,UAAA,EAAY,IAAA,CAAK,MAAA,EAAQ,IAAA,EAAM,CAAA;AAClE,MAAA,CAAA,IAAK,CAAA;AAAA,IACP;AACA,IAAA,MAAA,GAAS,GAAA,GAAM,CAAA;AAAA,EACjB;AACA,EAAA,IAAI,GAAA,CAAI,WAAW,CAAA,EAAG;AACpB,IAAA,OAAO,CAAC,EAAE,EAAA,EAAI,UAAA,EAAY,UAAA,EAAY,CAAA,EAAG,UAAA,EAAY,QAAA,CAAS,MAAA,EAAQ,IAAA,EAAM,QAAA,EAAU,CAAA;AAAA,EACxF;AACA,EAAA,OAAO,GAAA;AACT;AAEA,SAAS,iBAAiB,CAAA,EAA+C;AACvE,EAAA,MAAM,QAA4C,EAAC;AACnD,EAAA,IAAI,MAAA,GAAS,CAAA;AACb,EAAA,KAAA,MAAW,IAAA,IAAQ,CAAA,CAAE,KAAA,CAAM,IAAI,CAAA,EAAG;AAChC,IAAA,MAAM,SAAS,IAAA,IAAQ,MAAA,GAAS,KAAK,MAAA,GAAS,CAAA,CAAE,SAAS,IAAA,GAAO,EAAA,CAAA;AAChE,IAAA,KAAA,CAAM,IAAA,CAAK,EAAE,IAAA,EAAM,MAAA,EAAQ,QAAQ,CAAA;AACnC,IAAA,MAAA,IAAU,MAAA,CAAO,MAAA;AAAA,EACnB;AACA,EAAA,OAAO,KAAA;AACT;AAEA,SAAS,QAAQ,CAAA,EAAmB;AAClC,EAAA,OACE,CAAA,CACG,WAAA,EAAY,CACZ,OAAA,CAAQ,eAAe,GAAG,CAAA,CAC1B,OAAA,CAAQ,UAAA,EAAY,EAAE,CAAA,CACtB,KAAA,CAAM,CAAA,EAAG,EAAE,CAAA,IAAK,SAAA;AAEvB;AAEA,SAAS,QAAA,CAAS,MAAc,KAAA,EAA4B;AAC1D,EAAA,IAAI,CAAC,KAAA,CAAM,GAAA,CAAI,IAAI,GAAG,OAAO,IAAA;AAC7B,EAAA,IAAI,CAAA,GAAI,CAAA;AACR,EAAA,OAAO,KAAA,CAAM,IAAI,CAAA,EAAG,IAAI,IAAI,CAAC,CAAA,CAAE,GAAG,CAAA,IAAK,CAAA;AACvC,EAAA,OAAO,CAAA,EAAG,IAAI,CAAA,CAAA,EAAI,CAAC,CAAA,CAAA;AACrB;AC3GO,SAAS,0BAA0B,IAAA,EAAoD;AAC5F,EAAA,IAAI,IAAA,CAAK,sBAAsB,KAAA,EAAO;AACpC,IAAAA,gBAAA,CAAI,iBAAA,GAAoB,KAAA;AAAA,EAC1B;AAEA,EAAA,IAAI,eAAA,GAA2C,IAAA;AAC/C,EAAA,IAAI,iBAAA,GAAmC,IAAA;AAEvC,EAAA,SAAS,WAAA,GAAgC;AACvC,IAAA,IAAI,CAAC,eAAA,EAAiB;AACpB,MAAA,MAAM,SAAA,GAAqC,EAAE,KAAA,EAAO,IAAA,CAAK,SAAS,MAAA,EAAO;AACzE,MAAA,IAAI,IAAA,CAAK,MAAA,KAAW,MAAA,EAAW,SAAA,CAAU,SAAS,IAAA,CAAK,MAAA;AACvD,MAAA,eAAA,GAAkBC,qBAAA,CAAS,oBAAA,EAAsB,IAAA,CAAK,KAAA,EAAO,SAAkB,CAAA;AAAA,IACjF;AACA,IAAA,OAAO,eAAA;AAAA,EACT;AAEA,EAAA,MAAM,OAAA,GAA4B;AAAA,IAChC,OAAO,IAAA,CAAK,KAAA;AAAA,IACZ,aAAA,EAAe,KAAK,aAAA,IAAiB,MAAA;AAAA,IACrC,MAAA,EAAQ,KAAK,MAAA,IAAU,QAAA;AAAA,IACvB,UAAA,EAAY,IAAA;AAAA,IACZ,MAAM,MAAM,KAAA,EAA2C;AACrD,MAAA,IAAI,KAAA,CAAM,WAAW,CAAA,EAAG;AACtB,QAAA,OAAO,EAAE,OAAA,EAAS,IAAI,SAAA,EAAW,IAAA,CAAK,aAAa,CAAA,EAAE;AAAA,MACvD;AACA,MAAA,MAAM,IAAA,GAAQ,MAAM,WAAA,EAAY;AAIhC,MAAA,MAAM,MAAA,GAAS,MAAM,IAAA,CAAK,KAAA,EAAO,EAAE,OAAA,EAAS,MAAA,EAAQ,SAAA,EAAW,IAAA,EAAM,CAAA;AACrE,MAAA,MAAM,OAAO,MAAA,CAAO,IAAA;AACpB,MAAA,MAAM,YAAY,MAAA,CAAO,IAAA,CAAK,MAAA,CAAO,IAAA,CAAK,SAAS,CAAC,CAAA;AACpD,MAAA,IAAI,iBAAA,KAAsB,IAAA,IAAQ,iBAAA,KAAsB,SAAA,EAAW;AACjE,QAAA,MAAM,IAAI,KAAA,CAAM,CAAA,0CAAA,EAA6C,iBAAiB,CAAA,MAAA,EAAS,SAAS,CAAA,CAAE,CAAA;AAAA,MACpG;AACA,MAAA,iBAAA,GAAoB,SAAA;AACpB,MAAA,MAAM,UAA0B,EAAC;AACjC,MAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,KAAA,CAAM,MAAA,EAAQ,KAAK,CAAA,EAAG;AACxC,QAAA,OAAA,CAAQ,IAAA,CAAK,IAAI,YAAA,CAAa,IAAA,CAAK,MAAA,EAAQ,IAAA,CAAK,UAAA,GAAa,CAAA,GAAI,SAAA,GAAY,CAAA,EAAG,SAAS,CAAA,CAAE,OAAO,CAAA;AAAA,MACpG;AACA,MAAA,OAAO,EAAE,SAAS,SAAA,EAAU;AAAA,IAC9B;AAAA,GACF;AACA,EAAA,IAAI,IAAA,CAAK,SAAA,KAAc,MAAA,EAAW,OAAA,CAAQ,YAAY,IAAA,CAAK,SAAA;AAC3D,EAAA,OAAO,OAAA;AACT;;;ACvCO,IAAM,aAAA,GAAgB;AACtB,IAAM,uBAAA,GAA0B;;;ACNvC,eAAsB,KAAA,CAAM,QAAA,EAAkB,IAAA,GAAqB,EAAC,EAA+B;AACjG,EAAA,MAAM,MAAA,GAAS,cAAc,QAAA,EAAU,EAAE,MAAM,IAAA,CAAK,QAAA,IAAY,WAAW,CAAA;AAC3E,EAAA,MAAM,OAAA,GACJ,KAAK,OAAA,IACL,yBAAA;AAAA,IACE,KAAK,aAAA,KAAkB,MAAA,GACnB,EAAE,KAAA,EAAO,KAAK,KAAA,IAAS,aAAA,EAAe,aAAA,EAAe,IAAA,CAAK,eAAc,GACxE,EAAE,KAAA,EAAO,IAAA,CAAK,SAAS,aAAA;AAAc,GAC3C;AAEF,EAAA,MAAM,MAAA,GAAS,MAAM,OAAA,CAAQ,KAAA,CAAM,MAAA,CAAO,IAAI,CAAC,CAAA,KAAM,CAAA,CAAE,IAAI,CAAC,CAAA;AAC5D,EAAA,IAAI,MAAA,CAAO,OAAA,CAAQ,MAAA,KAAW,MAAA,CAAO,MAAA,EAAQ;AAC3C,IAAA,MAAM,IAAI,MAAM,CAAA,iBAAA,EAAoB,MAAA,CAAO,QAAQ,MAAM,CAAA,aAAA,EAAgB,MAAA,CAAO,MAAM,CAAA,OAAA,CAAS,CAAA;AAAA,EACjG;AAEA,EAAA,MAAM,eAAA,GAAoC,MAAA,CAAO,GAAA,CAAI,CAAC,GAAG,CAAA,MAAO;AAAA,IAC9D,IAAI,CAAA,CAAE,EAAA;AAAA,IACN,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,MAAA,EAAQ,MAAA,CAAO,OAAA,CAAQ,CAAC;AAAA,GAC1B,CAAE,CAAA;AAEF,EAAA,MAAM,KAAA,GAAwB;AAAA,IAC5B,OAAO,OAAA,CAAQ,KAAA;AAAA,IACf,eAAe,OAAA,CAAQ,aAAA;AAAA,IACvB,WAAW,MAAA,CAAO,SAAA;AAAA,IAClB,QAAQ,OAAA,CAAQ,MAAA;AAAA,IAChB,YAAY,OAAA,CAAQ,UAAA;AAAA,IACpB,QAAA,EAAU,KAAK,QAAA,IAAY,SAAA;AAAA,IAC3B,MAAA,EAAQ;AAAA,GACV;AAEA,EAAA,OAAO,EAAE,aAAA,EAAe,CAAA,EAAG,MAAA,EAAQ,CAAC,KAAK,CAAA,EAAE;AAC7C;;;AC/BO,SAAS,cAAA,CACd,OAAA,EACA,WAAA,EACA,IAAA,GAAsB,EAAC,EACV;AACb,EAAA,MAAM,KAAA,GAAQ,SAAA,CAAU,OAAA,EAAS,IAAA,CAAK,KAAK,CAAA;AAC3C,EAAA,IAAI,CAAC,KAAA,EAAO,MAAM,IAAI,MAAM,mCAAmC,CAAA;AAC/D,EAAA,IAAI,WAAA,CAAY,MAAA,KAAW,KAAA,CAAM,SAAA,EAAW;AAC1C,IAAA,MAAM,IAAI,KAAA,CAAM,CAAA,uBAAA,EAA0B,WAAA,CAAY,MAAM,CAAA,sBAAA,EAAyB,KAAA,CAAM,KAAK,CAAA,EAAA,EAAK,KAAA,CAAM,SAAS,CAAA,CAAA,CAAG,CAAA;AAAA,EACzH;AAEA,EAAA,MAAM,CAAA,GAAI,KAAK,CAAA,IAAK,CAAA;AACpB,EAAA,MAAM,MAAA,GAAsB,KAAA,CAAM,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,MAAO;AAAA,IACnD,YAAY,KAAA,CAAM,KAAA;AAAA,IAClB,SAAS,CAAA,CAAE,EAAA;AAAA,IACX,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,OAAO,UAAA,CAAW,WAAA,EAAa,CAAA,CAAE,MAAA,EAAQ,MAAM,MAAM;AAAA,GACvD,CAAE,CAAA;AAGF,EAAA,MAAM,KAAA,GAAQ,KAAA,CAAM,MAAA,KAAW,WAAA,GAAc,CAAA,GAAI,EAAA;AACjD,EAAA,MAAA,CAAO,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,SAAS,CAAA,CAAE,KAAA,GAAQ,EAAE,KAAA,CAAM,CAAA;AACjD,EAAA,OAAO,MAAA,CAAO,KAAA,CAAM,CAAA,EAAG,CAAC,CAAA;AAC1B;AAEA,SAAS,SAAA,CAAU,SAA4B,SAAA,EAAgD;AAC7F,EAAA,IAAI,SAAA,SAAkB,OAAA,CAAQ,MAAA,CAAO,KAAK,CAAC,CAAA,KAAM,CAAA,CAAE,KAAA,KAAU,SAAS,CAAA;AACtE,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAC,CAAA;AACzB;AAEA,SAAS,UAAA,CAAW,CAAA,EAAiB,CAAA,EAAiB,MAAA,EAAiC;AACrF,EAAA,IAAI,WAAW,WAAA,EAAa;AAC1B,IAAA,IAAI,GAAA,GAAM,CAAA;AACV,IAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,KAAK,CAAA,EAAG;AACpC,MAAA,MAAM,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,GAAK,EAAE,CAAC,CAAA;AACrB,MAAA,GAAA,IAAO,CAAA,GAAI,CAAA;AAAA,IACb;AACA,IAAA,OAAO,IAAA,CAAK,KAAK,GAAG,CAAA;AAAA,EACtB;AACA,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,GAAA,IAAO,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACxD,EAAA,IAAI,MAAA,KAAW,OAAO,OAAO,GAAA;AAE7B,EAAA,IAAI,EAAA,GAAK,CAAA;AACT,EAAA,IAAI,EAAA,GAAK,CAAA;AACT,EAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,KAAK,CAAA,EAAG;AACpC,IAAA,EAAA,IAAM,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACjB,IAAA,EAAA,IAAM,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AAAA,EACnB;AACA,EAAA,MAAM,QAAQ,IAAA,CAAK,IAAA,CAAK,EAAE,CAAA,GAAI,IAAA,CAAK,KAAK,EAAE,CAAA;AAC1C,EAAA,OAAO,KAAA,KAAU,CAAA,GAAI,CAAA,GAAI,GAAA,GAAM,KAAA;AACjC;;;AClDA,IAAM,gBAAA,GAAmB,mDAAA;AAElB,SAAS,yBAAyB,IAAA,EAAmD;AAC1F,EAAA,MAAM,QAAQ,IAAA,CAAK,KAAA,IAAS,QAAQ,GAAA,CAAI,QAAA,IAAY,QAAQ,GAAA,CAAI,iBAAA;AAChE,EAAA,IAAI,CAAC,KAAA,EAAO;AACV,IAAA,MAAM,IAAI,MAAM,0EAA0E,CAAA;AAAA,EAC5F;AACA,EAAA,MAAM,OAAA,GAAU,KAAK,OAAA,IAAW,gBAAA;AAChC,EAAA,MAAM,MAAM,CAAA,EAAG,OAAO,IAAI,SAAA,CAAU,IAAA,CAAK,KAAK,CAAC,CAAA,4BAAA,CAAA;AAE/C,EAAA,MAAM,OAAA,GAA4B;AAAA,IAChC,OAAO,IAAA,CAAK,KAAA;AAAA,IACZ,aAAA,EAAe,KAAK,aAAA,IAAiB,MAAA;AAAA,IACrC,MAAA,EAAQ,KAAK,MAAA,IAAU,QAAA;AAAA,IACvB,UAAA,EAAY,IAAA;AAAA,IACZ,MAAM,MAAM,KAAA,EAA2C;AACrD,MAAA,IAAI,KAAA,CAAM,WAAW,CAAA,EAAG;AACtB,QAAA,OAAO,EAAE,OAAA,EAAS,IAAI,SAAA,EAAW,IAAA,CAAK,aAAa,CAAA,EAAE;AAAA,MACvD;AACA,MAAA,MAAM,IAAA,GAAO,IAAA,CAAK,SAAA,CAAU,EAAE,MAAA,EAAQ,KAAA,EAAO,OAAA,EAAS,EAAE,cAAA,EAAgB,IAAA,EAAK,EAAG,CAAA;AAChF,MAAA,MAAM,GAAA,GAAM,MAAM,KAAA,CAAM,GAAA,EAAK;AAAA,QAC3B,MAAA,EAAQ,MAAA;AAAA,QACR,OAAA,EAAS;AAAA,UACP,cAAA,EAAgB,kBAAA;AAAA,UAChB,aAAA,EAAe,UAAU,KAAK,CAAA;AAAA,SAChC;AAAA,QACA;AAAA,OACD,CAAA;AACD,MAAA,IAAI,CAAC,IAAI,EAAA,EAAI;AACX,QAAA,MAAM,SAAS,MAAM,GAAA,CAAI,MAAK,CAAE,KAAA,CAAM,MAAM,EAAE,CAAA;AAC9C,QAAA,MAAM,IAAI,KAAA,CAAM,CAAA,iBAAA,EAAoB,GAAA,CAAI,MAAM,CAAA,KAAA,EAAQ,IAAA,CAAK,KAAK,CAAA,EAAA,EAAK,MAAA,IAAU,GAAA,CAAI,UAAU,CAAA,CAAE,CAAA;AAAA,MACjG;AACA,MAAA,MAAM,GAAA,GAAO,MAAM,GAAA,CAAI,IAAA,EAAK;AAC5B,MAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,GAAA,EAAK,KAAA,CAAM,MAAM,CAAA;AAChD,MAAA,MAAM,YAAY,MAAA,CAAO,CAAC,CAAA,EAAG,MAAA,IAAU,KAAK,SAAA,IAAa,CAAA;AACzD,MAAA,MAAM,OAAA,GAAU,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,KAAM,UAAU,YAAA,CAAa,IAAA,CAAK,CAAC,CAAC,CAAC,CAAA;AACjE,MAAA,OAAO,EAAE,SAAS,SAAA,EAAU;AAAA,IAC9B;AAAA,GACF;AACA,EAAA,IAAI,IAAA,CAAK,SAAA,KAAc,MAAA,EAAW,OAAA,CAAQ,YAAY,IAAA,CAAK,SAAA;AAC3D,EAAA,OAAO,OAAA;AACT;AAWA,SAAS,eAAA,CAAgB,KAAc,aAAA,EAAmC;AACxE,EAAA,IAAI,CAAC,KAAA,CAAM,OAAA,CAAQ,GAAG,CAAA,EAAG;AACvB,IAAA,MAAM,IAAI,MAAM,2CAA2C,CAAA;AAAA,EAC7D;AACA,EAAA,IAAI,GAAA,CAAI,MAAA,KAAW,CAAA,EAAG,OAAO,EAAC;AAE9B,EAAA,MAAM,KAAA,GAAQ,IAAI,CAAC,CAAA;AACnB,EAAA,IAAI,OAAO,UAAU,QAAA,EAAU;AAC7B,IAAA,IAAI,kBAAkB,CAAA,EAAG;AACvB,MAAA,MAAM,IAAI,KAAA,CAAM,CAAA,yCAAA,EAA4C,aAAa,CAAA,CAAE,CAAA;AAAA,IAC7E;AACA,IAAA,OAAO,CAAC,GAAe,CAAA;AAAA,EACzB;AACA,EAAA,IAAI,KAAA,CAAM,OAAA,CAAQ,KAAK,CAAA,KAAM,KAAA,CAAM,MAAA,KAAW,CAAA,IAAK,OAAO,KAAA,CAAM,CAAC,CAAA,KAAM,QAAA,CAAA,EAAW;AAChF,IAAA,OAAO,GAAA;AAAA,EACT;AACA,EAAA,IAAI,KAAA,CAAM,QAAQ,KAAK,CAAA,IAAK,MAAM,OAAA,CAAQ,KAAA,CAAM,CAAC,CAAC,CAAA,EAAG;AAEnD,IAAA,OAAQ,GAAA,CAAqB,IAAI,QAAQ,CAAA;AAAA,EAC3C;AACA,EAAA,MAAM,IAAI,MAAM,+CAA+C,CAAA;AACjE;AAEA,SAAS,SAAS,MAAA,EAA8B;AAC9C,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,EAAC;AACjC,EAAA,MAAM,GAAA,GAAM,MAAA,CAAO,CAAC,CAAA,CAAG,MAAA;AACvB,EAAA,MAAM,MAAM,IAAI,KAAA,CAAc,GAAG,CAAA,CAAE,KAAK,CAAC,CAAA;AACzC,EAAA,KAAA,MAAW,KAAK,MAAA,EAAQ;AACtB,IAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,GAAA,EAAK,CAAA,IAAK,GAAG,GAAA,CAAI,CAAC,CAAA,IAAM,CAAA,CAAE,CAAC,CAAA;AAAA,EACjD;AACA,EAAA,KAAA,IAAS,CAAA,GAAI,GAAG,CAAA,GAAI,GAAA,EAAK,KAAK,CAAA,EAAG,GAAA,CAAI,CAAC,CAAA,IAAM,MAAA,CAAO,MAAA;AACnD,EAAA,OAAO,GAAA;AACT;AAEA,SAAS,UAAU,CAAA,EAA+B;AAChD,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,GAAA,IAAO,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACxD,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,IAAA,CAAK,GAAG,CAAA;AAC1B,EAAA,IAAI,IAAA,KAAS,GAAG,OAAO,CAAA;AACvB,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,CAAA,CAAE,CAAC,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,GAAK,IAAA;AACrD,EAAA,OAAO,CAAA;AACT","file":"index.cjs","sourcesContent":["/**\n * Section-based markdown chunker.\n *\n * Splits a markdown document on ATX headings (`#`, `##`, ...). Each chunk\n * carries the byte offset and length into the original UTF-8 source so a\n * downstream consumer can map a vector hit back to the exact substring\n * without re-tokenising. Pre-heading content becomes a \"preamble\" chunk.\n */\n\nexport type ChunkingMode = 'document' | 'section' | 'paragraph';\n\nexport interface MarkdownChunk {\n id: string;\n textOffset: number;\n textLength: number;\n text: string;\n}\n\nexport interface ChunkOptions {\n mode?: ChunkingMode;\n}\n\nconst HEADING = /^(#{1,6})\\s+(.+?)\\s*$/;\n\nexport function chunkMarkdown(markdown: string, opts: ChunkOptions = {}): MarkdownChunk[] {\n const mode = opts.mode ?? 'section';\n if (mode === 'document') {\n return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }];\n }\n if (mode === 'paragraph') {\n return paragraphChunks(markdown);\n }\n return sectionChunks(markdown);\n}\n\nfunction sectionChunks(markdown: string): MarkdownChunk[] {\n const lines = splitWithOffsets(markdown);\n const sections: MarkdownChunk[] = [];\n let current: { id: string; start: number; end: number } | null = null;\n const ids = new Set<string>();\n\n function flush(end: number): void {\n if (!current) return;\n const text = markdown.slice(current.start, end);\n if (text.trim().length === 0) {\n current = null;\n return;\n }\n sections.push({ id: current.id, textOffset: current.start, textLength: end - current.start, text });\n current = null;\n }\n\n for (const line of lines) {\n const match = HEADING.exec(line.text);\n if (match) {\n flush(line.offset);\n const id = uniqueId(slugify(match[2] ?? `section-${sections.length + 1}`), ids);\n ids.add(id);\n current = { id, start: line.offset, end: line.offset + line.text.length };\n continue;\n }\n if (current === null) {\n const id = uniqueId('preamble', ids);\n ids.add(id);\n current = { id, start: line.offset, end: line.offset + line.text.length };\n } else {\n current.end = line.offset + line.text.length;\n }\n }\n flush(markdown.length);\n\n if (sections.length === 0) {\n return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }];\n }\n return sections;\n}\n\nfunction paragraphChunks(markdown: string): MarkdownChunk[] {\n const out: MarkdownChunk[] = [];\n const ids = new Set<string>();\n let cursor = 0;\n let i = 0;\n while (cursor < markdown.length) {\n let end = markdown.indexOf('\\n\\n', cursor);\n if (end === -1) end = markdown.length;\n const text = markdown.slice(cursor, end);\n if (text.trim().length > 0) {\n const id = uniqueId(slugify(text.split('\\n')[0] ?? `p-${i}`), ids);\n ids.add(id);\n out.push({ id, textOffset: cursor, textLength: text.length, text });\n i += 1;\n }\n cursor = end + 2;\n }\n if (out.length === 0) {\n return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }];\n }\n return out;\n}\n\nfunction splitWithOffsets(s: string): { text: string; offset: number }[] {\n const lines: { text: string; offset: number }[] = [];\n let offset = 0;\n for (const line of s.split('\\n')) {\n const withNl = line + (offset + line.length < s.length ? '\\n' : '');\n lines.push({ text: withNl, offset });\n offset += withNl.length;\n }\n return lines;\n}\n\nfunction slugify(s: string): string {\n return (\n s\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 64) || 'section'\n );\n}\n\nfunction uniqueId(base: string, taken: Set<string>): string {\n if (!taken.has(base)) return base;\n let n = 2;\n while (taken.has(`${base}-${n}`)) n += 1;\n return `${base}-${n}`;\n}\n","/**\n * transformers.js backend (browser + Node + Bun via @huggingface/transformers).\n *\n * Loads the model lazily and reuses the pipeline across calls. Models are\n * cached on disk by transformers.js itself (HF cache layout).\n */\n\nimport { pipeline, env } from '@huggingface/transformers';\nimport type { EmbeddingBackend, EmbeddingBackendOptions, EmbeddingMatrix } from './types.js';\n\ninterface TransformersBackendOptions extends EmbeddingBackendOptions {\n /** Override Xenova quantisation. Defaults to fp32 for closest fidelity. */\n dtype?: 'fp32' | 'fp16' | 'q8' | 'q4';\n /** Force a backend device (e.g. 'cpu', 'gpu', 'wasm'). Defaults to auto. */\n device?: 'cpu' | 'gpu' | 'wasm' | 'webgpu';\n /** Allow remote model downloads. true by default. */\n allowRemoteModels?: boolean;\n}\n\nexport function createTransformersBackend(opts: TransformersBackendOptions): EmbeddingBackend {\n if (opts.allowRemoteModels === false) {\n env.allowRemoteModels = false;\n }\n\n let pipelinePromise: Promise<unknown> | null = null;\n let resolvedDimension: number | null = null;\n\n function getPipeline(): Promise<unknown> {\n if (!pipelinePromise) {\n const modelOpts: Record<string, unknown> = { dtype: opts.dtype ?? 'fp32' };\n if (opts.device !== undefined) modelOpts.device = opts.device;\n pipelinePromise = pipeline('feature-extraction', opts.model, modelOpts as never);\n }\n return pipelinePromise;\n }\n\n const backend: EmbeddingBackend = {\n model: opts.model,\n modelRevision: opts.modelRevision ?? 'main',\n metric: opts.metric ?? 'cosine',\n normalized: true,\n async embed(texts: string[]): Promise<EmbeddingMatrix> {\n if (texts.length === 0) {\n return { vectors: [], dimension: opts.dimension ?? 0 };\n }\n const pipe = (await getPipeline()) as (\n texts: string[],\n opts: { pooling: 'mean'; normalize: boolean },\n ) => Promise<{ data: Float32Array; dims: number[] }>;\n const tensor = await pipe(texts, { pooling: 'mean', normalize: true });\n const data = tensor.data;\n const dimension = tensor.dims[tensor.dims.length - 1] as number;\n if (resolvedDimension !== null && resolvedDimension !== dimension) {\n throw new Error(`Model emitted inconsistent dimension: had ${resolvedDimension}, now ${dimension}`);\n }\n resolvedDimension = dimension;\n const vectors: Float32Array[] = [];\n for (let i = 0; i < texts.length; i += 1) {\n vectors.push(new Float32Array(data.buffer, data.byteOffset + i * dimension * 4, dimension).slice());\n }\n return { vectors, dimension };\n },\n };\n if (opts.dimension !== undefined) backend.dimension = opts.dimension;\n return backend;\n}\n","import type { EmbeddingMetric } from '@cvfile/sdk';\n\nexport interface EmbeddingMatrix {\n vectors: Float32Array[];\n dimension: number;\n}\n\nexport interface EmbeddingBackend {\n model: string;\n modelRevision: string;\n /** Optional pre-declared dimension; the backend may override after first call. */\n dimension?: number;\n metric: EmbeddingMetric;\n normalized: boolean;\n embed(texts: string[]): Promise<EmbeddingMatrix>;\n}\n\nexport interface EmbeddingBackendOptions {\n model: string;\n modelRevision?: string;\n /** Optional pre-known dimension (e.g. 1024 for bge-m3). */\n dimension?: number;\n metric?: EmbeddingMetric;\n}\n\n/** Recommended default per spec §5: BAAI BGE-M3, MIT, multilingual, 1024-dim. */\nexport const DEFAULT_MODEL = 'Xenova/bge-m3';\nexport const DEFAULT_MODEL_DIMENSION = 1024;\n","/**\n * High-level embed() API: markdown in, EmbeddingsPayload out, ready to drop\n * into pack({ embeddings: ... }).\n */\n\nimport type { EmbeddingChunk, EmbeddingSpace, EmbeddingsPayload } from '@cvfile/sdk';\nimport { chunkMarkdown, type ChunkingMode } from './chunk.js';\nimport { createTransformersBackend } from './transformers-backend.js';\nimport { DEFAULT_MODEL, type EmbeddingBackend } from './types.js';\n\nexport interface EmbedOptions {\n /** HF model id; defaults to BGE-M3 (Xenova/bge-m3). */\n model?: string;\n /** Pinned model revision; recorded in the payload for reproducibility. */\n modelRevision?: string;\n /** Chunking strategy; default 'section'. */\n chunking?: ChunkingMode;\n /** Bring-your-own backend (e.g. an OpenAI/Voyage adapter). */\n backend?: EmbeddingBackend;\n}\n\nexport async function embed(markdown: string, opts: EmbedOptions = {}): Promise<EmbeddingsPayload> {\n const chunks = chunkMarkdown(markdown, { mode: opts.chunking ?? 'section' });\n const backend =\n opts.backend ??\n createTransformersBackend(\n opts.modelRevision !== undefined\n ? { model: opts.model ?? DEFAULT_MODEL, modelRevision: opts.modelRevision }\n : { model: opts.model ?? DEFAULT_MODEL },\n );\n\n const matrix = await backend.embed(chunks.map((c) => c.text));\n if (matrix.vectors.length !== chunks.length) {\n throw new Error(`Backend returned ${matrix.vectors.length} vectors for ${chunks.length} chunks`);\n }\n\n const embeddingChunks: EmbeddingChunk[] = chunks.map((c, i) => ({\n id: c.id,\n textOffset: c.textOffset,\n textLength: c.textLength,\n vector: matrix.vectors[i]!,\n }));\n\n const space: EmbeddingSpace = {\n model: backend.model,\n modelRevision: backend.modelRevision,\n dimension: matrix.dimension,\n metric: backend.metric,\n normalized: backend.normalized,\n chunking: opts.chunking ?? 'section',\n chunks: embeddingChunks,\n };\n\n return { formatVersion: 1, spaces: [space] };\n}\n","/**\n * Pure similarity search over an EmbeddingsPayload. Vector input is the\n * caller's responsibility (encode the query with the same model that was\n * used to populate the space, ideally pulled from space.model).\n */\n\nimport type { EmbeddingMetric, EmbeddingSpace, EmbeddingsPayload } from '@cvfile/sdk';\n\nexport interface SearchHit {\n spaceModel: string;\n chunkId: string;\n textOffset: number;\n textLength: number;\n score: number;\n}\n\nexport interface SearchOptions {\n /** Restrict to a specific embedding space; default = first space. */\n model?: string;\n /** Top-k results; default 5. */\n k?: number;\n}\n\nexport function searchSemantic(\n payload: EmbeddingsPayload,\n queryVector: Float32Array,\n opts: SearchOptions = {},\n): SearchHit[] {\n const space = pickSpace(payload, opts.model);\n if (!space) throw new Error('No matching embedding space found');\n if (queryVector.length !== space.dimension) {\n throw new Error(`Query vector dimension ${queryVector.length} does not match space ${space.model} (${space.dimension})`);\n }\n\n const k = opts.k ?? 5;\n const scored: SearchHit[] = space.chunks.map((c) => ({\n spaceModel: space.model,\n chunkId: c.id,\n textOffset: c.textOffset,\n textLength: c.textLength,\n score: similarity(queryVector, c.vector, space.metric),\n }));\n\n // Higher score = better for cosine/dot; lower for euclidean.\n const order = space.metric === 'euclidean' ? 1 : -1;\n scored.sort((a, b) => order * (a.score - b.score));\n return scored.slice(0, k);\n}\n\nfunction pickSpace(payload: EmbeddingsPayload, modelHint?: string): EmbeddingSpace | undefined {\n if (modelHint) return payload.spaces.find((s) => s.model === modelHint);\n return payload.spaces[0];\n}\n\nfunction similarity(a: Float32Array, b: Float32Array, metric: EmbeddingMetric): number {\n if (metric === 'euclidean') {\n let sum = 0;\n for (let i = 0; i < a.length; i += 1) {\n const d = a[i]! - b[i]!;\n sum += d * d;\n }\n return Math.sqrt(sum);\n }\n let dot = 0;\n for (let i = 0; i < a.length; i += 1) dot += a[i]! * b[i]!;\n if (metric === 'dot') return dot;\n // cosine: assume normalized vectors when produced by our backend\n let na = 0;\n let nb = 0;\n for (let i = 0; i < a.length; i += 1) {\n na += a[i]! * a[i]!;\n nb += b[i]! * b[i]!;\n }\n const denom = Math.sqrt(na) * Math.sqrt(nb);\n return denom === 0 ? 0 : dot / denom;\n}\n","/**\n * Hugging Face Inference API backend.\n *\n * Calls https://router.huggingface.co/hf-inference/models/<model>/pipeline/feature-extraction\n * with HF_TOKEN. For sentence-transformers models (BGE-M3, MiniLM, etc.) the\n * response is already mean-pooled per input — one vector per text. We\n * normalise client-side so cosine math is consistent across backends.\n */\n\nimport type { EmbeddingBackend, EmbeddingMatrix } from './types.js';\nimport type { EmbeddingMetric } from '@cvfile/sdk';\n\nexport interface HuggingFaceBackendOptions {\n model: string;\n /** HF token. Defaults to `process.env.HF_TOKEN`. */\n token?: string;\n /** Pinned revision; recorded in the payload. Default 'main'. */\n modelRevision?: string;\n /** Pre-known dimension. Optional; inferred from first response otherwise. */\n dimension?: number;\n metric?: EmbeddingMetric;\n /** Override base URL (e.g. for self-hosted TEI). */\n baseUrl?: string;\n}\n\nconst DEFAULT_BASE_URL = 'https://router.huggingface.co/hf-inference/models';\n\nexport function createHuggingFaceBackend(opts: HuggingFaceBackendOptions): EmbeddingBackend {\n const token = opts.token ?? process.env.HF_TOKEN ?? process.env.HUGGINGFACE_TOKEN;\n if (!token) {\n throw new Error('HF_TOKEN (or HUGGINGFACE_TOKEN) is required for the Hugging Face backend');\n }\n const baseUrl = opts.baseUrl ?? DEFAULT_BASE_URL;\n const url = `${baseUrl}/${encodeURI(opts.model)}/pipeline/feature-extraction`;\n\n const backend: EmbeddingBackend = {\n model: opts.model,\n modelRevision: opts.modelRevision ?? 'main',\n metric: opts.metric ?? 'cosine',\n normalized: true,\n async embed(texts: string[]): Promise<EmbeddingMatrix> {\n if (texts.length === 0) {\n return { vectors: [], dimension: opts.dimension ?? 0 };\n }\n const body = JSON.stringify({ inputs: texts, options: { wait_for_model: true } });\n const res = await fetch(url, {\n method: 'POST',\n headers: {\n 'content-type': 'application/json',\n authorization: `Bearer ${token}`,\n },\n body,\n });\n if (!res.ok) {\n const detail = await res.text().catch(() => '');\n throw new Error(`HF Inference API ${res.status} for ${opts.model}: ${detail || res.statusText}`);\n }\n const raw = (await res.json()) as unknown;\n const matrix = parseHfResponse(raw, texts.length);\n const dimension = matrix[0]?.length ?? opts.dimension ?? 0;\n const vectors = matrix.map((v) => normalize(Float32Array.from(v)));\n return { vectors, dimension };\n },\n };\n if (opts.dimension !== undefined) backend.dimension = opts.dimension;\n return backend;\n}\n\n/**\n * Coerce the variety of shapes the HF Inference API returns into a flat\n * `number[][]`: one mean-pooled vector per input.\n *\n * Observed shapes:\n * - sentence-transformers (BGE-M3, MiniLM): `[[...vec], [...vec]]`\n * - feature-extraction without pooling: `[[[...token0], [...token1], ...], ...]`\n * - single-input convenience form: `[...vec]`\n */\nfunction parseHfResponse(raw: unknown, expectedCount: number): number[][] {\n if (!Array.isArray(raw)) {\n throw new Error('HF Inference API: expected array response');\n }\n if (raw.length === 0) return [];\n\n const first = raw[0];\n if (typeof first === 'number') {\n if (expectedCount !== 1) {\n throw new Error(`HF Inference API: got 1 vector, expected ${expectedCount}`);\n }\n return [raw as number[]];\n }\n if (Array.isArray(first) && (first.length === 0 || typeof first[0] === 'number')) {\n return raw as number[][];\n }\n if (Array.isArray(first) && Array.isArray(first[0])) {\n // token-level embeddings: mean-pool per input\n return (raw as number[][][]).map(meanPool);\n }\n throw new Error('HF Inference API: unrecognised response shape');\n}\n\nfunction meanPool(tokens: number[][]): number[] {\n if (tokens.length === 0) return [];\n const dim = tokens[0]!.length;\n const out = new Array<number>(dim).fill(0);\n for (const t of tokens) {\n for (let i = 0; i < dim; i += 1) out[i]! += t[i]!;\n }\n for (let i = 0; i < dim; i += 1) out[i]! /= tokens.length;\n return out;\n}\n\nfunction normalize(v: Float32Array): Float32Array {\n let sum = 0;\n for (let i = 0; i < v.length; i += 1) sum += v[i]! * v[i]!;\n const norm = Math.sqrt(sum);\n if (norm === 0) return v;\n for (let i = 0; i < v.length; i += 1) v[i] = v[i]! / norm;\n return v;\n}\n"]}
|
|
1
|
+
{"version":3,"sources":["../src/chunk.ts","../src/transformers-backend.ts","../src/types.ts","../src/embed.ts","../src/search.ts","../src/huggingface-backend.ts"],"names":["env","pipeline"],"mappings":";;;;;AAgBA,IAAM,OAAA,GAAU,IAAI,WAAA,EAAY;AAChC,IAAM,OAAA,GAAU,IAAI,WAAA,EAAY;AAehC,IAAM,OAAA,GAAU,uBAAA;AAST,SAAS,aAAA,CAAc,QAAA,EAAkB,IAAA,GAAqB,EAAC,EAAoB;AACxF,EAAA,MAAM,IAAA,GAAO,KAAK,IAAA,IAAQ,SAAA;AAC1B,EAAA,MAAM,KAAA,GAAQ,OAAA,CAAQ,MAAA,CAAO,QAAQ,CAAA;AACrC,EAAA,IAAI,SAAS,UAAA,EAAY;AACvB,IAAA,OAAO,CAAC,aAAA,CAAc,KAAK,CAAC,CAAA;AAAA,EAC9B;AACA,EAAA,IAAI,SAAS,WAAA,EAAa;AACxB,IAAA,OAAO,gBAAgB,KAAK,CAAA;AAAA,EAC9B;AACA,EAAA,OAAO,cAAc,KAAK,CAAA;AAC5B;AAEA,SAAS,cAAc,KAAA,EAAkC;AACvD,EAAA,OAAO,EAAE,EAAA,EAAI,UAAA,EAAY,UAAA,EAAY,GAAG,UAAA,EAAY,KAAA,CAAM,UAAA,EAAY,IAAA,EAAM,SAAA,CAAU,KAAA,EAAO,CAAA,EAAG,KAAA,CAAM,UAAU,CAAA,EAAE;AACpH;AAEA,SAAS,cAAc,KAAA,EAAoC;AACzD,EAAA,MAAM,KAAA,GAAQ,qBAAqB,KAAK,CAAA;AACxC,EAAA,MAAM,WAA4B,EAAC;AACnC,EAAA,IAAI,OAAA,GAA6D,IAAA;AACjE,EAAA,MAAM,GAAA,uBAAU,GAAA,EAAY;AAE5B,EAAA,SAAS,MAAM,GAAA,EAAmB;AAChC,IAAA,IAAI,CAAC,OAAA,EAAS;AACd,IAAA,MAAM,IAAA,GAAO,SAAA,CAAU,KAAA,EAAO,OAAA,CAAQ,OAAO,GAAG,CAAA;AAChD,IAAA,IAAI,IAAA,CAAK,IAAA,EAAK,CAAE,MAAA,KAAW,CAAA,EAAG;AAC5B,MAAA,OAAA,GAAU,IAAA;AACV,MAAA;AAAA,IACF;AACA,IAAA,QAAA,CAAS,IAAA,CAAK,EAAE,EAAA,EAAI,OAAA,CAAQ,EAAA,EAAI,UAAA,EAAY,OAAA,CAAQ,KAAA,EAAO,UAAA,EAAY,GAAA,GAAM,OAAA,CAAQ,KAAA,EAAO,MAAM,CAAA;AAClG,IAAA,OAAA,GAAU,IAAA;AAAA,EACZ;AAEA,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,MAAM,KAAA,GAAQ,OAAA,CAAQ,IAAA,CAAK,IAAA,CAAK,IAAI,CAAA;AACpC,IAAA,MAAM,OAAA,GAAU,IAAA,CAAK,MAAA,GAAS,IAAA,CAAK,UAAA;AACnC,IAAA,IAAI,KAAA,EAAO;AACT,MAAA,KAAA,CAAM,KAAK,MAAM,CAAA;AACjB,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,OAAA,CAAQ,KAAA,CAAM,CAAC,CAAA,IAAK,CAAA,QAAA,EAAW,QAAA,CAAS,MAAA,GAAS,CAAC,CAAA,CAAE,CAAA,EAAG,GAAG,CAAA;AAC9E,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,OAAA,GAAU,EAAE,EAAA,EAAI,KAAA,EAAO,IAAA,CAAK,MAAA,EAAQ,KAAK,OAAA,EAAQ;AACjD,MAAA;AAAA,IACF;AACA,IAAA,IAAI,YAAY,IAAA,EAAM;AACpB,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,UAAA,EAAY,GAAG,CAAA;AACnC,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,OAAA,GAAU,EAAE,EAAA,EAAI,KAAA,EAAO,IAAA,CAAK,MAAA,EAAQ,KAAK,OAAA,EAAQ;AAAA,IACnD,CAAA,MAAO;AACL,MAAA,OAAA,CAAQ,GAAA,GAAM,OAAA;AAAA,IAChB;AAAA,EACF;AACA,EAAA,KAAA,CAAM,MAAM,UAAU,CAAA;AAEtB,EAAA,IAAI,QAAA,CAAS,WAAW,CAAA,EAAG;AACzB,IAAA,OAAO,CAAC,aAAA,CAAc,KAAK,CAAC,CAAA;AAAA,EAC9B;AACA,EAAA,OAAO,QAAA;AACT;AAEA,SAAS,gBAAgB,KAAA,EAAoC;AAC3D,EAAA,MAAM,MAAuB,EAAC;AAC9B,EAAA,MAAM,GAAA,uBAAU,GAAA,EAAY;AAC5B,EAAA,MAAM,SAAA,GAAY,OAAA,CAAQ,MAAA,CAAO,MAAM,CAAA;AACvC,EAAA,IAAI,MAAA,GAAS,CAAA;AACb,EAAA,IAAI,CAAA,GAAI,CAAA;AACR,EAAA,OAAO,MAAA,GAAS,MAAM,UAAA,EAAY;AAChC,IAAA,IAAI,GAAA,GAAM,YAAA,CAAa,KAAA,EAAO,SAAA,EAAW,MAAM,CAAA;AAC/C,IAAA,IAAI,GAAA,KAAQ,EAAA,EAAI,GAAA,GAAM,KAAA,CAAM,UAAA;AAC5B,IAAA,MAAM,IAAA,GAAO,SAAA,CAAU,KAAA,EAAO,MAAA,EAAQ,GAAG,CAAA;AACzC,IAAA,IAAI,IAAA,CAAK,IAAA,EAAK,CAAE,MAAA,GAAS,CAAA,EAAG;AAC1B,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,OAAA,CAAQ,IAAA,CAAK,KAAA,CAAM,IAAI,CAAA,CAAE,CAAC,CAAA,IAAK,CAAA,EAAA,EAAK,CAAC,CAAA,CAAE,GAAG,GAAG,CAAA;AACjE,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,GAAA,CAAI,IAAA,CAAK,EAAE,EAAA,EAAI,UAAA,EAAY,QAAQ,UAAA,EAAY,GAAA,GAAM,MAAA,EAAQ,IAAA,EAAM,CAAA;AACnE,MAAA,CAAA,IAAK,CAAA;AAAA,IACP;AACA,IAAA,MAAA,GAAS,MAAM,SAAA,CAAU,UAAA;AAAA,EAC3B;AACA,EAAA,IAAI,GAAA,CAAI,WAAW,CAAA,EAAG;AACpB,IAAA,OAAO,CAAC,aAAA,CAAc,KAAK,CAAC,CAAA;AAAA,EAC9B;AACA,EAAA,OAAO,GAAA;AACT;AAGA,SAAS,SAAA,CAAU,KAAA,EAAmB,KAAA,EAAe,GAAA,EAAqB;AACxE,EAAA,OAAO,QAAQ,MAAA,CAAO,KAAA,CAAM,QAAA,CAAS,KAAA,EAAO,GAAG,CAAC,CAAA;AAClD;AAGA,SAAS,qBAAqB,KAAA,EAA+B;AAC3D,EAAA,MAAM,OAAA,GAAU,EAAA;AAChB,EAAA,MAAM,QAAoB,EAAC;AAC3B,EAAA,IAAI,KAAA,GAAQ,CAAA;AACZ,EAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,KAAA,CAAM,UAAA,EAAY,KAAK,CAAA,EAAG;AAC5C,IAAA,IAAI,KAAA,CAAM,CAAC,CAAA,KAAM,OAAA,EAAS;AACxB,MAAA,MAAM,UAAA,GAAa,IAAI,KAAA,GAAQ,CAAA;AAC/B,MAAA,KAAA,CAAM,IAAA,CAAK,EAAE,IAAA,EAAM,SAAA,CAAU,KAAA,EAAO,KAAA,EAAO,CAAA,GAAI,CAAC,CAAA,EAAG,MAAA,EAAQ,KAAA,EAAO,UAAA,EAAY,CAAA;AAC9E,MAAA,KAAA,GAAQ,CAAA,GAAI,CAAA;AAAA,IACd;AAAA,EACF;AACA,EAAA,IAAI,KAAA,GAAQ,MAAM,UAAA,EAAY;AAC5B,IAAA,KAAA,CAAM,IAAA,CAAK,EAAE,IAAA,EAAM,SAAA,CAAU,OAAO,KAAA,EAAO,KAAA,CAAM,UAAU,CAAA,EAAG,QAAQ,KAAA,EAAO,UAAA,EAAY,KAAA,CAAM,UAAA,GAAa,OAAO,CAAA;AAAA,EACrH;AACA,EAAA,OAAO,KAAA;AACT;AAGA,SAAS,YAAA,CAAa,QAAA,EAAsB,MAAA,EAAoB,IAAA,EAAsB;AACpF,EAAA,MAAM,IAAA,GAAO,QAAA,CAAS,UAAA,GAAa,MAAA,CAAO,UAAA;AAC1C,EAAA,KAAA,IAAS,CAAA,GAAI,IAAA,EAAM,CAAA,IAAK,IAAA,EAAM,KAAK,CAAA,EAAG;AACpC,IAAA,IAAI,OAAA,GAAU,IAAA;AACd,IAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,MAAA,CAAO,UAAA,EAAY,KAAK,CAAA,EAAG;AAC7C,MAAA,IAAI,SAAS,CAAA,GAAI,CAAC,CAAA,KAAM,MAAA,CAAO,CAAC,CAAA,EAAG;AACjC,QAAA,OAAA,GAAU,KAAA;AACV,QAAA;AAAA,MACF;AAAA,IACF;AACA,IAAA,IAAI,SAAS,OAAO,CAAA;AAAA,EACtB;AACA,EAAA,OAAO,EAAA;AACT;AAEA,SAAS,QAAQ,CAAA,EAAmB;AAClC,EAAA,OACE,CAAA,CACG,WAAA,EAAY,CACZ,OAAA,CAAQ,eAAe,GAAG,CAAA,CAC1B,OAAA,CAAQ,UAAA,EAAY,EAAE,CAAA,CACtB,KAAA,CAAM,CAAA,EAAG,EAAE,CAAA,IAAK,SAAA;AAEvB;AAEA,SAAS,QAAA,CAAS,MAAc,KAAA,EAA4B;AAC1D,EAAA,IAAI,CAAC,KAAA,CAAM,GAAA,CAAI,IAAI,GAAG,OAAO,IAAA;AAC7B,EAAA,IAAI,CAAA,GAAI,CAAA;AACR,EAAA,OAAO,KAAA,CAAM,IAAI,CAAA,EAAG,IAAI,IAAI,CAAC,CAAA,CAAE,GAAG,CAAA,IAAK,CAAA;AACvC,EAAA,OAAO,CAAA,EAAG,IAAI,CAAA,CAAA,EAAI,CAAC,CAAA,CAAA;AACrB;AC/JO,SAAS,0BAA0B,IAAA,EAAoD;AAC5F,EAAA,IAAI,IAAA,CAAK,sBAAsB,KAAA,EAAO;AACpC,IAAAA,gBAAA,CAAI,iBAAA,GAAoB,KAAA;AAAA,EAC1B;AAEA,EAAA,IAAI,eAAA,GAA2C,IAAA;AAC/C,EAAA,IAAI,iBAAA,GAAmC,IAAA;AAEvC,EAAA,SAAS,WAAA,GAAgC;AACvC,IAAA,IAAI,CAAC,eAAA,EAAiB;AACpB,MAAA,MAAM,SAAA,GAAqC,EAAE,KAAA,EAAO,IAAA,CAAK,SAAS,MAAA,EAAO;AACzE,MAAA,IAAI,IAAA,CAAK,MAAA,KAAW,MAAA,EAAW,SAAA,CAAU,SAAS,IAAA,CAAK,MAAA;AACvD,MAAA,eAAA,GAAkBC,qBAAA,CAAS,oBAAA,EAAsB,IAAA,CAAK,KAAA,EAAO,SAAkB,CAAA;AAAA,IACjF;AACA,IAAA,OAAO,eAAA;AAAA,EACT;AAEA,EAAA,MAAM,OAAA,GAA4B;AAAA,IAChC,OAAO,IAAA,CAAK,KAAA;AAAA,IACZ,aAAA,EAAe,KAAK,aAAA,IAAiB,MAAA;AAAA,IACrC,MAAA,EAAQ,KAAK,MAAA,IAAU,QAAA;AAAA,IACvB,UAAA,EAAY,IAAA;AAAA,IACZ,MAAM,MAAM,KAAA,EAA2C;AACrD,MAAA,IAAI,KAAA,CAAM,WAAW,CAAA,EAAG;AACtB,QAAA,OAAO,EAAE,OAAA,EAAS,IAAI,SAAA,EAAW,IAAA,CAAK,aAAa,CAAA,EAAE;AAAA,MACvD;AACA,MAAA,MAAM,IAAA,GAAQ,MAAM,WAAA,EAAY;AAIhC,MAAA,MAAM,MAAA,GAAS,MAAM,IAAA,CAAK,KAAA,EAAO,EAAE,OAAA,EAAS,MAAA,EAAQ,SAAA,EAAW,IAAA,EAAM,CAAA;AACrE,MAAA,MAAM,OAAO,MAAA,CAAO,IAAA;AACpB,MAAA,MAAM,YAAY,MAAA,CAAO,IAAA,CAAK,MAAA,CAAO,IAAA,CAAK,SAAS,CAAC,CAAA;AACpD,MAAA,IAAI,iBAAA,KAAsB,IAAA,IAAQ,iBAAA,KAAsB,SAAA,EAAW;AACjE,QAAA,MAAM,IAAI,KAAA,CAAM,CAAA,0CAAA,EAA6C,iBAAiB,CAAA,MAAA,EAAS,SAAS,CAAA,CAAE,CAAA;AAAA,MACpG;AACA,MAAA,iBAAA,GAAoB,SAAA;AACpB,MAAA,MAAM,UAA0B,EAAC;AACjC,MAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,KAAA,CAAM,MAAA,EAAQ,KAAK,CAAA,EAAG;AACxC,QAAA,OAAA,CAAQ,IAAA,CAAK,IAAI,YAAA,CAAa,IAAA,CAAK,MAAA,EAAQ,IAAA,CAAK,UAAA,GAAa,CAAA,GAAI,SAAA,GAAY,CAAA,EAAG,SAAS,CAAA,CAAE,OAAO,CAAA;AAAA,MACpG;AACA,MAAA,OAAO,EAAE,SAAS,SAAA,EAAU;AAAA,IAC9B;AAAA,GACF;AACA,EAAA,IAAI,IAAA,CAAK,SAAA,KAAc,MAAA,EAAW,OAAA,CAAQ,YAAY,IAAA,CAAK,SAAA;AAC3D,EAAA,OAAO,OAAA;AACT;;;ACvCO,IAAM,aAAA,GAAgB;AACtB,IAAM,uBAAA,GAA0B;;;ACNvC,eAAsB,KAAA,CAAM,QAAA,EAAkB,IAAA,GAAqB,EAAC,EAA+B;AACjG,EAAA,MAAM,MAAA,GAAS,cAAc,QAAA,EAAU,EAAE,MAAM,IAAA,CAAK,QAAA,IAAY,WAAW,CAAA;AAC3E,EAAA,MAAM,OAAA,GACJ,KAAK,OAAA,IACL,yBAAA;AAAA,IACE,KAAK,aAAA,KAAkB,MAAA,GACnB,EAAE,KAAA,EAAO,KAAK,KAAA,IAAS,aAAA,EAAe,aAAA,EAAe,IAAA,CAAK,eAAc,GACxE,EAAE,KAAA,EAAO,IAAA,CAAK,SAAS,aAAA;AAAc,GAC3C;AAEF,EAAA,MAAM,MAAA,GAAS,MAAM,OAAA,CAAQ,KAAA,CAAM,MAAA,CAAO,IAAI,CAAC,CAAA,KAAM,CAAA,CAAE,IAAI,CAAC,CAAA;AAC5D,EAAA,IAAI,MAAA,CAAO,OAAA,CAAQ,MAAA,KAAW,MAAA,CAAO,MAAA,EAAQ;AAC3C,IAAA,MAAM,IAAI,MAAM,CAAA,iBAAA,EAAoB,MAAA,CAAO,QAAQ,MAAM,CAAA,aAAA,EAAgB,MAAA,CAAO,MAAM,CAAA,OAAA,CAAS,CAAA;AAAA,EACjG;AAEA,EAAA,MAAM,eAAA,GAAoC,MAAA,CAAO,GAAA,CAAI,CAAC,GAAG,CAAA,MAAO;AAAA,IAC9D,IAAI,CAAA,CAAE,EAAA;AAAA,IACN,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,MAAA,EAAQ,MAAA,CAAO,OAAA,CAAQ,CAAC;AAAA,GAC1B,CAAE,CAAA;AAEF,EAAA,MAAM,KAAA,GAAwB;AAAA,IAC5B,OAAO,OAAA,CAAQ,KAAA;AAAA,IACf,eAAe,OAAA,CAAQ,aAAA;AAAA,IACvB,WAAW,MAAA,CAAO,SAAA;AAAA,IAClB,QAAQ,OAAA,CAAQ,MAAA;AAAA,IAChB,YAAY,OAAA,CAAQ,UAAA;AAAA,IACpB,QAAA,EAAU,KAAK,QAAA,IAAY,SAAA;AAAA,IAC3B,MAAA,EAAQ;AAAA,GACV;AAEA,EAAA,OAAO,EAAE,aAAA,EAAe,CAAA,EAAG,MAAA,EAAQ,CAAC,KAAK,CAAA,EAAE;AAC7C;;;AC/BO,SAAS,cAAA,CACd,OAAA,EACA,WAAA,EACA,IAAA,GAAsB,EAAC,EACV;AACb,EAAA,MAAM,KAAA,GAAQ,SAAA,CAAU,OAAA,EAAS,IAAA,CAAK,KAAK,CAAA;AAC3C,EAAA,IAAI,CAAC,KAAA,EAAO,MAAM,IAAI,MAAM,mCAAmC,CAAA;AAC/D,EAAA,IAAI,WAAA,CAAY,MAAA,KAAW,KAAA,CAAM,SAAA,EAAW;AAC1C,IAAA,MAAM,IAAI,KAAA,CAAM,CAAA,uBAAA,EAA0B,WAAA,CAAY,MAAM,CAAA,sBAAA,EAAyB,KAAA,CAAM,KAAK,CAAA,EAAA,EAAK,KAAA,CAAM,SAAS,CAAA,CAAA,CAAG,CAAA;AAAA,EACzH;AAEA,EAAA,MAAM,CAAA,GAAI,KAAK,CAAA,IAAK,CAAA;AACpB,EAAA,MAAM,MAAA,GAAsB,KAAA,CAAM,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,MAAO;AAAA,IACnD,YAAY,KAAA,CAAM,KAAA;AAAA,IAClB,SAAS,CAAA,CAAE,EAAA;AAAA,IACX,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,OAAO,UAAA,CAAW,WAAA,EAAa,CAAA,CAAE,MAAA,EAAQ,MAAM,MAAM;AAAA,GACvD,CAAE,CAAA;AAGF,EAAA,MAAM,KAAA,GAAQ,KAAA,CAAM,MAAA,KAAW,WAAA,GAAc,CAAA,GAAI,EAAA;AACjD,EAAA,MAAA,CAAO,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,SAAS,CAAA,CAAE,KAAA,GAAQ,EAAE,KAAA,CAAM,CAAA;AACjD,EAAA,OAAO,MAAA,CAAO,KAAA,CAAM,CAAA,EAAG,CAAC,CAAA;AAC1B;AAEA,SAAS,SAAA,CAAU,SAA4B,SAAA,EAAgD;AAC7F,EAAA,IAAI,SAAA,SAAkB,OAAA,CAAQ,MAAA,CAAO,KAAK,CAAC,CAAA,KAAM,CAAA,CAAE,KAAA,KAAU,SAAS,CAAA;AACtE,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAC,CAAA;AACzB;AAEA,SAAS,UAAA,CAAW,CAAA,EAAiB,CAAA,EAAiB,MAAA,EAAiC;AACrF,EAAA,IAAI,WAAW,WAAA,EAAa;AAC1B,IAAA,IAAI,GAAA,GAAM,CAAA;AACV,IAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,KAAK,CAAA,EAAG;AACpC,MAAA,MAAM,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,GAAK,EAAE,CAAC,CAAA;AACrB,MAAA,GAAA,IAAO,CAAA,GAAI,CAAA;AAAA,IACb;AACA,IAAA,OAAO,IAAA,CAAK,KAAK,GAAG,CAAA;AAAA,EACtB;AACA,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,GAAA,IAAO,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACxD,EAAA,IAAI,MAAA,KAAW,OAAO,OAAO,GAAA;AAE7B,EAAA,IAAI,EAAA,GAAK,CAAA;AACT,EAAA,IAAI,EAAA,GAAK,CAAA;AACT,EAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,KAAK,CAAA,EAAG;AACpC,IAAA,EAAA,IAAM,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACjB,IAAA,EAAA,IAAM,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AAAA,EACnB;AACA,EAAA,MAAM,QAAQ,IAAA,CAAK,IAAA,CAAK,EAAE,CAAA,GAAI,IAAA,CAAK,KAAK,EAAE,CAAA;AAC1C,EAAA,OAAO,KAAA,KAAU,CAAA,GAAI,CAAA,GAAI,GAAA,GAAM,KAAA;AACjC;;;AClDA,IAAM,gBAAA,GAAmB,mDAAA;AAElB,SAAS,yBAAyB,IAAA,EAAmD;AAC1F,EAAA,MAAM,QAAQ,IAAA,CAAK,KAAA,IAAS,QAAQ,GAAA,CAAI,QAAA,IAAY,QAAQ,GAAA,CAAI,iBAAA;AAChE,EAAA,IAAI,CAAC,KAAA,EAAO;AACV,IAAA,MAAM,IAAI,MAAM,0EAA0E,CAAA;AAAA,EAC5F;AACA,EAAA,MAAM,OAAA,GAAU,KAAK,OAAA,IAAW,gBAAA;AAChC,EAAA,MAAM,MAAM,CAAA,EAAG,OAAO,IAAI,SAAA,CAAU,IAAA,CAAK,KAAK,CAAC,CAAA,4BAAA,CAAA;AAE/C,EAAA,MAAM,OAAA,GAA4B;AAAA,IAChC,OAAO,IAAA,CAAK,KAAA;AAAA,IACZ,aAAA,EAAe,KAAK,aAAA,IAAiB,MAAA;AAAA,IACrC,MAAA,EAAQ,KAAK,MAAA,IAAU,QAAA;AAAA,IACvB,UAAA,EAAY,IAAA;AAAA,IACZ,MAAM,MAAM,KAAA,EAA2C;AACrD,MAAA,IAAI,KAAA,CAAM,WAAW,CAAA,EAAG;AACtB,QAAA,OAAO,EAAE,OAAA,EAAS,IAAI,SAAA,EAAW,IAAA,CAAK,aAAa,CAAA,EAAE;AAAA,MACvD;AACA,MAAA,MAAM,IAAA,GAAO,IAAA,CAAK,SAAA,CAAU,EAAE,MAAA,EAAQ,KAAA,EAAO,OAAA,EAAS,EAAE,cAAA,EAAgB,IAAA,EAAK,EAAG,CAAA;AAChF,MAAA,MAAM,GAAA,GAAM,MAAM,KAAA,CAAM,GAAA,EAAK;AAAA,QAC3B,MAAA,EAAQ,MAAA;AAAA,QACR,OAAA,EAAS;AAAA,UACP,cAAA,EAAgB,kBAAA;AAAA,UAChB,aAAA,EAAe,UAAU,KAAK,CAAA;AAAA,SAChC;AAAA,QACA;AAAA,OACD,CAAA;AACD,MAAA,IAAI,CAAC,IAAI,EAAA,EAAI;AACX,QAAA,MAAM,SAAS,MAAM,GAAA,CAAI,MAAK,CAAE,KAAA,CAAM,MAAM,EAAE,CAAA;AAC9C,QAAA,MAAM,IAAI,KAAA,CAAM,CAAA,iBAAA,EAAoB,GAAA,CAAI,MAAM,CAAA,KAAA,EAAQ,IAAA,CAAK,KAAK,CAAA,EAAA,EAAK,MAAA,IAAU,GAAA,CAAI,UAAU,CAAA,CAAE,CAAA;AAAA,MACjG;AACA,MAAA,MAAM,GAAA,GAAO,MAAM,GAAA,CAAI,IAAA,EAAK;AAC5B,MAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,GAAA,EAAK,KAAA,CAAM,MAAM,CAAA;AAChD,MAAA,MAAM,YAAY,MAAA,CAAO,CAAC,CAAA,EAAG,MAAA,IAAU,KAAK,SAAA,IAAa,CAAA;AACzD,MAAA,MAAM,OAAA,GAAU,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,KAAM,UAAU,YAAA,CAAa,IAAA,CAAK,CAAC,CAAC,CAAC,CAAA;AACjE,MAAA,OAAO,EAAE,SAAS,SAAA,EAAU;AAAA,IAC9B;AAAA,GACF;AACA,EAAA,IAAI,IAAA,CAAK,SAAA,KAAc,MAAA,EAAW,OAAA,CAAQ,YAAY,IAAA,CAAK,SAAA;AAC3D,EAAA,OAAO,OAAA;AACT;AAWA,SAAS,eAAA,CAAgB,KAAc,aAAA,EAAmC;AACxE,EAAA,IAAI,CAAC,KAAA,CAAM,OAAA,CAAQ,GAAG,CAAA,EAAG;AACvB,IAAA,MAAM,IAAI,MAAM,2CAA2C,CAAA;AAAA,EAC7D;AACA,EAAA,IAAI,GAAA,CAAI,MAAA,KAAW,CAAA,EAAG,OAAO,EAAC;AAE9B,EAAA,MAAM,KAAA,GAAQ,IAAI,CAAC,CAAA;AACnB,EAAA,IAAI,OAAO,UAAU,QAAA,EAAU;AAC7B,IAAA,IAAI,kBAAkB,CAAA,EAAG;AACvB,MAAA,MAAM,IAAI,KAAA,CAAM,CAAA,yCAAA,EAA4C,aAAa,CAAA,CAAE,CAAA;AAAA,IAC7E;AACA,IAAA,OAAO,CAAC,GAAe,CAAA;AAAA,EACzB;AACA,EAAA,IAAI,KAAA,CAAM,OAAA,CAAQ,KAAK,CAAA,KAAM,KAAA,CAAM,MAAA,KAAW,CAAA,IAAK,OAAO,KAAA,CAAM,CAAC,CAAA,KAAM,QAAA,CAAA,EAAW;AAChF,IAAA,OAAO,GAAA;AAAA,EACT;AACA,EAAA,IAAI,KAAA,CAAM,QAAQ,KAAK,CAAA,IAAK,MAAM,OAAA,CAAQ,KAAA,CAAM,CAAC,CAAC,CAAA,EAAG;AAEnD,IAAA,OAAQ,GAAA,CAAqB,IAAI,QAAQ,CAAA;AAAA,EAC3C;AACA,EAAA,MAAM,IAAI,MAAM,+CAA+C,CAAA;AACjE;AAEA,SAAS,SAAS,MAAA,EAA8B;AAC9C,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,EAAC;AACjC,EAAA,MAAM,GAAA,GAAM,MAAA,CAAO,CAAC,CAAA,CAAG,MAAA;AACvB,EAAA,MAAM,MAAM,IAAI,KAAA,CAAc,GAAG,CAAA,CAAE,KAAK,CAAC,CAAA;AACzC,EAAA,KAAA,MAAW,KAAK,MAAA,EAAQ;AACtB,IAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,GAAA,EAAK,CAAA,IAAK,GAAG,GAAA,CAAI,CAAC,CAAA,IAAM,CAAA,CAAE,CAAC,CAAA;AAAA,EACjD;AACA,EAAA,KAAA,IAAS,CAAA,GAAI,GAAG,CAAA,GAAI,GAAA,EAAK,KAAK,CAAA,EAAG,GAAA,CAAI,CAAC,CAAA,IAAM,MAAA,CAAO,MAAA;AACnD,EAAA,OAAO,GAAA;AACT;AAEA,SAAS,UAAU,CAAA,EAA+B;AAChD,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,GAAA,IAAO,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACxD,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,IAAA,CAAK,GAAG,CAAA;AAC1B,EAAA,IAAI,IAAA,KAAS,GAAG,OAAO,CAAA;AACvB,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,CAAA,CAAE,CAAC,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,GAAK,IAAA;AACrD,EAAA,OAAO,CAAA;AACT","file":"index.cjs","sourcesContent":["/**\n * Section-based markdown chunker.\n *\n * Splits a markdown document on ATX headings (`#`, `##`, ...). Each chunk\n * carries the byte offset and length into the original UTF-8 source so a\n * downstream consumer can map a vector hit back to the exact substring\n * without re-tokenising. Pre-heading content becomes a \"preamble\" chunk.\n *\n * Per spec §5.1, `textOffset`/`textLength` are UTF-8 *byte* offsets into the\n * markdown source. We encode the document once with `TextEncoder`, track a\n * byte cursor while iterating lines (counting the trailing `\\n` byte), and\n * derive each chunk's `text` by decoding the corresponding byte slice. This\n * keeps the offsets in agreement with the Go and Python SDKs for any\n * non-ASCII résumé.\n */\n\nconst encoder = new TextEncoder();\nconst decoder = new TextDecoder();\n\nexport type ChunkingMode = 'document' | 'section' | 'paragraph';\n\nexport interface MarkdownChunk {\n id: string;\n textOffset: number;\n textLength: number;\n text: string;\n}\n\nexport interface ChunkOptions {\n mode?: ChunkingMode;\n}\n\nconst HEADING = /^(#{1,6})\\s+(.+?)\\s*$/;\n\n/** A source line plus its UTF-8 byte offset and byte length (including any trailing `\\n`). */\ninterface ByteLine {\n text: string;\n offset: number;\n byteLength: number;\n}\n\nexport function chunkMarkdown(markdown: string, opts: ChunkOptions = {}): MarkdownChunk[] {\n const mode = opts.mode ?? 'section';\n const bytes = encoder.encode(markdown);\n if (mode === 'document') {\n return [documentChunk(bytes)];\n }\n if (mode === 'paragraph') {\n return paragraphChunks(bytes);\n }\n return sectionChunks(bytes);\n}\n\nfunction documentChunk(bytes: Uint8Array): MarkdownChunk {\n return { id: 'document', textOffset: 0, textLength: bytes.byteLength, text: sliceText(bytes, 0, bytes.byteLength) };\n}\n\nfunction sectionChunks(bytes: Uint8Array): MarkdownChunk[] {\n const lines = splitWithByteOffsets(bytes);\n const sections: MarkdownChunk[] = [];\n let current: { id: string; start: number; end: number } | null = null;\n const ids = new Set<string>();\n\n function flush(end: number): void {\n if (!current) return;\n const text = sliceText(bytes, current.start, end);\n if (text.trim().length === 0) {\n current = null;\n return;\n }\n sections.push({ id: current.id, textOffset: current.start, textLength: end - current.start, text });\n current = null;\n }\n\n for (const line of lines) {\n const match = HEADING.exec(line.text);\n const lineEnd = line.offset + line.byteLength;\n if (match) {\n flush(line.offset);\n const id = uniqueId(slugify(match[2] ?? `section-${sections.length + 1}`), ids);\n ids.add(id);\n current = { id, start: line.offset, end: lineEnd };\n continue;\n }\n if (current === null) {\n const id = uniqueId('preamble', ids);\n ids.add(id);\n current = { id, start: line.offset, end: lineEnd };\n } else {\n current.end = lineEnd;\n }\n }\n flush(bytes.byteLength);\n\n if (sections.length === 0) {\n return [documentChunk(bytes)];\n }\n return sections;\n}\n\nfunction paragraphChunks(bytes: Uint8Array): MarkdownChunk[] {\n const out: MarkdownChunk[] = [];\n const ids = new Set<string>();\n const separator = encoder.encode('\\n\\n');\n let cursor = 0;\n let i = 0;\n while (cursor < bytes.byteLength) {\n let end = indexOfBytes(bytes, separator, cursor);\n if (end === -1) end = bytes.byteLength;\n const text = sliceText(bytes, cursor, end);\n if (text.trim().length > 0) {\n const id = uniqueId(slugify(text.split('\\n')[0] ?? `p-${i}`), ids);\n ids.add(id);\n out.push({ id, textOffset: cursor, textLength: end - cursor, text });\n i += 1;\n }\n cursor = end + separator.byteLength;\n }\n if (out.length === 0) {\n return [documentChunk(bytes)];\n }\n return out;\n}\n\n/** Decode the UTF-8 byte slice `[start, end)` back into a string. */\nfunction sliceText(bytes: Uint8Array, start: number, end: number): string {\n return decoder.decode(bytes.subarray(start, end));\n}\n\n/** Split UTF-8 bytes into lines, each tagged with its byte offset and byte length (newline included). */\nfunction splitWithByteOffsets(bytes: Uint8Array): ByteLine[] {\n const newline = 0x0a; // '\\n'\n const lines: ByteLine[] = [];\n let start = 0;\n for (let i = 0; i < bytes.byteLength; i += 1) {\n if (bytes[i] === newline) {\n const byteLength = i - start + 1;\n lines.push({ text: sliceText(bytes, start, i + 1), offset: start, byteLength });\n start = i + 1;\n }\n }\n if (start < bytes.byteLength) {\n lines.push({ text: sliceText(bytes, start, bytes.byteLength), offset: start, byteLength: bytes.byteLength - start });\n }\n return lines;\n}\n\n/** Find the byte index of `needle` in `haystack` at or after `from`, or -1. */\nfunction indexOfBytes(haystack: Uint8Array, needle: Uint8Array, from: number): number {\n const last = haystack.byteLength - needle.byteLength;\n for (let i = from; i <= last; i += 1) {\n let matched = true;\n for (let j = 0; j < needle.byteLength; j += 1) {\n if (haystack[i + j] !== needle[j]) {\n matched = false;\n break;\n }\n }\n if (matched) return i;\n }\n return -1;\n}\n\nfunction slugify(s: string): string {\n return (\n s\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 64) || 'section'\n );\n}\n\nfunction uniqueId(base: string, taken: Set<string>): string {\n if (!taken.has(base)) return base;\n let n = 2;\n while (taken.has(`${base}-${n}`)) n += 1;\n return `${base}-${n}`;\n}\n","/**\n * transformers.js backend (browser + Node + Bun via @huggingface/transformers).\n *\n * Loads the model lazily and reuses the pipeline across calls. Models are\n * cached on disk by transformers.js itself (HF cache layout).\n */\n\nimport { pipeline, env } from '@huggingface/transformers';\nimport type { EmbeddingBackend, EmbeddingBackendOptions, EmbeddingMatrix } from './types.js';\n\ninterface TransformersBackendOptions extends EmbeddingBackendOptions {\n /** Override Xenova quantisation. Defaults to fp32 for closest fidelity. */\n dtype?: 'fp32' | 'fp16' | 'q8' | 'q4';\n /** Force a backend device (e.g. 'cpu', 'gpu', 'wasm'). Defaults to auto. */\n device?: 'cpu' | 'gpu' | 'wasm' | 'webgpu';\n /** Allow remote model downloads. true by default. */\n allowRemoteModels?: boolean;\n}\n\nexport function createTransformersBackend(opts: TransformersBackendOptions): EmbeddingBackend {\n if (opts.allowRemoteModels === false) {\n env.allowRemoteModels = false;\n }\n\n let pipelinePromise: Promise<unknown> | null = null;\n let resolvedDimension: number | null = null;\n\n function getPipeline(): Promise<unknown> {\n if (!pipelinePromise) {\n const modelOpts: Record<string, unknown> = { dtype: opts.dtype ?? 'fp32' };\n if (opts.device !== undefined) modelOpts.device = opts.device;\n pipelinePromise = pipeline('feature-extraction', opts.model, modelOpts as never);\n }\n return pipelinePromise;\n }\n\n const backend: EmbeddingBackend = {\n model: opts.model,\n modelRevision: opts.modelRevision ?? 'main',\n metric: opts.metric ?? 'cosine',\n normalized: true,\n async embed(texts: string[]): Promise<EmbeddingMatrix> {\n if (texts.length === 0) {\n return { vectors: [], dimension: opts.dimension ?? 0 };\n }\n const pipe = (await getPipeline()) as (\n texts: string[],\n opts: { pooling: 'mean'; normalize: boolean },\n ) => Promise<{ data: Float32Array; dims: number[] }>;\n const tensor = await pipe(texts, { pooling: 'mean', normalize: true });\n const data = tensor.data;\n const dimension = tensor.dims[tensor.dims.length - 1] as number;\n if (resolvedDimension !== null && resolvedDimension !== dimension) {\n throw new Error(`Model emitted inconsistent dimension: had ${resolvedDimension}, now ${dimension}`);\n }\n resolvedDimension = dimension;\n const vectors: Float32Array[] = [];\n for (let i = 0; i < texts.length; i += 1) {\n vectors.push(new Float32Array(data.buffer, data.byteOffset + i * dimension * 4, dimension).slice());\n }\n return { vectors, dimension };\n },\n };\n if (opts.dimension !== undefined) backend.dimension = opts.dimension;\n return backend;\n}\n","import type { EmbeddingMetric } from '@cvfile/sdk';\n\nexport interface EmbeddingMatrix {\n vectors: Float32Array[];\n dimension: number;\n}\n\nexport interface EmbeddingBackend {\n model: string;\n modelRevision: string;\n /** Optional pre-declared dimension; the backend may override after first call. */\n dimension?: number;\n metric: EmbeddingMetric;\n normalized: boolean;\n embed(texts: string[]): Promise<EmbeddingMatrix>;\n}\n\nexport interface EmbeddingBackendOptions {\n model: string;\n modelRevision?: string;\n /** Optional pre-known dimension (e.g. 1024 for bge-m3). */\n dimension?: number;\n metric?: EmbeddingMetric;\n}\n\n/** Recommended default per spec §5: BAAI BGE-M3, MIT, multilingual, 1024-dim. */\nexport const DEFAULT_MODEL = 'Xenova/bge-m3';\nexport const DEFAULT_MODEL_DIMENSION = 1024;\n","/**\n * High-level embed() API: markdown in, EmbeddingsPayload out, ready to drop\n * into pack({ embeddings: ... }).\n */\n\nimport type { EmbeddingChunk, EmbeddingSpace, EmbeddingsPayload } from '@cvfile/sdk';\nimport { chunkMarkdown, type ChunkingMode } from './chunk.js';\nimport { createTransformersBackend } from './transformers-backend.js';\nimport { DEFAULT_MODEL, type EmbeddingBackend } from './types.js';\n\nexport interface EmbedOptions {\n /** HF model id; defaults to BGE-M3 (Xenova/bge-m3). */\n model?: string;\n /** Pinned model revision; recorded in the payload for reproducibility. */\n modelRevision?: string;\n /** Chunking strategy; default 'section'. */\n chunking?: ChunkingMode;\n /** Bring-your-own backend (e.g. an OpenAI/Voyage adapter). */\n backend?: EmbeddingBackend;\n}\n\nexport async function embed(markdown: string, opts: EmbedOptions = {}): Promise<EmbeddingsPayload> {\n const chunks = chunkMarkdown(markdown, { mode: opts.chunking ?? 'section' });\n const backend =\n opts.backend ??\n createTransformersBackend(\n opts.modelRevision !== undefined\n ? { model: opts.model ?? DEFAULT_MODEL, modelRevision: opts.modelRevision }\n : { model: opts.model ?? DEFAULT_MODEL },\n );\n\n const matrix = await backend.embed(chunks.map((c) => c.text));\n if (matrix.vectors.length !== chunks.length) {\n throw new Error(`Backend returned ${matrix.vectors.length} vectors for ${chunks.length} chunks`);\n }\n\n const embeddingChunks: EmbeddingChunk[] = chunks.map((c, i) => ({\n id: c.id,\n textOffset: c.textOffset,\n textLength: c.textLength,\n vector: matrix.vectors[i]!,\n }));\n\n const space: EmbeddingSpace = {\n model: backend.model,\n modelRevision: backend.modelRevision,\n dimension: matrix.dimension,\n metric: backend.metric,\n normalized: backend.normalized,\n chunking: opts.chunking ?? 'section',\n chunks: embeddingChunks,\n };\n\n return { formatVersion: 1, spaces: [space] };\n}\n","/**\n * Pure similarity search over an EmbeddingsPayload. Vector input is the\n * caller's responsibility (encode the query with the same model that was\n * used to populate the space, ideally pulled from space.model).\n */\n\nimport type { EmbeddingMetric, EmbeddingSpace, EmbeddingsPayload } from '@cvfile/sdk';\n\nexport interface SearchHit {\n spaceModel: string;\n chunkId: string;\n textOffset: number;\n textLength: number;\n score: number;\n}\n\nexport interface SearchOptions {\n /** Restrict to a specific embedding space; default = first space. */\n model?: string;\n /** Top-k results; default 5. */\n k?: number;\n}\n\nexport function searchSemantic(\n payload: EmbeddingsPayload,\n queryVector: Float32Array,\n opts: SearchOptions = {},\n): SearchHit[] {\n const space = pickSpace(payload, opts.model);\n if (!space) throw new Error('No matching embedding space found');\n if (queryVector.length !== space.dimension) {\n throw new Error(`Query vector dimension ${queryVector.length} does not match space ${space.model} (${space.dimension})`);\n }\n\n const k = opts.k ?? 5;\n const scored: SearchHit[] = space.chunks.map((c) => ({\n spaceModel: space.model,\n chunkId: c.id,\n textOffset: c.textOffset,\n textLength: c.textLength,\n score: similarity(queryVector, c.vector, space.metric),\n }));\n\n // Higher score = better for cosine/dot; lower for euclidean.\n const order = space.metric === 'euclidean' ? 1 : -1;\n scored.sort((a, b) => order * (a.score - b.score));\n return scored.slice(0, k);\n}\n\nfunction pickSpace(payload: EmbeddingsPayload, modelHint?: string): EmbeddingSpace | undefined {\n if (modelHint) return payload.spaces.find((s) => s.model === modelHint);\n return payload.spaces[0];\n}\n\nfunction similarity(a: Float32Array, b: Float32Array, metric: EmbeddingMetric): number {\n if (metric === 'euclidean') {\n let sum = 0;\n for (let i = 0; i < a.length; i += 1) {\n const d = a[i]! - b[i]!;\n sum += d * d;\n }\n return Math.sqrt(sum);\n }\n let dot = 0;\n for (let i = 0; i < a.length; i += 1) dot += a[i]! * b[i]!;\n if (metric === 'dot') return dot;\n // cosine: assume normalized vectors when produced by our backend\n let na = 0;\n let nb = 0;\n for (let i = 0; i < a.length; i += 1) {\n na += a[i]! * a[i]!;\n nb += b[i]! * b[i]!;\n }\n const denom = Math.sqrt(na) * Math.sqrt(nb);\n return denom === 0 ? 0 : dot / denom;\n}\n","/**\n * Hugging Face Inference API backend.\n *\n * Calls https://router.huggingface.co/hf-inference/models/<model>/pipeline/feature-extraction\n * with HF_TOKEN. For sentence-transformers models (BGE-M3, MiniLM, etc.) the\n * response is already mean-pooled per input — one vector per text. We\n * normalise client-side so cosine math is consistent across backends.\n */\n\nimport type { EmbeddingBackend, EmbeddingMatrix } from './types.js';\nimport type { EmbeddingMetric } from '@cvfile/sdk';\n\nexport interface HuggingFaceBackendOptions {\n model: string;\n /** HF token. Defaults to `process.env.HF_TOKEN`. */\n token?: string;\n /** Pinned revision; recorded in the payload. Default 'main'. */\n modelRevision?: string;\n /** Pre-known dimension. Optional; inferred from first response otherwise. */\n dimension?: number;\n metric?: EmbeddingMetric;\n /** Override base URL (e.g. for self-hosted TEI). */\n baseUrl?: string;\n}\n\nconst DEFAULT_BASE_URL = 'https://router.huggingface.co/hf-inference/models';\n\nexport function createHuggingFaceBackend(opts: HuggingFaceBackendOptions): EmbeddingBackend {\n const token = opts.token ?? process.env.HF_TOKEN ?? process.env.HUGGINGFACE_TOKEN;\n if (!token) {\n throw new Error('HF_TOKEN (or HUGGINGFACE_TOKEN) is required for the Hugging Face backend');\n }\n const baseUrl = opts.baseUrl ?? DEFAULT_BASE_URL;\n const url = `${baseUrl}/${encodeURI(opts.model)}/pipeline/feature-extraction`;\n\n const backend: EmbeddingBackend = {\n model: opts.model,\n modelRevision: opts.modelRevision ?? 'main',\n metric: opts.metric ?? 'cosine',\n normalized: true,\n async embed(texts: string[]): Promise<EmbeddingMatrix> {\n if (texts.length === 0) {\n return { vectors: [], dimension: opts.dimension ?? 0 };\n }\n const body = JSON.stringify({ inputs: texts, options: { wait_for_model: true } });\n const res = await fetch(url, {\n method: 'POST',\n headers: {\n 'content-type': 'application/json',\n authorization: `Bearer ${token}`,\n },\n body,\n });\n if (!res.ok) {\n const detail = await res.text().catch(() => '');\n throw new Error(`HF Inference API ${res.status} for ${opts.model}: ${detail || res.statusText}`);\n }\n const raw = (await res.json()) as unknown;\n const matrix = parseHfResponse(raw, texts.length);\n const dimension = matrix[0]?.length ?? opts.dimension ?? 0;\n const vectors = matrix.map((v) => normalize(Float32Array.from(v)));\n return { vectors, dimension };\n },\n };\n if (opts.dimension !== undefined) backend.dimension = opts.dimension;\n return backend;\n}\n\n/**\n * Coerce the variety of shapes the HF Inference API returns into a flat\n * `number[][]`: one mean-pooled vector per input.\n *\n * Observed shapes:\n * - sentence-transformers (BGE-M3, MiniLM): `[[...vec], [...vec]]`\n * - feature-extraction without pooling: `[[[...token0], [...token1], ...], ...]`\n * - single-input convenience form: `[...vec]`\n */\nfunction parseHfResponse(raw: unknown, expectedCount: number): number[][] {\n if (!Array.isArray(raw)) {\n throw new Error('HF Inference API: expected array response');\n }\n if (raw.length === 0) return [];\n\n const first = raw[0];\n if (typeof first === 'number') {\n if (expectedCount !== 1) {\n throw new Error(`HF Inference API: got 1 vector, expected ${expectedCount}`);\n }\n return [raw as number[]];\n }\n if (Array.isArray(first) && (first.length === 0 || typeof first[0] === 'number')) {\n return raw as number[][];\n }\n if (Array.isArray(first) && Array.isArray(first[0])) {\n // token-level embeddings: mean-pool per input\n return (raw as number[][][]).map(meanPool);\n }\n throw new Error('HF Inference API: unrecognised response shape');\n}\n\nfunction meanPool(tokens: number[][]): number[] {\n if (tokens.length === 0) return [];\n const dim = tokens[0]!.length;\n const out = new Array<number>(dim).fill(0);\n for (const t of tokens) {\n for (let i = 0; i < dim; i += 1) out[i]! += t[i]!;\n }\n for (let i = 0; i < dim; i += 1) out[i]! /= tokens.length;\n return out;\n}\n\nfunction normalize(v: Float32Array): Float32Array {\n let sum = 0;\n for (let i = 0; i < v.length; i += 1) sum += v[i]! * v[i]!;\n const norm = Math.sqrt(sum);\n if (norm === 0) return v;\n for (let i = 0; i < v.length; i += 1) v[i] = v[i]! / norm;\n return v;\n}\n"]}
|
package/dist/index.d.cts
CHANGED
|
@@ -7,6 +7,13 @@ import { EmbeddingMetric, EmbeddingsPayload } from '@cvfile/sdk';
|
|
|
7
7
|
* carries the byte offset and length into the original UTF-8 source so a
|
|
8
8
|
* downstream consumer can map a vector hit back to the exact substring
|
|
9
9
|
* without re-tokenising. Pre-heading content becomes a "preamble" chunk.
|
|
10
|
+
*
|
|
11
|
+
* Per spec §5.1, `textOffset`/`textLength` are UTF-8 *byte* offsets into the
|
|
12
|
+
* markdown source. We encode the document once with `TextEncoder`, track a
|
|
13
|
+
* byte cursor while iterating lines (counting the trailing `\n` byte), and
|
|
14
|
+
* derive each chunk's `text` by decoding the corresponding byte slice. This
|
|
15
|
+
* keeps the offsets in agreement with the Go and Python SDKs for any
|
|
16
|
+
* non-ASCII résumé.
|
|
10
17
|
*/
|
|
11
18
|
type ChunkingMode = 'document' | 'section' | 'paragraph';
|
|
12
19
|
interface MarkdownChunk {
|
package/dist/index.d.ts
CHANGED
|
@@ -7,6 +7,13 @@ import { EmbeddingMetric, EmbeddingsPayload } from '@cvfile/sdk';
|
|
|
7
7
|
* carries the byte offset and length into the original UTF-8 source so a
|
|
8
8
|
* downstream consumer can map a vector hit back to the exact substring
|
|
9
9
|
* without re-tokenising. Pre-heading content becomes a "preamble" chunk.
|
|
10
|
+
*
|
|
11
|
+
* Per spec §5.1, `textOffset`/`textLength` are UTF-8 *byte* offsets into the
|
|
12
|
+
* markdown source. We encode the document once with `TextEncoder`, track a
|
|
13
|
+
* byte cursor while iterating lines (counting the trailing `\n` byte), and
|
|
14
|
+
* derive each chunk's `text` by decoding the corresponding byte slice. This
|
|
15
|
+
* keeps the offsets in agreement with the Go and Python SDKs for any
|
|
16
|
+
* non-ASCII résumé.
|
|
10
17
|
*/
|
|
11
18
|
type ChunkingMode = 'document' | 'section' | 'paragraph';
|
|
12
19
|
interface MarkdownChunk {
|
package/dist/index.js
CHANGED
|
@@ -1,25 +1,31 @@
|
|
|
1
1
|
import { env, pipeline } from '@huggingface/transformers';
|
|
2
2
|
|
|
3
3
|
// src/chunk.ts
|
|
4
|
+
var encoder = new TextEncoder();
|
|
5
|
+
var decoder = new TextDecoder();
|
|
4
6
|
var HEADING = /^(#{1,6})\s+(.+?)\s*$/;
|
|
5
7
|
function chunkMarkdown(markdown, opts = {}) {
|
|
6
8
|
const mode = opts.mode ?? "section";
|
|
9
|
+
const bytes = encoder.encode(markdown);
|
|
7
10
|
if (mode === "document") {
|
|
8
|
-
return [
|
|
11
|
+
return [documentChunk(bytes)];
|
|
9
12
|
}
|
|
10
13
|
if (mode === "paragraph") {
|
|
11
|
-
return paragraphChunks(
|
|
14
|
+
return paragraphChunks(bytes);
|
|
12
15
|
}
|
|
13
|
-
return sectionChunks(
|
|
16
|
+
return sectionChunks(bytes);
|
|
14
17
|
}
|
|
15
|
-
function
|
|
16
|
-
|
|
18
|
+
function documentChunk(bytes) {
|
|
19
|
+
return { id: "document", textOffset: 0, textLength: bytes.byteLength, text: sliceText(bytes, 0, bytes.byteLength) };
|
|
20
|
+
}
|
|
21
|
+
function sectionChunks(bytes) {
|
|
22
|
+
const lines = splitWithByteOffsets(bytes);
|
|
17
23
|
const sections = [];
|
|
18
24
|
let current = null;
|
|
19
25
|
const ids = /* @__PURE__ */ new Set();
|
|
20
26
|
function flush(end) {
|
|
21
27
|
if (!current) return;
|
|
22
|
-
const text =
|
|
28
|
+
const text = sliceText(bytes, current.start, end);
|
|
23
29
|
if (text.trim().length === 0) {
|
|
24
30
|
current = null;
|
|
25
31
|
return;
|
|
@@ -29,59 +35,84 @@ function sectionChunks(markdown) {
|
|
|
29
35
|
}
|
|
30
36
|
for (const line of lines) {
|
|
31
37
|
const match = HEADING.exec(line.text);
|
|
38
|
+
const lineEnd = line.offset + line.byteLength;
|
|
32
39
|
if (match) {
|
|
33
40
|
flush(line.offset);
|
|
34
41
|
const id = uniqueId(slugify(match[2] ?? `section-${sections.length + 1}`), ids);
|
|
35
42
|
ids.add(id);
|
|
36
|
-
current = { id, start: line.offset, end:
|
|
43
|
+
current = { id, start: line.offset, end: lineEnd };
|
|
37
44
|
continue;
|
|
38
45
|
}
|
|
39
46
|
if (current === null) {
|
|
40
47
|
const id = uniqueId("preamble", ids);
|
|
41
48
|
ids.add(id);
|
|
42
|
-
current = { id, start: line.offset, end:
|
|
49
|
+
current = { id, start: line.offset, end: lineEnd };
|
|
43
50
|
} else {
|
|
44
|
-
current.end =
|
|
51
|
+
current.end = lineEnd;
|
|
45
52
|
}
|
|
46
53
|
}
|
|
47
|
-
flush(
|
|
54
|
+
flush(bytes.byteLength);
|
|
48
55
|
if (sections.length === 0) {
|
|
49
|
-
return [
|
|
56
|
+
return [documentChunk(bytes)];
|
|
50
57
|
}
|
|
51
58
|
return sections;
|
|
52
59
|
}
|
|
53
|
-
function paragraphChunks(
|
|
60
|
+
function paragraphChunks(bytes) {
|
|
54
61
|
const out = [];
|
|
55
62
|
const ids = /* @__PURE__ */ new Set();
|
|
63
|
+
const separator = encoder.encode("\n\n");
|
|
56
64
|
let cursor = 0;
|
|
57
65
|
let i = 0;
|
|
58
|
-
while (cursor <
|
|
59
|
-
let end =
|
|
60
|
-
if (end === -1) end =
|
|
61
|
-
const text =
|
|
66
|
+
while (cursor < bytes.byteLength) {
|
|
67
|
+
let end = indexOfBytes(bytes, separator, cursor);
|
|
68
|
+
if (end === -1) end = bytes.byteLength;
|
|
69
|
+
const text = sliceText(bytes, cursor, end);
|
|
62
70
|
if (text.trim().length > 0) {
|
|
63
71
|
const id = uniqueId(slugify(text.split("\n")[0] ?? `p-${i}`), ids);
|
|
64
72
|
ids.add(id);
|
|
65
|
-
out.push({ id, textOffset: cursor, textLength:
|
|
73
|
+
out.push({ id, textOffset: cursor, textLength: end - cursor, text });
|
|
66
74
|
i += 1;
|
|
67
75
|
}
|
|
68
|
-
cursor = end +
|
|
76
|
+
cursor = end + separator.byteLength;
|
|
69
77
|
}
|
|
70
78
|
if (out.length === 0) {
|
|
71
|
-
return [
|
|
79
|
+
return [documentChunk(bytes)];
|
|
72
80
|
}
|
|
73
81
|
return out;
|
|
74
82
|
}
|
|
75
|
-
function
|
|
83
|
+
function sliceText(bytes, start, end) {
|
|
84
|
+
return decoder.decode(bytes.subarray(start, end));
|
|
85
|
+
}
|
|
86
|
+
function splitWithByteOffsets(bytes) {
|
|
87
|
+
const newline = 10;
|
|
76
88
|
const lines = [];
|
|
77
|
-
let
|
|
78
|
-
for (
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
89
|
+
let start = 0;
|
|
90
|
+
for (let i = 0; i < bytes.byteLength; i += 1) {
|
|
91
|
+
if (bytes[i] === newline) {
|
|
92
|
+
const byteLength = i - start + 1;
|
|
93
|
+
lines.push({ text: sliceText(bytes, start, i + 1), offset: start, byteLength });
|
|
94
|
+
start = i + 1;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
if (start < bytes.byteLength) {
|
|
98
|
+
lines.push({ text: sliceText(bytes, start, bytes.byteLength), offset: start, byteLength: bytes.byteLength - start });
|
|
82
99
|
}
|
|
83
100
|
return lines;
|
|
84
101
|
}
|
|
102
|
+
function indexOfBytes(haystack, needle, from) {
|
|
103
|
+
const last = haystack.byteLength - needle.byteLength;
|
|
104
|
+
for (let i = from; i <= last; i += 1) {
|
|
105
|
+
let matched = true;
|
|
106
|
+
for (let j = 0; j < needle.byteLength; j += 1) {
|
|
107
|
+
if (haystack[i + j] !== needle[j]) {
|
|
108
|
+
matched = false;
|
|
109
|
+
break;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
if (matched) return i;
|
|
113
|
+
}
|
|
114
|
+
return -1;
|
|
115
|
+
}
|
|
85
116
|
function slugify(s) {
|
|
86
117
|
return s.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 64) || "section";
|
|
87
118
|
}
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/chunk.ts","../src/transformers-backend.ts","../src/types.ts","../src/embed.ts","../src/search.ts","../src/huggingface-backend.ts"],"names":[],"mappings":";;;AAsBA,IAAM,OAAA,GAAU,uBAAA;AAET,SAAS,aAAA,CAAc,QAAA,EAAkB,IAAA,GAAqB,EAAC,EAAoB;AACxF,EAAA,MAAM,IAAA,GAAO,KAAK,IAAA,IAAQ,SAAA;AAC1B,EAAA,IAAI,SAAS,UAAA,EAAY;AACvB,IAAA,OAAO,CAAC,EAAE,EAAA,EAAI,UAAA,EAAY,UAAA,EAAY,CAAA,EAAG,UAAA,EAAY,QAAA,CAAS,MAAA,EAAQ,IAAA,EAAM,QAAA,EAAU,CAAA;AAAA,EACxF;AACA,EAAA,IAAI,SAAS,WAAA,EAAa;AACxB,IAAA,OAAO,gBAAgB,QAAQ,CAAA;AAAA,EACjC;AACA,EAAA,OAAO,cAAc,QAAQ,CAAA;AAC/B;AAEA,SAAS,cAAc,QAAA,EAAmC;AACxD,EAAA,MAAM,KAAA,GAAQ,iBAAiB,QAAQ,CAAA;AACvC,EAAA,MAAM,WAA4B,EAAC;AACnC,EAAA,IAAI,OAAA,GAA6D,IAAA;AACjE,EAAA,MAAM,GAAA,uBAAU,GAAA,EAAY;AAE5B,EAAA,SAAS,MAAM,GAAA,EAAmB;AAChC,IAAA,IAAI,CAAC,OAAA,EAAS;AACd,IAAA,MAAM,IAAA,GAAO,QAAA,CAAS,KAAA,CAAM,OAAA,CAAQ,OAAO,GAAG,CAAA;AAC9C,IAAA,IAAI,IAAA,CAAK,IAAA,EAAK,CAAE,MAAA,KAAW,CAAA,EAAG;AAC5B,MAAA,OAAA,GAAU,IAAA;AACV,MAAA;AAAA,IACF;AACA,IAAA,QAAA,CAAS,IAAA,CAAK,EAAE,EAAA,EAAI,OAAA,CAAQ,EAAA,EAAI,UAAA,EAAY,OAAA,CAAQ,KAAA,EAAO,UAAA,EAAY,GAAA,GAAM,OAAA,CAAQ,KAAA,EAAO,MAAM,CAAA;AAClG,IAAA,OAAA,GAAU,IAAA;AAAA,EACZ;AAEA,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,MAAM,KAAA,GAAQ,OAAA,CAAQ,IAAA,CAAK,IAAA,CAAK,IAAI,CAAA;AACpC,IAAA,IAAI,KAAA,EAAO;AACT,MAAA,KAAA,CAAM,KAAK,MAAM,CAAA;AACjB,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,OAAA,CAAQ,KAAA,CAAM,CAAC,CAAA,IAAK,CAAA,QAAA,EAAW,QAAA,CAAS,MAAA,GAAS,CAAC,CAAA,CAAE,CAAA,EAAG,GAAG,CAAA;AAC9E,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,OAAA,GAAU,EAAE,EAAA,EAAI,KAAA,EAAO,IAAA,CAAK,MAAA,EAAQ,KAAK,IAAA,CAAK,MAAA,GAAS,IAAA,CAAK,IAAA,CAAK,MAAA,EAAO;AACxE,MAAA;AAAA,IACF;AACA,IAAA,IAAI,YAAY,IAAA,EAAM;AACpB,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,UAAA,EAAY,GAAG,CAAA;AACnC,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,OAAA,GAAU,EAAE,EAAA,EAAI,KAAA,EAAO,IAAA,CAAK,MAAA,EAAQ,KAAK,IAAA,CAAK,MAAA,GAAS,IAAA,CAAK,IAAA,CAAK,MAAA,EAAO;AAAA,IAC1E,CAAA,MAAO;AACL,MAAA,OAAA,CAAQ,GAAA,GAAM,IAAA,CAAK,MAAA,GAAS,IAAA,CAAK,IAAA,CAAK,MAAA;AAAA,IACxC;AAAA,EACF;AACA,EAAA,KAAA,CAAM,SAAS,MAAM,CAAA;AAErB,EAAA,IAAI,QAAA,CAAS,WAAW,CAAA,EAAG;AACzB,IAAA,OAAO,CAAC,EAAE,EAAA,EAAI,UAAA,EAAY,UAAA,EAAY,CAAA,EAAG,UAAA,EAAY,QAAA,CAAS,MAAA,EAAQ,IAAA,EAAM,QAAA,EAAU,CAAA;AAAA,EACxF;AACA,EAAA,OAAO,QAAA;AACT;AAEA,SAAS,gBAAgB,QAAA,EAAmC;AAC1D,EAAA,MAAM,MAAuB,EAAC;AAC9B,EAAA,MAAM,GAAA,uBAAU,GAAA,EAAY;AAC5B,EAAA,IAAI,MAAA,GAAS,CAAA;AACb,EAAA,IAAI,CAAA,GAAI,CAAA;AACR,EAAA,OAAO,MAAA,GAAS,SAAS,MAAA,EAAQ;AAC/B,IAAA,IAAI,GAAA,GAAM,QAAA,CAAS,OAAA,CAAQ,MAAA,EAAQ,MAAM,CAAA;AACzC,IAAA,IAAI,GAAA,KAAQ,EAAA,EAAI,GAAA,GAAM,QAAA,CAAS,MAAA;AAC/B,IAAA,MAAM,IAAA,GAAO,QAAA,CAAS,KAAA,CAAM,MAAA,EAAQ,GAAG,CAAA;AACvC,IAAA,IAAI,IAAA,CAAK,IAAA,EAAK,CAAE,MAAA,GAAS,CAAA,EAAG;AAC1B,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,OAAA,CAAQ,IAAA,CAAK,KAAA,CAAM,IAAI,CAAA,CAAE,CAAC,CAAA,IAAK,CAAA,EAAA,EAAK,CAAC,CAAA,CAAE,GAAG,GAAG,CAAA;AACjE,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,GAAA,CAAI,IAAA,CAAK,EAAE,EAAA,EAAI,UAAA,EAAY,QAAQ,UAAA,EAAY,IAAA,CAAK,MAAA,EAAQ,IAAA,EAAM,CAAA;AAClE,MAAA,CAAA,IAAK,CAAA;AAAA,IACP;AACA,IAAA,MAAA,GAAS,GAAA,GAAM,CAAA;AAAA,EACjB;AACA,EAAA,IAAI,GAAA,CAAI,WAAW,CAAA,EAAG;AACpB,IAAA,OAAO,CAAC,EAAE,EAAA,EAAI,UAAA,EAAY,UAAA,EAAY,CAAA,EAAG,UAAA,EAAY,QAAA,CAAS,MAAA,EAAQ,IAAA,EAAM,QAAA,EAAU,CAAA;AAAA,EACxF;AACA,EAAA,OAAO,GAAA;AACT;AAEA,SAAS,iBAAiB,CAAA,EAA+C;AACvE,EAAA,MAAM,QAA4C,EAAC;AACnD,EAAA,IAAI,MAAA,GAAS,CAAA;AACb,EAAA,KAAA,MAAW,IAAA,IAAQ,CAAA,CAAE,KAAA,CAAM,IAAI,CAAA,EAAG;AAChC,IAAA,MAAM,SAAS,IAAA,IAAQ,MAAA,GAAS,KAAK,MAAA,GAAS,CAAA,CAAE,SAAS,IAAA,GAAO,EAAA,CAAA;AAChE,IAAA,KAAA,CAAM,IAAA,CAAK,EAAE,IAAA,EAAM,MAAA,EAAQ,QAAQ,CAAA;AACnC,IAAA,MAAA,IAAU,MAAA,CAAO,MAAA;AAAA,EACnB;AACA,EAAA,OAAO,KAAA;AACT;AAEA,SAAS,QAAQ,CAAA,EAAmB;AAClC,EAAA,OACE,CAAA,CACG,WAAA,EAAY,CACZ,OAAA,CAAQ,eAAe,GAAG,CAAA,CAC1B,OAAA,CAAQ,UAAA,EAAY,EAAE,CAAA,CACtB,KAAA,CAAM,CAAA,EAAG,EAAE,CAAA,IAAK,SAAA;AAEvB;AAEA,SAAS,QAAA,CAAS,MAAc,KAAA,EAA4B;AAC1D,EAAA,IAAI,CAAC,KAAA,CAAM,GAAA,CAAI,IAAI,GAAG,OAAO,IAAA;AAC7B,EAAA,IAAI,CAAA,GAAI,CAAA;AACR,EAAA,OAAO,KAAA,CAAM,IAAI,CAAA,EAAG,IAAI,IAAI,CAAC,CAAA,CAAE,GAAG,CAAA,IAAK,CAAA;AACvC,EAAA,OAAO,CAAA,EAAG,IAAI,CAAA,CAAA,EAAI,CAAC,CAAA,CAAA;AACrB;AC3GO,SAAS,0BAA0B,IAAA,EAAoD;AAC5F,EAAA,IAAI,IAAA,CAAK,sBAAsB,KAAA,EAAO;AACpC,IAAA,GAAA,CAAI,iBAAA,GAAoB,KAAA;AAAA,EAC1B;AAEA,EAAA,IAAI,eAAA,GAA2C,IAAA;AAC/C,EAAA,IAAI,iBAAA,GAAmC,IAAA;AAEvC,EAAA,SAAS,WAAA,GAAgC;AACvC,IAAA,IAAI,CAAC,eAAA,EAAiB;AACpB,MAAA,MAAM,SAAA,GAAqC,EAAE,KAAA,EAAO,IAAA,CAAK,SAAS,MAAA,EAAO;AACzE,MAAA,IAAI,IAAA,CAAK,MAAA,KAAW,MAAA,EAAW,SAAA,CAAU,SAAS,IAAA,CAAK,MAAA;AACvD,MAAA,eAAA,GAAkB,QAAA,CAAS,oBAAA,EAAsB,IAAA,CAAK,KAAA,EAAO,SAAkB,CAAA;AAAA,IACjF;AACA,IAAA,OAAO,eAAA;AAAA,EACT;AAEA,EAAA,MAAM,OAAA,GAA4B;AAAA,IAChC,OAAO,IAAA,CAAK,KAAA;AAAA,IACZ,aAAA,EAAe,KAAK,aAAA,IAAiB,MAAA;AAAA,IACrC,MAAA,EAAQ,KAAK,MAAA,IAAU,QAAA;AAAA,IACvB,UAAA,EAAY,IAAA;AAAA,IACZ,MAAM,MAAM,KAAA,EAA2C;AACrD,MAAA,IAAI,KAAA,CAAM,WAAW,CAAA,EAAG;AACtB,QAAA,OAAO,EAAE,OAAA,EAAS,IAAI,SAAA,EAAW,IAAA,CAAK,aAAa,CAAA,EAAE;AAAA,MACvD;AACA,MAAA,MAAM,IAAA,GAAQ,MAAM,WAAA,EAAY;AAIhC,MAAA,MAAM,MAAA,GAAS,MAAM,IAAA,CAAK,KAAA,EAAO,EAAE,OAAA,EAAS,MAAA,EAAQ,SAAA,EAAW,IAAA,EAAM,CAAA;AACrE,MAAA,MAAM,OAAO,MAAA,CAAO,IAAA;AACpB,MAAA,MAAM,YAAY,MAAA,CAAO,IAAA,CAAK,MAAA,CAAO,IAAA,CAAK,SAAS,CAAC,CAAA;AACpD,MAAA,IAAI,iBAAA,KAAsB,IAAA,IAAQ,iBAAA,KAAsB,SAAA,EAAW;AACjE,QAAA,MAAM,IAAI,KAAA,CAAM,CAAA,0CAAA,EAA6C,iBAAiB,CAAA,MAAA,EAAS,SAAS,CAAA,CAAE,CAAA;AAAA,MACpG;AACA,MAAA,iBAAA,GAAoB,SAAA;AACpB,MAAA,MAAM,UAA0B,EAAC;AACjC,MAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,KAAA,CAAM,MAAA,EAAQ,KAAK,CAAA,EAAG;AACxC,QAAA,OAAA,CAAQ,IAAA,CAAK,IAAI,YAAA,CAAa,IAAA,CAAK,MAAA,EAAQ,IAAA,CAAK,UAAA,GAAa,CAAA,GAAI,SAAA,GAAY,CAAA,EAAG,SAAS,CAAA,CAAE,OAAO,CAAA;AAAA,MACpG;AACA,MAAA,OAAO,EAAE,SAAS,SAAA,EAAU;AAAA,IAC9B;AAAA,GACF;AACA,EAAA,IAAI,IAAA,CAAK,SAAA,KAAc,MAAA,EAAW,OAAA,CAAQ,YAAY,IAAA,CAAK,SAAA;AAC3D,EAAA,OAAO,OAAA;AACT;;;ACvCO,IAAM,aAAA,GAAgB;AACtB,IAAM,uBAAA,GAA0B;;;ACNvC,eAAsB,KAAA,CAAM,QAAA,EAAkB,IAAA,GAAqB,EAAC,EAA+B;AACjG,EAAA,MAAM,MAAA,GAAS,cAAc,QAAA,EAAU,EAAE,MAAM,IAAA,CAAK,QAAA,IAAY,WAAW,CAAA;AAC3E,EAAA,MAAM,OAAA,GACJ,KAAK,OAAA,IACL,yBAAA;AAAA,IACE,KAAK,aAAA,KAAkB,MAAA,GACnB,EAAE,KAAA,EAAO,KAAK,KAAA,IAAS,aAAA,EAAe,aAAA,EAAe,IAAA,CAAK,eAAc,GACxE,EAAE,KAAA,EAAO,IAAA,CAAK,SAAS,aAAA;AAAc,GAC3C;AAEF,EAAA,MAAM,MAAA,GAAS,MAAM,OAAA,CAAQ,KAAA,CAAM,MAAA,CAAO,IAAI,CAAC,CAAA,KAAM,CAAA,CAAE,IAAI,CAAC,CAAA;AAC5D,EAAA,IAAI,MAAA,CAAO,OAAA,CAAQ,MAAA,KAAW,MAAA,CAAO,MAAA,EAAQ;AAC3C,IAAA,MAAM,IAAI,MAAM,CAAA,iBAAA,EAAoB,MAAA,CAAO,QAAQ,MAAM,CAAA,aAAA,EAAgB,MAAA,CAAO,MAAM,CAAA,OAAA,CAAS,CAAA;AAAA,EACjG;AAEA,EAAA,MAAM,eAAA,GAAoC,MAAA,CAAO,GAAA,CAAI,CAAC,GAAG,CAAA,MAAO;AAAA,IAC9D,IAAI,CAAA,CAAE,EAAA;AAAA,IACN,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,MAAA,EAAQ,MAAA,CAAO,OAAA,CAAQ,CAAC;AAAA,GAC1B,CAAE,CAAA;AAEF,EAAA,MAAM,KAAA,GAAwB;AAAA,IAC5B,OAAO,OAAA,CAAQ,KAAA;AAAA,IACf,eAAe,OAAA,CAAQ,aAAA;AAAA,IACvB,WAAW,MAAA,CAAO,SAAA;AAAA,IAClB,QAAQ,OAAA,CAAQ,MAAA;AAAA,IAChB,YAAY,OAAA,CAAQ,UAAA;AAAA,IACpB,QAAA,EAAU,KAAK,QAAA,IAAY,SAAA;AAAA,IAC3B,MAAA,EAAQ;AAAA,GACV;AAEA,EAAA,OAAO,EAAE,aAAA,EAAe,CAAA,EAAG,MAAA,EAAQ,CAAC,KAAK,CAAA,EAAE;AAC7C;;;AC/BO,SAAS,cAAA,CACd,OAAA,EACA,WAAA,EACA,IAAA,GAAsB,EAAC,EACV;AACb,EAAA,MAAM,KAAA,GAAQ,SAAA,CAAU,OAAA,EAAS,IAAA,CAAK,KAAK,CAAA;AAC3C,EAAA,IAAI,CAAC,KAAA,EAAO,MAAM,IAAI,MAAM,mCAAmC,CAAA;AAC/D,EAAA,IAAI,WAAA,CAAY,MAAA,KAAW,KAAA,CAAM,SAAA,EAAW;AAC1C,IAAA,MAAM,IAAI,KAAA,CAAM,CAAA,uBAAA,EAA0B,WAAA,CAAY,MAAM,CAAA,sBAAA,EAAyB,KAAA,CAAM,KAAK,CAAA,EAAA,EAAK,KAAA,CAAM,SAAS,CAAA,CAAA,CAAG,CAAA;AAAA,EACzH;AAEA,EAAA,MAAM,CAAA,GAAI,KAAK,CAAA,IAAK,CAAA;AACpB,EAAA,MAAM,MAAA,GAAsB,KAAA,CAAM,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,MAAO;AAAA,IACnD,YAAY,KAAA,CAAM,KAAA;AAAA,IAClB,SAAS,CAAA,CAAE,EAAA;AAAA,IACX,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,OAAO,UAAA,CAAW,WAAA,EAAa,CAAA,CAAE,MAAA,EAAQ,MAAM,MAAM;AAAA,GACvD,CAAE,CAAA;AAGF,EAAA,MAAM,KAAA,GAAQ,KAAA,CAAM,MAAA,KAAW,WAAA,GAAc,CAAA,GAAI,EAAA;AACjD,EAAA,MAAA,CAAO,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,SAAS,CAAA,CAAE,KAAA,GAAQ,EAAE,KAAA,CAAM,CAAA;AACjD,EAAA,OAAO,MAAA,CAAO,KAAA,CAAM,CAAA,EAAG,CAAC,CAAA;AAC1B;AAEA,SAAS,SAAA,CAAU,SAA4B,SAAA,EAAgD;AAC7F,EAAA,IAAI,SAAA,SAAkB,OAAA,CAAQ,MAAA,CAAO,KAAK,CAAC,CAAA,KAAM,CAAA,CAAE,KAAA,KAAU,SAAS,CAAA;AACtE,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAC,CAAA;AACzB;AAEA,SAAS,UAAA,CAAW,CAAA,EAAiB,CAAA,EAAiB,MAAA,EAAiC;AACrF,EAAA,IAAI,WAAW,WAAA,EAAa;AAC1B,IAAA,IAAI,GAAA,GAAM,CAAA;AACV,IAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,KAAK,CAAA,EAAG;AACpC,MAAA,MAAM,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,GAAK,EAAE,CAAC,CAAA;AACrB,MAAA,GAAA,IAAO,CAAA,GAAI,CAAA;AAAA,IACb;AACA,IAAA,OAAO,IAAA,CAAK,KAAK,GAAG,CAAA;AAAA,EACtB;AACA,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,GAAA,IAAO,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACxD,EAAA,IAAI,MAAA,KAAW,OAAO,OAAO,GAAA;AAE7B,EAAA,IAAI,EAAA,GAAK,CAAA;AACT,EAAA,IAAI,EAAA,GAAK,CAAA;AACT,EAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,KAAK,CAAA,EAAG;AACpC,IAAA,EAAA,IAAM,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACjB,IAAA,EAAA,IAAM,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AAAA,EACnB;AACA,EAAA,MAAM,QAAQ,IAAA,CAAK,IAAA,CAAK,EAAE,CAAA,GAAI,IAAA,CAAK,KAAK,EAAE,CAAA;AAC1C,EAAA,OAAO,KAAA,KAAU,CAAA,GAAI,CAAA,GAAI,GAAA,GAAM,KAAA;AACjC;;;AClDA,IAAM,gBAAA,GAAmB,mDAAA;AAElB,SAAS,yBAAyB,IAAA,EAAmD;AAC1F,EAAA,MAAM,QAAQ,IAAA,CAAK,KAAA,IAAS,QAAQ,GAAA,CAAI,QAAA,IAAY,QAAQ,GAAA,CAAI,iBAAA;AAChE,EAAA,IAAI,CAAC,KAAA,EAAO;AACV,IAAA,MAAM,IAAI,MAAM,0EAA0E,CAAA;AAAA,EAC5F;AACA,EAAA,MAAM,OAAA,GAAU,KAAK,OAAA,IAAW,gBAAA;AAChC,EAAA,MAAM,MAAM,CAAA,EAAG,OAAO,IAAI,SAAA,CAAU,IAAA,CAAK,KAAK,CAAC,CAAA,4BAAA,CAAA;AAE/C,EAAA,MAAM,OAAA,GAA4B;AAAA,IAChC,OAAO,IAAA,CAAK,KAAA;AAAA,IACZ,aAAA,EAAe,KAAK,aAAA,IAAiB,MAAA;AAAA,IACrC,MAAA,EAAQ,KAAK,MAAA,IAAU,QAAA;AAAA,IACvB,UAAA,EAAY,IAAA;AAAA,IACZ,MAAM,MAAM,KAAA,EAA2C;AACrD,MAAA,IAAI,KAAA,CAAM,WAAW,CAAA,EAAG;AACtB,QAAA,OAAO,EAAE,OAAA,EAAS,IAAI,SAAA,EAAW,IAAA,CAAK,aAAa,CAAA,EAAE;AAAA,MACvD;AACA,MAAA,MAAM,IAAA,GAAO,IAAA,CAAK,SAAA,CAAU,EAAE,MAAA,EAAQ,KAAA,EAAO,OAAA,EAAS,EAAE,cAAA,EAAgB,IAAA,EAAK,EAAG,CAAA;AAChF,MAAA,MAAM,GAAA,GAAM,MAAM,KAAA,CAAM,GAAA,EAAK;AAAA,QAC3B,MAAA,EAAQ,MAAA;AAAA,QACR,OAAA,EAAS;AAAA,UACP,cAAA,EAAgB,kBAAA;AAAA,UAChB,aAAA,EAAe,UAAU,KAAK,CAAA;AAAA,SAChC;AAAA,QACA;AAAA,OACD,CAAA;AACD,MAAA,IAAI,CAAC,IAAI,EAAA,EAAI;AACX,QAAA,MAAM,SAAS,MAAM,GAAA,CAAI,MAAK,CAAE,KAAA,CAAM,MAAM,EAAE,CAAA;AAC9C,QAAA,MAAM,IAAI,KAAA,CAAM,CAAA,iBAAA,EAAoB,GAAA,CAAI,MAAM,CAAA,KAAA,EAAQ,IAAA,CAAK,KAAK,CAAA,EAAA,EAAK,MAAA,IAAU,GAAA,CAAI,UAAU,CAAA,CAAE,CAAA;AAAA,MACjG;AACA,MAAA,MAAM,GAAA,GAAO,MAAM,GAAA,CAAI,IAAA,EAAK;AAC5B,MAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,GAAA,EAAK,KAAA,CAAM,MAAM,CAAA;AAChD,MAAA,MAAM,YAAY,MAAA,CAAO,CAAC,CAAA,EAAG,MAAA,IAAU,KAAK,SAAA,IAAa,CAAA;AACzD,MAAA,MAAM,OAAA,GAAU,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,KAAM,UAAU,YAAA,CAAa,IAAA,CAAK,CAAC,CAAC,CAAC,CAAA;AACjE,MAAA,OAAO,EAAE,SAAS,SAAA,EAAU;AAAA,IAC9B;AAAA,GACF;AACA,EAAA,IAAI,IAAA,CAAK,SAAA,KAAc,MAAA,EAAW,OAAA,CAAQ,YAAY,IAAA,CAAK,SAAA;AAC3D,EAAA,OAAO,OAAA;AACT;AAWA,SAAS,eAAA,CAAgB,KAAc,aAAA,EAAmC;AACxE,EAAA,IAAI,CAAC,KAAA,CAAM,OAAA,CAAQ,GAAG,CAAA,EAAG;AACvB,IAAA,MAAM,IAAI,MAAM,2CAA2C,CAAA;AAAA,EAC7D;AACA,EAAA,IAAI,GAAA,CAAI,MAAA,KAAW,CAAA,EAAG,OAAO,EAAC;AAE9B,EAAA,MAAM,KAAA,GAAQ,IAAI,CAAC,CAAA;AACnB,EAAA,IAAI,OAAO,UAAU,QAAA,EAAU;AAC7B,IAAA,IAAI,kBAAkB,CAAA,EAAG;AACvB,MAAA,MAAM,IAAI,KAAA,CAAM,CAAA,yCAAA,EAA4C,aAAa,CAAA,CAAE,CAAA;AAAA,IAC7E;AACA,IAAA,OAAO,CAAC,GAAe,CAAA;AAAA,EACzB;AACA,EAAA,IAAI,KAAA,CAAM,OAAA,CAAQ,KAAK,CAAA,KAAM,KAAA,CAAM,MAAA,KAAW,CAAA,IAAK,OAAO,KAAA,CAAM,CAAC,CAAA,KAAM,QAAA,CAAA,EAAW;AAChF,IAAA,OAAO,GAAA;AAAA,EACT;AACA,EAAA,IAAI,KAAA,CAAM,QAAQ,KAAK,CAAA,IAAK,MAAM,OAAA,CAAQ,KAAA,CAAM,CAAC,CAAC,CAAA,EAAG;AAEnD,IAAA,OAAQ,GAAA,CAAqB,IAAI,QAAQ,CAAA;AAAA,EAC3C;AACA,EAAA,MAAM,IAAI,MAAM,+CAA+C,CAAA;AACjE;AAEA,SAAS,SAAS,MAAA,EAA8B;AAC9C,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,EAAC;AACjC,EAAA,MAAM,GAAA,GAAM,MAAA,CAAO,CAAC,CAAA,CAAG,MAAA;AACvB,EAAA,MAAM,MAAM,IAAI,KAAA,CAAc,GAAG,CAAA,CAAE,KAAK,CAAC,CAAA;AACzC,EAAA,KAAA,MAAW,KAAK,MAAA,EAAQ;AACtB,IAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,GAAA,EAAK,CAAA,IAAK,GAAG,GAAA,CAAI,CAAC,CAAA,IAAM,CAAA,CAAE,CAAC,CAAA;AAAA,EACjD;AACA,EAAA,KAAA,IAAS,CAAA,GAAI,GAAG,CAAA,GAAI,GAAA,EAAK,KAAK,CAAA,EAAG,GAAA,CAAI,CAAC,CAAA,IAAM,MAAA,CAAO,MAAA;AACnD,EAAA,OAAO,GAAA;AACT;AAEA,SAAS,UAAU,CAAA,EAA+B;AAChD,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,GAAA,IAAO,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACxD,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,IAAA,CAAK,GAAG,CAAA;AAC1B,EAAA,IAAI,IAAA,KAAS,GAAG,OAAO,CAAA;AACvB,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,CAAA,CAAE,CAAC,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,GAAK,IAAA;AACrD,EAAA,OAAO,CAAA;AACT","file":"index.js","sourcesContent":["/**\n * Section-based markdown chunker.\n *\n * Splits a markdown document on ATX headings (`#`, `##`, ...). Each chunk\n * carries the byte offset and length into the original UTF-8 source so a\n * downstream consumer can map a vector hit back to the exact substring\n * without re-tokenising. Pre-heading content becomes a \"preamble\" chunk.\n */\n\nexport type ChunkingMode = 'document' | 'section' | 'paragraph';\n\nexport interface MarkdownChunk {\n id: string;\n textOffset: number;\n textLength: number;\n text: string;\n}\n\nexport interface ChunkOptions {\n mode?: ChunkingMode;\n}\n\nconst HEADING = /^(#{1,6})\\s+(.+?)\\s*$/;\n\nexport function chunkMarkdown(markdown: string, opts: ChunkOptions = {}): MarkdownChunk[] {\n const mode = opts.mode ?? 'section';\n if (mode === 'document') {\n return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }];\n }\n if (mode === 'paragraph') {\n return paragraphChunks(markdown);\n }\n return sectionChunks(markdown);\n}\n\nfunction sectionChunks(markdown: string): MarkdownChunk[] {\n const lines = splitWithOffsets(markdown);\n const sections: MarkdownChunk[] = [];\n let current: { id: string; start: number; end: number } | null = null;\n const ids = new Set<string>();\n\n function flush(end: number): void {\n if (!current) return;\n const text = markdown.slice(current.start, end);\n if (text.trim().length === 0) {\n current = null;\n return;\n }\n sections.push({ id: current.id, textOffset: current.start, textLength: end - current.start, text });\n current = null;\n }\n\n for (const line of lines) {\n const match = HEADING.exec(line.text);\n if (match) {\n flush(line.offset);\n const id = uniqueId(slugify(match[2] ?? `section-${sections.length + 1}`), ids);\n ids.add(id);\n current = { id, start: line.offset, end: line.offset + line.text.length };\n continue;\n }\n if (current === null) {\n const id = uniqueId('preamble', ids);\n ids.add(id);\n current = { id, start: line.offset, end: line.offset + line.text.length };\n } else {\n current.end = line.offset + line.text.length;\n }\n }\n flush(markdown.length);\n\n if (sections.length === 0) {\n return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }];\n }\n return sections;\n}\n\nfunction paragraphChunks(markdown: string): MarkdownChunk[] {\n const out: MarkdownChunk[] = [];\n const ids = new Set<string>();\n let cursor = 0;\n let i = 0;\n while (cursor < markdown.length) {\n let end = markdown.indexOf('\\n\\n', cursor);\n if (end === -1) end = markdown.length;\n const text = markdown.slice(cursor, end);\n if (text.trim().length > 0) {\n const id = uniqueId(slugify(text.split('\\n')[0] ?? `p-${i}`), ids);\n ids.add(id);\n out.push({ id, textOffset: cursor, textLength: text.length, text });\n i += 1;\n }\n cursor = end + 2;\n }\n if (out.length === 0) {\n return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }];\n }\n return out;\n}\n\nfunction splitWithOffsets(s: string): { text: string; offset: number }[] {\n const lines: { text: string; offset: number }[] = [];\n let offset = 0;\n for (const line of s.split('\\n')) {\n const withNl = line + (offset + line.length < s.length ? '\\n' : '');\n lines.push({ text: withNl, offset });\n offset += withNl.length;\n }\n return lines;\n}\n\nfunction slugify(s: string): string {\n return (\n s\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 64) || 'section'\n );\n}\n\nfunction uniqueId(base: string, taken: Set<string>): string {\n if (!taken.has(base)) return base;\n let n = 2;\n while (taken.has(`${base}-${n}`)) n += 1;\n return `${base}-${n}`;\n}\n","/**\n * transformers.js backend (browser + Node + Bun via @huggingface/transformers).\n *\n * Loads the model lazily and reuses the pipeline across calls. Models are\n * cached on disk by transformers.js itself (HF cache layout).\n */\n\nimport { pipeline, env } from '@huggingface/transformers';\nimport type { EmbeddingBackend, EmbeddingBackendOptions, EmbeddingMatrix } from './types.js';\n\ninterface TransformersBackendOptions extends EmbeddingBackendOptions {\n /** Override Xenova quantisation. Defaults to fp32 for closest fidelity. */\n dtype?: 'fp32' | 'fp16' | 'q8' | 'q4';\n /** Force a backend device (e.g. 'cpu', 'gpu', 'wasm'). Defaults to auto. */\n device?: 'cpu' | 'gpu' | 'wasm' | 'webgpu';\n /** Allow remote model downloads. true by default. */\n allowRemoteModels?: boolean;\n}\n\nexport function createTransformersBackend(opts: TransformersBackendOptions): EmbeddingBackend {\n if (opts.allowRemoteModels === false) {\n env.allowRemoteModels = false;\n }\n\n let pipelinePromise: Promise<unknown> | null = null;\n let resolvedDimension: number | null = null;\n\n function getPipeline(): Promise<unknown> {\n if (!pipelinePromise) {\n const modelOpts: Record<string, unknown> = { dtype: opts.dtype ?? 'fp32' };\n if (opts.device !== undefined) modelOpts.device = opts.device;\n pipelinePromise = pipeline('feature-extraction', opts.model, modelOpts as never);\n }\n return pipelinePromise;\n }\n\n const backend: EmbeddingBackend = {\n model: opts.model,\n modelRevision: opts.modelRevision ?? 'main',\n metric: opts.metric ?? 'cosine',\n normalized: true,\n async embed(texts: string[]): Promise<EmbeddingMatrix> {\n if (texts.length === 0) {\n return { vectors: [], dimension: opts.dimension ?? 0 };\n }\n const pipe = (await getPipeline()) as (\n texts: string[],\n opts: { pooling: 'mean'; normalize: boolean },\n ) => Promise<{ data: Float32Array; dims: number[] }>;\n const tensor = await pipe(texts, { pooling: 'mean', normalize: true });\n const data = tensor.data;\n const dimension = tensor.dims[tensor.dims.length - 1] as number;\n if (resolvedDimension !== null && resolvedDimension !== dimension) {\n throw new Error(`Model emitted inconsistent dimension: had ${resolvedDimension}, now ${dimension}`);\n }\n resolvedDimension = dimension;\n const vectors: Float32Array[] = [];\n for (let i = 0; i < texts.length; i += 1) {\n vectors.push(new Float32Array(data.buffer, data.byteOffset + i * dimension * 4, dimension).slice());\n }\n return { vectors, dimension };\n },\n };\n if (opts.dimension !== undefined) backend.dimension = opts.dimension;\n return backend;\n}\n","import type { EmbeddingMetric } from '@cvfile/sdk';\n\nexport interface EmbeddingMatrix {\n vectors: Float32Array[];\n dimension: number;\n}\n\nexport interface EmbeddingBackend {\n model: string;\n modelRevision: string;\n /** Optional pre-declared dimension; the backend may override after first call. */\n dimension?: number;\n metric: EmbeddingMetric;\n normalized: boolean;\n embed(texts: string[]): Promise<EmbeddingMatrix>;\n}\n\nexport interface EmbeddingBackendOptions {\n model: string;\n modelRevision?: string;\n /** Optional pre-known dimension (e.g. 1024 for bge-m3). */\n dimension?: number;\n metric?: EmbeddingMetric;\n}\n\n/** Recommended default per spec §5: BAAI BGE-M3, MIT, multilingual, 1024-dim. */\nexport const DEFAULT_MODEL = 'Xenova/bge-m3';\nexport const DEFAULT_MODEL_DIMENSION = 1024;\n","/**\n * High-level embed() API: markdown in, EmbeddingsPayload out, ready to drop\n * into pack({ embeddings: ... }).\n */\n\nimport type { EmbeddingChunk, EmbeddingSpace, EmbeddingsPayload } from '@cvfile/sdk';\nimport { chunkMarkdown, type ChunkingMode } from './chunk.js';\nimport { createTransformersBackend } from './transformers-backend.js';\nimport { DEFAULT_MODEL, type EmbeddingBackend } from './types.js';\n\nexport interface EmbedOptions {\n /** HF model id; defaults to BGE-M3 (Xenova/bge-m3). */\n model?: string;\n /** Pinned model revision; recorded in the payload for reproducibility. */\n modelRevision?: string;\n /** Chunking strategy; default 'section'. */\n chunking?: ChunkingMode;\n /** Bring-your-own backend (e.g. an OpenAI/Voyage adapter). */\n backend?: EmbeddingBackend;\n}\n\nexport async function embed(markdown: string, opts: EmbedOptions = {}): Promise<EmbeddingsPayload> {\n const chunks = chunkMarkdown(markdown, { mode: opts.chunking ?? 'section' });\n const backend =\n opts.backend ??\n createTransformersBackend(\n opts.modelRevision !== undefined\n ? { model: opts.model ?? DEFAULT_MODEL, modelRevision: opts.modelRevision }\n : { model: opts.model ?? DEFAULT_MODEL },\n );\n\n const matrix = await backend.embed(chunks.map((c) => c.text));\n if (matrix.vectors.length !== chunks.length) {\n throw new Error(`Backend returned ${matrix.vectors.length} vectors for ${chunks.length} chunks`);\n }\n\n const embeddingChunks: EmbeddingChunk[] = chunks.map((c, i) => ({\n id: c.id,\n textOffset: c.textOffset,\n textLength: c.textLength,\n vector: matrix.vectors[i]!,\n }));\n\n const space: EmbeddingSpace = {\n model: backend.model,\n modelRevision: backend.modelRevision,\n dimension: matrix.dimension,\n metric: backend.metric,\n normalized: backend.normalized,\n chunking: opts.chunking ?? 'section',\n chunks: embeddingChunks,\n };\n\n return { formatVersion: 1, spaces: [space] };\n}\n","/**\n * Pure similarity search over an EmbeddingsPayload. Vector input is the\n * caller's responsibility (encode the query with the same model that was\n * used to populate the space, ideally pulled from space.model).\n */\n\nimport type { EmbeddingMetric, EmbeddingSpace, EmbeddingsPayload } from '@cvfile/sdk';\n\nexport interface SearchHit {\n spaceModel: string;\n chunkId: string;\n textOffset: number;\n textLength: number;\n score: number;\n}\n\nexport interface SearchOptions {\n /** Restrict to a specific embedding space; default = first space. */\n model?: string;\n /** Top-k results; default 5. */\n k?: number;\n}\n\nexport function searchSemantic(\n payload: EmbeddingsPayload,\n queryVector: Float32Array,\n opts: SearchOptions = {},\n): SearchHit[] {\n const space = pickSpace(payload, opts.model);\n if (!space) throw new Error('No matching embedding space found');\n if (queryVector.length !== space.dimension) {\n throw new Error(`Query vector dimension ${queryVector.length} does not match space ${space.model} (${space.dimension})`);\n }\n\n const k = opts.k ?? 5;\n const scored: SearchHit[] = space.chunks.map((c) => ({\n spaceModel: space.model,\n chunkId: c.id,\n textOffset: c.textOffset,\n textLength: c.textLength,\n score: similarity(queryVector, c.vector, space.metric),\n }));\n\n // Higher score = better for cosine/dot; lower for euclidean.\n const order = space.metric === 'euclidean' ? 1 : -1;\n scored.sort((a, b) => order * (a.score - b.score));\n return scored.slice(0, k);\n}\n\nfunction pickSpace(payload: EmbeddingsPayload, modelHint?: string): EmbeddingSpace | undefined {\n if (modelHint) return payload.spaces.find((s) => s.model === modelHint);\n return payload.spaces[0];\n}\n\nfunction similarity(a: Float32Array, b: Float32Array, metric: EmbeddingMetric): number {\n if (metric === 'euclidean') {\n let sum = 0;\n for (let i = 0; i < a.length; i += 1) {\n const d = a[i]! - b[i]!;\n sum += d * d;\n }\n return Math.sqrt(sum);\n }\n let dot = 0;\n for (let i = 0; i < a.length; i += 1) dot += a[i]! * b[i]!;\n if (metric === 'dot') return dot;\n // cosine: assume normalized vectors when produced by our backend\n let na = 0;\n let nb = 0;\n for (let i = 0; i < a.length; i += 1) {\n na += a[i]! * a[i]!;\n nb += b[i]! * b[i]!;\n }\n const denom = Math.sqrt(na) * Math.sqrt(nb);\n return denom === 0 ? 0 : dot / denom;\n}\n","/**\n * Hugging Face Inference API backend.\n *\n * Calls https://router.huggingface.co/hf-inference/models/<model>/pipeline/feature-extraction\n * with HF_TOKEN. For sentence-transformers models (BGE-M3, MiniLM, etc.) the\n * response is already mean-pooled per input — one vector per text. We\n * normalise client-side so cosine math is consistent across backends.\n */\n\nimport type { EmbeddingBackend, EmbeddingMatrix } from './types.js';\nimport type { EmbeddingMetric } from '@cvfile/sdk';\n\nexport interface HuggingFaceBackendOptions {\n model: string;\n /** HF token. Defaults to `process.env.HF_TOKEN`. */\n token?: string;\n /** Pinned revision; recorded in the payload. Default 'main'. */\n modelRevision?: string;\n /** Pre-known dimension. Optional; inferred from first response otherwise. */\n dimension?: number;\n metric?: EmbeddingMetric;\n /** Override base URL (e.g. for self-hosted TEI). */\n baseUrl?: string;\n}\n\nconst DEFAULT_BASE_URL = 'https://router.huggingface.co/hf-inference/models';\n\nexport function createHuggingFaceBackend(opts: HuggingFaceBackendOptions): EmbeddingBackend {\n const token = opts.token ?? process.env.HF_TOKEN ?? process.env.HUGGINGFACE_TOKEN;\n if (!token) {\n throw new Error('HF_TOKEN (or HUGGINGFACE_TOKEN) is required for the Hugging Face backend');\n }\n const baseUrl = opts.baseUrl ?? DEFAULT_BASE_URL;\n const url = `${baseUrl}/${encodeURI(opts.model)}/pipeline/feature-extraction`;\n\n const backend: EmbeddingBackend = {\n model: opts.model,\n modelRevision: opts.modelRevision ?? 'main',\n metric: opts.metric ?? 'cosine',\n normalized: true,\n async embed(texts: string[]): Promise<EmbeddingMatrix> {\n if (texts.length === 0) {\n return { vectors: [], dimension: opts.dimension ?? 0 };\n }\n const body = JSON.stringify({ inputs: texts, options: { wait_for_model: true } });\n const res = await fetch(url, {\n method: 'POST',\n headers: {\n 'content-type': 'application/json',\n authorization: `Bearer ${token}`,\n },\n body,\n });\n if (!res.ok) {\n const detail = await res.text().catch(() => '');\n throw new Error(`HF Inference API ${res.status} for ${opts.model}: ${detail || res.statusText}`);\n }\n const raw = (await res.json()) as unknown;\n const matrix = parseHfResponse(raw, texts.length);\n const dimension = matrix[0]?.length ?? opts.dimension ?? 0;\n const vectors = matrix.map((v) => normalize(Float32Array.from(v)));\n return { vectors, dimension };\n },\n };\n if (opts.dimension !== undefined) backend.dimension = opts.dimension;\n return backend;\n}\n\n/**\n * Coerce the variety of shapes the HF Inference API returns into a flat\n * `number[][]`: one mean-pooled vector per input.\n *\n * Observed shapes:\n * - sentence-transformers (BGE-M3, MiniLM): `[[...vec], [...vec]]`\n * - feature-extraction without pooling: `[[[...token0], [...token1], ...], ...]`\n * - single-input convenience form: `[...vec]`\n */\nfunction parseHfResponse(raw: unknown, expectedCount: number): number[][] {\n if (!Array.isArray(raw)) {\n throw new Error('HF Inference API: expected array response');\n }\n if (raw.length === 0) return [];\n\n const first = raw[0];\n if (typeof first === 'number') {\n if (expectedCount !== 1) {\n throw new Error(`HF Inference API: got 1 vector, expected ${expectedCount}`);\n }\n return [raw as number[]];\n }\n if (Array.isArray(first) && (first.length === 0 || typeof first[0] === 'number')) {\n return raw as number[][];\n }\n if (Array.isArray(first) && Array.isArray(first[0])) {\n // token-level embeddings: mean-pool per input\n return (raw as number[][][]).map(meanPool);\n }\n throw new Error('HF Inference API: unrecognised response shape');\n}\n\nfunction meanPool(tokens: number[][]): number[] {\n if (tokens.length === 0) return [];\n const dim = tokens[0]!.length;\n const out = new Array<number>(dim).fill(0);\n for (const t of tokens) {\n for (let i = 0; i < dim; i += 1) out[i]! += t[i]!;\n }\n for (let i = 0; i < dim; i += 1) out[i]! /= tokens.length;\n return out;\n}\n\nfunction normalize(v: Float32Array): Float32Array {\n let sum = 0;\n for (let i = 0; i < v.length; i += 1) sum += v[i]! * v[i]!;\n const norm = Math.sqrt(sum);\n if (norm === 0) return v;\n for (let i = 0; i < v.length; i += 1) v[i] = v[i]! / norm;\n return v;\n}\n"]}
|
|
1
|
+
{"version":3,"sources":["../src/chunk.ts","../src/transformers-backend.ts","../src/types.ts","../src/embed.ts","../src/search.ts","../src/huggingface-backend.ts"],"names":[],"mappings":";;;AAgBA,IAAM,OAAA,GAAU,IAAI,WAAA,EAAY;AAChC,IAAM,OAAA,GAAU,IAAI,WAAA,EAAY;AAehC,IAAM,OAAA,GAAU,uBAAA;AAST,SAAS,aAAA,CAAc,QAAA,EAAkB,IAAA,GAAqB,EAAC,EAAoB;AACxF,EAAA,MAAM,IAAA,GAAO,KAAK,IAAA,IAAQ,SAAA;AAC1B,EAAA,MAAM,KAAA,GAAQ,OAAA,CAAQ,MAAA,CAAO,QAAQ,CAAA;AACrC,EAAA,IAAI,SAAS,UAAA,EAAY;AACvB,IAAA,OAAO,CAAC,aAAA,CAAc,KAAK,CAAC,CAAA;AAAA,EAC9B;AACA,EAAA,IAAI,SAAS,WAAA,EAAa;AACxB,IAAA,OAAO,gBAAgB,KAAK,CAAA;AAAA,EAC9B;AACA,EAAA,OAAO,cAAc,KAAK,CAAA;AAC5B;AAEA,SAAS,cAAc,KAAA,EAAkC;AACvD,EAAA,OAAO,EAAE,EAAA,EAAI,UAAA,EAAY,UAAA,EAAY,GAAG,UAAA,EAAY,KAAA,CAAM,UAAA,EAAY,IAAA,EAAM,SAAA,CAAU,KAAA,EAAO,CAAA,EAAG,KAAA,CAAM,UAAU,CAAA,EAAE;AACpH;AAEA,SAAS,cAAc,KAAA,EAAoC;AACzD,EAAA,MAAM,KAAA,GAAQ,qBAAqB,KAAK,CAAA;AACxC,EAAA,MAAM,WAA4B,EAAC;AACnC,EAAA,IAAI,OAAA,GAA6D,IAAA;AACjE,EAAA,MAAM,GAAA,uBAAU,GAAA,EAAY;AAE5B,EAAA,SAAS,MAAM,GAAA,EAAmB;AAChC,IAAA,IAAI,CAAC,OAAA,EAAS;AACd,IAAA,MAAM,IAAA,GAAO,SAAA,CAAU,KAAA,EAAO,OAAA,CAAQ,OAAO,GAAG,CAAA;AAChD,IAAA,IAAI,IAAA,CAAK,IAAA,EAAK,CAAE,MAAA,KAAW,CAAA,EAAG;AAC5B,MAAA,OAAA,GAAU,IAAA;AACV,MAAA;AAAA,IACF;AACA,IAAA,QAAA,CAAS,IAAA,CAAK,EAAE,EAAA,EAAI,OAAA,CAAQ,EAAA,EAAI,UAAA,EAAY,OAAA,CAAQ,KAAA,EAAO,UAAA,EAAY,GAAA,GAAM,OAAA,CAAQ,KAAA,EAAO,MAAM,CAAA;AAClG,IAAA,OAAA,GAAU,IAAA;AAAA,EACZ;AAEA,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,MAAM,KAAA,GAAQ,OAAA,CAAQ,IAAA,CAAK,IAAA,CAAK,IAAI,CAAA;AACpC,IAAA,MAAM,OAAA,GAAU,IAAA,CAAK,MAAA,GAAS,IAAA,CAAK,UAAA;AACnC,IAAA,IAAI,KAAA,EAAO;AACT,MAAA,KAAA,CAAM,KAAK,MAAM,CAAA;AACjB,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,OAAA,CAAQ,KAAA,CAAM,CAAC,CAAA,IAAK,CAAA,QAAA,EAAW,QAAA,CAAS,MAAA,GAAS,CAAC,CAAA,CAAE,CAAA,EAAG,GAAG,CAAA;AAC9E,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,OAAA,GAAU,EAAE,EAAA,EAAI,KAAA,EAAO,IAAA,CAAK,MAAA,EAAQ,KAAK,OAAA,EAAQ;AACjD,MAAA;AAAA,IACF;AACA,IAAA,IAAI,YAAY,IAAA,EAAM;AACpB,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,UAAA,EAAY,GAAG,CAAA;AACnC,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,OAAA,GAAU,EAAE,EAAA,EAAI,KAAA,EAAO,IAAA,CAAK,MAAA,EAAQ,KAAK,OAAA,EAAQ;AAAA,IACnD,CAAA,MAAO;AACL,MAAA,OAAA,CAAQ,GAAA,GAAM,OAAA;AAAA,IAChB;AAAA,EACF;AACA,EAAA,KAAA,CAAM,MAAM,UAAU,CAAA;AAEtB,EAAA,IAAI,QAAA,CAAS,WAAW,CAAA,EAAG;AACzB,IAAA,OAAO,CAAC,aAAA,CAAc,KAAK,CAAC,CAAA;AAAA,EAC9B;AACA,EAAA,OAAO,QAAA;AACT;AAEA,SAAS,gBAAgB,KAAA,EAAoC;AAC3D,EAAA,MAAM,MAAuB,EAAC;AAC9B,EAAA,MAAM,GAAA,uBAAU,GAAA,EAAY;AAC5B,EAAA,MAAM,SAAA,GAAY,OAAA,CAAQ,MAAA,CAAO,MAAM,CAAA;AACvC,EAAA,IAAI,MAAA,GAAS,CAAA;AACb,EAAA,IAAI,CAAA,GAAI,CAAA;AACR,EAAA,OAAO,MAAA,GAAS,MAAM,UAAA,EAAY;AAChC,IAAA,IAAI,GAAA,GAAM,YAAA,CAAa,KAAA,EAAO,SAAA,EAAW,MAAM,CAAA;AAC/C,IAAA,IAAI,GAAA,KAAQ,EAAA,EAAI,GAAA,GAAM,KAAA,CAAM,UAAA;AAC5B,IAAA,MAAM,IAAA,GAAO,SAAA,CAAU,KAAA,EAAO,MAAA,EAAQ,GAAG,CAAA;AACzC,IAAA,IAAI,IAAA,CAAK,IAAA,EAAK,CAAE,MAAA,GAAS,CAAA,EAAG;AAC1B,MAAA,MAAM,EAAA,GAAK,QAAA,CAAS,OAAA,CAAQ,IAAA,CAAK,KAAA,CAAM,IAAI,CAAA,CAAE,CAAC,CAAA,IAAK,CAAA,EAAA,EAAK,CAAC,CAAA,CAAE,GAAG,GAAG,CAAA;AACjE,MAAA,GAAA,CAAI,IAAI,EAAE,CAAA;AACV,MAAA,GAAA,CAAI,IAAA,CAAK,EAAE,EAAA,EAAI,UAAA,EAAY,QAAQ,UAAA,EAAY,GAAA,GAAM,MAAA,EAAQ,IAAA,EAAM,CAAA;AACnE,MAAA,CAAA,IAAK,CAAA;AAAA,IACP;AACA,IAAA,MAAA,GAAS,MAAM,SAAA,CAAU,UAAA;AAAA,EAC3B;AACA,EAAA,IAAI,GAAA,CAAI,WAAW,CAAA,EAAG;AACpB,IAAA,OAAO,CAAC,aAAA,CAAc,KAAK,CAAC,CAAA;AAAA,EAC9B;AACA,EAAA,OAAO,GAAA;AACT;AAGA,SAAS,SAAA,CAAU,KAAA,EAAmB,KAAA,EAAe,GAAA,EAAqB;AACxE,EAAA,OAAO,QAAQ,MAAA,CAAO,KAAA,CAAM,QAAA,CAAS,KAAA,EAAO,GAAG,CAAC,CAAA;AAClD;AAGA,SAAS,qBAAqB,KAAA,EAA+B;AAC3D,EAAA,MAAM,OAAA,GAAU,EAAA;AAChB,EAAA,MAAM,QAAoB,EAAC;AAC3B,EAAA,IAAI,KAAA,GAAQ,CAAA;AACZ,EAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,KAAA,CAAM,UAAA,EAAY,KAAK,CAAA,EAAG;AAC5C,IAAA,IAAI,KAAA,CAAM,CAAC,CAAA,KAAM,OAAA,EAAS;AACxB,MAAA,MAAM,UAAA,GAAa,IAAI,KAAA,GAAQ,CAAA;AAC/B,MAAA,KAAA,CAAM,IAAA,CAAK,EAAE,IAAA,EAAM,SAAA,CAAU,KAAA,EAAO,KAAA,EAAO,CAAA,GAAI,CAAC,CAAA,EAAG,MAAA,EAAQ,KAAA,EAAO,UAAA,EAAY,CAAA;AAC9E,MAAA,KAAA,GAAQ,CAAA,GAAI,CAAA;AAAA,IACd;AAAA,EACF;AACA,EAAA,IAAI,KAAA,GAAQ,MAAM,UAAA,EAAY;AAC5B,IAAA,KAAA,CAAM,IAAA,CAAK,EAAE,IAAA,EAAM,SAAA,CAAU,OAAO,KAAA,EAAO,KAAA,CAAM,UAAU,CAAA,EAAG,QAAQ,KAAA,EAAO,UAAA,EAAY,KAAA,CAAM,UAAA,GAAa,OAAO,CAAA;AAAA,EACrH;AACA,EAAA,OAAO,KAAA;AACT;AAGA,SAAS,YAAA,CAAa,QAAA,EAAsB,MAAA,EAAoB,IAAA,EAAsB;AACpF,EAAA,MAAM,IAAA,GAAO,QAAA,CAAS,UAAA,GAAa,MAAA,CAAO,UAAA;AAC1C,EAAA,KAAA,IAAS,CAAA,GAAI,IAAA,EAAM,CAAA,IAAK,IAAA,EAAM,KAAK,CAAA,EAAG;AACpC,IAAA,IAAI,OAAA,GAAU,IAAA;AACd,IAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,MAAA,CAAO,UAAA,EAAY,KAAK,CAAA,EAAG;AAC7C,MAAA,IAAI,SAAS,CAAA,GAAI,CAAC,CAAA,KAAM,MAAA,CAAO,CAAC,CAAA,EAAG;AACjC,QAAA,OAAA,GAAU,KAAA;AACV,QAAA;AAAA,MACF;AAAA,IACF;AACA,IAAA,IAAI,SAAS,OAAO,CAAA;AAAA,EACtB;AACA,EAAA,OAAO,EAAA;AACT;AAEA,SAAS,QAAQ,CAAA,EAAmB;AAClC,EAAA,OACE,CAAA,CACG,WAAA,EAAY,CACZ,OAAA,CAAQ,eAAe,GAAG,CAAA,CAC1B,OAAA,CAAQ,UAAA,EAAY,EAAE,CAAA,CACtB,KAAA,CAAM,CAAA,EAAG,EAAE,CAAA,IAAK,SAAA;AAEvB;AAEA,SAAS,QAAA,CAAS,MAAc,KAAA,EAA4B;AAC1D,EAAA,IAAI,CAAC,KAAA,CAAM,GAAA,CAAI,IAAI,GAAG,OAAO,IAAA;AAC7B,EAAA,IAAI,CAAA,GAAI,CAAA;AACR,EAAA,OAAO,KAAA,CAAM,IAAI,CAAA,EAAG,IAAI,IAAI,CAAC,CAAA,CAAE,GAAG,CAAA,IAAK,CAAA;AACvC,EAAA,OAAO,CAAA,EAAG,IAAI,CAAA,CAAA,EAAI,CAAC,CAAA,CAAA;AACrB;AC/JO,SAAS,0BAA0B,IAAA,EAAoD;AAC5F,EAAA,IAAI,IAAA,CAAK,sBAAsB,KAAA,EAAO;AACpC,IAAA,GAAA,CAAI,iBAAA,GAAoB,KAAA;AAAA,EAC1B;AAEA,EAAA,IAAI,eAAA,GAA2C,IAAA;AAC/C,EAAA,IAAI,iBAAA,GAAmC,IAAA;AAEvC,EAAA,SAAS,WAAA,GAAgC;AACvC,IAAA,IAAI,CAAC,eAAA,EAAiB;AACpB,MAAA,MAAM,SAAA,GAAqC,EAAE,KAAA,EAAO,IAAA,CAAK,SAAS,MAAA,EAAO;AACzE,MAAA,IAAI,IAAA,CAAK,MAAA,KAAW,MAAA,EAAW,SAAA,CAAU,SAAS,IAAA,CAAK,MAAA;AACvD,MAAA,eAAA,GAAkB,QAAA,CAAS,oBAAA,EAAsB,IAAA,CAAK,KAAA,EAAO,SAAkB,CAAA;AAAA,IACjF;AACA,IAAA,OAAO,eAAA;AAAA,EACT;AAEA,EAAA,MAAM,OAAA,GAA4B;AAAA,IAChC,OAAO,IAAA,CAAK,KAAA;AAAA,IACZ,aAAA,EAAe,KAAK,aAAA,IAAiB,MAAA;AAAA,IACrC,MAAA,EAAQ,KAAK,MAAA,IAAU,QAAA;AAAA,IACvB,UAAA,EAAY,IAAA;AAAA,IACZ,MAAM,MAAM,KAAA,EAA2C;AACrD,MAAA,IAAI,KAAA,CAAM,WAAW,CAAA,EAAG;AACtB,QAAA,OAAO,EAAE,OAAA,EAAS,IAAI,SAAA,EAAW,IAAA,CAAK,aAAa,CAAA,EAAE;AAAA,MACvD;AACA,MAAA,MAAM,IAAA,GAAQ,MAAM,WAAA,EAAY;AAIhC,MAAA,MAAM,MAAA,GAAS,MAAM,IAAA,CAAK,KAAA,EAAO,EAAE,OAAA,EAAS,MAAA,EAAQ,SAAA,EAAW,IAAA,EAAM,CAAA;AACrE,MAAA,MAAM,OAAO,MAAA,CAAO,IAAA;AACpB,MAAA,MAAM,YAAY,MAAA,CAAO,IAAA,CAAK,MAAA,CAAO,IAAA,CAAK,SAAS,CAAC,CAAA;AACpD,MAAA,IAAI,iBAAA,KAAsB,IAAA,IAAQ,iBAAA,KAAsB,SAAA,EAAW;AACjE,QAAA,MAAM,IAAI,KAAA,CAAM,CAAA,0CAAA,EAA6C,iBAAiB,CAAA,MAAA,EAAS,SAAS,CAAA,CAAE,CAAA;AAAA,MACpG;AACA,MAAA,iBAAA,GAAoB,SAAA;AACpB,MAAA,MAAM,UAA0B,EAAC;AACjC,MAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,KAAA,CAAM,MAAA,EAAQ,KAAK,CAAA,EAAG;AACxC,QAAA,OAAA,CAAQ,IAAA,CAAK,IAAI,YAAA,CAAa,IAAA,CAAK,MAAA,EAAQ,IAAA,CAAK,UAAA,GAAa,CAAA,GAAI,SAAA,GAAY,CAAA,EAAG,SAAS,CAAA,CAAE,OAAO,CAAA;AAAA,MACpG;AACA,MAAA,OAAO,EAAE,SAAS,SAAA,EAAU;AAAA,IAC9B;AAAA,GACF;AACA,EAAA,IAAI,IAAA,CAAK,SAAA,KAAc,MAAA,EAAW,OAAA,CAAQ,YAAY,IAAA,CAAK,SAAA;AAC3D,EAAA,OAAO,OAAA;AACT;;;ACvCO,IAAM,aAAA,GAAgB;AACtB,IAAM,uBAAA,GAA0B;;;ACNvC,eAAsB,KAAA,CAAM,QAAA,EAAkB,IAAA,GAAqB,EAAC,EAA+B;AACjG,EAAA,MAAM,MAAA,GAAS,cAAc,QAAA,EAAU,EAAE,MAAM,IAAA,CAAK,QAAA,IAAY,WAAW,CAAA;AAC3E,EAAA,MAAM,OAAA,GACJ,KAAK,OAAA,IACL,yBAAA;AAAA,IACE,KAAK,aAAA,KAAkB,MAAA,GACnB,EAAE,KAAA,EAAO,KAAK,KAAA,IAAS,aAAA,EAAe,aAAA,EAAe,IAAA,CAAK,eAAc,GACxE,EAAE,KAAA,EAAO,IAAA,CAAK,SAAS,aAAA;AAAc,GAC3C;AAEF,EAAA,MAAM,MAAA,GAAS,MAAM,OAAA,CAAQ,KAAA,CAAM,MAAA,CAAO,IAAI,CAAC,CAAA,KAAM,CAAA,CAAE,IAAI,CAAC,CAAA;AAC5D,EAAA,IAAI,MAAA,CAAO,OAAA,CAAQ,MAAA,KAAW,MAAA,CAAO,MAAA,EAAQ;AAC3C,IAAA,MAAM,IAAI,MAAM,CAAA,iBAAA,EAAoB,MAAA,CAAO,QAAQ,MAAM,CAAA,aAAA,EAAgB,MAAA,CAAO,MAAM,CAAA,OAAA,CAAS,CAAA;AAAA,EACjG;AAEA,EAAA,MAAM,eAAA,GAAoC,MAAA,CAAO,GAAA,CAAI,CAAC,GAAG,CAAA,MAAO;AAAA,IAC9D,IAAI,CAAA,CAAE,EAAA;AAAA,IACN,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,MAAA,EAAQ,MAAA,CAAO,OAAA,CAAQ,CAAC;AAAA,GAC1B,CAAE,CAAA;AAEF,EAAA,MAAM,KAAA,GAAwB;AAAA,IAC5B,OAAO,OAAA,CAAQ,KAAA;AAAA,IACf,eAAe,OAAA,CAAQ,aAAA;AAAA,IACvB,WAAW,MAAA,CAAO,SAAA;AAAA,IAClB,QAAQ,OAAA,CAAQ,MAAA;AAAA,IAChB,YAAY,OAAA,CAAQ,UAAA;AAAA,IACpB,QAAA,EAAU,KAAK,QAAA,IAAY,SAAA;AAAA,IAC3B,MAAA,EAAQ;AAAA,GACV;AAEA,EAAA,OAAO,EAAE,aAAA,EAAe,CAAA,EAAG,MAAA,EAAQ,CAAC,KAAK,CAAA,EAAE;AAC7C;;;AC/BO,SAAS,cAAA,CACd,OAAA,EACA,WAAA,EACA,IAAA,GAAsB,EAAC,EACV;AACb,EAAA,MAAM,KAAA,GAAQ,SAAA,CAAU,OAAA,EAAS,IAAA,CAAK,KAAK,CAAA;AAC3C,EAAA,IAAI,CAAC,KAAA,EAAO,MAAM,IAAI,MAAM,mCAAmC,CAAA;AAC/D,EAAA,IAAI,WAAA,CAAY,MAAA,KAAW,KAAA,CAAM,SAAA,EAAW;AAC1C,IAAA,MAAM,IAAI,KAAA,CAAM,CAAA,uBAAA,EAA0B,WAAA,CAAY,MAAM,CAAA,sBAAA,EAAyB,KAAA,CAAM,KAAK,CAAA,EAAA,EAAK,KAAA,CAAM,SAAS,CAAA,CAAA,CAAG,CAAA;AAAA,EACzH;AAEA,EAAA,MAAM,CAAA,GAAI,KAAK,CAAA,IAAK,CAAA;AACpB,EAAA,MAAM,MAAA,GAAsB,KAAA,CAAM,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,MAAO;AAAA,IACnD,YAAY,KAAA,CAAM,KAAA;AAAA,IAClB,SAAS,CAAA,CAAE,EAAA;AAAA,IACX,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,OAAO,UAAA,CAAW,WAAA,EAAa,CAAA,CAAE,MAAA,EAAQ,MAAM,MAAM;AAAA,GACvD,CAAE,CAAA;AAGF,EAAA,MAAM,KAAA,GAAQ,KAAA,CAAM,MAAA,KAAW,WAAA,GAAc,CAAA,GAAI,EAAA;AACjD,EAAA,MAAA,CAAO,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,SAAS,CAAA,CAAE,KAAA,GAAQ,EAAE,KAAA,CAAM,CAAA;AACjD,EAAA,OAAO,MAAA,CAAO,KAAA,CAAM,CAAA,EAAG,CAAC,CAAA;AAC1B;AAEA,SAAS,SAAA,CAAU,SAA4B,SAAA,EAAgD;AAC7F,EAAA,IAAI,SAAA,SAAkB,OAAA,CAAQ,MAAA,CAAO,KAAK,CAAC,CAAA,KAAM,CAAA,CAAE,KAAA,KAAU,SAAS,CAAA;AACtE,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAC,CAAA;AACzB;AAEA,SAAS,UAAA,CAAW,CAAA,EAAiB,CAAA,EAAiB,MAAA,EAAiC;AACrF,EAAA,IAAI,WAAW,WAAA,EAAa;AAC1B,IAAA,IAAI,GAAA,GAAM,CAAA;AACV,IAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,KAAK,CAAA,EAAG;AACpC,MAAA,MAAM,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,GAAK,EAAE,CAAC,CAAA;AACrB,MAAA,GAAA,IAAO,CAAA,GAAI,CAAA;AAAA,IACb;AACA,IAAA,OAAO,IAAA,CAAK,KAAK,GAAG,CAAA;AAAA,EACtB;AACA,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,GAAA,IAAO,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACxD,EAAA,IAAI,MAAA,KAAW,OAAO,OAAO,GAAA;AAE7B,EAAA,IAAI,EAAA,GAAK,CAAA;AACT,EAAA,IAAI,EAAA,GAAK,CAAA;AACT,EAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,KAAK,CAAA,EAAG;AACpC,IAAA,EAAA,IAAM,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACjB,IAAA,EAAA,IAAM,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AAAA,EACnB;AACA,EAAA,MAAM,QAAQ,IAAA,CAAK,IAAA,CAAK,EAAE,CAAA,GAAI,IAAA,CAAK,KAAK,EAAE,CAAA;AAC1C,EAAA,OAAO,KAAA,KAAU,CAAA,GAAI,CAAA,GAAI,GAAA,GAAM,KAAA;AACjC;;;AClDA,IAAM,gBAAA,GAAmB,mDAAA;AAElB,SAAS,yBAAyB,IAAA,EAAmD;AAC1F,EAAA,MAAM,QAAQ,IAAA,CAAK,KAAA,IAAS,QAAQ,GAAA,CAAI,QAAA,IAAY,QAAQ,GAAA,CAAI,iBAAA;AAChE,EAAA,IAAI,CAAC,KAAA,EAAO;AACV,IAAA,MAAM,IAAI,MAAM,0EAA0E,CAAA;AAAA,EAC5F;AACA,EAAA,MAAM,OAAA,GAAU,KAAK,OAAA,IAAW,gBAAA;AAChC,EAAA,MAAM,MAAM,CAAA,EAAG,OAAO,IAAI,SAAA,CAAU,IAAA,CAAK,KAAK,CAAC,CAAA,4BAAA,CAAA;AAE/C,EAAA,MAAM,OAAA,GAA4B;AAAA,IAChC,OAAO,IAAA,CAAK,KAAA;AAAA,IACZ,aAAA,EAAe,KAAK,aAAA,IAAiB,MAAA;AAAA,IACrC,MAAA,EAAQ,KAAK,MAAA,IAAU,QAAA;AAAA,IACvB,UAAA,EAAY,IAAA;AAAA,IACZ,MAAM,MAAM,KAAA,EAA2C;AACrD,MAAA,IAAI,KAAA,CAAM,WAAW,CAAA,EAAG;AACtB,QAAA,OAAO,EAAE,OAAA,EAAS,IAAI,SAAA,EAAW,IAAA,CAAK,aAAa,CAAA,EAAE;AAAA,MACvD;AACA,MAAA,MAAM,IAAA,GAAO,IAAA,CAAK,SAAA,CAAU,EAAE,MAAA,EAAQ,KAAA,EAAO,OAAA,EAAS,EAAE,cAAA,EAAgB,IAAA,EAAK,EAAG,CAAA;AAChF,MAAA,MAAM,GAAA,GAAM,MAAM,KAAA,CAAM,GAAA,EAAK;AAAA,QAC3B,MAAA,EAAQ,MAAA;AAAA,QACR,OAAA,EAAS;AAAA,UACP,cAAA,EAAgB,kBAAA;AAAA,UAChB,aAAA,EAAe,UAAU,KAAK,CAAA;AAAA,SAChC;AAAA,QACA;AAAA,OACD,CAAA;AACD,MAAA,IAAI,CAAC,IAAI,EAAA,EAAI;AACX,QAAA,MAAM,SAAS,MAAM,GAAA,CAAI,MAAK,CAAE,KAAA,CAAM,MAAM,EAAE,CAAA;AAC9C,QAAA,MAAM,IAAI,KAAA,CAAM,CAAA,iBAAA,EAAoB,GAAA,CAAI,MAAM,CAAA,KAAA,EAAQ,IAAA,CAAK,KAAK,CAAA,EAAA,EAAK,MAAA,IAAU,GAAA,CAAI,UAAU,CAAA,CAAE,CAAA;AAAA,MACjG;AACA,MAAA,MAAM,GAAA,GAAO,MAAM,GAAA,CAAI,IAAA,EAAK;AAC5B,MAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,GAAA,EAAK,KAAA,CAAM,MAAM,CAAA;AAChD,MAAA,MAAM,YAAY,MAAA,CAAO,CAAC,CAAA,EAAG,MAAA,IAAU,KAAK,SAAA,IAAa,CAAA;AACzD,MAAA,MAAM,OAAA,GAAU,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,KAAM,UAAU,YAAA,CAAa,IAAA,CAAK,CAAC,CAAC,CAAC,CAAA;AACjE,MAAA,OAAO,EAAE,SAAS,SAAA,EAAU;AAAA,IAC9B;AAAA,GACF;AACA,EAAA,IAAI,IAAA,CAAK,SAAA,KAAc,MAAA,EAAW,OAAA,CAAQ,YAAY,IAAA,CAAK,SAAA;AAC3D,EAAA,OAAO,OAAA;AACT;AAWA,SAAS,eAAA,CAAgB,KAAc,aAAA,EAAmC;AACxE,EAAA,IAAI,CAAC,KAAA,CAAM,OAAA,CAAQ,GAAG,CAAA,EAAG;AACvB,IAAA,MAAM,IAAI,MAAM,2CAA2C,CAAA;AAAA,EAC7D;AACA,EAAA,IAAI,GAAA,CAAI,MAAA,KAAW,CAAA,EAAG,OAAO,EAAC;AAE9B,EAAA,MAAM,KAAA,GAAQ,IAAI,CAAC,CAAA;AACnB,EAAA,IAAI,OAAO,UAAU,QAAA,EAAU;AAC7B,IAAA,IAAI,kBAAkB,CAAA,EAAG;AACvB,MAAA,MAAM,IAAI,KAAA,CAAM,CAAA,yCAAA,EAA4C,aAAa,CAAA,CAAE,CAAA;AAAA,IAC7E;AACA,IAAA,OAAO,CAAC,GAAe,CAAA;AAAA,EACzB;AACA,EAAA,IAAI,KAAA,CAAM,OAAA,CAAQ,KAAK,CAAA,KAAM,KAAA,CAAM,MAAA,KAAW,CAAA,IAAK,OAAO,KAAA,CAAM,CAAC,CAAA,KAAM,QAAA,CAAA,EAAW;AAChF,IAAA,OAAO,GAAA;AAAA,EACT;AACA,EAAA,IAAI,KAAA,CAAM,QAAQ,KAAK,CAAA,IAAK,MAAM,OAAA,CAAQ,KAAA,CAAM,CAAC,CAAC,CAAA,EAAG;AAEnD,IAAA,OAAQ,GAAA,CAAqB,IAAI,QAAQ,CAAA;AAAA,EAC3C;AACA,EAAA,MAAM,IAAI,MAAM,+CAA+C,CAAA;AACjE;AAEA,SAAS,SAAS,MAAA,EAA8B;AAC9C,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,EAAC;AACjC,EAAA,MAAM,GAAA,GAAM,MAAA,CAAO,CAAC,CAAA,CAAG,MAAA;AACvB,EAAA,MAAM,MAAM,IAAI,KAAA,CAAc,GAAG,CAAA,CAAE,KAAK,CAAC,CAAA;AACzC,EAAA,KAAA,MAAW,KAAK,MAAA,EAAQ;AACtB,IAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,GAAA,EAAK,CAAA,IAAK,GAAG,GAAA,CAAI,CAAC,CAAA,IAAM,CAAA,CAAE,CAAC,CAAA;AAAA,EACjD;AACA,EAAA,KAAA,IAAS,CAAA,GAAI,GAAG,CAAA,GAAI,GAAA,EAAK,KAAK,CAAA,EAAG,GAAA,CAAI,CAAC,CAAA,IAAM,MAAA,CAAO,MAAA;AACnD,EAAA,OAAO,GAAA;AACT;AAEA,SAAS,UAAU,CAAA,EAA+B;AAChD,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,GAAA,IAAO,CAAA,CAAE,CAAC,CAAA,GAAK,CAAA,CAAE,CAAC,CAAA;AACxD,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,IAAA,CAAK,GAAG,CAAA;AAC1B,EAAA,IAAI,IAAA,KAAS,GAAG,OAAO,CAAA;AACvB,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,CAAA,CAAE,MAAA,EAAQ,CAAA,IAAK,CAAA,EAAG,CAAA,CAAE,CAAC,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,GAAK,IAAA;AACrD,EAAA,OAAO,CAAA;AACT","file":"index.js","sourcesContent":["/**\n * Section-based markdown chunker.\n *\n * Splits a markdown document on ATX headings (`#`, `##`, ...). Each chunk\n * carries the byte offset and length into the original UTF-8 source so a\n * downstream consumer can map a vector hit back to the exact substring\n * without re-tokenising. Pre-heading content becomes a \"preamble\" chunk.\n *\n * Per spec §5.1, `textOffset`/`textLength` are UTF-8 *byte* offsets into the\n * markdown source. We encode the document once with `TextEncoder`, track a\n * byte cursor while iterating lines (counting the trailing `\\n` byte), and\n * derive each chunk's `text` by decoding the corresponding byte slice. This\n * keeps the offsets in agreement with the Go and Python SDKs for any\n * non-ASCII résumé.\n */\n\nconst encoder = new TextEncoder();\nconst decoder = new TextDecoder();\n\nexport type ChunkingMode = 'document' | 'section' | 'paragraph';\n\nexport interface MarkdownChunk {\n id: string;\n textOffset: number;\n textLength: number;\n text: string;\n}\n\nexport interface ChunkOptions {\n mode?: ChunkingMode;\n}\n\nconst HEADING = /^(#{1,6})\\s+(.+?)\\s*$/;\n\n/** A source line plus its UTF-8 byte offset and byte length (including any trailing `\\n`). */\ninterface ByteLine {\n text: string;\n offset: number;\n byteLength: number;\n}\n\nexport function chunkMarkdown(markdown: string, opts: ChunkOptions = {}): MarkdownChunk[] {\n const mode = opts.mode ?? 'section';\n const bytes = encoder.encode(markdown);\n if (mode === 'document') {\n return [documentChunk(bytes)];\n }\n if (mode === 'paragraph') {\n return paragraphChunks(bytes);\n }\n return sectionChunks(bytes);\n}\n\nfunction documentChunk(bytes: Uint8Array): MarkdownChunk {\n return { id: 'document', textOffset: 0, textLength: bytes.byteLength, text: sliceText(bytes, 0, bytes.byteLength) };\n}\n\nfunction sectionChunks(bytes: Uint8Array): MarkdownChunk[] {\n const lines = splitWithByteOffsets(bytes);\n const sections: MarkdownChunk[] = [];\n let current: { id: string; start: number; end: number } | null = null;\n const ids = new Set<string>();\n\n function flush(end: number): void {\n if (!current) return;\n const text = sliceText(bytes, current.start, end);\n if (text.trim().length === 0) {\n current = null;\n return;\n }\n sections.push({ id: current.id, textOffset: current.start, textLength: end - current.start, text });\n current = null;\n }\n\n for (const line of lines) {\n const match = HEADING.exec(line.text);\n const lineEnd = line.offset + line.byteLength;\n if (match) {\n flush(line.offset);\n const id = uniqueId(slugify(match[2] ?? `section-${sections.length + 1}`), ids);\n ids.add(id);\n current = { id, start: line.offset, end: lineEnd };\n continue;\n }\n if (current === null) {\n const id = uniqueId('preamble', ids);\n ids.add(id);\n current = { id, start: line.offset, end: lineEnd };\n } else {\n current.end = lineEnd;\n }\n }\n flush(bytes.byteLength);\n\n if (sections.length === 0) {\n return [documentChunk(bytes)];\n }\n return sections;\n}\n\nfunction paragraphChunks(bytes: Uint8Array): MarkdownChunk[] {\n const out: MarkdownChunk[] = [];\n const ids = new Set<string>();\n const separator = encoder.encode('\\n\\n');\n let cursor = 0;\n let i = 0;\n while (cursor < bytes.byteLength) {\n let end = indexOfBytes(bytes, separator, cursor);\n if (end === -1) end = bytes.byteLength;\n const text = sliceText(bytes, cursor, end);\n if (text.trim().length > 0) {\n const id = uniqueId(slugify(text.split('\\n')[0] ?? `p-${i}`), ids);\n ids.add(id);\n out.push({ id, textOffset: cursor, textLength: end - cursor, text });\n i += 1;\n }\n cursor = end + separator.byteLength;\n }\n if (out.length === 0) {\n return [documentChunk(bytes)];\n }\n return out;\n}\n\n/** Decode the UTF-8 byte slice `[start, end)` back into a string. */\nfunction sliceText(bytes: Uint8Array, start: number, end: number): string {\n return decoder.decode(bytes.subarray(start, end));\n}\n\n/** Split UTF-8 bytes into lines, each tagged with its byte offset and byte length (newline included). */\nfunction splitWithByteOffsets(bytes: Uint8Array): ByteLine[] {\n const newline = 0x0a; // '\\n'\n const lines: ByteLine[] = [];\n let start = 0;\n for (let i = 0; i < bytes.byteLength; i += 1) {\n if (bytes[i] === newline) {\n const byteLength = i - start + 1;\n lines.push({ text: sliceText(bytes, start, i + 1), offset: start, byteLength });\n start = i + 1;\n }\n }\n if (start < bytes.byteLength) {\n lines.push({ text: sliceText(bytes, start, bytes.byteLength), offset: start, byteLength: bytes.byteLength - start });\n }\n return lines;\n}\n\n/** Find the byte index of `needle` in `haystack` at or after `from`, or -1. */\nfunction indexOfBytes(haystack: Uint8Array, needle: Uint8Array, from: number): number {\n const last = haystack.byteLength - needle.byteLength;\n for (let i = from; i <= last; i += 1) {\n let matched = true;\n for (let j = 0; j < needle.byteLength; j += 1) {\n if (haystack[i + j] !== needle[j]) {\n matched = false;\n break;\n }\n }\n if (matched) return i;\n }\n return -1;\n}\n\nfunction slugify(s: string): string {\n return (\n s\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 64) || 'section'\n );\n}\n\nfunction uniqueId(base: string, taken: Set<string>): string {\n if (!taken.has(base)) return base;\n let n = 2;\n while (taken.has(`${base}-${n}`)) n += 1;\n return `${base}-${n}`;\n}\n","/**\n * transformers.js backend (browser + Node + Bun via @huggingface/transformers).\n *\n * Loads the model lazily and reuses the pipeline across calls. Models are\n * cached on disk by transformers.js itself (HF cache layout).\n */\n\nimport { pipeline, env } from '@huggingface/transformers';\nimport type { EmbeddingBackend, EmbeddingBackendOptions, EmbeddingMatrix } from './types.js';\n\ninterface TransformersBackendOptions extends EmbeddingBackendOptions {\n /** Override Xenova quantisation. Defaults to fp32 for closest fidelity. */\n dtype?: 'fp32' | 'fp16' | 'q8' | 'q4';\n /** Force a backend device (e.g. 'cpu', 'gpu', 'wasm'). Defaults to auto. */\n device?: 'cpu' | 'gpu' | 'wasm' | 'webgpu';\n /** Allow remote model downloads. true by default. */\n allowRemoteModels?: boolean;\n}\n\nexport function createTransformersBackend(opts: TransformersBackendOptions): EmbeddingBackend {\n if (opts.allowRemoteModels === false) {\n env.allowRemoteModels = false;\n }\n\n let pipelinePromise: Promise<unknown> | null = null;\n let resolvedDimension: number | null = null;\n\n function getPipeline(): Promise<unknown> {\n if (!pipelinePromise) {\n const modelOpts: Record<string, unknown> = { dtype: opts.dtype ?? 'fp32' };\n if (opts.device !== undefined) modelOpts.device = opts.device;\n pipelinePromise = pipeline('feature-extraction', opts.model, modelOpts as never);\n }\n return pipelinePromise;\n }\n\n const backend: EmbeddingBackend = {\n model: opts.model,\n modelRevision: opts.modelRevision ?? 'main',\n metric: opts.metric ?? 'cosine',\n normalized: true,\n async embed(texts: string[]): Promise<EmbeddingMatrix> {\n if (texts.length === 0) {\n return { vectors: [], dimension: opts.dimension ?? 0 };\n }\n const pipe = (await getPipeline()) as (\n texts: string[],\n opts: { pooling: 'mean'; normalize: boolean },\n ) => Promise<{ data: Float32Array; dims: number[] }>;\n const tensor = await pipe(texts, { pooling: 'mean', normalize: true });\n const data = tensor.data;\n const dimension = tensor.dims[tensor.dims.length - 1] as number;\n if (resolvedDimension !== null && resolvedDimension !== dimension) {\n throw new Error(`Model emitted inconsistent dimension: had ${resolvedDimension}, now ${dimension}`);\n }\n resolvedDimension = dimension;\n const vectors: Float32Array[] = [];\n for (let i = 0; i < texts.length; i += 1) {\n vectors.push(new Float32Array(data.buffer, data.byteOffset + i * dimension * 4, dimension).slice());\n }\n return { vectors, dimension };\n },\n };\n if (opts.dimension !== undefined) backend.dimension = opts.dimension;\n return backend;\n}\n","import type { EmbeddingMetric } from '@cvfile/sdk';\n\nexport interface EmbeddingMatrix {\n vectors: Float32Array[];\n dimension: number;\n}\n\nexport interface EmbeddingBackend {\n model: string;\n modelRevision: string;\n /** Optional pre-declared dimension; the backend may override after first call. */\n dimension?: number;\n metric: EmbeddingMetric;\n normalized: boolean;\n embed(texts: string[]): Promise<EmbeddingMatrix>;\n}\n\nexport interface EmbeddingBackendOptions {\n model: string;\n modelRevision?: string;\n /** Optional pre-known dimension (e.g. 1024 for bge-m3). */\n dimension?: number;\n metric?: EmbeddingMetric;\n}\n\n/** Recommended default per spec §5: BAAI BGE-M3, MIT, multilingual, 1024-dim. */\nexport const DEFAULT_MODEL = 'Xenova/bge-m3';\nexport const DEFAULT_MODEL_DIMENSION = 1024;\n","/**\n * High-level embed() API: markdown in, EmbeddingsPayload out, ready to drop\n * into pack({ embeddings: ... }).\n */\n\nimport type { EmbeddingChunk, EmbeddingSpace, EmbeddingsPayload } from '@cvfile/sdk';\nimport { chunkMarkdown, type ChunkingMode } from './chunk.js';\nimport { createTransformersBackend } from './transformers-backend.js';\nimport { DEFAULT_MODEL, type EmbeddingBackend } from './types.js';\n\nexport interface EmbedOptions {\n /** HF model id; defaults to BGE-M3 (Xenova/bge-m3). */\n model?: string;\n /** Pinned model revision; recorded in the payload for reproducibility. */\n modelRevision?: string;\n /** Chunking strategy; default 'section'. */\n chunking?: ChunkingMode;\n /** Bring-your-own backend (e.g. an OpenAI/Voyage adapter). */\n backend?: EmbeddingBackend;\n}\n\nexport async function embed(markdown: string, opts: EmbedOptions = {}): Promise<EmbeddingsPayload> {\n const chunks = chunkMarkdown(markdown, { mode: opts.chunking ?? 'section' });\n const backend =\n opts.backend ??\n createTransformersBackend(\n opts.modelRevision !== undefined\n ? { model: opts.model ?? DEFAULT_MODEL, modelRevision: opts.modelRevision }\n : { model: opts.model ?? DEFAULT_MODEL },\n );\n\n const matrix = await backend.embed(chunks.map((c) => c.text));\n if (matrix.vectors.length !== chunks.length) {\n throw new Error(`Backend returned ${matrix.vectors.length} vectors for ${chunks.length} chunks`);\n }\n\n const embeddingChunks: EmbeddingChunk[] = chunks.map((c, i) => ({\n id: c.id,\n textOffset: c.textOffset,\n textLength: c.textLength,\n vector: matrix.vectors[i]!,\n }));\n\n const space: EmbeddingSpace = {\n model: backend.model,\n modelRevision: backend.modelRevision,\n dimension: matrix.dimension,\n metric: backend.metric,\n normalized: backend.normalized,\n chunking: opts.chunking ?? 'section',\n chunks: embeddingChunks,\n };\n\n return { formatVersion: 1, spaces: [space] };\n}\n","/**\n * Pure similarity search over an EmbeddingsPayload. Vector input is the\n * caller's responsibility (encode the query with the same model that was\n * used to populate the space, ideally pulled from space.model).\n */\n\nimport type { EmbeddingMetric, EmbeddingSpace, EmbeddingsPayload } from '@cvfile/sdk';\n\nexport interface SearchHit {\n spaceModel: string;\n chunkId: string;\n textOffset: number;\n textLength: number;\n score: number;\n}\n\nexport interface SearchOptions {\n /** Restrict to a specific embedding space; default = first space. */\n model?: string;\n /** Top-k results; default 5. */\n k?: number;\n}\n\nexport function searchSemantic(\n payload: EmbeddingsPayload,\n queryVector: Float32Array,\n opts: SearchOptions = {},\n): SearchHit[] {\n const space = pickSpace(payload, opts.model);\n if (!space) throw new Error('No matching embedding space found');\n if (queryVector.length !== space.dimension) {\n throw new Error(`Query vector dimension ${queryVector.length} does not match space ${space.model} (${space.dimension})`);\n }\n\n const k = opts.k ?? 5;\n const scored: SearchHit[] = space.chunks.map((c) => ({\n spaceModel: space.model,\n chunkId: c.id,\n textOffset: c.textOffset,\n textLength: c.textLength,\n score: similarity(queryVector, c.vector, space.metric),\n }));\n\n // Higher score = better for cosine/dot; lower for euclidean.\n const order = space.metric === 'euclidean' ? 1 : -1;\n scored.sort((a, b) => order * (a.score - b.score));\n return scored.slice(0, k);\n}\n\nfunction pickSpace(payload: EmbeddingsPayload, modelHint?: string): EmbeddingSpace | undefined {\n if (modelHint) return payload.spaces.find((s) => s.model === modelHint);\n return payload.spaces[0];\n}\n\nfunction similarity(a: Float32Array, b: Float32Array, metric: EmbeddingMetric): number {\n if (metric === 'euclidean') {\n let sum = 0;\n for (let i = 0; i < a.length; i += 1) {\n const d = a[i]! - b[i]!;\n sum += d * d;\n }\n return Math.sqrt(sum);\n }\n let dot = 0;\n for (let i = 0; i < a.length; i += 1) dot += a[i]! * b[i]!;\n if (metric === 'dot') return dot;\n // cosine: assume normalized vectors when produced by our backend\n let na = 0;\n let nb = 0;\n for (let i = 0; i < a.length; i += 1) {\n na += a[i]! * a[i]!;\n nb += b[i]! * b[i]!;\n }\n const denom = Math.sqrt(na) * Math.sqrt(nb);\n return denom === 0 ? 0 : dot / denom;\n}\n","/**\n * Hugging Face Inference API backend.\n *\n * Calls https://router.huggingface.co/hf-inference/models/<model>/pipeline/feature-extraction\n * with HF_TOKEN. For sentence-transformers models (BGE-M3, MiniLM, etc.) the\n * response is already mean-pooled per input — one vector per text. We\n * normalise client-side so cosine math is consistent across backends.\n */\n\nimport type { EmbeddingBackend, EmbeddingMatrix } from './types.js';\nimport type { EmbeddingMetric } from '@cvfile/sdk';\n\nexport interface HuggingFaceBackendOptions {\n model: string;\n /** HF token. Defaults to `process.env.HF_TOKEN`. */\n token?: string;\n /** Pinned revision; recorded in the payload. Default 'main'. */\n modelRevision?: string;\n /** Pre-known dimension. Optional; inferred from first response otherwise. */\n dimension?: number;\n metric?: EmbeddingMetric;\n /** Override base URL (e.g. for self-hosted TEI). */\n baseUrl?: string;\n}\n\nconst DEFAULT_BASE_URL = 'https://router.huggingface.co/hf-inference/models';\n\nexport function createHuggingFaceBackend(opts: HuggingFaceBackendOptions): EmbeddingBackend {\n const token = opts.token ?? process.env.HF_TOKEN ?? process.env.HUGGINGFACE_TOKEN;\n if (!token) {\n throw new Error('HF_TOKEN (or HUGGINGFACE_TOKEN) is required for the Hugging Face backend');\n }\n const baseUrl = opts.baseUrl ?? DEFAULT_BASE_URL;\n const url = `${baseUrl}/${encodeURI(opts.model)}/pipeline/feature-extraction`;\n\n const backend: EmbeddingBackend = {\n model: opts.model,\n modelRevision: opts.modelRevision ?? 'main',\n metric: opts.metric ?? 'cosine',\n normalized: true,\n async embed(texts: string[]): Promise<EmbeddingMatrix> {\n if (texts.length === 0) {\n return { vectors: [], dimension: opts.dimension ?? 0 };\n }\n const body = JSON.stringify({ inputs: texts, options: { wait_for_model: true } });\n const res = await fetch(url, {\n method: 'POST',\n headers: {\n 'content-type': 'application/json',\n authorization: `Bearer ${token}`,\n },\n body,\n });\n if (!res.ok) {\n const detail = await res.text().catch(() => '');\n throw new Error(`HF Inference API ${res.status} for ${opts.model}: ${detail || res.statusText}`);\n }\n const raw = (await res.json()) as unknown;\n const matrix = parseHfResponse(raw, texts.length);\n const dimension = matrix[0]?.length ?? opts.dimension ?? 0;\n const vectors = matrix.map((v) => normalize(Float32Array.from(v)));\n return { vectors, dimension };\n },\n };\n if (opts.dimension !== undefined) backend.dimension = opts.dimension;\n return backend;\n}\n\n/**\n * Coerce the variety of shapes the HF Inference API returns into a flat\n * `number[][]`: one mean-pooled vector per input.\n *\n * Observed shapes:\n * - sentence-transformers (BGE-M3, MiniLM): `[[...vec], [...vec]]`\n * - feature-extraction without pooling: `[[[...token0], [...token1], ...], ...]`\n * - single-input convenience form: `[...vec]`\n */\nfunction parseHfResponse(raw: unknown, expectedCount: number): number[][] {\n if (!Array.isArray(raw)) {\n throw new Error('HF Inference API: expected array response');\n }\n if (raw.length === 0) return [];\n\n const first = raw[0];\n if (typeof first === 'number') {\n if (expectedCount !== 1) {\n throw new Error(`HF Inference API: got 1 vector, expected ${expectedCount}`);\n }\n return [raw as number[]];\n }\n if (Array.isArray(first) && (first.length === 0 || typeof first[0] === 'number')) {\n return raw as number[][];\n }\n if (Array.isArray(first) && Array.isArray(first[0])) {\n // token-level embeddings: mean-pool per input\n return (raw as number[][][]).map(meanPool);\n }\n throw new Error('HF Inference API: unrecognised response shape');\n}\n\nfunction meanPool(tokens: number[][]): number[] {\n if (tokens.length === 0) return [];\n const dim = tokens[0]!.length;\n const out = new Array<number>(dim).fill(0);\n for (const t of tokens) {\n for (let i = 0; i < dim; i += 1) out[i]! += t[i]!;\n }\n for (let i = 0; i < dim; i += 1) out[i]! /= tokens.length;\n return out;\n}\n\nfunction normalize(v: Float32Array): Float32Array {\n let sum = 0;\n for (let i = 0; i < v.length; i += 1) sum += v[i]! * v[i]!;\n const norm = Math.sqrt(sum);\n if (norm === 0) return v;\n for (let i = 0; i < v.length; i += 1) v[i] = v[i]! / norm;\n return v;\n}\n"]}
|
package/package.json
CHANGED
|
@@ -1,23 +1,31 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cvfile/embed",
|
|
3
|
-
"version": "0.1
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "Open-source embedding generation for the .cv format. BGE-M3 by default; pluggable.",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"homepage": "https://cvfile.org",
|
|
7
7
|
"repository": {
|
|
8
8
|
"type": "git",
|
|
9
|
-
"url": "https://github.com/cvfile/cv",
|
|
9
|
+
"url": "git+https://github.com/cvfile/cv.git",
|
|
10
10
|
"directory": "packages/embed-js"
|
|
11
11
|
},
|
|
12
|
+
"publishConfig": {
|
|
13
|
+
"access": "public"
|
|
14
|
+
},
|
|
12
15
|
"type": "module",
|
|
13
16
|
"main": "./dist/index.cjs",
|
|
14
17
|
"module": "./dist/index.js",
|
|
15
|
-
"types": "./dist/index.d.
|
|
18
|
+
"types": "./dist/index.d.cts",
|
|
16
19
|
"exports": {
|
|
17
20
|
".": {
|
|
18
|
-
"
|
|
19
|
-
|
|
20
|
-
|
|
21
|
+
"import": {
|
|
22
|
+
"types": "./dist/index.d.ts",
|
|
23
|
+
"default": "./dist/index.js"
|
|
24
|
+
},
|
|
25
|
+
"require": {
|
|
26
|
+
"types": "./dist/index.d.cts",
|
|
27
|
+
"default": "./dist/index.cjs"
|
|
28
|
+
}
|
|
21
29
|
}
|
|
22
30
|
},
|
|
23
31
|
"files": [
|
|
@@ -28,7 +36,7 @@
|
|
|
28
36
|
"sideEffects": false,
|
|
29
37
|
"dependencies": {
|
|
30
38
|
"@huggingface/transformers": "^3.0.0",
|
|
31
|
-
"@cvfile/sdk": "0.1
|
|
39
|
+
"@cvfile/sdk": "^0.3.1"
|
|
32
40
|
},
|
|
33
41
|
"devDependencies": {
|
|
34
42
|
"@types/node": "^22.0.0",
|
|
@@ -41,6 +49,7 @@
|
|
|
41
49
|
"test": "vitest run",
|
|
42
50
|
"test:watch": "vitest",
|
|
43
51
|
"typecheck": "tsc --noEmit",
|
|
52
|
+
"lint": "eslint .",
|
|
44
53
|
"clean": "rm -rf dist .turbo"
|
|
45
54
|
}
|
|
46
55
|
}
|