@chatbot-packages/rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunking/index.d.ts +51 -0
- package/dist/chunking/index.js +248 -0
- package/dist/chunking/index.js.map +1 -0
- package/dist/embeddings/index.d.ts +103 -0
- package/dist/embeddings/index.js +195 -0
- package/dist/embeddings/index.js.map +1 -0
- package/dist/extractors/index.d.ts +95 -0
- package/dist/extractors/index.js +343 -0
- package/dist/extractors/index.js.map +1 -0
- package/dist/index.d.ts +78 -0
- package/dist/index.js +1576 -0
- package/dist/index.js.map +1 -0
- package/dist/retrieval/index.d.ts +65 -0
- package/dist/retrieval/index.js +144 -0
- package/dist/retrieval/index.js.map +1 -0
- package/dist/types-CjnplPJD.d.ts +242 -0
- package/dist/vectorstore/index.d.ts +109 -0
- package/dist/vectorstore/index.js +422 -0
- package/dist/vectorstore/index.js.map +1 -0
- package/package.json +83 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { e as ChunkingOptions, j as ExtractedDocument, d as ChunkResult } from '../types-CjnplPJD.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Header-Aware Chunker
|
|
5
|
+
*
|
|
6
|
+
* Splits documents into chunks while respecting heading boundaries.
|
|
7
|
+
* Ported from the Python implementation in Gen21AIHelpAndQNA.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
interface HeaderAwareChunkerOptions extends ChunkingOptions {
|
|
11
|
+
/** Document ID to associate chunks with */
|
|
12
|
+
documentId?: string;
|
|
13
|
+
}
|
|
14
|
+
declare class HeaderAwareChunker {
|
|
15
|
+
private options;
|
|
16
|
+
constructor(options?: ChunkingOptions);
|
|
17
|
+
/**
|
|
18
|
+
* Chunk a document into smaller pieces
|
|
19
|
+
*/
|
|
20
|
+
chunk(document: ExtractedDocument, documentId?: string): ChunkResult;
|
|
21
|
+
/**
|
|
22
|
+
* Split content by heading boundaries
|
|
23
|
+
*/
|
|
24
|
+
private splitBySections;
|
|
25
|
+
/**
|
|
26
|
+
* Parse a heading line
|
|
27
|
+
*/
|
|
28
|
+
private parseHeading;
|
|
29
|
+
/**
|
|
30
|
+
* Chunk a single section
|
|
31
|
+
*/
|
|
32
|
+
private chunkSection;
|
|
33
|
+
/**
|
|
34
|
+
* Split a large paragraph into sentence-based chunks
|
|
35
|
+
*/
|
|
36
|
+
private splitLargeParagraph;
|
|
37
|
+
/**
|
|
38
|
+
* Get overlap text from previous chunk
|
|
39
|
+
*/
|
|
40
|
+
private getOverlapText;
|
|
41
|
+
/**
|
|
42
|
+
* Create a chunk object
|
|
43
|
+
*/
|
|
44
|
+
private createChunk;
|
|
45
|
+
/**
|
|
46
|
+
* Estimate token count (rough approximation: ~4 chars per token)
|
|
47
|
+
*/
|
|
48
|
+
private estimateTokens;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export { HeaderAwareChunker, type HeaderAwareChunkerOptions };
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
// src/chunking/header-aware-chunker.ts
|
|
2
|
+
import { generateId } from "@chatbot-packages/utils";
|
|
3
|
+
var HeaderAwareChunker = class {
|
|
4
|
+
options;
|
|
5
|
+
constructor(options) {
|
|
6
|
+
this.options = {
|
|
7
|
+
chunkSize: 512,
|
|
8
|
+
chunkOverlap: 50,
|
|
9
|
+
minChunkSize: 100,
|
|
10
|
+
maxChunkSize: 800,
|
|
11
|
+
respectHeadings: true,
|
|
12
|
+
splitOnHeadings: [1, 2],
|
|
13
|
+
...options
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Chunk a document into smaller pieces
|
|
18
|
+
*/
|
|
19
|
+
chunk(document, documentId) {
|
|
20
|
+
const docId = documentId || generateId("doc");
|
|
21
|
+
const { content } = document;
|
|
22
|
+
if (!content.trim()) {
|
|
23
|
+
return {
|
|
24
|
+
chunks: [],
|
|
25
|
+
stats: { totalChunks: 0, avgChunkSize: 0, minChunkSize: 0, maxChunkSize: 0 }
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
const chunks = [];
|
|
29
|
+
const sections = this.splitBySections(content);
|
|
30
|
+
for (const section of sections) {
|
|
31
|
+
const sectionChunks = this.chunkSection(section, docId, document.path);
|
|
32
|
+
chunks.push(...sectionChunks);
|
|
33
|
+
}
|
|
34
|
+
const sizes = chunks.map((c) => c.text.length);
|
|
35
|
+
const stats = {
|
|
36
|
+
totalChunks: chunks.length,
|
|
37
|
+
avgChunkSize: sizes.length > 0 ? Math.round(sizes.reduce((a, b) => a + b, 0) / sizes.length) : 0,
|
|
38
|
+
minChunkSize: sizes.length > 0 ? Math.min(...sizes) : 0,
|
|
39
|
+
maxChunkSize: sizes.length > 0 ? Math.max(...sizes) : 0
|
|
40
|
+
};
|
|
41
|
+
return { chunks, stats };
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Split content by heading boundaries
|
|
45
|
+
*/
|
|
46
|
+
splitBySections(content) {
|
|
47
|
+
const lines = content.split("\n");
|
|
48
|
+
const sections = [];
|
|
49
|
+
let currentContext = { sectionPath: "" };
|
|
50
|
+
let currentText = [];
|
|
51
|
+
for (const line of lines) {
|
|
52
|
+
const heading = this.parseHeading(line);
|
|
53
|
+
if (heading && this.options.splitOnHeadings.includes(heading.level)) {
|
|
54
|
+
if (currentText.length > 0) {
|
|
55
|
+
sections.push({
|
|
56
|
+
text: currentText.join("\n").trim(),
|
|
57
|
+
context: { ...currentContext }
|
|
58
|
+
});
|
|
59
|
+
currentText = [];
|
|
60
|
+
}
|
|
61
|
+
if (heading.level === 1) {
|
|
62
|
+
currentContext = {
|
|
63
|
+
h1: heading.text,
|
|
64
|
+
sectionPath: heading.text
|
|
65
|
+
};
|
|
66
|
+
} else if (heading.level === 2) {
|
|
67
|
+
currentContext = {
|
|
68
|
+
...currentContext,
|
|
69
|
+
h2: heading.text,
|
|
70
|
+
h3: void 0,
|
|
71
|
+
sectionPath: currentContext.h1 ? `${currentContext.h1} > ${heading.text}` : heading.text
|
|
72
|
+
};
|
|
73
|
+
} else if (heading.level === 3) {
|
|
74
|
+
currentContext = {
|
|
75
|
+
...currentContext,
|
|
76
|
+
h3: heading.text,
|
|
77
|
+
sectionPath: currentContext.h2 ? `${currentContext.sectionPath} > ${heading.text}` : heading.text
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
currentText.push(line);
|
|
81
|
+
} else {
|
|
82
|
+
currentText.push(line);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
if (currentText.length > 0) {
|
|
86
|
+
sections.push({
|
|
87
|
+
text: currentText.join("\n").trim(),
|
|
88
|
+
context: { ...currentContext }
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
return sections;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Parse a heading line
|
|
95
|
+
*/
|
|
96
|
+
parseHeading(line) {
|
|
97
|
+
const match = line.match(/^(#{1,6})\s+(.+)$/);
|
|
98
|
+
if (match) {
|
|
99
|
+
return {
|
|
100
|
+
level: match[1].length,
|
|
101
|
+
text: match[2].trim()
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
return null;
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Chunk a single section
|
|
108
|
+
*/
|
|
109
|
+
chunkSection(section, documentId, sourcePath) {
|
|
110
|
+
const { text, context } = section;
|
|
111
|
+
const chunks = [];
|
|
112
|
+
if (this.estimateTokens(text) <= this.options.maxChunkSize) {
|
|
113
|
+
if (this.estimateTokens(text) >= this.options.minChunkSize) {
|
|
114
|
+
chunks.push(this.createChunk(text, context, documentId, sourcePath, 0));
|
|
115
|
+
}
|
|
116
|
+
return chunks;
|
|
117
|
+
}
|
|
118
|
+
const paragraphs = text.split(/\n\n+/);
|
|
119
|
+
let currentChunk = [];
|
|
120
|
+
let currentTokens = 0;
|
|
121
|
+
for (const para of paragraphs) {
|
|
122
|
+
const paraTokens = this.estimateTokens(para);
|
|
123
|
+
if (paraTokens > this.options.maxChunkSize) {
|
|
124
|
+
if (currentChunk.length > 0) {
|
|
125
|
+
chunks.push(
|
|
126
|
+
this.createChunk(
|
|
127
|
+
currentChunk.join("\n\n"),
|
|
128
|
+
context,
|
|
129
|
+
documentId,
|
|
130
|
+
sourcePath,
|
|
131
|
+
chunks.length
|
|
132
|
+
)
|
|
133
|
+
);
|
|
134
|
+
currentChunk = [];
|
|
135
|
+
currentTokens = 0;
|
|
136
|
+
}
|
|
137
|
+
const sentenceChunks = this.splitLargeParagraph(para);
|
|
138
|
+
for (const sentenceChunk of sentenceChunks) {
|
|
139
|
+
chunks.push(
|
|
140
|
+
this.createChunk(sentenceChunk, context, documentId, sourcePath, chunks.length)
|
|
141
|
+
);
|
|
142
|
+
}
|
|
143
|
+
continue;
|
|
144
|
+
}
|
|
145
|
+
if (currentTokens + paraTokens > this.options.chunkSize) {
|
|
146
|
+
if (currentChunk.length > 0) {
|
|
147
|
+
chunks.push(
|
|
148
|
+
this.createChunk(
|
|
149
|
+
currentChunk.join("\n\n"),
|
|
150
|
+
context,
|
|
151
|
+
documentId,
|
|
152
|
+
sourcePath,
|
|
153
|
+
chunks.length
|
|
154
|
+
)
|
|
155
|
+
);
|
|
156
|
+
const overlapText = this.getOverlapText(currentChunk);
|
|
157
|
+
currentChunk = overlapText ? [overlapText, para] : [para];
|
|
158
|
+
currentTokens = this.estimateTokens(currentChunk.join("\n\n"));
|
|
159
|
+
} else {
|
|
160
|
+
currentChunk = [para];
|
|
161
|
+
currentTokens = paraTokens;
|
|
162
|
+
}
|
|
163
|
+
} else {
|
|
164
|
+
currentChunk.push(para);
|
|
165
|
+
currentTokens += paraTokens;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
if (currentChunk.length > 0 && currentTokens >= this.options.minChunkSize) {
|
|
169
|
+
chunks.push(
|
|
170
|
+
this.createChunk(currentChunk.join("\n\n"), context, documentId, sourcePath, chunks.length)
|
|
171
|
+
);
|
|
172
|
+
}
|
|
173
|
+
return chunks;
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Split a large paragraph into sentence-based chunks
|
|
177
|
+
*/
|
|
178
|
+
splitLargeParagraph(paragraph) {
|
|
179
|
+
const sentences = paragraph.match(/[^.!?]+[.!?]+/g) || [paragraph];
|
|
180
|
+
const chunks = [];
|
|
181
|
+
let currentChunk = [];
|
|
182
|
+
let currentTokens = 0;
|
|
183
|
+
for (const sentence of sentences) {
|
|
184
|
+
const sentenceTokens = this.estimateTokens(sentence);
|
|
185
|
+
if (currentTokens + sentenceTokens > this.options.chunkSize && currentChunk.length > 0) {
|
|
186
|
+
chunks.push(currentChunk.join(" ").trim());
|
|
187
|
+
currentChunk = [];
|
|
188
|
+
currentTokens = 0;
|
|
189
|
+
}
|
|
190
|
+
currentChunk.push(sentence.trim());
|
|
191
|
+
currentTokens += sentenceTokens;
|
|
192
|
+
}
|
|
193
|
+
if (currentChunk.length > 0) {
|
|
194
|
+
chunks.push(currentChunk.join(" ").trim());
|
|
195
|
+
}
|
|
196
|
+
return chunks;
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Get overlap text from previous chunk
|
|
200
|
+
*/
|
|
201
|
+
getOverlapText(previousChunk) {
|
|
202
|
+
if (!this.options.chunkOverlap || previousChunk.length === 0) {
|
|
203
|
+
return null;
|
|
204
|
+
}
|
|
205
|
+
const reversed = [...previousChunk].reverse();
|
|
206
|
+
const overlapParts = [];
|
|
207
|
+
let tokens = 0;
|
|
208
|
+
for (const part of reversed) {
|
|
209
|
+
const partTokens = this.estimateTokens(part);
|
|
210
|
+
if (tokens + partTokens > this.options.chunkOverlap) {
|
|
211
|
+
break;
|
|
212
|
+
}
|
|
213
|
+
overlapParts.unshift(part);
|
|
214
|
+
tokens += partTokens;
|
|
215
|
+
}
|
|
216
|
+
return overlapParts.length > 0 ? overlapParts.join("\n\n") : null;
|
|
217
|
+
}
|
|
218
|
+
/**
|
|
219
|
+
* Create a chunk object
|
|
220
|
+
*/
|
|
221
|
+
createChunk(text, context, documentId, sourcePath, index) {
|
|
222
|
+
const metadata = {
|
|
223
|
+
sectionPath: context.sectionPath,
|
|
224
|
+
headingH1: context.h1,
|
|
225
|
+
headingH2: context.h2,
|
|
226
|
+
headingH3: context.h3,
|
|
227
|
+
sourcePath,
|
|
228
|
+
chunkIndex: index
|
|
229
|
+
};
|
|
230
|
+
return {
|
|
231
|
+
id: generateId("chunk"),
|
|
232
|
+
documentId,
|
|
233
|
+
text: text.trim(),
|
|
234
|
+
metadata,
|
|
235
|
+
createdAt: /* @__PURE__ */ new Date()
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Estimate token count (rough approximation: ~4 chars per token)
|
|
240
|
+
*/
|
|
241
|
+
estimateTokens(text) {
|
|
242
|
+
return Math.ceil(text.length / 4);
|
|
243
|
+
}
|
|
244
|
+
};
|
|
245
|
+
export {
|
|
246
|
+
HeaderAwareChunker
|
|
247
|
+
};
|
|
248
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/chunking/header-aware-chunker.ts"],"sourcesContent":["/**\n * Header-Aware Chunker\n *\n * Splits documents into chunks while respecting heading boundaries.\n * Ported from the Python implementation in Gen21AIHelpAndQNA.\n */\n\nimport { generateId } from '@chatbot-packages/utils';\nimport type {\n ChunkingOptions,\n ChunkResult,\n DocumentChunk,\n ChunkMetadata,\n ExtractedDocument,\n} from '../types.js';\n\nexport interface HeaderAwareChunkerOptions extends ChunkingOptions {\n /** Document ID to associate chunks with */\n documentId?: string;\n}\n\ninterface HeadingContext {\n h1?: string;\n h2?: string;\n h3?: string;\n sectionPath: string;\n}\n\nexport class HeaderAwareChunker {\n private options: Required<Omit<HeaderAwareChunkerOptions, 'documentId'>>;\n\n constructor(options?: ChunkingOptions) {\n this.options = {\n chunkSize: 512,\n chunkOverlap: 50,\n minChunkSize: 100,\n maxChunkSize: 800,\n respectHeadings: true,\n splitOnHeadings: [1, 2],\n ...options,\n };\n }\n\n /**\n * Chunk a document into smaller pieces\n */\n chunk(document: ExtractedDocument, documentId?: string): ChunkResult {\n const docId = documentId || generateId('doc');\n const { content } = document;\n\n if (!content.trim()) {\n return {\n chunks: [],\n stats: { totalChunks: 0, avgChunkSize: 0, minChunkSize: 0, maxChunkSize: 0 },\n };\n }\n\n const chunks: DocumentChunk[] = [];\n const sections = this.splitBySections(content);\n\n for (const section of sections) {\n const sectionChunks = this.chunkSection(section, docId, document.path);\n chunks.push(...sectionChunks);\n }\n\n // Calculate stats\n const sizes = chunks.map((c) => c.text.length);\n const stats = {\n totalChunks: chunks.length,\n avgChunkSize: sizes.length > 0 ? Math.round(sizes.reduce((a, b) => a + b, 0) / sizes.length) : 0,\n minChunkSize: sizes.length > 0 ? Math.min(...sizes) : 0,\n maxChunkSize: sizes.length > 0 ? Math.max(...sizes) : 0,\n };\n\n return { chunks, stats };\n }\n\n /**\n * Split content by heading boundaries\n */\n private splitBySections(content: string): Array<{ text: string; context: HeadingContext }> {\n const lines = content.split('\\n');\n const sections: Array<{ text: string; context: HeadingContext }> = [];\n\n let currentContext: HeadingContext = { sectionPath: '' };\n let currentText: string[] = [];\n\n for (const line of lines) {\n const heading = this.parseHeading(line);\n\n if (heading && this.options.splitOnHeadings.includes(heading.level)) {\n // Save current section\n if (currentText.length > 0) {\n sections.push({\n text: currentText.join('\\n').trim(),\n context: { ...currentContext },\n });\n currentText = [];\n }\n\n // Update context based on heading level\n if (heading.level === 1) {\n currentContext = {\n h1: heading.text,\n sectionPath: heading.text,\n };\n } else if (heading.level === 2) {\n currentContext = {\n ...currentContext,\n h2: heading.text,\n h3: undefined,\n sectionPath: currentContext.h1\n ? `${currentContext.h1} > ${heading.text}`\n : heading.text,\n };\n } else if (heading.level === 3) {\n currentContext = {\n ...currentContext,\n h3: heading.text,\n sectionPath: currentContext.h2\n ? `${currentContext.sectionPath} > ${heading.text}`\n : heading.text,\n };\n }\n\n currentText.push(line);\n } else {\n currentText.push(line);\n }\n }\n\n // Don't forget the last section\n if (currentText.length > 0) {\n sections.push({\n text: currentText.join('\\n').trim(),\n context: { ...currentContext },\n });\n }\n\n return sections;\n }\n\n /**\n * Parse a heading line\n */\n private parseHeading(line: string): { level: number; text: string } | null {\n // Markdown-style headings: # H1, ## H2, ### H3\n const match = line.match(/^(#{1,6})\\s+(.+)$/);\n if (match) {\n return {\n level: match[1].length,\n text: match[2].trim(),\n };\n }\n return null;\n }\n\n /**\n * Chunk a single section\n */\n private chunkSection(\n section: { text: string; context: HeadingContext },\n documentId: string,\n sourcePath: string\n ): DocumentChunk[] {\n const { text, context } = section;\n const chunks: DocumentChunk[] = [];\n\n // If section is small enough, keep as single chunk\n if (this.estimateTokens(text) <= this.options.maxChunkSize) {\n if (this.estimateTokens(text) >= this.options.minChunkSize) {\n chunks.push(this.createChunk(text, context, documentId, sourcePath, 0));\n }\n return chunks;\n }\n\n // Split into paragraphs\n const paragraphs = text.split(/\\n\\n+/);\n let currentChunk: string[] = [];\n let currentTokens = 0;\n\n for (const para of paragraphs) {\n const paraTokens = this.estimateTokens(para);\n\n // If single paragraph is too large, split by sentences\n if (paraTokens > this.options.maxChunkSize) {\n // Save current chunk first\n if (currentChunk.length > 0) {\n chunks.push(\n this.createChunk(\n currentChunk.join('\\n\\n'),\n context,\n documentId,\n sourcePath,\n chunks.length\n )\n );\n currentChunk = [];\n currentTokens = 0;\n }\n\n // Split large paragraph by sentences\n const sentenceChunks = this.splitLargeParagraph(para);\n for (const sentenceChunk of sentenceChunks) {\n chunks.push(\n this.createChunk(sentenceChunk, context, documentId, sourcePath, chunks.length)\n );\n }\n continue;\n }\n\n // Check if adding paragraph exceeds chunk size\n if (currentTokens + paraTokens > this.options.chunkSize) {\n // Save current chunk\n if (currentChunk.length > 0) {\n chunks.push(\n this.createChunk(\n currentChunk.join('\\n\\n'),\n context,\n documentId,\n sourcePath,\n chunks.length\n )\n );\n\n // Start new chunk with overlap\n const overlapText = this.getOverlapText(currentChunk);\n currentChunk = overlapText ? [overlapText, para] : [para];\n currentTokens = this.estimateTokens(currentChunk.join('\\n\\n'));\n } else {\n currentChunk = [para];\n currentTokens = paraTokens;\n }\n } else {\n currentChunk.push(para);\n currentTokens += paraTokens;\n }\n }\n\n // Don't forget the last chunk\n if (currentChunk.length > 0 && currentTokens >= this.options.minChunkSize) {\n chunks.push(\n this.createChunk(currentChunk.join('\\n\\n'), context, documentId, sourcePath, chunks.length)\n );\n }\n\n return chunks;\n }\n\n /**\n * Split a large paragraph into sentence-based chunks\n */\n private splitLargeParagraph(paragraph: string): string[] {\n const sentences = paragraph.match(/[^.!?]+[.!?]+/g) || [paragraph];\n const chunks: string[] = [];\n let currentChunk: string[] = [];\n let currentTokens = 0;\n\n for (const sentence of sentences) {\n const sentenceTokens = this.estimateTokens(sentence);\n\n if (currentTokens + sentenceTokens > this.options.chunkSize && currentChunk.length > 0) {\n chunks.push(currentChunk.join(' ').trim());\n currentChunk = [];\n currentTokens = 0;\n }\n\n currentChunk.push(sentence.trim());\n currentTokens += sentenceTokens;\n }\n\n if (currentChunk.length > 0) {\n chunks.push(currentChunk.join(' ').trim());\n }\n\n return chunks;\n }\n\n /**\n * Get overlap text from previous chunk\n */\n private getOverlapText(previousChunk: string[]): string | null {\n if (!this.options.chunkOverlap || previousChunk.length === 0) {\n return null;\n }\n\n // Take last paragraph(s) that fit within overlap size\n const reversed = [...previousChunk].reverse();\n const overlapParts: string[] = [];\n let tokens = 0;\n\n for (const part of reversed) {\n const partTokens = this.estimateTokens(part);\n if (tokens + partTokens > this.options.chunkOverlap) {\n break;\n }\n overlapParts.unshift(part);\n tokens += partTokens;\n }\n\n return overlapParts.length > 0 ? overlapParts.join('\\n\\n') : null;\n }\n\n /**\n * Create a chunk object\n */\n private createChunk(\n text: string,\n context: HeadingContext,\n documentId: string,\n sourcePath: string,\n index: number\n ): DocumentChunk {\n const metadata: ChunkMetadata = {\n sectionPath: context.sectionPath,\n headingH1: context.h1,\n headingH2: context.h2,\n headingH3: context.h3,\n sourcePath,\n chunkIndex: index,\n };\n\n return {\n id: generateId('chunk'),\n documentId,\n text: text.trim(),\n metadata,\n createdAt: new Date(),\n };\n }\n\n /**\n * Estimate token count (rough approximation: ~4 chars per token)\n */\n private estimateTokens(text: string): number {\n return Math.ceil(text.length / 4);\n }\n}\n"],"mappings":";AAOA,SAAS,kBAAkB;AAqBpB,IAAM,qBAAN,MAAyB;AAAA,EACtB;AAAA,EAER,YAAY,SAA2B;AACrC,SAAK,UAAU;AAAA,MACb,WAAW;AAAA,MACX,cAAc;AAAA,MACd,cAAc;AAAA,MACd,cAAc;AAAA,MACd,iBAAiB;AAAA,MACjB,iBAAiB,CAAC,GAAG,CAAC;AAAA,MACtB,GAAG;AAAA,IACL;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAM,UAA6B,YAAkC;AACnE,UAAM,QAAQ,cAAc,WAAW,KAAK;AAC5C,UAAM,EAAE,QAAQ,IAAI;AAEpB,QAAI,CAAC,QAAQ,KAAK,GAAG;AACnB,aAAO;AAAA,QACL,QAAQ,CAAC;AAAA,QACT,OAAO,EAAE,aAAa,GAAG,cAAc,GAAG,cAAc,GAAG,cAAc,EAAE;AAAA,MAC7E;AAAA,IACF;AAEA,UAAM,SAA0B,CAAC;AACjC,UAAM,WAAW,KAAK,gBAAgB,OAAO;AAE7C,eAAW,WAAW,UAAU;AAC9B,YAAM,gBAAgB,KAAK,aAAa,SAAS,OAAO,SAAS,IAAI;AACrE,aAAO,KAAK,GAAG,aAAa;AAAA,IAC9B;AAGA,UAAM,QAAQ,OAAO,IAAI,CAAC,MAAM,EAAE,KAAK,MAAM;AAC7C,UAAM,QAAQ;AAAA,MACZ,aAAa,OAAO;AAAA,MACpB,cAAc,MAAM,SAAS,IAAI,KAAK,MAAM,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,MAAM,MAAM,IAAI;AAAA,MAC/F,cAAc,MAAM,SAAS,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI;AAAA,MACtD,cAAc,MAAM,SAAS,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI;AAAA,IACxD;AAEA,WAAO,EAAE,QAAQ,MAAM;AAAA,EACzB;AAAA;AAAA;AAAA;AAAA,EAKQ,gBAAgB,SAAmE;AACzF,UAAM,QAAQ,QAAQ,MAAM,IAAI;AAChC,UAAM,WAA6D,CAAC;AAEpE,QAAI,iBAAiC,EAAE,aAAa,GAAG;AACvD,QAAI,cAAwB,CAAC;AAE7B,eAAW,QAAQ,OAAO;AACxB,YAAM,UAAU,KAAK,aAAa,IAAI;AAEtC,UAAI,WAAW,KAAK,QAAQ,gBAAgB,SAAS,QAAQ,KAAK,GAAG;AAEnE,YAAI,YAAY,SAAS,GAAG;AAC1B,mBAAS,KAAK;AAAA,YACZ,MAAM,YAAY,KAAK,IAAI,EAAE,KAAK;AAAA,YAClC,SAAS,EAAE,GAAG,eAAe;AAAA,UAC/B,CAAC;AACD,wBAAc,CAAC;AAAA,QACjB;AAGA,YAAI,QAAQ,UAAU,GAAG;AACvB,2BAAiB;AAAA,YACf,IAAI,QAAQ;AAAA,YACZ,aAAa,QAAQ;AAAA,UACvB;AAAA,QACF,WAAW,QAAQ,UAAU,GAAG;AAC9B,2BAAiB;AAAA,YACf,GAAG;AAAA,YACH,IAAI,QAAQ;AAAA,YACZ,IAAI;AAAA,YACJ,aAAa,eAAe,KACxB,GAAG,eAAe,EAAE,MAAM,QAAQ,IAAI,KACtC,QAAQ;AAAA,UACd;AAAA,QACF,WAAW,QAAQ,UAAU,GAAG;AAC9B,2BAAiB;AAAA,YACf,GAAG;AAAA,YACH,IAAI,QAAQ;AAAA,YACZ,aAAa,eAAe,KACxB,GAAG,eAAe,WAAW,MAAM,QAAQ,IAAI,KAC/C,QAAQ;AAAA,UACd;AAAA,QACF;AAEA,oBAAY,KAAK,IAAI;AAAA,MACvB,OAAO;AACL,oBAAY,KAAK,IAAI;AAAA,MACvB;AAAA,IACF;AAGA,QAAI,YAAY,SAAS,GAAG;AAC1B,eAAS,KAAK;AAAA,QACZ,MAAM,YAAY,KAAK,IAAI,EAAE,KAAK;AAAA,QAClC,SAAS,EAAE,GAAG,eAAe;AAAA,MAC/B,CAAC;AAAA,IACH;AAEA,WAAO;AAAA,EACT;AAAA;AAAA;AAAA;AAAA,EAKQ,aAAa,MAAsD;AAEzE,UAAM,QAAQ,KAAK,MAAM,mBAAmB;AAC5C,QAAI,OAAO;AACT,aAAO;AAAA,QACL,OAAO,MAAM,CAAC,EAAE;AAAA,QAChB,MAAM,MAAM,CAAC,EAAE,KAAK;AAAA,MACtB;AAAA,IACF;AACA,WAAO;AAAA,EACT;AAAA;AAAA;AAAA;AAAA,EAKQ,aACN,SACA,YACA,YACiB;AACjB,UAAM,EAAE,MAAM,QAAQ,IAAI;AAC1B,UAAM,SAA0B,CAAC;AAGjC,QAAI,KAAK,eAAe,IAAI,KAAK,KAAK,QAAQ,cAAc;AAC1D,UAAI,KAAK,eAAe,IAAI,KAAK,KAAK,QAAQ,cAAc;AAC1D,eAAO,KAAK,KAAK,YAAY,MAAM,SAAS,YAAY,YAAY,CAAC,CAAC;AAAA,MACxE;AACA,aAAO;AAAA,IACT;AAGA,UAAM,aAAa,KAAK,MAAM,OAAO;AACrC,QAAI,eAAyB,CAAC;AAC9B,QAAI,gBAAgB;AAEpB,eAAW,QAAQ,YAAY;AAC7B,YAAM,aAAa,KAAK,eAAe,IAAI;AAG3C,UAAI,aAAa,KAAK,QAAQ,cAAc;AAE1C,YAAI,aAAa,SAAS,GAAG;AAC3B,iBAAO;AAAA,YACL,KAAK;AAAA,cACH,aAAa,KAAK,MAAM;AAAA,cACxB;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO;AAAA,YACT;AAAA,UACF;AACA,yBAAe,CAAC;AAChB,0BAAgB;AAAA,QAClB;AAGA,cAAM,iBAAiB,KAAK,oBAAoB,IAAI;AACpD,mBAAW,iBAAiB,gBAAgB;AAC1C,iBAAO;AAAA,YACL,KAAK,YAAY,eAAe,SAAS,YAAY,YAAY,OAAO,MAAM;AAAA,UAChF;AAAA,QACF;AACA;AAAA,MACF;AAGA,UAAI,gBAAgB,aAAa,KAAK,QAAQ,WAAW;AAEvD,YAAI,aAAa,SAAS,GAAG;AAC3B,iBAAO;AAAA,YACL,KAAK;AAAA,cACH,aAAa,KAAK,MAAM;AAAA,cACxB;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO;AAAA,YACT;AAAA,UACF;AAGA,gBAAM,cAAc,KAAK,eAAe,YAAY;AACpD,yBAAe,cAAc,CAAC,aAAa,IAAI,IAAI,CAAC,IAAI;AACxD,0BAAgB,KAAK,eAAe,aAAa,KAAK,MAAM,CAAC;AAAA,QAC/D,OAAO;AACL,yBAAe,CAAC,IAAI;AACpB,0BAAgB;AAAA,QAClB;AAAA,MACF,OAAO;AACL,qBAAa,KAAK,IAAI;AACtB,yBAAiB;AAAA,MACnB;AAAA,IACF;AAGA,QAAI,aAAa,SAAS,KAAK,iBAAiB,KAAK,QAAQ,cAAc;AACzE,aAAO;AAAA,QACL,KAAK,YAAY,aAAa,KAAK,MAAM,GAAG,SAAS,YAAY,YAAY,OAAO,MAAM;AAAA,MAC5F;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AAAA;AAAA;AAAA;AAAA,EAKQ,oBAAoB,WAA6B;AACvD,UAAM,YAAY,UAAU,MAAM,gBAAgB,KAAK,CAAC,SAAS;AACjE,UAAM,SAAmB,CAAC;AAC1B,QAAI,eAAyB,CAAC;AAC9B,QAAI,gBAAgB;AAEpB,eAAW,YAAY,WAAW;AAChC,YAAM,iBAAiB,KAAK,eAAe,QAAQ;AAEnD,UAAI,gBAAgB,iBAAiB,KAAK,QAAQ,aAAa,aAAa,SAAS,GAAG;AACtF,eAAO,KAAK,aAAa,KAAK,GAAG,EAAE,KAAK,CAAC;AACzC,uBAAe,CAAC;AAChB,wBAAgB;AAAA,MAClB;AAEA,mBAAa,KAAK,SAAS,KAAK,CAAC;AACjC,uBAAiB;AAAA,IACnB;AAEA,QAAI,aAAa,SAAS,GAAG;AAC3B,aAAO,KAAK,aAAa,KAAK,GAAG,EAAE,KAAK,CAAC;AAAA,IAC3C;AAEA,WAAO;AAAA,EACT;AAAA;AAAA;AAAA;AAAA,EAKQ,eAAe,eAAwC;AAC7D,QAAI,CAAC,KAAK,QAAQ,gBAAgB,cAAc,WAAW,GAAG;AAC5D,aAAO;AAAA,IACT;AAGA,UAAM,WAAW,CAAC,GAAG,aAAa,EAAE,QAAQ;AAC5C,UAAM,eAAyB,CAAC;AAChC,QAAI,SAAS;AAEb,eAAW,QAAQ,UAAU;AAC3B,YAAM,aAAa,KAAK,eAAe,IAAI;AAC3C,UAAI,SAAS,aAAa,KAAK,QAAQ,cAAc;AACnD;AAAA,MACF;AACA,mBAAa,QAAQ,IAAI;AACzB,gBAAU;AAAA,IACZ;AAEA,WAAO,aAAa,SAAS,IAAI,aAAa,KAAK,MAAM,IAAI;AAAA,EAC/D;AAAA;AAAA;AAAA;AAAA,EAKQ,YACN,MACA,SACA,YACA,YACA,OACe;AACf,UAAM,WAA0B;AAAA,MAC9B,aAAa,QAAQ;AAAA,MACrB,WAAW,QAAQ;AAAA,MACnB,WAAW,QAAQ;AAAA,MACnB,WAAW,QAAQ;AAAA,MACnB;AAAA,MACA,YAAY;AAAA,IACd;AAEA,WAAO;AAAA,MACL,IAAI,WAAW,OAAO;AAAA,MACtB;AAAA,MACA,MAAM,KAAK,KAAK;AAAA,MAChB;AAAA,MACA,WAAW,oBAAI,KAAK;AAAA,IACtB;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKQ,eAAe,MAAsB;AAC3C,WAAO,KAAK,KAAK,KAAK,SAAS,CAAC;AAAA,EAClC;AACF;","names":[]}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import { E as EmbeddingBackend, i as EmbeddingResult, h as EmbeddingOptions } from '../types-CjnplPJD.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Local Embeddings using @xenova/transformers
|
|
5
|
+
*
|
|
6
|
+
* Runs embedding models locally without API calls.
|
|
7
|
+
* Supports BGE, all-MiniLM, and other sentence-transformer models.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
interface LocalEmbeddingOptions {
|
|
11
|
+
/** Model name (default: 'Xenova/bge-base-en-v1.5') */
|
|
12
|
+
model?: string;
|
|
13
|
+
/** Batch size for processing (default: 32) */
|
|
14
|
+
batchSize?: number;
|
|
15
|
+
/** Whether to normalize embeddings (default: true) */
|
|
16
|
+
normalize?: boolean;
|
|
17
|
+
}
|
|
18
|
+
declare class LocalEmbeddingBackend implements EmbeddingBackend {
|
|
19
|
+
private model;
|
|
20
|
+
private batchSize;
|
|
21
|
+
private normalize;
|
|
22
|
+
private dimensions;
|
|
23
|
+
private static MODEL_DIMENSIONS;
|
|
24
|
+
constructor(options?: LocalEmbeddingOptions);
|
|
25
|
+
embed(text: string): Promise<EmbeddingResult>;
|
|
26
|
+
embedBatch(texts: string[]): Promise<EmbeddingResult[]>;
|
|
27
|
+
getDimensions(): number;
|
|
28
|
+
getModel(): string;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Available local models
|
|
32
|
+
*/
|
|
33
|
+
declare const LOCAL_MODELS: {
|
|
34
|
+
/** BGE Large - Best quality, slower (1024 dims) */
|
|
35
|
+
BGE_LARGE: string;
|
|
36
|
+
/** BGE Base - Good balance (768 dims) */
|
|
37
|
+
BGE_BASE: string;
|
|
38
|
+
/** BGE Small - Fastest (384 dims) */
|
|
39
|
+
BGE_SMALL: string;
|
|
40
|
+
/** MiniLM L6 - Very fast (384 dims) */
|
|
41
|
+
MINILM_L6: string;
|
|
42
|
+
/** MiniLM L12 - Good quality (384 dims) */
|
|
43
|
+
MINILM_L12: string;
|
|
44
|
+
/** MPNet - High quality (768 dims) */
|
|
45
|
+
MPNET: string;
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* OpenAI Embeddings
|
|
50
|
+
*
|
|
51
|
+
* Uses OpenAI's embedding API for high-quality embeddings.
|
|
52
|
+
*/
|
|
53
|
+
|
|
54
|
+
interface OpenAIEmbeddingOptions {
|
|
55
|
+
/** OpenAI API key */
|
|
56
|
+
apiKey: string;
|
|
57
|
+
/** Model name (default: 'text-embedding-3-small') */
|
|
58
|
+
model?: string;
|
|
59
|
+
/** Embedding dimensions (for models that support it) */
|
|
60
|
+
dimensions?: number;
|
|
61
|
+
/** Batch size for processing (default: 100) */
|
|
62
|
+
batchSize?: number;
|
|
63
|
+
/** Base URL for API (for compatible APIs) */
|
|
64
|
+
baseUrl?: string;
|
|
65
|
+
}
|
|
66
|
+
declare class OpenAIEmbeddingBackend implements EmbeddingBackend {
|
|
67
|
+
private apiKey;
|
|
68
|
+
private model;
|
|
69
|
+
private dimensions;
|
|
70
|
+
private batchSize;
|
|
71
|
+
private baseUrl;
|
|
72
|
+
private static MODEL_DIMENSIONS;
|
|
73
|
+
constructor(options: OpenAIEmbeddingOptions);
|
|
74
|
+
embed(text: string): Promise<EmbeddingResult>;
|
|
75
|
+
embedBatch(texts: string[]): Promise<EmbeddingResult[]>;
|
|
76
|
+
private callAPI;
|
|
77
|
+
getDimensions(): number;
|
|
78
|
+
getModel(): string;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Available OpenAI embedding models
|
|
82
|
+
*/
|
|
83
|
+
declare const OPENAI_MODELS: {
|
|
84
|
+
/** text-embedding-3-large - Highest quality (3072 dims, can reduce) */
|
|
85
|
+
EMBEDDING_3_LARGE: string;
|
|
86
|
+
/** text-embedding-3-small - Good balance (1536 dims, can reduce) */
|
|
87
|
+
EMBEDDING_3_SMALL: string;
|
|
88
|
+
/** text-embedding-ada-002 - Legacy model (1536 dims) */
|
|
89
|
+
ADA_002: string;
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Embedding Backends
|
|
94
|
+
*
|
|
95
|
+
* Generate vector embeddings from text using local or cloud models.
|
|
96
|
+
*/
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Create an embedding backend based on options
|
|
100
|
+
*/
|
|
101
|
+
declare function createEmbeddingBackend(options: EmbeddingOptions): EmbeddingBackend;
|
|
102
|
+
|
|
103
|
+
export { LOCAL_MODELS, LocalEmbeddingBackend, type LocalEmbeddingOptions, OPENAI_MODELS, OpenAIEmbeddingBackend, type OpenAIEmbeddingOptions, createEmbeddingBackend };
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
// src/embeddings/local.ts
|
|
2
|
+
var pipeline = null;
|
|
3
|
+
var embedder = null;
|
|
4
|
+
async function loadPipeline(model) {
|
|
5
|
+
if (!pipeline) {
|
|
6
|
+
const transformers = await import("@xenova/transformers");
|
|
7
|
+
pipeline = transformers.pipeline;
|
|
8
|
+
}
|
|
9
|
+
if (!embedder) {
|
|
10
|
+
console.log(`[LocalEmbeddings] Loading model: ${model}...`);
|
|
11
|
+
embedder = await pipeline("feature-extraction", model, {
|
|
12
|
+
quantized: true
|
|
13
|
+
// Use quantized model for faster inference
|
|
14
|
+
});
|
|
15
|
+
console.log(`[LocalEmbeddings] Model loaded successfully`);
|
|
16
|
+
}
|
|
17
|
+
return embedder;
|
|
18
|
+
}
|
|
19
|
+
var LocalEmbeddingBackend = class _LocalEmbeddingBackend {
|
|
20
|
+
model;
|
|
21
|
+
batchSize;
|
|
22
|
+
normalize;
|
|
23
|
+
dimensions;
|
|
24
|
+
// Model dimension map
|
|
25
|
+
static MODEL_DIMENSIONS = {
|
|
26
|
+
"Xenova/bge-large-en-v1.5": 1024,
|
|
27
|
+
"Xenova/bge-base-en-v1.5": 768,
|
|
28
|
+
"Xenova/bge-small-en-v1.5": 384,
|
|
29
|
+
"Xenova/all-MiniLM-L6-v2": 384,
|
|
30
|
+
"Xenova/all-MiniLM-L12-v2": 384,
|
|
31
|
+
"Xenova/all-mpnet-base-v2": 768
|
|
32
|
+
};
|
|
33
|
+
constructor(options) {
|
|
34
|
+
this.model = options?.model || "Xenova/bge-base-en-v1.5";
|
|
35
|
+
this.batchSize = options?.batchSize || 32;
|
|
36
|
+
this.normalize = options?.normalize ?? true;
|
|
37
|
+
this.dimensions = _LocalEmbeddingBackend.MODEL_DIMENSIONS[this.model] || 768;
|
|
38
|
+
}
|
|
39
|
+
async embed(text) {
|
|
40
|
+
const embedder2 = await loadPipeline(this.model);
|
|
41
|
+
const processedText = this.model.includes("bge") ? `Represent this sentence for searching relevant passages: ${text}` : text;
|
|
42
|
+
const output = await embedder2(processedText, {
|
|
43
|
+
pooling: "mean",
|
|
44
|
+
normalize: this.normalize
|
|
45
|
+
});
|
|
46
|
+
const embedding = Array.from(output.data);
|
|
47
|
+
return {
|
|
48
|
+
embedding,
|
|
49
|
+
tokens: Math.ceil(text.length / 4)
|
|
50
|
+
// Rough estimate
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
async embedBatch(texts) {
|
|
54
|
+
const results = [];
|
|
55
|
+
for (let i = 0; i < texts.length; i += this.batchSize) {
|
|
56
|
+
const batch = texts.slice(i, i + this.batchSize);
|
|
57
|
+
const batchResults = await Promise.all(batch.map((text) => this.embed(text)));
|
|
58
|
+
results.push(...batchResults);
|
|
59
|
+
}
|
|
60
|
+
return results;
|
|
61
|
+
}
|
|
62
|
+
getDimensions() {
|
|
63
|
+
return this.dimensions;
|
|
64
|
+
}
|
|
65
|
+
getModel() {
|
|
66
|
+
return this.model;
|
|
67
|
+
}
|
|
68
|
+
};
|
|
69
|
+
var LOCAL_MODELS = {
|
|
70
|
+
/** BGE Large - Best quality, slower (1024 dims) */
|
|
71
|
+
BGE_LARGE: "Xenova/bge-large-en-v1.5",
|
|
72
|
+
/** BGE Base - Good balance (768 dims) */
|
|
73
|
+
BGE_BASE: "Xenova/bge-base-en-v1.5",
|
|
74
|
+
/** BGE Small - Fastest (384 dims) */
|
|
75
|
+
BGE_SMALL: "Xenova/bge-small-en-v1.5",
|
|
76
|
+
/** MiniLM L6 - Very fast (384 dims) */
|
|
77
|
+
MINILM_L6: "Xenova/all-MiniLM-L6-v2",
|
|
78
|
+
/** MiniLM L12 - Good quality (384 dims) */
|
|
79
|
+
MINILM_L12: "Xenova/all-MiniLM-L12-v2",
|
|
80
|
+
/** MPNet - High quality (768 dims) */
|
|
81
|
+
MPNET: "Xenova/all-mpnet-base-v2"
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
// src/embeddings/openai.ts
|
|
85
|
+
var OpenAIEmbeddingBackend = class _OpenAIEmbeddingBackend {
|
|
86
|
+
apiKey;
|
|
87
|
+
model;
|
|
88
|
+
dimensions;
|
|
89
|
+
batchSize;
|
|
90
|
+
baseUrl;
|
|
91
|
+
// Model dimension defaults
|
|
92
|
+
static MODEL_DIMENSIONS = {
|
|
93
|
+
"text-embedding-3-large": 3072,
|
|
94
|
+
"text-embedding-3-small": 1536,
|
|
95
|
+
"text-embedding-ada-002": 1536
|
|
96
|
+
};
|
|
97
|
+
constructor(options) {
|
|
98
|
+
this.apiKey = options.apiKey;
|
|
99
|
+
this.model = options.model || "text-embedding-3-small";
|
|
100
|
+
this.dimensions = options.dimensions || _OpenAIEmbeddingBackend.MODEL_DIMENSIONS[this.model] || 1536;
|
|
101
|
+
this.batchSize = options.batchSize || 100;
|
|
102
|
+
this.baseUrl = options.baseUrl || "https://api.openai.com/v1";
|
|
103
|
+
}
|
|
104
|
+
async embed(text) {
|
|
105
|
+
const results = await this.embedBatch([text]);
|
|
106
|
+
return results[0];
|
|
107
|
+
}
|
|
108
|
+
async embedBatch(texts) {
|
|
109
|
+
const allResults = [];
|
|
110
|
+
for (let i = 0; i < texts.length; i += this.batchSize) {
|
|
111
|
+
const batch = texts.slice(i, i + this.batchSize);
|
|
112
|
+
const batchResults = await this.callAPI(batch);
|
|
113
|
+
allResults.push(...batchResults);
|
|
114
|
+
}
|
|
115
|
+
return allResults;
|
|
116
|
+
}
|
|
117
|
+
async callAPI(texts) {
|
|
118
|
+
const body = {
|
|
119
|
+
model: this.model,
|
|
120
|
+
input: texts
|
|
121
|
+
};
|
|
122
|
+
if (this.model.startsWith("text-embedding-3-") && this.dimensions) {
|
|
123
|
+
body.dimensions = this.dimensions;
|
|
124
|
+
}
|
|
125
|
+
const response = await fetch(`${this.baseUrl}/embeddings`, {
|
|
126
|
+
method: "POST",
|
|
127
|
+
headers: {
|
|
128
|
+
"Content-Type": "application/json",
|
|
129
|
+
Authorization: `Bearer ${this.apiKey}`
|
|
130
|
+
},
|
|
131
|
+
body: JSON.stringify(body)
|
|
132
|
+
});
|
|
133
|
+
if (!response.ok) {
|
|
134
|
+
const error = await response.text();
|
|
135
|
+
throw new Error(`OpenAI API error: ${response.status} - ${error}`);
|
|
136
|
+
}
|
|
137
|
+
const data = await response.json();
|
|
138
|
+
const sorted = data.data.sort((a, b) => a.index - b.index);
|
|
139
|
+
return sorted.map((item, i) => ({
|
|
140
|
+
embedding: item.embedding,
|
|
141
|
+
tokens: Math.ceil(texts[i].length / 4)
|
|
142
|
+
// Rough estimate
|
|
143
|
+
}));
|
|
144
|
+
}
|
|
145
|
+
getDimensions() {
|
|
146
|
+
return this.dimensions;
|
|
147
|
+
}
|
|
148
|
+
getModel() {
|
|
149
|
+
return this.model;
|
|
150
|
+
}
|
|
151
|
+
};
|
|
152
|
+
var OPENAI_MODELS = {
|
|
153
|
+
/** text-embedding-3-large - Highest quality (3072 dims, can reduce) */
|
|
154
|
+
EMBEDDING_3_LARGE: "text-embedding-3-large",
|
|
155
|
+
/** text-embedding-3-small - Good balance (1536 dims, can reduce) */
|
|
156
|
+
EMBEDDING_3_SMALL: "text-embedding-3-small",
|
|
157
|
+
/** text-embedding-ada-002 - Legacy model (1536 dims) */
|
|
158
|
+
ADA_002: "text-embedding-ada-002"
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
// src/embeddings/index.ts
|
|
162
|
+
function createEmbeddingBackend(options) {
|
|
163
|
+
switch (options.provider) {
|
|
164
|
+
case "local":
|
|
165
|
+
return new LocalEmbeddingBackend({
|
|
166
|
+
model: options.model,
|
|
167
|
+
batchSize: options.batchSize
|
|
168
|
+
});
|
|
169
|
+
case "openai":
|
|
170
|
+
if (!options.apiKey) {
|
|
171
|
+
throw new Error("OpenAI embedding requires an API key");
|
|
172
|
+
}
|
|
173
|
+
return new OpenAIEmbeddingBackend({
|
|
174
|
+
apiKey: options.apiKey,
|
|
175
|
+
model: options.model,
|
|
176
|
+
dimensions: options.dimensions,
|
|
177
|
+
batchSize: options.batchSize
|
|
178
|
+
});
|
|
179
|
+
case "huggingface":
|
|
180
|
+
return new LocalEmbeddingBackend({
|
|
181
|
+
model: options.model || "Xenova/all-MiniLM-L6-v2",
|
|
182
|
+
batchSize: options.batchSize
|
|
183
|
+
});
|
|
184
|
+
default:
|
|
185
|
+
throw new Error(`Unknown embedding provider: ${options.provider}`);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
export {
|
|
189
|
+
LOCAL_MODELS,
|
|
190
|
+
LocalEmbeddingBackend,
|
|
191
|
+
OPENAI_MODELS,
|
|
192
|
+
OpenAIEmbeddingBackend,
|
|
193
|
+
createEmbeddingBackend
|
|
194
|
+
};
|
|
195
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/embeddings/local.ts","../../src/embeddings/openai.ts","../../src/embeddings/index.ts"],"sourcesContent":["/**\n * Local Embeddings using @xenova/transformers\n *\n * Runs embedding models locally without API calls.\n * Supports BGE, all-MiniLM, and other sentence-transformer models.\n */\n\nimport type { EmbeddingBackend, EmbeddingResult } from '../types.js';\n\n// Dynamic import for transformers (it's a heavy module)\nlet pipeline: any = null;\nlet embedder: any = null;\n\nasync function loadPipeline(model: string) {\n if (!pipeline) {\n const transformers = await import('@xenova/transformers');\n pipeline = transformers.pipeline;\n }\n\n if (!embedder) {\n console.log(`[LocalEmbeddings] Loading model: ${model}...`);\n embedder = await pipeline('feature-extraction', model, {\n quantized: true, // Use quantized model for faster inference\n });\n console.log(`[LocalEmbeddings] Model loaded successfully`);\n }\n\n return embedder;\n}\n\nexport interface LocalEmbeddingOptions {\n /** Model name (default: 'Xenova/bge-base-en-v1.5') */\n model?: string;\n /** Batch size for processing (default: 32) */\n batchSize?: number;\n /** Whether to normalize embeddings (default: true) */\n normalize?: boolean;\n}\n\nexport class LocalEmbeddingBackend implements EmbeddingBackend {\n private model: string;\n private batchSize: number;\n private normalize: boolean;\n private dimensions: number;\n\n // Model dimension map\n private static MODEL_DIMENSIONS: Record<string, number> = {\n 'Xenova/bge-large-en-v1.5': 1024,\n 'Xenova/bge-base-en-v1.5': 768,\n 'Xenova/bge-small-en-v1.5': 384,\n 'Xenova/all-MiniLM-L6-v2': 384,\n 'Xenova/all-MiniLM-L12-v2': 384,\n 'Xenova/all-mpnet-base-v2': 768,\n };\n\n constructor(options?: LocalEmbeddingOptions) {\n // Use bge-base for balance of quality and speed\n this.model = options?.model || 'Xenova/bge-base-en-v1.5';\n this.batchSize = options?.batchSize || 32;\n this.normalize = options?.normalize ?? true;\n this.dimensions = LocalEmbeddingBackend.MODEL_DIMENSIONS[this.model] || 768;\n }\n\n async embed(text: string): Promise<EmbeddingResult> {\n const embedder = await loadPipeline(this.model);\n\n // For BGE models, add instruction prefix for better retrieval\n const processedText = this.model.includes('bge')\n ? `Represent this sentence for searching relevant passages: ${text}`\n : text;\n\n const output = await embedder(processedText, {\n pooling: 'mean',\n normalize: this.normalize,\n });\n\n // Convert to regular array\n const embedding = Array.from(output.data as Float32Array);\n\n return {\n embedding,\n tokens: Math.ceil(text.length / 4), // Rough estimate\n };\n }\n\n async embedBatch(texts: string[]): Promise<EmbeddingResult[]> {\n const results: EmbeddingResult[] = [];\n\n // Process in batches\n for (let i = 0; i < texts.length; i += this.batchSize) {\n const batch = texts.slice(i, i + this.batchSize);\n const batchResults = await Promise.all(batch.map((text) => this.embed(text)));\n results.push(...batchResults);\n }\n\n return results;\n }\n\n getDimensions(): number {\n return this.dimensions;\n }\n\n getModel(): string {\n return this.model;\n }\n}\n\n/**\n * Available local models\n */\nexport const LOCAL_MODELS = {\n /** BGE Large - Best quality, slower (1024 dims) */\n BGE_LARGE: 'Xenova/bge-large-en-v1.5',\n /** BGE Base - Good balance (768 dims) */\n BGE_BASE: 'Xenova/bge-base-en-v1.5',\n /** BGE Small - Fastest (384 dims) */\n BGE_SMALL: 'Xenova/bge-small-en-v1.5',\n /** MiniLM L6 - Very fast (384 dims) */\n MINILM_L6: 'Xenova/all-MiniLM-L6-v2',\n /** MiniLM L12 - Good quality (384 dims) */\n MINILM_L12: 'Xenova/all-MiniLM-L12-v2',\n /** MPNet - High quality (768 dims) */\n MPNET: 'Xenova/all-mpnet-base-v2',\n};\n","/**\n * OpenAI Embeddings\n *\n * Uses OpenAI's embedding API for high-quality embeddings.\n */\n\nimport type { EmbeddingBackend, EmbeddingResult } from '../types.js';\n\nexport interface OpenAIEmbeddingOptions {\n /** OpenAI API key */\n apiKey: string;\n /** Model name (default: 'text-embedding-3-small') */\n model?: string;\n /** Embedding dimensions (for models that support it) */\n dimensions?: number;\n /** Batch size for processing (default: 100) */\n batchSize?: number;\n /** Base URL for API (for compatible APIs) */\n baseUrl?: string;\n}\n\nexport class OpenAIEmbeddingBackend implements EmbeddingBackend {\n private apiKey: string;\n private model: string;\n private dimensions: number;\n private batchSize: number;\n private baseUrl: string;\n\n // Model dimension defaults\n private static MODEL_DIMENSIONS: Record<string, number> = {\n 'text-embedding-3-large': 3072,\n 'text-embedding-3-small': 1536,\n 'text-embedding-ada-002': 1536,\n };\n\n constructor(options: OpenAIEmbeddingOptions) {\n this.apiKey = options.apiKey;\n this.model = options.model || 'text-embedding-3-small';\n this.dimensions =\n options.dimensions || OpenAIEmbeddingBackend.MODEL_DIMENSIONS[this.model] || 1536;\n this.batchSize = options.batchSize || 100;\n this.baseUrl = options.baseUrl || 'https://api.openai.com/v1';\n }\n\n async embed(text: string): Promise<EmbeddingResult> {\n const results = await this.embedBatch([text]);\n return results[0];\n }\n\n async embedBatch(texts: string[]): Promise<EmbeddingResult[]> {\n const allResults: EmbeddingResult[] = [];\n\n // Process in batches (OpenAI has limits)\n for (let i = 0; i < texts.length; i += this.batchSize) {\n const batch = texts.slice(i, i + this.batchSize);\n const batchResults = await this.callAPI(batch);\n allResults.push(...batchResults);\n }\n\n return allResults;\n }\n\n private async callAPI(texts: string[]): Promise<EmbeddingResult[]> {\n const body: Record<string, unknown> = {\n model: this.model,\n input: texts,\n };\n\n // Add dimensions param for text-embedding-3-* models\n if (this.model.startsWith('text-embedding-3-') && this.dimensions) {\n body.dimensions = this.dimensions;\n }\n\n const response = await fetch(`${this.baseUrl}/embeddings`, {\n method: 'POST',\n headers: {\n 'Content-Type': 'application/json',\n Authorization: `Bearer ${this.apiKey}`,\n },\n body: JSON.stringify(body),\n });\n\n if (!response.ok) {\n const error = await response.text();\n throw new Error(`OpenAI API error: ${response.status} - ${error}`);\n }\n\n const data = (await response.json()) as {\n data: Array<{ embedding: number[]; index: number }>;\n usage: { prompt_tokens: number; total_tokens: number };\n };\n\n // Sort by index to maintain order\n const sorted = data.data.sort((a, b) => a.index - b.index);\n\n return sorted.map((item, i) => ({\n embedding: item.embedding,\n tokens: Math.ceil(texts[i].length / 4), // Rough estimate\n }));\n }\n\n getDimensions(): number {\n return this.dimensions;\n }\n\n getModel(): string {\n return this.model;\n }\n}\n\n/**\n * Available OpenAI embedding models\n */\nexport const OPENAI_MODELS = {\n /** text-embedding-3-large - Highest quality (3072 dims, can reduce) */\n EMBEDDING_3_LARGE: 'text-embedding-3-large',\n /** text-embedding-3-small - Good balance (1536 dims, can reduce) */\n EMBEDDING_3_SMALL: 'text-embedding-3-small',\n /** text-embedding-ada-002 - Legacy model (1536 dims) */\n ADA_002: 'text-embedding-ada-002',\n};\n","/**\n * Embedding Backends\n *\n * Generate vector embeddings from text using local or cloud models.\n */\n\nexport {\n LocalEmbeddingBackend,\n LOCAL_MODELS,\n type LocalEmbeddingOptions,\n} from './local.js';\n\nexport {\n OpenAIEmbeddingBackend,\n OPENAI_MODELS,\n type OpenAIEmbeddingOptions,\n} from './openai.js';\n\nimport type { EmbeddingBackend, EmbeddingOptions } from '../types.js';\nimport { LocalEmbeddingBackend } from './local.js';\nimport { OpenAIEmbeddingBackend } from './openai.js';\n\n/**\n * Create an embedding backend based on options\n */\nexport function createEmbeddingBackend(options: EmbeddingOptions): EmbeddingBackend {\n switch (options.provider) {\n case 'local':\n return new LocalEmbeddingBackend({\n model: options.model,\n batchSize: options.batchSize,\n });\n\n case 'openai':\n if (!options.apiKey) {\n throw new Error('OpenAI embedding requires an API key');\n }\n return new OpenAIEmbeddingBackend({\n apiKey: options.apiKey,\n model: options.model,\n dimensions: options.dimensions,\n batchSize: options.batchSize,\n });\n\n case 'huggingface':\n // For now, use local with HuggingFace models\n return new LocalEmbeddingBackend({\n model: options.model || 'Xenova/all-MiniLM-L6-v2',\n batchSize: options.batchSize,\n });\n\n default:\n throw new Error(`Unknown embedding provider: ${options.provider}`);\n }\n}\n"],"mappings":";AAUA,IAAI,WAAgB;AACpB,IAAI,WAAgB;AAEpB,eAAe,aAAa,OAAe;AACzC,MAAI,CAAC,UAAU;AACb,UAAM,eAAe,MAAM,OAAO,sBAAsB;AACxD,eAAW,aAAa;AAAA,EAC1B;AAEA,MAAI,CAAC,UAAU;AACb,YAAQ,IAAI,oCAAoC,KAAK,KAAK;AAC1D,eAAW,MAAM,SAAS,sBAAsB,OAAO;AAAA,MACrD,WAAW;AAAA;AAAA,IACb,CAAC;AACD,YAAQ,IAAI,6CAA6C;AAAA,EAC3D;AAEA,SAAO;AACT;AAWO,IAAM,wBAAN,MAAM,uBAAkD;AAAA,EACrD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGR,OAAe,mBAA2C;AAAA,IACxD,4BAA4B;AAAA,IAC5B,2BAA2B;AAAA,IAC3B,4BAA4B;AAAA,IAC5B,2BAA2B;AAAA,IAC3B,4BAA4B;AAAA,IAC5B,4BAA4B;AAAA,EAC9B;AAAA,EAEA,YAAY,SAAiC;AAE3C,SAAK,QAAQ,SAAS,SAAS;AAC/B,SAAK,YAAY,SAAS,aAAa;AACvC,SAAK,YAAY,SAAS,aAAa;AACvC,SAAK,aAAa,uBAAsB,iBAAiB,KAAK,KAAK,KAAK;AAAA,EAC1E;AAAA,EAEA,MAAM,MAAM,MAAwC;AAClD,UAAMA,YAAW,MAAM,aAAa,KAAK,KAAK;AAG9C,UAAM,gBAAgB,KAAK,MAAM,SAAS,KAAK,IAC3C,4DAA4D,IAAI,KAChE;AAEJ,UAAM,SAAS,MAAMA,UAAS,eAAe;AAAA,MAC3C,SAAS;AAAA,MACT,WAAW,KAAK;AAAA,IAClB,CAAC;AAGD,UAAM,YAAY,MAAM,KAAK,OAAO,IAAoB;AAExD,WAAO;AAAA,MACL;AAAA,MACA,QAAQ,KAAK,KAAK,KAAK,SAAS,CAAC;AAAA;AAAA,IACnC;AAAA,EACF;AAAA,EAEA,MAAM,WAAW,OAA6C;AAC5D,UAAM,UAA6B,CAAC;AAGpC,aAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK,KAAK,WAAW;AACrD,YAAM,QAAQ,MAAM,MAAM,GAAG,IAAI,KAAK,SAAS;AAC/C,YAAM,eAAe,MAAM,QAAQ,IAAI,MAAM,IAAI,CAAC,SAAS,KAAK,MAAM,IAAI,CAAC,CAAC;AAC5E,cAAQ,KAAK,GAAG,YAAY;AAAA,IAC9B;AAEA,WAAO;AAAA,EACT;AAAA,EAEA,gBAAwB;AACtB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,WAAmB;AACjB,WAAO,KAAK;AAAA,EACd;AACF;AAKO,IAAM,eAAe;AAAA;AAAA,EAE1B,WAAW;AAAA;AAAA,EAEX,UAAU;AAAA;AAAA,EAEV,WAAW;AAAA;AAAA,EAEX,WAAW;AAAA;AAAA,EAEX,YAAY;AAAA;AAAA,EAEZ,OAAO;AACT;;;ACtGO,IAAM,yBAAN,MAAM,wBAAmD;AAAA,EACtD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGR,OAAe,mBAA2C;AAAA,IACxD,0BAA0B;AAAA,IAC1B,0BAA0B;AAAA,IAC1B,0BAA0B;AAAA,EAC5B;AAAA,EAEA,YAAY,SAAiC;AAC3C,SAAK,SAAS,QAAQ;AACtB,SAAK,QAAQ,QAAQ,SAAS;AAC9B,SAAK,aACH,QAAQ,cAAc,wBAAuB,iBAAiB,KAAK,KAAK,KAAK;AAC/E,SAAK,YAAY,QAAQ,aAAa;AACtC,SAAK,UAAU,QAAQ,WAAW;AAAA,EACpC;AAAA,EAEA,MAAM,MAAM,MAAwC;AAClD,UAAM,UAAU,MAAM,KAAK,WAAW,CAAC,IAAI,CAAC;AAC5C,WAAO,QAAQ,CAAC;AAAA,EAClB;AAAA,EAEA,MAAM,WAAW,OAA6C;AAC5D,UAAM,aAAgC,CAAC;AAGvC,aAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK,KAAK,WAAW;AACrD,YAAM,QAAQ,MAAM,MAAM,GAAG,IAAI,KAAK,SAAS;AAC/C,YAAM,eAAe,MAAM,KAAK,QAAQ,KAAK;AAC7C,iBAAW,KAAK,GAAG,YAAY;AAAA,IACjC;AAEA,WAAO;AAAA,EACT;AAAA,EAEA,MAAc,QAAQ,OAA6C;AACjE,UAAM,OAAgC;AAAA,MACpC,OAAO,KAAK;AAAA,MACZ,OAAO;AAAA,IACT;AAGA,QAAI,KAAK,MAAM,WAAW,mBAAmB,KAAK,KAAK,YAAY;AACjE,WAAK,aAAa,KAAK;AAAA,IACzB;AAEA,UAAM,WAAW,MAAM,MAAM,GAAG,KAAK,OAAO,eAAe;AAAA,MACzD,QAAQ;AAAA,MACR,SAAS;AAAA,QACP,gBAAgB;AAAA,QAChB,eAAe,UAAU,KAAK,MAAM;AAAA,MACtC;AAAA,MACA,MAAM,KAAK,UAAU,IAAI;AAAA,IAC3B,CAAC;AAED,QAAI,CAAC,SAAS,IAAI;AAChB,YAAM,QAAQ,MAAM,SAAS,KAAK;AAClC,YAAM,IAAI,MAAM,qBAAqB,SAAS,MAAM,MAAM,KAAK,EAAE;AAAA,IACnE;AAEA,UAAM,OAAQ,MAAM,SAAS,KAAK;AAMlC,UAAM,SAAS,KAAK,KAAK,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK;AAEzD,WAAO,OAAO,IAAI,CAAC,MAAM,OAAO;AAAA,MAC9B,WAAW,KAAK;AAAA,MAChB,QAAQ,KAAK,KAAK,MAAM,CAAC,EAAE,SAAS,CAAC;AAAA;AAAA,IACvC,EAAE;AAAA,EACJ;AAAA,EAEA,gBAAwB;AACtB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,WAAmB;AACjB,WAAO,KAAK;AAAA,EACd;AACF;AAKO,IAAM,gBAAgB;AAAA;AAAA,EAE3B,mBAAmB;AAAA;AAAA,EAEnB,mBAAmB;AAAA;AAAA,EAEnB,SAAS;AACX;;;AC/FO,SAAS,uBAAuB,SAA6C;AAClF,UAAQ,QAAQ,UAAU;AAAA,IACxB,KAAK;AACH,aAAO,IAAI,sBAAsB;AAAA,QAC/B,OAAO,QAAQ;AAAA,QACf,WAAW,QAAQ;AAAA,MACrB,CAAC;AAAA,IAEH,KAAK;AACH,UAAI,CAAC,QAAQ,QAAQ;AACnB,cAAM,IAAI,MAAM,sCAAsC;AAAA,MACxD;AACA,aAAO,IAAI,uBAAuB;AAAA,QAChC,QAAQ,QAAQ;AAAA,QAChB,OAAO,QAAQ;AAAA,QACf,YAAY,QAAQ;AAAA,QACpB,WAAW,QAAQ;AAAA,MACrB,CAAC;AAAA,IAEH,KAAK;AAEH,aAAO,IAAI,sBAAsB;AAAA,QAC/B,OAAO,QAAQ,SAAS;AAAA,QACxB,WAAW,QAAQ;AAAA,MACrB,CAAC;AAAA,IAEH;AACE,YAAM,IAAI,MAAM,+BAA+B,QAAQ,QAAQ,EAAE;AAAA,EACrE;AACF;","names":["embedder"]}
|