@elsium-ai/rag 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunkers.d.ts +19 -0
- package/dist/chunkers.d.ts.map +1 -0
- package/dist/embeddings.d.ts +11 -0
- package/dist/embeddings.d.ts.map +1 -0
- package/dist/index.d.ts +12 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +766 -0
- package/dist/loaders.d.ts +17 -0
- package/dist/loaders.d.ts.map +1 -0
- package/dist/pipeline.d.ts +26 -0
- package/dist/pipeline.d.ts.map +1 -0
- package/dist/types.d.ts +72 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/vectorstore.d.ts +20 -0
- package/dist/vectorstore.d.ts.map +1 -0
- package/package.json +35 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { Chunk, ChunkingConfig, Document } from './types';
|
|
2
|
+
export interface Chunker {
|
|
3
|
+
chunk(document: Document): Chunk[];
|
|
4
|
+
}
|
|
5
|
+
export declare function fixedSizeChunker(options?: {
|
|
6
|
+
maxChunkSize?: number;
|
|
7
|
+
overlap?: number;
|
|
8
|
+
}): Chunker;
|
|
9
|
+
export declare function recursiveChunker(options?: {
|
|
10
|
+
maxChunkSize?: number;
|
|
11
|
+
overlap?: number;
|
|
12
|
+
separators?: string[];
|
|
13
|
+
}): Chunker;
|
|
14
|
+
export declare function sentenceChunker(options?: {
|
|
15
|
+
maxChunkSize?: number;
|
|
16
|
+
overlap?: number;
|
|
17
|
+
}): Chunker;
|
|
18
|
+
export declare function getChunker(config: ChunkingConfig): Chunker;
|
|
19
|
+
//# sourceMappingURL=chunkers.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunkers.d.ts","sourceRoot":"","sources":["../src/chunkers.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAA;AAE9D,MAAM,WAAW,OAAO;IACvB,KAAK,CAAC,QAAQ,EAAE,QAAQ,GAAG,KAAK,EAAE,CAAA;CAClC;AAID,wBAAgB,gBAAgB,CAAC,OAAO,CAAC,EAAE;IAC1C,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB,GAAG,OAAO,CA2CV;AAMD,wBAAgB,gBAAgB,CAAC,OAAO,CAAC,EAAE;IAC1C,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,UAAU,CAAC,EAAE,MAAM,EAAE,CAAA;CACrB,GAAG,OAAO,CAmFV;AAID,wBAAgB,eAAe,CAAC,OAAO,CAAC,EAAE;IACzC,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB,GAAG,OAAO,CA6FV;AAID,wBAAgB,UAAU,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAkB1D"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { EmbeddingConfig, EmbeddingVector } from './types';
|
|
2
|
+
export interface EmbeddingProvider {
|
|
3
|
+
readonly name: string;
|
|
4
|
+
readonly dimensions: number;
|
|
5
|
+
embed(text: string): Promise<EmbeddingVector>;
|
|
6
|
+
embedBatch(texts: string[]): Promise<EmbeddingVector[]>;
|
|
7
|
+
}
|
|
8
|
+
export declare function createOpenAIEmbeddings(config: EmbeddingConfig): EmbeddingProvider;
|
|
9
|
+
export declare function createMockEmbeddings(dims?: number): EmbeddingProvider;
|
|
10
|
+
export declare function getEmbeddingProvider(config: EmbeddingConfig): EmbeddingProvider;
|
|
11
|
+
//# sourceMappingURL=embeddings.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embeddings.d.ts","sourceRoot":"","sources":["../src/embeddings.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,SAAS,CAAA;AAE/D,MAAM,WAAW,iBAAiB;IACjC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAA;IACrB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAA;IAE3B,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC,CAAA;IAC7C,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC,CAAA;CACvD;AAID,wBAAgB,sBAAsB,CAAC,MAAM,EAAE,eAAe,GAAG,iBAAiB,CAyEjF;AAID,wBAAgB,oBAAoB,CAAC,IAAI,SAAM,GAAG,iBAAiB,CAgClE;AAID,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,eAAe,GAAG,iBAAiB,CAa/E"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
export type { Document, DocumentMetadata, Chunk, ChunkMetadata, EmbeddingVector, EmbeddedChunk, RetrievalResult, QueryOptions, LoaderType, ChunkingStrategy, ChunkingConfig, EmbeddingConfig, VectorStoreConfig, RetrievalConfig, } from './types';
|
|
2
|
+
export { textLoader, markdownLoader, htmlLoader, jsonLoader, csvLoader, getLoader, } from './loaders';
|
|
3
|
+
export type { DocumentLoader } from './loaders';
|
|
4
|
+
export { fixedSizeChunker, recursiveChunker, sentenceChunker, getChunker, } from './chunkers';
|
|
5
|
+
export type { Chunker } from './chunkers';
|
|
6
|
+
export { createOpenAIEmbeddings, createMockEmbeddings, getEmbeddingProvider, } from './embeddings';
|
|
7
|
+
export type { EmbeddingProvider } from './embeddings';
|
|
8
|
+
export { createInMemoryStore, cosineSimilarity, mmrRerank, } from './vectorstore';
|
|
9
|
+
export type { VectorStore } from './vectorstore';
|
|
10
|
+
export { rag } from './pipeline';
|
|
11
|
+
export type { RAGPipeline, RAGPipelineConfig, IngestResult } from './pipeline';
|
|
12
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,YAAY,EACX,QAAQ,EACR,gBAAgB,EAChB,KAAK,EACL,aAAa,EACb,eAAe,EACf,aAAa,EACb,eAAe,EACf,YAAY,EACZ,UAAU,EACV,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,iBAAiB,EACjB,eAAe,GACf,MAAM,SAAS,CAAA;AAGhB,OAAO,EACN,UAAU,EACV,cAAc,EACd,UAAU,EACV,UAAU,EACV,SAAS,EACT,SAAS,GACT,MAAM,WAAW,CAAA;AAClB,YAAY,EAAE,cAAc,EAAE,MAAM,WAAW,CAAA;AAG/C,OAAO,EACN,gBAAgB,EAChB,gBAAgB,EAChB,eAAe,EACf,UAAU,GACV,MAAM,YAAY,CAAA;AACnB,YAAY,EAAE,OAAO,EAAE,MAAM,YAAY,CAAA;AAGzC,OAAO,EACN,sBAAsB,EACtB,oBAAoB,EACpB,oBAAoB,GACpB,MAAM,cAAc,CAAA;AACrB,YAAY,EAAE,iBAAiB,EAAE,MAAM,cAAc,CAAA;AAGrD,OAAO,EACN,mBAAmB,EACnB,gBAAgB,EAChB,SAAS,GACT,MAAM,eAAe,CAAA;AACtB,YAAY,EAAE,WAAW,EAAE,MAAM,eAAe,CAAA;AAGhD,OAAO,EAAE,GAAG,EAAE,MAAM,YAAY,CAAA;AAChC,YAAY,EAAE,WAAW,EAAE,iBAAiB,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,766 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// ../core/src/errors.ts
|
|
3
|
+
class ElsiumError extends Error {
|
|
4
|
+
code;
|
|
5
|
+
provider;
|
|
6
|
+
model;
|
|
7
|
+
statusCode;
|
|
8
|
+
retryable;
|
|
9
|
+
retryAfterMs;
|
|
10
|
+
cause;
|
|
11
|
+
metadata;
|
|
12
|
+
constructor(details) {
|
|
13
|
+
super(details.message);
|
|
14
|
+
this.name = "ElsiumError";
|
|
15
|
+
this.code = details.code;
|
|
16
|
+
this.provider = details.provider;
|
|
17
|
+
this.model = details.model;
|
|
18
|
+
this.statusCode = details.statusCode;
|
|
19
|
+
this.retryable = details.retryable;
|
|
20
|
+
this.retryAfterMs = details.retryAfterMs;
|
|
21
|
+
this.cause = details.cause;
|
|
22
|
+
this.metadata = details.metadata;
|
|
23
|
+
}
|
|
24
|
+
toJSON() {
|
|
25
|
+
return {
|
|
26
|
+
name: this.name,
|
|
27
|
+
code: this.code,
|
|
28
|
+
message: this.message,
|
|
29
|
+
provider: this.provider,
|
|
30
|
+
model: this.model,
|
|
31
|
+
statusCode: this.statusCode,
|
|
32
|
+
retryable: this.retryable,
|
|
33
|
+
retryAfterMs: this.retryAfterMs,
|
|
34
|
+
metadata: this.metadata
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
static providerError(message, opts) {
|
|
38
|
+
return new ElsiumError({
|
|
39
|
+
code: "PROVIDER_ERROR",
|
|
40
|
+
message,
|
|
41
|
+
provider: opts.provider,
|
|
42
|
+
statusCode: opts.statusCode,
|
|
43
|
+
retryable: opts.retryable ?? false,
|
|
44
|
+
cause: opts.cause
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
static rateLimit(provider, retryAfterMs) {
|
|
48
|
+
return new ElsiumError({
|
|
49
|
+
code: "RATE_LIMIT",
|
|
50
|
+
message: `Rate limited by ${provider}`,
|
|
51
|
+
provider,
|
|
52
|
+
statusCode: 429,
|
|
53
|
+
retryable: true,
|
|
54
|
+
retryAfterMs
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
static authError(provider) {
|
|
58
|
+
return new ElsiumError({
|
|
59
|
+
code: "AUTH_ERROR",
|
|
60
|
+
message: `Authentication failed for ${provider}. Check your API key.`,
|
|
61
|
+
provider,
|
|
62
|
+
statusCode: 401,
|
|
63
|
+
retryable: false
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
static timeout(provider, timeoutMs) {
|
|
67
|
+
return new ElsiumError({
|
|
68
|
+
code: "TIMEOUT",
|
|
69
|
+
message: `Request to ${provider} timed out after ${timeoutMs}ms`,
|
|
70
|
+
provider,
|
|
71
|
+
retryable: true
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
static validation(message, metadata) {
|
|
75
|
+
return new ElsiumError({
|
|
76
|
+
code: "VALIDATION_ERROR",
|
|
77
|
+
message,
|
|
78
|
+
retryable: false,
|
|
79
|
+
metadata
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
static budgetExceeded(spent, budget) {
|
|
83
|
+
return new ElsiumError({
|
|
84
|
+
code: "BUDGET_EXCEEDED",
|
|
85
|
+
message: `Token budget exceeded: spent ${spent}, budget ${budget}`,
|
|
86
|
+
retryable: false,
|
|
87
|
+
metadata: { spent, budget }
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
// ../core/src/utils.ts
|
|
92
|
+
import { randomBytes } from "crypto";
|
|
93
|
+
function cryptoHex(bytes) {
|
|
94
|
+
return randomBytes(bytes).toString("hex");
|
|
95
|
+
}
|
|
96
|
+
function generateId(prefix = "els") {
|
|
97
|
+
const timestamp = Date.now().toString(36);
|
|
98
|
+
const random = cryptoHex(4);
|
|
99
|
+
return `${prefix}_${timestamp}_${random}`;
|
|
100
|
+
}
|
|
101
|
+
// src/loaders.ts
|
|
102
|
+
function createDocument(content, metadata) {
|
|
103
|
+
return {
|
|
104
|
+
id: generateId("doc"),
|
|
105
|
+
content,
|
|
106
|
+
metadata
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
function textLoader() {
|
|
110
|
+
return {
|
|
111
|
+
load(source, content) {
|
|
112
|
+
return createDocument(content, {
|
|
113
|
+
source,
|
|
114
|
+
type: "text"
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
function markdownLoader() {
|
|
120
|
+
return {
|
|
121
|
+
load(source, content) {
|
|
122
|
+
const title = extractMarkdownTitle(content);
|
|
123
|
+
return createDocument(content, {
|
|
124
|
+
source,
|
|
125
|
+
type: "markdown",
|
|
126
|
+
title
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
function extractMarkdownTitle(content) {
|
|
132
|
+
const match = content.match(/^#\s+(.+)$/m);
|
|
133
|
+
return match?.[1]?.trim();
|
|
134
|
+
}
|
|
135
|
+
function htmlLoader() {
|
|
136
|
+
return {
|
|
137
|
+
load(source, content) {
|
|
138
|
+
const plainText = stripHtml(content);
|
|
139
|
+
const title = extractHtmlTitle(content);
|
|
140
|
+
return createDocument(plainText, {
|
|
141
|
+
source,
|
|
142
|
+
type: "html",
|
|
143
|
+
title
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
function stripHtml(html) {
|
|
149
|
+
return html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/ /g, " ").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, '"').replace(/'/g, "'").replace(/\s+/g, " ").trim();
|
|
150
|
+
}
|
|
151
|
+
function extractHtmlTitle(html) {
|
|
152
|
+
const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
153
|
+
return match?.[1]?.trim();
|
|
154
|
+
}
|
|
155
|
+
function extractItemText(item, contentField) {
|
|
156
|
+
if (typeof item === "string")
|
|
157
|
+
return item;
|
|
158
|
+
if (typeof item === "object" && item !== null) {
|
|
159
|
+
const text = item[contentField];
|
|
160
|
+
if (typeof text === "string")
|
|
161
|
+
return text;
|
|
162
|
+
return JSON.stringify(item, null, 2);
|
|
163
|
+
}
|
|
164
|
+
return "";
|
|
165
|
+
}
|
|
166
|
+
function extractMetadataFields(parsed, metadataFields) {
|
|
167
|
+
const extra = {};
|
|
168
|
+
if (!Array.isArray(parsed) && typeof parsed === "object" && parsed !== null) {
|
|
169
|
+
for (const field of metadataFields) {
|
|
170
|
+
if (field in parsed) {
|
|
171
|
+
extra[field] = parsed[field];
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
return extra;
|
|
176
|
+
}
|
|
177
|
+
function jsonLoader(options) {
|
|
178
|
+
const contentField = options?.contentField ?? "content";
|
|
179
|
+
const metadataFields = options?.metadataFields ?? [];
|
|
180
|
+
return {
|
|
181
|
+
load(source, content) {
|
|
182
|
+
const parsed = JSON.parse(content);
|
|
183
|
+
const items = Array.isArray(parsed) ? parsed : [parsed];
|
|
184
|
+
const texts = items.map((item) => extractItemText(item, contentField)).filter(Boolean);
|
|
185
|
+
const extra = extractMetadataFields(parsed, metadataFields);
|
|
186
|
+
return createDocument(texts.join(`
|
|
187
|
+
|
|
188
|
+
`), {
|
|
189
|
+
source,
|
|
190
|
+
type: "json",
|
|
191
|
+
...extra
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
function csvLoader(options) {
|
|
197
|
+
const separator = options?.separator ?? ",";
|
|
198
|
+
const contentColumns = options?.contentColumns;
|
|
199
|
+
return {
|
|
200
|
+
load(source, content) {
|
|
201
|
+
const lines = content.split(`
|
|
202
|
+
`).filter((l) => l.trim().length > 0);
|
|
203
|
+
if (lines.length === 0) {
|
|
204
|
+
return createDocument("", { source, type: "csv" });
|
|
205
|
+
}
|
|
206
|
+
const headers = parseCSVLine(lines[0], separator);
|
|
207
|
+
const rows = lines.slice(1).map((line) => parseCSVLine(line, separator));
|
|
208
|
+
const columnsToUse = contentColumns ?? headers;
|
|
209
|
+
const columnIndices = columnsToUse.map((col) => headers.indexOf(col)).filter((i) => i >= 0);
|
|
210
|
+
const textRows = rows.map((row) => {
|
|
211
|
+
if (columnIndices.length > 0) {
|
|
212
|
+
return columnIndices.map((i) => `${headers[i]}: ${row[i] ?? ""}`).join(", ");
|
|
213
|
+
}
|
|
214
|
+
return row.join(", ");
|
|
215
|
+
});
|
|
216
|
+
return createDocument(textRows.join(`
|
|
217
|
+
`), {
|
|
218
|
+
source,
|
|
219
|
+
type: "csv",
|
|
220
|
+
rowCount: rows.length,
|
|
221
|
+
columns: headers
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
function parseCSVLine(line, separator) {
|
|
227
|
+
const result = [];
|
|
228
|
+
let current = "";
|
|
229
|
+
let inQuotes = false;
|
|
230
|
+
for (let i = 0;i < line.length; i++) {
|
|
231
|
+
const char = line[i];
|
|
232
|
+
if (char === '"') {
|
|
233
|
+
if (inQuotes && line[i + 1] === '"') {
|
|
234
|
+
current += '"';
|
|
235
|
+
i++;
|
|
236
|
+
} else {
|
|
237
|
+
inQuotes = !inQuotes;
|
|
238
|
+
}
|
|
239
|
+
} else if (char === separator && !inQuotes) {
|
|
240
|
+
result.push(current.trim());
|
|
241
|
+
current = "";
|
|
242
|
+
} else {
|
|
243
|
+
current += char;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
result.push(current.trim());
|
|
247
|
+
return result;
|
|
248
|
+
}
|
|
249
|
+
function getLoader(type) {
|
|
250
|
+
switch (type) {
|
|
251
|
+
case "text":
|
|
252
|
+
return textLoader();
|
|
253
|
+
case "markdown":
|
|
254
|
+
return markdownLoader();
|
|
255
|
+
case "html":
|
|
256
|
+
return htmlLoader();
|
|
257
|
+
case "json":
|
|
258
|
+
return jsonLoader();
|
|
259
|
+
case "csv":
|
|
260
|
+
return csvLoader();
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
// src/chunkers.ts
|
|
264
|
+
function fixedSizeChunker(options) {
|
|
265
|
+
const maxSize = options?.maxChunkSize ?? 512;
|
|
266
|
+
const overlap = options?.overlap ?? 0;
|
|
267
|
+
if (overlap >= maxSize) {
|
|
268
|
+
throw new Error("overlap must be less than maxChunkSize");
|
|
269
|
+
}
|
|
270
|
+
return {
|
|
271
|
+
chunk(document) {
|
|
272
|
+
const { content } = document;
|
|
273
|
+
if (content.length === 0)
|
|
274
|
+
return [];
|
|
275
|
+
const chunks = [];
|
|
276
|
+
let startChar = 0;
|
|
277
|
+
let index = 0;
|
|
278
|
+
while (startChar < content.length) {
|
|
279
|
+
const endChar = Math.min(startChar + maxSize, content.length);
|
|
280
|
+
const chunkContent = content.slice(startChar, endChar);
|
|
281
|
+
chunks.push({
|
|
282
|
+
id: generateId("chk"),
|
|
283
|
+
content: chunkContent,
|
|
284
|
+
documentId: document.id,
|
|
285
|
+
index,
|
|
286
|
+
metadata: {
|
|
287
|
+
startChar,
|
|
288
|
+
endChar,
|
|
289
|
+
tokenEstimate: Math.ceil(chunkContent.length / 4)
|
|
290
|
+
}
|
|
291
|
+
});
|
|
292
|
+
index++;
|
|
293
|
+
startChar = endChar - overlap;
|
|
294
|
+
if (startChar >= content.length)
|
|
295
|
+
break;
|
|
296
|
+
if (endChar === content.length)
|
|
297
|
+
break;
|
|
298
|
+
}
|
|
299
|
+
return chunks;
|
|
300
|
+
}
|
|
301
|
+
};
|
|
302
|
+
}
|
|
303
|
+
var DEFAULT_SEPARATORS = [`
|
|
304
|
+
|
|
305
|
+
`, `
|
|
306
|
+
`, ". ", " ", ""];
|
|
307
|
+
function recursiveChunker(options) {
|
|
308
|
+
const maxSize = options?.maxChunkSize ?? 512;
|
|
309
|
+
const overlap = options?.overlap ?? 0;
|
|
310
|
+
const separators = options?.separators ?? DEFAULT_SEPARATORS;
|
|
311
|
+
if (overlap >= maxSize) {
|
|
312
|
+
throw new Error("overlap must be less than maxChunkSize");
|
|
313
|
+
}
|
|
314
|
+
function fixedSizeSplit(text) {
|
|
315
|
+
const parts = [];
|
|
316
|
+
for (let i = 0;i < text.length; i += maxSize - overlap) {
|
|
317
|
+
parts.push(text.slice(i, i + maxSize));
|
|
318
|
+
}
|
|
319
|
+
return parts;
|
|
320
|
+
}
|
|
321
|
+
function handleOversizedSplit(split, sepIndex) {
|
|
322
|
+
if (split.length > maxSize) {
|
|
323
|
+
return { chunks: splitRecursive(split, sepIndex + 1), remainder: "" };
|
|
324
|
+
}
|
|
325
|
+
return { chunks: [], remainder: split };
|
|
326
|
+
}
|
|
327
|
+
function mergeSplits(splits, separator, sepIndex) {
|
|
328
|
+
const result = [];
|
|
329
|
+
let current = "";
|
|
330
|
+
for (const split of splits) {
|
|
331
|
+
const candidate = current ? current + separator + split : split;
|
|
332
|
+
if (candidate.length <= maxSize) {
|
|
333
|
+
current = candidate;
|
|
334
|
+
continue;
|
|
335
|
+
}
|
|
336
|
+
if (current)
|
|
337
|
+
result.push(current);
|
|
338
|
+
const { chunks, remainder } = handleOversizedSplit(split, sepIndex);
|
|
339
|
+
result.push(...chunks);
|
|
340
|
+
current = remainder;
|
|
341
|
+
}
|
|
342
|
+
if (current)
|
|
343
|
+
result.push(current);
|
|
344
|
+
return result;
|
|
345
|
+
}
|
|
346
|
+
function splitRecursive(text, sepIndex) {
|
|
347
|
+
if (text.length <= maxSize)
|
|
348
|
+
return [text];
|
|
349
|
+
if (sepIndex >= separators.length)
|
|
350
|
+
return fixedSizeSplit(text);
|
|
351
|
+
const separator = separators[sepIndex];
|
|
352
|
+
const splits = separator === "" ? [text] : text.split(separator);
|
|
353
|
+
return mergeSplits(splits, separator, sepIndex);
|
|
354
|
+
}
|
|
355
|
+
return {
|
|
356
|
+
chunk(document) {
|
|
357
|
+
const parts = splitRecursive(document.content, 0);
|
|
358
|
+
let charOffset = 0;
|
|
359
|
+
return parts.map((content, index) => {
|
|
360
|
+
const startChar = document.content.indexOf(content, charOffset);
|
|
361
|
+
const actualStart = startChar >= 0 ? startChar : charOffset;
|
|
362
|
+
charOffset = actualStart + content.length;
|
|
363
|
+
return {
|
|
364
|
+
id: generateId("chk"),
|
|
365
|
+
content,
|
|
366
|
+
documentId: document.id,
|
|
367
|
+
index,
|
|
368
|
+
metadata: {
|
|
369
|
+
startChar: actualStart,
|
|
370
|
+
endChar: actualStart + content.length,
|
|
371
|
+
tokenEstimate: Math.ceil(content.length / 4)
|
|
372
|
+
}
|
|
373
|
+
};
|
|
374
|
+
});
|
|
375
|
+
}
|
|
376
|
+
};
|
|
377
|
+
}
|
|
378
|
+
function sentenceChunker(options) {
|
|
379
|
+
const maxSize = options?.maxChunkSize ?? 512;
|
|
380
|
+
const overlapSentences = options?.overlap ?? 1;
|
|
381
|
+
function splitSentences(text) {
|
|
382
|
+
return text.split(/(?<=[.!?])\s+/).map((s) => s.trim()).filter((s) => s.length > 0);
|
|
383
|
+
}
|
|
384
|
+
function gatherGroup(sentences, startIdx) {
|
|
385
|
+
const group = [];
|
|
386
|
+
let length = 0;
|
|
387
|
+
let i = startIdx;
|
|
388
|
+
while (i < sentences.length) {
|
|
389
|
+
const nextLen = length + sentences[i].length + (group.length > 0 ? 1 : 0);
|
|
390
|
+
if (nextLen > maxSize && group.length > 0)
|
|
391
|
+
break;
|
|
392
|
+
group.push(sentences[i]);
|
|
393
|
+
length = nextLen;
|
|
394
|
+
i++;
|
|
395
|
+
}
|
|
396
|
+
return { group, nextIdx: i };
|
|
397
|
+
}
|
|
398
|
+
function applyOverlap(i, sentences, group, chunkCount) {
|
|
399
|
+
if (overlapSentences <= 0 || i >= sentences.length)
|
|
400
|
+
return i;
|
|
401
|
+
let next = Math.max(i - overlapSentences, chunkCount > 0 ? i - overlapSentences : 0);
|
|
402
|
+
if (next <= (chunkCount > 1 ? sentences.indexOf(group[0]) : -1)) {
|
|
403
|
+
next = sentences.indexOf(group[group.length - 1]) + 1;
|
|
404
|
+
}
|
|
405
|
+
return next;
|
|
406
|
+
}
|
|
407
|
+
function buildSentenceChunk(document, group, index, searchStart) {
|
|
408
|
+
const content = group.join(" ");
|
|
409
|
+
const startChar = document.content.indexOf(group[0], searchStart);
|
|
410
|
+
const actualStart = startChar >= 0 ? startChar : 0;
|
|
411
|
+
return {
|
|
412
|
+
id: generateId("chk"),
|
|
413
|
+
content,
|
|
414
|
+
documentId: document.id,
|
|
415
|
+
index,
|
|
416
|
+
metadata: {
|
|
417
|
+
startChar: actualStart,
|
|
418
|
+
endChar: actualStart + content.length,
|
|
419
|
+
tokenEstimate: Math.ceil(content.length / 4),
|
|
420
|
+
sentenceCount: group.length
|
|
421
|
+
}
|
|
422
|
+
};
|
|
423
|
+
}
|
|
424
|
+
return {
|
|
425
|
+
chunk(document) {
|
|
426
|
+
const sentences = splitSentences(document.content);
|
|
427
|
+
if (sentences.length === 0)
|
|
428
|
+
return [];
|
|
429
|
+
const chunks = [];
|
|
430
|
+
let i = 0;
|
|
431
|
+
let index = 0;
|
|
432
|
+
while (i < sentences.length) {
|
|
433
|
+
const { group, nextIdx } = gatherGroup(sentences, i);
|
|
434
|
+
i = nextIdx;
|
|
435
|
+
const searchStart = chunks.length > 0 ? chunks[chunks.length - 1].metadata.endChar : 0;
|
|
436
|
+
chunks.push(buildSentenceChunk(document, group, index, searchStart));
|
|
437
|
+
index++;
|
|
438
|
+
i = applyOverlap(i, sentences, group, chunks.length);
|
|
439
|
+
}
|
|
440
|
+
return chunks;
|
|
441
|
+
}
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
function getChunker(config) {
|
|
445
|
+
switch (config.strategy) {
|
|
446
|
+
case "fixed-size":
|
|
447
|
+
return fixedSizeChunker({
|
|
448
|
+
maxChunkSize: config.maxChunkSize,
|
|
449
|
+
overlap: config.overlap
|
|
450
|
+
});
|
|
451
|
+
case "recursive":
|
|
452
|
+
return recursiveChunker({
|
|
453
|
+
maxChunkSize: config.maxChunkSize,
|
|
454
|
+
overlap: config.overlap
|
|
455
|
+
});
|
|
456
|
+
case "sentence":
|
|
457
|
+
return sentenceChunker({
|
|
458
|
+
maxChunkSize: config.maxChunkSize,
|
|
459
|
+
overlap: config.overlap
|
|
460
|
+
});
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
// src/embeddings.ts
|
|
464
|
+
function createOpenAIEmbeddings(config) {
|
|
465
|
+
const {
|
|
466
|
+
apiKey,
|
|
467
|
+
model = "text-embedding-3-small",
|
|
468
|
+
baseUrl = "https://api.openai.com",
|
|
469
|
+
dimensions = 1536,
|
|
470
|
+
batchSize = 100
|
|
471
|
+
} = config;
|
|
472
|
+
if (!apiKey) {
|
|
473
|
+
throw new ElsiumError({
|
|
474
|
+
code: "CONFIG_ERROR",
|
|
475
|
+
message: "OpenAI API key is required for embeddings",
|
|
476
|
+
retryable: false
|
|
477
|
+
});
|
|
478
|
+
}
|
|
479
|
+
async function callAPI(input) {
|
|
480
|
+
const response = await fetch(`${baseUrl}/v1/embeddings`, {
|
|
481
|
+
method: "POST",
|
|
482
|
+
headers: {
|
|
483
|
+
"Content-Type": "application/json",
|
|
484
|
+
Authorization: `Bearer ${apiKey}`
|
|
485
|
+
},
|
|
486
|
+
body: JSON.stringify({
|
|
487
|
+
input,
|
|
488
|
+
model,
|
|
489
|
+
...dimensions ? { dimensions } : {}
|
|
490
|
+
})
|
|
491
|
+
});
|
|
492
|
+
if (!response.ok) {
|
|
493
|
+
const body = await response.text().catch(() => "Unknown error");
|
|
494
|
+
throw ElsiumError.providerError(`OpenAI embeddings error ${response.status}: ${body}`, {
|
|
495
|
+
provider: "openai",
|
|
496
|
+
statusCode: response.status,
|
|
497
|
+
retryable: response.status >= 500
|
|
498
|
+
});
|
|
499
|
+
}
|
|
500
|
+
const json = await response.json();
|
|
501
|
+
return json.data.sort((a, b) => a.index - b.index).map((d) => d.embedding);
|
|
502
|
+
}
|
|
503
|
+
return {
|
|
504
|
+
name: "openai",
|
|
505
|
+
dimensions,
|
|
506
|
+
async embed(text) {
|
|
507
|
+
const [embedding] = await callAPI([text]);
|
|
508
|
+
return { values: embedding, dimensions: embedding.length };
|
|
509
|
+
},
|
|
510
|
+
async embedBatch(texts) {
|
|
511
|
+
const results = [];
|
|
512
|
+
for (let i = 0;i < texts.length; i += batchSize) {
|
|
513
|
+
const batch = texts.slice(i, i + batchSize);
|
|
514
|
+
const embeddings = await callAPI(batch);
|
|
515
|
+
results.push(...embeddings.map((values) => ({
|
|
516
|
+
values,
|
|
517
|
+
dimensions: values.length
|
|
518
|
+
})));
|
|
519
|
+
}
|
|
520
|
+
return results;
|
|
521
|
+
}
|
|
522
|
+
};
|
|
523
|
+
}
|
|
524
|
+
function createMockEmbeddings(dims = 128) {
|
|
525
|
+
function hashEmbed(text) {
|
|
526
|
+
const values = new Array(dims).fill(0);
|
|
527
|
+
for (let i = 0;i < text.length; i++) {
|
|
528
|
+
values[i % dims] += text.charCodeAt(i) / 1000;
|
|
529
|
+
}
|
|
530
|
+
const magnitude = Math.sqrt(values.reduce((s, v) => s + v * v, 0));
|
|
531
|
+
if (magnitude > 0) {
|
|
532
|
+
for (let i = 0;i < dims; i++) {
|
|
533
|
+
values[i] /= magnitude;
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
return values;
|
|
537
|
+
}
|
|
538
|
+
return {
|
|
539
|
+
name: "mock",
|
|
540
|
+
dimensions: dims,
|
|
541
|
+
async embed(text) {
|
|
542
|
+
const values = hashEmbed(text);
|
|
543
|
+
return { values, dimensions: dims };
|
|
544
|
+
},
|
|
545
|
+
async embedBatch(texts) {
|
|
546
|
+
return texts.map((text) => ({
|
|
547
|
+
values: hashEmbed(text),
|
|
548
|
+
dimensions: dims
|
|
549
|
+
}));
|
|
550
|
+
}
|
|
551
|
+
};
|
|
552
|
+
}
|
|
553
|
+
function getEmbeddingProvider(config) {
|
|
554
|
+
switch (config.provider) {
|
|
555
|
+
case "openai":
|
|
556
|
+
return createOpenAIEmbeddings(config);
|
|
557
|
+
case "mock":
|
|
558
|
+
return createMockEmbeddings(config.dimensions);
|
|
559
|
+
default:
|
|
560
|
+
throw new ElsiumError({
|
|
561
|
+
code: "CONFIG_ERROR",
|
|
562
|
+
message: `Unknown embedding provider: ${config.provider}`,
|
|
563
|
+
retryable: false
|
|
564
|
+
});
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
// src/vectorstore.ts
|
|
568
|
+
function cosineSimilarity(a, b) {
|
|
569
|
+
if (a.length !== b.length)
|
|
570
|
+
return 0;
|
|
571
|
+
let dotProduct = 0;
|
|
572
|
+
let magnitudeA = 0;
|
|
573
|
+
let magnitudeB = 0;
|
|
574
|
+
for (let i = 0;i < a.length; i++) {
|
|
575
|
+
dotProduct += a[i] * b[i];
|
|
576
|
+
magnitudeA += a[i] * a[i];
|
|
577
|
+
magnitudeB += b[i] * b[i];
|
|
578
|
+
}
|
|
579
|
+
magnitudeA = Math.sqrt(magnitudeA);
|
|
580
|
+
magnitudeB = Math.sqrt(magnitudeB);
|
|
581
|
+
if (magnitudeA === 0 || magnitudeB === 0)
|
|
582
|
+
return 0;
|
|
583
|
+
return dotProduct / (magnitudeA * magnitudeB);
|
|
584
|
+
}
|
|
585
|
+
function createInMemoryStore(options) {
|
|
586
|
+
const maxChunks = options?.maxChunks ?? 1e5;
|
|
587
|
+
const entries = new Map;
|
|
588
|
+
return {
|
|
589
|
+
name: "in-memory",
|
|
590
|
+
async upsert(chunks) {
|
|
591
|
+
for (const chunk of chunks) {
|
|
592
|
+
entries.set(chunk.id, chunk);
|
|
593
|
+
}
|
|
594
|
+
while (entries.size > maxChunks) {
|
|
595
|
+
const firstKey = entries.keys().next().value;
|
|
596
|
+
if (firstKey !== undefined)
|
|
597
|
+
entries.delete(firstKey);
|
|
598
|
+
}
|
|
599
|
+
},
|
|
600
|
+
async query(embedding, options2) {
|
|
601
|
+
const topK = options2?.topK ?? 5;
|
|
602
|
+
const minScore = options2?.minScore ?? 0;
|
|
603
|
+
const scored = [];
|
|
604
|
+
for (const chunk of entries.values()) {
|
|
605
|
+
const score = cosineSimilarity(embedding.values, chunk.embedding.values);
|
|
606
|
+
if (score >= minScore) {
|
|
607
|
+
scored.push({
|
|
608
|
+
chunk: {
|
|
609
|
+
id: chunk.id,
|
|
610
|
+
content: chunk.content,
|
|
611
|
+
documentId: chunk.documentId,
|
|
612
|
+
index: chunk.index,
|
|
613
|
+
metadata: chunk.metadata
|
|
614
|
+
},
|
|
615
|
+
score,
|
|
616
|
+
distance: 1 - score
|
|
617
|
+
});
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
scored.sort((a, b) => b.score - a.score);
|
|
621
|
+
return scored.slice(0, topK);
|
|
622
|
+
},
|
|
623
|
+
async delete(ids) {
|
|
624
|
+
for (const id of ids) {
|
|
625
|
+
entries.delete(id);
|
|
626
|
+
}
|
|
627
|
+
},
|
|
628
|
+
async clear() {
|
|
629
|
+
entries.clear();
|
|
630
|
+
},
|
|
631
|
+
async count() {
|
|
632
|
+
return entries.size;
|
|
633
|
+
}
|
|
634
|
+
};
|
|
635
|
+
}
|
|
636
|
+
function getEmbeddingValues(sel, results) {
|
|
637
|
+
const match = results.find((r) => r.chunk.id === sel.chunk.id);
|
|
638
|
+
return match?.embedding.values ?? [];
|
|
639
|
+
}
|
|
640
|
+
function maxSimilarityToSelected(candidate, selected, results) {
|
|
641
|
+
let maxSim = Number.NEGATIVE_INFINITY;
|
|
642
|
+
for (const sel of selected) {
|
|
643
|
+
const selValues = getEmbeddingValues(sel, results);
|
|
644
|
+
const sim = cosineSimilarity(candidate.embedding.values, selValues);
|
|
645
|
+
if (sim > maxSim)
|
|
646
|
+
maxSim = sim;
|
|
647
|
+
}
|
|
648
|
+
return maxSim;
|
|
649
|
+
}
|
|
650
|
+
function selectBestCandidate(remaining, selected, results, lambda) {
|
|
651
|
+
let bestIndex = 0;
|
|
652
|
+
let bestMmrScore = Number.NEGATIVE_INFINITY;
|
|
653
|
+
for (let i = 0;i < remaining.length; i++) {
|
|
654
|
+
const relevance = remaining[i].score;
|
|
655
|
+
const maxSim = maxSimilarityToSelected(remaining[i], selected, results);
|
|
656
|
+
const mmrScore = lambda * relevance - (1 - lambda) * maxSim;
|
|
657
|
+
if (mmrScore > bestMmrScore) {
|
|
658
|
+
bestMmrScore = mmrScore;
|
|
659
|
+
bestIndex = i;
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
return bestIndex;
|
|
663
|
+
}
|
|
664
|
+
function mmrRerank(queryEmbedding, results, options) {
|
|
665
|
+
const topK = options?.topK ?? 5;
|
|
666
|
+
const lambda = options?.lambda ?? 0.7;
|
|
667
|
+
if (results.length === 0)
|
|
668
|
+
return [];
|
|
669
|
+
const selected = [];
|
|
670
|
+
const remaining = [...results];
|
|
671
|
+
remaining.sort((a, b) => b.score - a.score);
|
|
672
|
+
const first = remaining.shift();
|
|
673
|
+
if (!first)
|
|
674
|
+
return [];
|
|
675
|
+
selected.push(first);
|
|
676
|
+
while (selected.length < topK && remaining.length > 0) {
|
|
677
|
+
const bestIndex = selectBestCandidate(remaining, selected, results, lambda);
|
|
678
|
+
selected.push(remaining[bestIndex]);
|
|
679
|
+
remaining.splice(bestIndex, 1);
|
|
680
|
+
}
|
|
681
|
+
return selected;
|
|
682
|
+
}
|
|
683
|
+
// src/pipeline.ts
|
|
684
|
+
function rag(config) {
|
|
685
|
+
const loaderType = config.loader ?? "text";
|
|
686
|
+
const chunkingConfig = config.chunking ?? {
|
|
687
|
+
strategy: "recursive",
|
|
688
|
+
maxChunkSize: 512,
|
|
689
|
+
overlap: 50
|
|
690
|
+
};
|
|
691
|
+
const retrievalConfig = config.retrieval ?? {
|
|
692
|
+
topK: 5,
|
|
693
|
+
minScore: 0,
|
|
694
|
+
strategy: "similarity"
|
|
695
|
+
};
|
|
696
|
+
if (config.store) {
|
|
697
|
+
throw new Error("External vector store not yet implemented. Use in-memory store.");
|
|
698
|
+
}
|
|
699
|
+
const loader = getLoader(loaderType);
|
|
700
|
+
const chunker = getChunker(chunkingConfig);
|
|
701
|
+
const embeddingProvider = getEmbeddingProvider(config.embeddings);
|
|
702
|
+
const vectorStore = createInMemoryStore();
|
|
703
|
+
async function embedChunks(chunks) {
|
|
704
|
+
const texts = chunks.map((c) => c.content);
|
|
705
|
+
const embeddings = await embeddingProvider.embedBatch(texts);
|
|
706
|
+
return chunks.map((chunk, i) => ({
|
|
707
|
+
...chunk,
|
|
708
|
+
embedding: embeddings[i]
|
|
709
|
+
}));
|
|
710
|
+
}
|
|
711
|
+
return {
|
|
712
|
+
embeddingProvider,
|
|
713
|
+
vectorStore,
|
|
714
|
+
async ingest(source, content) {
|
|
715
|
+
const document = loader.load(source, content);
|
|
716
|
+
return this.ingestDocument(document);
|
|
717
|
+
},
|
|
718
|
+
async ingestDocument(document) {
|
|
719
|
+
const chunks = chunker.chunk(document);
|
|
720
|
+
if (chunks.length === 0) {
|
|
721
|
+
return { documentId: document.id, chunkCount: 0, totalTokens: 0 };
|
|
722
|
+
}
|
|
723
|
+
const embedded = await embedChunks(chunks);
|
|
724
|
+
await vectorStore.upsert(embedded);
|
|
725
|
+
const totalTokens = chunks.reduce((sum, c) => sum + c.metadata.tokenEstimate, 0);
|
|
726
|
+
return {
|
|
727
|
+
documentId: document.id,
|
|
728
|
+
chunkCount: chunks.length,
|
|
729
|
+
totalTokens
|
|
730
|
+
};
|
|
731
|
+
},
|
|
732
|
+
async query(text, options) {
|
|
733
|
+
const queryEmbedding = await embeddingProvider.embed(text);
|
|
734
|
+
return vectorStore.query(queryEmbedding, {
|
|
735
|
+
topK: options?.topK ?? retrievalConfig.topK,
|
|
736
|
+
minScore: options?.minScore ?? retrievalConfig.minScore,
|
|
737
|
+
filter: options?.filter
|
|
738
|
+
});
|
|
739
|
+
},
|
|
740
|
+
async clear() {
|
|
741
|
+
await vectorStore.clear();
|
|
742
|
+
},
|
|
743
|
+
async count() {
|
|
744
|
+
return vectorStore.count();
|
|
745
|
+
}
|
|
746
|
+
};
|
|
747
|
+
}
|
|
748
|
+
export {
|
|
749
|
+
textLoader,
|
|
750
|
+
sentenceChunker,
|
|
751
|
+
recursiveChunker,
|
|
752
|
+
rag,
|
|
753
|
+
mmrRerank,
|
|
754
|
+
markdownLoader,
|
|
755
|
+
jsonLoader,
|
|
756
|
+
htmlLoader,
|
|
757
|
+
getLoader,
|
|
758
|
+
getEmbeddingProvider,
|
|
759
|
+
getChunker,
|
|
760
|
+
fixedSizeChunker,
|
|
761
|
+
csvLoader,
|
|
762
|
+
createOpenAIEmbeddings,
|
|
763
|
+
createMockEmbeddings,
|
|
764
|
+
createInMemoryStore,
|
|
765
|
+
cosineSimilarity
|
|
766
|
+
};
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { Document, LoaderType } from './types';
|
|
2
|
+
export interface DocumentLoader {
|
|
3
|
+
load(source: string, content: string): Document;
|
|
4
|
+
}
|
|
5
|
+
export declare function textLoader(): DocumentLoader;
|
|
6
|
+
export declare function markdownLoader(): DocumentLoader;
|
|
7
|
+
export declare function htmlLoader(): DocumentLoader;
|
|
8
|
+
export declare function jsonLoader(options?: {
|
|
9
|
+
contentField?: string;
|
|
10
|
+
metadataFields?: string[];
|
|
11
|
+
}): DocumentLoader;
|
|
12
|
+
export declare function csvLoader(options?: {
|
|
13
|
+
separator?: string;
|
|
14
|
+
contentColumns?: string[];
|
|
15
|
+
}): DocumentLoader;
|
|
16
|
+
export declare function getLoader(type: LoaderType): DocumentLoader;
|
|
17
|
+
//# sourceMappingURL=loaders.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loaders.d.ts","sourceRoot":"","sources":["../src/loaders.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAoB,UAAU,EAAE,MAAM,SAAS,CAAA;AAErE,MAAM,WAAW,cAAc;IAC9B,IAAI,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,QAAQ,CAAA;CAC/C;AAYD,wBAAgB,UAAU,IAAI,cAAc,CAS3C;AAID,wBAAgB,cAAc,IAAI,cAAc,CAY/C;AASD,wBAAgB,UAAU,IAAI,cAAc,CAa3C;AA8CD,wBAAgB,UAAU,CAAC,OAAO,CAAC,EAAE;IACpC,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;CACzB,GAAG,cAAc,CAkBjB;AAID,wBAAgB,SAAS,CAAC,OAAO,CAAC,EAAE;IACnC,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;CACzB,GAAG,cAAc,CAgCjB;AA+BD,wBAAgB,SAAS,CAAC,IAAI,EAAE,UAAU,GAAG,cAAc,CAa1D"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { type EmbeddingProvider } from './embeddings';
|
|
2
|
+
import type { ChunkingConfig, Document, EmbeddingConfig, LoaderType, QueryOptions, RetrievalConfig, RetrievalResult, VectorStoreConfig } from './types';
|
|
3
|
+
import { type VectorStore } from './vectorstore';
|
|
4
|
+
export interface RAGPipelineConfig {
|
|
5
|
+
loader?: LoaderType;
|
|
6
|
+
chunking?: ChunkingConfig;
|
|
7
|
+
embeddings: EmbeddingConfig;
|
|
8
|
+
store?: VectorStoreConfig;
|
|
9
|
+
retrieval?: RetrievalConfig;
|
|
10
|
+
}
|
|
11
|
+
export interface RAGPipeline {
|
|
12
|
+
ingest(source: string, content: string): Promise<IngestResult>;
|
|
13
|
+
ingestDocument(document: Document): Promise<IngestResult>;
|
|
14
|
+
query(text: string, options?: QueryOptions): Promise<RetrievalResult[]>;
|
|
15
|
+
clear(): Promise<void>;
|
|
16
|
+
count(): Promise<number>;
|
|
17
|
+
readonly embeddingProvider: EmbeddingProvider;
|
|
18
|
+
readonly vectorStore: VectorStore;
|
|
19
|
+
}
|
|
20
|
+
export interface IngestResult {
|
|
21
|
+
documentId: string;
|
|
22
|
+
chunkCount: number;
|
|
23
|
+
totalTokens: number;
|
|
24
|
+
}
|
|
25
|
+
export declare function rag(config: RAGPipelineConfig): RAGPipeline;
|
|
26
|
+
//# sourceMappingURL=pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,KAAK,iBAAiB,EAAwB,MAAM,cAAc,CAAA;AAE3E,OAAO,KAAK,EAEX,cAAc,EACd,QAAQ,EAER,eAAe,EACf,UAAU,EACV,YAAY,EACZ,eAAe,EACf,eAAe,EACf,iBAAiB,EACjB,MAAM,SAAS,CAAA;AAChB,OAAO,EAAE,KAAK,WAAW,EAAuB,MAAM,eAAe,CAAA;AAErE,MAAM,WAAW,iBAAiB;IACjC,MAAM,CAAC,EAAE,UAAU,CAAA;IACnB,QAAQ,CAAC,EAAE,cAAc,CAAA;IACzB,UAAU,EAAE,eAAe,CAAA;IAC3B,KAAK,CAAC,EAAE,iBAAiB,CAAA;IACzB,SAAS,CAAC,EAAE,eAAe,CAAA;CAC3B;AAED,MAAM,WAAW,WAAW;IAC3B,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,CAAA;IAC9D,cAAc,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAAC,YAAY,CAAC,CAAA;IACzD,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC,CAAA;IACvE,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAA;IACtB,KAAK,IAAI,OAAO,CAAC,MAAM,CAAC,CAAA;IACxB,QAAQ,CAAC,iBAAiB,EAAE,iBAAiB,CAAA;IAC7C,QAAQ,CAAC,WAAW,EAAE,WAAW,CAAA;CACjC;AAED,MAAM,WAAW,YAAY;IAC5B,UAAU,EAAE,MAAM,CAAA;IAClB,UAAU,EAAE,MAAM,CAAA;IAClB,WAAW,EAAE,MAAM,CAAA;CACnB;AAED,wBAAgB,GAAG,CAAC,MAAM,EAAE,iBAAiB,GAAG,WAAW,CA8E1D"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
export interface Document {
|
|
2
|
+
id: string;
|
|
3
|
+
content: string;
|
|
4
|
+
metadata: DocumentMetadata;
|
|
5
|
+
}
|
|
6
|
+
export interface DocumentMetadata {
|
|
7
|
+
source: string;
|
|
8
|
+
type: string;
|
|
9
|
+
title?: string;
|
|
10
|
+
language?: string;
|
|
11
|
+
createdAt?: string;
|
|
12
|
+
[key: string]: unknown;
|
|
13
|
+
}
|
|
14
|
+
export interface Chunk {
|
|
15
|
+
id: string;
|
|
16
|
+
content: string;
|
|
17
|
+
documentId: string;
|
|
18
|
+
index: number;
|
|
19
|
+
metadata: ChunkMetadata;
|
|
20
|
+
}
|
|
21
|
+
export interface ChunkMetadata {
|
|
22
|
+
startChar: number;
|
|
23
|
+
endChar: number;
|
|
24
|
+
tokenEstimate: number;
|
|
25
|
+
[key: string]: unknown;
|
|
26
|
+
}
|
|
27
|
+
export interface EmbeddingVector {
|
|
28
|
+
values: number[];
|
|
29
|
+
dimensions: number;
|
|
30
|
+
}
|
|
31
|
+
export interface EmbeddedChunk extends Chunk {
|
|
32
|
+
embedding: EmbeddingVector;
|
|
33
|
+
}
|
|
34
|
+
export interface RetrievalResult {
|
|
35
|
+
chunk: Chunk;
|
|
36
|
+
score: number;
|
|
37
|
+
distance: number;
|
|
38
|
+
}
|
|
39
|
+
export interface QueryOptions {
|
|
40
|
+
topK?: number;
|
|
41
|
+
minScore?: number;
|
|
42
|
+
filter?: Record<string, unknown>;
|
|
43
|
+
}
|
|
44
|
+
export type LoaderType = 'text' | 'markdown' | 'html' | 'json' | 'csv';
|
|
45
|
+
export type ChunkingStrategy = 'fixed-size' | 'recursive' | 'sentence';
|
|
46
|
+
export interface ChunkingConfig {
|
|
47
|
+
strategy: ChunkingStrategy;
|
|
48
|
+
maxChunkSize?: number;
|
|
49
|
+
overlap?: number;
|
|
50
|
+
separator?: string;
|
|
51
|
+
}
|
|
52
|
+
export interface EmbeddingConfig {
|
|
53
|
+
provider: string;
|
|
54
|
+
model?: string;
|
|
55
|
+
apiKey?: string;
|
|
56
|
+
baseUrl?: string;
|
|
57
|
+
dimensions?: number;
|
|
58
|
+
batchSize?: number;
|
|
59
|
+
}
|
|
60
|
+
export interface VectorStoreConfig {
|
|
61
|
+
provider: string;
|
|
62
|
+
connectionString?: string;
|
|
63
|
+
tableName?: string;
|
|
64
|
+
dimensions?: number;
|
|
65
|
+
}
|
|
66
|
+
export interface RetrievalConfig {
|
|
67
|
+
topK?: number;
|
|
68
|
+
minScore?: number;
|
|
69
|
+
strategy?: 'similarity' | 'mmr';
|
|
70
|
+
mmrLambda?: number;
|
|
71
|
+
}
|
|
72
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAEA,MAAM,WAAW,QAAQ;IACxB,EAAE,EAAE,MAAM,CAAA;IACV,OAAO,EAAE,MAAM,CAAA;IACf,QAAQ,EAAE,gBAAgB,CAAA;CAC1B;AAED,MAAM,WAAW,gBAAgB;IAChC,MAAM,EAAE,MAAM,CAAA;IACd,IAAI,EAAE,MAAM,CAAA;IACZ,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CACtB;AAID,MAAM,WAAW,KAAK;IACrB,EAAE,EAAE,MAAM,CAAA;IACV,OAAO,EAAE,MAAM,CAAA;IACf,UAAU,EAAE,MAAM,CAAA;IAClB,KAAK,EAAE,MAAM,CAAA;IACb,QAAQ,EAAE,aAAa,CAAA;CACvB;AAED,MAAM,WAAW,aAAa;IAC7B,SAAS,EAAE,MAAM,CAAA;IACjB,OAAO,EAAE,MAAM,CAAA;IACf,aAAa,EAAE,MAAM,CAAA;IACrB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CACtB;AAID,MAAM,WAAW,eAAe;IAC/B,MAAM,EAAE,MAAM,EAAE,CAAA;IAChB,UAAU,EAAE,MAAM,CAAA;CAClB;AAED,MAAM,WAAW,aAAc,SAAQ,KAAK;IAC3C,SAAS,EAAE,eAAe,CAAA;CAC1B;AAID,MAAM,WAAW,eAAe;IAC/B,KAAK,EAAE,KAAK,CAAA;IACZ,KAAK,EAAE,MAAM,CAAA;IACb,QAAQ,EAAE,MAAM,CAAA;CAChB;AAED,MAAM,WAAW,YAAY;IAC5B,IAAI,CAAC,EAAE,MAAM,CAAA;IACb,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;CAChC;AAID,MAAM,MAAM,UAAU,GAAG,MAAM,GAAG,UAAU,GAAG,MAAM,GAAG,MAAM,GAAG,KAAK,CAAA;AAEtE,MAAM,MAAM,gBAAgB,GAAG,YAAY,GAAG,WAAW,GAAG,UAAU,CAAA;AAEtE,MAAM,WAAW,cAAc;IAC9B,QAAQ,EAAE,gBAAgB,CAAA;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB;AAED,MAAM,WAAW,eAAe;IAC/B,QAAQ,EAAE,MAAM,CAAA;IAChB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB;AAED,MAAM,WAAW,iBAAiB;IACjC,QAAQ,EAAE,MAAM,CAAA;IAChB,gBAAgB,CAAC,EAAE,MAAM,CAAA;IACzB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,UAAU,CAAC,EAAE,MAAM,CAAA;CACnB;AAED,MAAM,WAAW,eAAe;IAC/B,IAAI,CAAC,EAAE,MAAM,CAAA;IACb,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,QAAQ,CAAC,EAAE,YAAY,GAAG,KAAK,CAAA;IAC/B,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { EmbeddedChunk, EmbeddingVector, QueryOptions, RetrievalResult } from './types';
|
|
2
|
+
export interface VectorStore {
|
|
3
|
+
readonly name: string;
|
|
4
|
+
upsert(chunks: EmbeddedChunk[]): Promise<void>;
|
|
5
|
+
query(embedding: EmbeddingVector, options?: QueryOptions): Promise<RetrievalResult[]>;
|
|
6
|
+
delete(ids: string[]): Promise<void>;
|
|
7
|
+
clear(): Promise<void>;
|
|
8
|
+
count(): Promise<number>;
|
|
9
|
+
}
|
|
10
|
+
export declare function cosineSimilarity(a: number[], b: number[]): number;
|
|
11
|
+
export declare function createInMemoryStore(options?: {
|
|
12
|
+
maxChunks?: number;
|
|
13
|
+
}): VectorStore;
|
|
14
|
+
export declare function mmrRerank(queryEmbedding: EmbeddingVector, results: Array<RetrievalResult & {
|
|
15
|
+
embedding: EmbeddingVector;
|
|
16
|
+
}>, options?: {
|
|
17
|
+
topK?: number;
|
|
18
|
+
lambda?: number;
|
|
19
|
+
}): RetrievalResult[];
|
|
20
|
+
//# sourceMappingURL=vectorstore.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vectorstore.d.ts","sourceRoot":"","sources":["../src/vectorstore.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,eAAe,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,SAAS,CAAA;AAE5F,MAAM,WAAW,WAAW;IAC3B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAA;IAErB,MAAM,CAAC,MAAM,EAAE,aAAa,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IAC9C,KAAK,CAAC,SAAS,EAAE,eAAe,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC,CAAA;IACrF,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IACpC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAA;IACtB,KAAK,IAAI,OAAO,CAAC,MAAM,CAAC,CAAA;CACxB;AAID,wBAAgB,gBAAgB,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC,EAAE,MAAM,EAAE,GAAG,MAAM,CAkBjE;AAID,wBAAgB,mBAAmB,CAAC,OAAO,CAAC,EAAE;IAC7C,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB,GAAG,WAAW,CA2Dd;AAmDD,wBAAgB,SAAS,CACxB,cAAc,EAAE,eAAe,EAC/B,OAAO,EAAE,KAAK,CAAC,eAAe,GAAG;IAAE,SAAS,EAAE,eAAe,CAAA;CAAE,CAAC,EAChE,OAAO,CAAC,EAAE;IAAE,IAAI,CAAC,EAAE,MAAM,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,GAC1C,eAAe,EAAE,CAqBnB"}
|
package/package.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@elsium-ai/rag",
|
|
3
|
+
"version": "0.1.6",
|
|
4
|
+
"description": "RAG pipeline, document processing, embeddings, and vector stores for ElsiumAI",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"author": "Eric Utrera <ebutrera9103@gmail.com>",
|
|
7
|
+
"repository": {
|
|
8
|
+
"type": "git",
|
|
9
|
+
"url": "https://github.com/elsium-ai/elsium-ai",
|
|
10
|
+
"directory": "packages/rag"
|
|
11
|
+
},
|
|
12
|
+
"type": "module",
|
|
13
|
+
"main": "./dist/index.js",
|
|
14
|
+
"types": "./dist/index.d.ts",
|
|
15
|
+
"exports": {
|
|
16
|
+
".": {
|
|
17
|
+
"import": "./dist/index.js",
|
|
18
|
+
"types": "./dist/index.d.ts"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"files": [
|
|
22
|
+
"dist"
|
|
23
|
+
],
|
|
24
|
+
"scripts": {
|
|
25
|
+
"build": "bun build ./src/index.ts --outdir ./dist --target bun && bun x tsc -p tsconfig.build.json --emitDeclarationOnly",
|
|
26
|
+
"dev": "bun --watch src/index.ts"
|
|
27
|
+
},
|
|
28
|
+
"dependencies": {
|
|
29
|
+
"@elsium-ai/core": "workspace:*"
|
|
30
|
+
},
|
|
31
|
+
"devDependencies": {
|
|
32
|
+
"bun-types": "^1.3.0",
|
|
33
|
+
"typescript": "^5.7.0"
|
|
34
|
+
}
|
|
35
|
+
}
|