@vibe-agent-toolkit/rag 0.1.0-rc.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +424 -0
- package/dist/chunking/chunk-by-tokens.d.ts +22 -0
- package/dist/chunking/chunk-by-tokens.d.ts.map +1 -0
- package/dist/chunking/chunk-by-tokens.js +68 -0
- package/dist/chunking/chunk-by-tokens.js.map +1 -0
- package/dist/chunking/chunk-resource.d.ts +46 -0
- package/dist/chunking/chunk-resource.d.ts.map +1 -0
- package/dist/chunking/chunk-resource.js +131 -0
- package/dist/chunking/chunk-resource.js.map +1 -0
- package/dist/chunking/index.d.ts +10 -0
- package/dist/chunking/index.d.ts.map +1 -0
- package/dist/chunking/index.js +9 -0
- package/dist/chunking/index.js.map +1 -0
- package/dist/chunking/types.d.ts +42 -0
- package/dist/chunking/types.d.ts.map +1 -0
- package/dist/chunking/types.js +5 -0
- package/dist/chunking/types.js.map +1 -0
- package/dist/chunking/utils.d.ts +41 -0
- package/dist/chunking/utils.d.ts.map +1 -0
- package/dist/chunking/utils.js +62 -0
- package/dist/chunking/utils.js.map +1 -0
- package/dist/embedding-providers/index.d.ts +8 -0
- package/dist/embedding-providers/index.d.ts.map +1 -0
- package/dist/embedding-providers/index.js +8 -0
- package/dist/embedding-providers/index.js.map +1 -0
- package/dist/embedding-providers/openai-embedding-provider.d.ts +64 -0
- package/dist/embedding-providers/openai-embedding-provider.d.ts.map +1 -0
- package/dist/embedding-providers/openai-embedding-provider.js +92 -0
- package/dist/embedding-providers/openai-embedding-provider.js.map +1 -0
- package/dist/embedding-providers/transformers-embedding-provider.d.ts +62 -0
- package/dist/embedding-providers/transformers-embedding-provider.d.ts.map +1 -0
- package/dist/embedding-providers/transformers-embedding-provider.js +75 -0
- package/dist/embedding-providers/transformers-embedding-provider.js.map +1 -0
- package/dist/index.d.ts +12 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +16 -0
- package/dist/index.js.map +1 -0
- package/dist/interfaces/embedding.d.ts +33 -0
- package/dist/interfaces/embedding.d.ts.map +1 -0
- package/dist/interfaces/embedding.js +7 -0
- package/dist/interfaces/embedding.js.map +1 -0
- package/dist/interfaces/index.d.ts +9 -0
- package/dist/interfaces/index.d.ts.map +1 -0
- package/dist/interfaces/index.js +7 -0
- package/dist/interfaces/index.js.map +1 -0
- package/dist/interfaces/provider.d.ts +150 -0
- package/dist/interfaces/provider.d.ts.map +1 -0
- package/dist/interfaces/provider.js +8 -0
- package/dist/interfaces/provider.js.map +1 -0
- package/dist/interfaces/token-counter.d.ts +29 -0
- package/dist/interfaces/token-counter.d.ts.map +1 -0
- package/dist/interfaces/token-counter.js +7 -0
- package/dist/interfaces/token-counter.js.map +1 -0
- package/dist/schemas/admin.d.ts +82 -0
- package/dist/schemas/admin.d.ts.map +1 -0
- package/dist/schemas/admin.js +34 -0
- package/dist/schemas/admin.js.map +1 -0
- package/dist/schemas/chunk.d.ts +75 -0
- package/dist/schemas/chunk.d.ts.map +1 -0
- package/dist/schemas/chunk.js +39 -0
- package/dist/schemas/chunk.js.map +1 -0
- package/dist/schemas/index.d.ts +9 -0
- package/dist/schemas/index.d.ts.map +1 -0
- package/dist/schemas/index.js +9 -0
- package/dist/schemas/index.js.map +1 -0
- package/dist/schemas/json-schema.d.ts +86 -0
- package/dist/schemas/json-schema.d.ts.map +1 -0
- package/dist/schemas/json-schema.js +55 -0
- package/dist/schemas/json-schema.js.map +1 -0
- package/dist/schemas/query.d.ts +262 -0
- package/dist/schemas/query.d.ts.map +1 -0
- package/dist/schemas/query.js +56 -0
- package/dist/schemas/query.js.map +1 -0
- package/dist/token-counters/approximate-token-counter.d.ts +32 -0
- package/dist/token-counters/approximate-token-counter.d.ts.map +1 -0
- package/dist/token-counters/approximate-token-counter.js +40 -0
- package/dist/token-counters/approximate-token-counter.js.map +1 -0
- package/dist/token-counters/fast-token-counter.d.ts +33 -0
- package/dist/token-counters/fast-token-counter.d.ts.map +1 -0
- package/dist/token-counters/fast-token-counter.js +40 -0
- package/dist/token-counters/fast-token-counter.js.map +1 -0
- package/dist/token-counters/index.d.ts +8 -0
- package/dist/token-counters/index.d.ts.map +1 -0
- package/dist/token-counters/index.js +8 -0
- package/dist/token-counters/index.js.map +1 -0
- package/package.json +53 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Resource chunking
|
|
3
|
+
*
|
|
4
|
+
* Chunks ResourceMetadata using hybrid heading-based + token-aware strategy.
|
|
5
|
+
*/
|
|
6
|
+
import { chunkByTokens } from './chunk-by-tokens.js';
|
|
7
|
+
import { generateChunkId, generateContentHash } from './utils.js';
|
|
8
|
+
/**
|
|
9
|
+
* Chunk a resource using hybrid strategy
|
|
10
|
+
*
|
|
11
|
+
* Strategy:
|
|
12
|
+
* 1. Use heading boundaries as primary splits (from ResourceRegistry)
|
|
13
|
+
* 2. For large sections exceeding target size, split by tokens (paragraphs)
|
|
14
|
+
* 3. Link chunks for context expansion (previousChunkId, nextChunkId)
|
|
15
|
+
*
|
|
16
|
+
* @param resource - Chunkable resource with content and frontmatter
|
|
17
|
+
* @param config - Chunking configuration
|
|
18
|
+
* @returns Chunking result with raw chunks and statistics
|
|
19
|
+
*/
|
|
20
|
+
export function chunkResource(resource, config) {
|
|
21
|
+
const rawChunks = [];
|
|
22
|
+
if (resource.headings.length === 0) {
|
|
23
|
+
// No headings - chunk entire content by tokens
|
|
24
|
+
const chunks = chunkByTokens(resource.content, config);
|
|
25
|
+
rawChunks.push(...chunks);
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
// Extract content between headings
|
|
29
|
+
const lines = resource.content.split('\n');
|
|
30
|
+
for (let i = 0; i < resource.headings.length; i++) {
|
|
31
|
+
const heading = resource.headings[i];
|
|
32
|
+
if (!heading)
|
|
33
|
+
continue;
|
|
34
|
+
const nextHeading = resource.headings[i + 1];
|
|
35
|
+
// Extract content between this heading and next (or end of file)
|
|
36
|
+
const startLine = heading.line ?? 0;
|
|
37
|
+
const endLine = nextHeading?.line ? nextHeading.line - 1 : lines.length;
|
|
38
|
+
const sectionContent = lines
|
|
39
|
+
.slice(startLine, endLine)
|
|
40
|
+
.join('\n')
|
|
41
|
+
.trim();
|
|
42
|
+
if (sectionContent.length === 0) {
|
|
43
|
+
continue;
|
|
44
|
+
}
|
|
45
|
+
// Build heading path (hierarchy)
|
|
46
|
+
const headingPath = buildHeadingPath(resource.headings, i);
|
|
47
|
+
// Chunk this section by tokens if needed
|
|
48
|
+
const metadata = {
|
|
49
|
+
headingPath,
|
|
50
|
+
headingLevel: heading.level,
|
|
51
|
+
startLine,
|
|
52
|
+
endLine,
|
|
53
|
+
};
|
|
54
|
+
const sectionChunks = chunkByTokens(sectionContent, config, metadata);
|
|
55
|
+
rawChunks.push(...sectionChunks);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// Calculate statistics
|
|
59
|
+
const tokenCounts = rawChunks.map((c) => config.tokenCounter.count(c.content));
|
|
60
|
+
const stats = {
|
|
61
|
+
totalChunks: rawChunks.length,
|
|
62
|
+
averageTokens: tokenCounts.reduce((sum, t) => sum + t, 0) / rawChunks.length,
|
|
63
|
+
maxTokens: Math.max(...tokenCounts),
|
|
64
|
+
minTokens: Math.min(...tokenCounts),
|
|
65
|
+
};
|
|
66
|
+
return { chunks: rawChunks, stats };
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Build heading path from heading hierarchy
|
|
70
|
+
*
|
|
71
|
+
* @param headings - All headings in resource
|
|
72
|
+
* @param currentIndex - Index of current heading
|
|
73
|
+
* @returns Heading path (e.g., "Architecture > RAG Design > Chunking")
|
|
74
|
+
*/
|
|
75
|
+
function buildHeadingPath(headings, currentIndex) {
|
|
76
|
+
const current = headings[currentIndex];
|
|
77
|
+
if (!current)
|
|
78
|
+
return '';
|
|
79
|
+
const path = [current.text];
|
|
80
|
+
// Walk backwards to find parent headings
|
|
81
|
+
for (let i = currentIndex - 1; i >= 0; i--) {
|
|
82
|
+
const heading = headings[i];
|
|
83
|
+
if (!heading)
|
|
84
|
+
continue;
|
|
85
|
+
if (heading.level < current.level) {
|
|
86
|
+
path.unshift(heading.text);
|
|
87
|
+
if (heading.level === 1)
|
|
88
|
+
break; // Stop at top level
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return path.join(' > ');
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Enrich raw chunks with full RAGChunk metadata
|
|
95
|
+
*
|
|
96
|
+
* Adds resource metadata, embeddings, chunk IDs, and links between chunks.
|
|
97
|
+
*
|
|
98
|
+
* @param rawChunks - Raw chunks from chunkResource
|
|
99
|
+
* @param resource - Source chunkable resource with frontmatter
|
|
100
|
+
* @param embeddings - Embedding array for each chunk
|
|
101
|
+
* @param embeddingModel - Model used for embeddings
|
|
102
|
+
* @returns Array of complete RAGChunks
|
|
103
|
+
*/
|
|
104
|
+
export function enrichChunks(rawChunks, resource, embeddings, embeddingModel) {
|
|
105
|
+
const enrichedChunks = rawChunks.map((raw, index) => {
|
|
106
|
+
const chunkId = generateChunkId(resource.id, index);
|
|
107
|
+
const contentHash = generateContentHash(raw.content);
|
|
108
|
+
return {
|
|
109
|
+
chunkId,
|
|
110
|
+
resourceId: resource.id,
|
|
111
|
+
content: raw.content,
|
|
112
|
+
contentHash,
|
|
113
|
+
tokenCount: 0, // Will be set by caller
|
|
114
|
+
headingPath: raw.headingPath,
|
|
115
|
+
headingLevel: raw.headingLevel,
|
|
116
|
+
startLine: raw.startLine,
|
|
117
|
+
endLine: raw.endLine,
|
|
118
|
+
filePath: resource.filePath,
|
|
119
|
+
tags: resource.frontmatter['tags'],
|
|
120
|
+
type: resource.frontmatter['type'],
|
|
121
|
+
title: resource.frontmatter['title'],
|
|
122
|
+
embedding: embeddings[index] ?? [],
|
|
123
|
+
embeddingModel,
|
|
124
|
+
embeddedAt: new Date(),
|
|
125
|
+
previousChunkId: index > 0 ? generateChunkId(resource.id, index - 1) : undefined,
|
|
126
|
+
nextChunkId: index < rawChunks.length - 1 ? generateChunkId(resource.id, index + 1) : undefined,
|
|
127
|
+
};
|
|
128
|
+
});
|
|
129
|
+
return enrichedChunks;
|
|
130
|
+
}
|
|
131
|
+
//# sourceMappingURL=chunk-resource.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunk-resource.js","sourceRoot":"","sources":["../../src/chunking/chunk-resource.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAMH,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AAErD,OAAO,EAAE,eAAe,EAAE,mBAAmB,EAAE,MAAM,YAAY,CAAC;AAelE;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,aAAa,CAC3B,QAA2B,EAC3B,MAAsB;IAEtB,MAAM,SAAS,GAAe,EAAE,CAAC;IAEjC,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACnC,+CAA+C;QAC/C,MAAM,MAAM,GAAG,aAAa,CAAC,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;QACvD,SAAS,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;IAC5B,CAAC;SAAM,CAAC;QACN,mCAAmC;QACnC,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAE3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAClD,MAAM,OAAO,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;YACrC,IAAI,CAAC,OAAO;gBAAE,SAAS;YAEvB,MAAM,WAAW,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YAE7C,iEAAiE;YACjE,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;YACpC,MAAM,OAAO,GAAG,WAAW,EAAE,IAAI,CAAC,CAAC,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC;YAExE,MAAM,cAAc,GAAG,KAAK;iBACzB,KAAK,CAAC,SAAS,EAAE,OAAO,CAAC;iBACzB,IAAI,CAAC,IAAI,CAAC;iBACV,IAAI,EAAE,CAAC;YAEV,IAAI,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAChC,SAAS;YACX,CAAC;YAED,iCAAiC;YACjC,MAAM,WAAW,GAAG,gBAAgB,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;YAE3D,yCAAyC;YACzC,MAAM,QAAQ,GAAG;gBACf,WAAW;gBACX,YAAY,EAAE,OAAO,CAAC,KAAK;gBAC3B,SAAS;gBACT,OAAO;aACR,CAAC;YAEF,MAAM,aAAa,GAAG,aAAa,CAAC,cAAc,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;YACtE,SAAS,CAAC,IAAI,CAAC,GAAG,aAAa,CAAC,CAAC;QACnC,CAAC;IACH,CAAC;IAED,uBAAuB;IACvB,MAAM,WAAW,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC;IAC/E,MAAM,KAAK,GAAG;QACZ,WAAW,EAAE,SAAS,CAAC,MAAM;QAC7B,aAAa,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC,MAAM;QAC5E,SAAS,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,WAAW,CAAC;QACnC,SAAS,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,WAAW,CAAC;KACpC,CAAC;IAEF,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;AACtC,CAAC;AAED;;;;;;GAMG;AACH,SAAS,gBAAgB,CACvB,QAAgD,EAChD,YAAoB;IAEpB,MAAM,OAAO,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC;IACvC,IAAI,CAAC,OAAO;QAAE,OAAO,EAAE,CAAC;IAExB,MAAM,IAAI,GAAa,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAEtC,yCAAyC;IACzC,KAAK,IAAI,CAAC,GAAG,YAAY,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;QAC5B,IAAI,CAAC,OAAO;YAAE,SAAS;QAEvB,IAAI,OAAO,CAAC,KAAK,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC;YAClC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC3B,IAAI,OAAO,CAAC,KAAK,KAAK,CAAC;gBAAE,MAAM,CAAC,oBAAoB;QACtD,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;AAC1B,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,YAAY,CAC1B,SAAqB,EACrB,QAA2B,EAC3B,UAAsB,EACtB,cAAsB;IAEtB,MAAM,cAAc,GAAe,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,KAAK,EAAE,EAAE;QAC9D,MAAM,OAAO,GAAG,eAAe,CAAC,QAAQ,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;QACpD,MAAM,WAAW,GAAG,mBAAmB,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QAErD,OAAO;YACL,OAAO;YACP,UAAU,EAAE,QAAQ,CAAC,EAAE;YACvB,OAAO,EAAE,GAAG,CAAC,OAAO;YACpB,WAAW;YACX,UAAU,EAAE,CAAC,EAAE,wBAAwB;YACvC,WAAW,EAAE,GAAG,CAAC,WAAW;YAC5B,YAAY,EAAE,GAAG,CAAC,YAAY;YAC9B,SAAS,EAAE,GAAG,CAAC,SAAS;YACxB,OAAO,EAAE,GAAG,CAAC,OAAO;YACpB,QAAQ,EAAE,QAAQ,CAAC,QAAQ;YAC3B,IAAI,EAAE,QAAQ,CAAC,WAAW,CAAC,MAAM,CAAyB;YAC1D,IAAI,EAAE,QAAQ,CAAC,WAAW,CAAC,MAAM,CAAuB;YACxD,KAAK,EAAE,QAAQ,CAAC,WAAW,CAAC,OAAO,CAAuB;YAC1D,SAAS,EAAE,UAAU,CAAC,KAAK,CAAC,IAAI,EAAE;YAClC,cAAc;YACd,UAAU,EAAE,IAAI,IAAI,EAAE;YACtB,eAAe,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,eAAe,CAAC,QAAQ,CAAC,EAAE,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS;YAChF,WAAW,EACT,KAAK,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,eAAe,CAAC,QAAQ,CAAC,EAAE,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS;SACrF,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,OAAO,cAAc,CAAC;AACxB,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking utilities
|
|
3
|
+
*
|
|
4
|
+
* Hybrid chunking strategy: heading-based + token-aware.
|
|
5
|
+
*/
|
|
6
|
+
export { chunkByTokens } from './chunk-by-tokens.js';
|
|
7
|
+
export { chunkResource, enrichChunks, type ChunkableResource } from './chunk-resource.js';
|
|
8
|
+
export type { ChunkingConfig, RawChunk, ChunkingResult } from './types.js';
|
|
9
|
+
export { calculateEffectiveTarget, generateChunkId, generateContentHash, splitByParagraphs, splitBySentences, } from './utils.js';
|
|
10
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/chunking/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,KAAK,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AAC1F,YAAY,EAAE,cAAc,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAC3E,OAAO,EACL,wBAAwB,EACxB,eAAe,EACf,mBAAmB,EACnB,iBAAiB,EACjB,gBAAgB,GACjB,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking utilities
|
|
3
|
+
*
|
|
4
|
+
* Hybrid chunking strategy: heading-based + token-aware.
|
|
5
|
+
*/
|
|
6
|
+
export { chunkByTokens } from './chunk-by-tokens.js';
|
|
7
|
+
export { chunkResource, enrichChunks } from './chunk-resource.js';
|
|
8
|
+
export { calculateEffectiveTarget, generateChunkId, generateContentHash, splitByParagraphs, splitBySentences, } from './utils.js';
|
|
9
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/chunking/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,aAAa,EAAE,YAAY,EAA0B,MAAM,qBAAqB,CAAC;AAE1F,OAAO,EACL,wBAAwB,EACxB,eAAe,EACf,mBAAmB,EACnB,iBAAiB,EACjB,gBAAgB,GACjB,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking types and configuration
|
|
3
|
+
*/
|
|
4
|
+
import type { TokenCounter } from '../interfaces/token-counter.js';
|
|
5
|
+
/**
|
|
6
|
+
* Chunking configuration
|
|
7
|
+
*/
|
|
8
|
+
export interface ChunkingConfig {
|
|
9
|
+
/** Target chunk size in tokens (ideal) */
|
|
10
|
+
targetChunkSize: number;
|
|
11
|
+
/** Model's maximum token limit (hard limit) */
|
|
12
|
+
modelTokenLimit: number;
|
|
13
|
+
/** Padding factor (0.8-1.0) - safety margin for token estimation */
|
|
14
|
+
paddingFactor: number;
|
|
15
|
+
/** Token counter to use */
|
|
16
|
+
tokenCounter: TokenCounter;
|
|
17
|
+
/** Minimum chunk size in tokens (avoid tiny chunks) */
|
|
18
|
+
minChunkSize?: number;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Raw chunk before enrichment with metadata
|
|
22
|
+
*/
|
|
23
|
+
export interface RawChunk {
|
|
24
|
+
content: string;
|
|
25
|
+
headingPath?: string;
|
|
26
|
+
headingLevel?: number;
|
|
27
|
+
startLine?: number;
|
|
28
|
+
endLine?: number;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Chunking result with statistics
|
|
32
|
+
*/
|
|
33
|
+
export interface ChunkingResult {
|
|
34
|
+
chunks: RawChunk[];
|
|
35
|
+
stats: {
|
|
36
|
+
totalChunks: number;
|
|
37
|
+
averageTokens: number;
|
|
38
|
+
maxTokens: number;
|
|
39
|
+
minTokens: number;
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/chunking/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,gCAAgC,CAAC;AAEnE;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,0CAA0C;IAC1C,eAAe,EAAE,MAAM,CAAC;IAExB,+CAA+C;IAC/C,eAAe,EAAE,MAAM,CAAC;IAExB,oEAAoE;IACpE,aAAa,EAAE,MAAM,CAAC;IAEtB,2BAA2B;IAC3B,YAAY,EAAE,YAAY,CAAC;IAE3B,uDAAuD;IACvD,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,QAAQ,EAAE,CAAC;IACnB,KAAK,EAAE;QACL,WAAW,EAAE,MAAM,CAAC;QACpB,aAAa,EAAE,MAAM,CAAC;QACtB,SAAS,EAAE,MAAM,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;CACH"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/chunking/types.ts"],"names":[],"mappings":"AAAA;;GAEG"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking utilities
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Generate content hash for change detection
|
|
6
|
+
*
|
|
7
|
+
* @param content - Content to hash
|
|
8
|
+
* @returns SHA-256 hash of content
|
|
9
|
+
*/
|
|
10
|
+
export declare function generateContentHash(content: string): string;
|
|
11
|
+
/**
|
|
12
|
+
* Generate unique chunk ID
|
|
13
|
+
*
|
|
14
|
+
* @param resourceId - Source resource ID
|
|
15
|
+
* @param chunkIndex - Index of chunk in resource (0-based)
|
|
16
|
+
* @returns Unique chunk ID
|
|
17
|
+
*/
|
|
18
|
+
export declare function generateChunkId(resourceId: string, chunkIndex: number): string;
|
|
19
|
+
/**
|
|
20
|
+
* Calculate effective target size with padding factor
|
|
21
|
+
*
|
|
22
|
+
* @param targetSize - Target chunk size
|
|
23
|
+
* @param paddingFactor - Padding factor (0.8-1.0)
|
|
24
|
+
* @returns Effective target size
|
|
25
|
+
*/
|
|
26
|
+
export declare function calculateEffectiveTarget(targetSize: number, paddingFactor: number): number;
|
|
27
|
+
/**
|
|
28
|
+
* Split text by paragraphs
|
|
29
|
+
*
|
|
30
|
+
* @param text - Text to split
|
|
31
|
+
* @returns Array of paragraphs
|
|
32
|
+
*/
|
|
33
|
+
export declare function splitByParagraphs(text: string): string[];
|
|
34
|
+
/**
|
|
35
|
+
* Split text by sentences
|
|
36
|
+
*
|
|
37
|
+
* @param text - Text to split
|
|
38
|
+
* @returns Array of sentences
|
|
39
|
+
*/
|
|
40
|
+
export declare function splitBySentences(text: string): string[];
|
|
41
|
+
//# sourceMappingURL=utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../../src/chunking/utils.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAE3D;AAED;;;;;;GAMG;AACH,wBAAgB,eAAe,CAAC,UAAU,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,MAAM,CAE9E;AAED;;;;;;GAMG;AACH,wBAAgB,wBAAwB,CACtC,UAAU,EAAE,MAAM,EAClB,aAAa,EAAE,MAAM,GACpB,MAAM,CAER;AAED;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAMxD;AAED;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAQvD"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking utilities
|
|
3
|
+
*/
|
|
4
|
+
import { createHash } from 'node:crypto';
|
|
5
|
+
/**
|
|
6
|
+
* Generate content hash for change detection
|
|
7
|
+
*
|
|
8
|
+
* @param content - Content to hash
|
|
9
|
+
* @returns SHA-256 hash of content
|
|
10
|
+
*/
|
|
11
|
+
export function generateContentHash(content) {
|
|
12
|
+
return createHash('sha256').update(content).digest('hex');
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Generate unique chunk ID
|
|
16
|
+
*
|
|
17
|
+
* @param resourceId - Source resource ID
|
|
18
|
+
* @param chunkIndex - Index of chunk in resource (0-based)
|
|
19
|
+
* @returns Unique chunk ID
|
|
20
|
+
*/
|
|
21
|
+
export function generateChunkId(resourceId, chunkIndex) {
|
|
22
|
+
return `${resourceId}-chunk-${chunkIndex}`;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Calculate effective target size with padding factor
|
|
26
|
+
*
|
|
27
|
+
* @param targetSize - Target chunk size
|
|
28
|
+
* @param paddingFactor - Padding factor (0.8-1.0)
|
|
29
|
+
* @returns Effective target size
|
|
30
|
+
*/
|
|
31
|
+
export function calculateEffectiveTarget(targetSize, paddingFactor) {
|
|
32
|
+
return Math.floor(targetSize * paddingFactor);
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Split text by paragraphs
|
|
36
|
+
*
|
|
37
|
+
* @param text - Text to split
|
|
38
|
+
* @returns Array of paragraphs
|
|
39
|
+
*/
|
|
40
|
+
export function splitByParagraphs(text) {
|
|
41
|
+
// Split by double newlines (paragraph boundaries)
|
|
42
|
+
return text
|
|
43
|
+
.split(/\n\n+/)
|
|
44
|
+
.map((p) => p.trim())
|
|
45
|
+
.filter((p) => p.length > 0);
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Split text by sentences
|
|
49
|
+
*
|
|
50
|
+
* @param text - Text to split
|
|
51
|
+
* @returns Array of sentences
|
|
52
|
+
*/
|
|
53
|
+
export function splitBySentences(text) {
|
|
54
|
+
// Simple sentence splitting (handles . ! ?)
|
|
55
|
+
// Match sentence boundaries and capture the sentences
|
|
56
|
+
const matches = text.match(/[^.!?]+/g);
|
|
57
|
+
if (!matches) {
|
|
58
|
+
return [];
|
|
59
|
+
}
|
|
60
|
+
return matches.map((s) => s.trim()).filter((s) => s.length > 0);
|
|
61
|
+
}
|
|
62
|
+
//# sourceMappingURL=utils.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.js","sourceRoot":"","sources":["../../src/chunking/utils.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEzC;;;;;GAKG;AACH,MAAM,UAAU,mBAAmB,CAAC,OAAe;IACjD,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC5D,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,eAAe,CAAC,UAAkB,EAAE,UAAkB;IACpE,OAAO,GAAG,UAAU,UAAU,UAAU,EAAE,CAAC;AAC7C,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,wBAAwB,CACtC,UAAkB,EAClB,aAAqB;IAErB,OAAO,IAAI,CAAC,KAAK,CAAC,UAAU,GAAG,aAAa,CAAC,CAAC;AAChD,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAY;IAC5C,kDAAkD;IAClD,OAAO,IAAI;SACR,KAAK,CAAC,OAAO,CAAC;SACd,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;SACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACjC,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,4CAA4C;IAC5C,sDAAsD;IACtD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;IACvC,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO,EAAE,CAAC;IACZ,CAAC;IACD,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AAClE,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding Provider Implementations
|
|
3
|
+
*
|
|
4
|
+
* Pluggable embedding providers for RAG.
|
|
5
|
+
*/
|
|
6
|
+
export { TransformersEmbeddingProvider, type TransformersEmbeddingConfig, } from './transformers-embedding-provider.js';
|
|
7
|
+
export { OpenAIEmbeddingProvider, type OpenAIEmbeddingConfig, } from './openai-embedding-provider.js';
|
|
8
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/embedding-providers/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EACL,6BAA6B,EAC7B,KAAK,2BAA2B,GACjC,MAAM,sCAAsC,CAAC;AAE9C,OAAO,EACL,uBAAuB,EACvB,KAAK,qBAAqB,GAC3B,MAAM,gCAAgC,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding Provider Implementations
|
|
3
|
+
*
|
|
4
|
+
* Pluggable embedding providers for RAG.
|
|
5
|
+
*/
|
|
6
|
+
export { TransformersEmbeddingProvider, } from './transformers-embedding-provider.js';
|
|
7
|
+
export { OpenAIEmbeddingProvider, } from './openai-embedding-provider.js';
|
|
8
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/embedding-providers/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EACL,6BAA6B,GAE9B,MAAM,sCAAsC,CAAC;AAE9C,OAAO,EACL,uBAAuB,GAExB,MAAM,gCAAgC,CAAC"}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI Embedding Provider
|
|
3
|
+
*
|
|
4
|
+
* Uses OpenAI API for high-quality embedding generation.
|
|
5
|
+
* Requires API key and internet connection.
|
|
6
|
+
*/
|
|
7
|
+
import type { EmbeddingProvider } from '../interfaces/embedding.js';
|
|
8
|
+
/**
|
|
9
|
+
* Configuration for OpenAIEmbeddingProvider
|
|
10
|
+
*/
|
|
11
|
+
export interface OpenAIEmbeddingConfig {
|
|
12
|
+
/** OpenAI API key */
|
|
13
|
+
apiKey: string;
|
|
14
|
+
/** Model name (default: text-embedding-3-small). Supports text-embedding-3-small, text-embedding-3-large, text-embedding-ada-002, etc. */
|
|
15
|
+
model?: string;
|
|
16
|
+
/** Custom dimensions (only for text-embedding-3-* models) */
|
|
17
|
+
dimensions?: number;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* OpenAIEmbeddingProvider
|
|
21
|
+
*
|
|
22
|
+
* Cloud-based embedding generation using OpenAI API.
|
|
23
|
+
* Default model: text-embedding-3-small (1536 dimensions)
|
|
24
|
+
*
|
|
25
|
+
* Benefits:
|
|
26
|
+
* - State-of-art quality
|
|
27
|
+
* - Well-tested, production-ready
|
|
28
|
+
* - Higher dimensions for better accuracy
|
|
29
|
+
*
|
|
30
|
+
* Considerations:
|
|
31
|
+
* - Requires API key
|
|
32
|
+
* - API cost per token
|
|
33
|
+
* - Network latency
|
|
34
|
+
* - Rate limits
|
|
35
|
+
*/
|
|
36
|
+
export declare class OpenAIEmbeddingProvider implements EmbeddingProvider {
|
|
37
|
+
readonly name = "openai";
|
|
38
|
+
readonly model: string;
|
|
39
|
+
readonly dimensions: number;
|
|
40
|
+
private readonly client;
|
|
41
|
+
/**
|
|
42
|
+
* Create OpenAIEmbeddingProvider
|
|
43
|
+
*
|
|
44
|
+
* @param config - Configuration with API key
|
|
45
|
+
*/
|
|
46
|
+
constructor(config: OpenAIEmbeddingConfig);
|
|
47
|
+
/**
|
|
48
|
+
* Embed a single text chunk
|
|
49
|
+
*
|
|
50
|
+
* @param text - Text to embed
|
|
51
|
+
* @returns Vector embedding
|
|
52
|
+
*/
|
|
53
|
+
embed(text: string): Promise<number[]>;
|
|
54
|
+
/**
|
|
55
|
+
* Embed multiple text chunks efficiently
|
|
56
|
+
*
|
|
57
|
+
* Uses OpenAI's batch API for better performance.
|
|
58
|
+
*
|
|
59
|
+
* @param texts - Array of texts to embed
|
|
60
|
+
* @returns Array of vector embeddings
|
|
61
|
+
*/
|
|
62
|
+
embedBatch(texts: string[]): Promise<number[][]>;
|
|
63
|
+
}
|
|
64
|
+
//# sourceMappingURL=openai-embedding-provider.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"openai-embedding-provider.d.ts","sourceRoot":"","sources":["../../src/embedding-providers/openai-embedding-provider.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAEpE;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,qBAAqB;IACrB,MAAM,EAAE,MAAM,CAAC;IACf,0IAA0I;IAC1I,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,6DAA6D;IAC7D,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAWD;;;;;;;;;;;;;;;;GAgBG;AACH,qBAAa,uBAAwB,YAAW,iBAAiB;IAC/D,QAAQ,CAAC,IAAI,YAAY;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAE5B,OAAO,CAAC,QAAQ,CAAC,MAAM,CAUrB;IAEF;;;;OAIG;gBACS,MAAM,EAAE,qBAAqB;IAezC;;;;;OAKG;IACG,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAc5C;;;;;;;OAOG;IACG,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CAavD"}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI Embedding Provider
|
|
3
|
+
*
|
|
4
|
+
* Uses OpenAI API for high-quality embedding generation.
|
|
5
|
+
* Requires API key and internet connection.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Model dimensions map
|
|
9
|
+
*/
|
|
10
|
+
const MODEL_DIMENSIONS = {
|
|
11
|
+
'text-embedding-3-small': 1536,
|
|
12
|
+
'text-embedding-3-large': 3072,
|
|
13
|
+
'text-embedding-ada-002': 1536,
|
|
14
|
+
};
|
|
15
|
+
/**
|
|
16
|
+
* OpenAIEmbeddingProvider
|
|
17
|
+
*
|
|
18
|
+
* Cloud-based embedding generation using OpenAI API.
|
|
19
|
+
* Default model: text-embedding-3-small (1536 dimensions)
|
|
20
|
+
*
|
|
21
|
+
* Benefits:
|
|
22
|
+
* - State-of-art quality
|
|
23
|
+
* - Well-tested, production-ready
|
|
24
|
+
* - Higher dimensions for better accuracy
|
|
25
|
+
*
|
|
26
|
+
* Considerations:
|
|
27
|
+
* - Requires API key
|
|
28
|
+
* - API cost per token
|
|
29
|
+
* - Network latency
|
|
30
|
+
* - Rate limits
|
|
31
|
+
*/
|
|
32
|
+
export class OpenAIEmbeddingProvider {
|
|
33
|
+
name = 'openai';
|
|
34
|
+
model;
|
|
35
|
+
dimensions;
|
|
36
|
+
client;
|
|
37
|
+
/**
|
|
38
|
+
* Create OpenAIEmbeddingProvider
|
|
39
|
+
*
|
|
40
|
+
* @param config - Configuration with API key
|
|
41
|
+
*/
|
|
42
|
+
constructor(config) {
|
|
43
|
+
this.model = config.model ?? 'text-embedding-3-small';
|
|
44
|
+
this.dimensions = config.dimensions ?? MODEL_DIMENSIONS[this.model] ?? 1536;
|
|
45
|
+
// Lazy load OpenAI SDK (optional dependency)
|
|
46
|
+
try {
|
|
47
|
+
const { OpenAI } = require('openai');
|
|
48
|
+
this.client = new OpenAI({ apiKey: config.apiKey });
|
|
49
|
+
}
|
|
50
|
+
catch {
|
|
51
|
+
throw new Error('OpenAI SDK not installed. Install with: bun add openai');
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Embed a single text chunk
|
|
56
|
+
*
|
|
57
|
+
* @param text - Text to embed
|
|
58
|
+
* @returns Vector embedding
|
|
59
|
+
*/
|
|
60
|
+
async embed(text) {
|
|
61
|
+
const response = await this.client.embeddings.create({
|
|
62
|
+
model: this.model,
|
|
63
|
+
input: text,
|
|
64
|
+
dimensions: this.dimensions,
|
|
65
|
+
});
|
|
66
|
+
const firstItem = response.data[0];
|
|
67
|
+
if (!firstItem) {
|
|
68
|
+
throw new Error('OpenAI API returned no embeddings');
|
|
69
|
+
}
|
|
70
|
+
return firstItem.embedding;
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Embed multiple text chunks efficiently
|
|
74
|
+
*
|
|
75
|
+
* Uses OpenAI's batch API for better performance.
|
|
76
|
+
*
|
|
77
|
+
* @param texts - Array of texts to embed
|
|
78
|
+
* @returns Array of vector embeddings
|
|
79
|
+
*/
|
|
80
|
+
async embedBatch(texts) {
|
|
81
|
+
if (texts.length === 0) {
|
|
82
|
+
return [];
|
|
83
|
+
}
|
|
84
|
+
const response = await this.client.embeddings.create({
|
|
85
|
+
model: this.model,
|
|
86
|
+
input: texts,
|
|
87
|
+
dimensions: this.dimensions,
|
|
88
|
+
});
|
|
89
|
+
return response.data.map((item) => item.embedding);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
//# sourceMappingURL=openai-embedding-provider.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"openai-embedding-provider.js","sourceRoot":"","sources":["../../src/embedding-providers/openai-embedding-provider.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAgBH;;GAEG;AACH,MAAM,gBAAgB,GAA2B;IAC/C,wBAAwB,EAAE,IAAI;IAC9B,wBAAwB,EAAE,IAAI;IAC9B,wBAAwB,EAAE,IAAI;CAC/B,CAAC;AAEF;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,OAAO,uBAAuB;IACzB,IAAI,GAAG,QAAQ,CAAC;IAChB,KAAK,CAAS;IACd,UAAU,CAAS;IAEX,MAAM,CAUrB;IAEF;;;;OAIG;IACH,YAAY,MAA6B;QACvC,IAAI,CAAC,KAAK,GAAG,MAAM,CAAC,KAAK,IAAI,wBAAwB,CAAC;QACtD,IAAI,CAAC,UAAU,GAAG,MAAM,CAAC,UAAU,IAAI,gBAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC;QAE5E,6CAA6C;QAC7C,IAAI,CAAC;YACH,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;YACrC,IAAI,CAAC,MAAM,GAAG,IAAI,MAAM,CAAC,EAAE,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;QACtD,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CACb,wDAAwD,CACzD,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,KAAK,CAAC,IAAY;QACtB,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC;YACnD,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,KAAK,EAAE,IAAI;YACX,UAAU,EAAE,IAAI,CAAC,UAAU;SAC5B,CAAC,CAAC;QAEH,MAAM,SAAS,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACnC,IAAI,CAAC,SAAS,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QACvD,CAAC;QACD,OAAO,SAAS,CAAC,SAAS,CAAC;IAC7B,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,UAAU,CAAC,KAAe;QAC9B,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC;YACnD,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,KAAK,EAAE,KAAK;YACZ,UAAU,EAAE,IAAI,CAAC,UAAU;SAC5B,CAAC,CAAC;QAEH,OAAO,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACrD,CAAC;CACF"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Transformers Embedding Provider
|
|
3
|
+
*
|
|
4
|
+
* Uses @xenova/transformers for local embedding generation.
|
|
5
|
+
* No API key required, runs entirely in Node.js.
|
|
6
|
+
*/
|
|
7
|
+
import type { EmbeddingProvider } from '../interfaces/embedding.js';
|
|
8
|
+
/**
|
|
9
|
+
* Configuration for TransformersEmbeddingProvider
|
|
10
|
+
*/
|
|
11
|
+
export interface TransformersEmbeddingConfig {
|
|
12
|
+
/** Model name (default: Xenova/all-MiniLM-L6-v2) */
|
|
13
|
+
model?: string;
|
|
14
|
+
/** Embedding dimensions (default: 384) */
|
|
15
|
+
dimensions?: number;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* TransformersEmbeddingProvider
|
|
19
|
+
*
|
|
20
|
+
* Local embedding generation using transformers.js.
|
|
21
|
+
* Default model: all-MiniLM-L6-v2 (384 dimensions)
|
|
22
|
+
*
|
|
23
|
+
* Benefits:
|
|
24
|
+
* - No API key required
|
|
25
|
+
* - Runs locally in Node.js
|
|
26
|
+
* - Fast inference
|
|
27
|
+
* - Good quality embeddings
|
|
28
|
+
* - No network latency
|
|
29
|
+
*
|
|
30
|
+
* Note: First run downloads model (~20MB for all-MiniLM-L6-v2)
|
|
31
|
+
*/
|
|
32
|
+
export declare class TransformersEmbeddingProvider implements EmbeddingProvider {
|
|
33
|
+
readonly name = "transformers-js";
|
|
34
|
+
readonly model: string;
|
|
35
|
+
readonly dimensions: number;
|
|
36
|
+
private pipelinePromise;
|
|
37
|
+
/**
|
|
38
|
+
* Create TransformersEmbeddingProvider
|
|
39
|
+
*
|
|
40
|
+
* @param config - Optional configuration
|
|
41
|
+
*/
|
|
42
|
+
constructor(config?: TransformersEmbeddingConfig);
|
|
43
|
+
/**
|
|
44
|
+
* Get or initialize the embedding pipeline
|
|
45
|
+
*/
|
|
46
|
+
private getPipeline;
|
|
47
|
+
/**
|
|
48
|
+
* Embed a single text chunk
|
|
49
|
+
*
|
|
50
|
+
* @param text - Text to embed
|
|
51
|
+
* @returns Vector embedding (normalized)
|
|
52
|
+
*/
|
|
53
|
+
embed(text: string): Promise<number[]>;
|
|
54
|
+
/**
|
|
55
|
+
* Embed multiple text chunks efficiently
|
|
56
|
+
*
|
|
57
|
+
* @param texts - Array of texts to embed
|
|
58
|
+
* @returns Array of vector embeddings
|
|
59
|
+
*/
|
|
60
|
+
embedBatch(texts: string[]): Promise<number[][]>;
|
|
61
|
+
}
|
|
62
|
+
//# sourceMappingURL=transformers-embedding-provider.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"transformers-embedding-provider.d.ts","sourceRoot":"","sources":["../../src/embedding-providers/transformers-embedding-provider.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAEpE;;GAEG;AACH,MAAM,WAAW,2BAA2B;IAC1C,oDAAoD;IACpD,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,0CAA0C;IAC1C,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;;;;;;;;;GAcG;AACH,qBAAa,6BAA8B,YAAW,iBAAiB;IACrE,QAAQ,CAAC,IAAI,qBAAqB;IAClC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAE5B,OAAO,CAAC,eAAe,CAAiC;IAExD;;;;OAIG;gBACS,MAAM,GAAE,2BAAgC;IAKpD;;OAEG;YACW,WAAW;IAOzB;;;;;OAKG;IACG,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAa5C;;;;;OAKG;IACG,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CASvD"}
|