voctar 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +102 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +29 -0
- package/dist/index.js.map +1 -0
- package/dist/src/chunking/index.d.ts +48 -0
- package/dist/src/chunking/index.d.ts.map +1 -0
- package/dist/src/chunking/index.js +123 -0
- package/dist/src/chunking/index.js.map +1 -0
- package/dist/src/chunking/strategies/fixed.d.ts +14 -0
- package/dist/src/chunking/strategies/fixed.d.ts.map +1 -0
- package/dist/src/chunking/strategies/fixed.js +111 -0
- package/dist/src/chunking/strategies/fixed.js.map +1 -0
- package/dist/src/chunking/strategies/paragraph.d.ts +6 -0
- package/dist/src/chunking/strategies/paragraph.d.ts.map +1 -0
- package/dist/src/chunking/strategies/paragraph.js +84 -0
- package/dist/src/chunking/strategies/paragraph.js.map +1 -0
- package/dist/src/chunking/strategies/recursive.d.ts +17 -0
- package/dist/src/chunking/strategies/recursive.d.ts.map +1 -0
- package/dist/src/chunking/strategies/recursive.js +192 -0
- package/dist/src/chunking/strategies/recursive.js.map +1 -0
- package/dist/src/chunking/strategies/semantic.d.ts +96 -0
- package/dist/src/chunking/strategies/semantic.d.ts.map +1 -0
- package/dist/src/chunking/strategies/semantic.js +587 -0
- package/dist/src/chunking/strategies/semantic.js.map +1 -0
- package/dist/src/chunking/strategies/sentence.d.ts +7 -0
- package/dist/src/chunking/strategies/sentence.d.ts.map +1 -0
- package/dist/src/chunking/strategies/sentence.js +116 -0
- package/dist/src/chunking/strategies/sentence.js.map +1 -0
- package/dist/src/chunking/types.d.ts +45 -0
- package/dist/src/chunking/types.d.ts.map +1 -0
- package/dist/src/chunking/types.js +4 -0
- package/dist/src/chunking/types.js.map +1 -0
- package/dist/src/chunking/utils/tokenizer.d.ts +10 -0
- package/dist/src/chunking/utils/tokenizer.d.ts.map +1 -0
- package/dist/src/chunking/utils/tokenizer.js +50 -0
- package/dist/src/chunking/utils/tokenizer.js.map +1 -0
- package/dist/src/providers/embeddings/index.d.ts +3 -0
- package/dist/src/providers/embeddings/index.d.ts.map +1 -0
- package/dist/src/providers/embeddings/index.js +7 -0
- package/dist/src/providers/embeddings/index.js.map +1 -0
- package/dist/src/providers/embeddings/openai.d.ts +21 -0
- package/dist/src/providers/embeddings/openai.d.ts.map +1 -0
- package/dist/src/providers/embeddings/openai.js +86 -0
- package/dist/src/providers/embeddings/openai.js.map +1 -0
- package/dist/src/providers/index.d.ts +3 -0
- package/dist/src/providers/index.d.ts.map +1 -0
- package/dist/src/providers/index.js +20 -0
- package/dist/src/providers/index.js.map +1 -0
- package/dist/src/providers/stores/index.d.ts +6 -0
- package/dist/src/providers/stores/index.d.ts.map +1 -0
- package/dist/src/providers/stores/index.js +11 -0
- package/dist/src/providers/stores/index.js.map +1 -0
- package/dist/src/providers/stores/memory.d.ts +18 -0
- package/dist/src/providers/stores/memory.d.ts.map +1 -0
- package/dist/src/providers/stores/memory.js +169 -0
- package/dist/src/providers/stores/memory.js.map +1 -0
- package/dist/src/providers/stores/qdrant.d.ts +28 -0
- package/dist/src/providers/stores/qdrant.d.ts.map +1 -0
- package/dist/src/providers/stores/qdrant.js +223 -0
- package/dist/src/providers/stores/qdrant.js.map +1 -0
- package/dist/src/providers/stores/sqlite.d.ts +38 -0
- package/dist/src/providers/stores/sqlite.d.ts.map +1 -0
- package/dist/src/providers/stores/sqlite.js +306 -0
- package/dist/src/providers/stores/sqlite.js.map +1 -0
- package/dist/src/types.d.ts +111 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +32 -0
- package/dist/src/types.js.map +1 -0
- package/dist/src/vector.d.ts +74 -0
- package/dist/src/vector.d.ts.map +1 -0
- package/dist/src/vector.js +505 -0
- package/dist/src/vector.js.map +1 -0
- package/docs/API.md +361 -0
- package/docs/CHUNKING.md +280 -0
- package/docs/CUSTOM_PROVIDERS.md +101 -0
- package/docs/README.md +11 -0
- package/docs/STORAGE_BACKENDS.md +189 -0
- package/docs/assets/vectar.png +0 -0
- package/package.json +46 -0
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.SentenceChunkingStrategy = void 0;
|
|
4
|
+
// Sentence-based chunking strategy
|
|
5
|
+
const uuid_1 = require("uuid");
|
|
6
|
+
const tokenizer_1 = require("../utils/tokenizer");
|
|
7
|
+
class SentenceChunkingStrategy {
|
|
8
|
+
getName() {
|
|
9
|
+
return 'sentence';
|
|
10
|
+
}
|
|
11
|
+
chunk(text, documentId, options) {
|
|
12
|
+
// Get token limit and ensure maxSize doesn't exceed it
|
|
13
|
+
const tokenLimit = options.tokenLimit ?? 8192;
|
|
14
|
+
const maxSize = Math.min(options.maxChunkSize ?? 1000, tokenLimit);
|
|
15
|
+
const overlap = options.overlap ?? 1; // Overlap in number of sentences
|
|
16
|
+
// Split text into sentences
|
|
17
|
+
const sentences = this.splitIntoSentences(text);
|
|
18
|
+
const chunks = [];
|
|
19
|
+
let currentChunk = [];
|
|
20
|
+
let currentTokens = 0;
|
|
21
|
+
let chunkIndex = 0;
|
|
22
|
+
let startChar = 0;
|
|
23
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
24
|
+
const sentence = sentences[i];
|
|
25
|
+
const sentenceTokens = (0, tokenizer_1.countTokens)(sentence);
|
|
26
|
+
// If adding this sentence would exceed maxSize (in tokens) and we have content, create a chunk
|
|
27
|
+
if (currentTokens + sentenceTokens > maxSize && currentChunk.length > 0) {
|
|
28
|
+
const chunkText = currentChunk.join(' ').trim();
|
|
29
|
+
const endChar = startChar + chunkText.length;
|
|
30
|
+
chunks.push({
|
|
31
|
+
id: (0, uuid_1.v4)(),
|
|
32
|
+
text: chunkText,
|
|
33
|
+
metadata: {
|
|
34
|
+
documentId,
|
|
35
|
+
chunkIndex,
|
|
36
|
+
totalChunks: 0, // Will be updated later
|
|
37
|
+
startChar,
|
|
38
|
+
endChar,
|
|
39
|
+
sentences: currentChunk.length,
|
|
40
|
+
...options.metadata,
|
|
41
|
+
},
|
|
42
|
+
});
|
|
43
|
+
// Keep last N sentences for overlap
|
|
44
|
+
const overlapSentences = currentChunk.slice(-overlap);
|
|
45
|
+
currentChunk = [...overlapSentences, sentence];
|
|
46
|
+
currentTokens = (0, tokenizer_1.countTokens)(overlapSentences.join(' ')) + sentenceTokens;
|
|
47
|
+
startChar = endChar - (overlapSentences.join(' ').length);
|
|
48
|
+
chunkIndex++;
|
|
49
|
+
}
|
|
50
|
+
else {
|
|
51
|
+
currentChunk.push(sentence);
|
|
52
|
+
currentTokens += sentenceTokens;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
// Add remaining content as final chunk
|
|
56
|
+
if (currentChunk.length > 0) {
|
|
57
|
+
const chunkText = currentChunk.join(' ').trim();
|
|
58
|
+
const endChar = startChar + chunkText.length;
|
|
59
|
+
chunks.push({
|
|
60
|
+
id: (0, uuid_1.v4)(),
|
|
61
|
+
text: chunkText,
|
|
62
|
+
metadata: {
|
|
63
|
+
documentId,
|
|
64
|
+
chunkIndex,
|
|
65
|
+
totalChunks: 0,
|
|
66
|
+
startChar,
|
|
67
|
+
endChar,
|
|
68
|
+
sentences: currentChunk.length,
|
|
69
|
+
...options.metadata,
|
|
70
|
+
},
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
// Update totalChunks
|
|
74
|
+
chunks.forEach(chunk => {
|
|
75
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
76
|
+
});
|
|
77
|
+
return chunks;
|
|
78
|
+
}
|
|
79
|
+
splitIntoSentences(text) {
|
|
80
|
+
// Simple sentence splitter - could be improved with NLP library
|
|
81
|
+
// Handles common abbreviations
|
|
82
|
+
const sentences = [];
|
|
83
|
+
// Replace common abbreviations to avoid false splits
|
|
84
|
+
let normalized = text
|
|
85
|
+
.replace(/Mr\./g, 'Mr')
|
|
86
|
+
.replace(/Mrs\./g, 'Mrs')
|
|
87
|
+
.replace(/Dr\./g, 'Dr')
|
|
88
|
+
.replace(/Ms\./g, 'Ms')
|
|
89
|
+
.replace(/vs\./g, 'vs')
|
|
90
|
+
.replace(/etc\./g, 'etc')
|
|
91
|
+
.replace(/e\.g\./g, 'eg')
|
|
92
|
+
.replace(/i\.e\./g, 'ie');
|
|
93
|
+
// Split on sentence boundaries
|
|
94
|
+
const parts = normalized.split(/([.!?]+[\s\n]+)/);
|
|
95
|
+
let currentSentence = '';
|
|
96
|
+
for (const part of parts) {
|
|
97
|
+
if (/[.!?]+[\s\n]+/.test(part)) {
|
|
98
|
+
currentSentence += part.trim();
|
|
99
|
+
if (currentSentence.trim()) {
|
|
100
|
+
sentences.push(currentSentence.trim());
|
|
101
|
+
}
|
|
102
|
+
currentSentence = '';
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
currentSentence += part;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// Add any remaining content
|
|
109
|
+
if (currentSentence.trim()) {
|
|
110
|
+
sentences.push(currentSentence.trim());
|
|
111
|
+
}
|
|
112
|
+
return sentences.filter(s => s.length > 0);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
exports.SentenceChunkingStrategy = SentenceChunkingStrategy;
|
|
116
|
+
//# sourceMappingURL=sentence.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sentence.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/sentence.ts"],"names":[],"mappings":";;;AAAA,mCAAmC;AACnC,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,wBAAwB;IACnC,OAAO;QACL,OAAO,UAAU,CAAC;IACpB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,iCAAiC;QAEvE,4BAA4B;QAC5B,MAAM,SAAS,GAAG,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,CAAC;QAEhD,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAa,EAAE,CAAC;QAChC,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;YAC9B,MAAM,cAAc,GAAG,IAAA,uBAAW,EAAC,QAAQ,CAAC,CAAC;YAE7C,+FAA+F;YAC/F,IAAI,aAAa,GAAG,cAAc,GAAG,OAAO,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxE,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;gBAChD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;gBAE7C,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,IAAA,SAAM,GAAE;oBACZ,IAAI,EAAE,SAAS;oBACf,QAAQ,EAAE;wBACR,UAAU;wBACV,UAAU;wBACV,WAAW,EAAE,CAAC,EAAE,wBAAwB;wBACxC,SAAS;wBACT,OAAO;wBACP,SAAS,EAAE,YAAY,CAAC,MAAM;wBAC9B,GAAG,OAAO,CAAC,QAAQ;qBACpB;iBACF,CAAC,CAAC;gBAEH,oCAAoC;gBACpC,MAAM,gBAAgB,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC;gBACtD,YAAY,GAAG,CAAC,GAAG,gBAAgB,EAAE,QAAQ,CAAC,CAAC;gBAC/C,aAAa,GAAG,IAAA,uBAAW,EAAC,gBAAgB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,cAAc,CAAC;gBACzE,SAAS,GAAG,OAAO,GAAG,CAAC,gBAAgB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC;gBAC1D,UAAU,EAAE,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;gBAC5B,aAAa,IAAI,cAAc,CAAC;YAClC,CAAC;QACH,CAAC;QAED,uCAAuC;QACvC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YAChD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU;oBACV,WAAW,EAAE,CAAC;oBACd,SAAS;oBACT,OAAO;oBACP,SAAS,EAAE,YAAY,CAAC,MAAM;oBAC9B,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;QACL,CAAC;QAED,qBAAqB;QACrB,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YACrB,KAAK,CAAC,QAAQ,CAAC,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,kBAAkB,CAAC,IAAY;QACrC,gEAAgE;QAChE,+BAA+B;QAC/B,MAAM,SAAS,GAAa,EAAE,CAAC;QAE/B,qDAAqD;QACrD,IAAI,UAAU,GAAG,IAAI;aAClB,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC;aACtB,OAAO,CAAC,QAAQ,EAAE,KAAK,CAAC;aACxB,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC;aACtB,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC;aACtB,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC;aACtB,OAAO,CAAC,QAAQ,EAAE,KAAK,CAAC;aACxB,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC;aACxB,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;QAE5B,+BAA+B;QAC/B,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;QAElD,IAAI,eAAe,GAAG,EAAE,CAAC;QAEzB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC/B,eAAe,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;gBAC/B,IAAI,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC;oBAC3B,SAAS,CAAC,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,CAAC,CAAC;gBACzC,CAAC;gBACD,eAAe,GAAG,EAAE,CAAC;YACvB,CAAC;iBAAM,CAAC;gBACN,eAAe,IAAI,IAAI,CAAC;YAC1B,CAAC;QACH,CAAC;QAED,4BAA4B;QAC5B,IAAI,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC;YAC3B,SAAS,CAAC,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,CAAC,CAAC;QACzC,CAAC;QAED,OAAO,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC7C,CAAC;CACF;AA3HD,4DA2HC"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
export interface Chunk {
|
|
2
|
+
id: string;
|
|
3
|
+
text: string;
|
|
4
|
+
metadata: ChunkMetadata;
|
|
5
|
+
}
|
|
6
|
+
export interface ChunkMetadata {
|
|
7
|
+
documentId: string;
|
|
8
|
+
chunkIndex: number;
|
|
9
|
+
totalChunks: number;
|
|
10
|
+
startChar: number;
|
|
11
|
+
endChar: number;
|
|
12
|
+
tokens?: number;
|
|
13
|
+
[key: string]: any;
|
|
14
|
+
}
|
|
15
|
+
export interface ChunkingOptions {
|
|
16
|
+
strategy?: 'fixed' | 'recursive' | 'semantic' | 'sentence' | 'paragraph';
|
|
17
|
+
maxChunkSize?: number;
|
|
18
|
+
overlap?: number;
|
|
19
|
+
preserveFormatting?: boolean;
|
|
20
|
+
metadata?: Record<string, any>;
|
|
21
|
+
separator?: string | string[];
|
|
22
|
+
tokenLimit?: number;
|
|
23
|
+
softLimit?: number;
|
|
24
|
+
hardLimit?: number;
|
|
25
|
+
similarityThreshold?: number;
|
|
26
|
+
contentType?: 'conversation' | 'text';
|
|
27
|
+
contextOverlapPercent?: number;
|
|
28
|
+
smartOverlap?: boolean;
|
|
29
|
+
volatilityWindow?: number;
|
|
30
|
+
generateHeaders?: boolean;
|
|
31
|
+
stripNoise?: boolean;
|
|
32
|
+
noisePatterns?: RegExp[];
|
|
33
|
+
addRoleMarkers?: boolean;
|
|
34
|
+
embeddingProvider?: any;
|
|
35
|
+
}
|
|
36
|
+
export interface ChunkingStrategy {
|
|
37
|
+
chunk(text: string, documentId: string, options: ChunkingOptions): Chunk[];
|
|
38
|
+
getName(): string;
|
|
39
|
+
}
|
|
40
|
+
export interface DocumentChunkResult {
|
|
41
|
+
documentId: string;
|
|
42
|
+
chunks: Chunk[];
|
|
43
|
+
metadata: Record<string, any>;
|
|
44
|
+
}
|
|
45
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../src/chunking/types.ts"],"names":[],"mappings":"AAEA,MAAM,WAAW,KAAK;IACpB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,aAAa,CAAC;CACzB;AAED,MAAM,WAAW,aAAa;IAC5B,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;CACpB;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,EAAE,OAAO,GAAG,WAAW,GAAG,UAAU,GAAG,UAAU,GAAG,WAAW,CAAC;IACzE,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC/B,SAAS,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IAC9B,UAAU,CAAC,EAAE,MAAM,CAAC;IAGpB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,WAAW,CAAC,EAAE,cAAc,GAAG,MAAM,CAAC;IACtC,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAC/B,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB,iBAAiB,CAAC,EAAE,GAAG,CAAC;CACzB;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE,CAAC;IAC3E,OAAO,IAAI,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,mBAAmB;IAClC,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAC/B"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/chunking/types.ts"],"names":[],"mappings":";AAAA,gCAAgC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Count tokens in text accurately using tiktoken
|
|
3
|
+
*/
|
|
4
|
+
export declare function countTokens(text: string): number;
|
|
5
|
+
/**
|
|
6
|
+
* Estimate tokens (fallback method)
|
|
7
|
+
* Use this only if tiktoken is not available
|
|
8
|
+
*/
|
|
9
|
+
export declare function estimateTokens(text: string): number;
|
|
10
|
+
//# sourceMappingURL=tokenizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../../../src/chunking/utils/tokenizer.ts"],"names":[],"mappings":"AAoBA;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAahD;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAMnD"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.countTokens = countTokens;
|
|
4
|
+
exports.estimateTokens = estimateTokens;
|
|
5
|
+
// Token counting utility for chunking strategies
|
|
6
|
+
// Uses tiktoken for accurate token counting
|
|
7
|
+
const tiktoken_1 = require("tiktoken");
|
|
8
|
+
// Cache encoding to avoid recreating it
|
|
9
|
+
let cachedEncoding = null;
|
|
10
|
+
/**
|
|
11
|
+
* Get the encoding for embedding models
|
|
12
|
+
* OpenAI embedding models use cl100k_base encoding
|
|
13
|
+
*/
|
|
14
|
+
function getEmbeddingEncoding() {
|
|
15
|
+
if (!cachedEncoding) {
|
|
16
|
+
// Use cl100k_base encoding which is used by text-embedding-3 models
|
|
17
|
+
// This is compatible with GPT-4 and text-embedding-3 models
|
|
18
|
+
cachedEncoding = (0, tiktoken_1.encoding_for_model)('gpt-4');
|
|
19
|
+
}
|
|
20
|
+
return cachedEncoding;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Count tokens in text accurately using tiktoken
|
|
24
|
+
*/
|
|
25
|
+
function countTokens(text) {
|
|
26
|
+
if (!text || text.length === 0) {
|
|
27
|
+
return 0;
|
|
28
|
+
}
|
|
29
|
+
try {
|
|
30
|
+
const encoding = getEmbeddingEncoding();
|
|
31
|
+
return encoding.encode(text).length;
|
|
32
|
+
}
|
|
33
|
+
catch (error) {
|
|
34
|
+
// Fallback to approximation if tiktoken fails
|
|
35
|
+
// Rough approximation: 1 token ≈ 4 characters for English text
|
|
36
|
+
return Math.ceil(text.length / 4);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Estimate tokens (fallback method)
|
|
41
|
+
* Use this only if tiktoken is not available
|
|
42
|
+
*/
|
|
43
|
+
function estimateTokens(text) {
|
|
44
|
+
if (!text || text.length === 0) {
|
|
45
|
+
return 0;
|
|
46
|
+
}
|
|
47
|
+
// Rough approximation: 1 token ≈ 4 characters for English text
|
|
48
|
+
return Math.ceil(text.length / 4);
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=tokenizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../../../src/chunking/utils/tokenizer.ts"],"names":[],"mappings":";;AAuBA,kCAaC;AAMD,wCAMC;AAhDD,iDAAiD;AACjD,4CAA4C;AAC5C,uCAA8C;AAE9C,wCAAwC;AACxC,IAAI,cAAc,GAAiD,IAAI,CAAC;AAExE;;;GAGG;AACH,SAAS,oBAAoB;IAC3B,IAAI,CAAC,cAAc,EAAE,CAAC;QACpB,oEAAoE;QACpE,4DAA4D;QAC5D,cAAc,GAAG,IAAA,6BAAkB,EAAC,OAAO,CAAC,CAAC;IAC/C,CAAC;IACD,OAAO,cAAc,CAAC;AACxB,CAAC;AAED;;GAEG;AACH,SAAgB,WAAW,CAAC,IAAY;IACtC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,CAAC,CAAC;IACX,CAAC;IAED,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,oBAAoB,EAAE,CAAC;QACxC,OAAO,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;IACtC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,8CAA8C;QAC9C,+DAA+D;QAC/D,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACpC,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,SAAgB,cAAc,CAAC,IAAY;IACzC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,CAAC,CAAC;IACX,CAAC;IACD,+DAA+D;IAC/D,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACpC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/providers/embeddings/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,uBAAuB,EAAE,MAAM,UAAU,CAAC;AACnD,YAAY,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.OpenAIEmbeddingProvider = void 0;
|
|
4
|
+
// Embedding providers export
|
|
5
|
+
var openai_1 = require("./openai");
|
|
6
|
+
Object.defineProperty(exports, "OpenAIEmbeddingProvider", { enumerable: true, get: function () { return openai_1.OpenAIEmbeddingProvider; } });
|
|
7
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/providers/embeddings/index.ts"],"names":[],"mappings":";;;AAAA,6BAA6B;AAC7B,mCAAmD;AAA1C,iHAAA,uBAAuB,OAAA"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { EmbeddingProvider } from '../../types';
|
|
2
|
+
export interface OpenAIEmbeddingConfig {
|
|
3
|
+
apiKey: string;
|
|
4
|
+
model?: string;
|
|
5
|
+
dimension?: number;
|
|
6
|
+
maxRetries?: number;
|
|
7
|
+
}
|
|
8
|
+
export declare class OpenAIEmbeddingProvider implements EmbeddingProvider {
|
|
9
|
+
private client;
|
|
10
|
+
private model;
|
|
11
|
+
private dimension;
|
|
12
|
+
private maxRetries;
|
|
13
|
+
constructor(config: OpenAIEmbeddingConfig);
|
|
14
|
+
embed(text: string): Promise<number[]>;
|
|
15
|
+
embedBatch(texts: string[]): Promise<number[][]>;
|
|
16
|
+
getDimension(): number;
|
|
17
|
+
getModelName(): string;
|
|
18
|
+
getTokenLimit(): number;
|
|
19
|
+
private normalizeText;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=openai.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"openai.d.ts","sourceRoot":"","sources":["../../../../src/providers/embeddings/openai.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAGrD,MAAM,WAAW,qBAAqB;IACpC,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,qBAAa,uBAAwB,YAAW,iBAAiB;IAC/D,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,UAAU,CAAS;gBAEf,MAAM,EAAE,qBAAqB;IAUnC,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAkBtC,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;IAoCtD,YAAY,IAAI,MAAM;IAItB,YAAY,IAAI,MAAM;IAItB,aAAa,IAAI,MAAM;IAevB,OAAO,CAAC,aAAa;CAMtB"}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.OpenAIEmbeddingProvider = void 0;
|
|
4
|
+
// OpenAI embedding provider
|
|
5
|
+
const openai_1 = require("openai");
|
|
6
|
+
const types_1 = require("../../types");
|
|
7
|
+
class OpenAIEmbeddingProvider {
|
|
8
|
+
constructor(config) {
|
|
9
|
+
this.client = new openai_1.OpenAI({
|
|
10
|
+
apiKey: config.apiKey,
|
|
11
|
+
maxRetries: config.maxRetries ?? 3,
|
|
12
|
+
});
|
|
13
|
+
this.model = config.model ?? 'text-embedding-3-small';
|
|
14
|
+
this.dimension = config.dimension ?? 1536;
|
|
15
|
+
this.maxRetries = config.maxRetries ?? 3;
|
|
16
|
+
}
|
|
17
|
+
async embed(text) {
|
|
18
|
+
try {
|
|
19
|
+
const normalized = this.normalizeText(text);
|
|
20
|
+
const response = await this.client.embeddings.create({
|
|
21
|
+
model: this.model,
|
|
22
|
+
input: normalized,
|
|
23
|
+
dimensions: this.dimension,
|
|
24
|
+
});
|
|
25
|
+
return response.data[0].embedding;
|
|
26
|
+
}
|
|
27
|
+
catch (error) {
|
|
28
|
+
throw new types_1.VectorEmbeddingError(`Failed to generate embedding: ${error instanceof Error ? error.message : 'Unknown error'}`, error instanceof Error ? error : undefined);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
async embedBatch(texts) {
|
|
32
|
+
if (texts.length === 0) {
|
|
33
|
+
return [];
|
|
34
|
+
}
|
|
35
|
+
try {
|
|
36
|
+
// OpenAI supports up to 2048 inputs per request, but we'll be conservative
|
|
37
|
+
const batchSize = 100;
|
|
38
|
+
const batches = [];
|
|
39
|
+
for (let i = 0; i < texts.length; i += batchSize) {
|
|
40
|
+
batches.push(texts.slice(i, i + batchSize));
|
|
41
|
+
}
|
|
42
|
+
const results = [];
|
|
43
|
+
for (const batch of batches) {
|
|
44
|
+
const normalized = batch.map(t => this.normalizeText(t));
|
|
45
|
+
const response = await this.client.embeddings.create({
|
|
46
|
+
model: this.model,
|
|
47
|
+
input: normalized,
|
|
48
|
+
dimensions: this.dimension,
|
|
49
|
+
});
|
|
50
|
+
results.push(...response.data.map(d => d.embedding));
|
|
51
|
+
}
|
|
52
|
+
return results;
|
|
53
|
+
}
|
|
54
|
+
catch (error) {
|
|
55
|
+
throw new types_1.VectorEmbeddingError(`Failed to generate batch embeddings: ${error instanceof Error ? error.message : 'Unknown error'}`, error instanceof Error ? error : undefined);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
getDimension() {
|
|
59
|
+
return this.dimension;
|
|
60
|
+
}
|
|
61
|
+
getModelName() {
|
|
62
|
+
return this.model;
|
|
63
|
+
}
|
|
64
|
+
getTokenLimit() {
|
|
65
|
+
// OpenAI embedding models have different token limits
|
|
66
|
+
// text-embedding-3-small and text-embedding-3-large: 8192 tokens
|
|
67
|
+
// text-embedding-ada-002: 8191 tokens
|
|
68
|
+
// Older models may have different limits, default to 8192
|
|
69
|
+
if (this.model.includes('text-embedding-3')) {
|
|
70
|
+
return 8192;
|
|
71
|
+
}
|
|
72
|
+
if (this.model.includes('text-embedding-ada-002')) {
|
|
73
|
+
return 8191;
|
|
74
|
+
}
|
|
75
|
+
// Default for other models
|
|
76
|
+
return 8192;
|
|
77
|
+
}
|
|
78
|
+
normalizeText(text) {
|
|
79
|
+
return text
|
|
80
|
+
.trim()
|
|
81
|
+
.replace(/\n{3,}/g, '\n\n') // Replace 3+ newlines with 2
|
|
82
|
+
.replace(/\s{2,}/g, ' '); // Replace multiple spaces with single space
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
exports.OpenAIEmbeddingProvider = OpenAIEmbeddingProvider;
|
|
86
|
+
//# sourceMappingURL=openai.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"openai.js","sourceRoot":"","sources":["../../../../src/providers/embeddings/openai.ts"],"names":[],"mappings":";;;AAAA,4BAA4B;AAC5B,mCAAgC;AAEhC,uCAAmD;AASnD,MAAa,uBAAuB;IAMlC,YAAY,MAA6B;QACvC,IAAI,CAAC,MAAM,GAAG,IAAI,eAAM,CAAC;YACvB,MAAM,EAAE,MAAM,CAAC,MAAM;YACrB,UAAU,EAAE,MAAM,CAAC,UAAU,IAAI,CAAC;SACnC,CAAC,CAAC;QACH,IAAI,CAAC,KAAK,GAAG,MAAM,CAAC,KAAK,IAAI,wBAAwB,CAAC;QACtD,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC;QAC1C,IAAI,CAAC,UAAU,GAAG,MAAM,CAAC,UAAU,IAAI,CAAC,CAAC;IAC3C,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,IAAY;QACtB,IAAI,CAAC;YACH,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;YAC5C,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC;gBACnD,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,KAAK,EAAE,UAAU;gBACjB,UAAU,EAAE,IAAI,CAAC,SAAS;aAC3B,CAAC,CAAC;YAEH,OAAO,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QACpC,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,4BAAoB,CAC5B,iCAAiC,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,EAC3F,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAC3C,CAAC;QACJ,CAAC;IACH,CAAC;IAED,KAAK,CAAC,UAAU,CAAC,KAAe;QAC9B,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,IAAI,CAAC;YACH,2EAA2E;YAC3E,MAAM,SAAS,GAAG,GAAG,CAAC;YACtB,MAAM,OAAO,GAAe,EAAE,CAAC;YAE/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC;gBACjD,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,CAAC,CAAC;YAC9C,CAAC;YAED,MAAM,OAAO,GAAe,EAAE,CAAC;YAE/B,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;gBAC5B,MAAM,UAAU,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC;gBACzD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC;oBACnD,KAAK,EAAE,IAAI,CAAC,KAAK;oBACjB,KAAK,EAAE,UAAU;oBACjB,UAAU,EAAE,IAAI,CAAC,SAAS;iBAC3B,CAAC,CAAC;gBAEH,OAAO,CAAC,IAAI,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC;YACvD,CAAC;YAED,OAAO,OAAO,CAAC;QACjB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,4BAAoB,CAC5B,wCAAwC,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,EAClG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAC3C,CAAC;QACJ,CAAC;IACH,CAAC;IAED,YAAY;QACV,OAAO,IAAI,CAAC,SAAS,CAAC;IACxB,CAAC;IAED,YAAY;QACV,OAAO,IAAI,CAAC,KAAK,CAAC;IACpB,CAAC;IAED,aAAa;QACX,sDAAsD;QACtD,iEAAiE;QACjE,sCAAsC;QACtC,0DAA0D;QAC1D,IAAI,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,kBAAkB,CAAC,EAAE,CAAC;YAC5C,OAAO,IAAI,CAAC;QACd,CAAC;QACD,IAAI,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,wBAAwB,CAAC,EAAE,CAAC;YAClD,OAAO,IAAI,CAAC;QACd,CAAC;QACD,2BAA2B;QAC3B,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,aAAa,CAAC,IAAY;QAChC,OAAO,IAAI;aACR,IAAI,EAAE;aACN,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,6BAA6B;aACxD,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC,4CAA4C;IAC1E,CAAC;CACF;AAnGD,0DAmGC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/providers/index.ts"],"names":[],"mappings":"AACA,cAAc,cAAc,CAAC;AAC7B,cAAc,UAAU,CAAC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
// Vector service providers
|
|
18
|
+
__exportStar(require("./embeddings"), exports);
|
|
19
|
+
__exportStar(require("./stores"), exports);
|
|
20
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/providers/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;AAAA,2BAA2B;AAC3B,+CAA6B;AAC7B,2CAAyB"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { QdrantVectorStoreProvider } from './qdrant';
|
|
2
|
+
export type { QdrantConfig } from './qdrant';
|
|
3
|
+
export { InMemoryVectorStoreProvider } from './memory';
|
|
4
|
+
export { SQLiteVectorStoreProvider } from './sqlite';
|
|
5
|
+
export type { SQLiteConfig } from './sqlite';
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/providers/stores/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,yBAAyB,EAAE,MAAM,UAAU,CAAC;AACrD,YAAY,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAC7C,OAAO,EAAE,2BAA2B,EAAE,MAAM,UAAU,CAAC;AACvD,OAAO,EAAE,yBAAyB,EAAE,MAAM,UAAU,CAAC;AACrD,YAAY,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.SQLiteVectorStoreProvider = exports.InMemoryVectorStoreProvider = exports.QdrantVectorStoreProvider = void 0;
|
|
4
|
+
// Vector store providers export
|
|
5
|
+
var qdrant_1 = require("./qdrant");
|
|
6
|
+
Object.defineProperty(exports, "QdrantVectorStoreProvider", { enumerable: true, get: function () { return qdrant_1.QdrantVectorStoreProvider; } });
|
|
7
|
+
var memory_1 = require("./memory");
|
|
8
|
+
Object.defineProperty(exports, "InMemoryVectorStoreProvider", { enumerable: true, get: function () { return memory_1.InMemoryVectorStoreProvider; } });
|
|
9
|
+
var sqlite_1 = require("./sqlite");
|
|
10
|
+
Object.defineProperty(exports, "SQLiteVectorStoreProvider", { enumerable: true, get: function () { return sqlite_1.SQLiteVectorStoreProvider; } });
|
|
11
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/providers/stores/index.ts"],"names":[],"mappings":";;;AAAA,gCAAgC;AAChC,mCAAqD;AAA5C,mHAAA,yBAAyB,OAAA;AAElC,mCAAuD;AAA9C,qHAAA,2BAA2B,OAAA;AACpC,mCAAqD;AAA5C,mHAAA,yBAAyB,OAAA"}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { VectorStoreProvider, VectorPoint, SearchOptions, SearchResult, CollectionConfig } from '../../types';
|
|
2
|
+
export declare class InMemoryVectorStoreProvider implements VectorStoreProvider {
|
|
3
|
+
private collections;
|
|
4
|
+
private collectionConfigs;
|
|
5
|
+
ensureCollection(name: string, dimension: number, config?: CollectionConfig): Promise<void>;
|
|
6
|
+
upsert(collection: string, points: VectorPoint[]): Promise<void>;
|
|
7
|
+
search(collection: string, vector: number[], options?: SearchOptions): Promise<SearchResult[]>;
|
|
8
|
+
delete(collection: string, ids: string[]): Promise<void>;
|
|
9
|
+
deleteCollection(collection: string): Promise<void>;
|
|
10
|
+
getIdsByFilter(collection: string, filter: Record<string, any>, limit?: number): Promise<string[]>;
|
|
11
|
+
private calculateSimilarity;
|
|
12
|
+
private cosineSimilarity;
|
|
13
|
+
private euclideanDistance;
|
|
14
|
+
private dotProduct;
|
|
15
|
+
private matchesFilter;
|
|
16
|
+
private buildFilter;
|
|
17
|
+
}
|
|
18
|
+
//# sourceMappingURL=memory.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"memory.d.ts","sourceRoot":"","sources":["../../../../src/providers/stores/memory.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,mBAAmB,EAAE,WAAW,EAAE,aAAa,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAOnH,qBAAa,2BAA4B,YAAW,mBAAmB;IACrE,OAAO,CAAC,WAAW,CAA+C;IAClE,OAAO,CAAC,iBAAiB,CAA8D;IAEjF,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IAU3F,MAAM,CAAC,UAAU,EAAE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAchE,MAAM,CAAC,UAAU,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAmDlG,MAAM,CAAC,UAAU,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAWxD,gBAAgB,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKnD,cAAc,CAAC,UAAU,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,KAAK,GAAE,MAAc,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAqB/G,OAAO,CAAC,mBAAmB;IAiB3B,OAAO,CAAC,gBAAgB;IAOxB,OAAO,CAAC,iBAAiB;IAIzB,OAAO,CAAC,UAAU;IAIlB,OAAO,CAAC,aAAa;IAgCrB,OAAO,CAAC,WAAW;CAYpB"}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.InMemoryVectorStoreProvider = void 0;
|
|
4
|
+
const types_1 = require("../../types");
|
|
5
|
+
class InMemoryVectorStoreProvider {
|
|
6
|
+
constructor() {
|
|
7
|
+
this.collections = new Map();
|
|
8
|
+
this.collectionConfigs = new Map();
|
|
9
|
+
}
|
|
10
|
+
async ensureCollection(name, dimension, config) {
|
|
11
|
+
if (!this.collections.has(name)) {
|
|
12
|
+
this.collections.set(name, new Map());
|
|
13
|
+
this.collectionConfigs.set(name, {
|
|
14
|
+
dimension,
|
|
15
|
+
distance: config?.distance ?? 'cosine',
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
async upsert(collection, points) {
|
|
20
|
+
const store = this.collections.get(collection);
|
|
21
|
+
if (!store) {
|
|
22
|
+
throw new types_1.VectorStoreError(`Collection '${collection}' does not exist`);
|
|
23
|
+
}
|
|
24
|
+
for (const point of points) {
|
|
25
|
+
store.set(point.id, {
|
|
26
|
+
...point,
|
|
27
|
+
text: point.payload?.text || '',
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
async search(collection, vector, options = {}) {
|
|
32
|
+
try {
|
|
33
|
+
const store = this.collections.get(collection);
|
|
34
|
+
if (!store) {
|
|
35
|
+
throw new types_1.VectorSearchError(`Collection '${collection}' does not exist`);
|
|
36
|
+
}
|
|
37
|
+
const config = this.collectionConfigs.get(collection);
|
|
38
|
+
if (!config) {
|
|
39
|
+
throw new types_1.VectorSearchError(`Collection config not found for '${collection}'`);
|
|
40
|
+
}
|
|
41
|
+
const limit = options.limit ?? 10;
|
|
42
|
+
const scoreThreshold = options.scoreThreshold ?? 0.0;
|
|
43
|
+
// Calculate similarity for all points
|
|
44
|
+
const results = [];
|
|
45
|
+
for (const [id, point] of store.entries()) {
|
|
46
|
+
// Apply filter if provided
|
|
47
|
+
if (options.filter && !this.matchesFilter(point.payload, options.filter)) {
|
|
48
|
+
continue;
|
|
49
|
+
}
|
|
50
|
+
const score = this.calculateSimilarity(vector, point.vector, config.distance);
|
|
51
|
+
if (score >= scoreThreshold) {
|
|
52
|
+
const payload = point.payload || {};
|
|
53
|
+
const system = payload.system || {};
|
|
54
|
+
results.push({
|
|
55
|
+
id,
|
|
56
|
+
text: point.text,
|
|
57
|
+
score,
|
|
58
|
+
createdAt: system.createdAt || Date.now(),
|
|
59
|
+
metadata: payload,
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
// Sort by score descending and limit
|
|
64
|
+
results.sort((a, b) => b.score - a.score);
|
|
65
|
+
return results.slice(0, limit);
|
|
66
|
+
}
|
|
67
|
+
catch (error) {
|
|
68
|
+
throw new types_1.VectorSearchError(`Failed to search in collection '${collection}': ${error instanceof Error ? error.message : 'Unknown error'}`, error instanceof Error ? error : undefined);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
async delete(collection, ids) {
|
|
72
|
+
const store = this.collections.get(collection);
|
|
73
|
+
if (!store) {
|
|
74
|
+
throw new types_1.VectorStoreError(`Collection '${collection}' does not exist`);
|
|
75
|
+
}
|
|
76
|
+
for (const id of ids) {
|
|
77
|
+
store.delete(id);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
async deleteCollection(collection) {
|
|
81
|
+
this.collections.delete(collection);
|
|
82
|
+
this.collectionConfigs.delete(collection);
|
|
83
|
+
}
|
|
84
|
+
async getIdsByFilter(collection, filter, limit = 10000) {
|
|
85
|
+
const store = this.collections.get(collection);
|
|
86
|
+
if (!store) {
|
|
87
|
+
throw new types_1.VectorStoreError(`Collection '${collection}' does not exist`);
|
|
88
|
+
}
|
|
89
|
+
const matchingIds = [];
|
|
90
|
+
let count = 0;
|
|
91
|
+
for (const [id, point] of store.entries()) {
|
|
92
|
+
if (count >= limit)
|
|
93
|
+
break;
|
|
94
|
+
if (this.matchesFilter(point.payload, filter)) {
|
|
95
|
+
matchingIds.push(id);
|
|
96
|
+
count++;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return matchingIds;
|
|
100
|
+
}
|
|
101
|
+
calculateSimilarity(vec1, vec2, distance) {
|
|
102
|
+
if (vec1.length !== vec2.length) {
|
|
103
|
+
throw new Error('Vector dimensions do not match');
|
|
104
|
+
}
|
|
105
|
+
switch (distance) {
|
|
106
|
+
case 'cosine':
|
|
107
|
+
return this.cosineSimilarity(vec1, vec2);
|
|
108
|
+
case 'euclidean':
|
|
109
|
+
return 1 / (1 + this.euclideanDistance(vec1, vec2));
|
|
110
|
+
case 'dot':
|
|
111
|
+
return this.dotProduct(vec1, vec2);
|
|
112
|
+
default:
|
|
113
|
+
return this.cosineSimilarity(vec1, vec2);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
cosineSimilarity(vec1, vec2) {
|
|
117
|
+
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
|
|
118
|
+
const mag1 = Math.sqrt(vec1.reduce((sum, val) => sum + val * val, 0));
|
|
119
|
+
const mag2 = Math.sqrt(vec2.reduce((sum, val) => sum + val * val, 0));
|
|
120
|
+
return dotProduct / (mag1 * mag2);
|
|
121
|
+
}
|
|
122
|
+
euclideanDistance(vec1, vec2) {
|
|
123
|
+
return Math.sqrt(vec1.reduce((sum, val, i) => sum + Math.pow(val - vec2[i], 2), 0));
|
|
124
|
+
}
|
|
125
|
+
dotProduct(vec1, vec2) {
|
|
126
|
+
return vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
|
|
127
|
+
}
|
|
128
|
+
matchesFilter(payload, filter) {
|
|
129
|
+
if (!payload)
|
|
130
|
+
return false;
|
|
131
|
+
const _filter = this.buildFilter(filter);
|
|
132
|
+
for (const [key, value] of Object.entries(_filter)) {
|
|
133
|
+
const keys = key.split('.');
|
|
134
|
+
let current = payload;
|
|
135
|
+
for (const k of keys) {
|
|
136
|
+
if (current && typeof current === 'object' && k in current) {
|
|
137
|
+
current = current[k];
|
|
138
|
+
}
|
|
139
|
+
else {
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
if (Array.isArray(value)) {
|
|
144
|
+
if (value.length === 0) {
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
if (!value.includes(current)) {
|
|
148
|
+
return false;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
else if (current !== value) {
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
return true;
|
|
156
|
+
}
|
|
157
|
+
buildFilter(filter) {
|
|
158
|
+
const _filter = {};
|
|
159
|
+
for (const [key, value] of Object.entries(filter)) {
|
|
160
|
+
if (key === 'text')
|
|
161
|
+
continue; // Skip text field as it's the main content
|
|
162
|
+
const parsedKey = key.includes('.') ? key : `metadata.${key}`;
|
|
163
|
+
_filter[parsedKey] = value;
|
|
164
|
+
}
|
|
165
|
+
return _filter;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
exports.InMemoryVectorStoreProvider = InMemoryVectorStoreProvider;
|
|
169
|
+
//# sourceMappingURL=memory.js.map
|