voctar 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +102 -0
  3. package/dist/index.d.ts +6 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +29 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/src/chunking/index.d.ts +48 -0
  8. package/dist/src/chunking/index.d.ts.map +1 -0
  9. package/dist/src/chunking/index.js +123 -0
  10. package/dist/src/chunking/index.js.map +1 -0
  11. package/dist/src/chunking/strategies/fixed.d.ts +14 -0
  12. package/dist/src/chunking/strategies/fixed.d.ts.map +1 -0
  13. package/dist/src/chunking/strategies/fixed.js +111 -0
  14. package/dist/src/chunking/strategies/fixed.js.map +1 -0
  15. package/dist/src/chunking/strategies/paragraph.d.ts +6 -0
  16. package/dist/src/chunking/strategies/paragraph.d.ts.map +1 -0
  17. package/dist/src/chunking/strategies/paragraph.js +84 -0
  18. package/dist/src/chunking/strategies/paragraph.js.map +1 -0
  19. package/dist/src/chunking/strategies/recursive.d.ts +17 -0
  20. package/dist/src/chunking/strategies/recursive.d.ts.map +1 -0
  21. package/dist/src/chunking/strategies/recursive.js +192 -0
  22. package/dist/src/chunking/strategies/recursive.js.map +1 -0
  23. package/dist/src/chunking/strategies/semantic.d.ts +96 -0
  24. package/dist/src/chunking/strategies/semantic.d.ts.map +1 -0
  25. package/dist/src/chunking/strategies/semantic.js +587 -0
  26. package/dist/src/chunking/strategies/semantic.js.map +1 -0
  27. package/dist/src/chunking/strategies/sentence.d.ts +7 -0
  28. package/dist/src/chunking/strategies/sentence.d.ts.map +1 -0
  29. package/dist/src/chunking/strategies/sentence.js +116 -0
  30. package/dist/src/chunking/strategies/sentence.js.map +1 -0
  31. package/dist/src/chunking/types.d.ts +45 -0
  32. package/dist/src/chunking/types.d.ts.map +1 -0
  33. package/dist/src/chunking/types.js +4 -0
  34. package/dist/src/chunking/types.js.map +1 -0
  35. package/dist/src/chunking/utils/tokenizer.d.ts +10 -0
  36. package/dist/src/chunking/utils/tokenizer.d.ts.map +1 -0
  37. package/dist/src/chunking/utils/tokenizer.js +50 -0
  38. package/dist/src/chunking/utils/tokenizer.js.map +1 -0
  39. package/dist/src/providers/embeddings/index.d.ts +3 -0
  40. package/dist/src/providers/embeddings/index.d.ts.map +1 -0
  41. package/dist/src/providers/embeddings/index.js +7 -0
  42. package/dist/src/providers/embeddings/index.js.map +1 -0
  43. package/dist/src/providers/embeddings/openai.d.ts +21 -0
  44. package/dist/src/providers/embeddings/openai.d.ts.map +1 -0
  45. package/dist/src/providers/embeddings/openai.js +86 -0
  46. package/dist/src/providers/embeddings/openai.js.map +1 -0
  47. package/dist/src/providers/index.d.ts +3 -0
  48. package/dist/src/providers/index.d.ts.map +1 -0
  49. package/dist/src/providers/index.js +20 -0
  50. package/dist/src/providers/index.js.map +1 -0
  51. package/dist/src/providers/stores/index.d.ts +6 -0
  52. package/dist/src/providers/stores/index.d.ts.map +1 -0
  53. package/dist/src/providers/stores/index.js +11 -0
  54. package/dist/src/providers/stores/index.js.map +1 -0
  55. package/dist/src/providers/stores/memory.d.ts +18 -0
  56. package/dist/src/providers/stores/memory.d.ts.map +1 -0
  57. package/dist/src/providers/stores/memory.js +169 -0
  58. package/dist/src/providers/stores/memory.js.map +1 -0
  59. package/dist/src/providers/stores/qdrant.d.ts +28 -0
  60. package/dist/src/providers/stores/qdrant.d.ts.map +1 -0
  61. package/dist/src/providers/stores/qdrant.js +223 -0
  62. package/dist/src/providers/stores/qdrant.js.map +1 -0
  63. package/dist/src/providers/stores/sqlite.d.ts +38 -0
  64. package/dist/src/providers/stores/sqlite.d.ts.map +1 -0
  65. package/dist/src/providers/stores/sqlite.js +306 -0
  66. package/dist/src/providers/stores/sqlite.js.map +1 -0
  67. package/dist/src/types.d.ts +111 -0
  68. package/dist/src/types.d.ts.map +1 -0
  69. package/dist/src/types.js +32 -0
  70. package/dist/src/types.js.map +1 -0
  71. package/dist/src/vector.d.ts +74 -0
  72. package/dist/src/vector.d.ts.map +1 -0
  73. package/dist/src/vector.js +505 -0
  74. package/dist/src/vector.js.map +1 -0
  75. package/docs/API.md +361 -0
  76. package/docs/CHUNKING.md +280 -0
  77. package/docs/CUSTOM_PROVIDERS.md +101 -0
  78. package/docs/README.md +11 -0
  79. package/docs/STORAGE_BACKENDS.md +189 -0
  80. package/docs/assets/vectar.png +0 -0
  81. package/package.json +46 -0
@@ -0,0 +1,192 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.RecursiveChunkingStrategy = void 0;
4
+ // Recursive text splitting strategy - tries to split on natural boundaries
5
+ const uuid_1 = require("uuid");
6
+ const tokenizer_1 = require("../utils/tokenizer");
7
+ class RecursiveChunkingStrategy {
8
+ getName() {
9
+ return 'recursive';
10
+ }
11
+ chunk(text, documentId, options) {
12
+ // Get token limit and ensure maxSize doesn't exceed it
13
+ const tokenLimit = options.tokenLimit ?? 8192;
14
+ const maxSize = Math.min(options.maxChunkSize ?? 1000, tokenLimit);
15
+ const overlap = Math.min(options.overlap ?? 200, Math.floor(maxSize * 0.2)); // Overlap shouldn't exceed 20% of maxSize
16
+ // Default separators in order of preference (paragraph > sentence > word > char)
17
+ const defaultSeparators = [
18
+ '\n\n', // Paragraph
19
+ '\n', // Line break
20
+ '. ', // Sentence
21
+ '! ', // Sentence
22
+ '? ', // Sentence
23
+ '; ', // Clause
24
+ ', ', // Phrase
25
+ ' ', // Word
26
+ '', // Character
27
+ ];
28
+ const separators = options.separator
29
+ ? (Array.isArray(options.separator) ? options.separator : [options.separator])
30
+ : defaultSeparators;
31
+ const chunks = this.recursiveSplit(text, maxSize, overlap, separators);
32
+ // Convert to Chunk objects with metadata
33
+ let startChar = 0;
34
+ const result = [];
35
+ chunks.forEach((chunkText, index) => {
36
+ const endChar = startChar + chunkText.length;
37
+ result.push({
38
+ id: (0, uuid_1.v4)(),
39
+ text: chunkText,
40
+ metadata: {
41
+ documentId,
42
+ chunkIndex: index,
43
+ totalChunks: chunks.length,
44
+ startChar,
45
+ endChar,
46
+ ...options.metadata,
47
+ },
48
+ });
49
+ // Account for overlap in start position
50
+ startChar = endChar - overlap;
51
+ });
52
+ return result;
53
+ }
54
+ recursiveSplit(text, maxSize, // maxSize is now in tokens
55
+ overlap, // overlap is now in tokens
56
+ separators) {
57
+ const finalChunks = [];
58
+ // Base case: if text token count is small enough, return it
59
+ const textTokens = (0, tokenizer_1.countTokens)(text);
60
+ if (textTokens <= maxSize) {
61
+ return text.trim() ? [text.trim()] : [];
62
+ }
63
+ // Try each separator in order
64
+ for (const separator of separators) {
65
+ if (separator === '') {
66
+ // Character-level split as last resort
67
+ return this.splitByCharacters(text, maxSize, overlap);
68
+ }
69
+ if (text.includes(separator)) {
70
+ const splits = text.split(separator);
71
+ const chunks = this.mergeSplits(splits, separator, maxSize, overlap);
72
+ // If any chunk is still too large (by token count), recursively split it
73
+ for (const chunk of chunks) {
74
+ const chunkTokens = (0, tokenizer_1.countTokens)(chunk);
75
+ if (chunkTokens > maxSize) {
76
+ // Find next separator in the list
77
+ const nextSeparatorIndex = separators.indexOf(separator) + 1;
78
+ const remainingSeparators = separators.slice(nextSeparatorIndex);
79
+ finalChunks.push(...this.recursiveSplit(chunk, maxSize, overlap, remainingSeparators));
80
+ }
81
+ else if (chunk.trim()) {
82
+ finalChunks.push(chunk.trim());
83
+ }
84
+ }
85
+ return finalChunks;
86
+ }
87
+ }
88
+ // Fallback to character split
89
+ return this.splitByCharacters(text, maxSize, overlap);
90
+ }
91
+ mergeSplits(splits, separator, maxSize, // maxSize is now in tokens
92
+ overlap // overlap is now in tokens
93
+ ) {
94
+ const chunks = [];
95
+ let currentChunk = '';
96
+ for (let i = 0; i < splits.length; i++) {
97
+ const split = splits[i];
98
+ const piece = i < splits.length - 1 ? split + separator : split;
99
+ const combined = currentChunk + piece;
100
+ const combinedTokens = (0, tokenizer_1.countTokens)(combined);
101
+ if (combinedTokens <= maxSize) {
102
+ currentChunk = combined;
103
+ }
104
+ else {
105
+ if (currentChunk) {
106
+ chunks.push(currentChunk);
107
+ // Start new chunk with overlap (in tokens)
108
+ // Find overlap by binary search or by character approximation
109
+ const overlapText = this.getOverlapText(currentChunk, overlap);
110
+ currentChunk = overlapText + piece;
111
+ }
112
+ else {
113
+ // Single piece is larger than maxSize, add it anyway (will be split recursively)
114
+ currentChunk = piece;
115
+ }
116
+ }
117
+ }
118
+ if (currentChunk) {
119
+ chunks.push(currentChunk);
120
+ }
121
+ return chunks;
122
+ }
123
+ /**
124
+ * Get overlap text that is approximately 'overlapTokens' tokens
125
+ */
126
+ getOverlapText(text, overlapTokens) {
127
+ // Binary search for the right amount of text
128
+ let start = 0;
129
+ let end = text.length;
130
+ let bestMatch = '';
131
+ while (start < end) {
132
+ const mid = Math.floor((start + end) / 2);
133
+ const candidate = text.slice(mid);
134
+ const tokens = (0, tokenizer_1.countTokens)(candidate);
135
+ if (tokens <= overlapTokens) {
136
+ bestMatch = candidate;
137
+ end = mid;
138
+ }
139
+ else {
140
+ start = mid + 1;
141
+ }
142
+ }
143
+ return bestMatch || text.slice(-Math.floor(text.length * 0.1)); // Fallback to last 10%
144
+ }
145
+ splitByCharacters(text, maxSize, overlap) {
146
+ // maxSize and overlap are in tokens, but we need to split by characters
147
+ // Use binary search to find character positions that match token limits
148
+ const chunks = [];
149
+ let start = 0;
150
+ while (start < text.length) {
151
+ const chunkText = this.getTextUpToTokenLimit(text.slice(start), maxSize);
152
+ if (!chunkText)
153
+ break;
154
+ chunks.push(chunkText);
155
+ const chunkLength = chunkText.length;
156
+ // Calculate overlap start position
157
+ const overlapText = this.getOverlapText(chunkText, overlap);
158
+ start += chunkLength - overlapText.length;
159
+ if (text.length - start < overlapText.length) {
160
+ break;
161
+ }
162
+ }
163
+ return chunks;
164
+ }
165
+ /**
166
+ * Get text up to a token limit
167
+ */
168
+ getTextUpToTokenLimit(text, maxTokens) {
169
+ if ((0, tokenizer_1.countTokens)(text) <= maxTokens) {
170
+ return text;
171
+ }
172
+ // Binary search for the right character position
173
+ let start = 0;
174
+ let end = text.length;
175
+ let bestMatch = '';
176
+ while (start < end) {
177
+ const mid = Math.floor((start + end) / 2);
178
+ const candidate = text.slice(0, mid);
179
+ const tokens = (0, tokenizer_1.countTokens)(candidate);
180
+ if (tokens <= maxTokens) {
181
+ bestMatch = candidate;
182
+ start = mid + 1;
183
+ }
184
+ else {
185
+ end = mid;
186
+ }
187
+ }
188
+ return bestMatch || text.slice(0, Math.floor(text.length * 0.8)); // Fallback to 80%
189
+ }
190
+ }
191
+ exports.RecursiveChunkingStrategy = RecursiveChunkingStrategy;
192
+ //# sourceMappingURL=recursive.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"recursive.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/recursive.ts"],"names":[],"mappings":";;;AAAA,2EAA2E;AAC3E,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,WAAW,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,OAAO,IAAI,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,0CAA0C;QAEvH,iFAAiF;QACjF,MAAM,iBAAiB,GAAG;YACxB,MAAM,EAAG,YAAY;YACrB,IAAI,EAAK,aAAa;YACtB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,SAAS;YAClB,IAAI,EAAK,SAAS;YAClB,GAAG,EAAM,OAAO;YAChB,EAAE,EAAO,YAAY;SACtB,CAAC;QAEF,MAAM,UAAU,GAAG,OAAO,CAAC,SAAS;YAClC,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;YAC9E,CAAC,CAAC,iBAAiB,CAAC;QAEtB,MAAM,MAAM,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;QAEvE,yCAAyC;QACzC,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,MAAM,MAAM,GAAY,EAAE,CAAC;QAE3B,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,EAAE,KAAK,EAAE,EAAE;YAClC,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU,EAAE,KAAK;oBACjB,WAAW,EAAE,MAAM,CAAC,MAAM;oBAC1B,SAAS;oBACT,OAAO;oBACP,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;YAEH,wCAAwC;YACxC,SAAS,GAAG,OAAO,GAAG,OAAO,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,cAAc,CACpB,IAAY,EACZ,OAAe,EAAE,2BAA2B;IAC5C,OAAe,EAAE,2BAA2B;IAC5C,UAAoB;QAEpB,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,4DAA4D;QAC5D,MAAM,UAAU,GAAG,IAAA,uBAAW,EAAC,IAAI,CAAC,CAAC;QACrC,IAAI,UAAU,IAAI,OAAO,EAAE,CAAC;YAC1B,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC1C,CAAC;QAED,8BAA8B;QAC9B,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;YACnC,IAAI,SAAS,KAAK,EAAE,EAAE,CAAC;gBACrB,uCAAuC;gBACvC,OAAO,IAAI,CAAC,iBAAiB,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;YACxD,CAAC;YAED,IAAI,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC7B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;gBACrC,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;gBAErE,yEAAyE;gBACzE,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;oBAC3B,MAAM,WAAW,GAAG,IAAA,uBAAW,EAAC,KAAK,CAAC,CAAC;oBACvC,IAAI,WAAW,GAAG,OAAO,EAAE,CAAC;wBAC1B,kCAAkC;wBAClC,MAAM,kBAAkB,GAAG,UAAU,CAAC,OAAO,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;wBAC7D,MAAM,mBAAmB,GAAG,UAAU,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;wBACjE,WAAW,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,mBAAmB,CAAC,CAAC,CAAC;oBACzF,CAAC;yBAAM,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;wBACxB,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC;oBACjC,CAAC;gBACH,CAAC;gBAED,OAAO,WAAW,CAAC;YACrB,CAAC;QACH,CAAC;QAED,8BAA8B;QAC9B,OAAO,IAAI,CAAC,iBAAiB,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;IACxD,CAAC;IAEO,WAAW,CACjB,MAAgB,EAChB,SAAiB,EACjB,OAAe,EAAE,2BAA2B;IAC5C,OAAe,CAAC,2BAA2B;;QAE3C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,YAAY,GAAG,EAAE,CAAC;QAEtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;YACxB,MAAM,KAAK,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC;YAChE,MAAM,QAAQ,GAAG,YAAY,GAAG,KAAK,CAAC;YACtC,MAAM,cAAc,GAAG,IAAA,uBAAW,EAAC,QAAQ,CAAC,CAAC;YAE7C,IAAI,cAAc,IAAI,OAAO,EAAE,CAAC;gBAC9B,YAAY,GAAG,QAAQ,CAAC;YAC1B,CAAC;iBAAM,CAAC;gBACN,IAAI,YAAY,EAAE,CAAC;oBACjB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;oBAC1B,2CAA2C;oBAC3C,8DAA8D;oBAC9D,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;oBAC/D,YAAY,GAAG,WAAW,GAAG,KAAK,CAAC;gBACrC,CAAC;qBAAM,CAAC;oBACN,iFAAiF;oBACjF,YAAY,GAAG,KAAK,CAAC;gBACvB,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,YAAY,EAAE,CAAC;YACjB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,IAAY,EAAE,aAAqB;QACxD,6CAA6C;QAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,aAAa,EAAE,CAAC;gBAC5B,SAAS,GAAG,SAAS,CAAC;gBACtB,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,uBAAuB;IACzF,CAAC;IAEO,iBAAiB,CAAC,IAAY,EAAE,OAAe,EAAE,OAAe;QACtE,wEAAwE;QACxE,wEAAwE;QACxE,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,IAAI,CAAC,qBAAqB,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,CAAC;YACzE,IAAI,CAAC,SAAS;gBAAE,MAAM;YAEtB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACvB,MAAM,WAAW,GAAG,SAAS,CAAC,MAAM,CAAC;YAErC,mCAAmC;YACnC,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YAC5D,KAAK,IAAI,WAAW,GAAG,WAAW,CAAC,MAAM,CAAC;YAE1C,IAAI,IAAI,CAAC,MAAM,GAAG,KAAK,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC;gBAC7C,MAAM;YACR,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,qBAAqB,CAAC,IAAY,EAAE,SAAiB;QAC3D,IAAI,IAAA,uBAAW,EAAC,IAAI,CAAC,IAAI,SAAS,EAAE,CAAC;YACnC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,iDAAiD;QACjD,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YACrC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,SAAS,EAAE,CAAC;gBACxB,SAAS,GAAG,SAAS,CAAC;gBACtB,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,kBAAkB;IACtF,CAAC;CACF;AA3ND,8DA2NC"}
@@ -0,0 +1,96 @@
1
+ import type { Chunk, ChunkingOptions, ChunkingStrategy } from '../types';
2
+ import type { EmbeddingProvider } from '../../types';
3
+ interface ChunkHeader {
4
+ summary?: string;
5
+ facts?: string[];
6
+ refs?: string[];
7
+ }
8
+ export interface SemanticChunkingOptions extends ChunkingOptions {
9
+ embeddingProvider?: EmbeddingProvider;
10
+ softLimit?: number;
11
+ hardLimit?: number;
12
+ similarityThreshold?: number;
13
+ contentType?: 'conversation' | 'text';
14
+ contextOverlapPercent?: number;
15
+ smartOverlap?: boolean;
16
+ volatilityWindow?: number;
17
+ generateHeaders?: boolean;
18
+ headerGenerator?: (text: string) => Promise<ChunkHeader>;
19
+ stripNoise?: boolean;
20
+ noisePatterns?: RegExp[];
21
+ addRoleMarkers?: boolean;
22
+ }
23
+ export declare class SemanticChunkingStrategy implements ChunkingStrategy {
24
+ getName(): string;
25
+ chunk(text: string, documentId: string, options: ChunkingOptions): Chunk[];
26
+ /**
27
+ * Auto-detect if text is conversational based on role markers
28
+ */
29
+ private detectContentType;
30
+ /**
31
+ * Pre-segment text into atomic units (turns, paragraphs, sentences)
32
+ */
33
+ private preSegment;
34
+ /**
35
+ * Segment conversation into turns (user/agent/tool)
36
+ */
37
+ private segmentConversation;
38
+ /**
39
+ * Segment plain text into paragraphs and sentences
40
+ */
41
+ private segmentText;
42
+ /**
43
+ * Semantic merging with soft/hard limits
44
+ */
45
+ private semanticMerge;
46
+ /**
47
+ * Finalize chunk from atoms
48
+ */
49
+ private finalizeChunk;
50
+ /**
51
+ * Calculate topic volatility (simplified - uses lexical changes)
52
+ */
53
+ private calculateVolatility;
54
+ /**
55
+ * Apply smart overlap between chunks
56
+ */
57
+ private applySmartOverlap;
58
+ /**
59
+ * Add role markers to text
60
+ */
61
+ private addRoleMarkers;
62
+ /**
63
+ * Generate simple header (synchronous version)
64
+ */
65
+ private generateSimpleHeader;
66
+ /**
67
+ * Extract facts from text (dates, numbers, key entities)
68
+ */
69
+ private extractFacts;
70
+ /**
71
+ * Strip noise from text
72
+ */
73
+ private stripNoise;
74
+ /**
75
+ * Compute lexical similarity (Jaccard similarity on words)
76
+ */
77
+ private computeLexicalSimilarity;
78
+ /**
79
+ * Split text into sentences
80
+ */
81
+ private splitIntoSentences;
82
+ /**
83
+ * Estimate token count using accurate tokenizer
84
+ */
85
+ private estimateTokens;
86
+ /**
87
+ * Split an oversized atom (turn, paragraph, or sentence)
88
+ */
89
+ private splitOversizedAtom;
90
+ /**
91
+ * Split an oversized chunk using simple recursive splitting
92
+ */
93
+ private splitOversizedChunk;
94
+ }
95
+ export {};
96
+ //# sourceMappingURL=semantic.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"semantic.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/semantic.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AACzE,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAYrD,UAAU,WAAW;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;CACjB;AASD,MAAM,WAAW,uBAAwB,SAAQ,eAAe;IAE9D,iBAAiB,CAAC,EAAE,iBAAiB,CAAC;IAGtC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IAGnB,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAG7B,WAAW,CAAC,EAAE,cAAc,GAAG,MAAM,CAAC;IAGtC,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAC/B,YAAY,CAAC,EAAE,OAAO,CAAC;IAGvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAG1B,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,eAAe,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC,WAAW,CAAC,CAAC;IAGzD,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IAGzB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC1B;AAED,qBAAa,wBAAyB,YAAW,gBAAgB;IAC/D,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;IAmI1E;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAsBzB;;OAEG;IACH,OAAO,CAAC,UAAU;IAalB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAwE3B;;OAEG;IACH,OAAO,CAAC,WAAW;IAuCnB;;OAEG;IACH,OAAO,CAAC,aAAa;IA6GrB;;OAEG;IACH,OAAO,CAAC,aAAa;IAqBrB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAY3B;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAuCzB;;OAEG;IACH,OAAO,CAAC,cAAc;IAWtB;;OAEG;IACH,OAAO,CAAC,oBAAoB;IAW5B;;OAEG;IACH,OAAO,CAAC,YAAY;IAqBpB;;OAEG;IACH,OAAO,CAAC,UAAU;IAoBlB;;OAEG;IACH,OAAO,CAAC,wBAAwB;IAwBhC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAS1B;;OAEG;IACH,OAAO,CAAC,cAAc;IAItB;;OAEG;IACH,OAAO,CAAC,kBAAkB;IA0C1B;;OAEG;IACH,OAAO,CAAC,mBAAmB;CAoD5B"}