@robthepcguy/rag-vault 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +24 -0
  2. package/README.md +421 -0
  3. package/dist/bin/install-skills.d.ts +20 -0
  4. package/dist/bin/install-skills.d.ts.map +1 -0
  5. package/dist/bin/install-skills.js +196 -0
  6. package/dist/bin/install-skills.js.map +1 -0
  7. package/dist/chunker/index.d.ts +11 -0
  8. package/dist/chunker/index.d.ts.map +1 -0
  9. package/dist/chunker/index.js +6 -0
  10. package/dist/chunker/index.js.map +1 -0
  11. package/dist/chunker/semantic-chunker.d.ts +96 -0
  12. package/dist/chunker/semantic-chunker.d.ts.map +1 -0
  13. package/dist/chunker/semantic-chunker.js +267 -0
  14. package/dist/chunker/semantic-chunker.js.map +1 -0
  15. package/dist/chunker/sentence-splitter.d.ts +16 -0
  16. package/dist/chunker/sentence-splitter.d.ts.map +1 -0
  17. package/dist/chunker/sentence-splitter.js +114 -0
  18. package/dist/chunker/sentence-splitter.js.map +1 -0
  19. package/dist/embedder/index.d.ts +55 -0
  20. package/dist/embedder/index.d.ts.map +1 -0
  21. package/dist/embedder/index.js +146 -0
  22. package/dist/embedder/index.js.map +1 -0
  23. package/dist/errors/index.d.ts +73 -0
  24. package/dist/errors/index.d.ts.map +1 -0
  25. package/dist/errors/index.js +170 -0
  26. package/dist/errors/index.js.map +1 -0
  27. package/dist/index.d.ts +3 -0
  28. package/dist/index.d.ts.map +1 -0
  29. package/dist/index.js +91 -0
  30. package/dist/index.js.map +1 -0
  31. package/dist/parser/html-parser.d.ts +14 -0
  32. package/dist/parser/html-parser.d.ts.map +1 -0
  33. package/dist/parser/html-parser.js +99 -0
  34. package/dist/parser/html-parser.js.map +1 -0
  35. package/dist/parser/index.d.ts +144 -0
  36. package/dist/parser/index.d.ts.map +1 -0
  37. package/dist/parser/index.js +446 -0
  38. package/dist/parser/index.js.map +1 -0
  39. package/dist/parser/pdf-filter.d.ts +89 -0
  40. package/dist/parser/pdf-filter.d.ts.map +1 -0
  41. package/dist/parser/pdf-filter.js +304 -0
  42. package/dist/parser/pdf-filter.js.map +1 -0
  43. package/dist/server/index.d.ts +144 -0
  44. package/dist/server/index.d.ts.map +1 -0
  45. package/dist/server/index.js +518 -0
  46. package/dist/server/index.js.map +1 -0
  47. package/dist/server/raw-data-utils.d.ts +81 -0
  48. package/dist/server/raw-data-utils.d.ts.map +1 -0
  49. package/dist/server/raw-data-utils.js +196 -0
  50. package/dist/server/raw-data-utils.js.map +1 -0
  51. package/dist/server/schemas.d.ts +186 -0
  52. package/dist/server/schemas.d.ts.map +1 -0
  53. package/dist/server/schemas.js +99 -0
  54. package/dist/server/schemas.js.map +1 -0
  55. package/dist/utils/config-parsers.d.ts +14 -0
  56. package/dist/utils/config-parsers.d.ts.map +1 -0
  57. package/dist/utils/config-parsers.js +47 -0
  58. package/dist/utils/config-parsers.js.map +1 -0
  59. package/dist/utils/config.d.ts +37 -0
  60. package/dist/utils/config.d.ts.map +1 -0
  61. package/dist/utils/config.js +52 -0
  62. package/dist/utils/config.js.map +1 -0
  63. package/dist/utils/logger.d.ts +36 -0
  64. package/dist/utils/logger.d.ts.map +1 -0
  65. package/dist/utils/logger.js +64 -0
  66. package/dist/utils/logger.js.map +1 -0
  67. package/dist/utils/math.d.ts +34 -0
  68. package/dist/utils/math.d.ts.map +1 -0
  69. package/dist/utils/math.js +73 -0
  70. package/dist/utils/math.js.map +1 -0
  71. package/dist/utils/process-handlers.d.ts +26 -0
  72. package/dist/utils/process-handlers.d.ts.map +1 -0
  73. package/dist/utils/process-handlers.js +69 -0
  74. package/dist/utils/process-handlers.js.map +1 -0
  75. package/dist/vectordb/index.d.ts +210 -0
  76. package/dist/vectordb/index.d.ts.map +1 -0
  77. package/dist/vectordb/index.js +613 -0
  78. package/dist/vectordb/index.js.map +1 -0
  79. package/dist/web/api-routes.d.ts +9 -0
  80. package/dist/web/api-routes.d.ts.map +1 -0
  81. package/dist/web/api-routes.js +127 -0
  82. package/dist/web/api-routes.js.map +1 -0
  83. package/dist/web/config-routes.d.ts +7 -0
  84. package/dist/web/config-routes.d.ts.map +1 -0
  85. package/dist/web/config-routes.js +54 -0
  86. package/dist/web/config-routes.js.map +1 -0
  87. package/dist/web/database-manager.d.ts +130 -0
  88. package/dist/web/database-manager.d.ts.map +1 -0
  89. package/dist/web/database-manager.js +382 -0
  90. package/dist/web/database-manager.js.map +1 -0
  91. package/dist/web/http-server.d.ts +28 -0
  92. package/dist/web/http-server.d.ts.map +1 -0
  93. package/dist/web/http-server.js +311 -0
  94. package/dist/web/http-server.js.map +1 -0
  95. package/dist/web/index.d.ts +3 -0
  96. package/dist/web/index.d.ts.map +1 -0
  97. package/dist/web/index.js +114 -0
  98. package/dist/web/index.js.map +1 -0
  99. package/dist/web/middleware/async-handler.d.ts +17 -0
  100. package/dist/web/middleware/async-handler.d.ts.map +1 -0
  101. package/dist/web/middleware/async-handler.js +26 -0
  102. package/dist/web/middleware/async-handler.js.map +1 -0
  103. package/dist/web/middleware/auth.d.ts +22 -0
  104. package/dist/web/middleware/auth.d.ts.map +1 -0
  105. package/dist/web/middleware/auth.js +81 -0
  106. package/dist/web/middleware/auth.js.map +1 -0
  107. package/dist/web/middleware/error-handler.d.ts +36 -0
  108. package/dist/web/middleware/error-handler.d.ts.map +1 -0
  109. package/dist/web/middleware/error-handler.js +68 -0
  110. package/dist/web/middleware/error-handler.js.map +1 -0
  111. package/dist/web/middleware/index.d.ts +6 -0
  112. package/dist/web/middleware/index.d.ts.map +1 -0
  113. package/dist/web/middleware/index.js +19 -0
  114. package/dist/web/middleware/index.js.map +1 -0
  115. package/dist/web/middleware/rate-limit.d.ts +38 -0
  116. package/dist/web/middleware/rate-limit.d.ts.map +1 -0
  117. package/dist/web/middleware/rate-limit.js +116 -0
  118. package/dist/web/middleware/rate-limit.js.map +1 -0
  119. package/dist/web/middleware/request-logger.d.ts +52 -0
  120. package/dist/web/middleware/request-logger.d.ts.map +1 -0
  121. package/dist/web/middleware/request-logger.js +74 -0
  122. package/dist/web/middleware/request-logger.js.map +1 -0
  123. package/dist/web/types.d.ts +6 -0
  124. package/dist/web/types.d.ts.map +1 -0
  125. package/dist/web/types.js +4 -0
  126. package/dist/web/types.js.map +1 -0
  127. package/package.json +135 -0
  128. package/skills/rag-vault/SKILL.md +111 -0
  129. package/skills/rag-vault/references/html-ingestion.md +73 -0
  130. package/skills/rag-vault/references/query-optimization.md +57 -0
  131. package/skills/rag-vault/references/result-refinement.md +54 -0
@@ -0,0 +1,6 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.SemanticChunker = void 0;
4
+ var semantic_chunker_js_1 = require("./semantic-chunker.js");
5
+ Object.defineProperty(exports, "SemanticChunker", { enumerable: true, get: function () { return semantic_chunker_js_1.SemanticChunker; } });
6
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/chunker/index.ts"],"names":[],"mappings":";;;AAUA,6DAAuD;AAA9C,sHAAA,eAAe,OAAA"}
@@ -0,0 +1,96 @@
1
+ import type { TextChunk } from './index.js';
2
+ /**
3
+ * Semantic Chunker configuration
4
+ * Based on paper recommendations: hardThreshold=0.6, initConst=1.5, c=0.9
5
+ */
6
+ export interface SemanticChunkerConfig {
7
+ /** Hard threshold for minimum similarity (default: 0.6) */
8
+ hardThreshold: number;
9
+ /** Initial constant for first sentence pair (default: 1.5) */
10
+ initConst: number;
11
+ /** Scaling constant for threshold calculation (default: 0.9) */
12
+ c: number;
13
+ /** Minimum chunk length in characters (default: 50) */
14
+ minChunkLength: number;
15
+ }
16
+ /**
17
+ * Embedder interface for generating embeddings
18
+ */
19
+ export interface EmbedderInterface {
20
+ embedBatch(texts: string[]): Promise<number[][]>;
21
+ }
22
+ /**
23
+ * Check if a chunk is garbage (should be filtered out)
24
+ *
25
+ * Criteria (language-agnostic):
26
+ * 1. Empty after trimming
27
+ * 2. Contains alphanumeric -> valid content (keep)
28
+ * 3. Only decoration characters (----, ====, etc.) -> garbage
29
+ * 4. Single character repeated >80% of text -> garbage
30
+ *
31
+ * Note: Applied after minChunkLength filter
32
+ *
33
+ * @param text - Chunk text to check
34
+ * @returns true if chunk is garbage and should be removed
35
+ */
36
+ export declare function isGarbageChunk(text: string): boolean;
37
+ export declare const DEFAULT_SEMANTIC_CHUNKER_CONFIG: SemanticChunkerConfig;
38
+ /**
39
+ * Semantic chunker using Max-Min algorithm
40
+ *
41
+ * The algorithm groups consecutive sentences based on semantic similarity:
42
+ * 1. Split text into sentences
43
+ * 2. Generate embeddings for all sentences
44
+ * 3. For each sentence, decide whether to add to current chunk or start new chunk
45
+ * 4. Decision is based on comparing max similarity with new sentence vs min similarity within chunk
46
+ *
47
+ * Key insight: A sentence belongs to a chunk if its maximum similarity to any chunk member
48
+ * is greater than the minimum similarity between existing chunk members (with threshold adjustment)
49
+ */
50
+ export declare class SemanticChunker {
51
+ private readonly config;
52
+ constructor(config?: Partial<SemanticChunkerConfig>);
53
+ /**
54
+ * Split text into semantically coherent chunks
55
+ *
56
+ * @param text - The text to chunk
57
+ * @param embedder - Embedder to generate sentence embeddings
58
+ * @returns Array of text chunks
59
+ */
60
+ chunkText(text: string, embedder: EmbedderInterface): Promise<TextChunk[]>;
61
+ /**
62
+ * Group sentences into chunks using Max-Min algorithm
63
+ */
64
+ private groupSentences;
65
+ /**
66
+ * Decide if a sentence should be added to the current chunk
67
+ * Based on Max-Min algorithm from the paper
68
+ */
69
+ private shouldAddToChunk;
70
+ /**
71
+ * Get minimum pairwise similarity within a chunk.
72
+ * Only compares the last WINDOW_SIZE sentences for O(1) complexity.
73
+ * This approximation is valid because recent sentences are most relevant
74
+ * for determining chunk coherence (per Max-Min paper's experimental setup).
75
+ */
76
+ private getMinSimilarity;
77
+ /**
78
+ * Get maximum similarity between a sentence and any sentence in the chunk
79
+ */
80
+ private getMaxSimilarity;
81
+ /**
82
+ * Calculate dynamic threshold based on chunk size
83
+ * threshold = max(c * minSim * sigmoid(|C|), hardThreshold)
84
+ */
85
+ private calculateThreshold;
86
+ /**
87
+ * Sigmoid function
88
+ */
89
+ private sigmoid;
90
+ /**
91
+ * Calculate cosine similarity between two vectors
92
+ * Public for testing - delegates to shared utility
93
+ */
94
+ cosineSimilarity(vec1: number[], vec2: number[]): number;
95
+ }
96
+ //# sourceMappingURL=semantic-chunker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"semantic-chunker.d.ts","sourceRoot":"","sources":["../../src/chunker/semantic-chunker.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAQ3C;;;GAGG;AACH,MAAM,WAAW,qBAAqB;IACpC,2DAA2D;IAC3D,aAAa,EAAE,MAAM,CAAA;IACrB,8DAA8D;IAC9D,SAAS,EAAE,MAAM,CAAA;IACjB,gEAAgE;IAChE,CAAC,EAAE,MAAM,CAAA;IACT,uDAAuD;IACvD,cAAc,EAAE,MAAM,CAAA;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAA;CACjD;AAoBD;;;;;;;;;;;;;GAaG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAmBpD;AAMD,eAAO,MAAM,+BAA+B,EAAE,qBAK7C,CAAA;AAMD;;;;;;;;;;;GAWG;AACH,qBAAa,eAAe;IAC1B,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAuB;gBAElC,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM;IAIvD;;;;;;OAMG;IACG,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,iBAAiB,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;IAsChF;;OAEG;IACH,OAAO,CAAC,cAAc;IAmEtB;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IAaxB;;;;;OAKG;IACH,OAAO,CAAC,gBAAgB;IAuBxB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAWxB;;;OAGG;IACH,OAAO,CAAC,kBAAkB;IAM1B;;OAEG;IACH,OAAO,CAAC,OAAO;IAIf;;;OAGG;IACH,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,MAAM;CAGzD"}
@@ -0,0 +1,267 @@
1
+ "use strict";
2
+ // Semantic Chunker implementation using Max-Min algorithm
3
+ // Based on: "Max–Min semantic chunking of documents for RAG application" (Springer, 2025)
4
+ Object.defineProperty(exports, "__esModule", { value: true });
5
+ exports.SemanticChunker = exports.DEFAULT_SEMANTIC_CHUNKER_CONFIG = void 0;
6
+ exports.isGarbageChunk = isGarbageChunk;
7
+ const sentence_splitter_js_1 = require("./sentence-splitter.js");
8
+ const math_js_1 = require("../utils/math.js");
9
+ // ============================================
10
+ // Performance Optimization Constants
11
+ // ============================================
12
+ /**
13
+ * Number of recent sentences to compare in getMinSimilarity.
14
+ * Based on Max-Min paper's experimental conditions (median 5 sentences per chunk).
15
+ * Reduces complexity from O(k²) to O(WINDOW_SIZE²) = O(25) = O(1).
16
+ */
17
+ const WINDOW_SIZE = 5;
18
+ /**
19
+ * Maximum number of sentences per chunk before forced split.
20
+ * Safety limit to prevent computational explosion on homogeneous documents.
21
+ * Set to 3x the paper's median chunk size for reasonable margin.
22
+ */
23
+ const MAX_SENTENCES = 15;
24
+ /**
25
+ * Check if a chunk is garbage (should be filtered out)
26
+ *
27
+ * Criteria (language-agnostic):
28
+ * 1. Empty after trimming
29
+ * 2. Contains alphanumeric -> valid content (keep)
30
+ * 3. Only decoration characters (----, ====, etc.) -> garbage
31
+ * 4. Single character repeated >80% of text -> garbage
32
+ *
33
+ * Note: Applied after minChunkLength filter
34
+ *
35
+ * @param text - Chunk text to check
36
+ * @returns true if chunk is garbage and should be removed
37
+ */
38
+ function isGarbageChunk(text) {
39
+ const trimmed = text.trim();
40
+ if (trimmed.length === 0)
41
+ return true;
42
+ // If contains any alphanumeric, consider valid content
43
+ if (/[a-zA-Z0-9]/.test(trimmed))
44
+ return false;
45
+ // Decoration line patterns only (----, ====, ****, etc.)
46
+ if (/^[-=_.*#|~`@!%^&*()[\]{}\\/<>:+\s]+$/.test(trimmed))
47
+ return true;
48
+ // Excessive repetition of single character (>80%)
49
+ const charCounts = new Map();
50
+ for (const char of trimmed) {
51
+ charCounts.set(char, (charCounts.get(char) ?? 0) + 1);
52
+ }
53
+ const maxCount = Math.max(...charCounts.values());
54
+ if (maxCount / trimmed.length > 0.8)
55
+ return true;
56
+ return false;
57
+ }
58
+ // ============================================
59
+ // Default Configuration
60
+ // ============================================
61
+ exports.DEFAULT_SEMANTIC_CHUNKER_CONFIG = {
62
+ hardThreshold: 0.6,
63
+ initConst: 1.5,
64
+ c: 0.9,
65
+ minChunkLength: 50,
66
+ };
67
+ // ============================================
68
+ // SemanticChunker Class
69
+ // ============================================
70
+ /**
71
+ * Semantic chunker using Max-Min algorithm
72
+ *
73
+ * The algorithm groups consecutive sentences based on semantic similarity:
74
+ * 1. Split text into sentences
75
+ * 2. Generate embeddings for all sentences
76
+ * 3. For each sentence, decide whether to add to current chunk or start new chunk
77
+ * 4. Decision is based on comparing max similarity with new sentence vs min similarity within chunk
78
+ *
79
+ * Key insight: A sentence belongs to a chunk if its maximum similarity to any chunk member
80
+ * is greater than the minimum similarity between existing chunk members (with threshold adjustment)
81
+ */
82
+ class SemanticChunker {
83
+ constructor(config = {}) {
84
+ this.config = { ...exports.DEFAULT_SEMANTIC_CHUNKER_CONFIG, ...config };
85
+ }
86
+ /**
87
+ * Split text into semantically coherent chunks
88
+ *
89
+ * @param text - The text to chunk
90
+ * @param embedder - Embedder to generate sentence embeddings
91
+ * @returns Array of text chunks
92
+ */
93
+ async chunkText(text, embedder) {
94
+ // Handle empty input
95
+ if (!text || text.trim().length === 0) {
96
+ return [];
97
+ }
98
+ // Split into sentences
99
+ const sentences = (0, sentence_splitter_js_1.splitIntoSentences)(text);
100
+ if (sentences.length === 0) {
101
+ return [];
102
+ }
103
+ // Generate embeddings for all sentences
104
+ const embeddings = await embedder.embedBatch(sentences);
105
+ // Apply Max-Min algorithm to group sentences into chunks
106
+ const sentenceGroups = this.groupSentences(sentences, embeddings);
107
+ // Convert groups to TextChunks
108
+ const chunks = [];
109
+ let chunkIndex = 0;
110
+ for (const group of sentenceGroups) {
111
+ const chunkText = group.join(' ');
112
+ // Filter out chunks that are too short or garbage
113
+ if (chunkText.length >= this.config.minChunkLength && !isGarbageChunk(chunkText)) {
114
+ chunks.push({
115
+ text: chunkText,
116
+ index: chunkIndex,
117
+ });
118
+ chunkIndex++;
119
+ }
120
+ }
121
+ return chunks;
122
+ }
123
+ /**
124
+ * Group sentences into chunks using Max-Min algorithm
125
+ */
126
+ groupSentences(sentences, embeddings) {
127
+ if (sentences.length === 0)
128
+ return [];
129
+ if (sentences.length === 1)
130
+ return [[sentences[0] ?? '']];
131
+ const groups = [];
132
+ let currentGroup = [];
133
+ let currentGroupEmbeddings = [];
134
+ for (let i = 0; i < sentences.length; i++) {
135
+ const sentence = sentences[i];
136
+ const embedding = embeddings[i];
137
+ if (!sentence || !embedding)
138
+ continue;
139
+ if (currentGroup.length === 0) {
140
+ // Start new group with first sentence
141
+ currentGroup.push(sentence);
142
+ currentGroupEmbeddings.push(embedding);
143
+ }
144
+ else if (currentGroup.length === 1) {
145
+ // Special case for second sentence (init phase)
146
+ const firstEmbedding = currentGroupEmbeddings[0];
147
+ if (!firstEmbedding)
148
+ continue;
149
+ const similarity = this.cosineSimilarity(firstEmbedding, embedding);
150
+ if (this.config.initConst * similarity > this.config.hardThreshold) {
151
+ // Add to current group
152
+ currentGroup.push(sentence);
153
+ currentGroupEmbeddings.push(embedding);
154
+ }
155
+ else {
156
+ // Start new group
157
+ groups.push([...currentGroup]);
158
+ currentGroup = [sentence];
159
+ currentGroupEmbeddings = [embedding];
160
+ }
161
+ }
162
+ else {
163
+ // Force split if chunk reaches MAX_SENTENCES (safety limit for performance)
164
+ if (currentGroup.length >= MAX_SENTENCES) {
165
+ groups.push([...currentGroup]);
166
+ currentGroup = [sentence];
167
+ currentGroupEmbeddings = [embedding];
168
+ continue;
169
+ }
170
+ // Normal case: check if sentence should join current group
171
+ const shouldAdd = this.shouldAddToChunk(embedding, currentGroupEmbeddings);
172
+ if (shouldAdd) {
173
+ currentGroup.push(sentence);
174
+ currentGroupEmbeddings.push(embedding);
175
+ }
176
+ else {
177
+ // Start new group
178
+ groups.push([...currentGroup]);
179
+ currentGroup = [sentence];
180
+ currentGroupEmbeddings = [embedding];
181
+ }
182
+ }
183
+ }
184
+ // Don't forget the last group
185
+ if (currentGroup.length > 0) {
186
+ groups.push(currentGroup);
187
+ }
188
+ return groups;
189
+ }
190
+ /**
191
+ * Decide if a sentence should be added to the current chunk
192
+ * Based on Max-Min algorithm from the paper
193
+ */
194
+ shouldAddToChunk(newEmbedding, chunkEmbeddings) {
195
+ // Calculate min similarity within current chunk
196
+ const minSim = this.getMinSimilarity(chunkEmbeddings);
197
+ // Calculate max similarity between new sentence and chunk
198
+ const maxSim = this.getMaxSimilarity(newEmbedding, chunkEmbeddings);
199
+ // Calculate dynamic threshold
200
+ const threshold = this.calculateThreshold(minSim, chunkEmbeddings.length);
201
+ return maxSim > threshold;
202
+ }
203
+ /**
204
+ * Get minimum pairwise similarity within a chunk.
205
+ * Only compares the last WINDOW_SIZE sentences for O(1) complexity.
206
+ * This approximation is valid because recent sentences are most relevant
207
+ * for determining chunk coherence (per Max-Min paper's experimental setup).
208
+ */
209
+ getMinSimilarity(embeddings) {
210
+ if (embeddings.length < 2)
211
+ return 1.0;
212
+ // Only compare the last WINDOW_SIZE embeddings to reduce O(k²) to O(1)
213
+ const startIdx = Math.max(0, embeddings.length - WINDOW_SIZE);
214
+ const windowEmbeddings = embeddings.slice(startIdx);
215
+ let minSim = 1.0;
216
+ for (let i = 0; i < windowEmbeddings.length; i++) {
217
+ for (let j = i + 1; j < windowEmbeddings.length; j++) {
218
+ const embI = windowEmbeddings[i];
219
+ const embJ = windowEmbeddings[j];
220
+ if (!embI || !embJ)
221
+ continue;
222
+ const sim = this.cosineSimilarity(embI, embJ);
223
+ if (sim < minSim) {
224
+ minSim = sim;
225
+ }
226
+ }
227
+ }
228
+ return minSim;
229
+ }
230
+ /**
231
+ * Get maximum similarity between a sentence and any sentence in the chunk
232
+ */
233
+ getMaxSimilarity(embedding, chunkEmbeddings) {
234
+ let maxSim = -1.0;
235
+ for (const chunkEmb of chunkEmbeddings) {
236
+ const sim = this.cosineSimilarity(embedding, chunkEmb);
237
+ if (sim > maxSim) {
238
+ maxSim = sim;
239
+ }
240
+ }
241
+ return maxSim;
242
+ }
243
+ /**
244
+ * Calculate dynamic threshold based on chunk size
245
+ * threshold = max(c * minSim * sigmoid(|C|), hardThreshold)
246
+ */
247
+ calculateThreshold(minSim, chunkSize) {
248
+ const sigmoidValue = this.sigmoid(chunkSize);
249
+ const dynamicThreshold = this.config.c * minSim * sigmoidValue;
250
+ return Math.max(dynamicThreshold, this.config.hardThreshold);
251
+ }
252
+ /**
253
+ * Sigmoid function
254
+ */
255
+ sigmoid(x) {
256
+ return 1 / (1 + Math.exp(-x));
257
+ }
258
+ /**
259
+ * Calculate cosine similarity between two vectors
260
+ * Public for testing - delegates to shared utility
261
+ */
262
+ cosineSimilarity(vec1, vec2) {
263
+ return (0, math_js_1.cosineSimilarity)(vec1, vec2);
264
+ }
265
+ }
266
+ exports.SemanticChunker = SemanticChunker;
267
+ //# sourceMappingURL=semantic-chunker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"semantic-chunker.js","sourceRoot":"","sources":["../../src/chunker/semantic-chunker.ts"],"names":[],"mappings":";AAAA,0DAA0D;AAC1D,0FAA0F;;;AAgE1F,wCAmBC;AAhFD,iEAA2D;AAC3D,8CAA2E;AA4B3E,+CAA+C;AAC/C,qCAAqC;AACrC,+CAA+C;AAE/C;;;;GAIG;AACH,MAAM,WAAW,GAAG,CAAC,CAAA;AAErB;;;;GAIG;AACH,MAAM,aAAa,GAAG,EAAE,CAAA;AAExB;;;;;;;;;;;;;GAaG;AACH,SAAgB,cAAc,CAAC,IAAY;IACzC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAA;IAC3B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAA;IAErC,uDAAuD;IACvD,IAAI,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,KAAK,CAAA;IAE7C,yDAAyD;IACzD,IAAI,sCAAsC,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAA;IAErE,kDAAkD;IAClD,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAA;IAC5C,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;QAC3B,UAAU,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;IACvD,CAAC;IACD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,CAAA;IACjD,IAAI,QAAQ,GAAG,OAAO,CAAC,MAAM,GAAG,GAAG;QAAE,OAAO,IAAI,CAAA;IAEhD,OAAO,KAAK,CAAA;AACd,CAAC;AAED,+CAA+C;AAC/C,wBAAwB;AACxB,+CAA+C;AAElC,QAAA,+BAA+B,GAA0B;IACpE,aAAa,EAAE,GAAG;IAClB,SAAS,EAAE,GAAG;IACd,CAAC,EAAE,GAAG;IACN,cAAc,EAAE,EAAE;CACnB,CAAA;AAED,+CAA+C;AAC/C,wBAAwB;AACxB,+CAA+C;AAE/C;;;;;;;;;;;GAWG;AACH,MAAa,eAAe;IAG1B,YAAY,SAAyC,EAAE;QACrD,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,uCAA+B,EAAE,GAAG,MAAM,EAAE,CAAA;IACjE,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,SAAS,CAAC,IAAY,EAAE,QAA2B;QACvD,qBAAqB;QACrB,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtC,OAAO,EAAE,CAAA;QACX,CAAC;QAED,uBAAuB;QACvB,MAAM,SAAS,GAAG,IAAA,yCAAkB,EAAC,IAAI,CAAC,CAAA;QAC1C,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,OAAO,EAAE,CAAA;QACX,CAAC;QAED,wCAAwC;QACxC,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,SAAS,CAAC,CAAA;QAEvD,yDAAyD;QACzD,MAAM,cAAc,GAAG,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,UAAU,CAAC,CAAA;QAEjE,+BAA+B;QAC/B,MAAM,MAAM,GAAgB,EAAE,CAAA;QAC9B,IAAI,UAAU,GAAG,CAAC,CAAA;QAElB,KAAK,MAAM,KAAK,IAAI,cAAc,EAAE,CAAC;YACnC,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;YAEjC,kDAAkD;YAClD,IAAI,SAAS,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,CAAC,cAAc,IAAI,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,CAAC;gBACjF,MAAM,CAAC,IAAI,CAAC;oBACV,IAAI,EAAE,SAAS;oBACf,KAAK,EAAE,UAAU;iBAClB,CAAC,CAAA;gBACF,UAAU,EAAE,CAAA;YACd,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,SAAmB,EAAE,UAAsB;QAChE,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAA;QACrC,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAA;QAEzD,MAAM,MAAM,GAAe,EAAE,CAAA;QAC7B,IAAI,YAAY,GAAa,EAAE,CAAA;QAC/B,IAAI,sBAAsB,GAAe,EAAE,CAAA;QAE3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAA;YAC7B,MAAM,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAE/B,IAAI,CAAC,QAAQ,IAAI,CAAC,SAAS;gBAAE,SAAQ;YAErC,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC9B,sCAAsC;gBACtC,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;gBAC3B,sBAAsB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;YACxC,CAAC;iBAAM,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACrC,gDAAgD;gBAChD,MAAM,cAAc,GAAG,sBAAsB,CAAC,CAAC,CAAC,CAAA;gBAChD,IAAI,CAAC,cAAc;oBAAE,SAAQ;gBAE7B,MAAM,UAAU,GAAG,IAAI,CAAC,gBAAgB,CAAC,cAAc,EAAE,SAAS,CAAC,CAAA;gBAEnE,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,GAAG,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,aAAa,EAAE,CAAC;oBACnE,uBAAuB;oBACvB,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;oBAC3B,sBAAsB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;gBACxC,CAAC;qBAAM,CAAC;oBACN,kBAAkB;oBAClB,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC,CAAA;oBAC9B,YAAY,GAAG,CAAC,QAAQ,CAAC,CAAA;oBACzB,sBAAsB,GAAG,CAAC,SAAS,CAAC,CAAA;gBACtC,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,4EAA4E;gBAC5E,IAAI,YAAY,CAAC,MAAM,IAAI,aAAa,EAAE,CAAC;oBACzC,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC,CAAA;oBAC9B,YAAY,GAAG,CAAC,QAAQ,CAAC,CAAA;oBACzB,sBAAsB,GAAG,CAAC,SAAS,CAAC,CAAA;oBACpC,SAAQ;gBACV,CAAC;gBAED,2DAA2D;gBAC3D,MAAM,SAAS,GAAG,IAAI,CAAC,gBAAgB,CAAC,SAAS,EAAE,sBAAsB,CAAC,CAAA;gBAE1E,IAAI,SAAS,EAAE,CAAC;oBACd,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;oBAC3B,sBAAsB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;gBACxC,CAAC;qBAAM,CAAC;oBACN,kBAAkB;oBAClB,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC,CAAA;oBAC9B,YAAY,GAAG,CAAC,QAAQ,CAAC,CAAA;oBACzB,sBAAsB,GAAG,CAAC,SAAS,CAAC,CAAA;gBACtC,CAAC;YACH,CAAC;QACH,CAAC;QAED,8BAA8B;QAC9B,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAA;QAC3B,CAAC;QAED,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;;OAGG;IACK,gBAAgB,CAAC,YAAsB,EAAE,eAA2B;QAC1E,gDAAgD;QAChD,MAAM,MAAM,GAAG,IAAI,CAAC,gBAAgB,CAAC,eAAe,CAAC,CAAA;QAErD,0DAA0D;QAC1D,MAAM,MAAM,GAAG,IAAI,CAAC,gBAAgB,CAAC,YAAY,EAAE,eAAe,CAAC,CAAA;QAEnE,8BAA8B;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,kBAAkB,CAAC,MAAM,EAAE,eAAe,CAAC,MAAM,CAAC,CAAA;QAEzE,OAAO,MAAM,GAAG,SAAS,CAAA;IAC3B,CAAC;IAED;;;;;OAKG;IACK,gBAAgB,CAAC,UAAsB;QAC7C,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,GAAG,CAAA;QAErC,uEAAuE;QACvE,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,UAAU,CAAC,MAAM,GAAG,WAAW,CAAC,CAAA;QAC7D,MAAM,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAA;QAEnD,IAAI,MAAM,GAAG,GAAG,CAAA;QAChB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACrD,MAAM,IAAI,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAA;gBAChC,MAAM,IAAI,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAA;gBAChC,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI;oBAAE,SAAQ;gBAE5B,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAA;gBAC7C,IAAI,GAAG,GAAG,MAAM,EAAE,CAAC;oBACjB,MAAM,GAAG,GAAG,CAAA;gBACd,CAAC;YACH,CAAC;QACH,CAAC;QACD,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,SAAmB,EAAE,eAA2B;QACvE,IAAI,MAAM,GAAG,CAAC,GAAG,CAAA;QACjB,KAAK,MAAM,QAAQ,IAAI,eAAe,EAAE,CAAC;YACvC,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAA;YACtD,IAAI,GAAG,GAAG,MAAM,EAAE,CAAC;gBACjB,MAAM,GAAG,GAAG,CAAA;YACd,CAAC;QACH,CAAC;QACD,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;;OAGG;IACK,kBAAkB,CAAC,MAAc,EAAE,SAAiB;QAC1D,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAA;QAC5C,MAAM,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,MAAM,GAAG,YAAY,CAAA;QAC9D,OAAO,IAAI,CAAC,GAAG,CAAC,gBAAgB,EAAE,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,CAAA;IAC9D,CAAC;IAED;;OAEG;IACK,OAAO,CAAC,CAAS;QACvB,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAC/B,CAAC;IAED;;;OAGG;IACH,gBAAgB,CAAC,IAAc,EAAE,IAAc;QAC7C,OAAO,IAAA,0BAAoB,EAAC,IAAI,EAAE,IAAI,CAAC,CAAA;IACzC,CAAC;CACF;AA9MD,0CA8MC"}
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Split text into sentences using Intl.Segmenter
3
+ *
4
+ * Uses the Unicode Text Segmentation standard (UAX #29) via Intl.Segmenter.
5
+ * This provides multilingual support for sentence boundary detection.
6
+ *
7
+ * Note: Intl.Segmenter may split on abbreviations like "Mr." or "e.g."
8
+ * These edge cases are acceptable for semantic chunking as:
9
+ * 1. Short fragments will be grouped with adjacent sentences by similarity
10
+ * 2. Fragments below minChunkLength are filtered out
11
+ *
12
+ * @param text - The text to split into sentences
13
+ * @returns Array of sentences
14
+ */
15
+ export declare function splitIntoSentences(text: string): string[];
16
+ //# sourceMappingURL=sentence-splitter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"sentence-splitter.d.ts","sourceRoot":"","sources":["../../src/chunker/sentence-splitter.ts"],"names":[],"mappings":"AAoFA;;;;;;;;;;;;;GAaG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAwCzD"}
@@ -0,0 +1,114 @@
1
+ "use strict";
2
+ // Sentence Splitter for Semantic Chunking
3
+ // Created: 2025-12-27
4
+ // Purpose: Split text into sentences using Intl.Segmenter (Unicode standard)
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.splitIntoSentences = splitIntoSentences;
7
+ // ============================================
8
+ // Constants
9
+ // ============================================
10
+ /**
11
+ * Placeholder for code blocks during processing
12
+ */
13
+ const CODE_BLOCK_PLACEHOLDER = '\u0000CODE_BLOCK\u0000';
14
+ /**
15
+ * Placeholder for inline code during processing
16
+ */
17
+ const INLINE_CODE_PLACEHOLDER = '\u0000INLINE_CODE\u0000';
18
+ // ============================================
19
+ // Helper Functions
20
+ // ============================================
21
+ /**
22
+ * Extract and replace code blocks with placeholders
23
+ */
24
+ function extractCodeBlocks(text) {
25
+ const blocks = [];
26
+ let processedText = text;
27
+ // Extract fenced code blocks (```...```)
28
+ const codeBlockRegex = /```[\s\S]*?```/g;
29
+ let index = 0;
30
+ const codeBlockMatches = text.matchAll(codeBlockRegex);
31
+ for (const match of codeBlockMatches) {
32
+ const placeholder = `${CODE_BLOCK_PLACEHOLDER}${index}${CODE_BLOCK_PLACEHOLDER}`;
33
+ blocks.push({ placeholder, content: match[0] });
34
+ processedText = processedText.replace(match[0], placeholder);
35
+ index++;
36
+ }
37
+ // Extract inline code (`...`)
38
+ const inlineCodeRegex = /`[^`]+`/g;
39
+ const inlineMatches = processedText.matchAll(inlineCodeRegex);
40
+ for (const match of inlineMatches) {
41
+ const placeholder = `${INLINE_CODE_PLACEHOLDER}${index}${INLINE_CODE_PLACEHOLDER}`;
42
+ blocks.push({ placeholder, content: match[0] });
43
+ processedText = processedText.replace(match[0], placeholder);
44
+ index++;
45
+ }
46
+ return { text: processedText, blocks };
47
+ }
48
+ /**
49
+ * Restore code blocks from placeholders
50
+ */
51
+ function restoreCodeBlocks(sentences, blocks) {
52
+ return sentences.map((sentence) => {
53
+ let restored = sentence;
54
+ for (const block of blocks) {
55
+ restored = restored.replace(block.placeholder, block.content);
56
+ }
57
+ return restored;
58
+ });
59
+ }
60
+ // ============================================
61
+ // Intl.Segmenter-based splitting
62
+ // ============================================
63
+ // Create segmenters for supported languages
64
+ // Using 'und' (undetermined) as fallback for general Unicode support
65
+ const segmenter = new Intl.Segmenter('und', { granularity: 'sentence' });
66
+ /**
67
+ * Split text into sentences using Intl.Segmenter
68
+ *
69
+ * Uses the Unicode Text Segmentation standard (UAX #29) via Intl.Segmenter.
70
+ * This provides multilingual support for sentence boundary detection.
71
+ *
72
+ * Note: Intl.Segmenter may split on abbreviations like "Mr." or "e.g."
73
+ * These edge cases are acceptable for semantic chunking as:
74
+ * 1. Short fragments will be grouped with adjacent sentences by similarity
75
+ * 2. Fragments below minChunkLength are filtered out
76
+ *
77
+ * @param text - The text to split into sentences
78
+ * @returns Array of sentences
79
+ */
80
+ function splitIntoSentences(text) {
81
+ // Handle empty input
82
+ if (!text || text.trim().length === 0) {
83
+ return [];
84
+ }
85
+ // Extract code blocks to protect them from splitting
86
+ const { text: processedText, blocks } = extractCodeBlocks(text);
87
+ // Split on paragraph boundaries first
88
+ // biome-ignore lint/suspicious/noControlCharactersInRegex: Intentional use of NULL character as placeholder delimiter
89
+ const paragraphs = processedText.split(/\n{2,}|\n(?=\S)|(?<=\u0000)\n/);
90
+ const sentences = [];
91
+ for (const paragraph of paragraphs) {
92
+ const trimmedParagraph = paragraph.trim();
93
+ if (!trimmedParagraph)
94
+ continue;
95
+ // Check if it's a markdown heading (treat as single sentence)
96
+ if (/^#{1,6}\s/.test(trimmedParagraph)) {
97
+ sentences.push(trimmedParagraph);
98
+ continue;
99
+ }
100
+ // Use Intl.Segmenter for sentence splitting
101
+ const segments = segmenter.segment(trimmedParagraph);
102
+ for (const segment of segments) {
103
+ const trimmed = segment.segment.trim();
104
+ if (trimmed) {
105
+ sentences.push(trimmed);
106
+ }
107
+ }
108
+ }
109
+ // Restore code blocks
110
+ const restoredSentences = restoreCodeBlocks(sentences, blocks);
111
+ // Filter empty sentences and trim
112
+ return restoredSentences.map((s) => s.trim()).filter((s) => s.length > 0);
113
+ }
114
+ //# sourceMappingURL=sentence-splitter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"sentence-splitter.js","sourceRoot":"","sources":["../../src/chunker/sentence-splitter.ts"],"names":[],"mappings":";AAAA,0CAA0C;AAC1C,sBAAsB;AACtB,6EAA6E;;AAgG7E,gDAwCC;AAtID,+CAA+C;AAC/C,YAAY;AACZ,+CAA+C;AAE/C;;GAEG;AACH,MAAM,sBAAsB,GAAG,wBAAwB,CAAA;AAEvD;;GAEG;AACH,MAAM,uBAAuB,GAAG,yBAAyB,CAAA;AAWzD,+CAA+C;AAC/C,mBAAmB;AACnB,+CAA+C;AAE/C;;GAEG;AACH,SAAS,iBAAiB,CAAC,IAAY;IACrC,MAAM,MAAM,GAAoB,EAAE,CAAA;IAClC,IAAI,aAAa,GAAG,IAAI,CAAA;IAExB,yCAAyC;IACzC,MAAM,cAAc,GAAG,iBAAiB,CAAA;IACxC,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,MAAM,gBAAgB,GAAG,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAA;IACtD,KAAK,MAAM,KAAK,IAAI,gBAAgB,EAAE,CAAC;QACrC,MAAM,WAAW,GAAG,GAAG,sBAAsB,GAAG,KAAK,GAAG,sBAAsB,EAAE,CAAA;QAChF,MAAM,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAA;QAC/C,aAAa,GAAG,aAAa,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,WAAW,CAAC,CAAA;QAC5D,KAAK,EAAE,CAAA;IACT,CAAC;IAED,8BAA8B;IAC9B,MAAM,eAAe,GAAG,UAAU,CAAA;IAClC,MAAM,aAAa,GAAG,aAAa,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAA;IAC7D,KAAK,MAAM,KAAK,IAAI,aAAa,EAAE,CAAC;QAClC,MAAM,WAAW,GAAG,GAAG,uBAAuB,GAAG,KAAK,GAAG,uBAAuB,EAAE,CAAA;QAClF,MAAM,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAA;QAC/C,aAAa,GAAG,aAAa,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,WAAW,CAAC,CAAA;QAC5D,KAAK,EAAE,CAAA;IACT,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,aAAa,EAAE,MAAM,EAAE,CAAA;AACxC,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,SAAmB,EAAE,MAAuB;IACrE,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE;QAChC,IAAI,QAAQ,GAAG,QAAQ,CAAA;QACvB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,KAAK,CAAC,WAAW,EAAE,KAAK,CAAC,OAAO,CAAC,CAAA;QAC/D,CAAC;QACD,OAAO,QAAQ,CAAA;IACjB,CAAC,CAAC,CAAA;AACJ,CAAC;AAED,+CAA+C;AAC/C,iCAAiC;AACjC,+CAA+C;AAE/C,4CAA4C;AAC5C,qEAAqE;AACrE,MAAM,SAAS,GAAG,IAAI,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAA;AAExE;;;;;;;;;;;;;GAaG;AACH,SAAgB,kBAAkB,CAAC,IAAY;IAC7C,qBAAqB;IACrB,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,OAAO,EAAE,CAAA;IACX,CAAC;IAED,qDAAqD;IACrD,MAAM,EAAE,IAAI,EAAE,aAAa,EAAE,MAAM,EAAE,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAA;IAE/D,sCAAsC;IACtC,sHAAsH;IACtH,MAAM,UAAU,GAAG,aAAa,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAA;IAEvE,MAAM,SAAS,GAAa,EAAE,CAAA;IAE9B,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,MAAM,gBAAgB,GAAG,SAAS,CAAC,IAAI,EAAE,CAAA;QACzC,IAAI,CAAC,gBAAgB;YAAE,SAAQ;QAE/B,8DAA8D;QAC9D,IAAI,WAAW,CAAC,IAAI,CAAC,gBAAgB,CAAC,EAAE,CAAC;YACvC,SAAS,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAA;YAChC,SAAQ;QACV,CAAC;QAED,4CAA4C;QAC5C,MAAM,QAAQ,GAAG,SAAS,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAA;QACpD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,EAAE,CAAA;YACtC,IAAI,OAAO,EAAE,CAAC;gBACZ,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;YACzB,CAAC;QACH,CAAC;IACH,CAAC;IAED,sBAAsB;IACtB,MAAM,iBAAiB,GAAG,iBAAiB,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;IAE9D,kCAAkC;IAClC,OAAO,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAC3E,CAAC"}
@@ -0,0 +1,55 @@
1
+ export { EmbeddingError } from '../errors/index.js';
2
+ /**
3
+ * Embedder configuration
4
+ */
5
+ export interface EmbedderConfig {
6
+ /** HuggingFace model path */
7
+ modelPath: string;
8
+ /** Batch size */
9
+ batchSize: number;
10
+ /** Model cache directory */
11
+ cacheDir: string;
12
+ }
13
+ /**
14
+ * Embedding generation class using Transformers.js
15
+ *
16
+ * Responsibilities:
17
+ * - Generate embedding vectors (dimension depends on model)
18
+ * - Transformers.js wrapper
19
+ * - Batch processing (size 8)
20
+ */
21
+ export declare class Embedder {
22
+ private model;
23
+ private initPromise;
24
+ private readonly config;
25
+ constructor(config: EmbedderConfig);
26
+ /**
27
+ * Get the model name/path
28
+ */
29
+ getModelName(): string;
30
+ /**
31
+ * Initialize Transformers.js model
32
+ */
33
+ initialize(): Promise<void>;
34
+ /**
35
+ * Ensure model is initialized (lazy initialization)
36
+ * This method is called automatically by embed() and embedBatch()
37
+ */
38
+ private ensureInitialized;
39
+ /**
40
+ * Convert single text to embedding vector
41
+ *
42
+ * @param text - Text
43
+ * @returns Embedding vector (dimension depends on model)
44
+ */
45
+ embed(text: string): Promise<number[]>;
46
+ /**
47
+ * Convert multiple texts to embedding vectors with batch processing
48
+ *
49
+ * @param texts - Array of texts
50
+ * @param signal - Optional AbortSignal for cancellation support
51
+ * @returns Array of embedding vectors (dimension depends on model)
52
+ */
53
+ embedBatch(texts: string[], signal?: AbortSignal): Promise<number[][]>;
54
+ }
55
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/embedder/index.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAA;AAMnD;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,6BAA6B;IAC7B,SAAS,EAAE,MAAM,CAAA;IACjB,iBAAiB;IACjB,SAAS,EAAE,MAAM,CAAA;IACjB,4BAA4B;IAC5B,QAAQ,EAAE,MAAM,CAAA;CACjB;AAMD;;;;;;;GAOG;AACH,qBAAa,QAAQ;IAEnB,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,WAAW,CAA6B;IAChD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAgB;gBAE3B,MAAM,EAAE,cAAc;IAIlC;;OAEG;IACH,YAAY,IAAI,MAAM;IAItB;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAuBjC;;;OAGG;YACW,iBAAiB;IA+B/B;;;;;OAKG;IACG,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAiC5C;;;;;;OAMG;IACG,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,MAAM,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CAmC7E"}