@robthepcguy/rag-vault 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +24 -0
- package/README.md +421 -0
- package/dist/bin/install-skills.d.ts +20 -0
- package/dist/bin/install-skills.d.ts.map +1 -0
- package/dist/bin/install-skills.js +196 -0
- package/dist/bin/install-skills.js.map +1 -0
- package/dist/chunker/index.d.ts +11 -0
- package/dist/chunker/index.d.ts.map +1 -0
- package/dist/chunker/index.js +6 -0
- package/dist/chunker/index.js.map +1 -0
- package/dist/chunker/semantic-chunker.d.ts +96 -0
- package/dist/chunker/semantic-chunker.d.ts.map +1 -0
- package/dist/chunker/semantic-chunker.js +267 -0
- package/dist/chunker/semantic-chunker.js.map +1 -0
- package/dist/chunker/sentence-splitter.d.ts +16 -0
- package/dist/chunker/sentence-splitter.d.ts.map +1 -0
- package/dist/chunker/sentence-splitter.js +114 -0
- package/dist/chunker/sentence-splitter.js.map +1 -0
- package/dist/embedder/index.d.ts +55 -0
- package/dist/embedder/index.d.ts.map +1 -0
- package/dist/embedder/index.js +146 -0
- package/dist/embedder/index.js.map +1 -0
- package/dist/errors/index.d.ts +73 -0
- package/dist/errors/index.d.ts.map +1 -0
- package/dist/errors/index.js +170 -0
- package/dist/errors/index.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +91 -0
- package/dist/index.js.map +1 -0
- package/dist/parser/html-parser.d.ts +14 -0
- package/dist/parser/html-parser.d.ts.map +1 -0
- package/dist/parser/html-parser.js +99 -0
- package/dist/parser/html-parser.js.map +1 -0
- package/dist/parser/index.d.ts +144 -0
- package/dist/parser/index.d.ts.map +1 -0
- package/dist/parser/index.js +446 -0
- package/dist/parser/index.js.map +1 -0
- package/dist/parser/pdf-filter.d.ts +89 -0
- package/dist/parser/pdf-filter.d.ts.map +1 -0
- package/dist/parser/pdf-filter.js +304 -0
- package/dist/parser/pdf-filter.js.map +1 -0
- package/dist/server/index.d.ts +144 -0
- package/dist/server/index.d.ts.map +1 -0
- package/dist/server/index.js +518 -0
- package/dist/server/index.js.map +1 -0
- package/dist/server/raw-data-utils.d.ts +81 -0
- package/dist/server/raw-data-utils.d.ts.map +1 -0
- package/dist/server/raw-data-utils.js +196 -0
- package/dist/server/raw-data-utils.js.map +1 -0
- package/dist/server/schemas.d.ts +186 -0
- package/dist/server/schemas.d.ts.map +1 -0
- package/dist/server/schemas.js +99 -0
- package/dist/server/schemas.js.map +1 -0
- package/dist/utils/config-parsers.d.ts +14 -0
- package/dist/utils/config-parsers.d.ts.map +1 -0
- package/dist/utils/config-parsers.js +47 -0
- package/dist/utils/config-parsers.js.map +1 -0
- package/dist/utils/config.d.ts +37 -0
- package/dist/utils/config.d.ts.map +1 -0
- package/dist/utils/config.js +52 -0
- package/dist/utils/config.js.map +1 -0
- package/dist/utils/logger.d.ts +36 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +64 -0
- package/dist/utils/logger.js.map +1 -0
- package/dist/utils/math.d.ts +34 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +73 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/process-handlers.d.ts +26 -0
- package/dist/utils/process-handlers.d.ts.map +1 -0
- package/dist/utils/process-handlers.js +69 -0
- package/dist/utils/process-handlers.js.map +1 -0
- package/dist/vectordb/index.d.ts +210 -0
- package/dist/vectordb/index.d.ts.map +1 -0
- package/dist/vectordb/index.js +613 -0
- package/dist/vectordb/index.js.map +1 -0
- package/dist/web/api-routes.d.ts +9 -0
- package/dist/web/api-routes.d.ts.map +1 -0
- package/dist/web/api-routes.js +127 -0
- package/dist/web/api-routes.js.map +1 -0
- package/dist/web/config-routes.d.ts +7 -0
- package/dist/web/config-routes.d.ts.map +1 -0
- package/dist/web/config-routes.js +54 -0
- package/dist/web/config-routes.js.map +1 -0
- package/dist/web/database-manager.d.ts +130 -0
- package/dist/web/database-manager.d.ts.map +1 -0
- package/dist/web/database-manager.js +382 -0
- package/dist/web/database-manager.js.map +1 -0
- package/dist/web/http-server.d.ts +28 -0
- package/dist/web/http-server.d.ts.map +1 -0
- package/dist/web/http-server.js +311 -0
- package/dist/web/http-server.js.map +1 -0
- package/dist/web/index.d.ts +3 -0
- package/dist/web/index.d.ts.map +1 -0
- package/dist/web/index.js +114 -0
- package/dist/web/index.js.map +1 -0
- package/dist/web/middleware/async-handler.d.ts +17 -0
- package/dist/web/middleware/async-handler.d.ts.map +1 -0
- package/dist/web/middleware/async-handler.js +26 -0
- package/dist/web/middleware/async-handler.js.map +1 -0
- package/dist/web/middleware/auth.d.ts +22 -0
- package/dist/web/middleware/auth.d.ts.map +1 -0
- package/dist/web/middleware/auth.js +81 -0
- package/dist/web/middleware/auth.js.map +1 -0
- package/dist/web/middleware/error-handler.d.ts +36 -0
- package/dist/web/middleware/error-handler.d.ts.map +1 -0
- package/dist/web/middleware/error-handler.js +68 -0
- package/dist/web/middleware/error-handler.js.map +1 -0
- package/dist/web/middleware/index.d.ts +6 -0
- package/dist/web/middleware/index.d.ts.map +1 -0
- package/dist/web/middleware/index.js +19 -0
- package/dist/web/middleware/index.js.map +1 -0
- package/dist/web/middleware/rate-limit.d.ts +38 -0
- package/dist/web/middleware/rate-limit.d.ts.map +1 -0
- package/dist/web/middleware/rate-limit.js +116 -0
- package/dist/web/middleware/rate-limit.js.map +1 -0
- package/dist/web/middleware/request-logger.d.ts +52 -0
- package/dist/web/middleware/request-logger.d.ts.map +1 -0
- package/dist/web/middleware/request-logger.js +74 -0
- package/dist/web/middleware/request-logger.js.map +1 -0
- package/dist/web/types.d.ts +6 -0
- package/dist/web/types.d.ts.map +1 -0
- package/dist/web/types.js +4 -0
- package/dist/web/types.js.map +1 -0
- package/package.json +135 -0
- package/skills/rag-vault/SKILL.md +111 -0
- package/skills/rag-vault/references/html-ingestion.md +73 -0
- package/skills/rag-vault/references/query-optimization.md +57 -0
- package/skills/rag-vault/references/result-refinement.md +54 -0
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.SemanticChunker = void 0;
|
|
4
|
+
var semantic_chunker_js_1 = require("./semantic-chunker.js");
|
|
5
|
+
Object.defineProperty(exports, "SemanticChunker", { enumerable: true, get: function () { return semantic_chunker_js_1.SemanticChunker; } });
|
|
6
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/chunker/index.ts"],"names":[],"mappings":";;;AAUA,6DAAuD;AAA9C,sHAAA,eAAe,OAAA"}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import type { TextChunk } from './index.js';
|
|
2
|
+
/**
|
|
3
|
+
* Semantic Chunker configuration
|
|
4
|
+
* Based on paper recommendations: hardThreshold=0.6, initConst=1.5, c=0.9
|
|
5
|
+
*/
|
|
6
|
+
export interface SemanticChunkerConfig {
|
|
7
|
+
/** Hard threshold for minimum similarity (default: 0.6) */
|
|
8
|
+
hardThreshold: number;
|
|
9
|
+
/** Initial constant for first sentence pair (default: 1.5) */
|
|
10
|
+
initConst: number;
|
|
11
|
+
/** Scaling constant for threshold calculation (default: 0.9) */
|
|
12
|
+
c: number;
|
|
13
|
+
/** Minimum chunk length in characters (default: 50) */
|
|
14
|
+
minChunkLength: number;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Embedder interface for generating embeddings
|
|
18
|
+
*/
|
|
19
|
+
export interface EmbedderInterface {
|
|
20
|
+
embedBatch(texts: string[]): Promise<number[][]>;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Check if a chunk is garbage (should be filtered out)
|
|
24
|
+
*
|
|
25
|
+
* Criteria (language-agnostic):
|
|
26
|
+
* 1. Empty after trimming
|
|
27
|
+
* 2. Contains alphanumeric -> valid content (keep)
|
|
28
|
+
* 3. Only decoration characters (----, ====, etc.) -> garbage
|
|
29
|
+
* 4. Single character repeated >80% of text -> garbage
|
|
30
|
+
*
|
|
31
|
+
* Note: Applied after minChunkLength filter
|
|
32
|
+
*
|
|
33
|
+
* @param text - Chunk text to check
|
|
34
|
+
* @returns true if chunk is garbage and should be removed
|
|
35
|
+
*/
|
|
36
|
+
export declare function isGarbageChunk(text: string): boolean;
|
|
37
|
+
export declare const DEFAULT_SEMANTIC_CHUNKER_CONFIG: SemanticChunkerConfig;
|
|
38
|
+
/**
|
|
39
|
+
* Semantic chunker using Max-Min algorithm
|
|
40
|
+
*
|
|
41
|
+
* The algorithm groups consecutive sentences based on semantic similarity:
|
|
42
|
+
* 1. Split text into sentences
|
|
43
|
+
* 2. Generate embeddings for all sentences
|
|
44
|
+
* 3. For each sentence, decide whether to add to current chunk or start new chunk
|
|
45
|
+
* 4. Decision is based on comparing max similarity with new sentence vs min similarity within chunk
|
|
46
|
+
*
|
|
47
|
+
* Key insight: A sentence belongs to a chunk if its maximum similarity to any chunk member
|
|
48
|
+
* is greater than the minimum similarity between existing chunk members (with threshold adjustment)
|
|
49
|
+
*/
|
|
50
|
+
export declare class SemanticChunker {
|
|
51
|
+
private readonly config;
|
|
52
|
+
constructor(config?: Partial<SemanticChunkerConfig>);
|
|
53
|
+
/**
|
|
54
|
+
* Split text into semantically coherent chunks
|
|
55
|
+
*
|
|
56
|
+
* @param text - The text to chunk
|
|
57
|
+
* @param embedder - Embedder to generate sentence embeddings
|
|
58
|
+
* @returns Array of text chunks
|
|
59
|
+
*/
|
|
60
|
+
chunkText(text: string, embedder: EmbedderInterface): Promise<TextChunk[]>;
|
|
61
|
+
/**
|
|
62
|
+
* Group sentences into chunks using Max-Min algorithm
|
|
63
|
+
*/
|
|
64
|
+
private groupSentences;
|
|
65
|
+
/**
|
|
66
|
+
* Decide if a sentence should be added to the current chunk
|
|
67
|
+
* Based on Max-Min algorithm from the paper
|
|
68
|
+
*/
|
|
69
|
+
private shouldAddToChunk;
|
|
70
|
+
/**
|
|
71
|
+
* Get minimum pairwise similarity within a chunk.
|
|
72
|
+
* Only compares the last WINDOW_SIZE sentences for O(1) complexity.
|
|
73
|
+
* This approximation is valid because recent sentences are most relevant
|
|
74
|
+
* for determining chunk coherence (per Max-Min paper's experimental setup).
|
|
75
|
+
*/
|
|
76
|
+
private getMinSimilarity;
|
|
77
|
+
/**
|
|
78
|
+
* Get maximum similarity between a sentence and any sentence in the chunk
|
|
79
|
+
*/
|
|
80
|
+
private getMaxSimilarity;
|
|
81
|
+
/**
|
|
82
|
+
* Calculate dynamic threshold based on chunk size
|
|
83
|
+
* threshold = max(c * minSim * sigmoid(|C|), hardThreshold)
|
|
84
|
+
*/
|
|
85
|
+
private calculateThreshold;
|
|
86
|
+
/**
|
|
87
|
+
* Sigmoid function
|
|
88
|
+
*/
|
|
89
|
+
private sigmoid;
|
|
90
|
+
/**
|
|
91
|
+
* Calculate cosine similarity between two vectors
|
|
92
|
+
* Public for testing - delegates to shared utility
|
|
93
|
+
*/
|
|
94
|
+
cosineSimilarity(vec1: number[], vec2: number[]): number;
|
|
95
|
+
}
|
|
96
|
+
//# sourceMappingURL=semantic-chunker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"semantic-chunker.d.ts","sourceRoot":"","sources":["../../src/chunker/semantic-chunker.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAQ3C;;;GAGG;AACH,MAAM,WAAW,qBAAqB;IACpC,2DAA2D;IAC3D,aAAa,EAAE,MAAM,CAAA;IACrB,8DAA8D;IAC9D,SAAS,EAAE,MAAM,CAAA;IACjB,gEAAgE;IAChE,CAAC,EAAE,MAAM,CAAA;IACT,uDAAuD;IACvD,cAAc,EAAE,MAAM,CAAA;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAA;CACjD;AAoBD;;;;;;;;;;;;;GAaG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAmBpD;AAMD,eAAO,MAAM,+BAA+B,EAAE,qBAK7C,CAAA;AAMD;;;;;;;;;;;GAWG;AACH,qBAAa,eAAe;IAC1B,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAuB;gBAElC,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM;IAIvD;;;;;;OAMG;IACG,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,iBAAiB,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;IAsChF;;OAEG;IACH,OAAO,CAAC,cAAc;IAmEtB;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IAaxB;;;;;OAKG;IACH,OAAO,CAAC,gBAAgB;IAuBxB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAWxB;;;OAGG;IACH,OAAO,CAAC,kBAAkB;IAM1B;;OAEG;IACH,OAAO,CAAC,OAAO;IAIf;;;OAGG;IACH,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,MAAM;CAGzD"}
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// Semantic Chunker implementation using Max-Min algorithm
|
|
3
|
+
// Based on: "Max–Min semantic chunking of documents for RAG application" (Springer, 2025)
|
|
4
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
5
|
+
exports.SemanticChunker = exports.DEFAULT_SEMANTIC_CHUNKER_CONFIG = void 0;
|
|
6
|
+
exports.isGarbageChunk = isGarbageChunk;
|
|
7
|
+
const sentence_splitter_js_1 = require("./sentence-splitter.js");
|
|
8
|
+
const math_js_1 = require("../utils/math.js");
|
|
9
|
+
// ============================================
|
|
10
|
+
// Performance Optimization Constants
|
|
11
|
+
// ============================================
|
|
12
|
+
/**
|
|
13
|
+
* Number of recent sentences to compare in getMinSimilarity.
|
|
14
|
+
* Based on Max-Min paper's experimental conditions (median 5 sentences per chunk).
|
|
15
|
+
* Reduces complexity from O(k²) to O(WINDOW_SIZE²) = O(25) = O(1).
|
|
16
|
+
*/
|
|
17
|
+
const WINDOW_SIZE = 5;
|
|
18
|
+
/**
|
|
19
|
+
* Maximum number of sentences per chunk before forced split.
|
|
20
|
+
* Safety limit to prevent computational explosion on homogeneous documents.
|
|
21
|
+
* Set to 3x the paper's median chunk size for reasonable margin.
|
|
22
|
+
*/
|
|
23
|
+
const MAX_SENTENCES = 15;
|
|
24
|
+
/**
|
|
25
|
+
* Check if a chunk is garbage (should be filtered out)
|
|
26
|
+
*
|
|
27
|
+
* Criteria (language-agnostic):
|
|
28
|
+
* 1. Empty after trimming
|
|
29
|
+
* 2. Contains alphanumeric -> valid content (keep)
|
|
30
|
+
* 3. Only decoration characters (----, ====, etc.) -> garbage
|
|
31
|
+
* 4. Single character repeated >80% of text -> garbage
|
|
32
|
+
*
|
|
33
|
+
* Note: Applied after minChunkLength filter
|
|
34
|
+
*
|
|
35
|
+
* @param text - Chunk text to check
|
|
36
|
+
* @returns true if chunk is garbage and should be removed
|
|
37
|
+
*/
|
|
38
|
+
function isGarbageChunk(text) {
|
|
39
|
+
const trimmed = text.trim();
|
|
40
|
+
if (trimmed.length === 0)
|
|
41
|
+
return true;
|
|
42
|
+
// If contains any alphanumeric, consider valid content
|
|
43
|
+
if (/[a-zA-Z0-9]/.test(trimmed))
|
|
44
|
+
return false;
|
|
45
|
+
// Decoration line patterns only (----, ====, ****, etc.)
|
|
46
|
+
if (/^[-=_.*#|~`@!%^&*()[\]{}\\/<>:+\s]+$/.test(trimmed))
|
|
47
|
+
return true;
|
|
48
|
+
// Excessive repetition of single character (>80%)
|
|
49
|
+
const charCounts = new Map();
|
|
50
|
+
for (const char of trimmed) {
|
|
51
|
+
charCounts.set(char, (charCounts.get(char) ?? 0) + 1);
|
|
52
|
+
}
|
|
53
|
+
const maxCount = Math.max(...charCounts.values());
|
|
54
|
+
if (maxCount / trimmed.length > 0.8)
|
|
55
|
+
return true;
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
// ============================================
|
|
59
|
+
// Default Configuration
|
|
60
|
+
// ============================================
|
|
61
|
+
exports.DEFAULT_SEMANTIC_CHUNKER_CONFIG = {
|
|
62
|
+
hardThreshold: 0.6,
|
|
63
|
+
initConst: 1.5,
|
|
64
|
+
c: 0.9,
|
|
65
|
+
minChunkLength: 50,
|
|
66
|
+
};
|
|
67
|
+
// ============================================
|
|
68
|
+
// SemanticChunker Class
|
|
69
|
+
// ============================================
|
|
70
|
+
/**
|
|
71
|
+
* Semantic chunker using Max-Min algorithm
|
|
72
|
+
*
|
|
73
|
+
* The algorithm groups consecutive sentences based on semantic similarity:
|
|
74
|
+
* 1. Split text into sentences
|
|
75
|
+
* 2. Generate embeddings for all sentences
|
|
76
|
+
* 3. For each sentence, decide whether to add to current chunk or start new chunk
|
|
77
|
+
* 4. Decision is based on comparing max similarity with new sentence vs min similarity within chunk
|
|
78
|
+
*
|
|
79
|
+
* Key insight: A sentence belongs to a chunk if its maximum similarity to any chunk member
|
|
80
|
+
* is greater than the minimum similarity between existing chunk members (with threshold adjustment)
|
|
81
|
+
*/
|
|
82
|
+
class SemanticChunker {
|
|
83
|
+
constructor(config = {}) {
|
|
84
|
+
this.config = { ...exports.DEFAULT_SEMANTIC_CHUNKER_CONFIG, ...config };
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Split text into semantically coherent chunks
|
|
88
|
+
*
|
|
89
|
+
* @param text - The text to chunk
|
|
90
|
+
* @param embedder - Embedder to generate sentence embeddings
|
|
91
|
+
* @returns Array of text chunks
|
|
92
|
+
*/
|
|
93
|
+
async chunkText(text, embedder) {
|
|
94
|
+
// Handle empty input
|
|
95
|
+
if (!text || text.trim().length === 0) {
|
|
96
|
+
return [];
|
|
97
|
+
}
|
|
98
|
+
// Split into sentences
|
|
99
|
+
const sentences = (0, sentence_splitter_js_1.splitIntoSentences)(text);
|
|
100
|
+
if (sentences.length === 0) {
|
|
101
|
+
return [];
|
|
102
|
+
}
|
|
103
|
+
// Generate embeddings for all sentences
|
|
104
|
+
const embeddings = await embedder.embedBatch(sentences);
|
|
105
|
+
// Apply Max-Min algorithm to group sentences into chunks
|
|
106
|
+
const sentenceGroups = this.groupSentences(sentences, embeddings);
|
|
107
|
+
// Convert groups to TextChunks
|
|
108
|
+
const chunks = [];
|
|
109
|
+
let chunkIndex = 0;
|
|
110
|
+
for (const group of sentenceGroups) {
|
|
111
|
+
const chunkText = group.join(' ');
|
|
112
|
+
// Filter out chunks that are too short or garbage
|
|
113
|
+
if (chunkText.length >= this.config.minChunkLength && !isGarbageChunk(chunkText)) {
|
|
114
|
+
chunks.push({
|
|
115
|
+
text: chunkText,
|
|
116
|
+
index: chunkIndex,
|
|
117
|
+
});
|
|
118
|
+
chunkIndex++;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return chunks;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Group sentences into chunks using Max-Min algorithm
|
|
125
|
+
*/
|
|
126
|
+
groupSentences(sentences, embeddings) {
|
|
127
|
+
if (sentences.length === 0)
|
|
128
|
+
return [];
|
|
129
|
+
if (sentences.length === 1)
|
|
130
|
+
return [[sentences[0] ?? '']];
|
|
131
|
+
const groups = [];
|
|
132
|
+
let currentGroup = [];
|
|
133
|
+
let currentGroupEmbeddings = [];
|
|
134
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
135
|
+
const sentence = sentences[i];
|
|
136
|
+
const embedding = embeddings[i];
|
|
137
|
+
if (!sentence || !embedding)
|
|
138
|
+
continue;
|
|
139
|
+
if (currentGroup.length === 0) {
|
|
140
|
+
// Start new group with first sentence
|
|
141
|
+
currentGroup.push(sentence);
|
|
142
|
+
currentGroupEmbeddings.push(embedding);
|
|
143
|
+
}
|
|
144
|
+
else if (currentGroup.length === 1) {
|
|
145
|
+
// Special case for second sentence (init phase)
|
|
146
|
+
const firstEmbedding = currentGroupEmbeddings[0];
|
|
147
|
+
if (!firstEmbedding)
|
|
148
|
+
continue;
|
|
149
|
+
const similarity = this.cosineSimilarity(firstEmbedding, embedding);
|
|
150
|
+
if (this.config.initConst * similarity > this.config.hardThreshold) {
|
|
151
|
+
// Add to current group
|
|
152
|
+
currentGroup.push(sentence);
|
|
153
|
+
currentGroupEmbeddings.push(embedding);
|
|
154
|
+
}
|
|
155
|
+
else {
|
|
156
|
+
// Start new group
|
|
157
|
+
groups.push([...currentGroup]);
|
|
158
|
+
currentGroup = [sentence];
|
|
159
|
+
currentGroupEmbeddings = [embedding];
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
else {
|
|
163
|
+
// Force split if chunk reaches MAX_SENTENCES (safety limit for performance)
|
|
164
|
+
if (currentGroup.length >= MAX_SENTENCES) {
|
|
165
|
+
groups.push([...currentGroup]);
|
|
166
|
+
currentGroup = [sentence];
|
|
167
|
+
currentGroupEmbeddings = [embedding];
|
|
168
|
+
continue;
|
|
169
|
+
}
|
|
170
|
+
// Normal case: check if sentence should join current group
|
|
171
|
+
const shouldAdd = this.shouldAddToChunk(embedding, currentGroupEmbeddings);
|
|
172
|
+
if (shouldAdd) {
|
|
173
|
+
currentGroup.push(sentence);
|
|
174
|
+
currentGroupEmbeddings.push(embedding);
|
|
175
|
+
}
|
|
176
|
+
else {
|
|
177
|
+
// Start new group
|
|
178
|
+
groups.push([...currentGroup]);
|
|
179
|
+
currentGroup = [sentence];
|
|
180
|
+
currentGroupEmbeddings = [embedding];
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
// Don't forget the last group
|
|
185
|
+
if (currentGroup.length > 0) {
|
|
186
|
+
groups.push(currentGroup);
|
|
187
|
+
}
|
|
188
|
+
return groups;
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Decide if a sentence should be added to the current chunk
|
|
192
|
+
* Based on Max-Min algorithm from the paper
|
|
193
|
+
*/
|
|
194
|
+
shouldAddToChunk(newEmbedding, chunkEmbeddings) {
|
|
195
|
+
// Calculate min similarity within current chunk
|
|
196
|
+
const minSim = this.getMinSimilarity(chunkEmbeddings);
|
|
197
|
+
// Calculate max similarity between new sentence and chunk
|
|
198
|
+
const maxSim = this.getMaxSimilarity(newEmbedding, chunkEmbeddings);
|
|
199
|
+
// Calculate dynamic threshold
|
|
200
|
+
const threshold = this.calculateThreshold(minSim, chunkEmbeddings.length);
|
|
201
|
+
return maxSim > threshold;
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* Get minimum pairwise similarity within a chunk.
|
|
205
|
+
* Only compares the last WINDOW_SIZE sentences for O(1) complexity.
|
|
206
|
+
* This approximation is valid because recent sentences are most relevant
|
|
207
|
+
* for determining chunk coherence (per Max-Min paper's experimental setup).
|
|
208
|
+
*/
|
|
209
|
+
getMinSimilarity(embeddings) {
|
|
210
|
+
if (embeddings.length < 2)
|
|
211
|
+
return 1.0;
|
|
212
|
+
// Only compare the last WINDOW_SIZE embeddings to reduce O(k²) to O(1)
|
|
213
|
+
const startIdx = Math.max(0, embeddings.length - WINDOW_SIZE);
|
|
214
|
+
const windowEmbeddings = embeddings.slice(startIdx);
|
|
215
|
+
let minSim = 1.0;
|
|
216
|
+
for (let i = 0; i < windowEmbeddings.length; i++) {
|
|
217
|
+
for (let j = i + 1; j < windowEmbeddings.length; j++) {
|
|
218
|
+
const embI = windowEmbeddings[i];
|
|
219
|
+
const embJ = windowEmbeddings[j];
|
|
220
|
+
if (!embI || !embJ)
|
|
221
|
+
continue;
|
|
222
|
+
const sim = this.cosineSimilarity(embI, embJ);
|
|
223
|
+
if (sim < minSim) {
|
|
224
|
+
minSim = sim;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
return minSim;
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Get maximum similarity between a sentence and any sentence in the chunk
|
|
232
|
+
*/
|
|
233
|
+
getMaxSimilarity(embedding, chunkEmbeddings) {
|
|
234
|
+
let maxSim = -1.0;
|
|
235
|
+
for (const chunkEmb of chunkEmbeddings) {
|
|
236
|
+
const sim = this.cosineSimilarity(embedding, chunkEmb);
|
|
237
|
+
if (sim > maxSim) {
|
|
238
|
+
maxSim = sim;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
return maxSim;
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Calculate dynamic threshold based on chunk size
|
|
245
|
+
* threshold = max(c * minSim * sigmoid(|C|), hardThreshold)
|
|
246
|
+
*/
|
|
247
|
+
calculateThreshold(minSim, chunkSize) {
|
|
248
|
+
const sigmoidValue = this.sigmoid(chunkSize);
|
|
249
|
+
const dynamicThreshold = this.config.c * minSim * sigmoidValue;
|
|
250
|
+
return Math.max(dynamicThreshold, this.config.hardThreshold);
|
|
251
|
+
}
|
|
252
|
+
/**
|
|
253
|
+
* Sigmoid function
|
|
254
|
+
*/
|
|
255
|
+
sigmoid(x) {
|
|
256
|
+
return 1 / (1 + Math.exp(-x));
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Calculate cosine similarity between two vectors
|
|
260
|
+
* Public for testing - delegates to shared utility
|
|
261
|
+
*/
|
|
262
|
+
cosineSimilarity(vec1, vec2) {
|
|
263
|
+
return (0, math_js_1.cosineSimilarity)(vec1, vec2);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
exports.SemanticChunker = SemanticChunker;
|
|
267
|
+
//# sourceMappingURL=semantic-chunker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"semantic-chunker.js","sourceRoot":"","sources":["../../src/chunker/semantic-chunker.ts"],"names":[],"mappings":";AAAA,0DAA0D;AAC1D,0FAA0F;;;AAgE1F,wCAmBC;AAhFD,iEAA2D;AAC3D,8CAA2E;AA4B3E,+CAA+C;AAC/C,qCAAqC;AACrC,+CAA+C;AAE/C;;;;GAIG;AACH,MAAM,WAAW,GAAG,CAAC,CAAA;AAErB;;;;GAIG;AACH,MAAM,aAAa,GAAG,EAAE,CAAA;AAExB;;;;;;;;;;;;;GAaG;AACH,SAAgB,cAAc,CAAC,IAAY;IACzC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAA;IAC3B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAA;IAErC,uDAAuD;IACvD,IAAI,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,KAAK,CAAA;IAE7C,yDAAyD;IACzD,IAAI,sCAAsC,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAA;IAErE,kDAAkD;IAClD,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAA;IAC5C,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;QAC3B,UAAU,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;IACvD,CAAC;IACD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,CAAA;IACjD,IAAI,QAAQ,GAAG,OAAO,CAAC,MAAM,GAAG,GAAG;QAAE,OAAO,IAAI,CAAA;IAEhD,OAAO,KAAK,CAAA;AACd,CAAC;AAED,+CAA+C;AAC/C,wBAAwB;AACxB,+CAA+C;AAElC,QAAA,+BAA+B,GAA0B;IACpE,aAAa,EAAE,GAAG;IAClB,SAAS,EAAE,GAAG;IACd,CAAC,EAAE,GAAG;IACN,cAAc,EAAE,EAAE;CACnB,CAAA;AAED,+CAA+C;AAC/C,wBAAwB;AACxB,+CAA+C;AAE/C;;;;;;;;;;;GAWG;AACH,MAAa,eAAe;IAG1B,YAAY,SAAyC,EAAE;QACrD,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,uCAA+B,EAAE,GAAG,MAAM,EAAE,CAAA;IACjE,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,SAAS,CAAC,IAAY,EAAE,QAA2B;QACvD,qBAAqB;QACrB,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtC,OAAO,EAAE,CAAA;QACX,CAAC;QAED,uBAAuB;QACvB,MAAM,SAAS,GAAG,IAAA,yCAAkB,EAAC,IAAI,CAAC,CAAA;QAC1C,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,OAAO,EAAE,CAAA;QACX,CAAC;QAED,wCAAwC;QACxC,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,SAAS,CAAC,CAAA;QAEvD,yDAAyD;QACzD,MAAM,cAAc,GAAG,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,UAAU,CAAC,CAAA;QAEjE,+BAA+B;QAC/B,MAAM,MAAM,GAAgB,EAAE,CAAA;QAC9B,IAAI,UAAU,GAAG,CAAC,CAAA;QAElB,KAAK,MAAM,KAAK,IAAI,cAAc,EAAE,CAAC;YACnC,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;YAEjC,kDAAkD;YAClD,IAAI,SAAS,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,CAAC,cAAc,IAAI,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,CAAC;gBACjF,MAAM,CAAC,IAAI,CAAC;oBACV,IAAI,EAAE,SAAS;oBACf,KAAK,EAAE,UAAU;iBAClB,CAAC,CAAA;gBACF,UAAU,EAAE,CAAA;YACd,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,SAAmB,EAAE,UAAsB;QAChE,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAA;QACrC,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAA;QAEzD,MAAM,MAAM,GAAe,EAAE,CAAA;QAC7B,IAAI,YAAY,GAAa,EAAE,CAAA;QAC/B,IAAI,sBAAsB,GAAe,EAAE,CAAA;QAE3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAA;YAC7B,MAAM,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAE/B,IAAI,CAAC,QAAQ,IAAI,CAAC,SAAS;gBAAE,SAAQ;YAErC,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC9B,sCAAsC;gBACtC,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;gBAC3B,sBAAsB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;YACxC,CAAC;iBAAM,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACrC,gDAAgD;gBAChD,MAAM,cAAc,GAAG,sBAAsB,CAAC,CAAC,CAAC,CAAA;gBAChD,IAAI,CAAC,cAAc;oBAAE,SAAQ;gBAE7B,MAAM,UAAU,GAAG,IAAI,CAAC,gBAAgB,CAAC,cAAc,EAAE,SAAS,CAAC,CAAA;gBAEnE,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,GAAG,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,aAAa,EAAE,CAAC;oBACnE,uBAAuB;oBACvB,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;oBAC3B,sBAAsB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;gBACxC,CAAC;qBAAM,CAAC;oBACN,kBAAkB;oBAClB,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC,CAAA;oBAC9B,YAAY,GAAG,CAAC,QAAQ,CAAC,CAAA;oBACzB,sBAAsB,GAAG,CAAC,SAAS,CAAC,CAAA;gBACtC,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,4EAA4E;gBAC5E,IAAI,YAAY,CAAC,MAAM,IAAI,aAAa,EAAE,CAAC;oBACzC,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC,CAAA;oBAC9B,YAAY,GAAG,CAAC,QAAQ,CAAC,CAAA;oBACzB,sBAAsB,GAAG,CAAC,SAAS,CAAC,CAAA;oBACpC,SAAQ;gBACV,CAAC;gBAED,2DAA2D;gBAC3D,MAAM,SAAS,GAAG,IAAI,CAAC,gBAAgB,CAAC,SAAS,EAAE,sBAAsB,CAAC,CAAA;gBAE1E,IAAI,SAAS,EAAE,CAAC;oBACd,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;oBAC3B,sBAAsB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;gBACxC,CAAC;qBAAM,CAAC;oBACN,kBAAkB;oBAClB,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC,CAAA;oBAC9B,YAAY,GAAG,CAAC,QAAQ,CAAC,CAAA;oBACzB,sBAAsB,GAAG,CAAC,SAAS,CAAC,CAAA;gBACtC,CAAC;YACH,CAAC;QACH,CAAC;QAED,8BAA8B;QAC9B,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAA;QAC3B,CAAC;QAED,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;;OAGG;IACK,gBAAgB,CAAC,YAAsB,EAAE,eAA2B;QAC1E,gDAAgD;QAChD,MAAM,MAAM,GAAG,IAAI,CAAC,gBAAgB,CAAC,eAAe,CAAC,CAAA;QAErD,0DAA0D;QAC1D,MAAM,MAAM,GAAG,IAAI,CAAC,gBAAgB,CAAC,YAAY,EAAE,eAAe,CAAC,CAAA;QAEnE,8BAA8B;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,kBAAkB,CAAC,MAAM,EAAE,eAAe,CAAC,MAAM,CAAC,CAAA;QAEzE,OAAO,MAAM,GAAG,SAAS,CAAA;IAC3B,CAAC;IAED;;;;;OAKG;IACK,gBAAgB,CAAC,UAAsB;QAC7C,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,GAAG,CAAA;QAErC,uEAAuE;QACvE,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,UAAU,CAAC,MAAM,GAAG,WAAW,CAAC,CAAA;QAC7D,MAAM,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAA;QAEnD,IAAI,MAAM,GAAG,GAAG,CAAA;QAChB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACrD,MAAM,IAAI,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAA;gBAChC,MAAM,IAAI,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAA;gBAChC,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI;oBAAE,SAAQ;gBAE5B,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAA;gBAC7C,IAAI,GAAG,GAAG,MAAM,EAAE,CAAC;oBACjB,MAAM,GAAG,GAAG,CAAA;gBACd,CAAC;YACH,CAAC;QACH,CAAC;QACD,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,SAAmB,EAAE,eAA2B;QACvE,IAAI,MAAM,GAAG,CAAC,GAAG,CAAA;QACjB,KAAK,MAAM,QAAQ,IAAI,eAAe,EAAE,CAAC;YACvC,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAA;YACtD,IAAI,GAAG,GAAG,MAAM,EAAE,CAAC;gBACjB,MAAM,GAAG,GAAG,CAAA;YACd,CAAC;QACH,CAAC;QACD,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;;OAGG;IACK,kBAAkB,CAAC,MAAc,EAAE,SAAiB;QAC1D,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAA;QAC5C,MAAM,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,MAAM,GAAG,YAAY,CAAA;QAC9D,OAAO,IAAI,CAAC,GAAG,CAAC,gBAAgB,EAAE,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,CAAA;IAC9D,CAAC;IAED;;OAEG;IACK,OAAO,CAAC,CAAS;QACvB,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAC/B,CAAC;IAED;;;OAGG;IACH,gBAAgB,CAAC,IAAc,EAAE,IAAc;QAC7C,OAAO,IAAA,0BAAoB,EAAC,IAAI,EAAE,IAAI,CAAC,CAAA;IACzC,CAAC;CACF;AA9MD,0CA8MC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Split text into sentences using Intl.Segmenter
|
|
3
|
+
*
|
|
4
|
+
* Uses the Unicode Text Segmentation standard (UAX #29) via Intl.Segmenter.
|
|
5
|
+
* This provides multilingual support for sentence boundary detection.
|
|
6
|
+
*
|
|
7
|
+
* Note: Intl.Segmenter may split on abbreviations like "Mr." or "e.g."
|
|
8
|
+
* These edge cases are acceptable for semantic chunking as:
|
|
9
|
+
* 1. Short fragments will be grouped with adjacent sentences by similarity
|
|
10
|
+
* 2. Fragments below minChunkLength are filtered out
|
|
11
|
+
*
|
|
12
|
+
* @param text - The text to split into sentences
|
|
13
|
+
* @returns Array of sentences
|
|
14
|
+
*/
|
|
15
|
+
export declare function splitIntoSentences(text: string): string[];
|
|
16
|
+
//# sourceMappingURL=sentence-splitter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sentence-splitter.d.ts","sourceRoot":"","sources":["../../src/chunker/sentence-splitter.ts"],"names":[],"mappings":"AAoFA;;;;;;;;;;;;;GAaG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAwCzD"}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// Sentence Splitter for Semantic Chunking
|
|
3
|
+
// Created: 2025-12-27
|
|
4
|
+
// Purpose: Split text into sentences using Intl.Segmenter (Unicode standard)
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.splitIntoSentences = splitIntoSentences;
|
|
7
|
+
// ============================================
|
|
8
|
+
// Constants
|
|
9
|
+
// ============================================
|
|
10
|
+
/**
|
|
11
|
+
* Placeholder for code blocks during processing
|
|
12
|
+
*/
|
|
13
|
+
const CODE_BLOCK_PLACEHOLDER = '\u0000CODE_BLOCK\u0000';
|
|
14
|
+
/**
|
|
15
|
+
* Placeholder for inline code during processing
|
|
16
|
+
*/
|
|
17
|
+
const INLINE_CODE_PLACEHOLDER = '\u0000INLINE_CODE\u0000';
|
|
18
|
+
// ============================================
|
|
19
|
+
// Helper Functions
|
|
20
|
+
// ============================================
|
|
21
|
+
/**
|
|
22
|
+
* Extract and replace code blocks with placeholders
|
|
23
|
+
*/
|
|
24
|
+
function extractCodeBlocks(text) {
|
|
25
|
+
const blocks = [];
|
|
26
|
+
let processedText = text;
|
|
27
|
+
// Extract fenced code blocks (```...```)
|
|
28
|
+
const codeBlockRegex = /```[\s\S]*?```/g;
|
|
29
|
+
let index = 0;
|
|
30
|
+
const codeBlockMatches = text.matchAll(codeBlockRegex);
|
|
31
|
+
for (const match of codeBlockMatches) {
|
|
32
|
+
const placeholder = `${CODE_BLOCK_PLACEHOLDER}${index}${CODE_BLOCK_PLACEHOLDER}`;
|
|
33
|
+
blocks.push({ placeholder, content: match[0] });
|
|
34
|
+
processedText = processedText.replace(match[0], placeholder);
|
|
35
|
+
index++;
|
|
36
|
+
}
|
|
37
|
+
// Extract inline code (`...`)
|
|
38
|
+
const inlineCodeRegex = /`[^`]+`/g;
|
|
39
|
+
const inlineMatches = processedText.matchAll(inlineCodeRegex);
|
|
40
|
+
for (const match of inlineMatches) {
|
|
41
|
+
const placeholder = `${INLINE_CODE_PLACEHOLDER}${index}${INLINE_CODE_PLACEHOLDER}`;
|
|
42
|
+
blocks.push({ placeholder, content: match[0] });
|
|
43
|
+
processedText = processedText.replace(match[0], placeholder);
|
|
44
|
+
index++;
|
|
45
|
+
}
|
|
46
|
+
return { text: processedText, blocks };
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Restore code blocks from placeholders
|
|
50
|
+
*/
|
|
51
|
+
function restoreCodeBlocks(sentences, blocks) {
|
|
52
|
+
return sentences.map((sentence) => {
|
|
53
|
+
let restored = sentence;
|
|
54
|
+
for (const block of blocks) {
|
|
55
|
+
restored = restored.replace(block.placeholder, block.content);
|
|
56
|
+
}
|
|
57
|
+
return restored;
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
// ============================================
|
|
61
|
+
// Intl.Segmenter-based splitting
|
|
62
|
+
// ============================================
|
|
63
|
+
// Create segmenters for supported languages
|
|
64
|
+
// Using 'und' (undetermined) as fallback for general Unicode support
|
|
65
|
+
const segmenter = new Intl.Segmenter('und', { granularity: 'sentence' });
|
|
66
|
+
/**
|
|
67
|
+
* Split text into sentences using Intl.Segmenter
|
|
68
|
+
*
|
|
69
|
+
* Uses the Unicode Text Segmentation standard (UAX #29) via Intl.Segmenter.
|
|
70
|
+
* This provides multilingual support for sentence boundary detection.
|
|
71
|
+
*
|
|
72
|
+
* Note: Intl.Segmenter may split on abbreviations like "Mr." or "e.g."
|
|
73
|
+
* These edge cases are acceptable for semantic chunking as:
|
|
74
|
+
* 1. Short fragments will be grouped with adjacent sentences by similarity
|
|
75
|
+
* 2. Fragments below minChunkLength are filtered out
|
|
76
|
+
*
|
|
77
|
+
* @param text - The text to split into sentences
|
|
78
|
+
* @returns Array of sentences
|
|
79
|
+
*/
|
|
80
|
+
function splitIntoSentences(text) {
|
|
81
|
+
// Handle empty input
|
|
82
|
+
if (!text || text.trim().length === 0) {
|
|
83
|
+
return [];
|
|
84
|
+
}
|
|
85
|
+
// Extract code blocks to protect them from splitting
|
|
86
|
+
const { text: processedText, blocks } = extractCodeBlocks(text);
|
|
87
|
+
// Split on paragraph boundaries first
|
|
88
|
+
// biome-ignore lint/suspicious/noControlCharactersInRegex: Intentional use of NULL character as placeholder delimiter
|
|
89
|
+
const paragraphs = processedText.split(/\n{2,}|\n(?=\S)|(?<=\u0000)\n/);
|
|
90
|
+
const sentences = [];
|
|
91
|
+
for (const paragraph of paragraphs) {
|
|
92
|
+
const trimmedParagraph = paragraph.trim();
|
|
93
|
+
if (!trimmedParagraph)
|
|
94
|
+
continue;
|
|
95
|
+
// Check if it's a markdown heading (treat as single sentence)
|
|
96
|
+
if (/^#{1,6}\s/.test(trimmedParagraph)) {
|
|
97
|
+
sentences.push(trimmedParagraph);
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
// Use Intl.Segmenter for sentence splitting
|
|
101
|
+
const segments = segmenter.segment(trimmedParagraph);
|
|
102
|
+
for (const segment of segments) {
|
|
103
|
+
const trimmed = segment.segment.trim();
|
|
104
|
+
if (trimmed) {
|
|
105
|
+
sentences.push(trimmed);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
// Restore code blocks
|
|
110
|
+
const restoredSentences = restoreCodeBlocks(sentences, blocks);
|
|
111
|
+
// Filter empty sentences and trim
|
|
112
|
+
return restoredSentences.map((s) => s.trim()).filter((s) => s.length > 0);
|
|
113
|
+
}
|
|
114
|
+
//# sourceMappingURL=sentence-splitter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sentence-splitter.js","sourceRoot":"","sources":["../../src/chunker/sentence-splitter.ts"],"names":[],"mappings":";AAAA,0CAA0C;AAC1C,sBAAsB;AACtB,6EAA6E;;AAgG7E,gDAwCC;AAtID,+CAA+C;AAC/C,YAAY;AACZ,+CAA+C;AAE/C;;GAEG;AACH,MAAM,sBAAsB,GAAG,wBAAwB,CAAA;AAEvD;;GAEG;AACH,MAAM,uBAAuB,GAAG,yBAAyB,CAAA;AAWzD,+CAA+C;AAC/C,mBAAmB;AACnB,+CAA+C;AAE/C;;GAEG;AACH,SAAS,iBAAiB,CAAC,IAAY;IACrC,MAAM,MAAM,GAAoB,EAAE,CAAA;IAClC,IAAI,aAAa,GAAG,IAAI,CAAA;IAExB,yCAAyC;IACzC,MAAM,cAAc,GAAG,iBAAiB,CAAA;IACxC,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,MAAM,gBAAgB,GAAG,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAA;IACtD,KAAK,MAAM,KAAK,IAAI,gBAAgB,EAAE,CAAC;QACrC,MAAM,WAAW,GAAG,GAAG,sBAAsB,GAAG,KAAK,GAAG,sBAAsB,EAAE,CAAA;QAChF,MAAM,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAA;QAC/C,aAAa,GAAG,aAAa,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,WAAW,CAAC,CAAA;QAC5D,KAAK,EAAE,CAAA;IACT,CAAC;IAED,8BAA8B;IAC9B,MAAM,eAAe,GAAG,UAAU,CAAA;IAClC,MAAM,aAAa,GAAG,aAAa,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAA;IAC7D,KAAK,MAAM,KAAK,IAAI,aAAa,EAAE,CAAC;QAClC,MAAM,WAAW,GAAG,GAAG,uBAAuB,GAAG,KAAK,GAAG,uBAAuB,EAAE,CAAA;QAClF,MAAM,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAA;QAC/C,aAAa,GAAG,aAAa,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,WAAW,CAAC,CAAA;QAC5D,KAAK,EAAE,CAAA;IACT,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,aAAa,EAAE,MAAM,EAAE,CAAA;AACxC,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,SAAmB,EAAE,MAAuB;IACrE,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE;QAChC,IAAI,QAAQ,GAAG,QAAQ,CAAA;QACvB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,KAAK,CAAC,WAAW,EAAE,KAAK,CAAC,OAAO,CAAC,CAAA;QAC/D,CAAC;QACD,OAAO,QAAQ,CAAA;IACjB,CAAC,CAAC,CAAA;AACJ,CAAC;AAED,+CAA+C;AAC/C,iCAAiC;AACjC,+CAA+C;AAE/C,4CAA4C;AAC5C,qEAAqE;AACrE,MAAM,SAAS,GAAG,IAAI,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAA;AAExE;;;;;;;;;;;;;GAaG;AACH,SAAgB,kBAAkB,CAAC,IAAY;IAC7C,qBAAqB;IACrB,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,OAAO,EAAE,CAAA;IACX,CAAC;IAED,qDAAqD;IACrD,MAAM,EAAE,IAAI,EAAE,aAAa,EAAE,MAAM,EAAE,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAA;IAE/D,sCAAsC;IACtC,sHAAsH;IACtH,MAAM,UAAU,GAAG,aAAa,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAA;IAEvE,MAAM,SAAS,GAAa,EAAE,CAAA;IAE9B,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,MAAM,gBAAgB,GAAG,SAAS,CAAC,IAAI,EAAE,CAAA;QACzC,IAAI,CAAC,gBAAgB;YAAE,SAAQ;QAE/B,8DAA8D;QAC9D,IAAI,WAAW,CAAC,IAAI,CAAC,gBAAgB,CAAC,EAAE,CAAC;YACvC,SAAS,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAA;YAChC,SAAQ;QACV,CAAC;QAED,4CAA4C;QAC5C,MAAM,QAAQ,GAAG,SAAS,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAA;QACpD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,EAAE,CAAA;YACtC,IAAI,OAAO,EAAE,CAAC;gBACZ,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;YACzB,CAAC;QACH,CAAC;IACH,CAAC;IAED,sBAAsB;IACtB,MAAM,iBAAiB,GAAG,iBAAiB,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;IAE9D,kCAAkC;IAClC,OAAO,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAC3E,CAAC"}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
export { EmbeddingError } from '../errors/index.js';
|
|
2
|
+
/**
|
|
3
|
+
* Embedder configuration
|
|
4
|
+
*/
|
|
5
|
+
export interface EmbedderConfig {
|
|
6
|
+
/** HuggingFace model path */
|
|
7
|
+
modelPath: string;
|
|
8
|
+
/** Batch size */
|
|
9
|
+
batchSize: number;
|
|
10
|
+
/** Model cache directory */
|
|
11
|
+
cacheDir: string;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Embedding generation class using Transformers.js
|
|
15
|
+
*
|
|
16
|
+
* Responsibilities:
|
|
17
|
+
* - Generate embedding vectors (dimension depends on model)
|
|
18
|
+
* - Transformers.js wrapper
|
|
19
|
+
* - Batch processing (size 8)
|
|
20
|
+
*/
|
|
21
|
+
export declare class Embedder {
|
|
22
|
+
private model;
|
|
23
|
+
private initPromise;
|
|
24
|
+
private readonly config;
|
|
25
|
+
constructor(config: EmbedderConfig);
|
|
26
|
+
/**
|
|
27
|
+
* Get the model name/path
|
|
28
|
+
*/
|
|
29
|
+
getModelName(): string;
|
|
30
|
+
/**
|
|
31
|
+
* Initialize Transformers.js model
|
|
32
|
+
*/
|
|
33
|
+
initialize(): Promise<void>;
|
|
34
|
+
/**
|
|
35
|
+
* Ensure model is initialized (lazy initialization)
|
|
36
|
+
* This method is called automatically by embed() and embedBatch()
|
|
37
|
+
*/
|
|
38
|
+
private ensureInitialized;
|
|
39
|
+
/**
|
|
40
|
+
* Convert single text to embedding vector
|
|
41
|
+
*
|
|
42
|
+
* @param text - Text
|
|
43
|
+
* @returns Embedding vector (dimension depends on model)
|
|
44
|
+
*/
|
|
45
|
+
embed(text: string): Promise<number[]>;
|
|
46
|
+
/**
|
|
47
|
+
* Convert multiple texts to embedding vectors with batch processing
|
|
48
|
+
*
|
|
49
|
+
* @param texts - Array of texts
|
|
50
|
+
* @param signal - Optional AbortSignal for cancellation support
|
|
51
|
+
* @returns Array of embedding vectors (dimension depends on model)
|
|
52
|
+
*/
|
|
53
|
+
embedBatch(texts: string[], signal?: AbortSignal): Promise<number[][]>;
|
|
54
|
+
}
|
|
55
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/embedder/index.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAA;AAMnD;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,6BAA6B;IAC7B,SAAS,EAAE,MAAM,CAAA;IACjB,iBAAiB;IACjB,SAAS,EAAE,MAAM,CAAA;IACjB,4BAA4B;IAC5B,QAAQ,EAAE,MAAM,CAAA;CACjB;AAMD;;;;;;;GAOG;AACH,qBAAa,QAAQ;IAEnB,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,WAAW,CAA6B;IAChD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAgB;gBAE3B,MAAM,EAAE,cAAc;IAIlC;;OAEG;IACH,YAAY,IAAI,MAAM;IAItB;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAuBjC;;;OAGG;YACW,iBAAiB;IA+B/B;;;;;OAKG;IACG,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAiC5C;;;;;;OAMG;IACG,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,MAAM,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CAmC7E"}
|