@dakshp1234/langchain-textsplitters 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/semantic_text_splitter.cjs +43 -1
- package/dist/semantic_text_splitter.cjs.map +1 -1
- package/dist/semantic_text_splitter.d.cts +53 -0
- package/dist/semantic_text_splitter.d.cts.map +1 -1
- package/dist/semantic_text_splitter.d.ts +53 -0
- package/dist/semantic_text_splitter.d.ts.map +1 -1
- package/dist/semantic_text_splitter.js +43 -1
- package/dist/semantic_text_splitter.js.map +1 -1
- package/package.json +1 -1
|
@@ -2,7 +2,7 @@ let _langchain_core_documents = require("@langchain/core/documents");
|
|
|
2
2
|
//#region src/semantic_text_splitter.ts
|
|
3
3
|
/**
|
|
4
4
|
* Experimental semantic text splitter based on semantic similarity.
|
|
5
|
-
*
|
|
5
|
+
*
|
|
6
6
|
* Inspired by Greg Kamradt's semantic chunking approach:
|
|
7
7
|
* https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
|
|
8
8
|
*/
|
|
@@ -24,6 +24,12 @@ function cosineSimilarity(a, b) {
|
|
|
24
24
|
if (magnitudeA === 0 || magnitudeB === 0) return 0;
|
|
25
25
|
return dotProduct / (magnitudeA * magnitudeB);
|
|
26
26
|
}
|
|
27
|
+
/**
|
|
28
|
+
* Calculates the cosine distance (1 - cosine similarity) between sequential sentences.
|
|
29
|
+
*
|
|
30
|
+
* @param sentences - An array of sentences that already contain generated embeddings.
|
|
31
|
+
* @returns A tuple where the first element is an array of distances between adjacent sentences, and the second is the original array of sentences.
|
|
32
|
+
*/
|
|
27
33
|
function calculateCosineDistances(sentences) {
|
|
28
34
|
const distances = [];
|
|
29
35
|
for (let i = 0; i < sentences.length - 1; i++) {
|
|
@@ -40,6 +46,10 @@ const BREAKPOINT_DEFAULTS = {
|
|
|
40
46
|
interquartile: 1.5,
|
|
41
47
|
gradient: 95
|
|
42
48
|
};
|
|
49
|
+
/**
|
|
50
|
+
* A Document Transformer that splits texts by comparing the semantic similarity of sequential sentences.
|
|
51
|
+
* It ensures sentences with high contextual relationships remain in the same chunk.
|
|
52
|
+
*/
|
|
43
53
|
var SemanticTextSplitter = class extends _langchain_core_documents.BaseDocumentTransformer {
|
|
44
54
|
embeddings;
|
|
45
55
|
bufferSize;
|
|
@@ -49,6 +59,10 @@ var SemanticTextSplitter = class extends _langchain_core_documents.BaseDocumentT
|
|
|
49
59
|
numberOfChunks;
|
|
50
60
|
sentenceSplitRegex;
|
|
51
61
|
minChunkSize;
|
|
62
|
+
/**
|
|
63
|
+
* Constructs a new SemanticTextSplitter.
|
|
64
|
+
* @param params - The configuration options initializing the text splitter.
|
|
65
|
+
*/
|
|
52
66
|
constructor(params) {
|
|
53
67
|
super();
|
|
54
68
|
this.embeddings = params.embeddings;
|
|
@@ -114,6 +128,12 @@ var SemanticTextSplitter = class extends _langchain_core_documents.BaseDocumentT
|
|
|
114
128
|
getSingleSentencesList(text) {
|
|
115
129
|
return text.split(new RegExp(this.sentenceSplitRegex)).filter((sentence) => sentence.trim().length > 0);
|
|
116
130
|
}
|
|
131
|
+
/**
|
|
132
|
+
* Core method to process a raw string of text and turn it into an array of chunked strings based on semantic proximity.
|
|
133
|
+
*
|
|
134
|
+
* @param text - The raw string input to be chunked.
|
|
135
|
+
* @returns An array of string chunks derived from the original text snippet.
|
|
136
|
+
*/
|
|
117
137
|
async splitText(text) {
|
|
118
138
|
const singleSentencesList = this.getSingleSentencesList(text);
|
|
119
139
|
if (singleSentencesList.length === 1) return singleSentencesList;
|
|
@@ -144,6 +164,14 @@ var SemanticTextSplitter = class extends _langchain_core_documents.BaseDocumentT
|
|
|
144
164
|
}
|
|
145
165
|
return chunks;
|
|
146
166
|
}
|
|
167
|
+
/**
|
|
168
|
+
* Takes raw strings and corresponding optional metadata, sending them through the underlying
|
|
169
|
+
* semantic text splitting process, and forming standard LangChain `Document` objects.
|
|
170
|
+
*
|
|
171
|
+
* @param texts - Array of strings spanning the content to separate out.
|
|
172
|
+
* @param metadatas - Optional array of metadata mappings that align 1:1 with texts.
|
|
173
|
+
* @returns An array of freshly constructed LangChain Document instances containing the split text chunks.
|
|
174
|
+
*/
|
|
147
175
|
async createDocuments(texts, metadatas = []) {
|
|
148
176
|
const _metadatas = metadatas.length > 0 ? metadatas : texts.map(() => ({}));
|
|
149
177
|
const documents = [];
|
|
@@ -163,11 +191,25 @@ var SemanticTextSplitter = class extends _langchain_core_documents.BaseDocumentT
|
|
|
163
191
|
}
|
|
164
192
|
return documents;
|
|
165
193
|
}
|
|
194
|
+
/**
|
|
195
|
+
* Convenience method to take in standardized Documents, extract their text, split the
|
|
196
|
+
* text semantically, and reconstruct them into smaller Documents.
|
|
197
|
+
*
|
|
198
|
+
* @param documents - The original Documents to break down.
|
|
199
|
+
* @returns Semantically chunked Documents inherited from the originals.
|
|
200
|
+
*/
|
|
166
201
|
async splitDocuments(documents) {
|
|
167
202
|
const texts = documents.map((doc) => doc.pageContent);
|
|
168
203
|
const metadatas = documents.map((doc) => doc.metadata);
|
|
169
204
|
return this.createDocuments(texts, metadatas);
|
|
170
205
|
}
|
|
206
|
+
/**
|
|
207
|
+
* Implements the base BaseDocumentTransformer requirement to process documents.
|
|
208
|
+
* Equivalent to `splitDocuments()`.
|
|
209
|
+
*
|
|
210
|
+
* @param documents - The original Documents to transform.
|
|
211
|
+
* @returns Transformed, semantically split Documents.
|
|
212
|
+
*/
|
|
171
213
|
async transformDocuments(documents) {
|
|
172
214
|
return this.splitDocuments(documents);
|
|
173
215
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"semantic_text_splitter.cjs","names":["BaseDocumentTransformer","Document"],"sources":["../src/semantic_text_splitter.ts"],"sourcesContent":["/**\r\n * Experimental semantic text splitter based on semantic similarity.\r\n * \r\n * Inspired by Greg Kamradt's semantic chunking approach:\r\n * https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb\r\n */\r\nimport { Document, BaseDocumentTransformer } from \"@langchain/core/documents\";\r\nimport { Embeddings } from \"@langchain/core/embeddings\";\r\n\r\ninterface SentenceDict {\r\n sentence: string;\r\n index: number;\r\n combinedSentence?: string;\r\n combinedSentenceEmbedding?: number[];\r\n}\r\n\r\nfunction combineSentences(sentences: SentenceDict[], bufferSize: number = 1): SentenceDict[] {\r\n // Go through each sentence dict\r\n for (let i = 0; i < sentences.length; i++) {\r\n const sentence = sentences[i];\r\n \r\n // Create the combined sentence by combining the sentences within the buffer\r\n let combinedSentence = \"\";\r\n \r\n // Add sentences before the current one\r\n for (let j = Math.max(0, i - bufferSize); j < i; j++) {\r\n combinedSentence += sentences[j].sentence + \" \";\r\n }\r\n \r\n // Add the current sentence\r\n combinedSentence += sentence.sentence;\r\n \r\n // Add sentences after the current one\r\n for (let j = i + 1; j <= Math.min(sentences.length - 1, i + bufferSize); j++) {\r\n combinedSentence += \" \" + sentences[j].sentence;\r\n }\r\n \r\n // Assign the combined sentence to the dict\r\n sentence.combinedSentence = combinedSentence.trim();\r\n }\r\n \r\n return sentences;\r\n}\r\n\r\nfunction cosineSimilarity(a: number[], b: number[]): number {\r\n const dotProduct = a.reduce((sum, ai, i) => sum + ai * b[i], 0);\r\n const magnitudeA = Math.sqrt(a.reduce((sum, ai) => sum + ai * ai, 0));\r\n const magnitudeB = Math.sqrt(b.reduce((sum, bi) => sum + bi * bi, 0));\r\n \r\n if (magnitudeA === 0 || magnitudeB === 0) {\r\n return 0;\r\n }\r\n \r\n return dotProduct / (magnitudeA * magnitudeB);\r\n}\r\n\r\nexport function calculateCosineDistances(sentences: SentenceDict[]): [number[], SentenceDict[]] {\r\n const distances: number[] = [];\r\n \r\n for (let i = 0; i < sentences.length - 1; i++) {\r\n const embeddingCurrent = sentences[i].combinedSentenceEmbedding!;\r\n const embeddingNext = sentences[i + 1].combinedSentenceEmbedding!;\r\n \r\n // Calculate cosine similarity\r\n const similarity = cosineSimilarity(embeddingCurrent, embeddingNext);\r\n \r\n // Convert to cosine distance (1 - cosine similarity)\r\n const distance = 1 - similarity;\r\n distances.push(distance);\r\n }\r\n \r\n return [distances, sentences];\r\n}\r\n\r\ntype BreakpointThresholdType = \"percentile\" | \"standard_deviation\" | \"interquartile\" | \"gradient\";\r\n\r\nconst BREAKPOINT_DEFAULTS: Record<BreakpointThresholdType, number> = {\r\n percentile: 95,\r\n standard_deviation: 3,\r\n interquartile: 1.5,\r\n gradient: 95,\r\n};\r\n\r\nexport interface SemanticTextSplitterParams {\r\n embeddings: Embeddings;\r\n bufferSize?: number;\r\n addStartIndex?: boolean;\r\n breakpointThresholdType?: BreakpointThresholdType;\r\n breakpointThresholdAmount?: number;\r\n numberOfChunks?: number;\r\n sentenceSplitRegex?: string;\r\n minChunkSize?: number;\r\n}\r\n\r\nexport class SemanticTextSplitter extends BaseDocumentTransformer {\r\n private embeddings: Embeddings;\r\n private bufferSize: number;\r\n private addStartIndex: boolean;\r\n private breakpointThresholdType: BreakpointThresholdType;\r\n private breakpointThresholdAmount: number;\r\n private numberOfChunks?: number;\r\n private sentenceSplitRegex: string;\r\n private minChunkSize?: number;\r\n\r\n constructor(params: SemanticTextSplitterParams) {\r\n super();\r\n this.embeddings = params.embeddings;\r\n this.bufferSize = params.bufferSize ?? 1;\r\n this.addStartIndex = params.addStartIndex ?? false;\r\n this.breakpointThresholdType = params.breakpointThresholdType ?? \"percentile\";\r\n this.numberOfChunks = params.numberOfChunks;\r\n this.sentenceSplitRegex = params.sentenceSplitRegex ?? \"(?<=[.?!])\\\\s+\";\r\n this.minChunkSize = params.minChunkSize;\r\n \r\n if (params.breakpointThresholdAmount === undefined) {\r\n this.breakpointThresholdAmount = BREAKPOINT_DEFAULTS[this.breakpointThresholdType];\r\n } else {\r\n this.breakpointThresholdAmount = params.breakpointThresholdAmount;\r\n }\r\n }\r\n\r\n private calculateBreakpointThreshold(distances: number[]): [number, number[]] {\r\n if (this.breakpointThresholdType === \"percentile\") {\r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const index = Math.ceil((this.breakpointThresholdAmount / 100) * sorted.length) - 1;\r\n return [sorted[Math.max(0, index)], distances];\r\n } else if (this.breakpointThresholdType === \"standard_deviation\") {\r\n const mean = distances.reduce((sum, d) => sum + d, 0) / distances.length;\r\n const variance = distances.reduce((sum, d) => sum + Math.pow(d - mean, 2), 0) / distances.length;\r\n const stdDev = Math.sqrt(variance);\r\n return [mean + this.breakpointThresholdAmount * stdDev, distances];\r\n } else if (this.breakpointThresholdType === \"interquartile\") {\r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const q1Index = Math.floor(0.25 * sorted.length);\r\n const q3Index = Math.floor(0.75 * sorted.length);\r\n const q1 = sorted[q1Index];\r\n const q3 = sorted[q3Index];\r\n const iqr = q3 - q1;\r\n const mean = distances.reduce((sum, d) => sum + d, 0) / distances.length;\r\n return [mean + this.breakpointThresholdAmount * iqr, distances];\r\n } else if (this.breakpointThresholdType === \"gradient\") {\r\n const distanceGradient: number[] = [];\r\n for (let i = 0; i < distances.length - 1; i++) {\r\n distanceGradient.push(distances[i + 1] - distances[i]);\r\n }\r\n const sortedGradient = [...distanceGradient].sort((a, b) => a - b);\r\n const index = Math.ceil((this.breakpointThresholdAmount / 100) * sortedGradient.length) - 1;\r\n return [sortedGradient[Math.max(0, index)], distanceGradient];\r\n } else {\r\n throw new Error(`Got unexpected breakpointThresholdType: ${this.breakpointThresholdType}`);\r\n }\r\n }\r\n\r\n private thresholdFromClusters(distances: number[]): number {\r\n if (this.numberOfChunks === undefined) {\r\n throw new Error(\"This should never be called if numberOfChunks is undefined.\");\r\n }\r\n \r\n const x1 = distances.length;\r\n const y1 = 0.0;\r\n const x2 = 1.0;\r\n const y2 = 100.0;\r\n \r\n const x = Math.max(Math.min(this.numberOfChunks, x1), x2);\r\n \r\n let y: number;\r\n if (x2 === x1) {\r\n y = y2;\r\n } else {\r\n y = y1 + ((y2 - y1) / (x2 - x1)) * (x - x1);\r\n }\r\n \r\n y = Math.min(Math.max(y, 0), 100);\r\n \r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const index = Math.ceil((y / 100) * sorted.length) - 1;\r\n return sorted[Math.max(0, index)];\r\n }\r\n\r\n private async calculateSentenceDistances(singleSentencesList: string[]): Promise<[number[], SentenceDict[]]> {\r\n const sentences: SentenceDict[] = singleSentencesList.map((sentence, index) => ({\r\n sentence,\r\n index,\r\n }));\r\n\r\n const sentencesWithCombined = combineSentences(sentences, this.bufferSize);\r\n \r\n const combinedSentences = sentencesWithCombined.map(s => s.combinedSentence!);\r\n const embeddings = await this.embeddings.embedDocuments(combinedSentences);\r\n \r\n for (let i = 0; i < sentencesWithCombined.length; i++) {\r\n sentencesWithCombined[i].combinedSentenceEmbedding = embeddings[i];\r\n }\r\n\r\n return calculateCosineDistances(sentencesWithCombined);\r\n }\r\n\r\n private getSingleSentencesList(text: string): string[] {\r\n return text.split(new RegExp(this.sentenceSplitRegex)).filter(sentence => sentence.trim().length > 0);\r\n }\r\n\r\n async splitText(text: string): Promise<string[]> {\r\n const singleSentencesList = this.getSingleSentencesList(text);\r\n\r\n if (singleSentencesList.length === 1) {\r\n return singleSentencesList;\r\n }\r\n\r\n if (this.breakpointThresholdType === \"gradient\" && singleSentencesList.length === 2) {\r\n return singleSentencesList;\r\n }\r\n\r\n const [distances, sentences] = await this.calculateSentenceDistances(singleSentencesList);\r\n\r\n let breakpointDistanceThreshold: number;\r\n let breakpointArray: number[];\r\n\r\n if (this.numberOfChunks !== undefined) {\r\n breakpointDistanceThreshold = this.thresholdFromClusters(distances);\r\n breakpointArray = distances;\r\n } else {\r\n [breakpointDistanceThreshold, breakpointArray] = this.calculateBreakpointThreshold(distances);\r\n }\r\n\r\n const indicesAboveThresh = breakpointArray\r\n .map((x, i) => ({ value: x, index: i }))\r\n .filter(({ value }) => value > breakpointDistanceThreshold)\r\n .map(({ index }) => index);\r\n\r\n const chunks: string[] = [];\r\n let startIndex = 0;\r\n\r\n for (const index of indicesAboveThresh) {\r\n const endIndex = index;\r\n const group = sentences.slice(startIndex, endIndex + 1);\r\n const combinedText = group.map((d: SentenceDict) => d.sentence).join(\" \");\r\n\r\n if (this.minChunkSize !== undefined && combinedText.length < this.minChunkSize) {\r\n continue;\r\n }\r\n \r\n chunks.push(combinedText);\r\n startIndex = index + 1;\r\n }\r\n\r\n if (startIndex < sentences.length) {\r\n const combinedText = sentences.slice(startIndex).map((d: SentenceDict) => d.sentence).join(\" \");\r\n chunks.push(combinedText);\r\n }\r\n\r\n return chunks;\r\n }\r\n\r\n async createDocuments(\r\n texts: string[],\r\n metadatas: Record<string, any>[] = []\r\n ): Promise<Document[]> {\r\n const _metadatas = metadatas.length > 0 ? metadatas : texts.map(() => (<Record<string, any>>{}));\r\n const documents: Document[] = [];\r\n \r\n for (let i = 0; i < texts.length; i++) {\r\n const text = texts[i];\r\n let startIndex = 0;\r\n \r\n for (const chunk of await this.splitText(text)) {\r\n const metadata = { ..._metadatas[i] };\r\n if (this.addStartIndex) {\r\n metadata.start_index = startIndex;\r\n }\r\n \r\n const newDoc = new Document({\r\n pageContent: chunk,\r\n metadata,\r\n });\r\n \r\n documents.push(newDoc);\r\n startIndex += chunk.length;\r\n }\r\n }\r\n \r\n return documents;\r\n }\r\n\r\n async splitDocuments(documents: Document[]): Promise<Document[]> {\r\n const texts = documents.map(doc => doc.pageContent);\r\n const metadatas = documents.map(doc => doc.metadata);\r\n return this.createDocuments(texts, metadatas);\r\n }\r\n\r\n async transformDocuments(documents: Document[]): Promise<Document[]> {\r\n return this.splitDocuments(documents);\r\n }\r\n}"],"mappings":";;;;;;;;AAgBA,SAAS,iBAAiB,WAA2B,aAAqB,GAAmB;AAE3F,MAAK,IAAI,IAAI,GAAG,IAAI,UAAU,QAAQ,KAAK;EACzC,MAAM,WAAW,UAAU;EAG3B,IAAI,mBAAmB;AAGvB,OAAK,IAAI,IAAI,KAAK,IAAI,GAAG,IAAI,WAAW,EAAE,IAAI,GAAG,IAC/C,qBAAoB,UAAU,GAAG,WAAW;AAI9C,sBAAoB,SAAS;AAG7B,OAAK,IAAI,IAAI,IAAI,GAAG,KAAK,KAAK,IAAI,UAAU,SAAS,GAAG,IAAI,WAAW,EAAE,IACvE,qBAAoB,MAAM,UAAU,GAAG;AAIzC,WAAS,mBAAmB,iBAAiB,MAAM;;AAGrD,QAAO;;AAGT,SAAS,iBAAiB,GAAa,GAAqB;CAC1D,MAAM,aAAa,EAAE,QAAQ,KAAK,IAAI,MAAM,MAAM,KAAK,EAAE,IAAI,EAAE;CAC/D,MAAM,aAAa,KAAK,KAAK,EAAE,QAAQ,KAAK,OAAO,MAAM,KAAK,IAAI,EAAE,CAAC;CACrE,MAAM,aAAa,KAAK,KAAK,EAAE,QAAQ,KAAK,OAAO,MAAM,KAAK,IAAI,EAAE,CAAC;AAErE,KAAI,eAAe,KAAK,eAAe,EACrC,QAAO;AAGT,QAAO,cAAc,aAAa;;AAGpC,SAAgB,yBAAyB,WAAuD;CAC9F,MAAM,YAAsB,EAAE;AAE9B,MAAK,IAAI,IAAI,GAAG,IAAI,UAAU,SAAS,GAAG,KAAK;EAC7C,MAAM,mBAAmB,UAAU,GAAG;EACtC,MAAM,gBAAgB,UAAU,IAAI,GAAG;EAMvC,MAAM,WAAW,IAHE,iBAAiB,kBAAkB,cAAc;AAIpE,YAAU,KAAK,SAAS;;AAG1B,QAAO,CAAC,WAAW,UAAU;;AAK/B,MAAM,sBAA+D;CACnE,YAAY;CACZ,oBAAoB;CACpB,eAAe;CACf,UAAU;CACX;AAaD,IAAa,uBAAb,cAA0CA,0BAAAA,wBAAwB;CAChE;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CAEA,YAAY,QAAoC;AAC9C,SAAO;AACP,OAAK,aAAa,OAAO;AACzB,OAAK,aAAa,OAAO,cAAc;AACvC,OAAK,gBAAgB,OAAO,iBAAiB;AAC7C,OAAK,0BAA0B,OAAO,2BAA2B;AACjE,OAAK,iBAAiB,OAAO;AAC7B,OAAK,qBAAqB,OAAO,sBAAsB;AACvD,OAAK,eAAe,OAAO;AAE3B,MAAI,OAAO,8BAA8B,KAAA,EACvC,MAAK,4BAA4B,oBAAoB,KAAK;MAE1D,MAAK,4BAA4B,OAAO;;CAI5C,6BAAqC,WAAyC;AAC5E,MAAI,KAAK,4BAA4B,cAAc;GACjD,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GACnD,MAAM,QAAQ,KAAK,KAAM,KAAK,4BAA4B,MAAO,OAAO,OAAO,GAAG;AAClF,UAAO,CAAC,OAAO,KAAK,IAAI,GAAG,MAAM,GAAG,UAAU;aACrC,KAAK,4BAA4B,sBAAsB;GAChE,MAAM,OAAO,UAAU,QAAQ,KAAK,MAAM,MAAM,GAAG,EAAE,GAAG,UAAU;GAClE,MAAM,WAAW,UAAU,QAAQ,KAAK,MAAM,MAAM,KAAK,IAAI,IAAI,MAAM,EAAE,EAAE,EAAE,GAAG,UAAU;GAC1F,MAAM,SAAS,KAAK,KAAK,SAAS;AAClC,UAAO,CAAC,OAAO,KAAK,4BAA4B,QAAQ,UAAU;aACzD,KAAK,4BAA4B,iBAAiB;GAC3D,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GACnD,MAAM,UAAU,KAAK,MAAM,MAAO,OAAO,OAAO;GAChD,MAAM,UAAU,KAAK,MAAM,MAAO,OAAO,OAAO;GAChD,MAAM,KAAK,OAAO;GAElB,MAAM,MADK,OAAO,WACD;AAEjB,UAAO,CADM,UAAU,QAAQ,KAAK,MAAM,MAAM,GAAG,EAAE,GAAG,UAAU,SACnD,KAAK,4BAA4B,KAAK,UAAU;aACtD,KAAK,4BAA4B,YAAY;GACtD,MAAM,mBAA6B,EAAE;AACrC,QAAK,IAAI,IAAI,GAAG,IAAI,UAAU,SAAS,GAAG,IACxC,kBAAiB,KAAK,UAAU,IAAI,KAAK,UAAU,GAAG;GAExD,MAAM,iBAAiB,CAAC,GAAG,iBAAiB,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GAClE,MAAM,QAAQ,KAAK,KAAM,KAAK,4BAA4B,MAAO,eAAe,OAAO,GAAG;AAC1F,UAAO,CAAC,eAAe,KAAK,IAAI,GAAG,MAAM,GAAG,iBAAiB;QAE7D,OAAM,IAAI,MAAM,2CAA2C,KAAK,0BAA0B;;CAI9F,sBAA8B,WAA6B;AACzD,MAAI,KAAK,mBAAmB,KAAA,EAC1B,OAAM,IAAI,MAAM,8DAA8D;EAGhF,MAAM,KAAK,UAAU;EACrB,MAAM,KAAK;EACX,MAAM,KAAK;EACX,MAAM,KAAK;EAEX,MAAM,IAAI,KAAK,IAAI,KAAK,IAAI,KAAK,gBAAgB,GAAG,EAAE,GAAG;EAEzD,IAAI;AACJ,MAAI,OAAO,GACT,KAAI;MAEJ,KAAI,MAAO,KAAK,OAAO,KAAK,OAAQ,IAAI;AAG1C,MAAI,KAAK,IAAI,KAAK,IAAI,GAAG,EAAE,EAAE,IAAI;EAEjC,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;EACnD,MAAM,QAAQ,KAAK,KAAM,IAAI,MAAO,OAAO,OAAO,GAAG;AACrD,SAAO,OAAO,KAAK,IAAI,GAAG,MAAM;;CAGlC,MAAc,2BAA2B,qBAAoE;EAM3G,MAAM,wBAAwB,iBALI,oBAAoB,KAAK,UAAU,WAAW;GAC9E;GACA;GACD,EAAE,EAEuD,KAAK,WAAW;EAE1E,MAAM,oBAAoB,sBAAsB,KAAI,MAAK,EAAE,iBAAkB;EAC7E,MAAM,aAAa,MAAM,KAAK,WAAW,eAAe,kBAAkB;AAE1E,OAAK,IAAI,IAAI,GAAG,IAAI,sBAAsB,QAAQ,IAChD,uBAAsB,GAAG,4BAA4B,WAAW;AAGlE,SAAO,yBAAyB,sBAAsB;;CAGxD,uBAA+B,MAAwB;AACrD,SAAO,KAAK,MAAM,IAAI,OAAO,KAAK,mBAAmB,CAAC,CAAC,QAAO,aAAY,SAAS,MAAM,CAAC,SAAS,EAAE;;CAGvG,MAAM,UAAU,MAAiC;EAC/C,MAAM,sBAAsB,KAAK,uBAAuB,KAAK;AAE7D,MAAI,oBAAoB,WAAW,EACjC,QAAO;AAGT,MAAI,KAAK,4BAA4B,cAAc,oBAAoB,WAAW,EAChF,QAAO;EAGT,MAAM,CAAC,WAAW,aAAa,MAAM,KAAK,2BAA2B,oBAAoB;EAEzF,IAAI;EACJ,IAAI;AAEJ,MAAI,KAAK,mBAAmB,KAAA,GAAW;AACrC,iCAA8B,KAAK,sBAAsB,UAAU;AACnE,qBAAkB;QAElB,EAAC,6BAA6B,mBAAmB,KAAK,6BAA6B,UAAU;EAG/F,MAAM,qBAAqB,gBACxB,KAAK,GAAG,OAAO;GAAE,OAAO;GAAG,OAAO;GAAG,EAAE,CACvC,QAAQ,EAAE,YAAY,QAAQ,4BAA4B,CAC1D,KAAK,EAAE,YAAY,MAAM;EAE5B,MAAM,SAAmB,EAAE;EAC3B,IAAI,aAAa;AAEjB,OAAK,MAAM,SAAS,oBAAoB;GACtC,MAAM,WAAW;GAEjB,MAAM,eADQ,UAAU,MAAM,YAAY,WAAW,EAAE,CAC5B,KAAK,MAAoB,EAAE,SAAS,CAAC,KAAK,IAAI;AAEzE,OAAI,KAAK,iBAAiB,KAAA,KAAa,aAAa,SAAS,KAAK,aAChE;AAGF,UAAO,KAAK,aAAa;AACzB,gBAAa,QAAQ;;AAGvB,MAAI,aAAa,UAAU,QAAQ;GACjC,MAAM,eAAe,UAAU,MAAM,WAAW,CAAC,KAAK,MAAoB,EAAE,SAAS,CAAC,KAAK,IAAI;AAC/F,UAAO,KAAK,aAAa;;AAG3B,SAAO;;CAGT,MAAM,gBACJ,OACA,YAAmC,EAAE,EAChB;EACrB,MAAM,aAAa,UAAU,SAAS,IAAI,YAAY,MAAM,WAAgC,EAAE,EAAE;EAChG,MAAM,YAAwB,EAAE;AAEhC,OAAK,IAAI,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;GACrC,MAAM,OAAO,MAAM;GACnB,IAAI,aAAa;AAEjB,QAAK,MAAM,SAAS,MAAM,KAAK,UAAU,KAAK,EAAE;IAC9C,MAAM,WAAW,EAAE,GAAG,WAAW,IAAI;AACrC,QAAI,KAAK,cACP,UAAS,cAAc;IAGzB,MAAM,SAAS,IAAIC,0BAAAA,SAAS;KAC1B,aAAa;KACb;KACD,CAAC;AAEF,cAAU,KAAK,OAAO;AACtB,kBAAc,MAAM;;;AAIxB,SAAO;;CAGT,MAAM,eAAe,WAA4C;EAC/D,MAAM,QAAQ,UAAU,KAAI,QAAO,IAAI,YAAY;EACnD,MAAM,YAAY,UAAU,KAAI,QAAO,IAAI,SAAS;AACpD,SAAO,KAAK,gBAAgB,OAAO,UAAU;;CAG/C,MAAM,mBAAmB,WAA4C;AACnE,SAAO,KAAK,eAAe,UAAU"}
|
|
1
|
+
{"version":3,"file":"semantic_text_splitter.cjs","names":["BaseDocumentTransformer","Document"],"sources":["../src/semantic_text_splitter.ts"],"sourcesContent":["/**\r\n * Experimental semantic text splitter based on semantic similarity.\r\n *\r\n * Inspired by Greg Kamradt's semantic chunking approach:\r\n * https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb\r\n */\r\nimport { Document, BaseDocumentTransformer } from \"@langchain/core/documents\";\r\nimport { Embeddings } from \"@langchain/core/embeddings\";\r\n\r\ninterface SentenceDict {\r\n sentence: string;\r\n index: number;\r\n combinedSentence?: string;\r\n combinedSentenceEmbedding?: number[];\r\n}\r\n\r\nfunction combineSentences(\r\n sentences: SentenceDict[],\r\n bufferSize: number = 1,\r\n): SentenceDict[] {\r\n // Go through each sentence dict\r\n for (let i = 0; i < sentences.length; i++) {\r\n const sentence = sentences[i];\r\n\r\n // Create the combined sentence by combining the sentences within the buffer\r\n let combinedSentence = \"\";\r\n\r\n // Add sentences before the current one\r\n for (let j = Math.max(0, i - bufferSize); j < i; j++) {\r\n combinedSentence += sentences[j].sentence + \" \";\r\n }\r\n\r\n // Add the current sentence\r\n combinedSentence += sentence.sentence;\r\n\r\n // Add sentences after the current one\r\n for (\r\n let j = i + 1;\r\n j <= Math.min(sentences.length - 1, i + bufferSize);\r\n j++\r\n ) {\r\n combinedSentence += \" \" + sentences[j].sentence;\r\n }\r\n\r\n // Assign the combined sentence to the dict\r\n sentence.combinedSentence = combinedSentence.trim();\r\n }\r\n\r\n return sentences;\r\n}\r\n\r\nfunction cosineSimilarity(a: number[], b: number[]): number {\r\n const dotProduct = a.reduce((sum, ai, i) => sum + ai * b[i], 0);\r\n const magnitudeA = Math.sqrt(a.reduce((sum, ai) => sum + ai * ai, 0));\r\n const magnitudeB = Math.sqrt(b.reduce((sum, bi) => sum + bi * bi, 0));\r\n\r\n if (magnitudeA === 0 || magnitudeB === 0) {\r\n return 0;\r\n }\r\n\r\n return dotProduct / (magnitudeA * magnitudeB);\r\n}\r\n\r\n/**\r\n * Calculates the cosine distance (1 - cosine similarity) between sequential sentences.\r\n *\r\n * @param sentences - An array of sentences that already contain generated embeddings.\r\n * @returns A tuple where the first element is an array of distances between adjacent sentences, and the second is the original array of sentences.\r\n */\r\nexport function calculateCosineDistances(\r\n sentences: SentenceDict[],\r\n): [number[], SentenceDict[]] {\r\n const distances: number[] = [];\r\n\r\n for (let i = 0; i < sentences.length - 1; i++) {\r\n const embeddingCurrent = sentences[i].combinedSentenceEmbedding!;\r\n const embeddingNext = sentences[i + 1].combinedSentenceEmbedding!;\r\n\r\n // Calculate cosine similarity\r\n const similarity = cosineSimilarity(embeddingCurrent, embeddingNext);\r\n\r\n // Convert to cosine distance (1 - cosine similarity)\r\n const distance = 1 - similarity;\r\n distances.push(distance);\r\n }\r\n\r\n return [distances, sentences];\r\n}\r\n\r\ntype BreakpointThresholdType =\r\n | \"percentile\"\r\n | \"standard_deviation\"\r\n | \"interquartile\"\r\n | \"gradient\";\r\n\r\nconst BREAKPOINT_DEFAULTS: Record<BreakpointThresholdType, number> = {\r\n percentile: 95,\r\n standard_deviation: 3,\r\n interquartile: 1.5,\r\n gradient: 95,\r\n};\r\n\r\n/**\r\n * Configuration parameters for the SemanticTextSplitter.\r\n */\r\nexport interface SemanticTextSplitterParams {\r\n /** The embeddings model to calculate string semantic similarities. */\r\n embeddings: Embeddings;\r\n /** The number of contextual surrounding sentences to bundle with the target sentence for embeddings. Default is 1. */\r\n bufferSize?: number;\r\n /** Whether to add the starting index of the chunk to the metadata. Default is false. */\r\n addStartIndex?: boolean;\r\n /** The statistical method used to determine where chunks should be split. Default is \"percentile\". */\r\n breakpointThresholdType?: BreakpointThresholdType;\r\n /** The specific threshold amount corresponding to the algorithm type. Falls back to predefined defaults if not provided. */\r\n breakpointThresholdAmount?: number;\r\n /** The exact number of chunks to produce. Hardcoded clustering thresholds are used if provided. */\r\n numberOfChunks?: number;\r\n /** A regex pattern used to divide the raw text into individual sentences. Default splits at '.', '?', and '!'. */\r\n sentenceSplitRegex?: string;\r\n /** An optional constraint to combine chunks dynamically until they reach this minimum character length. */\r\n minChunkSize?: number;\r\n}\r\n\r\n/**\r\n * A Document Transformer that splits texts by comparing the semantic similarity of sequential sentences.\r\n * It ensures sentences with high contextual relationships remain in the same chunk.\r\n */\r\nexport class SemanticTextSplitter extends BaseDocumentTransformer {\r\n private embeddings: Embeddings;\r\n private bufferSize: number;\r\n private addStartIndex: boolean;\r\n private breakpointThresholdType: BreakpointThresholdType;\r\n private breakpointThresholdAmount: number;\r\n private numberOfChunks?: number;\r\n private sentenceSplitRegex: string;\r\n private minChunkSize?: number;\r\n\r\n /**\r\n * Constructs a new SemanticTextSplitter.\r\n * @param params - The configuration options initializing the text splitter.\r\n */\r\n constructor(params: SemanticTextSplitterParams) {\r\n super();\r\n this.embeddings = params.embeddings;\r\n this.bufferSize = params.bufferSize ?? 1;\r\n this.addStartIndex = params.addStartIndex ?? false;\r\n this.breakpointThresholdType =\r\n params.breakpointThresholdType ?? \"percentile\";\r\n this.numberOfChunks = params.numberOfChunks;\r\n this.sentenceSplitRegex = params.sentenceSplitRegex ?? \"(?<=[.?!])\\\\s+\";\r\n this.minChunkSize = params.minChunkSize;\r\n\r\n if (params.breakpointThresholdAmount === undefined) {\r\n this.breakpointThresholdAmount =\r\n BREAKPOINT_DEFAULTS[this.breakpointThresholdType];\r\n } else {\r\n this.breakpointThresholdAmount = params.breakpointThresholdAmount;\r\n }\r\n }\r\n\r\n private calculateBreakpointThreshold(\r\n distances: number[],\r\n ): [number, number[]] {\r\n if (this.breakpointThresholdType === \"percentile\") {\r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const index =\r\n Math.ceil((this.breakpointThresholdAmount / 100) * sorted.length) - 1;\r\n return [sorted[Math.max(0, index)], distances];\r\n } else if (this.breakpointThresholdType === \"standard_deviation\") {\r\n const mean = distances.reduce((sum, d) => sum + d, 0) / distances.length;\r\n const variance =\r\n distances.reduce((sum, d) => sum + Math.pow(d - mean, 2), 0) /\r\n distances.length;\r\n const stdDev = Math.sqrt(variance);\r\n return [mean + this.breakpointThresholdAmount * stdDev, distances];\r\n } else if (this.breakpointThresholdType === \"interquartile\") {\r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const q1Index = Math.floor(0.25 * sorted.length);\r\n const q3Index = Math.floor(0.75 * sorted.length);\r\n const q1 = sorted[q1Index];\r\n const q3 = sorted[q3Index];\r\n const iqr = q3 - q1;\r\n const mean = distances.reduce((sum, d) => sum + d, 0) / distances.length;\r\n return [mean + this.breakpointThresholdAmount * iqr, distances];\r\n } else if (this.breakpointThresholdType === \"gradient\") {\r\n const distanceGradient: number[] = [];\r\n for (let i = 0; i < distances.length - 1; i++) {\r\n distanceGradient.push(distances[i + 1] - distances[i]);\r\n }\r\n const sortedGradient = [...distanceGradient].sort((a, b) => a - b);\r\n const index =\r\n Math.ceil(\r\n (this.breakpointThresholdAmount / 100) * sortedGradient.length,\r\n ) - 1;\r\n return [sortedGradient[Math.max(0, index)], distanceGradient];\r\n } else {\r\n throw new Error(\r\n `Got unexpected breakpointThresholdType: ${this.breakpointThresholdType}`,\r\n );\r\n }\r\n }\r\n\r\n private thresholdFromClusters(distances: number[]): number {\r\n if (this.numberOfChunks === undefined) {\r\n throw new Error(\r\n \"This should never be called if numberOfChunks is undefined.\",\r\n );\r\n }\r\n\r\n const x1 = distances.length;\r\n const y1 = 0.0;\r\n const x2 = 1.0;\r\n const y2 = 100.0;\r\n\r\n const x = Math.max(Math.min(this.numberOfChunks, x1), x2);\r\n\r\n let y: number;\r\n if (x2 === x1) {\r\n y = y2;\r\n } else {\r\n y = y1 + ((y2 - y1) / (x2 - x1)) * (x - x1);\r\n }\r\n\r\n y = Math.min(Math.max(y, 0), 100);\r\n\r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const index = Math.ceil((y / 100) * sorted.length) - 1;\r\n return sorted[Math.max(0, index)];\r\n }\r\n\r\n private async calculateSentenceDistances(\r\n singleSentencesList: string[],\r\n ): Promise<[number[], SentenceDict[]]> {\r\n const sentences: SentenceDict[] = singleSentencesList.map(\r\n (sentence, index) => ({\r\n sentence,\r\n index,\r\n }),\r\n );\r\n\r\n const sentencesWithCombined = combineSentences(sentences, this.bufferSize);\r\n\r\n const combinedSentences = sentencesWithCombined.map(\r\n (s) => s.combinedSentence!,\r\n );\r\n const embeddings = await this.embeddings.embedDocuments(combinedSentences);\r\n\r\n for (let i = 0; i < sentencesWithCombined.length; i++) {\r\n sentencesWithCombined[i].combinedSentenceEmbedding = embeddings[i];\r\n }\r\n\r\n return calculateCosineDistances(sentencesWithCombined);\r\n }\r\n\r\n private getSingleSentencesList(text: string): string[] {\r\n return text\r\n .split(new RegExp(this.sentenceSplitRegex))\r\n .filter((sentence) => sentence.trim().length > 0);\r\n }\r\n\r\n /**\r\n * Core method to process a raw string of text and turn it into an array of chunked strings based on semantic proximity.\r\n *\r\n * @param text - The raw string input to be chunked.\r\n * @returns An array of string chunks derived from the original text snippet.\r\n */\r\n async splitText(text: string): Promise<string[]> {\r\n const singleSentencesList = this.getSingleSentencesList(text);\r\n\r\n if (singleSentencesList.length === 1) {\r\n return singleSentencesList;\r\n }\r\n\r\n if (\r\n this.breakpointThresholdType === \"gradient\" &&\r\n singleSentencesList.length === 2\r\n ) {\r\n return singleSentencesList;\r\n }\r\n\r\n const [distances, sentences] =\r\n await this.calculateSentenceDistances(singleSentencesList);\r\n\r\n let breakpointDistanceThreshold: number;\r\n let breakpointArray: number[];\r\n\r\n if (this.numberOfChunks !== undefined) {\r\n breakpointDistanceThreshold = this.thresholdFromClusters(distances);\r\n breakpointArray = distances;\r\n } else {\r\n [breakpointDistanceThreshold, breakpointArray] =\r\n this.calculateBreakpointThreshold(distances);\r\n }\r\n\r\n const indicesAboveThresh = breakpointArray\r\n .map((x, i) => ({ value: x, index: i }))\r\n .filter(({ value }) => value > breakpointDistanceThreshold)\r\n .map(({ index }) => index);\r\n\r\n const chunks: string[] = [];\r\n let startIndex = 0;\r\n\r\n for (const index of indicesAboveThresh) {\r\n const endIndex = index;\r\n const group = sentences.slice(startIndex, endIndex + 1);\r\n const combinedText = group.map((d: SentenceDict) => d.sentence).join(\" \");\r\n\r\n if (\r\n this.minChunkSize !== undefined &&\r\n combinedText.length < this.minChunkSize\r\n ) {\r\n continue;\r\n }\r\n\r\n chunks.push(combinedText);\r\n startIndex = index + 1;\r\n }\r\n\r\n if (startIndex < sentences.length) {\r\n const combinedText = sentences\r\n .slice(startIndex)\r\n .map((d: SentenceDict) => d.sentence)\r\n .join(\" \");\r\n chunks.push(combinedText);\r\n }\r\n\r\n return chunks;\r\n }\r\n\r\n /**\r\n * Takes raw strings and corresponding optional metadata, sending them through the underlying\r\n * semantic text splitting process, and forming standard LangChain `Document` objects.\r\n *\r\n * @param texts - Array of strings spanning the content to separate out.\r\n * @param metadatas - Optional array of metadata mappings that align 1:1 with texts.\r\n * @returns An array of freshly constructed LangChain Document instances containing the split text chunks.\r\n */\r\n async createDocuments(\r\n texts: string[],\r\n metadatas: Record<string, any>[] = [],\r\n ): Promise<Document[]> {\r\n const _metadatas =\r\n metadatas.length > 0\r\n ? metadatas\r\n : texts.map(() => <Record<string, any>>{});\r\n const documents: Document[] = [];\r\n\r\n for (let i = 0; i < texts.length; i++) {\r\n const text = texts[i];\r\n let startIndex = 0;\r\n\r\n for (const chunk of await this.splitText(text)) {\r\n const metadata = { ..._metadatas[i] };\r\n if (this.addStartIndex) {\r\n metadata.start_index = startIndex;\r\n }\r\n\r\n const newDoc = new Document({\r\n pageContent: chunk,\r\n metadata,\r\n });\r\n\r\n documents.push(newDoc);\r\n startIndex += chunk.length;\r\n }\r\n }\r\n\r\n return documents;\r\n }\r\n\r\n /**\r\n * Convenience method to take in standardized Documents, extract their text, split the\r\n * text semantically, and reconstruct them into smaller Documents.\r\n *\r\n * @param documents - The original Documents to break down.\r\n * @returns Semantically chunked Documents inherited from the originals.\r\n */\r\n async splitDocuments(documents: Document[]): Promise<Document[]> {\r\n const texts = documents.map((doc) => doc.pageContent);\r\n const metadatas = documents.map((doc) => doc.metadata);\r\n return this.createDocuments(texts, metadatas);\r\n }\r\n\r\n /**\r\n * Implements the base BaseDocumentTransformer requirement to process documents.\r\n * Equivalent to `splitDocuments()`.\r\n *\r\n * @param documents - The original Documents to transform.\r\n * @returns Transformed, semantically split Documents.\r\n */\r\n async transformDocuments(documents: Document[]): Promise<Document[]> {\r\n return this.splitDocuments(documents);\r\n }\r\n}\r\n"],"mappings":";;;;;;;;AAgBA,SAAS,iBACP,WACA,aAAqB,GACL;AAEhB,MAAK,IAAI,IAAI,GAAG,IAAI,UAAU,QAAQ,KAAK;EACzC,MAAM,WAAW,UAAU;EAG3B,IAAI,mBAAmB;AAGvB,OAAK,IAAI,IAAI,KAAK,IAAI,GAAG,IAAI,WAAW,EAAE,IAAI,GAAG,IAC/C,qBAAoB,UAAU,GAAG,WAAW;AAI9C,sBAAoB,SAAS;AAG7B,OACE,IAAI,IAAI,IAAI,GACZ,KAAK,KAAK,IAAI,UAAU,SAAS,GAAG,IAAI,WAAW,EACnD,IAEA,qBAAoB,MAAM,UAAU,GAAG;AAIzC,WAAS,mBAAmB,iBAAiB,MAAM;;AAGrD,QAAO;;AAGT,SAAS,iBAAiB,GAAa,GAAqB;CAC1D,MAAM,aAAa,EAAE,QAAQ,KAAK,IAAI,MAAM,MAAM,KAAK,EAAE,IAAI,EAAE;CAC/D,MAAM,aAAa,KAAK,KAAK,EAAE,QAAQ,KAAK,OAAO,MAAM,KAAK,IAAI,EAAE,CAAC;CACrE,MAAM,aAAa,KAAK,KAAK,EAAE,QAAQ,KAAK,OAAO,MAAM,KAAK,IAAI,EAAE,CAAC;AAErE,KAAI,eAAe,KAAK,eAAe,EACrC,QAAO;AAGT,QAAO,cAAc,aAAa;;;;;;;;AASpC,SAAgB,yBACd,WAC4B;CAC5B,MAAM,YAAsB,EAAE;AAE9B,MAAK,IAAI,IAAI,GAAG,IAAI,UAAU,SAAS,GAAG,KAAK;EAC7C,MAAM,mBAAmB,UAAU,GAAG;EACtC,MAAM,gBAAgB,UAAU,IAAI,GAAG;EAMvC,MAAM,WAAW,IAHE,iBAAiB,kBAAkB,cAAc;AAIpE,YAAU,KAAK,SAAS;;AAG1B,QAAO,CAAC,WAAW,UAAU;;AAS/B,MAAM,sBAA+D;CACnE,YAAY;CACZ,oBAAoB;CACpB,eAAe;CACf,UAAU;CACX;;;;;AA4BD,IAAa,uBAAb,cAA0CA,0BAAAA,wBAAwB;CAChE;CACA;CACA;CACA;CACA;CACA;CACA;CACA;;;;;CAMA,YAAY,QAAoC;AAC9C,SAAO;AACP,OAAK,aAAa,OAAO;AACzB,OAAK,aAAa,OAAO,cAAc;AACvC,OAAK,gBAAgB,OAAO,iBAAiB;AAC7C,OAAK,0BACH,OAAO,2BAA2B;AACpC,OAAK,iBAAiB,OAAO;AAC7B,OAAK,qBAAqB,OAAO,sBAAsB;AACvD,OAAK,eAAe,OAAO;AAE3B,MAAI,OAAO,8BAA8B,KAAA,EACvC,MAAK,4BACH,oBAAoB,KAAK;MAE3B,MAAK,4BAA4B,OAAO;;CAI5C,6BACE,WACoB;AACpB,MAAI,KAAK,4BAA4B,cAAc;GACjD,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GACnD,MAAM,QACJ,KAAK,KAAM,KAAK,4BAA4B,MAAO,OAAO,OAAO,GAAG;AACtE,UAAO,CAAC,OAAO,KAAK,IAAI,GAAG,MAAM,GAAG,UAAU;aACrC,KAAK,4BAA4B,sBAAsB;GAChE,MAAM,OAAO,UAAU,QAAQ,KAAK,MAAM,MAAM,GAAG,EAAE,GAAG,UAAU;GAClE,MAAM,WACJ,UAAU,QAAQ,KAAK,MAAM,MAAM,KAAK,IAAI,IAAI,MAAM,EAAE,EAAE,EAAE,GAC5D,UAAU;GACZ,MAAM,SAAS,KAAK,KAAK,SAAS;AAClC,UAAO,CAAC,OAAO,KAAK,4BAA4B,QAAQ,UAAU;aACzD,KAAK,4BAA4B,iBAAiB;GAC3D,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GACnD,MAAM,UAAU,KAAK,MAAM,MAAO,OAAO,OAAO;GAChD,MAAM,UAAU,KAAK,MAAM,MAAO,OAAO,OAAO;GAChD,MAAM,KAAK,OAAO;GAElB,MAAM,MADK,OAAO,WACD;AAEjB,UAAO,CADM,UAAU,QAAQ,KAAK,MAAM,MAAM,GAAG,EAAE,GAAG,UAAU,SACnD,KAAK,4BAA4B,KAAK,UAAU;aACtD,KAAK,4BAA4B,YAAY;GACtD,MAAM,mBAA6B,EAAE;AACrC,QAAK,IAAI,IAAI,GAAG,IAAI,UAAU,SAAS,GAAG,IACxC,kBAAiB,KAAK,UAAU,IAAI,KAAK,UAAU,GAAG;GAExD,MAAM,iBAAiB,CAAC,GAAG,iBAAiB,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GAClE,MAAM,QACJ,KAAK,KACF,KAAK,4BAA4B,MAAO,eAAe,OACzD,GAAG;AACN,UAAO,CAAC,eAAe,KAAK,IAAI,GAAG,MAAM,GAAG,iBAAiB;QAE7D,OAAM,IAAI,MACR,2CAA2C,KAAK,0BACjD;;CAIL,sBAA8B,WAA6B;AACzD,MAAI,KAAK,mBAAmB,KAAA,EAC1B,OAAM,IAAI,MACR,8DACD;EAGH,MAAM,KAAK,UAAU;EACrB,MAAM,KAAK;EACX,MAAM,KAAK;EACX,MAAM,KAAK;EAEX,MAAM,IAAI,KAAK,IAAI,KAAK,IAAI,KAAK,gBAAgB,GAAG,EAAE,GAAG;EAEzD,IAAI;AACJ,MAAI,OAAO,GACT,KAAI;MAEJ,KAAI,MAAO,KAAK,OAAO,KAAK,OAAQ,IAAI;AAG1C,MAAI,KAAK,IAAI,KAAK,IAAI,GAAG,EAAE,EAAE,IAAI;EAEjC,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;EACnD,MAAM,QAAQ,KAAK,KAAM,IAAI,MAAO,OAAO,OAAO,GAAG;AACrD,SAAO,OAAO,KAAK,IAAI,GAAG,MAAM;;CAGlC,MAAc,2BACZ,qBACqC;EAQrC,MAAM,wBAAwB,iBAPI,oBAAoB,KACnD,UAAU,WAAW;GACpB;GACA;GACD,EACF,EAEyD,KAAK,WAAW;EAE1E,MAAM,oBAAoB,sBAAsB,KAC7C,MAAM,EAAE,iBACV;EACD,MAAM,aAAa,MAAM,KAAK,WAAW,eAAe,kBAAkB;AAE1E,OAAK,IAAI,IAAI,GAAG,IAAI,sBAAsB,QAAQ,IAChD,uBAAsB,GAAG,4BAA4B,WAAW;AAGlE,SAAO,yBAAyB,sBAAsB;;CAGxD,uBAA+B,MAAwB;AACrD,SAAO,KACJ,MAAM,IAAI,OAAO,KAAK,mBAAmB,CAAC,CAC1C,QAAQ,aAAa,SAAS,MAAM,CAAC,SAAS,EAAE;;;;;;;;CASrD,MAAM,UAAU,MAAiC;EAC/C,MAAM,sBAAsB,KAAK,uBAAuB,KAAK;AAE7D,MAAI,oBAAoB,WAAW,EACjC,QAAO;AAGT,MACE,KAAK,4BAA4B,cACjC,oBAAoB,WAAW,EAE/B,QAAO;EAGT,MAAM,CAAC,WAAW,aAChB,MAAM,KAAK,2BAA2B,oBAAoB;EAE5D,IAAI;EACJ,IAAI;AAEJ,MAAI,KAAK,mBAAmB,KAAA,GAAW;AACrC,iCAA8B,KAAK,sBAAsB,UAAU;AACnE,qBAAkB;QAElB,EAAC,6BAA6B,mBAC5B,KAAK,6BAA6B,UAAU;EAGhD,MAAM,qBAAqB,gBACxB,KAAK,GAAG,OAAO;GAAE,OAAO;GAAG,OAAO;GAAG,EAAE,CACvC,QAAQ,EAAE,YAAY,QAAQ,4BAA4B,CAC1D,KAAK,EAAE,YAAY,MAAM;EAE5B,MAAM,SAAmB,EAAE;EAC3B,IAAI,aAAa;AAEjB,OAAK,MAAM,SAAS,oBAAoB;GACtC,MAAM,WAAW;GAEjB,MAAM,eADQ,UAAU,MAAM,YAAY,WAAW,EAAE,CAC5B,KAAK,MAAoB,EAAE,SAAS,CAAC,KAAK,IAAI;AAEzE,OACE,KAAK,iBAAiB,KAAA,KACtB,aAAa,SAAS,KAAK,aAE3B;AAGF,UAAO,KAAK,aAAa;AACzB,gBAAa,QAAQ;;AAGvB,MAAI,aAAa,UAAU,QAAQ;GACjC,MAAM,eAAe,UAClB,MAAM,WAAW,CACjB,KAAK,MAAoB,EAAE,SAAS,CACpC,KAAK,IAAI;AACZ,UAAO,KAAK,aAAa;;AAG3B,SAAO;;;;;;;;;;CAWT,MAAM,gBACJ,OACA,YAAmC,EAAE,EAChB;EACrB,MAAM,aACJ,UAAU,SAAS,IACf,YACA,MAAM,WAA+B,EAAE,EAAC;EAC9C,MAAM,YAAwB,EAAE;AAEhC,OAAK,IAAI,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;GACrC,MAAM,OAAO,MAAM;GACnB,IAAI,aAAa;AAEjB,QAAK,MAAM,SAAS,MAAM,KAAK,UAAU,KAAK,EAAE;IAC9C,MAAM,WAAW,EAAE,GAAG,WAAW,IAAI;AACrC,QAAI,KAAK,cACP,UAAS,cAAc;IAGzB,MAAM,SAAS,IAAIC,0BAAAA,SAAS;KAC1B,aAAa;KACb;KACD,CAAC;AAEF,cAAU,KAAK,OAAO;AACtB,kBAAc,MAAM;;;AAIxB,SAAO;;;;;;;;;CAUT,MAAM,eAAe,WAA4C;EAC/D,MAAM,QAAQ,UAAU,KAAK,QAAQ,IAAI,YAAY;EACrD,MAAM,YAAY,UAAU,KAAK,QAAQ,IAAI,SAAS;AACtD,SAAO,KAAK,gBAAgB,OAAO,UAAU;;;;;;;;;CAU/C,MAAM,mBAAmB,WAA4C;AACnE,SAAO,KAAK,eAAe,UAAU"}
|
|
@@ -8,18 +8,39 @@ interface SentenceDict {
|
|
|
8
8
|
combinedSentence?: string;
|
|
9
9
|
combinedSentenceEmbedding?: number[];
|
|
10
10
|
}
|
|
11
|
+
/**
|
|
12
|
+
* Calculates the cosine distance (1 - cosine similarity) between sequential sentences.
|
|
13
|
+
*
|
|
14
|
+
* @param sentences - An array of sentences that already contain generated embeddings.
|
|
15
|
+
* @returns A tuple where the first element is an array of distances between adjacent sentences, and the second is the original array of sentences.
|
|
16
|
+
*/
|
|
11
17
|
declare function calculateCosineDistances(sentences: SentenceDict[]): [number[], SentenceDict[]];
|
|
12
18
|
type BreakpointThresholdType = "percentile" | "standard_deviation" | "interquartile" | "gradient";
|
|
19
|
+
/**
|
|
20
|
+
* Configuration parameters for the SemanticTextSplitter.
|
|
21
|
+
*/
|
|
13
22
|
interface SemanticTextSplitterParams {
|
|
23
|
+
/** The embeddings model to calculate string semantic similarities. */
|
|
14
24
|
embeddings: Embeddings;
|
|
25
|
+
/** The number of contextual surrounding sentences to bundle with the target sentence for embeddings. Default is 1. */
|
|
15
26
|
bufferSize?: number;
|
|
27
|
+
/** Whether to add the starting index of the chunk to the metadata. Default is false. */
|
|
16
28
|
addStartIndex?: boolean;
|
|
29
|
+
/** The statistical method used to determine where chunks should be split. Default is "percentile". */
|
|
17
30
|
breakpointThresholdType?: BreakpointThresholdType;
|
|
31
|
+
/** The specific threshold amount corresponding to the algorithm type. Falls back to predefined defaults if not provided. */
|
|
18
32
|
breakpointThresholdAmount?: number;
|
|
33
|
+
/** The exact number of chunks to produce. Hardcoded clustering thresholds are used if provided. */
|
|
19
34
|
numberOfChunks?: number;
|
|
35
|
+
/** A regex pattern used to divide the raw text into individual sentences. Default splits at '.', '?', and '!'. */
|
|
20
36
|
sentenceSplitRegex?: string;
|
|
37
|
+
/** An optional constraint to combine chunks dynamically until they reach this minimum character length. */
|
|
21
38
|
minChunkSize?: number;
|
|
22
39
|
}
|
|
40
|
+
/**
|
|
41
|
+
* A Document Transformer that splits texts by comparing the semantic similarity of sequential sentences.
|
|
42
|
+
* It ensures sentences with high contextual relationships remain in the same chunk.
|
|
43
|
+
*/
|
|
23
44
|
declare class SemanticTextSplitter extends BaseDocumentTransformer {
|
|
24
45
|
private embeddings;
|
|
25
46
|
private bufferSize;
|
|
@@ -29,14 +50,46 @@ declare class SemanticTextSplitter extends BaseDocumentTransformer {
|
|
|
29
50
|
private numberOfChunks?;
|
|
30
51
|
private sentenceSplitRegex;
|
|
31
52
|
private minChunkSize?;
|
|
53
|
+
/**
|
|
54
|
+
* Constructs a new SemanticTextSplitter.
|
|
55
|
+
* @param params - The configuration options initializing the text splitter.
|
|
56
|
+
*/
|
|
32
57
|
constructor(params: SemanticTextSplitterParams);
|
|
33
58
|
private calculateBreakpointThreshold;
|
|
34
59
|
private thresholdFromClusters;
|
|
35
60
|
private calculateSentenceDistances;
|
|
36
61
|
private getSingleSentencesList;
|
|
62
|
+
/**
|
|
63
|
+
* Core method to process a raw string of text and turn it into an array of chunked strings based on semantic proximity.
|
|
64
|
+
*
|
|
65
|
+
* @param text - The raw string input to be chunked.
|
|
66
|
+
* @returns An array of string chunks derived from the original text snippet.
|
|
67
|
+
*/
|
|
37
68
|
splitText(text: string): Promise<string[]>;
|
|
69
|
+
/**
|
|
70
|
+
* Takes raw strings and corresponding optional metadata, sending them through the underlying
|
|
71
|
+
* semantic text splitting process, and forming standard LangChain `Document` objects.
|
|
72
|
+
*
|
|
73
|
+
* @param texts - Array of strings spanning the content to separate out.
|
|
74
|
+
* @param metadatas - Optional array of metadata mappings that align 1:1 with texts.
|
|
75
|
+
* @returns An array of freshly constructed LangChain Document instances containing the split text chunks.
|
|
76
|
+
*/
|
|
38
77
|
createDocuments(texts: string[], metadatas?: Record<string, any>[]): Promise<Document[]>;
|
|
78
|
+
/**
|
|
79
|
+
* Convenience method to take in standardized Documents, extract their text, split the
|
|
80
|
+
* text semantically, and reconstruct them into smaller Documents.
|
|
81
|
+
*
|
|
82
|
+
* @param documents - The original Documents to break down.
|
|
83
|
+
* @returns Semantically chunked Documents inherited from the originals.
|
|
84
|
+
*/
|
|
39
85
|
splitDocuments(documents: Document[]): Promise<Document[]>;
|
|
86
|
+
/**
|
|
87
|
+
* Implements the base BaseDocumentTransformer requirement to process documents.
|
|
88
|
+
* Equivalent to `splitDocuments()`.
|
|
89
|
+
*
|
|
90
|
+
* @param documents - The original Documents to transform.
|
|
91
|
+
* @returns Transformed, semantically split Documents.
|
|
92
|
+
*/
|
|
40
93
|
transformDocuments(documents: Document[]): Promise<Document[]>;
|
|
41
94
|
}
|
|
42
95
|
//#endregion
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"semantic_text_splitter.d.cts","names":[],"sources":["../src/semantic_text_splitter.ts"],"mappings":";;;;UASU,YAAA;EACR,QAAA;EACA,KAAA;EACA,gBAAA;EACA,yBAAA;AAAA;AAAA,
|
|
1
|
+
{"version":3,"file":"semantic_text_splitter.d.cts","names":[],"sources":["../src/semantic_text_splitter.ts"],"mappings":";;;;UASU,YAAA;EACR,QAAA;EACA,KAAA;EACA,gBAAA;EACA,yBAAA;AAAA;AAwDF;;;;;;AAAA,iBAAgB,wBAAA,CACd,SAAA,EAAW,YAAA,gBACC,YAAA;AAAA,KAkBT,uBAAA;;AAFJ;;UAkBgB,0BAAA;EAhBW;EAkB1B,UAAA,EAAY,UAAA;EAFG;EAIf,UAAA;;EAEA,aAAA;EAJA;EAMA,uBAAA,GAA0B,uBAAA;EAJ1B;EAMA,yBAAA;EAFA;EAIA,cAAA;EAFA;EAIA,kBAAA;EAAA;EAEA,YAAA;AAAA;;AAOF;;;cAAa,oBAAA,SAA6B,uBAAA;EAAA,QAChC,UAAA;EAAA,QACA,UAAA;EAAA,QACA,aAAA;EAAA,QACA,uBAAA;EAAA,QACA,yBAAA;EAAA,QACA,cAAA;EAAA,QACA,kBAAA;EAAA,QACA,YAAA;EA+PiD;;;;EAzPzD,WAAA,CAAY,MAAA,EAAQ,0BAAA;EAAA,QAmBZ,4BAAA;EAAA,QA0CA,qBAAA;EAAA,QA4BM,0BAAA;EAAA,QAwBN,sBAAA;EA3HA;;;;;;EAuIF,SAAA,CAAU,IAAA,WAAe,OAAA;EA7HnB;;;;;;;;EAoMN,eAAA,CACJ,KAAA,YACA,SAAA,GAAW,MAAA,kBACV,OAAA,CAAQ,QAAA;EAFT;;;;;;;EAuCI,cAAA,CAAe,SAAA,EAAW,QAAA,KAAa,OAAA,CAAQ,QAAA;EAAR;;;;;;;EAavC,kBAAA,CAAmB,SAAA,EAAW,QAAA,KAAa,OAAA,CAAQ,QAAA;AAAA"}
|
|
@@ -8,18 +8,39 @@ interface SentenceDict {
|
|
|
8
8
|
combinedSentence?: string;
|
|
9
9
|
combinedSentenceEmbedding?: number[];
|
|
10
10
|
}
|
|
11
|
+
/**
|
|
12
|
+
* Calculates the cosine distance (1 - cosine similarity) between sequential sentences.
|
|
13
|
+
*
|
|
14
|
+
* @param sentences - An array of sentences that already contain generated embeddings.
|
|
15
|
+
* @returns A tuple where the first element is an array of distances between adjacent sentences, and the second is the original array of sentences.
|
|
16
|
+
*/
|
|
11
17
|
declare function calculateCosineDistances(sentences: SentenceDict[]): [number[], SentenceDict[]];
|
|
12
18
|
type BreakpointThresholdType = "percentile" | "standard_deviation" | "interquartile" | "gradient";
|
|
19
|
+
/**
|
|
20
|
+
* Configuration parameters for the SemanticTextSplitter.
|
|
21
|
+
*/
|
|
13
22
|
interface SemanticTextSplitterParams {
|
|
23
|
+
/** The embeddings model to calculate string semantic similarities. */
|
|
14
24
|
embeddings: Embeddings;
|
|
25
|
+
/** The number of contextual surrounding sentences to bundle with the target sentence for embeddings. Default is 1. */
|
|
15
26
|
bufferSize?: number;
|
|
27
|
+
/** Whether to add the starting index of the chunk to the metadata. Default is false. */
|
|
16
28
|
addStartIndex?: boolean;
|
|
29
|
+
/** The statistical method used to determine where chunks should be split. Default is "percentile". */
|
|
17
30
|
breakpointThresholdType?: BreakpointThresholdType;
|
|
31
|
+
/** The specific threshold amount corresponding to the algorithm type. Falls back to predefined defaults if not provided. */
|
|
18
32
|
breakpointThresholdAmount?: number;
|
|
33
|
+
/** The exact number of chunks to produce. Hardcoded clustering thresholds are used if provided. */
|
|
19
34
|
numberOfChunks?: number;
|
|
35
|
+
/** A regex pattern used to divide the raw text into individual sentences. Default splits at '.', '?', and '!'. */
|
|
20
36
|
sentenceSplitRegex?: string;
|
|
37
|
+
/** An optional constraint to combine chunks dynamically until they reach this minimum character length. */
|
|
21
38
|
minChunkSize?: number;
|
|
22
39
|
}
|
|
40
|
+
/**
|
|
41
|
+
* A Document Transformer that splits texts by comparing the semantic similarity of sequential sentences.
|
|
42
|
+
* It ensures sentences with high contextual relationships remain in the same chunk.
|
|
43
|
+
*/
|
|
23
44
|
declare class SemanticTextSplitter extends BaseDocumentTransformer {
|
|
24
45
|
private embeddings;
|
|
25
46
|
private bufferSize;
|
|
@@ -29,14 +50,46 @@ declare class SemanticTextSplitter extends BaseDocumentTransformer {
|
|
|
29
50
|
private numberOfChunks?;
|
|
30
51
|
private sentenceSplitRegex;
|
|
31
52
|
private minChunkSize?;
|
|
53
|
+
/**
|
|
54
|
+
* Constructs a new SemanticTextSplitter.
|
|
55
|
+
* @param params - The configuration options initializing the text splitter.
|
|
56
|
+
*/
|
|
32
57
|
constructor(params: SemanticTextSplitterParams);
|
|
33
58
|
private calculateBreakpointThreshold;
|
|
34
59
|
private thresholdFromClusters;
|
|
35
60
|
private calculateSentenceDistances;
|
|
36
61
|
private getSingleSentencesList;
|
|
62
|
+
/**
|
|
63
|
+
* Core method to process a raw string of text and turn it into an array of chunked strings based on semantic proximity.
|
|
64
|
+
*
|
|
65
|
+
* @param text - The raw string input to be chunked.
|
|
66
|
+
* @returns An array of string chunks derived from the original text snippet.
|
|
67
|
+
*/
|
|
37
68
|
splitText(text: string): Promise<string[]>;
|
|
69
|
+
/**
|
|
70
|
+
* Takes raw strings and corresponding optional metadata, sending them through the underlying
|
|
71
|
+
* semantic text splitting process, and forming standard LangChain `Document` objects.
|
|
72
|
+
*
|
|
73
|
+
* @param texts - Array of strings spanning the content to separate out.
|
|
74
|
+
* @param metadatas - Optional array of metadata mappings that align 1:1 with texts.
|
|
75
|
+
* @returns An array of freshly constructed LangChain Document instances containing the split text chunks.
|
|
76
|
+
*/
|
|
38
77
|
createDocuments(texts: string[], metadatas?: Record<string, any>[]): Promise<Document[]>;
|
|
78
|
+
/**
|
|
79
|
+
* Convenience method to take in standardized Documents, extract their text, split the
|
|
80
|
+
* text semantically, and reconstruct them into smaller Documents.
|
|
81
|
+
*
|
|
82
|
+
* @param documents - The original Documents to break down.
|
|
83
|
+
* @returns Semantically chunked Documents inherited from the originals.
|
|
84
|
+
*/
|
|
39
85
|
splitDocuments(documents: Document[]): Promise<Document[]>;
|
|
86
|
+
/**
|
|
87
|
+
* Implements the base BaseDocumentTransformer requirement to process documents.
|
|
88
|
+
* Equivalent to `splitDocuments()`.
|
|
89
|
+
*
|
|
90
|
+
* @param documents - The original Documents to transform.
|
|
91
|
+
* @returns Transformed, semantically split Documents.
|
|
92
|
+
*/
|
|
40
93
|
transformDocuments(documents: Document[]): Promise<Document[]>;
|
|
41
94
|
}
|
|
42
95
|
//#endregion
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"semantic_text_splitter.d.ts","names":[],"sources":["../src/semantic_text_splitter.ts"],"mappings":";;;;UASU,YAAA;EACR,QAAA;EACA,KAAA;EACA,gBAAA;EACA,yBAAA;AAAA;AAAA,
|
|
1
|
+
{"version":3,"file":"semantic_text_splitter.d.ts","names":[],"sources":["../src/semantic_text_splitter.ts"],"mappings":";;;;UASU,YAAA;EACR,QAAA;EACA,KAAA;EACA,gBAAA;EACA,yBAAA;AAAA;AAwDF;;;;;;AAAA,iBAAgB,wBAAA,CACd,SAAA,EAAW,YAAA,gBACC,YAAA;AAAA,KAkBT,uBAAA;;AAFJ;;UAkBgB,0BAAA;EAhBW;EAkB1B,UAAA,EAAY,UAAA;EAFG;EAIf,UAAA;;EAEA,aAAA;EAJA;EAMA,uBAAA,GAA0B,uBAAA;EAJ1B;EAMA,yBAAA;EAFA;EAIA,cAAA;EAFA;EAIA,kBAAA;EAAA;EAEA,YAAA;AAAA;;AAOF;;;cAAa,oBAAA,SAA6B,uBAAA;EAAA,QAChC,UAAA;EAAA,QACA,UAAA;EAAA,QACA,aAAA;EAAA,QACA,uBAAA;EAAA,QACA,yBAAA;EAAA,QACA,cAAA;EAAA,QACA,kBAAA;EAAA,QACA,YAAA;EA+PiD;;;;EAzPzD,WAAA,CAAY,MAAA,EAAQ,0BAAA;EAAA,QAmBZ,4BAAA;EAAA,QA0CA,qBAAA;EAAA,QA4BM,0BAAA;EAAA,QAwBN,sBAAA;EA3HA;;;;;;EAuIF,SAAA,CAAU,IAAA,WAAe,OAAA;EA7HnB;;;;;;;;EAoMN,eAAA,CACJ,KAAA,YACA,SAAA,GAAW,MAAA,kBACV,OAAA,CAAQ,QAAA;EAFT;;;;;;;EAuCI,cAAA,CAAe,SAAA,EAAW,QAAA,KAAa,OAAA,CAAQ,QAAA;EAAR;;;;;;;EAavC,kBAAA,CAAmB,SAAA,EAAW,QAAA,KAAa,OAAA,CAAQ,QAAA;AAAA"}
|
|
@@ -2,7 +2,7 @@ import { BaseDocumentTransformer, Document } from "@langchain/core/documents";
|
|
|
2
2
|
//#region src/semantic_text_splitter.ts
|
|
3
3
|
/**
|
|
4
4
|
* Experimental semantic text splitter based on semantic similarity.
|
|
5
|
-
*
|
|
5
|
+
*
|
|
6
6
|
* Inspired by Greg Kamradt's semantic chunking approach:
|
|
7
7
|
* https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
|
|
8
8
|
*/
|
|
@@ -24,6 +24,12 @@ function cosineSimilarity(a, b) {
|
|
|
24
24
|
if (magnitudeA === 0 || magnitudeB === 0) return 0;
|
|
25
25
|
return dotProduct / (magnitudeA * magnitudeB);
|
|
26
26
|
}
|
|
27
|
+
/**
|
|
28
|
+
* Calculates the cosine distance (1 - cosine similarity) between sequential sentences.
|
|
29
|
+
*
|
|
30
|
+
* @param sentences - An array of sentences that already contain generated embeddings.
|
|
31
|
+
* @returns A tuple where the first element is an array of distances between adjacent sentences, and the second is the original array of sentences.
|
|
32
|
+
*/
|
|
27
33
|
function calculateCosineDistances(sentences) {
|
|
28
34
|
const distances = [];
|
|
29
35
|
for (let i = 0; i < sentences.length - 1; i++) {
|
|
@@ -40,6 +46,10 @@ const BREAKPOINT_DEFAULTS = {
|
|
|
40
46
|
interquartile: 1.5,
|
|
41
47
|
gradient: 95
|
|
42
48
|
};
|
|
49
|
+
/**
|
|
50
|
+
* A Document Transformer that splits texts by comparing the semantic similarity of sequential sentences.
|
|
51
|
+
* It ensures sentences with high contextual relationships remain in the same chunk.
|
|
52
|
+
*/
|
|
43
53
|
var SemanticTextSplitter = class extends BaseDocumentTransformer {
|
|
44
54
|
embeddings;
|
|
45
55
|
bufferSize;
|
|
@@ -49,6 +59,10 @@ var SemanticTextSplitter = class extends BaseDocumentTransformer {
|
|
|
49
59
|
numberOfChunks;
|
|
50
60
|
sentenceSplitRegex;
|
|
51
61
|
minChunkSize;
|
|
62
|
+
/**
|
|
63
|
+
* Constructs a new SemanticTextSplitter.
|
|
64
|
+
* @param params - The configuration options initializing the text splitter.
|
|
65
|
+
*/
|
|
52
66
|
constructor(params) {
|
|
53
67
|
super();
|
|
54
68
|
this.embeddings = params.embeddings;
|
|
@@ -114,6 +128,12 @@ var SemanticTextSplitter = class extends BaseDocumentTransformer {
|
|
|
114
128
|
getSingleSentencesList(text) {
|
|
115
129
|
return text.split(new RegExp(this.sentenceSplitRegex)).filter((sentence) => sentence.trim().length > 0);
|
|
116
130
|
}
|
|
131
|
+
/**
|
|
132
|
+
* Core method to process a raw string of text and turn it into an array of chunked strings based on semantic proximity.
|
|
133
|
+
*
|
|
134
|
+
* @param text - The raw string input to be chunked.
|
|
135
|
+
* @returns An array of string chunks derived from the original text snippet.
|
|
136
|
+
*/
|
|
117
137
|
async splitText(text) {
|
|
118
138
|
const singleSentencesList = this.getSingleSentencesList(text);
|
|
119
139
|
if (singleSentencesList.length === 1) return singleSentencesList;
|
|
@@ -144,6 +164,14 @@ var SemanticTextSplitter = class extends BaseDocumentTransformer {
|
|
|
144
164
|
}
|
|
145
165
|
return chunks;
|
|
146
166
|
}
|
|
167
|
+
/**
|
|
168
|
+
* Takes raw strings and corresponding optional metadata, sending them through the underlying
|
|
169
|
+
* semantic text splitting process, and forming standard LangChain `Document` objects.
|
|
170
|
+
*
|
|
171
|
+
* @param texts - Array of strings spanning the content to separate out.
|
|
172
|
+
* @param metadatas - Optional array of metadata mappings that align 1:1 with texts.
|
|
173
|
+
* @returns An array of freshly constructed LangChain Document instances containing the split text chunks.
|
|
174
|
+
*/
|
|
147
175
|
async createDocuments(texts, metadatas = []) {
|
|
148
176
|
const _metadatas = metadatas.length > 0 ? metadatas : texts.map(() => ({}));
|
|
149
177
|
const documents = [];
|
|
@@ -163,11 +191,25 @@ var SemanticTextSplitter = class extends BaseDocumentTransformer {
|
|
|
163
191
|
}
|
|
164
192
|
return documents;
|
|
165
193
|
}
|
|
194
|
+
/**
|
|
195
|
+
* Convenience method to take in standardized Documents, extract their text, split the
|
|
196
|
+
* text semantically, and reconstruct them into smaller Documents.
|
|
197
|
+
*
|
|
198
|
+
* @param documents - The original Documents to break down.
|
|
199
|
+
* @returns Semantically chunked Documents inherited from the originals.
|
|
200
|
+
*/
|
|
166
201
|
async splitDocuments(documents) {
|
|
167
202
|
const texts = documents.map((doc) => doc.pageContent);
|
|
168
203
|
const metadatas = documents.map((doc) => doc.metadata);
|
|
169
204
|
return this.createDocuments(texts, metadatas);
|
|
170
205
|
}
|
|
206
|
+
/**
|
|
207
|
+
* Implements the base BaseDocumentTransformer requirement to process documents.
|
|
208
|
+
* Equivalent to `splitDocuments()`.
|
|
209
|
+
*
|
|
210
|
+
* @param documents - The original Documents to transform.
|
|
211
|
+
* @returns Transformed, semantically split Documents.
|
|
212
|
+
*/
|
|
171
213
|
async transformDocuments(documents) {
|
|
172
214
|
return this.splitDocuments(documents);
|
|
173
215
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"semantic_text_splitter.js","names":[],"sources":["../src/semantic_text_splitter.ts"],"sourcesContent":["/**\r\n * Experimental semantic text splitter based on semantic similarity.\r\n * \r\n * Inspired by Greg Kamradt's semantic chunking approach:\r\n * https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb\r\n */\r\nimport { Document, BaseDocumentTransformer } from \"@langchain/core/documents\";\r\nimport { Embeddings } from \"@langchain/core/embeddings\";\r\n\r\ninterface SentenceDict {\r\n sentence: string;\r\n index: number;\r\n combinedSentence?: string;\r\n combinedSentenceEmbedding?: number[];\r\n}\r\n\r\nfunction combineSentences(sentences: SentenceDict[], bufferSize: number = 1): SentenceDict[] {\r\n // Go through each sentence dict\r\n for (let i = 0; i < sentences.length; i++) {\r\n const sentence = sentences[i];\r\n \r\n // Create the combined sentence by combining the sentences within the buffer\r\n let combinedSentence = \"\";\r\n \r\n // Add sentences before the current one\r\n for (let j = Math.max(0, i - bufferSize); j < i; j++) {\r\n combinedSentence += sentences[j].sentence + \" \";\r\n }\r\n \r\n // Add the current sentence\r\n combinedSentence += sentence.sentence;\r\n \r\n // Add sentences after the current one\r\n for (let j = i + 1; j <= Math.min(sentences.length - 1, i + bufferSize); j++) {\r\n combinedSentence += \" \" + sentences[j].sentence;\r\n }\r\n \r\n // Assign the combined sentence to the dict\r\n sentence.combinedSentence = combinedSentence.trim();\r\n }\r\n \r\n return sentences;\r\n}\r\n\r\nfunction cosineSimilarity(a: number[], b: number[]): number {\r\n const dotProduct = a.reduce((sum, ai, i) => sum + ai * b[i], 0);\r\n const magnitudeA = Math.sqrt(a.reduce((sum, ai) => sum + ai * ai, 0));\r\n const magnitudeB = Math.sqrt(b.reduce((sum, bi) => sum + bi * bi, 0));\r\n \r\n if (magnitudeA === 0 || magnitudeB === 0) {\r\n return 0;\r\n }\r\n \r\n return dotProduct / (magnitudeA * magnitudeB);\r\n}\r\n\r\nexport function calculateCosineDistances(sentences: SentenceDict[]): [number[], SentenceDict[]] {\r\n const distances: number[] = [];\r\n \r\n for (let i = 0; i < sentences.length - 1; i++) {\r\n const embeddingCurrent = sentences[i].combinedSentenceEmbedding!;\r\n const embeddingNext = sentences[i + 1].combinedSentenceEmbedding!;\r\n \r\n // Calculate cosine similarity\r\n const similarity = cosineSimilarity(embeddingCurrent, embeddingNext);\r\n \r\n // Convert to cosine distance (1 - cosine similarity)\r\n const distance = 1 - similarity;\r\n distances.push(distance);\r\n }\r\n \r\n return [distances, sentences];\r\n}\r\n\r\ntype BreakpointThresholdType = \"percentile\" | \"standard_deviation\" | \"interquartile\" | \"gradient\";\r\n\r\nconst BREAKPOINT_DEFAULTS: Record<BreakpointThresholdType, number> = {\r\n percentile: 95,\r\n standard_deviation: 3,\r\n interquartile: 1.5,\r\n gradient: 95,\r\n};\r\n\r\nexport interface SemanticTextSplitterParams {\r\n embeddings: Embeddings;\r\n bufferSize?: number;\r\n addStartIndex?: boolean;\r\n breakpointThresholdType?: BreakpointThresholdType;\r\n breakpointThresholdAmount?: number;\r\n numberOfChunks?: number;\r\n sentenceSplitRegex?: string;\r\n minChunkSize?: number;\r\n}\r\n\r\nexport class SemanticTextSplitter extends BaseDocumentTransformer {\r\n private embeddings: Embeddings;\r\n private bufferSize: number;\r\n private addStartIndex: boolean;\r\n private breakpointThresholdType: BreakpointThresholdType;\r\n private breakpointThresholdAmount: number;\r\n private numberOfChunks?: number;\r\n private sentenceSplitRegex: string;\r\n private minChunkSize?: number;\r\n\r\n constructor(params: SemanticTextSplitterParams) {\r\n super();\r\n this.embeddings = params.embeddings;\r\n this.bufferSize = params.bufferSize ?? 1;\r\n this.addStartIndex = params.addStartIndex ?? false;\r\n this.breakpointThresholdType = params.breakpointThresholdType ?? \"percentile\";\r\n this.numberOfChunks = params.numberOfChunks;\r\n this.sentenceSplitRegex = params.sentenceSplitRegex ?? \"(?<=[.?!])\\\\s+\";\r\n this.minChunkSize = params.minChunkSize;\r\n \r\n if (params.breakpointThresholdAmount === undefined) {\r\n this.breakpointThresholdAmount = BREAKPOINT_DEFAULTS[this.breakpointThresholdType];\r\n } else {\r\n this.breakpointThresholdAmount = params.breakpointThresholdAmount;\r\n }\r\n }\r\n\r\n private calculateBreakpointThreshold(distances: number[]): [number, number[]] {\r\n if (this.breakpointThresholdType === \"percentile\") {\r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const index = Math.ceil((this.breakpointThresholdAmount / 100) * sorted.length) - 1;\r\n return [sorted[Math.max(0, index)], distances];\r\n } else if (this.breakpointThresholdType === \"standard_deviation\") {\r\n const mean = distances.reduce((sum, d) => sum + d, 0) / distances.length;\r\n const variance = distances.reduce((sum, d) => sum + Math.pow(d - mean, 2), 0) / distances.length;\r\n const stdDev = Math.sqrt(variance);\r\n return [mean + this.breakpointThresholdAmount * stdDev, distances];\r\n } else if (this.breakpointThresholdType === \"interquartile\") {\r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const q1Index = Math.floor(0.25 * sorted.length);\r\n const q3Index = Math.floor(0.75 * sorted.length);\r\n const q1 = sorted[q1Index];\r\n const q3 = sorted[q3Index];\r\n const iqr = q3 - q1;\r\n const mean = distances.reduce((sum, d) => sum + d, 0) / distances.length;\r\n return [mean + this.breakpointThresholdAmount * iqr, distances];\r\n } else if (this.breakpointThresholdType === \"gradient\") {\r\n const distanceGradient: number[] = [];\r\n for (let i = 0; i < distances.length - 1; i++) {\r\n distanceGradient.push(distances[i + 1] - distances[i]);\r\n }\r\n const sortedGradient = [...distanceGradient].sort((a, b) => a - b);\r\n const index = Math.ceil((this.breakpointThresholdAmount / 100) * sortedGradient.length) - 1;\r\n return [sortedGradient[Math.max(0, index)], distanceGradient];\r\n } else {\r\n throw new Error(`Got unexpected breakpointThresholdType: ${this.breakpointThresholdType}`);\r\n }\r\n }\r\n\r\n private thresholdFromClusters(distances: number[]): number {\r\n if (this.numberOfChunks === undefined) {\r\n throw new Error(\"This should never be called if numberOfChunks is undefined.\");\r\n }\r\n \r\n const x1 = distances.length;\r\n const y1 = 0.0;\r\n const x2 = 1.0;\r\n const y2 = 100.0;\r\n \r\n const x = Math.max(Math.min(this.numberOfChunks, x1), x2);\r\n \r\n let y: number;\r\n if (x2 === x1) {\r\n y = y2;\r\n } else {\r\n y = y1 + ((y2 - y1) / (x2 - x1)) * (x - x1);\r\n }\r\n \r\n y = Math.min(Math.max(y, 0), 100);\r\n \r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const index = Math.ceil((y / 100) * sorted.length) - 1;\r\n return sorted[Math.max(0, index)];\r\n }\r\n\r\n private async calculateSentenceDistances(singleSentencesList: string[]): Promise<[number[], SentenceDict[]]> {\r\n const sentences: SentenceDict[] = singleSentencesList.map((sentence, index) => ({\r\n sentence,\r\n index,\r\n }));\r\n\r\n const sentencesWithCombined = combineSentences(sentences, this.bufferSize);\r\n \r\n const combinedSentences = sentencesWithCombined.map(s => s.combinedSentence!);\r\n const embeddings = await this.embeddings.embedDocuments(combinedSentences);\r\n \r\n for (let i = 0; i < sentencesWithCombined.length; i++) {\r\n sentencesWithCombined[i].combinedSentenceEmbedding = embeddings[i];\r\n }\r\n\r\n return calculateCosineDistances(sentencesWithCombined);\r\n }\r\n\r\n private getSingleSentencesList(text: string): string[] {\r\n return text.split(new RegExp(this.sentenceSplitRegex)).filter(sentence => sentence.trim().length > 0);\r\n }\r\n\r\n async splitText(text: string): Promise<string[]> {\r\n const singleSentencesList = this.getSingleSentencesList(text);\r\n\r\n if (singleSentencesList.length === 1) {\r\n return singleSentencesList;\r\n }\r\n\r\n if (this.breakpointThresholdType === \"gradient\" && singleSentencesList.length === 2) {\r\n return singleSentencesList;\r\n }\r\n\r\n const [distances, sentences] = await this.calculateSentenceDistances(singleSentencesList);\r\n\r\n let breakpointDistanceThreshold: number;\r\n let breakpointArray: number[];\r\n\r\n if (this.numberOfChunks !== undefined) {\r\n breakpointDistanceThreshold = this.thresholdFromClusters(distances);\r\n breakpointArray = distances;\r\n } else {\r\n [breakpointDistanceThreshold, breakpointArray] = this.calculateBreakpointThreshold(distances);\r\n }\r\n\r\n const indicesAboveThresh = breakpointArray\r\n .map((x, i) => ({ value: x, index: i }))\r\n .filter(({ value }) => value > breakpointDistanceThreshold)\r\n .map(({ index }) => index);\r\n\r\n const chunks: string[] = [];\r\n let startIndex = 0;\r\n\r\n for (const index of indicesAboveThresh) {\r\n const endIndex = index;\r\n const group = sentences.slice(startIndex, endIndex + 1);\r\n const combinedText = group.map((d: SentenceDict) => d.sentence).join(\" \");\r\n\r\n if (this.minChunkSize !== undefined && combinedText.length < this.minChunkSize) {\r\n continue;\r\n }\r\n \r\n chunks.push(combinedText);\r\n startIndex = index + 1;\r\n }\r\n\r\n if (startIndex < sentences.length) {\r\n const combinedText = sentences.slice(startIndex).map((d: SentenceDict) => d.sentence).join(\" \");\r\n chunks.push(combinedText);\r\n }\r\n\r\n return chunks;\r\n }\r\n\r\n async createDocuments(\r\n texts: string[],\r\n metadatas: Record<string, any>[] = []\r\n ): Promise<Document[]> {\r\n const _metadatas = metadatas.length > 0 ? metadatas : texts.map(() => (<Record<string, any>>{}));\r\n const documents: Document[] = [];\r\n \r\n for (let i = 0; i < texts.length; i++) {\r\n const text = texts[i];\r\n let startIndex = 0;\r\n \r\n for (const chunk of await this.splitText(text)) {\r\n const metadata = { ..._metadatas[i] };\r\n if (this.addStartIndex) {\r\n metadata.start_index = startIndex;\r\n }\r\n \r\n const newDoc = new Document({\r\n pageContent: chunk,\r\n metadata,\r\n });\r\n \r\n documents.push(newDoc);\r\n startIndex += chunk.length;\r\n }\r\n }\r\n \r\n return documents;\r\n }\r\n\r\n async splitDocuments(documents: Document[]): Promise<Document[]> {\r\n const texts = documents.map(doc => doc.pageContent);\r\n const metadatas = documents.map(doc => doc.metadata);\r\n return this.createDocuments(texts, metadatas);\r\n }\r\n\r\n async transformDocuments(documents: Document[]): Promise<Document[]> {\r\n return this.splitDocuments(documents);\r\n }\r\n}"],"mappings":";;;;;;;;AAgBA,SAAS,iBAAiB,WAA2B,aAAqB,GAAmB;AAE3F,MAAK,IAAI,IAAI,GAAG,IAAI,UAAU,QAAQ,KAAK;EACzC,MAAM,WAAW,UAAU;EAG3B,IAAI,mBAAmB;AAGvB,OAAK,IAAI,IAAI,KAAK,IAAI,GAAG,IAAI,WAAW,EAAE,IAAI,GAAG,IAC/C,qBAAoB,UAAU,GAAG,WAAW;AAI9C,sBAAoB,SAAS;AAG7B,OAAK,IAAI,IAAI,IAAI,GAAG,KAAK,KAAK,IAAI,UAAU,SAAS,GAAG,IAAI,WAAW,EAAE,IACvE,qBAAoB,MAAM,UAAU,GAAG;AAIzC,WAAS,mBAAmB,iBAAiB,MAAM;;AAGrD,QAAO;;AAGT,SAAS,iBAAiB,GAAa,GAAqB;CAC1D,MAAM,aAAa,EAAE,QAAQ,KAAK,IAAI,MAAM,MAAM,KAAK,EAAE,IAAI,EAAE;CAC/D,MAAM,aAAa,KAAK,KAAK,EAAE,QAAQ,KAAK,OAAO,MAAM,KAAK,IAAI,EAAE,CAAC;CACrE,MAAM,aAAa,KAAK,KAAK,EAAE,QAAQ,KAAK,OAAO,MAAM,KAAK,IAAI,EAAE,CAAC;AAErE,KAAI,eAAe,KAAK,eAAe,EACrC,QAAO;AAGT,QAAO,cAAc,aAAa;;AAGpC,SAAgB,yBAAyB,WAAuD;CAC9F,MAAM,YAAsB,EAAE;AAE9B,MAAK,IAAI,IAAI,GAAG,IAAI,UAAU,SAAS,GAAG,KAAK;EAC7C,MAAM,mBAAmB,UAAU,GAAG;EACtC,MAAM,gBAAgB,UAAU,IAAI,GAAG;EAMvC,MAAM,WAAW,IAHE,iBAAiB,kBAAkB,cAAc;AAIpE,YAAU,KAAK,SAAS;;AAG1B,QAAO,CAAC,WAAW,UAAU;;AAK/B,MAAM,sBAA+D;CACnE,YAAY;CACZ,oBAAoB;CACpB,eAAe;CACf,UAAU;CACX;AAaD,IAAa,uBAAb,cAA0C,wBAAwB;CAChE;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CAEA,YAAY,QAAoC;AAC9C,SAAO;AACP,OAAK,aAAa,OAAO;AACzB,OAAK,aAAa,OAAO,cAAc;AACvC,OAAK,gBAAgB,OAAO,iBAAiB;AAC7C,OAAK,0BAA0B,OAAO,2BAA2B;AACjE,OAAK,iBAAiB,OAAO;AAC7B,OAAK,qBAAqB,OAAO,sBAAsB;AACvD,OAAK,eAAe,OAAO;AAE3B,MAAI,OAAO,8BAA8B,KAAA,EACvC,MAAK,4BAA4B,oBAAoB,KAAK;MAE1D,MAAK,4BAA4B,OAAO;;CAI5C,6BAAqC,WAAyC;AAC5E,MAAI,KAAK,4BAA4B,cAAc;GACjD,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GACnD,MAAM,QAAQ,KAAK,KAAM,KAAK,4BAA4B,MAAO,OAAO,OAAO,GAAG;AAClF,UAAO,CAAC,OAAO,KAAK,IAAI,GAAG,MAAM,GAAG,UAAU;aACrC,KAAK,4BAA4B,sBAAsB;GAChE,MAAM,OAAO,UAAU,QAAQ,KAAK,MAAM,MAAM,GAAG,EAAE,GAAG,UAAU;GAClE,MAAM,WAAW,UAAU,QAAQ,KAAK,MAAM,MAAM,KAAK,IAAI,IAAI,MAAM,EAAE,EAAE,EAAE,GAAG,UAAU;GAC1F,MAAM,SAAS,KAAK,KAAK,SAAS;AAClC,UAAO,CAAC,OAAO,KAAK,4BAA4B,QAAQ,UAAU;aACzD,KAAK,4BAA4B,iBAAiB;GAC3D,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GACnD,MAAM,UAAU,KAAK,MAAM,MAAO,OAAO,OAAO;GAChD,MAAM,UAAU,KAAK,MAAM,MAAO,OAAO,OAAO;GAChD,MAAM,KAAK,OAAO;GAElB,MAAM,MADK,OAAO,WACD;AAEjB,UAAO,CADM,UAAU,QAAQ,KAAK,MAAM,MAAM,GAAG,EAAE,GAAG,UAAU,SACnD,KAAK,4BAA4B,KAAK,UAAU;aACtD,KAAK,4BAA4B,YAAY;GACtD,MAAM,mBAA6B,EAAE;AACrC,QAAK,IAAI,IAAI,GAAG,IAAI,UAAU,SAAS,GAAG,IACxC,kBAAiB,KAAK,UAAU,IAAI,KAAK,UAAU,GAAG;GAExD,MAAM,iBAAiB,CAAC,GAAG,iBAAiB,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GAClE,MAAM,QAAQ,KAAK,KAAM,KAAK,4BAA4B,MAAO,eAAe,OAAO,GAAG;AAC1F,UAAO,CAAC,eAAe,KAAK,IAAI,GAAG,MAAM,GAAG,iBAAiB;QAE7D,OAAM,IAAI,MAAM,2CAA2C,KAAK,0BAA0B;;CAI9F,sBAA8B,WAA6B;AACzD,MAAI,KAAK,mBAAmB,KAAA,EAC1B,OAAM,IAAI,MAAM,8DAA8D;EAGhF,MAAM,KAAK,UAAU;EACrB,MAAM,KAAK;EACX,MAAM,KAAK;EACX,MAAM,KAAK;EAEX,MAAM,IAAI,KAAK,IAAI,KAAK,IAAI,KAAK,gBAAgB,GAAG,EAAE,GAAG;EAEzD,IAAI;AACJ,MAAI,OAAO,GACT,KAAI;MAEJ,KAAI,MAAO,KAAK,OAAO,KAAK,OAAQ,IAAI;AAG1C,MAAI,KAAK,IAAI,KAAK,IAAI,GAAG,EAAE,EAAE,IAAI;EAEjC,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;EACnD,MAAM,QAAQ,KAAK,KAAM,IAAI,MAAO,OAAO,OAAO,GAAG;AACrD,SAAO,OAAO,KAAK,IAAI,GAAG,MAAM;;CAGlC,MAAc,2BAA2B,qBAAoE;EAM3G,MAAM,wBAAwB,iBALI,oBAAoB,KAAK,UAAU,WAAW;GAC9E;GACA;GACD,EAAE,EAEuD,KAAK,WAAW;EAE1E,MAAM,oBAAoB,sBAAsB,KAAI,MAAK,EAAE,iBAAkB;EAC7E,MAAM,aAAa,MAAM,KAAK,WAAW,eAAe,kBAAkB;AAE1E,OAAK,IAAI,IAAI,GAAG,IAAI,sBAAsB,QAAQ,IAChD,uBAAsB,GAAG,4BAA4B,WAAW;AAGlE,SAAO,yBAAyB,sBAAsB;;CAGxD,uBAA+B,MAAwB;AACrD,SAAO,KAAK,MAAM,IAAI,OAAO,KAAK,mBAAmB,CAAC,CAAC,QAAO,aAAY,SAAS,MAAM,CAAC,SAAS,EAAE;;CAGvG,MAAM,UAAU,MAAiC;EAC/C,MAAM,sBAAsB,KAAK,uBAAuB,KAAK;AAE7D,MAAI,oBAAoB,WAAW,EACjC,QAAO;AAGT,MAAI,KAAK,4BAA4B,cAAc,oBAAoB,WAAW,EAChF,QAAO;EAGT,MAAM,CAAC,WAAW,aAAa,MAAM,KAAK,2BAA2B,oBAAoB;EAEzF,IAAI;EACJ,IAAI;AAEJ,MAAI,KAAK,mBAAmB,KAAA,GAAW;AACrC,iCAA8B,KAAK,sBAAsB,UAAU;AACnE,qBAAkB;QAElB,EAAC,6BAA6B,mBAAmB,KAAK,6BAA6B,UAAU;EAG/F,MAAM,qBAAqB,gBACxB,KAAK,GAAG,OAAO;GAAE,OAAO;GAAG,OAAO;GAAG,EAAE,CACvC,QAAQ,EAAE,YAAY,QAAQ,4BAA4B,CAC1D,KAAK,EAAE,YAAY,MAAM;EAE5B,MAAM,SAAmB,EAAE;EAC3B,IAAI,aAAa;AAEjB,OAAK,MAAM,SAAS,oBAAoB;GACtC,MAAM,WAAW;GAEjB,MAAM,eADQ,UAAU,MAAM,YAAY,WAAW,EAAE,CAC5B,KAAK,MAAoB,EAAE,SAAS,CAAC,KAAK,IAAI;AAEzE,OAAI,KAAK,iBAAiB,KAAA,KAAa,aAAa,SAAS,KAAK,aAChE;AAGF,UAAO,KAAK,aAAa;AACzB,gBAAa,QAAQ;;AAGvB,MAAI,aAAa,UAAU,QAAQ;GACjC,MAAM,eAAe,UAAU,MAAM,WAAW,CAAC,KAAK,MAAoB,EAAE,SAAS,CAAC,KAAK,IAAI;AAC/F,UAAO,KAAK,aAAa;;AAG3B,SAAO;;CAGT,MAAM,gBACJ,OACA,YAAmC,EAAE,EAChB;EACrB,MAAM,aAAa,UAAU,SAAS,IAAI,YAAY,MAAM,WAAgC,EAAE,EAAE;EAChG,MAAM,YAAwB,EAAE;AAEhC,OAAK,IAAI,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;GACrC,MAAM,OAAO,MAAM;GACnB,IAAI,aAAa;AAEjB,QAAK,MAAM,SAAS,MAAM,KAAK,UAAU,KAAK,EAAE;IAC9C,MAAM,WAAW,EAAE,GAAG,WAAW,IAAI;AACrC,QAAI,KAAK,cACP,UAAS,cAAc;IAGzB,MAAM,SAAS,IAAI,SAAS;KAC1B,aAAa;KACb;KACD,CAAC;AAEF,cAAU,KAAK,OAAO;AACtB,kBAAc,MAAM;;;AAIxB,SAAO;;CAGT,MAAM,eAAe,WAA4C;EAC/D,MAAM,QAAQ,UAAU,KAAI,QAAO,IAAI,YAAY;EACnD,MAAM,YAAY,UAAU,KAAI,QAAO,IAAI,SAAS;AACpD,SAAO,KAAK,gBAAgB,OAAO,UAAU;;CAG/C,MAAM,mBAAmB,WAA4C;AACnE,SAAO,KAAK,eAAe,UAAU"}
|
|
1
|
+
{"version":3,"file":"semantic_text_splitter.js","names":[],"sources":["../src/semantic_text_splitter.ts"],"sourcesContent":["/**\r\n * Experimental semantic text splitter based on semantic similarity.\r\n *\r\n * Inspired by Greg Kamradt's semantic chunking approach:\r\n * https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb\r\n */\r\nimport { Document, BaseDocumentTransformer } from \"@langchain/core/documents\";\r\nimport { Embeddings } from \"@langchain/core/embeddings\";\r\n\r\ninterface SentenceDict {\r\n sentence: string;\r\n index: number;\r\n combinedSentence?: string;\r\n combinedSentenceEmbedding?: number[];\r\n}\r\n\r\nfunction combineSentences(\r\n sentences: SentenceDict[],\r\n bufferSize: number = 1,\r\n): SentenceDict[] {\r\n // Go through each sentence dict\r\n for (let i = 0; i < sentences.length; i++) {\r\n const sentence = sentences[i];\r\n\r\n // Create the combined sentence by combining the sentences within the buffer\r\n let combinedSentence = \"\";\r\n\r\n // Add sentences before the current one\r\n for (let j = Math.max(0, i - bufferSize); j < i; j++) {\r\n combinedSentence += sentences[j].sentence + \" \";\r\n }\r\n\r\n // Add the current sentence\r\n combinedSentence += sentence.sentence;\r\n\r\n // Add sentences after the current one\r\n for (\r\n let j = i + 1;\r\n j <= Math.min(sentences.length - 1, i + bufferSize);\r\n j++\r\n ) {\r\n combinedSentence += \" \" + sentences[j].sentence;\r\n }\r\n\r\n // Assign the combined sentence to the dict\r\n sentence.combinedSentence = combinedSentence.trim();\r\n }\r\n\r\n return sentences;\r\n}\r\n\r\nfunction cosineSimilarity(a: number[], b: number[]): number {\r\n const dotProduct = a.reduce((sum, ai, i) => sum + ai * b[i], 0);\r\n const magnitudeA = Math.sqrt(a.reduce((sum, ai) => sum + ai * ai, 0));\r\n const magnitudeB = Math.sqrt(b.reduce((sum, bi) => sum + bi * bi, 0));\r\n\r\n if (magnitudeA === 0 || magnitudeB === 0) {\r\n return 0;\r\n }\r\n\r\n return dotProduct / (magnitudeA * magnitudeB);\r\n}\r\n\r\n/**\r\n * Calculates the cosine distance (1 - cosine similarity) between sequential sentences.\r\n *\r\n * @param sentences - An array of sentences that already contain generated embeddings.\r\n * @returns A tuple where the first element is an array of distances between adjacent sentences, and the second is the original array of sentences.\r\n */\r\nexport function calculateCosineDistances(\r\n sentences: SentenceDict[],\r\n): [number[], SentenceDict[]] {\r\n const distances: number[] = [];\r\n\r\n for (let i = 0; i < sentences.length - 1; i++) {\r\n const embeddingCurrent = sentences[i].combinedSentenceEmbedding!;\r\n const embeddingNext = sentences[i + 1].combinedSentenceEmbedding!;\r\n\r\n // Calculate cosine similarity\r\n const similarity = cosineSimilarity(embeddingCurrent, embeddingNext);\r\n\r\n // Convert to cosine distance (1 - cosine similarity)\r\n const distance = 1 - similarity;\r\n distances.push(distance);\r\n }\r\n\r\n return [distances, sentences];\r\n}\r\n\r\ntype BreakpointThresholdType =\r\n | \"percentile\"\r\n | \"standard_deviation\"\r\n | \"interquartile\"\r\n | \"gradient\";\r\n\r\nconst BREAKPOINT_DEFAULTS: Record<BreakpointThresholdType, number> = {\r\n percentile: 95,\r\n standard_deviation: 3,\r\n interquartile: 1.5,\r\n gradient: 95,\r\n};\r\n\r\n/**\r\n * Configuration parameters for the SemanticTextSplitter.\r\n */\r\nexport interface SemanticTextSplitterParams {\r\n /** The embeddings model to calculate string semantic similarities. */\r\n embeddings: Embeddings;\r\n /** The number of contextual surrounding sentences to bundle with the target sentence for embeddings. Default is 1. */\r\n bufferSize?: number;\r\n /** Whether to add the starting index of the chunk to the metadata. Default is false. */\r\n addStartIndex?: boolean;\r\n /** The statistical method used to determine where chunks should be split. Default is \"percentile\". */\r\n breakpointThresholdType?: BreakpointThresholdType;\r\n /** The specific threshold amount corresponding to the algorithm type. Falls back to predefined defaults if not provided. */\r\n breakpointThresholdAmount?: number;\r\n /** The exact number of chunks to produce. Hardcoded clustering thresholds are used if provided. */\r\n numberOfChunks?: number;\r\n /** A regex pattern used to divide the raw text into individual sentences. Default splits at '.', '?', and '!'. */\r\n sentenceSplitRegex?: string;\r\n /** An optional constraint to combine chunks dynamically until they reach this minimum character length. */\r\n minChunkSize?: number;\r\n}\r\n\r\n/**\r\n * A Document Transformer that splits texts by comparing the semantic similarity of sequential sentences.\r\n * It ensures sentences with high contextual relationships remain in the same chunk.\r\n */\r\nexport class SemanticTextSplitter extends BaseDocumentTransformer {\r\n private embeddings: Embeddings;\r\n private bufferSize: number;\r\n private addStartIndex: boolean;\r\n private breakpointThresholdType: BreakpointThresholdType;\r\n private breakpointThresholdAmount: number;\r\n private numberOfChunks?: number;\r\n private sentenceSplitRegex: string;\r\n private minChunkSize?: number;\r\n\r\n /**\r\n * Constructs a new SemanticTextSplitter.\r\n * @param params - The configuration options initializing the text splitter.\r\n */\r\n constructor(params: SemanticTextSplitterParams) {\r\n super();\r\n this.embeddings = params.embeddings;\r\n this.bufferSize = params.bufferSize ?? 1;\r\n this.addStartIndex = params.addStartIndex ?? false;\r\n this.breakpointThresholdType =\r\n params.breakpointThresholdType ?? \"percentile\";\r\n this.numberOfChunks = params.numberOfChunks;\r\n this.sentenceSplitRegex = params.sentenceSplitRegex ?? \"(?<=[.?!])\\\\s+\";\r\n this.minChunkSize = params.minChunkSize;\r\n\r\n if (params.breakpointThresholdAmount === undefined) {\r\n this.breakpointThresholdAmount =\r\n BREAKPOINT_DEFAULTS[this.breakpointThresholdType];\r\n } else {\r\n this.breakpointThresholdAmount = params.breakpointThresholdAmount;\r\n }\r\n }\r\n\r\n private calculateBreakpointThreshold(\r\n distances: number[],\r\n ): [number, number[]] {\r\n if (this.breakpointThresholdType === \"percentile\") {\r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const index =\r\n Math.ceil((this.breakpointThresholdAmount / 100) * sorted.length) - 1;\r\n return [sorted[Math.max(0, index)], distances];\r\n } else if (this.breakpointThresholdType === \"standard_deviation\") {\r\n const mean = distances.reduce((sum, d) => sum + d, 0) / distances.length;\r\n const variance =\r\n distances.reduce((sum, d) => sum + Math.pow(d - mean, 2), 0) /\r\n distances.length;\r\n const stdDev = Math.sqrt(variance);\r\n return [mean + this.breakpointThresholdAmount * stdDev, distances];\r\n } else if (this.breakpointThresholdType === \"interquartile\") {\r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const q1Index = Math.floor(0.25 * sorted.length);\r\n const q3Index = Math.floor(0.75 * sorted.length);\r\n const q1 = sorted[q1Index];\r\n const q3 = sorted[q3Index];\r\n const iqr = q3 - q1;\r\n const mean = distances.reduce((sum, d) => sum + d, 0) / distances.length;\r\n return [mean + this.breakpointThresholdAmount * iqr, distances];\r\n } else if (this.breakpointThresholdType === \"gradient\") {\r\n const distanceGradient: number[] = [];\r\n for (let i = 0; i < distances.length - 1; i++) {\r\n distanceGradient.push(distances[i + 1] - distances[i]);\r\n }\r\n const sortedGradient = [...distanceGradient].sort((a, b) => a - b);\r\n const index =\r\n Math.ceil(\r\n (this.breakpointThresholdAmount / 100) * sortedGradient.length,\r\n ) - 1;\r\n return [sortedGradient[Math.max(0, index)], distanceGradient];\r\n } else {\r\n throw new Error(\r\n `Got unexpected breakpointThresholdType: ${this.breakpointThresholdType}`,\r\n );\r\n }\r\n }\r\n\r\n private thresholdFromClusters(distances: number[]): number {\r\n if (this.numberOfChunks === undefined) {\r\n throw new Error(\r\n \"This should never be called if numberOfChunks is undefined.\",\r\n );\r\n }\r\n\r\n const x1 = distances.length;\r\n const y1 = 0.0;\r\n const x2 = 1.0;\r\n const y2 = 100.0;\r\n\r\n const x = Math.max(Math.min(this.numberOfChunks, x1), x2);\r\n\r\n let y: number;\r\n if (x2 === x1) {\r\n y = y2;\r\n } else {\r\n y = y1 + ((y2 - y1) / (x2 - x1)) * (x - x1);\r\n }\r\n\r\n y = Math.min(Math.max(y, 0), 100);\r\n\r\n const sorted = [...distances].sort((a, b) => a - b);\r\n const index = Math.ceil((y / 100) * sorted.length) - 1;\r\n return sorted[Math.max(0, index)];\r\n }\r\n\r\n private async calculateSentenceDistances(\r\n singleSentencesList: string[],\r\n ): Promise<[number[], SentenceDict[]]> {\r\n const sentences: SentenceDict[] = singleSentencesList.map(\r\n (sentence, index) => ({\r\n sentence,\r\n index,\r\n }),\r\n );\r\n\r\n const sentencesWithCombined = combineSentences(sentences, this.bufferSize);\r\n\r\n const combinedSentences = sentencesWithCombined.map(\r\n (s) => s.combinedSentence!,\r\n );\r\n const embeddings = await this.embeddings.embedDocuments(combinedSentences);\r\n\r\n for (let i = 0; i < sentencesWithCombined.length; i++) {\r\n sentencesWithCombined[i].combinedSentenceEmbedding = embeddings[i];\r\n }\r\n\r\n return calculateCosineDistances(sentencesWithCombined);\r\n }\r\n\r\n private getSingleSentencesList(text: string): string[] {\r\n return text\r\n .split(new RegExp(this.sentenceSplitRegex))\r\n .filter((sentence) => sentence.trim().length > 0);\r\n }\r\n\r\n /**\r\n * Core method to process a raw string of text and turn it into an array of chunked strings based on semantic proximity.\r\n *\r\n * @param text - The raw string input to be chunked.\r\n * @returns An array of string chunks derived from the original text snippet.\r\n */\r\n async splitText(text: string): Promise<string[]> {\r\n const singleSentencesList = this.getSingleSentencesList(text);\r\n\r\n if (singleSentencesList.length === 1) {\r\n return singleSentencesList;\r\n }\r\n\r\n if (\r\n this.breakpointThresholdType === \"gradient\" &&\r\n singleSentencesList.length === 2\r\n ) {\r\n return singleSentencesList;\r\n }\r\n\r\n const [distances, sentences] =\r\n await this.calculateSentenceDistances(singleSentencesList);\r\n\r\n let breakpointDistanceThreshold: number;\r\n let breakpointArray: number[];\r\n\r\n if (this.numberOfChunks !== undefined) {\r\n breakpointDistanceThreshold = this.thresholdFromClusters(distances);\r\n breakpointArray = distances;\r\n } else {\r\n [breakpointDistanceThreshold, breakpointArray] =\r\n this.calculateBreakpointThreshold(distances);\r\n }\r\n\r\n const indicesAboveThresh = breakpointArray\r\n .map((x, i) => ({ value: x, index: i }))\r\n .filter(({ value }) => value > breakpointDistanceThreshold)\r\n .map(({ index }) => index);\r\n\r\n const chunks: string[] = [];\r\n let startIndex = 0;\r\n\r\n for (const index of indicesAboveThresh) {\r\n const endIndex = index;\r\n const group = sentences.slice(startIndex, endIndex + 1);\r\n const combinedText = group.map((d: SentenceDict) => d.sentence).join(\" \");\r\n\r\n if (\r\n this.minChunkSize !== undefined &&\r\n combinedText.length < this.minChunkSize\r\n ) {\r\n continue;\r\n }\r\n\r\n chunks.push(combinedText);\r\n startIndex = index + 1;\r\n }\r\n\r\n if (startIndex < sentences.length) {\r\n const combinedText = sentences\r\n .slice(startIndex)\r\n .map((d: SentenceDict) => d.sentence)\r\n .join(\" \");\r\n chunks.push(combinedText);\r\n }\r\n\r\n return chunks;\r\n }\r\n\r\n /**\r\n * Takes raw strings and corresponding optional metadata, sending them through the underlying\r\n * semantic text splitting process, and forming standard LangChain `Document` objects.\r\n *\r\n * @param texts - Array of strings spanning the content to separate out.\r\n * @param metadatas - Optional array of metadata mappings that align 1:1 with texts.\r\n * @returns An array of freshly constructed LangChain Document instances containing the split text chunks.\r\n */\r\n async createDocuments(\r\n texts: string[],\r\n metadatas: Record<string, any>[] = [],\r\n ): Promise<Document[]> {\r\n const _metadatas =\r\n metadatas.length > 0\r\n ? metadatas\r\n : texts.map(() => <Record<string, any>>{});\r\n const documents: Document[] = [];\r\n\r\n for (let i = 0; i < texts.length; i++) {\r\n const text = texts[i];\r\n let startIndex = 0;\r\n\r\n for (const chunk of await this.splitText(text)) {\r\n const metadata = { ..._metadatas[i] };\r\n if (this.addStartIndex) {\r\n metadata.start_index = startIndex;\r\n }\r\n\r\n const newDoc = new Document({\r\n pageContent: chunk,\r\n metadata,\r\n });\r\n\r\n documents.push(newDoc);\r\n startIndex += chunk.length;\r\n }\r\n }\r\n\r\n return documents;\r\n }\r\n\r\n /**\r\n * Convenience method to take in standardized Documents, extract their text, split the\r\n * text semantically, and reconstruct them into smaller Documents.\r\n *\r\n * @param documents - The original Documents to break down.\r\n * @returns Semantically chunked Documents inherited from the originals.\r\n */\r\n async splitDocuments(documents: Document[]): Promise<Document[]> {\r\n const texts = documents.map((doc) => doc.pageContent);\r\n const metadatas = documents.map((doc) => doc.metadata);\r\n return this.createDocuments(texts, metadatas);\r\n }\r\n\r\n /**\r\n * Implements the base BaseDocumentTransformer requirement to process documents.\r\n * Equivalent to `splitDocuments()`.\r\n *\r\n * @param documents - The original Documents to transform.\r\n * @returns Transformed, semantically split Documents.\r\n */\r\n async transformDocuments(documents: Document[]): Promise<Document[]> {\r\n return this.splitDocuments(documents);\r\n }\r\n}\r\n"],"mappings":";;;;;;;;AAgBA,SAAS,iBACP,WACA,aAAqB,GACL;AAEhB,MAAK,IAAI,IAAI,GAAG,IAAI,UAAU,QAAQ,KAAK;EACzC,MAAM,WAAW,UAAU;EAG3B,IAAI,mBAAmB;AAGvB,OAAK,IAAI,IAAI,KAAK,IAAI,GAAG,IAAI,WAAW,EAAE,IAAI,GAAG,IAC/C,qBAAoB,UAAU,GAAG,WAAW;AAI9C,sBAAoB,SAAS;AAG7B,OACE,IAAI,IAAI,IAAI,GACZ,KAAK,KAAK,IAAI,UAAU,SAAS,GAAG,IAAI,WAAW,EACnD,IAEA,qBAAoB,MAAM,UAAU,GAAG;AAIzC,WAAS,mBAAmB,iBAAiB,MAAM;;AAGrD,QAAO;;AAGT,SAAS,iBAAiB,GAAa,GAAqB;CAC1D,MAAM,aAAa,EAAE,QAAQ,KAAK,IAAI,MAAM,MAAM,KAAK,EAAE,IAAI,EAAE;CAC/D,MAAM,aAAa,KAAK,KAAK,EAAE,QAAQ,KAAK,OAAO,MAAM,KAAK,IAAI,EAAE,CAAC;CACrE,MAAM,aAAa,KAAK,KAAK,EAAE,QAAQ,KAAK,OAAO,MAAM,KAAK,IAAI,EAAE,CAAC;AAErE,KAAI,eAAe,KAAK,eAAe,EACrC,QAAO;AAGT,QAAO,cAAc,aAAa;;;;;;;;AASpC,SAAgB,yBACd,WAC4B;CAC5B,MAAM,YAAsB,EAAE;AAE9B,MAAK,IAAI,IAAI,GAAG,IAAI,UAAU,SAAS,GAAG,KAAK;EAC7C,MAAM,mBAAmB,UAAU,GAAG;EACtC,MAAM,gBAAgB,UAAU,IAAI,GAAG;EAMvC,MAAM,WAAW,IAHE,iBAAiB,kBAAkB,cAAc;AAIpE,YAAU,KAAK,SAAS;;AAG1B,QAAO,CAAC,WAAW,UAAU;;AAS/B,MAAM,sBAA+D;CACnE,YAAY;CACZ,oBAAoB;CACpB,eAAe;CACf,UAAU;CACX;;;;;AA4BD,IAAa,uBAAb,cAA0C,wBAAwB;CAChE;CACA;CACA;CACA;CACA;CACA;CACA;CACA;;;;;CAMA,YAAY,QAAoC;AAC9C,SAAO;AACP,OAAK,aAAa,OAAO;AACzB,OAAK,aAAa,OAAO,cAAc;AACvC,OAAK,gBAAgB,OAAO,iBAAiB;AAC7C,OAAK,0BACH,OAAO,2BAA2B;AACpC,OAAK,iBAAiB,OAAO;AAC7B,OAAK,qBAAqB,OAAO,sBAAsB;AACvD,OAAK,eAAe,OAAO;AAE3B,MAAI,OAAO,8BAA8B,KAAA,EACvC,MAAK,4BACH,oBAAoB,KAAK;MAE3B,MAAK,4BAA4B,OAAO;;CAI5C,6BACE,WACoB;AACpB,MAAI,KAAK,4BAA4B,cAAc;GACjD,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GACnD,MAAM,QACJ,KAAK,KAAM,KAAK,4BAA4B,MAAO,OAAO,OAAO,GAAG;AACtE,UAAO,CAAC,OAAO,KAAK,IAAI,GAAG,MAAM,GAAG,UAAU;aACrC,KAAK,4BAA4B,sBAAsB;GAChE,MAAM,OAAO,UAAU,QAAQ,KAAK,MAAM,MAAM,GAAG,EAAE,GAAG,UAAU;GAClE,MAAM,WACJ,UAAU,QAAQ,KAAK,MAAM,MAAM,KAAK,IAAI,IAAI,MAAM,EAAE,EAAE,EAAE,GAC5D,UAAU;GACZ,MAAM,SAAS,KAAK,KAAK,SAAS;AAClC,UAAO,CAAC,OAAO,KAAK,4BAA4B,QAAQ,UAAU;aACzD,KAAK,4BAA4B,iBAAiB;GAC3D,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GACnD,MAAM,UAAU,KAAK,MAAM,MAAO,OAAO,OAAO;GAChD,MAAM,UAAU,KAAK,MAAM,MAAO,OAAO,OAAO;GAChD,MAAM,KAAK,OAAO;GAElB,MAAM,MADK,OAAO,WACD;AAEjB,UAAO,CADM,UAAU,QAAQ,KAAK,MAAM,MAAM,GAAG,EAAE,GAAG,UAAU,SACnD,KAAK,4BAA4B,KAAK,UAAU;aACtD,KAAK,4BAA4B,YAAY;GACtD,MAAM,mBAA6B,EAAE;AACrC,QAAK,IAAI,IAAI,GAAG,IAAI,UAAU,SAAS,GAAG,IACxC,kBAAiB,KAAK,UAAU,IAAI,KAAK,UAAU,GAAG;GAExD,MAAM,iBAAiB,CAAC,GAAG,iBAAiB,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;GAClE,MAAM,QACJ,KAAK,KACF,KAAK,4BAA4B,MAAO,eAAe,OACzD,GAAG;AACN,UAAO,CAAC,eAAe,KAAK,IAAI,GAAG,MAAM,GAAG,iBAAiB;QAE7D,OAAM,IAAI,MACR,2CAA2C,KAAK,0BACjD;;CAIL,sBAA8B,WAA6B;AACzD,MAAI,KAAK,mBAAmB,KAAA,EAC1B,OAAM,IAAI,MACR,8DACD;EAGH,MAAM,KAAK,UAAU;EACrB,MAAM,KAAK;EACX,MAAM,KAAK;EACX,MAAM,KAAK;EAEX,MAAM,IAAI,KAAK,IAAI,KAAK,IAAI,KAAK,gBAAgB,GAAG,EAAE,GAAG;EAEzD,IAAI;AACJ,MAAI,OAAO,GACT,KAAI;MAEJ,KAAI,MAAO,KAAK,OAAO,KAAK,OAAQ,IAAI;AAG1C,MAAI,KAAK,IAAI,KAAK,IAAI,GAAG,EAAE,EAAE,IAAI;EAEjC,MAAM,SAAS,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,MAAM,IAAI,EAAE;EACnD,MAAM,QAAQ,KAAK,KAAM,IAAI,MAAO,OAAO,OAAO,GAAG;AACrD,SAAO,OAAO,KAAK,IAAI,GAAG,MAAM;;CAGlC,MAAc,2BACZ,qBACqC;EAQrC,MAAM,wBAAwB,iBAPI,oBAAoB,KACnD,UAAU,WAAW;GACpB;GACA;GACD,EACF,EAEyD,KAAK,WAAW;EAE1E,MAAM,oBAAoB,sBAAsB,KAC7C,MAAM,EAAE,iBACV;EACD,MAAM,aAAa,MAAM,KAAK,WAAW,eAAe,kBAAkB;AAE1E,OAAK,IAAI,IAAI,GAAG,IAAI,sBAAsB,QAAQ,IAChD,uBAAsB,GAAG,4BAA4B,WAAW;AAGlE,SAAO,yBAAyB,sBAAsB;;CAGxD,uBAA+B,MAAwB;AACrD,SAAO,KACJ,MAAM,IAAI,OAAO,KAAK,mBAAmB,CAAC,CAC1C,QAAQ,aAAa,SAAS,MAAM,CAAC,SAAS,EAAE;;;;;;;;CASrD,MAAM,UAAU,MAAiC;EAC/C,MAAM,sBAAsB,KAAK,uBAAuB,KAAK;AAE7D,MAAI,oBAAoB,WAAW,EACjC,QAAO;AAGT,MACE,KAAK,4BAA4B,cACjC,oBAAoB,WAAW,EAE/B,QAAO;EAGT,MAAM,CAAC,WAAW,aAChB,MAAM,KAAK,2BAA2B,oBAAoB;EAE5D,IAAI;EACJ,IAAI;AAEJ,MAAI,KAAK,mBAAmB,KAAA,GAAW;AACrC,iCAA8B,KAAK,sBAAsB,UAAU;AACnE,qBAAkB;QAElB,EAAC,6BAA6B,mBAC5B,KAAK,6BAA6B,UAAU;EAGhD,MAAM,qBAAqB,gBACxB,KAAK,GAAG,OAAO;GAAE,OAAO;GAAG,OAAO;GAAG,EAAE,CACvC,QAAQ,EAAE,YAAY,QAAQ,4BAA4B,CAC1D,KAAK,EAAE,YAAY,MAAM;EAE5B,MAAM,SAAmB,EAAE;EAC3B,IAAI,aAAa;AAEjB,OAAK,MAAM,SAAS,oBAAoB;GACtC,MAAM,WAAW;GAEjB,MAAM,eADQ,UAAU,MAAM,YAAY,WAAW,EAAE,CAC5B,KAAK,MAAoB,EAAE,SAAS,CAAC,KAAK,IAAI;AAEzE,OACE,KAAK,iBAAiB,KAAA,KACtB,aAAa,SAAS,KAAK,aAE3B;AAGF,UAAO,KAAK,aAAa;AACzB,gBAAa,QAAQ;;AAGvB,MAAI,aAAa,UAAU,QAAQ;GACjC,MAAM,eAAe,UAClB,MAAM,WAAW,CACjB,KAAK,MAAoB,EAAE,SAAS,CACpC,KAAK,IAAI;AACZ,UAAO,KAAK,aAAa;;AAG3B,SAAO;;;;;;;;;;CAWT,MAAM,gBACJ,OACA,YAAmC,EAAE,EAChB;EACrB,MAAM,aACJ,UAAU,SAAS,IACf,YACA,MAAM,WAA+B,EAAE,EAAC;EAC9C,MAAM,YAAwB,EAAE;AAEhC,OAAK,IAAI,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;GACrC,MAAM,OAAO,MAAM;GACnB,IAAI,aAAa;AAEjB,QAAK,MAAM,SAAS,MAAM,KAAK,UAAU,KAAK,EAAE;IAC9C,MAAM,WAAW,EAAE,GAAG,WAAW,IAAI;AACrC,QAAI,KAAK,cACP,UAAS,cAAc;IAGzB,MAAM,SAAS,IAAI,SAAS;KAC1B,aAAa;KACb;KACD,CAAC;AAEF,cAAU,KAAK,OAAO;AACtB,kBAAc,MAAM;;;AAIxB,SAAO;;;;;;;;;CAUT,MAAM,eAAe,WAA4C;EAC/D,MAAM,QAAQ,UAAU,KAAK,QAAQ,IAAI,YAAY;EACrD,MAAM,YAAY,UAAU,KAAK,QAAQ,IAAI,SAAS;AACtD,SAAO,KAAK,gBAAgB,OAAO,UAAU;;;;;;;;;CAU/C,MAAM,mBAAmB,WAA4C;AACnE,SAAO,KAAK,eAAe,UAAU"}
|