@mastra/rag 1.2.2 → 1.2.3-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +22 -0
- package/dist/index.cjs +25 -9
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +25 -9
- package/dist/index.js.map +1 -1
- package/dist/tools/graph-rag.d.ts.map +1 -1
- package/dist/tools/types.d.ts +18 -5
- package/dist/tools/types.d.ts.map +1 -1
- package/dist/tools/vector-query.d.ts.map +1 -1
- package/dist/utils/vector-search.d.ts +6 -7
- package/dist/utils/vector-search.d.ts.map +1 -1
- package/package.json +19 -6
- package/.turbo/turbo-build.log +0 -4
- package/docker-compose.yaml +0 -22
- package/eslint.config.js +0 -6
- package/src/document/document.test.ts +0 -2975
- package/src/document/document.ts +0 -335
- package/src/document/extractors/base.ts +0 -30
- package/src/document/extractors/index.ts +0 -5
- package/src/document/extractors/keywords.test.ts +0 -125
- package/src/document/extractors/keywords.ts +0 -126
- package/src/document/extractors/questions.test.ts +0 -120
- package/src/document/extractors/questions.ts +0 -111
- package/src/document/extractors/summary.test.ts +0 -107
- package/src/document/extractors/summary.ts +0 -122
- package/src/document/extractors/title.test.ts +0 -121
- package/src/document/extractors/title.ts +0 -185
- package/src/document/extractors/types.ts +0 -40
- package/src/document/index.ts +0 -2
- package/src/document/prompts/base.ts +0 -77
- package/src/document/prompts/format.ts +0 -9
- package/src/document/prompts/index.ts +0 -15
- package/src/document/prompts/prompt.ts +0 -60
- package/src/document/prompts/types.ts +0 -29
- package/src/document/schema/index.ts +0 -3
- package/src/document/schema/node.ts +0 -187
- package/src/document/schema/types.ts +0 -40
- package/src/document/transformers/character.ts +0 -267
- package/src/document/transformers/html.ts +0 -346
- package/src/document/transformers/json.ts +0 -536
- package/src/document/transformers/latex.ts +0 -11
- package/src/document/transformers/markdown.ts +0 -239
- package/src/document/transformers/semantic-markdown.ts +0 -227
- package/src/document/transformers/sentence.ts +0 -314
- package/src/document/transformers/text.ts +0 -158
- package/src/document/transformers/token.ts +0 -137
- package/src/document/transformers/transformer.ts +0 -5
- package/src/document/types.ts +0 -145
- package/src/document/validation.ts +0 -158
- package/src/graph-rag/index.test.ts +0 -235
- package/src/graph-rag/index.ts +0 -306
- package/src/index.ts +0 -8
- package/src/rerank/index.test.ts +0 -150
- package/src/rerank/index.ts +0 -198
- package/src/rerank/relevance/cohere/index.ts +0 -56
- package/src/rerank/relevance/index.ts +0 -3
- package/src/rerank/relevance/mastra-agent/index.ts +0 -32
- package/src/rerank/relevance/zeroentropy/index.ts +0 -26
- package/src/tools/README.md +0 -153
- package/src/tools/document-chunker.ts +0 -34
- package/src/tools/graph-rag.test.ts +0 -115
- package/src/tools/graph-rag.ts +0 -154
- package/src/tools/index.ts +0 -3
- package/src/tools/types.ts +0 -110
- package/src/tools/vector-query-database-config.test.ts +0 -190
- package/src/tools/vector-query.test.ts +0 -418
- package/src/tools/vector-query.ts +0 -169
- package/src/utils/convert-sources.ts +0 -43
- package/src/utils/default-settings.ts +0 -38
- package/src/utils/index.ts +0 -3
- package/src/utils/tool-schemas.ts +0 -38
- package/src/utils/vector-prompts.ts +0 -832
- package/src/utils/vector-search.ts +0 -117
- package/tsconfig.build.json +0 -9
- package/tsconfig.json +0 -5
- package/tsup.config.ts +0 -17
- package/vitest.config.ts +0 -8
|
@@ -1,267 +0,0 @@
|
|
|
1
|
-
import { Language } from '../types';
|
|
2
|
-
import type { BaseChunkOptions, CharacterChunkOptions, RecursiveChunkOptions } from '../types';
|
|
3
|
-
|
|
4
|
-
import { TextTransformer } from './text';
|
|
5
|
-
|
|
6
|
-
function splitTextWithRegex(text: string, separator: string, keepSeparator: boolean | 'start' | 'end'): string[] {
|
|
7
|
-
if (!separator) {
|
|
8
|
-
return text.split('');
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
if (!keepSeparator) {
|
|
12
|
-
return text.split(new RegExp(separator)).filter(s => s !== '');
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
if (!text) {
|
|
16
|
-
return [];
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
// Split with capturing group to keep separators
|
|
20
|
-
const splits = text.split(new RegExp(`(${separator})`));
|
|
21
|
-
const result: string[] = [];
|
|
22
|
-
|
|
23
|
-
if (keepSeparator === 'end') {
|
|
24
|
-
// Process all complete pairs
|
|
25
|
-
for (let i = 0; i < splits.length - 1; i += 2) {
|
|
26
|
-
if (i + 1 < splits.length) {
|
|
27
|
-
// Current text + separator
|
|
28
|
-
const chunk = splits[i] + (splits[i + 1] || '');
|
|
29
|
-
if (chunk) result.push(chunk);
|
|
30
|
-
}
|
|
31
|
-
}
|
|
32
|
-
// Handle the last element if it exists and isn't a separator
|
|
33
|
-
if (splits.length % 2 === 1 && splits[splits.length - 1]) {
|
|
34
|
-
result.push(splits?.[splits.length - 1]!);
|
|
35
|
-
}
|
|
36
|
-
} else {
|
|
37
|
-
if (splits[0]) result.push(splits[0]);
|
|
38
|
-
|
|
39
|
-
for (let i = 1; i < splits.length - 1; i += 2) {
|
|
40
|
-
const separator = splits[i];
|
|
41
|
-
const text = splits[i + 1];
|
|
42
|
-
if (separator && text) {
|
|
43
|
-
result.push(separator + text);
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
return result.filter(s => s !== '');
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
export class CharacterTransformer extends TextTransformer {
|
|
52
|
-
protected separator: string;
|
|
53
|
-
protected isSeparatorRegex: boolean;
|
|
54
|
-
|
|
55
|
-
constructor({ separator = '\n\n', isSeparatorRegex = false, ...baseOptions }: CharacterChunkOptions = {}) {
|
|
56
|
-
super(baseOptions);
|
|
57
|
-
this.separator = separator;
|
|
58
|
-
this.isSeparatorRegex = isSeparatorRegex;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
splitText({ text }: { text: string }): string[] {
|
|
62
|
-
// First, split the text into initial chunks
|
|
63
|
-
const separator = this.isSeparatorRegex ? this.separator : this.separator.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
64
|
-
|
|
65
|
-
const initialSplits = splitTextWithRegex(text, separator, this.keepSeparator);
|
|
66
|
-
|
|
67
|
-
// If length of any split is greater than chunk size, perform additional splitting
|
|
68
|
-
const chunks: string[] = [];
|
|
69
|
-
for (const split of initialSplits) {
|
|
70
|
-
if (this.lengthFunction(split) <= this.maxSize) {
|
|
71
|
-
chunks.push(split);
|
|
72
|
-
} else {
|
|
73
|
-
// If a single split is too large, split it further with overlap
|
|
74
|
-
const subChunks = this.__splitChunk(split);
|
|
75
|
-
chunks.push(...subChunks);
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
return chunks;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
private __splitChunk(text: string): string[] {
|
|
83
|
-
const chunks: string[] = [];
|
|
84
|
-
let currentPosition = 0;
|
|
85
|
-
|
|
86
|
-
while (currentPosition < text.length) {
|
|
87
|
-
let chunkEnd = currentPosition;
|
|
88
|
-
|
|
89
|
-
// Build chunk up to max size
|
|
90
|
-
while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.maxSize) {
|
|
91
|
-
chunkEnd++;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
const currentChunk = text.slice(currentPosition, chunkEnd);
|
|
95
|
-
const chunkLength = this.lengthFunction(currentChunk);
|
|
96
|
-
chunks.push(currentChunk);
|
|
97
|
-
|
|
98
|
-
// If we're at the end, break to avoid tiny chunks
|
|
99
|
-
if (chunkEnd >= text.length) break;
|
|
100
|
-
|
|
101
|
-
// Move position forward by chunk size minus overlap
|
|
102
|
-
currentPosition += Math.max(1, chunkLength - this.overlap);
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
return chunks;
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
export class RecursiveCharacterTransformer extends TextTransformer {
|
|
110
|
-
protected separators: string[];
|
|
111
|
-
protected isSeparatorRegex: boolean;
|
|
112
|
-
|
|
113
|
-
constructor({ separators, isSeparatorRegex = false, language, ...baseOptions }: RecursiveChunkOptions = {}) {
|
|
114
|
-
super(baseOptions);
|
|
115
|
-
this.separators = separators || ['\n\n', '\n', ' ', ''];
|
|
116
|
-
this.isSeparatorRegex = isSeparatorRegex;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
private _splitText(text: string, separators: string[]): string[] {
|
|
120
|
-
const finalChunks: string[] = [];
|
|
121
|
-
|
|
122
|
-
let separator = separators?.[separators.length - 1]!;
|
|
123
|
-
let newSeparators: string[] = [];
|
|
124
|
-
|
|
125
|
-
for (let i = 0; i < separators.length; i++) {
|
|
126
|
-
const s = separators[i]!;
|
|
127
|
-
const _separator = this.isSeparatorRegex ? s : s?.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
128
|
-
|
|
129
|
-
if (s === '') {
|
|
130
|
-
separator = s;
|
|
131
|
-
break;
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
if (new RegExp(_separator).test(text)) {
|
|
135
|
-
separator = s;
|
|
136
|
-
newSeparators = separators.slice(i + 1);
|
|
137
|
-
break;
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
const _separator = this.isSeparatorRegex ? separator : separator?.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
142
|
-
|
|
143
|
-
const splits = splitTextWithRegex(text, _separator, this.keepSeparator);
|
|
144
|
-
|
|
145
|
-
const goodSplits: string[] = [];
|
|
146
|
-
const mergeSeparator = this.keepSeparator ? '' : separator;
|
|
147
|
-
|
|
148
|
-
for (const s of splits) {
|
|
149
|
-
if (this.lengthFunction(s) < this.maxSize) {
|
|
150
|
-
goodSplits.push(s);
|
|
151
|
-
} else {
|
|
152
|
-
if (goodSplits.length > 0) {
|
|
153
|
-
const mergedText = this.mergeSplits(goodSplits, mergeSeparator);
|
|
154
|
-
finalChunks.push(...mergedText);
|
|
155
|
-
goodSplits.length = 0;
|
|
156
|
-
}
|
|
157
|
-
if (newSeparators.length === 0) {
|
|
158
|
-
finalChunks.push(s);
|
|
159
|
-
} else {
|
|
160
|
-
const otherInfo = this._splitText(s, newSeparators);
|
|
161
|
-
finalChunks.push(...otherInfo);
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
if (goodSplits.length > 0) {
|
|
167
|
-
const mergedText = this.mergeSplits(goodSplits, mergeSeparator);
|
|
168
|
-
finalChunks.push(...mergedText);
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
return finalChunks;
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
splitText({ text }: { text: string }): string[] {
|
|
175
|
-
return this._splitText(text, this.separators);
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
static fromLanguage(language: Language, options: BaseChunkOptions = {}): RecursiveCharacterTransformer {
|
|
179
|
-
const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage(language);
|
|
180
|
-
return new RecursiveCharacterTransformer({
|
|
181
|
-
...options,
|
|
182
|
-
separators,
|
|
183
|
-
isSeparatorRegex: true,
|
|
184
|
-
language,
|
|
185
|
-
});
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
static getSeparatorsForLanguage(language: Language): string[] {
|
|
189
|
-
switch (language) {
|
|
190
|
-
case Language.MARKDOWN:
|
|
191
|
-
return [
|
|
192
|
-
// First, try to split along Markdown headings (starting with level 2)
|
|
193
|
-
'\n#{1,6} ',
|
|
194
|
-
// End of code block
|
|
195
|
-
'```\n',
|
|
196
|
-
// Horizontal lines
|
|
197
|
-
'\n\\*\\*\\*+\n',
|
|
198
|
-
'\n---+\n',
|
|
199
|
-
'\n___+\n',
|
|
200
|
-
// Note that this splitter doesn't handle horizontal lines defined
|
|
201
|
-
// by *three or more* of ***, ---, or ___, but this is not handled
|
|
202
|
-
'\n\n',
|
|
203
|
-
'\n',
|
|
204
|
-
' ',
|
|
205
|
-
'',
|
|
206
|
-
];
|
|
207
|
-
case Language.CPP:
|
|
208
|
-
case Language.C:
|
|
209
|
-
return [
|
|
210
|
-
'\nclass ',
|
|
211
|
-
'\nvoid ',
|
|
212
|
-
'\nint ',
|
|
213
|
-
'\nfloat ',
|
|
214
|
-
'\ndouble ',
|
|
215
|
-
'\nif ',
|
|
216
|
-
'\nfor ',
|
|
217
|
-
'\nwhile ',
|
|
218
|
-
'\nswitch ',
|
|
219
|
-
'\ncase ',
|
|
220
|
-
'\n\n',
|
|
221
|
-
'\n',
|
|
222
|
-
' ',
|
|
223
|
-
'',
|
|
224
|
-
];
|
|
225
|
-
case Language.TS:
|
|
226
|
-
return [
|
|
227
|
-
'\nenum ',
|
|
228
|
-
'\ninterface ',
|
|
229
|
-
'\nnamespace ',
|
|
230
|
-
'\ntype ',
|
|
231
|
-
'\nclass ',
|
|
232
|
-
'\nfunction ',
|
|
233
|
-
'\nconst ',
|
|
234
|
-
'\nlet ',
|
|
235
|
-
'\nvar ',
|
|
236
|
-
'\nif ',
|
|
237
|
-
'\nfor ',
|
|
238
|
-
'\nwhile ',
|
|
239
|
-
'\nswitch ',
|
|
240
|
-
'\ncase ',
|
|
241
|
-
'\ndefault ',
|
|
242
|
-
'\n\n',
|
|
243
|
-
'\n',
|
|
244
|
-
' ',
|
|
245
|
-
'',
|
|
246
|
-
];
|
|
247
|
-
case Language.LATEX:
|
|
248
|
-
return [
|
|
249
|
-
'\\\\part\\*?\\{',
|
|
250
|
-
'\\\\chapter\\*?\\{',
|
|
251
|
-
'\\\\section\\*?\\{',
|
|
252
|
-
'\\\\subsection\\*?\\{',
|
|
253
|
-
'\\\\subsubsection\\*?\\{',
|
|
254
|
-
'\\\\begin\\{.*?\\}',
|
|
255
|
-
'\\\\end\\{.*?\\}',
|
|
256
|
-
'\\\\[a-zA-Z]+\\{.*?\\}',
|
|
257
|
-
'\n\n',
|
|
258
|
-
'\n',
|
|
259
|
-
' ',
|
|
260
|
-
'',
|
|
261
|
-
];
|
|
262
|
-
// ... (add other language cases following the same pattern)
|
|
263
|
-
default:
|
|
264
|
-
throw new Error(`Language ${language} is not supported! Please choose from ${Object.values(Language)}`);
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
}
|
|
@@ -1,346 +0,0 @@
|
|
|
1
|
-
import { parse } from 'node-html-better-parser';
|
|
2
|
-
import { Document } from '../schema';
|
|
3
|
-
import type { HTMLChunkOptions } from '../types';
|
|
4
|
-
|
|
5
|
-
import { RecursiveCharacterTransformer } from './character';
|
|
6
|
-
|
|
7
|
-
interface ElementType {
|
|
8
|
-
url: string;
|
|
9
|
-
xpath: string;
|
|
10
|
-
content: string;
|
|
11
|
-
metadata: Record<string, string>;
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
export class HTMLHeaderTransformer {
|
|
15
|
-
private headersToSplitOn: [string, string][];
|
|
16
|
-
private returnEachElement: boolean;
|
|
17
|
-
|
|
18
|
-
constructor(options: HTMLChunkOptions & { headers: [string, string][] }) {
|
|
19
|
-
this.returnEachElement = options.returnEachLine ?? false;
|
|
20
|
-
this.headersToSplitOn = [...options.headers].sort();
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
splitText({ text }: { text: string }): Document[] {
|
|
24
|
-
const root = parse(text);
|
|
25
|
-
|
|
26
|
-
const headerFilter = this.headersToSplitOn.map(([header]) => header);
|
|
27
|
-
const headerMapping = Object.fromEntries(this.headersToSplitOn);
|
|
28
|
-
|
|
29
|
-
const elements: ElementType[] = [];
|
|
30
|
-
const headers = root.querySelectorAll(headerFilter.join(','));
|
|
31
|
-
|
|
32
|
-
headers.forEach(header => {
|
|
33
|
-
let content = '';
|
|
34
|
-
const parentNode = header.parentNode;
|
|
35
|
-
|
|
36
|
-
if (parentNode && parentNode.childNodes) {
|
|
37
|
-
let foundHeader = false;
|
|
38
|
-
for (const node of parentNode.childNodes) {
|
|
39
|
-
// Start collecting content after we find our header
|
|
40
|
-
if (node === header) {
|
|
41
|
-
foundHeader = true;
|
|
42
|
-
continue;
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
// If we found our header and hit another header, stop
|
|
46
|
-
// @ts-expect-error - node.tagName is not defined on type Node
|
|
47
|
-
if (foundHeader && node.tagName && headerFilter.includes(node.tagName.toLowerCase())) {
|
|
48
|
-
break;
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
// Collect content between headers
|
|
52
|
-
if (foundHeader) {
|
|
53
|
-
content += this.getTextContent(node) + ' ';
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
elements.push({
|
|
59
|
-
url: text,
|
|
60
|
-
xpath: this.getXPath(header),
|
|
61
|
-
content: content.trim(),
|
|
62
|
-
metadata: {
|
|
63
|
-
[headerMapping?.[header.tagName.toLowerCase()]!]: header.text || '',
|
|
64
|
-
},
|
|
65
|
-
});
|
|
66
|
-
});
|
|
67
|
-
|
|
68
|
-
return this.returnEachElement
|
|
69
|
-
? elements.map(
|
|
70
|
-
el =>
|
|
71
|
-
new Document({
|
|
72
|
-
text: el.content,
|
|
73
|
-
metadata: { ...el.metadata, xpath: el.xpath },
|
|
74
|
-
}),
|
|
75
|
-
)
|
|
76
|
-
: this.aggregateElementsToChunks(elements);
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
private getXPath(element: any): string {
|
|
80
|
-
if (!element) return '';
|
|
81
|
-
|
|
82
|
-
const parts: string[] = [];
|
|
83
|
-
let current = element;
|
|
84
|
-
|
|
85
|
-
while (current && current.tagName) {
|
|
86
|
-
let index = 1;
|
|
87
|
-
const parent = current.parentNode;
|
|
88
|
-
|
|
89
|
-
if (parent && parent.childNodes) {
|
|
90
|
-
// Count preceding siblings with same tag
|
|
91
|
-
for (const sibling of parent.childNodes) {
|
|
92
|
-
if (sibling === current) break;
|
|
93
|
-
if (sibling.tagName === current.tagName) {
|
|
94
|
-
index++;
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
|
|
100
|
-
current = current.parentNode;
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
return '/' + parts.join('/');
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
private getTextContent(element: any): string {
|
|
107
|
-
if (!element) return '';
|
|
108
|
-
|
|
109
|
-
// For text nodes, return their content
|
|
110
|
-
if (!element.tagName) {
|
|
111
|
-
return element.text || '';
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
// For element nodes, combine their text with children's text
|
|
115
|
-
let content = element.text || '';
|
|
116
|
-
|
|
117
|
-
if (element.childNodes) {
|
|
118
|
-
for (const child of element.childNodes) {
|
|
119
|
-
const childText = this.getTextContent(child);
|
|
120
|
-
if (childText) {
|
|
121
|
-
content += ' ' + childText;
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
return content.trim();
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
private aggregateElementsToChunks(elements: ElementType[]): Document[] {
|
|
130
|
-
const aggregatedChunks: ElementType[] = [];
|
|
131
|
-
|
|
132
|
-
for (const element of elements) {
|
|
133
|
-
if (
|
|
134
|
-
aggregatedChunks.length > 0 &&
|
|
135
|
-
JSON.stringify(aggregatedChunks[aggregatedChunks.length - 1]!.metadata) === JSON.stringify(element.metadata)
|
|
136
|
-
) {
|
|
137
|
-
// If the last element has the same metadata, append content
|
|
138
|
-
aggregatedChunks[aggregatedChunks.length - 1]!.content += ' \n' + element.content;
|
|
139
|
-
} else {
|
|
140
|
-
// Otherwise, add as new element
|
|
141
|
-
aggregatedChunks.push({ ...element });
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
return aggregatedChunks.map(
|
|
146
|
-
chunk =>
|
|
147
|
-
new Document({
|
|
148
|
-
text: chunk.content,
|
|
149
|
-
metadata: { ...chunk.metadata, xpath: chunk.xpath },
|
|
150
|
-
}),
|
|
151
|
-
);
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[] {
|
|
155
|
-
const _metadatas = metadatas || Array(texts.length).fill({});
|
|
156
|
-
const documents: Document[] = [];
|
|
157
|
-
|
|
158
|
-
for (let i = 0; i < texts.length; i++) {
|
|
159
|
-
const chunks = this.splitText({ text: texts[i]! });
|
|
160
|
-
for (const chunk of chunks) {
|
|
161
|
-
const metadata = { ...(_metadatas[i] || {}) };
|
|
162
|
-
const chunkMetadata = chunk.metadata;
|
|
163
|
-
|
|
164
|
-
if (chunkMetadata) {
|
|
165
|
-
for (const [key, value] of Object.entries(chunkMetadata || {})) {
|
|
166
|
-
if (value === '#TITLE#') {
|
|
167
|
-
chunkMetadata[key] = metadata['Title'];
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
documents.push(
|
|
173
|
-
new Document({
|
|
174
|
-
text: chunk.text!,
|
|
175
|
-
metadata: { ...metadata, ...chunkMetadata },
|
|
176
|
-
}),
|
|
177
|
-
);
|
|
178
|
-
}
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
return documents;
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
transformDocuments(documents: Document[]): Document[] {
|
|
185
|
-
const texts: string[] = [];
|
|
186
|
-
const metadatas: Record<string, any>[] = [];
|
|
187
|
-
|
|
188
|
-
for (const doc of documents) {
|
|
189
|
-
texts.push(doc.text);
|
|
190
|
-
metadatas.push(doc.metadata);
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
return this.createDocuments(texts, metadatas);
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
export class HTMLSectionTransformer {
|
|
198
|
-
private headersToSplitOn: Record<string, string>;
|
|
199
|
-
private textSplitter: RecursiveCharacterTransformer;
|
|
200
|
-
|
|
201
|
-
constructor(options: HTMLChunkOptions & { sections: [string, string][] }) {
|
|
202
|
-
this.headersToSplitOn = Object.fromEntries(options.sections.map(([tag, name]) => [tag.toLowerCase(), name]));
|
|
203
|
-
this.textSplitter = new RecursiveCharacterTransformer(options);
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
splitText(text: string): Document[] {
|
|
207
|
-
const sections = this.splitHtmlByHeaders(text);
|
|
208
|
-
|
|
209
|
-
return sections.map(
|
|
210
|
-
section =>
|
|
211
|
-
new Document({
|
|
212
|
-
text: section.content,
|
|
213
|
-
metadata: {
|
|
214
|
-
[this.headersToSplitOn[section.tagName.toLowerCase()]!]: section.header,
|
|
215
|
-
xpath: section.xpath,
|
|
216
|
-
},
|
|
217
|
-
}),
|
|
218
|
-
);
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
private getXPath(element: any): string {
|
|
222
|
-
const parts: string[] = [];
|
|
223
|
-
let current = element;
|
|
224
|
-
|
|
225
|
-
while (current && current.nodeType === 1) {
|
|
226
|
-
let index = 1;
|
|
227
|
-
let sibling = current.previousSibling;
|
|
228
|
-
|
|
229
|
-
while (sibling) {
|
|
230
|
-
if (sibling.nodeType === 1 && sibling.tagName === current.tagName) {
|
|
231
|
-
index++;
|
|
232
|
-
}
|
|
233
|
-
sibling = sibling.previousSibling;
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
if (current.tagName) {
|
|
237
|
-
parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
|
|
238
|
-
}
|
|
239
|
-
current = current.parentNode;
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
return '/' + parts.join('/');
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
private splitHtmlByHeaders(htmlDoc: string): Array<{
|
|
246
|
-
header: string;
|
|
247
|
-
content: string;
|
|
248
|
-
tagName: string;
|
|
249
|
-
xpath: string;
|
|
250
|
-
}> {
|
|
251
|
-
const sections: Array<{
|
|
252
|
-
header: string;
|
|
253
|
-
content: string;
|
|
254
|
-
tagName: string;
|
|
255
|
-
xpath: string;
|
|
256
|
-
}> = [];
|
|
257
|
-
|
|
258
|
-
const root = parse(htmlDoc);
|
|
259
|
-
const headers = Object.keys(this.headersToSplitOn);
|
|
260
|
-
const headerElements = root.querySelectorAll(headers.join(','));
|
|
261
|
-
|
|
262
|
-
headerElements.forEach((headerElement, index) => {
|
|
263
|
-
const header = headerElement.text?.trim() || '';
|
|
264
|
-
const tagName = headerElement.tagName;
|
|
265
|
-
const xpath = this.getXPath(headerElement);
|
|
266
|
-
let content = '';
|
|
267
|
-
|
|
268
|
-
// @ts-expect-error - nextElementSibling is not defined on type Element
|
|
269
|
-
let currentElement = headerElement.nextElementSibling;
|
|
270
|
-
const nextHeader = headerElements[index + 1];
|
|
271
|
-
|
|
272
|
-
while (currentElement && (!nextHeader || currentElement !== nextHeader)) {
|
|
273
|
-
if (currentElement.text) {
|
|
274
|
-
content += currentElement.text.trim() + ' ';
|
|
275
|
-
}
|
|
276
|
-
currentElement = currentElement.nextElementSibling;
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
content = content.trim();
|
|
280
|
-
sections.push({
|
|
281
|
-
header,
|
|
282
|
-
content,
|
|
283
|
-
tagName,
|
|
284
|
-
xpath,
|
|
285
|
-
});
|
|
286
|
-
});
|
|
287
|
-
|
|
288
|
-
return sections;
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
async splitDocuments(documents: Document[]): Promise<Document[]> {
|
|
292
|
-
const texts: string[] = [];
|
|
293
|
-
const metadatas: Record<string, any>[] = [];
|
|
294
|
-
|
|
295
|
-
for (const doc of documents) {
|
|
296
|
-
texts.push(doc.text);
|
|
297
|
-
metadatas.push(doc.metadata);
|
|
298
|
-
}
|
|
299
|
-
const results = await this.createDocuments(texts, metadatas);
|
|
300
|
-
|
|
301
|
-
return this.textSplitter.splitDocuments(results);
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[] {
|
|
305
|
-
const _metadatas = metadatas || Array(texts.length).fill({});
|
|
306
|
-
const documents: Document[] = [];
|
|
307
|
-
|
|
308
|
-
for (let i = 0; i < texts.length; i++) {
|
|
309
|
-
const chunks = this.splitText(texts[i]!);
|
|
310
|
-
for (const chunk of chunks) {
|
|
311
|
-
const metadata = { ...(_metadatas[i] || {}) };
|
|
312
|
-
|
|
313
|
-
const chunkMetadata = chunk.metadata;
|
|
314
|
-
|
|
315
|
-
if (chunkMetadata) {
|
|
316
|
-
for (const [key, value] of Object.entries(chunkMetadata || {})) {
|
|
317
|
-
if (value === '#TITLE#') {
|
|
318
|
-
chunkMetadata[key] = metadata['Title'];
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
documents.push(
|
|
324
|
-
new Document({
|
|
325
|
-
text: chunk.text!,
|
|
326
|
-
metadata: { ...metadata, ...chunkMetadata },
|
|
327
|
-
}),
|
|
328
|
-
);
|
|
329
|
-
}
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
return documents;
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
transformDocuments(documents: Document[]): Document[] {
|
|
336
|
-
const texts: string[] = [];
|
|
337
|
-
const metadatas: Record<string, any>[] = [];
|
|
338
|
-
|
|
339
|
-
for (const doc of documents) {
|
|
340
|
-
texts.push(doc.text);
|
|
341
|
-
metadatas.push(doc.metadata);
|
|
342
|
-
}
|
|
343
|
-
|
|
344
|
-
return this.createDocuments(texts, metadatas);
|
|
345
|
-
}
|
|
346
|
-
}
|