plugin-knowledge-base 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/client/index.js +1 -1
- package/dist/externalVersion.js +1 -0
- package/dist/node_modules/@langchain/textsplitters/LICENSE +21 -0
- package/dist/node_modules/@langchain/textsplitters/dist/_virtual/rolldown_runtime.cjs +25 -0
- package/dist/node_modules/@langchain/textsplitters/dist/index.cjs +7 -0
- package/dist/node_modules/@langchain/textsplitters/dist/index.d.cts +2 -0
- package/dist/node_modules/@langchain/textsplitters/dist/index.d.ts +2 -0
- package/dist/node_modules/@langchain/textsplitters/dist/index.js +3 -0
- package/dist/node_modules/@langchain/textsplitters/dist/text_splitter.cjs +539 -0
- package/dist/node_modules/@langchain/textsplitters/dist/text_splitter.d.cts +84 -0
- package/dist/node_modules/@langchain/textsplitters/dist/text_splitter.d.ts +84 -0
- package/dist/node_modules/@langchain/textsplitters/dist/text_splitter.js +532 -0
- package/dist/node_modules/@langchain/textsplitters/package.json +1 -0
- package/dist/server/actions/add-document.js +85 -0
- package/dist/server/features/vector-store-provider-impl.js +50 -15
- package/dist/server/pipeline/simple-embeddings.js +82 -0
- package/dist/server/pipeline/vectorization.js +28 -15
- package/dist/server/plugin.js +16 -0
- package/dist/server/request-context.js +54 -0
- package/package.json +1 -1
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { BaseDocumentTransformer, Document } from "@langchain/core/documents";
|
|
2
|
+
import * as tiktoken from "js-tiktoken";
|
|
3
|
+
|
|
4
|
+
//#region src/text_splitter.d.ts
|
|
5
|
+
interface TextSplitterParams {
|
|
6
|
+
chunkSize: number;
|
|
7
|
+
chunkOverlap: number;
|
|
8
|
+
keepSeparator: boolean;
|
|
9
|
+
lengthFunction?: ((text: string) => number) | ((text: string) => Promise<number>);
|
|
10
|
+
}
|
|
11
|
+
type TextSplitterChunkHeaderOptions = {
|
|
12
|
+
chunkHeader?: string;
|
|
13
|
+
chunkOverlapHeader?: string;
|
|
14
|
+
appendChunkOverlapHeader?: boolean;
|
|
15
|
+
};
|
|
16
|
+
declare abstract class TextSplitter extends BaseDocumentTransformer implements TextSplitterParams {
|
|
17
|
+
lc_namespace: string[];
|
|
18
|
+
chunkSize: number;
|
|
19
|
+
chunkOverlap: number;
|
|
20
|
+
keepSeparator: boolean;
|
|
21
|
+
lengthFunction: ((text: string) => number) | ((text: string) => Promise<number>);
|
|
22
|
+
constructor(fields?: Partial<TextSplitterParams>);
|
|
23
|
+
transformDocuments(documents: Document[], chunkHeaderOptions?: TextSplitterChunkHeaderOptions): Promise<Document[]>;
|
|
24
|
+
abstract splitText(text: string): Promise<string[]>;
|
|
25
|
+
protected splitOnSeparator(text: string, separator: string): string[];
|
|
26
|
+
createDocuments(texts: string[],
|
|
27
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
28
|
+
metadatas?: Record<string, any>[], chunkHeaderOptions?: TextSplitterChunkHeaderOptions): Promise<Document[]>;
|
|
29
|
+
private numberOfNewLines;
|
|
30
|
+
splitDocuments(documents: Document[], chunkHeaderOptions?: TextSplitterChunkHeaderOptions): Promise<Document[]>;
|
|
31
|
+
private joinDocs;
|
|
32
|
+
mergeSplits(splits: string[], separator: string): Promise<string[]>;
|
|
33
|
+
}
|
|
34
|
+
interface CharacterTextSplitterParams extends TextSplitterParams {
|
|
35
|
+
separator: string;
|
|
36
|
+
}
|
|
37
|
+
declare class CharacterTextSplitter extends TextSplitter implements CharacterTextSplitterParams {
|
|
38
|
+
static lc_name(): string;
|
|
39
|
+
separator: string;
|
|
40
|
+
constructor(fields?: Partial<CharacterTextSplitterParams>);
|
|
41
|
+
splitText(text: string): Promise<string[]>;
|
|
42
|
+
}
|
|
43
|
+
interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {
|
|
44
|
+
separators: string[];
|
|
45
|
+
}
|
|
46
|
+
declare const SupportedTextSplitterLanguages: readonly ["cpp", "go", "java", "js", "php", "proto", "python", "rst", "ruby", "rust", "scala", "swift", "markdown", "latex", "html", "sol"];
|
|
47
|
+
type SupportedTextSplitterLanguage = (typeof SupportedTextSplitterLanguages)[number];
|
|
48
|
+
declare class RecursiveCharacterTextSplitter extends TextSplitter implements RecursiveCharacterTextSplitterParams {
|
|
49
|
+
static lc_name(): string;
|
|
50
|
+
separators: string[];
|
|
51
|
+
constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>);
|
|
52
|
+
private _splitText;
|
|
53
|
+
splitText(text: string): Promise<string[]>;
|
|
54
|
+
static fromLanguage(language: SupportedTextSplitterLanguage, options?: Partial<RecursiveCharacterTextSplitterParams>): RecursiveCharacterTextSplitter;
|
|
55
|
+
static getSeparatorsForLanguage(language: SupportedTextSplitterLanguage): string[];
|
|
56
|
+
}
|
|
57
|
+
interface TokenTextSplitterParams extends TextSplitterParams {
|
|
58
|
+
encodingName: tiktoken.TiktokenEncoding;
|
|
59
|
+
allowedSpecial: "all" | Array<string>;
|
|
60
|
+
disallowedSpecial: "all" | Array<string>;
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Implementation of splitter which looks at tokens.
|
|
64
|
+
*/
|
|
65
|
+
declare class TokenTextSplitter extends TextSplitter implements TokenTextSplitterParams {
|
|
66
|
+
static lc_name(): string;
|
|
67
|
+
encodingName: tiktoken.TiktokenEncoding;
|
|
68
|
+
allowedSpecial: "all" | Array<string>;
|
|
69
|
+
disallowedSpecial: "all" | Array<string>;
|
|
70
|
+
private tokenizer;
|
|
71
|
+
constructor(fields?: Partial<TokenTextSplitterParams>);
|
|
72
|
+
splitText(text: string): Promise<string[]>;
|
|
73
|
+
}
|
|
74
|
+
type MarkdownTextSplitterParams = TextSplitterParams;
|
|
75
|
+
declare class MarkdownTextSplitter extends RecursiveCharacterTextSplitter implements MarkdownTextSplitterParams {
|
|
76
|
+
constructor(fields?: Partial<MarkdownTextSplitterParams>);
|
|
77
|
+
}
|
|
78
|
+
type LatexTextSplitterParams = TextSplitterParams;
|
|
79
|
+
declare class LatexTextSplitter extends RecursiveCharacterTextSplitter implements LatexTextSplitterParams {
|
|
80
|
+
constructor(fields?: Partial<LatexTextSplitterParams>);
|
|
81
|
+
}
|
|
82
|
+
//#endregion
|
|
83
|
+
export { CharacterTextSplitter, CharacterTextSplitterParams, LatexTextSplitter, LatexTextSplitterParams, MarkdownTextSplitter, MarkdownTextSplitterParams, RecursiveCharacterTextSplitter, RecursiveCharacterTextSplitterParams, SupportedTextSplitterLanguage, SupportedTextSplitterLanguages, TextSplitter, TextSplitterChunkHeaderOptions, TextSplitterParams, TokenTextSplitter, TokenTextSplitterParams };
|
|
84
|
+
//# sourceMappingURL=text_splitter.d.ts.map
|
|
@@ -0,0 +1,532 @@
|
|
|
1
|
+
import { BaseDocumentTransformer, Document } from "@langchain/core/documents";
|
|
2
|
+
import { getEncoding } from "@langchain/core/utils/tiktoken";
|
|
3
|
+
|
|
4
|
+
//#region src/text_splitter.ts
|
|
5
|
+
var TextSplitter = class extends BaseDocumentTransformer {
|
|
6
|
+
lc_namespace = [
|
|
7
|
+
"langchain",
|
|
8
|
+
"document_transformers",
|
|
9
|
+
"text_splitters"
|
|
10
|
+
];
|
|
11
|
+
chunkSize = 1e3;
|
|
12
|
+
chunkOverlap = 200;
|
|
13
|
+
keepSeparator = false;
|
|
14
|
+
lengthFunction;
|
|
15
|
+
constructor(fields) {
|
|
16
|
+
super(fields);
|
|
17
|
+
this.chunkSize = fields?.chunkSize ?? this.chunkSize;
|
|
18
|
+
this.chunkOverlap = fields?.chunkOverlap ?? this.chunkOverlap;
|
|
19
|
+
this.keepSeparator = fields?.keepSeparator ?? this.keepSeparator;
|
|
20
|
+
this.lengthFunction = fields?.lengthFunction ?? ((text) => text.length);
|
|
21
|
+
if (this.chunkOverlap >= this.chunkSize) throw new Error("Cannot have chunkOverlap >= chunkSize");
|
|
22
|
+
}
|
|
23
|
+
async transformDocuments(documents, chunkHeaderOptions = {}) {
|
|
24
|
+
return this.splitDocuments(documents, chunkHeaderOptions);
|
|
25
|
+
}
|
|
26
|
+
splitOnSeparator(text, separator) {
|
|
27
|
+
let splits;
|
|
28
|
+
if (separator) if (this.keepSeparator) {
|
|
29
|
+
const regexEscapedSeparator = separator.replace(/[/\-\\^$*+?.()|[\]{}]/g, "\\$&");
|
|
30
|
+
splits = text.split(/* @__PURE__ */ new RegExp(`(?=${regexEscapedSeparator})`));
|
|
31
|
+
} else splits = text.split(separator);
|
|
32
|
+
else splits = text.split("");
|
|
33
|
+
return splits.filter((s) => s !== "");
|
|
34
|
+
}
|
|
35
|
+
async createDocuments(texts, metadatas = [], chunkHeaderOptions = {}) {
|
|
36
|
+
const _metadatas = metadatas.length > 0 ? metadatas : [...Array(texts.length)].map(() => ({}));
|
|
37
|
+
const { chunkHeader = "", chunkOverlapHeader = "(cont'd) ", appendChunkOverlapHeader = false } = chunkHeaderOptions;
|
|
38
|
+
const documents = new Array();
|
|
39
|
+
for (let i = 0; i < texts.length; i += 1) {
|
|
40
|
+
const text = texts[i];
|
|
41
|
+
let lineCounterIndex = 1;
|
|
42
|
+
let prevChunk = null;
|
|
43
|
+
let indexPrevChunk = -1;
|
|
44
|
+
for (const chunk of await this.splitText(text)) {
|
|
45
|
+
let pageContent = chunkHeader;
|
|
46
|
+
const indexChunk = text.indexOf(chunk, indexPrevChunk + 1);
|
|
47
|
+
if (prevChunk === null) {
|
|
48
|
+
const newLinesBeforeFirstChunk = this.numberOfNewLines(text, 0, indexChunk);
|
|
49
|
+
lineCounterIndex += newLinesBeforeFirstChunk;
|
|
50
|
+
} else {
|
|
51
|
+
const indexEndPrevChunk = indexPrevChunk + await this.lengthFunction(prevChunk);
|
|
52
|
+
if (indexEndPrevChunk < indexChunk) {
|
|
53
|
+
const numberOfIntermediateNewLines = this.numberOfNewLines(text, indexEndPrevChunk, indexChunk);
|
|
54
|
+
lineCounterIndex += numberOfIntermediateNewLines;
|
|
55
|
+
} else if (indexEndPrevChunk > indexChunk) {
|
|
56
|
+
const numberOfIntermediateNewLines = this.numberOfNewLines(text, indexChunk, indexEndPrevChunk);
|
|
57
|
+
lineCounterIndex -= numberOfIntermediateNewLines;
|
|
58
|
+
}
|
|
59
|
+
if (appendChunkOverlapHeader) pageContent += chunkOverlapHeader;
|
|
60
|
+
}
|
|
61
|
+
const newLinesCount = this.numberOfNewLines(chunk);
|
|
62
|
+
const loc = _metadatas[i].loc && typeof _metadatas[i].loc === "object" ? { ..._metadatas[i].loc } : {};
|
|
63
|
+
loc.lines = {
|
|
64
|
+
from: lineCounterIndex,
|
|
65
|
+
to: lineCounterIndex + newLinesCount
|
|
66
|
+
};
|
|
67
|
+
const metadataWithLinesNumber = {
|
|
68
|
+
..._metadatas[i],
|
|
69
|
+
loc
|
|
70
|
+
};
|
|
71
|
+
pageContent += chunk;
|
|
72
|
+
documents.push(new Document({
|
|
73
|
+
pageContent,
|
|
74
|
+
metadata: metadataWithLinesNumber
|
|
75
|
+
}));
|
|
76
|
+
lineCounterIndex += newLinesCount;
|
|
77
|
+
prevChunk = chunk;
|
|
78
|
+
indexPrevChunk = indexChunk;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return documents;
|
|
82
|
+
}
|
|
83
|
+
numberOfNewLines(text, start, end) {
|
|
84
|
+
const textSection = text.slice(start, end);
|
|
85
|
+
return (textSection.match(/\n/g) || []).length;
|
|
86
|
+
}
|
|
87
|
+
async splitDocuments(documents, chunkHeaderOptions = {}) {
|
|
88
|
+
const selectedDocuments = documents.filter((doc) => doc.pageContent !== void 0);
|
|
89
|
+
const texts = selectedDocuments.map((doc) => doc.pageContent);
|
|
90
|
+
const metadatas = selectedDocuments.map((doc) => doc.metadata);
|
|
91
|
+
return this.createDocuments(texts, metadatas, chunkHeaderOptions);
|
|
92
|
+
}
|
|
93
|
+
joinDocs(docs, separator) {
|
|
94
|
+
const text = docs.join(separator).trim();
|
|
95
|
+
return text === "" ? null : text;
|
|
96
|
+
}
|
|
97
|
+
async mergeSplits(splits, separator) {
|
|
98
|
+
const docs = [];
|
|
99
|
+
const currentDoc = [];
|
|
100
|
+
let total = 0;
|
|
101
|
+
for (const d of splits) {
|
|
102
|
+
const _len = await this.lengthFunction(d);
|
|
103
|
+
if (total + _len + currentDoc.length * separator.length > this.chunkSize) {
|
|
104
|
+
if (total > this.chunkSize) console.warn(`Created a chunk of size ${total}, +
|
|
105
|
+
which is longer than the specified ${this.chunkSize}`);
|
|
106
|
+
if (currentDoc.length > 0) {
|
|
107
|
+
const doc$1 = this.joinDocs(currentDoc, separator);
|
|
108
|
+
if (doc$1 !== null) docs.push(doc$1);
|
|
109
|
+
while (total > this.chunkOverlap || total + _len + currentDoc.length * separator.length > this.chunkSize && total > 0) {
|
|
110
|
+
total -= await this.lengthFunction(currentDoc[0]);
|
|
111
|
+
currentDoc.shift();
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
currentDoc.push(d);
|
|
116
|
+
total += _len;
|
|
117
|
+
}
|
|
118
|
+
const doc = this.joinDocs(currentDoc, separator);
|
|
119
|
+
if (doc !== null) docs.push(doc);
|
|
120
|
+
return docs;
|
|
121
|
+
}
|
|
122
|
+
};
|
|
123
|
+
var CharacterTextSplitter = class extends TextSplitter {
|
|
124
|
+
static lc_name() {
|
|
125
|
+
return "CharacterTextSplitter";
|
|
126
|
+
}
|
|
127
|
+
separator = "\n\n";
|
|
128
|
+
constructor(fields) {
|
|
129
|
+
super(fields);
|
|
130
|
+
this.separator = fields?.separator ?? this.separator;
|
|
131
|
+
}
|
|
132
|
+
async splitText(text) {
|
|
133
|
+
const splits = this.splitOnSeparator(text, this.separator);
|
|
134
|
+
return this.mergeSplits(splits, this.keepSeparator ? "" : this.separator);
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
const SupportedTextSplitterLanguages = [
|
|
138
|
+
"cpp",
|
|
139
|
+
"go",
|
|
140
|
+
"java",
|
|
141
|
+
"js",
|
|
142
|
+
"php",
|
|
143
|
+
"proto",
|
|
144
|
+
"python",
|
|
145
|
+
"rst",
|
|
146
|
+
"ruby",
|
|
147
|
+
"rust",
|
|
148
|
+
"scala",
|
|
149
|
+
"swift",
|
|
150
|
+
"markdown",
|
|
151
|
+
"latex",
|
|
152
|
+
"html",
|
|
153
|
+
"sol"
|
|
154
|
+
];
|
|
155
|
+
var RecursiveCharacterTextSplitter = class RecursiveCharacterTextSplitter extends TextSplitter {
|
|
156
|
+
static lc_name() {
|
|
157
|
+
return "RecursiveCharacterTextSplitter";
|
|
158
|
+
}
|
|
159
|
+
separators = [
|
|
160
|
+
"\n\n",
|
|
161
|
+
"\n",
|
|
162
|
+
" ",
|
|
163
|
+
""
|
|
164
|
+
];
|
|
165
|
+
constructor(fields) {
|
|
166
|
+
super(fields);
|
|
167
|
+
this.separators = fields?.separators ?? this.separators;
|
|
168
|
+
this.keepSeparator = fields?.keepSeparator ?? true;
|
|
169
|
+
}
|
|
170
|
+
async _splitText(text, separators) {
|
|
171
|
+
const finalChunks = [];
|
|
172
|
+
let separator = separators[separators.length - 1];
|
|
173
|
+
let newSeparators;
|
|
174
|
+
for (let i = 0; i < separators.length; i += 1) {
|
|
175
|
+
const s = separators[i];
|
|
176
|
+
if (s === "") {
|
|
177
|
+
separator = s;
|
|
178
|
+
break;
|
|
179
|
+
}
|
|
180
|
+
if (text.includes(s)) {
|
|
181
|
+
separator = s;
|
|
182
|
+
newSeparators = separators.slice(i + 1);
|
|
183
|
+
break;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
const splits = this.splitOnSeparator(text, separator);
|
|
187
|
+
let goodSplits = [];
|
|
188
|
+
const _separator = this.keepSeparator ? "" : separator;
|
|
189
|
+
for (const s of splits) if (await this.lengthFunction(s) < this.chunkSize) goodSplits.push(s);
|
|
190
|
+
else {
|
|
191
|
+
if (goodSplits.length) {
|
|
192
|
+
const mergedText = await this.mergeSplits(goodSplits, _separator);
|
|
193
|
+
finalChunks.push(...mergedText);
|
|
194
|
+
goodSplits = [];
|
|
195
|
+
}
|
|
196
|
+
if (!newSeparators) finalChunks.push(s);
|
|
197
|
+
else {
|
|
198
|
+
const otherInfo = await this._splitText(s, newSeparators);
|
|
199
|
+
finalChunks.push(...otherInfo);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
if (goodSplits.length) {
|
|
203
|
+
const mergedText = await this.mergeSplits(goodSplits, _separator);
|
|
204
|
+
finalChunks.push(...mergedText);
|
|
205
|
+
}
|
|
206
|
+
return finalChunks;
|
|
207
|
+
}
|
|
208
|
+
async splitText(text) {
|
|
209
|
+
return this._splitText(text, this.separators);
|
|
210
|
+
}
|
|
211
|
+
static fromLanguage(language, options) {
|
|
212
|
+
return new RecursiveCharacterTextSplitter({
|
|
213
|
+
...options,
|
|
214
|
+
separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage(language)
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
static getSeparatorsForLanguage(language) {
|
|
218
|
+
if (language === "cpp") return [
|
|
219
|
+
"\nclass ",
|
|
220
|
+
"\nvoid ",
|
|
221
|
+
"\nint ",
|
|
222
|
+
"\nfloat ",
|
|
223
|
+
"\ndouble ",
|
|
224
|
+
"\nif ",
|
|
225
|
+
"\nfor ",
|
|
226
|
+
"\nwhile ",
|
|
227
|
+
"\nswitch ",
|
|
228
|
+
"\ncase ",
|
|
229
|
+
"\n\n",
|
|
230
|
+
"\n",
|
|
231
|
+
" ",
|
|
232
|
+
""
|
|
233
|
+
];
|
|
234
|
+
else if (language === "go") return [
|
|
235
|
+
"\nfunc ",
|
|
236
|
+
"\nvar ",
|
|
237
|
+
"\nconst ",
|
|
238
|
+
"\ntype ",
|
|
239
|
+
"\nif ",
|
|
240
|
+
"\nfor ",
|
|
241
|
+
"\nswitch ",
|
|
242
|
+
"\ncase ",
|
|
243
|
+
"\n\n",
|
|
244
|
+
"\n",
|
|
245
|
+
" ",
|
|
246
|
+
""
|
|
247
|
+
];
|
|
248
|
+
else if (language === "java") return [
|
|
249
|
+
"\nclass ",
|
|
250
|
+
"\npublic ",
|
|
251
|
+
"\nprotected ",
|
|
252
|
+
"\nprivate ",
|
|
253
|
+
"\nstatic ",
|
|
254
|
+
"\nif ",
|
|
255
|
+
"\nfor ",
|
|
256
|
+
"\nwhile ",
|
|
257
|
+
"\nswitch ",
|
|
258
|
+
"\ncase ",
|
|
259
|
+
"\n\n",
|
|
260
|
+
"\n",
|
|
261
|
+
" ",
|
|
262
|
+
""
|
|
263
|
+
];
|
|
264
|
+
else if (language === "js") return [
|
|
265
|
+
"\nfunction ",
|
|
266
|
+
"\nconst ",
|
|
267
|
+
"\nlet ",
|
|
268
|
+
"\nvar ",
|
|
269
|
+
"\nclass ",
|
|
270
|
+
"\nif ",
|
|
271
|
+
"\nfor ",
|
|
272
|
+
"\nwhile ",
|
|
273
|
+
"\nswitch ",
|
|
274
|
+
"\ncase ",
|
|
275
|
+
"\ndefault ",
|
|
276
|
+
"\n\n",
|
|
277
|
+
"\n",
|
|
278
|
+
" ",
|
|
279
|
+
""
|
|
280
|
+
];
|
|
281
|
+
else if (language === "php") return [
|
|
282
|
+
"\nfunction ",
|
|
283
|
+
"\nclass ",
|
|
284
|
+
"\nif ",
|
|
285
|
+
"\nforeach ",
|
|
286
|
+
"\nwhile ",
|
|
287
|
+
"\ndo ",
|
|
288
|
+
"\nswitch ",
|
|
289
|
+
"\ncase ",
|
|
290
|
+
"\n\n",
|
|
291
|
+
"\n",
|
|
292
|
+
" ",
|
|
293
|
+
""
|
|
294
|
+
];
|
|
295
|
+
else if (language === "proto") return [
|
|
296
|
+
"\nmessage ",
|
|
297
|
+
"\nservice ",
|
|
298
|
+
"\nenum ",
|
|
299
|
+
"\noption ",
|
|
300
|
+
"\nimport ",
|
|
301
|
+
"\nsyntax ",
|
|
302
|
+
"\n\n",
|
|
303
|
+
"\n",
|
|
304
|
+
" ",
|
|
305
|
+
""
|
|
306
|
+
];
|
|
307
|
+
else if (language === "python") return [
|
|
308
|
+
"\nclass ",
|
|
309
|
+
"\ndef ",
|
|
310
|
+
"\n def ",
|
|
311
|
+
"\n\n",
|
|
312
|
+
"\n",
|
|
313
|
+
" ",
|
|
314
|
+
""
|
|
315
|
+
];
|
|
316
|
+
else if (language === "rst") return [
|
|
317
|
+
"\n===\n",
|
|
318
|
+
"\n---\n",
|
|
319
|
+
"\n***\n",
|
|
320
|
+
"\n.. ",
|
|
321
|
+
"\n\n",
|
|
322
|
+
"\n",
|
|
323
|
+
" ",
|
|
324
|
+
""
|
|
325
|
+
];
|
|
326
|
+
else if (language === "ruby") return [
|
|
327
|
+
"\ndef ",
|
|
328
|
+
"\nclass ",
|
|
329
|
+
"\nif ",
|
|
330
|
+
"\nunless ",
|
|
331
|
+
"\nwhile ",
|
|
332
|
+
"\nfor ",
|
|
333
|
+
"\ndo ",
|
|
334
|
+
"\nbegin ",
|
|
335
|
+
"\nrescue ",
|
|
336
|
+
"\n\n",
|
|
337
|
+
"\n",
|
|
338
|
+
" ",
|
|
339
|
+
""
|
|
340
|
+
];
|
|
341
|
+
else if (language === "rust") return [
|
|
342
|
+
"\nfn ",
|
|
343
|
+
"\nconst ",
|
|
344
|
+
"\nlet ",
|
|
345
|
+
"\nif ",
|
|
346
|
+
"\nwhile ",
|
|
347
|
+
"\nfor ",
|
|
348
|
+
"\nloop ",
|
|
349
|
+
"\nmatch ",
|
|
350
|
+
"\nconst ",
|
|
351
|
+
"\n\n",
|
|
352
|
+
"\n",
|
|
353
|
+
" ",
|
|
354
|
+
""
|
|
355
|
+
];
|
|
356
|
+
else if (language === "scala") return [
|
|
357
|
+
"\nclass ",
|
|
358
|
+
"\nobject ",
|
|
359
|
+
"\ndef ",
|
|
360
|
+
"\nval ",
|
|
361
|
+
"\nvar ",
|
|
362
|
+
"\nif ",
|
|
363
|
+
"\nfor ",
|
|
364
|
+
"\nwhile ",
|
|
365
|
+
"\nmatch ",
|
|
366
|
+
"\ncase ",
|
|
367
|
+
"\n\n",
|
|
368
|
+
"\n",
|
|
369
|
+
" ",
|
|
370
|
+
""
|
|
371
|
+
];
|
|
372
|
+
else if (language === "swift") return [
|
|
373
|
+
"\nfunc ",
|
|
374
|
+
"\nclass ",
|
|
375
|
+
"\nstruct ",
|
|
376
|
+
"\nenum ",
|
|
377
|
+
"\nif ",
|
|
378
|
+
"\nfor ",
|
|
379
|
+
"\nwhile ",
|
|
380
|
+
"\ndo ",
|
|
381
|
+
"\nswitch ",
|
|
382
|
+
"\ncase ",
|
|
383
|
+
"\n\n",
|
|
384
|
+
"\n",
|
|
385
|
+
" ",
|
|
386
|
+
""
|
|
387
|
+
];
|
|
388
|
+
else if (language === "markdown") return [
|
|
389
|
+
"\n## ",
|
|
390
|
+
"\n### ",
|
|
391
|
+
"\n#### ",
|
|
392
|
+
"\n##### ",
|
|
393
|
+
"\n###### ",
|
|
394
|
+
"```\n\n",
|
|
395
|
+
"\n\n***\n\n",
|
|
396
|
+
"\n\n---\n\n",
|
|
397
|
+
"\n\n___\n\n",
|
|
398
|
+
"\n\n",
|
|
399
|
+
"\n",
|
|
400
|
+
" ",
|
|
401
|
+
""
|
|
402
|
+
];
|
|
403
|
+
else if (language === "latex") return [
|
|
404
|
+
"\n\\chapter{",
|
|
405
|
+
"\n\\section{",
|
|
406
|
+
"\n\\subsection{",
|
|
407
|
+
"\n\\subsubsection{",
|
|
408
|
+
"\n\\begin{enumerate}",
|
|
409
|
+
"\n\\begin{itemize}",
|
|
410
|
+
"\n\\begin{description}",
|
|
411
|
+
"\n\\begin{list}",
|
|
412
|
+
"\n\\begin{quote}",
|
|
413
|
+
"\n\\begin{quotation}",
|
|
414
|
+
"\n\\begin{verse}",
|
|
415
|
+
"\n\\begin{verbatim}",
|
|
416
|
+
"\n\\begin{align}",
|
|
417
|
+
"$$",
|
|
418
|
+
"$",
|
|
419
|
+
"\n\n",
|
|
420
|
+
"\n",
|
|
421
|
+
" ",
|
|
422
|
+
""
|
|
423
|
+
];
|
|
424
|
+
else if (language === "html") return [
|
|
425
|
+
"<body>",
|
|
426
|
+
"<div>",
|
|
427
|
+
"<p>",
|
|
428
|
+
"<br>",
|
|
429
|
+
"<li>",
|
|
430
|
+
"<h1>",
|
|
431
|
+
"<h2>",
|
|
432
|
+
"<h3>",
|
|
433
|
+
"<h4>",
|
|
434
|
+
"<h5>",
|
|
435
|
+
"<h6>",
|
|
436
|
+
"<span>",
|
|
437
|
+
"<table>",
|
|
438
|
+
"<tr>",
|
|
439
|
+
"<td>",
|
|
440
|
+
"<th>",
|
|
441
|
+
"<ul>",
|
|
442
|
+
"<ol>",
|
|
443
|
+
"<header>",
|
|
444
|
+
"<footer>",
|
|
445
|
+
"<nav>",
|
|
446
|
+
"<head>",
|
|
447
|
+
"<style>",
|
|
448
|
+
"<script>",
|
|
449
|
+
"<meta>",
|
|
450
|
+
"<title>",
|
|
451
|
+
" ",
|
|
452
|
+
""
|
|
453
|
+
];
|
|
454
|
+
else if (language === "sol") return [
|
|
455
|
+
"\npragma ",
|
|
456
|
+
"\nusing ",
|
|
457
|
+
"\ncontract ",
|
|
458
|
+
"\ninterface ",
|
|
459
|
+
"\nlibrary ",
|
|
460
|
+
"\nconstructor ",
|
|
461
|
+
"\ntype ",
|
|
462
|
+
"\nfunction ",
|
|
463
|
+
"\nevent ",
|
|
464
|
+
"\nmodifier ",
|
|
465
|
+
"\nerror ",
|
|
466
|
+
"\nstruct ",
|
|
467
|
+
"\nenum ",
|
|
468
|
+
"\nif ",
|
|
469
|
+
"\nfor ",
|
|
470
|
+
"\nwhile ",
|
|
471
|
+
"\ndo while ",
|
|
472
|
+
"\nassembly ",
|
|
473
|
+
"\n\n",
|
|
474
|
+
"\n",
|
|
475
|
+
" ",
|
|
476
|
+
""
|
|
477
|
+
];
|
|
478
|
+
else throw new Error(`Language ${language} is not supported.`);
|
|
479
|
+
}
|
|
480
|
+
};
|
|
481
|
+
/**
|
|
482
|
+
* Implementation of splitter which looks at tokens.
|
|
483
|
+
*/
|
|
484
|
+
var TokenTextSplitter = class extends TextSplitter {
|
|
485
|
+
static lc_name() {
|
|
486
|
+
return "TokenTextSplitter";
|
|
487
|
+
}
|
|
488
|
+
encodingName;
|
|
489
|
+
allowedSpecial;
|
|
490
|
+
disallowedSpecial;
|
|
491
|
+
tokenizer;
|
|
492
|
+
constructor(fields) {
|
|
493
|
+
super(fields);
|
|
494
|
+
this.encodingName = fields?.encodingName ?? "gpt2";
|
|
495
|
+
this.allowedSpecial = fields?.allowedSpecial ?? [];
|
|
496
|
+
this.disallowedSpecial = fields?.disallowedSpecial ?? "all";
|
|
497
|
+
}
|
|
498
|
+
async splitText(text) {
|
|
499
|
+
if (!this.tokenizer) this.tokenizer = await getEncoding(this.encodingName);
|
|
500
|
+
const splits = [];
|
|
501
|
+
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
|
|
502
|
+
let start_idx = 0;
|
|
503
|
+
while (start_idx < input_ids.length) {
|
|
504
|
+
if (start_idx > 0) start_idx -= this.chunkOverlap;
|
|
505
|
+
const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
|
|
506
|
+
const chunk_ids = input_ids.slice(start_idx, end_idx);
|
|
507
|
+
splits.push(this.tokenizer.decode(chunk_ids));
|
|
508
|
+
start_idx = end_idx;
|
|
509
|
+
}
|
|
510
|
+
return splits;
|
|
511
|
+
}
|
|
512
|
+
};
|
|
513
|
+
var MarkdownTextSplitter = class extends RecursiveCharacterTextSplitter {
|
|
514
|
+
constructor(fields) {
|
|
515
|
+
super({
|
|
516
|
+
...fields,
|
|
517
|
+
separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage("markdown")
|
|
518
|
+
});
|
|
519
|
+
}
|
|
520
|
+
};
|
|
521
|
+
var LatexTextSplitter = class extends RecursiveCharacterTextSplitter {
|
|
522
|
+
constructor(fields) {
|
|
523
|
+
super({
|
|
524
|
+
...fields,
|
|
525
|
+
separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage("latex")
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
};
|
|
529
|
+
|
|
530
|
+
//#endregion
|
|
531
|
+
export { CharacterTextSplitter, LatexTextSplitter, MarkdownTextSplitter, RecursiveCharacterTextSplitter, SupportedTextSplitterLanguages, TextSplitter, TokenTextSplitter };
|
|
532
|
+
//# sourceMappingURL=text_splitter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"name":"@langchain/textsplitters","version":"1.0.1","description":"Various implementations of LangChain.js text splitters","author":"LangChain","license":"MIT","type":"module","engines":{"node":">=20"},"repository":{"type":"git","url":"git@github.com:langchain-ai/langchainjs.git"},"homepage":"https://github.com/langchain-ai/langchainjs/tree/main/libs/langchain-textsplitters/","dependencies":{"js-tiktoken":"^1.0.12"},"peerDependencies":{"@langchain/core":"^1.0.0"},"devDependencies":{"@tsconfig/recommended":"^1.0.3","@vitest/coverage-v8":"^3.2.4","dotenv":"^16.3.1","dpdm":"^3.14.0","eslint":"^9.34.0","prettier":"^2.8.3","rollup":"^4.5.2","typescript":"~5.8.3","vitest":"^3.2.4","@langchain/core":"1.1.0","@langchain/eslint":"0.1.1"},"publishConfig":{"access":"public"},"main":"./dist/index.cjs","types":"./dist/index.d.cts","exports":{".":{"input":"./src/index.ts","require":{"types":"./dist/index.d.cts","default":"./dist/index.cjs"},"import":{"types":"./dist/index.d.ts","default":"./dist/index.js"}},"./package.json":"./package.json"},"files":["dist/","CHANGELOG.md","README.md","LICENSE"],"module":"./dist/index.js","scripts":{"build":"turbo build:compile --filter @langchain/textsplitters --output-logs new-only","build:compile":"tsdown","lint:eslint":"eslint --cache src/","lint:dpdm":"dpdm --skip-dynamic-imports circular --exit-code circular:1 --no-warning --no-tree src/*.ts src/**/*.ts","lint":"pnpm lint:eslint && pnpm lint:dpdm","lint:fix":"pnpm lint:eslint --fix && pnpm lint:dpdm","clean":"rm -rf .turbo dist/","test":"vitest --run","test:watch":"vitest --watch","format":"prettier --config .prettierrc --write \"src\"","format:check":"prettier --config .prettierrc --check \"src\""},"_lastModified":"2026-03-18T18:59:29.112Z"}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* This file is part of the NocoBase (R) project.
|
|
3
|
+
* Copyright (c) 2020-2024 NocoBase Co., Ltd.
|
|
4
|
+
* Authors: NocoBase Team.
|
|
5
|
+
*
|
|
6
|
+
* This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
|
|
7
|
+
* For more information, please refer to: https://www.nocobase.com/agreement.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
var __defProp = Object.defineProperty;
|
|
11
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
12
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
13
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
14
|
+
var __export = (target, all) => {
|
|
15
|
+
for (var name in all)
|
|
16
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
17
|
+
};
|
|
18
|
+
var __copyProps = (to, from, except, desc) => {
|
|
19
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
20
|
+
for (let key of __getOwnPropNames(from))
|
|
21
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
22
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
23
|
+
}
|
|
24
|
+
return to;
|
|
25
|
+
};
|
|
26
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
27
|
+
var add_document_exports = {};
|
|
28
|
+
__export(add_document_exports, {
|
|
29
|
+
addDocumentAction: () => addDocumentAction
|
|
30
|
+
});
|
|
31
|
+
module.exports = __toCommonJS(add_document_exports);
|
|
32
|
+
async function addDocumentAction(ctx, next) {
|
|
33
|
+
const { knowledgeBaseId, fileUrl, textContent, userId, filename } = ctx.action.params.values ?? {};
|
|
34
|
+
if (!knowledgeBaseId) {
|
|
35
|
+
ctx.throw(400, "knowledgeBaseId is required");
|
|
36
|
+
}
|
|
37
|
+
if (!fileUrl && !textContent) {
|
|
38
|
+
ctx.throw(400, "fileUrl or textContent is required");
|
|
39
|
+
}
|
|
40
|
+
if (!userId) {
|
|
41
|
+
ctx.throw(400, "userId is required");
|
|
42
|
+
}
|
|
43
|
+
const knowledgeBase = await ctx.db.getRepository("aiKnowledgeBases").findOne({
|
|
44
|
+
filter: { id: knowledgeBaseId },
|
|
45
|
+
appends: ["vectorStore", "vectorStore.vectorDatabase"]
|
|
46
|
+
});
|
|
47
|
+
if (!knowledgeBase) {
|
|
48
|
+
ctx.throw(404, `Knowledge base "${knowledgeBaseId}" not found`);
|
|
49
|
+
}
|
|
50
|
+
const docRepo = ctx.db.getRepository("aiKnowledgeBaseDocuments");
|
|
51
|
+
const docValues = {
|
|
52
|
+
knowledgeBaseId,
|
|
53
|
+
uploadedById: userId,
|
|
54
|
+
status: "pending",
|
|
55
|
+
filename: filename || (fileUrl ? fileUrl.split("/").pop() : "pasted-text")
|
|
56
|
+
};
|
|
57
|
+
if (textContent) {
|
|
58
|
+
docValues.textContent = textContent;
|
|
59
|
+
docValues.filename = filename || "pasted-text";
|
|
60
|
+
}
|
|
61
|
+
if (fileUrl) {
|
|
62
|
+
docValues.fileUrl = fileUrl;
|
|
63
|
+
}
|
|
64
|
+
const doc = await docRepo.create({ values: docValues });
|
|
65
|
+
try {
|
|
66
|
+
const plugin = ctx.app.pm.get("plugin-knowledge-base") || ctx.app.pm.get("@nocobase/plugin-knowledge-base");
|
|
67
|
+
if (plugin == null ? void 0 : plugin.vectorizationPipeline) {
|
|
68
|
+
plugin.vectorizationPipeline.processDocument(doc.id).catch((err) => {
|
|
69
|
+
ctx.app.logger.error(`[addDocument] Vectorization failed for doc ${doc.id}:`, err);
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
} catch (err) {
|
|
73
|
+
ctx.app.logger.error("[addDocument] Failed to trigger vectorization:", err);
|
|
74
|
+
}
|
|
75
|
+
ctx.body = {
|
|
76
|
+
success: true,
|
|
77
|
+
documentId: doc.id,
|
|
78
|
+
message: "Document added and vectorization triggered"
|
|
79
|
+
};
|
|
80
|
+
await next();
|
|
81
|
+
}
|
|
82
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
83
|
+
0 && (module.exports = {
|
|
84
|
+
addDocumentAction
|
|
85
|
+
});
|