vectra 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/bin/vectra.js +3 -0
- package/lib/GPT3Tokenizer.d.ts +9 -0
- package/lib/GPT3Tokenizer.d.ts.map +1 -0
- package/lib/GPT3Tokenizer.js +17 -0
- package/lib/GPT3Tokenizer.js.map +1 -0
- package/lib/ItemSelector.d.ts +41 -0
- package/lib/ItemSelector.d.ts.map +1 -0
- package/lib/ItemSelector.js +156 -0
- package/lib/ItemSelector.js.map +1 -0
- package/lib/LocalDocument.d.ts +16 -0
- package/lib/LocalDocument.d.ts.map +1 -0
- package/lib/LocalDocument.js +99 -0
- package/lib/LocalDocument.js.map +1 -0
- package/lib/LocalDocumentIndex.d.ts +48 -0
- package/lib/LocalDocumentIndex.d.ts.map +1 -0
- package/lib/LocalDocumentIndex.js +367 -0
- package/lib/LocalDocumentIndex.js.map +1 -0
- package/lib/LocalDocumentResult.d.ts +12 -0
- package/lib/LocalDocumentResult.d.ts.map +1 -0
- package/lib/LocalDocumentResult.js +186 -0
- package/lib/LocalDocumentResult.js.map +1 -0
- package/lib/LocalIndex.d.ts +130 -0
- package/lib/LocalIndex.d.ts.map +1 -0
- package/lib/LocalIndex.js +405 -0
- package/lib/LocalIndex.js.map +1 -0
- package/lib/OpenAIEmbeddings.d.ts +98 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.js +139 -0
- package/lib/OpenAIEmbeddings.js.map +1 -0
- package/lib/TextSplitter.d.ts +17 -0
- package/lib/TextSplitter.d.ts.map +1 -0
- package/lib/TextSplitter.js +460 -0
- package/lib/TextSplitter.js.map +1 -0
- package/lib/WebFetcher.d.ts +16 -0
- package/lib/WebFetcher.d.ts.map +1 -0
- package/lib/WebFetcher.js +144 -0
- package/lib/WebFetcher.js.map +1 -0
- package/lib/index.d.ts +11 -0
- package/lib/index.d.ts.map +1 -0
- package/lib/index.js +27 -0
- package/lib/index.js.map +1 -0
- package/lib/internals/Colorize.d.ts +14 -0
- package/lib/internals/Colorize.d.ts.map +1 -0
- package/lib/internals/Colorize.js +64 -0
- package/lib/internals/Colorize.js.map +1 -0
- package/lib/internals/index.d.ts +3 -0
- package/lib/internals/index.d.ts.map +1 -0
- package/lib/internals/index.js +19 -0
- package/lib/internals/index.js.map +1 -0
- package/lib/internals/types.d.ts +42 -0
- package/lib/internals/types.d.ts.map +1 -0
- package/lib/internals/types.js +3 -0
- package/lib/internals/types.js.map +1 -0
- package/lib/types.d.ts +133 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +3 -0
- package/lib/types.js.map +1 -0
- package/lib/vectra-cli.d.ts +2 -0
- package/lib/vectra-cli.d.ts.map +1 -0
- package/lib/vectra-cli.js +276 -0
- package/lib/vectra-cli.js.map +1 -0
- package/package.json +21 -3
- package/src/GPT3Tokenizer.ts +15 -0
- package/src/ItemSelector.ts +9 -9
- package/src/LocalDocument.ts +70 -0
- package/src/LocalDocumentIndex.ts +355 -0
- package/src/LocalDocumentResult.ts +206 -0
- package/src/LocalIndex.ts +12 -78
- package/src/OpenAIEmbeddings.ts +205 -0
- package/src/TextSplitter.ts +480 -0
- package/src/WebFetcher.ts +128 -0
- package/src/index.ts +8 -0
- package/src/internals/Colorize.ts +64 -0
- package/src/internals/index.ts +2 -0
- package/src/internals/types.ts +46 -0
- package/src/types.ts +160 -0
- package/src/vectra-cli.ts +238 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.OpenAIEmbeddings = void 0;
|
|
16
|
+
const axios_1 = __importDefault(require("axios"));
|
|
17
|
+
/**
|
|
18
|
+
* A `PromptCompletionModel` for calling OpenAI and Azure OpenAI hosted models.
|
|
19
|
+
* @remarks
|
|
20
|
+
*/
|
|
21
|
+
class OpenAIEmbeddings {
|
|
22
|
+
/**
|
|
23
|
+
* Creates a new `OpenAIClient` instance.
|
|
24
|
+
* @param options Options for configuring an `OpenAIClient`.
|
|
25
|
+
*/
|
|
26
|
+
constructor(options) {
|
|
27
|
+
this.UserAgent = 'AlphaWave';
|
|
28
|
+
// Check for azure config
|
|
29
|
+
if (options.azureApiKey) {
|
|
30
|
+
this._useAzure = true;
|
|
31
|
+
this.options = Object.assign({
|
|
32
|
+
retryPolicy: [2000, 5000],
|
|
33
|
+
azureApiVersion: '2023-05-15',
|
|
34
|
+
}, options);
|
|
35
|
+
// Cleanup and validate endpoint
|
|
36
|
+
let endpoint = this.options.azureEndpoint.trim();
|
|
37
|
+
if (endpoint.endsWith('/')) {
|
|
38
|
+
endpoint = endpoint.substring(0, endpoint.length - 1);
|
|
39
|
+
}
|
|
40
|
+
if (!endpoint.toLowerCase().startsWith('https://')) {
|
|
41
|
+
throw new Error(`Client created with an invalid endpoint of '${endpoint}'. The endpoint must be a valid HTTPS url.`);
|
|
42
|
+
}
|
|
43
|
+
this.options.azureEndpoint = endpoint;
|
|
44
|
+
}
|
|
45
|
+
else {
|
|
46
|
+
this._useAzure = false;
|
|
47
|
+
this.options = Object.assign({
|
|
48
|
+
retryPolicy: [2000, 5000]
|
|
49
|
+
}, options);
|
|
50
|
+
}
|
|
51
|
+
// Create client
|
|
52
|
+
this._httpClient = axios_1.default.create({
|
|
53
|
+
validateStatus: (status) => status < 400 || status == 429
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Creates embeddings for the given inputs using the OpenAI API.
|
|
58
|
+
* @param model Name of the model to use (or deployment for Azure).
|
|
59
|
+
* @param inputs Text inputs to create embeddings for.
|
|
60
|
+
* @returns A `EmbeddingsResponse` with a status and the generated embeddings or a message when an error occurs.
|
|
61
|
+
*/
|
|
62
|
+
createEmbeddings(inputs) {
|
|
63
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
64
|
+
const response = yield this.createEmbeddingRequest({
|
|
65
|
+
input: inputs,
|
|
66
|
+
});
|
|
67
|
+
// Process response
|
|
68
|
+
if (response.status < 300) {
|
|
69
|
+
return { status: 'success', output: response.data.data.sort((a, b) => a.index - b.index).map((item) => item.embedding) };
|
|
70
|
+
}
|
|
71
|
+
else if (response.status == 429) {
|
|
72
|
+
return { status: 'rate_limited', message: `The embeddings API returned a rate limit error.` };
|
|
73
|
+
}
|
|
74
|
+
else {
|
|
75
|
+
return { status: 'error', message: `The embeddings API returned an error status of ${response.status}: ${response.statusText}` };
|
|
76
|
+
}
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* @private
|
|
81
|
+
*/
|
|
82
|
+
createEmbeddingRequest(request) {
|
|
83
|
+
var _a;
|
|
84
|
+
if (this._useAzure) {
|
|
85
|
+
const options = this.options;
|
|
86
|
+
const url = `${options.azureEndpoint}/openai/deployments/${options.azureDeployment}/embeddings?api-version=${options.azureApiVersion}`;
|
|
87
|
+
return this.post(url, request);
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
const options = this.options;
|
|
91
|
+
const url = `${(_a = options.endpoint) !== null && _a !== void 0 ? _a : 'https://api.openai.com'}/v1/embeddings`;
|
|
92
|
+
request.model = options.model;
|
|
93
|
+
return this.post(url, request);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* @private
|
|
98
|
+
*/
|
|
99
|
+
post(url, body, retryCount = 0) {
|
|
100
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
101
|
+
// Initialize request config
|
|
102
|
+
const requestConfig = Object.assign({}, this.options.requestConfig);
|
|
103
|
+
// Initialize request headers
|
|
104
|
+
if (!requestConfig.headers) {
|
|
105
|
+
requestConfig.headers = {};
|
|
106
|
+
}
|
|
107
|
+
if (!requestConfig.headers['Content-Type']) {
|
|
108
|
+
requestConfig.headers['Content-Type'] = 'application/json';
|
|
109
|
+
}
|
|
110
|
+
if (!requestConfig.headers['User-Agent']) {
|
|
111
|
+
requestConfig.headers['User-Agent'] = this.UserAgent;
|
|
112
|
+
}
|
|
113
|
+
if (this._useAzure) {
|
|
114
|
+
const options = this.options;
|
|
115
|
+
requestConfig.headers['api-key'] = options.azureApiKey;
|
|
116
|
+
}
|
|
117
|
+
else {
|
|
118
|
+
const options = this.options;
|
|
119
|
+
requestConfig.headers['Authorization'] = `Bearer ${options.apiKey}`;
|
|
120
|
+
if (options.organization) {
|
|
121
|
+
requestConfig.headers['OpenAI-Organization'] = options.organization;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
// Send request
|
|
125
|
+
const response = yield this._httpClient.post(url, body, requestConfig);
|
|
126
|
+
// Check for rate limit error
|
|
127
|
+
if (response.status == 429 && Array.isArray(this.options.retryPolicy) && retryCount < this.options.retryPolicy.length) {
|
|
128
|
+
const delay = this.options.retryPolicy[retryCount];
|
|
129
|
+
yield new Promise((resolve) => setTimeout(resolve, delay));
|
|
130
|
+
return this.post(url, body, retryCount + 1);
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
return response;
|
|
134
|
+
}
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
exports.OpenAIEmbeddings = OpenAIEmbeddings;
|
|
139
|
+
//# sourceMappingURL=OpenAIEmbeddings.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"OpenAIEmbeddings.js","sourceRoot":"","sources":["../src/OpenAIEmbeddings.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;AAAA,kDAAgF;AA2EhF;;;GAGG;AACH,MAAa,gBAAgB;IAWzB;;;OAGG;IACH,YAAmB,OAA6D;QAX/D,cAAS,GAAG,WAAW,CAAC;QAYrC,yBAAyB;QACzB,IAAK,OAAwC,CAAC,WAAW,EAAE;YACvD,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;YACtB,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC;gBACzB,WAAW,EAAE,CAAC,IAAI,EAAE,IAAI,CAAC;gBACzB,eAAe,EAAE,YAAY;aAChC,EAAE,OAAO,CAAiC,CAAC;YAE5C,gCAAgC;YAChC,IAAI,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC;YACjD,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE;gBACxB,QAAQ,GAAG,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;aACzD;YAED,IAAI,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE;gBAChD,MAAM,IAAI,KAAK,CAAC,+CAA+C,QAAQ,4CAA4C,CAAC,CAAC;aACxH;YAED,IAAI,CAAC,OAAO,CAAC,aAAa,GAAG,QAAQ,CAAC;SACzC;aAAM;YACH,IAAI,CAAC,SAAS,GAAG,KAAK,CAAC;YACvB,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC;gBACzB,WAAW,EAAE,CAAC,IAAI,EAAE,IAAI,CAAC;aAC5B,EAAE,OAAO,CAA4B,CAAC;SAC1C;QAED,gBAAgB;QAChB,IAAI,CAAC,WAAW,GAAG,eAAK,CAAC,MAAM,CAAC;YAC5B,cAAc,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,GAAG,GAAG,IAAI,MAAM,IAAI,GAAG;SAC5D,CAAC,CAAC;IACP,CAAC;IAED;;;;;OAKG;IACU,gBAAgB,CAAC,MAAyB;;YACnD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,sBAAsB,CAAC;gBAC/C,KAAK,EAAE,MAAM;aAChB,CAAC,CAAC;YAEH,mBAAmB;YACnB,IAAI,QAAQ,CAAC,MAAM,GAAG,GAAG,EAAE;gBACvB,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;aAC5H;iBAAM,IAAI,QAAQ,CAAC,MAAM,IAAI,GAAG,EAAE;gBAC/B,OAAO,EAAE,MAAM,EAAE,cAAc,EAAE,OAAO,EAAE,iDAAiD,EAAE,CAAA;aAChG;iBAAM;gBACH,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,kDAAkD,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,EAAE,CAAC;aACpI;QACL,CAAC;KAAA;IAED;;OAEG;IACO,sBAAsB,CAAC,OAA+B;;QAC5D,IAAI,IAAI,CAAC,SAAS,EAAE;YAChB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAuC,CAAC;YAC7D,MAAM,GAAG,GAAG,GAAG,OAAO,CAAC,aAAa,uBAAuB,OAAO,CAAC,eAAe,2BAA2B,OAAO,CAAC,eAAgB,EAAE,CAAC;YACxI,OAAO,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;SAClC;aAAM;YACH,MAAM,OAAO,GAAG,IAAI,CAAC,OAAkC,CAAC;YACxD,MAAM,GAAG,GAAG,GAAG,MAAA,OAAO,CAAC,QAAQ,mCAAI,wBAAwB,gBAAgB,CAAC;YAC3E,OAAwC,CAAC,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;YAChE,OAAO,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;SAClC;IACL,CAAC;IAED;;OAEG;IACa,IAAI,CAAQ,GAAW,EAAE,IAAY,EAAE,UAAU,GAAG,CAAC;;YACjE,4BAA4B;YAC5B,MAAM,aAAa,GAAuB,MAAM,CAAC,MAAM,CAAC,EAAE,EAAE,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;YAExF,6BAA6B;YAC7B,IAAI,CAAC,aAAa,CAAC,OAAO,EAAE;gBACxB,aAAa,CAAC,OAAO,GAAG,EAAE,CAAC;aAC9B;YACD,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE;gBACxC,aAAa,CAAC,OAAO,CAAC,cAAc,CAAC,GAAG,kBAAkB,CAAC;aAC9D;YACD,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,YAAY,CAAC,EAAE;gBACtC,aAAa,CAAC,OAAO,CAAC,YAAY,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC;aACxD;YACD,IAAI,IAAI,CAAC,SAAS,EAAE;gBAChB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAuC,CAAC;gBAC7D,aAAa,CAAC,OAAO,CAAC,SAAS,CAAC,GAAG,OAAO,CAAC,WAAW,CAAC;aAC1D;iBAAM;gBACH,MAAM,OAAO,GAAG,IAAI,CAAC,OAAkC,CAAC;gBACxD,aAAa,CAAC,OAAO,CAAC,eAAe,CAAC,GAAG,UAAU,OAAO,CAAC,MAAM,EAAE,CAAC;gBACpE,IAAI,OAAO,CAAC,YAAY,EAAE;oBACtB,aAAa,CAAC,OAAO,CAAC,qBAAqB,CAAC,GAAG,OAAO,CAAC,YAAY,CAAC;iBACvE;aACJ;YAED,eAAe;YACf,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,EAAE,aAAa,CAAC,CAAC;YAEvE,6BAA6B;YAC7B,IAAI,QAAQ,CAAC,MAAM,IAAI,GAAG,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,IAAI,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,MAAM,EAAE;gBACnH,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC;gBACnD,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;gBAC3D,OAAO,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,EAAE,UAAU,GAAG,CAAC,CAAC,CAAC;aAC/C;iBAAM;gBACH,OAAO,QAAQ,CAAC;aACnB;QACL,CAAC;KAAA;CACJ;AA7HD,4CA6HC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { TextChunk, Tokenizer } from "./types";
|
|
2
|
+
export interface TextSplitterConfig {
|
|
3
|
+
separators: string[];
|
|
4
|
+
keepSeparators: boolean;
|
|
5
|
+
chunkSize: number;
|
|
6
|
+
chunkOverlap: number;
|
|
7
|
+
tokenizer: Tokenizer;
|
|
8
|
+
docType?: string;
|
|
9
|
+
}
|
|
10
|
+
export declare class TextSplitter {
|
|
11
|
+
private readonly _config;
|
|
12
|
+
constructor(config?: Partial<TextSplitterConfig>);
|
|
13
|
+
split(text: string): TextChunk[];
|
|
14
|
+
private recursiveSplit;
|
|
15
|
+
private getSeparators;
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=TextSplitter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"TextSplitter.d.ts","sourceRoot":"","sources":["../src/TextSplitter.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAE/C,MAAM,WAAW,kBAAkB;IAC/B,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,cAAc,EAAE,OAAO,CAAC;IACxB,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,SAAS,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,qBAAa,YAAY;IACrB,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAqB;gBAE1B,MAAM,CAAC,EAAE,OAAO,CAAC,kBAAkB,CAAC;IA4BhD,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,EAAE;IA4BvC,OAAO,CAAC,cAAc;IA0CtB,OAAO,CAAC,aAAa;CA8WxB"}
|
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.TextSplitter = void 0;
|
|
4
|
+
const GPT3Tokenizer_1 = require("./GPT3Tokenizer");
|
|
5
|
+
class TextSplitter {
|
|
6
|
+
constructor(config) {
|
|
7
|
+
this._config = Object.assign({
|
|
8
|
+
separators: ["\n\n", "\n", " ", ""],
|
|
9
|
+
keepSeparators: false,
|
|
10
|
+
chunkSize: 400,
|
|
11
|
+
chunkOverlap: 40,
|
|
12
|
+
}, config);
|
|
13
|
+
// Create a default tokenizer if none is provided
|
|
14
|
+
if (!this._config.tokenizer) {
|
|
15
|
+
this._config.tokenizer = new GPT3Tokenizer_1.GPT3Tokenizer();
|
|
16
|
+
}
|
|
17
|
+
// Use default separators if none are provided
|
|
18
|
+
if (!this._config.separators || this._config.separators.length === 0) {
|
|
19
|
+
this._config.separators = this.getSeparators(this._config.docType);
|
|
20
|
+
}
|
|
21
|
+
// Validate the config settings
|
|
22
|
+
if (this._config.chunkSize < 1) {
|
|
23
|
+
throw new Error("chunkSize must be >= 1");
|
|
24
|
+
}
|
|
25
|
+
else if (this._config.chunkOverlap < 0) {
|
|
26
|
+
throw new Error("chunkOverlap must be >= 0");
|
|
27
|
+
}
|
|
28
|
+
else if (this._config.chunkOverlap > this._config.chunkSize) {
|
|
29
|
+
throw new Error("chunkOverlap must be <= chunkSize");
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
split(text) {
|
|
33
|
+
// Get basic chunks
|
|
34
|
+
const chunks = this.recursiveSplit(text, this._config.separators, 0);
|
|
35
|
+
const that = this;
|
|
36
|
+
function getOverlapTokens(tokens) {
|
|
37
|
+
if (tokens != undefined) {
|
|
38
|
+
const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
|
|
39
|
+
return tokens.slice(tokens.length);
|
|
40
|
+
}
|
|
41
|
+
else {
|
|
42
|
+
return [];
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
// Add overlap tokens and text to the start and end of each chunk
|
|
46
|
+
if (this._config.chunkOverlap > 0) {
|
|
47
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
48
|
+
const previousChunk = chunks[i - 1];
|
|
49
|
+
const chunk = chunks[i];
|
|
50
|
+
const nextChunk = i < chunks.length - 1 ? chunks[i + 1] : undefined;
|
|
51
|
+
chunk.startOverlap = getOverlapTokens(previousChunk.tokens.reverse()).reverse();
|
|
52
|
+
chunk.endOverlap = getOverlapTokens(nextChunk === null || nextChunk === void 0 ? void 0 : nextChunk.tokens);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return chunks;
|
|
56
|
+
}
|
|
57
|
+
recursiveSplit(text, separators, startPos) {
|
|
58
|
+
const chunks = [];
|
|
59
|
+
if (text.length > 0 && separators.length > 0) {
|
|
60
|
+
const separator = separators[0];
|
|
61
|
+
const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
|
|
62
|
+
const parts = text.split(separator);
|
|
63
|
+
for (let i = 0; i < parts.length; i++) {
|
|
64
|
+
const lastChunk = (i === parts.length - 1);
|
|
65
|
+
// Get chunk text and endPos
|
|
66
|
+
let chunk = parts[i];
|
|
67
|
+
const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
|
|
68
|
+
if (this._config.keepSeparators && !lastChunk) {
|
|
69
|
+
chunk += separator;
|
|
70
|
+
}
|
|
71
|
+
// Encode chunk text
|
|
72
|
+
const tokens = this._config.tokenizer.encode(chunk);
|
|
73
|
+
if (tokens.length > this._config.chunkSize) {
|
|
74
|
+
// Break the text into smaller chunks
|
|
75
|
+
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
76
|
+
chunks.push(...subChunks);
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
// Append chunk to output
|
|
80
|
+
chunks.push({
|
|
81
|
+
text: chunk,
|
|
82
|
+
tokens: tokens,
|
|
83
|
+
startPos: startPos,
|
|
84
|
+
endPos: endPos,
|
|
85
|
+
startOverlap: [],
|
|
86
|
+
endOverlap: [],
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
// Update startPos
|
|
90
|
+
startPos = endPos + 1;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return chunks;
|
|
94
|
+
}
|
|
95
|
+
getSeparators(docType) {
|
|
96
|
+
switch (docType !== null && docType !== void 0 ? docType : '') {
|
|
97
|
+
case "cpp":
|
|
98
|
+
return [
|
|
99
|
+
// Split along class definitions
|
|
100
|
+
"\nclass ",
|
|
101
|
+
// Split along function definitions
|
|
102
|
+
"\nvoid ",
|
|
103
|
+
"\nint ",
|
|
104
|
+
"\nfloat ",
|
|
105
|
+
"\ndouble ",
|
|
106
|
+
// Split along control flow statements
|
|
107
|
+
"\nif ",
|
|
108
|
+
"\nfor ",
|
|
109
|
+
"\nwhile ",
|
|
110
|
+
"\nswitch ",
|
|
111
|
+
"\ncase ",
|
|
112
|
+
// Split by the normal type of lines
|
|
113
|
+
"\n\n",
|
|
114
|
+
"\n",
|
|
115
|
+
" ",
|
|
116
|
+
"",
|
|
117
|
+
];
|
|
118
|
+
case "go":
|
|
119
|
+
return [
|
|
120
|
+
// Split along function definitions
|
|
121
|
+
"\nfunc ",
|
|
122
|
+
"\nvar ",
|
|
123
|
+
"\nconst ",
|
|
124
|
+
"\ntype ",
|
|
125
|
+
// Split along control flow statements
|
|
126
|
+
"\nif ",
|
|
127
|
+
"\nfor ",
|
|
128
|
+
"\nswitch ",
|
|
129
|
+
"\ncase ",
|
|
130
|
+
// Split by the normal type of lines
|
|
131
|
+
"\n\n",
|
|
132
|
+
"\n",
|
|
133
|
+
" ",
|
|
134
|
+
"",
|
|
135
|
+
];
|
|
136
|
+
case "java":
|
|
137
|
+
case "c#":
|
|
138
|
+
case "csharp":
|
|
139
|
+
case "cs":
|
|
140
|
+
case "ts":
|
|
141
|
+
case "tsx":
|
|
142
|
+
case "typescript":
|
|
143
|
+
return [
|
|
144
|
+
// Split along class definitions
|
|
145
|
+
"\nclass ",
|
|
146
|
+
// Split along method definitions
|
|
147
|
+
"\npublic ",
|
|
148
|
+
"\nprotected ",
|
|
149
|
+
"\nprivate ",
|
|
150
|
+
"\nstatic ",
|
|
151
|
+
// Split along control flow statements
|
|
152
|
+
"\nif ",
|
|
153
|
+
"\nfor ",
|
|
154
|
+
"\nwhile ",
|
|
155
|
+
"\nswitch ",
|
|
156
|
+
"\ncase ",
|
|
157
|
+
// Split by the normal type of lines
|
|
158
|
+
"\n\n",
|
|
159
|
+
"\n",
|
|
160
|
+
" ",
|
|
161
|
+
"",
|
|
162
|
+
];
|
|
163
|
+
case "js":
|
|
164
|
+
case "jsx":
|
|
165
|
+
case "javascript":
|
|
166
|
+
return [
|
|
167
|
+
// Split along class definitions
|
|
168
|
+
"\nclass ",
|
|
169
|
+
// Split along function definitions
|
|
170
|
+
"\nfunction ",
|
|
171
|
+
"\nconst ",
|
|
172
|
+
"\nlet ",
|
|
173
|
+
"\nvar ",
|
|
174
|
+
"\nclass ",
|
|
175
|
+
// Split along control flow statements
|
|
176
|
+
"\nif ",
|
|
177
|
+
"\nfor ",
|
|
178
|
+
"\nwhile ",
|
|
179
|
+
"\nswitch ",
|
|
180
|
+
"\ncase ",
|
|
181
|
+
"\ndefault ",
|
|
182
|
+
// Split by the normal type of lines
|
|
183
|
+
"\n\n",
|
|
184
|
+
"\n",
|
|
185
|
+
" ",
|
|
186
|
+
"",
|
|
187
|
+
];
|
|
188
|
+
case "php":
|
|
189
|
+
return [
|
|
190
|
+
// Split along function definitions
|
|
191
|
+
"\nfunction ",
|
|
192
|
+
// Split along class definitions
|
|
193
|
+
"\nclass ",
|
|
194
|
+
// Split along control flow statements
|
|
195
|
+
"\nif ",
|
|
196
|
+
"\nforeach ",
|
|
197
|
+
"\nwhile ",
|
|
198
|
+
"\ndo ",
|
|
199
|
+
"\nswitch ",
|
|
200
|
+
"\ncase ",
|
|
201
|
+
// Split by the normal type of lines
|
|
202
|
+
"\n\n",
|
|
203
|
+
"\n",
|
|
204
|
+
" ",
|
|
205
|
+
"",
|
|
206
|
+
];
|
|
207
|
+
case "proto":
|
|
208
|
+
return [
|
|
209
|
+
// Split along message definitions
|
|
210
|
+
"\nmessage ",
|
|
211
|
+
// Split along service definitions
|
|
212
|
+
"\nservice ",
|
|
213
|
+
// Split along enum definitions
|
|
214
|
+
"\nenum ",
|
|
215
|
+
// Split along option definitions
|
|
216
|
+
"\noption ",
|
|
217
|
+
// Split along import statements
|
|
218
|
+
"\nimport ",
|
|
219
|
+
// Split along syntax declarations
|
|
220
|
+
"\nsyntax ",
|
|
221
|
+
// Split by the normal type of lines
|
|
222
|
+
"\n\n",
|
|
223
|
+
"\n",
|
|
224
|
+
" ",
|
|
225
|
+
"",
|
|
226
|
+
];
|
|
227
|
+
case "python":
|
|
228
|
+
case "py":
|
|
229
|
+
return [
|
|
230
|
+
// First, try to split along class definitions
|
|
231
|
+
"\nclass ",
|
|
232
|
+
"\ndef ",
|
|
233
|
+
"\n\tdef ",
|
|
234
|
+
// Now split by the normal type of lines
|
|
235
|
+
"\n\n",
|
|
236
|
+
"\n",
|
|
237
|
+
" ",
|
|
238
|
+
"",
|
|
239
|
+
];
|
|
240
|
+
case "rst":
|
|
241
|
+
return [
|
|
242
|
+
// Split along section titles
|
|
243
|
+
"\n===\n",
|
|
244
|
+
"\n---\n",
|
|
245
|
+
"\n***\n",
|
|
246
|
+
// Split along directive markers
|
|
247
|
+
"\n.. ",
|
|
248
|
+
// Split by the normal type of lines
|
|
249
|
+
"\n\n",
|
|
250
|
+
"\n",
|
|
251
|
+
" ",
|
|
252
|
+
"",
|
|
253
|
+
];
|
|
254
|
+
case "ruby":
|
|
255
|
+
return [
|
|
256
|
+
// Split along method definitions
|
|
257
|
+
"\ndef ",
|
|
258
|
+
"\nclass ",
|
|
259
|
+
// Split along control flow statements
|
|
260
|
+
"\nif ",
|
|
261
|
+
"\nunless ",
|
|
262
|
+
"\nwhile ",
|
|
263
|
+
"\nfor ",
|
|
264
|
+
"\ndo ",
|
|
265
|
+
"\nbegin ",
|
|
266
|
+
"\nrescue ",
|
|
267
|
+
// Split by the normal type of lines
|
|
268
|
+
"\n\n",
|
|
269
|
+
"\n",
|
|
270
|
+
" ",
|
|
271
|
+
"",
|
|
272
|
+
];
|
|
273
|
+
case "rust":
|
|
274
|
+
return [
|
|
275
|
+
// Split along function definitions
|
|
276
|
+
"\nfn ",
|
|
277
|
+
"\nconst ",
|
|
278
|
+
"\nlet ",
|
|
279
|
+
// Split along control flow statements
|
|
280
|
+
"\nif ",
|
|
281
|
+
"\nwhile ",
|
|
282
|
+
"\nfor ",
|
|
283
|
+
"\nloop ",
|
|
284
|
+
"\nmatch ",
|
|
285
|
+
"\nconst ",
|
|
286
|
+
// Split by the normal type of lines
|
|
287
|
+
"\n\n",
|
|
288
|
+
"\n",
|
|
289
|
+
" ",
|
|
290
|
+
"",
|
|
291
|
+
];
|
|
292
|
+
case "scala":
|
|
293
|
+
return [
|
|
294
|
+
// Split along class definitions
|
|
295
|
+
"\nclass ",
|
|
296
|
+
"\nobject ",
|
|
297
|
+
// Split along method definitions
|
|
298
|
+
"\ndef ",
|
|
299
|
+
"\nval ",
|
|
300
|
+
"\nvar ",
|
|
301
|
+
// Split along control flow statements
|
|
302
|
+
"\nif ",
|
|
303
|
+
"\nfor ",
|
|
304
|
+
"\nwhile ",
|
|
305
|
+
"\nmatch ",
|
|
306
|
+
"\ncase ",
|
|
307
|
+
// Split by the normal type of lines
|
|
308
|
+
"\n\n",
|
|
309
|
+
"\n",
|
|
310
|
+
" ",
|
|
311
|
+
"",
|
|
312
|
+
];
|
|
313
|
+
case "swift":
|
|
314
|
+
return [
|
|
315
|
+
// Split along function definitions
|
|
316
|
+
"\nfunc ",
|
|
317
|
+
// Split along class definitions
|
|
318
|
+
"\nclass ",
|
|
319
|
+
"\nstruct ",
|
|
320
|
+
"\nenum ",
|
|
321
|
+
// Split along control flow statements
|
|
322
|
+
"\nif ",
|
|
323
|
+
"\nfor ",
|
|
324
|
+
"\nwhile ",
|
|
325
|
+
"\ndo ",
|
|
326
|
+
"\nswitch ",
|
|
327
|
+
"\ncase ",
|
|
328
|
+
// Split by the normal type of lines
|
|
329
|
+
"\n\n",
|
|
330
|
+
"\n",
|
|
331
|
+
" ",
|
|
332
|
+
"",
|
|
333
|
+
];
|
|
334
|
+
case "markdown":
|
|
335
|
+
return [
|
|
336
|
+
// First, try to split along Markdown headings (starting with level 2)
|
|
337
|
+
"\n## ",
|
|
338
|
+
"\n### ",
|
|
339
|
+
"\n#### ",
|
|
340
|
+
"\n##### ",
|
|
341
|
+
"\n###### ",
|
|
342
|
+
// Note the alternative syntax for headings (below) is not handled here
|
|
343
|
+
// Heading level 2
|
|
344
|
+
// ---------------
|
|
345
|
+
// End of code block
|
|
346
|
+
"```\n\n",
|
|
347
|
+
// Horizontal lines
|
|
348
|
+
"\n\n***\n\n",
|
|
349
|
+
"\n\n---\n\n",
|
|
350
|
+
"\n\n___\n\n",
|
|
351
|
+
// Note that this splitter doesn't handle horizontal lines defined
|
|
352
|
+
// by *three or more* of ***, ---, or ___, but this is not handled
|
|
353
|
+
"\n\n",
|
|
354
|
+
"\n",
|
|
355
|
+
" ",
|
|
356
|
+
"",
|
|
357
|
+
];
|
|
358
|
+
case "latex":
|
|
359
|
+
return [
|
|
360
|
+
// First, try to split along Latex sections
|
|
361
|
+
"\n\\chapter{",
|
|
362
|
+
"\n\\section{",
|
|
363
|
+
"\n\\subsection{",
|
|
364
|
+
"\n\\subsubsection{",
|
|
365
|
+
// Now split by environments
|
|
366
|
+
"\n\\begin{enumerate}",
|
|
367
|
+
"\n\\begin{itemize}",
|
|
368
|
+
"\n\\begin{description}",
|
|
369
|
+
"\n\\begin{list}",
|
|
370
|
+
"\n\\begin{quote}",
|
|
371
|
+
"\n\\begin{quotation}",
|
|
372
|
+
"\n\\begin{verse}",
|
|
373
|
+
"\n\\begin{verbatim}",
|
|
374
|
+
// Now split by math environments
|
|
375
|
+
"\n\\begin{align}",
|
|
376
|
+
"$$",
|
|
377
|
+
"$",
|
|
378
|
+
// Now split by the normal type of lines
|
|
379
|
+
"\n\n",
|
|
380
|
+
"\n",
|
|
381
|
+
" ",
|
|
382
|
+
"",
|
|
383
|
+
];
|
|
384
|
+
case "html":
|
|
385
|
+
return [
|
|
386
|
+
// First, try to split along HTML tags
|
|
387
|
+
"<body>",
|
|
388
|
+
"<div>",
|
|
389
|
+
"<p>",
|
|
390
|
+
"<br>",
|
|
391
|
+
"<li>",
|
|
392
|
+
"<h1>",
|
|
393
|
+
"<h2>",
|
|
394
|
+
"<h3>",
|
|
395
|
+
"<h4>",
|
|
396
|
+
"<h5>",
|
|
397
|
+
"<h6>",
|
|
398
|
+
"<span>",
|
|
399
|
+
"<table>",
|
|
400
|
+
"<tr>",
|
|
401
|
+
"<td>",
|
|
402
|
+
"<th>",
|
|
403
|
+
"<ul>",
|
|
404
|
+
"<ol>",
|
|
405
|
+
"<header>",
|
|
406
|
+
"<footer>",
|
|
407
|
+
"<nav>",
|
|
408
|
+
// Head
|
|
409
|
+
"<head>",
|
|
410
|
+
"<style>",
|
|
411
|
+
"<script>",
|
|
412
|
+
"<meta>",
|
|
413
|
+
"<title>",
|
|
414
|
+
// Normal type of lines
|
|
415
|
+
" ",
|
|
416
|
+
"",
|
|
417
|
+
];
|
|
418
|
+
case "sol":
|
|
419
|
+
return [
|
|
420
|
+
// Split along compiler informations definitions
|
|
421
|
+
"\npragma ",
|
|
422
|
+
"\nusing ",
|
|
423
|
+
// Split along contract definitions
|
|
424
|
+
"\ncontract ",
|
|
425
|
+
"\ninterface ",
|
|
426
|
+
"\nlibrary ",
|
|
427
|
+
// Split along method definitions
|
|
428
|
+
"\nconstructor ",
|
|
429
|
+
"\ntype ",
|
|
430
|
+
"\nfunction ",
|
|
431
|
+
"\nevent ",
|
|
432
|
+
"\nmodifier ",
|
|
433
|
+
"\nerror ",
|
|
434
|
+
"\nstruct ",
|
|
435
|
+
"\nenum ",
|
|
436
|
+
// Split along control flow statements
|
|
437
|
+
"\nif ",
|
|
438
|
+
"\nfor ",
|
|
439
|
+
"\nwhile ",
|
|
440
|
+
"\ndo while ",
|
|
441
|
+
"\nassembly ",
|
|
442
|
+
// Split by the normal type of lines
|
|
443
|
+
"\n\n",
|
|
444
|
+
"\n",
|
|
445
|
+
" ",
|
|
446
|
+
"",
|
|
447
|
+
];
|
|
448
|
+
default:
|
|
449
|
+
return [
|
|
450
|
+
// Split by the normal type of lines
|
|
451
|
+
"\n\n",
|
|
452
|
+
"\n",
|
|
453
|
+
" ",
|
|
454
|
+
"",
|
|
455
|
+
];
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
exports.TextSplitter = TextSplitter;
|
|
460
|
+
//# sourceMappingURL=TextSplitter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"TextSplitter.js","sourceRoot":"","sources":["../src/TextSplitter.ts"],"names":[],"mappings":";;;AAAA,mDAAgD;AAYhD,MAAa,YAAY;IAGrB,YAAmB,MAAoC;QACnD,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC;YACzB,UAAU,EAAE,CAAC,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC;YACnC,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,GAAG;YACd,YAAY,EAAE,EAAE;SACG,EAAE,MAAM,CAAC,CAAC;QAEjC,iDAAiD;QACjD,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE;YACzB,IAAI,CAAC,OAAO,CAAC,SAAS,GAAG,IAAI,6BAAa,EAAE,CAAC;SAChD;QAED,8CAA8C;QAC9C,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,IAAI,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE;YAClE,IAAI,CAAC,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;SACtE;QAED,+BAA+B;QAC/B,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,GAAG,CAAC,EAAE;YAC5B,MAAM,IAAI,KAAK,CAAC,wBAAwB,CAAC,CAAC;SAC7C;aAAM,IAAI,IAAI,CAAC,OAAO,CAAC,YAAY,GAAG,CAAC,EAAE;YACtC,MAAM,IAAI,KAAK,CAAC,2BAA2B,CAAC,CAAC;SAChD;aAAM,IAAI,IAAI,CAAC,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE;YAC3D,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;SACxD;IACL,CAAC;IAEM,KAAK,CAAC,IAAY;QACrB,mBAAmB;QACnB,MAAM,MAAM,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;QAErE,MAAM,IAAI,GAAG,IAAI,CAAC;QAClB,SAAS,gBAAgB,CAAC,MAAiB;YACvC,IAAI,MAAM,IAAI,SAAS,EAAE;gBACrB,MAAM,GAAG,GAAG,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC;gBAClG,OAAO,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;aACtC;iBAAM;gBACH,OAAO,EAAE,CAAC;aACb;QACL,CAAC;QAED,iEAAiE;QACjE,IAAI,IAAI,CAAC,OAAO,CAAC,YAAY,GAAG,CAAC,EAAE;YAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;gBACpC,MAAM,aAAa,GAAG,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;gBACpC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;gBACxB,MAAM,SAAS,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;gBACpE,KAAK,CAAC,YAAY,GAAG,gBAAgB,CAAC,aAAa,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC;gBAChF,KAAK,CAAC,UAAU,GAAG,gBAAgB,CAAC,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,MAAM,CAAC,CAAC;aAC1D;SACJ;QAED,OAAO,MAAM,CAAC;IAClB,CAAC;IAEO,cAAc,CAAC,IAAY,EAAE,UAAoB,EAAE,QAAgB;QACvE,MAAM,MAAM,GAAgB,EAAE,CAAC;QAC/B,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE;YAC1C,MAAM,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,cAAc,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YACxE,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;YACpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;gBACnC,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;gBAE3C,4BAA4B;gBAC5B,IAAI,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACrB,MAAM,MAAM,GAAG,CAAC,QAAQ,GAAG,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;gBACpF,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,IAAI,CAAC,SAAS,EAAE;oBAC3C,KAAK,IAAI,SAAS,CAAC;iBACtB;gBAED,oBAAoB;gBACpB,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBACpD,IAAI,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE;oBACxC,qCAAqC;oBACrC,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,cAAc,EAAE,QAAQ,CAAC,CAAC;oBACvE,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;iBAC7B;qBAAM;oBACH,yBAAyB;oBACzB,MAAM,CAAC,IAAI,CAAC;wBACR,IAAI,EAAE,KAAK;wBACX,MAAM,EAAE,MAAM;wBACd,QAAQ,EAAE,QAAQ;wBAClB,MAAM,EAAE,MAAM;wBACd,YAAY,EAAE,EAAE;wBAChB,UAAU,EAAE,EAAE;qBACjB,CAAC,CAAC;iBACN;gBAED,kBAAkB;gBAClB,QAAQ,GAAG,MAAM,GAAG,CAAC,CAAC;aACzB;SACJ;QAED,OAAO,MAAM,CAAC;IAClB,CAAC;IAEO,aAAa,CAAC,OAAgB;QAClC,QAAQ,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,EAAE;YACnB,KAAK,KAAK;gBACN,OAAO;oBACH,gCAAgC;oBAChC,UAAU;oBACV,mCAAmC;oBACnC,SAAS;oBACT,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,sCAAsC;oBACtC,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,IAAI;gBACL,OAAO;oBACH,mCAAmC;oBACnC,SAAS;oBACT,QAAQ;oBACR,UAAU;oBACV,SAAS;oBACT,sCAAsC;oBACtC,OAAO;oBACP,QAAQ;oBACR,WAAW;oBACX,SAAS;oBACT,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,MAAM,CAAC;YACZ,KAAK,IAAI,CAAC;YACV,KAAK,QAAQ,CAAC;YACd,KAAK,IAAI,CAAC;YACV,KAAK,IAAI,CAAC;YACV,KAAK,KAAK,CAAC;YACX,KAAK,YAAY;gBACb,OAAO;oBACH,gCAAgC;oBAChC,UAAU;oBACV,iCAAiC;oBACjC,WAAW;oBACX,cAAc;oBACd,YAAY;oBACZ,WAAW;oBACX,sCAAsC;oBACtC,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,IAAI,CAAC;YACV,KAAK,KAAK,CAAC;YACX,KAAK,YAAY;gBACb,OAAO;oBACH,gCAAgC;oBAChC,UAAU;oBACV,mCAAmC;oBACnC,aAAa;oBACb,UAAU;oBACV,QAAQ;oBACR,QAAQ;oBACR,UAAU;oBACV,sCAAsC;oBACtC,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,YAAY;oBACZ,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,KAAK;gBACN,OAAO;oBACH,mCAAmC;oBACnC,aAAa;oBACb,gCAAgC;oBAChC,UAAU;oBACV,sCAAsC;oBACtC,OAAO;oBACP,YAAY;oBACZ,UAAU;oBACV,OAAO;oBACP,WAAW;oBACX,SAAS;oBACT,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,OAAO;gBACR,OAAO;oBACH,kCAAkC;oBAClC,YAAY;oBACZ,kCAAkC;oBAClC,YAAY;oBACZ,+BAA+B;oBAC/B,SAAS;oBACT,iCAAiC;oBACjC,WAAW;oBACX,gCAAgC;oBAChC,WAAW;oBACX,kCAAkC;oBAClC,WAAW;oBACX,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,QAAQ,CAAC;YACd,KAAK,IAAI;gBACL,OAAO;oBACH,8CAA8C;oBAC9C,UAAU;oBACV,QAAQ;oBACR,UAAU;oBACV,wCAAwC;oBACxC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,KAAK;gBACN,OAAO;oBACH,6BAA6B;oBAC7B,SAAS;oBACT,SAAS;oBACT,SAAS;oBACT,gCAAgC;oBAChC,OAAO;oBACP,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,MAAM;gBACP,OAAO;oBACH,iCAAiC;oBACjC,QAAQ;oBACR,UAAU;oBACV,sCAAsC;oBACtC,OAAO;oBACP,WAAW;oBACX,UAAU;oBACV,QAAQ;oBACR,OAAO;oBACP,UAAU;oBACV,WAAW;oBACX,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,MAAM;gBACP,OAAO;oBACH,mCAAmC;oBACnC,OAAO;oBACP,UAAU;oBACV,QAAQ;oBACR,sCAAsC;oBACtC,OAAO;oBACP,UAAU;oBACV,QAAQ;oBACR,SAAS;oBACT,UAAU;oBACV,UAAU;oBACV,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,OAAO;gBACR,OAAO;oBACH,gCAAgC;oBAChC,UAAU;oBACV,WAAW;oBACX,iCAAiC;oBACjC,QAAQ;oBACR,QAAQ;oBACR,QAAQ;oBACR,sCAAsC;oBACtC,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,UAAU;oBACV,SAAS;oBACT,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,OAAO;gBACR,OAAO;oBACH,mCAAmC;oBACnC,SAAS;oBACT,gCAAgC;oBAChC,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,sCAAsC;oBACtC,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,OAAO;oBACP,WAAW;oBACX,SAAS;oBACT,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,UAAU;gBACX,OAAO;oBACH,sEAAsE;oBACtE,OAAO;oBACP,QAAQ;oBACR,SAAS;oBACT,UAAU;oBACV,WAAW;oBACX,uEAAuE;oBACvE,kBAAkB;oBAClB,kBAAkB;oBAClB,oBAAoB;oBACpB,SAAS;oBACT,mBAAmB;oBACnB,aAAa;oBACb,aAAa;oBACb,aAAa;oBACb,kEAAkE;oBAClE,kEAAkE;oBAClE,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,OAAO;gBACR,OAAO;oBACH,2CAA2C;oBAC3C,cAAc;oBACd,cAAc;oBACd,iBAAiB;oBACjB,oBAAoB;oBAEpB,4BAA4B;oBAC5B,sBAAsB;oBACtB,oBAAoB;oBACpB,wBAAwB;oBACxB,iBAAiB;oBACjB,kBAAkB;oBAClB,sBAAsB;oBACtB,kBAAkB;oBAClB,qBAAqB;oBAErB,iCAAiC;oBACjC,kBAAkB;oBAClB,IAAI;oBACJ,GAAG;oBAEH,wCAAwC;oBACxC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,MAAM;gBACP,OAAO;oBACH,sCAAsC;oBACtC,QAAQ;oBACR,OAAO;oBACP,KAAK;oBACL,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,QAAQ;oBACR,SAAS;oBACT,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,UAAU;oBACV,UAAU;oBACV,OAAO;oBACP,OAAO;oBACP,QAAQ;oBACR,SAAS;oBACT,UAAU;oBACV,QAAQ;oBACR,SAAS;oBACT,uBAAuB;oBACvB,GAAG;oBACH,EAAE;iBACL,CAAC;YACN,KAAK,KAAK;gBACN,OAAO;oBACH,gDAAgD;oBAChD,WAAW;oBACX,UAAU;oBACV,mCAAmC;oBACnC,aAAa;oBACb,cAAc;oBACd,YAAY;oBACZ,iCAAiC;oBACjC,gBAAgB;oBAChB,SAAS;oBACT,aAAa;oBACb,UAAU;oBACV,aAAa;oBACb,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,sCAAsC;oBACtC,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,aAAa;oBACb,aAAa;oBACb,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;YACN;gBACI,OAAO;oBACH,oCAAoC;oBACpC,MAAM;oBACN,IAAI;oBACJ,GAAG;oBACH,EAAE;iBACL,CAAC;SACT;IACL,CAAC;CACJ;AAndD,oCAmdC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { AxiosRequestConfig } from "axios";
|
|
2
|
+
import { TextFetcher } from './types';
|
|
3
|
+
export interface WebFetcherConfig {
|
|
4
|
+
headers?: Record<string, string>;
|
|
5
|
+
requestConfig?: AxiosRequestConfig;
|
|
6
|
+
htmlToText: boolean;
|
|
7
|
+
summarizeHtml: boolean;
|
|
8
|
+
}
|
|
9
|
+
export declare class WebFetcher implements TextFetcher {
|
|
10
|
+
private readonly _config;
|
|
11
|
+
constructor(config?: Partial<WebFetcherConfig>);
|
|
12
|
+
fetch(uri: string): Promise<string>;
|
|
13
|
+
private extractText;
|
|
14
|
+
private fetchPage;
|
|
15
|
+
}
|
|
16
|
+
//# sourceMappingURL=WebFetcher.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"WebFetcher.d.ts","sourceRoot":"","sources":["../src/WebFetcher.ts"],"names":[],"mappings":"AAAA,OAAc,EAAE,kBAAkB,EAAE,MAAM,OAAO,CAAC;AAElD,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AA2BtC,MAAM,WAAW,gBAAgB;IAC7B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAC,MAAM,CAAC,CAAC;IAChC,aAAa,CAAC,EAAE,kBAAkB,CAAC;IACnC,UAAU,EAAE,OAAO,CAAC;IACpB,aAAa,EAAE,OAAO,CAAC;CAC1B;AAED,qBAAa,UAAW,YAAW,WAAW;IAC1C,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAmB;gBAExB,MAAM,CAAC,EAAE,OAAO,CAAC,gBAAgB,CAAC;IAOxC,KAAK,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAShD,OAAO,CAAC,WAAW;YAyCL,SAAS;CA+B1B"}
|