langchain 0.0.132 → 0.0.134
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/chat/outputParser.cjs +2 -1
- package/dist/agents/chat/outputParser.js +2 -1
- package/dist/agents/executor.cjs +106 -7
- package/dist/agents/executor.d.ts +23 -0
- package/dist/agents/executor.js +104 -6
- package/dist/agents/mrkl/outputParser.cjs +2 -1
- package/dist/agents/mrkl/outputParser.js +2 -1
- package/dist/callbacks/index.cjs +2 -1
- package/dist/callbacks/index.d.ts +1 -1
- package/dist/callbacks/index.js +1 -1
- package/dist/chains/sql_db/sql_db_chain.d.ts +1 -1
- package/dist/chains/sql_db/sql_db_prompt.d.ts +6 -6
- package/dist/chat_models/googlevertexai.cjs +1 -1
- package/dist/chat_models/googlevertexai.d.ts +2 -2
- package/dist/chat_models/googlevertexai.js +2 -2
- package/dist/chat_models/ollama.cjs +8 -8
- package/dist/chat_models/ollama.js +8 -8
- package/dist/document_loaders/web/notionapi.cjs +153 -74
- package/dist/document_loaders/web/notionapi.d.ts +19 -10
- package/dist/document_loaders/web/notionapi.js +154 -75
- package/dist/document_loaders/web/recursive_url.cjs +177 -0
- package/dist/document_loaders/web/recursive_url.d.ts +27 -0
- package/dist/document_loaders/web/recursive_url.js +173 -0
- package/dist/embeddings/googlevertexai.cjs +1 -1
- package/dist/embeddings/googlevertexai.d.ts +2 -2
- package/dist/embeddings/googlevertexai.js +2 -2
- package/dist/experimental/multimodal_embeddings/googlevertexai.cjs +1 -1
- package/dist/experimental/multimodal_embeddings/googlevertexai.d.ts +2 -2
- package/dist/experimental/multimodal_embeddings/googlevertexai.js +2 -2
- package/dist/hub.cjs +16 -0
- package/dist/hub.d.ts +4 -0
- package/dist/hub.js +11 -0
- package/dist/llms/bedrock.cjs +63 -19
- package/dist/llms/bedrock.d.ts +9 -1
- package/dist/llms/bedrock.js +63 -19
- package/dist/llms/googlevertexai.cjs +1 -1
- package/dist/llms/googlevertexai.js +2 -2
- package/dist/load/import_constants.cjs +3 -0
- package/dist/load/import_constants.js +3 -0
- package/dist/schema/output_parser.cjs +2 -2
- package/dist/schema/output_parser.js +2 -2
- package/dist/tools/base.cjs +26 -2
- package/dist/tools/base.d.ts +9 -0
- package/dist/tools/base.js +24 -1
- package/dist/tools/sql.cjs +9 -3
- package/dist/tools/sql.d.ts +0 -1
- package/dist/tools/sql.js +9 -3
- package/dist/types/googlevertexai-types.d.ts +8 -3
- package/dist/util/googlevertexai-connection.cjs +49 -15
- package/dist/util/googlevertexai-connection.d.ts +12 -4
- package/dist/util/googlevertexai-connection.js +46 -13
- package/dist/vectorstores/googlevertexai.cjs +551 -0
- package/dist/vectorstores/googlevertexai.d.ts +180 -0
- package/dist/vectorstores/googlevertexai.js +520 -0
- package/dist/vectorstores/myscale.cjs +2 -2
- package/dist/vectorstores/myscale.d.ts +1 -1
- package/dist/vectorstores/myscale.js +2 -2
- package/dist/vectorstores/vectara.cjs +11 -2
- package/dist/vectorstores/vectara.d.ts +10 -1
- package/dist/vectorstores/vectara.js +11 -2
- package/document_loaders/web/recursive_url.cjs +1 -0
- package/document_loaders/web/recursive_url.d.ts +1 -0
- package/document_loaders/web/recursive_url.js +1 -0
- package/hub.cjs +1 -0
- package/hub.d.ts +1 -0
- package/hub.js +1 -0
- package/package.json +41 -2
- package/vectorstores/googlevertexai.cjs +1 -0
- package/vectorstores/googlevertexai.d.ts +1 -0
- package/vectorstores/googlevertexai.js +1 -0
|
@@ -1,8 +1,22 @@
|
|
|
1
|
-
import { Client, isFullBlock, isFullPage, iteratePaginatedAPI, } from "@notionhq/client";
|
|
1
|
+
import { Client, isFullBlock, isFullPage, iteratePaginatedAPI, APIErrorCode, isNotionClientError, isFullDatabase, } from "@notionhq/client";
|
|
2
2
|
import { NotionToMarkdown } from "notion-to-md";
|
|
3
3
|
import { getBlockChildren } from "notion-to-md/build/utils/notion.js";
|
|
4
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
5
4
|
import { Document } from "../../document.js";
|
|
5
|
+
import { BaseDocumentLoader } from "../base.js";
|
|
6
|
+
import { AsyncCaller } from "../../util/async_caller.js";
|
|
7
|
+
const isPageResponse = (res) => !isNotionClientError(res) && res.object === "page";
|
|
8
|
+
const isDatabaseResponse = (res) => !isNotionClientError(res) && res.object === "database";
|
|
9
|
+
const isErrorResponse = (res) => isNotionClientError(res);
|
|
10
|
+
const isPage = (res) => isPageResponse(res) && isFullPage(res);
|
|
11
|
+
const isDatabase = (res) => isDatabaseResponse(res) && isFullDatabase(res);
|
|
12
|
+
const getTitle = (obj) => {
|
|
13
|
+
if (isPage(obj) && obj.properties.title.type === "title") {
|
|
14
|
+
return obj.properties.title.title[0]?.plain_text;
|
|
15
|
+
}
|
|
16
|
+
if (isDatabase(obj))
|
|
17
|
+
return obj.title[0]?.plain_text;
|
|
18
|
+
return null;
|
|
19
|
+
};
|
|
6
20
|
/**
|
|
7
21
|
* A class that extends the BaseDocumentLoader class. It represents a
|
|
8
22
|
* document loader for loading documents from Notion using the Notion API.
|
|
@@ -10,6 +24,12 @@ import { Document } from "../../document.js";
|
|
|
10
24
|
export class NotionAPILoader extends BaseDocumentLoader {
|
|
11
25
|
constructor(options) {
|
|
12
26
|
super();
|
|
27
|
+
Object.defineProperty(this, "caller", {
|
|
28
|
+
enumerable: true,
|
|
29
|
+
configurable: true,
|
|
30
|
+
writable: true,
|
|
31
|
+
value: void 0
|
|
32
|
+
});
|
|
13
33
|
Object.defineProperty(this, "notionClient", {
|
|
14
34
|
enumerable: true,
|
|
15
35
|
configurable: true,
|
|
@@ -28,19 +48,66 @@ export class NotionAPILoader extends BaseDocumentLoader {
|
|
|
28
48
|
writable: true,
|
|
29
49
|
value: void 0
|
|
30
50
|
});
|
|
31
|
-
Object.defineProperty(this, "
|
|
51
|
+
Object.defineProperty(this, "pageQueue", {
|
|
52
|
+
enumerable: true,
|
|
53
|
+
configurable: true,
|
|
54
|
+
writable: true,
|
|
55
|
+
value: void 0
|
|
56
|
+
});
|
|
57
|
+
Object.defineProperty(this, "pageCompleted", {
|
|
58
|
+
enumerable: true,
|
|
59
|
+
configurable: true,
|
|
60
|
+
writable: true,
|
|
61
|
+
value: void 0
|
|
62
|
+
});
|
|
63
|
+
Object.defineProperty(this, "pageQueueTotal", {
|
|
64
|
+
enumerable: true,
|
|
65
|
+
configurable: true,
|
|
66
|
+
writable: true,
|
|
67
|
+
value: void 0
|
|
68
|
+
});
|
|
69
|
+
Object.defineProperty(this, "documents", {
|
|
32
70
|
enumerable: true,
|
|
33
71
|
configurable: true,
|
|
34
72
|
writable: true,
|
|
35
73
|
value: void 0
|
|
36
74
|
});
|
|
37
|
-
this
|
|
75
|
+
Object.defineProperty(this, "rootTitle", {
|
|
76
|
+
enumerable: true,
|
|
77
|
+
configurable: true,
|
|
78
|
+
writable: true,
|
|
79
|
+
value: void 0
|
|
80
|
+
});
|
|
81
|
+
Object.defineProperty(this, "onDocumentLoaded", {
|
|
82
|
+
enumerable: true,
|
|
83
|
+
configurable: true,
|
|
84
|
+
writable: true,
|
|
85
|
+
value: void 0
|
|
86
|
+
});
|
|
87
|
+
this.caller = new AsyncCaller({
|
|
88
|
+
maxConcurrency: 64,
|
|
89
|
+
...options.callerOptions,
|
|
90
|
+
});
|
|
91
|
+
this.notionClient = new Client({
|
|
92
|
+
logger: () => { },
|
|
93
|
+
...options.clientOptions,
|
|
94
|
+
});
|
|
38
95
|
this.n2mClient = new NotionToMarkdown({
|
|
39
96
|
notionClient: this.notionClient,
|
|
40
97
|
config: { parseChildPages: false, convertImagesToBase64: false },
|
|
41
98
|
});
|
|
42
99
|
this.id = options.id;
|
|
43
|
-
this.
|
|
100
|
+
this.pageQueue = [];
|
|
101
|
+
this.pageCompleted = [];
|
|
102
|
+
this.pageQueueTotal = 0;
|
|
103
|
+
this.documents = [];
|
|
104
|
+
this.rootTitle = "";
|
|
105
|
+
this.onDocumentLoaded = options.onDocumentLoaded ?? ((_ti, _cu) => { });
|
|
106
|
+
}
|
|
107
|
+
addToQueue(...items) {
|
|
108
|
+
const deDuped = items.filter((item) => !this.pageCompleted.concat(this.pageQueue).includes(item));
|
|
109
|
+
this.pageQueue.push(...deDuped);
|
|
110
|
+
this.pageQueueTotal += deDuped.length;
|
|
44
111
|
}
|
|
45
112
|
/**
|
|
46
113
|
* Parses the properties of a Notion page and returns them as key-value
|
|
@@ -123,123 +190,135 @@ export class NotionAPILoader extends BaseDocumentLoader {
|
|
|
123
190
|
* @returns A Promise that resolves to an MdBlock object.
|
|
124
191
|
*/
|
|
125
192
|
async loadBlock(block) {
|
|
126
|
-
|
|
193
|
+
const mdBlock = {
|
|
127
194
|
type: block.type,
|
|
128
195
|
blockId: block.id,
|
|
129
|
-
parent: await this.n2mClient.blockToMarkdown(block),
|
|
196
|
+
parent: await this.caller.call(() => this.n2mClient.blockToMarkdown(block)),
|
|
130
197
|
children: [],
|
|
131
198
|
};
|
|
199
|
+
if (block.has_children) {
|
|
200
|
+
const block_id = block.type === "synced_block" &&
|
|
201
|
+
block.synced_block?.synced_from?.block_id
|
|
202
|
+
? block.synced_block.synced_from.block_id
|
|
203
|
+
: block.id;
|
|
204
|
+
const childBlocks = await this.loadBlocks(await this.caller.call(() => getBlockChildren(this.notionClient, block_id, null)));
|
|
205
|
+
mdBlock.children = childBlocks;
|
|
206
|
+
}
|
|
207
|
+
return mdBlock;
|
|
132
208
|
}
|
|
133
209
|
/**
|
|
134
|
-
* Loads Notion blocks and their
|
|
210
|
+
* Loads Notion blocks and their children recursively.
|
|
135
211
|
* @param blocksResponse The response from the Notion API containing the blocks to load.
|
|
136
|
-
* @returns A Promise that resolves to an
|
|
212
|
+
* @returns A Promise that resolves to an array containing the loaded MdBlocks.
|
|
137
213
|
*/
|
|
138
|
-
async
|
|
214
|
+
async loadBlocks(blocksResponse) {
|
|
139
215
|
const blocks = blocksResponse.filter(isFullBlock);
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
const childBlocksDocs = await this.loadBlocksAndDocs(await getBlockChildren(this.notionClient, block_id, null));
|
|
158
|
-
mdBlock.children = childBlocksDocs.mdBlocks;
|
|
159
|
-
childDocuments = childBlocksDocs.childDocuments;
|
|
160
|
-
}
|
|
161
|
-
return {
|
|
162
|
-
mdBlocks: [mdBlock],
|
|
163
|
-
childDocuments,
|
|
164
|
-
};
|
|
165
|
-
})),
|
|
216
|
+
// Add child pages to queue
|
|
217
|
+
const childPages = blocks
|
|
218
|
+
.filter((block) => block.type.includes("child_page"))
|
|
219
|
+
.map((block) => block.id);
|
|
220
|
+
if (childPages.length > 0)
|
|
221
|
+
this.addToQueue(...childPages);
|
|
222
|
+
// Add child database pages to queue
|
|
223
|
+
const childDatabases = blocks
|
|
224
|
+
.filter((block) => block.type.includes("child_database"))
|
|
225
|
+
.map((block) => this.caller.call(() => this.loadDatabase(block.id)));
|
|
226
|
+
// Load this block and child blocks
|
|
227
|
+
const loadingMdBlocks = blocks
|
|
228
|
+
.filter((block) => !["child_page", "child_database"].includes(block.type))
|
|
229
|
+
.map((block) => this.loadBlock(block));
|
|
230
|
+
const [mdBlocks] = await Promise.all([
|
|
231
|
+
Promise.all(loadingMdBlocks),
|
|
232
|
+
Promise.all(childDatabases),
|
|
166
233
|
]);
|
|
167
|
-
|
|
168
|
-
.flat()
|
|
169
|
-
.map((blockDoc) => blockDoc.mdBlocks);
|
|
170
|
-
const childDocuments = blocksDocsArray
|
|
171
|
-
.flat()
|
|
172
|
-
.map((blockDoc) => blockDoc.childDocuments);
|
|
173
|
-
return {
|
|
174
|
-
mdBlocks: [...allMdBlocks.flat()],
|
|
175
|
-
childDocuments: [
|
|
176
|
-
...childPageDocuments.flat(),
|
|
177
|
-
...childDatabaseDocuments.flat(),
|
|
178
|
-
...childDocuments.flat(),
|
|
179
|
-
],
|
|
180
|
-
};
|
|
234
|
+
return mdBlocks;
|
|
181
235
|
}
|
|
182
236
|
/**
|
|
183
|
-
* Loads a Notion page and its child documents.
|
|
237
|
+
* Loads a Notion page and its child documents, then adds it to the completed documents array.
|
|
184
238
|
* @param page The Notion page or page ID to load.
|
|
185
|
-
* @returns A Promise that resolves to an array of Documents.
|
|
186
239
|
*/
|
|
187
240
|
async loadPage(page) {
|
|
188
|
-
// Check page is a page ID or a
|
|
241
|
+
// Check page is a page ID or a PageObjectResponse
|
|
189
242
|
const [pageData, pageId] = typeof page === "string"
|
|
190
|
-
? [
|
|
243
|
+
? [
|
|
244
|
+
this.caller.call(() => this.notionClient.pages.retrieve({ page_id: page })),
|
|
245
|
+
page,
|
|
246
|
+
]
|
|
191
247
|
: [page, page.id];
|
|
192
248
|
const [pageDetails, pageBlocks] = await Promise.all([
|
|
193
249
|
pageData,
|
|
194
|
-
getBlockChildren(this.notionClient, pageId, null),
|
|
250
|
+
this.caller.call(() => getBlockChildren(this.notionClient, pageId, null)),
|
|
195
251
|
]);
|
|
196
252
|
if (!isFullPage(pageDetails))
|
|
197
|
-
return
|
|
198
|
-
const
|
|
253
|
+
return;
|
|
254
|
+
const mdBlocks = await this.loadBlocks(pageBlocks);
|
|
199
255
|
const mdStringObject = this.n2mClient.toMarkdownString(mdBlocks);
|
|
200
256
|
const pageDocument = new Document({
|
|
201
257
|
pageContent: mdStringObject.parent,
|
|
202
258
|
metadata: this.parsePageDetails(pageDetails),
|
|
203
259
|
});
|
|
204
|
-
|
|
260
|
+
this.documents.push(pageDocument);
|
|
261
|
+
this.pageCompleted.push(pageId);
|
|
262
|
+
this.onDocumentLoaded(this.documents.length, this.pageQueueTotal, pageDocument.metadata.properties.title, this.rootTitle);
|
|
205
263
|
}
|
|
206
264
|
/**
|
|
207
|
-
* Loads a Notion database and
|
|
265
|
+
* Loads a Notion database and adds it's pages to the queue.
|
|
208
266
|
* @param id The ID of the Notion database to load.
|
|
209
|
-
* @returns A Promise that resolves to an array of Documents.
|
|
210
267
|
*/
|
|
211
268
|
async loadDatabase(id) {
|
|
212
|
-
const documents = [];
|
|
213
269
|
try {
|
|
214
270
|
for await (const page of iteratePaginatedAPI(this.notionClient.databases.query, {
|
|
215
271
|
database_id: id,
|
|
272
|
+
page_size: 50,
|
|
216
273
|
})) {
|
|
217
|
-
|
|
218
|
-
continue;
|
|
219
|
-
documents.push(...(await this.loadPage(page)));
|
|
274
|
+
this.addToQueue(page.id);
|
|
220
275
|
}
|
|
221
276
|
}
|
|
222
277
|
catch (e) {
|
|
223
278
|
console.log(e);
|
|
224
279
|
// TODO: Catch and report api request errors
|
|
225
280
|
}
|
|
226
|
-
return documents;
|
|
227
281
|
}
|
|
228
282
|
/**
|
|
229
283
|
* Loads the documents from Notion based on the specified options.
|
|
230
284
|
* @returns A Promise that resolves to an array of Documents.
|
|
231
285
|
*/
|
|
232
286
|
async load() {
|
|
233
|
-
const
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
287
|
+
const resPagePromise = this.notionClient.pages
|
|
288
|
+
.retrieve({ page_id: this.id })
|
|
289
|
+
.then((res) => {
|
|
290
|
+
this.addToQueue(this.id);
|
|
291
|
+
return res;
|
|
292
|
+
})
|
|
293
|
+
.catch((error) => error);
|
|
294
|
+
const resDatabasePromise = this.notionClient.databases
|
|
295
|
+
.retrieve({ database_id: this.id })
|
|
296
|
+
.then(async (res) => {
|
|
297
|
+
await this.loadDatabase(this.id);
|
|
298
|
+
return res;
|
|
299
|
+
})
|
|
300
|
+
.catch((error) => error);
|
|
301
|
+
const [resPage, resDatabase] = await Promise.all([
|
|
302
|
+
resPagePromise,
|
|
303
|
+
resDatabasePromise,
|
|
304
|
+
]);
|
|
305
|
+
// Check if both resPage and resDatabase resulted in error responses
|
|
306
|
+
const errors = [resPage, resDatabase].filter(isErrorResponse);
|
|
307
|
+
if (errors.length === 2) {
|
|
308
|
+
if (errors.every((e) => e.code === APIErrorCode.ObjectNotFound)) {
|
|
309
|
+
throw new AggregateError([
|
|
310
|
+
Error(`Could not find object with ID: ${this.id}. Make sure the relevant pages and databases are shared with your integration.`),
|
|
311
|
+
...errors,
|
|
312
|
+
]);
|
|
313
|
+
}
|
|
314
|
+
throw new AggregateError(errors);
|
|
315
|
+
}
|
|
316
|
+
this.rootTitle = getTitle(resPage) || getTitle(resDatabase) || this.id;
|
|
317
|
+
let pageId = this.pageQueue.shift();
|
|
318
|
+
while (pageId) {
|
|
319
|
+
await this.loadPage(pageId);
|
|
320
|
+
pageId = this.pageQueue.shift();
|
|
242
321
|
}
|
|
243
|
-
return documents;
|
|
322
|
+
return this.documents;
|
|
244
323
|
}
|
|
245
324
|
}
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.RecursiveUrlLoader = void 0;
|
|
4
|
+
const jsdom_1 = require("jsdom");
|
|
5
|
+
const async_caller_js_1 = require("../../util/async_caller.cjs");
|
|
6
|
+
const base_js_1 = require("../base.cjs");
|
|
7
|
+
class RecursiveUrlLoader extends base_js_1.BaseDocumentLoader {
|
|
8
|
+
constructor(url, options) {
|
|
9
|
+
super();
|
|
10
|
+
Object.defineProperty(this, "caller", {
|
|
11
|
+
enumerable: true,
|
|
12
|
+
configurable: true,
|
|
13
|
+
writable: true,
|
|
14
|
+
value: void 0
|
|
15
|
+
});
|
|
16
|
+
Object.defineProperty(this, "url", {
|
|
17
|
+
enumerable: true,
|
|
18
|
+
configurable: true,
|
|
19
|
+
writable: true,
|
|
20
|
+
value: void 0
|
|
21
|
+
});
|
|
22
|
+
Object.defineProperty(this, "excludeDirs", {
|
|
23
|
+
enumerable: true,
|
|
24
|
+
configurable: true,
|
|
25
|
+
writable: true,
|
|
26
|
+
value: void 0
|
|
27
|
+
});
|
|
28
|
+
Object.defineProperty(this, "extractor", {
|
|
29
|
+
enumerable: true,
|
|
30
|
+
configurable: true,
|
|
31
|
+
writable: true,
|
|
32
|
+
value: void 0
|
|
33
|
+
});
|
|
34
|
+
Object.defineProperty(this, "maxDepth", {
|
|
35
|
+
enumerable: true,
|
|
36
|
+
configurable: true,
|
|
37
|
+
writable: true,
|
|
38
|
+
value: void 0
|
|
39
|
+
});
|
|
40
|
+
Object.defineProperty(this, "timeout", {
|
|
41
|
+
enumerable: true,
|
|
42
|
+
configurable: true,
|
|
43
|
+
writable: true,
|
|
44
|
+
value: void 0
|
|
45
|
+
});
|
|
46
|
+
Object.defineProperty(this, "preventOutside", {
|
|
47
|
+
enumerable: true,
|
|
48
|
+
configurable: true,
|
|
49
|
+
writable: true,
|
|
50
|
+
value: void 0
|
|
51
|
+
});
|
|
52
|
+
this.caller = new async_caller_js_1.AsyncCaller({
|
|
53
|
+
maxConcurrency: 64,
|
|
54
|
+
maxRetries: 0,
|
|
55
|
+
...options.callerOptions,
|
|
56
|
+
});
|
|
57
|
+
this.url = url;
|
|
58
|
+
this.excludeDirs = options.excludeDirs ?? [];
|
|
59
|
+
this.extractor = options.extractor ?? ((s) => s);
|
|
60
|
+
this.maxDepth = options.maxDepth ?? 2;
|
|
61
|
+
this.timeout = options.timeout ?? 10000;
|
|
62
|
+
this.preventOutside = options.preventOutside ?? true;
|
|
63
|
+
}
|
|
64
|
+
async fetchWithTimeout(resource, options) {
|
|
65
|
+
const { timeout, ...rest } = options;
|
|
66
|
+
return this.caller.call(() => fetch(resource, { ...rest, signal: AbortSignal.timeout(timeout) }));
|
|
67
|
+
}
|
|
68
|
+
getChildLinks(html, baseUrl) {
|
|
69
|
+
const allLinks = Array.from(new jsdom_1.JSDOM(html).window.document.querySelectorAll("a")).map((a) => a.href);
|
|
70
|
+
const absolutePaths = [];
|
|
71
|
+
// eslint-disable-next-line no-script-url
|
|
72
|
+
const invalidPrefixes = ["javascript:", "mailto:", "#"];
|
|
73
|
+
const invalidSuffixes = [
|
|
74
|
+
".css",
|
|
75
|
+
".js",
|
|
76
|
+
".ico",
|
|
77
|
+
".png",
|
|
78
|
+
".jpg",
|
|
79
|
+
".jpeg",
|
|
80
|
+
".gif",
|
|
81
|
+
".svg",
|
|
82
|
+
];
|
|
83
|
+
for (const link of allLinks) {
|
|
84
|
+
if (invalidPrefixes.some((prefix) => link.startsWith(prefix)) ||
|
|
85
|
+
invalidSuffixes.some((suffix) => link.endsWith(suffix)))
|
|
86
|
+
continue;
|
|
87
|
+
if (link.startsWith("http")) {
|
|
88
|
+
const isAllowed = !this.preventOutside || link.startsWith(baseUrl);
|
|
89
|
+
if (isAllowed)
|
|
90
|
+
absolutePaths.push(link);
|
|
91
|
+
}
|
|
92
|
+
else if (link.startsWith("//")) {
|
|
93
|
+
const base = new URL(baseUrl);
|
|
94
|
+
absolutePaths.push(base.protocol + link);
|
|
95
|
+
}
|
|
96
|
+
else {
|
|
97
|
+
const newLink = new URL(link, baseUrl).href;
|
|
98
|
+
absolutePaths.push(newLink);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return Array.from(new Set(absolutePaths));
|
|
102
|
+
}
|
|
103
|
+
extractMetadata(rawHtml, url) {
|
|
104
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
105
|
+
const metadata = { source: url };
|
|
106
|
+
const { document } = new jsdom_1.JSDOM(rawHtml).window;
|
|
107
|
+
const title = document.getElementsByTagName("title")[0];
|
|
108
|
+
if (title) {
|
|
109
|
+
metadata.title = title.textContent;
|
|
110
|
+
}
|
|
111
|
+
const description = document.querySelector("meta[name=description]");
|
|
112
|
+
if (description) {
|
|
113
|
+
metadata.description = description.getAttribute("content");
|
|
114
|
+
}
|
|
115
|
+
const html = document.getElementsByTagName("html")[0];
|
|
116
|
+
if (html) {
|
|
117
|
+
metadata.language = html.getAttribute("lang");
|
|
118
|
+
}
|
|
119
|
+
return metadata;
|
|
120
|
+
}
|
|
121
|
+
async getUrlAsDoc(url) {
|
|
122
|
+
let res;
|
|
123
|
+
try {
|
|
124
|
+
res = await this.fetchWithTimeout(url, { timeout: this.timeout });
|
|
125
|
+
res = await res.text();
|
|
126
|
+
}
|
|
127
|
+
catch (e) {
|
|
128
|
+
return null;
|
|
129
|
+
}
|
|
130
|
+
return {
|
|
131
|
+
pageContent: this.extractor(res),
|
|
132
|
+
metadata: this.extractMetadata(res, url),
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
async getChildUrlsRecursive(inputUrl, visited = new Set(), depth = 0) {
|
|
136
|
+
if (depth > this.maxDepth)
|
|
137
|
+
return [];
|
|
138
|
+
let url = inputUrl;
|
|
139
|
+
if (!inputUrl.endsWith("/"))
|
|
140
|
+
url += "/";
|
|
141
|
+
const isExcluded = this.excludeDirs.some((exDir) => url.startsWith(exDir));
|
|
142
|
+
if (isExcluded)
|
|
143
|
+
return [];
|
|
144
|
+
let res;
|
|
145
|
+
try {
|
|
146
|
+
res = await this.fetchWithTimeout(url, { timeout: this.timeout });
|
|
147
|
+
res = await res.text();
|
|
148
|
+
}
|
|
149
|
+
catch (e) {
|
|
150
|
+
return [];
|
|
151
|
+
}
|
|
152
|
+
const childUrls = this.getChildLinks(res, url);
|
|
153
|
+
const results = await Promise.all(childUrls.map((childUrl) => (async () => {
|
|
154
|
+
if (visited.has(childUrl))
|
|
155
|
+
return null;
|
|
156
|
+
visited.add(childUrl);
|
|
157
|
+
const childDoc = await this.getUrlAsDoc(childUrl);
|
|
158
|
+
if (!childDoc)
|
|
159
|
+
return null;
|
|
160
|
+
if (childUrl.endsWith("/")) {
|
|
161
|
+
const childUrlResponses = await this.getChildUrlsRecursive(childUrl, visited, depth + 1);
|
|
162
|
+
return [childDoc, ...childUrlResponses];
|
|
163
|
+
}
|
|
164
|
+
return [childDoc];
|
|
165
|
+
})()));
|
|
166
|
+
return results.flat().filter((docs) => docs !== null);
|
|
167
|
+
}
|
|
168
|
+
async load() {
|
|
169
|
+
const rootDoc = await this.getUrlAsDoc(this.url);
|
|
170
|
+
if (!rootDoc)
|
|
171
|
+
return [];
|
|
172
|
+
const docs = [rootDoc];
|
|
173
|
+
docs.push(...(await this.getChildUrlsRecursive(this.url, new Set([this.url]))));
|
|
174
|
+
return docs;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
exports.RecursiveUrlLoader = RecursiveUrlLoader;
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { Document } from "../../document.js";
|
|
2
|
+
import { AsyncCaller } from "../../util/async_caller.js";
|
|
3
|
+
import { BaseDocumentLoader, DocumentLoader } from "../base.js";
|
|
4
|
+
export interface RecursiveUrlLoaderOptions {
|
|
5
|
+
excludeDirs?: string[];
|
|
6
|
+
extractor?: (text: string) => string;
|
|
7
|
+
maxDepth?: number;
|
|
8
|
+
timeout?: number;
|
|
9
|
+
preventOutside?: boolean;
|
|
10
|
+
callerOptions?: ConstructorParameters<typeof AsyncCaller>[0];
|
|
11
|
+
}
|
|
12
|
+
export declare class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLoader {
|
|
13
|
+
private caller;
|
|
14
|
+
private url;
|
|
15
|
+
private excludeDirs;
|
|
16
|
+
private extractor;
|
|
17
|
+
private maxDepth;
|
|
18
|
+
private timeout;
|
|
19
|
+
private preventOutside;
|
|
20
|
+
constructor(url: string, options: RecursiveUrlLoaderOptions);
|
|
21
|
+
private fetchWithTimeout;
|
|
22
|
+
private getChildLinks;
|
|
23
|
+
private extractMetadata;
|
|
24
|
+
private getUrlAsDoc;
|
|
25
|
+
private getChildUrlsRecursive;
|
|
26
|
+
load(): Promise<Document[]>;
|
|
27
|
+
}
|