langchain 0.2.18 → 0.3.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/openai_functions/index.cjs +2 -2
- package/dist/agents/openai_functions/index.js +2 -2
- package/dist/chains/combine_documents/stuff.cjs +2 -2
- package/dist/chains/combine_documents/stuff.js +2 -2
- package/dist/chains/openai_functions/openapi.cjs +3 -1
- package/dist/chains/openai_functions/openapi.js +3 -1
- package/dist/load/import_constants.cjs +2 -39
- package/dist/load/import_constants.js +2 -39
- package/dist/load/import_map.cjs +2 -3
- package/dist/load/import_map.d.ts +0 -1
- package/dist/load/import_map.js +0 -1
- package/dist/smith/config.d.ts +1 -5
- package/package.json +31 -854
- package/dist/document_loaders/fs/chatgpt.cjs +0 -90
- package/dist/document_loaders/fs/chatgpt.d.ts +0 -8
- package/dist/document_loaders/fs/chatgpt.js +0 -86
- package/dist/document_loaders/fs/csv.cjs +0 -73
- package/dist/document_loaders/fs/csv.d.ts +0 -65
- package/dist/document_loaders/fs/csv.js +0 -69
- package/dist/document_loaders/fs/docx.cjs +0 -58
- package/dist/document_loaders/fs/docx.d.ts +0 -25
- package/dist/document_loaders/fs/docx.js +0 -54
- package/dist/document_loaders/fs/epub.cjs +0 -103
- package/dist/document_loaders/fs/epub.d.ts +0 -33
- package/dist/document_loaders/fs/epub.js +0 -99
- package/dist/document_loaders/fs/notion.cjs +0 -26
- package/dist/document_loaders/fs/notion.d.ts +0 -12
- package/dist/document_loaders/fs/notion.js +0 -22
- package/dist/document_loaders/fs/obsidian.cjs +0 -247
- package/dist/document_loaders/fs/obsidian.d.ts +0 -28
- package/dist/document_loaders/fs/obsidian.js +0 -240
- package/dist/document_loaders/fs/openai_whisper_audio.cjs +0 -49
- package/dist/document_loaders/fs/openai_whisper_audio.d.ts +0 -23
- package/dist/document_loaders/fs/openai_whisper_audio.js +0 -45
- package/dist/document_loaders/fs/pdf.cjs +0 -148
- package/dist/document_loaders/fs/pdf.d.ts +0 -49
- package/dist/document_loaders/fs/pdf.js +0 -144
- package/dist/document_loaders/fs/pptx.cjs +0 -46
- package/dist/document_loaders/fs/pptx.d.ts +0 -25
- package/dist/document_loaders/fs/pptx.js +0 -42
- package/dist/document_loaders/fs/srt.cjs +0 -57
- package/dist/document_loaders/fs/srt.d.ts +0 -32
- package/dist/document_loaders/fs/srt.js +0 -50
- package/dist/document_loaders/fs/unstructured.cjs +0 -338
- package/dist/document_loaders/fs/unstructured.d.ts +0 -125
- package/dist/document_loaders/fs/unstructured.js +0 -333
- package/dist/document_loaders/web/apify_dataset.cjs +0 -130
- package/dist/document_loaders/web/apify_dataset.d.ts +0 -85
- package/dist/document_loaders/web/apify_dataset.js +0 -126
- package/dist/document_loaders/web/assemblyai.cjs +0 -200
- package/dist/document_loaders/web/assemblyai.d.ts +0 -95
- package/dist/document_loaders/web/assemblyai.js +0 -193
- package/dist/document_loaders/web/azure_blob_storage_container.cjs +0 -73
- package/dist/document_loaders/web/azure_blob_storage_container.d.ts +0 -46
- package/dist/document_loaders/web/azure_blob_storage_container.js +0 -69
- package/dist/document_loaders/web/azure_blob_storage_file.cjs +0 -124
- package/dist/document_loaders/web/azure_blob_storage_file.d.ts +0 -53
- package/dist/document_loaders/web/azure_blob_storage_file.js +0 -97
- package/dist/document_loaders/web/browserbase.cjs +0 -93
- package/dist/document_loaders/web/browserbase.d.ts +0 -48
- package/dist/document_loaders/web/browserbase.js +0 -86
- package/dist/document_loaders/web/cheerio.cjs +0 -118
- package/dist/document_loaders/web/cheerio.d.ts +0 -77
- package/dist/document_loaders/web/cheerio.js +0 -114
- package/dist/document_loaders/web/college_confidential.cjs +0 -41
- package/dist/document_loaders/web/college_confidential.d.ts +0 -25
- package/dist/document_loaders/web/college_confidential.js +0 -37
- package/dist/document_loaders/web/confluence.cjs +0 -190
- package/dist/document_loaders/web/confluence.d.ts +0 -114
- package/dist/document_loaders/web/confluence.js +0 -186
- package/dist/document_loaders/web/couchbase.cjs +0 -95
- package/dist/document_loaders/web/couchbase.d.ts +0 -32
- package/dist/document_loaders/web/couchbase.js +0 -91
- package/dist/document_loaders/web/figma.cjs +0 -102
- package/dist/document_loaders/web/figma.d.ts +0 -82
- package/dist/document_loaders/web/figma.js +0 -98
- package/dist/document_loaders/web/firecrawl.cjs +0 -95
- package/dist/document_loaders/web/firecrawl.d.ts +0 -50
- package/dist/document_loaders/web/firecrawl.js +0 -88
- package/dist/document_loaders/web/gitbook.cjs +0 -110
- package/dist/document_loaders/web/gitbook.d.ts +0 -55
- package/dist/document_loaders/web/gitbook.js +0 -106
- package/dist/document_loaders/web/github.cjs +0 -615
- package/dist/document_loaders/web/github.d.ts +0 -203
- package/dist/document_loaders/web/github.js +0 -608
- package/dist/document_loaders/web/hn.cjs +0 -90
- package/dist/document_loaders/web/hn.d.ts +0 -42
- package/dist/document_loaders/web/hn.js +0 -86
- package/dist/document_loaders/web/imsdb.cjs +0 -44
- package/dist/document_loaders/web/imsdb.d.ts +0 -23
- package/dist/document_loaders/web/imsdb.js +0 -40
- package/dist/document_loaders/web/notionapi.cjs +0 -404
- package/dist/document_loaders/web/notionapi.d.ts +0 -133
- package/dist/document_loaders/web/notionapi.js +0 -392
- package/dist/document_loaders/web/notiondb.cjs +0 -199
- package/dist/document_loaders/web/notiondb.d.ts +0 -56
- package/dist/document_loaders/web/notiondb.js +0 -195
- package/dist/document_loaders/web/pdf.cjs +0 -140
- package/dist/document_loaders/web/pdf.d.ts +0 -35
- package/dist/document_loaders/web/pdf.js +0 -136
- package/dist/document_loaders/web/playwright.cjs +0 -89
- package/dist/document_loaders/web/playwright.d.ts +0 -58
- package/dist/document_loaders/web/playwright.js +0 -85
- package/dist/document_loaders/web/puppeteer.cjs +0 -139
- package/dist/document_loaders/web/puppeteer.d.ts +0 -82
- package/dist/document_loaders/web/puppeteer.js +0 -135
- package/dist/document_loaders/web/recursive_url.cjs +0 -198
- package/dist/document_loaders/web/recursive_url.d.ts +0 -33
- package/dist/document_loaders/web/recursive_url.js +0 -194
- package/dist/document_loaders/web/s3.cjs +0 -164
- package/dist/document_loaders/web/s3.d.ts +0 -78
- package/dist/document_loaders/web/s3.js +0 -137
- package/dist/document_loaders/web/searchapi.cjs +0 -150
- package/dist/document_loaders/web/searchapi.d.ts +0 -76
- package/dist/document_loaders/web/searchapi.js +0 -146
- package/dist/document_loaders/web/serpapi.cjs +0 -127
- package/dist/document_loaders/web/serpapi.d.ts +0 -62
- package/dist/document_loaders/web/serpapi.js +0 -123
- package/dist/document_loaders/web/sitemap.cjs +0 -118
- package/dist/document_loaders/web/sitemap.d.ts +0 -41
- package/dist/document_loaders/web/sitemap.js +0 -114
- package/dist/document_loaders/web/sonix_audio.cjs +0 -68
- package/dist/document_loaders/web/sonix_audio.d.ts +0 -36
- package/dist/document_loaders/web/sonix_audio.js +0 -64
- package/dist/document_loaders/web/sort_xyz_blockchain.cjs +0 -157
- package/dist/document_loaders/web/sort_xyz_blockchain.d.ts +0 -78
- package/dist/document_loaders/web/sort_xyz_blockchain.js +0 -153
- package/dist/document_loaders/web/youtube.cjs +0 -116
- package/dist/document_loaders/web/youtube.d.ts +0 -55
- package/dist/document_loaders/web/youtube.js +0 -112
- package/dist/experimental/tools/pyinterpreter.cjs +0 -248
- package/dist/experimental/tools/pyinterpreter.d.ts +0 -18
- package/dist/experimental/tools/pyinterpreter.js +0 -244
- package/dist/retrievers/self_query/chroma.cjs +0 -48
- package/dist/retrievers/self_query/chroma.d.ts +0 -26
- package/dist/retrievers/self_query/chroma.js +0 -44
- package/dist/retrievers/self_query/pinecone.cjs +0 -47
- package/dist/retrievers/self_query/pinecone.d.ts +0 -26
- package/dist/retrievers/self_query/pinecone.js +0 -43
- package/dist/retrievers/self_query/supabase.cjs +0 -278
- package/dist/retrievers/self_query/supabase.d.ts +0 -109
- package/dist/retrievers/self_query/supabase.js +0 -274
- package/dist/retrievers/self_query/supabase_utils.cjs +0 -264
- package/dist/retrievers/self_query/supabase_utils.d.ts +0 -101
- package/dist/retrievers/self_query/supabase_utils.js +0 -259
- package/dist/retrievers/self_query/vectara.cjs +0 -143
- package/dist/retrievers/self_query/vectara.d.ts +0 -42
- package/dist/retrievers/self_query/vectara.js +0 -139
- package/dist/retrievers/self_query/weaviate.cjs +0 -201
- package/dist/retrievers/self_query/weaviate.d.ts +0 -99
- package/dist/retrievers/self_query/weaviate.js +0 -197
- package/dist/types/assemblyai-types.cjs +0 -2
- package/dist/types/assemblyai-types.d.ts +0 -4
- package/dist/types/assemblyai-types.js +0 -1
- package/document_loaders/fs/chatgpt.cjs +0 -1
- package/document_loaders/fs/chatgpt.d.cts +0 -1
- package/document_loaders/fs/chatgpt.d.ts +0 -1
- package/document_loaders/fs/chatgpt.js +0 -1
- package/document_loaders/fs/csv.cjs +0 -1
- package/document_loaders/fs/csv.d.cts +0 -1
- package/document_loaders/fs/csv.d.ts +0 -1
- package/document_loaders/fs/csv.js +0 -1
- package/document_loaders/fs/docx.cjs +0 -1
- package/document_loaders/fs/docx.d.cts +0 -1
- package/document_loaders/fs/docx.d.ts +0 -1
- package/document_loaders/fs/docx.js +0 -1
- package/document_loaders/fs/epub.cjs +0 -1
- package/document_loaders/fs/epub.d.cts +0 -1
- package/document_loaders/fs/epub.d.ts +0 -1
- package/document_loaders/fs/epub.js +0 -1
- package/document_loaders/fs/notion.cjs +0 -1
- package/document_loaders/fs/notion.d.cts +0 -1
- package/document_loaders/fs/notion.d.ts +0 -1
- package/document_loaders/fs/notion.js +0 -1
- package/document_loaders/fs/obsidian.cjs +0 -1
- package/document_loaders/fs/obsidian.d.cts +0 -1
- package/document_loaders/fs/obsidian.d.ts +0 -1
- package/document_loaders/fs/obsidian.js +0 -1
- package/document_loaders/fs/openai_whisper_audio.cjs +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.cts +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.ts +0 -1
- package/document_loaders/fs/openai_whisper_audio.js +0 -1
- package/document_loaders/fs/pdf.cjs +0 -1
- package/document_loaders/fs/pdf.d.cts +0 -1
- package/document_loaders/fs/pdf.d.ts +0 -1
- package/document_loaders/fs/pdf.js +0 -1
- package/document_loaders/fs/pptx.cjs +0 -1
- package/document_loaders/fs/pptx.d.cts +0 -1
- package/document_loaders/fs/pptx.d.ts +0 -1
- package/document_loaders/fs/pptx.js +0 -1
- package/document_loaders/fs/srt.cjs +0 -1
- package/document_loaders/fs/srt.d.cts +0 -1
- package/document_loaders/fs/srt.d.ts +0 -1
- package/document_loaders/fs/srt.js +0 -1
- package/document_loaders/fs/unstructured.cjs +0 -1
- package/document_loaders/fs/unstructured.d.cts +0 -1
- package/document_loaders/fs/unstructured.d.ts +0 -1
- package/document_loaders/fs/unstructured.js +0 -1
- package/document_loaders/web/apify_dataset.cjs +0 -1
- package/document_loaders/web/apify_dataset.d.cts +0 -1
- package/document_loaders/web/apify_dataset.d.ts +0 -1
- package/document_loaders/web/apify_dataset.js +0 -1
- package/document_loaders/web/assemblyai.cjs +0 -1
- package/document_loaders/web/assemblyai.d.cts +0 -1
- package/document_loaders/web/assemblyai.d.ts +0 -1
- package/document_loaders/web/assemblyai.js +0 -1
- package/document_loaders/web/azure_blob_storage_container.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_container.js +0 -1
- package/document_loaders/web/azure_blob_storage_file.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_file.js +0 -1
- package/document_loaders/web/browserbase.cjs +0 -1
- package/document_loaders/web/browserbase.d.cts +0 -1
- package/document_loaders/web/browserbase.d.ts +0 -1
- package/document_loaders/web/browserbase.js +0 -1
- package/document_loaders/web/cheerio.cjs +0 -1
- package/document_loaders/web/cheerio.d.cts +0 -1
- package/document_loaders/web/cheerio.d.ts +0 -1
- package/document_loaders/web/cheerio.js +0 -1
- package/document_loaders/web/college_confidential.cjs +0 -1
- package/document_loaders/web/college_confidential.d.cts +0 -1
- package/document_loaders/web/college_confidential.d.ts +0 -1
- package/document_loaders/web/college_confidential.js +0 -1
- package/document_loaders/web/confluence.cjs +0 -1
- package/document_loaders/web/confluence.d.cts +0 -1
- package/document_loaders/web/confluence.d.ts +0 -1
- package/document_loaders/web/confluence.js +0 -1
- package/document_loaders/web/couchbase.cjs +0 -1
- package/document_loaders/web/couchbase.d.cts +0 -1
- package/document_loaders/web/couchbase.d.ts +0 -1
- package/document_loaders/web/couchbase.js +0 -1
- package/document_loaders/web/figma.cjs +0 -1
- package/document_loaders/web/figma.d.cts +0 -1
- package/document_loaders/web/figma.d.ts +0 -1
- package/document_loaders/web/figma.js +0 -1
- package/document_loaders/web/firecrawl.cjs +0 -1
- package/document_loaders/web/firecrawl.d.cts +0 -1
- package/document_loaders/web/firecrawl.d.ts +0 -1
- package/document_loaders/web/firecrawl.js +0 -1
- package/document_loaders/web/gitbook.cjs +0 -1
- package/document_loaders/web/gitbook.d.cts +0 -1
- package/document_loaders/web/gitbook.d.ts +0 -1
- package/document_loaders/web/gitbook.js +0 -1
- package/document_loaders/web/github.cjs +0 -1
- package/document_loaders/web/github.d.cts +0 -1
- package/document_loaders/web/github.d.ts +0 -1
- package/document_loaders/web/github.js +0 -1
- package/document_loaders/web/hn.cjs +0 -1
- package/document_loaders/web/hn.d.cts +0 -1
- package/document_loaders/web/hn.d.ts +0 -1
- package/document_loaders/web/hn.js +0 -1
- package/document_loaders/web/imsdb.cjs +0 -1
- package/document_loaders/web/imsdb.d.cts +0 -1
- package/document_loaders/web/imsdb.d.ts +0 -1
- package/document_loaders/web/imsdb.js +0 -1
- package/document_loaders/web/notionapi.cjs +0 -1
- package/document_loaders/web/notionapi.d.cts +0 -1
- package/document_loaders/web/notionapi.d.ts +0 -1
- package/document_loaders/web/notionapi.js +0 -1
- package/document_loaders/web/notiondb.cjs +0 -1
- package/document_loaders/web/notiondb.d.cts +0 -1
- package/document_loaders/web/notiondb.d.ts +0 -1
- package/document_loaders/web/notiondb.js +0 -1
- package/document_loaders/web/pdf.cjs +0 -1
- package/document_loaders/web/pdf.d.cts +0 -1
- package/document_loaders/web/pdf.d.ts +0 -1
- package/document_loaders/web/pdf.js +0 -1
- package/document_loaders/web/playwright.cjs +0 -1
- package/document_loaders/web/playwright.d.cts +0 -1
- package/document_loaders/web/playwright.d.ts +0 -1
- package/document_loaders/web/playwright.js +0 -1
- package/document_loaders/web/puppeteer.cjs +0 -1
- package/document_loaders/web/puppeteer.d.cts +0 -1
- package/document_loaders/web/puppeteer.d.ts +0 -1
- package/document_loaders/web/puppeteer.js +0 -1
- package/document_loaders/web/recursive_url.cjs +0 -1
- package/document_loaders/web/recursive_url.d.cts +0 -1
- package/document_loaders/web/recursive_url.d.ts +0 -1
- package/document_loaders/web/recursive_url.js +0 -1
- package/document_loaders/web/s3.cjs +0 -1
- package/document_loaders/web/s3.d.cts +0 -1
- package/document_loaders/web/s3.d.ts +0 -1
- package/document_loaders/web/s3.js +0 -1
- package/document_loaders/web/searchapi.cjs +0 -1
- package/document_loaders/web/searchapi.d.cts +0 -1
- package/document_loaders/web/searchapi.d.ts +0 -1
- package/document_loaders/web/searchapi.js +0 -1
- package/document_loaders/web/serpapi.cjs +0 -1
- package/document_loaders/web/serpapi.d.cts +0 -1
- package/document_loaders/web/serpapi.d.ts +0 -1
- package/document_loaders/web/serpapi.js +0 -1
- package/document_loaders/web/sitemap.cjs +0 -1
- package/document_loaders/web/sitemap.d.cts +0 -1
- package/document_loaders/web/sitemap.d.ts +0 -1
- package/document_loaders/web/sitemap.js +0 -1
- package/document_loaders/web/sonix_audio.cjs +0 -1
- package/document_loaders/web/sonix_audio.d.cts +0 -1
- package/document_loaders/web/sonix_audio.d.ts +0 -1
- package/document_loaders/web/sonix_audio.js +0 -1
- package/document_loaders/web/sort_xyz_blockchain.cjs +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.cts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.ts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.js +0 -1
- package/document_loaders/web/youtube.cjs +0 -1
- package/document_loaders/web/youtube.d.cts +0 -1
- package/document_loaders/web/youtube.d.ts +0 -1
- package/document_loaders/web/youtube.js +0 -1
- package/experimental/tools/pyinterpreter.cjs +0 -1
- package/experimental/tools/pyinterpreter.d.cts +0 -1
- package/experimental/tools/pyinterpreter.d.ts +0 -1
- package/experimental/tools/pyinterpreter.js +0 -1
- package/memory/index.cjs +0 -1
- package/memory/index.d.cts +0 -1
- package/memory/index.d.ts +0 -1
- package/memory/index.js +0 -1
- package/retrievers/self_query/chroma.cjs +0 -1
- package/retrievers/self_query/chroma.d.cts +0 -1
- package/retrievers/self_query/chroma.d.ts +0 -1
- package/retrievers/self_query/chroma.js +0 -1
- package/retrievers/self_query/pinecone.cjs +0 -1
- package/retrievers/self_query/pinecone.d.cts +0 -1
- package/retrievers/self_query/pinecone.d.ts +0 -1
- package/retrievers/self_query/pinecone.js +0 -1
- package/retrievers/self_query/supabase.cjs +0 -1
- package/retrievers/self_query/supabase.d.cts +0 -1
- package/retrievers/self_query/supabase.d.ts +0 -1
- package/retrievers/self_query/supabase.js +0 -1
- package/retrievers/self_query/vectara.cjs +0 -1
- package/retrievers/self_query/vectara.d.cts +0 -1
- package/retrievers/self_query/vectara.d.ts +0 -1
- package/retrievers/self_query/vectara.js +0 -1
- package/retrievers/self_query/weaviate.cjs +0 -1
- package/retrievers/self_query/weaviate.d.cts +0 -1
- package/retrievers/self_query/weaviate.d.ts +0 -1
- package/retrievers/self_query/weaviate.js +0 -1
|
@@ -1,135 +0,0 @@
|
|
|
1
|
-
import { Document } from "@langchain/core/documents";
|
|
2
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
3
|
-
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
|
|
4
|
-
/* #__PURE__ */ logVersion020MigrationWarning({
|
|
5
|
-
oldEntrypointName: "document_loaders/web/puppeteer",
|
|
6
|
-
newPackageName: "@langchain/community",
|
|
7
|
-
});
|
|
8
|
-
/**
|
|
9
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/puppeteer" instead. This entrypoint will be removed in 0.3.0.
|
|
10
|
-
*
|
|
11
|
-
* Class that extends the BaseDocumentLoader class and implements the
|
|
12
|
-
* DocumentLoader interface. It represents a document loader for scraping
|
|
13
|
-
* web pages using Puppeteer.
|
|
14
|
-
* @example
|
|
15
|
-
* ```typescript
|
|
16
|
-
* const loader = new PuppeteerWebBaseLoader("https:exampleurl.com", {
|
|
17
|
-
* launchOptions: {
|
|
18
|
-
* headless: true,
|
|
19
|
-
* },
|
|
20
|
-
* gotoOptions: {
|
|
21
|
-
* waitUntil: "domcontentloaded",
|
|
22
|
-
* },
|
|
23
|
-
* });
|
|
24
|
-
* const screenshot = await loader.screenshot();
|
|
25
|
-
* ```
|
|
26
|
-
*/
|
|
27
|
-
export class PuppeteerWebBaseLoader extends BaseDocumentLoader {
|
|
28
|
-
constructor(webPath, options) {
|
|
29
|
-
super();
|
|
30
|
-
Object.defineProperty(this, "webPath", {
|
|
31
|
-
enumerable: true,
|
|
32
|
-
configurable: true,
|
|
33
|
-
writable: true,
|
|
34
|
-
value: webPath
|
|
35
|
-
});
|
|
36
|
-
Object.defineProperty(this, "options", {
|
|
37
|
-
enumerable: true,
|
|
38
|
-
configurable: true,
|
|
39
|
-
writable: true,
|
|
40
|
-
value: void 0
|
|
41
|
-
});
|
|
42
|
-
this.options = options ?? undefined;
|
|
43
|
-
}
|
|
44
|
-
static async _scrape(url, options) {
|
|
45
|
-
const { launch } = await PuppeteerWebBaseLoader.imports();
|
|
46
|
-
const browser = await launch({
|
|
47
|
-
headless: true,
|
|
48
|
-
defaultViewport: null,
|
|
49
|
-
ignoreDefaultArgs: ["--disable-extensions"],
|
|
50
|
-
...options?.launchOptions,
|
|
51
|
-
});
|
|
52
|
-
const page = await browser.newPage();
|
|
53
|
-
await page.goto(url, {
|
|
54
|
-
timeout: 180000,
|
|
55
|
-
waitUntil: "domcontentloaded",
|
|
56
|
-
...options?.gotoOptions,
|
|
57
|
-
});
|
|
58
|
-
const bodyHTML = options?.evaluate
|
|
59
|
-
? await options?.evaluate(page, browser)
|
|
60
|
-
: await page.evaluate(() => document.body.innerHTML);
|
|
61
|
-
await browser.close();
|
|
62
|
-
return bodyHTML;
|
|
63
|
-
}
|
|
64
|
-
/**
|
|
65
|
-
* Method that calls the _scrape method to perform the scraping of the web
|
|
66
|
-
* page specified by the webPath property.
|
|
67
|
-
* @returns Promise that resolves to the scraped HTML content of the web page.
|
|
68
|
-
*/
|
|
69
|
-
async scrape() {
|
|
70
|
-
return PuppeteerWebBaseLoader._scrape(this.webPath, this.options);
|
|
71
|
-
}
|
|
72
|
-
/**
|
|
73
|
-
* Method that calls the scrape method and returns the scraped HTML
|
|
74
|
-
* content as a Document object.
|
|
75
|
-
* @returns Promise that resolves to an array of Document objects.
|
|
76
|
-
*/
|
|
77
|
-
async load() {
|
|
78
|
-
const text = await this.scrape();
|
|
79
|
-
const metadata = { source: this.webPath };
|
|
80
|
-
return [new Document({ pageContent: text, metadata })];
|
|
81
|
-
}
|
|
82
|
-
/**
|
|
83
|
-
* Static class method used to screenshot a web page and return
|
|
84
|
-
* it as a {@link Document} object where the pageContent property
|
|
85
|
-
* is the screenshot encoded in base64.
|
|
86
|
-
*
|
|
87
|
-
* @param {string} url
|
|
88
|
-
* @param {PuppeteerWebBaseLoaderOptions} options
|
|
89
|
-
* @returns {Document} A document object containing the screenshot of the page encoded in base64.
|
|
90
|
-
*/
|
|
91
|
-
static async _screenshot(url, options) {
|
|
92
|
-
const { launch } = await PuppeteerWebBaseLoader.imports();
|
|
93
|
-
const browser = await launch({
|
|
94
|
-
headless: true,
|
|
95
|
-
defaultViewport: null,
|
|
96
|
-
ignoreDefaultArgs: ["--disable-extensions"],
|
|
97
|
-
...options?.launchOptions,
|
|
98
|
-
});
|
|
99
|
-
const page = await browser.newPage();
|
|
100
|
-
await page.goto(url, {
|
|
101
|
-
timeout: 180000,
|
|
102
|
-
waitUntil: "domcontentloaded",
|
|
103
|
-
...options?.gotoOptions,
|
|
104
|
-
});
|
|
105
|
-
const screenshot = await page.screenshot();
|
|
106
|
-
const base64 = screenshot.toString("base64");
|
|
107
|
-
const metadata = { source: url };
|
|
108
|
-
return new Document({ pageContent: base64, metadata });
|
|
109
|
-
}
|
|
110
|
-
/**
|
|
111
|
-
* Screenshot a web page and return it as a {@link Document} object where
|
|
112
|
-
* the pageContent property is the screenshot encoded in base64.
|
|
113
|
-
*
|
|
114
|
-
* @returns {Promise<Document>} A document object containing the screenshot of the page encoded in base64.
|
|
115
|
-
*/
|
|
116
|
-
async screenshot() {
|
|
117
|
-
return PuppeteerWebBaseLoader._screenshot(this.webPath, this.options);
|
|
118
|
-
}
|
|
119
|
-
/**
|
|
120
|
-
* Static method that imports the necessary Puppeteer modules. It returns
|
|
121
|
-
* a Promise that resolves to an object containing the imported modules.
|
|
122
|
-
* @returns Promise that resolves to an object containing the imported Puppeteer modules.
|
|
123
|
-
*/
|
|
124
|
-
static async imports() {
|
|
125
|
-
try {
|
|
126
|
-
// eslint-disable-next-line import/no-extraneous-dependencies
|
|
127
|
-
const { launch } = await import("puppeteer");
|
|
128
|
-
return { launch };
|
|
129
|
-
}
|
|
130
|
-
catch (e) {
|
|
131
|
-
console.error(e);
|
|
132
|
-
throw new Error("Please install puppeteer as a dependency with, e.g. `yarn add puppeteer`");
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
}
|
|
@@ -1,198 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.RecursiveUrlLoader = void 0;
|
|
4
|
-
const jsdom_1 = require("jsdom");
|
|
5
|
-
const async_caller_1 = require("@langchain/core/utils/async_caller");
|
|
6
|
-
const base_js_1 = require("../base.cjs");
|
|
7
|
-
const entrypoint_deprecation_js_1 = require("../../util/entrypoint_deprecation.cjs");
|
|
8
|
-
/* #__PURE__ */ (0, entrypoint_deprecation_js_1.logVersion020MigrationWarning)({
|
|
9
|
-
oldEntrypointName: "document_loaders/web/recursive_url",
|
|
10
|
-
newPackageName: "@langchain/community",
|
|
11
|
-
});
|
|
12
|
-
/**
|
|
13
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/recursive_url" instead. This entrypoint will be removed in 0.3.0.
|
|
14
|
-
*/
|
|
15
|
-
class RecursiveUrlLoader extends base_js_1.BaseDocumentLoader {
|
|
16
|
-
constructor(url, options) {
|
|
17
|
-
super();
|
|
18
|
-
Object.defineProperty(this, "caller", {
|
|
19
|
-
enumerable: true,
|
|
20
|
-
configurable: true,
|
|
21
|
-
writable: true,
|
|
22
|
-
value: void 0
|
|
23
|
-
});
|
|
24
|
-
Object.defineProperty(this, "url", {
|
|
25
|
-
enumerable: true,
|
|
26
|
-
configurable: true,
|
|
27
|
-
writable: true,
|
|
28
|
-
value: void 0
|
|
29
|
-
});
|
|
30
|
-
Object.defineProperty(this, "excludeDirs", {
|
|
31
|
-
enumerable: true,
|
|
32
|
-
configurable: true,
|
|
33
|
-
writable: true,
|
|
34
|
-
value: void 0
|
|
35
|
-
});
|
|
36
|
-
Object.defineProperty(this, "extractor", {
|
|
37
|
-
enumerable: true,
|
|
38
|
-
configurable: true,
|
|
39
|
-
writable: true,
|
|
40
|
-
value: void 0
|
|
41
|
-
});
|
|
42
|
-
Object.defineProperty(this, "maxDepth", {
|
|
43
|
-
enumerable: true,
|
|
44
|
-
configurable: true,
|
|
45
|
-
writable: true,
|
|
46
|
-
value: void 0
|
|
47
|
-
});
|
|
48
|
-
Object.defineProperty(this, "timeout", {
|
|
49
|
-
enumerable: true,
|
|
50
|
-
configurable: true,
|
|
51
|
-
writable: true,
|
|
52
|
-
value: void 0
|
|
53
|
-
});
|
|
54
|
-
Object.defineProperty(this, "preventOutside", {
|
|
55
|
-
enumerable: true,
|
|
56
|
-
configurable: true,
|
|
57
|
-
writable: true,
|
|
58
|
-
value: void 0
|
|
59
|
-
});
|
|
60
|
-
this.caller = new async_caller_1.AsyncCaller({
|
|
61
|
-
maxConcurrency: 64,
|
|
62
|
-
maxRetries: 0,
|
|
63
|
-
...options.callerOptions,
|
|
64
|
-
});
|
|
65
|
-
this.url = url;
|
|
66
|
-
this.excludeDirs = options.excludeDirs ?? [];
|
|
67
|
-
this.extractor = options.extractor ?? ((s) => s);
|
|
68
|
-
this.maxDepth = options.maxDepth ?? 2;
|
|
69
|
-
this.timeout = options.timeout ?? 10000;
|
|
70
|
-
this.preventOutside = options.preventOutside ?? true;
|
|
71
|
-
}
|
|
72
|
-
async fetchWithTimeout(resource, options) {
|
|
73
|
-
const { timeout, ...rest } = options;
|
|
74
|
-
return this.caller.call(() => fetch(resource, { ...rest, signal: AbortSignal.timeout(timeout) }));
|
|
75
|
-
}
|
|
76
|
-
getChildLinks(html, baseUrl) {
|
|
77
|
-
const allLinks = Array.from(new jsdom_1.JSDOM(html).window.document.querySelectorAll("a")).map((a) => a.href);
|
|
78
|
-
const absolutePaths = [];
|
|
79
|
-
// eslint-disable-next-line no-script-url
|
|
80
|
-
const invalidPrefixes = ["javascript:", "mailto:", "#"];
|
|
81
|
-
const invalidSuffixes = [
|
|
82
|
-
".css",
|
|
83
|
-
".js",
|
|
84
|
-
".ico",
|
|
85
|
-
".png",
|
|
86
|
-
".jpg",
|
|
87
|
-
".jpeg",
|
|
88
|
-
".gif",
|
|
89
|
-
".svg",
|
|
90
|
-
];
|
|
91
|
-
for (const link of allLinks) {
|
|
92
|
-
if (invalidPrefixes.some((prefix) => link.startsWith(prefix)) ||
|
|
93
|
-
invalidSuffixes.some((suffix) => link.endsWith(suffix)))
|
|
94
|
-
continue;
|
|
95
|
-
let standardizedLink;
|
|
96
|
-
if (link.startsWith("http")) {
|
|
97
|
-
standardizedLink = link;
|
|
98
|
-
}
|
|
99
|
-
else if (link.startsWith("//")) {
|
|
100
|
-
const base = new URL(baseUrl);
|
|
101
|
-
standardizedLink = base.protocol + link;
|
|
102
|
-
}
|
|
103
|
-
else {
|
|
104
|
-
standardizedLink = new URL(link, baseUrl).href;
|
|
105
|
-
}
|
|
106
|
-
if (this.excludeDirs.some((exDir) => standardizedLink.startsWith(exDir)))
|
|
107
|
-
continue;
|
|
108
|
-
if (link.startsWith("http")) {
|
|
109
|
-
const isAllowed = !this.preventOutside || link.startsWith(baseUrl);
|
|
110
|
-
if (isAllowed)
|
|
111
|
-
absolutePaths.push(link);
|
|
112
|
-
}
|
|
113
|
-
else if (link.startsWith("//")) {
|
|
114
|
-
const base = new URL(baseUrl);
|
|
115
|
-
absolutePaths.push(base.protocol + link);
|
|
116
|
-
}
|
|
117
|
-
else {
|
|
118
|
-
const newLink = new URL(link, baseUrl).href;
|
|
119
|
-
absolutePaths.push(newLink);
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
return Array.from(new Set(absolutePaths));
|
|
123
|
-
}
|
|
124
|
-
extractMetadata(rawHtml, url) {
|
|
125
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
126
|
-
const metadata = { source: url };
|
|
127
|
-
const { document } = new jsdom_1.JSDOM(rawHtml).window;
|
|
128
|
-
const title = document.getElementsByTagName("title")[0];
|
|
129
|
-
if (title) {
|
|
130
|
-
metadata.title = title.textContent;
|
|
131
|
-
}
|
|
132
|
-
const description = document.querySelector("meta[name=description]");
|
|
133
|
-
if (description) {
|
|
134
|
-
metadata.description = description.getAttribute("content");
|
|
135
|
-
}
|
|
136
|
-
const html = document.getElementsByTagName("html")[0];
|
|
137
|
-
if (html) {
|
|
138
|
-
metadata.language = html.getAttribute("lang");
|
|
139
|
-
}
|
|
140
|
-
return metadata;
|
|
141
|
-
}
|
|
142
|
-
async getUrlAsDoc(url) {
|
|
143
|
-
let res;
|
|
144
|
-
try {
|
|
145
|
-
res = await this.fetchWithTimeout(url, { timeout: this.timeout });
|
|
146
|
-
res = await res.text();
|
|
147
|
-
}
|
|
148
|
-
catch (e) {
|
|
149
|
-
return null;
|
|
150
|
-
}
|
|
151
|
-
return {
|
|
152
|
-
pageContent: this.extractor(res),
|
|
153
|
-
metadata: this.extractMetadata(res, url),
|
|
154
|
-
};
|
|
155
|
-
}
|
|
156
|
-
async getChildUrlsRecursive(inputUrl, visited = new Set(), depth = 0) {
|
|
157
|
-
if (depth >= this.maxDepth)
|
|
158
|
-
return [];
|
|
159
|
-
let url = inputUrl;
|
|
160
|
-
if (!inputUrl.endsWith("/"))
|
|
161
|
-
url += "/";
|
|
162
|
-
const isExcluded = this.excludeDirs.some((exDir) => url.startsWith(exDir));
|
|
163
|
-
if (isExcluded)
|
|
164
|
-
return [];
|
|
165
|
-
let res;
|
|
166
|
-
try {
|
|
167
|
-
res = await this.fetchWithTimeout(url, { timeout: this.timeout });
|
|
168
|
-
res = await res.text();
|
|
169
|
-
}
|
|
170
|
-
catch (e) {
|
|
171
|
-
return [];
|
|
172
|
-
}
|
|
173
|
-
const childUrls = this.getChildLinks(res, url);
|
|
174
|
-
const results = await Promise.all(childUrls.map((childUrl) => (async () => {
|
|
175
|
-
if (visited.has(childUrl))
|
|
176
|
-
return null;
|
|
177
|
-
visited.add(childUrl);
|
|
178
|
-
const childDoc = await this.getUrlAsDoc(childUrl);
|
|
179
|
-
if (!childDoc)
|
|
180
|
-
return null;
|
|
181
|
-
if (childUrl.endsWith("/")) {
|
|
182
|
-
const childUrlResponses = await this.getChildUrlsRecursive(childUrl, visited, depth + 1);
|
|
183
|
-
return [childDoc, ...childUrlResponses];
|
|
184
|
-
}
|
|
185
|
-
return [childDoc];
|
|
186
|
-
})()));
|
|
187
|
-
return results.flat().filter((docs) => docs !== null);
|
|
188
|
-
}
|
|
189
|
-
async load() {
|
|
190
|
-
const rootDoc = await this.getUrlAsDoc(this.url);
|
|
191
|
-
if (!rootDoc)
|
|
192
|
-
return [];
|
|
193
|
-
const docs = [rootDoc];
|
|
194
|
-
docs.push(...(await this.getChildUrlsRecursive(this.url, new Set([this.url]))));
|
|
195
|
-
return docs;
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
exports.RecursiveUrlLoader = RecursiveUrlLoader;
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
import { Document } from "@langchain/core/documents";
|
|
2
|
-
import { AsyncCaller } from "@langchain/core/utils/async_caller";
|
|
3
|
-
import { BaseDocumentLoader, DocumentLoader } from "../base.js";
|
|
4
|
-
/**
|
|
5
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/recursive_url" instead. This entrypoint will be removed in 0.3.0.
|
|
6
|
-
*/
|
|
7
|
-
export interface RecursiveUrlLoaderOptions {
|
|
8
|
-
excludeDirs?: string[];
|
|
9
|
-
extractor?: (text: string) => string;
|
|
10
|
-
maxDepth?: number;
|
|
11
|
-
timeout?: number;
|
|
12
|
-
preventOutside?: boolean;
|
|
13
|
-
callerOptions?: ConstructorParameters<typeof AsyncCaller>[0];
|
|
14
|
-
}
|
|
15
|
-
/**
|
|
16
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/recursive_url" instead. This entrypoint will be removed in 0.3.0.
|
|
17
|
-
*/
|
|
18
|
-
export declare class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLoader {
|
|
19
|
-
private caller;
|
|
20
|
-
private url;
|
|
21
|
-
private excludeDirs;
|
|
22
|
-
private extractor;
|
|
23
|
-
private maxDepth;
|
|
24
|
-
private timeout;
|
|
25
|
-
private preventOutside;
|
|
26
|
-
constructor(url: string, options: RecursiveUrlLoaderOptions);
|
|
27
|
-
private fetchWithTimeout;
|
|
28
|
-
private getChildLinks;
|
|
29
|
-
private extractMetadata;
|
|
30
|
-
private getUrlAsDoc;
|
|
31
|
-
private getChildUrlsRecursive;
|
|
32
|
-
load(): Promise<Document[]>;
|
|
33
|
-
}
|
|
@@ -1,194 +0,0 @@
|
|
|
1
|
-
import { JSDOM } from "jsdom";
|
|
2
|
-
import { AsyncCaller } from "@langchain/core/utils/async_caller";
|
|
3
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
4
|
-
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
|
|
5
|
-
/* #__PURE__ */ logVersion020MigrationWarning({
|
|
6
|
-
oldEntrypointName: "document_loaders/web/recursive_url",
|
|
7
|
-
newPackageName: "@langchain/community",
|
|
8
|
-
});
|
|
9
|
-
/**
|
|
10
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/recursive_url" instead. This entrypoint will be removed in 0.3.0.
|
|
11
|
-
*/
|
|
12
|
-
export class RecursiveUrlLoader extends BaseDocumentLoader {
|
|
13
|
-
constructor(url, options) {
|
|
14
|
-
super();
|
|
15
|
-
Object.defineProperty(this, "caller", {
|
|
16
|
-
enumerable: true,
|
|
17
|
-
configurable: true,
|
|
18
|
-
writable: true,
|
|
19
|
-
value: void 0
|
|
20
|
-
});
|
|
21
|
-
Object.defineProperty(this, "url", {
|
|
22
|
-
enumerable: true,
|
|
23
|
-
configurable: true,
|
|
24
|
-
writable: true,
|
|
25
|
-
value: void 0
|
|
26
|
-
});
|
|
27
|
-
Object.defineProperty(this, "excludeDirs", {
|
|
28
|
-
enumerable: true,
|
|
29
|
-
configurable: true,
|
|
30
|
-
writable: true,
|
|
31
|
-
value: void 0
|
|
32
|
-
});
|
|
33
|
-
Object.defineProperty(this, "extractor", {
|
|
34
|
-
enumerable: true,
|
|
35
|
-
configurable: true,
|
|
36
|
-
writable: true,
|
|
37
|
-
value: void 0
|
|
38
|
-
});
|
|
39
|
-
Object.defineProperty(this, "maxDepth", {
|
|
40
|
-
enumerable: true,
|
|
41
|
-
configurable: true,
|
|
42
|
-
writable: true,
|
|
43
|
-
value: void 0
|
|
44
|
-
});
|
|
45
|
-
Object.defineProperty(this, "timeout", {
|
|
46
|
-
enumerable: true,
|
|
47
|
-
configurable: true,
|
|
48
|
-
writable: true,
|
|
49
|
-
value: void 0
|
|
50
|
-
});
|
|
51
|
-
Object.defineProperty(this, "preventOutside", {
|
|
52
|
-
enumerable: true,
|
|
53
|
-
configurable: true,
|
|
54
|
-
writable: true,
|
|
55
|
-
value: void 0
|
|
56
|
-
});
|
|
57
|
-
this.caller = new AsyncCaller({
|
|
58
|
-
maxConcurrency: 64,
|
|
59
|
-
maxRetries: 0,
|
|
60
|
-
...options.callerOptions,
|
|
61
|
-
});
|
|
62
|
-
this.url = url;
|
|
63
|
-
this.excludeDirs = options.excludeDirs ?? [];
|
|
64
|
-
this.extractor = options.extractor ?? ((s) => s);
|
|
65
|
-
this.maxDepth = options.maxDepth ?? 2;
|
|
66
|
-
this.timeout = options.timeout ?? 10000;
|
|
67
|
-
this.preventOutside = options.preventOutside ?? true;
|
|
68
|
-
}
|
|
69
|
-
async fetchWithTimeout(resource, options) {
|
|
70
|
-
const { timeout, ...rest } = options;
|
|
71
|
-
return this.caller.call(() => fetch(resource, { ...rest, signal: AbortSignal.timeout(timeout) }));
|
|
72
|
-
}
|
|
73
|
-
getChildLinks(html, baseUrl) {
|
|
74
|
-
const allLinks = Array.from(new JSDOM(html).window.document.querySelectorAll("a")).map((a) => a.href);
|
|
75
|
-
const absolutePaths = [];
|
|
76
|
-
// eslint-disable-next-line no-script-url
|
|
77
|
-
const invalidPrefixes = ["javascript:", "mailto:", "#"];
|
|
78
|
-
const invalidSuffixes = [
|
|
79
|
-
".css",
|
|
80
|
-
".js",
|
|
81
|
-
".ico",
|
|
82
|
-
".png",
|
|
83
|
-
".jpg",
|
|
84
|
-
".jpeg",
|
|
85
|
-
".gif",
|
|
86
|
-
".svg",
|
|
87
|
-
];
|
|
88
|
-
for (const link of allLinks) {
|
|
89
|
-
if (invalidPrefixes.some((prefix) => link.startsWith(prefix)) ||
|
|
90
|
-
invalidSuffixes.some((suffix) => link.endsWith(suffix)))
|
|
91
|
-
continue;
|
|
92
|
-
let standardizedLink;
|
|
93
|
-
if (link.startsWith("http")) {
|
|
94
|
-
standardizedLink = link;
|
|
95
|
-
}
|
|
96
|
-
else if (link.startsWith("//")) {
|
|
97
|
-
const base = new URL(baseUrl);
|
|
98
|
-
standardizedLink = base.protocol + link;
|
|
99
|
-
}
|
|
100
|
-
else {
|
|
101
|
-
standardizedLink = new URL(link, baseUrl).href;
|
|
102
|
-
}
|
|
103
|
-
if (this.excludeDirs.some((exDir) => standardizedLink.startsWith(exDir)))
|
|
104
|
-
continue;
|
|
105
|
-
if (link.startsWith("http")) {
|
|
106
|
-
const isAllowed = !this.preventOutside || link.startsWith(baseUrl);
|
|
107
|
-
if (isAllowed)
|
|
108
|
-
absolutePaths.push(link);
|
|
109
|
-
}
|
|
110
|
-
else if (link.startsWith("//")) {
|
|
111
|
-
const base = new URL(baseUrl);
|
|
112
|
-
absolutePaths.push(base.protocol + link);
|
|
113
|
-
}
|
|
114
|
-
else {
|
|
115
|
-
const newLink = new URL(link, baseUrl).href;
|
|
116
|
-
absolutePaths.push(newLink);
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
return Array.from(new Set(absolutePaths));
|
|
120
|
-
}
|
|
121
|
-
extractMetadata(rawHtml, url) {
|
|
122
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
123
|
-
const metadata = { source: url };
|
|
124
|
-
const { document } = new JSDOM(rawHtml).window;
|
|
125
|
-
const title = document.getElementsByTagName("title")[0];
|
|
126
|
-
if (title) {
|
|
127
|
-
metadata.title = title.textContent;
|
|
128
|
-
}
|
|
129
|
-
const description = document.querySelector("meta[name=description]");
|
|
130
|
-
if (description) {
|
|
131
|
-
metadata.description = description.getAttribute("content");
|
|
132
|
-
}
|
|
133
|
-
const html = document.getElementsByTagName("html")[0];
|
|
134
|
-
if (html) {
|
|
135
|
-
metadata.language = html.getAttribute("lang");
|
|
136
|
-
}
|
|
137
|
-
return metadata;
|
|
138
|
-
}
|
|
139
|
-
async getUrlAsDoc(url) {
|
|
140
|
-
let res;
|
|
141
|
-
try {
|
|
142
|
-
res = await this.fetchWithTimeout(url, { timeout: this.timeout });
|
|
143
|
-
res = await res.text();
|
|
144
|
-
}
|
|
145
|
-
catch (e) {
|
|
146
|
-
return null;
|
|
147
|
-
}
|
|
148
|
-
return {
|
|
149
|
-
pageContent: this.extractor(res),
|
|
150
|
-
metadata: this.extractMetadata(res, url),
|
|
151
|
-
};
|
|
152
|
-
}
|
|
153
|
-
async getChildUrlsRecursive(inputUrl, visited = new Set(), depth = 0) {
|
|
154
|
-
if (depth >= this.maxDepth)
|
|
155
|
-
return [];
|
|
156
|
-
let url = inputUrl;
|
|
157
|
-
if (!inputUrl.endsWith("/"))
|
|
158
|
-
url += "/";
|
|
159
|
-
const isExcluded = this.excludeDirs.some((exDir) => url.startsWith(exDir));
|
|
160
|
-
if (isExcluded)
|
|
161
|
-
return [];
|
|
162
|
-
let res;
|
|
163
|
-
try {
|
|
164
|
-
res = await this.fetchWithTimeout(url, { timeout: this.timeout });
|
|
165
|
-
res = await res.text();
|
|
166
|
-
}
|
|
167
|
-
catch (e) {
|
|
168
|
-
return [];
|
|
169
|
-
}
|
|
170
|
-
const childUrls = this.getChildLinks(res, url);
|
|
171
|
-
const results = await Promise.all(childUrls.map((childUrl) => (async () => {
|
|
172
|
-
if (visited.has(childUrl))
|
|
173
|
-
return null;
|
|
174
|
-
visited.add(childUrl);
|
|
175
|
-
const childDoc = await this.getUrlAsDoc(childUrl);
|
|
176
|
-
if (!childDoc)
|
|
177
|
-
return null;
|
|
178
|
-
if (childUrl.endsWith("/")) {
|
|
179
|
-
const childUrlResponses = await this.getChildUrlsRecursive(childUrl, visited, depth + 1);
|
|
180
|
-
return [childDoc, ...childUrlResponses];
|
|
181
|
-
}
|
|
182
|
-
return [childDoc];
|
|
183
|
-
})()));
|
|
184
|
-
return results.flat().filter((docs) => docs !== null);
|
|
185
|
-
}
|
|
186
|
-
async load() {
|
|
187
|
-
const rootDoc = await this.getUrlAsDoc(this.url);
|
|
188
|
-
if (!rootDoc)
|
|
189
|
-
return [];
|
|
190
|
-
const docs = [rootDoc];
|
|
191
|
-
docs.push(...(await this.getChildUrlsRecursive(this.url, new Set([this.url]))));
|
|
192
|
-
return docs;
|
|
193
|
-
}
|
|
194
|
-
}
|