langchain 0.2.18 → 0.3.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/openai_functions/index.cjs +2 -2
- package/dist/agents/openai_functions/index.js +2 -2
- package/dist/chains/combine_documents/stuff.cjs +2 -2
- package/dist/chains/combine_documents/stuff.js +2 -2
- package/dist/chains/openai_functions/openapi.cjs +3 -1
- package/dist/chains/openai_functions/openapi.js +3 -1
- package/dist/load/import_constants.cjs +2 -39
- package/dist/load/import_constants.js +2 -39
- package/dist/load/import_map.cjs +2 -3
- package/dist/load/import_map.d.ts +0 -1
- package/dist/load/import_map.js +0 -1
- package/dist/smith/config.d.ts +1 -5
- package/package.json +31 -854
- package/dist/document_loaders/fs/chatgpt.cjs +0 -90
- package/dist/document_loaders/fs/chatgpt.d.ts +0 -8
- package/dist/document_loaders/fs/chatgpt.js +0 -86
- package/dist/document_loaders/fs/csv.cjs +0 -73
- package/dist/document_loaders/fs/csv.d.ts +0 -65
- package/dist/document_loaders/fs/csv.js +0 -69
- package/dist/document_loaders/fs/docx.cjs +0 -58
- package/dist/document_loaders/fs/docx.d.ts +0 -25
- package/dist/document_loaders/fs/docx.js +0 -54
- package/dist/document_loaders/fs/epub.cjs +0 -103
- package/dist/document_loaders/fs/epub.d.ts +0 -33
- package/dist/document_loaders/fs/epub.js +0 -99
- package/dist/document_loaders/fs/notion.cjs +0 -26
- package/dist/document_loaders/fs/notion.d.ts +0 -12
- package/dist/document_loaders/fs/notion.js +0 -22
- package/dist/document_loaders/fs/obsidian.cjs +0 -247
- package/dist/document_loaders/fs/obsidian.d.ts +0 -28
- package/dist/document_loaders/fs/obsidian.js +0 -240
- package/dist/document_loaders/fs/openai_whisper_audio.cjs +0 -49
- package/dist/document_loaders/fs/openai_whisper_audio.d.ts +0 -23
- package/dist/document_loaders/fs/openai_whisper_audio.js +0 -45
- package/dist/document_loaders/fs/pdf.cjs +0 -148
- package/dist/document_loaders/fs/pdf.d.ts +0 -49
- package/dist/document_loaders/fs/pdf.js +0 -144
- package/dist/document_loaders/fs/pptx.cjs +0 -46
- package/dist/document_loaders/fs/pptx.d.ts +0 -25
- package/dist/document_loaders/fs/pptx.js +0 -42
- package/dist/document_loaders/fs/srt.cjs +0 -57
- package/dist/document_loaders/fs/srt.d.ts +0 -32
- package/dist/document_loaders/fs/srt.js +0 -50
- package/dist/document_loaders/fs/unstructured.cjs +0 -338
- package/dist/document_loaders/fs/unstructured.d.ts +0 -125
- package/dist/document_loaders/fs/unstructured.js +0 -333
- package/dist/document_loaders/web/apify_dataset.cjs +0 -130
- package/dist/document_loaders/web/apify_dataset.d.ts +0 -85
- package/dist/document_loaders/web/apify_dataset.js +0 -126
- package/dist/document_loaders/web/assemblyai.cjs +0 -200
- package/dist/document_loaders/web/assemblyai.d.ts +0 -95
- package/dist/document_loaders/web/assemblyai.js +0 -193
- package/dist/document_loaders/web/azure_blob_storage_container.cjs +0 -73
- package/dist/document_loaders/web/azure_blob_storage_container.d.ts +0 -46
- package/dist/document_loaders/web/azure_blob_storage_container.js +0 -69
- package/dist/document_loaders/web/azure_blob_storage_file.cjs +0 -124
- package/dist/document_loaders/web/azure_blob_storage_file.d.ts +0 -53
- package/dist/document_loaders/web/azure_blob_storage_file.js +0 -97
- package/dist/document_loaders/web/browserbase.cjs +0 -93
- package/dist/document_loaders/web/browserbase.d.ts +0 -48
- package/dist/document_loaders/web/browserbase.js +0 -86
- package/dist/document_loaders/web/cheerio.cjs +0 -118
- package/dist/document_loaders/web/cheerio.d.ts +0 -77
- package/dist/document_loaders/web/cheerio.js +0 -114
- package/dist/document_loaders/web/college_confidential.cjs +0 -41
- package/dist/document_loaders/web/college_confidential.d.ts +0 -25
- package/dist/document_loaders/web/college_confidential.js +0 -37
- package/dist/document_loaders/web/confluence.cjs +0 -190
- package/dist/document_loaders/web/confluence.d.ts +0 -114
- package/dist/document_loaders/web/confluence.js +0 -186
- package/dist/document_loaders/web/couchbase.cjs +0 -95
- package/dist/document_loaders/web/couchbase.d.ts +0 -32
- package/dist/document_loaders/web/couchbase.js +0 -91
- package/dist/document_loaders/web/figma.cjs +0 -102
- package/dist/document_loaders/web/figma.d.ts +0 -82
- package/dist/document_loaders/web/figma.js +0 -98
- package/dist/document_loaders/web/firecrawl.cjs +0 -95
- package/dist/document_loaders/web/firecrawl.d.ts +0 -50
- package/dist/document_loaders/web/firecrawl.js +0 -88
- package/dist/document_loaders/web/gitbook.cjs +0 -110
- package/dist/document_loaders/web/gitbook.d.ts +0 -55
- package/dist/document_loaders/web/gitbook.js +0 -106
- package/dist/document_loaders/web/github.cjs +0 -615
- package/dist/document_loaders/web/github.d.ts +0 -203
- package/dist/document_loaders/web/github.js +0 -608
- package/dist/document_loaders/web/hn.cjs +0 -90
- package/dist/document_loaders/web/hn.d.ts +0 -42
- package/dist/document_loaders/web/hn.js +0 -86
- package/dist/document_loaders/web/imsdb.cjs +0 -44
- package/dist/document_loaders/web/imsdb.d.ts +0 -23
- package/dist/document_loaders/web/imsdb.js +0 -40
- package/dist/document_loaders/web/notionapi.cjs +0 -404
- package/dist/document_loaders/web/notionapi.d.ts +0 -133
- package/dist/document_loaders/web/notionapi.js +0 -392
- package/dist/document_loaders/web/notiondb.cjs +0 -199
- package/dist/document_loaders/web/notiondb.d.ts +0 -56
- package/dist/document_loaders/web/notiondb.js +0 -195
- package/dist/document_loaders/web/pdf.cjs +0 -140
- package/dist/document_loaders/web/pdf.d.ts +0 -35
- package/dist/document_loaders/web/pdf.js +0 -136
- package/dist/document_loaders/web/playwright.cjs +0 -89
- package/dist/document_loaders/web/playwright.d.ts +0 -58
- package/dist/document_loaders/web/playwright.js +0 -85
- package/dist/document_loaders/web/puppeteer.cjs +0 -139
- package/dist/document_loaders/web/puppeteer.d.ts +0 -82
- package/dist/document_loaders/web/puppeteer.js +0 -135
- package/dist/document_loaders/web/recursive_url.cjs +0 -198
- package/dist/document_loaders/web/recursive_url.d.ts +0 -33
- package/dist/document_loaders/web/recursive_url.js +0 -194
- package/dist/document_loaders/web/s3.cjs +0 -164
- package/dist/document_loaders/web/s3.d.ts +0 -78
- package/dist/document_loaders/web/s3.js +0 -137
- package/dist/document_loaders/web/searchapi.cjs +0 -150
- package/dist/document_loaders/web/searchapi.d.ts +0 -76
- package/dist/document_loaders/web/searchapi.js +0 -146
- package/dist/document_loaders/web/serpapi.cjs +0 -127
- package/dist/document_loaders/web/serpapi.d.ts +0 -62
- package/dist/document_loaders/web/serpapi.js +0 -123
- package/dist/document_loaders/web/sitemap.cjs +0 -118
- package/dist/document_loaders/web/sitemap.d.ts +0 -41
- package/dist/document_loaders/web/sitemap.js +0 -114
- package/dist/document_loaders/web/sonix_audio.cjs +0 -68
- package/dist/document_loaders/web/sonix_audio.d.ts +0 -36
- package/dist/document_loaders/web/sonix_audio.js +0 -64
- package/dist/document_loaders/web/sort_xyz_blockchain.cjs +0 -157
- package/dist/document_loaders/web/sort_xyz_blockchain.d.ts +0 -78
- package/dist/document_loaders/web/sort_xyz_blockchain.js +0 -153
- package/dist/document_loaders/web/youtube.cjs +0 -116
- package/dist/document_loaders/web/youtube.d.ts +0 -55
- package/dist/document_loaders/web/youtube.js +0 -112
- package/dist/experimental/tools/pyinterpreter.cjs +0 -248
- package/dist/experimental/tools/pyinterpreter.d.ts +0 -18
- package/dist/experimental/tools/pyinterpreter.js +0 -244
- package/dist/retrievers/self_query/chroma.cjs +0 -48
- package/dist/retrievers/self_query/chroma.d.ts +0 -26
- package/dist/retrievers/self_query/chroma.js +0 -44
- package/dist/retrievers/self_query/pinecone.cjs +0 -47
- package/dist/retrievers/self_query/pinecone.d.ts +0 -26
- package/dist/retrievers/self_query/pinecone.js +0 -43
- package/dist/retrievers/self_query/supabase.cjs +0 -278
- package/dist/retrievers/self_query/supabase.d.ts +0 -109
- package/dist/retrievers/self_query/supabase.js +0 -274
- package/dist/retrievers/self_query/supabase_utils.cjs +0 -264
- package/dist/retrievers/self_query/supabase_utils.d.ts +0 -101
- package/dist/retrievers/self_query/supabase_utils.js +0 -259
- package/dist/retrievers/self_query/vectara.cjs +0 -143
- package/dist/retrievers/self_query/vectara.d.ts +0 -42
- package/dist/retrievers/self_query/vectara.js +0 -139
- package/dist/retrievers/self_query/weaviate.cjs +0 -201
- package/dist/retrievers/self_query/weaviate.d.ts +0 -99
- package/dist/retrievers/self_query/weaviate.js +0 -197
- package/dist/types/assemblyai-types.cjs +0 -2
- package/dist/types/assemblyai-types.d.ts +0 -4
- package/dist/types/assemblyai-types.js +0 -1
- package/document_loaders/fs/chatgpt.cjs +0 -1
- package/document_loaders/fs/chatgpt.d.cts +0 -1
- package/document_loaders/fs/chatgpt.d.ts +0 -1
- package/document_loaders/fs/chatgpt.js +0 -1
- package/document_loaders/fs/csv.cjs +0 -1
- package/document_loaders/fs/csv.d.cts +0 -1
- package/document_loaders/fs/csv.d.ts +0 -1
- package/document_loaders/fs/csv.js +0 -1
- package/document_loaders/fs/docx.cjs +0 -1
- package/document_loaders/fs/docx.d.cts +0 -1
- package/document_loaders/fs/docx.d.ts +0 -1
- package/document_loaders/fs/docx.js +0 -1
- package/document_loaders/fs/epub.cjs +0 -1
- package/document_loaders/fs/epub.d.cts +0 -1
- package/document_loaders/fs/epub.d.ts +0 -1
- package/document_loaders/fs/epub.js +0 -1
- package/document_loaders/fs/notion.cjs +0 -1
- package/document_loaders/fs/notion.d.cts +0 -1
- package/document_loaders/fs/notion.d.ts +0 -1
- package/document_loaders/fs/notion.js +0 -1
- package/document_loaders/fs/obsidian.cjs +0 -1
- package/document_loaders/fs/obsidian.d.cts +0 -1
- package/document_loaders/fs/obsidian.d.ts +0 -1
- package/document_loaders/fs/obsidian.js +0 -1
- package/document_loaders/fs/openai_whisper_audio.cjs +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.cts +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.ts +0 -1
- package/document_loaders/fs/openai_whisper_audio.js +0 -1
- package/document_loaders/fs/pdf.cjs +0 -1
- package/document_loaders/fs/pdf.d.cts +0 -1
- package/document_loaders/fs/pdf.d.ts +0 -1
- package/document_loaders/fs/pdf.js +0 -1
- package/document_loaders/fs/pptx.cjs +0 -1
- package/document_loaders/fs/pptx.d.cts +0 -1
- package/document_loaders/fs/pptx.d.ts +0 -1
- package/document_loaders/fs/pptx.js +0 -1
- package/document_loaders/fs/srt.cjs +0 -1
- package/document_loaders/fs/srt.d.cts +0 -1
- package/document_loaders/fs/srt.d.ts +0 -1
- package/document_loaders/fs/srt.js +0 -1
- package/document_loaders/fs/unstructured.cjs +0 -1
- package/document_loaders/fs/unstructured.d.cts +0 -1
- package/document_loaders/fs/unstructured.d.ts +0 -1
- package/document_loaders/fs/unstructured.js +0 -1
- package/document_loaders/web/apify_dataset.cjs +0 -1
- package/document_loaders/web/apify_dataset.d.cts +0 -1
- package/document_loaders/web/apify_dataset.d.ts +0 -1
- package/document_loaders/web/apify_dataset.js +0 -1
- package/document_loaders/web/assemblyai.cjs +0 -1
- package/document_loaders/web/assemblyai.d.cts +0 -1
- package/document_loaders/web/assemblyai.d.ts +0 -1
- package/document_loaders/web/assemblyai.js +0 -1
- package/document_loaders/web/azure_blob_storage_container.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_container.js +0 -1
- package/document_loaders/web/azure_blob_storage_file.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_file.js +0 -1
- package/document_loaders/web/browserbase.cjs +0 -1
- package/document_loaders/web/browserbase.d.cts +0 -1
- package/document_loaders/web/browserbase.d.ts +0 -1
- package/document_loaders/web/browserbase.js +0 -1
- package/document_loaders/web/cheerio.cjs +0 -1
- package/document_loaders/web/cheerio.d.cts +0 -1
- package/document_loaders/web/cheerio.d.ts +0 -1
- package/document_loaders/web/cheerio.js +0 -1
- package/document_loaders/web/college_confidential.cjs +0 -1
- package/document_loaders/web/college_confidential.d.cts +0 -1
- package/document_loaders/web/college_confidential.d.ts +0 -1
- package/document_loaders/web/college_confidential.js +0 -1
- package/document_loaders/web/confluence.cjs +0 -1
- package/document_loaders/web/confluence.d.cts +0 -1
- package/document_loaders/web/confluence.d.ts +0 -1
- package/document_loaders/web/confluence.js +0 -1
- package/document_loaders/web/couchbase.cjs +0 -1
- package/document_loaders/web/couchbase.d.cts +0 -1
- package/document_loaders/web/couchbase.d.ts +0 -1
- package/document_loaders/web/couchbase.js +0 -1
- package/document_loaders/web/figma.cjs +0 -1
- package/document_loaders/web/figma.d.cts +0 -1
- package/document_loaders/web/figma.d.ts +0 -1
- package/document_loaders/web/figma.js +0 -1
- package/document_loaders/web/firecrawl.cjs +0 -1
- package/document_loaders/web/firecrawl.d.cts +0 -1
- package/document_loaders/web/firecrawl.d.ts +0 -1
- package/document_loaders/web/firecrawl.js +0 -1
- package/document_loaders/web/gitbook.cjs +0 -1
- package/document_loaders/web/gitbook.d.cts +0 -1
- package/document_loaders/web/gitbook.d.ts +0 -1
- package/document_loaders/web/gitbook.js +0 -1
- package/document_loaders/web/github.cjs +0 -1
- package/document_loaders/web/github.d.cts +0 -1
- package/document_loaders/web/github.d.ts +0 -1
- package/document_loaders/web/github.js +0 -1
- package/document_loaders/web/hn.cjs +0 -1
- package/document_loaders/web/hn.d.cts +0 -1
- package/document_loaders/web/hn.d.ts +0 -1
- package/document_loaders/web/hn.js +0 -1
- package/document_loaders/web/imsdb.cjs +0 -1
- package/document_loaders/web/imsdb.d.cts +0 -1
- package/document_loaders/web/imsdb.d.ts +0 -1
- package/document_loaders/web/imsdb.js +0 -1
- package/document_loaders/web/notionapi.cjs +0 -1
- package/document_loaders/web/notionapi.d.cts +0 -1
- package/document_loaders/web/notionapi.d.ts +0 -1
- package/document_loaders/web/notionapi.js +0 -1
- package/document_loaders/web/notiondb.cjs +0 -1
- package/document_loaders/web/notiondb.d.cts +0 -1
- package/document_loaders/web/notiondb.d.ts +0 -1
- package/document_loaders/web/notiondb.js +0 -1
- package/document_loaders/web/pdf.cjs +0 -1
- package/document_loaders/web/pdf.d.cts +0 -1
- package/document_loaders/web/pdf.d.ts +0 -1
- package/document_loaders/web/pdf.js +0 -1
- package/document_loaders/web/playwright.cjs +0 -1
- package/document_loaders/web/playwright.d.cts +0 -1
- package/document_loaders/web/playwright.d.ts +0 -1
- package/document_loaders/web/playwright.js +0 -1
- package/document_loaders/web/puppeteer.cjs +0 -1
- package/document_loaders/web/puppeteer.d.cts +0 -1
- package/document_loaders/web/puppeteer.d.ts +0 -1
- package/document_loaders/web/puppeteer.js +0 -1
- package/document_loaders/web/recursive_url.cjs +0 -1
- package/document_loaders/web/recursive_url.d.cts +0 -1
- package/document_loaders/web/recursive_url.d.ts +0 -1
- package/document_loaders/web/recursive_url.js +0 -1
- package/document_loaders/web/s3.cjs +0 -1
- package/document_loaders/web/s3.d.cts +0 -1
- package/document_loaders/web/s3.d.ts +0 -1
- package/document_loaders/web/s3.js +0 -1
- package/document_loaders/web/searchapi.cjs +0 -1
- package/document_loaders/web/searchapi.d.cts +0 -1
- package/document_loaders/web/searchapi.d.ts +0 -1
- package/document_loaders/web/searchapi.js +0 -1
- package/document_loaders/web/serpapi.cjs +0 -1
- package/document_loaders/web/serpapi.d.cts +0 -1
- package/document_loaders/web/serpapi.d.ts +0 -1
- package/document_loaders/web/serpapi.js +0 -1
- package/document_loaders/web/sitemap.cjs +0 -1
- package/document_loaders/web/sitemap.d.cts +0 -1
- package/document_loaders/web/sitemap.d.ts +0 -1
- package/document_loaders/web/sitemap.js +0 -1
- package/document_loaders/web/sonix_audio.cjs +0 -1
- package/document_loaders/web/sonix_audio.d.cts +0 -1
- package/document_loaders/web/sonix_audio.d.ts +0 -1
- package/document_loaders/web/sonix_audio.js +0 -1
- package/document_loaders/web/sort_xyz_blockchain.cjs +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.cts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.ts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.js +0 -1
- package/document_loaders/web/youtube.cjs +0 -1
- package/document_loaders/web/youtube.d.cts +0 -1
- package/document_loaders/web/youtube.d.ts +0 -1
- package/document_loaders/web/youtube.js +0 -1
- package/experimental/tools/pyinterpreter.cjs +0 -1
- package/experimental/tools/pyinterpreter.d.cts +0 -1
- package/experimental/tools/pyinterpreter.d.ts +0 -1
- package/experimental/tools/pyinterpreter.js +0 -1
- package/memory/index.cjs +0 -1
- package/memory/index.d.cts +0 -1
- package/memory/index.d.ts +0 -1
- package/memory/index.js +0 -1
- package/retrievers/self_query/chroma.cjs +0 -1
- package/retrievers/self_query/chroma.d.cts +0 -1
- package/retrievers/self_query/chroma.d.ts +0 -1
- package/retrievers/self_query/chroma.js +0 -1
- package/retrievers/self_query/pinecone.cjs +0 -1
- package/retrievers/self_query/pinecone.d.cts +0 -1
- package/retrievers/self_query/pinecone.d.ts +0 -1
- package/retrievers/self_query/pinecone.js +0 -1
- package/retrievers/self_query/supabase.cjs +0 -1
- package/retrievers/self_query/supabase.d.cts +0 -1
- package/retrievers/self_query/supabase.d.ts +0 -1
- package/retrievers/self_query/supabase.js +0 -1
- package/retrievers/self_query/vectara.cjs +0 -1
- package/retrievers/self_query/vectara.d.cts +0 -1
- package/retrievers/self_query/vectara.d.ts +0 -1
- package/retrievers/self_query/vectara.js +0 -1
- package/retrievers/self_query/weaviate.cjs +0 -1
- package/retrievers/self_query/weaviate.d.cts +0 -1
- package/retrievers/self_query/weaviate.d.ts +0 -1
- package/retrievers/self_query/weaviate.js +0 -1
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
import { Document } from "@langchain/core/documents";
|
|
2
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
3
|
-
import { formatDocumentsAsString } from "../../util/document.js";
|
|
4
|
-
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
|
|
5
|
-
/* #__PURE__ */ logVersion020MigrationWarning({
|
|
6
|
-
oldEntrypointName: "document_loaders/web/pdf",
|
|
7
|
-
newPackageName: "@langchain/community",
|
|
8
|
-
});
|
|
9
|
-
/**
|
|
10
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/pdf" instead. This entrypoint will be removed in 0.3.0.
|
|
11
|
-
*
|
|
12
|
-
* A document loader for loading data from PDFs.
|
|
13
|
-
* @example
|
|
14
|
-
* ```typescript
|
|
15
|
-
* const loader = new WebPDFLoader(new Blob());
|
|
16
|
-
* const docs = await loader.load();
|
|
17
|
-
* console.log({ docs });
|
|
18
|
-
* ```
|
|
19
|
-
*/
|
|
20
|
-
export class WebPDFLoader extends BaseDocumentLoader {
|
|
21
|
-
constructor(blob, { splitPages = true, pdfjs = PDFLoaderImports, parsedItemSeparator = "", } = {}) {
|
|
22
|
-
super();
|
|
23
|
-
Object.defineProperty(this, "blob", {
|
|
24
|
-
enumerable: true,
|
|
25
|
-
configurable: true,
|
|
26
|
-
writable: true,
|
|
27
|
-
value: void 0
|
|
28
|
-
});
|
|
29
|
-
Object.defineProperty(this, "splitPages", {
|
|
30
|
-
enumerable: true,
|
|
31
|
-
configurable: true,
|
|
32
|
-
writable: true,
|
|
33
|
-
value: true
|
|
34
|
-
});
|
|
35
|
-
Object.defineProperty(this, "pdfjs", {
|
|
36
|
-
enumerable: true,
|
|
37
|
-
configurable: true,
|
|
38
|
-
writable: true,
|
|
39
|
-
value: void 0
|
|
40
|
-
});
|
|
41
|
-
Object.defineProperty(this, "parsedItemSeparator", {
|
|
42
|
-
enumerable: true,
|
|
43
|
-
configurable: true,
|
|
44
|
-
writable: true,
|
|
45
|
-
value: void 0
|
|
46
|
-
});
|
|
47
|
-
this.blob = blob;
|
|
48
|
-
this.splitPages = splitPages ?? this.splitPages;
|
|
49
|
-
this.pdfjs = pdfjs;
|
|
50
|
-
this.parsedItemSeparator = parsedItemSeparator;
|
|
51
|
-
}
|
|
52
|
-
/**
|
|
53
|
-
* Loads the contents of the PDF as documents.
|
|
54
|
-
* @returns An array of Documents representing the retrieved data.
|
|
55
|
-
*/
|
|
56
|
-
async load() {
|
|
57
|
-
const { getDocument, version } = await this.pdfjs();
|
|
58
|
-
const parsedPdf = await getDocument({
|
|
59
|
-
data: new Uint8Array(await this.blob.arrayBuffer()),
|
|
60
|
-
useWorkerFetch: false,
|
|
61
|
-
isEvalSupported: false,
|
|
62
|
-
useSystemFonts: true,
|
|
63
|
-
}).promise;
|
|
64
|
-
const meta = await parsedPdf.getMetadata().catch(() => null);
|
|
65
|
-
const documents = [];
|
|
66
|
-
for (let i = 1; i <= parsedPdf.numPages; i += 1) {
|
|
67
|
-
const page = await parsedPdf.getPage(i);
|
|
68
|
-
const content = await page.getTextContent();
|
|
69
|
-
if (content.items.length === 0) {
|
|
70
|
-
continue;
|
|
71
|
-
}
|
|
72
|
-
// Eliminate excessive newlines
|
|
73
|
-
// Source: https://github.com/albertcui/pdf-parse/blob/7086fc1cc9058545cdf41dd0646d6ae5832c7107/lib/pdf-parse.js#L16
|
|
74
|
-
let lastY;
|
|
75
|
-
const textItems = [];
|
|
76
|
-
for (const item of content.items) {
|
|
77
|
-
if ("str" in item) {
|
|
78
|
-
if (lastY === item.transform[5] || !lastY) {
|
|
79
|
-
textItems.push(item.str);
|
|
80
|
-
}
|
|
81
|
-
else {
|
|
82
|
-
textItems.push(`\n${item.str}`);
|
|
83
|
-
}
|
|
84
|
-
// eslint-disable-next-line prefer-destructuring
|
|
85
|
-
lastY = item.transform[5];
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
const text = textItems.join(this.parsedItemSeparator);
|
|
89
|
-
documents.push(new Document({
|
|
90
|
-
pageContent: text,
|
|
91
|
-
metadata: {
|
|
92
|
-
pdf: {
|
|
93
|
-
version,
|
|
94
|
-
info: meta?.info,
|
|
95
|
-
metadata: meta?.metadata,
|
|
96
|
-
totalPages: parsedPdf.numPages,
|
|
97
|
-
},
|
|
98
|
-
loc: {
|
|
99
|
-
pageNumber: i,
|
|
100
|
-
},
|
|
101
|
-
},
|
|
102
|
-
}));
|
|
103
|
-
}
|
|
104
|
-
if (this.splitPages) {
|
|
105
|
-
return documents;
|
|
106
|
-
}
|
|
107
|
-
if (documents.length === 0) {
|
|
108
|
-
return [];
|
|
109
|
-
}
|
|
110
|
-
return [
|
|
111
|
-
new Document({
|
|
112
|
-
pageContent: formatDocumentsAsString(documents),
|
|
113
|
-
metadata: {
|
|
114
|
-
pdf: {
|
|
115
|
-
version,
|
|
116
|
-
info: meta?.info,
|
|
117
|
-
metadata: meta?.metadata,
|
|
118
|
-
totalPages: parsedPdf.numPages,
|
|
119
|
-
},
|
|
120
|
-
},
|
|
121
|
-
}),
|
|
122
|
-
];
|
|
123
|
-
return documents;
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
async function PDFLoaderImports() {
|
|
127
|
-
try {
|
|
128
|
-
const { default: mod } = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js");
|
|
129
|
-
const { getDocument, version } = mod;
|
|
130
|
-
return { getDocument, version };
|
|
131
|
-
}
|
|
132
|
-
catch (e) {
|
|
133
|
-
console.error(e);
|
|
134
|
-
throw new Error("Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`.");
|
|
135
|
-
}
|
|
136
|
-
}
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.PlaywrightWebBaseLoader = void 0;
|
|
4
|
-
const documents_1 = require("@langchain/core/documents");
|
|
5
|
-
const base_js_1 = require("../base.cjs");
|
|
6
|
-
const entrypoint_deprecation_js_1 = require("../../util/entrypoint_deprecation.cjs");
|
|
7
|
-
/* #__PURE__ */ (0, entrypoint_deprecation_js_1.logVersion020MigrationWarning)({
|
|
8
|
-
oldEntrypointName: "document_loaders/web/playwright",
|
|
9
|
-
newPackageName: "@langchain/community",
|
|
10
|
-
});
|
|
11
|
-
/**
|
|
12
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/pplaywrightdf" instead. This entrypoint will be removed in 0.3.0.
|
|
13
|
-
*
|
|
14
|
-
* Class representing a document loader for scraping web pages using
|
|
15
|
-
* Playwright. Extends the BaseDocumentLoader class and implements the
|
|
16
|
-
* DocumentLoader interface.
|
|
17
|
-
*/
|
|
18
|
-
class PlaywrightWebBaseLoader extends base_js_1.BaseDocumentLoader {
|
|
19
|
-
constructor(webPath, options) {
|
|
20
|
-
super();
|
|
21
|
-
Object.defineProperty(this, "webPath", {
|
|
22
|
-
enumerable: true,
|
|
23
|
-
configurable: true,
|
|
24
|
-
writable: true,
|
|
25
|
-
value: webPath
|
|
26
|
-
});
|
|
27
|
-
Object.defineProperty(this, "options", {
|
|
28
|
-
enumerable: true,
|
|
29
|
-
configurable: true,
|
|
30
|
-
writable: true,
|
|
31
|
-
value: void 0
|
|
32
|
-
});
|
|
33
|
-
this.options = options ?? undefined;
|
|
34
|
-
}
|
|
35
|
-
static async _scrape(url, options) {
|
|
36
|
-
const { chromium } = await PlaywrightWebBaseLoader.imports();
|
|
37
|
-
const browser = await chromium.launch({
|
|
38
|
-
headless: true,
|
|
39
|
-
...options?.launchOptions,
|
|
40
|
-
});
|
|
41
|
-
const page = await browser.newPage();
|
|
42
|
-
const response = await page.goto(url, {
|
|
43
|
-
timeout: 180000,
|
|
44
|
-
waitUntil: "domcontentloaded",
|
|
45
|
-
...options?.gotoOptions,
|
|
46
|
-
});
|
|
47
|
-
const bodyHTML = options?.evaluate
|
|
48
|
-
? await options?.evaluate(page, browser, response)
|
|
49
|
-
: await page.content();
|
|
50
|
-
await browser.close();
|
|
51
|
-
return bodyHTML;
|
|
52
|
-
}
|
|
53
|
-
/**
|
|
54
|
-
* Method that calls the _scrape method to perform the scraping of the web
|
|
55
|
-
* page specified by the webPath property. Returns a Promise that resolves
|
|
56
|
-
* to the scraped HTML content of the web page.
|
|
57
|
-
* @returns Promise that resolves to the scraped HTML content of the web page.
|
|
58
|
-
*/
|
|
59
|
-
async scrape() {
|
|
60
|
-
return PlaywrightWebBaseLoader._scrape(this.webPath, this.options);
|
|
61
|
-
}
|
|
62
|
-
/**
|
|
63
|
-
* Method that calls the scrape method and returns the scraped HTML
|
|
64
|
-
* content as a Document object. Returns a Promise that resolves to an
|
|
65
|
-
* array of Document objects.
|
|
66
|
-
* @returns Promise that resolves to an array of Document objects.
|
|
67
|
-
*/
|
|
68
|
-
async load() {
|
|
69
|
-
const text = await this.scrape();
|
|
70
|
-
const metadata = { source: this.webPath };
|
|
71
|
-
return [new documents_1.Document({ pageContent: text, metadata })];
|
|
72
|
-
}
|
|
73
|
-
/**
|
|
74
|
-
* Static method that imports the necessary Playwright modules. Returns a
|
|
75
|
-
* Promise that resolves to an object containing the imported modules.
|
|
76
|
-
* @returns Promise that resolves to an object containing the imported modules.
|
|
77
|
-
*/
|
|
78
|
-
static async imports() {
|
|
79
|
-
try {
|
|
80
|
-
const { chromium } = await import("playwright");
|
|
81
|
-
return { chromium };
|
|
82
|
-
}
|
|
83
|
-
catch (e) {
|
|
84
|
-
console.error(e);
|
|
85
|
-
throw new Error("Please install playwright as a dependency with, e.g. `yarn add playwright`");
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
exports.PlaywrightWebBaseLoader = PlaywrightWebBaseLoader;
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
import type { LaunchOptions, Page, Browser, Response } from "playwright";
|
|
2
|
-
import { Document } from "@langchain/core/documents";
|
|
3
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
4
|
-
import type { DocumentLoader } from "../base.js";
|
|
5
|
-
export { Page, Browser, Response };
|
|
6
|
-
export type PlaywrightGotoOptions = {
|
|
7
|
-
referer?: string;
|
|
8
|
-
timeout?: number;
|
|
9
|
-
waitUntil?: "load" | "domcontentloaded" | "networkidle" | "commit";
|
|
10
|
-
};
|
|
11
|
-
/**
|
|
12
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/playwright" instead. This entrypoint will be removed in 0.3.0.
|
|
13
|
-
*
|
|
14
|
-
* Type representing a function for evaluating JavaScript code on a web
|
|
15
|
-
* page using Playwright. Takes a Page, Browser, and Response object as
|
|
16
|
-
* parameters and returns a Promise that resolves to a string.
|
|
17
|
-
*/
|
|
18
|
-
export type PlaywrightEvaluate = (page: Page, browser: Browser, response: Response | null) => Promise<string>;
|
|
19
|
-
export type PlaywrightWebBaseLoaderOptions = {
|
|
20
|
-
launchOptions?: LaunchOptions;
|
|
21
|
-
gotoOptions?: PlaywrightGotoOptions;
|
|
22
|
-
evaluate?: PlaywrightEvaluate;
|
|
23
|
-
};
|
|
24
|
-
/**
|
|
25
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/pplaywrightdf" instead. This entrypoint will be removed in 0.3.0.
|
|
26
|
-
*
|
|
27
|
-
* Class representing a document loader for scraping web pages using
|
|
28
|
-
* Playwright. Extends the BaseDocumentLoader class and implements the
|
|
29
|
-
* DocumentLoader interface.
|
|
30
|
-
*/
|
|
31
|
-
export declare class PlaywrightWebBaseLoader extends BaseDocumentLoader implements DocumentLoader {
|
|
32
|
-
webPath: string;
|
|
33
|
-
options: PlaywrightWebBaseLoaderOptions | undefined;
|
|
34
|
-
constructor(webPath: string, options?: PlaywrightWebBaseLoaderOptions);
|
|
35
|
-
static _scrape(url: string, options?: PlaywrightWebBaseLoaderOptions): Promise<string>;
|
|
36
|
-
/**
|
|
37
|
-
* Method that calls the _scrape method to perform the scraping of the web
|
|
38
|
-
* page specified by the webPath property. Returns a Promise that resolves
|
|
39
|
-
* to the scraped HTML content of the web page.
|
|
40
|
-
* @returns Promise that resolves to the scraped HTML content of the web page.
|
|
41
|
-
*/
|
|
42
|
-
scrape(): Promise<string>;
|
|
43
|
-
/**
|
|
44
|
-
* Method that calls the scrape method and returns the scraped HTML
|
|
45
|
-
* content as a Document object. Returns a Promise that resolves to an
|
|
46
|
-
* array of Document objects.
|
|
47
|
-
* @returns Promise that resolves to an array of Document objects.
|
|
48
|
-
*/
|
|
49
|
-
load(): Promise<Document[]>;
|
|
50
|
-
/**
|
|
51
|
-
* Static method that imports the necessary Playwright modules. Returns a
|
|
52
|
-
* Promise that resolves to an object containing the imported modules.
|
|
53
|
-
* @returns Promise that resolves to an object containing the imported modules.
|
|
54
|
-
*/
|
|
55
|
-
static imports(): Promise<{
|
|
56
|
-
chromium: typeof import("playwright").chromium;
|
|
57
|
-
}>;
|
|
58
|
-
}
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
import { Document } from "@langchain/core/documents";
|
|
2
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
3
|
-
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
|
|
4
|
-
/* #__PURE__ */ logVersion020MigrationWarning({
|
|
5
|
-
oldEntrypointName: "document_loaders/web/playwright",
|
|
6
|
-
newPackageName: "@langchain/community",
|
|
7
|
-
});
|
|
8
|
-
/**
|
|
9
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/pplaywrightdf" instead. This entrypoint will be removed in 0.3.0.
|
|
10
|
-
*
|
|
11
|
-
* Class representing a document loader for scraping web pages using
|
|
12
|
-
* Playwright. Extends the BaseDocumentLoader class and implements the
|
|
13
|
-
* DocumentLoader interface.
|
|
14
|
-
*/
|
|
15
|
-
export class PlaywrightWebBaseLoader extends BaseDocumentLoader {
|
|
16
|
-
constructor(webPath, options) {
|
|
17
|
-
super();
|
|
18
|
-
Object.defineProperty(this, "webPath", {
|
|
19
|
-
enumerable: true,
|
|
20
|
-
configurable: true,
|
|
21
|
-
writable: true,
|
|
22
|
-
value: webPath
|
|
23
|
-
});
|
|
24
|
-
Object.defineProperty(this, "options", {
|
|
25
|
-
enumerable: true,
|
|
26
|
-
configurable: true,
|
|
27
|
-
writable: true,
|
|
28
|
-
value: void 0
|
|
29
|
-
});
|
|
30
|
-
this.options = options ?? undefined;
|
|
31
|
-
}
|
|
32
|
-
static async _scrape(url, options) {
|
|
33
|
-
const { chromium } = await PlaywrightWebBaseLoader.imports();
|
|
34
|
-
const browser = await chromium.launch({
|
|
35
|
-
headless: true,
|
|
36
|
-
...options?.launchOptions,
|
|
37
|
-
});
|
|
38
|
-
const page = await browser.newPage();
|
|
39
|
-
const response = await page.goto(url, {
|
|
40
|
-
timeout: 180000,
|
|
41
|
-
waitUntil: "domcontentloaded",
|
|
42
|
-
...options?.gotoOptions,
|
|
43
|
-
});
|
|
44
|
-
const bodyHTML = options?.evaluate
|
|
45
|
-
? await options?.evaluate(page, browser, response)
|
|
46
|
-
: await page.content();
|
|
47
|
-
await browser.close();
|
|
48
|
-
return bodyHTML;
|
|
49
|
-
}
|
|
50
|
-
/**
|
|
51
|
-
* Method that calls the _scrape method to perform the scraping of the web
|
|
52
|
-
* page specified by the webPath property. Returns a Promise that resolves
|
|
53
|
-
* to the scraped HTML content of the web page.
|
|
54
|
-
* @returns Promise that resolves to the scraped HTML content of the web page.
|
|
55
|
-
*/
|
|
56
|
-
async scrape() {
|
|
57
|
-
return PlaywrightWebBaseLoader._scrape(this.webPath, this.options);
|
|
58
|
-
}
|
|
59
|
-
/**
|
|
60
|
-
* Method that calls the scrape method and returns the scraped HTML
|
|
61
|
-
* content as a Document object. Returns a Promise that resolves to an
|
|
62
|
-
* array of Document objects.
|
|
63
|
-
* @returns Promise that resolves to an array of Document objects.
|
|
64
|
-
*/
|
|
65
|
-
async load() {
|
|
66
|
-
const text = await this.scrape();
|
|
67
|
-
const metadata = { source: this.webPath };
|
|
68
|
-
return [new Document({ pageContent: text, metadata })];
|
|
69
|
-
}
|
|
70
|
-
/**
|
|
71
|
-
* Static method that imports the necessary Playwright modules. Returns a
|
|
72
|
-
* Promise that resolves to an object containing the imported modules.
|
|
73
|
-
* @returns Promise that resolves to an object containing the imported modules.
|
|
74
|
-
*/
|
|
75
|
-
static async imports() {
|
|
76
|
-
try {
|
|
77
|
-
const { chromium } = await import("playwright");
|
|
78
|
-
return { chromium };
|
|
79
|
-
}
|
|
80
|
-
catch (e) {
|
|
81
|
-
console.error(e);
|
|
82
|
-
throw new Error("Please install playwright as a dependency with, e.g. `yarn add playwright`");
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
}
|
|
@@ -1,139 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.PuppeteerWebBaseLoader = void 0;
|
|
4
|
-
const documents_1 = require("@langchain/core/documents");
|
|
5
|
-
const base_js_1 = require("../base.cjs");
|
|
6
|
-
const entrypoint_deprecation_js_1 = require("../../util/entrypoint_deprecation.cjs");
|
|
7
|
-
/* #__PURE__ */ (0, entrypoint_deprecation_js_1.logVersion020MigrationWarning)({
|
|
8
|
-
oldEntrypointName: "document_loaders/web/puppeteer",
|
|
9
|
-
newPackageName: "@langchain/community",
|
|
10
|
-
});
|
|
11
|
-
/**
|
|
12
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/puppeteer" instead. This entrypoint will be removed in 0.3.0.
|
|
13
|
-
*
|
|
14
|
-
* Class that extends the BaseDocumentLoader class and implements the
|
|
15
|
-
* DocumentLoader interface. It represents a document loader for scraping
|
|
16
|
-
* web pages using Puppeteer.
|
|
17
|
-
* @example
|
|
18
|
-
* ```typescript
|
|
19
|
-
* const loader = new PuppeteerWebBaseLoader("https:exampleurl.com", {
|
|
20
|
-
* launchOptions: {
|
|
21
|
-
* headless: true,
|
|
22
|
-
* },
|
|
23
|
-
* gotoOptions: {
|
|
24
|
-
* waitUntil: "domcontentloaded",
|
|
25
|
-
* },
|
|
26
|
-
* });
|
|
27
|
-
* const screenshot = await loader.screenshot();
|
|
28
|
-
* ```
|
|
29
|
-
*/
|
|
30
|
-
class PuppeteerWebBaseLoader extends base_js_1.BaseDocumentLoader {
|
|
31
|
-
constructor(webPath, options) {
|
|
32
|
-
super();
|
|
33
|
-
Object.defineProperty(this, "webPath", {
|
|
34
|
-
enumerable: true,
|
|
35
|
-
configurable: true,
|
|
36
|
-
writable: true,
|
|
37
|
-
value: webPath
|
|
38
|
-
});
|
|
39
|
-
Object.defineProperty(this, "options", {
|
|
40
|
-
enumerable: true,
|
|
41
|
-
configurable: true,
|
|
42
|
-
writable: true,
|
|
43
|
-
value: void 0
|
|
44
|
-
});
|
|
45
|
-
this.options = options ?? undefined;
|
|
46
|
-
}
|
|
47
|
-
static async _scrape(url, options) {
|
|
48
|
-
const { launch } = await PuppeteerWebBaseLoader.imports();
|
|
49
|
-
const browser = await launch({
|
|
50
|
-
headless: true,
|
|
51
|
-
defaultViewport: null,
|
|
52
|
-
ignoreDefaultArgs: ["--disable-extensions"],
|
|
53
|
-
...options?.launchOptions,
|
|
54
|
-
});
|
|
55
|
-
const page = await browser.newPage();
|
|
56
|
-
await page.goto(url, {
|
|
57
|
-
timeout: 180000,
|
|
58
|
-
waitUntil: "domcontentloaded",
|
|
59
|
-
...options?.gotoOptions,
|
|
60
|
-
});
|
|
61
|
-
const bodyHTML = options?.evaluate
|
|
62
|
-
? await options?.evaluate(page, browser)
|
|
63
|
-
: await page.evaluate(() => document.body.innerHTML);
|
|
64
|
-
await browser.close();
|
|
65
|
-
return bodyHTML;
|
|
66
|
-
}
|
|
67
|
-
/**
|
|
68
|
-
* Method that calls the _scrape method to perform the scraping of the web
|
|
69
|
-
* page specified by the webPath property.
|
|
70
|
-
* @returns Promise that resolves to the scraped HTML content of the web page.
|
|
71
|
-
*/
|
|
72
|
-
async scrape() {
|
|
73
|
-
return PuppeteerWebBaseLoader._scrape(this.webPath, this.options);
|
|
74
|
-
}
|
|
75
|
-
/**
|
|
76
|
-
* Method that calls the scrape method and returns the scraped HTML
|
|
77
|
-
* content as a Document object.
|
|
78
|
-
* @returns Promise that resolves to an array of Document objects.
|
|
79
|
-
*/
|
|
80
|
-
async load() {
|
|
81
|
-
const text = await this.scrape();
|
|
82
|
-
const metadata = { source: this.webPath };
|
|
83
|
-
return [new documents_1.Document({ pageContent: text, metadata })];
|
|
84
|
-
}
|
|
85
|
-
/**
|
|
86
|
-
* Static class method used to screenshot a web page and return
|
|
87
|
-
* it as a {@link Document} object where the pageContent property
|
|
88
|
-
* is the screenshot encoded in base64.
|
|
89
|
-
*
|
|
90
|
-
* @param {string} url
|
|
91
|
-
* @param {PuppeteerWebBaseLoaderOptions} options
|
|
92
|
-
* @returns {Document} A document object containing the screenshot of the page encoded in base64.
|
|
93
|
-
*/
|
|
94
|
-
static async _screenshot(url, options) {
|
|
95
|
-
const { launch } = await PuppeteerWebBaseLoader.imports();
|
|
96
|
-
const browser = await launch({
|
|
97
|
-
headless: true,
|
|
98
|
-
defaultViewport: null,
|
|
99
|
-
ignoreDefaultArgs: ["--disable-extensions"],
|
|
100
|
-
...options?.launchOptions,
|
|
101
|
-
});
|
|
102
|
-
const page = await browser.newPage();
|
|
103
|
-
await page.goto(url, {
|
|
104
|
-
timeout: 180000,
|
|
105
|
-
waitUntil: "domcontentloaded",
|
|
106
|
-
...options?.gotoOptions,
|
|
107
|
-
});
|
|
108
|
-
const screenshot = await page.screenshot();
|
|
109
|
-
const base64 = screenshot.toString("base64");
|
|
110
|
-
const metadata = { source: url };
|
|
111
|
-
return new documents_1.Document({ pageContent: base64, metadata });
|
|
112
|
-
}
|
|
113
|
-
/**
|
|
114
|
-
* Screenshot a web page and return it as a {@link Document} object where
|
|
115
|
-
* the pageContent property is the screenshot encoded in base64.
|
|
116
|
-
*
|
|
117
|
-
* @returns {Promise<Document>} A document object containing the screenshot of the page encoded in base64.
|
|
118
|
-
*/
|
|
119
|
-
async screenshot() {
|
|
120
|
-
return PuppeteerWebBaseLoader._screenshot(this.webPath, this.options);
|
|
121
|
-
}
|
|
122
|
-
/**
|
|
123
|
-
* Static method that imports the necessary Puppeteer modules. It returns
|
|
124
|
-
* a Promise that resolves to an object containing the imported modules.
|
|
125
|
-
* @returns Promise that resolves to an object containing the imported Puppeteer modules.
|
|
126
|
-
*/
|
|
127
|
-
static async imports() {
|
|
128
|
-
try {
|
|
129
|
-
// eslint-disable-next-line import/no-extraneous-dependencies
|
|
130
|
-
const { launch } = await import("puppeteer");
|
|
131
|
-
return { launch };
|
|
132
|
-
}
|
|
133
|
-
catch (e) {
|
|
134
|
-
console.error(e);
|
|
135
|
-
throw new Error("Please install puppeteer as a dependency with, e.g. `yarn add puppeteer`");
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
exports.PuppeteerWebBaseLoader = PuppeteerWebBaseLoader;
|
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
import type { launch, WaitForOptions, Page, Browser, PuppeteerLaunchOptions } from "puppeteer";
|
|
2
|
-
import { Document } from "@langchain/core/documents";
|
|
3
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
4
|
-
import type { DocumentLoader } from "../base.js";
|
|
5
|
-
export { Page, Browser };
|
|
6
|
-
export type PuppeteerGotoOptions = WaitForOptions & {
|
|
7
|
-
referer?: string;
|
|
8
|
-
referrerPolicy?: string;
|
|
9
|
-
};
|
|
10
|
-
/**
|
|
11
|
-
* Type representing a function for evaluating JavaScript code on a web
|
|
12
|
-
* page using Puppeteer. It takes a Page and Browser object as parameters
|
|
13
|
-
* and returns a Promise that resolves to a string.
|
|
14
|
-
*/
|
|
15
|
-
export type PuppeteerEvaluate = (page: Page, browser: Browser) => Promise<string>;
|
|
16
|
-
export type PuppeteerWebBaseLoaderOptions = {
|
|
17
|
-
launchOptions?: PuppeteerLaunchOptions;
|
|
18
|
-
gotoOptions?: PuppeteerGotoOptions;
|
|
19
|
-
evaluate?: PuppeteerEvaluate;
|
|
20
|
-
};
|
|
21
|
-
/**
|
|
22
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/puppeteer" instead. This entrypoint will be removed in 0.3.0.
|
|
23
|
-
*
|
|
24
|
-
* Class that extends the BaseDocumentLoader class and implements the
|
|
25
|
-
* DocumentLoader interface. It represents a document loader for scraping
|
|
26
|
-
* web pages using Puppeteer.
|
|
27
|
-
* @example
|
|
28
|
-
* ```typescript
|
|
29
|
-
* const loader = new PuppeteerWebBaseLoader("https:exampleurl.com", {
|
|
30
|
-
* launchOptions: {
|
|
31
|
-
* headless: true,
|
|
32
|
-
* },
|
|
33
|
-
* gotoOptions: {
|
|
34
|
-
* waitUntil: "domcontentloaded",
|
|
35
|
-
* },
|
|
36
|
-
* });
|
|
37
|
-
* const screenshot = await loader.screenshot();
|
|
38
|
-
* ```
|
|
39
|
-
*/
|
|
40
|
-
export declare class PuppeteerWebBaseLoader extends BaseDocumentLoader implements DocumentLoader {
|
|
41
|
-
webPath: string;
|
|
42
|
-
options: PuppeteerWebBaseLoaderOptions | undefined;
|
|
43
|
-
constructor(webPath: string, options?: PuppeteerWebBaseLoaderOptions);
|
|
44
|
-
static _scrape(url: string, options?: PuppeteerWebBaseLoaderOptions): Promise<string>;
|
|
45
|
-
/**
|
|
46
|
-
* Method that calls the _scrape method to perform the scraping of the web
|
|
47
|
-
* page specified by the webPath property.
|
|
48
|
-
* @returns Promise that resolves to the scraped HTML content of the web page.
|
|
49
|
-
*/
|
|
50
|
-
scrape(): Promise<string>;
|
|
51
|
-
/**
|
|
52
|
-
* Method that calls the scrape method and returns the scraped HTML
|
|
53
|
-
* content as a Document object.
|
|
54
|
-
* @returns Promise that resolves to an array of Document objects.
|
|
55
|
-
*/
|
|
56
|
-
load(): Promise<Document[]>;
|
|
57
|
-
/**
|
|
58
|
-
* Static class method used to screenshot a web page and return
|
|
59
|
-
* it as a {@link Document} object where the pageContent property
|
|
60
|
-
* is the screenshot encoded in base64.
|
|
61
|
-
*
|
|
62
|
-
* @param {string} url
|
|
63
|
-
* @param {PuppeteerWebBaseLoaderOptions} options
|
|
64
|
-
* @returns {Document} A document object containing the screenshot of the page encoded in base64.
|
|
65
|
-
*/
|
|
66
|
-
static _screenshot(url: string, options?: PuppeteerWebBaseLoaderOptions): Promise<Document>;
|
|
67
|
-
/**
|
|
68
|
-
* Screenshot a web page and return it as a {@link Document} object where
|
|
69
|
-
* the pageContent property is the screenshot encoded in base64.
|
|
70
|
-
*
|
|
71
|
-
* @returns {Promise<Document>} A document object containing the screenshot of the page encoded in base64.
|
|
72
|
-
*/
|
|
73
|
-
screenshot(): Promise<Document>;
|
|
74
|
-
/**
|
|
75
|
-
* Static method that imports the necessary Puppeteer modules. It returns
|
|
76
|
-
* a Promise that resolves to an object containing the imported modules.
|
|
77
|
-
* @returns Promise that resolves to an object containing the imported Puppeteer modules.
|
|
78
|
-
*/
|
|
79
|
-
static imports(): Promise<{
|
|
80
|
-
launch: typeof launch;
|
|
81
|
-
}>;
|
|
82
|
-
}
|