langchain 0.2.18 → 0.3.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/openai_functions/index.cjs +2 -2
- package/dist/agents/openai_functions/index.js +2 -2
- package/dist/chains/combine_documents/stuff.cjs +2 -2
- package/dist/chains/combine_documents/stuff.js +2 -2
- package/dist/chains/openai_functions/openapi.cjs +3 -1
- package/dist/chains/openai_functions/openapi.js +3 -1
- package/dist/load/import_constants.cjs +2 -39
- package/dist/load/import_constants.js +2 -39
- package/dist/load/import_map.cjs +2 -3
- package/dist/load/import_map.d.ts +0 -1
- package/dist/load/import_map.js +0 -1
- package/dist/smith/config.d.ts +1 -5
- package/package.json +31 -854
- package/dist/document_loaders/fs/chatgpt.cjs +0 -90
- package/dist/document_loaders/fs/chatgpt.d.ts +0 -8
- package/dist/document_loaders/fs/chatgpt.js +0 -86
- package/dist/document_loaders/fs/csv.cjs +0 -73
- package/dist/document_loaders/fs/csv.d.ts +0 -65
- package/dist/document_loaders/fs/csv.js +0 -69
- package/dist/document_loaders/fs/docx.cjs +0 -58
- package/dist/document_loaders/fs/docx.d.ts +0 -25
- package/dist/document_loaders/fs/docx.js +0 -54
- package/dist/document_loaders/fs/epub.cjs +0 -103
- package/dist/document_loaders/fs/epub.d.ts +0 -33
- package/dist/document_loaders/fs/epub.js +0 -99
- package/dist/document_loaders/fs/notion.cjs +0 -26
- package/dist/document_loaders/fs/notion.d.ts +0 -12
- package/dist/document_loaders/fs/notion.js +0 -22
- package/dist/document_loaders/fs/obsidian.cjs +0 -247
- package/dist/document_loaders/fs/obsidian.d.ts +0 -28
- package/dist/document_loaders/fs/obsidian.js +0 -240
- package/dist/document_loaders/fs/openai_whisper_audio.cjs +0 -49
- package/dist/document_loaders/fs/openai_whisper_audio.d.ts +0 -23
- package/dist/document_loaders/fs/openai_whisper_audio.js +0 -45
- package/dist/document_loaders/fs/pdf.cjs +0 -148
- package/dist/document_loaders/fs/pdf.d.ts +0 -49
- package/dist/document_loaders/fs/pdf.js +0 -144
- package/dist/document_loaders/fs/pptx.cjs +0 -46
- package/dist/document_loaders/fs/pptx.d.ts +0 -25
- package/dist/document_loaders/fs/pptx.js +0 -42
- package/dist/document_loaders/fs/srt.cjs +0 -57
- package/dist/document_loaders/fs/srt.d.ts +0 -32
- package/dist/document_loaders/fs/srt.js +0 -50
- package/dist/document_loaders/fs/unstructured.cjs +0 -338
- package/dist/document_loaders/fs/unstructured.d.ts +0 -125
- package/dist/document_loaders/fs/unstructured.js +0 -333
- package/dist/document_loaders/web/apify_dataset.cjs +0 -130
- package/dist/document_loaders/web/apify_dataset.d.ts +0 -85
- package/dist/document_loaders/web/apify_dataset.js +0 -126
- package/dist/document_loaders/web/assemblyai.cjs +0 -200
- package/dist/document_loaders/web/assemblyai.d.ts +0 -95
- package/dist/document_loaders/web/assemblyai.js +0 -193
- package/dist/document_loaders/web/azure_blob_storage_container.cjs +0 -73
- package/dist/document_loaders/web/azure_blob_storage_container.d.ts +0 -46
- package/dist/document_loaders/web/azure_blob_storage_container.js +0 -69
- package/dist/document_loaders/web/azure_blob_storage_file.cjs +0 -124
- package/dist/document_loaders/web/azure_blob_storage_file.d.ts +0 -53
- package/dist/document_loaders/web/azure_blob_storage_file.js +0 -97
- package/dist/document_loaders/web/browserbase.cjs +0 -93
- package/dist/document_loaders/web/browserbase.d.ts +0 -48
- package/dist/document_loaders/web/browserbase.js +0 -86
- package/dist/document_loaders/web/cheerio.cjs +0 -118
- package/dist/document_loaders/web/cheerio.d.ts +0 -77
- package/dist/document_loaders/web/cheerio.js +0 -114
- package/dist/document_loaders/web/college_confidential.cjs +0 -41
- package/dist/document_loaders/web/college_confidential.d.ts +0 -25
- package/dist/document_loaders/web/college_confidential.js +0 -37
- package/dist/document_loaders/web/confluence.cjs +0 -190
- package/dist/document_loaders/web/confluence.d.ts +0 -114
- package/dist/document_loaders/web/confluence.js +0 -186
- package/dist/document_loaders/web/couchbase.cjs +0 -95
- package/dist/document_loaders/web/couchbase.d.ts +0 -32
- package/dist/document_loaders/web/couchbase.js +0 -91
- package/dist/document_loaders/web/figma.cjs +0 -102
- package/dist/document_loaders/web/figma.d.ts +0 -82
- package/dist/document_loaders/web/figma.js +0 -98
- package/dist/document_loaders/web/firecrawl.cjs +0 -95
- package/dist/document_loaders/web/firecrawl.d.ts +0 -50
- package/dist/document_loaders/web/firecrawl.js +0 -88
- package/dist/document_loaders/web/gitbook.cjs +0 -110
- package/dist/document_loaders/web/gitbook.d.ts +0 -55
- package/dist/document_loaders/web/gitbook.js +0 -106
- package/dist/document_loaders/web/github.cjs +0 -615
- package/dist/document_loaders/web/github.d.ts +0 -203
- package/dist/document_loaders/web/github.js +0 -608
- package/dist/document_loaders/web/hn.cjs +0 -90
- package/dist/document_loaders/web/hn.d.ts +0 -42
- package/dist/document_loaders/web/hn.js +0 -86
- package/dist/document_loaders/web/imsdb.cjs +0 -44
- package/dist/document_loaders/web/imsdb.d.ts +0 -23
- package/dist/document_loaders/web/imsdb.js +0 -40
- package/dist/document_loaders/web/notionapi.cjs +0 -404
- package/dist/document_loaders/web/notionapi.d.ts +0 -133
- package/dist/document_loaders/web/notionapi.js +0 -392
- package/dist/document_loaders/web/notiondb.cjs +0 -199
- package/dist/document_loaders/web/notiondb.d.ts +0 -56
- package/dist/document_loaders/web/notiondb.js +0 -195
- package/dist/document_loaders/web/pdf.cjs +0 -140
- package/dist/document_loaders/web/pdf.d.ts +0 -35
- package/dist/document_loaders/web/pdf.js +0 -136
- package/dist/document_loaders/web/playwright.cjs +0 -89
- package/dist/document_loaders/web/playwright.d.ts +0 -58
- package/dist/document_loaders/web/playwright.js +0 -85
- package/dist/document_loaders/web/puppeteer.cjs +0 -139
- package/dist/document_loaders/web/puppeteer.d.ts +0 -82
- package/dist/document_loaders/web/puppeteer.js +0 -135
- package/dist/document_loaders/web/recursive_url.cjs +0 -198
- package/dist/document_loaders/web/recursive_url.d.ts +0 -33
- package/dist/document_loaders/web/recursive_url.js +0 -194
- package/dist/document_loaders/web/s3.cjs +0 -164
- package/dist/document_loaders/web/s3.d.ts +0 -78
- package/dist/document_loaders/web/s3.js +0 -137
- package/dist/document_loaders/web/searchapi.cjs +0 -150
- package/dist/document_loaders/web/searchapi.d.ts +0 -76
- package/dist/document_loaders/web/searchapi.js +0 -146
- package/dist/document_loaders/web/serpapi.cjs +0 -127
- package/dist/document_loaders/web/serpapi.d.ts +0 -62
- package/dist/document_loaders/web/serpapi.js +0 -123
- package/dist/document_loaders/web/sitemap.cjs +0 -118
- package/dist/document_loaders/web/sitemap.d.ts +0 -41
- package/dist/document_loaders/web/sitemap.js +0 -114
- package/dist/document_loaders/web/sonix_audio.cjs +0 -68
- package/dist/document_loaders/web/sonix_audio.d.ts +0 -36
- package/dist/document_loaders/web/sonix_audio.js +0 -64
- package/dist/document_loaders/web/sort_xyz_blockchain.cjs +0 -157
- package/dist/document_loaders/web/sort_xyz_blockchain.d.ts +0 -78
- package/dist/document_loaders/web/sort_xyz_blockchain.js +0 -153
- package/dist/document_loaders/web/youtube.cjs +0 -116
- package/dist/document_loaders/web/youtube.d.ts +0 -55
- package/dist/document_loaders/web/youtube.js +0 -112
- package/dist/experimental/tools/pyinterpreter.cjs +0 -248
- package/dist/experimental/tools/pyinterpreter.d.ts +0 -18
- package/dist/experimental/tools/pyinterpreter.js +0 -244
- package/dist/retrievers/self_query/chroma.cjs +0 -48
- package/dist/retrievers/self_query/chroma.d.ts +0 -26
- package/dist/retrievers/self_query/chroma.js +0 -44
- package/dist/retrievers/self_query/pinecone.cjs +0 -47
- package/dist/retrievers/self_query/pinecone.d.ts +0 -26
- package/dist/retrievers/self_query/pinecone.js +0 -43
- package/dist/retrievers/self_query/supabase.cjs +0 -278
- package/dist/retrievers/self_query/supabase.d.ts +0 -109
- package/dist/retrievers/self_query/supabase.js +0 -274
- package/dist/retrievers/self_query/supabase_utils.cjs +0 -264
- package/dist/retrievers/self_query/supabase_utils.d.ts +0 -101
- package/dist/retrievers/self_query/supabase_utils.js +0 -259
- package/dist/retrievers/self_query/vectara.cjs +0 -143
- package/dist/retrievers/self_query/vectara.d.ts +0 -42
- package/dist/retrievers/self_query/vectara.js +0 -139
- package/dist/retrievers/self_query/weaviate.cjs +0 -201
- package/dist/retrievers/self_query/weaviate.d.ts +0 -99
- package/dist/retrievers/self_query/weaviate.js +0 -197
- package/dist/types/assemblyai-types.cjs +0 -2
- package/dist/types/assemblyai-types.d.ts +0 -4
- package/dist/types/assemblyai-types.js +0 -1
- package/document_loaders/fs/chatgpt.cjs +0 -1
- package/document_loaders/fs/chatgpt.d.cts +0 -1
- package/document_loaders/fs/chatgpt.d.ts +0 -1
- package/document_loaders/fs/chatgpt.js +0 -1
- package/document_loaders/fs/csv.cjs +0 -1
- package/document_loaders/fs/csv.d.cts +0 -1
- package/document_loaders/fs/csv.d.ts +0 -1
- package/document_loaders/fs/csv.js +0 -1
- package/document_loaders/fs/docx.cjs +0 -1
- package/document_loaders/fs/docx.d.cts +0 -1
- package/document_loaders/fs/docx.d.ts +0 -1
- package/document_loaders/fs/docx.js +0 -1
- package/document_loaders/fs/epub.cjs +0 -1
- package/document_loaders/fs/epub.d.cts +0 -1
- package/document_loaders/fs/epub.d.ts +0 -1
- package/document_loaders/fs/epub.js +0 -1
- package/document_loaders/fs/notion.cjs +0 -1
- package/document_loaders/fs/notion.d.cts +0 -1
- package/document_loaders/fs/notion.d.ts +0 -1
- package/document_loaders/fs/notion.js +0 -1
- package/document_loaders/fs/obsidian.cjs +0 -1
- package/document_loaders/fs/obsidian.d.cts +0 -1
- package/document_loaders/fs/obsidian.d.ts +0 -1
- package/document_loaders/fs/obsidian.js +0 -1
- package/document_loaders/fs/openai_whisper_audio.cjs +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.cts +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.ts +0 -1
- package/document_loaders/fs/openai_whisper_audio.js +0 -1
- package/document_loaders/fs/pdf.cjs +0 -1
- package/document_loaders/fs/pdf.d.cts +0 -1
- package/document_loaders/fs/pdf.d.ts +0 -1
- package/document_loaders/fs/pdf.js +0 -1
- package/document_loaders/fs/pptx.cjs +0 -1
- package/document_loaders/fs/pptx.d.cts +0 -1
- package/document_loaders/fs/pptx.d.ts +0 -1
- package/document_loaders/fs/pptx.js +0 -1
- package/document_loaders/fs/srt.cjs +0 -1
- package/document_loaders/fs/srt.d.cts +0 -1
- package/document_loaders/fs/srt.d.ts +0 -1
- package/document_loaders/fs/srt.js +0 -1
- package/document_loaders/fs/unstructured.cjs +0 -1
- package/document_loaders/fs/unstructured.d.cts +0 -1
- package/document_loaders/fs/unstructured.d.ts +0 -1
- package/document_loaders/fs/unstructured.js +0 -1
- package/document_loaders/web/apify_dataset.cjs +0 -1
- package/document_loaders/web/apify_dataset.d.cts +0 -1
- package/document_loaders/web/apify_dataset.d.ts +0 -1
- package/document_loaders/web/apify_dataset.js +0 -1
- package/document_loaders/web/assemblyai.cjs +0 -1
- package/document_loaders/web/assemblyai.d.cts +0 -1
- package/document_loaders/web/assemblyai.d.ts +0 -1
- package/document_loaders/web/assemblyai.js +0 -1
- package/document_loaders/web/azure_blob_storage_container.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_container.js +0 -1
- package/document_loaders/web/azure_blob_storage_file.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_file.js +0 -1
- package/document_loaders/web/browserbase.cjs +0 -1
- package/document_loaders/web/browserbase.d.cts +0 -1
- package/document_loaders/web/browserbase.d.ts +0 -1
- package/document_loaders/web/browserbase.js +0 -1
- package/document_loaders/web/cheerio.cjs +0 -1
- package/document_loaders/web/cheerio.d.cts +0 -1
- package/document_loaders/web/cheerio.d.ts +0 -1
- package/document_loaders/web/cheerio.js +0 -1
- package/document_loaders/web/college_confidential.cjs +0 -1
- package/document_loaders/web/college_confidential.d.cts +0 -1
- package/document_loaders/web/college_confidential.d.ts +0 -1
- package/document_loaders/web/college_confidential.js +0 -1
- package/document_loaders/web/confluence.cjs +0 -1
- package/document_loaders/web/confluence.d.cts +0 -1
- package/document_loaders/web/confluence.d.ts +0 -1
- package/document_loaders/web/confluence.js +0 -1
- package/document_loaders/web/couchbase.cjs +0 -1
- package/document_loaders/web/couchbase.d.cts +0 -1
- package/document_loaders/web/couchbase.d.ts +0 -1
- package/document_loaders/web/couchbase.js +0 -1
- package/document_loaders/web/figma.cjs +0 -1
- package/document_loaders/web/figma.d.cts +0 -1
- package/document_loaders/web/figma.d.ts +0 -1
- package/document_loaders/web/figma.js +0 -1
- package/document_loaders/web/firecrawl.cjs +0 -1
- package/document_loaders/web/firecrawl.d.cts +0 -1
- package/document_loaders/web/firecrawl.d.ts +0 -1
- package/document_loaders/web/firecrawl.js +0 -1
- package/document_loaders/web/gitbook.cjs +0 -1
- package/document_loaders/web/gitbook.d.cts +0 -1
- package/document_loaders/web/gitbook.d.ts +0 -1
- package/document_loaders/web/gitbook.js +0 -1
- package/document_loaders/web/github.cjs +0 -1
- package/document_loaders/web/github.d.cts +0 -1
- package/document_loaders/web/github.d.ts +0 -1
- package/document_loaders/web/github.js +0 -1
- package/document_loaders/web/hn.cjs +0 -1
- package/document_loaders/web/hn.d.cts +0 -1
- package/document_loaders/web/hn.d.ts +0 -1
- package/document_loaders/web/hn.js +0 -1
- package/document_loaders/web/imsdb.cjs +0 -1
- package/document_loaders/web/imsdb.d.cts +0 -1
- package/document_loaders/web/imsdb.d.ts +0 -1
- package/document_loaders/web/imsdb.js +0 -1
- package/document_loaders/web/notionapi.cjs +0 -1
- package/document_loaders/web/notionapi.d.cts +0 -1
- package/document_loaders/web/notionapi.d.ts +0 -1
- package/document_loaders/web/notionapi.js +0 -1
- package/document_loaders/web/notiondb.cjs +0 -1
- package/document_loaders/web/notiondb.d.cts +0 -1
- package/document_loaders/web/notiondb.d.ts +0 -1
- package/document_loaders/web/notiondb.js +0 -1
- package/document_loaders/web/pdf.cjs +0 -1
- package/document_loaders/web/pdf.d.cts +0 -1
- package/document_loaders/web/pdf.d.ts +0 -1
- package/document_loaders/web/pdf.js +0 -1
- package/document_loaders/web/playwright.cjs +0 -1
- package/document_loaders/web/playwright.d.cts +0 -1
- package/document_loaders/web/playwright.d.ts +0 -1
- package/document_loaders/web/playwright.js +0 -1
- package/document_loaders/web/puppeteer.cjs +0 -1
- package/document_loaders/web/puppeteer.d.cts +0 -1
- package/document_loaders/web/puppeteer.d.ts +0 -1
- package/document_loaders/web/puppeteer.js +0 -1
- package/document_loaders/web/recursive_url.cjs +0 -1
- package/document_loaders/web/recursive_url.d.cts +0 -1
- package/document_loaders/web/recursive_url.d.ts +0 -1
- package/document_loaders/web/recursive_url.js +0 -1
- package/document_loaders/web/s3.cjs +0 -1
- package/document_loaders/web/s3.d.cts +0 -1
- package/document_loaders/web/s3.d.ts +0 -1
- package/document_loaders/web/s3.js +0 -1
- package/document_loaders/web/searchapi.cjs +0 -1
- package/document_loaders/web/searchapi.d.cts +0 -1
- package/document_loaders/web/searchapi.d.ts +0 -1
- package/document_loaders/web/searchapi.js +0 -1
- package/document_loaders/web/serpapi.cjs +0 -1
- package/document_loaders/web/serpapi.d.cts +0 -1
- package/document_loaders/web/serpapi.d.ts +0 -1
- package/document_loaders/web/serpapi.js +0 -1
- package/document_loaders/web/sitemap.cjs +0 -1
- package/document_loaders/web/sitemap.d.cts +0 -1
- package/document_loaders/web/sitemap.d.ts +0 -1
- package/document_loaders/web/sitemap.js +0 -1
- package/document_loaders/web/sonix_audio.cjs +0 -1
- package/document_loaders/web/sonix_audio.d.cts +0 -1
- package/document_loaders/web/sonix_audio.d.ts +0 -1
- package/document_loaders/web/sonix_audio.js +0 -1
- package/document_loaders/web/sort_xyz_blockchain.cjs +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.cts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.ts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.js +0 -1
- package/document_loaders/web/youtube.cjs +0 -1
- package/document_loaders/web/youtube.d.cts +0 -1
- package/document_loaders/web/youtube.d.ts +0 -1
- package/document_loaders/web/youtube.js +0 -1
- package/experimental/tools/pyinterpreter.cjs +0 -1
- package/experimental/tools/pyinterpreter.d.cts +0 -1
- package/experimental/tools/pyinterpreter.d.ts +0 -1
- package/experimental/tools/pyinterpreter.js +0 -1
- package/memory/index.cjs +0 -1
- package/memory/index.d.cts +0 -1
- package/memory/index.d.ts +0 -1
- package/memory/index.js +0 -1
- package/retrievers/self_query/chroma.cjs +0 -1
- package/retrievers/self_query/chroma.d.cts +0 -1
- package/retrievers/self_query/chroma.d.ts +0 -1
- package/retrievers/self_query/chroma.js +0 -1
- package/retrievers/self_query/pinecone.cjs +0 -1
- package/retrievers/self_query/pinecone.d.cts +0 -1
- package/retrievers/self_query/pinecone.d.ts +0 -1
- package/retrievers/self_query/pinecone.js +0 -1
- package/retrievers/self_query/supabase.cjs +0 -1
- package/retrievers/self_query/supabase.d.cts +0 -1
- package/retrievers/self_query/supabase.d.ts +0 -1
- package/retrievers/self_query/supabase.js +0 -1
- package/retrievers/self_query/vectara.cjs +0 -1
- package/retrievers/self_query/vectara.d.cts +0 -1
- package/retrievers/self_query/vectara.d.ts +0 -1
- package/retrievers/self_query/vectara.js +0 -1
- package/retrievers/self_query/weaviate.cjs +0 -1
- package/retrievers/self_query/weaviate.d.cts +0 -1
- package/retrievers/self_query/weaviate.d.ts +0 -1
- package/retrievers/self_query/weaviate.js +0 -1
|
@@ -1,146 +0,0 @@
|
|
|
1
|
-
import { Document } from "@langchain/core/documents";
|
|
2
|
-
import { getEnvironmentVariable } from "@langchain/core/utils/env";
|
|
3
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
4
|
-
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
|
|
5
|
-
/* #__PURE__ */ logVersion020MigrationWarning({
|
|
6
|
-
oldEntrypointName: "document_loaders/web/searchapi",
|
|
7
|
-
newPackageName: "@langchain/community",
|
|
8
|
-
});
|
|
9
|
-
/**
|
|
10
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/searchapi" instead. This entrypoint will be removed in 0.3.0.
|
|
11
|
-
*
|
|
12
|
-
* Class representing a document loader for loading search results from
|
|
13
|
-
* the SearchApi. It extends the BaseDocumentLoader class.
|
|
14
|
-
* @example
|
|
15
|
-
* ```typescript
|
|
16
|
-
* const loader = new SearchApiLoader({
|
|
17
|
-
* q: "{query}",
|
|
18
|
-
* apiKey: "{apiKey}",
|
|
19
|
-
* engine: "google",
|
|
20
|
-
* });
|
|
21
|
-
* const docs = await loader.load();
|
|
22
|
-
* ```
|
|
23
|
-
*/
|
|
24
|
-
export class SearchApiLoader extends BaseDocumentLoader {
|
|
25
|
-
constructor(params) {
|
|
26
|
-
super();
|
|
27
|
-
Object.defineProperty(this, "apiKey", {
|
|
28
|
-
enumerable: true,
|
|
29
|
-
configurable: true,
|
|
30
|
-
writable: true,
|
|
31
|
-
value: void 0
|
|
32
|
-
});
|
|
33
|
-
Object.defineProperty(this, "parameters", {
|
|
34
|
-
enumerable: true,
|
|
35
|
-
configurable: true,
|
|
36
|
-
writable: true,
|
|
37
|
-
value: void 0
|
|
38
|
-
});
|
|
39
|
-
const { apiKey = getEnvironmentVariable("SEARCHAPI_API_KEY") } = params;
|
|
40
|
-
if (typeof apiKey !== "string") {
|
|
41
|
-
throw new Error("Invalid type for apiKey. Expected string.");
|
|
42
|
-
}
|
|
43
|
-
if (!apiKey) {
|
|
44
|
-
throw new Error("SearchApi API key not set. You can set it as SEARCHAPI_API_KEY in your .env file, or pass it to SearchApi.");
|
|
45
|
-
}
|
|
46
|
-
this.apiKey = apiKey;
|
|
47
|
-
this.parameters = { ...params };
|
|
48
|
-
}
|
|
49
|
-
/**
|
|
50
|
-
* Builds the URL for the SearchApi search request.
|
|
51
|
-
* @returns The URL for the search request.
|
|
52
|
-
*/
|
|
53
|
-
buildUrl() {
|
|
54
|
-
this.parameters = {
|
|
55
|
-
engine: "google",
|
|
56
|
-
api_key: this.apiKey,
|
|
57
|
-
...this.parameters,
|
|
58
|
-
};
|
|
59
|
-
const preparedParams = Object.entries(this.parameters)
|
|
60
|
-
.filter(([key, value]) => value !== undefined && value !== null && key !== "apiKey")
|
|
61
|
-
.map(([key, value]) => [key, `${value}`]);
|
|
62
|
-
const searchParams = new URLSearchParams(preparedParams);
|
|
63
|
-
return `https://www.searchapi.io/api/v1/search?${searchParams}`;
|
|
64
|
-
}
|
|
65
|
-
/**
|
|
66
|
-
* Extracts documents from the provided output.
|
|
67
|
-
* @param output - The output to extract documents from.
|
|
68
|
-
* @param responseType - The type of the response to extract documents from.
|
|
69
|
-
* @returns An array of Documents.
|
|
70
|
-
*/
|
|
71
|
-
extractDocuments(output, responseType) {
|
|
72
|
-
const documents = [];
|
|
73
|
-
const results = Array.isArray(output) ? output : [output];
|
|
74
|
-
if (responseType === "transcripts") {
|
|
75
|
-
const pageContent = results.map((result) => result.text).join("\n");
|
|
76
|
-
const metadata = {
|
|
77
|
-
source: "SearchApi",
|
|
78
|
-
responseType,
|
|
79
|
-
};
|
|
80
|
-
documents.push(new Document({ pageContent, metadata }));
|
|
81
|
-
}
|
|
82
|
-
else {
|
|
83
|
-
for (const result of results) {
|
|
84
|
-
const pageContent = JSON.stringify(result);
|
|
85
|
-
const metadata = {
|
|
86
|
-
source: "SearchApi",
|
|
87
|
-
responseType,
|
|
88
|
-
};
|
|
89
|
-
documents.push(new Document({ pageContent, metadata }));
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
return documents;
|
|
93
|
-
}
|
|
94
|
-
/**
|
|
95
|
-
* Processes the response data from the SearchApi search request and converts it into an array of Documents.
|
|
96
|
-
* @param data - The response data from the SearchApi search request.
|
|
97
|
-
* @returns An array of Documents.
|
|
98
|
-
*/
|
|
99
|
-
processResponseData(data) {
|
|
100
|
-
const documents = [];
|
|
101
|
-
const responseTypes = [
|
|
102
|
-
"answer_box",
|
|
103
|
-
"shopping_results",
|
|
104
|
-
"knowledge_graph",
|
|
105
|
-
"organic_results",
|
|
106
|
-
"transcripts",
|
|
107
|
-
];
|
|
108
|
-
for (const responseType of responseTypes) {
|
|
109
|
-
if (responseType in data) {
|
|
110
|
-
documents.push(...this.extractDocuments(data[responseType], responseType));
|
|
111
|
-
}
|
|
112
|
-
}
|
|
113
|
-
return documents;
|
|
114
|
-
}
|
|
115
|
-
/**
|
|
116
|
-
* Fetches the data from the provided URL and returns it as a JSON object.
|
|
117
|
-
* If an error occurs during the fetch operation, an exception is thrown with the error message.
|
|
118
|
-
* @param url - The URL to fetch data from.
|
|
119
|
-
* @returns A promise that resolves to the fetched data as a JSON object.
|
|
120
|
-
* @throws An error if the fetch operation fails.
|
|
121
|
-
*/
|
|
122
|
-
async fetchData(url) {
|
|
123
|
-
const response = await fetch(url);
|
|
124
|
-
const data = await response.json();
|
|
125
|
-
if (data.error) {
|
|
126
|
-
throw new Error(`Failed to load search results from SearchApi due to: ${data.error}`);
|
|
127
|
-
}
|
|
128
|
-
return data;
|
|
129
|
-
}
|
|
130
|
-
/**
|
|
131
|
-
* Loads the search results from the SearchApi.
|
|
132
|
-
* @returns An array of Documents representing the search results.
|
|
133
|
-
* @throws An error if the search results could not be loaded.
|
|
134
|
-
*/
|
|
135
|
-
async load() {
|
|
136
|
-
const url = this.buildUrl();
|
|
137
|
-
const data = await this.fetchData(url);
|
|
138
|
-
try {
|
|
139
|
-
return this.processResponseData(data);
|
|
140
|
-
}
|
|
141
|
-
catch (error) {
|
|
142
|
-
console.error(error);
|
|
143
|
-
throw new Error(`Failed to process search results from SearchApi: ${error}`);
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
}
|
|
@@ -1,127 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.SerpAPILoader = void 0;
|
|
4
|
-
const documents_1 = require("@langchain/core/documents");
|
|
5
|
-
const env_1 = require("@langchain/core/utils/env");
|
|
6
|
-
const base_js_1 = require("../base.cjs");
|
|
7
|
-
const entrypoint_deprecation_js_1 = require("../../util/entrypoint_deprecation.cjs");
|
|
8
|
-
/* #__PURE__ */ (0, entrypoint_deprecation_js_1.logVersion020MigrationWarning)({
|
|
9
|
-
oldEntrypointName: "document_loaders/web/serpapi",
|
|
10
|
-
newPackageName: "@langchain/community",
|
|
11
|
-
});
|
|
12
|
-
/**
|
|
13
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/serpapi" instead. This entrypoint will be removed in 0.3.0.
|
|
14
|
-
*
|
|
15
|
-
* Class representing a document loader for loading search results from
|
|
16
|
-
* the SerpAPI. It extends the BaseDocumentLoader class.
|
|
17
|
-
* @example
|
|
18
|
-
* ```typescript
|
|
19
|
-
* const loader = new SerpAPILoader({ q: "{query}", apiKey: "{apiKey}" });
|
|
20
|
-
* const docs = await loader.load();
|
|
21
|
-
* ```
|
|
22
|
-
*/
|
|
23
|
-
class SerpAPILoader extends base_js_1.BaseDocumentLoader {
|
|
24
|
-
constructor(params) {
|
|
25
|
-
super();
|
|
26
|
-
Object.defineProperty(this, "apiKey", {
|
|
27
|
-
enumerable: true,
|
|
28
|
-
configurable: true,
|
|
29
|
-
writable: true,
|
|
30
|
-
value: void 0
|
|
31
|
-
});
|
|
32
|
-
Object.defineProperty(this, "searchQuery", {
|
|
33
|
-
enumerable: true,
|
|
34
|
-
configurable: true,
|
|
35
|
-
writable: true,
|
|
36
|
-
value: void 0
|
|
37
|
-
});
|
|
38
|
-
const { apiKey = (0, env_1.getEnvironmentVariable)("SERPAPI_API_KEY"), q } = params;
|
|
39
|
-
if (!apiKey) {
|
|
40
|
-
throw new Error("SerpAPI API key not set. You can set it as SERPAPI_API_KEY in your .env file, or pass it to SerpAPI.");
|
|
41
|
-
}
|
|
42
|
-
this.apiKey = apiKey;
|
|
43
|
-
this.searchQuery = q;
|
|
44
|
-
}
|
|
45
|
-
/**
|
|
46
|
-
* Builds the URL for the SerpAPI search request.
|
|
47
|
-
* @returns The URL for the search request.
|
|
48
|
-
*/
|
|
49
|
-
buildUrl() {
|
|
50
|
-
const params = new URLSearchParams();
|
|
51
|
-
params.append("api_key", this.apiKey);
|
|
52
|
-
params.append("q", this.searchQuery);
|
|
53
|
-
return `https://serpapi.com/search?${params.toString()}`;
|
|
54
|
-
}
|
|
55
|
-
/**
|
|
56
|
-
* Extracts documents from the provided output.
|
|
57
|
-
* @param output - The output to extract documents from.
|
|
58
|
-
* @param responseType - The type of the response to extract documents from.
|
|
59
|
-
* @returns An array of Documents.
|
|
60
|
-
*/
|
|
61
|
-
extractDocuments(output, responseType) {
|
|
62
|
-
const documents = [];
|
|
63
|
-
const results = Array.isArray(output) ? output : [output];
|
|
64
|
-
for (const result of results) {
|
|
65
|
-
const pageContent = JSON.stringify(result);
|
|
66
|
-
const metadata = {
|
|
67
|
-
source: "SerpAPI",
|
|
68
|
-
responseType,
|
|
69
|
-
};
|
|
70
|
-
documents.push(new documents_1.Document({ pageContent, metadata }));
|
|
71
|
-
}
|
|
72
|
-
return documents;
|
|
73
|
-
}
|
|
74
|
-
/**
|
|
75
|
-
* Processes the response data from the SerpAPI search request and converts it into an array of Documents.
|
|
76
|
-
* @param data - The response data from the SerpAPI search request.
|
|
77
|
-
* @returns An array of Documents.
|
|
78
|
-
*/
|
|
79
|
-
processResponseData(data) {
|
|
80
|
-
const documents = [];
|
|
81
|
-
const responseTypes = [
|
|
82
|
-
"answer_box",
|
|
83
|
-
"sports_results",
|
|
84
|
-
"shopping_results",
|
|
85
|
-
"knowledge_graph",
|
|
86
|
-
"organic_results",
|
|
87
|
-
];
|
|
88
|
-
for (const responseType of responseTypes) {
|
|
89
|
-
if (responseType in data) {
|
|
90
|
-
documents.push(...this.extractDocuments(data[responseType], responseType));
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
return documents;
|
|
94
|
-
}
|
|
95
|
-
/**
|
|
96
|
-
* Fetches the data from the provided URL and returns it as a JSON object.
|
|
97
|
-
* If an error occurs during the fetch operation, an exception is thrown with the error message.
|
|
98
|
-
* @param url - The URL to fetch data from.
|
|
99
|
-
* @returns A promise that resolves to the fetched data as a JSON object.
|
|
100
|
-
* @throws An error if the fetch operation fails.
|
|
101
|
-
*/
|
|
102
|
-
async fetchData(url) {
|
|
103
|
-
const response = await fetch(url);
|
|
104
|
-
const data = await response.json();
|
|
105
|
-
if (data.error) {
|
|
106
|
-
throw new Error(`Failed to load search results from SerpAPI due to: ${data.error}`);
|
|
107
|
-
}
|
|
108
|
-
return data;
|
|
109
|
-
}
|
|
110
|
-
/**
|
|
111
|
-
* Loads the search results from the SerpAPI.
|
|
112
|
-
* @returns An array of Documents representing the search results.
|
|
113
|
-
* @throws An error if the search results could not be loaded.
|
|
114
|
-
*/
|
|
115
|
-
async load() {
|
|
116
|
-
const url = this.buildUrl();
|
|
117
|
-
const data = await this.fetchData(url);
|
|
118
|
-
try {
|
|
119
|
-
return this.processResponseData(data);
|
|
120
|
-
}
|
|
121
|
-
catch (error) {
|
|
122
|
-
console.error(error);
|
|
123
|
-
throw new Error(`Failed to process search results from SerpAPI: ${error}`);
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
exports.SerpAPILoader = SerpAPILoader;
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
import { Document } from "@langchain/core/documents";
|
|
2
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
3
|
-
/**
|
|
4
|
-
* Interface representing the parameters for the SerpAPI loader. It
|
|
5
|
-
* includes properties such as the search query and the API key.
|
|
6
|
-
*/
|
|
7
|
-
interface SerpAPIParameters {
|
|
8
|
-
/**
|
|
9
|
-
* Search Query
|
|
10
|
-
*/
|
|
11
|
-
q: string;
|
|
12
|
-
apiKey?: string;
|
|
13
|
-
}
|
|
14
|
-
/**
|
|
15
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/serpapi" instead. This entrypoint will be removed in 0.3.0.
|
|
16
|
-
*
|
|
17
|
-
* Class representing a document loader for loading search results from
|
|
18
|
-
* the SerpAPI. It extends the BaseDocumentLoader class.
|
|
19
|
-
* @example
|
|
20
|
-
* ```typescript
|
|
21
|
-
* const loader = new SerpAPILoader({ q: "{query}", apiKey: "{apiKey}" });
|
|
22
|
-
* const docs = await loader.load();
|
|
23
|
-
* ```
|
|
24
|
-
*/
|
|
25
|
-
export declare class SerpAPILoader extends BaseDocumentLoader {
|
|
26
|
-
private apiKey;
|
|
27
|
-
private searchQuery;
|
|
28
|
-
constructor(params: SerpAPIParameters);
|
|
29
|
-
/**
|
|
30
|
-
* Builds the URL for the SerpAPI search request.
|
|
31
|
-
* @returns The URL for the search request.
|
|
32
|
-
*/
|
|
33
|
-
buildUrl(): string;
|
|
34
|
-
/**
|
|
35
|
-
* Extracts documents from the provided output.
|
|
36
|
-
* @param output - The output to extract documents from.
|
|
37
|
-
* @param responseType - The type of the response to extract documents from.
|
|
38
|
-
* @returns An array of Documents.
|
|
39
|
-
*/
|
|
40
|
-
private extractDocuments;
|
|
41
|
-
/**
|
|
42
|
-
* Processes the response data from the SerpAPI search request and converts it into an array of Documents.
|
|
43
|
-
* @param data - The response data from the SerpAPI search request.
|
|
44
|
-
* @returns An array of Documents.
|
|
45
|
-
*/
|
|
46
|
-
processResponseData(data: Record<string, unknown>): Document[];
|
|
47
|
-
/**
|
|
48
|
-
* Fetches the data from the provided URL and returns it as a JSON object.
|
|
49
|
-
* If an error occurs during the fetch operation, an exception is thrown with the error message.
|
|
50
|
-
* @param url - The URL to fetch data from.
|
|
51
|
-
* @returns A promise that resolves to the fetched data as a JSON object.
|
|
52
|
-
* @throws An error if the fetch operation fails.
|
|
53
|
-
*/
|
|
54
|
-
private fetchData;
|
|
55
|
-
/**
|
|
56
|
-
* Loads the search results from the SerpAPI.
|
|
57
|
-
* @returns An array of Documents representing the search results.
|
|
58
|
-
* @throws An error if the search results could not be loaded.
|
|
59
|
-
*/
|
|
60
|
-
load(): Promise<Document[]>;
|
|
61
|
-
}
|
|
62
|
-
export {};
|
|
@@ -1,123 +0,0 @@
|
|
|
1
|
-
import { Document } from "@langchain/core/documents";
|
|
2
|
-
import { getEnvironmentVariable } from "@langchain/core/utils/env";
|
|
3
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
4
|
-
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
|
|
5
|
-
/* #__PURE__ */ logVersion020MigrationWarning({
|
|
6
|
-
oldEntrypointName: "document_loaders/web/serpapi",
|
|
7
|
-
newPackageName: "@langchain/community",
|
|
8
|
-
});
|
|
9
|
-
/**
|
|
10
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/serpapi" instead. This entrypoint will be removed in 0.3.0.
|
|
11
|
-
*
|
|
12
|
-
* Class representing a document loader for loading search results from
|
|
13
|
-
* the SerpAPI. It extends the BaseDocumentLoader class.
|
|
14
|
-
* @example
|
|
15
|
-
* ```typescript
|
|
16
|
-
* const loader = new SerpAPILoader({ q: "{query}", apiKey: "{apiKey}" });
|
|
17
|
-
* const docs = await loader.load();
|
|
18
|
-
* ```
|
|
19
|
-
*/
|
|
20
|
-
export class SerpAPILoader extends BaseDocumentLoader {
|
|
21
|
-
constructor(params) {
|
|
22
|
-
super();
|
|
23
|
-
Object.defineProperty(this, "apiKey", {
|
|
24
|
-
enumerable: true,
|
|
25
|
-
configurable: true,
|
|
26
|
-
writable: true,
|
|
27
|
-
value: void 0
|
|
28
|
-
});
|
|
29
|
-
Object.defineProperty(this, "searchQuery", {
|
|
30
|
-
enumerable: true,
|
|
31
|
-
configurable: true,
|
|
32
|
-
writable: true,
|
|
33
|
-
value: void 0
|
|
34
|
-
});
|
|
35
|
-
const { apiKey = getEnvironmentVariable("SERPAPI_API_KEY"), q } = params;
|
|
36
|
-
if (!apiKey) {
|
|
37
|
-
throw new Error("SerpAPI API key not set. You can set it as SERPAPI_API_KEY in your .env file, or pass it to SerpAPI.");
|
|
38
|
-
}
|
|
39
|
-
this.apiKey = apiKey;
|
|
40
|
-
this.searchQuery = q;
|
|
41
|
-
}
|
|
42
|
-
/**
|
|
43
|
-
* Builds the URL for the SerpAPI search request.
|
|
44
|
-
* @returns The URL for the search request.
|
|
45
|
-
*/
|
|
46
|
-
buildUrl() {
|
|
47
|
-
const params = new URLSearchParams();
|
|
48
|
-
params.append("api_key", this.apiKey);
|
|
49
|
-
params.append("q", this.searchQuery);
|
|
50
|
-
return `https://serpapi.com/search?${params.toString()}`;
|
|
51
|
-
}
|
|
52
|
-
/**
|
|
53
|
-
* Extracts documents from the provided output.
|
|
54
|
-
* @param output - The output to extract documents from.
|
|
55
|
-
* @param responseType - The type of the response to extract documents from.
|
|
56
|
-
* @returns An array of Documents.
|
|
57
|
-
*/
|
|
58
|
-
extractDocuments(output, responseType) {
|
|
59
|
-
const documents = [];
|
|
60
|
-
const results = Array.isArray(output) ? output : [output];
|
|
61
|
-
for (const result of results) {
|
|
62
|
-
const pageContent = JSON.stringify(result);
|
|
63
|
-
const metadata = {
|
|
64
|
-
source: "SerpAPI",
|
|
65
|
-
responseType,
|
|
66
|
-
};
|
|
67
|
-
documents.push(new Document({ pageContent, metadata }));
|
|
68
|
-
}
|
|
69
|
-
return documents;
|
|
70
|
-
}
|
|
71
|
-
/**
|
|
72
|
-
* Processes the response data from the SerpAPI search request and converts it into an array of Documents.
|
|
73
|
-
* @param data - The response data from the SerpAPI search request.
|
|
74
|
-
* @returns An array of Documents.
|
|
75
|
-
*/
|
|
76
|
-
processResponseData(data) {
|
|
77
|
-
const documents = [];
|
|
78
|
-
const responseTypes = [
|
|
79
|
-
"answer_box",
|
|
80
|
-
"sports_results",
|
|
81
|
-
"shopping_results",
|
|
82
|
-
"knowledge_graph",
|
|
83
|
-
"organic_results",
|
|
84
|
-
];
|
|
85
|
-
for (const responseType of responseTypes) {
|
|
86
|
-
if (responseType in data) {
|
|
87
|
-
documents.push(...this.extractDocuments(data[responseType], responseType));
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
return documents;
|
|
91
|
-
}
|
|
92
|
-
/**
|
|
93
|
-
* Fetches the data from the provided URL and returns it as a JSON object.
|
|
94
|
-
* If an error occurs during the fetch operation, an exception is thrown with the error message.
|
|
95
|
-
* @param url - The URL to fetch data from.
|
|
96
|
-
* @returns A promise that resolves to the fetched data as a JSON object.
|
|
97
|
-
* @throws An error if the fetch operation fails.
|
|
98
|
-
*/
|
|
99
|
-
async fetchData(url) {
|
|
100
|
-
const response = await fetch(url);
|
|
101
|
-
const data = await response.json();
|
|
102
|
-
if (data.error) {
|
|
103
|
-
throw new Error(`Failed to load search results from SerpAPI due to: ${data.error}`);
|
|
104
|
-
}
|
|
105
|
-
return data;
|
|
106
|
-
}
|
|
107
|
-
/**
|
|
108
|
-
* Loads the search results from the SerpAPI.
|
|
109
|
-
* @returns An array of Documents representing the search results.
|
|
110
|
-
* @throws An error if the search results could not be loaded.
|
|
111
|
-
*/
|
|
112
|
-
async load() {
|
|
113
|
-
const url = this.buildUrl();
|
|
114
|
-
const data = await this.fetchData(url);
|
|
115
|
-
try {
|
|
116
|
-
return this.processResponseData(data);
|
|
117
|
-
}
|
|
118
|
-
catch (error) {
|
|
119
|
-
console.error(error);
|
|
120
|
-
throw new Error(`Failed to process search results from SerpAPI: ${error}`);
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
}
|
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.SitemapLoader = void 0;
|
|
4
|
-
const documents_1 = require("@langchain/core/documents");
|
|
5
|
-
const chunk_array_1 = require("@langchain/core/utils/chunk_array");
|
|
6
|
-
const cheerio_js_1 = require("./cheerio.cjs");
|
|
7
|
-
const entrypoint_deprecation_js_1 = require("../../util/entrypoint_deprecation.cjs");
|
|
8
|
-
/* #__PURE__ */ (0, entrypoint_deprecation_js_1.logVersion020MigrationWarning)({
|
|
9
|
-
oldEntrypointName: "document_loaders/web/sitemap",
|
|
10
|
-
newPackageName: "@langchain/community",
|
|
11
|
-
});
|
|
12
|
-
const DEFAULT_CHUNK_SIZE = 300;
|
|
13
|
-
/**
|
|
14
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/sitemap" instead. This entrypoint will be removed in 0.3.0.
|
|
15
|
-
*/
|
|
16
|
-
class SitemapLoader extends cheerio_js_1.CheerioWebBaseLoader {
|
|
17
|
-
constructor(webPath, params = {}) {
|
|
18
|
-
const paramsWithDefaults = { chunkSize: DEFAULT_CHUNK_SIZE, ...params };
|
|
19
|
-
let path = webPath.endsWith("/") ? webPath.slice(0, -1) : webPath;
|
|
20
|
-
// Allow for custom sitemap paths to be passed in with the url.
|
|
21
|
-
path = path.endsWith(".xml") ? path : `${path}/sitemap.xml`;
|
|
22
|
-
super(path, paramsWithDefaults);
|
|
23
|
-
Object.defineProperty(this, "webPath", {
|
|
24
|
-
enumerable: true,
|
|
25
|
-
configurable: true,
|
|
26
|
-
writable: true,
|
|
27
|
-
value: webPath
|
|
28
|
-
});
|
|
29
|
-
Object.defineProperty(this, "allowUrlPatterns", {
|
|
30
|
-
enumerable: true,
|
|
31
|
-
configurable: true,
|
|
32
|
-
writable: true,
|
|
33
|
-
value: void 0
|
|
34
|
-
});
|
|
35
|
-
Object.defineProperty(this, "chunkSize", {
|
|
36
|
-
enumerable: true,
|
|
37
|
-
configurable: true,
|
|
38
|
-
writable: true,
|
|
39
|
-
value: void 0
|
|
40
|
-
});
|
|
41
|
-
this.webPath = path;
|
|
42
|
-
this.allowUrlPatterns = paramsWithDefaults.filterUrls;
|
|
43
|
-
this.chunkSize = paramsWithDefaults.chunkSize;
|
|
44
|
-
}
|
|
45
|
-
_checkUrlPatterns(url) {
|
|
46
|
-
if (!this.allowUrlPatterns) {
|
|
47
|
-
return false;
|
|
48
|
-
}
|
|
49
|
-
return !this.allowUrlPatterns.some((pattern) => !new RegExp(pattern).test(url));
|
|
50
|
-
}
|
|
51
|
-
async parseSitemap() {
|
|
52
|
-
const $ = await cheerio_js_1.CheerioWebBaseLoader._scrape(this.webPath, this.caller, this.timeout, this.textDecoder, {
|
|
53
|
-
xmlMode: true,
|
|
54
|
-
xml: true,
|
|
55
|
-
});
|
|
56
|
-
const elements = [];
|
|
57
|
-
$("url").each((_, element) => {
|
|
58
|
-
const loc = $(element).find("loc").text();
|
|
59
|
-
if (!loc) {
|
|
60
|
-
return;
|
|
61
|
-
}
|
|
62
|
-
if (this._checkUrlPatterns(loc)) {
|
|
63
|
-
return;
|
|
64
|
-
}
|
|
65
|
-
const changefreq = $(element).find("changefreq").text();
|
|
66
|
-
const lastmod = $(element).find("lastmod").text();
|
|
67
|
-
const priority = $(element).find("priority").text();
|
|
68
|
-
elements.push({ loc, changefreq, lastmod, priority });
|
|
69
|
-
});
|
|
70
|
-
$("sitemap").each((_, element) => {
|
|
71
|
-
const loc = $(element).find("loc").text();
|
|
72
|
-
if (!loc) {
|
|
73
|
-
return;
|
|
74
|
-
}
|
|
75
|
-
const changefreq = $(element).find("changefreq").text();
|
|
76
|
-
const lastmod = $(element).find("lastmod").text();
|
|
77
|
-
const priority = $(element).find("priority").text();
|
|
78
|
-
elements.push({ loc, changefreq, lastmod, priority });
|
|
79
|
-
});
|
|
80
|
-
return elements;
|
|
81
|
-
}
|
|
82
|
-
async _loadSitemapUrls(elements) {
|
|
83
|
-
const all = await cheerio_js_1.CheerioWebBaseLoader.scrapeAll(elements.map((ele) => ele.loc), this.caller, this.timeout, this.textDecoder);
|
|
84
|
-
const documents = all.map(($, i) => {
|
|
85
|
-
if (!elements[i]) {
|
|
86
|
-
throw new Error("Scraped docs and elements not in sync");
|
|
87
|
-
}
|
|
88
|
-
const text = $(this.selector).text();
|
|
89
|
-
const { loc: source, ...metadata } = elements[i];
|
|
90
|
-
// extract page metadata
|
|
91
|
-
const description = $("meta[name='description']").attr("content");
|
|
92
|
-
const title = $("meta[property='og:title']").attr("content");
|
|
93
|
-
const lang = $("meta[property='og:locale']").attr("content");
|
|
94
|
-
return new documents_1.Document({
|
|
95
|
-
pageContent: text,
|
|
96
|
-
metadata: {
|
|
97
|
-
...metadata,
|
|
98
|
-
description,
|
|
99
|
-
title,
|
|
100
|
-
lang,
|
|
101
|
-
source: source.trim(),
|
|
102
|
-
},
|
|
103
|
-
});
|
|
104
|
-
});
|
|
105
|
-
return documents;
|
|
106
|
-
}
|
|
107
|
-
async load() {
|
|
108
|
-
const elements = await this.parseSitemap();
|
|
109
|
-
const chunks = (0, chunk_array_1.chunkArray)(elements, this.chunkSize);
|
|
110
|
-
const documents = [];
|
|
111
|
-
for await (const chunk of chunks) {
|
|
112
|
-
const chunkedDocuments = await this._loadSitemapUrls(chunk);
|
|
113
|
-
documents.push(...chunkedDocuments);
|
|
114
|
-
}
|
|
115
|
-
return documents;
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
exports.SitemapLoader = SitemapLoader;
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
import { Document, DocumentInterface } from "@langchain/core/documents";
|
|
2
|
-
import { CheerioWebBaseLoader, WebBaseLoaderParams } from "./cheerio.js";
|
|
3
|
-
/**
|
|
4
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/sitemap" instead. This entrypoint will be removed in 0.3.0.
|
|
5
|
-
*
|
|
6
|
-
* Interface representing the parameters for initializing a SitemapLoader.
|
|
7
|
-
* @interface SitemapLoaderParams
|
|
8
|
-
* @extends WebBaseLoaderParams
|
|
9
|
-
*/
|
|
10
|
-
export interface SitemapLoaderParams extends WebBaseLoaderParams {
|
|
11
|
-
/**
|
|
12
|
-
* @property {(string | RegExp)[] | undefined} filterUrls - A list of regexes. Only URLs that match one of the filter URLs will be loaded.
|
|
13
|
-
* WARNING: The filter URLs are interpreted as regular expressions. Escape special characters if needed.
|
|
14
|
-
*/
|
|
15
|
-
filterUrls?: (string | RegExp)[];
|
|
16
|
-
/**
|
|
17
|
-
* The size to chunk the sitemap URLs into for scraping.
|
|
18
|
-
* @default {300}
|
|
19
|
-
*/
|
|
20
|
-
chunkSize?: number;
|
|
21
|
-
}
|
|
22
|
-
type SiteMapElement = {
|
|
23
|
-
loc: string;
|
|
24
|
-
changefreq?: string;
|
|
25
|
-
lastmod?: string;
|
|
26
|
-
priority?: string;
|
|
27
|
-
};
|
|
28
|
-
/**
|
|
29
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/sitemap" instead. This entrypoint will be removed in 0.3.0.
|
|
30
|
-
*/
|
|
31
|
-
export declare class SitemapLoader extends CheerioWebBaseLoader implements SitemapLoaderParams {
|
|
32
|
-
webPath: string;
|
|
33
|
-
allowUrlPatterns: (string | RegExp)[] | undefined;
|
|
34
|
-
chunkSize: number;
|
|
35
|
-
constructor(webPath: string, params?: SitemapLoaderParams);
|
|
36
|
-
_checkUrlPatterns(url: string): boolean;
|
|
37
|
-
parseSitemap(): Promise<SiteMapElement[]>;
|
|
38
|
-
_loadSitemapUrls(elements: Array<SiteMapElement>): Promise<DocumentInterface[]>;
|
|
39
|
-
load(): Promise<Document[]>;
|
|
40
|
-
}
|
|
41
|
-
export {};
|