langchain 0.2.17 → 0.3.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/executor.cjs +11 -0
- package/dist/agents/executor.d.ts +2 -0
- package/dist/agents/executor.js +11 -0
- package/dist/agents/openai_functions/index.cjs +2 -2
- package/dist/agents/openai_functions/index.js +2 -2
- package/dist/agents/react/output_parser.cjs +3 -2
- package/dist/agents/react/output_parser.js +3 -2
- package/dist/chains/combine_documents/stuff.cjs +2 -2
- package/dist/chains/combine_documents/stuff.js +2 -2
- package/dist/chains/conversational_retrieval_chain.cjs +1 -1
- package/dist/chains/conversational_retrieval_chain.d.ts +1 -1
- package/dist/chains/conversational_retrieval_chain.js +1 -1
- package/dist/chains/llm_chain.cjs +1 -1
- package/dist/chains/llm_chain.d.ts +1 -1
- package/dist/chains/llm_chain.js +1 -1
- package/dist/chains/openai_functions/openapi.cjs +3 -1
- package/dist/chains/openai_functions/openapi.js +3 -1
- package/dist/chains/retrieval_qa.cjs +1 -1
- package/dist/chains/retrieval_qa.d.ts +1 -1
- package/dist/chains/retrieval_qa.js +1 -1
- package/dist/load/import_constants.cjs +2 -39
- package/dist/load/import_constants.js +2 -39
- package/dist/load/import_map.cjs +2 -3
- package/dist/load/import_map.d.ts +0 -1
- package/dist/load/import_map.js +0 -1
- package/dist/smith/config.d.ts +1 -5
- package/package.json +35 -858
- package/dist/document_loaders/fs/chatgpt.cjs +0 -90
- package/dist/document_loaders/fs/chatgpt.d.ts +0 -8
- package/dist/document_loaders/fs/chatgpt.js +0 -86
- package/dist/document_loaders/fs/csv.cjs +0 -73
- package/dist/document_loaders/fs/csv.d.ts +0 -65
- package/dist/document_loaders/fs/csv.js +0 -69
- package/dist/document_loaders/fs/docx.cjs +0 -58
- package/dist/document_loaders/fs/docx.d.ts +0 -25
- package/dist/document_loaders/fs/docx.js +0 -54
- package/dist/document_loaders/fs/epub.cjs +0 -103
- package/dist/document_loaders/fs/epub.d.ts +0 -33
- package/dist/document_loaders/fs/epub.js +0 -99
- package/dist/document_loaders/fs/notion.cjs +0 -26
- package/dist/document_loaders/fs/notion.d.ts +0 -12
- package/dist/document_loaders/fs/notion.js +0 -22
- package/dist/document_loaders/fs/obsidian.cjs +0 -247
- package/dist/document_loaders/fs/obsidian.d.ts +0 -28
- package/dist/document_loaders/fs/obsidian.js +0 -240
- package/dist/document_loaders/fs/openai_whisper_audio.cjs +0 -49
- package/dist/document_loaders/fs/openai_whisper_audio.d.ts +0 -23
- package/dist/document_loaders/fs/openai_whisper_audio.js +0 -45
- package/dist/document_loaders/fs/pdf.cjs +0 -148
- package/dist/document_loaders/fs/pdf.d.ts +0 -49
- package/dist/document_loaders/fs/pdf.js +0 -144
- package/dist/document_loaders/fs/pptx.cjs +0 -46
- package/dist/document_loaders/fs/pptx.d.ts +0 -25
- package/dist/document_loaders/fs/pptx.js +0 -42
- package/dist/document_loaders/fs/srt.cjs +0 -57
- package/dist/document_loaders/fs/srt.d.ts +0 -32
- package/dist/document_loaders/fs/srt.js +0 -50
- package/dist/document_loaders/fs/unstructured.cjs +0 -338
- package/dist/document_loaders/fs/unstructured.d.ts +0 -125
- package/dist/document_loaders/fs/unstructured.js +0 -333
- package/dist/document_loaders/web/apify_dataset.cjs +0 -130
- package/dist/document_loaders/web/apify_dataset.d.ts +0 -85
- package/dist/document_loaders/web/apify_dataset.js +0 -126
- package/dist/document_loaders/web/assemblyai.cjs +0 -200
- package/dist/document_loaders/web/assemblyai.d.ts +0 -95
- package/dist/document_loaders/web/assemblyai.js +0 -193
- package/dist/document_loaders/web/azure_blob_storage_container.cjs +0 -73
- package/dist/document_loaders/web/azure_blob_storage_container.d.ts +0 -46
- package/dist/document_loaders/web/azure_blob_storage_container.js +0 -69
- package/dist/document_loaders/web/azure_blob_storage_file.cjs +0 -124
- package/dist/document_loaders/web/azure_blob_storage_file.d.ts +0 -53
- package/dist/document_loaders/web/azure_blob_storage_file.js +0 -97
- package/dist/document_loaders/web/browserbase.cjs +0 -93
- package/dist/document_loaders/web/browserbase.d.ts +0 -48
- package/dist/document_loaders/web/browserbase.js +0 -86
- package/dist/document_loaders/web/cheerio.cjs +0 -118
- package/dist/document_loaders/web/cheerio.d.ts +0 -77
- package/dist/document_loaders/web/cheerio.js +0 -114
- package/dist/document_loaders/web/college_confidential.cjs +0 -41
- package/dist/document_loaders/web/college_confidential.d.ts +0 -25
- package/dist/document_loaders/web/college_confidential.js +0 -37
- package/dist/document_loaders/web/confluence.cjs +0 -190
- package/dist/document_loaders/web/confluence.d.ts +0 -114
- package/dist/document_loaders/web/confluence.js +0 -186
- package/dist/document_loaders/web/couchbase.cjs +0 -95
- package/dist/document_loaders/web/couchbase.d.ts +0 -32
- package/dist/document_loaders/web/couchbase.js +0 -91
- package/dist/document_loaders/web/figma.cjs +0 -102
- package/dist/document_loaders/web/figma.d.ts +0 -82
- package/dist/document_loaders/web/figma.js +0 -98
- package/dist/document_loaders/web/firecrawl.cjs +0 -95
- package/dist/document_loaders/web/firecrawl.d.ts +0 -50
- package/dist/document_loaders/web/firecrawl.js +0 -88
- package/dist/document_loaders/web/gitbook.cjs +0 -110
- package/dist/document_loaders/web/gitbook.d.ts +0 -55
- package/dist/document_loaders/web/gitbook.js +0 -106
- package/dist/document_loaders/web/github.cjs +0 -615
- package/dist/document_loaders/web/github.d.ts +0 -203
- package/dist/document_loaders/web/github.js +0 -608
- package/dist/document_loaders/web/hn.cjs +0 -90
- package/dist/document_loaders/web/hn.d.ts +0 -42
- package/dist/document_loaders/web/hn.js +0 -86
- package/dist/document_loaders/web/imsdb.cjs +0 -44
- package/dist/document_loaders/web/imsdb.d.ts +0 -23
- package/dist/document_loaders/web/imsdb.js +0 -40
- package/dist/document_loaders/web/notionapi.cjs +0 -404
- package/dist/document_loaders/web/notionapi.d.ts +0 -133
- package/dist/document_loaders/web/notionapi.js +0 -392
- package/dist/document_loaders/web/notiondb.cjs +0 -199
- package/dist/document_loaders/web/notiondb.d.ts +0 -56
- package/dist/document_loaders/web/notiondb.js +0 -195
- package/dist/document_loaders/web/pdf.cjs +0 -140
- package/dist/document_loaders/web/pdf.d.ts +0 -35
- package/dist/document_loaders/web/pdf.js +0 -136
- package/dist/document_loaders/web/playwright.cjs +0 -89
- package/dist/document_loaders/web/playwright.d.ts +0 -58
- package/dist/document_loaders/web/playwright.js +0 -85
- package/dist/document_loaders/web/puppeteer.cjs +0 -139
- package/dist/document_loaders/web/puppeteer.d.ts +0 -82
- package/dist/document_loaders/web/puppeteer.js +0 -135
- package/dist/document_loaders/web/recursive_url.cjs +0 -198
- package/dist/document_loaders/web/recursive_url.d.ts +0 -33
- package/dist/document_loaders/web/recursive_url.js +0 -194
- package/dist/document_loaders/web/s3.cjs +0 -164
- package/dist/document_loaders/web/s3.d.ts +0 -78
- package/dist/document_loaders/web/s3.js +0 -137
- package/dist/document_loaders/web/searchapi.cjs +0 -150
- package/dist/document_loaders/web/searchapi.d.ts +0 -76
- package/dist/document_loaders/web/searchapi.js +0 -146
- package/dist/document_loaders/web/serpapi.cjs +0 -127
- package/dist/document_loaders/web/serpapi.d.ts +0 -62
- package/dist/document_loaders/web/serpapi.js +0 -123
- package/dist/document_loaders/web/sitemap.cjs +0 -118
- package/dist/document_loaders/web/sitemap.d.ts +0 -41
- package/dist/document_loaders/web/sitemap.js +0 -114
- package/dist/document_loaders/web/sonix_audio.cjs +0 -68
- package/dist/document_loaders/web/sonix_audio.d.ts +0 -36
- package/dist/document_loaders/web/sonix_audio.js +0 -64
- package/dist/document_loaders/web/sort_xyz_blockchain.cjs +0 -157
- package/dist/document_loaders/web/sort_xyz_blockchain.d.ts +0 -78
- package/dist/document_loaders/web/sort_xyz_blockchain.js +0 -153
- package/dist/document_loaders/web/youtube.cjs +0 -116
- package/dist/document_loaders/web/youtube.d.ts +0 -55
- package/dist/document_loaders/web/youtube.js +0 -112
- package/dist/experimental/chrome_ai/app/dist/bundle.cjs +0 -1250
- package/dist/experimental/chrome_ai/app/dist/bundle.d.ts +0 -1
- package/dist/experimental/chrome_ai/app/dist/bundle.js +0 -1249
- package/dist/experimental/tools/pyinterpreter.cjs +0 -248
- package/dist/experimental/tools/pyinterpreter.d.ts +0 -18
- package/dist/experimental/tools/pyinterpreter.js +0 -244
- package/dist/retrievers/self_query/chroma.cjs +0 -48
- package/dist/retrievers/self_query/chroma.d.ts +0 -26
- package/dist/retrievers/self_query/chroma.js +0 -44
- package/dist/retrievers/self_query/pinecone.cjs +0 -47
- package/dist/retrievers/self_query/pinecone.d.ts +0 -26
- package/dist/retrievers/self_query/pinecone.js +0 -43
- package/dist/retrievers/self_query/supabase.cjs +0 -278
- package/dist/retrievers/self_query/supabase.d.ts +0 -109
- package/dist/retrievers/self_query/supabase.js +0 -274
- package/dist/retrievers/self_query/supabase_utils.cjs +0 -264
- package/dist/retrievers/self_query/supabase_utils.d.ts +0 -101
- package/dist/retrievers/self_query/supabase_utils.js +0 -259
- package/dist/retrievers/self_query/vectara.cjs +0 -143
- package/dist/retrievers/self_query/vectara.d.ts +0 -42
- package/dist/retrievers/self_query/vectara.js +0 -139
- package/dist/retrievers/self_query/weaviate.cjs +0 -201
- package/dist/retrievers/self_query/weaviate.d.ts +0 -99
- package/dist/retrievers/self_query/weaviate.js +0 -197
- package/dist/types/assemblyai-types.cjs +0 -2
- package/dist/types/assemblyai-types.d.ts +0 -4
- package/dist/types/assemblyai-types.js +0 -1
- package/document_loaders/fs/chatgpt.cjs +0 -1
- package/document_loaders/fs/chatgpt.d.cts +0 -1
- package/document_loaders/fs/chatgpt.d.ts +0 -1
- package/document_loaders/fs/chatgpt.js +0 -1
- package/document_loaders/fs/csv.cjs +0 -1
- package/document_loaders/fs/csv.d.cts +0 -1
- package/document_loaders/fs/csv.d.ts +0 -1
- package/document_loaders/fs/csv.js +0 -1
- package/document_loaders/fs/docx.cjs +0 -1
- package/document_loaders/fs/docx.d.cts +0 -1
- package/document_loaders/fs/docx.d.ts +0 -1
- package/document_loaders/fs/docx.js +0 -1
- package/document_loaders/fs/epub.cjs +0 -1
- package/document_loaders/fs/epub.d.cts +0 -1
- package/document_loaders/fs/epub.d.ts +0 -1
- package/document_loaders/fs/epub.js +0 -1
- package/document_loaders/fs/notion.cjs +0 -1
- package/document_loaders/fs/notion.d.cts +0 -1
- package/document_loaders/fs/notion.d.ts +0 -1
- package/document_loaders/fs/notion.js +0 -1
- package/document_loaders/fs/obsidian.cjs +0 -1
- package/document_loaders/fs/obsidian.d.cts +0 -1
- package/document_loaders/fs/obsidian.d.ts +0 -1
- package/document_loaders/fs/obsidian.js +0 -1
- package/document_loaders/fs/openai_whisper_audio.cjs +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.cts +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.ts +0 -1
- package/document_loaders/fs/openai_whisper_audio.js +0 -1
- package/document_loaders/fs/pdf.cjs +0 -1
- package/document_loaders/fs/pdf.d.cts +0 -1
- package/document_loaders/fs/pdf.d.ts +0 -1
- package/document_loaders/fs/pdf.js +0 -1
- package/document_loaders/fs/pptx.cjs +0 -1
- package/document_loaders/fs/pptx.d.cts +0 -1
- package/document_loaders/fs/pptx.d.ts +0 -1
- package/document_loaders/fs/pptx.js +0 -1
- package/document_loaders/fs/srt.cjs +0 -1
- package/document_loaders/fs/srt.d.cts +0 -1
- package/document_loaders/fs/srt.d.ts +0 -1
- package/document_loaders/fs/srt.js +0 -1
- package/document_loaders/fs/unstructured.cjs +0 -1
- package/document_loaders/fs/unstructured.d.cts +0 -1
- package/document_loaders/fs/unstructured.d.ts +0 -1
- package/document_loaders/fs/unstructured.js +0 -1
- package/document_loaders/web/apify_dataset.cjs +0 -1
- package/document_loaders/web/apify_dataset.d.cts +0 -1
- package/document_loaders/web/apify_dataset.d.ts +0 -1
- package/document_loaders/web/apify_dataset.js +0 -1
- package/document_loaders/web/assemblyai.cjs +0 -1
- package/document_loaders/web/assemblyai.d.cts +0 -1
- package/document_loaders/web/assemblyai.d.ts +0 -1
- package/document_loaders/web/assemblyai.js +0 -1
- package/document_loaders/web/azure_blob_storage_container.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_container.js +0 -1
- package/document_loaders/web/azure_blob_storage_file.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_file.js +0 -1
- package/document_loaders/web/browserbase.cjs +0 -1
- package/document_loaders/web/browserbase.d.cts +0 -1
- package/document_loaders/web/browserbase.d.ts +0 -1
- package/document_loaders/web/browserbase.js +0 -1
- package/document_loaders/web/cheerio.cjs +0 -1
- package/document_loaders/web/cheerio.d.cts +0 -1
- package/document_loaders/web/cheerio.d.ts +0 -1
- package/document_loaders/web/cheerio.js +0 -1
- package/document_loaders/web/college_confidential.cjs +0 -1
- package/document_loaders/web/college_confidential.d.cts +0 -1
- package/document_loaders/web/college_confidential.d.ts +0 -1
- package/document_loaders/web/college_confidential.js +0 -1
- package/document_loaders/web/confluence.cjs +0 -1
- package/document_loaders/web/confluence.d.cts +0 -1
- package/document_loaders/web/confluence.d.ts +0 -1
- package/document_loaders/web/confluence.js +0 -1
- package/document_loaders/web/couchbase.cjs +0 -1
- package/document_loaders/web/couchbase.d.cts +0 -1
- package/document_loaders/web/couchbase.d.ts +0 -1
- package/document_loaders/web/couchbase.js +0 -1
- package/document_loaders/web/figma.cjs +0 -1
- package/document_loaders/web/figma.d.cts +0 -1
- package/document_loaders/web/figma.d.ts +0 -1
- package/document_loaders/web/figma.js +0 -1
- package/document_loaders/web/firecrawl.cjs +0 -1
- package/document_loaders/web/firecrawl.d.cts +0 -1
- package/document_loaders/web/firecrawl.d.ts +0 -1
- package/document_loaders/web/firecrawl.js +0 -1
- package/document_loaders/web/gitbook.cjs +0 -1
- package/document_loaders/web/gitbook.d.cts +0 -1
- package/document_loaders/web/gitbook.d.ts +0 -1
- package/document_loaders/web/gitbook.js +0 -1
- package/document_loaders/web/github.cjs +0 -1
- package/document_loaders/web/github.d.cts +0 -1
- package/document_loaders/web/github.d.ts +0 -1
- package/document_loaders/web/github.js +0 -1
- package/document_loaders/web/hn.cjs +0 -1
- package/document_loaders/web/hn.d.cts +0 -1
- package/document_loaders/web/hn.d.ts +0 -1
- package/document_loaders/web/hn.js +0 -1
- package/document_loaders/web/imsdb.cjs +0 -1
- package/document_loaders/web/imsdb.d.cts +0 -1
- package/document_loaders/web/imsdb.d.ts +0 -1
- package/document_loaders/web/imsdb.js +0 -1
- package/document_loaders/web/notionapi.cjs +0 -1
- package/document_loaders/web/notionapi.d.cts +0 -1
- package/document_loaders/web/notionapi.d.ts +0 -1
- package/document_loaders/web/notionapi.js +0 -1
- package/document_loaders/web/notiondb.cjs +0 -1
- package/document_loaders/web/notiondb.d.cts +0 -1
- package/document_loaders/web/notiondb.d.ts +0 -1
- package/document_loaders/web/notiondb.js +0 -1
- package/document_loaders/web/pdf.cjs +0 -1
- package/document_loaders/web/pdf.d.cts +0 -1
- package/document_loaders/web/pdf.d.ts +0 -1
- package/document_loaders/web/pdf.js +0 -1
- package/document_loaders/web/playwright.cjs +0 -1
- package/document_loaders/web/playwright.d.cts +0 -1
- package/document_loaders/web/playwright.d.ts +0 -1
- package/document_loaders/web/playwright.js +0 -1
- package/document_loaders/web/puppeteer.cjs +0 -1
- package/document_loaders/web/puppeteer.d.cts +0 -1
- package/document_loaders/web/puppeteer.d.ts +0 -1
- package/document_loaders/web/puppeteer.js +0 -1
- package/document_loaders/web/recursive_url.cjs +0 -1
- package/document_loaders/web/recursive_url.d.cts +0 -1
- package/document_loaders/web/recursive_url.d.ts +0 -1
- package/document_loaders/web/recursive_url.js +0 -1
- package/document_loaders/web/s3.cjs +0 -1
- package/document_loaders/web/s3.d.cts +0 -1
- package/document_loaders/web/s3.d.ts +0 -1
- package/document_loaders/web/s3.js +0 -1
- package/document_loaders/web/searchapi.cjs +0 -1
- package/document_loaders/web/searchapi.d.cts +0 -1
- package/document_loaders/web/searchapi.d.ts +0 -1
- package/document_loaders/web/searchapi.js +0 -1
- package/document_loaders/web/serpapi.cjs +0 -1
- package/document_loaders/web/serpapi.d.cts +0 -1
- package/document_loaders/web/serpapi.d.ts +0 -1
- package/document_loaders/web/serpapi.js +0 -1
- package/document_loaders/web/sitemap.cjs +0 -1
- package/document_loaders/web/sitemap.d.cts +0 -1
- package/document_loaders/web/sitemap.d.ts +0 -1
- package/document_loaders/web/sitemap.js +0 -1
- package/document_loaders/web/sonix_audio.cjs +0 -1
- package/document_loaders/web/sonix_audio.d.cts +0 -1
- package/document_loaders/web/sonix_audio.d.ts +0 -1
- package/document_loaders/web/sonix_audio.js +0 -1
- package/document_loaders/web/sort_xyz_blockchain.cjs +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.cts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.ts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.js +0 -1
- package/document_loaders/web/youtube.cjs +0 -1
- package/document_loaders/web/youtube.d.cts +0 -1
- package/document_loaders/web/youtube.d.ts +0 -1
- package/document_loaders/web/youtube.js +0 -1
- package/experimental/tools/pyinterpreter.cjs +0 -1
- package/experimental/tools/pyinterpreter.d.cts +0 -1
- package/experimental/tools/pyinterpreter.d.ts +0 -1
- package/experimental/tools/pyinterpreter.js +0 -1
- package/memory/index.cjs +0 -1
- package/memory/index.d.cts +0 -1
- package/memory/index.d.ts +0 -1
- package/memory/index.js +0 -1
- package/retrievers/self_query/chroma.cjs +0 -1
- package/retrievers/self_query/chroma.d.cts +0 -1
- package/retrievers/self_query/chroma.d.ts +0 -1
- package/retrievers/self_query/chroma.js +0 -1
- package/retrievers/self_query/pinecone.cjs +0 -1
- package/retrievers/self_query/pinecone.d.cts +0 -1
- package/retrievers/self_query/pinecone.d.ts +0 -1
- package/retrievers/self_query/pinecone.js +0 -1
- package/retrievers/self_query/supabase.cjs +0 -1
- package/retrievers/self_query/supabase.d.cts +0 -1
- package/retrievers/self_query/supabase.d.ts +0 -1
- package/retrievers/self_query/supabase.js +0 -1
- package/retrievers/self_query/vectara.cjs +0 -1
- package/retrievers/self_query/vectara.d.cts +0 -1
- package/retrievers/self_query/vectara.d.ts +0 -1
- package/retrievers/self_query/vectara.js +0 -1
- package/retrievers/self_query/weaviate.cjs +0 -1
- package/retrievers/self_query/weaviate.d.cts +0 -1
- package/retrievers/self_query/weaviate.d.ts +0 -1
- package/retrievers/self_query/weaviate.js +0 -1
|
@@ -1,333 +0,0 @@
|
|
|
1
|
-
import { Document } from "@langchain/core/documents";
|
|
2
|
-
import { getEnv } from "@langchain/core/utils/env";
|
|
3
|
-
import { DirectoryLoader, UnknownHandling, } from "./directory.js";
|
|
4
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
5
|
-
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
|
|
6
|
-
/* #__PURE__ */ logVersion020MigrationWarning({
|
|
7
|
-
oldEntrypointName: "document_loaders/fs/unstructured",
|
|
8
|
-
newPackageName: "@langchain/community",
|
|
9
|
-
});
|
|
10
|
-
const UNSTRUCTURED_API_FILETYPES = [
|
|
11
|
-
".txt",
|
|
12
|
-
".text",
|
|
13
|
-
".pdf",
|
|
14
|
-
".docx",
|
|
15
|
-
".doc",
|
|
16
|
-
".jpg",
|
|
17
|
-
".jpeg",
|
|
18
|
-
".eml",
|
|
19
|
-
".html",
|
|
20
|
-
".htm",
|
|
21
|
-
".md",
|
|
22
|
-
".pptx",
|
|
23
|
-
".ppt",
|
|
24
|
-
".msg",
|
|
25
|
-
".rtf",
|
|
26
|
-
".xlsx",
|
|
27
|
-
".xls",
|
|
28
|
-
".odt",
|
|
29
|
-
".epub",
|
|
30
|
-
];
|
|
31
|
-
/**
|
|
32
|
-
* @deprecated - Import from "@langchain/community/document_loaders/fs/unstructured" instead. This entrypoint will be removed in 0.3.0.
|
|
33
|
-
*
|
|
34
|
-
* A document loader that uses the Unstructured API to load unstructured
|
|
35
|
-
* documents. It supports both the new syntax with options object and the
|
|
36
|
-
* legacy syntax for backward compatibility. The load() method sends a
|
|
37
|
-
* partitioning request to the Unstructured API and retrieves the
|
|
38
|
-
* partitioned elements. It creates a Document instance for each element
|
|
39
|
-
* and returns an array of Document instances.
|
|
40
|
-
*/
|
|
41
|
-
export class UnstructuredLoader extends BaseDocumentLoader {
|
|
42
|
-
constructor(filePathOrLegacyApiUrlOrMemoryBuffer, optionsOrLegacyFilePath = {}) {
|
|
43
|
-
super();
|
|
44
|
-
Object.defineProperty(this, "filePath", {
|
|
45
|
-
enumerable: true,
|
|
46
|
-
configurable: true,
|
|
47
|
-
writable: true,
|
|
48
|
-
value: void 0
|
|
49
|
-
});
|
|
50
|
-
Object.defineProperty(this, "buffer", {
|
|
51
|
-
enumerable: true,
|
|
52
|
-
configurable: true,
|
|
53
|
-
writable: true,
|
|
54
|
-
value: void 0
|
|
55
|
-
});
|
|
56
|
-
Object.defineProperty(this, "fileName", {
|
|
57
|
-
enumerable: true,
|
|
58
|
-
configurable: true,
|
|
59
|
-
writable: true,
|
|
60
|
-
value: void 0
|
|
61
|
-
});
|
|
62
|
-
Object.defineProperty(this, "apiUrl", {
|
|
63
|
-
enumerable: true,
|
|
64
|
-
configurable: true,
|
|
65
|
-
writable: true,
|
|
66
|
-
value: "https://api.unstructured.io/general/v0/general"
|
|
67
|
-
});
|
|
68
|
-
Object.defineProperty(this, "apiKey", {
|
|
69
|
-
enumerable: true,
|
|
70
|
-
configurable: true,
|
|
71
|
-
writable: true,
|
|
72
|
-
value: void 0
|
|
73
|
-
});
|
|
74
|
-
Object.defineProperty(this, "strategy", {
|
|
75
|
-
enumerable: true,
|
|
76
|
-
configurable: true,
|
|
77
|
-
writable: true,
|
|
78
|
-
value: "hi_res"
|
|
79
|
-
});
|
|
80
|
-
Object.defineProperty(this, "encoding", {
|
|
81
|
-
enumerable: true,
|
|
82
|
-
configurable: true,
|
|
83
|
-
writable: true,
|
|
84
|
-
value: void 0
|
|
85
|
-
});
|
|
86
|
-
Object.defineProperty(this, "ocrLanguages", {
|
|
87
|
-
enumerable: true,
|
|
88
|
-
configurable: true,
|
|
89
|
-
writable: true,
|
|
90
|
-
value: []
|
|
91
|
-
});
|
|
92
|
-
Object.defineProperty(this, "coordinates", {
|
|
93
|
-
enumerable: true,
|
|
94
|
-
configurable: true,
|
|
95
|
-
writable: true,
|
|
96
|
-
value: void 0
|
|
97
|
-
});
|
|
98
|
-
Object.defineProperty(this, "pdfInferTableStructure", {
|
|
99
|
-
enumerable: true,
|
|
100
|
-
configurable: true,
|
|
101
|
-
writable: true,
|
|
102
|
-
value: void 0
|
|
103
|
-
});
|
|
104
|
-
Object.defineProperty(this, "xmlKeepTags", {
|
|
105
|
-
enumerable: true,
|
|
106
|
-
configurable: true,
|
|
107
|
-
writable: true,
|
|
108
|
-
value: void 0
|
|
109
|
-
});
|
|
110
|
-
Object.defineProperty(this, "skipInferTableTypes", {
|
|
111
|
-
enumerable: true,
|
|
112
|
-
configurable: true,
|
|
113
|
-
writable: true,
|
|
114
|
-
value: void 0
|
|
115
|
-
});
|
|
116
|
-
Object.defineProperty(this, "hiResModelName", {
|
|
117
|
-
enumerable: true,
|
|
118
|
-
configurable: true,
|
|
119
|
-
writable: true,
|
|
120
|
-
value: void 0
|
|
121
|
-
});
|
|
122
|
-
Object.defineProperty(this, "includePageBreaks", {
|
|
123
|
-
enumerable: true,
|
|
124
|
-
configurable: true,
|
|
125
|
-
writable: true,
|
|
126
|
-
value: void 0
|
|
127
|
-
});
|
|
128
|
-
Object.defineProperty(this, "chunkingStrategy", {
|
|
129
|
-
enumerable: true,
|
|
130
|
-
configurable: true,
|
|
131
|
-
writable: true,
|
|
132
|
-
value: void 0
|
|
133
|
-
});
|
|
134
|
-
Object.defineProperty(this, "multiPageSections", {
|
|
135
|
-
enumerable: true,
|
|
136
|
-
configurable: true,
|
|
137
|
-
writable: true,
|
|
138
|
-
value: void 0
|
|
139
|
-
});
|
|
140
|
-
Object.defineProperty(this, "combineUnderNChars", {
|
|
141
|
-
enumerable: true,
|
|
142
|
-
configurable: true,
|
|
143
|
-
writable: true,
|
|
144
|
-
value: void 0
|
|
145
|
-
});
|
|
146
|
-
Object.defineProperty(this, "newAfterNChars", {
|
|
147
|
-
enumerable: true,
|
|
148
|
-
configurable: true,
|
|
149
|
-
writable: true,
|
|
150
|
-
value: void 0
|
|
151
|
-
});
|
|
152
|
-
Object.defineProperty(this, "maxCharacters", {
|
|
153
|
-
enumerable: true,
|
|
154
|
-
configurable: true,
|
|
155
|
-
writable: true,
|
|
156
|
-
value: void 0
|
|
157
|
-
});
|
|
158
|
-
// Temporary shim to avoid breaking existing users
|
|
159
|
-
// Remove when API keys are enforced by Unstructured and existing code will break anyway
|
|
160
|
-
const isLegacySyntax = typeof optionsOrLegacyFilePath === "string";
|
|
161
|
-
const isMemorySyntax = typeof filePathOrLegacyApiUrlOrMemoryBuffer === "object";
|
|
162
|
-
if (isMemorySyntax) {
|
|
163
|
-
this.buffer = filePathOrLegacyApiUrlOrMemoryBuffer.buffer;
|
|
164
|
-
this.fileName = filePathOrLegacyApiUrlOrMemoryBuffer.fileName;
|
|
165
|
-
}
|
|
166
|
-
else if (isLegacySyntax) {
|
|
167
|
-
this.filePath = optionsOrLegacyFilePath;
|
|
168
|
-
this.apiUrl = filePathOrLegacyApiUrlOrMemoryBuffer;
|
|
169
|
-
}
|
|
170
|
-
else {
|
|
171
|
-
this.filePath = filePathOrLegacyApiUrlOrMemoryBuffer;
|
|
172
|
-
}
|
|
173
|
-
if (!isLegacySyntax) {
|
|
174
|
-
const options = optionsOrLegacyFilePath;
|
|
175
|
-
this.apiKey = options.apiKey;
|
|
176
|
-
this.apiUrl = options.apiUrl ?? this.apiUrl;
|
|
177
|
-
this.strategy = options.strategy ?? this.strategy;
|
|
178
|
-
this.encoding = options.encoding;
|
|
179
|
-
this.ocrLanguages = options.ocrLanguages ?? this.ocrLanguages;
|
|
180
|
-
this.coordinates = options.coordinates;
|
|
181
|
-
this.pdfInferTableStructure = options.pdfInferTableStructure;
|
|
182
|
-
this.xmlKeepTags = options.xmlKeepTags;
|
|
183
|
-
this.skipInferTableTypes = options.skipInferTableTypes;
|
|
184
|
-
this.hiResModelName = options.hiResModelName;
|
|
185
|
-
this.includePageBreaks = options.includePageBreaks;
|
|
186
|
-
this.chunkingStrategy = options.chunkingStrategy;
|
|
187
|
-
this.multiPageSections = options.multiPageSections;
|
|
188
|
-
this.combineUnderNChars = options.combineUnderNChars;
|
|
189
|
-
this.newAfterNChars = options.newAfterNChars;
|
|
190
|
-
this.maxCharacters = options.maxCharacters;
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
async _partition() {
|
|
194
|
-
let { buffer } = this;
|
|
195
|
-
let { fileName } = this;
|
|
196
|
-
if (!buffer) {
|
|
197
|
-
const { readFile, basename } = await this.imports();
|
|
198
|
-
buffer = await readFile(this.filePath);
|
|
199
|
-
fileName = basename(this.filePath);
|
|
200
|
-
// I'm aware this reads the file into memory first, but we have lots of work
|
|
201
|
-
// to do on then consuming Documents in a streaming fashion anyway, so not
|
|
202
|
-
// worried about this for now.
|
|
203
|
-
}
|
|
204
|
-
const formData = new FormData();
|
|
205
|
-
formData.append("files", new Blob([buffer]), fileName);
|
|
206
|
-
formData.append("strategy", this.strategy);
|
|
207
|
-
this.ocrLanguages.forEach((language) => {
|
|
208
|
-
formData.append("ocr_languages", language);
|
|
209
|
-
});
|
|
210
|
-
if (this.encoding) {
|
|
211
|
-
formData.append("encoding", this.encoding);
|
|
212
|
-
}
|
|
213
|
-
if (this.coordinates === true) {
|
|
214
|
-
formData.append("coordinates", "true");
|
|
215
|
-
}
|
|
216
|
-
if (this.pdfInferTableStructure === true) {
|
|
217
|
-
formData.append("pdf_infer_table_structure", "true");
|
|
218
|
-
}
|
|
219
|
-
if (this.xmlKeepTags === true) {
|
|
220
|
-
formData.append("xml_keep_tags", "true");
|
|
221
|
-
}
|
|
222
|
-
if (this.skipInferTableTypes) {
|
|
223
|
-
formData.append("skip_infer_table_types", JSON.stringify(this.skipInferTableTypes));
|
|
224
|
-
}
|
|
225
|
-
if (this.hiResModelName) {
|
|
226
|
-
formData.append("hi_res_model_name", this.hiResModelName);
|
|
227
|
-
}
|
|
228
|
-
if (this.includePageBreaks) {
|
|
229
|
-
formData.append("include_page_breaks", "true");
|
|
230
|
-
}
|
|
231
|
-
if (this.chunkingStrategy) {
|
|
232
|
-
formData.append("chunking_strategy", this.chunkingStrategy);
|
|
233
|
-
}
|
|
234
|
-
if (this.multiPageSections !== undefined) {
|
|
235
|
-
formData.append("multipage_sections", this.multiPageSections ? "true" : "false");
|
|
236
|
-
}
|
|
237
|
-
if (this.combineUnderNChars !== undefined) {
|
|
238
|
-
formData.append("combine_under_n_chars", String(this.combineUnderNChars));
|
|
239
|
-
}
|
|
240
|
-
if (this.newAfterNChars !== undefined) {
|
|
241
|
-
formData.append("new_after_n_chars", String(this.newAfterNChars));
|
|
242
|
-
}
|
|
243
|
-
if (this.maxCharacters !== undefined) {
|
|
244
|
-
formData.append("max_characters", String(this.maxCharacters));
|
|
245
|
-
}
|
|
246
|
-
const headers = {
|
|
247
|
-
"UNSTRUCTURED-API-KEY": this.apiKey ?? "",
|
|
248
|
-
};
|
|
249
|
-
const response = await fetch(this.apiUrl, {
|
|
250
|
-
method: "POST",
|
|
251
|
-
body: formData,
|
|
252
|
-
headers,
|
|
253
|
-
});
|
|
254
|
-
if (!response.ok) {
|
|
255
|
-
throw new Error(`Failed to partition file ${this.filePath} with error ${response.status} and message ${await response.text()}`);
|
|
256
|
-
}
|
|
257
|
-
const elements = await response.json();
|
|
258
|
-
if (!Array.isArray(elements)) {
|
|
259
|
-
throw new Error(`Expected partitioning request to return an array, but got ${elements}`);
|
|
260
|
-
}
|
|
261
|
-
return elements.filter((el) => typeof el.text === "string");
|
|
262
|
-
}
|
|
263
|
-
async load() {
|
|
264
|
-
const elements = await this._partition();
|
|
265
|
-
const documents = [];
|
|
266
|
-
for (const element of elements) {
|
|
267
|
-
const { metadata, text } = element;
|
|
268
|
-
if (typeof text === "string" && text !== "") {
|
|
269
|
-
documents.push(new Document({
|
|
270
|
-
pageContent: text,
|
|
271
|
-
metadata: {
|
|
272
|
-
...metadata,
|
|
273
|
-
category: element.type,
|
|
274
|
-
},
|
|
275
|
-
}));
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
return documents;
|
|
279
|
-
}
|
|
280
|
-
async imports() {
|
|
281
|
-
try {
|
|
282
|
-
const { readFile } = await import("node:fs/promises");
|
|
283
|
-
const { basename } = await import("node:path");
|
|
284
|
-
return { readFile, basename };
|
|
285
|
-
}
|
|
286
|
-
catch (e) {
|
|
287
|
-
console.error(e);
|
|
288
|
-
throw new Error(`Failed to load fs/promises. TextLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https://<link to docs> for alternatives.`);
|
|
289
|
-
}
|
|
290
|
-
}
|
|
291
|
-
}
|
|
292
|
-
/**
|
|
293
|
-
* A document loader that loads unstructured documents from a directory
|
|
294
|
-
* using the UnstructuredLoader. It creates a UnstructuredLoader instance
|
|
295
|
-
* for each supported file type and passes it to the DirectoryLoader
|
|
296
|
-
* constructor.
|
|
297
|
-
* @example
|
|
298
|
-
* ```typescript
|
|
299
|
-
* const loader = new UnstructuredDirectoryLoader("path/to/directory", {
|
|
300
|
-
* apiKey: "MY_API_KEY",
|
|
301
|
-
* });
|
|
302
|
-
* const docs = await loader.load();
|
|
303
|
-
* ```
|
|
304
|
-
*/
|
|
305
|
-
export class UnstructuredDirectoryLoader extends DirectoryLoader {
|
|
306
|
-
constructor(directoryPathOrLegacyApiUrl, optionsOrLegacyDirectoryPath, legacyOptionRecursive = true, legacyOptionUnknown = UnknownHandling.Warn) {
|
|
307
|
-
let directoryPath;
|
|
308
|
-
let options;
|
|
309
|
-
// Temporary shim to avoid breaking existing users
|
|
310
|
-
// Remove when API keys are enforced by Unstructured and existing code will break anyway
|
|
311
|
-
const isLegacySyntax = typeof optionsOrLegacyDirectoryPath === "string";
|
|
312
|
-
if (isLegacySyntax) {
|
|
313
|
-
directoryPath = optionsOrLegacyDirectoryPath;
|
|
314
|
-
options = {
|
|
315
|
-
apiUrl: directoryPathOrLegacyApiUrl,
|
|
316
|
-
recursive: legacyOptionRecursive,
|
|
317
|
-
unknown: legacyOptionUnknown,
|
|
318
|
-
};
|
|
319
|
-
}
|
|
320
|
-
else {
|
|
321
|
-
directoryPath = directoryPathOrLegacyApiUrl;
|
|
322
|
-
options = optionsOrLegacyDirectoryPath;
|
|
323
|
-
}
|
|
324
|
-
const loader = (p) => new UnstructuredLoader(p, options);
|
|
325
|
-
const loaders = UNSTRUCTURED_API_FILETYPES.reduce((loadersObject, filetype) => {
|
|
326
|
-
// eslint-disable-next-line no-param-reassign
|
|
327
|
-
loadersObject[filetype] = loader;
|
|
328
|
-
return loadersObject;
|
|
329
|
-
}, {});
|
|
330
|
-
super(directoryPath, loaders, options.recursive, options.unknown);
|
|
331
|
-
}
|
|
332
|
-
}
|
|
333
|
-
export { UnknownHandling };
|
|
@@ -1,130 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
/* eslint-disable @typescript-eslint/no-explicit-any */
|
|
3
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
-
exports.ApifyDatasetLoader = void 0;
|
|
5
|
-
const apify_client_1 = require("apify-client");
|
|
6
|
-
const async_caller_1 = require("@langchain/core/utils/async_caller");
|
|
7
|
-
const env_1 = require("@langchain/core/utils/env");
|
|
8
|
-
const base_js_1 = require("../base.cjs");
|
|
9
|
-
const entrypoint_deprecation_js_1 = require("../../util/entrypoint_deprecation.cjs");
|
|
10
|
-
/* #__PURE__ */ (0, entrypoint_deprecation_js_1.logVersion020MigrationWarning)({
|
|
11
|
-
oldEntrypointName: "document_loaders/web/apify_dataset",
|
|
12
|
-
newPackageName: "@langchain/community",
|
|
13
|
-
});
|
|
14
|
-
/**
|
|
15
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/apify_dataset" instead. This entrypoint will be removed in 0.3.0.
|
|
16
|
-
* A class that extends the BaseDocumentLoader and implements the
|
|
17
|
-
* DocumentLoader interface. It represents a document loader that loads
|
|
18
|
-
* documents from an Apify dataset.
|
|
19
|
-
* @example
|
|
20
|
-
* ```typescript
|
|
21
|
-
* const loader = new ApifyDatasetLoader("your-dataset-id", {
|
|
22
|
-
* datasetMappingFunction: (item) =>
|
|
23
|
-
* new Document({
|
|
24
|
-
* pageContent: item.text || "",
|
|
25
|
-
* metadata: { source: item.url },
|
|
26
|
-
* }),
|
|
27
|
-
* clientOptions: {
|
|
28
|
-
* token: "your-apify-token",
|
|
29
|
-
* },
|
|
30
|
-
* });
|
|
31
|
-
*
|
|
32
|
-
* const docs = await loader.load();
|
|
33
|
-
*
|
|
34
|
-
* const chain = new RetrievalQAChain();
|
|
35
|
-
* const res = await chain.invoke({ query: "What is LangChain?" });
|
|
36
|
-
*
|
|
37
|
-
* console.log(res.text);
|
|
38
|
-
* console.log(res.sourceDocuments.map((d) => d.metadata.source));
|
|
39
|
-
* ```
|
|
40
|
-
*/
|
|
41
|
-
class ApifyDatasetLoader extends base_js_1.BaseDocumentLoader {
|
|
42
|
-
constructor(datasetId, config) {
|
|
43
|
-
super();
|
|
44
|
-
Object.defineProperty(this, "apifyClient", {
|
|
45
|
-
enumerable: true,
|
|
46
|
-
configurable: true,
|
|
47
|
-
writable: true,
|
|
48
|
-
value: void 0
|
|
49
|
-
});
|
|
50
|
-
Object.defineProperty(this, "datasetId", {
|
|
51
|
-
enumerable: true,
|
|
52
|
-
configurable: true,
|
|
53
|
-
writable: true,
|
|
54
|
-
value: void 0
|
|
55
|
-
});
|
|
56
|
-
Object.defineProperty(this, "datasetMappingFunction", {
|
|
57
|
-
enumerable: true,
|
|
58
|
-
configurable: true,
|
|
59
|
-
writable: true,
|
|
60
|
-
value: void 0
|
|
61
|
-
});
|
|
62
|
-
Object.defineProperty(this, "caller", {
|
|
63
|
-
enumerable: true,
|
|
64
|
-
configurable: true,
|
|
65
|
-
writable: true,
|
|
66
|
-
value: void 0
|
|
67
|
-
});
|
|
68
|
-
const { clientOptions, datasetMappingFunction, ...asyncCallerParams } = config;
|
|
69
|
-
const token = ApifyDatasetLoader._getApifyApiToken(clientOptions);
|
|
70
|
-
this.apifyClient = new apify_client_1.ApifyClient({ ...clientOptions, token });
|
|
71
|
-
this.datasetId = datasetId;
|
|
72
|
-
this.datasetMappingFunction = datasetMappingFunction;
|
|
73
|
-
this.caller = new async_caller_1.AsyncCaller(asyncCallerParams);
|
|
74
|
-
}
|
|
75
|
-
static _getApifyApiToken(config) {
|
|
76
|
-
return config?.token ?? (0, env_1.getEnvironmentVariable)("APIFY_API_TOKEN");
|
|
77
|
-
}
|
|
78
|
-
/**
|
|
79
|
-
* Retrieves the dataset items from the Apify platform and applies the
|
|
80
|
-
* datasetMappingFunction to each item to create an array of Document
|
|
81
|
-
* instances.
|
|
82
|
-
* @returns An array of Document instances.
|
|
83
|
-
*/
|
|
84
|
-
async load() {
|
|
85
|
-
const dataset = await this.apifyClient
|
|
86
|
-
.dataset(this.datasetId)
|
|
87
|
-
.listItems({ clean: true });
|
|
88
|
-
const documentList = await Promise.all(dataset.items.map((item) => this.caller.call(async () => this.datasetMappingFunction(item))));
|
|
89
|
-
return documentList.flat();
|
|
90
|
-
}
|
|
91
|
-
/**
|
|
92
|
-
* Create an ApifyDatasetLoader by calling an Actor on the Apify platform and waiting for its results to be ready.
|
|
93
|
-
* @param actorId The ID or name of the Actor on the Apify platform.
|
|
94
|
-
* @param input The input object of the Actor that you're trying to run.
|
|
95
|
-
* @param options Options specifying settings for the Actor run.
|
|
96
|
-
* @param options.datasetMappingFunction A function that takes a single object (an Apify dataset item) and converts it to an instance of the Document class.
|
|
97
|
-
* @returns An instance of `ApifyDatasetLoader` with the results from the Actor run.
|
|
98
|
-
*/
|
|
99
|
-
static async fromActorCall(actorId, input, config) {
|
|
100
|
-
const apifyApiToken = ApifyDatasetLoader._getApifyApiToken(config.clientOptions);
|
|
101
|
-
const apifyClient = new apify_client_1.ApifyClient({ token: apifyApiToken });
|
|
102
|
-
const actorCall = await apifyClient
|
|
103
|
-
.actor(actorId)
|
|
104
|
-
.call(input, config.callOptions ?? {});
|
|
105
|
-
return new ApifyDatasetLoader(actorCall.defaultDatasetId, {
|
|
106
|
-
datasetMappingFunction: config.datasetMappingFunction,
|
|
107
|
-
clientOptions: { ...config.clientOptions, token: apifyApiToken },
|
|
108
|
-
});
|
|
109
|
-
}
|
|
110
|
-
/**
|
|
111
|
-
* Create an ApifyDatasetLoader by calling a saved Actor task on the Apify platform and waiting for its results to be ready.
|
|
112
|
-
* @param taskId The ID or name of the task on the Apify platform.
|
|
113
|
-
* @param input The input object of the task that you're trying to run. Overrides the task's saved input.
|
|
114
|
-
* @param options Options specifying settings for the task run.
|
|
115
|
-
* @param options.datasetMappingFunction A function that takes a single object (an Apify dataset item) and converts it to an instance of the Document class.
|
|
116
|
-
* @returns An instance of `ApifyDatasetLoader` with the results from the task's run.
|
|
117
|
-
*/
|
|
118
|
-
static async fromActorTaskCall(taskId, input, config) {
|
|
119
|
-
const apifyApiToken = ApifyDatasetLoader._getApifyApiToken(config.clientOptions);
|
|
120
|
-
const apifyClient = new apify_client_1.ApifyClient({ token: apifyApiToken });
|
|
121
|
-
const taskCall = await apifyClient
|
|
122
|
-
.task(taskId)
|
|
123
|
-
.call(input, config.callOptions ?? {});
|
|
124
|
-
return new ApifyDatasetLoader(taskCall.defaultDatasetId, {
|
|
125
|
-
datasetMappingFunction: config.datasetMappingFunction,
|
|
126
|
-
clientOptions: { ...config.clientOptions, token: apifyApiToken },
|
|
127
|
-
});
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
exports.ApifyDatasetLoader = ApifyDatasetLoader;
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
import { ActorCallOptions, ApifyClient, ApifyClientOptions, TaskCallOptions } from "apify-client";
|
|
2
|
-
import { Document } from "@langchain/core/documents";
|
|
3
|
-
import { AsyncCaller, AsyncCallerParams } from "@langchain/core/utils/async_caller";
|
|
4
|
-
import { BaseDocumentLoader, DocumentLoader } from "../base.js";
|
|
5
|
-
/**
|
|
6
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/apify_dataset" instead. This entrypoint will be removed in 0.3.0.
|
|
7
|
-
* A type that represents a function that takes a single object (an Apify
|
|
8
|
-
* dataset item) and converts it to an instance of the Document class.
|
|
9
|
-
*
|
|
10
|
-
* Change function signature to only be asynchronous for simplicity in v0.1.0
|
|
11
|
-
* https://github.com/langchain-ai/langchainjs/pull/3262
|
|
12
|
-
*/
|
|
13
|
-
export type ApifyDatasetMappingFunction<Metadata extends Record<string, any>> = (item: Record<string | number, unknown>) => Document<Metadata> | Array<Document<Metadata>> | Promise<Document<Metadata> | Array<Document<Metadata>>>;
|
|
14
|
-
export interface ApifyDatasetLoaderConfig<Metadata extends Record<string, any>> extends AsyncCallerParams {
|
|
15
|
-
datasetMappingFunction: ApifyDatasetMappingFunction<Metadata>;
|
|
16
|
-
clientOptions?: ApifyClientOptions;
|
|
17
|
-
}
|
|
18
|
-
/**
|
|
19
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/apify_dataset" instead. This entrypoint will be removed in 0.3.0.
|
|
20
|
-
* A class that extends the BaseDocumentLoader and implements the
|
|
21
|
-
* DocumentLoader interface. It represents a document loader that loads
|
|
22
|
-
* documents from an Apify dataset.
|
|
23
|
-
* @example
|
|
24
|
-
* ```typescript
|
|
25
|
-
* const loader = new ApifyDatasetLoader("your-dataset-id", {
|
|
26
|
-
* datasetMappingFunction: (item) =>
|
|
27
|
-
* new Document({
|
|
28
|
-
* pageContent: item.text || "",
|
|
29
|
-
* metadata: { source: item.url },
|
|
30
|
-
* }),
|
|
31
|
-
* clientOptions: {
|
|
32
|
-
* token: "your-apify-token",
|
|
33
|
-
* },
|
|
34
|
-
* });
|
|
35
|
-
*
|
|
36
|
-
* const docs = await loader.load();
|
|
37
|
-
*
|
|
38
|
-
* const chain = new RetrievalQAChain();
|
|
39
|
-
* const res = await chain.invoke({ query: "What is LangChain?" });
|
|
40
|
-
*
|
|
41
|
-
* console.log(res.text);
|
|
42
|
-
* console.log(res.sourceDocuments.map((d) => d.metadata.source));
|
|
43
|
-
* ```
|
|
44
|
-
*/
|
|
45
|
-
export declare class ApifyDatasetLoader<Metadata extends Record<string, any>> extends BaseDocumentLoader implements DocumentLoader {
|
|
46
|
-
protected apifyClient: ApifyClient;
|
|
47
|
-
protected datasetId: string;
|
|
48
|
-
protected datasetMappingFunction: ApifyDatasetMappingFunction<Metadata>;
|
|
49
|
-
protected caller: AsyncCaller;
|
|
50
|
-
constructor(datasetId: string, config: ApifyDatasetLoaderConfig<Metadata>);
|
|
51
|
-
private static _getApifyApiToken;
|
|
52
|
-
/**
|
|
53
|
-
* Retrieves the dataset items from the Apify platform and applies the
|
|
54
|
-
* datasetMappingFunction to each item to create an array of Document
|
|
55
|
-
* instances.
|
|
56
|
-
* @returns An array of Document instances.
|
|
57
|
-
*/
|
|
58
|
-
load(): Promise<Document<Metadata>[]>;
|
|
59
|
-
/**
|
|
60
|
-
* Create an ApifyDatasetLoader by calling an Actor on the Apify platform and waiting for its results to be ready.
|
|
61
|
-
* @param actorId The ID or name of the Actor on the Apify platform.
|
|
62
|
-
* @param input The input object of the Actor that you're trying to run.
|
|
63
|
-
* @param options Options specifying settings for the Actor run.
|
|
64
|
-
* @param options.datasetMappingFunction A function that takes a single object (an Apify dataset item) and converts it to an instance of the Document class.
|
|
65
|
-
* @returns An instance of `ApifyDatasetLoader` with the results from the Actor run.
|
|
66
|
-
*/
|
|
67
|
-
static fromActorCall<Metadata extends Record<string, any>>(actorId: string, input: Record<string | number, unknown>, config: {
|
|
68
|
-
callOptions?: ActorCallOptions;
|
|
69
|
-
clientOptions?: ApifyClientOptions;
|
|
70
|
-
datasetMappingFunction: ApifyDatasetMappingFunction<Metadata>;
|
|
71
|
-
}): Promise<ApifyDatasetLoader<Metadata>>;
|
|
72
|
-
/**
|
|
73
|
-
* Create an ApifyDatasetLoader by calling a saved Actor task on the Apify platform and waiting for its results to be ready.
|
|
74
|
-
* @param taskId The ID or name of the task on the Apify platform.
|
|
75
|
-
* @param input The input object of the task that you're trying to run. Overrides the task's saved input.
|
|
76
|
-
* @param options Options specifying settings for the task run.
|
|
77
|
-
* @param options.datasetMappingFunction A function that takes a single object (an Apify dataset item) and converts it to an instance of the Document class.
|
|
78
|
-
* @returns An instance of `ApifyDatasetLoader` with the results from the task's run.
|
|
79
|
-
*/
|
|
80
|
-
static fromActorTaskCall<Metadata extends Record<string, any>>(taskId: string, input: Record<string | number, unknown>, config: {
|
|
81
|
-
callOptions?: TaskCallOptions;
|
|
82
|
-
clientOptions?: ApifyClientOptions;
|
|
83
|
-
datasetMappingFunction: ApifyDatasetMappingFunction<Metadata>;
|
|
84
|
-
}): Promise<ApifyDatasetLoader<Metadata>>;
|
|
85
|
-
}
|
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
/* eslint-disable @typescript-eslint/no-explicit-any */
|
|
2
|
-
import { ApifyClient, } from "apify-client";
|
|
3
|
-
import { AsyncCaller, } from "@langchain/core/utils/async_caller";
|
|
4
|
-
import { getEnvironmentVariable } from "@langchain/core/utils/env";
|
|
5
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
6
|
-
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
|
|
7
|
-
/* #__PURE__ */ logVersion020MigrationWarning({
|
|
8
|
-
oldEntrypointName: "document_loaders/web/apify_dataset",
|
|
9
|
-
newPackageName: "@langchain/community",
|
|
10
|
-
});
|
|
11
|
-
/**
|
|
12
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/apify_dataset" instead. This entrypoint will be removed in 0.3.0.
|
|
13
|
-
* A class that extends the BaseDocumentLoader and implements the
|
|
14
|
-
* DocumentLoader interface. It represents a document loader that loads
|
|
15
|
-
* documents from an Apify dataset.
|
|
16
|
-
* @example
|
|
17
|
-
* ```typescript
|
|
18
|
-
* const loader = new ApifyDatasetLoader("your-dataset-id", {
|
|
19
|
-
* datasetMappingFunction: (item) =>
|
|
20
|
-
* new Document({
|
|
21
|
-
* pageContent: item.text || "",
|
|
22
|
-
* metadata: { source: item.url },
|
|
23
|
-
* }),
|
|
24
|
-
* clientOptions: {
|
|
25
|
-
* token: "your-apify-token",
|
|
26
|
-
* },
|
|
27
|
-
* });
|
|
28
|
-
*
|
|
29
|
-
* const docs = await loader.load();
|
|
30
|
-
*
|
|
31
|
-
* const chain = new RetrievalQAChain();
|
|
32
|
-
* const res = await chain.invoke({ query: "What is LangChain?" });
|
|
33
|
-
*
|
|
34
|
-
* console.log(res.text);
|
|
35
|
-
* console.log(res.sourceDocuments.map((d) => d.metadata.source));
|
|
36
|
-
* ```
|
|
37
|
-
*/
|
|
38
|
-
export class ApifyDatasetLoader extends BaseDocumentLoader {
|
|
39
|
-
constructor(datasetId, config) {
|
|
40
|
-
super();
|
|
41
|
-
Object.defineProperty(this, "apifyClient", {
|
|
42
|
-
enumerable: true,
|
|
43
|
-
configurable: true,
|
|
44
|
-
writable: true,
|
|
45
|
-
value: void 0
|
|
46
|
-
});
|
|
47
|
-
Object.defineProperty(this, "datasetId", {
|
|
48
|
-
enumerable: true,
|
|
49
|
-
configurable: true,
|
|
50
|
-
writable: true,
|
|
51
|
-
value: void 0
|
|
52
|
-
});
|
|
53
|
-
Object.defineProperty(this, "datasetMappingFunction", {
|
|
54
|
-
enumerable: true,
|
|
55
|
-
configurable: true,
|
|
56
|
-
writable: true,
|
|
57
|
-
value: void 0
|
|
58
|
-
});
|
|
59
|
-
Object.defineProperty(this, "caller", {
|
|
60
|
-
enumerable: true,
|
|
61
|
-
configurable: true,
|
|
62
|
-
writable: true,
|
|
63
|
-
value: void 0
|
|
64
|
-
});
|
|
65
|
-
const { clientOptions, datasetMappingFunction, ...asyncCallerParams } = config;
|
|
66
|
-
const token = ApifyDatasetLoader._getApifyApiToken(clientOptions);
|
|
67
|
-
this.apifyClient = new ApifyClient({ ...clientOptions, token });
|
|
68
|
-
this.datasetId = datasetId;
|
|
69
|
-
this.datasetMappingFunction = datasetMappingFunction;
|
|
70
|
-
this.caller = new AsyncCaller(asyncCallerParams);
|
|
71
|
-
}
|
|
72
|
-
static _getApifyApiToken(config) {
|
|
73
|
-
return config?.token ?? getEnvironmentVariable("APIFY_API_TOKEN");
|
|
74
|
-
}
|
|
75
|
-
/**
|
|
76
|
-
* Retrieves the dataset items from the Apify platform and applies the
|
|
77
|
-
* datasetMappingFunction to each item to create an array of Document
|
|
78
|
-
* instances.
|
|
79
|
-
* @returns An array of Document instances.
|
|
80
|
-
*/
|
|
81
|
-
async load() {
|
|
82
|
-
const dataset = await this.apifyClient
|
|
83
|
-
.dataset(this.datasetId)
|
|
84
|
-
.listItems({ clean: true });
|
|
85
|
-
const documentList = await Promise.all(dataset.items.map((item) => this.caller.call(async () => this.datasetMappingFunction(item))));
|
|
86
|
-
return documentList.flat();
|
|
87
|
-
}
|
|
88
|
-
/**
|
|
89
|
-
* Create an ApifyDatasetLoader by calling an Actor on the Apify platform and waiting for its results to be ready.
|
|
90
|
-
* @param actorId The ID or name of the Actor on the Apify platform.
|
|
91
|
-
* @param input The input object of the Actor that you're trying to run.
|
|
92
|
-
* @param options Options specifying settings for the Actor run.
|
|
93
|
-
* @param options.datasetMappingFunction A function that takes a single object (an Apify dataset item) and converts it to an instance of the Document class.
|
|
94
|
-
* @returns An instance of `ApifyDatasetLoader` with the results from the Actor run.
|
|
95
|
-
*/
|
|
96
|
-
static async fromActorCall(actorId, input, config) {
|
|
97
|
-
const apifyApiToken = ApifyDatasetLoader._getApifyApiToken(config.clientOptions);
|
|
98
|
-
const apifyClient = new ApifyClient({ token: apifyApiToken });
|
|
99
|
-
const actorCall = await apifyClient
|
|
100
|
-
.actor(actorId)
|
|
101
|
-
.call(input, config.callOptions ?? {});
|
|
102
|
-
return new ApifyDatasetLoader(actorCall.defaultDatasetId, {
|
|
103
|
-
datasetMappingFunction: config.datasetMappingFunction,
|
|
104
|
-
clientOptions: { ...config.clientOptions, token: apifyApiToken },
|
|
105
|
-
});
|
|
106
|
-
}
|
|
107
|
-
/**
|
|
108
|
-
* Create an ApifyDatasetLoader by calling a saved Actor task on the Apify platform and waiting for its results to be ready.
|
|
109
|
-
* @param taskId The ID or name of the task on the Apify platform.
|
|
110
|
-
* @param input The input object of the task that you're trying to run. Overrides the task's saved input.
|
|
111
|
-
* @param options Options specifying settings for the task run.
|
|
112
|
-
* @param options.datasetMappingFunction A function that takes a single object (an Apify dataset item) and converts it to an instance of the Document class.
|
|
113
|
-
* @returns An instance of `ApifyDatasetLoader` with the results from the task's run.
|
|
114
|
-
*/
|
|
115
|
-
static async fromActorTaskCall(taskId, input, config) {
|
|
116
|
-
const apifyApiToken = ApifyDatasetLoader._getApifyApiToken(config.clientOptions);
|
|
117
|
-
const apifyClient = new ApifyClient({ token: apifyApiToken });
|
|
118
|
-
const taskCall = await apifyClient
|
|
119
|
-
.task(taskId)
|
|
120
|
-
.call(input, config.callOptions ?? {});
|
|
121
|
-
return new ApifyDatasetLoader(taskCall.defaultDatasetId, {
|
|
122
|
-
datasetMappingFunction: config.datasetMappingFunction,
|
|
123
|
-
clientOptions: { ...config.clientOptions, token: apifyApiToken },
|
|
124
|
-
});
|
|
125
|
-
}
|
|
126
|
-
}
|