langchain 0.2.19 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -9
- package/dist/agents/chat/index.cjs +1 -1
- package/dist/agents/chat/index.d.ts +1 -1
- package/dist/agents/chat/index.js +1 -1
- package/dist/agents/chat_convo/index.cjs +1 -1
- package/dist/agents/chat_convo/index.d.ts +1 -1
- package/dist/agents/chat_convo/index.js +1 -1
- package/dist/agents/mrkl/index.cjs +1 -1
- package/dist/agents/mrkl/index.d.ts +1 -1
- package/dist/agents/mrkl/index.js +1 -1
- package/dist/agents/openai_functions/index.cjs +3 -3
- package/dist/agents/openai_functions/index.d.ts +1 -1
- package/dist/agents/openai_functions/index.js +3 -3
- package/dist/agents/structured_chat/index.cjs +1 -1
- package/dist/agents/structured_chat/index.d.ts +1 -1
- package/dist/agents/structured_chat/index.js +1 -1
- package/dist/agents/xml/index.cjs +1 -1
- package/dist/agents/xml/index.d.ts +1 -1
- package/dist/agents/xml/index.js +1 -1
- package/dist/chains/combine_documents/stuff.cjs +2 -2
- package/dist/chains/combine_documents/stuff.js +2 -2
- package/dist/chains/load.cjs +1 -1
- package/dist/chains/load.d.ts +1 -1
- package/dist/chains/load.js +1 -1
- package/dist/chains/openai_functions/structured_output.cjs +2 -2
- package/dist/chains/openai_functions/structured_output.d.ts +2 -2
- package/dist/chains/openai_functions/structured_output.js +2 -2
- package/dist/chains/serde.d.ts +12 -12
- package/dist/experimental/plan_and_execute/agent_executor.cjs +1 -1
- package/dist/experimental/plan_and_execute/agent_executor.js +1 -1
- package/dist/load/import_constants.cjs +2 -39
- package/dist/load/import_constants.js +2 -39
- package/dist/load/import_map.cjs +2 -3
- package/dist/load/import_map.d.ts +0 -1
- package/dist/load/import_map.js +0 -1
- package/dist/output_parsers/openai_tools.cjs +2 -4
- package/dist/output_parsers/openai_tools.d.ts +8 -3
- package/dist/output_parsers/openai_tools.js +2 -4
- package/dist/retrievers/multi_query.d.ts +1 -0
- package/dist/smith/config.d.ts +1 -5
- package/package.json +31 -854
- package/dist/document_loaders/fs/chatgpt.cjs +0 -90
- package/dist/document_loaders/fs/chatgpt.d.ts +0 -8
- package/dist/document_loaders/fs/chatgpt.js +0 -86
- package/dist/document_loaders/fs/csv.cjs +0 -73
- package/dist/document_loaders/fs/csv.d.ts +0 -65
- package/dist/document_loaders/fs/csv.js +0 -69
- package/dist/document_loaders/fs/docx.cjs +0 -58
- package/dist/document_loaders/fs/docx.d.ts +0 -25
- package/dist/document_loaders/fs/docx.js +0 -54
- package/dist/document_loaders/fs/epub.cjs +0 -103
- package/dist/document_loaders/fs/epub.d.ts +0 -33
- package/dist/document_loaders/fs/epub.js +0 -99
- package/dist/document_loaders/fs/notion.cjs +0 -26
- package/dist/document_loaders/fs/notion.d.ts +0 -12
- package/dist/document_loaders/fs/notion.js +0 -22
- package/dist/document_loaders/fs/obsidian.cjs +0 -247
- package/dist/document_loaders/fs/obsidian.d.ts +0 -28
- package/dist/document_loaders/fs/obsidian.js +0 -240
- package/dist/document_loaders/fs/openai_whisper_audio.cjs +0 -49
- package/dist/document_loaders/fs/openai_whisper_audio.d.ts +0 -23
- package/dist/document_loaders/fs/openai_whisper_audio.js +0 -45
- package/dist/document_loaders/fs/pdf.cjs +0 -148
- package/dist/document_loaders/fs/pdf.d.ts +0 -49
- package/dist/document_loaders/fs/pdf.js +0 -144
- package/dist/document_loaders/fs/pptx.cjs +0 -46
- package/dist/document_loaders/fs/pptx.d.ts +0 -25
- package/dist/document_loaders/fs/pptx.js +0 -42
- package/dist/document_loaders/fs/srt.cjs +0 -57
- package/dist/document_loaders/fs/srt.d.ts +0 -32
- package/dist/document_loaders/fs/srt.js +0 -50
- package/dist/document_loaders/fs/unstructured.cjs +0 -338
- package/dist/document_loaders/fs/unstructured.d.ts +0 -125
- package/dist/document_loaders/fs/unstructured.js +0 -333
- package/dist/document_loaders/web/apify_dataset.cjs +0 -130
- package/dist/document_loaders/web/apify_dataset.d.ts +0 -85
- package/dist/document_loaders/web/apify_dataset.js +0 -126
- package/dist/document_loaders/web/assemblyai.cjs +0 -200
- package/dist/document_loaders/web/assemblyai.d.ts +0 -95
- package/dist/document_loaders/web/assemblyai.js +0 -193
- package/dist/document_loaders/web/azure_blob_storage_container.cjs +0 -73
- package/dist/document_loaders/web/azure_blob_storage_container.d.ts +0 -46
- package/dist/document_loaders/web/azure_blob_storage_container.js +0 -69
- package/dist/document_loaders/web/azure_blob_storage_file.cjs +0 -124
- package/dist/document_loaders/web/azure_blob_storage_file.d.ts +0 -53
- package/dist/document_loaders/web/azure_blob_storage_file.js +0 -97
- package/dist/document_loaders/web/browserbase.cjs +0 -93
- package/dist/document_loaders/web/browserbase.d.ts +0 -48
- package/dist/document_loaders/web/browserbase.js +0 -86
- package/dist/document_loaders/web/cheerio.cjs +0 -118
- package/dist/document_loaders/web/cheerio.d.ts +0 -77
- package/dist/document_loaders/web/cheerio.js +0 -114
- package/dist/document_loaders/web/college_confidential.cjs +0 -41
- package/dist/document_loaders/web/college_confidential.d.ts +0 -25
- package/dist/document_loaders/web/college_confidential.js +0 -37
- package/dist/document_loaders/web/confluence.cjs +0 -190
- package/dist/document_loaders/web/confluence.d.ts +0 -114
- package/dist/document_loaders/web/confluence.js +0 -186
- package/dist/document_loaders/web/couchbase.cjs +0 -95
- package/dist/document_loaders/web/couchbase.d.ts +0 -32
- package/dist/document_loaders/web/couchbase.js +0 -91
- package/dist/document_loaders/web/figma.cjs +0 -102
- package/dist/document_loaders/web/figma.d.ts +0 -82
- package/dist/document_loaders/web/figma.js +0 -98
- package/dist/document_loaders/web/firecrawl.cjs +0 -95
- package/dist/document_loaders/web/firecrawl.d.ts +0 -50
- package/dist/document_loaders/web/firecrawl.js +0 -88
- package/dist/document_loaders/web/gitbook.cjs +0 -110
- package/dist/document_loaders/web/gitbook.d.ts +0 -55
- package/dist/document_loaders/web/gitbook.js +0 -106
- package/dist/document_loaders/web/github.cjs +0 -615
- package/dist/document_loaders/web/github.d.ts +0 -203
- package/dist/document_loaders/web/github.js +0 -608
- package/dist/document_loaders/web/hn.cjs +0 -90
- package/dist/document_loaders/web/hn.d.ts +0 -42
- package/dist/document_loaders/web/hn.js +0 -86
- package/dist/document_loaders/web/imsdb.cjs +0 -44
- package/dist/document_loaders/web/imsdb.d.ts +0 -23
- package/dist/document_loaders/web/imsdb.js +0 -40
- package/dist/document_loaders/web/notionapi.cjs +0 -404
- package/dist/document_loaders/web/notionapi.d.ts +0 -133
- package/dist/document_loaders/web/notionapi.js +0 -392
- package/dist/document_loaders/web/notiondb.cjs +0 -199
- package/dist/document_loaders/web/notiondb.d.ts +0 -56
- package/dist/document_loaders/web/notiondb.js +0 -195
- package/dist/document_loaders/web/pdf.cjs +0 -140
- package/dist/document_loaders/web/pdf.d.ts +0 -35
- package/dist/document_loaders/web/pdf.js +0 -136
- package/dist/document_loaders/web/playwright.cjs +0 -89
- package/dist/document_loaders/web/playwright.d.ts +0 -58
- package/dist/document_loaders/web/playwright.js +0 -85
- package/dist/document_loaders/web/puppeteer.cjs +0 -139
- package/dist/document_loaders/web/puppeteer.d.ts +0 -82
- package/dist/document_loaders/web/puppeteer.js +0 -135
- package/dist/document_loaders/web/recursive_url.cjs +0 -198
- package/dist/document_loaders/web/recursive_url.d.ts +0 -33
- package/dist/document_loaders/web/recursive_url.js +0 -194
- package/dist/document_loaders/web/s3.cjs +0 -164
- package/dist/document_loaders/web/s3.d.ts +0 -78
- package/dist/document_loaders/web/s3.js +0 -137
- package/dist/document_loaders/web/searchapi.cjs +0 -150
- package/dist/document_loaders/web/searchapi.d.ts +0 -76
- package/dist/document_loaders/web/searchapi.js +0 -146
- package/dist/document_loaders/web/serpapi.cjs +0 -127
- package/dist/document_loaders/web/serpapi.d.ts +0 -62
- package/dist/document_loaders/web/serpapi.js +0 -123
- package/dist/document_loaders/web/sitemap.cjs +0 -118
- package/dist/document_loaders/web/sitemap.d.ts +0 -41
- package/dist/document_loaders/web/sitemap.js +0 -114
- package/dist/document_loaders/web/sonix_audio.cjs +0 -68
- package/dist/document_loaders/web/sonix_audio.d.ts +0 -36
- package/dist/document_loaders/web/sonix_audio.js +0 -64
- package/dist/document_loaders/web/sort_xyz_blockchain.cjs +0 -157
- package/dist/document_loaders/web/sort_xyz_blockchain.d.ts +0 -78
- package/dist/document_loaders/web/sort_xyz_blockchain.js +0 -153
- package/dist/document_loaders/web/youtube.cjs +0 -116
- package/dist/document_loaders/web/youtube.d.ts +0 -55
- package/dist/document_loaders/web/youtube.js +0 -112
- package/dist/experimental/tools/pyinterpreter.cjs +0 -248
- package/dist/experimental/tools/pyinterpreter.d.ts +0 -18
- package/dist/experimental/tools/pyinterpreter.js +0 -244
- package/dist/retrievers/self_query/chroma.cjs +0 -48
- package/dist/retrievers/self_query/chroma.d.ts +0 -26
- package/dist/retrievers/self_query/chroma.js +0 -44
- package/dist/retrievers/self_query/pinecone.cjs +0 -47
- package/dist/retrievers/self_query/pinecone.d.ts +0 -26
- package/dist/retrievers/self_query/pinecone.js +0 -43
- package/dist/retrievers/self_query/supabase.cjs +0 -278
- package/dist/retrievers/self_query/supabase.d.ts +0 -109
- package/dist/retrievers/self_query/supabase.js +0 -274
- package/dist/retrievers/self_query/supabase_utils.cjs +0 -264
- package/dist/retrievers/self_query/supabase_utils.d.ts +0 -101
- package/dist/retrievers/self_query/supabase_utils.js +0 -259
- package/dist/retrievers/self_query/vectara.cjs +0 -143
- package/dist/retrievers/self_query/vectara.d.ts +0 -42
- package/dist/retrievers/self_query/vectara.js +0 -139
- package/dist/retrievers/self_query/weaviate.cjs +0 -201
- package/dist/retrievers/self_query/weaviate.d.ts +0 -99
- package/dist/retrievers/self_query/weaviate.js +0 -197
- package/dist/types/assemblyai-types.cjs +0 -2
- package/dist/types/assemblyai-types.d.ts +0 -4
- package/dist/types/assemblyai-types.js +0 -1
- package/document_loaders/fs/chatgpt.cjs +0 -1
- package/document_loaders/fs/chatgpt.d.cts +0 -1
- package/document_loaders/fs/chatgpt.d.ts +0 -1
- package/document_loaders/fs/chatgpt.js +0 -1
- package/document_loaders/fs/csv.cjs +0 -1
- package/document_loaders/fs/csv.d.cts +0 -1
- package/document_loaders/fs/csv.d.ts +0 -1
- package/document_loaders/fs/csv.js +0 -1
- package/document_loaders/fs/docx.cjs +0 -1
- package/document_loaders/fs/docx.d.cts +0 -1
- package/document_loaders/fs/docx.d.ts +0 -1
- package/document_loaders/fs/docx.js +0 -1
- package/document_loaders/fs/epub.cjs +0 -1
- package/document_loaders/fs/epub.d.cts +0 -1
- package/document_loaders/fs/epub.d.ts +0 -1
- package/document_loaders/fs/epub.js +0 -1
- package/document_loaders/fs/notion.cjs +0 -1
- package/document_loaders/fs/notion.d.cts +0 -1
- package/document_loaders/fs/notion.d.ts +0 -1
- package/document_loaders/fs/notion.js +0 -1
- package/document_loaders/fs/obsidian.cjs +0 -1
- package/document_loaders/fs/obsidian.d.cts +0 -1
- package/document_loaders/fs/obsidian.d.ts +0 -1
- package/document_loaders/fs/obsidian.js +0 -1
- package/document_loaders/fs/openai_whisper_audio.cjs +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.cts +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.ts +0 -1
- package/document_loaders/fs/openai_whisper_audio.js +0 -1
- package/document_loaders/fs/pdf.cjs +0 -1
- package/document_loaders/fs/pdf.d.cts +0 -1
- package/document_loaders/fs/pdf.d.ts +0 -1
- package/document_loaders/fs/pdf.js +0 -1
- package/document_loaders/fs/pptx.cjs +0 -1
- package/document_loaders/fs/pptx.d.cts +0 -1
- package/document_loaders/fs/pptx.d.ts +0 -1
- package/document_loaders/fs/pptx.js +0 -1
- package/document_loaders/fs/srt.cjs +0 -1
- package/document_loaders/fs/srt.d.cts +0 -1
- package/document_loaders/fs/srt.d.ts +0 -1
- package/document_loaders/fs/srt.js +0 -1
- package/document_loaders/fs/unstructured.cjs +0 -1
- package/document_loaders/fs/unstructured.d.cts +0 -1
- package/document_loaders/fs/unstructured.d.ts +0 -1
- package/document_loaders/fs/unstructured.js +0 -1
- package/document_loaders/web/apify_dataset.cjs +0 -1
- package/document_loaders/web/apify_dataset.d.cts +0 -1
- package/document_loaders/web/apify_dataset.d.ts +0 -1
- package/document_loaders/web/apify_dataset.js +0 -1
- package/document_loaders/web/assemblyai.cjs +0 -1
- package/document_loaders/web/assemblyai.d.cts +0 -1
- package/document_loaders/web/assemblyai.d.ts +0 -1
- package/document_loaders/web/assemblyai.js +0 -1
- package/document_loaders/web/azure_blob_storage_container.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_container.js +0 -1
- package/document_loaders/web/azure_blob_storage_file.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_file.js +0 -1
- package/document_loaders/web/browserbase.cjs +0 -1
- package/document_loaders/web/browserbase.d.cts +0 -1
- package/document_loaders/web/browserbase.d.ts +0 -1
- package/document_loaders/web/browserbase.js +0 -1
- package/document_loaders/web/cheerio.cjs +0 -1
- package/document_loaders/web/cheerio.d.cts +0 -1
- package/document_loaders/web/cheerio.d.ts +0 -1
- package/document_loaders/web/cheerio.js +0 -1
- package/document_loaders/web/college_confidential.cjs +0 -1
- package/document_loaders/web/college_confidential.d.cts +0 -1
- package/document_loaders/web/college_confidential.d.ts +0 -1
- package/document_loaders/web/college_confidential.js +0 -1
- package/document_loaders/web/confluence.cjs +0 -1
- package/document_loaders/web/confluence.d.cts +0 -1
- package/document_loaders/web/confluence.d.ts +0 -1
- package/document_loaders/web/confluence.js +0 -1
- package/document_loaders/web/couchbase.cjs +0 -1
- package/document_loaders/web/couchbase.d.cts +0 -1
- package/document_loaders/web/couchbase.d.ts +0 -1
- package/document_loaders/web/couchbase.js +0 -1
- package/document_loaders/web/figma.cjs +0 -1
- package/document_loaders/web/figma.d.cts +0 -1
- package/document_loaders/web/figma.d.ts +0 -1
- package/document_loaders/web/figma.js +0 -1
- package/document_loaders/web/firecrawl.cjs +0 -1
- package/document_loaders/web/firecrawl.d.cts +0 -1
- package/document_loaders/web/firecrawl.d.ts +0 -1
- package/document_loaders/web/firecrawl.js +0 -1
- package/document_loaders/web/gitbook.cjs +0 -1
- package/document_loaders/web/gitbook.d.cts +0 -1
- package/document_loaders/web/gitbook.d.ts +0 -1
- package/document_loaders/web/gitbook.js +0 -1
- package/document_loaders/web/github.cjs +0 -1
- package/document_loaders/web/github.d.cts +0 -1
- package/document_loaders/web/github.d.ts +0 -1
- package/document_loaders/web/github.js +0 -1
- package/document_loaders/web/hn.cjs +0 -1
- package/document_loaders/web/hn.d.cts +0 -1
- package/document_loaders/web/hn.d.ts +0 -1
- package/document_loaders/web/hn.js +0 -1
- package/document_loaders/web/imsdb.cjs +0 -1
- package/document_loaders/web/imsdb.d.cts +0 -1
- package/document_loaders/web/imsdb.d.ts +0 -1
- package/document_loaders/web/imsdb.js +0 -1
- package/document_loaders/web/notionapi.cjs +0 -1
- package/document_loaders/web/notionapi.d.cts +0 -1
- package/document_loaders/web/notionapi.d.ts +0 -1
- package/document_loaders/web/notionapi.js +0 -1
- package/document_loaders/web/notiondb.cjs +0 -1
- package/document_loaders/web/notiondb.d.cts +0 -1
- package/document_loaders/web/notiondb.d.ts +0 -1
- package/document_loaders/web/notiondb.js +0 -1
- package/document_loaders/web/pdf.cjs +0 -1
- package/document_loaders/web/pdf.d.cts +0 -1
- package/document_loaders/web/pdf.d.ts +0 -1
- package/document_loaders/web/pdf.js +0 -1
- package/document_loaders/web/playwright.cjs +0 -1
- package/document_loaders/web/playwright.d.cts +0 -1
- package/document_loaders/web/playwright.d.ts +0 -1
- package/document_loaders/web/playwright.js +0 -1
- package/document_loaders/web/puppeteer.cjs +0 -1
- package/document_loaders/web/puppeteer.d.cts +0 -1
- package/document_loaders/web/puppeteer.d.ts +0 -1
- package/document_loaders/web/puppeteer.js +0 -1
- package/document_loaders/web/recursive_url.cjs +0 -1
- package/document_loaders/web/recursive_url.d.cts +0 -1
- package/document_loaders/web/recursive_url.d.ts +0 -1
- package/document_loaders/web/recursive_url.js +0 -1
- package/document_loaders/web/s3.cjs +0 -1
- package/document_loaders/web/s3.d.cts +0 -1
- package/document_loaders/web/s3.d.ts +0 -1
- package/document_loaders/web/s3.js +0 -1
- package/document_loaders/web/searchapi.cjs +0 -1
- package/document_loaders/web/searchapi.d.cts +0 -1
- package/document_loaders/web/searchapi.d.ts +0 -1
- package/document_loaders/web/searchapi.js +0 -1
- package/document_loaders/web/serpapi.cjs +0 -1
- package/document_loaders/web/serpapi.d.cts +0 -1
- package/document_loaders/web/serpapi.d.ts +0 -1
- package/document_loaders/web/serpapi.js +0 -1
- package/document_loaders/web/sitemap.cjs +0 -1
- package/document_loaders/web/sitemap.d.cts +0 -1
- package/document_loaders/web/sitemap.d.ts +0 -1
- package/document_loaders/web/sitemap.js +0 -1
- package/document_loaders/web/sonix_audio.cjs +0 -1
- package/document_loaders/web/sonix_audio.d.cts +0 -1
- package/document_loaders/web/sonix_audio.d.ts +0 -1
- package/document_loaders/web/sonix_audio.js +0 -1
- package/document_loaders/web/sort_xyz_blockchain.cjs +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.cts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.ts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.js +0 -1
- package/document_loaders/web/youtube.cjs +0 -1
- package/document_loaders/web/youtube.d.cts +0 -1
- package/document_loaders/web/youtube.d.ts +0 -1
- package/document_loaders/web/youtube.js +0 -1
- package/experimental/tools/pyinterpreter.cjs +0 -1
- package/experimental/tools/pyinterpreter.d.cts +0 -1
- package/experimental/tools/pyinterpreter.d.ts +0 -1
- package/experimental/tools/pyinterpreter.js +0 -1
- package/memory/index.cjs +0 -1
- package/memory/index.d.cts +0 -1
- package/memory/index.d.ts +0 -1
- package/memory/index.js +0 -1
- package/retrievers/self_query/chroma.cjs +0 -1
- package/retrievers/self_query/chroma.d.cts +0 -1
- package/retrievers/self_query/chroma.d.ts +0 -1
- package/retrievers/self_query/chroma.js +0 -1
- package/retrievers/self_query/pinecone.cjs +0 -1
- package/retrievers/self_query/pinecone.d.cts +0 -1
- package/retrievers/self_query/pinecone.d.ts +0 -1
- package/retrievers/self_query/pinecone.js +0 -1
- package/retrievers/self_query/supabase.cjs +0 -1
- package/retrievers/self_query/supabase.d.cts +0 -1
- package/retrievers/self_query/supabase.d.ts +0 -1
- package/retrievers/self_query/supabase.js +0 -1
- package/retrievers/self_query/vectara.cjs +0 -1
- package/retrievers/self_query/vectara.d.cts +0 -1
- package/retrievers/self_query/vectara.d.ts +0 -1
- package/retrievers/self_query/vectara.js +0 -1
- package/retrievers/self_query/weaviate.cjs +0 -1
- package/retrievers/self_query/weaviate.d.cts +0 -1
- package/retrievers/self_query/weaviate.d.ts +0 -1
- package/retrievers/self_query/weaviate.js +0 -1
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
-
};
|
|
5
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.FireCrawlLoader = void 0;
|
|
7
|
-
const firecrawl_js_1 = __importDefault(require("@mendable/firecrawl-js"));
|
|
8
|
-
const documents_1 = require("@langchain/core/documents");
|
|
9
|
-
const env_1 = require("@langchain/core/utils/env");
|
|
10
|
-
const base_js_1 = require("../base.cjs");
|
|
11
|
-
const entrypoint_deprecation_js_1 = require("../../util/entrypoint_deprecation.cjs");
|
|
12
|
-
/* #__PURE__ */ (0, entrypoint_deprecation_js_1.logVersion020MigrationWarning)({
|
|
13
|
-
oldEntrypointName: "document_loaders/web/firecrawl",
|
|
14
|
-
newPackageName: "@langchain/community",
|
|
15
|
-
});
|
|
16
|
-
/**
|
|
17
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/firecrawl" instead. This entrypoint will be removed in 0.3.0.
|
|
18
|
-
*
|
|
19
|
-
* Class representing a document loader for loading data from
|
|
20
|
-
* Firecrawl (firecrawl.dev). It extends the BaseDocumentLoader class.
|
|
21
|
-
* @example
|
|
22
|
-
* ```typescript
|
|
23
|
-
* const loader = new FireCrawlLoader({
|
|
24
|
-
* url: "{url}",
|
|
25
|
-
* apiKey: "{apiKey}",
|
|
26
|
-
* mode: "crawl"
|
|
27
|
-
* });
|
|
28
|
-
* const docs = await loader.load();
|
|
29
|
-
* ```
|
|
30
|
-
*/
|
|
31
|
-
class FireCrawlLoader extends base_js_1.BaseDocumentLoader {
|
|
32
|
-
constructor(loaderParams) {
|
|
33
|
-
super();
|
|
34
|
-
Object.defineProperty(this, "apiKey", {
|
|
35
|
-
enumerable: true,
|
|
36
|
-
configurable: true,
|
|
37
|
-
writable: true,
|
|
38
|
-
value: void 0
|
|
39
|
-
});
|
|
40
|
-
Object.defineProperty(this, "url", {
|
|
41
|
-
enumerable: true,
|
|
42
|
-
configurable: true,
|
|
43
|
-
writable: true,
|
|
44
|
-
value: void 0
|
|
45
|
-
});
|
|
46
|
-
Object.defineProperty(this, "mode", {
|
|
47
|
-
enumerable: true,
|
|
48
|
-
configurable: true,
|
|
49
|
-
writable: true,
|
|
50
|
-
value: void 0
|
|
51
|
-
});
|
|
52
|
-
Object.defineProperty(this, "params", {
|
|
53
|
-
enumerable: true,
|
|
54
|
-
configurable: true,
|
|
55
|
-
writable: true,
|
|
56
|
-
value: void 0
|
|
57
|
-
});
|
|
58
|
-
const { apiKey = (0, env_1.getEnvironmentVariable)("FIRECRAWL_API_KEY"), url, mode = "crawl", params, } = loaderParams;
|
|
59
|
-
if (!apiKey) {
|
|
60
|
-
throw new Error("Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl.");
|
|
61
|
-
}
|
|
62
|
-
this.apiKey = apiKey;
|
|
63
|
-
this.url = url;
|
|
64
|
-
this.mode = mode;
|
|
65
|
-
this.params = params;
|
|
66
|
-
}
|
|
67
|
-
/**
|
|
68
|
-
* Loads the data from the Firecrawl.
|
|
69
|
-
* @returns An array of Documents representing the retrieved data.
|
|
70
|
-
* @throws An error if the data could not be loaded.
|
|
71
|
-
*/
|
|
72
|
-
async load() {
|
|
73
|
-
const app = new firecrawl_js_1.default({ apiKey: this.apiKey });
|
|
74
|
-
let firecrawlDocs;
|
|
75
|
-
if (this.mode === "scrape") {
|
|
76
|
-
const response = await app.scrapeUrl(this.url, this.params);
|
|
77
|
-
if (!response.success) {
|
|
78
|
-
throw new Error(`Firecrawl: Failed to scrape URL. Error: ${response.error}`);
|
|
79
|
-
}
|
|
80
|
-
firecrawlDocs = [response.data];
|
|
81
|
-
}
|
|
82
|
-
else if (this.mode === "crawl") {
|
|
83
|
-
const response = await app.crawlUrl(this.url, this.params, true);
|
|
84
|
-
firecrawlDocs = response;
|
|
85
|
-
}
|
|
86
|
-
else {
|
|
87
|
-
throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`);
|
|
88
|
-
}
|
|
89
|
-
return firecrawlDocs.map((doc) => new documents_1.Document({
|
|
90
|
-
pageContent: doc.markdown || "",
|
|
91
|
-
metadata: doc.metadata || {},
|
|
92
|
-
}));
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
exports.FireCrawlLoader = FireCrawlLoader;
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
import { type DocumentInterface } from "@langchain/core/documents";
|
|
2
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
3
|
-
/**
|
|
4
|
-
* Interface representing the parameters for the Firecrawl loader. It
|
|
5
|
-
* includes properties such as the URL to scrape or crawl and the API key.
|
|
6
|
-
*/
|
|
7
|
-
interface FirecrawlLoaderParameters {
|
|
8
|
-
/**
|
|
9
|
-
* URL to scrape or crawl
|
|
10
|
-
*/
|
|
11
|
-
url: string;
|
|
12
|
-
/**
|
|
13
|
-
* API key for Firecrawl. If not provided, the default value is the value of the FIRECRAWL_API_KEY environment variable.
|
|
14
|
-
*/
|
|
15
|
-
apiKey?: string;
|
|
16
|
-
/**
|
|
17
|
-
* Mode of operation. Can be either "crawl" or "scrape". If not provided, the default value is "crawl".
|
|
18
|
-
*/
|
|
19
|
-
mode?: "crawl" | "scrape";
|
|
20
|
-
params?: Record<string, unknown>;
|
|
21
|
-
}
|
|
22
|
-
/**
|
|
23
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/firecrawl" instead. This entrypoint will be removed in 0.3.0.
|
|
24
|
-
*
|
|
25
|
-
* Class representing a document loader for loading data from
|
|
26
|
-
* Firecrawl (firecrawl.dev). It extends the BaseDocumentLoader class.
|
|
27
|
-
* @example
|
|
28
|
-
* ```typescript
|
|
29
|
-
* const loader = new FireCrawlLoader({
|
|
30
|
-
* url: "{url}",
|
|
31
|
-
* apiKey: "{apiKey}",
|
|
32
|
-
* mode: "crawl"
|
|
33
|
-
* });
|
|
34
|
-
* const docs = await loader.load();
|
|
35
|
-
* ```
|
|
36
|
-
*/
|
|
37
|
-
export declare class FireCrawlLoader extends BaseDocumentLoader {
|
|
38
|
-
private apiKey;
|
|
39
|
-
private url;
|
|
40
|
-
private mode;
|
|
41
|
-
private params?;
|
|
42
|
-
constructor(loaderParams: FirecrawlLoaderParameters);
|
|
43
|
-
/**
|
|
44
|
-
* Loads the data from the Firecrawl.
|
|
45
|
-
* @returns An array of Documents representing the retrieved data.
|
|
46
|
-
* @throws An error if the data could not be loaded.
|
|
47
|
-
*/
|
|
48
|
-
load(): Promise<DocumentInterface[]>;
|
|
49
|
-
}
|
|
50
|
-
export {};
|
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
import FirecrawlApp from "@mendable/firecrawl-js";
|
|
2
|
-
import { Document } from "@langchain/core/documents";
|
|
3
|
-
import { getEnvironmentVariable } from "@langchain/core/utils/env";
|
|
4
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
5
|
-
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
|
|
6
|
-
/* #__PURE__ */ logVersion020MigrationWarning({
|
|
7
|
-
oldEntrypointName: "document_loaders/web/firecrawl",
|
|
8
|
-
newPackageName: "@langchain/community",
|
|
9
|
-
});
|
|
10
|
-
/**
|
|
11
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/firecrawl" instead. This entrypoint will be removed in 0.3.0.
|
|
12
|
-
*
|
|
13
|
-
* Class representing a document loader for loading data from
|
|
14
|
-
* Firecrawl (firecrawl.dev). It extends the BaseDocumentLoader class.
|
|
15
|
-
* @example
|
|
16
|
-
* ```typescript
|
|
17
|
-
* const loader = new FireCrawlLoader({
|
|
18
|
-
* url: "{url}",
|
|
19
|
-
* apiKey: "{apiKey}",
|
|
20
|
-
* mode: "crawl"
|
|
21
|
-
* });
|
|
22
|
-
* const docs = await loader.load();
|
|
23
|
-
* ```
|
|
24
|
-
*/
|
|
25
|
-
export class FireCrawlLoader extends BaseDocumentLoader {
|
|
26
|
-
constructor(loaderParams) {
|
|
27
|
-
super();
|
|
28
|
-
Object.defineProperty(this, "apiKey", {
|
|
29
|
-
enumerable: true,
|
|
30
|
-
configurable: true,
|
|
31
|
-
writable: true,
|
|
32
|
-
value: void 0
|
|
33
|
-
});
|
|
34
|
-
Object.defineProperty(this, "url", {
|
|
35
|
-
enumerable: true,
|
|
36
|
-
configurable: true,
|
|
37
|
-
writable: true,
|
|
38
|
-
value: void 0
|
|
39
|
-
});
|
|
40
|
-
Object.defineProperty(this, "mode", {
|
|
41
|
-
enumerable: true,
|
|
42
|
-
configurable: true,
|
|
43
|
-
writable: true,
|
|
44
|
-
value: void 0
|
|
45
|
-
});
|
|
46
|
-
Object.defineProperty(this, "params", {
|
|
47
|
-
enumerable: true,
|
|
48
|
-
configurable: true,
|
|
49
|
-
writable: true,
|
|
50
|
-
value: void 0
|
|
51
|
-
});
|
|
52
|
-
const { apiKey = getEnvironmentVariable("FIRECRAWL_API_KEY"), url, mode = "crawl", params, } = loaderParams;
|
|
53
|
-
if (!apiKey) {
|
|
54
|
-
throw new Error("Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl.");
|
|
55
|
-
}
|
|
56
|
-
this.apiKey = apiKey;
|
|
57
|
-
this.url = url;
|
|
58
|
-
this.mode = mode;
|
|
59
|
-
this.params = params;
|
|
60
|
-
}
|
|
61
|
-
/**
|
|
62
|
-
* Loads the data from the Firecrawl.
|
|
63
|
-
* @returns An array of Documents representing the retrieved data.
|
|
64
|
-
* @throws An error if the data could not be loaded.
|
|
65
|
-
*/
|
|
66
|
-
async load() {
|
|
67
|
-
const app = new FirecrawlApp({ apiKey: this.apiKey });
|
|
68
|
-
let firecrawlDocs;
|
|
69
|
-
if (this.mode === "scrape") {
|
|
70
|
-
const response = await app.scrapeUrl(this.url, this.params);
|
|
71
|
-
if (!response.success) {
|
|
72
|
-
throw new Error(`Firecrawl: Failed to scrape URL. Error: ${response.error}`);
|
|
73
|
-
}
|
|
74
|
-
firecrawlDocs = [response.data];
|
|
75
|
-
}
|
|
76
|
-
else if (this.mode === "crawl") {
|
|
77
|
-
const response = await app.crawlUrl(this.url, this.params, true);
|
|
78
|
-
firecrawlDocs = response;
|
|
79
|
-
}
|
|
80
|
-
else {
|
|
81
|
-
throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`);
|
|
82
|
-
}
|
|
83
|
-
return firecrawlDocs.map((doc) => new Document({
|
|
84
|
-
pageContent: doc.markdown || "",
|
|
85
|
-
metadata: doc.metadata || {},
|
|
86
|
-
}));
|
|
87
|
-
}
|
|
88
|
-
}
|
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.GitbookLoader = void 0;
|
|
4
|
-
const documents_1 = require("@langchain/core/documents");
|
|
5
|
-
const cheerio_js_1 = require("./cheerio.cjs");
|
|
6
|
-
const entrypoint_deprecation_js_1 = require("../../util/entrypoint_deprecation.cjs");
|
|
7
|
-
/* #__PURE__ */ (0, entrypoint_deprecation_js_1.logVersion020MigrationWarning)({
|
|
8
|
-
oldEntrypointName: "document_loaders/web/gitbook",
|
|
9
|
-
newPackageName: "@langchain/community",
|
|
10
|
-
});
|
|
11
|
-
/**
|
|
12
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/gitbook" instead. This entrypoint will be removed in 0.3.0.
|
|
13
|
-
*
|
|
14
|
-
* Class representing a document loader specifically designed for loading
|
|
15
|
-
* documents from Gitbook. It extends the CheerioWebBaseLoader.
|
|
16
|
-
*/
|
|
17
|
-
class GitbookLoader extends cheerio_js_1.CheerioWebBaseLoader {
|
|
18
|
-
constructor(webPath, params = {}) {
|
|
19
|
-
const path = params.shouldLoadAllPaths === true ? `${webPath}/sitemap.xml` : webPath;
|
|
20
|
-
super(path);
|
|
21
|
-
Object.defineProperty(this, "webPath", {
|
|
22
|
-
enumerable: true,
|
|
23
|
-
configurable: true,
|
|
24
|
-
writable: true,
|
|
25
|
-
value: webPath
|
|
26
|
-
});
|
|
27
|
-
Object.defineProperty(this, "shouldLoadAllPaths", {
|
|
28
|
-
enumerable: true,
|
|
29
|
-
configurable: true,
|
|
30
|
-
writable: true,
|
|
31
|
-
value: false
|
|
32
|
-
});
|
|
33
|
-
Object.defineProperty(this, "baseUrl", {
|
|
34
|
-
enumerable: true,
|
|
35
|
-
configurable: true,
|
|
36
|
-
writable: true,
|
|
37
|
-
value: void 0
|
|
38
|
-
});
|
|
39
|
-
this.baseUrl = webPath;
|
|
40
|
-
this.webPath = path;
|
|
41
|
-
this.shouldLoadAllPaths =
|
|
42
|
-
params.shouldLoadAllPaths ?? this.shouldLoadAllPaths;
|
|
43
|
-
}
|
|
44
|
-
/**
|
|
45
|
-
* Method that scrapes the web document using Cheerio and loads the
|
|
46
|
-
* content based on the value of shouldLoadAllPaths. If shouldLoadAllPaths
|
|
47
|
-
* is true, it calls the loadAllPaths() method to load all paths.
|
|
48
|
-
* Otherwise, it calls the loadPath() method to load a single path.
|
|
49
|
-
* @returns Promise resolving to an array of Document instances.
|
|
50
|
-
*/
|
|
51
|
-
async load() {
|
|
52
|
-
const $ = await this.scrape();
|
|
53
|
-
if (this.shouldLoadAllPaths === true) {
|
|
54
|
-
return this.loadAllPaths($);
|
|
55
|
-
}
|
|
56
|
-
return this.loadPath($);
|
|
57
|
-
}
|
|
58
|
-
/**
|
|
59
|
-
* Private method that loads the content of a single path from the Gitbook
|
|
60
|
-
* web document. It extracts the page content by selecting all elements
|
|
61
|
-
* inside the "main" element, filters out empty text nodes, and joins the
|
|
62
|
-
* remaining text nodes with line breaks. It extracts the title by
|
|
63
|
-
* selecting the first "h1" element inside the "main" element. It creates
|
|
64
|
-
* a Document instance with the extracted page content and metadata
|
|
65
|
-
* containing the source URL and title.
|
|
66
|
-
* @param $ CheerioAPI instance representing the loaded web document.
|
|
67
|
-
* @param url Optional string representing the URL of the web document.
|
|
68
|
-
* @returns Array of Document instances.
|
|
69
|
-
*/
|
|
70
|
-
loadPath($, url) {
|
|
71
|
-
const pageContent = $("main *")
|
|
72
|
-
.contents()
|
|
73
|
-
.toArray()
|
|
74
|
-
.map((element) => element.type === "text" ? $(element).text().trim() : null)
|
|
75
|
-
.filter((text) => text)
|
|
76
|
-
.join("\n");
|
|
77
|
-
const title = $("main h1").first().text().trim();
|
|
78
|
-
return [
|
|
79
|
-
new documents_1.Document({
|
|
80
|
-
pageContent,
|
|
81
|
-
metadata: { source: url ?? this.webPath, title },
|
|
82
|
-
}),
|
|
83
|
-
];
|
|
84
|
-
}
|
|
85
|
-
/**
|
|
86
|
-
* Private method that loads the content of all paths from the Gitbook web
|
|
87
|
-
* document. It extracts the URLs of all paths from the "loc" elements in
|
|
88
|
-
* the sitemap.xml. It iterates over each URL, scrapes the web document
|
|
89
|
-
* using the _scrape() method, and calls the loadPath() method to load the
|
|
90
|
-
* content of each path. It collects all the loaded documents and returns
|
|
91
|
-
* them as an array.
|
|
92
|
-
* @param $ CheerioAPI instance representing the loaded web document.
|
|
93
|
-
* @returns Promise resolving to an array of Document instances.
|
|
94
|
-
*/
|
|
95
|
-
async loadAllPaths($) {
|
|
96
|
-
const urls = $("loc")
|
|
97
|
-
.toArray()
|
|
98
|
-
.map((element) => $(element).text());
|
|
99
|
-
const documents = [];
|
|
100
|
-
for (const url of urls) {
|
|
101
|
-
const buildUrl = url.includes(this.baseUrl) ? url : this.baseUrl + url;
|
|
102
|
-
console.log(`Fetching text from ${buildUrl}`);
|
|
103
|
-
const html = await GitbookLoader._scrape(buildUrl, this.caller, this.timeout);
|
|
104
|
-
documents.push(...this.loadPath(html, buildUrl));
|
|
105
|
-
}
|
|
106
|
-
console.log(`Fetched ${documents.length} documents.`);
|
|
107
|
-
return documents;
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
exports.GitbookLoader = GitbookLoader;
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
import { Document } from "@langchain/core/documents";
|
|
2
|
-
import { CheerioWebBaseLoader } from "./cheerio.js";
|
|
3
|
-
/**
|
|
4
|
-
* Interface representing the parameters for configuring the
|
|
5
|
-
* GitbookLoader. It has an optional property shouldLoadAllPaths, which
|
|
6
|
-
* indicates whether all paths should be loaded.
|
|
7
|
-
*/
|
|
8
|
-
interface GitbookLoaderParams {
|
|
9
|
-
shouldLoadAllPaths?: boolean;
|
|
10
|
-
}
|
|
11
|
-
/**
|
|
12
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/gitbook" instead. This entrypoint will be removed in 0.3.0.
|
|
13
|
-
*
|
|
14
|
-
* Class representing a document loader specifically designed for loading
|
|
15
|
-
* documents from Gitbook. It extends the CheerioWebBaseLoader.
|
|
16
|
-
*/
|
|
17
|
-
export declare class GitbookLoader extends CheerioWebBaseLoader {
|
|
18
|
-
webPath: string;
|
|
19
|
-
shouldLoadAllPaths: boolean;
|
|
20
|
-
private readonly baseUrl;
|
|
21
|
-
constructor(webPath: string, params?: GitbookLoaderParams);
|
|
22
|
-
/**
|
|
23
|
-
* Method that scrapes the web document using Cheerio and loads the
|
|
24
|
-
* content based on the value of shouldLoadAllPaths. If shouldLoadAllPaths
|
|
25
|
-
* is true, it calls the loadAllPaths() method to load all paths.
|
|
26
|
-
* Otherwise, it calls the loadPath() method to load a single path.
|
|
27
|
-
* @returns Promise resolving to an array of Document instances.
|
|
28
|
-
*/
|
|
29
|
-
load(): Promise<Document[]>;
|
|
30
|
-
/**
|
|
31
|
-
* Private method that loads the content of a single path from the Gitbook
|
|
32
|
-
* web document. It extracts the page content by selecting all elements
|
|
33
|
-
* inside the "main" element, filters out empty text nodes, and joins the
|
|
34
|
-
* remaining text nodes with line breaks. It extracts the title by
|
|
35
|
-
* selecting the first "h1" element inside the "main" element. It creates
|
|
36
|
-
* a Document instance with the extracted page content and metadata
|
|
37
|
-
* containing the source URL and title.
|
|
38
|
-
* @param $ CheerioAPI instance representing the loaded web document.
|
|
39
|
-
* @param url Optional string representing the URL of the web document.
|
|
40
|
-
* @returns Array of Document instances.
|
|
41
|
-
*/
|
|
42
|
-
private loadPath;
|
|
43
|
-
/**
|
|
44
|
-
* Private method that loads the content of all paths from the Gitbook web
|
|
45
|
-
* document. It extracts the URLs of all paths from the "loc" elements in
|
|
46
|
-
* the sitemap.xml. It iterates over each URL, scrapes the web document
|
|
47
|
-
* using the _scrape() method, and calls the loadPath() method to load the
|
|
48
|
-
* content of each path. It collects all the loaded documents and returns
|
|
49
|
-
* them as an array.
|
|
50
|
-
* @param $ CheerioAPI instance representing the loaded web document.
|
|
51
|
-
* @returns Promise resolving to an array of Document instances.
|
|
52
|
-
*/
|
|
53
|
-
private loadAllPaths;
|
|
54
|
-
}
|
|
55
|
-
export {};
|
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
import { Document } from "@langchain/core/documents";
|
|
2
|
-
import { CheerioWebBaseLoader } from "./cheerio.js";
|
|
3
|
-
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
|
|
4
|
-
/* #__PURE__ */ logVersion020MigrationWarning({
|
|
5
|
-
oldEntrypointName: "document_loaders/web/gitbook",
|
|
6
|
-
newPackageName: "@langchain/community",
|
|
7
|
-
});
|
|
8
|
-
/**
|
|
9
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/gitbook" instead. This entrypoint will be removed in 0.3.0.
|
|
10
|
-
*
|
|
11
|
-
* Class representing a document loader specifically designed for loading
|
|
12
|
-
* documents from Gitbook. It extends the CheerioWebBaseLoader.
|
|
13
|
-
*/
|
|
14
|
-
export class GitbookLoader extends CheerioWebBaseLoader {
|
|
15
|
-
constructor(webPath, params = {}) {
|
|
16
|
-
const path = params.shouldLoadAllPaths === true ? `${webPath}/sitemap.xml` : webPath;
|
|
17
|
-
super(path);
|
|
18
|
-
Object.defineProperty(this, "webPath", {
|
|
19
|
-
enumerable: true,
|
|
20
|
-
configurable: true,
|
|
21
|
-
writable: true,
|
|
22
|
-
value: webPath
|
|
23
|
-
});
|
|
24
|
-
Object.defineProperty(this, "shouldLoadAllPaths", {
|
|
25
|
-
enumerable: true,
|
|
26
|
-
configurable: true,
|
|
27
|
-
writable: true,
|
|
28
|
-
value: false
|
|
29
|
-
});
|
|
30
|
-
Object.defineProperty(this, "baseUrl", {
|
|
31
|
-
enumerable: true,
|
|
32
|
-
configurable: true,
|
|
33
|
-
writable: true,
|
|
34
|
-
value: void 0
|
|
35
|
-
});
|
|
36
|
-
this.baseUrl = webPath;
|
|
37
|
-
this.webPath = path;
|
|
38
|
-
this.shouldLoadAllPaths =
|
|
39
|
-
params.shouldLoadAllPaths ?? this.shouldLoadAllPaths;
|
|
40
|
-
}
|
|
41
|
-
/**
|
|
42
|
-
* Method that scrapes the web document using Cheerio and loads the
|
|
43
|
-
* content based on the value of shouldLoadAllPaths. If shouldLoadAllPaths
|
|
44
|
-
* is true, it calls the loadAllPaths() method to load all paths.
|
|
45
|
-
* Otherwise, it calls the loadPath() method to load a single path.
|
|
46
|
-
* @returns Promise resolving to an array of Document instances.
|
|
47
|
-
*/
|
|
48
|
-
async load() {
|
|
49
|
-
const $ = await this.scrape();
|
|
50
|
-
if (this.shouldLoadAllPaths === true) {
|
|
51
|
-
return this.loadAllPaths($);
|
|
52
|
-
}
|
|
53
|
-
return this.loadPath($);
|
|
54
|
-
}
|
|
55
|
-
/**
|
|
56
|
-
* Private method that loads the content of a single path from the Gitbook
|
|
57
|
-
* web document. It extracts the page content by selecting all elements
|
|
58
|
-
* inside the "main" element, filters out empty text nodes, and joins the
|
|
59
|
-
* remaining text nodes with line breaks. It extracts the title by
|
|
60
|
-
* selecting the first "h1" element inside the "main" element. It creates
|
|
61
|
-
* a Document instance with the extracted page content and metadata
|
|
62
|
-
* containing the source URL and title.
|
|
63
|
-
* @param $ CheerioAPI instance representing the loaded web document.
|
|
64
|
-
* @param url Optional string representing the URL of the web document.
|
|
65
|
-
* @returns Array of Document instances.
|
|
66
|
-
*/
|
|
67
|
-
loadPath($, url) {
|
|
68
|
-
const pageContent = $("main *")
|
|
69
|
-
.contents()
|
|
70
|
-
.toArray()
|
|
71
|
-
.map((element) => element.type === "text" ? $(element).text().trim() : null)
|
|
72
|
-
.filter((text) => text)
|
|
73
|
-
.join("\n");
|
|
74
|
-
const title = $("main h1").first().text().trim();
|
|
75
|
-
return [
|
|
76
|
-
new Document({
|
|
77
|
-
pageContent,
|
|
78
|
-
metadata: { source: url ?? this.webPath, title },
|
|
79
|
-
}),
|
|
80
|
-
];
|
|
81
|
-
}
|
|
82
|
-
/**
|
|
83
|
-
* Private method that loads the content of all paths from the Gitbook web
|
|
84
|
-
* document. It extracts the URLs of all paths from the "loc" elements in
|
|
85
|
-
* the sitemap.xml. It iterates over each URL, scrapes the web document
|
|
86
|
-
* using the _scrape() method, and calls the loadPath() method to load the
|
|
87
|
-
* content of each path. It collects all the loaded documents and returns
|
|
88
|
-
* them as an array.
|
|
89
|
-
* @param $ CheerioAPI instance representing the loaded web document.
|
|
90
|
-
* @returns Promise resolving to an array of Document instances.
|
|
91
|
-
*/
|
|
92
|
-
async loadAllPaths($) {
|
|
93
|
-
const urls = $("loc")
|
|
94
|
-
.toArray()
|
|
95
|
-
.map((element) => $(element).text());
|
|
96
|
-
const documents = [];
|
|
97
|
-
for (const url of urls) {
|
|
98
|
-
const buildUrl = url.includes(this.baseUrl) ? url : this.baseUrl + url;
|
|
99
|
-
console.log(`Fetching text from ${buildUrl}`);
|
|
100
|
-
const html = await GitbookLoader._scrape(buildUrl, this.caller, this.timeout);
|
|
101
|
-
documents.push(...this.loadPath(html, buildUrl));
|
|
102
|
-
}
|
|
103
|
-
console.log(`Fetched ${documents.length} documents.`);
|
|
104
|
-
return documents;
|
|
105
|
-
}
|
|
106
|
-
}
|