langchain 0.2.18 → 0.3.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/openai_functions/index.cjs +2 -2
- package/dist/agents/openai_functions/index.js +2 -2
- package/dist/chains/combine_documents/stuff.cjs +2 -2
- package/dist/chains/combine_documents/stuff.js +2 -2
- package/dist/chains/openai_functions/openapi.cjs +3 -1
- package/dist/chains/openai_functions/openapi.js +3 -1
- package/dist/load/import_constants.cjs +2 -39
- package/dist/load/import_constants.js +2 -39
- package/dist/load/import_map.cjs +2 -3
- package/dist/load/import_map.d.ts +0 -1
- package/dist/load/import_map.js +0 -1
- package/dist/smith/config.d.ts +1 -5
- package/package.json +31 -854
- package/dist/document_loaders/fs/chatgpt.cjs +0 -90
- package/dist/document_loaders/fs/chatgpt.d.ts +0 -8
- package/dist/document_loaders/fs/chatgpt.js +0 -86
- package/dist/document_loaders/fs/csv.cjs +0 -73
- package/dist/document_loaders/fs/csv.d.ts +0 -65
- package/dist/document_loaders/fs/csv.js +0 -69
- package/dist/document_loaders/fs/docx.cjs +0 -58
- package/dist/document_loaders/fs/docx.d.ts +0 -25
- package/dist/document_loaders/fs/docx.js +0 -54
- package/dist/document_loaders/fs/epub.cjs +0 -103
- package/dist/document_loaders/fs/epub.d.ts +0 -33
- package/dist/document_loaders/fs/epub.js +0 -99
- package/dist/document_loaders/fs/notion.cjs +0 -26
- package/dist/document_loaders/fs/notion.d.ts +0 -12
- package/dist/document_loaders/fs/notion.js +0 -22
- package/dist/document_loaders/fs/obsidian.cjs +0 -247
- package/dist/document_loaders/fs/obsidian.d.ts +0 -28
- package/dist/document_loaders/fs/obsidian.js +0 -240
- package/dist/document_loaders/fs/openai_whisper_audio.cjs +0 -49
- package/dist/document_loaders/fs/openai_whisper_audio.d.ts +0 -23
- package/dist/document_loaders/fs/openai_whisper_audio.js +0 -45
- package/dist/document_loaders/fs/pdf.cjs +0 -148
- package/dist/document_loaders/fs/pdf.d.ts +0 -49
- package/dist/document_loaders/fs/pdf.js +0 -144
- package/dist/document_loaders/fs/pptx.cjs +0 -46
- package/dist/document_loaders/fs/pptx.d.ts +0 -25
- package/dist/document_loaders/fs/pptx.js +0 -42
- package/dist/document_loaders/fs/srt.cjs +0 -57
- package/dist/document_loaders/fs/srt.d.ts +0 -32
- package/dist/document_loaders/fs/srt.js +0 -50
- package/dist/document_loaders/fs/unstructured.cjs +0 -338
- package/dist/document_loaders/fs/unstructured.d.ts +0 -125
- package/dist/document_loaders/fs/unstructured.js +0 -333
- package/dist/document_loaders/web/apify_dataset.cjs +0 -130
- package/dist/document_loaders/web/apify_dataset.d.ts +0 -85
- package/dist/document_loaders/web/apify_dataset.js +0 -126
- package/dist/document_loaders/web/assemblyai.cjs +0 -200
- package/dist/document_loaders/web/assemblyai.d.ts +0 -95
- package/dist/document_loaders/web/assemblyai.js +0 -193
- package/dist/document_loaders/web/azure_blob_storage_container.cjs +0 -73
- package/dist/document_loaders/web/azure_blob_storage_container.d.ts +0 -46
- package/dist/document_loaders/web/azure_blob_storage_container.js +0 -69
- package/dist/document_loaders/web/azure_blob_storage_file.cjs +0 -124
- package/dist/document_loaders/web/azure_blob_storage_file.d.ts +0 -53
- package/dist/document_loaders/web/azure_blob_storage_file.js +0 -97
- package/dist/document_loaders/web/browserbase.cjs +0 -93
- package/dist/document_loaders/web/browserbase.d.ts +0 -48
- package/dist/document_loaders/web/browserbase.js +0 -86
- package/dist/document_loaders/web/cheerio.cjs +0 -118
- package/dist/document_loaders/web/cheerio.d.ts +0 -77
- package/dist/document_loaders/web/cheerio.js +0 -114
- package/dist/document_loaders/web/college_confidential.cjs +0 -41
- package/dist/document_loaders/web/college_confidential.d.ts +0 -25
- package/dist/document_loaders/web/college_confidential.js +0 -37
- package/dist/document_loaders/web/confluence.cjs +0 -190
- package/dist/document_loaders/web/confluence.d.ts +0 -114
- package/dist/document_loaders/web/confluence.js +0 -186
- package/dist/document_loaders/web/couchbase.cjs +0 -95
- package/dist/document_loaders/web/couchbase.d.ts +0 -32
- package/dist/document_loaders/web/couchbase.js +0 -91
- package/dist/document_loaders/web/figma.cjs +0 -102
- package/dist/document_loaders/web/figma.d.ts +0 -82
- package/dist/document_loaders/web/figma.js +0 -98
- package/dist/document_loaders/web/firecrawl.cjs +0 -95
- package/dist/document_loaders/web/firecrawl.d.ts +0 -50
- package/dist/document_loaders/web/firecrawl.js +0 -88
- package/dist/document_loaders/web/gitbook.cjs +0 -110
- package/dist/document_loaders/web/gitbook.d.ts +0 -55
- package/dist/document_loaders/web/gitbook.js +0 -106
- package/dist/document_loaders/web/github.cjs +0 -615
- package/dist/document_loaders/web/github.d.ts +0 -203
- package/dist/document_loaders/web/github.js +0 -608
- package/dist/document_loaders/web/hn.cjs +0 -90
- package/dist/document_loaders/web/hn.d.ts +0 -42
- package/dist/document_loaders/web/hn.js +0 -86
- package/dist/document_loaders/web/imsdb.cjs +0 -44
- package/dist/document_loaders/web/imsdb.d.ts +0 -23
- package/dist/document_loaders/web/imsdb.js +0 -40
- package/dist/document_loaders/web/notionapi.cjs +0 -404
- package/dist/document_loaders/web/notionapi.d.ts +0 -133
- package/dist/document_loaders/web/notionapi.js +0 -392
- package/dist/document_loaders/web/notiondb.cjs +0 -199
- package/dist/document_loaders/web/notiondb.d.ts +0 -56
- package/dist/document_loaders/web/notiondb.js +0 -195
- package/dist/document_loaders/web/pdf.cjs +0 -140
- package/dist/document_loaders/web/pdf.d.ts +0 -35
- package/dist/document_loaders/web/pdf.js +0 -136
- package/dist/document_loaders/web/playwright.cjs +0 -89
- package/dist/document_loaders/web/playwright.d.ts +0 -58
- package/dist/document_loaders/web/playwright.js +0 -85
- package/dist/document_loaders/web/puppeteer.cjs +0 -139
- package/dist/document_loaders/web/puppeteer.d.ts +0 -82
- package/dist/document_loaders/web/puppeteer.js +0 -135
- package/dist/document_loaders/web/recursive_url.cjs +0 -198
- package/dist/document_loaders/web/recursive_url.d.ts +0 -33
- package/dist/document_loaders/web/recursive_url.js +0 -194
- package/dist/document_loaders/web/s3.cjs +0 -164
- package/dist/document_loaders/web/s3.d.ts +0 -78
- package/dist/document_loaders/web/s3.js +0 -137
- package/dist/document_loaders/web/searchapi.cjs +0 -150
- package/dist/document_loaders/web/searchapi.d.ts +0 -76
- package/dist/document_loaders/web/searchapi.js +0 -146
- package/dist/document_loaders/web/serpapi.cjs +0 -127
- package/dist/document_loaders/web/serpapi.d.ts +0 -62
- package/dist/document_loaders/web/serpapi.js +0 -123
- package/dist/document_loaders/web/sitemap.cjs +0 -118
- package/dist/document_loaders/web/sitemap.d.ts +0 -41
- package/dist/document_loaders/web/sitemap.js +0 -114
- package/dist/document_loaders/web/sonix_audio.cjs +0 -68
- package/dist/document_loaders/web/sonix_audio.d.ts +0 -36
- package/dist/document_loaders/web/sonix_audio.js +0 -64
- package/dist/document_loaders/web/sort_xyz_blockchain.cjs +0 -157
- package/dist/document_loaders/web/sort_xyz_blockchain.d.ts +0 -78
- package/dist/document_loaders/web/sort_xyz_blockchain.js +0 -153
- package/dist/document_loaders/web/youtube.cjs +0 -116
- package/dist/document_loaders/web/youtube.d.ts +0 -55
- package/dist/document_loaders/web/youtube.js +0 -112
- package/dist/experimental/tools/pyinterpreter.cjs +0 -248
- package/dist/experimental/tools/pyinterpreter.d.ts +0 -18
- package/dist/experimental/tools/pyinterpreter.js +0 -244
- package/dist/retrievers/self_query/chroma.cjs +0 -48
- package/dist/retrievers/self_query/chroma.d.ts +0 -26
- package/dist/retrievers/self_query/chroma.js +0 -44
- package/dist/retrievers/self_query/pinecone.cjs +0 -47
- package/dist/retrievers/self_query/pinecone.d.ts +0 -26
- package/dist/retrievers/self_query/pinecone.js +0 -43
- package/dist/retrievers/self_query/supabase.cjs +0 -278
- package/dist/retrievers/self_query/supabase.d.ts +0 -109
- package/dist/retrievers/self_query/supabase.js +0 -274
- package/dist/retrievers/self_query/supabase_utils.cjs +0 -264
- package/dist/retrievers/self_query/supabase_utils.d.ts +0 -101
- package/dist/retrievers/self_query/supabase_utils.js +0 -259
- package/dist/retrievers/self_query/vectara.cjs +0 -143
- package/dist/retrievers/self_query/vectara.d.ts +0 -42
- package/dist/retrievers/self_query/vectara.js +0 -139
- package/dist/retrievers/self_query/weaviate.cjs +0 -201
- package/dist/retrievers/self_query/weaviate.d.ts +0 -99
- package/dist/retrievers/self_query/weaviate.js +0 -197
- package/dist/types/assemblyai-types.cjs +0 -2
- package/dist/types/assemblyai-types.d.ts +0 -4
- package/dist/types/assemblyai-types.js +0 -1
- package/document_loaders/fs/chatgpt.cjs +0 -1
- package/document_loaders/fs/chatgpt.d.cts +0 -1
- package/document_loaders/fs/chatgpt.d.ts +0 -1
- package/document_loaders/fs/chatgpt.js +0 -1
- package/document_loaders/fs/csv.cjs +0 -1
- package/document_loaders/fs/csv.d.cts +0 -1
- package/document_loaders/fs/csv.d.ts +0 -1
- package/document_loaders/fs/csv.js +0 -1
- package/document_loaders/fs/docx.cjs +0 -1
- package/document_loaders/fs/docx.d.cts +0 -1
- package/document_loaders/fs/docx.d.ts +0 -1
- package/document_loaders/fs/docx.js +0 -1
- package/document_loaders/fs/epub.cjs +0 -1
- package/document_loaders/fs/epub.d.cts +0 -1
- package/document_loaders/fs/epub.d.ts +0 -1
- package/document_loaders/fs/epub.js +0 -1
- package/document_loaders/fs/notion.cjs +0 -1
- package/document_loaders/fs/notion.d.cts +0 -1
- package/document_loaders/fs/notion.d.ts +0 -1
- package/document_loaders/fs/notion.js +0 -1
- package/document_loaders/fs/obsidian.cjs +0 -1
- package/document_loaders/fs/obsidian.d.cts +0 -1
- package/document_loaders/fs/obsidian.d.ts +0 -1
- package/document_loaders/fs/obsidian.js +0 -1
- package/document_loaders/fs/openai_whisper_audio.cjs +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.cts +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.ts +0 -1
- package/document_loaders/fs/openai_whisper_audio.js +0 -1
- package/document_loaders/fs/pdf.cjs +0 -1
- package/document_loaders/fs/pdf.d.cts +0 -1
- package/document_loaders/fs/pdf.d.ts +0 -1
- package/document_loaders/fs/pdf.js +0 -1
- package/document_loaders/fs/pptx.cjs +0 -1
- package/document_loaders/fs/pptx.d.cts +0 -1
- package/document_loaders/fs/pptx.d.ts +0 -1
- package/document_loaders/fs/pptx.js +0 -1
- package/document_loaders/fs/srt.cjs +0 -1
- package/document_loaders/fs/srt.d.cts +0 -1
- package/document_loaders/fs/srt.d.ts +0 -1
- package/document_loaders/fs/srt.js +0 -1
- package/document_loaders/fs/unstructured.cjs +0 -1
- package/document_loaders/fs/unstructured.d.cts +0 -1
- package/document_loaders/fs/unstructured.d.ts +0 -1
- package/document_loaders/fs/unstructured.js +0 -1
- package/document_loaders/web/apify_dataset.cjs +0 -1
- package/document_loaders/web/apify_dataset.d.cts +0 -1
- package/document_loaders/web/apify_dataset.d.ts +0 -1
- package/document_loaders/web/apify_dataset.js +0 -1
- package/document_loaders/web/assemblyai.cjs +0 -1
- package/document_loaders/web/assemblyai.d.cts +0 -1
- package/document_loaders/web/assemblyai.d.ts +0 -1
- package/document_loaders/web/assemblyai.js +0 -1
- package/document_loaders/web/azure_blob_storage_container.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_container.js +0 -1
- package/document_loaders/web/azure_blob_storage_file.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_file.js +0 -1
- package/document_loaders/web/browserbase.cjs +0 -1
- package/document_loaders/web/browserbase.d.cts +0 -1
- package/document_loaders/web/browserbase.d.ts +0 -1
- package/document_loaders/web/browserbase.js +0 -1
- package/document_loaders/web/cheerio.cjs +0 -1
- package/document_loaders/web/cheerio.d.cts +0 -1
- package/document_loaders/web/cheerio.d.ts +0 -1
- package/document_loaders/web/cheerio.js +0 -1
- package/document_loaders/web/college_confidential.cjs +0 -1
- package/document_loaders/web/college_confidential.d.cts +0 -1
- package/document_loaders/web/college_confidential.d.ts +0 -1
- package/document_loaders/web/college_confidential.js +0 -1
- package/document_loaders/web/confluence.cjs +0 -1
- package/document_loaders/web/confluence.d.cts +0 -1
- package/document_loaders/web/confluence.d.ts +0 -1
- package/document_loaders/web/confluence.js +0 -1
- package/document_loaders/web/couchbase.cjs +0 -1
- package/document_loaders/web/couchbase.d.cts +0 -1
- package/document_loaders/web/couchbase.d.ts +0 -1
- package/document_loaders/web/couchbase.js +0 -1
- package/document_loaders/web/figma.cjs +0 -1
- package/document_loaders/web/figma.d.cts +0 -1
- package/document_loaders/web/figma.d.ts +0 -1
- package/document_loaders/web/figma.js +0 -1
- package/document_loaders/web/firecrawl.cjs +0 -1
- package/document_loaders/web/firecrawl.d.cts +0 -1
- package/document_loaders/web/firecrawl.d.ts +0 -1
- package/document_loaders/web/firecrawl.js +0 -1
- package/document_loaders/web/gitbook.cjs +0 -1
- package/document_loaders/web/gitbook.d.cts +0 -1
- package/document_loaders/web/gitbook.d.ts +0 -1
- package/document_loaders/web/gitbook.js +0 -1
- package/document_loaders/web/github.cjs +0 -1
- package/document_loaders/web/github.d.cts +0 -1
- package/document_loaders/web/github.d.ts +0 -1
- package/document_loaders/web/github.js +0 -1
- package/document_loaders/web/hn.cjs +0 -1
- package/document_loaders/web/hn.d.cts +0 -1
- package/document_loaders/web/hn.d.ts +0 -1
- package/document_loaders/web/hn.js +0 -1
- package/document_loaders/web/imsdb.cjs +0 -1
- package/document_loaders/web/imsdb.d.cts +0 -1
- package/document_loaders/web/imsdb.d.ts +0 -1
- package/document_loaders/web/imsdb.js +0 -1
- package/document_loaders/web/notionapi.cjs +0 -1
- package/document_loaders/web/notionapi.d.cts +0 -1
- package/document_loaders/web/notionapi.d.ts +0 -1
- package/document_loaders/web/notionapi.js +0 -1
- package/document_loaders/web/notiondb.cjs +0 -1
- package/document_loaders/web/notiondb.d.cts +0 -1
- package/document_loaders/web/notiondb.d.ts +0 -1
- package/document_loaders/web/notiondb.js +0 -1
- package/document_loaders/web/pdf.cjs +0 -1
- package/document_loaders/web/pdf.d.cts +0 -1
- package/document_loaders/web/pdf.d.ts +0 -1
- package/document_loaders/web/pdf.js +0 -1
- package/document_loaders/web/playwright.cjs +0 -1
- package/document_loaders/web/playwright.d.cts +0 -1
- package/document_loaders/web/playwright.d.ts +0 -1
- package/document_loaders/web/playwright.js +0 -1
- package/document_loaders/web/puppeteer.cjs +0 -1
- package/document_loaders/web/puppeteer.d.cts +0 -1
- package/document_loaders/web/puppeteer.d.ts +0 -1
- package/document_loaders/web/puppeteer.js +0 -1
- package/document_loaders/web/recursive_url.cjs +0 -1
- package/document_loaders/web/recursive_url.d.cts +0 -1
- package/document_loaders/web/recursive_url.d.ts +0 -1
- package/document_loaders/web/recursive_url.js +0 -1
- package/document_loaders/web/s3.cjs +0 -1
- package/document_loaders/web/s3.d.cts +0 -1
- package/document_loaders/web/s3.d.ts +0 -1
- package/document_loaders/web/s3.js +0 -1
- package/document_loaders/web/searchapi.cjs +0 -1
- package/document_loaders/web/searchapi.d.cts +0 -1
- package/document_loaders/web/searchapi.d.ts +0 -1
- package/document_loaders/web/searchapi.js +0 -1
- package/document_loaders/web/serpapi.cjs +0 -1
- package/document_loaders/web/serpapi.d.cts +0 -1
- package/document_loaders/web/serpapi.d.ts +0 -1
- package/document_loaders/web/serpapi.js +0 -1
- package/document_loaders/web/sitemap.cjs +0 -1
- package/document_loaders/web/sitemap.d.cts +0 -1
- package/document_loaders/web/sitemap.d.ts +0 -1
- package/document_loaders/web/sitemap.js +0 -1
- package/document_loaders/web/sonix_audio.cjs +0 -1
- package/document_loaders/web/sonix_audio.d.cts +0 -1
- package/document_loaders/web/sonix_audio.d.ts +0 -1
- package/document_loaders/web/sonix_audio.js +0 -1
- package/document_loaders/web/sort_xyz_blockchain.cjs +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.cts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.ts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.js +0 -1
- package/document_loaders/web/youtube.cjs +0 -1
- package/document_loaders/web/youtube.d.cts +0 -1
- package/document_loaders/web/youtube.d.ts +0 -1
- package/document_loaders/web/youtube.js +0 -1
- package/experimental/tools/pyinterpreter.cjs +0 -1
- package/experimental/tools/pyinterpreter.d.cts +0 -1
- package/experimental/tools/pyinterpreter.d.ts +0 -1
- package/experimental/tools/pyinterpreter.js +0 -1
- package/memory/index.cjs +0 -1
- package/memory/index.d.cts +0 -1
- package/memory/index.d.ts +0 -1
- package/memory/index.js +0 -1
- package/retrievers/self_query/chroma.cjs +0 -1
- package/retrievers/self_query/chroma.d.cts +0 -1
- package/retrievers/self_query/chroma.d.ts +0 -1
- package/retrievers/self_query/chroma.js +0 -1
- package/retrievers/self_query/pinecone.cjs +0 -1
- package/retrievers/self_query/pinecone.d.cts +0 -1
- package/retrievers/self_query/pinecone.d.ts +0 -1
- package/retrievers/self_query/pinecone.js +0 -1
- package/retrievers/self_query/supabase.cjs +0 -1
- package/retrievers/self_query/supabase.d.cts +0 -1
- package/retrievers/self_query/supabase.d.ts +0 -1
- package/retrievers/self_query/supabase.js +0 -1
- package/retrievers/self_query/vectara.cjs +0 -1
- package/retrievers/self_query/vectara.d.cts +0 -1
- package/retrievers/self_query/vectara.d.ts +0 -1
- package/retrievers/self_query/vectara.js +0 -1
- package/retrievers/self_query/weaviate.cjs +0 -1
- package/retrievers/self_query/weaviate.d.cts +0 -1
- package/retrievers/self_query/weaviate.d.ts +0 -1
- package/retrievers/self_query/weaviate.js +0 -1
|
@@ -1,199 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.NotionDBLoader = void 0;
|
|
4
|
-
const env_1 = require("@langchain/core/utils/env");
|
|
5
|
-
const base_js_1 = require("../base.cjs");
|
|
6
|
-
const NOTION_BASE_URL = "https://api.notion.com/v1";
|
|
7
|
-
/**
|
|
8
|
-
* @deprecated use the `NotionAPILoader` class instead.
|
|
9
|
-
* @example
|
|
10
|
-
* ```typescript
|
|
11
|
-
* const loader = new NotionDBLoader({
|
|
12
|
-
* pageSizeLimit: 10,
|
|
13
|
-
* databaseId: "{databaseId}",
|
|
14
|
-
* notionIntegrationToken: "{notionIntegrationToken}",
|
|
15
|
-
* });
|
|
16
|
-
* const docs = await loader.load();
|
|
17
|
-
* ```
|
|
18
|
-
*/
|
|
19
|
-
class NotionDBLoader extends base_js_1.BaseDocumentLoader {
|
|
20
|
-
constructor({ databaseId, notionApiVersion = "2022-06-28", notionIntegrationToken = (0, env_1.getEnvironmentVariable)("NOTION_INTEGRATION_TOKEN"), pageSizeLimit = 50, }) {
|
|
21
|
-
super();
|
|
22
|
-
Object.defineProperty(this, "integrationToken", {
|
|
23
|
-
enumerable: true,
|
|
24
|
-
configurable: true,
|
|
25
|
-
writable: true,
|
|
26
|
-
value: void 0
|
|
27
|
-
});
|
|
28
|
-
Object.defineProperty(this, "databaseId", {
|
|
29
|
-
enumerable: true,
|
|
30
|
-
configurable: true,
|
|
31
|
-
writable: true,
|
|
32
|
-
value: void 0
|
|
33
|
-
});
|
|
34
|
-
Object.defineProperty(this, "notionApiVersion", {
|
|
35
|
-
enumerable: true,
|
|
36
|
-
configurable: true,
|
|
37
|
-
writable: true,
|
|
38
|
-
value: void 0
|
|
39
|
-
});
|
|
40
|
-
Object.defineProperty(this, "pageSizeLimit", {
|
|
41
|
-
enumerable: true,
|
|
42
|
-
configurable: true,
|
|
43
|
-
writable: true,
|
|
44
|
-
value: void 0
|
|
45
|
-
});
|
|
46
|
-
Object.defineProperty(this, "headers", {
|
|
47
|
-
enumerable: true,
|
|
48
|
-
configurable: true,
|
|
49
|
-
writable: true,
|
|
50
|
-
value: {}
|
|
51
|
-
});
|
|
52
|
-
if (!notionIntegrationToken) {
|
|
53
|
-
throw new Error("You must provide a Notion integration token.");
|
|
54
|
-
}
|
|
55
|
-
this.integrationToken = notionIntegrationToken;
|
|
56
|
-
this.pageSizeLimit = pageSizeLimit;
|
|
57
|
-
this.notionApiVersion = notionApiVersion;
|
|
58
|
-
this.databaseId = databaseId;
|
|
59
|
-
this.headers = {
|
|
60
|
-
Authorization: `Bearer ${this.integrationToken}`,
|
|
61
|
-
"Content-Type": "application/json",
|
|
62
|
-
"Notion-Version": notionApiVersion,
|
|
63
|
-
};
|
|
64
|
-
}
|
|
65
|
-
/**
|
|
66
|
-
* Loads the documents from Notion based on the specified options.
|
|
67
|
-
* @returns An array of Document objects.
|
|
68
|
-
*/
|
|
69
|
-
async load() {
|
|
70
|
-
const pageIds = await this.retrievePageIds();
|
|
71
|
-
const documents = [];
|
|
72
|
-
for (const pageId of pageIds) {
|
|
73
|
-
documents.push(await this.loadPage(pageId));
|
|
74
|
-
}
|
|
75
|
-
return documents;
|
|
76
|
-
}
|
|
77
|
-
/**
|
|
78
|
-
* Retrieves the IDs of the pages in the Notion database.
|
|
79
|
-
* @returns An array of page IDs.
|
|
80
|
-
*/
|
|
81
|
-
async retrievePageIds() {
|
|
82
|
-
const url = `${NOTION_BASE_URL}/databases/${this.databaseId}/query`;
|
|
83
|
-
const pageIds = [];
|
|
84
|
-
const query = {
|
|
85
|
-
page_size: this.pageSizeLimit,
|
|
86
|
-
};
|
|
87
|
-
let hasMore = true;
|
|
88
|
-
while (hasMore) {
|
|
89
|
-
const response = await fetch(url, {
|
|
90
|
-
method: "POST",
|
|
91
|
-
body: JSON.stringify(query),
|
|
92
|
-
headers: this.headers,
|
|
93
|
-
});
|
|
94
|
-
const data = await response.json();
|
|
95
|
-
if (!response.ok) {
|
|
96
|
-
throw new Error(`Failed to load data from Notion. Please check your integration token and database id.`);
|
|
97
|
-
}
|
|
98
|
-
const { results, has_more, next_cursor } = data;
|
|
99
|
-
pageIds.push(...(results?.map((page) => page.id) ?? []));
|
|
100
|
-
hasMore = has_more;
|
|
101
|
-
query.start_cursor = next_cursor;
|
|
102
|
-
}
|
|
103
|
-
return pageIds;
|
|
104
|
-
}
|
|
105
|
-
/**
|
|
106
|
-
* Loads a Notion page and returns it as a Document object.
|
|
107
|
-
* @param pageId The ID of the Notion page to load.
|
|
108
|
-
* @returns A Document object representing the loaded Notion page.
|
|
109
|
-
*/
|
|
110
|
-
async loadPage(pageId) {
|
|
111
|
-
const url = `${NOTION_BASE_URL}/pages/${pageId}`;
|
|
112
|
-
const response = await fetch(url, { method: "GET", headers: this.headers });
|
|
113
|
-
const data = await response.json();
|
|
114
|
-
if (!response.ok) {
|
|
115
|
-
throw new Error(`Unable to fetch page: ${response.status} ${JSON.stringify(data)}`);
|
|
116
|
-
}
|
|
117
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
118
|
-
const metadata = {};
|
|
119
|
-
const { properties } = data;
|
|
120
|
-
for (const key of Object.keys(properties)) {
|
|
121
|
-
const item = properties[key];
|
|
122
|
-
const itemType = item.type;
|
|
123
|
-
let value;
|
|
124
|
-
switch (itemType) {
|
|
125
|
-
case "rich_text":
|
|
126
|
-
value =
|
|
127
|
-
item?.rich_text && item?.rich_text.length > 0
|
|
128
|
-
? item?.rich_text[0].plain_text
|
|
129
|
-
: null;
|
|
130
|
-
break;
|
|
131
|
-
case "title":
|
|
132
|
-
value =
|
|
133
|
-
item?.title && item?.title.length > 0
|
|
134
|
-
? item?.title[0].plain_text
|
|
135
|
-
: null;
|
|
136
|
-
break;
|
|
137
|
-
case "multi_select":
|
|
138
|
-
if (item?.multi_select && item?.multi_select.length > 0) {
|
|
139
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
140
|
-
value = item?.multi_select.map((el) => el.name);
|
|
141
|
-
}
|
|
142
|
-
break;
|
|
143
|
-
case "url":
|
|
144
|
-
value = item?.url ? item.url : null;
|
|
145
|
-
break;
|
|
146
|
-
default:
|
|
147
|
-
break;
|
|
148
|
-
}
|
|
149
|
-
if (value) {
|
|
150
|
-
metadata[key.toLowerCase()] = value;
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
metadata.id = pageId;
|
|
154
|
-
return {
|
|
155
|
-
pageContent: await this.loadBlocks(pageId),
|
|
156
|
-
metadata,
|
|
157
|
-
};
|
|
158
|
-
}
|
|
159
|
-
/**
|
|
160
|
-
* Loads the blocks of a Notion page and returns them as a string.
|
|
161
|
-
* @param blockId The ID of the block to load.
|
|
162
|
-
* @param numberOfTabs The number of tabs to use for indentation.
|
|
163
|
-
* @returns A string representing the loaded blocks.
|
|
164
|
-
*/
|
|
165
|
-
async loadBlocks(blockId, numberOfTabs = 0) {
|
|
166
|
-
const resultLinesArr = [];
|
|
167
|
-
let currentBlockId = blockId;
|
|
168
|
-
while (currentBlockId) {
|
|
169
|
-
const response = await fetch(`${NOTION_BASE_URL}/blocks/${currentBlockId}/children`, {
|
|
170
|
-
method: "GET",
|
|
171
|
-
headers: this.headers,
|
|
172
|
-
});
|
|
173
|
-
const data = await response.json();
|
|
174
|
-
if (!response.ok) {
|
|
175
|
-
throw new Error(`Unable to fetch block: ${response.status} ${JSON.stringify(data)}`);
|
|
176
|
-
}
|
|
177
|
-
for (const result of data.results) {
|
|
178
|
-
const resultObj = result[result.type];
|
|
179
|
-
if (!resultObj.rich_text) {
|
|
180
|
-
continue;
|
|
181
|
-
}
|
|
182
|
-
const curResultTextArr = [];
|
|
183
|
-
for (const richText of resultObj.rich_text) {
|
|
184
|
-
if (richText.text) {
|
|
185
|
-
curResultTextArr.push("\t".repeat(numberOfTabs) + richText.text.content);
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
if (result.has_children) {
|
|
189
|
-
const childrenText = await this.loadBlocks(result.id, numberOfTabs + 1);
|
|
190
|
-
curResultTextArr.push(childrenText);
|
|
191
|
-
}
|
|
192
|
-
resultLinesArr.push(curResultTextArr.join("\n"));
|
|
193
|
-
}
|
|
194
|
-
currentBlockId = data.next_cursor;
|
|
195
|
-
}
|
|
196
|
-
return resultLinesArr.join("\n");
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
exports.NotionDBLoader = NotionDBLoader;
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
import { Document } from "@langchain/core/documents";
|
|
2
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
3
|
-
/**
|
|
4
|
-
* Interface representing the parameters for the NotionDBLoader class. It
|
|
5
|
-
* includes the database ID, Notion integration token, Notion API version,
|
|
6
|
-
* and page size limit.
|
|
7
|
-
*/
|
|
8
|
-
export interface NotionDBLoaderParams {
|
|
9
|
-
databaseId: string;
|
|
10
|
-
notionIntegrationToken?: string;
|
|
11
|
-
notionApiVersion?: string;
|
|
12
|
-
pageSizeLimit?: number;
|
|
13
|
-
}
|
|
14
|
-
/**
|
|
15
|
-
* @deprecated use the `NotionAPILoader` class instead.
|
|
16
|
-
* @example
|
|
17
|
-
* ```typescript
|
|
18
|
-
* const loader = new NotionDBLoader({
|
|
19
|
-
* pageSizeLimit: 10,
|
|
20
|
-
* databaseId: "{databaseId}",
|
|
21
|
-
* notionIntegrationToken: "{notionIntegrationToken}",
|
|
22
|
-
* });
|
|
23
|
-
* const docs = await loader.load();
|
|
24
|
-
* ```
|
|
25
|
-
*/
|
|
26
|
-
export declare class NotionDBLoader extends BaseDocumentLoader implements NotionDBLoaderParams {
|
|
27
|
-
integrationToken: string;
|
|
28
|
-
databaseId: string;
|
|
29
|
-
notionApiVersion: string;
|
|
30
|
-
pageSizeLimit: number;
|
|
31
|
-
private headers;
|
|
32
|
-
constructor({ databaseId, notionApiVersion, notionIntegrationToken, pageSizeLimit, }: NotionDBLoaderParams);
|
|
33
|
-
/**
|
|
34
|
-
* Loads the documents from Notion based on the specified options.
|
|
35
|
-
* @returns An array of Document objects.
|
|
36
|
-
*/
|
|
37
|
-
load(): Promise<Document[]>;
|
|
38
|
-
/**
|
|
39
|
-
* Retrieves the IDs of the pages in the Notion database.
|
|
40
|
-
* @returns An array of page IDs.
|
|
41
|
-
*/
|
|
42
|
-
private retrievePageIds;
|
|
43
|
-
/**
|
|
44
|
-
* Loads a Notion page and returns it as a Document object.
|
|
45
|
-
* @param pageId The ID of the Notion page to load.
|
|
46
|
-
* @returns A Document object representing the loaded Notion page.
|
|
47
|
-
*/
|
|
48
|
-
private loadPage;
|
|
49
|
-
/**
|
|
50
|
-
* Loads the blocks of a Notion page and returns them as a string.
|
|
51
|
-
* @param blockId The ID of the block to load.
|
|
52
|
-
* @param numberOfTabs The number of tabs to use for indentation.
|
|
53
|
-
* @returns A string representing the loaded blocks.
|
|
54
|
-
*/
|
|
55
|
-
private loadBlocks;
|
|
56
|
-
}
|
|
@@ -1,195 +0,0 @@
|
|
|
1
|
-
import { getEnvironmentVariable } from "@langchain/core/utils/env";
|
|
2
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
3
|
-
const NOTION_BASE_URL = "https://api.notion.com/v1";
|
|
4
|
-
/**
|
|
5
|
-
* @deprecated use the `NotionAPILoader` class instead.
|
|
6
|
-
* @example
|
|
7
|
-
* ```typescript
|
|
8
|
-
* const loader = new NotionDBLoader({
|
|
9
|
-
* pageSizeLimit: 10,
|
|
10
|
-
* databaseId: "{databaseId}",
|
|
11
|
-
* notionIntegrationToken: "{notionIntegrationToken}",
|
|
12
|
-
* });
|
|
13
|
-
* const docs = await loader.load();
|
|
14
|
-
* ```
|
|
15
|
-
*/
|
|
16
|
-
export class NotionDBLoader extends BaseDocumentLoader {
|
|
17
|
-
constructor({ databaseId, notionApiVersion = "2022-06-28", notionIntegrationToken = getEnvironmentVariable("NOTION_INTEGRATION_TOKEN"), pageSizeLimit = 50, }) {
|
|
18
|
-
super();
|
|
19
|
-
Object.defineProperty(this, "integrationToken", {
|
|
20
|
-
enumerable: true,
|
|
21
|
-
configurable: true,
|
|
22
|
-
writable: true,
|
|
23
|
-
value: void 0
|
|
24
|
-
});
|
|
25
|
-
Object.defineProperty(this, "databaseId", {
|
|
26
|
-
enumerable: true,
|
|
27
|
-
configurable: true,
|
|
28
|
-
writable: true,
|
|
29
|
-
value: void 0
|
|
30
|
-
});
|
|
31
|
-
Object.defineProperty(this, "notionApiVersion", {
|
|
32
|
-
enumerable: true,
|
|
33
|
-
configurable: true,
|
|
34
|
-
writable: true,
|
|
35
|
-
value: void 0
|
|
36
|
-
});
|
|
37
|
-
Object.defineProperty(this, "pageSizeLimit", {
|
|
38
|
-
enumerable: true,
|
|
39
|
-
configurable: true,
|
|
40
|
-
writable: true,
|
|
41
|
-
value: void 0
|
|
42
|
-
});
|
|
43
|
-
Object.defineProperty(this, "headers", {
|
|
44
|
-
enumerable: true,
|
|
45
|
-
configurable: true,
|
|
46
|
-
writable: true,
|
|
47
|
-
value: {}
|
|
48
|
-
});
|
|
49
|
-
if (!notionIntegrationToken) {
|
|
50
|
-
throw new Error("You must provide a Notion integration token.");
|
|
51
|
-
}
|
|
52
|
-
this.integrationToken = notionIntegrationToken;
|
|
53
|
-
this.pageSizeLimit = pageSizeLimit;
|
|
54
|
-
this.notionApiVersion = notionApiVersion;
|
|
55
|
-
this.databaseId = databaseId;
|
|
56
|
-
this.headers = {
|
|
57
|
-
Authorization: `Bearer ${this.integrationToken}`,
|
|
58
|
-
"Content-Type": "application/json",
|
|
59
|
-
"Notion-Version": notionApiVersion,
|
|
60
|
-
};
|
|
61
|
-
}
|
|
62
|
-
/**
|
|
63
|
-
* Loads the documents from Notion based on the specified options.
|
|
64
|
-
* @returns An array of Document objects.
|
|
65
|
-
*/
|
|
66
|
-
async load() {
|
|
67
|
-
const pageIds = await this.retrievePageIds();
|
|
68
|
-
const documents = [];
|
|
69
|
-
for (const pageId of pageIds) {
|
|
70
|
-
documents.push(await this.loadPage(pageId));
|
|
71
|
-
}
|
|
72
|
-
return documents;
|
|
73
|
-
}
|
|
74
|
-
/**
|
|
75
|
-
* Retrieves the IDs of the pages in the Notion database.
|
|
76
|
-
* @returns An array of page IDs.
|
|
77
|
-
*/
|
|
78
|
-
async retrievePageIds() {
|
|
79
|
-
const url = `${NOTION_BASE_URL}/databases/${this.databaseId}/query`;
|
|
80
|
-
const pageIds = [];
|
|
81
|
-
const query = {
|
|
82
|
-
page_size: this.pageSizeLimit,
|
|
83
|
-
};
|
|
84
|
-
let hasMore = true;
|
|
85
|
-
while (hasMore) {
|
|
86
|
-
const response = await fetch(url, {
|
|
87
|
-
method: "POST",
|
|
88
|
-
body: JSON.stringify(query),
|
|
89
|
-
headers: this.headers,
|
|
90
|
-
});
|
|
91
|
-
const data = await response.json();
|
|
92
|
-
if (!response.ok) {
|
|
93
|
-
throw new Error(`Failed to load data from Notion. Please check your integration token and database id.`);
|
|
94
|
-
}
|
|
95
|
-
const { results, has_more, next_cursor } = data;
|
|
96
|
-
pageIds.push(...(results?.map((page) => page.id) ?? []));
|
|
97
|
-
hasMore = has_more;
|
|
98
|
-
query.start_cursor = next_cursor;
|
|
99
|
-
}
|
|
100
|
-
return pageIds;
|
|
101
|
-
}
|
|
102
|
-
/**
|
|
103
|
-
* Loads a Notion page and returns it as a Document object.
|
|
104
|
-
* @param pageId The ID of the Notion page to load.
|
|
105
|
-
* @returns A Document object representing the loaded Notion page.
|
|
106
|
-
*/
|
|
107
|
-
async loadPage(pageId) {
|
|
108
|
-
const url = `${NOTION_BASE_URL}/pages/${pageId}`;
|
|
109
|
-
const response = await fetch(url, { method: "GET", headers: this.headers });
|
|
110
|
-
const data = await response.json();
|
|
111
|
-
if (!response.ok) {
|
|
112
|
-
throw new Error(`Unable to fetch page: ${response.status} ${JSON.stringify(data)}`);
|
|
113
|
-
}
|
|
114
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
115
|
-
const metadata = {};
|
|
116
|
-
const { properties } = data;
|
|
117
|
-
for (const key of Object.keys(properties)) {
|
|
118
|
-
const item = properties[key];
|
|
119
|
-
const itemType = item.type;
|
|
120
|
-
let value;
|
|
121
|
-
switch (itemType) {
|
|
122
|
-
case "rich_text":
|
|
123
|
-
value =
|
|
124
|
-
item?.rich_text && item?.rich_text.length > 0
|
|
125
|
-
? item?.rich_text[0].plain_text
|
|
126
|
-
: null;
|
|
127
|
-
break;
|
|
128
|
-
case "title":
|
|
129
|
-
value =
|
|
130
|
-
item?.title && item?.title.length > 0
|
|
131
|
-
? item?.title[0].plain_text
|
|
132
|
-
: null;
|
|
133
|
-
break;
|
|
134
|
-
case "multi_select":
|
|
135
|
-
if (item?.multi_select && item?.multi_select.length > 0) {
|
|
136
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
137
|
-
value = item?.multi_select.map((el) => el.name);
|
|
138
|
-
}
|
|
139
|
-
break;
|
|
140
|
-
case "url":
|
|
141
|
-
value = item?.url ? item.url : null;
|
|
142
|
-
break;
|
|
143
|
-
default:
|
|
144
|
-
break;
|
|
145
|
-
}
|
|
146
|
-
if (value) {
|
|
147
|
-
metadata[key.toLowerCase()] = value;
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
metadata.id = pageId;
|
|
151
|
-
return {
|
|
152
|
-
pageContent: await this.loadBlocks(pageId),
|
|
153
|
-
metadata,
|
|
154
|
-
};
|
|
155
|
-
}
|
|
156
|
-
/**
|
|
157
|
-
* Loads the blocks of a Notion page and returns them as a string.
|
|
158
|
-
* @param blockId The ID of the block to load.
|
|
159
|
-
* @param numberOfTabs The number of tabs to use for indentation.
|
|
160
|
-
* @returns A string representing the loaded blocks.
|
|
161
|
-
*/
|
|
162
|
-
async loadBlocks(blockId, numberOfTabs = 0) {
|
|
163
|
-
const resultLinesArr = [];
|
|
164
|
-
let currentBlockId = blockId;
|
|
165
|
-
while (currentBlockId) {
|
|
166
|
-
const response = await fetch(`${NOTION_BASE_URL}/blocks/${currentBlockId}/children`, {
|
|
167
|
-
method: "GET",
|
|
168
|
-
headers: this.headers,
|
|
169
|
-
});
|
|
170
|
-
const data = await response.json();
|
|
171
|
-
if (!response.ok) {
|
|
172
|
-
throw new Error(`Unable to fetch block: ${response.status} ${JSON.stringify(data)}`);
|
|
173
|
-
}
|
|
174
|
-
for (const result of data.results) {
|
|
175
|
-
const resultObj = result[result.type];
|
|
176
|
-
if (!resultObj.rich_text) {
|
|
177
|
-
continue;
|
|
178
|
-
}
|
|
179
|
-
const curResultTextArr = [];
|
|
180
|
-
for (const richText of resultObj.rich_text) {
|
|
181
|
-
if (richText.text) {
|
|
182
|
-
curResultTextArr.push("\t".repeat(numberOfTabs) + richText.text.content);
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
if (result.has_children) {
|
|
186
|
-
const childrenText = await this.loadBlocks(result.id, numberOfTabs + 1);
|
|
187
|
-
curResultTextArr.push(childrenText);
|
|
188
|
-
}
|
|
189
|
-
resultLinesArr.push(curResultTextArr.join("\n"));
|
|
190
|
-
}
|
|
191
|
-
currentBlockId = data.next_cursor;
|
|
192
|
-
}
|
|
193
|
-
return resultLinesArr.join("\n");
|
|
194
|
-
}
|
|
195
|
-
}
|
|
@@ -1,140 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.WebPDFLoader = void 0;
|
|
4
|
-
const documents_1 = require("@langchain/core/documents");
|
|
5
|
-
const base_js_1 = require("../base.cjs");
|
|
6
|
-
const document_js_1 = require("../../util/document.cjs");
|
|
7
|
-
const entrypoint_deprecation_js_1 = require("../../util/entrypoint_deprecation.cjs");
|
|
8
|
-
/* #__PURE__ */ (0, entrypoint_deprecation_js_1.logVersion020MigrationWarning)({
|
|
9
|
-
oldEntrypointName: "document_loaders/web/pdf",
|
|
10
|
-
newPackageName: "@langchain/community",
|
|
11
|
-
});
|
|
12
|
-
/**
|
|
13
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/pdf" instead. This entrypoint will be removed in 0.3.0.
|
|
14
|
-
*
|
|
15
|
-
* A document loader for loading data from PDFs.
|
|
16
|
-
* @example
|
|
17
|
-
* ```typescript
|
|
18
|
-
* const loader = new WebPDFLoader(new Blob());
|
|
19
|
-
* const docs = await loader.load();
|
|
20
|
-
* console.log({ docs });
|
|
21
|
-
* ```
|
|
22
|
-
*/
|
|
23
|
-
class WebPDFLoader extends base_js_1.BaseDocumentLoader {
|
|
24
|
-
constructor(blob, { splitPages = true, pdfjs = PDFLoaderImports, parsedItemSeparator = "", } = {}) {
|
|
25
|
-
super();
|
|
26
|
-
Object.defineProperty(this, "blob", {
|
|
27
|
-
enumerable: true,
|
|
28
|
-
configurable: true,
|
|
29
|
-
writable: true,
|
|
30
|
-
value: void 0
|
|
31
|
-
});
|
|
32
|
-
Object.defineProperty(this, "splitPages", {
|
|
33
|
-
enumerable: true,
|
|
34
|
-
configurable: true,
|
|
35
|
-
writable: true,
|
|
36
|
-
value: true
|
|
37
|
-
});
|
|
38
|
-
Object.defineProperty(this, "pdfjs", {
|
|
39
|
-
enumerable: true,
|
|
40
|
-
configurable: true,
|
|
41
|
-
writable: true,
|
|
42
|
-
value: void 0
|
|
43
|
-
});
|
|
44
|
-
Object.defineProperty(this, "parsedItemSeparator", {
|
|
45
|
-
enumerable: true,
|
|
46
|
-
configurable: true,
|
|
47
|
-
writable: true,
|
|
48
|
-
value: void 0
|
|
49
|
-
});
|
|
50
|
-
this.blob = blob;
|
|
51
|
-
this.splitPages = splitPages ?? this.splitPages;
|
|
52
|
-
this.pdfjs = pdfjs;
|
|
53
|
-
this.parsedItemSeparator = parsedItemSeparator;
|
|
54
|
-
}
|
|
55
|
-
/**
|
|
56
|
-
* Loads the contents of the PDF as documents.
|
|
57
|
-
* @returns An array of Documents representing the retrieved data.
|
|
58
|
-
*/
|
|
59
|
-
async load() {
|
|
60
|
-
const { getDocument, version } = await this.pdfjs();
|
|
61
|
-
const parsedPdf = await getDocument({
|
|
62
|
-
data: new Uint8Array(await this.blob.arrayBuffer()),
|
|
63
|
-
useWorkerFetch: false,
|
|
64
|
-
isEvalSupported: false,
|
|
65
|
-
useSystemFonts: true,
|
|
66
|
-
}).promise;
|
|
67
|
-
const meta = await parsedPdf.getMetadata().catch(() => null);
|
|
68
|
-
const documents = [];
|
|
69
|
-
for (let i = 1; i <= parsedPdf.numPages; i += 1) {
|
|
70
|
-
const page = await parsedPdf.getPage(i);
|
|
71
|
-
const content = await page.getTextContent();
|
|
72
|
-
if (content.items.length === 0) {
|
|
73
|
-
continue;
|
|
74
|
-
}
|
|
75
|
-
// Eliminate excessive newlines
|
|
76
|
-
// Source: https://github.com/albertcui/pdf-parse/blob/7086fc1cc9058545cdf41dd0646d6ae5832c7107/lib/pdf-parse.js#L16
|
|
77
|
-
let lastY;
|
|
78
|
-
const textItems = [];
|
|
79
|
-
for (const item of content.items) {
|
|
80
|
-
if ("str" in item) {
|
|
81
|
-
if (lastY === item.transform[5] || !lastY) {
|
|
82
|
-
textItems.push(item.str);
|
|
83
|
-
}
|
|
84
|
-
else {
|
|
85
|
-
textItems.push(`\n${item.str}`);
|
|
86
|
-
}
|
|
87
|
-
// eslint-disable-next-line prefer-destructuring
|
|
88
|
-
lastY = item.transform[5];
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
const text = textItems.join(this.parsedItemSeparator);
|
|
92
|
-
documents.push(new documents_1.Document({
|
|
93
|
-
pageContent: text,
|
|
94
|
-
metadata: {
|
|
95
|
-
pdf: {
|
|
96
|
-
version,
|
|
97
|
-
info: meta?.info,
|
|
98
|
-
metadata: meta?.metadata,
|
|
99
|
-
totalPages: parsedPdf.numPages,
|
|
100
|
-
},
|
|
101
|
-
loc: {
|
|
102
|
-
pageNumber: i,
|
|
103
|
-
},
|
|
104
|
-
},
|
|
105
|
-
}));
|
|
106
|
-
}
|
|
107
|
-
if (this.splitPages) {
|
|
108
|
-
return documents;
|
|
109
|
-
}
|
|
110
|
-
if (documents.length === 0) {
|
|
111
|
-
return [];
|
|
112
|
-
}
|
|
113
|
-
return [
|
|
114
|
-
new documents_1.Document({
|
|
115
|
-
pageContent: (0, document_js_1.formatDocumentsAsString)(documents),
|
|
116
|
-
metadata: {
|
|
117
|
-
pdf: {
|
|
118
|
-
version,
|
|
119
|
-
info: meta?.info,
|
|
120
|
-
metadata: meta?.metadata,
|
|
121
|
-
totalPages: parsedPdf.numPages,
|
|
122
|
-
},
|
|
123
|
-
},
|
|
124
|
-
}),
|
|
125
|
-
];
|
|
126
|
-
return documents;
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
exports.WebPDFLoader = WebPDFLoader;
|
|
130
|
-
async function PDFLoaderImports() {
|
|
131
|
-
try {
|
|
132
|
-
const { default: mod } = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js");
|
|
133
|
-
const { getDocument, version } = mod;
|
|
134
|
-
return { getDocument, version };
|
|
135
|
-
}
|
|
136
|
-
catch (e) {
|
|
137
|
-
console.error(e);
|
|
138
|
-
throw new Error("Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`.");
|
|
139
|
-
}
|
|
140
|
-
}
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
/// <reference path="../../../src/types/pdf-parse.d.ts" />
|
|
2
|
-
import { Document } from "@langchain/core/documents";
|
|
3
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
4
|
-
/**
|
|
5
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/pdf" instead. This entrypoint will be removed in 0.3.0.
|
|
6
|
-
*
|
|
7
|
-
* A document loader for loading data from PDFs.
|
|
8
|
-
* @example
|
|
9
|
-
* ```typescript
|
|
10
|
-
* const loader = new WebPDFLoader(new Blob());
|
|
11
|
-
* const docs = await loader.load();
|
|
12
|
-
* console.log({ docs });
|
|
13
|
-
* ```
|
|
14
|
-
*/
|
|
15
|
-
export declare class WebPDFLoader extends BaseDocumentLoader {
|
|
16
|
-
protected blob: Blob;
|
|
17
|
-
protected splitPages: boolean;
|
|
18
|
-
private pdfjs;
|
|
19
|
-
protected parsedItemSeparator: string;
|
|
20
|
-
constructor(blob: Blob, { splitPages, pdfjs, parsedItemSeparator, }?: {
|
|
21
|
-
splitPages?: boolean | undefined;
|
|
22
|
-
pdfjs?: typeof PDFLoaderImports | undefined;
|
|
23
|
-
parsedItemSeparator?: string | undefined;
|
|
24
|
-
});
|
|
25
|
-
/**
|
|
26
|
-
* Loads the contents of the PDF as documents.
|
|
27
|
-
* @returns An array of Documents representing the retrieved data.
|
|
28
|
-
*/
|
|
29
|
-
load(): Promise<Document[]>;
|
|
30
|
-
}
|
|
31
|
-
declare function PDFLoaderImports(): Promise<{
|
|
32
|
-
getDocument: typeof import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js").getDocument;
|
|
33
|
-
version: string;
|
|
34
|
-
}>;
|
|
35
|
-
export {};
|