langchain 0.2.18 → 0.3.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/openai_functions/index.cjs +2 -2
- package/dist/agents/openai_functions/index.js +2 -2
- package/dist/chains/combine_documents/stuff.cjs +2 -2
- package/dist/chains/combine_documents/stuff.js +2 -2
- package/dist/chains/openai_functions/openapi.cjs +3 -1
- package/dist/chains/openai_functions/openapi.js +3 -1
- package/dist/load/import_constants.cjs +2 -39
- package/dist/load/import_constants.js +2 -39
- package/dist/load/import_map.cjs +2 -3
- package/dist/load/import_map.d.ts +0 -1
- package/dist/load/import_map.js +0 -1
- package/dist/smith/config.d.ts +1 -5
- package/package.json +31 -854
- package/dist/document_loaders/fs/chatgpt.cjs +0 -90
- package/dist/document_loaders/fs/chatgpt.d.ts +0 -8
- package/dist/document_loaders/fs/chatgpt.js +0 -86
- package/dist/document_loaders/fs/csv.cjs +0 -73
- package/dist/document_loaders/fs/csv.d.ts +0 -65
- package/dist/document_loaders/fs/csv.js +0 -69
- package/dist/document_loaders/fs/docx.cjs +0 -58
- package/dist/document_loaders/fs/docx.d.ts +0 -25
- package/dist/document_loaders/fs/docx.js +0 -54
- package/dist/document_loaders/fs/epub.cjs +0 -103
- package/dist/document_loaders/fs/epub.d.ts +0 -33
- package/dist/document_loaders/fs/epub.js +0 -99
- package/dist/document_loaders/fs/notion.cjs +0 -26
- package/dist/document_loaders/fs/notion.d.ts +0 -12
- package/dist/document_loaders/fs/notion.js +0 -22
- package/dist/document_loaders/fs/obsidian.cjs +0 -247
- package/dist/document_loaders/fs/obsidian.d.ts +0 -28
- package/dist/document_loaders/fs/obsidian.js +0 -240
- package/dist/document_loaders/fs/openai_whisper_audio.cjs +0 -49
- package/dist/document_loaders/fs/openai_whisper_audio.d.ts +0 -23
- package/dist/document_loaders/fs/openai_whisper_audio.js +0 -45
- package/dist/document_loaders/fs/pdf.cjs +0 -148
- package/dist/document_loaders/fs/pdf.d.ts +0 -49
- package/dist/document_loaders/fs/pdf.js +0 -144
- package/dist/document_loaders/fs/pptx.cjs +0 -46
- package/dist/document_loaders/fs/pptx.d.ts +0 -25
- package/dist/document_loaders/fs/pptx.js +0 -42
- package/dist/document_loaders/fs/srt.cjs +0 -57
- package/dist/document_loaders/fs/srt.d.ts +0 -32
- package/dist/document_loaders/fs/srt.js +0 -50
- package/dist/document_loaders/fs/unstructured.cjs +0 -338
- package/dist/document_loaders/fs/unstructured.d.ts +0 -125
- package/dist/document_loaders/fs/unstructured.js +0 -333
- package/dist/document_loaders/web/apify_dataset.cjs +0 -130
- package/dist/document_loaders/web/apify_dataset.d.ts +0 -85
- package/dist/document_loaders/web/apify_dataset.js +0 -126
- package/dist/document_loaders/web/assemblyai.cjs +0 -200
- package/dist/document_loaders/web/assemblyai.d.ts +0 -95
- package/dist/document_loaders/web/assemblyai.js +0 -193
- package/dist/document_loaders/web/azure_blob_storage_container.cjs +0 -73
- package/dist/document_loaders/web/azure_blob_storage_container.d.ts +0 -46
- package/dist/document_loaders/web/azure_blob_storage_container.js +0 -69
- package/dist/document_loaders/web/azure_blob_storage_file.cjs +0 -124
- package/dist/document_loaders/web/azure_blob_storage_file.d.ts +0 -53
- package/dist/document_loaders/web/azure_blob_storage_file.js +0 -97
- package/dist/document_loaders/web/browserbase.cjs +0 -93
- package/dist/document_loaders/web/browserbase.d.ts +0 -48
- package/dist/document_loaders/web/browserbase.js +0 -86
- package/dist/document_loaders/web/cheerio.cjs +0 -118
- package/dist/document_loaders/web/cheerio.d.ts +0 -77
- package/dist/document_loaders/web/cheerio.js +0 -114
- package/dist/document_loaders/web/college_confidential.cjs +0 -41
- package/dist/document_loaders/web/college_confidential.d.ts +0 -25
- package/dist/document_loaders/web/college_confidential.js +0 -37
- package/dist/document_loaders/web/confluence.cjs +0 -190
- package/dist/document_loaders/web/confluence.d.ts +0 -114
- package/dist/document_loaders/web/confluence.js +0 -186
- package/dist/document_loaders/web/couchbase.cjs +0 -95
- package/dist/document_loaders/web/couchbase.d.ts +0 -32
- package/dist/document_loaders/web/couchbase.js +0 -91
- package/dist/document_loaders/web/figma.cjs +0 -102
- package/dist/document_loaders/web/figma.d.ts +0 -82
- package/dist/document_loaders/web/figma.js +0 -98
- package/dist/document_loaders/web/firecrawl.cjs +0 -95
- package/dist/document_loaders/web/firecrawl.d.ts +0 -50
- package/dist/document_loaders/web/firecrawl.js +0 -88
- package/dist/document_loaders/web/gitbook.cjs +0 -110
- package/dist/document_loaders/web/gitbook.d.ts +0 -55
- package/dist/document_loaders/web/gitbook.js +0 -106
- package/dist/document_loaders/web/github.cjs +0 -615
- package/dist/document_loaders/web/github.d.ts +0 -203
- package/dist/document_loaders/web/github.js +0 -608
- package/dist/document_loaders/web/hn.cjs +0 -90
- package/dist/document_loaders/web/hn.d.ts +0 -42
- package/dist/document_loaders/web/hn.js +0 -86
- package/dist/document_loaders/web/imsdb.cjs +0 -44
- package/dist/document_loaders/web/imsdb.d.ts +0 -23
- package/dist/document_loaders/web/imsdb.js +0 -40
- package/dist/document_loaders/web/notionapi.cjs +0 -404
- package/dist/document_loaders/web/notionapi.d.ts +0 -133
- package/dist/document_loaders/web/notionapi.js +0 -392
- package/dist/document_loaders/web/notiondb.cjs +0 -199
- package/dist/document_loaders/web/notiondb.d.ts +0 -56
- package/dist/document_loaders/web/notiondb.js +0 -195
- package/dist/document_loaders/web/pdf.cjs +0 -140
- package/dist/document_loaders/web/pdf.d.ts +0 -35
- package/dist/document_loaders/web/pdf.js +0 -136
- package/dist/document_loaders/web/playwright.cjs +0 -89
- package/dist/document_loaders/web/playwright.d.ts +0 -58
- package/dist/document_loaders/web/playwright.js +0 -85
- package/dist/document_loaders/web/puppeteer.cjs +0 -139
- package/dist/document_loaders/web/puppeteer.d.ts +0 -82
- package/dist/document_loaders/web/puppeteer.js +0 -135
- package/dist/document_loaders/web/recursive_url.cjs +0 -198
- package/dist/document_loaders/web/recursive_url.d.ts +0 -33
- package/dist/document_loaders/web/recursive_url.js +0 -194
- package/dist/document_loaders/web/s3.cjs +0 -164
- package/dist/document_loaders/web/s3.d.ts +0 -78
- package/dist/document_loaders/web/s3.js +0 -137
- package/dist/document_loaders/web/searchapi.cjs +0 -150
- package/dist/document_loaders/web/searchapi.d.ts +0 -76
- package/dist/document_loaders/web/searchapi.js +0 -146
- package/dist/document_loaders/web/serpapi.cjs +0 -127
- package/dist/document_loaders/web/serpapi.d.ts +0 -62
- package/dist/document_loaders/web/serpapi.js +0 -123
- package/dist/document_loaders/web/sitemap.cjs +0 -118
- package/dist/document_loaders/web/sitemap.d.ts +0 -41
- package/dist/document_loaders/web/sitemap.js +0 -114
- package/dist/document_loaders/web/sonix_audio.cjs +0 -68
- package/dist/document_loaders/web/sonix_audio.d.ts +0 -36
- package/dist/document_loaders/web/sonix_audio.js +0 -64
- package/dist/document_loaders/web/sort_xyz_blockchain.cjs +0 -157
- package/dist/document_loaders/web/sort_xyz_blockchain.d.ts +0 -78
- package/dist/document_loaders/web/sort_xyz_blockchain.js +0 -153
- package/dist/document_loaders/web/youtube.cjs +0 -116
- package/dist/document_loaders/web/youtube.d.ts +0 -55
- package/dist/document_loaders/web/youtube.js +0 -112
- package/dist/experimental/tools/pyinterpreter.cjs +0 -248
- package/dist/experimental/tools/pyinterpreter.d.ts +0 -18
- package/dist/experimental/tools/pyinterpreter.js +0 -244
- package/dist/retrievers/self_query/chroma.cjs +0 -48
- package/dist/retrievers/self_query/chroma.d.ts +0 -26
- package/dist/retrievers/self_query/chroma.js +0 -44
- package/dist/retrievers/self_query/pinecone.cjs +0 -47
- package/dist/retrievers/self_query/pinecone.d.ts +0 -26
- package/dist/retrievers/self_query/pinecone.js +0 -43
- package/dist/retrievers/self_query/supabase.cjs +0 -278
- package/dist/retrievers/self_query/supabase.d.ts +0 -109
- package/dist/retrievers/self_query/supabase.js +0 -274
- package/dist/retrievers/self_query/supabase_utils.cjs +0 -264
- package/dist/retrievers/self_query/supabase_utils.d.ts +0 -101
- package/dist/retrievers/self_query/supabase_utils.js +0 -259
- package/dist/retrievers/self_query/vectara.cjs +0 -143
- package/dist/retrievers/self_query/vectara.d.ts +0 -42
- package/dist/retrievers/self_query/vectara.js +0 -139
- package/dist/retrievers/self_query/weaviate.cjs +0 -201
- package/dist/retrievers/self_query/weaviate.d.ts +0 -99
- package/dist/retrievers/self_query/weaviate.js +0 -197
- package/dist/types/assemblyai-types.cjs +0 -2
- package/dist/types/assemblyai-types.d.ts +0 -4
- package/dist/types/assemblyai-types.js +0 -1
- package/document_loaders/fs/chatgpt.cjs +0 -1
- package/document_loaders/fs/chatgpt.d.cts +0 -1
- package/document_loaders/fs/chatgpt.d.ts +0 -1
- package/document_loaders/fs/chatgpt.js +0 -1
- package/document_loaders/fs/csv.cjs +0 -1
- package/document_loaders/fs/csv.d.cts +0 -1
- package/document_loaders/fs/csv.d.ts +0 -1
- package/document_loaders/fs/csv.js +0 -1
- package/document_loaders/fs/docx.cjs +0 -1
- package/document_loaders/fs/docx.d.cts +0 -1
- package/document_loaders/fs/docx.d.ts +0 -1
- package/document_loaders/fs/docx.js +0 -1
- package/document_loaders/fs/epub.cjs +0 -1
- package/document_loaders/fs/epub.d.cts +0 -1
- package/document_loaders/fs/epub.d.ts +0 -1
- package/document_loaders/fs/epub.js +0 -1
- package/document_loaders/fs/notion.cjs +0 -1
- package/document_loaders/fs/notion.d.cts +0 -1
- package/document_loaders/fs/notion.d.ts +0 -1
- package/document_loaders/fs/notion.js +0 -1
- package/document_loaders/fs/obsidian.cjs +0 -1
- package/document_loaders/fs/obsidian.d.cts +0 -1
- package/document_loaders/fs/obsidian.d.ts +0 -1
- package/document_loaders/fs/obsidian.js +0 -1
- package/document_loaders/fs/openai_whisper_audio.cjs +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.cts +0 -1
- package/document_loaders/fs/openai_whisper_audio.d.ts +0 -1
- package/document_loaders/fs/openai_whisper_audio.js +0 -1
- package/document_loaders/fs/pdf.cjs +0 -1
- package/document_loaders/fs/pdf.d.cts +0 -1
- package/document_loaders/fs/pdf.d.ts +0 -1
- package/document_loaders/fs/pdf.js +0 -1
- package/document_loaders/fs/pptx.cjs +0 -1
- package/document_loaders/fs/pptx.d.cts +0 -1
- package/document_loaders/fs/pptx.d.ts +0 -1
- package/document_loaders/fs/pptx.js +0 -1
- package/document_loaders/fs/srt.cjs +0 -1
- package/document_loaders/fs/srt.d.cts +0 -1
- package/document_loaders/fs/srt.d.ts +0 -1
- package/document_loaders/fs/srt.js +0 -1
- package/document_loaders/fs/unstructured.cjs +0 -1
- package/document_loaders/fs/unstructured.d.cts +0 -1
- package/document_loaders/fs/unstructured.d.ts +0 -1
- package/document_loaders/fs/unstructured.js +0 -1
- package/document_loaders/web/apify_dataset.cjs +0 -1
- package/document_loaders/web/apify_dataset.d.cts +0 -1
- package/document_loaders/web/apify_dataset.d.ts +0 -1
- package/document_loaders/web/apify_dataset.js +0 -1
- package/document_loaders/web/assemblyai.cjs +0 -1
- package/document_loaders/web/assemblyai.d.cts +0 -1
- package/document_loaders/web/assemblyai.d.ts +0 -1
- package/document_loaders/web/assemblyai.js +0 -1
- package/document_loaders/web/azure_blob_storage_container.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_container.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_container.js +0 -1
- package/document_loaders/web/azure_blob_storage_file.cjs +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.cts +0 -1
- package/document_loaders/web/azure_blob_storage_file.d.ts +0 -1
- package/document_loaders/web/azure_blob_storage_file.js +0 -1
- package/document_loaders/web/browserbase.cjs +0 -1
- package/document_loaders/web/browserbase.d.cts +0 -1
- package/document_loaders/web/browserbase.d.ts +0 -1
- package/document_loaders/web/browserbase.js +0 -1
- package/document_loaders/web/cheerio.cjs +0 -1
- package/document_loaders/web/cheerio.d.cts +0 -1
- package/document_loaders/web/cheerio.d.ts +0 -1
- package/document_loaders/web/cheerio.js +0 -1
- package/document_loaders/web/college_confidential.cjs +0 -1
- package/document_loaders/web/college_confidential.d.cts +0 -1
- package/document_loaders/web/college_confidential.d.ts +0 -1
- package/document_loaders/web/college_confidential.js +0 -1
- package/document_loaders/web/confluence.cjs +0 -1
- package/document_loaders/web/confluence.d.cts +0 -1
- package/document_loaders/web/confluence.d.ts +0 -1
- package/document_loaders/web/confluence.js +0 -1
- package/document_loaders/web/couchbase.cjs +0 -1
- package/document_loaders/web/couchbase.d.cts +0 -1
- package/document_loaders/web/couchbase.d.ts +0 -1
- package/document_loaders/web/couchbase.js +0 -1
- package/document_loaders/web/figma.cjs +0 -1
- package/document_loaders/web/figma.d.cts +0 -1
- package/document_loaders/web/figma.d.ts +0 -1
- package/document_loaders/web/figma.js +0 -1
- package/document_loaders/web/firecrawl.cjs +0 -1
- package/document_loaders/web/firecrawl.d.cts +0 -1
- package/document_loaders/web/firecrawl.d.ts +0 -1
- package/document_loaders/web/firecrawl.js +0 -1
- package/document_loaders/web/gitbook.cjs +0 -1
- package/document_loaders/web/gitbook.d.cts +0 -1
- package/document_loaders/web/gitbook.d.ts +0 -1
- package/document_loaders/web/gitbook.js +0 -1
- package/document_loaders/web/github.cjs +0 -1
- package/document_loaders/web/github.d.cts +0 -1
- package/document_loaders/web/github.d.ts +0 -1
- package/document_loaders/web/github.js +0 -1
- package/document_loaders/web/hn.cjs +0 -1
- package/document_loaders/web/hn.d.cts +0 -1
- package/document_loaders/web/hn.d.ts +0 -1
- package/document_loaders/web/hn.js +0 -1
- package/document_loaders/web/imsdb.cjs +0 -1
- package/document_loaders/web/imsdb.d.cts +0 -1
- package/document_loaders/web/imsdb.d.ts +0 -1
- package/document_loaders/web/imsdb.js +0 -1
- package/document_loaders/web/notionapi.cjs +0 -1
- package/document_loaders/web/notionapi.d.cts +0 -1
- package/document_loaders/web/notionapi.d.ts +0 -1
- package/document_loaders/web/notionapi.js +0 -1
- package/document_loaders/web/notiondb.cjs +0 -1
- package/document_loaders/web/notiondb.d.cts +0 -1
- package/document_loaders/web/notiondb.d.ts +0 -1
- package/document_loaders/web/notiondb.js +0 -1
- package/document_loaders/web/pdf.cjs +0 -1
- package/document_loaders/web/pdf.d.cts +0 -1
- package/document_loaders/web/pdf.d.ts +0 -1
- package/document_loaders/web/pdf.js +0 -1
- package/document_loaders/web/playwright.cjs +0 -1
- package/document_loaders/web/playwright.d.cts +0 -1
- package/document_loaders/web/playwright.d.ts +0 -1
- package/document_loaders/web/playwright.js +0 -1
- package/document_loaders/web/puppeteer.cjs +0 -1
- package/document_loaders/web/puppeteer.d.cts +0 -1
- package/document_loaders/web/puppeteer.d.ts +0 -1
- package/document_loaders/web/puppeteer.js +0 -1
- package/document_loaders/web/recursive_url.cjs +0 -1
- package/document_loaders/web/recursive_url.d.cts +0 -1
- package/document_loaders/web/recursive_url.d.ts +0 -1
- package/document_loaders/web/recursive_url.js +0 -1
- package/document_loaders/web/s3.cjs +0 -1
- package/document_loaders/web/s3.d.cts +0 -1
- package/document_loaders/web/s3.d.ts +0 -1
- package/document_loaders/web/s3.js +0 -1
- package/document_loaders/web/searchapi.cjs +0 -1
- package/document_loaders/web/searchapi.d.cts +0 -1
- package/document_loaders/web/searchapi.d.ts +0 -1
- package/document_loaders/web/searchapi.js +0 -1
- package/document_loaders/web/serpapi.cjs +0 -1
- package/document_loaders/web/serpapi.d.cts +0 -1
- package/document_loaders/web/serpapi.d.ts +0 -1
- package/document_loaders/web/serpapi.js +0 -1
- package/document_loaders/web/sitemap.cjs +0 -1
- package/document_loaders/web/sitemap.d.cts +0 -1
- package/document_loaders/web/sitemap.d.ts +0 -1
- package/document_loaders/web/sitemap.js +0 -1
- package/document_loaders/web/sonix_audio.cjs +0 -1
- package/document_loaders/web/sonix_audio.d.cts +0 -1
- package/document_loaders/web/sonix_audio.d.ts +0 -1
- package/document_loaders/web/sonix_audio.js +0 -1
- package/document_loaders/web/sort_xyz_blockchain.cjs +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.cts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.d.ts +0 -1
- package/document_loaders/web/sort_xyz_blockchain.js +0 -1
- package/document_loaders/web/youtube.cjs +0 -1
- package/document_loaders/web/youtube.d.cts +0 -1
- package/document_loaders/web/youtube.d.ts +0 -1
- package/document_loaders/web/youtube.js +0 -1
- package/experimental/tools/pyinterpreter.cjs +0 -1
- package/experimental/tools/pyinterpreter.d.cts +0 -1
- package/experimental/tools/pyinterpreter.d.ts +0 -1
- package/experimental/tools/pyinterpreter.js +0 -1
- package/memory/index.cjs +0 -1
- package/memory/index.d.cts +0 -1
- package/memory/index.d.ts +0 -1
- package/memory/index.js +0 -1
- package/retrievers/self_query/chroma.cjs +0 -1
- package/retrievers/self_query/chroma.d.cts +0 -1
- package/retrievers/self_query/chroma.d.ts +0 -1
- package/retrievers/self_query/chroma.js +0 -1
- package/retrievers/self_query/pinecone.cjs +0 -1
- package/retrievers/self_query/pinecone.d.cts +0 -1
- package/retrievers/self_query/pinecone.d.ts +0 -1
- package/retrievers/self_query/pinecone.js +0 -1
- package/retrievers/self_query/supabase.cjs +0 -1
- package/retrievers/self_query/supabase.d.cts +0 -1
- package/retrievers/self_query/supabase.d.ts +0 -1
- package/retrievers/self_query/supabase.js +0 -1
- package/retrievers/self_query/vectara.cjs +0 -1
- package/retrievers/self_query/vectara.d.cts +0 -1
- package/retrievers/self_query/vectara.d.ts +0 -1
- package/retrievers/self_query/vectara.js +0 -1
- package/retrievers/self_query/weaviate.cjs +0 -1
- package/retrievers/self_query/weaviate.d.cts +0 -1
- package/retrievers/self_query/weaviate.d.ts +0 -1
- package/retrievers/self_query/weaviate.js +0 -1
|
@@ -1,608 +0,0 @@
|
|
|
1
|
-
import ignore from "ignore";
|
|
2
|
-
import binaryExtensions from "binary-extensions";
|
|
3
|
-
import { Document } from "@langchain/core/documents";
|
|
4
|
-
import { getEnvironmentVariable } from "@langchain/core/utils/env";
|
|
5
|
-
import { AsyncCaller, } from "@langchain/core/utils/async_caller";
|
|
6
|
-
import { BaseDocumentLoader } from "../base.js";
|
|
7
|
-
import { UnknownHandling } from "../fs/directory.js";
|
|
8
|
-
import { extname } from "../../util/extname.js";
|
|
9
|
-
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
|
|
10
|
-
/* #__PURE__ */ logVersion020MigrationWarning({
|
|
11
|
-
oldEntrypointName: "document_loaders/web/github",
|
|
12
|
-
newPackageName: "@langchain/community",
|
|
13
|
-
});
|
|
14
|
-
const extensions = /* #__PURE__ */ new Set(binaryExtensions);
|
|
15
|
-
/**
|
|
16
|
-
* A function that checks if a file path is a binary file based on its
|
|
17
|
-
* extension.
|
|
18
|
-
* @param name The file path to check.
|
|
19
|
-
* @returns A boolean indicating whether the file path is a binary file.
|
|
20
|
-
*/
|
|
21
|
-
function isBinaryPath(name) {
|
|
22
|
-
return extensions.has(extname(name).slice(1).toLowerCase());
|
|
23
|
-
}
|
|
24
|
-
/**
|
|
25
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/github" instead. This entrypoint will be removed in 0.3.0.
|
|
26
|
-
*
|
|
27
|
-
* A class that extends the BaseDocumentLoader and implements the
|
|
28
|
-
* GithubRepoLoaderParams interface. It represents a document loader for
|
|
29
|
-
* loading files from a GitHub repository.
|
|
30
|
-
*/
|
|
31
|
-
export class GithubRepoLoader extends BaseDocumentLoader {
|
|
32
|
-
constructor(githubUrl, { accessToken = getEnvironmentVariable("GITHUB_ACCESS_TOKEN"), baseUrl = "https://github.com", apiUrl = "https://api.github.com", branch = "main", recursive = true, processSubmodules = false, unknown = UnknownHandling.Warn, ignoreFiles = [], ignorePaths, verbose = false, maxConcurrency = 2, maxRetries = 2, ...rest } = {}) {
|
|
33
|
-
super();
|
|
34
|
-
Object.defineProperty(this, "baseUrl", {
|
|
35
|
-
enumerable: true,
|
|
36
|
-
configurable: true,
|
|
37
|
-
writable: true,
|
|
38
|
-
value: void 0
|
|
39
|
-
});
|
|
40
|
-
Object.defineProperty(this, "apiUrl", {
|
|
41
|
-
enumerable: true,
|
|
42
|
-
configurable: true,
|
|
43
|
-
writable: true,
|
|
44
|
-
value: void 0
|
|
45
|
-
});
|
|
46
|
-
Object.defineProperty(this, "owner", {
|
|
47
|
-
enumerable: true,
|
|
48
|
-
configurable: true,
|
|
49
|
-
writable: true,
|
|
50
|
-
value: void 0
|
|
51
|
-
});
|
|
52
|
-
Object.defineProperty(this, "repo", {
|
|
53
|
-
enumerable: true,
|
|
54
|
-
configurable: true,
|
|
55
|
-
writable: true,
|
|
56
|
-
value: void 0
|
|
57
|
-
});
|
|
58
|
-
Object.defineProperty(this, "initialPath", {
|
|
59
|
-
enumerable: true,
|
|
60
|
-
configurable: true,
|
|
61
|
-
writable: true,
|
|
62
|
-
value: void 0
|
|
63
|
-
});
|
|
64
|
-
Object.defineProperty(this, "headers", {
|
|
65
|
-
enumerable: true,
|
|
66
|
-
configurable: true,
|
|
67
|
-
writable: true,
|
|
68
|
-
value: {}
|
|
69
|
-
});
|
|
70
|
-
Object.defineProperty(this, "branch", {
|
|
71
|
-
enumerable: true,
|
|
72
|
-
configurable: true,
|
|
73
|
-
writable: true,
|
|
74
|
-
value: void 0
|
|
75
|
-
});
|
|
76
|
-
Object.defineProperty(this, "recursive", {
|
|
77
|
-
enumerable: true,
|
|
78
|
-
configurable: true,
|
|
79
|
-
writable: true,
|
|
80
|
-
value: void 0
|
|
81
|
-
});
|
|
82
|
-
Object.defineProperty(this, "processSubmodules", {
|
|
83
|
-
enumerable: true,
|
|
84
|
-
configurable: true,
|
|
85
|
-
writable: true,
|
|
86
|
-
value: void 0
|
|
87
|
-
});
|
|
88
|
-
Object.defineProperty(this, "unknown", {
|
|
89
|
-
enumerable: true,
|
|
90
|
-
configurable: true,
|
|
91
|
-
writable: true,
|
|
92
|
-
value: void 0
|
|
93
|
-
});
|
|
94
|
-
Object.defineProperty(this, "accessToken", {
|
|
95
|
-
enumerable: true,
|
|
96
|
-
configurable: true,
|
|
97
|
-
writable: true,
|
|
98
|
-
value: void 0
|
|
99
|
-
});
|
|
100
|
-
Object.defineProperty(this, "ignoreFiles", {
|
|
101
|
-
enumerable: true,
|
|
102
|
-
configurable: true,
|
|
103
|
-
writable: true,
|
|
104
|
-
value: void 0
|
|
105
|
-
});
|
|
106
|
-
Object.defineProperty(this, "ignore", {
|
|
107
|
-
enumerable: true,
|
|
108
|
-
configurable: true,
|
|
109
|
-
writable: true,
|
|
110
|
-
value: void 0
|
|
111
|
-
});
|
|
112
|
-
Object.defineProperty(this, "verbose", {
|
|
113
|
-
enumerable: true,
|
|
114
|
-
configurable: true,
|
|
115
|
-
writable: true,
|
|
116
|
-
value: void 0
|
|
117
|
-
});
|
|
118
|
-
Object.defineProperty(this, "maxConcurrency", {
|
|
119
|
-
enumerable: true,
|
|
120
|
-
configurable: true,
|
|
121
|
-
writable: true,
|
|
122
|
-
value: void 0
|
|
123
|
-
});
|
|
124
|
-
Object.defineProperty(this, "maxRetries", {
|
|
125
|
-
enumerable: true,
|
|
126
|
-
configurable: true,
|
|
127
|
-
writable: true,
|
|
128
|
-
value: void 0
|
|
129
|
-
});
|
|
130
|
-
Object.defineProperty(this, "caller", {
|
|
131
|
-
enumerable: true,
|
|
132
|
-
configurable: true,
|
|
133
|
-
writable: true,
|
|
134
|
-
value: void 0
|
|
135
|
-
});
|
|
136
|
-
Object.defineProperty(this, "ignorePaths", {
|
|
137
|
-
enumerable: true,
|
|
138
|
-
configurable: true,
|
|
139
|
-
writable: true,
|
|
140
|
-
value: void 0
|
|
141
|
-
});
|
|
142
|
-
Object.defineProperty(this, "submoduleInfos", {
|
|
143
|
-
enumerable: true,
|
|
144
|
-
configurable: true,
|
|
145
|
-
writable: true,
|
|
146
|
-
value: void 0
|
|
147
|
-
});
|
|
148
|
-
this.baseUrl = baseUrl;
|
|
149
|
-
this.apiUrl = apiUrl;
|
|
150
|
-
const { owner, repo, path } = this.extractOwnerAndRepoAndPath(githubUrl);
|
|
151
|
-
this.owner = owner;
|
|
152
|
-
this.repo = repo;
|
|
153
|
-
this.initialPath = path;
|
|
154
|
-
this.branch = branch;
|
|
155
|
-
this.recursive = recursive;
|
|
156
|
-
// processing submodules without processing contents of other directories makes no sense
|
|
157
|
-
if (processSubmodules && !recursive) {
|
|
158
|
-
throw new Error(`Input property "recursive" must be true if "processSubmodules" is true.`);
|
|
159
|
-
}
|
|
160
|
-
this.processSubmodules = processSubmodules;
|
|
161
|
-
this.unknown = unknown;
|
|
162
|
-
this.accessToken = accessToken;
|
|
163
|
-
this.ignoreFiles = ignoreFiles;
|
|
164
|
-
this.verbose = verbose;
|
|
165
|
-
this.maxConcurrency = maxConcurrency;
|
|
166
|
-
this.maxRetries = maxRetries;
|
|
167
|
-
this.headers = {
|
|
168
|
-
"User-Agent": "langchain",
|
|
169
|
-
};
|
|
170
|
-
this.caller = new AsyncCaller({
|
|
171
|
-
maxConcurrency,
|
|
172
|
-
maxRetries,
|
|
173
|
-
...rest,
|
|
174
|
-
});
|
|
175
|
-
this.ignorePaths = ignorePaths;
|
|
176
|
-
if (ignorePaths) {
|
|
177
|
-
this.ignore = ignore.default().add(ignorePaths);
|
|
178
|
-
}
|
|
179
|
-
if (this.accessToken) {
|
|
180
|
-
this.headers = {
|
|
181
|
-
...this.headers,
|
|
182
|
-
Authorization: `Bearer ${this.accessToken}`,
|
|
183
|
-
};
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
/**
|
|
187
|
-
* Extracts the owner, repository, and path from a GitHub URL.
|
|
188
|
-
* @param url The GitHub URL to extract information from.
|
|
189
|
-
* @returns An object containing the owner, repository, and path extracted from the GitHub URL.
|
|
190
|
-
*/
|
|
191
|
-
extractOwnerAndRepoAndPath(url) {
|
|
192
|
-
const match = url.match(new RegExp(`${this.baseUrl}/([^/]+)/([^/]+)(/tree/[^/]+/(.+))?`, "i"));
|
|
193
|
-
if (!match) {
|
|
194
|
-
throw new Error("Invalid GitHub URL format.");
|
|
195
|
-
}
|
|
196
|
-
return { owner: match[1], repo: match[2], path: match[4] || "" };
|
|
197
|
-
}
|
|
198
|
-
/**
|
|
199
|
-
* Fetches the files from the GitHub repository and creates Document
|
|
200
|
-
* instances for each file. It also handles error handling based on the
|
|
201
|
-
* unknown handling option.
|
|
202
|
-
* @returns A promise that resolves to an array of Document instances.
|
|
203
|
-
*/
|
|
204
|
-
async load() {
|
|
205
|
-
this.log(`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`);
|
|
206
|
-
// process repository without submodules
|
|
207
|
-
const documents = (await this.processRepo()).map((fileResponse) => new Document({
|
|
208
|
-
pageContent: fileResponse.contents,
|
|
209
|
-
metadata: fileResponse.metadata,
|
|
210
|
-
}));
|
|
211
|
-
if (this.processSubmodules) {
|
|
212
|
-
// process submodules
|
|
213
|
-
await this.getSubmoduleInfo();
|
|
214
|
-
for (const submoduleInfo of this.submoduleInfos) {
|
|
215
|
-
documents.push(...(await this.loadSubmodule(submoduleInfo)));
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
return documents;
|
|
219
|
-
}
|
|
220
|
-
/**
|
|
221
|
-
* Asynchronously streams documents from the entire GitHub repository.
|
|
222
|
-
* It is suitable for situations where processing large repositories in a memory-efficient manner is required.
|
|
223
|
-
* @yields Yields a Promise that resolves to a Document object for each file or submodule content found in the repository.
|
|
224
|
-
*/
|
|
225
|
-
async *loadAsStream() {
|
|
226
|
-
this.log(`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`);
|
|
227
|
-
yield* await this.processRepoAsStream(this.initialPath);
|
|
228
|
-
if (!this.processSubmodules) {
|
|
229
|
-
return;
|
|
230
|
-
}
|
|
231
|
-
await this.getSubmoduleInfo();
|
|
232
|
-
for (const submoduleInfo of this.submoduleInfos) {
|
|
233
|
-
yield* await this.loadSubmoduleAsStream(submoduleInfo);
|
|
234
|
-
}
|
|
235
|
-
}
|
|
236
|
-
/**
|
|
237
|
-
* Loads the information about Git submodules from the repository, if available.
|
|
238
|
-
*/
|
|
239
|
-
async getSubmoduleInfo() {
|
|
240
|
-
this.log("Loading info about submodules...");
|
|
241
|
-
// we have to fetch the files of the root directory to get the download url of the .gitmodules file
|
|
242
|
-
// however, we cannot reuse the files retrieved in processRepo() as initialPath may be != ""
|
|
243
|
-
// so it may be that we end up fetching this file list twice
|
|
244
|
-
const repoFiles = await this.fetchRepoFiles("");
|
|
245
|
-
const gitmodulesFile = repoFiles.filter(({ name }) => name === ".gitmodules")?.[0];
|
|
246
|
-
if (gitmodulesFile) {
|
|
247
|
-
const gitmodulesContent = await this.fetchFileContent({
|
|
248
|
-
download_url: gitmodulesFile.download_url,
|
|
249
|
-
});
|
|
250
|
-
this.submoduleInfos = await this.parseGitmodules(gitmodulesContent);
|
|
251
|
-
}
|
|
252
|
-
else {
|
|
253
|
-
this.submoduleInfos = [];
|
|
254
|
-
}
|
|
255
|
-
this.log(`Found ${this.submoduleInfos.length} submodules:`);
|
|
256
|
-
for (const submoduleInfo of this.submoduleInfos) {
|
|
257
|
-
this.log(JSON.stringify(submoduleInfo));
|
|
258
|
-
}
|
|
259
|
-
}
|
|
260
|
-
/**
|
|
261
|
-
* Parses the given content of a .gitmodules file. Furthermore, queries the current SHA ref of all submodules.
|
|
262
|
-
* Returns the submodule information as array.
|
|
263
|
-
* @param gitmodulesContent the content of a .gitmodules file
|
|
264
|
-
*/
|
|
265
|
-
async parseGitmodules(gitmodulesContent) {
|
|
266
|
-
let validGitmodulesContent = gitmodulesContent;
|
|
267
|
-
// in case the .gitmodules file does not end with a newline, we add one to make the regex work
|
|
268
|
-
if (!validGitmodulesContent.endsWith("\n")) {
|
|
269
|
-
validGitmodulesContent += "\n";
|
|
270
|
-
}
|
|
271
|
-
// catches the initial line of submodule entries
|
|
272
|
-
const submodulePattern = /\[submodule "(.*?)"]\n((\s+.*?\s*=\s*.*?\n)*)/g;
|
|
273
|
-
// catches the properties of a submodule
|
|
274
|
-
const keyValuePattern = /\s+(.*?)\s*=\s*(.*?)\s/g;
|
|
275
|
-
const submoduleInfos = [];
|
|
276
|
-
for (const [, name, propertyLines] of validGitmodulesContent.matchAll(submodulePattern)) {
|
|
277
|
-
if (!name || !propertyLines) {
|
|
278
|
-
throw new Error("Could not parse submodule entry");
|
|
279
|
-
}
|
|
280
|
-
const submodulePropertyLines = propertyLines.matchAll(keyValuePattern);
|
|
281
|
-
let path;
|
|
282
|
-
let url;
|
|
283
|
-
for (const [, key, value] of submodulePropertyLines) {
|
|
284
|
-
if (!key || !value) {
|
|
285
|
-
throw new Error(`Could not parse key/value pairs for submodule ${name}`);
|
|
286
|
-
}
|
|
287
|
-
switch (key) {
|
|
288
|
-
case "path":
|
|
289
|
-
path = value;
|
|
290
|
-
break;
|
|
291
|
-
case "url":
|
|
292
|
-
url = value;
|
|
293
|
-
if (url.endsWith(".git")) {
|
|
294
|
-
url = url.substring(0, url.length - 4);
|
|
295
|
-
}
|
|
296
|
-
break;
|
|
297
|
-
default:
|
|
298
|
-
// ignoring unused keys
|
|
299
|
-
}
|
|
300
|
-
}
|
|
301
|
-
if (!path || !url) {
|
|
302
|
-
throw new Error(`Missing properties for submodule ${name}`);
|
|
303
|
-
}
|
|
304
|
-
// fetch the current ref of the submodule
|
|
305
|
-
const files = await this.fetchRepoFiles(path);
|
|
306
|
-
const submoduleInfo = {
|
|
307
|
-
name,
|
|
308
|
-
path,
|
|
309
|
-
url,
|
|
310
|
-
ref: files[0].sha,
|
|
311
|
-
};
|
|
312
|
-
submoduleInfos.push(submoduleInfo);
|
|
313
|
-
}
|
|
314
|
-
return submoduleInfos;
|
|
315
|
-
}
|
|
316
|
-
/**
|
|
317
|
-
* Loads the documents of the given submodule. Uses the same parameters as for the current repository.
|
|
318
|
-
* External submodules, i.e. submodules pointing to another GitHub instance, are ignored.
|
|
319
|
-
* @param submoduleInfo the info about the submodule to be loaded
|
|
320
|
-
*/
|
|
321
|
-
async loadSubmodule(submoduleInfo) {
|
|
322
|
-
if (!submoduleInfo.url.startsWith(this.baseUrl)) {
|
|
323
|
-
this.log(`Ignoring external submodule ${submoduleInfo.url}.`);
|
|
324
|
-
return [];
|
|
325
|
-
}
|
|
326
|
-
else if (!submoduleInfo.path.startsWith(this.initialPath)) {
|
|
327
|
-
this.log(`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`);
|
|
328
|
-
return [];
|
|
329
|
-
}
|
|
330
|
-
else {
|
|
331
|
-
this.log(`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`);
|
|
332
|
-
return new GithubRepoLoader(submoduleInfo.url, {
|
|
333
|
-
accessToken: this.accessToken,
|
|
334
|
-
apiUrl: this.apiUrl,
|
|
335
|
-
baseUrl: this.baseUrl,
|
|
336
|
-
branch: submoduleInfo.ref,
|
|
337
|
-
recursive: this.recursive,
|
|
338
|
-
processSubmodules: this.processSubmodules,
|
|
339
|
-
unknown: this.unknown,
|
|
340
|
-
ignoreFiles: this.ignoreFiles,
|
|
341
|
-
ignorePaths: this.ignorePaths,
|
|
342
|
-
verbose: this.verbose,
|
|
343
|
-
maxConcurrency: this.maxConcurrency,
|
|
344
|
-
maxRetries: this.maxRetries,
|
|
345
|
-
}).load();
|
|
346
|
-
}
|
|
347
|
-
}
|
|
348
|
-
/**
|
|
349
|
-
* Asynchronously processes and streams the contents of a specified submodule in the GitHub repository.
|
|
350
|
-
* @param submoduleInfo the info about the submodule to be loaded
|
|
351
|
-
* @yields Yields a Promise that resolves to a Document object for each file found in the submodule.
|
|
352
|
-
*/
|
|
353
|
-
async *loadSubmoduleAsStream(submoduleInfo) {
|
|
354
|
-
if (!submoduleInfo.url.startsWith(this.baseUrl)) {
|
|
355
|
-
this.log(`Ignoring external submodule ${submoduleInfo.url}.`);
|
|
356
|
-
yield* [];
|
|
357
|
-
}
|
|
358
|
-
if (!submoduleInfo.path.startsWith(this.initialPath)) {
|
|
359
|
-
this.log(`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`);
|
|
360
|
-
yield* [];
|
|
361
|
-
}
|
|
362
|
-
this.log(`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`);
|
|
363
|
-
const submoduleLoader = new GithubRepoLoader(submoduleInfo.url, {
|
|
364
|
-
accessToken: this.accessToken,
|
|
365
|
-
baseUrl: this.baseUrl,
|
|
366
|
-
apiUrl: this.apiUrl,
|
|
367
|
-
branch: submoduleInfo.ref,
|
|
368
|
-
recursive: this.recursive,
|
|
369
|
-
processSubmodules: this.processSubmodules,
|
|
370
|
-
unknown: this.unknown,
|
|
371
|
-
ignoreFiles: this.ignoreFiles,
|
|
372
|
-
ignorePaths: this.ignorePaths,
|
|
373
|
-
verbose: this.verbose,
|
|
374
|
-
maxConcurrency: this.maxConcurrency,
|
|
375
|
-
maxRetries: this.maxRetries,
|
|
376
|
-
});
|
|
377
|
-
yield* await submoduleLoader.processRepoAsStream(submoduleInfo.path);
|
|
378
|
-
}
|
|
379
|
-
/**
|
|
380
|
-
* Determines whether a file or directory should be ignored based on its
|
|
381
|
-
* path and type.
|
|
382
|
-
* @param path The path of the file or directory.
|
|
383
|
-
* @param fileType The type of the file or directory.
|
|
384
|
-
* @returns A boolean indicating whether the file or directory should be ignored.
|
|
385
|
-
*/
|
|
386
|
-
shouldIgnore(path, fileType) {
|
|
387
|
-
if (fileType !== "dir" && isBinaryPath(path)) {
|
|
388
|
-
return true;
|
|
389
|
-
}
|
|
390
|
-
if (this.ignore !== undefined) {
|
|
391
|
-
return this.ignore.ignores(path);
|
|
392
|
-
}
|
|
393
|
-
return (fileType !== "dir" &&
|
|
394
|
-
this.ignoreFiles.some((pattern) => {
|
|
395
|
-
if (typeof pattern === "string") {
|
|
396
|
-
return path === pattern;
|
|
397
|
-
}
|
|
398
|
-
try {
|
|
399
|
-
return pattern.test(path);
|
|
400
|
-
}
|
|
401
|
-
catch {
|
|
402
|
-
throw new Error(`Unknown ignore file pattern: ${pattern}`);
|
|
403
|
-
}
|
|
404
|
-
}));
|
|
405
|
-
}
|
|
406
|
-
/**
|
|
407
|
-
* Takes the file info and wrap it in a promise that will resolve to the file content and metadata
|
|
408
|
-
* @param file
|
|
409
|
-
* @returns
|
|
410
|
-
*/
|
|
411
|
-
async fetchFileContentWrapper(file) {
|
|
412
|
-
const fileContent = await this.fetchFileContent(file).catch((error) => {
|
|
413
|
-
this.handleError(`Failed wrap file content: ${file}, ${error}`);
|
|
414
|
-
});
|
|
415
|
-
return {
|
|
416
|
-
contents: fileContent || "",
|
|
417
|
-
metadata: {
|
|
418
|
-
source: file.path,
|
|
419
|
-
repository: `${this.baseUrl}/${this.owner}/${this.repo}`,
|
|
420
|
-
branch: this.branch,
|
|
421
|
-
},
|
|
422
|
-
};
|
|
423
|
-
}
|
|
424
|
-
/**
|
|
425
|
-
* Maps a list of files / directories to a list of promises that will fetch the file / directory contents
|
|
426
|
-
*/
|
|
427
|
-
async getCurrentDirectoryFilePromises(files) {
|
|
428
|
-
const currentDirectoryFilePromises = [];
|
|
429
|
-
// Directories have nested files / directories, which is why this is a list of promises of promises
|
|
430
|
-
const currentDirectoryDirectoryPromises = [];
|
|
431
|
-
for (const file of files) {
|
|
432
|
-
if (this.shouldIgnore(file.path, file.type)) {
|
|
433
|
-
continue;
|
|
434
|
-
}
|
|
435
|
-
if (file.type === "file" && file.size === 0) {
|
|
436
|
-
// this is a submodule. ignoring for the moment. submodule processing is done separately
|
|
437
|
-
continue;
|
|
438
|
-
}
|
|
439
|
-
if (file.type !== "dir") {
|
|
440
|
-
try {
|
|
441
|
-
currentDirectoryFilePromises.push(this.fetchFileContentWrapper(file));
|
|
442
|
-
}
|
|
443
|
-
catch (e) {
|
|
444
|
-
this.handleError(`Failed to fetch file content: ${file.path}, ${e}`);
|
|
445
|
-
}
|
|
446
|
-
}
|
|
447
|
-
else if (this.recursive) {
|
|
448
|
-
currentDirectoryDirectoryPromises.push(this.processDirectory(file.path));
|
|
449
|
-
}
|
|
450
|
-
}
|
|
451
|
-
const curDirDirectories = await Promise.all(currentDirectoryDirectoryPromises);
|
|
452
|
-
return [...currentDirectoryFilePromises, ...curDirDirectories.flat()];
|
|
453
|
-
}
|
|
454
|
-
/**
|
|
455
|
-
* Begins the process of fetching the contents of the repository
|
|
456
|
-
*/
|
|
457
|
-
async processRepo() {
|
|
458
|
-
try {
|
|
459
|
-
// Get the list of file / directory names in the root directory
|
|
460
|
-
const files = await this.fetchRepoFiles(this.initialPath);
|
|
461
|
-
// Map the file / directory paths to promises that will fetch the file / directory contents
|
|
462
|
-
const currentDirectoryFilePromises = await this.getCurrentDirectoryFilePromises(files);
|
|
463
|
-
return Promise.all(currentDirectoryFilePromises);
|
|
464
|
-
}
|
|
465
|
-
catch (error) {
|
|
466
|
-
this.handleError(`Failed to process directory: ${this.initialPath}, ${error}`);
|
|
467
|
-
return Promise.reject(error);
|
|
468
|
-
}
|
|
469
|
-
}
|
|
470
|
-
/**
|
|
471
|
-
* Asynchronously processes the contents of the entire GitHub repository,
|
|
472
|
-
* streaming each file as a Document object.
|
|
473
|
-
* @param path The path of the directory to process.
|
|
474
|
-
* @yields Yields a Promise that resolves to a Document object for each file found in the repository.
|
|
475
|
-
*/
|
|
476
|
-
async *processRepoAsStream(path) {
|
|
477
|
-
const files = await this.fetchRepoFiles(path);
|
|
478
|
-
for (const file of files) {
|
|
479
|
-
if (this.shouldIgnore(file.path, file.type)) {
|
|
480
|
-
continue;
|
|
481
|
-
}
|
|
482
|
-
if (file.type === "file") {
|
|
483
|
-
try {
|
|
484
|
-
const fileResponse = await this.fetchFileContentWrapper(file);
|
|
485
|
-
yield new Document({
|
|
486
|
-
pageContent: fileResponse.contents,
|
|
487
|
-
metadata: fileResponse.metadata,
|
|
488
|
-
});
|
|
489
|
-
}
|
|
490
|
-
catch (error) {
|
|
491
|
-
this.handleError(`Failed to fetch file content: ${file.path}, ${error}`);
|
|
492
|
-
}
|
|
493
|
-
}
|
|
494
|
-
else if (this.recursive) {
|
|
495
|
-
yield* await this.processDirectoryAsStream(file.path);
|
|
496
|
-
}
|
|
497
|
-
}
|
|
498
|
-
}
|
|
499
|
-
/**
|
|
500
|
-
* Fetches the contents of a directory and maps the file / directory paths
|
|
501
|
-
* to promises that will fetch the file / directory contents.
|
|
502
|
-
* @param path The path of the directory to process.
|
|
503
|
-
* @returns A promise that resolves to an array of promises that will fetch the file / directory contents.
|
|
504
|
-
*/
|
|
505
|
-
async processDirectory(path) {
|
|
506
|
-
try {
|
|
507
|
-
const files = await this.fetchRepoFiles(path);
|
|
508
|
-
return this.getCurrentDirectoryFilePromises(files);
|
|
509
|
-
}
|
|
510
|
-
catch (error) {
|
|
511
|
-
this.handleError(`Failed to process directory: ${path}, ${error}`);
|
|
512
|
-
return Promise.reject(error);
|
|
513
|
-
}
|
|
514
|
-
}
|
|
515
|
-
/**
|
|
516
|
-
* Asynchronously processes the contents of a given directory in the GitHub repository,
|
|
517
|
-
* streaming each file as a Document object.
|
|
518
|
-
* @param path The path of the directory to process.
|
|
519
|
-
* @yields Yields a Promise that resolves to a Document object for each file in the directory.
|
|
520
|
-
*/
|
|
521
|
-
async *processDirectoryAsStream(path) {
|
|
522
|
-
const files = await this.fetchRepoFiles(path);
|
|
523
|
-
for (const file of files) {
|
|
524
|
-
if (this.shouldIgnore(file.path, file.type)) {
|
|
525
|
-
continue;
|
|
526
|
-
}
|
|
527
|
-
if (file.type === "file") {
|
|
528
|
-
try {
|
|
529
|
-
const fileResponse = await this.fetchFileContentWrapper(file);
|
|
530
|
-
yield new Document({
|
|
531
|
-
pageContent: fileResponse.contents,
|
|
532
|
-
metadata: fileResponse.metadata,
|
|
533
|
-
});
|
|
534
|
-
}
|
|
535
|
-
catch {
|
|
536
|
-
this.handleError(`Failed to fetch file content: ${file.path}`);
|
|
537
|
-
}
|
|
538
|
-
}
|
|
539
|
-
else if (this.recursive) {
|
|
540
|
-
yield* await this.processDirectoryAsStream(file.path);
|
|
541
|
-
}
|
|
542
|
-
}
|
|
543
|
-
}
|
|
544
|
-
/**
|
|
545
|
-
* Fetches the files from a GitHub repository.
|
|
546
|
-
* If the path denotes a single file, the resulting array contains only one element.
|
|
547
|
-
* @param path The path of the repository to fetch the files from.
|
|
548
|
-
* @returns A promise that resolves to an array of GithubFile instances.
|
|
549
|
-
*/
|
|
550
|
-
async fetchRepoFiles(path) {
|
|
551
|
-
const url = `${this.apiUrl}/repos/${this.owner}/${this.repo}/contents/${path}?ref=${this.branch}`;
|
|
552
|
-
return this.caller.call(async () => {
|
|
553
|
-
this.log(`Fetching ${url}`);
|
|
554
|
-
const response = await fetch(url, { headers: this.headers });
|
|
555
|
-
const data = await response.json();
|
|
556
|
-
if (!response.ok) {
|
|
557
|
-
throw new Error(`Unable to fetch repository files: ${response.status} ${JSON.stringify(data)}`);
|
|
558
|
-
}
|
|
559
|
-
if (Array.isArray(data)) {
|
|
560
|
-
return data;
|
|
561
|
-
}
|
|
562
|
-
else {
|
|
563
|
-
return [data];
|
|
564
|
-
}
|
|
565
|
-
});
|
|
566
|
-
}
|
|
567
|
-
/**
|
|
568
|
-
* Fetches the content of a file from a GitHub repository.
|
|
569
|
-
* @param file The file to fetch the content from.
|
|
570
|
-
* @returns A promise that resolves to the content of the file.
|
|
571
|
-
*/
|
|
572
|
-
async fetchFileContent(file) {
|
|
573
|
-
return this.caller.call(async () => {
|
|
574
|
-
this.log(`Fetching ${file.download_url}`);
|
|
575
|
-
const response = await fetch(file.download_url, {
|
|
576
|
-
headers: this.headers,
|
|
577
|
-
});
|
|
578
|
-
return response.text();
|
|
579
|
-
});
|
|
580
|
-
}
|
|
581
|
-
/**
|
|
582
|
-
* Handles errors based on the unknown handling option.
|
|
583
|
-
* @param message The error message.
|
|
584
|
-
* @returns void
|
|
585
|
-
*/
|
|
586
|
-
handleError(message) {
|
|
587
|
-
switch (this.unknown) {
|
|
588
|
-
case UnknownHandling.Ignore:
|
|
589
|
-
break;
|
|
590
|
-
case UnknownHandling.Warn:
|
|
591
|
-
console.warn(message);
|
|
592
|
-
break;
|
|
593
|
-
case UnknownHandling.Error:
|
|
594
|
-
throw new Error(message);
|
|
595
|
-
default:
|
|
596
|
-
throw new Error(`Unknown unknown handling: ${this.unknown}`);
|
|
597
|
-
}
|
|
598
|
-
}
|
|
599
|
-
/**
|
|
600
|
-
* Logs the given message to the console, if parameter 'verbose' is set to true.
|
|
601
|
-
* @param message the message to be logged.
|
|
602
|
-
*/
|
|
603
|
-
log(message) {
|
|
604
|
-
if (this.verbose) {
|
|
605
|
-
console.log(message);
|
|
606
|
-
}
|
|
607
|
-
}
|
|
608
|
-
}
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.HNLoader = void 0;
|
|
4
|
-
const documents_1 = require("@langchain/core/documents");
|
|
5
|
-
const cheerio_js_1 = require("./cheerio.cjs");
|
|
6
|
-
const entrypoint_deprecation_js_1 = require("../../util/entrypoint_deprecation.cjs");
|
|
7
|
-
/* #__PURE__ */ (0, entrypoint_deprecation_js_1.logVersion020MigrationWarning)({
|
|
8
|
-
oldEntrypointName: "document_loaders/web/hn",
|
|
9
|
-
newPackageName: "@langchain/community",
|
|
10
|
-
});
|
|
11
|
-
/**
|
|
12
|
-
* @deprecated - Import from "@langchain/community/document_loaders/web/hn" instead. This entrypoint will be removed in 0.3.0.
|
|
13
|
-
*
|
|
14
|
-
* A class that extends the CheerioWebBaseLoader class. It represents a
|
|
15
|
-
* loader for loading web pages from the Hacker News website.
|
|
16
|
-
*/
|
|
17
|
-
class HNLoader extends cheerio_js_1.CheerioWebBaseLoader {
|
|
18
|
-
constructor(webPath) {
|
|
19
|
-
super(webPath);
|
|
20
|
-
Object.defineProperty(this, "webPath", {
|
|
21
|
-
enumerable: true,
|
|
22
|
-
configurable: true,
|
|
23
|
-
writable: true,
|
|
24
|
-
value: webPath
|
|
25
|
-
});
|
|
26
|
-
}
|
|
27
|
-
/**
|
|
28
|
-
* An asynchronous method that loads the web page. If the webPath includes
|
|
29
|
-
* "item", it calls the loadComments() method to load the comments from
|
|
30
|
-
* the web page. Otherwise, it calls the loadResults() method to load the
|
|
31
|
-
* results from the web page.
|
|
32
|
-
* @returns A Promise that resolves to an array of Document instances.
|
|
33
|
-
*/
|
|
34
|
-
async load() {
|
|
35
|
-
const $ = await this.scrape();
|
|
36
|
-
if (this.webPath.includes("item")) {
|
|
37
|
-
return this.loadComments($);
|
|
38
|
-
}
|
|
39
|
-
return this.loadResults($);
|
|
40
|
-
}
|
|
41
|
-
/**
|
|
42
|
-
* A private method that loads the comments from the web page. It selects
|
|
43
|
-
* the elements with the class "athing comtr" using the $ function
|
|
44
|
-
* provided by Cheerio. It also extracts the title of the web page from
|
|
45
|
-
* the element with the id "pagespace". It creates Document instances for
|
|
46
|
-
* each comment, with the comment text as the page content and the source
|
|
47
|
-
* and title as metadata.
|
|
48
|
-
* @param $ A CheerioAPI instance.
|
|
49
|
-
* @returns An array of Document instances.
|
|
50
|
-
*/
|
|
51
|
-
loadComments($) {
|
|
52
|
-
const comments = $("tr[class='athing comtr']");
|
|
53
|
-
const title = $("tr[id='pagespace']").attr("title");
|
|
54
|
-
const documents = [];
|
|
55
|
-
comments.each((_index, comment) => {
|
|
56
|
-
const text = $(comment).text().trim();
|
|
57
|
-
const metadata = { source: this.webPath, title };
|
|
58
|
-
documents.push(new documents_1.Document({ pageContent: text, metadata }));
|
|
59
|
-
});
|
|
60
|
-
return documents;
|
|
61
|
-
}
|
|
62
|
-
/**
|
|
63
|
-
* A private method that loads the results from the web page. It selects
|
|
64
|
-
* the elements with the class "athing" using the $ function provided by
|
|
65
|
-
* Cheerio. It extracts the ranking, link, title, and other metadata from
|
|
66
|
-
* each result item. It creates Document instances for each result item,
|
|
67
|
-
* with the title as the page content and the source, title, link, and
|
|
68
|
-
* ranking as metadata.
|
|
69
|
-
* @param $ A CheerioAPI instance.
|
|
70
|
-
* @returns An array of Document instances.
|
|
71
|
-
*/
|
|
72
|
-
loadResults($) {
|
|
73
|
-
const items = $("tr[class='athing']");
|
|
74
|
-
const documents = [];
|
|
75
|
-
items.each((_index, item) => {
|
|
76
|
-
const ranking = $(item).find("span[class='rank']").text();
|
|
77
|
-
const link = $(item).find("span[class='titleline'] a").attr("href");
|
|
78
|
-
const title = $(item).find("span[class='titleline']").text().trim();
|
|
79
|
-
const metadata = {
|
|
80
|
-
source: this.webPath,
|
|
81
|
-
title,
|
|
82
|
-
link,
|
|
83
|
-
ranking,
|
|
84
|
-
};
|
|
85
|
-
documents.push(new documents_1.Document({ pageContent: title, metadata }));
|
|
86
|
-
});
|
|
87
|
-
return documents;
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
exports.HNLoader = HNLoader;
|