langchain 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/document_loaders/fs/multi_file.cjs +98 -0
- package/dist/document_loaders/fs/multi_file.d.ts +37 -0
- package/dist/document_loaders/fs/multi_file.js +94 -0
- package/dist/document_loaders/tests/multi_file.test.d.ts +1 -0
- package/dist/document_loaders/tests/multi_file.test.js +49 -0
- package/dist/load/import_constants.cjs +1 -0
- package/dist/load/import_constants.js +1 -0
- package/dist/retrievers/multi_query.cjs +1 -1
- package/dist/retrievers/multi_query.js +1 -1
- package/document_loaders/fs/multi_file.cjs +1 -0
- package/document_loaders/fs/multi_file.d.cts +1 -0
- package/document_loaders/fs/multi_file.d.ts +1 -0
- package/document_loaders/fs/multi_file.js +1 -0
- package/package.json +14 -1
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.MultiFileLoader = void 0;
|
|
4
|
+
const node_path_1 = require("node:path");
|
|
5
|
+
const promises_1 = require("node:fs/promises");
|
|
6
|
+
const base_js_1 = require("../base.cjs");
|
|
7
|
+
const directory_js_1 = require("./directory.cjs");
|
|
8
|
+
/**
|
|
9
|
+
* A document loader that loads documents from multiple files. It extends the
|
|
10
|
+
* `BaseDocumentLoader` class and implements the `load()` method.
|
|
11
|
+
* @example
|
|
12
|
+
* ```typescript
|
|
13
|
+
*
|
|
14
|
+
* const multiFileLoader = new MultiFileLoader(
|
|
15
|
+
* ["path/to/file1.pdf", "path/to/file2.txt"],
|
|
16
|
+
* {
|
|
17
|
+
* ".pdf": (path: string) => new PDFLoader(path),
|
|
18
|
+
* },
|
|
19
|
+
* );
|
|
20
|
+
*
|
|
21
|
+
* const docs = await multiFileLoader.load();
|
|
22
|
+
* console.log({ docs });
|
|
23
|
+
*
|
|
24
|
+
* ```
|
|
25
|
+
*/
|
|
26
|
+
class MultiFileLoader extends base_js_1.BaseDocumentLoader {
|
|
27
|
+
constructor(filePaths, loaders, unknown = directory_js_1.UnknownHandling.Warn) {
|
|
28
|
+
super();
|
|
29
|
+
Object.defineProperty(this, "filePaths", {
|
|
30
|
+
enumerable: true,
|
|
31
|
+
configurable: true,
|
|
32
|
+
writable: true,
|
|
33
|
+
value: filePaths
|
|
34
|
+
});
|
|
35
|
+
Object.defineProperty(this, "loaders", {
|
|
36
|
+
enumerable: true,
|
|
37
|
+
configurable: true,
|
|
38
|
+
writable: true,
|
|
39
|
+
value: loaders
|
|
40
|
+
});
|
|
41
|
+
Object.defineProperty(this, "unknown", {
|
|
42
|
+
enumerable: true,
|
|
43
|
+
configurable: true,
|
|
44
|
+
writable: true,
|
|
45
|
+
value: unknown
|
|
46
|
+
});
|
|
47
|
+
if (Object.keys(loaders).length === 0) {
|
|
48
|
+
throw new Error("Must provide at least one loader");
|
|
49
|
+
}
|
|
50
|
+
for (const extension in loaders) {
|
|
51
|
+
if (Object.hasOwn(loaders, extension)) {
|
|
52
|
+
if (extension[0] !== ".") {
|
|
53
|
+
throw new Error(`Extension must start with a dot: ${extension}`);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Loads the documents from the provided file paths. It checks if the file
|
|
60
|
+
* is a directory and ignores it. If a file is a file, it checks if there
|
|
61
|
+
* is a corresponding loader function for the file extension in the `loaders`
|
|
62
|
+
* mapping. If there is, it loads the documents. If there is no
|
|
63
|
+
* corresponding loader function and `unknown` is set to `Warn`, it logs a
|
|
64
|
+
* warning message. If `unknown` is set to `Error`, it throws an error.
|
|
65
|
+
* @returns A promise that resolves to an array of loaded documents.
|
|
66
|
+
*/
|
|
67
|
+
async load() {
|
|
68
|
+
const documents = [];
|
|
69
|
+
for (const filePath of this.filePaths) {
|
|
70
|
+
const fullPath = (0, node_path_1.resolve)(filePath);
|
|
71
|
+
const fileStat = await (0, promises_1.stat)(fullPath);
|
|
72
|
+
if (fileStat.isDirectory()) {
|
|
73
|
+
console.warn(`Ignoring directory: ${fullPath}`);
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
const loaderFactory = this.loaders[(0, node_path_1.extname)(fullPath)];
|
|
77
|
+
if (loaderFactory) {
|
|
78
|
+
const loader = loaderFactory(fullPath);
|
|
79
|
+
documents.push(...(await loader.load()));
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
switch (this.unknown) {
|
|
83
|
+
case directory_js_1.UnknownHandling.Ignore:
|
|
84
|
+
break;
|
|
85
|
+
case directory_js_1.UnknownHandling.Warn:
|
|
86
|
+
console.warn(`Unknown file type: ${fullPath}`);
|
|
87
|
+
break;
|
|
88
|
+
case directory_js_1.UnknownHandling.Error:
|
|
89
|
+
throw new Error(`Unknown file type: ${fullPath}`);
|
|
90
|
+
default:
|
|
91
|
+
throw new Error(`Unknown unknown handling: ${this.unknown}`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return documents;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
exports.MultiFileLoader = MultiFileLoader;
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { Document } from "@langchain/core/documents";
|
|
2
|
+
import { BaseDocumentLoader } from "../base.js";
|
|
3
|
+
import { type LoadersMapping, UnknownHandling } from "./directory.js";
|
|
4
|
+
/**
|
|
5
|
+
* A document loader that loads documents from multiple files. It extends the
|
|
6
|
+
* `BaseDocumentLoader` class and implements the `load()` method.
|
|
7
|
+
* @example
|
|
8
|
+
* ```typescript
|
|
9
|
+
*
|
|
10
|
+
* const multiFileLoader = new MultiFileLoader(
|
|
11
|
+
* ["path/to/file1.pdf", "path/to/file2.txt"],
|
|
12
|
+
* {
|
|
13
|
+
* ".pdf": (path: string) => new PDFLoader(path),
|
|
14
|
+
* },
|
|
15
|
+
* );
|
|
16
|
+
*
|
|
17
|
+
* const docs = await multiFileLoader.load();
|
|
18
|
+
* console.log({ docs });
|
|
19
|
+
*
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
22
|
+
export declare class MultiFileLoader extends BaseDocumentLoader {
|
|
23
|
+
filePaths: string[];
|
|
24
|
+
loaders: LoadersMapping;
|
|
25
|
+
unknown: UnknownHandling;
|
|
26
|
+
constructor(filePaths: string[], loaders: LoadersMapping, unknown?: UnknownHandling);
|
|
27
|
+
/**
|
|
28
|
+
* Loads the documents from the provided file paths. It checks if the file
|
|
29
|
+
* is a directory and ignores it. If a file is a file, it checks if there
|
|
30
|
+
* is a corresponding loader function for the file extension in the `loaders`
|
|
31
|
+
* mapping. If there is, it loads the documents. If there is no
|
|
32
|
+
* corresponding loader function and `unknown` is set to `Warn`, it logs a
|
|
33
|
+
* warning message. If `unknown` is set to `Error`, it throws an error.
|
|
34
|
+
* @returns A promise that resolves to an array of loaded documents.
|
|
35
|
+
*/
|
|
36
|
+
load(): Promise<Document[]>;
|
|
37
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { extname, resolve } from "node:path";
|
|
2
|
+
import { stat } from "node:fs/promises";
|
|
3
|
+
import { BaseDocumentLoader } from "../base.js";
|
|
4
|
+
import { UnknownHandling } from "./directory.js";
|
|
5
|
+
/**
|
|
6
|
+
* A document loader that loads documents from multiple files. It extends the
|
|
7
|
+
* `BaseDocumentLoader` class and implements the `load()` method.
|
|
8
|
+
* @example
|
|
9
|
+
* ```typescript
|
|
10
|
+
*
|
|
11
|
+
* const multiFileLoader = new MultiFileLoader(
|
|
12
|
+
* ["path/to/file1.pdf", "path/to/file2.txt"],
|
|
13
|
+
* {
|
|
14
|
+
* ".pdf": (path: string) => new PDFLoader(path),
|
|
15
|
+
* },
|
|
16
|
+
* );
|
|
17
|
+
*
|
|
18
|
+
* const docs = await multiFileLoader.load();
|
|
19
|
+
* console.log({ docs });
|
|
20
|
+
*
|
|
21
|
+
* ```
|
|
22
|
+
*/
|
|
23
|
+
export class MultiFileLoader extends BaseDocumentLoader {
|
|
24
|
+
constructor(filePaths, loaders, unknown = UnknownHandling.Warn) {
|
|
25
|
+
super();
|
|
26
|
+
Object.defineProperty(this, "filePaths", {
|
|
27
|
+
enumerable: true,
|
|
28
|
+
configurable: true,
|
|
29
|
+
writable: true,
|
|
30
|
+
value: filePaths
|
|
31
|
+
});
|
|
32
|
+
Object.defineProperty(this, "loaders", {
|
|
33
|
+
enumerable: true,
|
|
34
|
+
configurable: true,
|
|
35
|
+
writable: true,
|
|
36
|
+
value: loaders
|
|
37
|
+
});
|
|
38
|
+
Object.defineProperty(this, "unknown", {
|
|
39
|
+
enumerable: true,
|
|
40
|
+
configurable: true,
|
|
41
|
+
writable: true,
|
|
42
|
+
value: unknown
|
|
43
|
+
});
|
|
44
|
+
if (Object.keys(loaders).length === 0) {
|
|
45
|
+
throw new Error("Must provide at least one loader");
|
|
46
|
+
}
|
|
47
|
+
for (const extension in loaders) {
|
|
48
|
+
if (Object.hasOwn(loaders, extension)) {
|
|
49
|
+
if (extension[0] !== ".") {
|
|
50
|
+
throw new Error(`Extension must start with a dot: ${extension}`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Loads the documents from the provided file paths. It checks if the file
|
|
57
|
+
* is a directory and ignores it. If a file is a file, it checks if there
|
|
58
|
+
* is a corresponding loader function for the file extension in the `loaders`
|
|
59
|
+
* mapping. If there is, it loads the documents. If there is no
|
|
60
|
+
* corresponding loader function and `unknown` is set to `Warn`, it logs a
|
|
61
|
+
* warning message. If `unknown` is set to `Error`, it throws an error.
|
|
62
|
+
* @returns A promise that resolves to an array of loaded documents.
|
|
63
|
+
*/
|
|
64
|
+
async load() {
|
|
65
|
+
const documents = [];
|
|
66
|
+
for (const filePath of this.filePaths) {
|
|
67
|
+
const fullPath = resolve(filePath);
|
|
68
|
+
const fileStat = await stat(fullPath);
|
|
69
|
+
if (fileStat.isDirectory()) {
|
|
70
|
+
console.warn(`Ignoring directory: ${fullPath}`);
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
const loaderFactory = this.loaders[extname(fullPath)];
|
|
74
|
+
if (loaderFactory) {
|
|
75
|
+
const loader = loaderFactory(fullPath);
|
|
76
|
+
documents.push(...(await loader.load()));
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
switch (this.unknown) {
|
|
80
|
+
case UnknownHandling.Ignore:
|
|
81
|
+
break;
|
|
82
|
+
case UnknownHandling.Warn:
|
|
83
|
+
console.warn(`Unknown file type: ${fullPath}`);
|
|
84
|
+
break;
|
|
85
|
+
case UnknownHandling.Error:
|
|
86
|
+
throw new Error(`Unknown file type: ${fullPath}`);
|
|
87
|
+
default:
|
|
88
|
+
throw new Error(`Unknown unknown handling: ${this.unknown}`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
return documents;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import * as url from "node:url";
|
|
2
|
+
import * as path from "node:path";
|
|
3
|
+
import { test, expect } from "@jest/globals";
|
|
4
|
+
import { MultiFileLoader } from "../fs/multi_file.js";
|
|
5
|
+
import { CSVLoader } from "../fs/csv.js";
|
|
6
|
+
import { PDFLoader } from "../fs/pdf.js";
|
|
7
|
+
import { TextLoader } from "../fs/text.js";
|
|
8
|
+
import { JSONLoader } from "../fs/json.js";
|
|
9
|
+
import { UnknownHandling } from "../fs/directory.js";
|
|
10
|
+
test("Test MultiFileLoader", async () => {
|
|
11
|
+
const baseDirectory = path.resolve(path.dirname(url.fileURLToPath(import.meta.url)), "./example_data");
|
|
12
|
+
const filePaths = [
|
|
13
|
+
path.resolve(baseDirectory, "1706.03762.pdf"),
|
|
14
|
+
path.resolve(baseDirectory, "Jacob_Lee_Resume_2023.pdf"),
|
|
15
|
+
path.resolve(baseDirectory, "Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.csv"),
|
|
16
|
+
path.resolve(baseDirectory, "Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.json"),
|
|
17
|
+
path.resolve(baseDirectory, "complex.json"),
|
|
18
|
+
path.resolve(baseDirectory, "example.txt"),
|
|
19
|
+
path.resolve(baseDirectory, "example_separator.csv"),
|
|
20
|
+
];
|
|
21
|
+
const loader = new MultiFileLoader(filePaths, {
|
|
22
|
+
".csv": (p) => {
|
|
23
|
+
if (p.includes("separator.csv")) {
|
|
24
|
+
return new CSVLoader(p, { column: "html", separator: "|" });
|
|
25
|
+
}
|
|
26
|
+
return new CSVLoader(p, "html");
|
|
27
|
+
},
|
|
28
|
+
".pdf": (p) => new PDFLoader(p),
|
|
29
|
+
".txt": (p) => new TextLoader(p),
|
|
30
|
+
".json": (p) => new JSONLoader(p),
|
|
31
|
+
}, UnknownHandling.Ignore);
|
|
32
|
+
const docs = await loader.load();
|
|
33
|
+
expect(docs.length).toBe(123);
|
|
34
|
+
const expectedSources = [
|
|
35
|
+
// PDF
|
|
36
|
+
...Array.from({ length: 15 }, (_) => path.resolve(baseDirectory, "1706.03762.pdf")),
|
|
37
|
+
path.resolve(baseDirectory, "Jacob_Lee_Resume_2023.pdf"),
|
|
38
|
+
// CSV
|
|
39
|
+
...Array.from({ length: 32 }, (_) => path.resolve(baseDirectory, "Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.csv")),
|
|
40
|
+
// JSON
|
|
41
|
+
...Array.from({ length: 32 }, (_) => path.resolve(baseDirectory, "Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.json")),
|
|
42
|
+
...Array.from({ length: 10 }, (_) => path.resolve(baseDirectory, "complex.json")),
|
|
43
|
+
// TXT
|
|
44
|
+
path.resolve(baseDirectory, "example.txt"),
|
|
45
|
+
// CSV
|
|
46
|
+
...Array.from({ length: 32 }, (_) => path.resolve(baseDirectory, "example_separator.csv")),
|
|
47
|
+
];
|
|
48
|
+
expect(docs.map((d) => d.metadata.source).sort()).toEqual(expectedSources);
|
|
49
|
+
});
|
|
@@ -38,6 +38,7 @@ exports.optionalImportEntrypoints = [
|
|
|
38
38
|
"langchain/document_loaders/web/couchbase",
|
|
39
39
|
"langchain/document_loaders/web/youtube",
|
|
40
40
|
"langchain/document_loaders/fs/directory",
|
|
41
|
+
"langchain/document_loaders/fs/multi_file",
|
|
41
42
|
"langchain/document_loaders/fs/buffer",
|
|
42
43
|
"langchain/document_loaders/fs/chatgpt",
|
|
43
44
|
"langchain/document_loaders/fs/text",
|
|
@@ -35,6 +35,7 @@ export const optionalImportEntrypoints = [
|
|
|
35
35
|
"langchain/document_loaders/web/couchbase",
|
|
36
36
|
"langchain/document_loaders/web/youtube",
|
|
37
37
|
"langchain/document_loaders/fs/directory",
|
|
38
|
+
"langchain/document_loaders/fs/multi_file",
|
|
38
39
|
"langchain/document_loaders/fs/buffer",
|
|
39
40
|
"langchain/document_loaders/fs/chatgpt",
|
|
40
41
|
"langchain/document_loaders/fs/text",
|
|
@@ -162,7 +162,7 @@ class MultiQueryRetriever extends retrievers_1.BaseRetriever {
|
|
|
162
162
|
const uniqueDocuments = this._uniqueUnion(documents);
|
|
163
163
|
let outputDocs = uniqueDocuments;
|
|
164
164
|
if (this.documentCompressor && uniqueDocuments.length) {
|
|
165
|
-
outputDocs = await this.documentCompressor.compressDocuments(uniqueDocuments, question);
|
|
165
|
+
outputDocs = await this.documentCompressor.compressDocuments(uniqueDocuments, question, runManager?.getChild());
|
|
166
166
|
if (this.documentCompressorFilteringFn) {
|
|
167
167
|
outputDocs = this.documentCompressorFilteringFn(outputDocs);
|
|
168
168
|
}
|
|
@@ -159,7 +159,7 @@ export class MultiQueryRetriever extends BaseRetriever {
|
|
|
159
159
|
const uniqueDocuments = this._uniqueUnion(documents);
|
|
160
160
|
let outputDocs = uniqueDocuments;
|
|
161
161
|
if (this.documentCompressor && uniqueDocuments.length) {
|
|
162
|
-
outputDocs = await this.documentCompressor.compressDocuments(uniqueDocuments, question);
|
|
162
|
+
outputDocs = await this.documentCompressor.compressDocuments(uniqueDocuments, question, runManager?.getChild());
|
|
163
163
|
if (this.documentCompressorFilteringFn) {
|
|
164
164
|
outputDocs = this.documentCompressorFilteringFn(outputDocs);
|
|
165
165
|
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
module.exports = require('../../dist/document_loaders/fs/multi_file.cjs');
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from '../../dist/document_loaders/fs/multi_file.js'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from '../../dist/document_loaders/fs/multi_file.js'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from '../../dist/document_loaders/fs/multi_file.js'
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "langchain",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.7",
|
|
4
4
|
"description": "Typescript bindings for langchain",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"engines": {
|
|
@@ -286,6 +286,10 @@
|
|
|
286
286
|
"document_loaders/fs/directory.js",
|
|
287
287
|
"document_loaders/fs/directory.d.ts",
|
|
288
288
|
"document_loaders/fs/directory.d.cts",
|
|
289
|
+
"document_loaders/fs/multi_file.cjs",
|
|
290
|
+
"document_loaders/fs/multi_file.js",
|
|
291
|
+
"document_loaders/fs/multi_file.d.ts",
|
|
292
|
+
"document_loaders/fs/multi_file.d.cts",
|
|
289
293
|
"document_loaders/fs/buffer.cjs",
|
|
290
294
|
"document_loaders/fs/buffer.js",
|
|
291
295
|
"document_loaders/fs/buffer.d.ts",
|
|
@@ -1540,6 +1544,15 @@
|
|
|
1540
1544
|
"import": "./document_loaders/fs/directory.js",
|
|
1541
1545
|
"require": "./document_loaders/fs/directory.cjs"
|
|
1542
1546
|
},
|
|
1547
|
+
"./document_loaders/fs/multi_file": {
|
|
1548
|
+
"types": {
|
|
1549
|
+
"import": "./document_loaders/fs/multi_file.d.ts",
|
|
1550
|
+
"require": "./document_loaders/fs/multi_file.d.cts",
|
|
1551
|
+
"default": "./document_loaders/fs/multi_file.d.ts"
|
|
1552
|
+
},
|
|
1553
|
+
"import": "./document_loaders/fs/multi_file.js",
|
|
1554
|
+
"require": "./document_loaders/fs/multi_file.cjs"
|
|
1555
|
+
},
|
|
1543
1556
|
"./document_loaders/fs/buffer": {
|
|
1544
1557
|
"types": {
|
|
1545
1558
|
"import": "./document_loaders/fs/buffer.d.ts",
|