langchain 0.1.26 → 0.1.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/document_loaders/web/gitbook.cjs +11 -3
- package/dist/document_loaders/web/gitbook.d.ts +1 -0
- package/dist/document_loaders/web/gitbook.js +11 -3
- package/dist/retrievers/parent_document.cjs +3 -2
- package/dist/retrievers/parent_document.d.ts +3 -1
- package/dist/retrievers/parent_document.js +3 -2
- package/package.json +1 -1
|
@@ -23,6 +23,13 @@ class GitbookLoader extends cheerio_js_1.CheerioWebBaseLoader {
|
|
|
23
23
|
writable: true,
|
|
24
24
|
value: false
|
|
25
25
|
});
|
|
26
|
+
Object.defineProperty(this, "baseUrl", {
|
|
27
|
+
enumerable: true,
|
|
28
|
+
configurable: true,
|
|
29
|
+
writable: true,
|
|
30
|
+
value: void 0
|
|
31
|
+
});
|
|
32
|
+
this.baseUrl = webPath;
|
|
26
33
|
this.webPath = path;
|
|
27
34
|
this.shouldLoadAllPaths =
|
|
28
35
|
params.shouldLoadAllPaths ?? this.shouldLoadAllPaths;
|
|
@@ -84,9 +91,10 @@ class GitbookLoader extends cheerio_js_1.CheerioWebBaseLoader {
|
|
|
84
91
|
.map((element) => $(element).text());
|
|
85
92
|
const documents = [];
|
|
86
93
|
for (const url of urls) {
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
94
|
+
const buildUrl = url.includes(this.baseUrl) ? url : this.baseUrl + url;
|
|
95
|
+
console.log(`Fetching text from ${buildUrl}`);
|
|
96
|
+
const html = await GitbookLoader._scrape(buildUrl, this.caller, this.timeout);
|
|
97
|
+
documents.push(...this.loadPath(html, buildUrl));
|
|
90
98
|
}
|
|
91
99
|
console.log(`Fetched ${documents.length} documents.`);
|
|
92
100
|
return documents;
|
|
@@ -15,6 +15,7 @@ interface GitbookLoaderParams {
|
|
|
15
15
|
export declare class GitbookLoader extends CheerioWebBaseLoader {
|
|
16
16
|
webPath: string;
|
|
17
17
|
shouldLoadAllPaths: boolean;
|
|
18
|
+
private readonly baseUrl;
|
|
18
19
|
constructor(webPath: string, params?: GitbookLoaderParams);
|
|
19
20
|
/**
|
|
20
21
|
* Method that scrapes the web document using Cheerio and loads the
|
|
@@ -20,6 +20,13 @@ export class GitbookLoader extends CheerioWebBaseLoader {
|
|
|
20
20
|
writable: true,
|
|
21
21
|
value: false
|
|
22
22
|
});
|
|
23
|
+
Object.defineProperty(this, "baseUrl", {
|
|
24
|
+
enumerable: true,
|
|
25
|
+
configurable: true,
|
|
26
|
+
writable: true,
|
|
27
|
+
value: void 0
|
|
28
|
+
});
|
|
29
|
+
this.baseUrl = webPath;
|
|
23
30
|
this.webPath = path;
|
|
24
31
|
this.shouldLoadAllPaths =
|
|
25
32
|
params.shouldLoadAllPaths ?? this.shouldLoadAllPaths;
|
|
@@ -81,9 +88,10 @@ export class GitbookLoader extends CheerioWebBaseLoader {
|
|
|
81
88
|
.map((element) => $(element).text());
|
|
82
89
|
const documents = [];
|
|
83
90
|
for (const url of urls) {
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
91
|
+
const buildUrl = url.includes(this.baseUrl) ? url : this.baseUrl + url;
|
|
92
|
+
console.log(`Fetching text from ${buildUrl}`);
|
|
93
|
+
const html = await GitbookLoader._scrape(buildUrl, this.caller, this.timeout);
|
|
94
|
+
documents.push(...this.loadPath(html, buildUrl));
|
|
87
95
|
}
|
|
88
96
|
console.log(`Fetched ${documents.length} documents.`);
|
|
89
97
|
return documents;
|
|
@@ -153,9 +153,10 @@ class ParentDocumentRetriever extends multi_vector_js_1.MultiVectorRetriever {
|
|
|
153
153
|
* This can be false if and only if `ids` are provided. You may want
|
|
154
154
|
* to set this to False if the documents are already in the docstore
|
|
155
155
|
* and you don't want to re-add them.
|
|
156
|
+
* @param config.chunkHeaderOptions Object with options for adding Contextual chunk headers
|
|
156
157
|
*/
|
|
157
158
|
async addDocuments(docs, config) {
|
|
158
|
-
const { ids, addToDocstore = true } = config ?? {};
|
|
159
|
+
const { ids, addToDocstore = true, childDocChunkHeaderOptions = {}, } = config ?? {};
|
|
159
160
|
const parentDocs = this.parentSplitter
|
|
160
161
|
? await this.parentSplitter.splitDocuments(docs)
|
|
161
162
|
: docs;
|
|
@@ -177,7 +178,7 @@ class ParentDocumentRetriever extends multi_vector_js_1.MultiVectorRetriever {
|
|
|
177
178
|
for (let i = 0; i < parentDocs.length; i += 1) {
|
|
178
179
|
const parentDoc = parentDocs[i];
|
|
179
180
|
const parentDocId = parentDocIds[i];
|
|
180
|
-
const subDocs = await this.childSplitter.splitDocuments([parentDoc]);
|
|
181
|
+
const subDocs = await this.childSplitter.splitDocuments([parentDoc], childDocChunkHeaderOptions);
|
|
181
182
|
const taggedSubDocs = subDocs.map((subDoc) => new documents_1.Document({
|
|
182
183
|
pageContent: subDoc.pageContent,
|
|
183
184
|
metadata: { ...subDoc.metadata, [this.idKey]: parentDocId },
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { type VectorStoreInterface, type VectorStoreRetrieverInterface } from "@langchain/core/vectorstores";
|
|
2
2
|
import { Document } from "@langchain/core/documents";
|
|
3
|
-
import { TextSplitter } from "../text_splitter.js";
|
|
3
|
+
import { TextSplitter, TextSplitterChunkHeaderOptions } from "../text_splitter.js";
|
|
4
4
|
import { MultiVectorRetriever, type MultiVectorRetrieverInput } from "./multi_vector.js";
|
|
5
5
|
/**
|
|
6
6
|
* Interface for the fields required to initialize a
|
|
@@ -69,9 +69,11 @@ export declare class ParentDocumentRetriever extends MultiVectorRetriever {
|
|
|
69
69
|
* This can be false if and only if `ids` are provided. You may want
|
|
70
70
|
* to set this to False if the documents are already in the docstore
|
|
71
71
|
* and you don't want to re-add them.
|
|
72
|
+
* @param config.chunkHeaderOptions Object with options for adding Contextual chunk headers
|
|
72
73
|
*/
|
|
73
74
|
addDocuments(docs: Document[], config?: {
|
|
74
75
|
ids?: string[];
|
|
75
76
|
addToDocstore?: boolean;
|
|
77
|
+
childDocChunkHeaderOptions?: TextSplitterChunkHeaderOptions;
|
|
76
78
|
}): Promise<void>;
|
|
77
79
|
}
|
|
@@ -127,9 +127,10 @@ export class ParentDocumentRetriever extends MultiVectorRetriever {
|
|
|
127
127
|
* This can be false if and only if `ids` are provided. You may want
|
|
128
128
|
* to set this to False if the documents are already in the docstore
|
|
129
129
|
* and you don't want to re-add them.
|
|
130
|
+
* @param config.chunkHeaderOptions Object with options for adding Contextual chunk headers
|
|
130
131
|
*/
|
|
131
132
|
async addDocuments(docs, config) {
|
|
132
|
-
const { ids, addToDocstore = true } = config ?? {};
|
|
133
|
+
const { ids, addToDocstore = true, childDocChunkHeaderOptions = {}, } = config ?? {};
|
|
133
134
|
const parentDocs = this.parentSplitter
|
|
134
135
|
? await this.parentSplitter.splitDocuments(docs)
|
|
135
136
|
: docs;
|
|
@@ -151,7 +152,7 @@ export class ParentDocumentRetriever extends MultiVectorRetriever {
|
|
|
151
152
|
for (let i = 0; i < parentDocs.length; i += 1) {
|
|
152
153
|
const parentDoc = parentDocs[i];
|
|
153
154
|
const parentDocId = parentDocIds[i];
|
|
154
|
-
const subDocs = await this.childSplitter.splitDocuments([parentDoc]);
|
|
155
|
+
const subDocs = await this.childSplitter.splitDocuments([parentDoc], childDocChunkHeaderOptions);
|
|
155
156
|
const taggedSubDocs = subDocs.map((subDoc) => new Document({
|
|
156
157
|
pageContent: subDoc.pageContent,
|
|
157
158
|
metadata: { ...subDoc.metadata, [this.idKey]: parentDocId },
|