langchain 0.1.16 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chains/graph_qa/prompts.cjs +7 -0
- package/dist/chains/graph_qa/prompts.js +7 -0
- package/dist/chains/retrieval.d.ts +2 -2
- package/dist/document_loaders/web/cheerio.cjs +11 -2
- package/dist/document_loaders/web/cheerio.d.ts +9 -2
- package/dist/document_loaders/web/cheerio.js +11 -2
- package/dist/document_loaders/web/sitemap.cjs +110 -0
- package/dist/document_loaders/web/sitemap.d.ts +36 -0
- package/dist/document_loaders/web/sitemap.js +106 -0
- package/dist/experimental/openai_assistant/index.cjs +2 -0
- package/dist/experimental/openai_assistant/index.js +2 -0
- package/dist/indexes/index.cjs +6 -1
- package/dist/indexes/index.d.ts +1 -1
- package/dist/indexes/index.js +1 -1
- package/dist/indexes/indexing.cjs +14 -9
- package/dist/indexes/indexing.d.ts +39 -0
- package/dist/indexes/indexing.js +8 -8
- package/dist/load/import_constants.cjs +1 -0
- package/dist/load/import_constants.js +1 -0
- package/dist/text_splitter.cjs +4 -1
- package/dist/text_splitter.js +4 -1
- package/document_loaders/web/sitemap.cjs +1 -0
- package/document_loaders/web/sitemap.d.cts +1 -0
- package/document_loaders/web/sitemap.d.ts +1 -0
- package/document_loaders/web/sitemap.js +1 -0
- package/package.json +18 -5
|
@@ -22,6 +22,13 @@ const CYPHER_QA_TEMPLATE = `You are an assistant that helps to form nice and hum
|
|
|
22
22
|
The information part contains the provided information that you must use to construct an answer.
|
|
23
23
|
The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
|
|
24
24
|
Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
|
|
25
|
+
Here is an example:
|
|
26
|
+
|
|
27
|
+
Question: Which managers own Neo4j stocks?
|
|
28
|
+
Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
|
|
29
|
+
Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.
|
|
30
|
+
|
|
31
|
+
Follow this example when generating answers.
|
|
25
32
|
If the provided information is empty, say that you don't know the answer.
|
|
26
33
|
Information:
|
|
27
34
|
{context}
|
|
@@ -19,6 +19,13 @@ const CYPHER_QA_TEMPLATE = `You are an assistant that helps to form nice and hum
|
|
|
19
19
|
The information part contains the provided information that you must use to construct an answer.
|
|
20
20
|
The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
|
|
21
21
|
Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
|
|
22
|
+
Here is an example:
|
|
23
|
+
|
|
24
|
+
Question: Which managers own Neo4j stocks?
|
|
25
|
+
Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
|
|
26
|
+
Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.
|
|
27
|
+
|
|
28
|
+
Follow this example when generating answers.
|
|
22
29
|
If the provided information is empty, say that you don't know the answer.
|
|
23
30
|
Information:
|
|
24
31
|
{context}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import type { BaseRetrieverInterface } from "@langchain/core/retrievers";
|
|
2
2
|
import { type Runnable, type RunnableInterface } from "@langchain/core/runnables";
|
|
3
3
|
import type { BaseMessage } from "@langchain/core/messages";
|
|
4
|
-
import type { DocumentInterface } from "@langchain/core/documents";
|
|
4
|
+
import type { DocumentInterface, Document } from "@langchain/core/documents";
|
|
5
5
|
/**
|
|
6
6
|
* Parameters for the createRetrievalChain method.
|
|
7
7
|
*/
|
|
@@ -58,7 +58,7 @@ export declare function createRetrievalChain<RunOutput>({ retriever, combineDocs
|
|
|
58
58
|
} & {
|
|
59
59
|
[key: string]: unknown;
|
|
60
60
|
}, {
|
|
61
|
-
context:
|
|
61
|
+
context: Document[];
|
|
62
62
|
answer: RunOutput;
|
|
63
63
|
} & {
|
|
64
64
|
[key: string]: unknown;
|
|
@@ -54,14 +54,23 @@ class CheerioWebBaseLoader extends base_js_1.BaseDocumentLoader {
|
|
|
54
54
|
this.selector = selector ?? "body";
|
|
55
55
|
this.textDecoder = textDecoder;
|
|
56
56
|
}
|
|
57
|
-
|
|
57
|
+
/**
|
|
58
|
+
* Fetches web documents from the given array of URLs and loads them using Cheerio.
|
|
59
|
+
* It returns an array of CheerioAPI instances.
|
|
60
|
+
* @param urls An array of URLs to fetch and load.
|
|
61
|
+
* @returns A Promise that resolves to an array of CheerioAPI instances.
|
|
62
|
+
*/
|
|
63
|
+
static async scrapeAll(urls, caller, timeout, textDecoder, options) {
|
|
64
|
+
return Promise.all(urls.map((url) => CheerioWebBaseLoader._scrape(url, caller, timeout, textDecoder, options)));
|
|
65
|
+
}
|
|
66
|
+
static async _scrape(url, caller, timeout, textDecoder, options) {
|
|
58
67
|
const { load } = await CheerioWebBaseLoader.imports();
|
|
59
68
|
const response = await caller.call(fetch, url, {
|
|
60
69
|
signal: timeout ? AbortSignal.timeout(timeout) : undefined,
|
|
61
70
|
});
|
|
62
71
|
const html = textDecoder?.decode(await response.arrayBuffer()) ??
|
|
63
72
|
(await response.text());
|
|
64
|
-
return load(html);
|
|
73
|
+
return load(html, options);
|
|
65
74
|
}
|
|
66
75
|
/**
|
|
67
76
|
* Fetches the web document from the webPath and loads it using Cheerio.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { CheerioAPI, load as LoadT, SelectorType } from "cheerio";
|
|
1
|
+
import type { CheerioAPI, CheerioOptions, load as LoadT, SelectorType } from "cheerio";
|
|
2
2
|
import { Document } from "@langchain/core/documents";
|
|
3
3
|
import { AsyncCaller, AsyncCallerParams } from "@langchain/core/utils/async_caller";
|
|
4
4
|
import { BaseDocumentLoader } from "../base.js";
|
|
@@ -41,7 +41,14 @@ export declare class CheerioWebBaseLoader extends BaseDocumentLoader implements
|
|
|
41
41
|
selector?: SelectorType;
|
|
42
42
|
textDecoder?: TextDecoder;
|
|
43
43
|
constructor(webPath: string, fields?: WebBaseLoaderParams);
|
|
44
|
-
|
|
44
|
+
/**
|
|
45
|
+
* Fetches web documents from the given array of URLs and loads them using Cheerio.
|
|
46
|
+
* It returns an array of CheerioAPI instances.
|
|
47
|
+
* @param urls An array of URLs to fetch and load.
|
|
48
|
+
* @returns A Promise that resolves to an array of CheerioAPI instances.
|
|
49
|
+
*/
|
|
50
|
+
static scrapeAll(urls: string[], caller: AsyncCaller, timeout: number | undefined, textDecoder?: TextDecoder, options?: CheerioOptions): Promise<CheerioAPI[]>;
|
|
51
|
+
static _scrape(url: string, caller: AsyncCaller, timeout: number | undefined, textDecoder?: TextDecoder, options?: CheerioOptions): Promise<CheerioAPI>;
|
|
45
52
|
/**
|
|
46
53
|
* Fetches the web document from the webPath and loads it using Cheerio.
|
|
47
54
|
* It returns a CheerioAPI instance.
|
|
@@ -51,14 +51,23 @@ export class CheerioWebBaseLoader extends BaseDocumentLoader {
|
|
|
51
51
|
this.selector = selector ?? "body";
|
|
52
52
|
this.textDecoder = textDecoder;
|
|
53
53
|
}
|
|
54
|
-
|
|
54
|
+
/**
|
|
55
|
+
* Fetches web documents from the given array of URLs and loads them using Cheerio.
|
|
56
|
+
* It returns an array of CheerioAPI instances.
|
|
57
|
+
* @param urls An array of URLs to fetch and load.
|
|
58
|
+
* @returns A Promise that resolves to an array of CheerioAPI instances.
|
|
59
|
+
*/
|
|
60
|
+
static async scrapeAll(urls, caller, timeout, textDecoder, options) {
|
|
61
|
+
return Promise.all(urls.map((url) => CheerioWebBaseLoader._scrape(url, caller, timeout, textDecoder, options)));
|
|
62
|
+
}
|
|
63
|
+
static async _scrape(url, caller, timeout, textDecoder, options) {
|
|
55
64
|
const { load } = await CheerioWebBaseLoader.imports();
|
|
56
65
|
const response = await caller.call(fetch, url, {
|
|
57
66
|
signal: timeout ? AbortSignal.timeout(timeout) : undefined,
|
|
58
67
|
});
|
|
59
68
|
const html = textDecoder?.decode(await response.arrayBuffer()) ??
|
|
60
69
|
(await response.text());
|
|
61
|
-
return load(html);
|
|
70
|
+
return load(html, options);
|
|
62
71
|
}
|
|
63
72
|
/**
|
|
64
73
|
* Fetches the web document from the webPath and loads it using Cheerio.
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.SitemapLoader = void 0;
|
|
4
|
+
const documents_1 = require("@langchain/core/documents");
|
|
5
|
+
const chunk_array_1 = require("@langchain/core/utils/chunk_array");
|
|
6
|
+
const cheerio_js_1 = require("./cheerio.cjs");
|
|
7
|
+
const DEFAULT_CHUNK_SIZE = 300;
|
|
8
|
+
class SitemapLoader extends cheerio_js_1.CheerioWebBaseLoader {
|
|
9
|
+
constructor(webPath, params = {}) {
|
|
10
|
+
const paramsWithDefaults = { chunkSize: DEFAULT_CHUNK_SIZE, ...params };
|
|
11
|
+
let path = webPath.endsWith("/") ? webPath.slice(0, -1) : webPath;
|
|
12
|
+
// Allow for custom sitemap paths to be passed in with the url.
|
|
13
|
+
path = path.endsWith(".xml") ? path : `${path}/sitemap.xml`;
|
|
14
|
+
super(path, paramsWithDefaults);
|
|
15
|
+
Object.defineProperty(this, "webPath", {
|
|
16
|
+
enumerable: true,
|
|
17
|
+
configurable: true,
|
|
18
|
+
writable: true,
|
|
19
|
+
value: webPath
|
|
20
|
+
});
|
|
21
|
+
Object.defineProperty(this, "allowUrlPatterns", {
|
|
22
|
+
enumerable: true,
|
|
23
|
+
configurable: true,
|
|
24
|
+
writable: true,
|
|
25
|
+
value: void 0
|
|
26
|
+
});
|
|
27
|
+
Object.defineProperty(this, "chunkSize", {
|
|
28
|
+
enumerable: true,
|
|
29
|
+
configurable: true,
|
|
30
|
+
writable: true,
|
|
31
|
+
value: void 0
|
|
32
|
+
});
|
|
33
|
+
this.webPath = path;
|
|
34
|
+
this.allowUrlPatterns = paramsWithDefaults.filterUrls;
|
|
35
|
+
this.chunkSize = paramsWithDefaults.chunkSize;
|
|
36
|
+
}
|
|
37
|
+
_checkUrlPatterns(url) {
|
|
38
|
+
if (!this.allowUrlPatterns) {
|
|
39
|
+
return false;
|
|
40
|
+
}
|
|
41
|
+
return !this.allowUrlPatterns.some((pattern) => !new RegExp(pattern).test(url));
|
|
42
|
+
}
|
|
43
|
+
async parseSitemap() {
|
|
44
|
+
const $ = await cheerio_js_1.CheerioWebBaseLoader._scrape(this.webPath, this.caller, this.timeout, this.textDecoder, {
|
|
45
|
+
xmlMode: true,
|
|
46
|
+
xml: true,
|
|
47
|
+
});
|
|
48
|
+
const elements = [];
|
|
49
|
+
$("url").each((_, element) => {
|
|
50
|
+
const loc = $(element).find("loc").text();
|
|
51
|
+
if (!loc) {
|
|
52
|
+
return;
|
|
53
|
+
}
|
|
54
|
+
if (this._checkUrlPatterns(loc)) {
|
|
55
|
+
return;
|
|
56
|
+
}
|
|
57
|
+
const changefreq = $(element).find("changefreq").text();
|
|
58
|
+
const lastmod = $(element).find("lastmod").text();
|
|
59
|
+
const priority = $(element).find("priority").text();
|
|
60
|
+
elements.push({ loc, changefreq, lastmod, priority });
|
|
61
|
+
});
|
|
62
|
+
$("sitemap").each((_, element) => {
|
|
63
|
+
const loc = $(element).find("loc").text();
|
|
64
|
+
if (!loc) {
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
const changefreq = $(element).find("changefreq").text();
|
|
68
|
+
const lastmod = $(element).find("lastmod").text();
|
|
69
|
+
const priority = $(element).find("priority").text();
|
|
70
|
+
elements.push({ loc, changefreq, lastmod, priority });
|
|
71
|
+
});
|
|
72
|
+
return elements;
|
|
73
|
+
}
|
|
74
|
+
async _loadSitemapUrls(elements) {
|
|
75
|
+
const all = await cheerio_js_1.CheerioWebBaseLoader.scrapeAll(elements.map((ele) => ele.loc), this.caller, this.timeout, this.textDecoder);
|
|
76
|
+
const documents = all.map(($, i) => {
|
|
77
|
+
if (!elements[i]) {
|
|
78
|
+
throw new Error("Scraped docs and elements not in sync");
|
|
79
|
+
}
|
|
80
|
+
const text = $(this.selector).text();
|
|
81
|
+
const { loc: source, ...metadata } = elements[i];
|
|
82
|
+
// extract page metadata
|
|
83
|
+
const description = $("meta[name='description']").attr("content");
|
|
84
|
+
const title = $("meta[property='og:title']").attr("content");
|
|
85
|
+
const lang = $("meta[property='og:locale']").attr("content");
|
|
86
|
+
return new documents_1.Document({
|
|
87
|
+
pageContent: text,
|
|
88
|
+
metadata: {
|
|
89
|
+
...metadata,
|
|
90
|
+
description,
|
|
91
|
+
title,
|
|
92
|
+
lang,
|
|
93
|
+
source: source.trim(),
|
|
94
|
+
},
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
return documents;
|
|
98
|
+
}
|
|
99
|
+
async load() {
|
|
100
|
+
const elements = await this.parseSitemap();
|
|
101
|
+
const chunks = (0, chunk_array_1.chunkArray)(elements, this.chunkSize);
|
|
102
|
+
const documents = [];
|
|
103
|
+
for await (const chunk of chunks) {
|
|
104
|
+
const chunkedDocuments = await this._loadSitemapUrls(chunk);
|
|
105
|
+
documents.push(...chunkedDocuments);
|
|
106
|
+
}
|
|
107
|
+
return documents;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
exports.SitemapLoader = SitemapLoader;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { Document, DocumentInterface } from "@langchain/core/documents";
|
|
2
|
+
import { CheerioWebBaseLoader, WebBaseLoaderParams } from "./cheerio.js";
|
|
3
|
+
/**
|
|
4
|
+
* Interface representing the parameters for initializing a SitemapLoader.
|
|
5
|
+
* @interface SitemapLoaderParams
|
|
6
|
+
* @extends WebBaseLoaderParams
|
|
7
|
+
*/
|
|
8
|
+
export interface SitemapLoaderParams extends WebBaseLoaderParams {
|
|
9
|
+
/**
|
|
10
|
+
* @property {(string | RegExp)[] | undefined} filterUrls - A list of regexes. Only URLs that match one of the filter URLs will be loaded.
|
|
11
|
+
* WARNING: The filter URLs are interpreted as regular expressions. Escape special characters if needed.
|
|
12
|
+
*/
|
|
13
|
+
filterUrls?: (string | RegExp)[];
|
|
14
|
+
/**
|
|
15
|
+
* The size to chunk the sitemap URLs into for scraping.
|
|
16
|
+
* @default {300}
|
|
17
|
+
*/
|
|
18
|
+
chunkSize?: number;
|
|
19
|
+
}
|
|
20
|
+
type SiteMapElement = {
|
|
21
|
+
loc: string;
|
|
22
|
+
changefreq?: string;
|
|
23
|
+
lastmod?: string;
|
|
24
|
+
priority?: string;
|
|
25
|
+
};
|
|
26
|
+
export declare class SitemapLoader extends CheerioWebBaseLoader implements SitemapLoaderParams {
|
|
27
|
+
webPath: string;
|
|
28
|
+
allowUrlPatterns: (string | RegExp)[] | undefined;
|
|
29
|
+
chunkSize: number;
|
|
30
|
+
constructor(webPath: string, params?: SitemapLoaderParams);
|
|
31
|
+
_checkUrlPatterns(url: string): boolean;
|
|
32
|
+
parseSitemap(): Promise<SiteMapElement[]>;
|
|
33
|
+
_loadSitemapUrls(elements: Array<SiteMapElement>): Promise<DocumentInterface[]>;
|
|
34
|
+
load(): Promise<Document[]>;
|
|
35
|
+
}
|
|
36
|
+
export {};
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import { Document } from "@langchain/core/documents";
|
|
2
|
+
import { chunkArray } from "@langchain/core/utils/chunk_array";
|
|
3
|
+
import { CheerioWebBaseLoader } from "./cheerio.js";
|
|
4
|
+
const DEFAULT_CHUNK_SIZE = 300;
|
|
5
|
+
export class SitemapLoader extends CheerioWebBaseLoader {
|
|
6
|
+
constructor(webPath, params = {}) {
|
|
7
|
+
const paramsWithDefaults = { chunkSize: DEFAULT_CHUNK_SIZE, ...params };
|
|
8
|
+
let path = webPath.endsWith("/") ? webPath.slice(0, -1) : webPath;
|
|
9
|
+
// Allow for custom sitemap paths to be passed in with the url.
|
|
10
|
+
path = path.endsWith(".xml") ? path : `${path}/sitemap.xml`;
|
|
11
|
+
super(path, paramsWithDefaults);
|
|
12
|
+
Object.defineProperty(this, "webPath", {
|
|
13
|
+
enumerable: true,
|
|
14
|
+
configurable: true,
|
|
15
|
+
writable: true,
|
|
16
|
+
value: webPath
|
|
17
|
+
});
|
|
18
|
+
Object.defineProperty(this, "allowUrlPatterns", {
|
|
19
|
+
enumerable: true,
|
|
20
|
+
configurable: true,
|
|
21
|
+
writable: true,
|
|
22
|
+
value: void 0
|
|
23
|
+
});
|
|
24
|
+
Object.defineProperty(this, "chunkSize", {
|
|
25
|
+
enumerable: true,
|
|
26
|
+
configurable: true,
|
|
27
|
+
writable: true,
|
|
28
|
+
value: void 0
|
|
29
|
+
});
|
|
30
|
+
this.webPath = path;
|
|
31
|
+
this.allowUrlPatterns = paramsWithDefaults.filterUrls;
|
|
32
|
+
this.chunkSize = paramsWithDefaults.chunkSize;
|
|
33
|
+
}
|
|
34
|
+
_checkUrlPatterns(url) {
|
|
35
|
+
if (!this.allowUrlPatterns) {
|
|
36
|
+
return false;
|
|
37
|
+
}
|
|
38
|
+
return !this.allowUrlPatterns.some((pattern) => !new RegExp(pattern).test(url));
|
|
39
|
+
}
|
|
40
|
+
async parseSitemap() {
|
|
41
|
+
const $ = await CheerioWebBaseLoader._scrape(this.webPath, this.caller, this.timeout, this.textDecoder, {
|
|
42
|
+
xmlMode: true,
|
|
43
|
+
xml: true,
|
|
44
|
+
});
|
|
45
|
+
const elements = [];
|
|
46
|
+
$("url").each((_, element) => {
|
|
47
|
+
const loc = $(element).find("loc").text();
|
|
48
|
+
if (!loc) {
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
if (this._checkUrlPatterns(loc)) {
|
|
52
|
+
return;
|
|
53
|
+
}
|
|
54
|
+
const changefreq = $(element).find("changefreq").text();
|
|
55
|
+
const lastmod = $(element).find("lastmod").text();
|
|
56
|
+
const priority = $(element).find("priority").text();
|
|
57
|
+
elements.push({ loc, changefreq, lastmod, priority });
|
|
58
|
+
});
|
|
59
|
+
$("sitemap").each((_, element) => {
|
|
60
|
+
const loc = $(element).find("loc").text();
|
|
61
|
+
if (!loc) {
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
const changefreq = $(element).find("changefreq").text();
|
|
65
|
+
const lastmod = $(element).find("lastmod").text();
|
|
66
|
+
const priority = $(element).find("priority").text();
|
|
67
|
+
elements.push({ loc, changefreq, lastmod, priority });
|
|
68
|
+
});
|
|
69
|
+
return elements;
|
|
70
|
+
}
|
|
71
|
+
async _loadSitemapUrls(elements) {
|
|
72
|
+
const all = await CheerioWebBaseLoader.scrapeAll(elements.map((ele) => ele.loc), this.caller, this.timeout, this.textDecoder);
|
|
73
|
+
const documents = all.map(($, i) => {
|
|
74
|
+
if (!elements[i]) {
|
|
75
|
+
throw new Error("Scraped docs and elements not in sync");
|
|
76
|
+
}
|
|
77
|
+
const text = $(this.selector).text();
|
|
78
|
+
const { loc: source, ...metadata } = elements[i];
|
|
79
|
+
// extract page metadata
|
|
80
|
+
const description = $("meta[name='description']").attr("content");
|
|
81
|
+
const title = $("meta[property='og:title']").attr("content");
|
|
82
|
+
const lang = $("meta[property='og:locale']").attr("content");
|
|
83
|
+
return new Document({
|
|
84
|
+
pageContent: text,
|
|
85
|
+
metadata: {
|
|
86
|
+
...metadata,
|
|
87
|
+
description,
|
|
88
|
+
title,
|
|
89
|
+
lang,
|
|
90
|
+
source: source.trim(),
|
|
91
|
+
},
|
|
92
|
+
});
|
|
93
|
+
});
|
|
94
|
+
return documents;
|
|
95
|
+
}
|
|
96
|
+
async load() {
|
|
97
|
+
const elements = await this.parseSitemap();
|
|
98
|
+
const chunks = chunkArray(elements, this.chunkSize);
|
|
99
|
+
const documents = [];
|
|
100
|
+
for await (const chunk of chunks) {
|
|
101
|
+
const chunkedDocuments = await this._loadSitemapUrls(chunk);
|
|
102
|
+
documents.push(...chunkedDocuments);
|
|
103
|
+
}
|
|
104
|
+
return documents;
|
|
105
|
+
}
|
|
106
|
+
}
|
package/dist/indexes/index.cjs
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.index = void 0;
|
|
3
|
+
exports._HashedDocument = exports._isBaseDocumentLoader = exports._getSourceIdAssigner = exports._deduplicateInOrder = exports._batch = exports.index = void 0;
|
|
4
4
|
var indexing_js_1 = require("./indexing.cjs");
|
|
5
5
|
Object.defineProperty(exports, "index", { enumerable: true, get: function () { return indexing_js_1.index; } });
|
|
6
|
+
Object.defineProperty(exports, "_batch", { enumerable: true, get: function () { return indexing_js_1._batch; } });
|
|
7
|
+
Object.defineProperty(exports, "_deduplicateInOrder", { enumerable: true, get: function () { return indexing_js_1._deduplicateInOrder; } });
|
|
8
|
+
Object.defineProperty(exports, "_getSourceIdAssigner", { enumerable: true, get: function () { return indexing_js_1._getSourceIdAssigner; } });
|
|
9
|
+
Object.defineProperty(exports, "_isBaseDocumentLoader", { enumerable: true, get: function () { return indexing_js_1._isBaseDocumentLoader; } });
|
|
10
|
+
Object.defineProperty(exports, "_HashedDocument", { enumerable: true, get: function () { return indexing_js_1._HashedDocument; } });
|
package/dist/indexes/index.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export { type CleanupMode, type IndexOptions, index } from "./indexing.js";
|
|
1
|
+
export { type CleanupMode, type IndexOptions, index, _batch, _deduplicateInOrder, _getSourceIdAssigner, _isBaseDocumentLoader, _HashedDocument, } from "./indexing.js";
|
package/dist/indexes/index.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export { index } from "./indexing.js";
|
|
1
|
+
export { index, _batch, _deduplicateInOrder, _getSourceIdAssigner, _isBaseDocumentLoader, _HashedDocument, } from "./indexing.js";
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.index = void 0;
|
|
3
|
+
exports.index = exports._isBaseDocumentLoader = exports._getSourceIdAssigner = exports._deduplicateInOrder = exports._batch = exports._HashedDocument = void 0;
|
|
4
4
|
const uuid_1 = require("uuid");
|
|
5
5
|
const base_1 = require("@langchain/community/indexes/base");
|
|
6
6
|
const hash_1 = require("@langchain/core/utils/hash");
|
|
@@ -10,7 +10,7 @@ const document_js_1 = require("../document.cjs");
|
|
|
10
10
|
* Hashes are calculated based on page content and metadata.
|
|
11
11
|
* It is used for indexing.
|
|
12
12
|
*/
|
|
13
|
-
class
|
|
13
|
+
class _HashedDocument {
|
|
14
14
|
constructor(fields) {
|
|
15
15
|
Object.defineProperty(this, "uid", {
|
|
16
16
|
enumerable: true,
|
|
@@ -98,7 +98,8 @@ class HashedDocument {
|
|
|
98
98
|
return (0, uuid_1.v5)(hash_value, base_1.UUIDV5_NAMESPACE);
|
|
99
99
|
}
|
|
100
100
|
}
|
|
101
|
-
|
|
101
|
+
exports._HashedDocument = _HashedDocument;
|
|
102
|
+
function _batch(size, iterable) {
|
|
102
103
|
const batches = [];
|
|
103
104
|
let currentBatch = [];
|
|
104
105
|
iterable.forEach((item) => {
|
|
@@ -113,7 +114,8 @@ function batch(size, iterable) {
|
|
|
113
114
|
}
|
|
114
115
|
return batches;
|
|
115
116
|
}
|
|
116
|
-
|
|
117
|
+
exports._batch = _batch;
|
|
118
|
+
function _deduplicateInOrder(hashedDocuments) {
|
|
117
119
|
const seen = new Set();
|
|
118
120
|
const deduplicated = [];
|
|
119
121
|
for (const hashedDoc of hashedDocuments) {
|
|
@@ -127,7 +129,8 @@ function deduplicateInOrder(hashedDocuments) {
|
|
|
127
129
|
}
|
|
128
130
|
return deduplicated;
|
|
129
131
|
}
|
|
130
|
-
|
|
132
|
+
exports._deduplicateInOrder = _deduplicateInOrder;
|
|
133
|
+
function _getSourceIdAssigner(sourceIdKey) {
|
|
131
134
|
if (sourceIdKey === null) {
|
|
132
135
|
return (_doc) => null;
|
|
133
136
|
}
|
|
@@ -141,6 +144,7 @@ function getSourceIdAssigner(sourceIdKey) {
|
|
|
141
144
|
throw new Error(`sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`);
|
|
142
145
|
}
|
|
143
146
|
}
|
|
147
|
+
exports._getSourceIdAssigner = _getSourceIdAssigner;
|
|
144
148
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
145
149
|
const _isBaseDocumentLoader = (arg) => {
|
|
146
150
|
if ("load" in arg &&
|
|
@@ -151,6 +155,7 @@ const _isBaseDocumentLoader = (arg) => {
|
|
|
151
155
|
}
|
|
152
156
|
return false;
|
|
153
157
|
};
|
|
158
|
+
exports._isBaseDocumentLoader = _isBaseDocumentLoader;
|
|
154
159
|
/**
|
|
155
160
|
* Index data from the doc source into the vector store.
|
|
156
161
|
*
|
|
@@ -176,18 +181,18 @@ async function index(args) {
|
|
|
176
181
|
if (cleanup === "incremental" && !sourceIdKey) {
|
|
177
182
|
throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.");
|
|
178
183
|
}
|
|
179
|
-
const docs = _isBaseDocumentLoader(docsSource)
|
|
184
|
+
const docs = (0, exports._isBaseDocumentLoader)(docsSource)
|
|
180
185
|
? await docsSource.load()
|
|
181
186
|
: docsSource;
|
|
182
|
-
const sourceIdAssigner =
|
|
187
|
+
const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);
|
|
183
188
|
const indexStartDt = await recordManager.getTime();
|
|
184
189
|
let numAdded = 0;
|
|
185
190
|
let numDeleted = 0;
|
|
186
191
|
let numUpdated = 0;
|
|
187
192
|
let numSkipped = 0;
|
|
188
|
-
const batches =
|
|
193
|
+
const batches = _batch(batchSize ?? 100, docs);
|
|
189
194
|
for (const batch of batches) {
|
|
190
|
-
const hashedDocs =
|
|
195
|
+
const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)));
|
|
191
196
|
const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));
|
|
192
197
|
if (cleanup === "incremental") {
|
|
193
198
|
hashedDocs.forEach((_hashedDoc, index) => {
|
|
@@ -2,6 +2,7 @@ import { VectorStore } from "@langchain/core/vectorstores";
|
|
|
2
2
|
import { RecordManagerInterface } from "@langchain/community/indexes/base";
|
|
3
3
|
import { DocumentInterface } from "@langchain/core/documents";
|
|
4
4
|
import { BaseDocumentLoader } from "../document_loaders/base.js";
|
|
5
|
+
type Metadata = Record<string, unknown>;
|
|
5
6
|
type IndexingResult = {
|
|
6
7
|
numAdded: number;
|
|
7
8
|
numDeleted: number;
|
|
@@ -9,6 +10,40 @@ type IndexingResult = {
|
|
|
9
10
|
numSkipped: number;
|
|
10
11
|
};
|
|
11
12
|
type StringOrDocFunc = string | ((doc: DocumentInterface) => string);
|
|
13
|
+
export interface HashedDocumentInterface extends DocumentInterface {
|
|
14
|
+
uid: string;
|
|
15
|
+
hash_?: string;
|
|
16
|
+
contentHash?: string;
|
|
17
|
+
metadataHash?: string;
|
|
18
|
+
pageContent: string;
|
|
19
|
+
metadata: Metadata;
|
|
20
|
+
calculateHashes(): void;
|
|
21
|
+
toDocument(): DocumentInterface;
|
|
22
|
+
}
|
|
23
|
+
interface HashedDocumentArgs {
|
|
24
|
+
pageContent: string;
|
|
25
|
+
metadata: Metadata;
|
|
26
|
+
uid: string;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* HashedDocument is a Document with hashes calculated.
|
|
30
|
+
* Hashes are calculated based on page content and metadata.
|
|
31
|
+
* It is used for indexing.
|
|
32
|
+
*/
|
|
33
|
+
export declare class _HashedDocument implements HashedDocumentInterface {
|
|
34
|
+
uid: string;
|
|
35
|
+
hash_?: string;
|
|
36
|
+
contentHash?: string;
|
|
37
|
+
metadataHash?: string;
|
|
38
|
+
pageContent: string;
|
|
39
|
+
metadata: Metadata;
|
|
40
|
+
constructor(fields: HashedDocumentArgs);
|
|
41
|
+
calculateHashes(): void;
|
|
42
|
+
toDocument(): DocumentInterface;
|
|
43
|
+
static fromDocument(document: DocumentInterface, uid?: string): _HashedDocument;
|
|
44
|
+
private _hashStringToUUID;
|
|
45
|
+
private _hashNestedDictToUUID;
|
|
46
|
+
}
|
|
12
47
|
export type CleanupMode = "full" | "incremental";
|
|
13
48
|
export type IndexOptions = {
|
|
14
49
|
/**
|
|
@@ -46,6 +81,10 @@ export type IndexOptions = {
|
|
|
46
81
|
*/
|
|
47
82
|
forceUpdate?: boolean;
|
|
48
83
|
};
|
|
84
|
+
export declare function _batch<T>(size: number, iterable: T[]): T[][];
|
|
85
|
+
export declare function _deduplicateInOrder(hashedDocuments: HashedDocumentInterface[]): HashedDocumentInterface[];
|
|
86
|
+
export declare function _getSourceIdAssigner(sourceIdKey: StringOrDocFunc | null): (doc: DocumentInterface) => string | null;
|
|
87
|
+
export declare const _isBaseDocumentLoader: (arg: any) => arg is BaseDocumentLoader;
|
|
49
88
|
interface IndexArgs {
|
|
50
89
|
docsSource: BaseDocumentLoader | DocumentInterface[];
|
|
51
90
|
recordManager: RecordManagerInterface;
|
package/dist/indexes/indexing.js
CHANGED
|
@@ -7,7 +7,7 @@ import { Document } from "../document.js";
|
|
|
7
7
|
* Hashes are calculated based on page content and metadata.
|
|
8
8
|
* It is used for indexing.
|
|
9
9
|
*/
|
|
10
|
-
class
|
|
10
|
+
export class _HashedDocument {
|
|
11
11
|
constructor(fields) {
|
|
12
12
|
Object.defineProperty(this, "uid", {
|
|
13
13
|
enumerable: true,
|
|
@@ -95,7 +95,7 @@ class HashedDocument {
|
|
|
95
95
|
return uuidv5(hash_value, UUIDV5_NAMESPACE);
|
|
96
96
|
}
|
|
97
97
|
}
|
|
98
|
-
function
|
|
98
|
+
export function _batch(size, iterable) {
|
|
99
99
|
const batches = [];
|
|
100
100
|
let currentBatch = [];
|
|
101
101
|
iterable.forEach((item) => {
|
|
@@ -110,7 +110,7 @@ function batch(size, iterable) {
|
|
|
110
110
|
}
|
|
111
111
|
return batches;
|
|
112
112
|
}
|
|
113
|
-
function
|
|
113
|
+
export function _deduplicateInOrder(hashedDocuments) {
|
|
114
114
|
const seen = new Set();
|
|
115
115
|
const deduplicated = [];
|
|
116
116
|
for (const hashedDoc of hashedDocuments) {
|
|
@@ -124,7 +124,7 @@ function deduplicateInOrder(hashedDocuments) {
|
|
|
124
124
|
}
|
|
125
125
|
return deduplicated;
|
|
126
126
|
}
|
|
127
|
-
function
|
|
127
|
+
export function _getSourceIdAssigner(sourceIdKey) {
|
|
128
128
|
if (sourceIdKey === null) {
|
|
129
129
|
return (_doc) => null;
|
|
130
130
|
}
|
|
@@ -139,7 +139,7 @@ function getSourceIdAssigner(sourceIdKey) {
|
|
|
139
139
|
}
|
|
140
140
|
}
|
|
141
141
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
142
|
-
const _isBaseDocumentLoader = (arg) => {
|
|
142
|
+
export const _isBaseDocumentLoader = (arg) => {
|
|
143
143
|
if ("load" in arg &&
|
|
144
144
|
typeof arg.load === "function" &&
|
|
145
145
|
"loadAndSplit" in arg &&
|
|
@@ -176,15 +176,15 @@ export async function index(args) {
|
|
|
176
176
|
const docs = _isBaseDocumentLoader(docsSource)
|
|
177
177
|
? await docsSource.load()
|
|
178
178
|
: docsSource;
|
|
179
|
-
const sourceIdAssigner =
|
|
179
|
+
const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);
|
|
180
180
|
const indexStartDt = await recordManager.getTime();
|
|
181
181
|
let numAdded = 0;
|
|
182
182
|
let numDeleted = 0;
|
|
183
183
|
let numUpdated = 0;
|
|
184
184
|
let numSkipped = 0;
|
|
185
|
-
const batches =
|
|
185
|
+
const batches = _batch(batchSize ?? 100, docs);
|
|
186
186
|
for (const batch of batches) {
|
|
187
|
-
const hashedDocs =
|
|
187
|
+
const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)));
|
|
188
188
|
const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));
|
|
189
189
|
if (cleanup === "incremental") {
|
|
190
190
|
hashedDocs.forEach((_hashedDoc, index) => {
|
|
@@ -99,6 +99,7 @@ exports.optionalImportEntrypoints = [
|
|
|
99
99
|
"langchain/document_loaders/web/pdf",
|
|
100
100
|
"langchain/document_loaders/web/recursive_url",
|
|
101
101
|
"langchain/document_loaders/web/s3",
|
|
102
|
+
"langchain/document_loaders/web/sitemap",
|
|
102
103
|
"langchain/document_loaders/web/sonix_audio",
|
|
103
104
|
"langchain/document_loaders/web/confluence",
|
|
104
105
|
"langchain/document_loaders/web/youtube",
|
|
@@ -96,6 +96,7 @@ export const optionalImportEntrypoints = [
|
|
|
96
96
|
"langchain/document_loaders/web/pdf",
|
|
97
97
|
"langchain/document_loaders/web/recursive_url",
|
|
98
98
|
"langchain/document_loaders/web/s3",
|
|
99
|
+
"langchain/document_loaders/web/sitemap",
|
|
99
100
|
"langchain/document_loaders/web/sonix_audio",
|
|
100
101
|
"langchain/document_loaders/web/confluence",
|
|
101
102
|
"langchain/document_loaders/web/youtube",
|
package/dist/text_splitter.cjs
CHANGED
|
@@ -68,7 +68,10 @@ class TextSplitter extends documents_1.BaseDocumentTransformer {
|
|
|
68
68
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
69
69
|
metadatas = [], chunkHeaderOptions = {}) {
|
|
70
70
|
// if no metadata is provided, we create an empty one for each text
|
|
71
|
-
|
|
71
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
72
|
+
const _metadatas = metadatas.length > 0
|
|
73
|
+
? metadatas
|
|
74
|
+
: [...Array(texts.length)].map(() => ({}));
|
|
72
75
|
const { chunkHeader = "", chunkOverlapHeader = "(cont'd) ", appendChunkOverlapHeader = false, } = chunkHeaderOptions;
|
|
73
76
|
const documents = new Array();
|
|
74
77
|
for (let i = 0; i < texts.length; i += 1) {
|
package/dist/text_splitter.js
CHANGED
|
@@ -65,7 +65,10 @@ export class TextSplitter extends BaseDocumentTransformer {
|
|
|
65
65
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
66
66
|
metadatas = [], chunkHeaderOptions = {}) {
|
|
67
67
|
// if no metadata is provided, we create an empty one for each text
|
|
68
|
-
|
|
68
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
69
|
+
const _metadatas = metadatas.length > 0
|
|
70
|
+
? metadatas
|
|
71
|
+
: [...Array(texts.length)].map(() => ({}));
|
|
69
72
|
const { chunkHeader = "", chunkOverlapHeader = "(cont'd) ", appendChunkOverlapHeader = false, } = chunkHeaderOptions;
|
|
70
73
|
const documents = new Array();
|
|
71
74
|
for (let i = 0; i < texts.length; i += 1) {
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
module.exports = require('../../dist/document_loaders/web/sitemap.cjs');
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from '../../dist/document_loaders/web/sitemap.js'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from '../../dist/document_loaders/web/sitemap.js'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from '../../dist/document_loaders/web/sitemap.js'
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "langchain",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.18",
|
|
4
4
|
"description": "Typescript bindings for langchain",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"engines": {
|
|
@@ -602,6 +602,10 @@
|
|
|
602
602
|
"document_loaders/web/s3.js",
|
|
603
603
|
"document_loaders/web/s3.d.ts",
|
|
604
604
|
"document_loaders/web/s3.d.cts",
|
|
605
|
+
"document_loaders/web/sitemap.cjs",
|
|
606
|
+
"document_loaders/web/sitemap.js",
|
|
607
|
+
"document_loaders/web/sitemap.d.ts",
|
|
608
|
+
"document_loaders/web/sitemap.d.cts",
|
|
605
609
|
"document_loaders/web/sonix_audio.cjs",
|
|
606
610
|
"document_loaders/web/sonix_audio.js",
|
|
607
611
|
"document_loaders/web/sonix_audio.d.ts",
|
|
@@ -1500,15 +1504,15 @@
|
|
|
1500
1504
|
},
|
|
1501
1505
|
"dependencies": {
|
|
1502
1506
|
"@anthropic-ai/sdk": "^0.9.1",
|
|
1503
|
-
"@langchain/community": "~0.0.
|
|
1504
|
-
"@langchain/core": "~0.1.
|
|
1505
|
-
"@langchain/openai": "~0.0.
|
|
1507
|
+
"@langchain/community": "~0.0.28",
|
|
1508
|
+
"@langchain/core": "~0.1.28",
|
|
1509
|
+
"@langchain/openai": "~0.0.14",
|
|
1506
1510
|
"binary-extensions": "^2.2.0",
|
|
1507
1511
|
"expr-eval": "^2.0.2",
|
|
1508
1512
|
"js-tiktoken": "^1.0.7",
|
|
1509
1513
|
"js-yaml": "^4.1.0",
|
|
1510
1514
|
"jsonpointer": "^5.0.1",
|
|
1511
|
-
"langchainhub": "~0.0.
|
|
1515
|
+
"langchainhub": "~0.0.8",
|
|
1512
1516
|
"langsmith": "~0.0.59",
|
|
1513
1517
|
"ml-distance": "^4.0.0",
|
|
1514
1518
|
"openapi-types": "^12.1.3",
|
|
@@ -2868,6 +2872,15 @@
|
|
|
2868
2872
|
"import": "./document_loaders/web/s3.js",
|
|
2869
2873
|
"require": "./document_loaders/web/s3.cjs"
|
|
2870
2874
|
},
|
|
2875
|
+
"./document_loaders/web/sitemap": {
|
|
2876
|
+
"types": {
|
|
2877
|
+
"import": "./document_loaders/web/sitemap.d.ts",
|
|
2878
|
+
"require": "./document_loaders/web/sitemap.d.cts",
|
|
2879
|
+
"default": "./document_loaders/web/sitemap.d.ts"
|
|
2880
|
+
},
|
|
2881
|
+
"import": "./document_loaders/web/sitemap.js",
|
|
2882
|
+
"require": "./document_loaders/web/sitemap.cjs"
|
|
2883
|
+
},
|
|
2871
2884
|
"./document_loaders/web/sonix_audio": {
|
|
2872
2885
|
"types": {
|
|
2873
2886
|
"import": "./document_loaders/web/sonix_audio.d.ts",
|