langchain 0.1.16 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,6 +22,13 @@ const CYPHER_QA_TEMPLATE = `You are an assistant that helps to form nice and hum
22
22
  The information part contains the provided information that you must use to construct an answer.
23
23
  The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
24
24
  Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
25
+ Here is an example:
26
+
27
+ Question: Which managers own Neo4j stocks?
28
+ Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
29
+ Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.
30
+
31
+ Follow this example when generating answers.
25
32
  If the provided information is empty, say that you don't know the answer.
26
33
  Information:
27
34
  {context}
@@ -19,6 +19,13 @@ const CYPHER_QA_TEMPLATE = `You are an assistant that helps to form nice and hum
19
19
  The information part contains the provided information that you must use to construct an answer.
20
20
  The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
21
21
  Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
22
+ Here is an example:
23
+
24
+ Question: Which managers own Neo4j stocks?
25
+ Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
26
+ Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.
27
+
28
+ Follow this example when generating answers.
22
29
  If the provided information is empty, say that you don't know the answer.
23
30
  Information:
24
31
  {context}
@@ -1,7 +1,7 @@
1
1
  import type { BaseRetrieverInterface } from "@langchain/core/retrievers";
2
2
  import { type Runnable, type RunnableInterface } from "@langchain/core/runnables";
3
3
  import type { BaseMessage } from "@langchain/core/messages";
4
- import type { DocumentInterface } from "@langchain/core/documents";
4
+ import type { DocumentInterface, Document } from "@langchain/core/documents";
5
5
  /**
6
6
  * Parameters for the createRetrievalChain method.
7
7
  */
@@ -58,7 +58,7 @@ export declare function createRetrievalChain<RunOutput>({ retriever, combineDocs
58
58
  } & {
59
59
  [key: string]: unknown;
60
60
  }, {
61
- context: string;
61
+ context: Document[];
62
62
  answer: RunOutput;
63
63
  } & {
64
64
  [key: string]: unknown;
@@ -54,14 +54,23 @@ class CheerioWebBaseLoader extends base_js_1.BaseDocumentLoader {
54
54
  this.selector = selector ?? "body";
55
55
  this.textDecoder = textDecoder;
56
56
  }
57
- static async _scrape(url, caller, timeout, textDecoder) {
57
+ /**
58
+ * Fetches web documents from the given array of URLs and loads them using Cheerio.
59
+ * It returns an array of CheerioAPI instances.
60
+ * @param urls An array of URLs to fetch and load.
61
+ * @returns A Promise that resolves to an array of CheerioAPI instances.
62
+ */
63
+ static async scrapeAll(urls, caller, timeout, textDecoder, options) {
64
+ return Promise.all(urls.map((url) => CheerioWebBaseLoader._scrape(url, caller, timeout, textDecoder, options)));
65
+ }
66
+ static async _scrape(url, caller, timeout, textDecoder, options) {
58
67
  const { load } = await CheerioWebBaseLoader.imports();
59
68
  const response = await caller.call(fetch, url, {
60
69
  signal: timeout ? AbortSignal.timeout(timeout) : undefined,
61
70
  });
62
71
  const html = textDecoder?.decode(await response.arrayBuffer()) ??
63
72
  (await response.text());
64
- return load(html);
73
+ return load(html, options);
65
74
  }
66
75
  /**
67
76
  * Fetches the web document from the webPath and loads it using Cheerio.
@@ -1,4 +1,4 @@
1
- import type { CheerioAPI, load as LoadT, SelectorType } from "cheerio";
1
+ import type { CheerioAPI, CheerioOptions, load as LoadT, SelectorType } from "cheerio";
2
2
  import { Document } from "@langchain/core/documents";
3
3
  import { AsyncCaller, AsyncCallerParams } from "@langchain/core/utils/async_caller";
4
4
  import { BaseDocumentLoader } from "../base.js";
@@ -41,7 +41,14 @@ export declare class CheerioWebBaseLoader extends BaseDocumentLoader implements
41
41
  selector?: SelectorType;
42
42
  textDecoder?: TextDecoder;
43
43
  constructor(webPath: string, fields?: WebBaseLoaderParams);
44
- static _scrape(url: string, caller: AsyncCaller, timeout: number | undefined, textDecoder?: TextDecoder): Promise<CheerioAPI>;
44
+ /**
45
+ * Fetches web documents from the given array of URLs and loads them using Cheerio.
46
+ * It returns an array of CheerioAPI instances.
47
+ * @param urls An array of URLs to fetch and load.
48
+ * @returns A Promise that resolves to an array of CheerioAPI instances.
49
+ */
50
+ static scrapeAll(urls: string[], caller: AsyncCaller, timeout: number | undefined, textDecoder?: TextDecoder, options?: CheerioOptions): Promise<CheerioAPI[]>;
51
+ static _scrape(url: string, caller: AsyncCaller, timeout: number | undefined, textDecoder?: TextDecoder, options?: CheerioOptions): Promise<CheerioAPI>;
45
52
  /**
46
53
  * Fetches the web document from the webPath and loads it using Cheerio.
47
54
  * It returns a CheerioAPI instance.
@@ -51,14 +51,23 @@ export class CheerioWebBaseLoader extends BaseDocumentLoader {
51
51
  this.selector = selector ?? "body";
52
52
  this.textDecoder = textDecoder;
53
53
  }
54
- static async _scrape(url, caller, timeout, textDecoder) {
54
+ /**
55
+ * Fetches web documents from the given array of URLs and loads them using Cheerio.
56
+ * It returns an array of CheerioAPI instances.
57
+ * @param urls An array of URLs to fetch and load.
58
+ * @returns A Promise that resolves to an array of CheerioAPI instances.
59
+ */
60
+ static async scrapeAll(urls, caller, timeout, textDecoder, options) {
61
+ return Promise.all(urls.map((url) => CheerioWebBaseLoader._scrape(url, caller, timeout, textDecoder, options)));
62
+ }
63
+ static async _scrape(url, caller, timeout, textDecoder, options) {
55
64
  const { load } = await CheerioWebBaseLoader.imports();
56
65
  const response = await caller.call(fetch, url, {
57
66
  signal: timeout ? AbortSignal.timeout(timeout) : undefined,
58
67
  });
59
68
  const html = textDecoder?.decode(await response.arrayBuffer()) ??
60
69
  (await response.text());
61
- return load(html);
70
+ return load(html, options);
62
71
  }
63
72
  /**
64
73
  * Fetches the web document from the webPath and loads it using Cheerio.
@@ -0,0 +1,110 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.SitemapLoader = void 0;
4
+ const documents_1 = require("@langchain/core/documents");
5
+ const chunk_array_1 = require("@langchain/core/utils/chunk_array");
6
+ const cheerio_js_1 = require("./cheerio.cjs");
7
+ const DEFAULT_CHUNK_SIZE = 300;
8
+ class SitemapLoader extends cheerio_js_1.CheerioWebBaseLoader {
9
+ constructor(webPath, params = {}) {
10
+ const paramsWithDefaults = { chunkSize: DEFAULT_CHUNK_SIZE, ...params };
11
+ let path = webPath.endsWith("/") ? webPath.slice(0, -1) : webPath;
12
+ // Allow for custom sitemap paths to be passed in with the url.
13
+ path = path.endsWith(".xml") ? path : `${path}/sitemap.xml`;
14
+ super(path, paramsWithDefaults);
15
+ Object.defineProperty(this, "webPath", {
16
+ enumerable: true,
17
+ configurable: true,
18
+ writable: true,
19
+ value: webPath
20
+ });
21
+ Object.defineProperty(this, "allowUrlPatterns", {
22
+ enumerable: true,
23
+ configurable: true,
24
+ writable: true,
25
+ value: void 0
26
+ });
27
+ Object.defineProperty(this, "chunkSize", {
28
+ enumerable: true,
29
+ configurable: true,
30
+ writable: true,
31
+ value: void 0
32
+ });
33
+ this.webPath = path;
34
+ this.allowUrlPatterns = paramsWithDefaults.filterUrls;
35
+ this.chunkSize = paramsWithDefaults.chunkSize;
36
+ }
37
+ _checkUrlPatterns(url) {
38
+ if (!this.allowUrlPatterns) {
39
+ return false;
40
+ }
41
+ return !this.allowUrlPatterns.some((pattern) => !new RegExp(pattern).test(url));
42
+ }
43
+ async parseSitemap() {
44
+ const $ = await cheerio_js_1.CheerioWebBaseLoader._scrape(this.webPath, this.caller, this.timeout, this.textDecoder, {
45
+ xmlMode: true,
46
+ xml: true,
47
+ });
48
+ const elements = [];
49
+ $("url").each((_, element) => {
50
+ const loc = $(element).find("loc").text();
51
+ if (!loc) {
52
+ return;
53
+ }
54
+ if (this._checkUrlPatterns(loc)) {
55
+ return;
56
+ }
57
+ const changefreq = $(element).find("changefreq").text();
58
+ const lastmod = $(element).find("lastmod").text();
59
+ const priority = $(element).find("priority").text();
60
+ elements.push({ loc, changefreq, lastmod, priority });
61
+ });
62
+ $("sitemap").each((_, element) => {
63
+ const loc = $(element).find("loc").text();
64
+ if (!loc) {
65
+ return;
66
+ }
67
+ const changefreq = $(element).find("changefreq").text();
68
+ const lastmod = $(element).find("lastmod").text();
69
+ const priority = $(element).find("priority").text();
70
+ elements.push({ loc, changefreq, lastmod, priority });
71
+ });
72
+ return elements;
73
+ }
74
+ async _loadSitemapUrls(elements) {
75
+ const all = await cheerio_js_1.CheerioWebBaseLoader.scrapeAll(elements.map((ele) => ele.loc), this.caller, this.timeout, this.textDecoder);
76
+ const documents = all.map(($, i) => {
77
+ if (!elements[i]) {
78
+ throw new Error("Scraped docs and elements not in sync");
79
+ }
80
+ const text = $(this.selector).text();
81
+ const { loc: source, ...metadata } = elements[i];
82
+ // extract page metadata
83
+ const description = $("meta[name='description']").attr("content");
84
+ const title = $("meta[property='og:title']").attr("content");
85
+ const lang = $("meta[property='og:locale']").attr("content");
86
+ return new documents_1.Document({
87
+ pageContent: text,
88
+ metadata: {
89
+ ...metadata,
90
+ description,
91
+ title,
92
+ lang,
93
+ source: source.trim(),
94
+ },
95
+ });
96
+ });
97
+ return documents;
98
+ }
99
+ async load() {
100
+ const elements = await this.parseSitemap();
101
+ const chunks = (0, chunk_array_1.chunkArray)(elements, this.chunkSize);
102
+ const documents = [];
103
+ for await (const chunk of chunks) {
104
+ const chunkedDocuments = await this._loadSitemapUrls(chunk);
105
+ documents.push(...chunkedDocuments);
106
+ }
107
+ return documents;
108
+ }
109
+ }
110
+ exports.SitemapLoader = SitemapLoader;
@@ -0,0 +1,36 @@
1
+ import { Document, DocumentInterface } from "@langchain/core/documents";
2
+ import { CheerioWebBaseLoader, WebBaseLoaderParams } from "./cheerio.js";
3
+ /**
4
+ * Interface representing the parameters for initializing a SitemapLoader.
5
+ * @interface SitemapLoaderParams
6
+ * @extends WebBaseLoaderParams
7
+ */
8
+ export interface SitemapLoaderParams extends WebBaseLoaderParams {
9
+ /**
10
+ * @property {(string | RegExp)[] | undefined} filterUrls - A list of regexes. Only URLs that match one of the filter URLs will be loaded.
11
+ * WARNING: The filter URLs are interpreted as regular expressions. Escape special characters if needed.
12
+ */
13
+ filterUrls?: (string | RegExp)[];
14
+ /**
15
+ * The size to chunk the sitemap URLs into for scraping.
16
+ * @default {300}
17
+ */
18
+ chunkSize?: number;
19
+ }
20
+ type SiteMapElement = {
21
+ loc: string;
22
+ changefreq?: string;
23
+ lastmod?: string;
24
+ priority?: string;
25
+ };
26
+ export declare class SitemapLoader extends CheerioWebBaseLoader implements SitemapLoaderParams {
27
+ webPath: string;
28
+ allowUrlPatterns: (string | RegExp)[] | undefined;
29
+ chunkSize: number;
30
+ constructor(webPath: string, params?: SitemapLoaderParams);
31
+ _checkUrlPatterns(url: string): boolean;
32
+ parseSitemap(): Promise<SiteMapElement[]>;
33
+ _loadSitemapUrls(elements: Array<SiteMapElement>): Promise<DocumentInterface[]>;
34
+ load(): Promise<Document[]>;
35
+ }
36
+ export {};
@@ -0,0 +1,106 @@
1
+ import { Document } from "@langchain/core/documents";
2
+ import { chunkArray } from "@langchain/core/utils/chunk_array";
3
+ import { CheerioWebBaseLoader } from "./cheerio.js";
4
+ const DEFAULT_CHUNK_SIZE = 300;
5
+ export class SitemapLoader extends CheerioWebBaseLoader {
6
+ constructor(webPath, params = {}) {
7
+ const paramsWithDefaults = { chunkSize: DEFAULT_CHUNK_SIZE, ...params };
8
+ let path = webPath.endsWith("/") ? webPath.slice(0, -1) : webPath;
9
+ // Allow for custom sitemap paths to be passed in with the url.
10
+ path = path.endsWith(".xml") ? path : `${path}/sitemap.xml`;
11
+ super(path, paramsWithDefaults);
12
+ Object.defineProperty(this, "webPath", {
13
+ enumerable: true,
14
+ configurable: true,
15
+ writable: true,
16
+ value: webPath
17
+ });
18
+ Object.defineProperty(this, "allowUrlPatterns", {
19
+ enumerable: true,
20
+ configurable: true,
21
+ writable: true,
22
+ value: void 0
23
+ });
24
+ Object.defineProperty(this, "chunkSize", {
25
+ enumerable: true,
26
+ configurable: true,
27
+ writable: true,
28
+ value: void 0
29
+ });
30
+ this.webPath = path;
31
+ this.allowUrlPatterns = paramsWithDefaults.filterUrls;
32
+ this.chunkSize = paramsWithDefaults.chunkSize;
33
+ }
34
+ _checkUrlPatterns(url) {
35
+ if (!this.allowUrlPatterns) {
36
+ return false;
37
+ }
38
+ return !this.allowUrlPatterns.some((pattern) => !new RegExp(pattern).test(url));
39
+ }
40
+ async parseSitemap() {
41
+ const $ = await CheerioWebBaseLoader._scrape(this.webPath, this.caller, this.timeout, this.textDecoder, {
42
+ xmlMode: true,
43
+ xml: true,
44
+ });
45
+ const elements = [];
46
+ $("url").each((_, element) => {
47
+ const loc = $(element).find("loc").text();
48
+ if (!loc) {
49
+ return;
50
+ }
51
+ if (this._checkUrlPatterns(loc)) {
52
+ return;
53
+ }
54
+ const changefreq = $(element).find("changefreq").text();
55
+ const lastmod = $(element).find("lastmod").text();
56
+ const priority = $(element).find("priority").text();
57
+ elements.push({ loc, changefreq, lastmod, priority });
58
+ });
59
+ $("sitemap").each((_, element) => {
60
+ const loc = $(element).find("loc").text();
61
+ if (!loc) {
62
+ return;
63
+ }
64
+ const changefreq = $(element).find("changefreq").text();
65
+ const lastmod = $(element).find("lastmod").text();
66
+ const priority = $(element).find("priority").text();
67
+ elements.push({ loc, changefreq, lastmod, priority });
68
+ });
69
+ return elements;
70
+ }
71
+ async _loadSitemapUrls(elements) {
72
+ const all = await CheerioWebBaseLoader.scrapeAll(elements.map((ele) => ele.loc), this.caller, this.timeout, this.textDecoder);
73
+ const documents = all.map(($, i) => {
74
+ if (!elements[i]) {
75
+ throw new Error("Scraped docs and elements not in sync");
76
+ }
77
+ const text = $(this.selector).text();
78
+ const { loc: source, ...metadata } = elements[i];
79
+ // extract page metadata
80
+ const description = $("meta[name='description']").attr("content");
81
+ const title = $("meta[property='og:title']").attr("content");
82
+ const lang = $("meta[property='og:locale']").attr("content");
83
+ return new Document({
84
+ pageContent: text,
85
+ metadata: {
86
+ ...metadata,
87
+ description,
88
+ title,
89
+ lang,
90
+ source: source.trim(),
91
+ },
92
+ });
93
+ });
94
+ return documents;
95
+ }
96
+ async load() {
97
+ const elements = await this.parseSitemap();
98
+ const chunks = chunkArray(elements, this.chunkSize);
99
+ const documents = [];
100
+ for await (const chunk of chunks) {
101
+ const chunkedDocuments = await this._loadSitemapUrls(chunk);
102
+ documents.push(...chunkedDocuments);
103
+ }
104
+ return documents;
105
+ }
106
+ }
@@ -221,6 +221,8 @@ class OpenAIAssistantRunnable extends runnables_1.Runnable {
221
221
  return {
222
222
  returnValues: {
223
223
  output: answerString,
224
+ runId,
225
+ threadId,
224
226
  },
225
227
  log: "",
226
228
  runId,
@@ -218,6 +218,8 @@ export class OpenAIAssistantRunnable extends Runnable {
218
218
  return {
219
219
  returnValues: {
220
220
  output: answerString,
221
+ runId,
222
+ threadId,
221
223
  },
222
224
  log: "",
223
225
  runId,
@@ -1,5 +1,10 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.index = void 0;
3
+ exports._HashedDocument = exports._isBaseDocumentLoader = exports._getSourceIdAssigner = exports._deduplicateInOrder = exports._batch = exports.index = void 0;
4
4
  var indexing_js_1 = require("./indexing.cjs");
5
5
  Object.defineProperty(exports, "index", { enumerable: true, get: function () { return indexing_js_1.index; } });
6
+ Object.defineProperty(exports, "_batch", { enumerable: true, get: function () { return indexing_js_1._batch; } });
7
+ Object.defineProperty(exports, "_deduplicateInOrder", { enumerable: true, get: function () { return indexing_js_1._deduplicateInOrder; } });
8
+ Object.defineProperty(exports, "_getSourceIdAssigner", { enumerable: true, get: function () { return indexing_js_1._getSourceIdAssigner; } });
9
+ Object.defineProperty(exports, "_isBaseDocumentLoader", { enumerable: true, get: function () { return indexing_js_1._isBaseDocumentLoader; } });
10
+ Object.defineProperty(exports, "_HashedDocument", { enumerable: true, get: function () { return indexing_js_1._HashedDocument; } });
@@ -1 +1 @@
1
- export { type CleanupMode, type IndexOptions, index } from "./indexing.js";
1
+ export { type CleanupMode, type IndexOptions, index, _batch, _deduplicateInOrder, _getSourceIdAssigner, _isBaseDocumentLoader, _HashedDocument, } from "./indexing.js";
@@ -1 +1 @@
1
- export { index } from "./indexing.js";
1
+ export { index, _batch, _deduplicateInOrder, _getSourceIdAssigner, _isBaseDocumentLoader, _HashedDocument, } from "./indexing.js";
@@ -1,6 +1,6 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.index = void 0;
3
+ exports.index = exports._isBaseDocumentLoader = exports._getSourceIdAssigner = exports._deduplicateInOrder = exports._batch = exports._HashedDocument = void 0;
4
4
  const uuid_1 = require("uuid");
5
5
  const base_1 = require("@langchain/community/indexes/base");
6
6
  const hash_1 = require("@langchain/core/utils/hash");
@@ -10,7 +10,7 @@ const document_js_1 = require("../document.cjs");
10
10
  * Hashes are calculated based on page content and metadata.
11
11
  * It is used for indexing.
12
12
  */
13
- class HashedDocument {
13
+ class _HashedDocument {
14
14
  constructor(fields) {
15
15
  Object.defineProperty(this, "uid", {
16
16
  enumerable: true,
@@ -98,7 +98,8 @@ class HashedDocument {
98
98
  return (0, uuid_1.v5)(hash_value, base_1.UUIDV5_NAMESPACE);
99
99
  }
100
100
  }
101
- function batch(size, iterable) {
101
+ exports._HashedDocument = _HashedDocument;
102
+ function _batch(size, iterable) {
102
103
  const batches = [];
103
104
  let currentBatch = [];
104
105
  iterable.forEach((item) => {
@@ -113,7 +114,8 @@ function batch(size, iterable) {
113
114
  }
114
115
  return batches;
115
116
  }
116
- function deduplicateInOrder(hashedDocuments) {
117
+ exports._batch = _batch;
118
+ function _deduplicateInOrder(hashedDocuments) {
117
119
  const seen = new Set();
118
120
  const deduplicated = [];
119
121
  for (const hashedDoc of hashedDocuments) {
@@ -127,7 +129,8 @@ function deduplicateInOrder(hashedDocuments) {
127
129
  }
128
130
  return deduplicated;
129
131
  }
130
- function getSourceIdAssigner(sourceIdKey) {
132
+ exports._deduplicateInOrder = _deduplicateInOrder;
133
+ function _getSourceIdAssigner(sourceIdKey) {
131
134
  if (sourceIdKey === null) {
132
135
  return (_doc) => null;
133
136
  }
@@ -141,6 +144,7 @@ function getSourceIdAssigner(sourceIdKey) {
141
144
  throw new Error(`sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`);
142
145
  }
143
146
  }
147
+ exports._getSourceIdAssigner = _getSourceIdAssigner;
144
148
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
145
149
  const _isBaseDocumentLoader = (arg) => {
146
150
  if ("load" in arg &&
@@ -151,6 +155,7 @@ const _isBaseDocumentLoader = (arg) => {
151
155
  }
152
156
  return false;
153
157
  };
158
+ exports._isBaseDocumentLoader = _isBaseDocumentLoader;
154
159
  /**
155
160
  * Index data from the doc source into the vector store.
156
161
  *
@@ -176,18 +181,18 @@ async function index(args) {
176
181
  if (cleanup === "incremental" && !sourceIdKey) {
177
182
  throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.");
178
183
  }
179
- const docs = _isBaseDocumentLoader(docsSource)
184
+ const docs = (0, exports._isBaseDocumentLoader)(docsSource)
180
185
  ? await docsSource.load()
181
186
  : docsSource;
182
- const sourceIdAssigner = getSourceIdAssigner(sourceIdKey ?? null);
187
+ const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);
183
188
  const indexStartDt = await recordManager.getTime();
184
189
  let numAdded = 0;
185
190
  let numDeleted = 0;
186
191
  let numUpdated = 0;
187
192
  let numSkipped = 0;
188
- const batches = batch(batchSize ?? 100, docs);
193
+ const batches = _batch(batchSize ?? 100, docs);
189
194
  for (const batch of batches) {
190
- const hashedDocs = deduplicateInOrder(batch.map((doc) => HashedDocument.fromDocument(doc)));
195
+ const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)));
191
196
  const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));
192
197
  if (cleanup === "incremental") {
193
198
  hashedDocs.forEach((_hashedDoc, index) => {
@@ -2,6 +2,7 @@ import { VectorStore } from "@langchain/core/vectorstores";
2
2
  import { RecordManagerInterface } from "@langchain/community/indexes/base";
3
3
  import { DocumentInterface } from "@langchain/core/documents";
4
4
  import { BaseDocumentLoader } from "../document_loaders/base.js";
5
+ type Metadata = Record<string, unknown>;
5
6
  type IndexingResult = {
6
7
  numAdded: number;
7
8
  numDeleted: number;
@@ -9,6 +10,40 @@ type IndexingResult = {
9
10
  numSkipped: number;
10
11
  };
11
12
  type StringOrDocFunc = string | ((doc: DocumentInterface) => string);
13
+ export interface HashedDocumentInterface extends DocumentInterface {
14
+ uid: string;
15
+ hash_?: string;
16
+ contentHash?: string;
17
+ metadataHash?: string;
18
+ pageContent: string;
19
+ metadata: Metadata;
20
+ calculateHashes(): void;
21
+ toDocument(): DocumentInterface;
22
+ }
23
+ interface HashedDocumentArgs {
24
+ pageContent: string;
25
+ metadata: Metadata;
26
+ uid: string;
27
+ }
28
+ /**
29
+ * HashedDocument is a Document with hashes calculated.
30
+ * Hashes are calculated based on page content and metadata.
31
+ * It is used for indexing.
32
+ */
33
+ export declare class _HashedDocument implements HashedDocumentInterface {
34
+ uid: string;
35
+ hash_?: string;
36
+ contentHash?: string;
37
+ metadataHash?: string;
38
+ pageContent: string;
39
+ metadata: Metadata;
40
+ constructor(fields: HashedDocumentArgs);
41
+ calculateHashes(): void;
42
+ toDocument(): DocumentInterface;
43
+ static fromDocument(document: DocumentInterface, uid?: string): _HashedDocument;
44
+ private _hashStringToUUID;
45
+ private _hashNestedDictToUUID;
46
+ }
12
47
  export type CleanupMode = "full" | "incremental";
13
48
  export type IndexOptions = {
14
49
  /**
@@ -46,6 +81,10 @@ export type IndexOptions = {
46
81
  */
47
82
  forceUpdate?: boolean;
48
83
  };
84
+ export declare function _batch<T>(size: number, iterable: T[]): T[][];
85
+ export declare function _deduplicateInOrder(hashedDocuments: HashedDocumentInterface[]): HashedDocumentInterface[];
86
+ export declare function _getSourceIdAssigner(sourceIdKey: StringOrDocFunc | null): (doc: DocumentInterface) => string | null;
87
+ export declare const _isBaseDocumentLoader: (arg: any) => arg is BaseDocumentLoader;
49
88
  interface IndexArgs {
50
89
  docsSource: BaseDocumentLoader | DocumentInterface[];
51
90
  recordManager: RecordManagerInterface;
@@ -7,7 +7,7 @@ import { Document } from "../document.js";
7
7
  * Hashes are calculated based on page content and metadata.
8
8
  * It is used for indexing.
9
9
  */
10
- class HashedDocument {
10
+ export class _HashedDocument {
11
11
  constructor(fields) {
12
12
  Object.defineProperty(this, "uid", {
13
13
  enumerable: true,
@@ -95,7 +95,7 @@ class HashedDocument {
95
95
  return uuidv5(hash_value, UUIDV5_NAMESPACE);
96
96
  }
97
97
  }
98
- function batch(size, iterable) {
98
+ export function _batch(size, iterable) {
99
99
  const batches = [];
100
100
  let currentBatch = [];
101
101
  iterable.forEach((item) => {
@@ -110,7 +110,7 @@ function batch(size, iterable) {
110
110
  }
111
111
  return batches;
112
112
  }
113
- function deduplicateInOrder(hashedDocuments) {
113
+ export function _deduplicateInOrder(hashedDocuments) {
114
114
  const seen = new Set();
115
115
  const deduplicated = [];
116
116
  for (const hashedDoc of hashedDocuments) {
@@ -124,7 +124,7 @@ function deduplicateInOrder(hashedDocuments) {
124
124
  }
125
125
  return deduplicated;
126
126
  }
127
- function getSourceIdAssigner(sourceIdKey) {
127
+ export function _getSourceIdAssigner(sourceIdKey) {
128
128
  if (sourceIdKey === null) {
129
129
  return (_doc) => null;
130
130
  }
@@ -139,7 +139,7 @@ function getSourceIdAssigner(sourceIdKey) {
139
139
  }
140
140
  }
141
141
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
142
- const _isBaseDocumentLoader = (arg) => {
142
+ export const _isBaseDocumentLoader = (arg) => {
143
143
  if ("load" in arg &&
144
144
  typeof arg.load === "function" &&
145
145
  "loadAndSplit" in arg &&
@@ -176,15 +176,15 @@ export async function index(args) {
176
176
  const docs = _isBaseDocumentLoader(docsSource)
177
177
  ? await docsSource.load()
178
178
  : docsSource;
179
- const sourceIdAssigner = getSourceIdAssigner(sourceIdKey ?? null);
179
+ const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);
180
180
  const indexStartDt = await recordManager.getTime();
181
181
  let numAdded = 0;
182
182
  let numDeleted = 0;
183
183
  let numUpdated = 0;
184
184
  let numSkipped = 0;
185
- const batches = batch(batchSize ?? 100, docs);
185
+ const batches = _batch(batchSize ?? 100, docs);
186
186
  for (const batch of batches) {
187
- const hashedDocs = deduplicateInOrder(batch.map((doc) => HashedDocument.fromDocument(doc)));
187
+ const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)));
188
188
  const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));
189
189
  if (cleanup === "incremental") {
190
190
  hashedDocs.forEach((_hashedDoc, index) => {
@@ -99,6 +99,7 @@ exports.optionalImportEntrypoints = [
99
99
  "langchain/document_loaders/web/pdf",
100
100
  "langchain/document_loaders/web/recursive_url",
101
101
  "langchain/document_loaders/web/s3",
102
+ "langchain/document_loaders/web/sitemap",
102
103
  "langchain/document_loaders/web/sonix_audio",
103
104
  "langchain/document_loaders/web/confluence",
104
105
  "langchain/document_loaders/web/youtube",
@@ -96,6 +96,7 @@ export const optionalImportEntrypoints = [
96
96
  "langchain/document_loaders/web/pdf",
97
97
  "langchain/document_loaders/web/recursive_url",
98
98
  "langchain/document_loaders/web/s3",
99
+ "langchain/document_loaders/web/sitemap",
99
100
  "langchain/document_loaders/web/sonix_audio",
100
101
  "langchain/document_loaders/web/confluence",
101
102
  "langchain/document_loaders/web/youtube",
@@ -68,7 +68,10 @@ class TextSplitter extends documents_1.BaseDocumentTransformer {
68
68
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
69
69
  metadatas = [], chunkHeaderOptions = {}) {
70
70
  // if no metadata is provided, we create an empty one for each text
71
- const _metadatas = metadatas.length > 0 ? metadatas : new Array(texts.length).fill({});
71
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
72
+ const _metadatas = metadatas.length > 0
73
+ ? metadatas
74
+ : [...Array(texts.length)].map(() => ({}));
72
75
  const { chunkHeader = "", chunkOverlapHeader = "(cont'd) ", appendChunkOverlapHeader = false, } = chunkHeaderOptions;
73
76
  const documents = new Array();
74
77
  for (let i = 0; i < texts.length; i += 1) {
@@ -65,7 +65,10 @@ export class TextSplitter extends BaseDocumentTransformer {
65
65
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
66
66
  metadatas = [], chunkHeaderOptions = {}) {
67
67
  // if no metadata is provided, we create an empty one for each text
68
- const _metadatas = metadatas.length > 0 ? metadatas : new Array(texts.length).fill({});
68
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
69
+ const _metadatas = metadatas.length > 0
70
+ ? metadatas
71
+ : [...Array(texts.length)].map(() => ({}));
69
72
  const { chunkHeader = "", chunkOverlapHeader = "(cont'd) ", appendChunkOverlapHeader = false, } = chunkHeaderOptions;
70
73
  const documents = new Array();
71
74
  for (let i = 0; i < texts.length; i += 1) {
@@ -0,0 +1 @@
1
+ module.exports = require('../../dist/document_loaders/web/sitemap.cjs');
@@ -0,0 +1 @@
1
+ export * from '../../dist/document_loaders/web/sitemap.js'
@@ -0,0 +1 @@
1
+ export * from '../../dist/document_loaders/web/sitemap.js'
@@ -0,0 +1 @@
1
+ export * from '../../dist/document_loaders/web/sitemap.js'
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "langchain",
3
- "version": "0.1.16",
3
+ "version": "0.1.18",
4
4
  "description": "Typescript bindings for langchain",
5
5
  "type": "module",
6
6
  "engines": {
@@ -602,6 +602,10 @@
602
602
  "document_loaders/web/s3.js",
603
603
  "document_loaders/web/s3.d.ts",
604
604
  "document_loaders/web/s3.d.cts",
605
+ "document_loaders/web/sitemap.cjs",
606
+ "document_loaders/web/sitemap.js",
607
+ "document_loaders/web/sitemap.d.ts",
608
+ "document_loaders/web/sitemap.d.cts",
605
609
  "document_loaders/web/sonix_audio.cjs",
606
610
  "document_loaders/web/sonix_audio.js",
607
611
  "document_loaders/web/sonix_audio.d.ts",
@@ -1500,15 +1504,15 @@
1500
1504
  },
1501
1505
  "dependencies": {
1502
1506
  "@anthropic-ai/sdk": "^0.9.1",
1503
- "@langchain/community": "~0.0.20",
1504
- "@langchain/core": "~0.1.25",
1505
- "@langchain/openai": "~0.0.12",
1507
+ "@langchain/community": "~0.0.28",
1508
+ "@langchain/core": "~0.1.28",
1509
+ "@langchain/openai": "~0.0.14",
1506
1510
  "binary-extensions": "^2.2.0",
1507
1511
  "expr-eval": "^2.0.2",
1508
1512
  "js-tiktoken": "^1.0.7",
1509
1513
  "js-yaml": "^4.1.0",
1510
1514
  "jsonpointer": "^5.0.1",
1511
- "langchainhub": "~0.0.6",
1515
+ "langchainhub": "~0.0.8",
1512
1516
  "langsmith": "~0.0.59",
1513
1517
  "ml-distance": "^4.0.0",
1514
1518
  "openapi-types": "^12.1.3",
@@ -2868,6 +2872,15 @@
2868
2872
  "import": "./document_loaders/web/s3.js",
2869
2873
  "require": "./document_loaders/web/s3.cjs"
2870
2874
  },
2875
+ "./document_loaders/web/sitemap": {
2876
+ "types": {
2877
+ "import": "./document_loaders/web/sitemap.d.ts",
2878
+ "require": "./document_loaders/web/sitemap.d.cts",
2879
+ "default": "./document_loaders/web/sitemap.d.ts"
2880
+ },
2881
+ "import": "./document_loaders/web/sitemap.js",
2882
+ "require": "./document_loaders/web/sitemap.cjs"
2883
+ },
2871
2884
  "./document_loaders/web/sonix_audio": {
2872
2885
  "types": {
2873
2886
  "import": "./document_loaders/web/sonix_audio.d.ts",