npm - langchain - Versions diffs - 0.1.16 → 0.1.18 - Mend

langchain 0.1.16 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/dist/chains/graph_qa/prompts.cjs +7 -0
package/dist/chains/graph_qa/prompts.js +7 -0
package/dist/chains/retrieval.d.ts +2 -2
package/dist/document_loaders/web/cheerio.cjs +11 -2
package/dist/document_loaders/web/cheerio.d.ts +9 -2
package/dist/document_loaders/web/cheerio.js +11 -2
package/dist/document_loaders/web/sitemap.cjs +110 -0
package/dist/document_loaders/web/sitemap.d.ts +36 -0
package/dist/document_loaders/web/sitemap.js +106 -0
package/dist/experimental/openai_assistant/index.cjs +2 -0
package/dist/experimental/openai_assistant/index.js +2 -0
package/dist/indexes/index.cjs +6 -1
package/dist/indexes/index.d.ts +1 -1
package/dist/indexes/index.js +1 -1
package/dist/indexes/indexing.cjs +14 -9
package/dist/indexes/indexing.d.ts +39 -0
package/dist/indexes/indexing.js +8 -8
package/dist/load/import_constants.cjs +1 -0
package/dist/load/import_constants.js +1 -0
package/dist/text_splitter.cjs +4 -1
package/dist/text_splitter.js +4 -1
package/document_loaders/web/sitemap.cjs +1 -0
package/document_loaders/web/sitemap.d.cts +1 -0
package/document_loaders/web/sitemap.d.ts +1 -0
package/document_loaders/web/sitemap.js +1 -0
package/package.json +18 -5

package/dist/chains/graph_qa/prompts.cjs CHANGED Viewed

@@ -22,6 +22,13 @@ const CYPHER_QA_TEMPLATE = `You are an assistant that helps to form nice and hum
 The information part contains the provided information that you must use to construct an answer.
 The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
 Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
+Here is an example:
+Question: Which managers own Neo4j stocks?
+Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
+Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.
+Follow this example when generating answers.
 If the provided information is empty, say that you don't know the answer.
 Information:
 {context}

package/dist/chains/graph_qa/prompts.js CHANGED Viewed

@@ -19,6 +19,13 @@ const CYPHER_QA_TEMPLATE = `You are an assistant that helps to form nice and hum
 The information part contains the provided information that you must use to construct an answer.
 The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
 Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
+Here is an example:
+Question: Which managers own Neo4j stocks?
+Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
+Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.
+Follow this example when generating answers.
 If the provided information is empty, say that you don't know the answer.
 Information:
 {context}

package/dist/chains/retrieval.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import type { BaseRetrieverInterface } from "@langchain/core/retrievers";
 import { type Runnable, type RunnableInterface } from "@langchain/core/runnables";
 import type { BaseMessage } from "@langchain/core/messages";
-import type { DocumentInterface } from "@langchain/core/documents";
+import type { DocumentInterface, Document } from "@langchain/core/documents";
 /**
  * Parameters for the createRetrievalChain method.
  */
@@ -58,7 +58,7 @@ export declare function createRetrievalChain<RunOutput>({ retriever, combineDocs
 } & {
     [key: string]: unknown;
 }, {
-    context: string;
+    context: Document[];
     answer: RunOutput;
 } & {
     [key: string]: unknown;

package/dist/document_loaders/web/cheerio.cjs CHANGED Viewed

@@ -54,14 +54,23 @@ class CheerioWebBaseLoader extends base_js_1.BaseDocumentLoader {
         this.selector = selector ?? "body";
         this.textDecoder = textDecoder;
     }
-    static async _scrape(url, caller, timeout, textDecoder) {
+    /**
+     * Fetches web documents from the given array of URLs and loads them using Cheerio.
+     * It returns an array of CheerioAPI instances.
+     * @param urls An array of URLs to fetch and load.
+     * @returns A Promise that resolves to an array of CheerioAPI instances.
+     */
+    static async scrapeAll(urls, caller, timeout, textDecoder, options) {
+        return Promise.all(urls.map((url) => CheerioWebBaseLoader._scrape(url, caller, timeout, textDecoder, options)));
+    }
+    static async _scrape(url, caller, timeout, textDecoder, options) {
         const { load } = await CheerioWebBaseLoader.imports();
         const response = await caller.call(fetch, url, {
             signal: timeout ? AbortSignal.timeout(timeout) : undefined,
         });
         const html = textDecoder?.decode(await response.arrayBuffer()) ??
             (await response.text());
-        return load(html);
+        return load(html, options);
     }
     /**
      * Fetches the web document from the webPath and loads it using Cheerio.

package/dist/document_loaders/web/cheerio.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { CheerioAPI, load as LoadT, SelectorType } from "cheerio";
+import type { CheerioAPI, CheerioOptions, load as LoadT, SelectorType } from "cheerio";
 import { Document } from "@langchain/core/documents";
 import { AsyncCaller, AsyncCallerParams } from "@langchain/core/utils/async_caller";
 import { BaseDocumentLoader } from "../base.js";
@@ -41,7 +41,14 @@ export declare class CheerioWebBaseLoader extends BaseDocumentLoader implements
     selector?: SelectorType;
     textDecoder?: TextDecoder;
     constructor(webPath: string, fields?: WebBaseLoaderParams);
-    static _scrape(url: string, caller: AsyncCaller, timeout: number | undefined, textDecoder?: TextDecoder): Promise<CheerioAPI>;
+    /**
+     * Fetches web documents from the given array of URLs and loads them using Cheerio.
+     * It returns an array of CheerioAPI instances.
+     * @param urls An array of URLs to fetch and load.
+     * @returns A Promise that resolves to an array of CheerioAPI instances.
+     */
+    static scrapeAll(urls: string[], caller: AsyncCaller, timeout: number | undefined, textDecoder?: TextDecoder, options?: CheerioOptions): Promise<CheerioAPI[]>;
+    static _scrape(url: string, caller: AsyncCaller, timeout: number | undefined, textDecoder?: TextDecoder, options?: CheerioOptions): Promise<CheerioAPI>;
     /**
      * Fetches the web document from the webPath and loads it using Cheerio.
      * It returns a CheerioAPI instance.

package/dist/document_loaders/web/cheerio.js CHANGED Viewed

@@ -51,14 +51,23 @@ export class CheerioWebBaseLoader extends BaseDocumentLoader {
         this.selector = selector ?? "body";
         this.textDecoder = textDecoder;
     }
-    static async _scrape(url, caller, timeout, textDecoder) {
+    /**
+     * Fetches web documents from the given array of URLs and loads them using Cheerio.
+     * It returns an array of CheerioAPI instances.
+     * @param urls An array of URLs to fetch and load.
+     * @returns A Promise that resolves to an array of CheerioAPI instances.
+     */
+    static async scrapeAll(urls, caller, timeout, textDecoder, options) {
+        return Promise.all(urls.map((url) => CheerioWebBaseLoader._scrape(url, caller, timeout, textDecoder, options)));
+    }
+    static async _scrape(url, caller, timeout, textDecoder, options) {
         const { load } = await CheerioWebBaseLoader.imports();
         const response = await caller.call(fetch, url, {
             signal: timeout ? AbortSignal.timeout(timeout) : undefined,
         });
         const html = textDecoder?.decode(await response.arrayBuffer()) ??
             (await response.text());
-        return load(html);
+        return load(html, options);
     }
     /**
      * Fetches the web document from the webPath and loads it using Cheerio.

package/dist/document_loaders/web/sitemap.cjs ADDED Viewed

@@ -0,0 +1,110 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.SitemapLoader = void 0;
+const documents_1 = require("@langchain/core/documents");
+const chunk_array_1 = require("@langchain/core/utils/chunk_array");
+const cheerio_js_1 = require("./cheerio.cjs");
+const DEFAULT_CHUNK_SIZE = 300;
+class SitemapLoader extends cheerio_js_1.CheerioWebBaseLoader {
+    constructor(webPath, params = {}) {
+        const paramsWithDefaults = { chunkSize: DEFAULT_CHUNK_SIZE, ...params };
+        let path = webPath.endsWith("/") ? webPath.slice(0, -1) : webPath;
+        // Allow for custom sitemap paths to be passed in with the url.
+        path = path.endsWith(".xml") ? path : `${path}/sitemap.xml`;
+        super(path, paramsWithDefaults);
+        Object.defineProperty(this, "webPath", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: webPath
+        });
+        Object.defineProperty(this, "allowUrlPatterns", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: void 0
+        });
+        Object.defineProperty(this, "chunkSize", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: void 0
+        });
+        this.webPath = path;
+        this.allowUrlPatterns = paramsWithDefaults.filterUrls;
+        this.chunkSize = paramsWithDefaults.chunkSize;
+    }
+    _checkUrlPatterns(url) {
+        if (!this.allowUrlPatterns) {
+            return false;
+        }
+        return !this.allowUrlPatterns.some((pattern) => !new RegExp(pattern).test(url));
+    }
+    async parseSitemap() {
+        const $ = await cheerio_js_1.CheerioWebBaseLoader._scrape(this.webPath, this.caller, this.timeout, this.textDecoder, {
+            xmlMode: true,
+            xml: true,
+        });
+        const elements = [];
+        $("url").each((_, element) => {
+            const loc = $(element).find("loc").text();
+            if (!loc) {
+                return;
+            }
+            if (this._checkUrlPatterns(loc)) {
+                return;
+            }
+            const changefreq = $(element).find("changefreq").text();
+            const lastmod = $(element).find("lastmod").text();
+            const priority = $(element).find("priority").text();
+            elements.push({ loc, changefreq, lastmod, priority });
+        });
+        $("sitemap").each((_, element) => {
+            const loc = $(element).find("loc").text();
+            if (!loc) {
+                return;
+            }
+            const changefreq = $(element).find("changefreq").text();
+            const lastmod = $(element).find("lastmod").text();
+            const priority = $(element).find("priority").text();
+            elements.push({ loc, changefreq, lastmod, priority });
+        });
+        return elements;
+    }
+    async _loadSitemapUrls(elements) {
+        const all = await cheerio_js_1.CheerioWebBaseLoader.scrapeAll(elements.map((ele) => ele.loc), this.caller, this.timeout, this.textDecoder);
+        const documents = all.map(($, i) => {
+            if (!elements[i]) {
+                throw new Error("Scraped docs and elements not in sync");
+            }
+            const text = $(this.selector).text();
+            const { loc: source, ...metadata } = elements[i];
+            // extract page metadata
+            const description = $("meta[name='description']").attr("content");
+            const title = $("meta[property='og:title']").attr("content");
+            const lang = $("meta[property='og:locale']").attr("content");
+            return new documents_1.Document({
+                pageContent: text,
+                metadata: {
+                    ...metadata,
+                    description,
+                    title,
+                    lang,
+                    source: source.trim(),
+                },
+            });
+        });
+        return documents;
+    }
+    async load() {
+        const elements = await this.parseSitemap();
+        const chunks = (0, chunk_array_1.chunkArray)(elements, this.chunkSize);
+        const documents = [];
+        for await (const chunk of chunks) {
+            const chunkedDocuments = await this._loadSitemapUrls(chunk);
+            documents.push(...chunkedDocuments);
+        }
+        return documents;
+    }
+}
+exports.SitemapLoader = SitemapLoader;

package/dist/document_loaders/web/sitemap.d.ts ADDED Viewed

@@ -0,0 +1,36 @@
+import { Document, DocumentInterface } from "@langchain/core/documents";
+import { CheerioWebBaseLoader, WebBaseLoaderParams } from "./cheerio.js";
+/**
+ * Interface representing the parameters for initializing a SitemapLoader.
+ * @interface SitemapLoaderParams
+ * @extends WebBaseLoaderParams
+ */
+export interface SitemapLoaderParams extends WebBaseLoaderParams {
+    /**
+     * @property {(string | RegExp)[] | undefined} filterUrls - A list of regexes. Only URLs that match one of the filter URLs will be loaded.
+     * WARNING: The filter URLs are interpreted as regular expressions. Escape special characters if needed.
+     */
+    filterUrls?: (string | RegExp)[];
+    /**
+     * The size to chunk the sitemap URLs into for scraping.
+     * @default {300}
+     */
+    chunkSize?: number;
+}
+type SiteMapElement = {
+    loc: string;
+    changefreq?: string;
+    lastmod?: string;
+    priority?: string;
+};
+export declare class SitemapLoader extends CheerioWebBaseLoader implements SitemapLoaderParams {
+    webPath: string;
+    allowUrlPatterns: (string | RegExp)[] | undefined;
+    chunkSize: number;
+    constructor(webPath: string, params?: SitemapLoaderParams);
+    _checkUrlPatterns(url: string): boolean;
+    parseSitemap(): Promise<SiteMapElement[]>;
+    _loadSitemapUrls(elements: Array<SiteMapElement>): Promise<DocumentInterface[]>;
+    load(): Promise<Document[]>;
+}
+export {};

package/dist/document_loaders/web/sitemap.js ADDED Viewed

@@ -0,0 +1,106 @@
+import { Document } from "@langchain/core/documents";
+import { chunkArray } from "@langchain/core/utils/chunk_array";
+import { CheerioWebBaseLoader } from "./cheerio.js";
+const DEFAULT_CHUNK_SIZE = 300;
+export class SitemapLoader extends CheerioWebBaseLoader {
+    constructor(webPath, params = {}) {
+        const paramsWithDefaults = { chunkSize: DEFAULT_CHUNK_SIZE, ...params };
+        let path = webPath.endsWith("/") ? webPath.slice(0, -1) : webPath;
+        // Allow for custom sitemap paths to be passed in with the url.
+        path = path.endsWith(".xml") ? path : `${path}/sitemap.xml`;
+        super(path, paramsWithDefaults);
+        Object.defineProperty(this, "webPath", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: webPath
+        });
+        Object.defineProperty(this, "allowUrlPatterns", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: void 0
+        });
+        Object.defineProperty(this, "chunkSize", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: void 0
+        });
+        this.webPath = path;
+        this.allowUrlPatterns = paramsWithDefaults.filterUrls;
+        this.chunkSize = paramsWithDefaults.chunkSize;
+    }
+    _checkUrlPatterns(url) {
+        if (!this.allowUrlPatterns) {
+            return false;
+        }
+        return !this.allowUrlPatterns.some((pattern) => !new RegExp(pattern).test(url));
+    }
+    async parseSitemap() {
+        const $ = await CheerioWebBaseLoader._scrape(this.webPath, this.caller, this.timeout, this.textDecoder, {
+            xmlMode: true,
+            xml: true,
+        });
+        const elements = [];
+        $("url").each((_, element) => {
+            const loc = $(element).find("loc").text();
+            if (!loc) {
+                return;
+            }
+            if (this._checkUrlPatterns(loc)) {
+                return;
+            }
+            const changefreq = $(element).find("changefreq").text();
+            const lastmod = $(element).find("lastmod").text();
+            const priority = $(element).find("priority").text();
+            elements.push({ loc, changefreq, lastmod, priority });
+        });
+        $("sitemap").each((_, element) => {
+            const loc = $(element).find("loc").text();
+            if (!loc) {
+                return;
+            }
+            const changefreq = $(element).find("changefreq").text();
+            const lastmod = $(element).find("lastmod").text();
+            const priority = $(element).find("priority").text();
+            elements.push({ loc, changefreq, lastmod, priority });
+        });
+        return elements;
+    }
+    async _loadSitemapUrls(elements) {
+        const all = await CheerioWebBaseLoader.scrapeAll(elements.map((ele) => ele.loc), this.caller, this.timeout, this.textDecoder);
+        const documents = all.map(($, i) => {
+            if (!elements[i]) {
+                throw new Error("Scraped docs and elements not in sync");
+            }
+            const text = $(this.selector).text();
+            const { loc: source, ...metadata } = elements[i];
+            // extract page metadata
+            const description = $("meta[name='description']").attr("content");
+            const title = $("meta[property='og:title']").attr("content");
+            const lang = $("meta[property='og:locale']").attr("content");
+            return new Document({
+                pageContent: text,
+                metadata: {
+                    ...metadata,
+                    description,
+                    title,
+                    lang,
+                    source: source.trim(),
+                },
+            });
+        });
+        return documents;
+    }
+    async load() {
+        const elements = await this.parseSitemap();
+        const chunks = chunkArray(elements, this.chunkSize);
+        const documents = [];
+        for await (const chunk of chunks) {
+            const chunkedDocuments = await this._loadSitemapUrls(chunk);
+            documents.push(...chunkedDocuments);
+        }
+        return documents;
+    }
+}

package/dist/experimental/openai_assistant/index.cjs CHANGED Viewed

@@ -221,6 +221,8 @@ class OpenAIAssistantRunnable extends runnables_1.Runnable {
                 return {
                     returnValues: {
                         output: answerString,
+                        runId,
+                        threadId,
                     },
                     log: "",
                     runId,

package/dist/experimental/openai_assistant/index.js CHANGED Viewed

@@ -218,6 +218,8 @@ export class OpenAIAssistantRunnable extends Runnable {
                 return {
                     returnValues: {
                         output: answerString,
+                        runId,
+                        threadId,
                     },
                     log: "",
                     runId,

package/dist/indexes/index.cjs CHANGED Viewed

@@ -1,5 +1,10 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.index = void 0;
+exports._HashedDocument = exports._isBaseDocumentLoader = exports._getSourceIdAssigner = exports._deduplicateInOrder = exports._batch = exports.index = void 0;
 var indexing_js_1 = require("./indexing.cjs");
 Object.defineProperty(exports, "index", { enumerable: true, get: function () { return indexing_js_1.index; } });
+Object.defineProperty(exports, "_batch", { enumerable: true, get: function () { return indexing_js_1._batch; } });
+Object.defineProperty(exports, "_deduplicateInOrder", { enumerable: true, get: function () { return indexing_js_1._deduplicateInOrder; } });
+Object.defineProperty(exports, "_getSourceIdAssigner", { enumerable: true, get: function () { return indexing_js_1._getSourceIdAssigner; } });
+Object.defineProperty(exports, "_isBaseDocumentLoader", { enumerable: true, get: function () { return indexing_js_1._isBaseDocumentLoader; } });
+Object.defineProperty(exports, "_HashedDocument", { enumerable: true, get: function () { return indexing_js_1._HashedDocument; } });

package/dist/indexes/index.d.ts CHANGED Viewed

	@@ -1 +1 @@
1	- export { type CleanupMode, type IndexOptions, index } from "./indexing.js";
1	+ export { type CleanupMode, type IndexOptions, index, _batch, _deduplicateInOrder, _getSourceIdAssigner, _isBaseDocumentLoader, _HashedDocument, } from "./indexing.js";

package/dist/indexes/index.js CHANGED Viewed

	@@ -1 +1 @@
1	- export { index } from "./indexing.js";
1	+ export { index, _batch, _deduplicateInOrder, _getSourceIdAssigner, _isBaseDocumentLoader, _HashedDocument, } from "./indexing.js";

package/dist/indexes/indexing.cjs CHANGED Viewed

@@ -1,6 +1,6 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.index = void 0;
+exports.index = exports._isBaseDocumentLoader = exports._getSourceIdAssigner = exports._deduplicateInOrder = exports._batch = exports._HashedDocument = void 0;
 const uuid_1 = require("uuid");
 const base_1 = require("@langchain/community/indexes/base");
 const hash_1 = require("@langchain/core/utils/hash");
@@ -10,7 +10,7 @@ const document_js_1 = require("../document.cjs");
  * Hashes are calculated based on page content and metadata.
  * It is used for indexing.
  */
-class HashedDocument {
+class _HashedDocument {
     constructor(fields) {
         Object.defineProperty(this, "uid", {
             enumerable: true,
@@ -98,7 +98,8 @@ class HashedDocument {
         return (0, uuid_1.v5)(hash_value, base_1.UUIDV5_NAMESPACE);
     }
 }
-function batch(size, iterable) {
+exports._HashedDocument = _HashedDocument;
+function _batch(size, iterable) {
     const batches = [];
     let currentBatch = [];
     iterable.forEach((item) => {
@@ -113,7 +114,8 @@ function batch(size, iterable) {
     }
     return batches;
 }
-function deduplicateInOrder(hashedDocuments) {
+exports._batch = _batch;
+function _deduplicateInOrder(hashedDocuments) {
     const seen = new Set();
     const deduplicated = [];
     for (const hashedDoc of hashedDocuments) {
@@ -127,7 +129,8 @@ function deduplicateInOrder(hashedDocuments) {
     }
     return deduplicated;
 }
-function getSourceIdAssigner(sourceIdKey) {
+exports._deduplicateInOrder = _deduplicateInOrder;
+function _getSourceIdAssigner(sourceIdKey) {
     if (sourceIdKey === null) {
         return (_doc) => null;
     }
@@ -141,6 +144,7 @@ function getSourceIdAssigner(sourceIdKey) {
         throw new Error(`sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`);
     }
 }
+exports._getSourceIdAssigner = _getSourceIdAssigner;
 // eslint-disable-next-line @typescript-eslint/no-explicit-any
 const _isBaseDocumentLoader = (arg) => {
     if ("load" in arg &&
@@ -151,6 +155,7 @@ const _isBaseDocumentLoader = (arg) => {
     }
     return false;
 };
+exports._isBaseDocumentLoader = _isBaseDocumentLoader;
 /**
  * Index data from the doc source into the vector store.
  *
@@ -176,18 +181,18 @@ async function index(args) {
     if (cleanup === "incremental" && !sourceIdKey) {
         throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.");
     }
-    const docs = _isBaseDocumentLoader(docsSource)
+    const docs = (0, exports._isBaseDocumentLoader)(docsSource)
         ? await docsSource.load()
         : docsSource;
-    const sourceIdAssigner = getSourceIdAssigner(sourceIdKey ?? null);
+    const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);
     const indexStartDt = await recordManager.getTime();
     let numAdded = 0;
     let numDeleted = 0;
     let numUpdated = 0;
     let numSkipped = 0;
-    const batches = batch(batchSize ?? 100, docs);
+    const batches = _batch(batchSize ?? 100, docs);
     for (const batch of batches) {
-        const hashedDocs = deduplicateInOrder(batch.map((doc) => HashedDocument.fromDocument(doc)));
+        const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)));
         const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));
         if (cleanup === "incremental") {
             hashedDocs.forEach((_hashedDoc, index) => {

package/dist/indexes/indexing.d.ts CHANGED Viewed

@@ -2,6 +2,7 @@ import { VectorStore } from "@langchain/core/vectorstores";
 import { RecordManagerInterface } from "@langchain/community/indexes/base";
 import { DocumentInterface } from "@langchain/core/documents";
 import { BaseDocumentLoader } from "../document_loaders/base.js";
+type Metadata = Record<string, unknown>;
 type IndexingResult = {
     numAdded: number;
     numDeleted: number;
@@ -9,6 +10,40 @@ type IndexingResult = {
     numSkipped: number;
 };
 type StringOrDocFunc = string | ((doc: DocumentInterface) => string);
+export interface HashedDocumentInterface extends DocumentInterface {
+    uid: string;
+    hash_?: string;
+    contentHash?: string;
+    metadataHash?: string;
+    pageContent: string;
+    metadata: Metadata;
+    calculateHashes(): void;
+    toDocument(): DocumentInterface;
+}
+interface HashedDocumentArgs {
+    pageContent: string;
+    metadata: Metadata;
+    uid: string;
+}
+/**
+ * HashedDocument is a Document with hashes calculated.
+ * Hashes are calculated based on page content and metadata.
+ * It is used for indexing.
+ */
+export declare class _HashedDocument implements HashedDocumentInterface {
+    uid: string;
+    hash_?: string;
+    contentHash?: string;
+    metadataHash?: string;
+    pageContent: string;
+    metadata: Metadata;
+    constructor(fields: HashedDocumentArgs);
+    calculateHashes(): void;
+    toDocument(): DocumentInterface;
+    static fromDocument(document: DocumentInterface, uid?: string): _HashedDocument;
+    private _hashStringToUUID;
+    private _hashNestedDictToUUID;
+}
 export type CleanupMode = "full" | "incremental";
 export type IndexOptions = {
     /**
@@ -46,6 +81,10 @@ export type IndexOptions = {
      */
     forceUpdate?: boolean;
 };
+export declare function _batch<T>(size: number, iterable: T[]): T[][];
+export declare function _deduplicateInOrder(hashedDocuments: HashedDocumentInterface[]): HashedDocumentInterface[];
+export declare function _getSourceIdAssigner(sourceIdKey: StringOrDocFunc | null): (doc: DocumentInterface) => string | null;
+export declare const _isBaseDocumentLoader: (arg: any) => arg is BaseDocumentLoader;
 interface IndexArgs {
     docsSource: BaseDocumentLoader | DocumentInterface[];
     recordManager: RecordManagerInterface;

package/dist/indexes/indexing.js CHANGED Viewed

@@ -7,7 +7,7 @@ import { Document } from "../document.js";
  * Hashes are calculated based on page content and metadata.
  * It is used for indexing.
  */
-class HashedDocument {
+export class _HashedDocument {
     constructor(fields) {
         Object.defineProperty(this, "uid", {
             enumerable: true,
@@ -95,7 +95,7 @@ class HashedDocument {
         return uuidv5(hash_value, UUIDV5_NAMESPACE);
     }
 }
-function batch(size, iterable) {
+export function _batch(size, iterable) {
     const batches = [];
     let currentBatch = [];
     iterable.forEach((item) => {
@@ -110,7 +110,7 @@ function batch(size, iterable) {
     }
     return batches;
 }
-function deduplicateInOrder(hashedDocuments) {
+export function _deduplicateInOrder(hashedDocuments) {
     const seen = new Set();
     const deduplicated = [];
     for (const hashedDoc of hashedDocuments) {
@@ -124,7 +124,7 @@ function deduplicateInOrder(hashedDocuments) {
     }
     return deduplicated;
 }
-function getSourceIdAssigner(sourceIdKey) {
+export function _getSourceIdAssigner(sourceIdKey) {
     if (sourceIdKey === null) {
         return (_doc) => null;
     }
@@ -139,7 +139,7 @@ function getSourceIdAssigner(sourceIdKey) {
     }
 }
 // eslint-disable-next-line @typescript-eslint/no-explicit-any
-const _isBaseDocumentLoader = (arg) => {
+export const _isBaseDocumentLoader = (arg) => {
     if ("load" in arg &&
         typeof arg.load === "function" &&
         "loadAndSplit" in arg &&
@@ -176,15 +176,15 @@ export async function index(args) {
     const docs = _isBaseDocumentLoader(docsSource)
         ? await docsSource.load()
         : docsSource;
-    const sourceIdAssigner = getSourceIdAssigner(sourceIdKey ?? null);
+    const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);
     const indexStartDt = await recordManager.getTime();
     let numAdded = 0;
     let numDeleted = 0;
     let numUpdated = 0;
     let numSkipped = 0;
-    const batches = batch(batchSize ?? 100, docs);
+    const batches = _batch(batchSize ?? 100, docs);
     for (const batch of batches) {
-        const hashedDocs = deduplicateInOrder(batch.map((doc) => HashedDocument.fromDocument(doc)));
+        const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)));
         const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));
         if (cleanup === "incremental") {
             hashedDocs.forEach((_hashedDoc, index) => {

package/dist/load/import_constants.cjs CHANGED Viewed

@@ -99,6 +99,7 @@ exports.optionalImportEntrypoints = [
     "langchain/document_loaders/web/pdf",
     "langchain/document_loaders/web/recursive_url",
     "langchain/document_loaders/web/s3",
+    "langchain/document_loaders/web/sitemap",
     "langchain/document_loaders/web/sonix_audio",
     "langchain/document_loaders/web/confluence",
     "langchain/document_loaders/web/youtube",

package/dist/load/import_constants.js CHANGED Viewed

@@ -96,6 +96,7 @@ export const optionalImportEntrypoints = [
     "langchain/document_loaders/web/pdf",
     "langchain/document_loaders/web/recursive_url",
     "langchain/document_loaders/web/s3",
+    "langchain/document_loaders/web/sitemap",
     "langchain/document_loaders/web/sonix_audio",
     "langchain/document_loaders/web/confluence",
     "langchain/document_loaders/web/youtube",

package/dist/text_splitter.cjs CHANGED Viewed

@@ -68,7 +68,10 @@ class TextSplitter extends documents_1.BaseDocumentTransformer {
     // eslint-disable-next-line @typescript-eslint/no-explicit-any
     metadatas = [], chunkHeaderOptions = {}) {
         // if no metadata is provided, we create an empty one for each text
-        const _metadatas = metadatas.length > 0 ? metadatas : new Array(texts.length).fill({});
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        const _metadatas = metadatas.length > 0
+            ? metadatas
+            : [...Array(texts.length)].map(() => ({}));
         const { chunkHeader = "", chunkOverlapHeader = "(cont'd) ", appendChunkOverlapHeader = false, } = chunkHeaderOptions;
         const documents = new Array();
         for (let i = 0; i < texts.length; i += 1) {

package/dist/text_splitter.js CHANGED Viewed

@@ -65,7 +65,10 @@ export class TextSplitter extends BaseDocumentTransformer {
     // eslint-disable-next-line @typescript-eslint/no-explicit-any
     metadatas = [], chunkHeaderOptions = {}) {
         // if no metadata is provided, we create an empty one for each text
-        const _metadatas = metadatas.length > 0 ? metadatas : new Array(texts.length).fill({});
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        const _metadatas = metadatas.length > 0
+            ? metadatas
+            : [...Array(texts.length)].map(() => ({}));
         const { chunkHeader = "", chunkOverlapHeader = "(cont'd) ", appendChunkOverlapHeader = false, } = chunkHeaderOptions;
         const documents = new Array();
         for (let i = 0; i < texts.length; i += 1) {

package/document_loaders/web/sitemap.cjs ADDED Viewed

	@@ -0,0 +1 @@
1	+ module.exports = require('../../dist/document_loaders/web/sitemap.cjs');

package/document_loaders/web/sitemap.d.cts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export * from '../../dist/document_loaders/web/sitemap.js'

package/document_loaders/web/sitemap.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export * from '../../dist/document_loaders/web/sitemap.js'

package/document_loaders/web/sitemap.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export * from '../../dist/document_loaders/web/sitemap.js'

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "langchain",
-  "version": "0.1.16",
+  "version": "0.1.18",
   "description": "Typescript bindings for langchain",
   "type": "module",
   "engines": {
@@ -602,6 +602,10 @@
     "document_loaders/web/s3.js",
     "document_loaders/web/s3.d.ts",
     "document_loaders/web/s3.d.cts",
+    "document_loaders/web/sitemap.cjs",
+    "document_loaders/web/sitemap.js",
+    "document_loaders/web/sitemap.d.ts",
+    "document_loaders/web/sitemap.d.cts",
     "document_loaders/web/sonix_audio.cjs",
     "document_loaders/web/sonix_audio.js",
     "document_loaders/web/sonix_audio.d.ts",
@@ -1500,15 +1504,15 @@
   },
   "dependencies": {
     "@anthropic-ai/sdk": "^0.9.1",
-    "@langchain/community": "~0.0.20",
-    "@langchain/core": "~0.1.25",
-    "@langchain/openai": "~0.0.12",
+    "@langchain/community": "~0.0.28",
+    "@langchain/core": "~0.1.28",
+    "@langchain/openai": "~0.0.14",
     "binary-extensions": "^2.2.0",
     "expr-eval": "^2.0.2",
     "js-tiktoken": "^1.0.7",
     "js-yaml": "^4.1.0",
     "jsonpointer": "^5.0.1",
-    "langchainhub": "~0.0.6",
+    "langchainhub": "~0.0.8",
     "langsmith": "~0.0.59",
     "ml-distance": "^4.0.0",
     "openapi-types": "^12.1.3",
@@ -2868,6 +2872,15 @@
       "import": "./document_loaders/web/s3.js",
       "require": "./document_loaders/web/s3.cjs"
     },
+    "./document_loaders/web/sitemap": {
+      "types": {
+        "import": "./document_loaders/web/sitemap.d.ts",
+        "require": "./document_loaders/web/sitemap.d.cts",
+        "default": "./document_loaders/web/sitemap.d.ts"
+      },
+      "import": "./document_loaders/web/sitemap.js",
+      "require": "./document_loaders/web/sitemap.cjs"
+    },
     "./document_loaders/web/sonix_audio": {
       "types": {
         "import": "./document_loaders/web/sonix_audio.d.ts",