npm - langchain - Versions diffs - 0.0.176 → 0.0.178 - Mend

langchain 0.0.176 → 0.0.178

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

package/chat_models/iflytek_xinghuo/web.cjs +1 -0
package/chat_models/iflytek_xinghuo/web.d.ts +1 -0
package/chat_models/iflytek_xinghuo/web.js +1 -0
package/chat_models/iflytek_xinghuo.cjs +1 -0
package/chat_models/iflytek_xinghuo.d.ts +1 -0
package/chat_models/iflytek_xinghuo.js +1 -0
package/dist/chat_models/bedrock.cjs +25 -4
package/dist/chat_models/bedrock.d.ts +2 -1
package/dist/chat_models/bedrock.js +25 -4
package/dist/chat_models/cloudflare_workersai.cjs +70 -24
package/dist/chat_models/cloudflare_workersai.d.ts +6 -2
package/dist/chat_models/cloudflare_workersai.js +71 -25
package/dist/chat_models/iflytek_xinghuo/common.cjs +335 -0
package/dist/chat_models/iflytek_xinghuo/common.d.ts +165 -0
package/dist/chat_models/iflytek_xinghuo/common.js +331 -0
package/dist/chat_models/iflytek_xinghuo/index.cjs +35 -0
package/dist/chat_models/iflytek_xinghuo/index.d.ts +5 -0
package/dist/chat_models/iflytek_xinghuo/index.js +28 -0
package/dist/chat_models/iflytek_xinghuo/web.cjs +30 -0
package/dist/chat_models/iflytek_xinghuo/web.d.ts +5 -0
package/dist/chat_models/iflytek_xinghuo/web.js +26 -0
package/dist/chat_models/llama_cpp.cjs +31 -79
package/dist/chat_models/llama_cpp.d.ts +15 -58
package/dist/chat_models/llama_cpp.js +32 -80
package/dist/chat_models/openai.cjs +91 -6
package/dist/chat_models/openai.d.ts +10 -0
package/dist/chat_models/openai.js +91 -6
package/dist/embeddings/hf.cjs +10 -1
package/dist/embeddings/hf.d.ts +4 -2
package/dist/embeddings/hf.js +10 -1
package/dist/embeddings/llama_cpp.cjs +67 -0
package/dist/embeddings/llama_cpp.d.ts +26 -0
package/dist/embeddings/llama_cpp.js +63 -0
package/dist/embeddings/ollama.cjs +7 -1
package/dist/embeddings/ollama.js +7 -1
package/dist/graphs/neo4j_graph.cjs +36 -5
package/dist/graphs/neo4j_graph.js +14 -3
package/dist/llms/bedrock.cjs +25 -3
package/dist/llms/bedrock.d.ts +2 -1
package/dist/llms/bedrock.js +25 -3
package/dist/llms/cloudflare_workersai.cjs +59 -13
package/dist/llms/cloudflare_workersai.d.ts +9 -3
package/dist/llms/cloudflare_workersai.js +59 -13
package/dist/llms/hf.cjs +10 -1
package/dist/llms/hf.d.ts +3 -0
package/dist/llms/hf.js +10 -1
package/dist/llms/llama_cpp.cjs +25 -65
package/dist/llms/llama_cpp.d.ts +7 -43
package/dist/llms/llama_cpp.js +25 -65
package/dist/load/import_constants.cjs +3 -0
package/dist/load/import_constants.js +3 -0
package/dist/prompts/chat.cjs +8 -0
package/dist/prompts/chat.d.ts +5 -0
package/dist/prompts/chat.js +8 -0
package/dist/prompts/few_shot.cjs +162 -1
package/dist/prompts/few_shot.d.ts +90 -2
package/dist/prompts/few_shot.js +160 -0
package/dist/prompts/index.cjs +2 -1
package/dist/prompts/index.d.ts +1 -1
package/dist/prompts/index.js +1 -1
package/dist/retrievers/zep.cjs +26 -3
package/dist/retrievers/zep.d.ts +11 -2
package/dist/retrievers/zep.js +26 -3
package/dist/util/bedrock.d.ts +2 -0
package/dist/util/event-source-parse.cjs +20 -1
package/dist/util/event-source-parse.d.ts +2 -0
package/dist/util/event-source-parse.js +18 -0
package/dist/util/iflytek_websocket_stream.cjs +81 -0
package/dist/util/iflytek_websocket_stream.d.ts +27 -0
package/dist/util/iflytek_websocket_stream.js +77 -0
package/dist/util/llama_cpp.cjs +34 -0
package/dist/util/llama_cpp.d.ts +46 -0
package/dist/util/llama_cpp.js +28 -0
package/dist/util/openai-format-fndef.cjs +81 -0
package/dist/util/openai-format-fndef.d.ts +44 -0
package/dist/util/openai-format-fndef.js +77 -0
package/dist/util/openapi.d.ts +2 -2
package/dist/vectorstores/pinecone.cjs +5 -5
package/dist/vectorstores/pinecone.d.ts +2 -2
package/dist/vectorstores/pinecone.js +5 -5
package/embeddings/llama_cpp.cjs +1 -0
package/embeddings/llama_cpp.d.ts +1 -0
package/embeddings/llama_cpp.js +1 -0
package/package.json +34 -5

package/dist/chat_models/llama_cpp.d.ts CHANGED Viewed

@@ -1,42 +1,13 @@
 import { LlamaModel, LlamaContext, LlamaChatSession, type ConversationInteraction } from "node-llama-cpp";
 import { SimpleChatModel, BaseChatModelParams } from "./base.js";
+import { LlamaBaseCppInputs } from "../util/llama_cpp.js";
 import { BaseLanguageModelCallOptions } from "../base_language/index.js";
 import type { BaseMessage } from "../schema/index.js";
 /**
  * Note that the modelPath is the only required parameter. For testing you
  * can set this in the environment variable `LLAMA_PATH`.
  */
-export interface LlamaCppInputs extends BaseChatModelParams {
-    /** Prompt processing batch size. */
-    batchSize?: number;
-    /** Text context size. */
-    contextSize?: number;
-    /** Embedding mode only. */
-    embedding?: boolean;
-    /** Use fp16 for KV cache. */
-    f16Kv?: boolean;
-    /** Number of layers to store in VRAM. */
-    gpuLayers?: number;
-    /** The llama_eval() call computes all logits, not just the last one. */
-    logitsAll?: boolean;
-    /** If true, reduce VRAM usage at the cost of performance. */
-    lowVram?: boolean;
-    /** Path to the model on the filesystem. */
-    modelPath: string;
-    /** If null, a random seed will be used. */
-    seed?: null | number;
-    /** The randomness of the responses, e.g. 0.1 deterministic, 1.5 creative, 0.8 balanced, 0 disables. */
-    temperature?: number;
-    /** Consider the n most likely tokens, where n is 1 to vocabulary size, 0 disables (uses full vocabulary). Note: only applies when `temperature` > 0. */
-    topK?: number;
-    /** Selects the smallest token set whose probability exceeds P, where P is between 0 - 1, 1 disables. Note: only applies when `temperature` > 0. */
-    topP?: number;
-    /** Force system to keep model in RAM. */
-    useMlock?: boolean;
-    /** Use mmap if possible. */
-    useMmap?: boolean;
-    /** Only load the vocabulary, no weights. */
-    vocabOnly?: boolean;
+export interface LlamaCppInputs extends LlamaBaseCppInputs, BaseChatModelParams {
 }
 export interface LlamaCppCallOptions extends BaseLanguageModelCallOptions {
     /** The maximum number of tokens the response should contain. */
@@ -53,42 +24,28 @@ export interface LlamaCppCallOptions extends BaseLanguageModelCallOptions {
 export declare class ChatLlamaCpp extends SimpleChatModel<LlamaCppCallOptions> {
     CallOptions: LlamaCppCallOptions;
     static inputs: LlamaCppInputs;
-    batchSize?: number;
-    contextSize?: number;
-    embedding?: boolean;
-    f16Kv?: boolean;
-    gpuLayers?: number;
-    logitsAll?: boolean;
-    lowVram?: boolean;
-    seed?: null | number;
-    useMlock?: boolean;
-    useMmap?: boolean;
-    vocabOnly?: boolean;
-    modelPath: string;
+    maxTokens?: number;
+    temperature?: number;
+    topK?: number;
+    topP?: number;
+    trimWhitespaceSuffix?: boolean;
     _model: LlamaModel;
     _context: LlamaContext;
     _session: LlamaChatSession | null;
     static lc_name(): string;
     constructor(inputs: LlamaCppInputs);
     _llmType(): string;
-    invocationParams(): {
-        batchSize: number | undefined;
-        contextSize: number | undefined;
-        embedding: boolean | undefined;
-        f16Kv: boolean | undefined;
-        gpuLayers: number | undefined;
-        logitsAll: boolean | undefined;
-        lowVram: boolean | undefined;
-        modelPath: string;
-        seed: number | null | undefined;
-        useMlock: boolean | undefined;
-        useMmap: boolean | undefined;
-        vocabOnly: boolean | undefined;
-    };
     /** @ignore */
     _combineLLMOutput(): {};
+    invocationParams(): {
+        maxTokens: number | undefined;
+        temperature: number | undefined;
+        topK: number | undefined;
+        topP: number | undefined;
+        trimWhitespaceSuffix: boolean | undefined;
+    };
     /** @ignore */
-    _call(messages: BaseMessage[], options: this["ParsedCallOptions"]): Promise<string>;
+    _call(messages: BaseMessage[], _options: this["ParsedCallOptions"]): Promise<string>;
     protected _buildSession(messages: BaseMessage[]): string;
     protected _convertMessagesToInteractions(messages: BaseMessage[]): ConversationInteraction[];
 }

package/dist/chat_models/llama_cpp.js CHANGED Viewed

@@ -1,5 +1,6 @@
-import { LlamaModel, LlamaContext, LlamaChatSession, } from "node-llama-cpp";
+import { LlamaChatSession, } from "node-llama-cpp";
 import { SimpleChatModel } from "./base.js";
+import { createLlamaModel, createLlamaContext, } from "../util/llama_cpp.js";
 /**
  *  To use this model you need to have the `node-llama-cpp` module installed.
  *  This can be installed using `npm install -S node-llama-cpp` and the minimum
@@ -12,73 +13,31 @@ export class ChatLlamaCpp extends SimpleChatModel {
     }
     constructor(inputs) {
         super(inputs);
-        Object.defineProperty(this, "batchSize", {
+        Object.defineProperty(this, "maxTokens", {
             enumerable: true,
             configurable: true,
             writable: true,
             value: void 0
         });
-        Object.defineProperty(this, "contextSize", {
+        Object.defineProperty(this, "temperature", {
             enumerable: true,
             configurable: true,
             writable: true,
             value: void 0
         });
-        Object.defineProperty(this, "embedding", {
+        Object.defineProperty(this, "topK", {
             enumerable: true,
             configurable: true,
             writable: true,
             value: void 0
         });
-        Object.defineProperty(this, "f16Kv", {
+        Object.defineProperty(this, "topP", {
             enumerable: true,
             configurable: true,
             writable: true,
             value: void 0
         });
-        Object.defineProperty(this, "gpuLayers", {
-            enumerable: true,
-            configurable: true,
-            writable: true,
-            value: void 0
-        });
-        Object.defineProperty(this, "logitsAll", {
-            enumerable: true,
-            configurable: true,
-            writable: true,
-            value: void 0
-        });
-        Object.defineProperty(this, "lowVram", {
-            enumerable: true,
-            configurable: true,
-            writable: true,
-            value: void 0
-        });
-        Object.defineProperty(this, "seed", {
-            enumerable: true,
-            configurable: true,
-            writable: true,
-            value: void 0
-        });
-        Object.defineProperty(this, "useMlock", {
-            enumerable: true,
-            configurable: true,
-            writable: true,
-            value: void 0
-        });
-        Object.defineProperty(this, "useMmap", {
-            enumerable: true,
-            configurable: true,
-            writable: true,
-            value: void 0
-        });
-        Object.defineProperty(this, "vocabOnly", {
-            enumerable: true,
-            configurable: true,
-            writable: true,
-            value: void 0
-        });
-        Object.defineProperty(this, "modelPath", {
+        Object.defineProperty(this, "trimWhitespaceSuffix", {
             enumerable: true,
             configurable: true,
             writable: true,
@@ -102,47 +61,33 @@ export class ChatLlamaCpp extends SimpleChatModel {
             writable: true,
             value: void 0
         });
-        this.batchSize = inputs?.batchSize;
-        this.contextSize = inputs?.contextSize;
-        this.embedding = inputs?.embedding;
-        this.f16Kv = inputs?.f16Kv;
-        this.gpuLayers = inputs?.gpuLayers;
-        this.logitsAll = inputs?.logitsAll;
-        this.lowVram = inputs?.lowVram;
-        this.modelPath = inputs.modelPath;
-        this.seed = inputs?.seed;
-        this.useMlock = inputs?.useMlock;
-        this.useMmap = inputs?.useMmap;
-        this.vocabOnly = inputs?.vocabOnly;
-        this._model = new LlamaModel(inputs);
-        this._context = new LlamaContext({ model: this._model });
+        this.maxTokens = inputs?.maxTokens;
+        this.temperature = inputs?.temperature;
+        this.topK = inputs?.topK;
+        this.topP = inputs?.topP;
+        this.trimWhitespaceSuffix = inputs?.trimWhitespaceSuffix;
+        this._model = createLlamaModel(inputs);
+        this._context = createLlamaContext(this._model, inputs);
         this._session = null;
     }
     _llmType() {
         return "llama2_cpp";
     }
-    invocationParams() {
-        return {
-            batchSize: this.batchSize,
-            contextSize: this.contextSize,
-            embedding: this.embedding,
-            f16Kv: this.f16Kv,
-            gpuLayers: this.gpuLayers,
-            logitsAll: this.logitsAll,
-            lowVram: this.lowVram,
-            modelPath: this.modelPath,
-            seed: this.seed,
-            useMlock: this.useMlock,
-            useMmap: this.useMmap,
-            vocabOnly: this.vocabOnly,
-        };
-    }
     /** @ignore */
     _combineLLMOutput() {
         return {};
     }
+    invocationParams() {
+        return {
+            maxTokens: this.maxTokens,
+            temperature: this.temperature,
+            topK: this.topK,
+            topP: this.topP,
+            trimWhitespaceSuffix: this.trimWhitespaceSuffix,
+        };
+    }
     /** @ignore */
-    async _call(messages, options) {
+    async _call(messages, _options) {
         let prompt = "";
         if (messages.length > 1) {
             // We need to build a new _session
@@ -156,8 +101,15 @@ export class ChatLlamaCpp extends SimpleChatModel {
             prompt = messages[0].content;
         }
         try {
+            const promptOptions = {
+                maxTokens: this?.maxTokens,
+                temperature: this?.temperature,
+                topK: this?.topK,
+                topP: this?.topP,
+                trimWhitespaceSuffix: this?.trimWhitespaceSuffix,
+            };
             // @ts-expect-error - TS2531: Object is possibly 'null'.
-            const completion = await this._session.prompt(prompt, options);
+            const completion = await this._session.prompt(prompt, promptOptions);
             return completion;
         }
         catch (e) {

package/dist/chat_models/openai.cjs CHANGED Viewed

@@ -2,7 +2,6 @@
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.PromptLayerChatOpenAI = exports.ChatOpenAI = void 0;
 const openai_1 = require("openai");
-const count_tokens_js_1 = require("../base_language/count_tokens.cjs");
 const index_js_1 = require("../schema/index.cjs");
 const convert_to_openai_js_1 = require("../tools/convert_to_openai.cjs");
 const azure_js_1 = require("../util/azure.cjs");
@@ -10,6 +9,7 @@ const env_js_1 = require("../util/env.cjs");
 const prompt_layer_js_1 = require("../util/prompt-layer.cjs");
 const base_js_1 = require("./base.cjs");
 const openai_js_1 = require("../util/openai.cjs");
+const openai_format_fndef_js_1 = require("../util/openai-format-fndef.cjs");
 function extractGenericMessageCustomRole(message) {
     if (message.role !== "system" &&
         message.role !== "assistant" &&
@@ -39,6 +39,19 @@ function messageToOpenAIRole(message) {
             throw new Error(`Unknown message type: ${type}`);
     }
 }
+function messageToOpenAIMessage(message) {
+    const msg = {
+        content: message.content || null,
+        name: message.name,
+        role: messageToOpenAIRole(message),
+        function_call: message.additional_kwargs.function_call,
+    };
+    if (msg.function_call?.arguments) {
+        // Remove spaces, new line characters etc.
+        msg.function_call.arguments = JSON.stringify(JSON.parse(msg.function_call.arguments));
+    }
+    return msg;
+}
 function openAIResponseToChatMessage(message) {
     switch (message.role) {
         case "user":
@@ -414,6 +427,7 @@ class ChatOpenAI extends base_js_1.BaseChatModel {
     }
     /**
      * Get the identifying parameters for the model
+     *
      */
     identifyingParams() {
         return this._identifyingParams();
@@ -430,7 +444,7 @@ class ChatOpenAI extends base_js_1.BaseChatModel {
                 .function_call,
         }));
         if (params.stream) {
-            const stream = await this._streamResponseChunks(messages, options, runManager);
+            const stream = this._streamResponseChunks(messages, options, runManager);
             const finalChunks = {};
             for await (const chunk of stream) {
                 const index = chunk.generationInfo?.completion ?? 0;
@@ -444,7 +458,15 @@ class ChatOpenAI extends base_js_1.BaseChatModel {
             const generations = Object.entries(finalChunks)
                 .sort(([aKey], [bKey]) => parseInt(aKey, 10) - parseInt(bKey, 10))
                 .map(([_, value]) => value);
-            return { generations };
+            const { functions, function_call } = this.invocationParams(options);
+            // OpenAI does not support token usage report under stream mode,
+            // fallback to estimation.
+            const promptTokenUsage = await this.getNumTokensFromPrompt(messages, functions, function_call);
+            const completionTokenUsage = await this.getNumTokensFromGenerations(generations);
+            tokenUsage.promptTokens = promptTokenUsage;
+            tokenUsage.completionTokens = completionTokenUsage;
+            tokenUsage.totalTokens = promptTokenUsage + completionTokenUsage;
+            return { generations, llmOutput: { estimatedTokenUsage: tokenUsage } };
         }
         else {
             const data = await this.completionWithRetry({
@@ -484,16 +506,65 @@ class ChatOpenAI extends base_js_1.BaseChatModel {
             };
         }
     }
+    /**
+     * Estimate the number of tokens a prompt will use.
+     * Modified from: https://github.com/hmarr/openai-chat-tokens/blob/main/src/index.ts
+     */
+    async getNumTokensFromPrompt(messages, functions, function_call) {
+        // It appears that if functions are present, the first system message is padded with a trailing newline. This
+        // was inferred by trying lots of combinations of messages and functions and seeing what the token counts were.
+        // let paddedSystem = false;
+        const openaiMessages = messages.map((m) => messageToOpenAIMessage(m));
+        let tokens = (await this.getNumTokensFromMessages(messages)).totalCount;
+        // If there are functions, add the function definitions as they count towards token usage
+        if (functions && function_call !== "auto") {
+            const promptDefinitions = (0, openai_format_fndef_js_1.formatFunctionDefinitions)(functions);
+            tokens += await this.getNumTokens(promptDefinitions);
+            tokens += 9; // Add nine per completion
+        }
+        // If there's a system message _and_ functions are present, subtract four tokens. I assume this is because
+        // functions typically add a system message, but reuse the first one if it's already there. This offsets
+        // the extra 9 tokens added by the function definitions.
+        if (functions && openaiMessages.find((m) => m.role === "system")) {
+            tokens -= 4;
+        }
+        // If function_call is 'none', add one token.
+        // If it's a FunctionCall object, add 4 + the number of tokens in the function name.
+        // If it's undefined or 'auto', don't add anything.
+        if (function_call === "none") {
+            tokens += 1;
+        }
+        else if (typeof function_call === "object") {
+            tokens += (await this.getNumTokens(function_call.name)) + 4;
+        }
+        return tokens;
+    }
+    /**
+     * Estimate the number of tokens an array of generations have used.
+     */
+    async getNumTokensFromGenerations(generations) {
+        const generationUsages = await Promise.all(generations.map(async (generation) => {
+            const openAIMessage = messageToOpenAIMessage(generation.message);
+            if (openAIMessage.function_call) {
+                return (await this.getNumTokensFromMessages([generation.message]))
+                    .countPerMessage[0];
+            }
+            else {
+                return await this.getNumTokens(generation.message.content);
+            }
+        }));
+        return generationUsages.reduce((a, b) => a + b, 0);
+    }
     async getNumTokensFromMessages(messages) {
         let totalCount = 0;
         let tokensPerMessage = 0;
         let tokensPerName = 0;
         // From: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
-        if ((0, count_tokens_js_1.getModelNameForTiktoken)(this.modelName) === "gpt-3.5-turbo") {
+        if (this.modelName === "gpt-3.5-turbo-0301") {
             tokensPerMessage = 4;
             tokensPerName = -1;
         }
-        else if ((0, count_tokens_js_1.getModelNameForTiktoken)(this.modelName).startsWith("gpt-4")) {
+        else {
             tokensPerMessage = 3;
             tokensPerName = 1;
         }
@@ -503,7 +574,21 @@ class ChatOpenAI extends base_js_1.BaseChatModel {
             const nameCount = message.name !== undefined
                 ? tokensPerName + (await this.getNumTokens(message.name))
                 : 0;
-            const count = textCount + tokensPerMessage + roleCount + nameCount;
+            let count = textCount + tokensPerMessage + roleCount + nameCount;
+            // From: https://github.com/hmarr/openai-chat-tokens/blob/main/src/index.ts messageTokenEstimate
+            const openAIMessage = messageToOpenAIMessage(message);
+            if (openAIMessage.role === "function") {
+                count -= 2;
+            }
+            if (openAIMessage.function_call) {
+                count += 3;
+            }
+            if (openAIMessage.function_call?.name) {
+                count += await this.getNumTokens(openAIMessage.function_call?.name);
+            }
+            if (openAIMessage.function_call?.arguments) {
+                count += await this.getNumTokens(openAIMessage.function_call?.arguments);
+            }
             totalCount += count;
             return count;
         }));

package/dist/chat_models/openai.d.ts CHANGED Viewed

@@ -83,12 +83,22 @@ export declare class ChatOpenAI<CallOptions extends ChatOpenAICallOptions = Chat
     _streamResponseChunks(messages: BaseMessage[], options: this["ParsedCallOptions"], runManager?: CallbackManagerForLLMRun): AsyncGenerator<ChatGenerationChunk>;
     /**
      * Get the identifying parameters for the model
+     *
      */
     identifyingParams(): Omit<OpenAIClient.Chat.Completions.ChatCompletionCreateParams, "messages"> & {
         model_name: string;
     } & ClientOptions;
     /** @ignore */
     _generate(messages: BaseMessage[], options: this["ParsedCallOptions"], runManager?: CallbackManagerForLLMRun): Promise<ChatResult>;
+    /**
+     * Estimate the number of tokens a prompt will use.
+     * Modified from: https://github.com/hmarr/openai-chat-tokens/blob/main/src/index.ts
+     */
+    private getNumTokensFromPrompt;
+    /**
+     * Estimate the number of tokens an array of generations have used.
+     */
+    private getNumTokensFromGenerations;
     getNumTokensFromMessages(messages: BaseMessage[]): Promise<{
         totalCount: number;
         countPerMessage: number[];

package/dist/chat_models/openai.js CHANGED Viewed

@@ -1,5 +1,4 @@
 import { OpenAI as OpenAIClient } from "openai";
-import { getModelNameForTiktoken } from "../base_language/count_tokens.js";
 import { AIMessage, AIMessageChunk, ChatGenerationChunk, ChatMessage, ChatMessageChunk, FunctionMessageChunk, HumanMessage, HumanMessageChunk, SystemMessage, SystemMessageChunk, } from "../schema/index.js";
 import { formatToOpenAIFunction } from "../tools/convert_to_openai.js";
 import { getEndpoint } from "../util/azure.js";
@@ -7,6 +6,7 @@ import { getEnvironmentVariable } from "../util/env.js";
 import { promptLayerTrackRequest } from "../util/prompt-layer.js";
 import { BaseChatModel } from "./base.js";
 import { wrapOpenAIClientError } from "../util/openai.js";
+import { formatFunctionDefinitions, } from "../util/openai-format-fndef.js";
 function extractGenericMessageCustomRole(message) {
     if (message.role !== "system" &&
         message.role !== "assistant" &&
@@ -36,6 +36,19 @@ function messageToOpenAIRole(message) {
             throw new Error(`Unknown message type: ${type}`);
     }
 }
+function messageToOpenAIMessage(message) {
+    const msg = {
+        content: message.content || null,
+        name: message.name,
+        role: messageToOpenAIRole(message),
+        function_call: message.additional_kwargs.function_call,
+    };
+    if (msg.function_call?.arguments) {
+        // Remove spaces, new line characters etc.
+        msg.function_call.arguments = JSON.stringify(JSON.parse(msg.function_call.arguments));
+    }
+    return msg;
+}
 function openAIResponseToChatMessage(message) {
     switch (message.role) {
         case "user":
@@ -411,6 +424,7 @@ export class ChatOpenAI extends BaseChatModel {
     }
     /**
      * Get the identifying parameters for the model
+     *
      */
     identifyingParams() {
         return this._identifyingParams();
@@ -427,7 +441,7 @@ export class ChatOpenAI extends BaseChatModel {
                 .function_call,
         }));
         if (params.stream) {
-            const stream = await this._streamResponseChunks(messages, options, runManager);
+            const stream = this._streamResponseChunks(messages, options, runManager);
             const finalChunks = {};
             for await (const chunk of stream) {
                 const index = chunk.generationInfo?.completion ?? 0;
@@ -441,7 +455,15 @@ export class ChatOpenAI extends BaseChatModel {
             const generations = Object.entries(finalChunks)
                 .sort(([aKey], [bKey]) => parseInt(aKey, 10) - parseInt(bKey, 10))
                 .map(([_, value]) => value);
-            return { generations };
+            const { functions, function_call } = this.invocationParams(options);
+            // OpenAI does not support token usage report under stream mode,
+            // fallback to estimation.
+            const promptTokenUsage = await this.getNumTokensFromPrompt(messages, functions, function_call);
+            const completionTokenUsage = await this.getNumTokensFromGenerations(generations);
+            tokenUsage.promptTokens = promptTokenUsage;
+            tokenUsage.completionTokens = completionTokenUsage;
+            tokenUsage.totalTokens = promptTokenUsage + completionTokenUsage;
+            return { generations, llmOutput: { estimatedTokenUsage: tokenUsage } };
         }
         else {
             const data = await this.completionWithRetry({
@@ -481,16 +503,65 @@ export class ChatOpenAI extends BaseChatModel {
             };
         }
     }
+    /**
+     * Estimate the number of tokens a prompt will use.
+     * Modified from: https://github.com/hmarr/openai-chat-tokens/blob/main/src/index.ts
+     */
+    async getNumTokensFromPrompt(messages, functions, function_call) {
+        // It appears that if functions are present, the first system message is padded with a trailing newline. This
+        // was inferred by trying lots of combinations of messages and functions and seeing what the token counts were.
+        // let paddedSystem = false;
+        const openaiMessages = messages.map((m) => messageToOpenAIMessage(m));
+        let tokens = (await this.getNumTokensFromMessages(messages)).totalCount;
+        // If there are functions, add the function definitions as they count towards token usage
+        if (functions && function_call !== "auto") {
+            const promptDefinitions = formatFunctionDefinitions(functions);
+            tokens += await this.getNumTokens(promptDefinitions);
+            tokens += 9; // Add nine per completion
+        }
+        // If there's a system message _and_ functions are present, subtract four tokens. I assume this is because
+        // functions typically add a system message, but reuse the first one if it's already there. This offsets
+        // the extra 9 tokens added by the function definitions.
+        if (functions && openaiMessages.find((m) => m.role === "system")) {
+            tokens -= 4;
+        }
+        // If function_call is 'none', add one token.
+        // If it's a FunctionCall object, add 4 + the number of tokens in the function name.
+        // If it's undefined or 'auto', don't add anything.
+        if (function_call === "none") {
+            tokens += 1;
+        }
+        else if (typeof function_call === "object") {
+            tokens += (await this.getNumTokens(function_call.name)) + 4;
+        }
+        return tokens;
+    }
+    /**
+     * Estimate the number of tokens an array of generations have used.
+     */
+    async getNumTokensFromGenerations(generations) {
+        const generationUsages = await Promise.all(generations.map(async (generation) => {
+            const openAIMessage = messageToOpenAIMessage(generation.message);
+            if (openAIMessage.function_call) {
+                return (await this.getNumTokensFromMessages([generation.message]))
+                    .countPerMessage[0];
+            }
+            else {
+                return await this.getNumTokens(generation.message.content);
+            }
+        }));
+        return generationUsages.reduce((a, b) => a + b, 0);
+    }
     async getNumTokensFromMessages(messages) {
         let totalCount = 0;
         let tokensPerMessage = 0;
         let tokensPerName = 0;
         // From: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
-        if (getModelNameForTiktoken(this.modelName) === "gpt-3.5-turbo") {
+        if (this.modelName === "gpt-3.5-turbo-0301") {
             tokensPerMessage = 4;
             tokensPerName = -1;
         }
-        else if (getModelNameForTiktoken(this.modelName).startsWith("gpt-4")) {
+        else {
             tokensPerMessage = 3;
             tokensPerName = 1;
         }
@@ -500,7 +571,21 @@ export class ChatOpenAI extends BaseChatModel {
             const nameCount = message.name !== undefined
                 ? tokensPerName + (await this.getNumTokens(message.name))
                 : 0;
-            const count = textCount + tokensPerMessage + roleCount + nameCount;
+            let count = textCount + tokensPerMessage + roleCount + nameCount;
+            // From: https://github.com/hmarr/openai-chat-tokens/blob/main/src/index.ts messageTokenEstimate
+            const openAIMessage = messageToOpenAIMessage(message);
+            if (openAIMessage.role === "function") {
+                count -= 2;
+            }
+            if (openAIMessage.function_call) {
+                count += 3;
+            }
+            if (openAIMessage.function_call?.name) {
+                count += await this.getNumTokens(openAIMessage.function_call?.name);
+            }
+            if (openAIMessage.function_call?.arguments) {
+                count += await this.getNumTokens(openAIMessage.function_call?.arguments);
+            }
             totalCount += count;
             return count;
         }));

package/dist/embeddings/hf.cjs CHANGED Viewed

@@ -24,6 +24,12 @@ class HuggingFaceInferenceEmbeddings extends base_js_1.Embeddings {
             writable: true,
             value: void 0
         });
+        Object.defineProperty(this, "endpointUrl", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: void 0
+        });
         Object.defineProperty(this, "client", {
             enumerable: true,
             configurable: true,
@@ -34,7 +40,10 @@ class HuggingFaceInferenceEmbeddings extends base_js_1.Embeddings {
             fields?.model ?? "sentence-transformers/distilbert-base-nli-mean-tokens";
         this.apiKey =
             fields?.apiKey ?? (0, env_js_1.getEnvironmentVariable)("HUGGINGFACEHUB_API_KEY");
-        this.client = new inference_1.HfInference(this.apiKey);
+        this.endpointUrl = fields?.endpointUrl;
+        this.client = this.endpointUrl
+            ? new inference_1.HfInference(this.apiKey).endpoint(this.endpointUrl)
+            : new inference_1.HfInference(this.apiKey);
     }
     async _embed(texts) {
         // replace newlines, which can negatively affect performance.

package/dist/embeddings/hf.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { HfInference } from "@huggingface/inference";
+import { HfInference, HfInferenceEndpoint } from "@huggingface/inference";
 import { Embeddings, EmbeddingsParams } from "./base.js";
 /**
  * Interface that extends EmbeddingsParams and defines additional
@@ -7,6 +7,7 @@ import { Embeddings, EmbeddingsParams } from "./base.js";
 export interface HuggingFaceInferenceEmbeddingsParams extends EmbeddingsParams {
     apiKey?: string;
     model?: string;
+    endpointUrl?: string;
 }
 /**
  * Class that extends the Embeddings class and provides methods for
@@ -16,7 +17,8 @@ export interface HuggingFaceInferenceEmbeddingsParams extends EmbeddingsParams {
 export declare class HuggingFaceInferenceEmbeddings extends Embeddings implements HuggingFaceInferenceEmbeddingsParams {
     apiKey?: string;
     model: string;
-    client: HfInference;
+    endpointUrl?: string;
+    client: HfInference | HfInferenceEndpoint;
     constructor(fields?: HuggingFaceInferenceEmbeddingsParams);
     _embed(texts: string[]): Promise<number[][]>;
     /**