@samanhappy/mcphub 0.12.7 → 0.12.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,243 @@
1
+ /**
2
+ * Token-aware text truncation utilities for embedding generation.
3
+ *
4
+ * Provides precise tokenization for known model families and a conservative
5
+ * heuristic fallback for unknown models.
6
+ *
7
+ * Model families and strategies:
8
+ * - OpenAI / Azure (text-embedding-*): BPE cl100k_base via gpt-tokenizer (exact)
9
+ * - BAAI/BGE and HuggingFace models: AutoTokenizer via @huggingface/transformers (exact)
10
+ * - Google Gemini (gemini-embedding-*): countTokens API via @google/genai (exact)
11
+ * - Unknown models: heuristic maxTokens * 3 chars (approximate)
12
+ */
13
+ /**
14
+ * Per-model token limits.
15
+ * Order matters: more specific entries must appear before generic ones.
16
+ * bge-m3 is explicitly listed before the generic 'bge' catch-all because
17
+ * its real limit (8192 tokens) differs substantially from the conservative
18
+ * 512 used for other BGE variants.
19
+ */
20
+ const MODEL_TOKEN_LIMITS = [
21
+ ['text-embedding-3-small', 8191],
22
+ ['text-embedding-3-large', 8191],
23
+ ['text-embedding-ada-002', 8191],
24
+ ['gemini-embedding-001', 2048],
25
+ ['bge-m3', 8192],
26
+ ];
27
+ /**
28
+ * Returns the default maximum token limit for a given embedding model name.
29
+ * Used when no explicit limit is configured via EMBEDDING_MAX_TOKENS or
30
+ * smartRouting.embeddingMaxTokens.
31
+ */
32
+ export function getModelDefaultTokenLimit(model) {
33
+ const lower = model.toLowerCase();
34
+ for (const [pattern, limit] of MODEL_TOKEN_LIMITS) {
35
+ if (lower.includes(pattern)) {
36
+ return limit;
37
+ }
38
+ }
39
+ // For other BGE variants (bge-large-en, bge-small-zh, etc.) use conservative limit
40
+ if (lower.includes('bge')) {
41
+ return 512;
42
+ }
43
+ // Default conservative limit: safe for entirely unknown models.
44
+ // Users can raise it with EMBEDDING_MAX_TOKENS if they know their model supports more.
45
+ return 512;
46
+ }
47
+ // ─────────────────────────────────────────────────────────────────────────────
48
+ // Model family detection helpers
49
+ // ─────────────────────────────────────────────────────────────────────────────
50
+ const OPENAI_MODELS = new Set(['text-embedding-3-small', 'text-embedding-3-large', 'text-embedding-ada-002']);
51
+ function isOpenAIModel(model) {
52
+ return OPENAI_MODELS.has(model.toLowerCase());
53
+ }
54
+ function isGeminiModel(model) {
55
+ return model.toLowerCase() === 'gemini-embedding-001';
56
+ }
57
+ function isBgeM3Model(model) {
58
+ return model.toLowerCase().includes('bge-m3');
59
+ }
60
+ // ─────────────────────────────────────────────────────────────────────────────
61
+ // Branch 1 — OpenAI / Azure: BPE cl100k_base via gpt-tokenizer
62
+ // ─────────────────────────────────────────────────────────────────────────────
63
+ /**
64
+ * Truncates text using OpenAI's BPE tokenizer (cl100k_base).
65
+ *
66
+ * Encodes the input text into tokens using the gpt-tokenizer library,
67
+ * which implements the exact cl100k_base BPE vocabulary used by OpenAI's
68
+ * embedding models (text-embedding-3-small, text-embedding-3-large, text-embedding-ada-002).
69
+ * If the token count exceeds maxTokens, decodes the truncated token sequence back to text.
70
+ *
71
+ * @param text The input text to truncate.
72
+ * @param maxTokens The maximum number of tokens allowed.
73
+ * @returns The original text if it fits, or a truncated prefix.
74
+ */
75
+ async function truncateWithGptTokenizer(text, maxTokens) {
76
+ const { encode, decode } = await import('gpt-tokenizer');
77
+ const tokens = encode(text);
78
+ if (tokens.length <= maxTokens) {
79
+ return text;
80
+ }
81
+ return decode(tokens.slice(0, maxTokens));
82
+ }
83
+ // ─────────────────────────────────────────────────────────────────────────────
84
+ // Branch 2 — HuggingFace / BGE: AutoTokenizer (no ONNX, pure JS tokenisation)
85
+ // ─────────────────────────────────────────────────────────────────────────────
86
+ // Tokenizer instances are cached in memory to avoid repeated HF Hub downloads
87
+ const tokenizerCache = new Map();
88
+ /**
89
+ * Fetches or retrieves a cached HuggingFace tokenizer for a given model.
90
+ *
91
+ * The tokenizer is downloaded from HuggingFace Hub (public models like BAAI/bge-m3
92
+ * do not require authentication). Once downloaded, the tokenizer is cached in memory
93
+ * to avoid redundant network requests on subsequent calls.
94
+ *
95
+ * @param modelId The fully-qualified HuggingFace Hub model ID (e.g., "BAAI/bge-m3").
96
+ * @returns The cached or freshly-downloaded tokenizer instance.
97
+ */
98
+ async function getHFTokenizer(modelId) {
99
+ if (!tokenizerCache.has(modelId)) {
100
+ const { AutoTokenizer } = await import('@huggingface/transformers');
101
+ const tokenizer = await AutoTokenizer.from_pretrained(modelId);
102
+ tokenizerCache.set(modelId, tokenizer);
103
+ }
104
+ return tokenizerCache.get(modelId);
105
+ }
106
+ /**
107
+ * Resolves a shorthand model name to a fully-qualified HuggingFace Hub repo ID.
108
+ * BAAI/bge-m3 is a public model — no HF_TOKEN is required to download its
109
+ * tokenizer.json file.
110
+ */
111
+ function getHFModelId(model) {
112
+ if (model.includes('/')) {
113
+ // Already fully qualified (e.g. "BAAI/bge-m3", "sentence-transformers/all-MiniLM-L6-v2")
114
+ return model;
115
+ }
116
+ const lower = model.toLowerCase();
117
+ if (lower.includes('bge-m3')) {
118
+ return 'BAAI/bge-m3';
119
+ }
120
+ if (lower.includes('bge')) {
121
+ return `BAAI/${model}`;
122
+ }
123
+ return model;
124
+ }
125
+ /**
126
+ * Truncates text using a HuggingFace AutoTokenizer (BAAI/BGE and other transformer models).
127
+ *
128
+ * Leverages the @huggingface/transformers library to tokenize input text using
129
+ * the model's own SentencePiece or WordPiece tokenizer, then decodes a truncated
130
+ * token sequence back to text. This provides exact tokenization matching the model's
131
+ * vocabulary and behavior, with tokenizer instances cached to avoid repeated downloads.
132
+ *
133
+ * @param text The input text to truncate.
134
+ * @param maxTokens The maximum number of tokens allowed.
135
+ * @param model The model identifier (shorthand or fully-qualified HF Hub ID).
136
+ * @returns The original text if it fits, or a truncated prefix.
137
+ */
138
+ async function truncateWithHFTokenizer(text, maxTokens, model) {
139
+ const modelId = getHFModelId(model);
140
+ const tokenizer = await getHFTokenizer(modelId);
141
+ // Tokenize without automatic truncation so we can apply the exact limit
142
+ const encoded = await tokenizer(text, { padding: false, truncation: false });
143
+ // input_ids.data is BigInt64Array or Int32Array depending on the model/environment
144
+ const rawIds = encoded.input_ids.data;
145
+ const ids = Array.from(rawIds).map(Number);
146
+ if (ids.length <= maxTokens) {
147
+ return text;
148
+ }
149
+ const truncatedIds = ids.slice(0, maxTokens);
150
+ return (await tokenizer.decode(truncatedIds, { skip_special_tokens: true }));
151
+ }
152
+ // ─────────────────────────────────────────────────────────────────────────────
153
+ // Branch 3 — Google Gemini: countTokens API with binary-search bisection
154
+ // ─────────────────────────────────────────────────────────────────────────────
155
+ /**
156
+ * Truncates text using Google Gemini's countTokens API.
157
+ *
158
+ * Uses the @google/genai library to query the exact token count from Gemini's
159
+ * tokenizer (SentencePiece). To minimize API calls, a binary-search algorithm
160
+ * finds the longest text prefix whose token count does not exceed maxTokens.
161
+ * If no Google API key is configured, falls back to a conservative 3× char heuristic.
162
+ *
163
+ * @param text The input text to truncate.
164
+ * @param maxTokens The maximum number of tokens allowed.
165
+ * @param model The Gemini model identifier (e.g., "gemini-embedding-001").
166
+ * @returns The original text if it fits, or a truncated prefix.
167
+ */
168
+ async function truncateWithGeminiAPI(text, maxTokens, model, apiKey) {
169
+ // Pre-filter: if char count is safely within 2× the token limit the text fits
170
+ // (each SentencePiece token is at least 1 character), so skip the network call.
171
+ if (text.length <= maxTokens * 2) {
172
+ return text;
173
+ }
174
+ // Use the apiKey provided from smartRouting config (with priority: env var → settings → default)
175
+ const finalApiKey = apiKey || '';
176
+ if (!finalApiKey) {
177
+ // No Google Gemini API key configured (OPENAI_API_KEY) — fall back to conservative heuristic
178
+ const maxChars = maxTokens * 3;
179
+ return text.length <= maxChars ? text : text.substring(0, maxChars);
180
+ }
181
+ const { GoogleGenAI } = await import('@google/genai');
182
+ const ai = new GoogleGenAI({ apiKey: finalApiKey });
183
+ const countTokens = async (chunk) => {
184
+ const result = await ai.models.countTokens({ model, contents: chunk });
185
+ return result.totalTokens ?? 0;
186
+ };
187
+ const totalTokens = await countTokens(text);
188
+ if (totalTokens <= maxTokens) {
189
+ return text;
190
+ }
191
+ // Binary search: find the longest prefix whose token count ≤ maxTokens.
192
+ // This minimizes the number of countTokens calls (O(log n) on text length).
193
+ let lo = 0;
194
+ let hi = text.length;
195
+ while (lo < hi - 1) {
196
+ const mid = Math.floor((lo + hi) / 2);
197
+ const count = await countTokens(text.slice(0, mid));
198
+ if (count <= maxTokens) {
199
+ lo = mid;
200
+ }
201
+ else {
202
+ hi = mid;
203
+ }
204
+ }
205
+ return text.slice(0, lo);
206
+ }
207
+ // ─────────────────────────────────────────────────────────────────────────────
208
+ // Main entry point
209
+ // ─────────────────────────────────────────────────────────────────────────────
210
+ /**
211
+ * Truncates `text` so that its token count does not exceed `maxTokens`,
212
+ * using the most accurate tokenization strategy available for the given model.
213
+ *
214
+ * The function is async because the Gemini branch may require a network call.
215
+ * All callers in vectorSearchService.ts must use `await`.
216
+ *
217
+ * NOTE: When using Google Gemini embeddings (gemini-embedding-*), the Google AI
218
+ * Studio API key must be provided via the `OPENAI_API_KEY` environment variable,
219
+ * not `GOOGLE_API_KEY`. This allows centralized API key configuration across all
220
+ * embedding model families.
221
+ *
222
+ * @param text Input text to potentially truncate.
223
+ * @param maxTokens Maximum number of tokens allowed.
224
+ * @param model Embedding model identifier (selects truncation strategy).
225
+ * @param apiKey Optional API key for Gemini models (from smartRouting config).
226
+ * @returns The original text if it fits, or a truncated prefix.
227
+ */
228
+ export async function truncateToTokenLimit(text, maxTokens, model, apiKey) {
229
+ if (isOpenAIModel(model)) {
230
+ return truncateWithGptTokenizer(text, maxTokens);
231
+ }
232
+ if (isGeminiModel(model)) {
233
+ return truncateWithGeminiAPI(text, maxTokens, model, apiKey);
234
+ }
235
+ if (isBgeM3Model(model)) {
236
+ return truncateWithHFTokenizer(text, maxTokens, model);
237
+ }
238
+ // Fallback heuristic: ~3 chars per token (conservative for CJK/multilingual).
239
+ // Ratio is safe for English (~4 chars/token) and CJK (~2 chars/token).
240
+ const maxChars = maxTokens * 3;
241
+ return text.length <= maxChars ? text : text.substring(0, maxChars);
242
+ }
243
+ //# sourceMappingURL=tokenTruncation.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tokenTruncation.js","sourceRoot":"","sources":["../../src/utils/tokenTruncation.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH;;;;;;GAMG;AACH,MAAM,kBAAkB,GAA4B;IAClD,CAAC,wBAAwB,EAAE,IAAI,CAAC;IAChC,CAAC,wBAAwB,EAAE,IAAI,CAAC;IAChC,CAAC,wBAAwB,EAAE,IAAI,CAAC;IAChC,CAAC,sBAAsB,EAAE,IAAI,CAAC;IAC9B,CAAC,QAAQ,EAAE,IAAI,CAAC;CACjB,CAAC;AAEF;;;;GAIG;AACH,MAAM,UAAU,yBAAyB,CAAC,KAAa;IACrD,MAAM,KAAK,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC;IAClC,KAAK,MAAM,CAAC,OAAO,EAAE,KAAK,CAAC,IAAI,kBAAkB,EAAE,CAAC;QAClD,IAAI,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;YAC5B,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IACD,mFAAmF;IACnF,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC1B,OAAO,GAAG,CAAC;IACb,CAAC;IACD,gEAAgE;IAChE,uFAAuF;IACvF,OAAO,GAAG,CAAC;AACb,CAAC;AAED,gFAAgF;AAChF,iCAAiC;AACjC,gFAAgF;AAEhF,MAAM,aAAa,GAAG,IAAI,GAAG,CAAC,CAAC,wBAAwB,EAAE,wBAAwB,EAAE,wBAAwB,CAAC,CAAC,CAAC;AAE9G,SAAS,aAAa,CAAC,KAAa;IAClC,OAAO,aAAa,CAAC,GAAG,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,CAAC;AAChD,CAAC;AAED,SAAS,aAAa,CAAC,KAAa;IAClC,OAAO,KAAK,CAAC,WAAW,EAAE,KAAK,sBAAsB,CAAC;AACxD,CAAC;AAED,SAAS,YAAY,CAAC,KAAa;IACjC,OAAO,KAAK,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;AAChD,CAAC;AAED,gFAAgF;AAChF,+DAA+D;AAC/D,gFAAgF;AAEhF;;;;;;;;;;;GAWG;AACH,KAAK,UAAU,wBAAwB,CAAC,IAAY,EAAE,SAAiB;IACrE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;IACzD,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;IAC5B,IAAI,MAAM,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC/B,OAAO,IAAI,CAAC;IACd,CAAC;IACD,OAAO,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC,CAAC;AAC5C,CAAC;AAED,gFAAgF;AAChF,8EAA8E;AAC9E,gFAAgF;AAEhF,8EAA8E;AAC9E,MAAM,cAAc,GAAG,IAAI,GAAG,EAA2I,CAAC;AAE1K;;;;;;;;;GASG;AAEH,KAAK,UAAU,cAAc,CAAC,OAAe;IAC3C,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;QACjC,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,2BAA2B,CAAC,CAAC;QACpE,MAAM,SAAS,GAAG,MAAM,aAAa,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC;QAC/D,cAAc,CAAC,GAAG,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;IACzC,CAAC;IACD,OAAO,cAAc,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;AACrC,CAAC;AAED;;;;GAIG;AACH,SAAS,YAAY,CAAC,KAAa;IACjC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QACxB,yFAAyF;QACzF,OAAO,KAAK,CAAC;IACf,CAAC;IACD,MAAM,KAAK,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC;IAClC,IAAI,KAAK,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC7B,OAAO,aAAa,CAAC;IACvB,CAAC;IACD,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC1B,OAAO,QAAQ,KAAK,EAAE,CAAC;IACzB,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,KAAK,UAAU,uBAAuB,CACpC,IAAY,EACZ,SAAiB,EACjB,KAAa;IAEb,MAAM,OAAO,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC;IACpC,MAAM,SAAS,GAAG,MAAM,cAAc,CAAC,OAAO,CAAC,CAAC;IAChD,wEAAwE;IACxE,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,KAAK,EAAE,CAAC,CAAC;IAC7E,mFAAmF;IACnF,MAAM,MAAM,GAAgC,OAAO,CAAC,SAAkD,CAAC,IAAI,CAAC;IAC5G,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,MAA2B,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;IAChE,IAAI,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC5B,OAAO,IAAI,CAAC;IACd,CAAC;IACD,MAAM,YAAY,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IAC7C,OAAO,CAAC,MAAM,SAAS,CAAC,MAAM,CAAC,YAAY,EAAE,EAAE,mBAAmB,EAAE,IAAI,EAAE,CAAC,CAAW,CAAC;AACzF,CAAC;AAED,gFAAgF;AAChF,yEAAyE;AACzE,gFAAgF;AAEhF;;;;;;;;;;;;GAYG;AACH,KAAK,UAAU,qBAAqB,CAClC,IAAY,EACZ,SAAiB,EACjB,KAAa,EACb,MAAe;IAEf,8EAA8E;IAC9E,gFAAgF;IAChF,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;QACjC,OAAO,IAAI,CAAC;IACd,CAAC;IAED,iGAAiG;IACjG,MAAM,WAAW,GAAG,MAAM,IAAI,EAAE,CAAC;IACjC,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,6FAA6F;QAC7F,MAAM,QAAQ,GAAG,SAAS,GAAG,CAAC,CAAC;QAC/B,OAAO,IAAI,CAAC,MAAM,IAAI,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;IACtE,CAAC;IAED,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;IACtD,MAAM,EAAE,GAAG,IAAI,WAAW,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC;IAEpD,MAAM,WAAW,GAAG,KAAK,EAAE,KAAa,EAAmB,EAAE;QAC3D,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QACvE,OAAO,MAAM,CAAC,WAAW,IAAI,CAAC,CAAC;IACjC,CAAC,CAAC;IAEF,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,IAAI,CAAC,CAAC;IAC5C,IAAI,WAAW,IAAI,SAAS,EAAE,CAAC;QAC7B,OAAO,IAAI,CAAC;IACd,CAAC;IAED,wEAAwE;IACxE,4EAA4E;IAC5E,IAAI,EAAE,GAAG,CAAC,CAAC;IACX,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC;IACrB,OAAO,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;QACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QACtC,MAAM,KAAK,GAAG,MAAM,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;QACpD,IAAI,KAAK,IAAI,SAAS,EAAE,CAAC;YACvB,EAAE,GAAG,GAAG,CAAC;QACX,CAAC;aAAM,CAAC;YACN,EAAE,GAAG,GAAG,CAAC;QACX,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;AAC3B,CAAC;AAED,gFAAgF;AAChF,mBAAmB;AACnB,gFAAgF;AAEhF;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,IAAY,EACZ,SAAiB,EACjB,KAAa,EACb,MAAe;IAEf,IAAI,aAAa,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,wBAAwB,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;IACnD,CAAC;IACD,IAAI,aAAa,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,qBAAqB,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC;IAC/D,CAAC;IACD,IAAI,YAAY,CAAC,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,uBAAuB,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,CAAC,CAAC;IACzD,CAAC;IACD,8EAA8E;IAC9E,uEAAuE;IACvE,MAAM,QAAQ,GAAG,SAAS,GAAG,CAAC,CAAC;IAC/B,OAAO,IAAI,CAAC,MAAM,IAAI,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;AACtE,CAAC"}