@llmindset/hf-mcp 0.1.20 → 0.1.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,16 @@
1
1
  import { z } from 'zod';
2
2
  import { HfApiCall } from '../hf-api-call.js';
3
- import { escapeMarkdown } from '../utilities.js';
3
+ import { escapeMarkdown, estimateTokens } from '../utilities.js';
4
4
  import { DOC_FETCH_CONFIG } from './doc-fetch.js';
5
5
 
6
+ /** token estimation. initial results for "how to load a image to image model in transformers" returned
7
+ * 121973 characters (36711 anthropic tokens) */
8
+
6
9
  export const DOCS_SEMANTIC_SEARCH_CONFIG = {
7
10
  name: 'hf_doc_search',
8
- description: 'Search the Hugging Face documentation library. Returns excerpts grouped by Product and Document.',
11
+ description:
12
+ 'Search the Hugging Face documentation library. Use this for the most up-to-date information ' +
13
+ 'Returns excerpts grouped by Product and Document.',
9
14
  schema: z.object({
10
15
  query: z
11
16
  .string()
@@ -43,16 +48,24 @@ interface DocSearchApiParams {
43
48
  product?: string;
44
49
  }
45
50
 
51
+ // Token budget defaults
52
+ const DEFAULT_TOKEN_BUDGET = 12500;
53
+ const TRUNCATE_EXCERPT_LENGTH = 400; // chars for truncated excerpts
54
+
46
55
  /**
47
56
  * Use the Hugging Face Semantic Document Search API
48
57
  */
49
58
  export class DocSearchTool extends HfApiCall<DocSearchApiParams, DocSearchResult[]> {
59
+ private tokenBudget: number;
60
+
50
61
  /**
51
- * @param apiUrl The URL of the Hugging Face document search API
52
62
  * @param hfToken Optional Hugging Face token for API access
63
+ * @param apiUrl The URL of the Hugging Face document search API
64
+ * @param tokenBudget Maximum number of tokens to return
53
65
  */
54
- constructor(hfToken?: string, apiUrl = 'https://hf.co/api/docs/search') {
66
+ constructor(hfToken?: string, apiUrl = 'https://hf.co/api/docs/search', tokenBudget = DEFAULT_TOKEN_BUDGET) {
55
67
  super(apiUrl, hfToken);
68
+ this.tokenBudget = tokenBudget;
56
69
  }
57
70
 
58
71
  /**
@@ -76,7 +89,7 @@ export class DocSearchTool extends HfApiCall<DocSearchApiParams, DocSearchResult
76
89
  : `No documentation found for query '${params.query}'`;
77
90
  }
78
91
 
79
- return formatSearchResults(params.query, results, params.product);
92
+ return formatSearchResults(params.query, results, params.product, this.tokenBudget);
80
93
  } catch (error) {
81
94
  if (error instanceof Error) {
82
95
  throw new Error(`Failed to search documentation: ${error.message}`);
@@ -139,70 +152,110 @@ function groupBySection(pageResults: DocSearchResult[]): Map<string | undefined,
139
152
  /**
140
153
  * Format excerpts from a section
141
154
  */
142
- function formatSectionExcerpts(section: string | undefined, results: DocSearchResult[]): string {
155
+ function formatSectionExcerpts(
156
+ section: string | undefined,
157
+ results: DocSearchResult[],
158
+ useTruncatedMode: boolean,
159
+ hasAlreadyShownTruncation: boolean
160
+ ): { text: string; tokensUsed: number; wasContentTruncated: boolean } {
143
161
  const lines: string[] = [];
162
+ let tokensUsed = 0;
163
+ let wasContentTruncated = false;
144
164
 
145
- // Add section heading if present
165
+ // Add section heading if we have one
146
166
  if (section) {
147
- if (results.length > 1) {
148
- lines.push(`#### Excerpts from the "${escapeMarkdown(section)}" section`);
149
- } else {
150
- lines.push(`#### Excerpt from the "${escapeMarkdown(section)}" section`);
151
- }
152
- lines.push('');
167
+ const heading =
168
+ results.length > 1
169
+ ? `#### Excerpts from the "${escapeMarkdown(section)}" section`
170
+ : `#### Excerpt from the "${escapeMarkdown(section)}" section`;
171
+
172
+ lines.push(heading, '');
173
+ tokensUsed += estimateTokens(heading + '\n\n');
153
174
  }
154
175
 
155
- // Add all excerpts from this section
156
176
  for (const result of results) {
157
- // Clean up the text - remove HTML tags if any
158
- const cleanText = result.text
177
+ let cleanText = result.text
159
178
  .replace(/<[^>]*>/g, '')
160
179
  .replace(/\n\s*\n/g, '\n')
161
180
  .trim();
162
181
 
163
- lines.push(cleanText);
164
- lines.push('');
182
+ // Truncate if in truncated mode and we haven't shown the message yet
183
+ if (useTruncatedMode && cleanText.length > TRUNCATE_EXCERPT_LENGTH && !hasAlreadyShownTruncation) {
184
+ cleanText =
185
+ cleanText.substring(0, TRUNCATE_EXCERPT_LENGTH) +
186
+ `...\n\n*[Content truncated - use ${DOC_FETCH_CONFIG.name} for full text or narrow search terms]*`;
187
+ wasContentTruncated = true;
188
+ }
189
+
190
+ lines.push(cleanText, '');
191
+ tokensUsed += estimateTokens(cleanText + '\n\n');
165
192
  }
166
193
 
167
- return lines.join('\n');
194
+ // Remove trailing empty line
195
+ if (lines.length > 0 && lines[lines.length - 1] === '') {
196
+ lines.pop();
197
+ }
198
+
199
+ return { text: lines.join('\n'), tokensUsed, wasContentTruncated };
168
200
  }
169
201
 
170
202
  /**
171
- * Format search results grouped by product and page
203
+ * Format search results with simple token budget management
172
204
  */
173
- function formatSearchResults(query: string, results: DocSearchResult[], productFilter?: string): string {
205
+ function formatSearchResults(
206
+ query: string,
207
+ results: DocSearchResult[],
208
+ productFilter?: string,
209
+ tokenBudget = DEFAULT_TOKEN_BUDGET
210
+ ): string {
174
211
  const lines: string[] = [];
212
+ let hasShownTruncationMessage = false;
175
213
 
176
214
  // Header
177
215
  const filterText = productFilter ? ` (filtered by product: ${productFilter})` : '';
178
- lines.push(`# Documentation Library Search Results for "${escapeMarkdown(query)}"${filterText}`);
179
- lines.push('');
180
- lines.push(`Found ${results.length} results`);
181
- lines.push('');
216
+ const header = `# Documentation Library Search Results for "${escapeMarkdown(query)}"${filterText}\n\nFound ${results.length} results\n`;
217
+ lines.push(header);
182
218
 
183
- // Group results
219
+ // Group and sort results
184
220
  const grouped = groupResults(results);
185
-
186
- // Sort products by count (most hits first)
187
221
  const sortedProducts = Array.from(grouped.keys()).sort((a, b) => {
188
222
  const productGroupA = grouped.get(a);
189
223
  const productGroupB = grouped.get(b);
190
224
  if (!productGroupA || !productGroupB) return 0;
191
-
192
225
  const countA = Array.from(productGroupA.values()).reduce((sum, arr) => sum + arr.length, 0);
193
226
  const countB = Array.from(productGroupB.values()).reduce((sum, arr) => sum + arr.length, 0);
194
- return countB - countA; // Descending order
227
+ return countB - countA;
195
228
  });
196
229
 
230
+ const linkOnlyResults: Array<{ product: string; url: string; title: string; count: number }> = [];
231
+
197
232
  for (const product of sortedProducts) {
198
233
  const productGroup = grouped.get(product);
199
234
  if (!productGroup) continue;
200
235
 
236
+ // Check current size before adding anything
237
+ const currentText = lines.join('\n');
238
+ if (estimateTokens(currentText) > tokenBudget) {
239
+ // Over budget - add remaining products to links
240
+ for (const url of productGroup.keys()) {
241
+ const pageResults = productGroup.get(url);
242
+ if (!pageResults?.[0]) continue;
243
+ linkOnlyResults.push({
244
+ product,
245
+ url,
246
+ title: pageResults[0].heading1 || pageResults[0].source_page_title,
247
+ count: pageResults.length,
248
+ });
249
+ }
250
+ continue;
251
+ }
252
+
253
+ // Add product header
201
254
  const totalProductHits = Array.from(productGroup.values()).reduce((sum, arr) => sum + arr.length, 0);
202
- lines.push(`## Results for Product: ${escapeMarkdown(product)} (${totalProductHits} results)`);
203
- lines.push('');
255
+ const productHeader = `## Results for Product: ${escapeMarkdown(product)} (${totalProductHits} results)\n`;
256
+ lines.push(productHeader);
204
257
 
205
- // Sort URLs within each product by count (most hits first)
258
+ // Sort pages by hit count
206
259
  const sortedUrls = Array.from(productGroup.keys()).sort((a, b) => {
207
260
  const pageResultsA = productGroup.get(a);
208
261
  const pageResultsB = productGroup.get(b);
@@ -212,32 +265,50 @@ function formatSearchResults(query: string, results: DocSearchResult[], productF
212
265
 
213
266
  for (const url of sortedUrls) {
214
267
  const pageResults = productGroup.get(url);
215
- if (!pageResults || pageResults.length === 0) continue;
216
- const firstResult = pageResults[0];
217
-
218
- // Skip if no results (shouldn't happen but TypeScript safety)
219
- if (!firstResult) continue;
268
+ if (!pageResults?.[0]) continue;
269
+
270
+ const pageTitle = pageResults[0].heading1 || pageResults[0].source_page_title;
271
+
272
+ // Check if we're over budget - if so, add remaining pages to links
273
+ const currentText = lines.join('\n');
274
+ if (estimateTokens(currentText) > tokenBudget) {
275
+ linkOnlyResults.push({ product, url, title: pageTitle, count: pageResults.length });
276
+ continue;
277
+ }
220
278
 
221
- // Page header with link and hit count
222
- const pageTitle = firstResult.heading1 || firstResult.source_page_title;
223
279
  const hitCount = pageResults.length > 1 ? ` (${pageResults.length} results)` : '';
224
- // Use the base URL (without anchor) for the page link
225
- lines.push(`### Results from [${escapeMarkdown(pageTitle)}](${url})${hitCount}`);
226
- lines.push('');
280
+ const pageHeader = `### Results from [${escapeMarkdown(pageTitle)}](${url})${hitCount}\n`;
281
+ lines.push(pageHeader);
227
282
 
228
- // Group results by section and format them
283
+ // Add all sections for this page
229
284
  const sectionGroups = groupBySection(pageResults);
230
-
231
- // Format each section's excerpts
232
285
  for (const [section, sectionResults] of sectionGroups) {
233
- lines.push(formatSectionExcerpts(section, sectionResults));
286
+ const currentTokens = estimateTokens(lines.join('\n'));
287
+ const useTruncatedMode = currentTokens > tokenBudget * 0.7;
288
+
289
+ const result = formatSectionExcerpts(section, sectionResults, useTruncatedMode, hasShownTruncationMessage);
290
+
291
+ if (result.text.trim()) {
292
+ lines.push(result.text);
293
+ if (result.wasContentTruncated) {
294
+ hasShownTruncationMessage = true;
295
+ }
296
+ }
234
297
  }
235
298
  }
236
299
  }
237
300
 
238
- // Add suggestion to use doc fetch tool
239
- lines.push('---');
240
- lines.push('');
301
+ // Add link-only results
302
+ if (linkOnlyResults.length > 0) {
303
+ lines.push(`\n## Further results were found in:\n`);
304
+ for (const linkResult of linkOnlyResults) {
305
+ const hitText = linkResult.count > 1 ? ` (${linkResult.count} results)` : '';
306
+ lines.push(`- [${escapeMarkdown(linkResult.title)}](${linkResult.url})${hitText} *(${linkResult.product})*`);
307
+ }
308
+ lines.push('');
309
+ }
310
+
311
+ lines.push('---\n');
241
312
  lines.push(`Use the "${DOC_FETCH_CONFIG.name}" tool to fetch a document from the library.`);
242
313
 
243
314
  return lines.join('\n');
package/src/utilities.ts CHANGED
@@ -62,3 +62,16 @@ export function escapeMarkdown(text: string): string {
62
62
  .replace(/>/g, '\\>')
63
63
  .replace(/#/g, '\\#');
64
64
  }
65
+
66
+ // Token estimation constants
67
+ const CHARS_PER_TOKEN = 3.3; // based on anthropic tokenizer for "how to load a image to image model in transformers"
68
+ // data: 121973 chars = 36711 tokens
69
+
70
+ /**
71
+ * Simple token estimation based on character count
72
+ * @param text The text to estimate tokens for
73
+ * @returns Estimated number of tokens
74
+ */
75
+ export function estimateTokens(text: string): number {
76
+ return Math.ceil(text.length / CHARS_PER_TOKEN);
77
+ }
@@ -1,7 +0,0 @@
1
- interface DocMapping {
2
- repo_id: string;
3
- doc_folder: string;
4
- }
5
- export declare const DOC_MAPPINGS: Record<string, DocMapping>;
6
- export {};
7
- //# sourceMappingURL=doc-mappings.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"doc-mappings.d.ts","sourceRoot":"","sources":["../../src/docs-search/doc-mappings.ts"],"names":[],"mappings":"AAAA,UAAU,UAAU;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,eAAO,MAAM,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,UAAU,CAyEnD,CAAC"}
@@ -1,75 +0,0 @@
1
- export const DOC_MAPPINGS = {
2
- 'tokenizers': {
3
- repo_id: 'huggingface/tokenizers',
4
- doc_folder: 'docs/source-doc-builder'
5
- },
6
- 'diffusers': {
7
- repo_id: 'huggingface/diffusers',
8
- doc_folder: 'docs/source/en'
9
- },
10
- 'accelerate': {
11
- repo_id: 'huggingface/accelerate',
12
- doc_folder: 'docs/source'
13
- },
14
- 'huggingface_hub': {
15
- repo_id: 'huggingface/huggingface_hub',
16
- doc_folder: 'docs/source/en'
17
- },
18
- 'transformers': {
19
- repo_id: 'huggingface/transformers',
20
- doc_folder: 'docs/source/en'
21
- },
22
- 'hub': {
23
- repo_id: 'huggingface/hub-docs',
24
- doc_folder: 'docs/hub'
25
- },
26
- 'huggingface.js': {
27
- repo_id: 'huggingface/huggingface.js',
28
- doc_folder: 'docs'
29
- },
30
- 'transformers.js': {
31
- repo_id: 'huggingface/transformers.js',
32
- doc_folder: 'docs/source'
33
- },
34
- 'smolagents': {
35
- repo_id: 'huggingface/smolagents',
36
- doc_folder: 'docs/source/en'
37
- },
38
- 'peft': {
39
- repo_id: 'huggingface/peft',
40
- doc_folder: 'docs/source'
41
- },
42
- 'trl': {
43
- repo_id: 'huggingface/trl',
44
- doc_folder: 'docs/source'
45
- },
46
- 'bitsandbytes': {
47
- repo_id: 'bitsandbytes-foundation/bitsandbytes',
48
- doc_folder: 'docs/source'
49
- },
50
- 'lerobot': {
51
- repo_id: 'huggingface/lerobot',
52
- doc_folder: 'docs/source'
53
- },
54
- 'timm': {
55
- repo_id: 'huggingface/pytorch-image-models',
56
- doc_folder: 'hfdocs/source'
57
- },
58
- 'inference-providers': {
59
- repo_id: 'huggingface/hub-docs',
60
- doc_folder: 'docs/inference-providers'
61
- },
62
- 'safetensors': {
63
- repo_id: 'huggingface/safetensors',
64
- doc_folder: 'docs/source'
65
- },
66
- 'inference-endpoints': {
67
- repo_id: 'huggingface/hf-endpoints-documentation',
68
- doc_folder: 'docs/source'
69
- },
70
- 'dataset-viewer': {
71
- repo_id: 'huggingface/dataset-viewer',
72
- doc_folder: 'docs/source'
73
- }
74
- };
75
- //# sourceMappingURL=doc-mappings.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"doc-mappings.js","sourceRoot":"","sources":["../../src/docs-search/doc-mappings.ts"],"names":[],"mappings":"AAKA,MAAM,CAAC,MAAM,YAAY,GAA+B;IACtD,YAAY,EAAE;QACZ,OAAO,EAAE,wBAAwB;QACjC,UAAU,EAAE,yBAAyB;KACtC;IACD,WAAW,EAAE;QACX,OAAO,EAAE,uBAAuB;QAChC,UAAU,EAAE,gBAAgB;KAC7B;IACD,YAAY,EAAE;QACZ,OAAO,EAAE,wBAAwB;QACjC,UAAU,EAAE,aAAa;KAC1B;IACD,iBAAiB,EAAE;QACjB,OAAO,EAAE,6BAA6B;QACtC,UAAU,EAAE,gBAAgB;KAC7B;IACD,cAAc,EAAE;QACd,OAAO,EAAE,0BAA0B;QACnC,UAAU,EAAE,gBAAgB;KAC7B;IACD,KAAK,EAAE;QACL,OAAO,EAAE,sBAAsB;QAC/B,UAAU,EAAE,UAAU;KACvB;IACD,gBAAgB,EAAE;QAChB,OAAO,EAAE,4BAA4B;QACrC,UAAU,EAAE,MAAM;KACnB;IACD,iBAAiB,EAAE;QACjB,OAAO,EAAE,6BAA6B;QACtC,UAAU,EAAE,aAAa;KAC1B;IACD,YAAY,EAAE;QACZ,OAAO,EAAE,wBAAwB;QACjC,UAAU,EAAE,gBAAgB;KAC7B;IACD,MAAM,EAAE;QACN,OAAO,EAAE,kBAAkB;QAC3B,UAAU,EAAE,aAAa;KAC1B;IACD,KAAK,EAAE;QACL,OAAO,EAAE,iBAAiB;QAC1B,UAAU,EAAE,aAAa;KAC1B;IACD,cAAc,EAAE;QACd,OAAO,EAAE,sCAAsC;QAC/C,UAAU,EAAE,aAAa;KAC1B;IACD,SAAS,EAAE;QACT,OAAO,EAAE,qBAAqB;QAC9B,UAAU,EAAE,aAAa;KAC1B;IACD,MAAM,EAAE;QACN,OAAO,EAAE,kCAAkC;QAC3C,UAAU,EAAE,eAAe;KAC5B;IACD,qBAAqB,EAAE;QACrB,OAAO,EAAE,sBAAsB;QAC/B,UAAU,EAAE,0BAA0B;KACvC;IACD,aAAa,EAAE;QACb,OAAO,EAAE,yBAAyB;QAClC,UAAU,EAAE,aAAa;KAC1B;IACD,qBAAqB,EAAE;QACrB,OAAO,EAAE,wCAAwC;QACjD,UAAU,EAAE,aAAa;KAC1B;IACD,gBAAgB,EAAE;QAChB,OAAO,EAAE,4BAA4B;QACrC,UAAU,EAAE,aAAa;KAC1B;CACF,CAAC"}
@@ -1,79 +0,0 @@
1
- interface DocMapping {
2
- repo_id: string;
3
- doc_folder: string;
4
- }
5
-
6
- export const DOC_MAPPINGS: Record<string, DocMapping> = {
7
- 'tokenizers': {
8
- repo_id: 'huggingface/tokenizers',
9
- doc_folder: 'docs/source-doc-builder'
10
- },
11
- 'diffusers': {
12
- repo_id: 'huggingface/diffusers',
13
- doc_folder: 'docs/source/en'
14
- },
15
- 'accelerate': {
16
- repo_id: 'huggingface/accelerate',
17
- doc_folder: 'docs/source'
18
- },
19
- 'huggingface_hub': {
20
- repo_id: 'huggingface/huggingface_hub',
21
- doc_folder: 'docs/source/en'
22
- },
23
- 'transformers': {
24
- repo_id: 'huggingface/transformers',
25
- doc_folder: 'docs/source/en'
26
- },
27
- 'hub': {
28
- repo_id: 'huggingface/hub-docs',
29
- doc_folder: 'docs/hub'
30
- },
31
- 'huggingface.js': {
32
- repo_id: 'huggingface/huggingface.js',
33
- doc_folder: 'docs'
34
- },
35
- 'transformers.js': {
36
- repo_id: 'huggingface/transformers.js',
37
- doc_folder: 'docs/source'
38
- },
39
- 'smolagents': {
40
- repo_id: 'huggingface/smolagents',
41
- doc_folder: 'docs/source/en'
42
- },
43
- 'peft': {
44
- repo_id: 'huggingface/peft',
45
- doc_folder: 'docs/source'
46
- },
47
- 'trl': {
48
- repo_id: 'huggingface/trl',
49
- doc_folder: 'docs/source'
50
- },
51
- 'bitsandbytes': {
52
- repo_id: 'bitsandbytes-foundation/bitsandbytes',
53
- doc_folder: 'docs/source'
54
- },
55
- 'lerobot': {
56
- repo_id: 'huggingface/lerobot',
57
- doc_folder: 'docs/source'
58
- },
59
- 'timm': {
60
- repo_id: 'huggingface/pytorch-image-models',
61
- doc_folder: 'hfdocs/source'
62
- },
63
- 'inference-providers': {
64
- repo_id: 'huggingface/hub-docs',
65
- doc_folder: 'docs/inference-providers'
66
- },
67
- 'safetensors': {
68
- repo_id: 'huggingface/safetensors',
69
- doc_folder: 'docs/source'
70
- },
71
- 'inference-endpoints': {
72
- repo_id: 'huggingface/hf-endpoints-documentation',
73
- doc_folder: 'docs/source'
74
- },
75
- 'dataset-viewer': {
76
- repo_id: 'huggingface/dataset-viewer',
77
- doc_folder: 'docs/source'
78
- }
79
- };