@llmindset/hf-mcp 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/LICENSE +21 -0
  2. package/dist/dataset-detail.d.ts +26 -0
  3. package/dist/dataset-detail.d.ts.map +1 -0
  4. package/dist/dataset-detail.js +157 -0
  5. package/dist/dataset-detail.js.map +1 -0
  6. package/dist/dataset-search.d.ts +62 -0
  7. package/dist/dataset-search.d.ts.map +1 -0
  8. package/dist/dataset-search.js +158 -0
  9. package/dist/dataset-search.js.map +1 -0
  10. package/dist/duplicate-space.d.ts +75 -0
  11. package/dist/duplicate-space.d.ts.map +1 -0
  12. package/dist/duplicate-space.js +189 -0
  13. package/dist/duplicate-space.js.map +1 -0
  14. package/dist/error-messages.d.ts +4 -0
  15. package/dist/error-messages.d.ts.map +1 -0
  16. package/dist/error-messages.js +30 -0
  17. package/dist/error-messages.js.map +1 -0
  18. package/dist/hf-api-call.d.ts +18 -0
  19. package/dist/hf-api-call.d.ts.map +1 -0
  20. package/dist/hf-api-call.js +105 -0
  21. package/dist/hf-api-call.js.map +1 -0
  22. package/dist/index.d.ts +16 -0
  23. package/dist/index.d.ts.map +1 -0
  24. package/dist/index.js +16 -0
  25. package/dist/index.js.map +1 -0
  26. package/dist/model-detail.d.ts +26 -0
  27. package/dist/model-detail.d.ts.map +1 -0
  28. package/dist/model-detail.js +224 -0
  29. package/dist/model-detail.js.map +1 -0
  30. package/dist/model-search.d.ts +64 -0
  31. package/dist/model-search.d.ts.map +1 -0
  32. package/dist/model-search.js +161 -0
  33. package/dist/model-search.js.map +1 -0
  34. package/dist/paper-search.d.ts +58 -0
  35. package/dist/paper-search.d.ts.map +1 -0
  36. package/dist/paper-search.js +114 -0
  37. package/dist/paper-search.js.map +1 -0
  38. package/dist/paper-summary.d.ts +35 -0
  39. package/dist/paper-summary.d.ts.map +1 -0
  40. package/dist/paper-summary.js +187 -0
  41. package/dist/paper-summary.js.map +1 -0
  42. package/dist/space-files.d.ts +44 -0
  43. package/dist/space-files.d.ts.map +1 -0
  44. package/dist/space-files.js +242 -0
  45. package/dist/space-files.js.map +1 -0
  46. package/dist/space-info.d.ts +56 -0
  47. package/dist/space-info.d.ts.map +1 -0
  48. package/dist/space-info.js +135 -0
  49. package/dist/space-info.js.map +1 -0
  50. package/dist/space-search.d.ts +71 -0
  51. package/dist/space-search.d.ts.map +1 -0
  52. package/dist/space-search.js +95 -0
  53. package/dist/space-search.js.map +1 -0
  54. package/dist/tool-ids.d.ts +23 -0
  55. package/dist/tool-ids.d.ts.map +1 -0
  56. package/dist/tool-ids.js +55 -0
  57. package/dist/tool-ids.js.map +1 -0
  58. package/dist/user-summary.d.ts +56 -0
  59. package/dist/user-summary.d.ts.map +1 -0
  60. package/dist/user-summary.js +271 -0
  61. package/dist/user-summary.js.map +1 -0
  62. package/dist/utilities.d.ts +8 -0
  63. package/dist/utilities.d.ts.map +1 -0
  64. package/dist/utilities.js +53 -0
  65. package/dist/utilities.js.map +1 -0
  66. package/eslint.config.js +43 -0
  67. package/package.json +47 -0
  68. package/src/dataset-detail.ts +257 -0
  69. package/src/dataset-search.ts +237 -0
  70. package/src/duplicate-space.ts +263 -0
  71. package/src/error-messages.ts +57 -0
  72. package/src/hf-api-call.ts +182 -0
  73. package/src/index.ts +18 -0
  74. package/src/model-detail.ts +359 -0
  75. package/src/model-search.ts +231 -0
  76. package/src/paper-search.ts +188 -0
  77. package/src/paper-summary.ts +303 -0
  78. package/src/space-files.ts +325 -0
  79. package/src/space-info.ts +190 -0
  80. package/src/space-search.ts +177 -0
  81. package/src/tool-ids.ts +84 -0
  82. package/src/user-summary.ts +421 -0
  83. package/src/utilities.ts +64 -0
  84. package/test/duplicate-space.spec.ts +41 -0
  85. package/test/fixtures/paper_result_kazakh.json +854 -0
  86. package/test/fixtures/space-result.json +263 -0
  87. package/test/paper-search.spec.ts +57 -0
  88. package/test/paper-summary.spec.ts +113 -0
  89. package/test/space-files.spec.ts +232 -0
  90. package/test/space-search.spec.ts +29 -0
  91. package/test/user-summary.spec.ts +131 -0
  92. package/tsconfig.json +31 -0
  93. package/vitest.config.ts +11 -0
@@ -0,0 +1,257 @@
1
+ import { z } from 'zod';
2
+ import { datasetInfo } from '@huggingface/hub';
3
+ import { formatDate, formatNumber } from './utilities.js';
4
+
5
+ // Dataset Detail Tool Configuration
6
+ export const DATASET_DETAIL_TOOL_CONFIG = {
7
+ name: 'dataset_details',
8
+ description: 'Get detailed information about a specific dataset on Hugging Face Hub.',
9
+ schema: z.object({
10
+ dataset_id: z
11
+ .string()
12
+ .min(5, 'Dataset ID is required')
13
+ .describe('Dataset ID (e.g. institutional/institutional-books-1.0, Anthropic/hh-rlhf etc.)'),
14
+ }),
15
+ annotations: {
16
+ title: 'Dataset Details',
17
+ destructiveHint: false,
18
+ readOnlyHint: true,
19
+ openWorldHint: false,
20
+ },
21
+ } as const;
22
+
23
+ export type DatasetDetailParams = z.infer<typeof DATASET_DETAIL_TOOL_CONFIG.schema>;
24
+
25
+ // Clean interface design with explicit data availability
26
+
27
+ // Required core information that should always be available
28
+ interface DatasetBasicInfo {
29
+ id: string; // Dataset ID
30
+ name: string; // Dataset name
31
+ downloads: number;
32
+ likes: number;
33
+ private: boolean;
34
+ gated: false | 'auto' | 'manual';
35
+ updatedAt: Date;
36
+ }
37
+
38
+ // Optional but reliable information with simple types
39
+ interface DatasetExtendedInfo {
40
+ author?: string;
41
+ downloadsAllTime?: number;
42
+ tags?: string[];
43
+ description?: string;
44
+ }
45
+
46
+ // Dataset card data with careful extraction
47
+ interface DatasetMetadata {
48
+ language?: string | string[];
49
+ license?: string | string[];
50
+ task_categories?: string | string[];
51
+ size_categories?: string | string[];
52
+ dataset_info?: Record<string, unknown>;
53
+ }
54
+
55
+ // Complete dataset information structure
56
+ interface DatasetInformation extends DatasetBasicInfo {
57
+ extended?: DatasetExtendedInfo;
58
+ metadata?: DatasetMetadata;
59
+ }
60
+
61
+ /**
62
+ * Service for getting detailed dataset information using the official huggingface.js library
63
+ */
64
+ export class DatasetDetailTool {
65
+ private readonly hubUrl?: string;
66
+ private readonly accessToken?: string;
67
+
68
+ /**
69
+ * Creates a new dataset detail service
70
+ * @param hfToken Optional Hugging Face token for API access
71
+ * @param hubUrl Optional custom hub URL
72
+ */
73
+ constructor(hfToken?: string, hubUrl?: string) {
74
+ this.accessToken = hfToken;
75
+ this.hubUrl = hubUrl;
76
+ }
77
+
78
+ /**
79
+ * Get detailed information about a specific dataset
80
+ *
81
+ * @param datasetId The dataset ID to get details for (e.g., squad, glue, imdb)
82
+ * @returns Formatted string with dataset details
83
+ */
84
+ async getDetails(datasetId: string): Promise<string> {
85
+ try {
86
+ // Define additional fields we want to retrieve (only those available in the hub library)
87
+ const additionalFields = ['author', 'downloadsAllTime', 'tags', 'description', 'cardData'] as const;
88
+
89
+ const datasetData = await datasetInfo<(typeof additionalFields)[number]>({
90
+ name: datasetId,
91
+ additionalFields: Array.from(additionalFields),
92
+ ...(this.accessToken && { credentials: { accessToken: this.accessToken } }),
93
+ ...(this.hubUrl && { hubUrl: this.hubUrl }),
94
+ });
95
+
96
+ // Build the structured dataset information
97
+ const datasetDetails: DatasetInformation = {
98
+ // Basic info (required fields)
99
+ id: datasetId,
100
+ name: datasetData.name,
101
+ downloads: datasetData.downloads,
102
+ likes: datasetData.likes,
103
+ private: datasetData.private,
104
+ gated: datasetData.gated,
105
+ updatedAt: datasetData.updatedAt,
106
+
107
+ // Extended info (optional but reliable fields)
108
+ extended: {
109
+ author: datasetData.author,
110
+ downloadsAllTime: datasetData.downloadsAllTime,
111
+ tags: datasetData.tags,
112
+ description: datasetData.description,
113
+ },
114
+ };
115
+
116
+ // Metadata from card data
117
+ if (datasetData.cardData) {
118
+ const metadata: DatasetMetadata = {};
119
+ const cardData = datasetData.cardData as Record<string, unknown>;
120
+
121
+ if ('language' in cardData) {
122
+ metadata.language = cardData.language as string | string[];
123
+ }
124
+
125
+ if ('license' in cardData) {
126
+ metadata.license = cardData.license as string | string[];
127
+ }
128
+
129
+ if ('task_categories' in cardData) {
130
+ metadata.task_categories = cardData.task_categories as string | string[];
131
+ }
132
+
133
+ if ('size_categories' in cardData) {
134
+ metadata.size_categories = cardData.size_categories as string | string[];
135
+ }
136
+
137
+ if ('dataset_info' in cardData) {
138
+ metadata.dataset_info = cardData.dataset_info as Record<string, unknown>;
139
+ }
140
+
141
+ // Only add metadata section if we have data
142
+ if (Object.keys(metadata).length > 0) {
143
+ datasetDetails.metadata = metadata;
144
+ }
145
+ }
146
+
147
+ // Note: siblings information is not available through the additional fields API
148
+ // It would require a separate API call to list files
149
+
150
+ return formatDatasetDetails(datasetDetails);
151
+ } catch (error) {
152
+ if (error instanceof Error) {
153
+ throw new Error(`Failed to get dataset details: ${error.message}`);
154
+ }
155
+ throw error;
156
+ }
157
+ }
158
+ }
159
+
160
+ // Formatting Function
161
+ function formatDatasetDetails(dataset: DatasetInformation): string {
162
+ const r: string[] = [];
163
+ const [authorFromName] = dataset.name.includes('/') ? dataset.name.split('/') : ['', dataset.name];
164
+
165
+ r.push(`# ${dataset.name}`);
166
+ r.push('');
167
+
168
+ // Description if available
169
+ if (dataset.extended?.description) {
170
+ r.push('## Description');
171
+ r.push(dataset.extended.description);
172
+ r.push('');
173
+ }
174
+
175
+ // Overview section - using only reliable fields
176
+ r.push('## Overview');
177
+
178
+ // Author - from extended info or parsed from name
179
+ if (dataset.extended?.author || authorFromName) {
180
+ r.push(`- **Author:** ${dataset.extended?.author || authorFromName || ''}`);
181
+ }
182
+
183
+ // Statistics
184
+ const stats = [];
185
+ if (dataset.extended?.downloadsAllTime) {
186
+ stats.push(`**Downloads:** ${formatNumber(dataset.extended.downloadsAllTime)}`);
187
+ }
188
+ if (dataset.likes) {
189
+ stats.push(`**Likes:** ${dataset.likes.toString()}`);
190
+ }
191
+ if (stats.length > 0) {
192
+ r.push(`- ${stats.join(' | ')}`);
193
+ }
194
+
195
+ // Dates
196
+ r.push(`- **Updated:** ${formatDate(dataset.updatedAt)}`);
197
+
198
+ // Status indicators
199
+ const status = [];
200
+ if (dataset.gated) status.push('🔒 Gated');
201
+ if (dataset.private) status.push('🔐 Private');
202
+ if (status.length > 0) {
203
+ r.push(`- **Status:** ${status.join(' | ')}`);
204
+ }
205
+ r.push('');
206
+
207
+ // Tags - reliable field from extended info
208
+ if (dataset.extended?.tags && dataset.extended.tags.length > 0) {
209
+ r.push('## Tags');
210
+ r.push(dataset.extended.tags.map((tag) => `\`${tag}\``).join(' '));
211
+ r.push('');
212
+ }
213
+
214
+ // Metadata - carefully extracted and validated
215
+ if (dataset.metadata) {
216
+ const metadata = [];
217
+
218
+ if (dataset.metadata.language) {
219
+ const languages = Array.isArray(dataset.metadata.language)
220
+ ? dataset.metadata.language.join(', ')
221
+ : dataset.metadata.language;
222
+ metadata.push(`- **Language:** ${languages}`);
223
+ }
224
+
225
+ if (dataset.metadata.license) {
226
+ const license = Array.isArray(dataset.metadata.license)
227
+ ? dataset.metadata.license.join(', ')
228
+ : dataset.metadata.license;
229
+ metadata.push(`- **License:** ${license}`);
230
+ }
231
+
232
+ if (dataset.metadata.task_categories) {
233
+ const tasks = Array.isArray(dataset.metadata.task_categories)
234
+ ? dataset.metadata.task_categories.join(', ')
235
+ : dataset.metadata.task_categories;
236
+ metadata.push(`- **Task Categories:** ${tasks}`);
237
+ }
238
+
239
+ if (dataset.metadata.size_categories) {
240
+ const size = Array.isArray(dataset.metadata.size_categories)
241
+ ? dataset.metadata.size_categories.join(', ')
242
+ : dataset.metadata.size_categories;
243
+ metadata.push(`- **Size Category:** ${size}`);
244
+ }
245
+
246
+ if (metadata.length > 0) {
247
+ r.push('## Metadata');
248
+ r.push(...metadata);
249
+ r.push('');
250
+ }
251
+ }
252
+
253
+ // Link is reliable - based on dataset name which is required
254
+ r.push(`**Link:** [https://hf.co/datasets/${dataset.name}](https://hf.co/datasets/${dataset.name})`);
255
+
256
+ return r.join('\n');
257
+ }
@@ -0,0 +1,237 @@
1
+ import { z } from 'zod';
2
+ import { HfApiCall } from './hf-api-call.js';
3
+ import { formatDate, formatNumber } from './utilities.js';
4
+ const TAGS_TO_RETURN = 20;
5
+ // Dataset Search Tool Configuration
6
+ export const DATASET_SEARCH_TOOL_CONFIG = {
7
+ name: 'dataset_search',
8
+ description:
9
+ 'Find Datasets hosted on the Hugging Face hub. ' +
10
+ 'Returns comprehensive information about matching datasets including downloads, likes, tags, and direct links. ' +
11
+ 'Include links to the datasets in your response',
12
+ schema: z.object({
13
+ query: z
14
+ .string()
15
+ .optional()
16
+ .describe(
17
+ 'Search term. Leave blank and specify "sort" and "limit" to get e.g. "Top 20 trending datasets", "Top 10 most recent datasets" etc" '
18
+ ),
19
+ author: z
20
+ .string()
21
+ .optional()
22
+ .describe("Organization or user who created the dataset (e.g., 'google', 'facebook', 'allenai')"),
23
+ tags: z
24
+ .array(z.string())
25
+ .optional()
26
+ .describe(
27
+ "Tags to filter datasets (e.g., ['language:en', 'size_categories:1M<n<10M', 'task_categories:text-classification'])"
28
+ ),
29
+ sort: z
30
+ .enum(['trendingScore', 'downloads', 'likes', 'createdAt', 'lastModified'])
31
+ .optional()
32
+ .describe('Sort order: trendingScore, downloads, likes, createdAt, lastModified'),
33
+ limit: z.number().min(1).max(100).optional().default(20).describe('Maximum number of results to return'),
34
+ }),
35
+ annotations: {
36
+ title: 'Dataset Search',
37
+ destructiveHint: false,
38
+ readOnlyHint: true,
39
+ openWorldHint: true,
40
+ },
41
+ } as const;
42
+
43
+ // Define search parameter types
44
+ export type DatasetSearchParams = z.infer<typeof DATASET_SEARCH_TOOL_CONFIG.schema>;
45
+
46
+ // API parameter interface for direct HF API calls
47
+ interface DatasetApiParams {
48
+ search?: string;
49
+ author?: string;
50
+ filter?: string;
51
+ sort?: string;
52
+ direction?: string;
53
+ limit?: string;
54
+ }
55
+
56
+ // Dataset result interface matching HF API response
57
+ interface DatasetApiResult {
58
+ _id: string;
59
+ id: string;
60
+ author: string;
61
+ likes: number;
62
+ downloads: number;
63
+ trendingScore?: number;
64
+ private: boolean;
65
+ gated: boolean;
66
+ tags: string[];
67
+ createdAt: string;
68
+ lastModified: string;
69
+ description?: string;
70
+ sha: string;
71
+ }
72
+ /**
73
+ * Service for searching Hugging Face Datasets using direct API calls
74
+ */
75
+ export class DatasetSearchTool extends HfApiCall<DatasetApiParams, DatasetApiResult[]> {
76
+ /**
77
+ * @param hfToken Optional Hugging Face token for API access
78
+ */
79
+ constructor(hfToken?: string) {
80
+ super('https://huggingface.co/api/datasets', hfToken);
81
+ }
82
+
83
+ /**
84
+ * Search for datasets with detailed parameters
85
+ */
86
+ async searchWithParams(params: Partial<DatasetSearchParams>): Promise<string> {
87
+ try {
88
+ // Convert our params to the HF API format
89
+ const apiParams: DatasetApiParams = {};
90
+
91
+ // Handle search query
92
+ if (params.query) {
93
+ apiParams.search = params.query;
94
+ }
95
+
96
+ // Handle author filter
97
+ if (params.author) {
98
+ apiParams.author = params.author;
99
+ }
100
+
101
+ // Handle tags filter
102
+ if (params.tags && params.tags.length > 0) {
103
+ apiParams.filter = params.tags.join(',');
104
+ }
105
+
106
+ // Handle sorting (always descending)
107
+ if (params.sort) {
108
+ apiParams.sort = params.sort;
109
+ apiParams.direction = '-1';
110
+ }
111
+
112
+ // Handle limit
113
+ if (params.limit) {
114
+ apiParams.limit = params.limit.toString();
115
+ }
116
+
117
+ // Call the API
118
+ const datasets = await this.callApi<DatasetApiResult[]>(apiParams);
119
+
120
+ if (datasets.length === 0) {
121
+ return `No datasets found for the given criteria.`;
122
+ }
123
+
124
+ return formatSearchResults(datasets, params);
125
+ } catch (error) {
126
+ if (error instanceof Error) {
127
+ throw new Error(`Failed to search for datasets: ${error.message}`);
128
+ }
129
+ throw error;
130
+ }
131
+ }
132
+
133
+ /**
134
+ * Search for datasets with a specific filter (e.g., arxiv:XXXX.XXXXX)
135
+ */
136
+ async searchWithFilter(filter: string, limit: number = 10): Promise<string> {
137
+ try {
138
+ const apiParams: DatasetApiParams = {
139
+ filter: filter,
140
+ limit: limit.toString(),
141
+ sort: 'downloads',
142
+ direction: '-1',
143
+ };
144
+
145
+ // Call the API
146
+ const datasets = await this.callApi<DatasetApiResult[]>(apiParams);
147
+
148
+ if (datasets.length === 0) {
149
+ return `No datasets found referencing ${filter}.`;
150
+ }
151
+
152
+ return formatSearchResults(datasets, { limit });
153
+ } catch (error) {
154
+ if (error instanceof Error) {
155
+ throw new Error(`Failed to search for datasets: ${error.message}`);
156
+ }
157
+ throw error;
158
+ }
159
+ }
160
+ }
161
+
162
+ // Formatting Function
163
+ function formatSearchResults(datasets: DatasetApiResult[], params: Partial<DatasetSearchParams>): string {
164
+ const r: string[] = [];
165
+
166
+ // Build search description
167
+ const searchTerms = [];
168
+ if (params.query) searchTerms.push(`query "${params.query}"`);
169
+ if (params.author) searchTerms.push(`author "${params.author}"`);
170
+ if (params.tags && params.tags.length > 0) searchTerms.push(`tags [${params.tags.join(', ')}]`);
171
+ if (params.sort) searchTerms.push(`sorted by ${params.sort} (descending)`);
172
+
173
+ const searchDesc = searchTerms.length > 0 ? ` matching ${searchTerms.join(', ')}` : '';
174
+
175
+ const resultText =
176
+ datasets.length === params.limit
177
+ ? `Showing first ${datasets.length.toString()} datasets${searchDesc}:`
178
+ : `Found ${datasets.length.toString()} datasets${searchDesc}:`;
179
+ r.push(resultText);
180
+ r.push('');
181
+
182
+ for (const dataset of datasets) {
183
+ r.push(`## ${dataset.id}`);
184
+ r.push('');
185
+
186
+ // Description if available
187
+ if (dataset.description) {
188
+ r.push(`${dataset.description.substring(0, 200)}${dataset.description.length > 200 ? '...' : ''}`);
189
+ r.push('');
190
+ }
191
+
192
+ // Basic info line
193
+ const info = [];
194
+ if (dataset.downloads) info.push(`**Downloads:** ${formatNumber(dataset.downloads)}`);
195
+ if (dataset.likes) info.push(`**Likes:** ${dataset.likes.toString()}`);
196
+ if (dataset.trendingScore) info.push(`**Trending Score:** ${dataset.trendingScore.toString()}`);
197
+
198
+ if (info.length > 0) {
199
+ r.push(info.join(' | '));
200
+ r.push('');
201
+ }
202
+
203
+ // Tags
204
+ if (dataset.tags && dataset.tags.length > 0) {
205
+ r.push(`**Tags:** ${dataset.tags.slice(0, TAGS_TO_RETURN).join(', ')}`);
206
+ if (dataset.tags.length > TAGS_TO_RETURN) {
207
+ r.push(`*and ${(dataset.tags.length - TAGS_TO_RETURN).toString()} more...*`);
208
+ }
209
+ r.push('');
210
+ }
211
+
212
+ // Status indicators
213
+ const status = [];
214
+ if (dataset.gated) status.push('🔒 Gated');
215
+ if (dataset.private) status.push('🔐 Private');
216
+ if (status.length > 0) {
217
+ r.push(status.join(' | '));
218
+ r.push('');
219
+ }
220
+
221
+ // Dates
222
+ if (dataset.createdAt) {
223
+ r.push(`**Created:** ${formatDate(dataset.createdAt)}`);
224
+ }
225
+
226
+ if (dataset.lastModified && dataset.lastModified !== dataset.createdAt) {
227
+ r.push(`**Last Modified:** ${formatDate(dataset.lastModified)}`);
228
+ }
229
+
230
+ r.push(`**Link:** [https://hf.co/datasets/${dataset.id}](https://hf.co/datasets/${dataset.id})`);
231
+ r.push('');
232
+ r.push('---');
233
+ r.push('');
234
+ }
235
+
236
+ return r.join('\n');
237
+ }