@llmindset/hf-mcp 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/dataset-detail.d.ts +26 -0
- package/dist/dataset-detail.d.ts.map +1 -0
- package/dist/dataset-detail.js +157 -0
- package/dist/dataset-detail.js.map +1 -0
- package/dist/dataset-search.d.ts +62 -0
- package/dist/dataset-search.d.ts.map +1 -0
- package/dist/dataset-search.js +158 -0
- package/dist/dataset-search.js.map +1 -0
- package/dist/duplicate-space.d.ts +75 -0
- package/dist/duplicate-space.d.ts.map +1 -0
- package/dist/duplicate-space.js +189 -0
- package/dist/duplicate-space.js.map +1 -0
- package/dist/error-messages.d.ts +4 -0
- package/dist/error-messages.d.ts.map +1 -0
- package/dist/error-messages.js +30 -0
- package/dist/error-messages.js.map +1 -0
- package/dist/hf-api-call.d.ts +18 -0
- package/dist/hf-api-call.d.ts.map +1 -0
- package/dist/hf-api-call.js +105 -0
- package/dist/hf-api-call.js.map +1 -0
- package/dist/index.d.ts +16 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +16 -0
- package/dist/index.js.map +1 -0
- package/dist/model-detail.d.ts +26 -0
- package/dist/model-detail.d.ts.map +1 -0
- package/dist/model-detail.js +224 -0
- package/dist/model-detail.js.map +1 -0
- package/dist/model-search.d.ts +64 -0
- package/dist/model-search.d.ts.map +1 -0
- package/dist/model-search.js +161 -0
- package/dist/model-search.js.map +1 -0
- package/dist/paper-search.d.ts +58 -0
- package/dist/paper-search.d.ts.map +1 -0
- package/dist/paper-search.js +114 -0
- package/dist/paper-search.js.map +1 -0
- package/dist/paper-summary.d.ts +35 -0
- package/dist/paper-summary.d.ts.map +1 -0
- package/dist/paper-summary.js +187 -0
- package/dist/paper-summary.js.map +1 -0
- package/dist/space-files.d.ts +44 -0
- package/dist/space-files.d.ts.map +1 -0
- package/dist/space-files.js +242 -0
- package/dist/space-files.js.map +1 -0
- package/dist/space-info.d.ts +56 -0
- package/dist/space-info.d.ts.map +1 -0
- package/dist/space-info.js +135 -0
- package/dist/space-info.js.map +1 -0
- package/dist/space-search.d.ts +71 -0
- package/dist/space-search.d.ts.map +1 -0
- package/dist/space-search.js +95 -0
- package/dist/space-search.js.map +1 -0
- package/dist/tool-ids.d.ts +23 -0
- package/dist/tool-ids.d.ts.map +1 -0
- package/dist/tool-ids.js +55 -0
- package/dist/tool-ids.js.map +1 -0
- package/dist/user-summary.d.ts +56 -0
- package/dist/user-summary.d.ts.map +1 -0
- package/dist/user-summary.js +271 -0
- package/dist/user-summary.js.map +1 -0
- package/dist/utilities.d.ts +8 -0
- package/dist/utilities.d.ts.map +1 -0
- package/dist/utilities.js +53 -0
- package/dist/utilities.js.map +1 -0
- package/eslint.config.js +43 -0
- package/package.json +47 -0
- package/src/dataset-detail.ts +257 -0
- package/src/dataset-search.ts +237 -0
- package/src/duplicate-space.ts +263 -0
- package/src/error-messages.ts +57 -0
- package/src/hf-api-call.ts +182 -0
- package/src/index.ts +18 -0
- package/src/model-detail.ts +359 -0
- package/src/model-search.ts +231 -0
- package/src/paper-search.ts +188 -0
- package/src/paper-summary.ts +303 -0
- package/src/space-files.ts +325 -0
- package/src/space-info.ts +190 -0
- package/src/space-search.ts +177 -0
- package/src/tool-ids.ts +84 -0
- package/src/user-summary.ts +421 -0
- package/src/utilities.ts +64 -0
- package/test/duplicate-space.spec.ts +41 -0
- package/test/fixtures/paper_result_kazakh.json +854 -0
- package/test/fixtures/space-result.json +263 -0
- package/test/paper-search.spec.ts +57 -0
- package/test/paper-summary.spec.ts +113 -0
- package/test/space-files.spec.ts +232 -0
- package/test/space-search.spec.ts +29 -0
- package/test/user-summary.spec.ts +131 -0
- package/tsconfig.json +31 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { datasetInfo } from '@huggingface/hub';
|
|
3
|
+
import { formatDate, formatNumber } from './utilities.js';
|
|
4
|
+
|
|
5
|
+
// Dataset Detail Tool Configuration
|
|
6
|
+
export const DATASET_DETAIL_TOOL_CONFIG = {
|
|
7
|
+
name: 'dataset_details',
|
|
8
|
+
description: 'Get detailed information about a specific dataset on Hugging Face Hub.',
|
|
9
|
+
schema: z.object({
|
|
10
|
+
dataset_id: z
|
|
11
|
+
.string()
|
|
12
|
+
.min(5, 'Dataset ID is required')
|
|
13
|
+
.describe('Dataset ID (e.g. institutional/institutional-books-1.0, Anthropic/hh-rlhf etc.)'),
|
|
14
|
+
}),
|
|
15
|
+
annotations: {
|
|
16
|
+
title: 'Dataset Details',
|
|
17
|
+
destructiveHint: false,
|
|
18
|
+
readOnlyHint: true,
|
|
19
|
+
openWorldHint: false,
|
|
20
|
+
},
|
|
21
|
+
} as const;
|
|
22
|
+
|
|
23
|
+
export type DatasetDetailParams = z.infer<typeof DATASET_DETAIL_TOOL_CONFIG.schema>;
|
|
24
|
+
|
|
25
|
+
// Clean interface design with explicit data availability
|
|
26
|
+
|
|
27
|
+
// Required core information that should always be available
|
|
28
|
+
interface DatasetBasicInfo {
|
|
29
|
+
id: string; // Dataset ID
|
|
30
|
+
name: string; // Dataset name
|
|
31
|
+
downloads: number;
|
|
32
|
+
likes: number;
|
|
33
|
+
private: boolean;
|
|
34
|
+
gated: false | 'auto' | 'manual';
|
|
35
|
+
updatedAt: Date;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Optional but reliable information with simple types
|
|
39
|
+
interface DatasetExtendedInfo {
|
|
40
|
+
author?: string;
|
|
41
|
+
downloadsAllTime?: number;
|
|
42
|
+
tags?: string[];
|
|
43
|
+
description?: string;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Dataset card data with careful extraction
|
|
47
|
+
interface DatasetMetadata {
|
|
48
|
+
language?: string | string[];
|
|
49
|
+
license?: string | string[];
|
|
50
|
+
task_categories?: string | string[];
|
|
51
|
+
size_categories?: string | string[];
|
|
52
|
+
dataset_info?: Record<string, unknown>;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Complete dataset information structure
|
|
56
|
+
interface DatasetInformation extends DatasetBasicInfo {
|
|
57
|
+
extended?: DatasetExtendedInfo;
|
|
58
|
+
metadata?: DatasetMetadata;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Service for getting detailed dataset information using the official huggingface.js library
|
|
63
|
+
*/
|
|
64
|
+
export class DatasetDetailTool {
|
|
65
|
+
private readonly hubUrl?: string;
|
|
66
|
+
private readonly accessToken?: string;
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Creates a new dataset detail service
|
|
70
|
+
* @param hfToken Optional Hugging Face token for API access
|
|
71
|
+
* @param hubUrl Optional custom hub URL
|
|
72
|
+
*/
|
|
73
|
+
constructor(hfToken?: string, hubUrl?: string) {
|
|
74
|
+
this.accessToken = hfToken;
|
|
75
|
+
this.hubUrl = hubUrl;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Get detailed information about a specific dataset
|
|
80
|
+
*
|
|
81
|
+
* @param datasetId The dataset ID to get details for (e.g., squad, glue, imdb)
|
|
82
|
+
* @returns Formatted string with dataset details
|
|
83
|
+
*/
|
|
84
|
+
async getDetails(datasetId: string): Promise<string> {
|
|
85
|
+
try {
|
|
86
|
+
// Define additional fields we want to retrieve (only those available in the hub library)
|
|
87
|
+
const additionalFields = ['author', 'downloadsAllTime', 'tags', 'description', 'cardData'] as const;
|
|
88
|
+
|
|
89
|
+
const datasetData = await datasetInfo<(typeof additionalFields)[number]>({
|
|
90
|
+
name: datasetId,
|
|
91
|
+
additionalFields: Array.from(additionalFields),
|
|
92
|
+
...(this.accessToken && { credentials: { accessToken: this.accessToken } }),
|
|
93
|
+
...(this.hubUrl && { hubUrl: this.hubUrl }),
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
// Build the structured dataset information
|
|
97
|
+
const datasetDetails: DatasetInformation = {
|
|
98
|
+
// Basic info (required fields)
|
|
99
|
+
id: datasetId,
|
|
100
|
+
name: datasetData.name,
|
|
101
|
+
downloads: datasetData.downloads,
|
|
102
|
+
likes: datasetData.likes,
|
|
103
|
+
private: datasetData.private,
|
|
104
|
+
gated: datasetData.gated,
|
|
105
|
+
updatedAt: datasetData.updatedAt,
|
|
106
|
+
|
|
107
|
+
// Extended info (optional but reliable fields)
|
|
108
|
+
extended: {
|
|
109
|
+
author: datasetData.author,
|
|
110
|
+
downloadsAllTime: datasetData.downloadsAllTime,
|
|
111
|
+
tags: datasetData.tags,
|
|
112
|
+
description: datasetData.description,
|
|
113
|
+
},
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
// Metadata from card data
|
|
117
|
+
if (datasetData.cardData) {
|
|
118
|
+
const metadata: DatasetMetadata = {};
|
|
119
|
+
const cardData = datasetData.cardData as Record<string, unknown>;
|
|
120
|
+
|
|
121
|
+
if ('language' in cardData) {
|
|
122
|
+
metadata.language = cardData.language as string | string[];
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
if ('license' in cardData) {
|
|
126
|
+
metadata.license = cardData.license as string | string[];
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if ('task_categories' in cardData) {
|
|
130
|
+
metadata.task_categories = cardData.task_categories as string | string[];
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if ('size_categories' in cardData) {
|
|
134
|
+
metadata.size_categories = cardData.size_categories as string | string[];
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
if ('dataset_info' in cardData) {
|
|
138
|
+
metadata.dataset_info = cardData.dataset_info as Record<string, unknown>;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Only add metadata section if we have data
|
|
142
|
+
if (Object.keys(metadata).length > 0) {
|
|
143
|
+
datasetDetails.metadata = metadata;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Note: siblings information is not available through the additional fields API
|
|
148
|
+
// It would require a separate API call to list files
|
|
149
|
+
|
|
150
|
+
return formatDatasetDetails(datasetDetails);
|
|
151
|
+
} catch (error) {
|
|
152
|
+
if (error instanceof Error) {
|
|
153
|
+
throw new Error(`Failed to get dataset details: ${error.message}`);
|
|
154
|
+
}
|
|
155
|
+
throw error;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Formatting Function
|
|
161
|
+
function formatDatasetDetails(dataset: DatasetInformation): string {
|
|
162
|
+
const r: string[] = [];
|
|
163
|
+
const [authorFromName] = dataset.name.includes('/') ? dataset.name.split('/') : ['', dataset.name];
|
|
164
|
+
|
|
165
|
+
r.push(`# ${dataset.name}`);
|
|
166
|
+
r.push('');
|
|
167
|
+
|
|
168
|
+
// Description if available
|
|
169
|
+
if (dataset.extended?.description) {
|
|
170
|
+
r.push('## Description');
|
|
171
|
+
r.push(dataset.extended.description);
|
|
172
|
+
r.push('');
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Overview section - using only reliable fields
|
|
176
|
+
r.push('## Overview');
|
|
177
|
+
|
|
178
|
+
// Author - from extended info or parsed from name
|
|
179
|
+
if (dataset.extended?.author || authorFromName) {
|
|
180
|
+
r.push(`- **Author:** ${dataset.extended?.author || authorFromName || ''}`);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Statistics
|
|
184
|
+
const stats = [];
|
|
185
|
+
if (dataset.extended?.downloadsAllTime) {
|
|
186
|
+
stats.push(`**Downloads:** ${formatNumber(dataset.extended.downloadsAllTime)}`);
|
|
187
|
+
}
|
|
188
|
+
if (dataset.likes) {
|
|
189
|
+
stats.push(`**Likes:** ${dataset.likes.toString()}`);
|
|
190
|
+
}
|
|
191
|
+
if (stats.length > 0) {
|
|
192
|
+
r.push(`- ${stats.join(' | ')}`);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Dates
|
|
196
|
+
r.push(`- **Updated:** ${formatDate(dataset.updatedAt)}`);
|
|
197
|
+
|
|
198
|
+
// Status indicators
|
|
199
|
+
const status = [];
|
|
200
|
+
if (dataset.gated) status.push('🔒 Gated');
|
|
201
|
+
if (dataset.private) status.push('🔐 Private');
|
|
202
|
+
if (status.length > 0) {
|
|
203
|
+
r.push(`- **Status:** ${status.join(' | ')}`);
|
|
204
|
+
}
|
|
205
|
+
r.push('');
|
|
206
|
+
|
|
207
|
+
// Tags - reliable field from extended info
|
|
208
|
+
if (dataset.extended?.tags && dataset.extended.tags.length > 0) {
|
|
209
|
+
r.push('## Tags');
|
|
210
|
+
r.push(dataset.extended.tags.map((tag) => `\`${tag}\``).join(' '));
|
|
211
|
+
r.push('');
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Metadata - carefully extracted and validated
|
|
215
|
+
if (dataset.metadata) {
|
|
216
|
+
const metadata = [];
|
|
217
|
+
|
|
218
|
+
if (dataset.metadata.language) {
|
|
219
|
+
const languages = Array.isArray(dataset.metadata.language)
|
|
220
|
+
? dataset.metadata.language.join(', ')
|
|
221
|
+
: dataset.metadata.language;
|
|
222
|
+
metadata.push(`- **Language:** ${languages}`);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
if (dataset.metadata.license) {
|
|
226
|
+
const license = Array.isArray(dataset.metadata.license)
|
|
227
|
+
? dataset.metadata.license.join(', ')
|
|
228
|
+
: dataset.metadata.license;
|
|
229
|
+
metadata.push(`- **License:** ${license}`);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
if (dataset.metadata.task_categories) {
|
|
233
|
+
const tasks = Array.isArray(dataset.metadata.task_categories)
|
|
234
|
+
? dataset.metadata.task_categories.join(', ')
|
|
235
|
+
: dataset.metadata.task_categories;
|
|
236
|
+
metadata.push(`- **Task Categories:** ${tasks}`);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
if (dataset.metadata.size_categories) {
|
|
240
|
+
const size = Array.isArray(dataset.metadata.size_categories)
|
|
241
|
+
? dataset.metadata.size_categories.join(', ')
|
|
242
|
+
: dataset.metadata.size_categories;
|
|
243
|
+
metadata.push(`- **Size Category:** ${size}`);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (metadata.length > 0) {
|
|
247
|
+
r.push('## Metadata');
|
|
248
|
+
r.push(...metadata);
|
|
249
|
+
r.push('');
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Link is reliable - based on dataset name which is required
|
|
254
|
+
r.push(`**Link:** [https://hf.co/datasets/${dataset.name}](https://hf.co/datasets/${dataset.name})`);
|
|
255
|
+
|
|
256
|
+
return r.join('\n');
|
|
257
|
+
}
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { HfApiCall } from './hf-api-call.js';
|
|
3
|
+
import { formatDate, formatNumber } from './utilities.js';
|
|
4
|
+
const TAGS_TO_RETURN = 20;
|
|
5
|
+
// Dataset Search Tool Configuration
|
|
6
|
+
export const DATASET_SEARCH_TOOL_CONFIG = {
|
|
7
|
+
name: 'dataset_search',
|
|
8
|
+
description:
|
|
9
|
+
'Find Datasets hosted on the Hugging Face hub. ' +
|
|
10
|
+
'Returns comprehensive information about matching datasets including downloads, likes, tags, and direct links. ' +
|
|
11
|
+
'Include links to the datasets in your response',
|
|
12
|
+
schema: z.object({
|
|
13
|
+
query: z
|
|
14
|
+
.string()
|
|
15
|
+
.optional()
|
|
16
|
+
.describe(
|
|
17
|
+
'Search term. Leave blank and specify "sort" and "limit" to get e.g. "Top 20 trending datasets", "Top 10 most recent datasets" etc" '
|
|
18
|
+
),
|
|
19
|
+
author: z
|
|
20
|
+
.string()
|
|
21
|
+
.optional()
|
|
22
|
+
.describe("Organization or user who created the dataset (e.g., 'google', 'facebook', 'allenai')"),
|
|
23
|
+
tags: z
|
|
24
|
+
.array(z.string())
|
|
25
|
+
.optional()
|
|
26
|
+
.describe(
|
|
27
|
+
"Tags to filter datasets (e.g., ['language:en', 'size_categories:1M<n<10M', 'task_categories:text-classification'])"
|
|
28
|
+
),
|
|
29
|
+
sort: z
|
|
30
|
+
.enum(['trendingScore', 'downloads', 'likes', 'createdAt', 'lastModified'])
|
|
31
|
+
.optional()
|
|
32
|
+
.describe('Sort order: trendingScore, downloads, likes, createdAt, lastModified'),
|
|
33
|
+
limit: z.number().min(1).max(100).optional().default(20).describe('Maximum number of results to return'),
|
|
34
|
+
}),
|
|
35
|
+
annotations: {
|
|
36
|
+
title: 'Dataset Search',
|
|
37
|
+
destructiveHint: false,
|
|
38
|
+
readOnlyHint: true,
|
|
39
|
+
openWorldHint: true,
|
|
40
|
+
},
|
|
41
|
+
} as const;
|
|
42
|
+
|
|
43
|
+
// Define search parameter types
|
|
44
|
+
export type DatasetSearchParams = z.infer<typeof DATASET_SEARCH_TOOL_CONFIG.schema>;
|
|
45
|
+
|
|
46
|
+
// API parameter interface for direct HF API calls
|
|
47
|
+
interface DatasetApiParams {
|
|
48
|
+
search?: string;
|
|
49
|
+
author?: string;
|
|
50
|
+
filter?: string;
|
|
51
|
+
sort?: string;
|
|
52
|
+
direction?: string;
|
|
53
|
+
limit?: string;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// Dataset result interface matching HF API response
|
|
57
|
+
interface DatasetApiResult {
|
|
58
|
+
_id: string;
|
|
59
|
+
id: string;
|
|
60
|
+
author: string;
|
|
61
|
+
likes: number;
|
|
62
|
+
downloads: number;
|
|
63
|
+
trendingScore?: number;
|
|
64
|
+
private: boolean;
|
|
65
|
+
gated: boolean;
|
|
66
|
+
tags: string[];
|
|
67
|
+
createdAt: string;
|
|
68
|
+
lastModified: string;
|
|
69
|
+
description?: string;
|
|
70
|
+
sha: string;
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Service for searching Hugging Face Datasets using direct API calls
|
|
74
|
+
*/
|
|
75
|
+
export class DatasetSearchTool extends HfApiCall<DatasetApiParams, DatasetApiResult[]> {
|
|
76
|
+
/**
|
|
77
|
+
* @param hfToken Optional Hugging Face token for API access
|
|
78
|
+
*/
|
|
79
|
+
constructor(hfToken?: string) {
|
|
80
|
+
super('https://huggingface.co/api/datasets', hfToken);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Search for datasets with detailed parameters
|
|
85
|
+
*/
|
|
86
|
+
async searchWithParams(params: Partial<DatasetSearchParams>): Promise<string> {
|
|
87
|
+
try {
|
|
88
|
+
// Convert our params to the HF API format
|
|
89
|
+
const apiParams: DatasetApiParams = {};
|
|
90
|
+
|
|
91
|
+
// Handle search query
|
|
92
|
+
if (params.query) {
|
|
93
|
+
apiParams.search = params.query;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Handle author filter
|
|
97
|
+
if (params.author) {
|
|
98
|
+
apiParams.author = params.author;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Handle tags filter
|
|
102
|
+
if (params.tags && params.tags.length > 0) {
|
|
103
|
+
apiParams.filter = params.tags.join(',');
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Handle sorting (always descending)
|
|
107
|
+
if (params.sort) {
|
|
108
|
+
apiParams.sort = params.sort;
|
|
109
|
+
apiParams.direction = '-1';
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Handle limit
|
|
113
|
+
if (params.limit) {
|
|
114
|
+
apiParams.limit = params.limit.toString();
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Call the API
|
|
118
|
+
const datasets = await this.callApi<DatasetApiResult[]>(apiParams);
|
|
119
|
+
|
|
120
|
+
if (datasets.length === 0) {
|
|
121
|
+
return `No datasets found for the given criteria.`;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return formatSearchResults(datasets, params);
|
|
125
|
+
} catch (error) {
|
|
126
|
+
if (error instanceof Error) {
|
|
127
|
+
throw new Error(`Failed to search for datasets: ${error.message}`);
|
|
128
|
+
}
|
|
129
|
+
throw error;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Search for datasets with a specific filter (e.g., arxiv:XXXX.XXXXX)
|
|
135
|
+
*/
|
|
136
|
+
async searchWithFilter(filter: string, limit: number = 10): Promise<string> {
|
|
137
|
+
try {
|
|
138
|
+
const apiParams: DatasetApiParams = {
|
|
139
|
+
filter: filter,
|
|
140
|
+
limit: limit.toString(),
|
|
141
|
+
sort: 'downloads',
|
|
142
|
+
direction: '-1',
|
|
143
|
+
};
|
|
144
|
+
|
|
145
|
+
// Call the API
|
|
146
|
+
const datasets = await this.callApi<DatasetApiResult[]>(apiParams);
|
|
147
|
+
|
|
148
|
+
if (datasets.length === 0) {
|
|
149
|
+
return `No datasets found referencing ${filter}.`;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return formatSearchResults(datasets, { limit });
|
|
153
|
+
} catch (error) {
|
|
154
|
+
if (error instanceof Error) {
|
|
155
|
+
throw new Error(`Failed to search for datasets: ${error.message}`);
|
|
156
|
+
}
|
|
157
|
+
throw error;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Formatting Function
|
|
163
|
+
function formatSearchResults(datasets: DatasetApiResult[], params: Partial<DatasetSearchParams>): string {
|
|
164
|
+
const r: string[] = [];
|
|
165
|
+
|
|
166
|
+
// Build search description
|
|
167
|
+
const searchTerms = [];
|
|
168
|
+
if (params.query) searchTerms.push(`query "${params.query}"`);
|
|
169
|
+
if (params.author) searchTerms.push(`author "${params.author}"`);
|
|
170
|
+
if (params.tags && params.tags.length > 0) searchTerms.push(`tags [${params.tags.join(', ')}]`);
|
|
171
|
+
if (params.sort) searchTerms.push(`sorted by ${params.sort} (descending)`);
|
|
172
|
+
|
|
173
|
+
const searchDesc = searchTerms.length > 0 ? ` matching ${searchTerms.join(', ')}` : '';
|
|
174
|
+
|
|
175
|
+
const resultText =
|
|
176
|
+
datasets.length === params.limit
|
|
177
|
+
? `Showing first ${datasets.length.toString()} datasets${searchDesc}:`
|
|
178
|
+
: `Found ${datasets.length.toString()} datasets${searchDesc}:`;
|
|
179
|
+
r.push(resultText);
|
|
180
|
+
r.push('');
|
|
181
|
+
|
|
182
|
+
for (const dataset of datasets) {
|
|
183
|
+
r.push(`## ${dataset.id}`);
|
|
184
|
+
r.push('');
|
|
185
|
+
|
|
186
|
+
// Description if available
|
|
187
|
+
if (dataset.description) {
|
|
188
|
+
r.push(`${dataset.description.substring(0, 200)}${dataset.description.length > 200 ? '...' : ''}`);
|
|
189
|
+
r.push('');
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Basic info line
|
|
193
|
+
const info = [];
|
|
194
|
+
if (dataset.downloads) info.push(`**Downloads:** ${formatNumber(dataset.downloads)}`);
|
|
195
|
+
if (dataset.likes) info.push(`**Likes:** ${dataset.likes.toString()}`);
|
|
196
|
+
if (dataset.trendingScore) info.push(`**Trending Score:** ${dataset.trendingScore.toString()}`);
|
|
197
|
+
|
|
198
|
+
if (info.length > 0) {
|
|
199
|
+
r.push(info.join(' | '));
|
|
200
|
+
r.push('');
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Tags
|
|
204
|
+
if (dataset.tags && dataset.tags.length > 0) {
|
|
205
|
+
r.push(`**Tags:** ${dataset.tags.slice(0, TAGS_TO_RETURN).join(', ')}`);
|
|
206
|
+
if (dataset.tags.length > TAGS_TO_RETURN) {
|
|
207
|
+
r.push(`*and ${(dataset.tags.length - TAGS_TO_RETURN).toString()} more...*`);
|
|
208
|
+
}
|
|
209
|
+
r.push('');
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Status indicators
|
|
213
|
+
const status = [];
|
|
214
|
+
if (dataset.gated) status.push('🔒 Gated');
|
|
215
|
+
if (dataset.private) status.push('🔐 Private');
|
|
216
|
+
if (status.length > 0) {
|
|
217
|
+
r.push(status.join(' | '));
|
|
218
|
+
r.push('');
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Dates
|
|
222
|
+
if (dataset.createdAt) {
|
|
223
|
+
r.push(`**Created:** ${formatDate(dataset.createdAt)}`);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
if (dataset.lastModified && dataset.lastModified !== dataset.createdAt) {
|
|
227
|
+
r.push(`**Last Modified:** ${formatDate(dataset.lastModified)}`);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
r.push(`**Link:** [https://hf.co/datasets/${dataset.id}](https://hf.co/datasets/${dataset.id})`);
|
|
231
|
+
r.push('');
|
|
232
|
+
r.push('---');
|
|
233
|
+
r.push('');
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
return r.join('\n');
|
|
237
|
+
}
|