@llmindset/hf-mcp 0.3.11 → 0.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dataset-detail.js +1 -1
- package/dist/dataset-detail.js.map +1 -1
- package/dist/dataset-viewer-inspect.d.ts +48 -0
- package/dist/dataset-viewer-inspect.d.ts.map +1 -0
- package/dist/dataset-viewer-inspect.js +660 -0
- package/dist/dataset-viewer-inspect.js.map +1 -0
- package/dist/dataset-viewer-inspect.test.d.ts +2 -0
- package/dist/dataset-viewer-inspect.test.d.ts.map +1 -0
- package/dist/dataset-viewer-inspect.test.js +218 -0
- package/dist/dataset-viewer-inspect.test.js.map +1 -0
- package/dist/gradio-files.d.ts +2 -2
- package/dist/hub-inspect.d.ts +17 -0
- package/dist/hub-inspect.d.ts.map +1 -1
- package/dist/hub-inspect.js +68 -4
- package/dist/hub-inspect.js.map +1 -1
- package/dist/hub-inspect.test.d.ts +2 -0
- package/dist/hub-inspect.test.d.ts.map +1 -0
- package/dist/hub-inspect.test.js +24 -0
- package/dist/hub-inspect.test.js.map +1 -0
- package/dist/index.browser.d.ts.map +1 -1
- package/dist/index.browser.js +2 -1
- package/dist/index.browser.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/jobs/commands/run.d.ts.map +1 -1
- package/dist/jobs/commands/run.js +7 -1
- package/dist/jobs/commands/run.js.map +1 -1
- package/dist/jobs/commands/scheduled.d.ts.map +1 -1
- package/dist/jobs/commands/scheduled.js +2 -0
- package/dist/jobs/commands/scheduled.js.map +1 -1
- package/dist/jobs/commands/utils.d.ts +3 -1
- package/dist/jobs/commands/utils.d.ts.map +1 -1
- package/dist/jobs/commands/utils.js +76 -1
- package/dist/jobs/commands/utils.js.map +1 -1
- package/dist/jobs/jobs-tool.d.ts.map +1 -1
- package/dist/jobs/jobs-tool.js +60 -3
- package/dist/jobs/jobs-tool.js.map +1 -1
- package/dist/jobs/types.d.ts +22 -0
- package/dist/jobs/types.d.ts.map +1 -1
- package/dist/jobs/types.js +16 -1
- package/dist/jobs/types.js.map +1 -1
- package/dist/model-detail.js +1 -1
- package/dist/model-detail.js.map +1 -1
- package/dist/readme-utils.d.ts +1 -1
- package/dist/readme-utils.d.ts.map +1 -1
- package/dist/readme-utils.js +2 -13
- package/dist/readme-utils.js.map +1 -1
- package/package.json +1 -1
- package/src/dataset-detail.ts +1 -1
- package/src/dataset-viewer-inspect.test.ts +234 -0
- package/src/dataset-viewer-inspect.ts +809 -0
- package/src/hub-inspect.test.ts +28 -0
- package/src/hub-inspect.ts +88 -4
- package/src/index.browser.ts +2 -1
- package/src/index.ts +1 -0
- package/src/jobs/commands/run.ts +7 -1
- package/src/jobs/commands/scheduled.ts +2 -0
- package/src/jobs/commands/utils.ts +95 -5
- package/src/jobs/jobs-tool.ts +60 -3
- package/src/jobs/types.ts +35 -1
- package/src/model-detail.ts +1 -1
- package/src/readme-utils.ts +2 -32
- package/test/jobs/command-translation.spec.ts +88 -2
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { describe, expect, it } from 'vitest';
|
|
2
|
+
import { HUB_REPO_DETAILS_TOOL_CONFIG } from './hub-inspect.js';
|
|
3
|
+
|
|
4
|
+
describe('HUB_REPO_DETAILS_TOOL_CONFIG', () => {
|
|
5
|
+
it('defaults to overview and accepts dataset viewer operations', () => {
|
|
6
|
+
const parsed = HUB_REPO_DETAILS_TOOL_CONFIG.schema.parse({
|
|
7
|
+
repo_ids: ['rajpurkar/squad'],
|
|
8
|
+
repo_type: 'dataset',
|
|
9
|
+
operations: ['dataset_structure', 'dataset_preview'],
|
|
10
|
+
config: 'plain_text',
|
|
11
|
+
split: 'train',
|
|
12
|
+
offset: 0,
|
|
13
|
+
limit: 5,
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
expect(parsed.include_readme).toBe(false);
|
|
17
|
+
expect(parsed.operations).toEqual(['dataset_structure', 'dataset_preview']);
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
it('does not expose a redundant readme operation', () => {
|
|
21
|
+
expect(() =>
|
|
22
|
+
HUB_REPO_DETAILS_TOOL_CONFIG.schema.parse({
|
|
23
|
+
repo_ids: ['rajpurkar/squad'],
|
|
24
|
+
operations: ['readme'],
|
|
25
|
+
})
|
|
26
|
+
).toThrow();
|
|
27
|
+
});
|
|
28
|
+
});
|
package/src/hub-inspect.ts
CHANGED
|
@@ -4,12 +4,17 @@ import { ModelDetailTool } from './model-detail.js';
|
|
|
4
4
|
import { DatasetDetailTool } from './dataset-detail.js';
|
|
5
5
|
import { spaceInfo } from '@huggingface/hub';
|
|
6
6
|
import { formatDate } from './utilities.js';
|
|
7
|
+
import { DatasetViewerInspector } from './dataset-viewer-inspect.js';
|
|
8
|
+
|
|
9
|
+
const HUB_INSPECT_OPERATIONS = ['overview', 'dataset_structure', 'dataset_preview'] as const;
|
|
7
10
|
|
|
8
11
|
export const HUB_REPO_DETAILS_TOOL_CONFIG = {
|
|
9
12
|
name: 'hub_repo_details',
|
|
10
13
|
description:
|
|
11
14
|
'Get details for one or more Hugging Face repos (model, dataset, or space). ' +
|
|
12
|
-
'Auto-detects type unless specified.'
|
|
15
|
+
'Auto-detects type unless specified. For datasets, use operations: overview, dataset_structure, dataset_preview. ' +
|
|
16
|
+
'Use dataset_structure first to discover configs, splits, sizes, and schema. Use dataset_preview only when ' +
|
|
17
|
+
'config and split are known, unless the dataset has a single config/split.',
|
|
13
18
|
schema: z.object({
|
|
14
19
|
repo_ids: z
|
|
15
20
|
.array(z.string().min(1))
|
|
@@ -18,6 +23,26 @@ export const HUB_REPO_DETAILS_TOOL_CONFIG = {
|
|
|
18
23
|
.describe('Repo IDs for (models|dataset/space) - usually in author/name format (e.g. openai/gpt-oss-120b)'),
|
|
19
24
|
repo_type: z.enum(['model', 'dataset', 'space']).optional().describe('Specify lookup type; otherwise auto-detects'),
|
|
20
25
|
include_readme: z.boolean().default(false).describe('Include README from the repo'),
|
|
26
|
+
operations: z
|
|
27
|
+
.array(z.enum(HUB_INSPECT_OPERATIONS))
|
|
28
|
+
.optional()
|
|
29
|
+
.describe(
|
|
30
|
+
'Details to return. Defaults to ["overview"]. For datasets, prefer ["overview", "dataset_structure"] first; then call ["dataset_preview"] with config and split.'
|
|
31
|
+
),
|
|
32
|
+
config: z
|
|
33
|
+
.string()
|
|
34
|
+
.optional()
|
|
35
|
+
.describe(
|
|
36
|
+
'Dataset Viewer config. Required for dataset_preview when the dataset has multiple config/split options. Discover via dataset_structure.'
|
|
37
|
+
),
|
|
38
|
+
split: z
|
|
39
|
+
.string()
|
|
40
|
+
.optional()
|
|
41
|
+
.describe(
|
|
42
|
+
'Dataset Viewer split. Required for dataset_preview when the dataset has multiple config/split options. Discover via dataset_structure.'
|
|
43
|
+
),
|
|
44
|
+
offset: z.number().int().nonnegative().optional().describe('Row offset for dataset_preview. Defaults to 0.'),
|
|
45
|
+
limit: z.number().int().optional().describe('Row count for dataset_preview. Defaults to 5 and is clamped to 1-100.'),
|
|
21
46
|
}),
|
|
22
47
|
annotations: {
|
|
23
48
|
title: 'Hub Repo Details',
|
|
@@ -32,11 +57,13 @@ export type HubInspectParams = z.infer<typeof HUB_REPO_DETAILS_TOOL_CONFIG.schem
|
|
|
32
57
|
export class HubInspectTool {
|
|
33
58
|
private readonly modelDetail: ModelDetailTool;
|
|
34
59
|
private readonly datasetDetail: DatasetDetailTool;
|
|
60
|
+
private readonly datasetViewer: DatasetViewerInspector;
|
|
35
61
|
private readonly hubUrl?: string;
|
|
36
62
|
|
|
37
63
|
constructor(hfToken?: string, hubUrl?: string) {
|
|
38
64
|
this.modelDetail = new ModelDetailTool(hfToken, hubUrl);
|
|
39
65
|
this.datasetDetail = new DatasetDetailTool(hfToken, hubUrl);
|
|
66
|
+
this.datasetViewer = new DatasetViewerInspector(hfToken, { hubUrl });
|
|
40
67
|
this.hubUrl = hubUrl;
|
|
41
68
|
}
|
|
42
69
|
|
|
@@ -46,7 +73,7 @@ export class HubInspectTool {
|
|
|
46
73
|
|
|
47
74
|
for (const id of params.repo_ids) {
|
|
48
75
|
try {
|
|
49
|
-
const section = await this.inspectSingle(id, params
|
|
76
|
+
const section = await this.inspectSingle(id, params, includeReadme);
|
|
50
77
|
parts.push(section);
|
|
51
78
|
successCount += 1;
|
|
52
79
|
} catch (err) {
|
|
@@ -66,20 +93,30 @@ export class HubInspectTool {
|
|
|
66
93
|
|
|
67
94
|
private async inspectSingle(
|
|
68
95
|
repoId: string,
|
|
69
|
-
|
|
96
|
+
params: HubInspectParams,
|
|
70
97
|
includeReadme: boolean
|
|
71
98
|
): Promise<string> {
|
|
99
|
+
const type = params.repo_type;
|
|
100
|
+
const operations = normalizeOperations(params.operations);
|
|
101
|
+
const hasDatasetOperation = operations.some((operation) => operation === 'dataset_structure' || operation === 'dataset_preview');
|
|
102
|
+
|
|
72
103
|
// If caller constrained the type, do only that
|
|
73
104
|
if (type === 'model') {
|
|
105
|
+
if (hasDatasetOperation) return operationMismatch(repoId, 'model', operations);
|
|
74
106
|
return (await this.modelDetail.getDetails(repoId, includeReadme)).formatted;
|
|
75
107
|
}
|
|
76
108
|
if (type === 'dataset') {
|
|
77
|
-
return
|
|
109
|
+
return await this.getDatasetDetails(repoId, params, includeReadme, operations);
|
|
78
110
|
}
|
|
79
111
|
if (type === 'space') {
|
|
112
|
+
if (hasDatasetOperation) return operationMismatch(repoId, 'space', operations);
|
|
80
113
|
return await this.getSpaceDetails(repoId);
|
|
81
114
|
}
|
|
82
115
|
|
|
116
|
+
if (hasDatasetOperation) {
|
|
117
|
+
return await this.getDatasetDetails(repoId, params, includeReadme, operations);
|
|
118
|
+
}
|
|
119
|
+
|
|
83
120
|
// Auto-detect: attempt all three and aggregate. The same id may exist for multiple types.
|
|
84
121
|
const matches: string[] = [];
|
|
85
122
|
|
|
@@ -111,6 +148,33 @@ export class HubInspectTool {
|
|
|
111
148
|
return matches.join('\n\n---\n\n');
|
|
112
149
|
}
|
|
113
150
|
|
|
151
|
+
private async getDatasetDetails(
|
|
152
|
+
repoId: string,
|
|
153
|
+
params: HubInspectParams,
|
|
154
|
+
includeReadme: boolean,
|
|
155
|
+
operations: HubInspectOperation[]
|
|
156
|
+
): Promise<string> {
|
|
157
|
+
const sections: string[] = [];
|
|
158
|
+
if (operations.includes('overview')) {
|
|
159
|
+
const overview = (await this.datasetDetail.getDetails(repoId, includeReadme)).formatted;
|
|
160
|
+
sections.push(`${overview}\n\n${datasetDrillDownHint()}`);
|
|
161
|
+
}
|
|
162
|
+
if (operations.includes('dataset_structure')) {
|
|
163
|
+
sections.push(await this.datasetViewer.getStructure(repoId, { config: params.config, split: params.split }));
|
|
164
|
+
}
|
|
165
|
+
if (operations.includes('dataset_preview')) {
|
|
166
|
+
sections.push(
|
|
167
|
+
await this.datasetViewer.getPreview(repoId, {
|
|
168
|
+
config: params.config,
|
|
169
|
+
split: params.split,
|
|
170
|
+
offset: params.offset,
|
|
171
|
+
limit: params.limit,
|
|
172
|
+
})
|
|
173
|
+
);
|
|
174
|
+
}
|
|
175
|
+
return sections.join('\n\n');
|
|
176
|
+
}
|
|
177
|
+
|
|
114
178
|
private async getSpaceDetails(spaceId: string): Promise<string> {
|
|
115
179
|
const additionalFields = ['author', 'tags', 'runtime', 'subdomain', 'sha'] as const;
|
|
116
180
|
const info = await spaceInfo<(typeof additionalFields)[number]>({
|
|
@@ -142,3 +206,23 @@ export class HubInspectTool {
|
|
|
142
206
|
return lines.join('\n');
|
|
143
207
|
}
|
|
144
208
|
}
|
|
209
|
+
|
|
210
|
+
type HubInspectOperation = (typeof HUB_INSPECT_OPERATIONS)[number];
|
|
211
|
+
|
|
212
|
+
function normalizeOperations(operations: readonly HubInspectOperation[] | undefined): HubInspectOperation[] {
|
|
213
|
+
return operations && operations.length > 0 ? [...new Set(operations)] : ['overview'];
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
function operationMismatch(repoId: string, type: 'model' | 'space', operations: HubInspectOperation[]): string {
|
|
217
|
+
const requested = operations.filter((operation) => operation.startsWith('dataset_')).join(', ');
|
|
218
|
+
return `# ${repoId}\n\nRequested dataset operation(s) \`${requested}\`, but this repo was requested as a ${type}. Dataset Viewer operations only apply to dataset repos.`;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
function datasetDrillDownHint(): string {
|
|
222
|
+
return [
|
|
223
|
+
'## Available deeper inspections',
|
|
224
|
+
'Call `hub_repo_details` with:',
|
|
225
|
+
'- `operations: ["dataset_structure"]` for configs, splits, sizes, parquet exports, and schema.',
|
|
226
|
+
'- `operations: ["dataset_preview"]` with `config` and `split` for sample rows.',
|
|
227
|
+
].join('\n');
|
|
228
|
+
}
|
package/src/index.browser.ts
CHANGED
|
@@ -75,7 +75,8 @@ export const HUB_REPO_DETAILS_TOOL_CONFIG: BrowserToolConfig = {
|
|
|
75
75
|
name: 'hub_repo_details',
|
|
76
76
|
description:
|
|
77
77
|
'Get details for one or more Hugging Face repos (model, dataset, or space). ' +
|
|
78
|
-
'Auto-detects type unless specified.'
|
|
78
|
+
'Auto-detects type unless specified. For datasets, use dataset_structure first to discover configs, splits, ' +
|
|
79
|
+
'sizes, and schema. Use dataset_preview only when config and split are known, unless the dataset has a single config/split.',
|
|
79
80
|
annotations: {
|
|
80
81
|
title: 'Hub Repo Details',
|
|
81
82
|
destructiveHint: false,
|
package/src/index.ts
CHANGED
|
@@ -10,6 +10,7 @@ export * from './dataset-search.js';
|
|
|
10
10
|
export * from './repo-search.js';
|
|
11
11
|
export * from './create-repo.js';
|
|
12
12
|
export * from './dataset-detail.js';
|
|
13
|
+
export * from './dataset-viewer-inspect.js';
|
|
13
14
|
export * from './hub-inspect.js';
|
|
14
15
|
export * from './duplicate-space.js';
|
|
15
16
|
export * from './space-info.js';
|
package/src/jobs/commands/run.ts
CHANGED
|
@@ -18,6 +18,7 @@ export async function runCommand(args: RunArgs, client: JobsApiClient, token?: s
|
|
|
18
18
|
secrets: args.secrets,
|
|
19
19
|
timeout: args.timeout,
|
|
20
20
|
hfToken: token,
|
|
21
|
+
volumes: args.volumes,
|
|
21
22
|
});
|
|
22
23
|
|
|
23
24
|
// Submit job
|
|
@@ -39,7 +40,11 @@ export async function runCommand(args: RunArgs, client: JobsApiClient, token?: s
|
|
|
39
40
|
|
|
40
41
|
// Not detached - fetch logs
|
|
41
42
|
const logsUrl = client.getLogsUrl(job.id, job.owner.name);
|
|
42
|
-
const logResult = await fetchJobLogs(logsUrl, {
|
|
43
|
+
const logResult = await fetchJobLogs(logsUrl, {
|
|
44
|
+
token,
|
|
45
|
+
maxDuration: DEFAULT_LOG_WAIT_MS,
|
|
46
|
+
maxLines: DEFAULT_MAX_LOG_LINES,
|
|
47
|
+
});
|
|
43
48
|
|
|
44
49
|
let response = `Job started: ${job.id}\n\n`;
|
|
45
50
|
|
|
@@ -80,6 +85,7 @@ export async function uvCommand(args: UvArgs, client: JobsApiClient, token?: str
|
|
|
80
85
|
timeout: args.timeout,
|
|
81
86
|
detach: args.detach,
|
|
82
87
|
namespace: args.namespace,
|
|
88
|
+
volumes: args.volumes,
|
|
83
89
|
};
|
|
84
90
|
|
|
85
91
|
return runCommand(runArgs, client, token);
|
|
@@ -28,6 +28,7 @@ export async function scheduledRunCommand(
|
|
|
28
28
|
secrets: args.secrets,
|
|
29
29
|
timeout: args.timeout,
|
|
30
30
|
hfToken: token,
|
|
31
|
+
volumes: args.volumes,
|
|
31
32
|
});
|
|
32
33
|
|
|
33
34
|
// Create scheduled job spec
|
|
@@ -78,6 +79,7 @@ export async function scheduledUvCommand(
|
|
|
78
79
|
timeout: args.timeout,
|
|
79
80
|
detach: args.detach,
|
|
80
81
|
namespace: args.namespace,
|
|
82
|
+
volumes: args.volumes,
|
|
81
83
|
};
|
|
82
84
|
|
|
83
85
|
return scheduledRunCommand(scheduledRunArgs, client, token);
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { JobSpec } from '../types.js';
|
|
1
|
+
import type { JobSpec, JobVolume, JobVolumeType } from '../types.js';
|
|
2
2
|
import { parse as parseShellArgs } from 'shell-quote';
|
|
3
3
|
|
|
4
4
|
interface EnvToken {
|
|
@@ -7,6 +7,17 @@ interface EnvToken {
|
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
const SPECIAL_PARAMS = new Set(['*', '@', '#', '?', '!', '-', '_']);
|
|
10
|
+
const HF_VOLUME_PREFIX = 'hf://';
|
|
11
|
+
const VOLUME_FORMAT_HELP =
|
|
12
|
+
'Expected format: hf://[TYPE/]OWNER/NAME[/PATH]:/MOUNT_PATH[:ro|:rw], ' +
|
|
13
|
+
'e.g. hf://datasets/org/dataset:/data:ro or hf://buckets/org/bucket:/output.';
|
|
14
|
+
const HF_VOLUME_TYPES: Record<string, JobVolumeType> = {
|
|
15
|
+
models: 'model',
|
|
16
|
+
datasets: 'dataset',
|
|
17
|
+
spaces: 'space',
|
|
18
|
+
buckets: 'bucket',
|
|
19
|
+
};
|
|
20
|
+
const SINGULAR_VOLUME_TYPES = new Set(['model', 'dataset', 'space', 'bucket']);
|
|
10
21
|
|
|
11
22
|
function isEnvToken(entry: unknown): entry is EnvToken {
|
|
12
23
|
return Boolean(entry && typeof entry === 'object' && (entry as EnvToken).type === 'env');
|
|
@@ -50,9 +61,7 @@ export function parseTimeout(timeout: string): number {
|
|
|
50
61
|
if (!isNaN(seconds)) {
|
|
51
62
|
return seconds;
|
|
52
63
|
}
|
|
53
|
-
throw new Error(
|
|
54
|
-
`Invalid timeout format: "${timeout}". Use format like "5m", "2h", "30s", or plain seconds.`
|
|
55
|
-
);
|
|
64
|
+
throw new Error(`Invalid timeout format: "${timeout}". Use format like "5m", "2h", "30s", or plain seconds.`);
|
|
56
65
|
}
|
|
57
66
|
|
|
58
67
|
const value = parseFloat(match[1]);
|
|
@@ -93,7 +102,7 @@ export function parseCommand(command: string | string[]): { command: string[]; a
|
|
|
93
102
|
}
|
|
94
103
|
|
|
95
104
|
// Parse the command string using shell-quote for POSIX-compliant parsing
|
|
96
|
-
const parsed = parseShellArgs<EnvToken>(command, key => ({ type: 'env', key }));
|
|
105
|
+
const parsed = parseShellArgs<EnvToken>(command, (key) => ({ type: 'env', key }));
|
|
97
106
|
|
|
98
107
|
// Convert parsed result to string array
|
|
99
108
|
// shell-quote can return various types (strings, objects for operators, etc.)
|
|
@@ -121,6 +130,82 @@ export function parseCommand(command: string | string[]): { command: string[]; a
|
|
|
121
130
|
return { command: stringArgs, arguments: [] };
|
|
122
131
|
}
|
|
123
132
|
|
|
133
|
+
function invalidVolume(rawSpec: string, message: string): Error {
|
|
134
|
+
return new Error(`Invalid volume "${rawSpec}". ${message} ${VOLUME_FORMAT_HELP}`);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function parseVolume(rawSpec: string): JobVolume {
|
|
138
|
+
let spec = rawSpec;
|
|
139
|
+
let readOnly: boolean | undefined;
|
|
140
|
+
|
|
141
|
+
if (spec.endsWith(':ro')) {
|
|
142
|
+
readOnly = true;
|
|
143
|
+
spec = spec.slice(0, -3);
|
|
144
|
+
} else if (spec.endsWith(':rw')) {
|
|
145
|
+
readOnly = false;
|
|
146
|
+
spec = spec.slice(0, -3);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if (!spec.startsWith(HF_VOLUME_PREFIX)) {
|
|
150
|
+
throw invalidVolume(rawSpec, `Volume source must start with "${HF_VOLUME_PREFIX}".`);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const body = spec.slice(HF_VOLUME_PREFIX.length);
|
|
154
|
+
const mountSeparator = body.lastIndexOf(':/');
|
|
155
|
+
if (mountSeparator === -1) {
|
|
156
|
+
throw invalidVolume(rawSpec, 'Missing mount path.');
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const sourcePart = body.slice(0, mountSeparator);
|
|
160
|
+
const mountPath = body.slice(mountSeparator + 1);
|
|
161
|
+
if (!sourcePart) {
|
|
162
|
+
throw invalidVolume(rawSpec, 'Missing Hub source before mount path.');
|
|
163
|
+
}
|
|
164
|
+
if (!mountPath.startsWith('/') || mountPath === '/') {
|
|
165
|
+
throw invalidVolume(rawSpec, `Mount path must be a non-empty absolute path, got "${mountPath}".`);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const segments = sourcePart.split('/');
|
|
169
|
+
const firstSegment = segments[0];
|
|
170
|
+
if (!firstSegment) {
|
|
171
|
+
throw invalidVolume(rawSpec, 'Missing Hub source type or owner.');
|
|
172
|
+
}
|
|
173
|
+
if (SINGULAR_VOLUME_TYPES.has(firstSegment)) {
|
|
174
|
+
throw invalidVolume(rawSpec, `Type prefix must be plural, got "${firstSegment}/".`);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
const explicitType = HF_VOLUME_TYPES[firstSegment];
|
|
178
|
+
const type = explicitType ?? 'model';
|
|
179
|
+
const locationSegments = explicitType ? segments.slice(1) : segments;
|
|
180
|
+
if (locationSegments.length < 2 || !locationSegments[0] || !locationSegments[1]) {
|
|
181
|
+
throw invalidVolume(rawSpec, 'Hub source must include OWNER/NAME.');
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
const source = `${locationSegments[0]}/${locationSegments[1]}`;
|
|
185
|
+
const path = locationSegments.slice(2).join('/') || undefined;
|
|
186
|
+
const volume: JobVolume = { type, source, mountPath };
|
|
187
|
+
|
|
188
|
+
if (readOnly !== undefined) {
|
|
189
|
+
volume.readOnly = readOnly;
|
|
190
|
+
}
|
|
191
|
+
if (path) {
|
|
192
|
+
volume.path = path;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return volume;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Parse hf:// volume mount strings into the Jobs API payload shape.
|
|
200
|
+
*/
|
|
201
|
+
export function parseVolumes(volumes?: string[]): JobVolume[] | undefined {
|
|
202
|
+
if (!volumes || volumes.length === 0) {
|
|
203
|
+
return undefined;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
return volumes.map(parseVolume);
|
|
207
|
+
}
|
|
208
|
+
|
|
124
209
|
/**
|
|
125
210
|
* Replace HF token placeholder with actual token if available
|
|
126
211
|
*/
|
|
@@ -162,6 +247,7 @@ export function createJobSpec(args: {
|
|
|
162
247
|
secrets?: Record<string, string>;
|
|
163
248
|
timeout?: string;
|
|
164
249
|
hfToken?: string;
|
|
250
|
+
volumes?: string[];
|
|
165
251
|
}): JobSpec {
|
|
166
252
|
// Validate required fields
|
|
167
253
|
if (!args.image) {
|
|
@@ -176,6 +262,7 @@ export function createJobSpec(args: {
|
|
|
176
262
|
const timeoutSeconds = args.timeout ? parseTimeout(args.timeout) : undefined;
|
|
177
263
|
const environment = transformEnvMap(args.env, args.hfToken) || {};
|
|
178
264
|
const secrets = transformEnvMap(args.secrets, args.hfToken) || {};
|
|
265
|
+
const volumes = parseVolumes(args.volumes);
|
|
179
266
|
|
|
180
267
|
const spec: JobSpec = {
|
|
181
268
|
...imageSource,
|
|
@@ -186,6 +273,9 @@ export function createJobSpec(args: {
|
|
|
186
273
|
secrets,
|
|
187
274
|
timeoutSeconds,
|
|
188
275
|
};
|
|
276
|
+
if (volumes) {
|
|
277
|
+
spec.volumes = volumes;
|
|
278
|
+
}
|
|
189
279
|
|
|
190
280
|
return spec;
|
|
191
281
|
}
|
package/src/jobs/jobs-tool.ts
CHANGED
|
@@ -337,6 +337,42 @@ Manage compute jobs on Hugging Face infrastructure.
|
|
|
337
337
|
|
|
338
338
|
${renderExampleSection('Run a simple job', 'run')}${renderExampleSection('Run a Python script with UV', 'uv')}
|
|
339
339
|
|
|
340
|
+
## Deep Hub Dataset/Repo Analysis
|
|
341
|
+
|
|
342
|
+
Use Jobs for deep analysis prompts involving Hugging Face datasets, models, Spaces, repos, traces, or large Hub files—especially when the user asks to "analyze", "find trends", "process all rows/files", "run a complete analysis", "take your time", or "install/use Python libraries".
|
|
343
|
+
|
|
344
|
+
Recommended workflow:
|
|
345
|
+
1. Inspect the repo with \`hub_repo_details\` for schema, splits, and parquet URLs.
|
|
346
|
+
2. Run \`operation: "uv"\` with a self-contained Python script; do not call \`{"operation": "uv"}\` by itself except to request help.
|
|
347
|
+
3. Always put third-party packages in \`with_deps\`; do not assume packages like \`pandas\`, \`polars\`, \`pyarrow\`, \`datasets\`, or \`huggingface_hub\` are installed. Prefer \`with_deps\` over relying on inline PEP 723 script metadata.
|
|
348
|
+
4. Prefer converted parquet URLs for Hub datasets when available; they are often more reliable for mixed JSONL/session repos than \`datasets.load_dataset(...)\`.
|
|
349
|
+
5. Print the final report at the end of the job. If the initial response only shows installation logs or partial output, call \`logs\` with the exact returned job ID and a larger \`tail\`, e.g. \`{"tail": 500}\`.
|
|
350
|
+
6. Jobs do not automatically inherit the MCP server's Hugging Face token inside the container. For private/gated data or uploads, pass \`secrets: { "HF_TOKEN": "$HF_TOKEN" }\`.
|
|
351
|
+
|
|
352
|
+
Example:
|
|
353
|
+
\`\`\`json
|
|
354
|
+
{
|
|
355
|
+
"operation": "uv",
|
|
356
|
+
"args": {
|
|
357
|
+
"with_deps": ["polars", "pyarrow", "huggingface_hub"],
|
|
358
|
+
"timeout": "60m",
|
|
359
|
+
"flavor": "cpu-upgrade",
|
|
360
|
+
"script": "import polars as pl\\nurl = 'PARQUET_URL_FROM_HUB_REPO_DETAILS'\\ndf = pl.read_parquet(url)\\nprint(df.shape)\\nprint(df.head())"
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
\`\`\`
|
|
364
|
+
|
|
365
|
+
If output is incomplete, fetch more logs:
|
|
366
|
+
\`\`\`json
|
|
367
|
+
{
|
|
368
|
+
"operation": "logs",
|
|
369
|
+
"args": {
|
|
370
|
+
"job_id": "JOB_ID_FROM_RUN_RESPONSE",
|
|
371
|
+
"tail": 500
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
\`\`\`
|
|
375
|
+
|
|
340
376
|
## Hardware Flavors
|
|
341
377
|
|
|
342
378
|
${HARDWARE_FLAVORS_SECTION}
|
|
@@ -357,6 +393,28 @@ ${HARDWARE_FLAVORS_SECTION}
|
|
|
357
393
|
- Include newline characters directly in the argument (e.g., \`"first line\\nsecond line"\`)
|
|
358
394
|
- UV inline scripts are automatically base64-decoded inside the container; just send the raw script text
|
|
359
395
|
|
|
396
|
+
## Volumes
|
|
397
|
+
|
|
398
|
+
Attach Hub repositories or buckets into the job container with \`hf://\` volume URLs.
|
|
399
|
+
|
|
400
|
+
Format: \`hf://[TYPE/]OWNER/NAME[/PATH]:/MOUNT_PATH[:ro|:rw]\`
|
|
401
|
+
|
|
402
|
+
- \`TYPE\` is one of \`models\`, \`datasets\`, \`spaces\`, or \`buckets\`; omitted type defaults to models.
|
|
403
|
+
- \`OWNER/NAME\` source IDs are required.
|
|
404
|
+
- \`:ro\` and \`:rw\` are optional; backend defaults are preserved when omitted.
|
|
405
|
+
|
|
406
|
+
Example:
|
|
407
|
+
\`\`\`json
|
|
408
|
+
{
|
|
409
|
+
"operation": "run",
|
|
410
|
+
"args": {
|
|
411
|
+
"image": "python:3.12",
|
|
412
|
+
"command": ["python", "-c", "import os; print(os.listdir('/data'))"],
|
|
413
|
+
"volumes": ["hf://datasets/org/dataset:/data:ro"]
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
\`\`\`
|
|
417
|
+
|
|
360
418
|
### Show command-specific help
|
|
361
419
|
Call this tool with:
|
|
362
420
|
\`\`\`json
|
|
@@ -378,9 +436,8 @@ Call this tool with:
|
|
|
378
436
|
export const HF_JOBS_TOOL_CONFIG = {
|
|
379
437
|
name: 'hf_jobs',
|
|
380
438
|
description:
|
|
381
|
-
'
|
|
382
|
-
'
|
|
383
|
-
'Call this tool with no operation for full usage instructions and examples. ',
|
|
439
|
+
'Remote compute for Hugging Face workflows. Run Python/UV or Docker jobs to deeply analyze Hub datasets, repos, traces, models, and large files; compute trends/statistics; run batch inference/evaluation; or perform long-running work with installed libraries. ' +
|
|
440
|
+
'Use for dataset/repo analysis prompts when local chat inspection is insufficient. Includes submit, logs, inspect, cancel, schedule, and volume mounting.',
|
|
384
441
|
schema: z.object({
|
|
385
442
|
operation: z
|
|
386
443
|
.enum(OPERATION_NAMES)
|
package/src/jobs/types.ts
CHANGED
|
@@ -60,6 +60,20 @@ export interface JobOwner {
|
|
|
60
60
|
type: 'user' | 'org';
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
+
/**
|
|
64
|
+
* Hugging Face Hub volume mounted into a Job container.
|
|
65
|
+
*/
|
|
66
|
+
export type JobVolumeType = 'bucket' | 'model' | 'dataset' | 'space';
|
|
67
|
+
|
|
68
|
+
export interface JobVolume {
|
|
69
|
+
type: JobVolumeType;
|
|
70
|
+
source: string;
|
|
71
|
+
mountPath: string;
|
|
72
|
+
revision?: string;
|
|
73
|
+
readOnly?: boolean;
|
|
74
|
+
path?: string;
|
|
75
|
+
}
|
|
76
|
+
|
|
63
77
|
/**
|
|
64
78
|
* Job information from API
|
|
65
79
|
* Based on OpenAPI schema
|
|
@@ -97,6 +111,7 @@ export interface JobSpec {
|
|
|
97
111
|
secrets?: Record<string, string>;
|
|
98
112
|
flavor: string;
|
|
99
113
|
timeoutSeconds?: number;
|
|
114
|
+
volumes?: JobVolume[];
|
|
100
115
|
}
|
|
101
116
|
|
|
102
117
|
/**
|
|
@@ -164,6 +179,14 @@ export const runArgsSchema = commonArgsSchema.extend({
|
|
|
164
179
|
.optional()
|
|
165
180
|
.describe('Secrets as key-value pairs. Use HF_TOKEN=$HF_TOKEN to include your token'),
|
|
166
181
|
timeout: z.string().optional().describe('Max duration (e.g., "5m", "2h", "30s"). Default: 30m').default('30m'),
|
|
182
|
+
volumes: z
|
|
183
|
+
.array(z.string())
|
|
184
|
+
.optional()
|
|
185
|
+
.describe(
|
|
186
|
+
'Volume mounts using hf:// URLs. Format: hf://TYPE/OWNER/NAME[/PATH]:/MOUNT_PATH[:ro|:rw]. ' +
|
|
187
|
+
'TYPE is models, datasets, spaces, or buckets. ' +
|
|
188
|
+
'Examples: ["hf://datasets/org/ds:/data:ro", "hf://buckets/org/b:/output"].'
|
|
189
|
+
),
|
|
167
190
|
detach: z
|
|
168
191
|
.boolean()
|
|
169
192
|
.optional()
|
|
@@ -187,7 +210,18 @@ export const uvArgsSchema = commonArgsSchema.extend({
|
|
|
187
210
|
.optional()
|
|
188
211
|
.describe('Secrets as key-value pairs. Use HF_TOKEN=$HF_TOKEN to include your token'),
|
|
189
212
|
timeout: z.string().optional().default('30m').describe('Max duration'),
|
|
190
|
-
|
|
213
|
+
volumes: z
|
|
214
|
+
.array(z.string())
|
|
215
|
+
.optional()
|
|
216
|
+
.describe(
|
|
217
|
+
'Volume mounts using hf:// URLs. Format: hf://TYPE/OWNER/NAME[/PATH]:/MOUNT_PATH[:ro|:rw]. ' +
|
|
218
|
+
'TYPE is models, datasets, spaces, or buckets.'
|
|
219
|
+
),
|
|
220
|
+
detach: z
|
|
221
|
+
.boolean()
|
|
222
|
+
.optional()
|
|
223
|
+
.default(false)
|
|
224
|
+
.describe('If true, return immediately with job ID. If false (default), tail logs for up to 10 seconds.'),
|
|
191
225
|
});
|
|
192
226
|
|
|
193
227
|
// PS command args
|
package/src/model-detail.ts
CHANGED
|
@@ -259,7 +259,7 @@ export class ModelDetailTool {
|
|
|
259
259
|
|
|
260
260
|
// Fetch and append README content if requested
|
|
261
261
|
if (includeReadme) {
|
|
262
|
-
const readmeContent = await fetchReadmeContent(modelDetails.name, 'models'
|
|
262
|
+
const readmeContent = await fetchReadmeContent(modelDetails.name, 'models');
|
|
263
263
|
if (readmeContent) {
|
|
264
264
|
const result = formatModelDetails(modelDetails);
|
|
265
265
|
result.formatted += '\n\n## README\n<modelcard-readme>\n\n' + readmeContent.trim() + '\n</modelcard-readme>';
|
package/src/readme-utils.ts
CHANGED
|
@@ -5,21 +5,16 @@
|
|
|
5
5
|
import { fetchWithProfile, NETWORK_FETCH_PROFILES } from './network/fetch-profile.js';
|
|
6
6
|
|
|
7
7
|
// Maximum number of characters to include from a README
|
|
8
|
-
const DEFAULT_MAX_README_CHARS =
|
|
8
|
+
const DEFAULT_MAX_README_CHARS = 40_000;
|
|
9
9
|
|
|
10
10
|
/**
|
|
11
11
|
* Fetches README content from a Hugging Face repository
|
|
12
12
|
*
|
|
13
13
|
* @param repoName The resolved repository name (e.g., 'rajpurkar/squad', 'openai-community/gpt2')
|
|
14
14
|
* @param type The repository type ('models' or 'datasets')
|
|
15
|
-
* @param includeYaml Whether to include YAML frontmatter (default: false)
|
|
16
15
|
* @returns Promise<string | null> The README content or null if not found/error
|
|
17
16
|
*/
|
|
18
|
-
export async function fetchReadmeContent(
|
|
19
|
-
repoName: string,
|
|
20
|
-
type: 'models' | 'datasets',
|
|
21
|
-
includeYaml: boolean = false
|
|
22
|
-
): Promise<string | null> {
|
|
17
|
+
export async function fetchReadmeContent(repoName: string, type: 'models' | 'datasets'): Promise<string | null> {
|
|
23
18
|
try {
|
|
24
19
|
// Construct the URL based on repository type
|
|
25
20
|
const baseUrl =
|
|
@@ -39,11 +34,6 @@ export async function fetchReadmeContent(
|
|
|
39
34
|
|
|
40
35
|
let content = await response.text();
|
|
41
36
|
|
|
42
|
-
// If includeYaml is false, strip YAML frontmatter
|
|
43
|
-
if (!includeYaml) {
|
|
44
|
-
content = stripYamlFrontmatter(content);
|
|
45
|
-
}
|
|
46
|
-
|
|
47
37
|
// Truncate overly long READMEs to a sensible default size
|
|
48
38
|
if (content.length > DEFAULT_MAX_README_CHARS) {
|
|
49
39
|
const truncated = content.slice(0, DEFAULT_MAX_README_CHARS);
|
|
@@ -62,23 +52,3 @@ export async function fetchReadmeContent(
|
|
|
62
52
|
return null;
|
|
63
53
|
}
|
|
64
54
|
}
|
|
65
|
-
|
|
66
|
-
/**
|
|
67
|
-
* Strips YAML frontmatter from markdown content
|
|
68
|
-
*
|
|
69
|
-
* @param content The full markdown content
|
|
70
|
-
* @returns The content with YAML frontmatter removed
|
|
71
|
-
*/
|
|
72
|
-
function stripYamlFrontmatter(content: string): string {
|
|
73
|
-
// Match YAML frontmatter: starts with ---, ends with ---
|
|
74
|
-
const yamlPattern = /^(\s*---[\r\n]+)([\S\s]*?)([\r\n]+---(\r\n|\n|$))/;
|
|
75
|
-
const match = content.match(yamlPattern);
|
|
76
|
-
|
|
77
|
-
if (match) {
|
|
78
|
-
// Return everything after the closing ---
|
|
79
|
-
return content.substring(match[0].length);
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
// No YAML frontmatter found, return original content
|
|
83
|
-
return content;
|
|
84
|
-
}
|