@vespermcp/mcp-server 1.2.21 → 1.2.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +6 -0
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +410 -0
- package/build/index.js +1587 -845
- package/build/ingestion/ingestor.js +7 -4
- package/build/install/install-service.js +11 -6
- package/build/lib/supabase.js +3 -0
- package/build/metadata/scraper.js +85 -14
- package/build/python/asset_downloader_engine.py +2 -0
- package/build/python/convert_engine.py +92 -0
- package/build/python/export_engine.py +45 -0
- package/build/python/kaggle_engine.py +77 -5
- package/build/python/normalize_engine.py +83 -0
- package/build/python/vesper/core/asset_downloader.py +5 -1
- package/build/search/engine.js +43 -5
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +509 -0
- package/build/tools/formatter.js +6 -3
- package/build/utils/python-runtime.js +130 -0
- package/package.json +7 -5
- package/scripts/postinstall.cjs +87 -31
- package/scripts/wizard.cjs +601 -0
- package/scripts/wizard.js +306 -12
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +2 -0
- package/src/python/convert_engine.py +92 -0
- package/src/python/export_engine.py +45 -0
- package/src/python/kaggle_engine.py +77 -5
- package/src/python/normalize_engine.py +83 -0
- package/src/python/requirements.txt +12 -0
- package/src/python/vesper/core/asset_downloader.py +5 -1
- package/wizard.cjs +3 -0
package/README.md
CHANGED
|
@@ -146,6 +146,33 @@ generate_quality_report(
|
|
|
146
146
|
|
|
147
147
|
### Dataset Discovery
|
|
148
148
|
|
|
149
|
+
#### `unified_dataset_api`
|
|
150
|
+
Single facade over multiple dataset backends. Use one tool for provider capability inspection, dataset discovery, dataset download, and dataset info lookup. The gateway prefers public/keyless providers and can also use server-managed credentials for connectors like Kaggle or data.world when configured by the operator.
|
|
151
|
+
|
|
152
|
+
**Parameters:**
|
|
153
|
+
- `operation` (string): `providers`, `discover`, `download`, or `info`
|
|
154
|
+
- `source` (string, optional): `auto`, `huggingface`, `openml`, `kaggle`, `dataworld`, `s3`, `bigquery`
|
|
155
|
+
- `query` (string, required for `discover`)
|
|
156
|
+
- `dataset_id` (string, required for `download`/`info`)
|
|
157
|
+
- `limit` (number, optional)
|
|
158
|
+
- `target_dir` (string, optional)
|
|
159
|
+
- `public_only` (boolean, optional)
|
|
160
|
+
|
|
161
|
+
**Examples:**
|
|
162
|
+
```
|
|
163
|
+
unified_dataset_api(operation="providers")
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
unified_dataset_api(operation="discover", query="credit risk", source="auto")
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
unified_dataset_api(operation="download", dataset_id="huggingface:imdb")
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
149
176
|
#### `search_datasets`
|
|
150
177
|
Search for datasets across multiple sources.
|
|
151
178
|
|
|
@@ -195,6 +222,28 @@ export_dataset(
|
|
|
195
222
|
|
|
196
223
|
---
|
|
197
224
|
|
|
225
|
+
#### `vesper_download_assets`
|
|
226
|
+
Download image/media assets to a user-controlled local directory.
|
|
227
|
+
|
|
228
|
+
**Parameters:**
|
|
229
|
+
- `dataset_id` (string): Dataset identifier
|
|
230
|
+
- `source` (string): `huggingface`, `kaggle`, or `url`
|
|
231
|
+
- `target_dir` (string, optional): Exact local directory where assets should be written
|
|
232
|
+
- `output_dir` (string, optional): Alias for `target_dir`
|
|
233
|
+
- `output_format` (string, optional): `webdataset`, `imagefolder`, or `parquet`
|
|
234
|
+
|
|
235
|
+
**Example:**
|
|
236
|
+
```
|
|
237
|
+
vesper_download_assets(
|
|
238
|
+
dataset_id="cats_vs_dogs",
|
|
239
|
+
source="kaggle",
|
|
240
|
+
target_dir="./datasets/cats_dogs_100",
|
|
241
|
+
output_format="imagefolder"
|
|
242
|
+
)
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
---
|
|
246
|
+
|
|
198
247
|
### Quality Analysis
|
|
199
248
|
|
|
200
249
|
#### `analyze_image_quality`
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import { readFile } from "fs/promises";
|
|
3
|
+
import { createClient } from "@supabase/supabase-js";
|
|
4
|
+
export class SupabaseAdapter {
|
|
5
|
+
bucket;
|
|
6
|
+
client;
|
|
7
|
+
constructor(bucket, supabaseUrl, supabaseServiceRoleKey) {
|
|
8
|
+
this.bucket = bucket;
|
|
9
|
+
const resolvedUrl = supabaseUrl || process.env.SUPABASE_URL;
|
|
10
|
+
const resolvedServiceRoleKey = supabaseServiceRoleKey || process.env.SUPABASE_SERVICE_ROLE_KEY;
|
|
11
|
+
if (!resolvedUrl || !resolvedServiceRoleKey) {
|
|
12
|
+
throw new Error("Supabase requires SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY");
|
|
13
|
+
}
|
|
14
|
+
this.client = createClient(resolvedUrl, resolvedServiceRoleKey, {
|
|
15
|
+
auth: {
|
|
16
|
+
persistSession: false,
|
|
17
|
+
},
|
|
18
|
+
});
|
|
19
|
+
}
|
|
20
|
+
async upload(localPath, remotePath) {
|
|
21
|
+
if (!fs.existsSync(localPath)) {
|
|
22
|
+
throw new Error(`Local file not found: ${localPath}`);
|
|
23
|
+
}
|
|
24
|
+
const fileBuffer = await readFile(localPath);
|
|
25
|
+
const { error } = await this.client.storage
|
|
26
|
+
.from(this.bucket)
|
|
27
|
+
.upload(remotePath, fileBuffer, { upsert: true });
|
|
28
|
+
if (error) {
|
|
29
|
+
throw new Error(`Supabase upload failed: ${error.message}`);
|
|
30
|
+
}
|
|
31
|
+
const { data } = this.client.storage.from(this.bucket).getPublicUrl(remotePath);
|
|
32
|
+
return data.publicUrl;
|
|
33
|
+
}
|
|
34
|
+
async delete(remotePath) {
|
|
35
|
+
const { error } = await this.client.storage.from(this.bucket).remove([remotePath]);
|
|
36
|
+
if (error) {
|
|
37
|
+
throw new Error(`Supabase delete failed: ${error.message}`);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
async getSignedUrl(remotePath, expiresValue = 3600) {
|
|
41
|
+
const { data, error } = await this.client.storage
|
|
42
|
+
.from(this.bucket)
|
|
43
|
+
.createSignedUrl(remotePath, expiresValue);
|
|
44
|
+
if (error || !data?.signedUrl) {
|
|
45
|
+
throw new Error(`Supabase signed URL failed: ${error?.message || "No signed URL returned"}`);
|
|
46
|
+
}
|
|
47
|
+
return data.signedUrl;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { LocalAdapter } from "./adapters/local.js";
|
|
2
2
|
import { S3Adapter } from "./adapters/s3.js";
|
|
3
|
+
import { SupabaseAdapter } from "./adapters/supabase.js";
|
|
3
4
|
export class StorageManager {
|
|
4
5
|
/**
|
|
5
6
|
* Creates an adapter based on configuration
|
|
@@ -13,6 +14,11 @@ export class StorageManager {
|
|
|
13
14
|
throw new Error("S3 requires bucket and region");
|
|
14
15
|
}
|
|
15
16
|
return new S3Adapter(config.options.bucket, config.options.region, config.options.credentials);
|
|
17
|
+
case "supabase":
|
|
18
|
+
if (!config.options.bucket) {
|
|
19
|
+
throw new Error("Supabase requires bucket");
|
|
20
|
+
}
|
|
21
|
+
return new SupabaseAdapter(config.options.bucket, config.options.supabaseUrl, config.options.supabaseServiceRoleKey);
|
|
16
22
|
default:
|
|
17
23
|
throw new Error(`Unsupported storage type: ${config.type}`);
|
|
18
24
|
}
|
package/build/export/exporter.js
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import { spawn } from "child_process";
|
|
2
2
|
import path from "path";
|
|
3
3
|
import fs from "fs";
|
|
4
|
+
import { ensurePythonPackages, resolvePythonCommand } from "../utils/python-runtime.js";
|
|
4
5
|
export class DataExporter {
|
|
5
|
-
|
|
6
|
+
buildDir;
|
|
6
7
|
scriptPath;
|
|
7
8
|
constructor(buildDir = process.cwd()) {
|
|
9
|
+
this.buildDir = buildDir;
|
|
8
10
|
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
9
11
|
const dataRoot = path.join(homeDir, ".vesper");
|
|
10
12
|
const scriptPath0 = path.resolve(dataRoot, "python", "export_engine.py");
|
|
@@ -26,27 +28,38 @@ export class DataExporter {
|
|
|
26
28
|
else {
|
|
27
29
|
this.scriptPath = scriptPath0;
|
|
28
30
|
}
|
|
29
|
-
// Detect Python command
|
|
30
|
-
if (process.platform === "win32") {
|
|
31
|
-
this.pythonPath = "py";
|
|
32
|
-
}
|
|
33
31
|
}
|
|
34
32
|
/**
|
|
35
33
|
* Exports a dataset file to a specified format
|
|
36
34
|
*/
|
|
37
35
|
async export(inputFile, outputFile, format, options = {}) {
|
|
36
|
+
const pythonRequirements = [
|
|
37
|
+
{ module: "polars", packageName: "polars" },
|
|
38
|
+
];
|
|
39
|
+
if (format === "feather") {
|
|
40
|
+
pythonRequirements.push({ module: "pyarrow", packageName: "pyarrow" });
|
|
41
|
+
}
|
|
42
|
+
if (format === "tfrecord") {
|
|
43
|
+
pythonRequirements.push({ module: "tensorflow", packageName: "tensorflow" });
|
|
44
|
+
}
|
|
45
|
+
const pythonPath = await ensurePythonPackages(this.buildDir, pythonRequirements).catch(() => resolvePythonCommand(this.buildDir));
|
|
38
46
|
return new Promise((resolve, reject) => {
|
|
39
47
|
if (!fs.existsSync(inputFile)) {
|
|
40
48
|
reject(new Error(`Input file not found: ${inputFile}`));
|
|
41
49
|
return;
|
|
42
50
|
}
|
|
43
51
|
const args = [this.scriptPath, inputFile, outputFile, format, JSON.stringify(options)];
|
|
44
|
-
const
|
|
52
|
+
const childProcess = spawn(pythonPath, args, {
|
|
53
|
+
env: {
|
|
54
|
+
...process.env,
|
|
55
|
+
PYTHONIOENCODING: "utf-8",
|
|
56
|
+
},
|
|
57
|
+
});
|
|
45
58
|
let stdout = "";
|
|
46
59
|
let stderr = "";
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
60
|
+
childProcess.stdout.on("data", (data) => stdout += data.toString());
|
|
61
|
+
childProcess.stderr.on("data", (data) => stderr += data.toString());
|
|
62
|
+
childProcess.on("close", (code) => {
|
|
50
63
|
if (code !== 0) {
|
|
51
64
|
reject(new Error(`Export failed: ${stderr || stdout}`));
|
|
52
65
|
return;
|
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import http from "http";
|
|
4
|
+
import https from "https";
|
|
5
|
+
import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
6
|
+
import { analyzeDatasetQuery } from "../search/query-intent.js";
|
|
7
|
+
export class UnifiedDatasetGateway {
|
|
8
|
+
deps;
|
|
9
|
+
constructor(deps) {
|
|
10
|
+
this.deps = deps;
|
|
11
|
+
}
|
|
12
|
+
getProviderStatuses(includeUnavailable = true) {
|
|
13
|
+
const hasHfToken = !!(process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN);
|
|
14
|
+
const hasKaggle = this.deps.dataIngestor.hasKaggleCredentials();
|
|
15
|
+
const hasDataWorld = this.deps.hasDataWorldToken();
|
|
16
|
+
const hasBigQuery = !!(process.env.GOOGLE_APPLICATION_CREDENTIALS || process.env.GOOGLE_CLOUD_PROJECT);
|
|
17
|
+
const providers = [
|
|
18
|
+
{
|
|
19
|
+
source: "huggingface",
|
|
20
|
+
display_name: "Hugging Face",
|
|
21
|
+
available: true,
|
|
22
|
+
auth_mode: hasHfToken ? "public-or-server-managed" : "public",
|
|
23
|
+
supported_operations: ["discover", "download", "info"],
|
|
24
|
+
requires_end_user_key: false,
|
|
25
|
+
notes: hasHfToken
|
|
26
|
+
? ["Public datasets are open by default. Gated/private repos can be accessed via the server-managed HF token when configured."]
|
|
27
|
+
: ["Public datasets work without any user key. Gated/private repos need an operator or user token."],
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
source: "openml",
|
|
31
|
+
display_name: "OpenML",
|
|
32
|
+
available: true,
|
|
33
|
+
auth_mode: "public",
|
|
34
|
+
supported_operations: ["discover", "download", "info"],
|
|
35
|
+
requires_end_user_key: false,
|
|
36
|
+
notes: ["OpenML is exposed as a keyless public provider through the gateway."],
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
source: "kaggle",
|
|
40
|
+
display_name: "Kaggle",
|
|
41
|
+
available: hasKaggle,
|
|
42
|
+
auth_mode: hasKaggle ? "server-managed" : "not-configured",
|
|
43
|
+
supported_operations: ["discover", "download", "info"],
|
|
44
|
+
requires_end_user_key: false,
|
|
45
|
+
notes: hasKaggle
|
|
46
|
+
? ["Kaggle is available through server-managed credentials. End users do not need to pass their own key."]
|
|
47
|
+
: ["Kaggle support exists, but no server-managed credentials are configured yet."],
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
source: "dataworld",
|
|
51
|
+
display_name: "data.world",
|
|
52
|
+
available: hasDataWorld,
|
|
53
|
+
auth_mode: hasDataWorld ? "server-managed" : "not-configured",
|
|
54
|
+
supported_operations: ["discover", "download", "info"],
|
|
55
|
+
requires_end_user_key: false,
|
|
56
|
+
notes: hasDataWorld
|
|
57
|
+
? ["data.world is available through server-managed credentials."]
|
|
58
|
+
: ["data.world support exists, but no server-managed token is configured yet."],
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
source: "s3",
|
|
62
|
+
display_name: "Amazon S3",
|
|
63
|
+
available: true,
|
|
64
|
+
auth_mode: "public-or-server-managed",
|
|
65
|
+
supported_operations: ["download", "info"],
|
|
66
|
+
requires_end_user_key: false,
|
|
67
|
+
notes: ["Supports keyless download of public S3 objects via s3://bucket/key or HTTPS S3 URLs.", "Bucket listing and search are intentionally not exposed."],
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
source: "bigquery",
|
|
71
|
+
display_name: "BigQuery",
|
|
72
|
+
available: hasBigQuery,
|
|
73
|
+
auth_mode: hasBigQuery ? "server-managed" : "not-configured",
|
|
74
|
+
supported_operations: ["info"],
|
|
75
|
+
requires_end_user_key: false,
|
|
76
|
+
notes: hasBigQuery
|
|
77
|
+
? ["BigQuery is reserved for operator-managed connectors. Query execution is not implemented in this patch."]
|
|
78
|
+
: ["BigQuery is scaffolded in the gateway contract, but no server-managed GCP configuration is present."],
|
|
79
|
+
},
|
|
80
|
+
];
|
|
81
|
+
return includeUnavailable ? providers : providers.filter(provider => provider.available);
|
|
82
|
+
}
|
|
83
|
+
async discover(options) {
|
|
84
|
+
const query = String(options.query || "").trim();
|
|
85
|
+
const requestedSource = options.source || "auto";
|
|
86
|
+
const limit = Math.max(1, Number(options.limit || 10));
|
|
87
|
+
const publicOnly = options.publicOnly !== false;
|
|
88
|
+
if (!query) {
|
|
89
|
+
throw new Error("query is required");
|
|
90
|
+
}
|
|
91
|
+
const notes = [];
|
|
92
|
+
const providers = this.resolveDiscoverSources(requestedSource, publicOnly, notes);
|
|
93
|
+
const perSourceLimit = Math.max(5, Math.ceil(limit / Math.max(providers.length, 1)) * 2);
|
|
94
|
+
const allResults = [];
|
|
95
|
+
for (const provider of providers) {
|
|
96
|
+
try {
|
|
97
|
+
const partial = await this.discoverFromSource(provider, query, perSourceLimit);
|
|
98
|
+
for (const dataset of partial) {
|
|
99
|
+
try {
|
|
100
|
+
this.deps.metadataStore.saveDataset(dataset);
|
|
101
|
+
}
|
|
102
|
+
catch {
|
|
103
|
+
// best-effort metadata persistence
|
|
104
|
+
}
|
|
105
|
+
allResults.push(dataset);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
catch (error) {
|
|
109
|
+
notes.push(`${provider}: ${(error?.message || error || "Unknown provider error").toString()}`);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
const deduped = new Map();
|
|
113
|
+
for (const dataset of allResults) {
|
|
114
|
+
deduped.set(`${dataset.source}:${dataset.id}`, dataset);
|
|
115
|
+
}
|
|
116
|
+
const results = Array.from(deduped.values())
|
|
117
|
+
.sort((a, b) => this.rankDataset(b) - this.rankDataset(a))
|
|
118
|
+
.slice(0, limit);
|
|
119
|
+
return {
|
|
120
|
+
query,
|
|
121
|
+
requested_source: requestedSource,
|
|
122
|
+
providers_tried: providers,
|
|
123
|
+
notes,
|
|
124
|
+
results,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
async download(options) {
|
|
128
|
+
const requested = String(options.datasetId || "").trim();
|
|
129
|
+
if (!requested) {
|
|
130
|
+
throw new Error("dataset_id is required");
|
|
131
|
+
}
|
|
132
|
+
const notes = [];
|
|
133
|
+
const resolved = this.resolveDatasetReference(requested, options.source || "auto");
|
|
134
|
+
if (resolved.source === "bigquery") {
|
|
135
|
+
throw new Error("BigQuery gateway support is scaffolded for operator-managed connectors, but query/download execution is not implemented yet.");
|
|
136
|
+
}
|
|
137
|
+
if (resolved.source === "s3") {
|
|
138
|
+
const localPath = await this.downloadPublicS3Object(resolved.datasetId, options.targetDir);
|
|
139
|
+
return {
|
|
140
|
+
dataset_id: requested,
|
|
141
|
+
resolved_source: "s3",
|
|
142
|
+
local_path: localPath,
|
|
143
|
+
notes: ["Downloaded via the keyless S3 gateway path."],
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
let source = this.toIngestSource(resolved.source);
|
|
147
|
+
let datasetId = resolved.datasetId;
|
|
148
|
+
if (!source) {
|
|
149
|
+
const metadataMatch = this.lookupKnownDataset(requested);
|
|
150
|
+
const metadataSource = this.toIngestSource(metadataMatch?.source);
|
|
151
|
+
if (metadataMatch && metadataSource) {
|
|
152
|
+
source = metadataSource;
|
|
153
|
+
datasetId = metadataMatch.id;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
if (!source) {
|
|
157
|
+
const discovery = await this.discover({ query: requested, source: "auto", limit: 1, publicOnly: false });
|
|
158
|
+
if (discovery.results.length === 0) {
|
|
159
|
+
throw new Error(`Unable to resolve provider for '${requested}'. Run unified_dataset_api with operation='discover' first or pass an explicit source.`);
|
|
160
|
+
}
|
|
161
|
+
const discoveredSource = this.toIngestSource(discovery.results[0].source);
|
|
162
|
+
if (!discoveredSource) {
|
|
163
|
+
throw new Error(`Resolved provider '${discovery.results[0].source}' cannot be downloaded through the dataset ingestor.`);
|
|
164
|
+
}
|
|
165
|
+
source = discoveredSource;
|
|
166
|
+
datasetId = discovery.results[0].id;
|
|
167
|
+
notes.push(`Auto-resolved provider to ${source}.`);
|
|
168
|
+
}
|
|
169
|
+
if (source === "kaggle" && !this.deps.dataIngestor.hasKaggleCredentials()) {
|
|
170
|
+
throw new Error("Kaggle is configured as a gateway source, but no server-managed credentials are available.");
|
|
171
|
+
}
|
|
172
|
+
if (source === "dataworld" && !this.deps.hasDataWorldToken()) {
|
|
173
|
+
throw new Error("data.world is configured as a gateway source, but no server-managed token is available.");
|
|
174
|
+
}
|
|
175
|
+
const localPath = await this.deps.dataIngestor.ensureData(datasetId, source, () => undefined);
|
|
176
|
+
let copiedTo;
|
|
177
|
+
if (options.targetDir) {
|
|
178
|
+
copiedTo = this.copyDownloadOutput(localPath, options.targetDir);
|
|
179
|
+
notes.push(`Copied dataset output to ${copiedTo}.`);
|
|
180
|
+
}
|
|
181
|
+
return {
|
|
182
|
+
dataset_id: datasetId,
|
|
183
|
+
resolved_source: source,
|
|
184
|
+
local_path: localPath,
|
|
185
|
+
copied_to: copiedTo,
|
|
186
|
+
notes,
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
async info(options) {
|
|
190
|
+
const requested = String(options.datasetId || "").trim();
|
|
191
|
+
if (!requested) {
|
|
192
|
+
throw new Error("dataset_id is required");
|
|
193
|
+
}
|
|
194
|
+
const resolved = this.resolveDatasetReference(requested, options.source || "auto");
|
|
195
|
+
const metadataMatch = this.lookupKnownDataset(requested) || (resolved.datasetId !== requested ? this.lookupKnownDataset(resolved.datasetId) : undefined);
|
|
196
|
+
if (metadataMatch) {
|
|
197
|
+
return {
|
|
198
|
+
dataset_id: requested,
|
|
199
|
+
resolved_source: metadataMatch.source,
|
|
200
|
+
notes: [],
|
|
201
|
+
dataset: metadataMatch,
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
if (resolved.source === "s3") {
|
|
205
|
+
return {
|
|
206
|
+
dataset_id: requested,
|
|
207
|
+
resolved_source: "s3",
|
|
208
|
+
notes: ["S3 info is derived from the object URI. Discovery/listing is intentionally not supported."],
|
|
209
|
+
dataset: {
|
|
210
|
+
id: requested,
|
|
211
|
+
source: "s3",
|
|
212
|
+
uri: this.toS3HttpsUrl(resolved.datasetId),
|
|
213
|
+
},
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
if (resolved.source === "bigquery") {
|
|
217
|
+
return {
|
|
218
|
+
dataset_id: requested,
|
|
219
|
+
resolved_source: "bigquery",
|
|
220
|
+
notes: ["BigQuery is reserved for operator-managed connectors. Detailed inspection is not implemented in this patch."],
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
const discovery = await this.discover({
|
|
224
|
+
query: resolved.datasetId,
|
|
225
|
+
source: resolved.source || "auto",
|
|
226
|
+
limit: 5,
|
|
227
|
+
publicOnly: options.publicOnly !== false,
|
|
228
|
+
});
|
|
229
|
+
const exact = discovery.results.find(dataset => this.matchesDatasetReference(dataset, requested));
|
|
230
|
+
return {
|
|
231
|
+
dataset_id: requested,
|
|
232
|
+
resolved_source: exact?.source,
|
|
233
|
+
notes: discovery.notes,
|
|
234
|
+
dataset: exact || discovery.results[0],
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
async discoverFromSource(source, query, limit) {
|
|
238
|
+
switch (source) {
|
|
239
|
+
case "huggingface":
|
|
240
|
+
return await new HuggingFaceScraper().scrape(limit, true, await analyzeDatasetQuery(query));
|
|
241
|
+
case "openml":
|
|
242
|
+
return await this.deps.openmlSource.discover(query, limit);
|
|
243
|
+
case "kaggle":
|
|
244
|
+
return await this.deps.kaggleSource.discover(query, limit);
|
|
245
|
+
case "dataworld":
|
|
246
|
+
return await this.deps.dataworldSource.discover(query, limit);
|
|
247
|
+
case "s3":
|
|
248
|
+
throw new Error("S3 does not support search/discovery in the unified gateway. Use a direct s3://bucket/key object reference.");
|
|
249
|
+
case "bigquery":
|
|
250
|
+
throw new Error("BigQuery discovery is not implemented in the unified gateway.");
|
|
251
|
+
default:
|
|
252
|
+
throw new Error(`Unsupported provider: ${source}`);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
resolveDiscoverSources(source, publicOnly, notes) {
|
|
256
|
+
if (source !== "auto") {
|
|
257
|
+
if (source === "kaggle" && !this.deps.dataIngestor.hasKaggleCredentials()) {
|
|
258
|
+
throw new Error("Kaggle requires server-managed credentials and none are configured.");
|
|
259
|
+
}
|
|
260
|
+
if (source === "dataworld" && !this.deps.hasDataWorldToken()) {
|
|
261
|
+
throw new Error("data.world requires a server-managed token and none is configured.");
|
|
262
|
+
}
|
|
263
|
+
if (source === "s3" || source === "bigquery") {
|
|
264
|
+
throw new Error(`${source} does not currently support discover operation through the gateway.`);
|
|
265
|
+
}
|
|
266
|
+
return [source];
|
|
267
|
+
}
|
|
268
|
+
const providers = ["huggingface", "openml"];
|
|
269
|
+
if (!publicOnly && this.deps.dataIngestor.hasKaggleCredentials()) {
|
|
270
|
+
providers.push("kaggle");
|
|
271
|
+
}
|
|
272
|
+
else if (!publicOnly) {
|
|
273
|
+
notes.push("Kaggle skipped because no server-managed credentials are configured.");
|
|
274
|
+
}
|
|
275
|
+
if (!publicOnly && this.deps.hasDataWorldToken()) {
|
|
276
|
+
providers.push("dataworld");
|
|
277
|
+
}
|
|
278
|
+
else if (!publicOnly) {
|
|
279
|
+
notes.push("data.world skipped because no server-managed token is configured.");
|
|
280
|
+
}
|
|
281
|
+
return providers;
|
|
282
|
+
}
|
|
283
|
+
resolveDatasetReference(datasetId, source) {
|
|
284
|
+
const trimmed = datasetId.trim();
|
|
285
|
+
if (source !== "auto") {
|
|
286
|
+
if (source === "s3") {
|
|
287
|
+
return { source, datasetId: trimmed };
|
|
288
|
+
}
|
|
289
|
+
return { source, datasetId: this.stripSourcePrefix(trimmed, source) };
|
|
290
|
+
}
|
|
291
|
+
if (/^s3:\/\//i.test(trimmed) || /^https?:\/\/[^\s]+\.s3[.-][^\s]+/i.test(trimmed) || /^https?:\/\/s3\.[^\s]+amazonaws\.com\//i.test(trimmed)) {
|
|
292
|
+
return { source: "s3", datasetId: trimmed };
|
|
293
|
+
}
|
|
294
|
+
if (/^kaggle:/i.test(trimmed))
|
|
295
|
+
return { source: "kaggle", datasetId: trimmed.replace(/^kaggle:/i, "") };
|
|
296
|
+
if (/^(huggingface|hf):/i.test(trimmed))
|
|
297
|
+
return { source: "huggingface", datasetId: trimmed.replace(/^(huggingface|hf):/i, "") };
|
|
298
|
+
if (/^openml:/i.test(trimmed))
|
|
299
|
+
return { source: "openml", datasetId: trimmed.replace(/^openml:/i, "") };
|
|
300
|
+
if (/^dataworld:/i.test(trimmed))
|
|
301
|
+
return { source: "dataworld", datasetId: trimmed.replace(/^dataworld:/i, "") };
|
|
302
|
+
if (/^bigquery:/i.test(trimmed))
|
|
303
|
+
return { source: "bigquery", datasetId: trimmed.replace(/^bigquery:/i, "") };
|
|
304
|
+
if (/^\d+$/.test(trimmed))
|
|
305
|
+
return { source: "openml", datasetId: trimmed };
|
|
306
|
+
if (trimmed.includes("/") && !trimmed.includes(":"))
|
|
307
|
+
return { source: "huggingface", datasetId: trimmed };
|
|
308
|
+
return { datasetId: trimmed };
|
|
309
|
+
}
|
|
310
|
+
stripSourcePrefix(datasetId, source) {
|
|
311
|
+
if (source === "huggingface") {
|
|
312
|
+
return datasetId.replace(/^(huggingface|hf):/i, "");
|
|
313
|
+
}
|
|
314
|
+
return datasetId.replace(new RegExp(`^${source}:`, "i"), "");
|
|
315
|
+
}
|
|
316
|
+
lookupKnownDataset(datasetId) {
|
|
317
|
+
const candidates = new Set([
|
|
318
|
+
datasetId,
|
|
319
|
+
datasetId.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, ""),
|
|
320
|
+
]);
|
|
321
|
+
for (const candidate of candidates) {
|
|
322
|
+
const dataset = this.deps.metadataStore.getDataset(candidate);
|
|
323
|
+
if (dataset)
|
|
324
|
+
return dataset;
|
|
325
|
+
}
|
|
326
|
+
return undefined;
|
|
327
|
+
}
|
|
328
|
+
matchesDatasetReference(dataset, requested) {
|
|
329
|
+
const normalizedRequested = requested.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, "").toLowerCase();
|
|
330
|
+
const fullId = `${dataset.source}:${dataset.id}`.toLowerCase();
|
|
331
|
+
return dataset.id.toLowerCase() === normalizedRequested || fullId === requested.toLowerCase();
|
|
332
|
+
}
|
|
333
|
+
toIngestSource(source) {
|
|
334
|
+
if (source === "huggingface" || source === "openml" || source === "kaggle" || source === "dataworld") {
|
|
335
|
+
return source;
|
|
336
|
+
}
|
|
337
|
+
return undefined;
|
|
338
|
+
}
|
|
339
|
+
rankDataset(dataset) {
|
|
340
|
+
const relevance = Number(dataset.relevance_score || 0) * 1000;
|
|
341
|
+
const quality = Number(dataset.quality_score || 0) * 100;
|
|
342
|
+
const downloads = Number(dataset.downloads || 0);
|
|
343
|
+
return relevance + quality + downloads;
|
|
344
|
+
}
|
|
345
|
+
copyDownloadOutput(localPath, targetDir) {
|
|
346
|
+
const resolvedTargetDir = path.resolve(targetDir);
|
|
347
|
+
fs.mkdirSync(resolvedTargetDir, { recursive: true });
|
|
348
|
+
const destination = path.join(resolvedTargetDir, path.basename(localPath));
|
|
349
|
+
fs.cpSync(localPath, destination, { recursive: true, force: true });
|
|
350
|
+
return destination;
|
|
351
|
+
}
|
|
352
|
+
async downloadPublicS3Object(datasetId, targetDir) {
|
|
353
|
+
const httpsUrl = this.toS3HttpsUrl(datasetId);
|
|
354
|
+
const parsed = new URL(httpsUrl);
|
|
355
|
+
const fileName = path.basename(parsed.pathname) || "s3-object.bin";
|
|
356
|
+
const outputDir = path.resolve(targetDir || path.join(this.deps.dataRoot, "data", "raw"));
|
|
357
|
+
fs.mkdirSync(outputDir, { recursive: true });
|
|
358
|
+
const outputPath = path.join(outputDir, fileName);
|
|
359
|
+
await this.downloadToFile(httpsUrl, outputPath);
|
|
360
|
+
this.deps.metadataStore.registerDownload(datasetId, outputPath, "completed", fs.statSync(outputPath).size);
|
|
361
|
+
return outputPath;
|
|
362
|
+
}
|
|
363
|
+
toS3HttpsUrl(datasetId) {
|
|
364
|
+
if (/^https?:\/\//i.test(datasetId)) {
|
|
365
|
+
return datasetId;
|
|
366
|
+
}
|
|
367
|
+
const match = datasetId.match(/^s3:\/\/([^/]+)\/(.+)$/i);
|
|
368
|
+
if (!match) {
|
|
369
|
+
throw new Error("S3 source expects an s3://bucket/key object reference or a direct HTTPS S3 URL.");
|
|
370
|
+
}
|
|
371
|
+
const bucket = match[1];
|
|
372
|
+
const objectKey = match[2].split("/").map(encodeURIComponent).join("/");
|
|
373
|
+
return `https://${bucket}.s3.amazonaws.com/${objectKey}`;
|
|
374
|
+
}
|
|
375
|
+
async downloadToFile(url, destination) {
|
|
376
|
+
await new Promise((resolve, reject) => {
|
|
377
|
+
const transport = url.startsWith("https:") ? https : http;
|
|
378
|
+
const request = transport.get(url, response => {
|
|
379
|
+
const statusCode = response.statusCode || 0;
|
|
380
|
+
const location = response.headers.location;
|
|
381
|
+
if (statusCode >= 300 && statusCode < 400 && location) {
|
|
382
|
+
response.resume();
|
|
383
|
+
this.downloadToFile(location, destination).then(resolve).catch(reject);
|
|
384
|
+
return;
|
|
385
|
+
}
|
|
386
|
+
if (statusCode < 200 || statusCode >= 300) {
|
|
387
|
+
response.resume();
|
|
388
|
+
reject(new Error(`Download failed with status ${statusCode}`));
|
|
389
|
+
return;
|
|
390
|
+
}
|
|
391
|
+
const file = fs.createWriteStream(destination);
|
|
392
|
+
response.pipe(file);
|
|
393
|
+
file.on("finish", () => {
|
|
394
|
+
file.close();
|
|
395
|
+
resolve();
|
|
396
|
+
});
|
|
397
|
+
file.on("error", error => {
|
|
398
|
+
try {
|
|
399
|
+
file.close();
|
|
400
|
+
}
|
|
401
|
+
catch {
|
|
402
|
+
// no-op
|
|
403
|
+
}
|
|
404
|
+
reject(error);
|
|
405
|
+
});
|
|
406
|
+
});
|
|
407
|
+
request.on("error", reject);
|
|
408
|
+
});
|
|
409
|
+
}
|
|
410
|
+
}
|