vesper-wizard 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +1 -1
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,410 +0,0 @@
|
|
|
1
|
-
import fs from "fs";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import http from "http";
|
|
4
|
-
import https from "https";
|
|
5
|
-
import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
6
|
-
import { analyzeDatasetQuery } from "../search/query-intent.js";
|
|
7
|
-
export class UnifiedDatasetGateway {
|
|
8
|
-
deps;
|
|
9
|
-
constructor(deps) {
|
|
10
|
-
this.deps = deps;
|
|
11
|
-
}
|
|
12
|
-
getProviderStatuses(includeUnavailable = true) {
|
|
13
|
-
const hasHfToken = !!(process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN);
|
|
14
|
-
const hasKaggle = this.deps.dataIngestor.hasKaggleCredentials();
|
|
15
|
-
const hasDataWorld = this.deps.hasDataWorldToken();
|
|
16
|
-
const hasBigQuery = !!(process.env.GOOGLE_APPLICATION_CREDENTIALS || process.env.GOOGLE_CLOUD_PROJECT);
|
|
17
|
-
const providers = [
|
|
18
|
-
{
|
|
19
|
-
source: "huggingface",
|
|
20
|
-
display_name: "Hugging Face",
|
|
21
|
-
available: true,
|
|
22
|
-
auth_mode: hasHfToken ? "public-or-server-managed" : "public",
|
|
23
|
-
supported_operations: ["discover", "download", "info"],
|
|
24
|
-
requires_end_user_key: false,
|
|
25
|
-
notes: hasHfToken
|
|
26
|
-
? ["Public datasets are open by default. Gated/private repos can be accessed via the server-managed HF token when configured."]
|
|
27
|
-
: ["Public datasets work without any user key. Gated/private repos need an operator or user token."],
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
source: "openml",
|
|
31
|
-
display_name: "OpenML",
|
|
32
|
-
available: true,
|
|
33
|
-
auth_mode: "public",
|
|
34
|
-
supported_operations: ["discover", "download", "info"],
|
|
35
|
-
requires_end_user_key: false,
|
|
36
|
-
notes: ["OpenML is exposed as a keyless public provider through the gateway."],
|
|
37
|
-
},
|
|
38
|
-
{
|
|
39
|
-
source: "kaggle",
|
|
40
|
-
display_name: "Kaggle",
|
|
41
|
-
available: hasKaggle,
|
|
42
|
-
auth_mode: hasKaggle ? "server-managed" : "not-configured",
|
|
43
|
-
supported_operations: ["discover", "download", "info"],
|
|
44
|
-
requires_end_user_key: false,
|
|
45
|
-
notes: hasKaggle
|
|
46
|
-
? ["Kaggle is available through server-managed credentials. End users do not need to pass their own key."]
|
|
47
|
-
: ["Kaggle support exists, but no server-managed credentials are configured yet."],
|
|
48
|
-
},
|
|
49
|
-
{
|
|
50
|
-
source: "dataworld",
|
|
51
|
-
display_name: "data.world",
|
|
52
|
-
available: hasDataWorld,
|
|
53
|
-
auth_mode: hasDataWorld ? "server-managed" : "not-configured",
|
|
54
|
-
supported_operations: ["discover", "download", "info"],
|
|
55
|
-
requires_end_user_key: false,
|
|
56
|
-
notes: hasDataWorld
|
|
57
|
-
? ["data.world is available through server-managed credentials."]
|
|
58
|
-
: ["data.world support exists, but no server-managed token is configured yet."],
|
|
59
|
-
},
|
|
60
|
-
{
|
|
61
|
-
source: "s3",
|
|
62
|
-
display_name: "Amazon S3",
|
|
63
|
-
available: true,
|
|
64
|
-
auth_mode: "public-or-server-managed",
|
|
65
|
-
supported_operations: ["download", "info"],
|
|
66
|
-
requires_end_user_key: false,
|
|
67
|
-
notes: ["Supports keyless download of public S3 objects via s3://bucket/key or HTTPS S3 URLs.", "Bucket listing and search are intentionally not exposed."],
|
|
68
|
-
},
|
|
69
|
-
{
|
|
70
|
-
source: "bigquery",
|
|
71
|
-
display_name: "BigQuery",
|
|
72
|
-
available: hasBigQuery,
|
|
73
|
-
auth_mode: hasBigQuery ? "server-managed" : "not-configured",
|
|
74
|
-
supported_operations: ["info"],
|
|
75
|
-
requires_end_user_key: false,
|
|
76
|
-
notes: hasBigQuery
|
|
77
|
-
? ["BigQuery is reserved for operator-managed connectors. Query execution is not implemented in this patch."]
|
|
78
|
-
: ["BigQuery is scaffolded in the gateway contract, but no server-managed GCP configuration is present."],
|
|
79
|
-
},
|
|
80
|
-
];
|
|
81
|
-
return includeUnavailable ? providers : providers.filter(provider => provider.available);
|
|
82
|
-
}
|
|
83
|
-
async discover(options) {
|
|
84
|
-
const query = String(options.query || "").trim();
|
|
85
|
-
const requestedSource = options.source || "auto";
|
|
86
|
-
const limit = Math.max(1, Number(options.limit || 10));
|
|
87
|
-
const publicOnly = options.publicOnly !== false;
|
|
88
|
-
if (!query) {
|
|
89
|
-
throw new Error("query is required");
|
|
90
|
-
}
|
|
91
|
-
const notes = [];
|
|
92
|
-
const providers = this.resolveDiscoverSources(requestedSource, publicOnly, notes);
|
|
93
|
-
const perSourceLimit = Math.max(5, Math.ceil(limit / Math.max(providers.length, 1)) * 2);
|
|
94
|
-
const allResults = [];
|
|
95
|
-
for (const provider of providers) {
|
|
96
|
-
try {
|
|
97
|
-
const partial = await this.discoverFromSource(provider, query, perSourceLimit);
|
|
98
|
-
for (const dataset of partial) {
|
|
99
|
-
try {
|
|
100
|
-
this.deps.metadataStore.saveDataset(dataset);
|
|
101
|
-
}
|
|
102
|
-
catch {
|
|
103
|
-
// best-effort metadata persistence
|
|
104
|
-
}
|
|
105
|
-
allResults.push(dataset);
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
catch (error) {
|
|
109
|
-
notes.push(`${provider}: ${(error?.message || error || "Unknown provider error").toString()}`);
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
const deduped = new Map();
|
|
113
|
-
for (const dataset of allResults) {
|
|
114
|
-
deduped.set(`${dataset.source}:${dataset.id}`, dataset);
|
|
115
|
-
}
|
|
116
|
-
const results = Array.from(deduped.values())
|
|
117
|
-
.sort((a, b) => this.rankDataset(b) - this.rankDataset(a))
|
|
118
|
-
.slice(0, limit);
|
|
119
|
-
return {
|
|
120
|
-
query,
|
|
121
|
-
requested_source: requestedSource,
|
|
122
|
-
providers_tried: providers,
|
|
123
|
-
notes,
|
|
124
|
-
results,
|
|
125
|
-
};
|
|
126
|
-
}
|
|
127
|
-
async download(options) {
|
|
128
|
-
const requested = String(options.datasetId || "").trim();
|
|
129
|
-
if (!requested) {
|
|
130
|
-
throw new Error("dataset_id is required");
|
|
131
|
-
}
|
|
132
|
-
const notes = [];
|
|
133
|
-
const resolved = this.resolveDatasetReference(requested, options.source || "auto");
|
|
134
|
-
if (resolved.source === "bigquery") {
|
|
135
|
-
throw new Error("BigQuery gateway support is scaffolded for operator-managed connectors, but query/download execution is not implemented yet.");
|
|
136
|
-
}
|
|
137
|
-
if (resolved.source === "s3") {
|
|
138
|
-
const localPath = await this.downloadPublicS3Object(resolved.datasetId, options.targetDir);
|
|
139
|
-
return {
|
|
140
|
-
dataset_id: requested,
|
|
141
|
-
resolved_source: "s3",
|
|
142
|
-
local_path: localPath,
|
|
143
|
-
notes: ["Downloaded via the keyless S3 gateway path."],
|
|
144
|
-
};
|
|
145
|
-
}
|
|
146
|
-
let source = this.toIngestSource(resolved.source);
|
|
147
|
-
let datasetId = resolved.datasetId;
|
|
148
|
-
if (!source) {
|
|
149
|
-
const metadataMatch = this.lookupKnownDataset(requested);
|
|
150
|
-
const metadataSource = this.toIngestSource(metadataMatch?.source);
|
|
151
|
-
if (metadataMatch && metadataSource) {
|
|
152
|
-
source = metadataSource;
|
|
153
|
-
datasetId = metadataMatch.id;
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
if (!source) {
|
|
157
|
-
const discovery = await this.discover({ query: requested, source: "auto", limit: 1, publicOnly: false });
|
|
158
|
-
if (discovery.results.length === 0) {
|
|
159
|
-
throw new Error(`Unable to resolve provider for '${requested}'. Run unified_dataset_api with operation='discover' first or pass an explicit source.`);
|
|
160
|
-
}
|
|
161
|
-
const discoveredSource = this.toIngestSource(discovery.results[0].source);
|
|
162
|
-
if (!discoveredSource) {
|
|
163
|
-
throw new Error(`Resolved provider '${discovery.results[0].source}' cannot be downloaded through the dataset ingestor.`);
|
|
164
|
-
}
|
|
165
|
-
source = discoveredSource;
|
|
166
|
-
datasetId = discovery.results[0].id;
|
|
167
|
-
notes.push(`Auto-resolved provider to ${source}.`);
|
|
168
|
-
}
|
|
169
|
-
if (source === "kaggle" && !this.deps.dataIngestor.hasKaggleCredentials()) {
|
|
170
|
-
throw new Error("Kaggle is configured as a gateway source, but no server-managed credentials are available.");
|
|
171
|
-
}
|
|
172
|
-
if (source === "dataworld" && !this.deps.hasDataWorldToken()) {
|
|
173
|
-
throw new Error("data.world is configured as a gateway source, but no server-managed token is available.");
|
|
174
|
-
}
|
|
175
|
-
const localPath = await this.deps.dataIngestor.ensureData(datasetId, source, () => undefined);
|
|
176
|
-
let copiedTo;
|
|
177
|
-
if (options.targetDir) {
|
|
178
|
-
copiedTo = this.copyDownloadOutput(localPath, options.targetDir);
|
|
179
|
-
notes.push(`Copied dataset output to ${copiedTo}.`);
|
|
180
|
-
}
|
|
181
|
-
return {
|
|
182
|
-
dataset_id: datasetId,
|
|
183
|
-
resolved_source: source,
|
|
184
|
-
local_path: localPath,
|
|
185
|
-
copied_to: copiedTo,
|
|
186
|
-
notes,
|
|
187
|
-
};
|
|
188
|
-
}
|
|
189
|
-
async info(options) {
|
|
190
|
-
const requested = String(options.datasetId || "").trim();
|
|
191
|
-
if (!requested) {
|
|
192
|
-
throw new Error("dataset_id is required");
|
|
193
|
-
}
|
|
194
|
-
const resolved = this.resolveDatasetReference(requested, options.source || "auto");
|
|
195
|
-
const metadataMatch = this.lookupKnownDataset(requested) || (resolved.datasetId !== requested ? this.lookupKnownDataset(resolved.datasetId) : undefined);
|
|
196
|
-
if (metadataMatch) {
|
|
197
|
-
return {
|
|
198
|
-
dataset_id: requested,
|
|
199
|
-
resolved_source: metadataMatch.source,
|
|
200
|
-
notes: [],
|
|
201
|
-
dataset: metadataMatch,
|
|
202
|
-
};
|
|
203
|
-
}
|
|
204
|
-
if (resolved.source === "s3") {
|
|
205
|
-
return {
|
|
206
|
-
dataset_id: requested,
|
|
207
|
-
resolved_source: "s3",
|
|
208
|
-
notes: ["S3 info is derived from the object URI. Discovery/listing is intentionally not supported."],
|
|
209
|
-
dataset: {
|
|
210
|
-
id: requested,
|
|
211
|
-
source: "s3",
|
|
212
|
-
uri: this.toS3HttpsUrl(resolved.datasetId),
|
|
213
|
-
},
|
|
214
|
-
};
|
|
215
|
-
}
|
|
216
|
-
if (resolved.source === "bigquery") {
|
|
217
|
-
return {
|
|
218
|
-
dataset_id: requested,
|
|
219
|
-
resolved_source: "bigquery",
|
|
220
|
-
notes: ["BigQuery is reserved for operator-managed connectors. Detailed inspection is not implemented in this patch."],
|
|
221
|
-
};
|
|
222
|
-
}
|
|
223
|
-
const discovery = await this.discover({
|
|
224
|
-
query: resolved.datasetId,
|
|
225
|
-
source: resolved.source || "auto",
|
|
226
|
-
limit: 5,
|
|
227
|
-
publicOnly: options.publicOnly !== false,
|
|
228
|
-
});
|
|
229
|
-
const exact = discovery.results.find(dataset => this.matchesDatasetReference(dataset, requested));
|
|
230
|
-
return {
|
|
231
|
-
dataset_id: requested,
|
|
232
|
-
resolved_source: exact?.source,
|
|
233
|
-
notes: discovery.notes,
|
|
234
|
-
dataset: exact || discovery.results[0],
|
|
235
|
-
};
|
|
236
|
-
}
|
|
237
|
-
async discoverFromSource(source, query, limit) {
|
|
238
|
-
switch (source) {
|
|
239
|
-
case "huggingface":
|
|
240
|
-
return await new HuggingFaceScraper().scrape(limit, true, await analyzeDatasetQuery(query));
|
|
241
|
-
case "openml":
|
|
242
|
-
return await this.deps.openmlSource.discover(query, limit);
|
|
243
|
-
case "kaggle":
|
|
244
|
-
return await this.deps.kaggleSource.discover(query, limit);
|
|
245
|
-
case "dataworld":
|
|
246
|
-
return await this.deps.dataworldSource.discover(query, limit);
|
|
247
|
-
case "s3":
|
|
248
|
-
throw new Error("S3 does not support search/discovery in the unified gateway. Use a direct s3://bucket/key object reference.");
|
|
249
|
-
case "bigquery":
|
|
250
|
-
throw new Error("BigQuery discovery is not implemented in the unified gateway.");
|
|
251
|
-
default:
|
|
252
|
-
throw new Error(`Unsupported provider: ${source}`);
|
|
253
|
-
}
|
|
254
|
-
}
|
|
255
|
-
resolveDiscoverSources(source, publicOnly, notes) {
|
|
256
|
-
if (source !== "auto") {
|
|
257
|
-
if (source === "kaggle" && !this.deps.dataIngestor.hasKaggleCredentials()) {
|
|
258
|
-
throw new Error("Kaggle requires server-managed credentials and none are configured.");
|
|
259
|
-
}
|
|
260
|
-
if (source === "dataworld" && !this.deps.hasDataWorldToken()) {
|
|
261
|
-
throw new Error("data.world requires a server-managed token and none is configured.");
|
|
262
|
-
}
|
|
263
|
-
if (source === "s3" || source === "bigquery") {
|
|
264
|
-
throw new Error(`${source} does not currently support discover operation through the gateway.`);
|
|
265
|
-
}
|
|
266
|
-
return [source];
|
|
267
|
-
}
|
|
268
|
-
const providers = ["huggingface", "openml"];
|
|
269
|
-
if (!publicOnly && this.deps.dataIngestor.hasKaggleCredentials()) {
|
|
270
|
-
providers.push("kaggle");
|
|
271
|
-
}
|
|
272
|
-
else if (!publicOnly) {
|
|
273
|
-
notes.push("Kaggle skipped because no server-managed credentials are configured.");
|
|
274
|
-
}
|
|
275
|
-
if (!publicOnly && this.deps.hasDataWorldToken()) {
|
|
276
|
-
providers.push("dataworld");
|
|
277
|
-
}
|
|
278
|
-
else if (!publicOnly) {
|
|
279
|
-
notes.push("data.world skipped because no server-managed token is configured.");
|
|
280
|
-
}
|
|
281
|
-
return providers;
|
|
282
|
-
}
|
|
283
|
-
resolveDatasetReference(datasetId, source) {
|
|
284
|
-
const trimmed = datasetId.trim();
|
|
285
|
-
if (source !== "auto") {
|
|
286
|
-
if (source === "s3") {
|
|
287
|
-
return { source, datasetId: trimmed };
|
|
288
|
-
}
|
|
289
|
-
return { source, datasetId: this.stripSourcePrefix(trimmed, source) };
|
|
290
|
-
}
|
|
291
|
-
if (/^s3:\/\//i.test(trimmed) || /^https?:\/\/[^\s]+\.s3[.-][^\s]+/i.test(trimmed) || /^https?:\/\/s3\.[^\s]+amazonaws\.com\//i.test(trimmed)) {
|
|
292
|
-
return { source: "s3", datasetId: trimmed };
|
|
293
|
-
}
|
|
294
|
-
if (/^kaggle:/i.test(trimmed))
|
|
295
|
-
return { source: "kaggle", datasetId: trimmed.replace(/^kaggle:/i, "") };
|
|
296
|
-
if (/^(huggingface|hf):/i.test(trimmed))
|
|
297
|
-
return { source: "huggingface", datasetId: trimmed.replace(/^(huggingface|hf):/i, "") };
|
|
298
|
-
if (/^openml:/i.test(trimmed))
|
|
299
|
-
return { source: "openml", datasetId: trimmed.replace(/^openml:/i, "") };
|
|
300
|
-
if (/^dataworld:/i.test(trimmed))
|
|
301
|
-
return { source: "dataworld", datasetId: trimmed.replace(/^dataworld:/i, "") };
|
|
302
|
-
if (/^bigquery:/i.test(trimmed))
|
|
303
|
-
return { source: "bigquery", datasetId: trimmed.replace(/^bigquery:/i, "") };
|
|
304
|
-
if (/^\d+$/.test(trimmed))
|
|
305
|
-
return { source: "openml", datasetId: trimmed };
|
|
306
|
-
if (trimmed.includes("/") && !trimmed.includes(":"))
|
|
307
|
-
return { source: "huggingface", datasetId: trimmed };
|
|
308
|
-
return { datasetId: trimmed };
|
|
309
|
-
}
|
|
310
|
-
stripSourcePrefix(datasetId, source) {
|
|
311
|
-
if (source === "huggingface") {
|
|
312
|
-
return datasetId.replace(/^(huggingface|hf):/i, "");
|
|
313
|
-
}
|
|
314
|
-
return datasetId.replace(new RegExp(`^${source}:`, "i"), "");
|
|
315
|
-
}
|
|
316
|
-
lookupKnownDataset(datasetId) {
|
|
317
|
-
const candidates = new Set([
|
|
318
|
-
datasetId,
|
|
319
|
-
datasetId.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, ""),
|
|
320
|
-
]);
|
|
321
|
-
for (const candidate of candidates) {
|
|
322
|
-
const dataset = this.deps.metadataStore.getDataset(candidate);
|
|
323
|
-
if (dataset)
|
|
324
|
-
return dataset;
|
|
325
|
-
}
|
|
326
|
-
return undefined;
|
|
327
|
-
}
|
|
328
|
-
matchesDatasetReference(dataset, requested) {
|
|
329
|
-
const normalizedRequested = requested.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, "").toLowerCase();
|
|
330
|
-
const fullId = `${dataset.source}:${dataset.id}`.toLowerCase();
|
|
331
|
-
return dataset.id.toLowerCase() === normalizedRequested || fullId === requested.toLowerCase();
|
|
332
|
-
}
|
|
333
|
-
toIngestSource(source) {
|
|
334
|
-
if (source === "huggingface" || source === "openml" || source === "kaggle" || source === "dataworld") {
|
|
335
|
-
return source;
|
|
336
|
-
}
|
|
337
|
-
return undefined;
|
|
338
|
-
}
|
|
339
|
-
rankDataset(dataset) {
|
|
340
|
-
const relevance = Number(dataset.relevance_score || 0) * 1000;
|
|
341
|
-
const quality = Number(dataset.quality_score || 0) * 100;
|
|
342
|
-
const downloads = Number(dataset.downloads || 0);
|
|
343
|
-
return relevance + quality + downloads;
|
|
344
|
-
}
|
|
345
|
-
copyDownloadOutput(localPath, targetDir) {
|
|
346
|
-
const resolvedTargetDir = path.resolve(targetDir);
|
|
347
|
-
fs.mkdirSync(resolvedTargetDir, { recursive: true });
|
|
348
|
-
const destination = path.join(resolvedTargetDir, path.basename(localPath));
|
|
349
|
-
fs.cpSync(localPath, destination, { recursive: true, force: true });
|
|
350
|
-
return destination;
|
|
351
|
-
}
|
|
352
|
-
async downloadPublicS3Object(datasetId, targetDir) {
|
|
353
|
-
const httpsUrl = this.toS3HttpsUrl(datasetId);
|
|
354
|
-
const parsed = new URL(httpsUrl);
|
|
355
|
-
const fileName = path.basename(parsed.pathname) || "s3-object.bin";
|
|
356
|
-
const outputDir = path.resolve(targetDir || path.join(this.deps.dataRoot, "data", "raw"));
|
|
357
|
-
fs.mkdirSync(outputDir, { recursive: true });
|
|
358
|
-
const outputPath = path.join(outputDir, fileName);
|
|
359
|
-
await this.downloadToFile(httpsUrl, outputPath);
|
|
360
|
-
this.deps.metadataStore.registerDownload(datasetId, outputPath, "completed", fs.statSync(outputPath).size);
|
|
361
|
-
return outputPath;
|
|
362
|
-
}
|
|
363
|
-
toS3HttpsUrl(datasetId) {
|
|
364
|
-
if (/^https?:\/\//i.test(datasetId)) {
|
|
365
|
-
return datasetId;
|
|
366
|
-
}
|
|
367
|
-
const match = datasetId.match(/^s3:\/\/([^/]+)\/(.+)$/i);
|
|
368
|
-
if (!match) {
|
|
369
|
-
throw new Error("S3 source expects an s3://bucket/key object reference or a direct HTTPS S3 URL.");
|
|
370
|
-
}
|
|
371
|
-
const bucket = match[1];
|
|
372
|
-
const objectKey = match[2].split("/").map(encodeURIComponent).join("/");
|
|
373
|
-
return `https://${bucket}.s3.amazonaws.com/${objectKey}`;
|
|
374
|
-
}
|
|
375
|
-
async downloadToFile(url, destination) {
|
|
376
|
-
await new Promise((resolve, reject) => {
|
|
377
|
-
const transport = url.startsWith("https:") ? https : http;
|
|
378
|
-
const request = transport.get(url, response => {
|
|
379
|
-
const statusCode = response.statusCode || 0;
|
|
380
|
-
const location = response.headers.location;
|
|
381
|
-
if (statusCode >= 300 && statusCode < 400 && location) {
|
|
382
|
-
response.resume();
|
|
383
|
-
this.downloadToFile(location, destination).then(resolve).catch(reject);
|
|
384
|
-
return;
|
|
385
|
-
}
|
|
386
|
-
if (statusCode < 200 || statusCode >= 300) {
|
|
387
|
-
response.resume();
|
|
388
|
-
reject(new Error(`Download failed with status ${statusCode}`));
|
|
389
|
-
return;
|
|
390
|
-
}
|
|
391
|
-
const file = fs.createWriteStream(destination);
|
|
392
|
-
response.pipe(file);
|
|
393
|
-
file.on("finish", () => {
|
|
394
|
-
file.close();
|
|
395
|
-
resolve();
|
|
396
|
-
});
|
|
397
|
-
file.on("error", error => {
|
|
398
|
-
try {
|
|
399
|
-
file.close();
|
|
400
|
-
}
|
|
401
|
-
catch {
|
|
402
|
-
// no-op
|
|
403
|
-
}
|
|
404
|
-
reject(error);
|
|
405
|
-
});
|
|
406
|
-
});
|
|
407
|
-
request.on("error", reject);
|
|
408
|
-
});
|
|
409
|
-
}
|
|
410
|
-
}
|