vesper-wizard 2.3.1 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
package/build/metadata/store.js
DELETED
|
@@ -1,340 +0,0 @@
|
|
|
1
|
-
import Database from "better-sqlite3";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
export class MetadataStore {
|
|
5
|
-
db;
|
|
6
|
-
constructor(dbPath) {
|
|
7
|
-
// Ensure data directory exists
|
|
8
|
-
const dir = path.dirname(dbPath);
|
|
9
|
-
if (!fs.existsSync(dir)) {
|
|
10
|
-
fs.mkdirSync(dir, { recursive: true });
|
|
11
|
-
}
|
|
12
|
-
this.db = new Database(dbPath);
|
|
13
|
-
this.init();
|
|
14
|
-
this.migrate();
|
|
15
|
-
}
|
|
16
|
-
migrate() {
|
|
17
|
-
// Add install_path if missing
|
|
18
|
-
try {
|
|
19
|
-
this.db.exec("ALTER TABLE datasets ADD COLUMN install_path TEXT");
|
|
20
|
-
console.error("[MetadataStore] Migrated: Added install_path column");
|
|
21
|
-
}
|
|
22
|
-
catch (e) {
|
|
23
|
-
// Probably already exists
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
init() {
|
|
27
|
-
// Create table with all columns if it doesn't exist
|
|
28
|
-
this.db.exec(`
|
|
29
|
-
CREATE TABLE IF NOT EXISTS datasets (
|
|
30
|
-
id TEXT PRIMARY KEY,
|
|
31
|
-
source TEXT, -- 'huggingface', 'kaggle', 'uci', etc.
|
|
32
|
-
name TEXT,
|
|
33
|
-
description TEXT,
|
|
34
|
-
downloads INTEGER,
|
|
35
|
-
likes INTEGER,
|
|
36
|
-
stars INTEGER,
|
|
37
|
-
tags TEXT, -- JSON string
|
|
38
|
-
license_id TEXT,
|
|
39
|
-
license_category TEXT,
|
|
40
|
-
quality_score INTEGER,
|
|
41
|
-
has_train_split BOOLEAN,
|
|
42
|
-
total_examples INTEGER,
|
|
43
|
-
total_size_mb REAL,
|
|
44
|
-
is_safe_source BOOLEAN,
|
|
45
|
-
is_structured BOOLEAN,
|
|
46
|
-
last_updated TEXT,
|
|
47
|
-
quality_warnings TEXT, -- JSON array string
|
|
48
|
-
metadata_json TEXT, -- Full metadata as JSON
|
|
49
|
-
install_path TEXT -- Path to locally installed data
|
|
50
|
-
);
|
|
51
|
-
|
|
52
|
-
CREATE TABLE IF NOT EXISTS jobs (
|
|
53
|
-
id TEXT PRIMARY KEY,
|
|
54
|
-
type TEXT,
|
|
55
|
-
status TEXT,
|
|
56
|
-
priority INTEGER DEFAULT 0,
|
|
57
|
-
progress INTEGER DEFAULT 0,
|
|
58
|
-
status_text TEXT,
|
|
59
|
-
result_url TEXT,
|
|
60
|
-
error TEXT,
|
|
61
|
-
attempts INTEGER DEFAULT 0,
|
|
62
|
-
max_attempts INTEGER DEFAULT 3,
|
|
63
|
-
created_at TEXT,
|
|
64
|
-
updated_at TEXT,
|
|
65
|
-
metadata TEXT
|
|
66
|
-
);
|
|
67
|
-
|
|
68
|
-
CREATE TABLE IF NOT EXISTS local_files (
|
|
69
|
-
id TEXT PRIMARY KEY,
|
|
70
|
-
local_path TEXT,
|
|
71
|
-
status TEXT, -- 'downloading', 'completed', 'failed'
|
|
72
|
-
size_bytes INTEGER,
|
|
73
|
-
last_checked TEXT,
|
|
74
|
-
error TEXT
|
|
75
|
-
);
|
|
76
|
-
|
|
77
|
-
CREATE TABLE IF NOT EXISTS jobs_archive (
|
|
78
|
-
id TEXT PRIMARY KEY,
|
|
79
|
-
type TEXT,
|
|
80
|
-
status TEXT,
|
|
81
|
-
priority INTEGER DEFAULT 0,
|
|
82
|
-
progress INTEGER DEFAULT 0,
|
|
83
|
-
status_text TEXT,
|
|
84
|
-
result_url TEXT,
|
|
85
|
-
error TEXT,
|
|
86
|
-
attempts INTEGER DEFAULT 0,
|
|
87
|
-
max_attempts INTEGER DEFAULT 3,
|
|
88
|
-
created_at TEXT,
|
|
89
|
-
updated_at TEXT,
|
|
90
|
-
metadata TEXT
|
|
91
|
-
);
|
|
92
|
-
`);
|
|
93
|
-
// Migrate existing tables: add new columns if they don't exist
|
|
94
|
-
const tableInfo = this.db.prepare("PRAGMA table_info(datasets)").all();
|
|
95
|
-
const existingColumns = new Set(tableInfo.map(col => col.name));
|
|
96
|
-
const migrations = [
|
|
97
|
-
{ column: "stars", type: "INTEGER DEFAULT 0" },
|
|
98
|
-
{ column: "total_examples", type: "INTEGER" },
|
|
99
|
-
{ column: "total_size_mb", type: "REAL" },
|
|
100
|
-
{ column: "is_safe_source", type: "BOOLEAN DEFAULT 1" },
|
|
101
|
-
{ column: "is_structured", type: "BOOLEAN DEFAULT 0" },
|
|
102
|
-
{ column: "domain", type: "TEXT" },
|
|
103
|
-
{ column: "source", type: "TEXT DEFAULT 'huggingface'" },
|
|
104
|
-
{ column: "quality_warnings", type: "TEXT" }
|
|
105
|
-
];
|
|
106
|
-
for (const migration of migrations) {
|
|
107
|
-
if (!existingColumns.has(migration.column)) {
|
|
108
|
-
console.error(`[MetadataStore] Migrating: adding column ${migration.column}`);
|
|
109
|
-
this.db.exec(`ALTER TABLE datasets ADD COLUMN ${migration.column} ${migration.type}`);
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
// Job migrations
|
|
113
|
-
const jobTableInfo = this.db.prepare("PRAGMA table_info(jobs)").all();
|
|
114
|
-
const existingJobColumns = new Set(jobTableInfo.map(col => col.name));
|
|
115
|
-
const jobMigrations = [
|
|
116
|
-
{ column: "priority", type: "INTEGER DEFAULT 0" },
|
|
117
|
-
{ column: "attempts", type: "INTEGER DEFAULT 0" },
|
|
118
|
-
{ column: "max_attempts", type: "INTEGER DEFAULT 3" }
|
|
119
|
-
];
|
|
120
|
-
for (const migration of jobMigrations) {
|
|
121
|
-
if (!existingJobColumns.has(migration.column)) {
|
|
122
|
-
console.error(`[MetadataStore] Migrating Job: adding column ${migration.column}`);
|
|
123
|
-
this.db.exec(`ALTER TABLE jobs ADD COLUMN ${migration.column} ${migration.type}`);
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
// Get updated column list after migrations
|
|
127
|
-
const updatedTableInfo = this.db.prepare("PRAGMA table_info(datasets)").all();
|
|
128
|
-
const allColumns = new Set(updatedTableInfo.map(col => col.name));
|
|
129
|
-
// Create indexes (only if columns exist)
|
|
130
|
-
const indexes = [
|
|
131
|
-
{ name: "idx_license_category", table: "datasets", column: "license_category" },
|
|
132
|
-
{ name: "idx_quality_score", table: "datasets", column: "quality_score" },
|
|
133
|
-
{ name: "idx_downloads", table: "datasets", column: "downloads" },
|
|
134
|
-
{ name: "idx_has_train_split", table: "datasets", column: "has_train_split" },
|
|
135
|
-
{ name: "idx_is_safe_source", table: "datasets", column: "is_safe_source" },
|
|
136
|
-
{ name: "idx_is_structured", table: "datasets", column: "is_structured" },
|
|
137
|
-
{ name: "idx_domain", table: "datasets", column: "domain" },
|
|
138
|
-
{ name: "idx_jobs_status", table: "jobs", column: "status" },
|
|
139
|
-
{ name: "idx_jobs_created", table: "jobs", column: "created_at" },
|
|
140
|
-
{ name: "idx_jobs_archive_created", table: "jobs_archive", column: "created_at" }
|
|
141
|
-
];
|
|
142
|
-
for (const idx of indexes) {
|
|
143
|
-
// For datasets table, check if column exists first
|
|
144
|
-
if (idx.table === "datasets" && !allColumns.has(idx.column))
|
|
145
|
-
continue;
|
|
146
|
-
try {
|
|
147
|
-
this.db.exec(`CREATE INDEX IF NOT EXISTS ${idx.name} ON ${idx.table}(${idx.column})`);
|
|
148
|
-
}
|
|
149
|
-
catch (e) {
|
|
150
|
-
// Ignore index errors
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
saveDataset(dataset) {
|
|
155
|
-
// If the incoming dataset is incomplete, check if we already have a complete one
|
|
156
|
-
if (dataset.is_incomplete) {
|
|
157
|
-
const existing = this.getDataset(dataset.id);
|
|
158
|
-
if (existing && !existing.is_incomplete) {
|
|
159
|
-
// Already have better data, only update stats
|
|
160
|
-
const updateStats = this.db.prepare(`
|
|
161
|
-
UPDATE datasets SET
|
|
162
|
-
downloads = ?,
|
|
163
|
-
likes = ?,
|
|
164
|
-
stars = ?,
|
|
165
|
-
last_updated = ?
|
|
166
|
-
WHERE id = ?
|
|
167
|
-
`);
|
|
168
|
-
updateStats.run(dataset.downloads, dataset.likes, dataset.stars || 0, dataset.last_updated, dataset.id);
|
|
169
|
-
return;
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
const stmt = this.db.prepare(`
|
|
173
|
-
INSERT INTO datasets (
|
|
174
|
-
id, source, name, description, downloads, likes, stars, tags,
|
|
175
|
-
license_id, license_category, quality_score,
|
|
176
|
-
has_train_split, total_examples, total_size_mb,
|
|
177
|
-
is_safe_source, is_structured, last_updated,
|
|
178
|
-
quality_warnings, metadata_json, install_path
|
|
179
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
180
|
-
ON CONFLICT(id) DO UPDATE SET
|
|
181
|
-
source=excluded.source,
|
|
182
|
-
name=excluded.name,
|
|
183
|
-
description=excluded.description,
|
|
184
|
-
downloads=excluded.downloads,
|
|
185
|
-
likes=excluded.likes,
|
|
186
|
-
stars=excluded.stars,
|
|
187
|
-
tags=excluded.tags,
|
|
188
|
-
license_id=excluded.license_id,
|
|
189
|
-
license_category=excluded.license_category,
|
|
190
|
-
quality_score=excluded.quality_score,
|
|
191
|
-
has_train_split=excluded.has_train_split,
|
|
192
|
-
total_examples=excluded.total_examples,
|
|
193
|
-
total_size_mb=excluded.total_size_mb,
|
|
194
|
-
is_safe_source=excluded.is_safe_source,
|
|
195
|
-
is_structured=excluded.is_structured,
|
|
196
|
-
last_updated=excluded.last_updated,
|
|
197
|
-
quality_warnings=excluded.quality_warnings,
|
|
198
|
-
metadata_json=excluded.metadata_json,
|
|
199
|
-
install_path=excluded.install_path
|
|
200
|
-
`);
|
|
201
|
-
stmt.run(dataset.id, dataset.source, dataset.name, dataset.description, dataset.downloads, dataset.likes, dataset.stars || 0, JSON.stringify(dataset.tags), dataset.license.id, dataset.license.category, dataset.quality_score, dataset.has_train_split ? 1 : 0, dataset.total_examples, dataset.total_size_mb || null, dataset.is_safe_source ? 1 : 0, dataset.is_structured ? 1 : 0, dataset.last_updated, JSON.stringify(dataset.quality_warnings || []), JSON.stringify(dataset), dataset.install_path || null);
|
|
202
|
-
}
|
|
203
|
-
updateInstallPath(id, path) {
|
|
204
|
-
this.db.prepare("UPDATE datasets SET install_path = ? WHERE id = ?").run(path, id);
|
|
205
|
-
}
|
|
206
|
-
getDataset(id) {
|
|
207
|
-
const row = this.db.prepare("SELECT metadata_json, install_path FROM datasets WHERE id = ?").get(id);
|
|
208
|
-
if (!row)
|
|
209
|
-
return null;
|
|
210
|
-
const metadata = JSON.parse(row.metadata_json);
|
|
211
|
-
metadata.install_path = row.install_path || undefined;
|
|
212
|
-
return metadata;
|
|
213
|
-
}
|
|
214
|
-
getAllDatasets() {
|
|
215
|
-
const rows = this.db.prepare("SELECT metadata_json FROM datasets").all();
|
|
216
|
-
return rows.map(r => JSON.parse(r.metadata_json));
|
|
217
|
-
}
|
|
218
|
-
beginTransaction() {
|
|
219
|
-
this.db.exec("BEGIN");
|
|
220
|
-
}
|
|
221
|
-
commit() {
|
|
222
|
-
this.db.exec("COMMIT");
|
|
223
|
-
}
|
|
224
|
-
rollback() {
|
|
225
|
-
this.db.exec("ROLLBACK");
|
|
226
|
-
}
|
|
227
|
-
search(options) {
|
|
228
|
-
let query = "SELECT metadata_json FROM datasets WHERE 1=1";
|
|
229
|
-
const params = [];
|
|
230
|
-
if (options.licenseCategories && options.licenseCategories.length > 0) {
|
|
231
|
-
const placeholders = options.licenseCategories.map(() => "?").join(",");
|
|
232
|
-
query += ` AND license_category IN (${placeholders})`;
|
|
233
|
-
params.push(...options.licenseCategories);
|
|
234
|
-
}
|
|
235
|
-
if (options.minQualityScore !== undefined) {
|
|
236
|
-
query += " AND quality_score >= ?";
|
|
237
|
-
params.push(options.minQualityScore);
|
|
238
|
-
}
|
|
239
|
-
query += " ORDER BY quality_score DESC";
|
|
240
|
-
if (options.limit) {
|
|
241
|
-
query += " LIMIT ?";
|
|
242
|
-
params.push(options.limit);
|
|
243
|
-
}
|
|
244
|
-
const rows = this.db.prepare(query).all(...params);
|
|
245
|
-
return rows.map(r => JSON.parse(r.metadata_json));
|
|
246
|
-
}
|
|
247
|
-
saveJob(job) {
|
|
248
|
-
const upsert = this.db.prepare(`
|
|
249
|
-
INSERT INTO jobs (id, type, status, priority, progress, status_text, result_url, error, attempts, max_attempts, created_at, updated_at, metadata)
|
|
250
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
251
|
-
ON CONFLICT(id) DO UPDATE SET
|
|
252
|
-
status=excluded.status,
|
|
253
|
-
priority=excluded.priority,
|
|
254
|
-
progress=excluded.progress,
|
|
255
|
-
status_text=excluded.status_text,
|
|
256
|
-
result_url=excluded.result_url,
|
|
257
|
-
error=excluded.error,
|
|
258
|
-
attempts=excluded.attempts,
|
|
259
|
-
max_attempts=excluded.max_attempts,
|
|
260
|
-
updated_at=excluded.updated_at,
|
|
261
|
-
metadata=excluded.metadata
|
|
262
|
-
`);
|
|
263
|
-
upsert.run(job.id, job.type, job.status, job.priority, job.progress, job.status_text, job.result_url || null, job.error || null, job.attempts, job.max_attempts, job.created_at, job.updated_at, job.metadata || null);
|
|
264
|
-
}
|
|
265
|
-
getJob(id) {
|
|
266
|
-
const row = this.db.prepare("SELECT * FROM jobs WHERE id = ?").get(id);
|
|
267
|
-
return row || null;
|
|
268
|
-
}
|
|
269
|
-
/**
|
|
270
|
-
* Register or update a local file record
|
|
271
|
-
*/
|
|
272
|
-
registerDownload(id, localPath, status, sizeBytes, error) {
|
|
273
|
-
const upsert = this.db.prepare(`
|
|
274
|
-
INSERT INTO local_files (id, local_path, status, size_bytes, last_checked, error)
|
|
275
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
276
|
-
ON CONFLICT(id) DO UPDATE SET
|
|
277
|
-
local_path=excluded.local_path,
|
|
278
|
-
status=excluded.status,
|
|
279
|
-
size_bytes=excluded.size_bytes,
|
|
280
|
-
last_checked=excluded.last_checked,
|
|
281
|
-
error=excluded.error
|
|
282
|
-
`);
|
|
283
|
-
upsert.run(id, localPath, status, sizeBytes || 0, new Date().toISOString(), error || null);
|
|
284
|
-
}
|
|
285
|
-
/**
|
|
286
|
-
* Get download status and path for a dataset
|
|
287
|
-
*/
|
|
288
|
-
getDownloadStatus(id) {
|
|
289
|
-
return this.db.prepare("SELECT * FROM local_files WHERE id = ?").get(id);
|
|
290
|
-
}
|
|
291
|
-
/**
|
|
292
|
-
* Archive old jobs to the cold storage table.
|
|
293
|
-
* @param days Age in days
|
|
294
|
-
*/
|
|
295
|
-
archiveOldJobs(days) {
|
|
296
|
-
const cutoff = new Date();
|
|
297
|
-
cutoff.setDate(cutoff.getDate() - days);
|
|
298
|
-
const cutoffStr = cutoff.toISOString();
|
|
299
|
-
return this.db.transaction(() => {
|
|
300
|
-
// Copy to archive
|
|
301
|
-
this.db.prepare(`
|
|
302
|
-
INSERT OR IGNORE INTO jobs_archive
|
|
303
|
-
SELECT * FROM jobs
|
|
304
|
-
WHERE created_at < ?
|
|
305
|
-
`).run(cutoffStr);
|
|
306
|
-
// Delete from active jobs
|
|
307
|
-
const info = this.db.prepare(`
|
|
308
|
-
DELETE FROM jobs
|
|
309
|
-
WHERE created_at < ?
|
|
310
|
-
`).run(cutoffStr);
|
|
311
|
-
return info.changes;
|
|
312
|
-
})();
|
|
313
|
-
}
|
|
314
|
-
/**
|
|
315
|
-
* Perform database maintenance (VACUUM, ANALYZE).
|
|
316
|
-
*/
|
|
317
|
-
optimize() {
|
|
318
|
-
console.error("[MetadataStore] Running database optimization (VACUUM, ANALYZE)...");
|
|
319
|
-
this.db.exec("VACUUM");
|
|
320
|
-
this.db.exec("ANALYZE");
|
|
321
|
-
}
|
|
322
|
-
/**
|
|
323
|
-
* Delete all datasets from a specific source
|
|
324
|
-
*/
|
|
325
|
-
deleteBySource(source) {
|
|
326
|
-
const info = this.db.prepare("DELETE FROM datasets WHERE source = ?").run(source);
|
|
327
|
-
console.error(`[MetadataStore] Deleted ${info.changes} datasets from source: ${source}`);
|
|
328
|
-
return info.changes;
|
|
329
|
-
}
|
|
330
|
-
/**
|
|
331
|
-
* Get all dataset IDs from a specific source
|
|
332
|
-
*/
|
|
333
|
-
getDatasetIdsBySource(source) {
|
|
334
|
-
const rows = this.db.prepare("SELECT id FROM datasets WHERE source = ?").all(source);
|
|
335
|
-
return rows.map(r => r.id);
|
|
336
|
-
}
|
|
337
|
-
close() {
|
|
338
|
-
this.db.close();
|
|
339
|
-
}
|
|
340
|
-
}
|
package/build/metadata/types.js
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import path from "path";
|
|
3
|
-
const PYTHON_SCRIPT_PATH = path.resolve("src", "python", "uci_adapter.py");
|
|
4
|
-
export class UCIScraper {
|
|
5
|
-
/**
|
|
6
|
-
* Search UCI repository using the Python adapter
|
|
7
|
-
*/
|
|
8
|
-
async scrape(query, limit = 10) {
|
|
9
|
-
return new Promise((resolve, reject) => {
|
|
10
|
-
const pythonProcess = spawn("python", [
|
|
11
|
-
PYTHON_SCRIPT_PATH,
|
|
12
|
-
"--action", "search",
|
|
13
|
-
"--query", query,
|
|
14
|
-
"--limit", String(limit)
|
|
15
|
-
]);
|
|
16
|
-
let output = "";
|
|
17
|
-
let errorOutput = "";
|
|
18
|
-
pythonProcess.stdout.on("data", (data) => {
|
|
19
|
-
output += data.toString();
|
|
20
|
-
});
|
|
21
|
-
pythonProcess.stderr.on("data", (data) => {
|
|
22
|
-
errorOutput += data.toString();
|
|
23
|
-
});
|
|
24
|
-
pythonProcess.on("close", (code) => {
|
|
25
|
-
if (code !== 0) {
|
|
26
|
-
// It's possible for python to emit stderr warnings but still succeed
|
|
27
|
-
// But exit code != 0 is definitely an error
|
|
28
|
-
console.error(`[UCIScraper] Process exited with code ${code}: ${errorOutput}`);
|
|
29
|
-
resolve([]); // Fail gracefully by returning empty
|
|
30
|
-
return;
|
|
31
|
-
}
|
|
32
|
-
try {
|
|
33
|
-
const results = JSON.parse(output);
|
|
34
|
-
if (results.error) {
|
|
35
|
-
console.error(`[UCIScraper] Internal error: ${results.error}`);
|
|
36
|
-
resolve([]);
|
|
37
|
-
}
|
|
38
|
-
else {
|
|
39
|
-
resolve(results);
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
catch (e) {
|
|
43
|
-
console.error(`[UCIScraper] JSON parse error: ${e.message}. Output: ${output.substring(0, 100)}...`);
|
|
44
|
-
resolve([]);
|
|
45
|
-
}
|
|
46
|
-
});
|
|
47
|
-
});
|
|
48
|
-
}
|
|
49
|
-
}
|
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
export class MockErrorTracker {
|
|
2
|
-
exceptions = [];
|
|
3
|
-
messages = [];
|
|
4
|
-
captureException(error, context) {
|
|
5
|
-
console.error(`[ErrorTracker] Exception captured: ${error.message}`);
|
|
6
|
-
this.exceptions.push({ error, context });
|
|
7
|
-
}
|
|
8
|
-
captureMessage(message, level = "info") {
|
|
9
|
-
console.error(`[ErrorTracker] Message captured (${level}): ${message}`);
|
|
10
|
-
this.messages.push({ message, level });
|
|
11
|
-
}
|
|
12
|
-
}
|
|
13
|
-
export class ObservabilityService {
|
|
14
|
-
errorTracker;
|
|
15
|
-
jobSuccessCounter = new Map();
|
|
16
|
-
jobFailureCounter = new Map();
|
|
17
|
-
jobDurationHistogram = new Map();
|
|
18
|
-
constructor(errorTracker = new MockErrorTracker()) {
|
|
19
|
-
this.errorTracker = errorTracker;
|
|
20
|
-
}
|
|
21
|
-
recordJobSuccess(type, durationMs) {
|
|
22
|
-
// Increment success counter
|
|
23
|
-
this.jobSuccessCounter.set(type, (this.jobSuccessCounter.get(type) || 0) + 1);
|
|
24
|
-
// Record duration
|
|
25
|
-
this.updateMetricRecord(type, durationMs);
|
|
26
|
-
}
|
|
27
|
-
recordJobFailure(type, error) {
|
|
28
|
-
// Increment failure counter
|
|
29
|
-
this.jobFailureCounter.set(type, (this.jobFailureCounter.get(type) || 0) + 1);
|
|
30
|
-
// Track error
|
|
31
|
-
this.errorTracker.captureException(error, { jobType: type });
|
|
32
|
-
}
|
|
33
|
-
getPrometheusMetrics() {
|
|
34
|
-
let out = "# HELP jobs_processed_total Total number of jobs successfully processed\n";
|
|
35
|
-
out += "# TYPE jobs_processed_total counter\n";
|
|
36
|
-
for (const [type, count] of this.jobSuccessCounter) {
|
|
37
|
-
out += `jobs_processed_total{type="${type}"} ${count}\n`;
|
|
38
|
-
}
|
|
39
|
-
out += "\n# HELP jobs_failed_total Total number of failed jobs\n";
|
|
40
|
-
out += "# TYPE jobs_failed_total counter\n";
|
|
41
|
-
for (const [type, count] of this.jobFailureCounter) {
|
|
42
|
-
out += `jobs_failed_total{type="${type}"} ${count}\n`;
|
|
43
|
-
}
|
|
44
|
-
out += "\n# HELP job_duration_seconds_sum Latency of job processing in seconds\n";
|
|
45
|
-
out += "# TYPE job_duration_seconds_sum counter\n";
|
|
46
|
-
for (const [type, record] of this.jobDurationHistogram) {
|
|
47
|
-
out += `job_duration_seconds_sum{type="${type}"} ${record.sum / 1000}\n`;
|
|
48
|
-
out += `job_duration_seconds_count{type="${type}"} ${record.count}\n`;
|
|
49
|
-
out += `job_duration_seconds_max{type="${type}"} ${record.max / 1000}\n`;
|
|
50
|
-
}
|
|
51
|
-
return out;
|
|
52
|
-
}
|
|
53
|
-
getStats() {
|
|
54
|
-
const stats = {};
|
|
55
|
-
for (const [type, record] of this.jobDurationHistogram) {
|
|
56
|
-
stats[type] = {
|
|
57
|
-
successCount: this.jobSuccessCounter.get(type) || 0,
|
|
58
|
-
failureCount: this.jobFailureCounter.get(type) || 0,
|
|
59
|
-
avgDuration: (record.sum / record.count).toFixed(2) + "ms",
|
|
60
|
-
maxDuration: record.max + "ms"
|
|
61
|
-
};
|
|
62
|
-
}
|
|
63
|
-
return stats;
|
|
64
|
-
}
|
|
65
|
-
updateMetricRecord(type, value) {
|
|
66
|
-
let record = this.jobDurationHistogram.get(type);
|
|
67
|
-
if (!record) {
|
|
68
|
-
record = { count: 0, sum: 0, min: value, max: value };
|
|
69
|
-
this.jobDurationHistogram.set(type, record);
|
|
70
|
-
}
|
|
71
|
-
record.count++;
|
|
72
|
-
record.sum += value;
|
|
73
|
-
record.min = Math.min(record.min, value);
|
|
74
|
-
record.max = Math.max(record.max, value);
|
|
75
|
-
}
|
|
76
|
-
}
|
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
export class TargetDetector {
|
|
5
|
-
pythonPath = "python";
|
|
6
|
-
scriptPath;
|
|
7
|
-
constructor(buildDir = process.cwd()) {
|
|
8
|
-
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
9
|
-
const dataRoot = path.join(homeDir, ".vesper");
|
|
10
|
-
// Use same robust path resolution as other services
|
|
11
|
-
const scriptPath0 = path.resolve(dataRoot, "python", "target_engine.py");
|
|
12
|
-
const scriptPath1 = path.resolve(buildDir, "python", "target_engine.py");
|
|
13
|
-
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "target_engine.py");
|
|
14
|
-
const scriptPath3 = path.resolve(buildDir, "..", "python", "target_engine.py");
|
|
15
|
-
if (fs.existsSync(scriptPath0)) {
|
|
16
|
-
this.scriptPath = scriptPath0;
|
|
17
|
-
}
|
|
18
|
-
else if (fs.existsSync(scriptPath1)) {
|
|
19
|
-
this.scriptPath = scriptPath1;
|
|
20
|
-
}
|
|
21
|
-
else if (fs.existsSync(scriptPath2)) {
|
|
22
|
-
this.scriptPath = scriptPath2;
|
|
23
|
-
}
|
|
24
|
-
else if (fs.existsSync(scriptPath3)) {
|
|
25
|
-
this.scriptPath = scriptPath3;
|
|
26
|
-
}
|
|
27
|
-
else {
|
|
28
|
-
this.scriptPath = scriptPath0;
|
|
29
|
-
}
|
|
30
|
-
if (process.platform === "win32") {
|
|
31
|
-
this.pythonPath = "py";
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
/**
|
|
35
|
-
* Detect probable target columns in a dataset
|
|
36
|
-
*/
|
|
37
|
-
async detectTarget(filePath) {
|
|
38
|
-
return this.runPython("detect", [filePath]);
|
|
39
|
-
}
|
|
40
|
-
/**
|
|
41
|
-
* Validate a specific column as a target
|
|
42
|
-
*/
|
|
43
|
-
async validateTarget(filePath, targetColumn) {
|
|
44
|
-
return this.runPython("validate", [filePath, targetColumn]);
|
|
45
|
-
}
|
|
46
|
-
async runPython(action, args) {
|
|
47
|
-
return new Promise((resolve, reject) => {
|
|
48
|
-
const childProcess = spawn(this.pythonPath, [this.scriptPath, action, ...args], {
|
|
49
|
-
env: { ...process.env, PYTHONIOENCODING: 'utf-8' }
|
|
50
|
-
});
|
|
51
|
-
let stdout = "";
|
|
52
|
-
let stderr = "";
|
|
53
|
-
childProcess.stdout.on("data", (data) => stdout += data.toString());
|
|
54
|
-
childProcess.stderr.on("data", (data) => stderr += data.toString());
|
|
55
|
-
childProcess.on("close", (code) => {
|
|
56
|
-
if (code !== 0) {
|
|
57
|
-
reject(new Error(`Target Detector (${action}) failed: ${stderr}`));
|
|
58
|
-
return;
|
|
59
|
-
}
|
|
60
|
-
try {
|
|
61
|
-
const result = JSON.parse(stdout);
|
|
62
|
-
if (result.error) {
|
|
63
|
-
reject(new Error(result.error));
|
|
64
|
-
}
|
|
65
|
-
else {
|
|
66
|
-
resolve(result);
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
catch (e) {
|
|
70
|
-
reject(new Error(`Failed to parse target detector output: ${stdout}\nError: ${e}`));
|
|
71
|
-
}
|
|
72
|
-
});
|
|
73
|
-
});
|
|
74
|
-
}
|
|
75
|
-
}
|
|
Binary file
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
import asyncio
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
import sys
|
|
6
|
-
import warnings
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import Any, Dict
|
|
9
|
-
|
|
10
|
-
# Suppress noisy HF warnings
|
|
11
|
-
warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
|
|
12
|
-
warnings.filterwarnings("ignore", message=".*legacy.*")
|
|
13
|
-
|
|
14
|
-
CURRENT_DIR = Path(__file__).resolve().parent
|
|
15
|
-
if str(CURRENT_DIR) not in sys.path:
|
|
16
|
-
sys.path.insert(0, str(CURRENT_DIR))
|
|
17
|
-
|
|
18
|
-
from vesper.core.asset_downloader import AssetDownloader
|
|
19
|
-
from vesper.core.download_recipe import build_download_recipe, save_recipe, get_download_recipe
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _print(payload: Dict[str, Any]) -> None:
|
|
23
|
-
print(json.dumps(payload, ensure_ascii=False))
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
|
|
27
|
-
payload = json.loads(args.payload)
|
|
28
|
-
output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
|
|
29
|
-
output_dir = payload.get("output_dir")
|
|
30
|
-
workers = int(payload.get("workers") or 8)
|
|
31
|
-
recipes_dir = payload.get("recipes_dir")
|
|
32
|
-
|
|
33
|
-
# Auto-set HF token from payload if provided
|
|
34
|
-
token = payload.get("token") or payload.get("hf_token")
|
|
35
|
-
if token:
|
|
36
|
-
os.environ["HF_TOKEN"] = str(token)
|
|
37
|
-
|
|
38
|
-
downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
|
|
39
|
-
|
|
40
|
-
result = await downloader.download_assets(
|
|
41
|
-
dataset_id=str(payload.get("dataset_id")),
|
|
42
|
-
source=payload.get("source"),
|
|
43
|
-
repo_id=payload.get("repo_id"),
|
|
44
|
-
kaggle_ref=payload.get("kaggle_ref"),
|
|
45
|
-
urls=payload.get("urls"),
|
|
46
|
-
output_format=payload.get("output_format", "webdataset"),
|
|
47
|
-
output_dir=str(output_dir) if output_dir else None,
|
|
48
|
-
max_items=payload.get("max_items"),
|
|
49
|
-
image_column=payload.get("image_column"),
|
|
50
|
-
)
|
|
51
|
-
return {"ok": True, "result": result}
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def main() -> None:
|
|
55
|
-
parser = argparse.ArgumentParser(description="Vesper Smart Asset Downloader Engine")
|
|
56
|
-
parser.add_argument("action", choices=["download", "build_recipe", "get_recipe"])
|
|
57
|
-
parser.add_argument("payload", help="JSON payload")
|
|
58
|
-
args = parser.parse_args()
|
|
59
|
-
|
|
60
|
-
try:
|
|
61
|
-
if args.action == "download":
|
|
62
|
-
response = asyncio.run(_run_download(args))
|
|
63
|
-
_print(response)
|
|
64
|
-
return
|
|
65
|
-
|
|
66
|
-
payload = json.loads(args.payload)
|
|
67
|
-
if args.action == "build_recipe":
|
|
68
|
-
recipe = build_download_recipe(payload)
|
|
69
|
-
saved = save_recipe(recipe, payload.get("recipes_dir"))
|
|
70
|
-
_print({"ok": True, "recipe": recipe, "saved_to": saved})
|
|
71
|
-
return
|
|
72
|
-
|
|
73
|
-
if args.action == "get_recipe":
|
|
74
|
-
dataset_id = str(payload.get("dataset_id"))
|
|
75
|
-
recipe = get_download_recipe(dataset_id, payload.get("recipes_dir"))
|
|
76
|
-
_print({"ok": True, "recipe": recipe})
|
|
77
|
-
return
|
|
78
|
-
|
|
79
|
-
_print({"ok": False, "error": f"Unknown action: {args.action}"})
|
|
80
|
-
except Exception as e:
|
|
81
|
-
error_msg = str(e)
|
|
82
|
-
# Provide actionable error messages
|
|
83
|
-
if "401" in error_msg or "403" in error_msg or "Unauthorized" in error_msg:
|
|
84
|
-
error_msg = (
|
|
85
|
-
"Authentication required. This dataset may be gated/private. "
|
|
86
|
-
"Use configure_keys tool to set HF_TOKEN, then retry."
|
|
87
|
-
)
|
|
88
|
-
elif "No image column" in error_msg:
|
|
89
|
-
error_msg += " Hint: specify image_column parameter with the name of the column containing images."
|
|
90
|
-
_print({"ok": False, "error": error_msg})
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
if __name__ == "__main__":
|
|
94
|
-
main()
|