vesper-wizard 2.3.1 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
package/build/quality/types.js
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
import { Embedder } from "../search/embedder.js";
|
|
2
|
-
import { VectorStore } from "../search/vector-store.js";
|
|
3
|
-
import { MetadataStore } from "../metadata/store.js";
|
|
4
|
-
import path from "path";
|
|
5
|
-
async function main() {
|
|
6
|
-
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
7
|
-
const vectorPath = path.join(process.cwd(), "data", "vectors.json");
|
|
8
|
-
const metadataStore = new MetadataStore(dbPath);
|
|
9
|
-
const vectorStore = new VectorStore(vectorPath);
|
|
10
|
-
const embedder = Embedder.getInstance();
|
|
11
|
-
const datasets = metadataStore.getAllDatasets();
|
|
12
|
-
const indexedIds = new Set(vectorStore.getAllIds());
|
|
13
|
-
// Filter to only new datasets
|
|
14
|
-
const toIndex = datasets.filter(ds => !indexedIds.has(ds.id));
|
|
15
|
-
console.error(`Total datasets: ${datasets.length}, Already indexed: ${indexedIds.size}, To index: ${toIndex.length}`);
|
|
16
|
-
const BATCH_SIZE = 20;
|
|
17
|
-
let processed = 0;
|
|
18
|
-
for (let i = 0; i < toIndex.length; i += BATCH_SIZE) {
|
|
19
|
-
const batch = toIndex.slice(i, i + BATCH_SIZE);
|
|
20
|
-
try {
|
|
21
|
-
// Prepare texts for batch embedding
|
|
22
|
-
const texts = batch.map(ds => [
|
|
23
|
-
ds.name,
|
|
24
|
-
ds.description,
|
|
25
|
-
`Task: ${ds.task}`,
|
|
26
|
-
`Languages: ${ds.languages?.join(", ") || ""}`,
|
|
27
|
-
`Tags: ${ds.tags?.join(" ") || ""}`
|
|
28
|
-
].join(" ").slice(0, 1500));
|
|
29
|
-
// Memory-safe sequential embedding (avoids OOM on large libraries)
|
|
30
|
-
for (let idx = 0; idx < batch.length; idx++) {
|
|
31
|
-
const ds = batch[idx];
|
|
32
|
-
try {
|
|
33
|
-
const vector = await embedder.embed(texts[idx]);
|
|
34
|
-
vectorStore.add(ds.id, vector);
|
|
35
|
-
}
|
|
36
|
-
catch (err) {
|
|
37
|
-
console.error(`Failed to index ${ds.id}:`, err);
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
processed += batch.length;
|
|
41
|
-
if (processed % 100 === 0 || i + BATCH_SIZE >= toIndex.length) {
|
|
42
|
-
console.error(`Indexed ${processed}/${toIndex.length} new datasets...`);
|
|
43
|
-
vectorStore.save();
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
catch (err) {
|
|
47
|
-
console.error(`Batch around ${i} failed:`, err);
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
vectorStore.save();
|
|
51
|
-
console.error("Vector indexing complete.");
|
|
52
|
-
metadataStore.close();
|
|
53
|
-
}
|
|
54
|
-
main().catch(console.error);
|
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
import Database from "better-sqlite3";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
async function main() {
|
|
5
|
-
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
6
|
-
if (!fs.existsSync(dbPath)) {
|
|
7
|
-
console.error("Database not found. Run 'npm run scrape' first.");
|
|
8
|
-
process.exit(1);
|
|
9
|
-
}
|
|
10
|
-
const db = new Database(dbPath);
|
|
11
|
-
try {
|
|
12
|
-
// Get total count
|
|
13
|
-
const count = db.prepare("SELECT COUNT(*) as count FROM datasets").get();
|
|
14
|
-
console.log(`\nTotal datasets in database: ${count.count}\n`);
|
|
15
|
-
// Check which columns exist
|
|
16
|
-
const tableInfo = db.prepare("PRAGMA table_info(datasets)").all();
|
|
17
|
-
const columns = tableInfo.map(col => col.name);
|
|
18
|
-
const hasNewColumns = columns.includes("is_safe_source");
|
|
19
|
-
// Get basic statistics (works with both old and new schema)
|
|
20
|
-
const stats = db.prepare(`
|
|
21
|
-
SELECT
|
|
22
|
-
COUNT(*) as total,
|
|
23
|
-
SUM(downloads) as total_downloads,
|
|
24
|
-
AVG(quality_score) as avg_quality,
|
|
25
|
-
SUM(CASE WHEN license_category = 'safe' THEN 1 ELSE 0 END) as safe_licenses,
|
|
26
|
-
SUM(CASE WHEN has_train_split = 1 THEN 1 ELSE 0 END) as with_train_split
|
|
27
|
-
FROM datasets
|
|
28
|
-
`).get();
|
|
29
|
-
console.log("Statistics:");
|
|
30
|
-
console.log(` Total downloads: ${stats.total_downloads?.toLocaleString() || 0}`);
|
|
31
|
-
console.log(` Average quality score: ${Math.round(stats.avg_quality || 0)}`);
|
|
32
|
-
console.log(` Safe licenses: ${stats.safe_licenses || 0}`);
|
|
33
|
-
console.log(` With train split: ${stats.with_train_split || 0}`);
|
|
34
|
-
// Show extended stats if new schema is available
|
|
35
|
-
if (hasNewColumns) {
|
|
36
|
-
const extendedStats = db.prepare(`
|
|
37
|
-
SELECT
|
|
38
|
-
SUM(CASE WHEN is_safe_source = 1 THEN 1 ELSE 0 END) as safe_sources,
|
|
39
|
-
SUM(CASE WHEN is_structured = 1 THEN 1 ELSE 0 END) as structured,
|
|
40
|
-
SUM(total_examples) as total_examples
|
|
41
|
-
FROM datasets
|
|
42
|
-
`).get();
|
|
43
|
-
console.log(` Safe sources: ${extendedStats.safe_sources || 0}`);
|
|
44
|
-
console.log(` Structured datasets: ${extendedStats.structured || 0}`);
|
|
45
|
-
console.log(` Total examples: ${extendedStats.total_examples?.toLocaleString() || 0}`);
|
|
46
|
-
}
|
|
47
|
-
else {
|
|
48
|
-
console.log(` WARNING: Database uses old schema. Re-scrape to get extended statistics.`);
|
|
49
|
-
}
|
|
50
|
-
console.log();
|
|
51
|
-
// Top 5 by downloads
|
|
52
|
-
const top5 = db.prepare(`
|
|
53
|
-
SELECT id, name, downloads, quality_score, license_category
|
|
54
|
-
FROM datasets
|
|
55
|
-
ORDER BY downloads DESC
|
|
56
|
-
LIMIT 5
|
|
57
|
-
`).all();
|
|
58
|
-
console.log("Top 5 datasets by downloads:");
|
|
59
|
-
top5.forEach((ds, i) => {
|
|
60
|
-
console.log(` ${i + 1}. ${ds.id}`);
|
|
61
|
-
console.log(` Downloads: ${ds.downloads.toLocaleString()}, Quality: ${ds.quality_score}, License: ${ds.license_category}`);
|
|
62
|
-
});
|
|
63
|
-
console.log();
|
|
64
|
-
}
|
|
65
|
-
catch (error) {
|
|
66
|
-
console.error("Error reading database:", error);
|
|
67
|
-
process.exit(1);
|
|
68
|
-
}
|
|
69
|
-
finally {
|
|
70
|
-
db.close();
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
main();
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import Database from "better-sqlite3";
|
|
2
|
-
import path from "path";
|
|
3
|
-
// Checking all plausible databases for jobs
|
|
4
|
-
const dbs = ["metadata.db", "vesper.db", "datasets.db"];
|
|
5
|
-
for (const dbName of dbs) {
|
|
6
|
-
const dbPath = path.resolve("data", dbName);
|
|
7
|
-
try {
|
|
8
|
-
const db = new Database(dbPath);
|
|
9
|
-
const tables = db.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='jobs'").all();
|
|
10
|
-
if (tables.length > 0) {
|
|
11
|
-
console.log(`\n--- Checking jobs in ${dbName} ---`);
|
|
12
|
-
const jobs = db.prepare("SELECT * FROM jobs ORDER BY created_at DESC LIMIT 20").all();
|
|
13
|
-
for (const job of jobs) {
|
|
14
|
-
if (JSON.stringify(job).toLowerCase().includes("naruto")) {
|
|
15
|
-
console.log(JSON.stringify(job, null, 2));
|
|
16
|
-
}
|
|
17
|
-
}
|
|
18
|
-
}
|
|
19
|
-
db.close();
|
|
20
|
-
}
|
|
21
|
-
catch (e) {
|
|
22
|
-
// Silently skip if DB doesn't exist or table missing
|
|
23
|
-
}
|
|
24
|
-
}
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
import Database from "better-sqlite3";
|
|
2
|
-
import path from "path";
|
|
3
|
-
// Trying metadata.db which is larger
|
|
4
|
-
const dbPath = path.resolve("data", "metadata.db");
|
|
5
|
-
const db = new Database(dbPath);
|
|
6
|
-
try {
|
|
7
|
-
const query = "naruto";
|
|
8
|
-
const results = db.prepare("SELECT * FROM datasets WHERE name LIKE ? OR description LIKE ?").all(`%${query}%`, `%${query}%`);
|
|
9
|
-
console.log(`Found ${results.length} results for "${query}" in metadata.db:`);
|
|
10
|
-
console.log(JSON.stringify(results, null, 2));
|
|
11
|
-
}
|
|
12
|
-
catch (e) {
|
|
13
|
-
console.error("Error checking database:", e.message);
|
|
14
|
-
}
|
|
15
|
-
finally {
|
|
16
|
-
db.close();
|
|
17
|
-
}
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
/**
|
|
3
|
-
* Cleanup script to remove all Kaggle datasets from Vesper
|
|
4
|
-
*/
|
|
5
|
-
import { fileURLToPath } from "url";
|
|
6
|
-
import path from "path";
|
|
7
|
-
import { MetadataStore } from "../metadata/store.js";
|
|
8
|
-
import { VectorStore } from "../search/vector-store.js";
|
|
9
|
-
const __filename = fileURLToPath(import.meta.url);
|
|
10
|
-
const __dirname = path.dirname(__filename);
|
|
11
|
-
const homeDir = process.env.HOME || process.env.USERPROFILE || path.join(__dirname, "..");
|
|
12
|
-
const dataRoot = path.join(homeDir, ".vesper");
|
|
13
|
-
const dbPath = path.join(dataRoot, "data", "metadata.db");
|
|
14
|
-
const vectorPath = path.join(dataRoot, "data", "vectors.json");
|
|
15
|
-
console.log("🧹 Vesper Kaggle Cleanup");
|
|
16
|
-
console.log("========================\n");
|
|
17
|
-
try {
|
|
18
|
-
const metadataStore = new MetadataStore(dbPath);
|
|
19
|
-
const vectorStore = new VectorStore(vectorPath);
|
|
20
|
-
// Get all Kaggle dataset IDs
|
|
21
|
-
const kaggleIds = metadataStore.getDatasetIdsBySource("kaggle");
|
|
22
|
-
console.log(`Found ${kaggleIds.length} Kaggle datasets in database`);
|
|
23
|
-
if (kaggleIds.length === 0) {
|
|
24
|
-
console.log("✅ No Kaggle datasets to remove");
|
|
25
|
-
process.exit(0);
|
|
26
|
-
}
|
|
27
|
-
// Delete from vector store
|
|
28
|
-
const vectorsDeleted = vectorStore.deleteMany(kaggleIds);
|
|
29
|
-
console.log(`🗑️ Deleted ${vectorsDeleted} vectors from vector store`);
|
|
30
|
-
vectorStore.save();
|
|
31
|
-
// Delete from metadata database
|
|
32
|
-
const datasetsDeleted = metadataStore.deleteBySource("kaggle");
|
|
33
|
-
console.log(`🗑️ Deleted ${datasetsDeleted} datasets from metadata database`);
|
|
34
|
-
metadataStore.close();
|
|
35
|
-
console.log("\n✅ Cleanup complete! Kaggle datasets have been removed.");
|
|
36
|
-
console.log(" You can now search without seeing Kaggle results.");
|
|
37
|
-
}
|
|
38
|
-
catch (error) {
|
|
39
|
-
console.error("❌ Cleanup failed:", error.message);
|
|
40
|
-
process.exit(1);
|
|
41
|
-
}
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
import { PipelineExecutor } from "../cleaning/executor.js";
|
|
2
|
-
import { ScriptGenerator } from "../cleaning/exporter.js";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
import path from "path";
|
|
5
|
-
async function main() {
|
|
6
|
-
console.log(" Vesper Dataset Ops Engine: Full Demo\n");
|
|
7
|
-
const executor = new PipelineExecutor();
|
|
8
|
-
const exporter = new ScriptGenerator();
|
|
9
|
-
const demoFile = path.join(process.cwd(), "vesper_demo_data.csv");
|
|
10
|
-
// 1. Create a Realistic Dirty Dataset
|
|
11
|
-
// - duplicate: Duplicate Customer
|
|
12
|
-
// - age: Mixed types ("25", "twenty"), Outliers (200)
|
|
13
|
-
// - email: PII
|
|
14
|
-
// - empty_col: 100% missing
|
|
15
|
-
// - score: Good data
|
|
16
|
-
const csvContent = `customer_id,age,email,score,empty_col
|
|
17
|
-
C001,25,john.doe@example.com,88.5,
|
|
18
|
-
C002,"30",jane.smith@work.org,92.0,
|
|
19
|
-
C003,200,bob.jones@gmail.com,15.0,
|
|
20
|
-
C001,25,john.doe@example.com,88.5,
|
|
21
|
-
C004,"forty",alice@co.uk,80.0,
|
|
22
|
-
C005,35,,75.0,`;
|
|
23
|
-
fs.writeFileSync(demoFile, csvContent);
|
|
24
|
-
console.log(`📦 Created dirty dataset: ${demoFile}`);
|
|
25
|
-
console.log(`Contains: Duplicates, PII (Emails), Mixed Types (Age), Outliers, Empty Columns.\n`);
|
|
26
|
-
try {
|
|
27
|
-
// 2. Run the Auto-Cleaning Pipeline
|
|
28
|
-
console.log(" Running Auto-Cleaning Pipeline...");
|
|
29
|
-
const result = await executor.runPipeline("demo-dataset", demoFile);
|
|
30
|
-
console.log("\n --- Quality Inspection Report ---");
|
|
31
|
-
console.log(` Duplicates: ${result.initial_quality.duplicate_rows} rows`);
|
|
32
|
-
console.log(` PII Warnings: ${result.initial_quality.pii_warnings?.length || 0}`);
|
|
33
|
-
if (result.initial_quality.schema_warnings.length > 0) {
|
|
34
|
-
console.log(" Schema Issues:");
|
|
35
|
-
result.initial_quality.schema_warnings.forEach(w => console.log(` ⚠️ ${w}`));
|
|
36
|
-
}
|
|
37
|
-
console.log("\n --- Generated Cleaning Plan ---");
|
|
38
|
-
result.plan.operations.forEach((op, i) => {
|
|
39
|
-
console.log(` ${i + 1}. [${op.type}] Reason: ${op.reason}`);
|
|
40
|
-
});
|
|
41
|
-
console.log("\n --- Execution Result ---");
|
|
42
|
-
if (result.cleaning_result.success) {
|
|
43
|
-
console.log(` Success! Cleaned file saved to:`);
|
|
44
|
-
console.log(` ${result.final_output_path}`);
|
|
45
|
-
}
|
|
46
|
-
else {
|
|
47
|
-
console.error(` Failed: ${result.cleaning_result.error}`);
|
|
48
|
-
}
|
|
49
|
-
// 3. Generate Reproducibility Script
|
|
50
|
-
console.log("\n --- Reproducibility ---");
|
|
51
|
-
const pythonScript = exporter.generatePythonScript(result.plan, demoFile);
|
|
52
|
-
const scriptPath = path.join(process.cwd(), "demo_cleaning_script.py");
|
|
53
|
-
fs.writeFileSync(scriptPath, pythonScript);
|
|
54
|
-
console.log(` Generated Python script: ${scriptPath}`);
|
|
55
|
-
console.log(` (You can run this script independently to reproduce these steps!)`);
|
|
56
|
-
console.log("\n Demo Complete.");
|
|
57
|
-
}
|
|
58
|
-
catch (error) {
|
|
59
|
-
console.error("\n Demo failed:", error);
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
main().catch(console.error);
|
package/build/scripts/demo-ui.js
DELETED
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
import { SearchEngine } from "../search/engine.js";
|
|
2
|
-
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
-
import { VectorStore } from "../search/vector-store.js";
|
|
4
|
-
import { Embedder } from "../search/embedder.js";
|
|
5
|
-
import { formatSearchResults, formatDatasetInfo } from "../tools/formatter.js";
|
|
6
|
-
import path from "path";
|
|
7
|
-
/**
|
|
8
|
-
* Demo script to showcase the new formatted UI
|
|
9
|
-
*/
|
|
10
|
-
async function main() {
|
|
11
|
-
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
12
|
-
const vectorPath = path.join(process.cwd(), "data", "vectors.json");
|
|
13
|
-
const store = new MetadataStore(dbPath);
|
|
14
|
-
const vectorStore = new VectorStore(vectorPath);
|
|
15
|
-
const embedder = Embedder.getInstance();
|
|
16
|
-
await embedder.init();
|
|
17
|
-
const engine = new SearchEngine(store, vectorStore, embedder);
|
|
18
|
-
console.log("\n" + "═".repeat(80));
|
|
19
|
-
console.log("VESPER UI DEMO - Formatted Search Results");
|
|
20
|
-
console.log("═".repeat(80) + "\n");
|
|
21
|
-
// Demo 1: Medical datasets
|
|
22
|
-
console.log("Demo 1: Medical Dataset Search\n");
|
|
23
|
-
const medicalResults = await engine.search("diabetes prediction machine learning", {
|
|
24
|
-
limit: 3,
|
|
25
|
-
safeOnly: true
|
|
26
|
-
});
|
|
27
|
-
console.log(formatSearchResults(medicalResults));
|
|
28
|
-
// Demo 2: Detailed dataset info
|
|
29
|
-
if (medicalResults.length > 0) {
|
|
30
|
-
console.log("\n" + "═".repeat(80));
|
|
31
|
-
console.log("Demo 2: Detailed Dataset Information");
|
|
32
|
-
console.log("═".repeat(80) + "\n");
|
|
33
|
-
console.log(formatDatasetInfo(medicalResults[0]));
|
|
34
|
-
}
|
|
35
|
-
// Demo 3: Computer Vision
|
|
36
|
-
console.log("\n" + "═".repeat(80));
|
|
37
|
-
console.log("Demo 3: Computer Vision Dataset Search");
|
|
38
|
-
console.log("═".repeat(80) + "\n");
|
|
39
|
-
const cvResults = await engine.search("image classification cats dogs", {
|
|
40
|
-
limit: 3,
|
|
41
|
-
safeOnly: true
|
|
42
|
-
});
|
|
43
|
-
console.log(formatSearchResults(cvResults));
|
|
44
|
-
// Demo 4: Show quality warnings
|
|
45
|
-
console.log("\n" + "═".repeat(80));
|
|
46
|
-
console.log("Demo 4: Niche Query with Quality Warnings");
|
|
47
|
-
console.log("═".repeat(80) + "\n");
|
|
48
|
-
const nicheResults = await engine.search("ancient manuscript text recognition", {
|
|
49
|
-
limit: 3,
|
|
50
|
-
safeOnly: true
|
|
51
|
-
});
|
|
52
|
-
console.log(formatSearchResults(nicheResults));
|
|
53
|
-
console.log("\n" + "═".repeat(80));
|
|
54
|
-
console.log("Demo Complete!");
|
|
55
|
-
console.log("═".repeat(80) + "\n");
|
|
56
|
-
store.close();
|
|
57
|
-
}
|
|
58
|
-
main().catch(console.error);
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
import { QualityAnalyzer } from "../quality/analyzer.js";
|
|
2
|
-
import { PipelineExecutor } from "../cleaning/executor.js";
|
|
3
|
-
import { DataSplitter } from "../splitting/splitter.js";
|
|
4
|
-
import { MetadataPackager } from "../export/packager.js";
|
|
5
|
-
import { DataExporter } from "../export/exporter.js";
|
|
6
|
-
import fs from "fs";
|
|
7
|
-
import path from "path";
|
|
8
|
-
async function main() {
|
|
9
|
-
console.log(" Vesper Data Ops Engine - End-to-End Pipeline Demo\n");
|
|
10
|
-
const sessionDir = path.join(process.cwd(), "e2e_demo_output");
|
|
11
|
-
if (fs.existsSync(sessionDir))
|
|
12
|
-
fs.rmSync(sessionDir, { recursive: true, force: true });
|
|
13
|
-
fs.mkdirSync(sessionDir);
|
|
14
|
-
const rawFile = path.join(sessionDir, "raw_data.csv");
|
|
15
|
-
// --- STEP 0: Create Raw "Dirty" Data ---
|
|
16
|
-
console.log(" Step 0: Initializing Raw Dataset...");
|
|
17
|
-
let content = "id,name,age,salary,joined_date\n";
|
|
18
|
-
content += "1,Alice,25,50000,2023-01-01\n";
|
|
19
|
-
content += "2,Bob,,60000,2023-01-05\n"; // Missing age
|
|
20
|
-
content += "3,Charlie,35,70000,2023-01-10\n";
|
|
21
|
-
content += "1,Alice,25,50000,2023-01-01\n"; // Duplicate
|
|
22
|
-
content += "4,Diana,40,invalid,2023-02-01\n"; // Type mismatch
|
|
23
|
-
fs.writeFileSync(rawFile, content);
|
|
24
|
-
// --- STEP 1: Quality Analysis ---
|
|
25
|
-
console.log(" Step 1: Running Quality Analysis...");
|
|
26
|
-
const analyzer = new QualityAnalyzer();
|
|
27
|
-
const report = await analyzer.analyze(rawFile);
|
|
28
|
-
console.log(` - Quality Score: ${report.overall_score}/100`);
|
|
29
|
-
console.log(` - Warnings Found: ${report.warnings.length}`);
|
|
30
|
-
// --- STEP 2: Auto-Cleaning ---
|
|
31
|
-
console.log(" Step 2: Generating and Executing Cleaning Pipeline...");
|
|
32
|
-
const executor = new PipelineExecutor();
|
|
33
|
-
const pipelineResult = await executor.runPipeline("demo-dataset", rawFile);
|
|
34
|
-
const cleanedFile = pipelineResult.final_output_path;
|
|
35
|
-
console.log(` - Cleaned file: ${path.basename(cleanedFile)}`);
|
|
36
|
-
// --- STEP 3: Smart Splitting ---
|
|
37
|
-
console.log(" Step 3: Splitting into Train/Val/Test/Holdout (Stratified)...");
|
|
38
|
-
const splitter = new DataSplitter();
|
|
39
|
-
const splitResult = await splitter.split(cleanedFile, {
|
|
40
|
-
type: "random", // Using random since dummy data too small for stratified
|
|
41
|
-
ratios: { train: 0.6, val: 0.2, test: 0.1, holdout: 0.1 },
|
|
42
|
-
shuffle: true,
|
|
43
|
-
random_seed: 42
|
|
44
|
-
});
|
|
45
|
-
console.log(` - Splits created: ${Object.keys(splitResult.paths).join(", ")}`);
|
|
46
|
-
// --- STEP 4: Export to Parquet ---
|
|
47
|
-
console.log(" Step 4: Exporting Final Train Set to Parquet...");
|
|
48
|
-
const exporter = new DataExporter();
|
|
49
|
-
const parquetFile = path.join(sessionDir, "train_final.parquet");
|
|
50
|
-
await exporter.export(splitResult.paths.train, parquetFile, "parquet");
|
|
51
|
-
console.log(` - Exported: ${path.basename(parquetFile)}`);
|
|
52
|
-
// --- STEP 5: Metadata Packaging ---
|
|
53
|
-
console.log(" Step 5: Creating Final Data Package...");
|
|
54
|
-
const packager = new MetadataPackager();
|
|
55
|
-
const packageResult = await packager.createPackage(path.join(sessionDir, "vesper_package"), [
|
|
56
|
-
{ path: parquetFile, format: "parquet", name: "train-set" },
|
|
57
|
-
{ path: splitResult.paths.holdout, format: "csv", name: "holdout-set" }
|
|
58
|
-
], {
|
|
59
|
-
name: "Vesper E2E Demo",
|
|
60
|
-
version: "1.0.0",
|
|
61
|
-
description: "Automatically cleaned and partitioned dataset.",
|
|
62
|
-
license: "MIT",
|
|
63
|
-
author: "Vesper Engine"
|
|
64
|
-
}, { qualityReport: report });
|
|
65
|
-
console.log(" Pipeline Finished Successfully!");
|
|
66
|
-
console.log(` Package Location: ${packageResult.packagePath}`);
|
|
67
|
-
console.log(` Manifest: datapackage.json created.`);
|
|
68
|
-
}
|
|
69
|
-
main().catch(err => {
|
|
70
|
-
console.error(" Pipeline Failed:", err);
|
|
71
|
-
process.exit(1);
|
|
72
|
-
});
|
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
2
|
-
import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
|
|
3
|
-
import { MetadataStore } from "../metadata/store.js";
|
|
4
|
-
import path from "path";
|
|
5
|
-
/**
|
|
6
|
-
* Realistic massive scraper: Get maximum from HF + extensive Kaggle coverage
|
|
7
|
-
* Phase 1: Bulk Discovery (Skeleton Indexing)
|
|
8
|
-
* Phase 2: Kaggle Discovery
|
|
9
|
-
*/
|
|
10
|
-
async function main() {
|
|
11
|
-
const scraper = new HuggingFaceScraper();
|
|
12
|
-
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
13
|
-
const store = new MetadataStore(dbPath);
|
|
14
|
-
const allDatasets = new Map();
|
|
15
|
-
try {
|
|
16
|
-
// Load existing datasets to avoid duplicates
|
|
17
|
-
console.error(`\nLoading existing datasets from database...`);
|
|
18
|
-
const existing = store.getAllDatasets();
|
|
19
|
-
for (const ds of existing) {
|
|
20
|
-
allDatasets.set(ds.id, ds);
|
|
21
|
-
}
|
|
22
|
-
console.error(`Found ${existing.length} existing datasets in database`);
|
|
23
|
-
// Phase 1: Bulk discovery from HuggingFace
|
|
24
|
-
console.error(`\nPhase 1: Bulk HuggingFace Discovery (Target: 30,000)`);
|
|
25
|
-
const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
26
|
-
if (hfToken) {
|
|
27
|
-
console.error(`Using HuggingFace token (rate limits should be higher)`);
|
|
28
|
-
}
|
|
29
|
-
else {
|
|
30
|
-
console.error(`WARNING: No HF_TOKEN found. Bulk scraping may be slower.`);
|
|
31
|
-
}
|
|
32
|
-
const hfLimit = 30000;
|
|
33
|
-
const hfDatasets = await scraper.scrapeBulk(hfLimit);
|
|
34
|
-
let newHfCount = 0;
|
|
35
|
-
for (const ds of hfDatasets) {
|
|
36
|
-
if (!allDatasets.has(ds.id)) {
|
|
37
|
-
allDatasets.set(ds.id, ds);
|
|
38
|
-
newHfCount++;
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
console.error(`HuggingFace Bulk Discovery: ${newHfCount} new datasets (${allDatasets.size} total unique)`);
|
|
42
|
-
// Save HF progress
|
|
43
|
-
console.error(`Saving HF discovery results to database...`);
|
|
44
|
-
store.beginTransaction();
|
|
45
|
-
try {
|
|
46
|
-
for (const ds of hfDatasets) {
|
|
47
|
-
store.saveDataset(ds);
|
|
48
|
-
}
|
|
49
|
-
store.commit();
|
|
50
|
-
}
|
|
51
|
-
catch (e) {
|
|
52
|
-
store.rollback();
|
|
53
|
-
console.error("Failed to save HF discovery progress:", e);
|
|
54
|
-
}
|
|
55
|
-
// Phase 2: Extensive Kaggle scraping across many categories
|
|
56
|
-
const kaggleUser = process.env.KAGGLE_USERNAME;
|
|
57
|
-
const kaggleKey = process.env.KAGGLE_KEY;
|
|
58
|
-
if (kaggleUser && kaggleKey) {
|
|
59
|
-
console.error(`\nPhase 2: Extensive Kaggle scraping`);
|
|
60
|
-
const kaggleScraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
|
|
61
|
-
// Comprehensive Kaggle search terms
|
|
62
|
-
const kaggleSearches = [
|
|
63
|
-
"machine learning", "deep learning", "data science",
|
|
64
|
-
"classification", "regression", "clustering", "anomaly detection",
|
|
65
|
-
"natural language processing", "text classification", "sentiment analysis",
|
|
66
|
-
"image classification", "object detection", "medical imaging",
|
|
67
|
-
"time series", "forecasting", "financial forecasting",
|
|
68
|
-
"healthcare", "medical diagnosis", "e-commerce", "social media"
|
|
69
|
-
];
|
|
70
|
-
for (const search of kaggleSearches) {
|
|
71
|
-
console.error(` [Kaggle: "${search}"] Fetching...`);
|
|
72
|
-
try {
|
|
73
|
-
const kaggleDatasets = await kaggleScraper.scrape(search, 100, true);
|
|
74
|
-
let newKaggleCount = 0;
|
|
75
|
-
for (const ds of kaggleDatasets) {
|
|
76
|
-
ds.id = `kaggle:${ds.id}`;
|
|
77
|
-
if (!allDatasets.has(ds.id)) {
|
|
78
|
-
allDatasets.set(ds.id, ds);
|
|
79
|
-
newKaggleCount++;
|
|
80
|
-
store.saveDataset(ds); // Save individually to avoid long transactions
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
console.error(` ${newKaggleCount} new datasets (${kaggleDatasets.length} total fetched, ${allDatasets.size} total unique)`);
|
|
84
|
-
}
|
|
85
|
-
catch (e) {
|
|
86
|
-
console.error(` ERROR: ${e.message}`);
|
|
87
|
-
}
|
|
88
|
-
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
console.error(`\nBulk Discovery Complete!`);
|
|
92
|
-
console.error(`Total unique datasets in library: ${allDatasets.size}`);
|
|
93
|
-
console.error(`\nNext step: Run 'npm run index' to update vectors.`);
|
|
94
|
-
}
|
|
95
|
-
catch (error) {
|
|
96
|
-
console.error("\nERROR: Massive scraping failed:", error);
|
|
97
|
-
process.exit(1);
|
|
98
|
-
}
|
|
99
|
-
finally {
|
|
100
|
-
store.close();
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
main();
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
import { MetadataStore } from "../metadata/store.js";
|
|
2
|
-
import { JobManager } from "../jobs/manager.js";
|
|
3
|
-
import path from "path";
|
|
4
|
-
async function showDashboard() {
|
|
5
|
-
const dbPath = path.resolve("data", "vesper.db");
|
|
6
|
-
const store = new MetadataStore(dbPath);
|
|
7
|
-
// In a real app, this would be a shared instance in a long-running process
|
|
8
|
-
const manager = JobManager.getInstance(store);
|
|
9
|
-
const stats = manager.stats;
|
|
10
|
-
console.clear();
|
|
11
|
-
console.log("==========================================");
|
|
12
|
-
console.log(" VESPER OPERATIONS DASHBOARD ");
|
|
13
|
-
console.log("==========================================\n");
|
|
14
|
-
const prometheus = stats.getPrometheusMetrics();
|
|
15
|
-
const summary = stats.getStats();
|
|
16
|
-
if (Object.keys(summary).length === 0) {
|
|
17
|
-
console.log("No metrics recorded yet. Process some jobs to see data.");
|
|
18
|
-
}
|
|
19
|
-
else {
|
|
20
|
-
console.log("--- SYSTEM METRICS (JSON) ---");
|
|
21
|
-
console.log(JSON.stringify(summary, null, 2));
|
|
22
|
-
console.log("\n--- PROMETHEUS EXPORT (RAW) ---");
|
|
23
|
-
console.log(prometheus);
|
|
24
|
-
}
|
|
25
|
-
console.log("\n==========================================");
|
|
26
|
-
console.log(" Press Ctrl+C to exit dashboard loop ");
|
|
27
|
-
console.log("==========================================");
|
|
28
|
-
store.close();
|
|
29
|
-
}
|
|
30
|
-
// Simple loop for "real-time" feel (simulated)
|
|
31
|
-
console.log("Starting Dashboard...");
|
|
32
|
-
showDashboard();
|
|
33
|
-
setInterval(showDashboard, 5000);
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
import { JobManager } from "../jobs/manager.js";
|
|
2
|
-
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
async function repro() {
|
|
5
|
-
const dbPath = "repro_test.db";
|
|
6
|
-
if (fs.existsSync(dbPath))
|
|
7
|
-
fs.unlinkSync(dbPath);
|
|
8
|
-
const store = new MetadataStore(dbPath);
|
|
9
|
-
const jobManager = JobManager.getInstance(store);
|
|
10
|
-
console.log("Setting up listener...");
|
|
11
|
-
jobManager.on("processJob", async (job, execute) => {
|
|
12
|
-
console.log(`Listener received job ${job.id}`);
|
|
13
|
-
const task = async () => {
|
|
14
|
-
console.log("Running task...");
|
|
15
|
-
return "success";
|
|
16
|
-
};
|
|
17
|
-
try {
|
|
18
|
-
await execute(task);
|
|
19
|
-
console.log("Execute finished");
|
|
20
|
-
}
|
|
21
|
-
catch (e) {
|
|
22
|
-
console.error("Execute failed in listener:", e.message);
|
|
23
|
-
}
|
|
24
|
-
});
|
|
25
|
-
console.log("Creating job...");
|
|
26
|
-
const job = jobManager.createJob("prepare", 0, { query: "test" });
|
|
27
|
-
console.log(`Job created: ${job.id}`);
|
|
28
|
-
// Wait for a bit
|
|
29
|
-
await new Promise(r => setTimeout(r, 2000));
|
|
30
|
-
const finalJob = store.getJob(job.id);
|
|
31
|
-
console.log("Final job status:", finalJob?.status);
|
|
32
|
-
console.log("Final job status text:", finalJob?.status_text);
|
|
33
|
-
store.close();
|
|
34
|
-
if (fs.existsSync(dbPath))
|
|
35
|
-
fs.unlinkSync(dbPath);
|
|
36
|
-
}
|
|
37
|
-
repro().catch(console.error);
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
import path from "path";
|
|
2
|
-
import fs from "fs";
|
|
3
|
-
import { spawnSync } from "child_process";
|
|
4
|
-
const pythonPath = "python";
|
|
5
|
-
const scriptPath = path.join(process.cwd(), "src", "python", "cleaner.py");
|
|
6
|
-
const testDir = path.join(process.cwd(), "test_repro");
|
|
7
|
-
if (!fs.existsSync(testDir))
|
|
8
|
-
fs.mkdirSync(testDir);
|
|
9
|
-
async function runRepro() {
|
|
10
|
-
console.log("=== Reproducing CSV Export Bug ===\n");
|
|
11
|
-
const parquetFile = path.join(testDir, "test_nested.parquet");
|
|
12
|
-
const csvOutput = path.join(testDir, "test_nested_cleaned.csv");
|
|
13
|
-
// 1. Create a Parquet file with nested data (Lists/Structs) using Python
|
|
14
|
-
console.log("Creating nested Parquet file...");
|
|
15
|
-
const createScript = `
|
|
16
|
-
import polars as pl
|
|
17
|
-
df = pl.DataFrame({
|
|
18
|
-
"id": [1, 2, 3],
|
|
19
|
-
"tags": [["a", "b"], ["c"], []],
|
|
20
|
-
"meta": [{"score": 0.9, "safe": True}, {"score": 0.4, "safe": False}, {"score": 0.1, "safe": True}]
|
|
21
|
-
})
|
|
22
|
-
df.write_parquet(r"${parquetFile}")
|
|
23
|
-
`;
|
|
24
|
-
fs.writeFileSync(path.join(testDir, "create_data.py"), createScript);
|
|
25
|
-
spawnSync(pythonPath, [path.join(testDir, "create_data.py")], { stdio: 'inherit' });
|
|
26
|
-
// 2. Call cleaner.py to convert to CSV
|
|
27
|
-
console.log("Calling cleaner.py to convert to CSV...");
|
|
28
|
-
const result = spawnSync(pythonPath, [
|
|
29
|
-
scriptPath,
|
|
30
|
-
parquetFile,
|
|
31
|
-
"[]",
|
|
32
|
-
"csv"
|
|
33
|
-
]);
|
|
34
|
-
console.log("Exit Code:", result.status);
|
|
35
|
-
console.log("Stdout:", result.stdout?.toString());
|
|
36
|
-
console.log("Stderr:", result.stderr?.toString());
|
|
37
|
-
if (result.status === 0) {
|
|
38
|
-
try {
|
|
39
|
-
const data = JSON.parse(result.stdout.toString());
|
|
40
|
-
if (data.success) {
|
|
41
|
-
console.log("SUCCESS! Output file:", data.output_path);
|
|
42
|
-
if (fs.existsSync(data.output_path)) {
|
|
43
|
-
console.log("File exists on disk.");
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
else {
|
|
47
|
-
console.error("cleaner.py reported failure:", data.error);
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
catch (e) {
|
|
51
|
-
console.error("Failed to parse JSON output:", e);
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
runRepro().catch(console.error);
|
|
56
|
-
runRepro().catch(console.error);
|