vesper-wizard 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +1 -1
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
package/build/jobs/manager.js
DELETED
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
import { v4 as uuidv4 } from "uuid";
|
|
2
|
-
import { EventEmitter } from "events";
|
|
3
|
-
import { QueueService } from "./queue.js";
|
|
4
|
-
import { ObservabilityService } from "../monitoring/observability.js";
|
|
5
|
-
export class JobManager extends EventEmitter {
|
|
6
|
-
store;
|
|
7
|
-
static instance;
|
|
8
|
-
queue;
|
|
9
|
-
stats;
|
|
10
|
-
maxConcurrency = 2;
|
|
11
|
-
activeWorkers = 0;
|
|
12
|
-
constructor(store, stats) {
|
|
13
|
-
super();
|
|
14
|
-
this.store = store;
|
|
15
|
-
this.queue = new QueueService();
|
|
16
|
-
this.stats = stats || new ObservabilityService();
|
|
17
|
-
}
|
|
18
|
-
static getInstance(store, stats) {
|
|
19
|
-
if (!JobManager.instance) {
|
|
20
|
-
JobManager.instance = new JobManager(store, stats);
|
|
21
|
-
}
|
|
22
|
-
return JobManager.instance;
|
|
23
|
-
}
|
|
24
|
-
/**
|
|
25
|
-
* Set max parallel workers
|
|
26
|
-
*/
|
|
27
|
-
setConcurrency(count) {
|
|
28
|
-
this.maxConcurrency = count;
|
|
29
|
-
}
|
|
30
|
-
/**
|
|
31
|
-
* Create a new job and add it to the priority queue
|
|
32
|
-
*/
|
|
33
|
-
createJob(type, priority = 0, metadata, maxAttempts = 3) {
|
|
34
|
-
const now = new Date().toISOString();
|
|
35
|
-
const job = {
|
|
36
|
-
id: uuidv4(),
|
|
37
|
-
type,
|
|
38
|
-
status: "pending",
|
|
39
|
-
priority,
|
|
40
|
-
progress: 0,
|
|
41
|
-
status_text: "Job created",
|
|
42
|
-
attempts: 0,
|
|
43
|
-
max_attempts: maxAttempts,
|
|
44
|
-
created_at: now,
|
|
45
|
-
updated_at: now,
|
|
46
|
-
metadata: metadata ? JSON.stringify(metadata) : undefined
|
|
47
|
-
};
|
|
48
|
-
this.store.saveJob(job);
|
|
49
|
-
this.queue.enqueue(job);
|
|
50
|
-
this.emit("jobCreated", job);
|
|
51
|
-
// Start processing background queue
|
|
52
|
-
this.processQueue();
|
|
53
|
-
return job;
|
|
54
|
-
}
|
|
55
|
-
/**
|
|
56
|
-
* Background loop to process queued jobs
|
|
57
|
-
*/
|
|
58
|
-
async processQueue() {
|
|
59
|
-
if (this.activeWorkers >= this.maxConcurrency)
|
|
60
|
-
return;
|
|
61
|
-
const job = this.queue.dequeue();
|
|
62
|
-
if (!job)
|
|
63
|
-
return;
|
|
64
|
-
this.activeWorkers++;
|
|
65
|
-
this.updateJob(job.id, { status: "running", status_text: "Picked up by worker" });
|
|
66
|
-
const startTime = Date.now();
|
|
67
|
-
const listeners = this.listenerCount("processJob");
|
|
68
|
-
console.error(`[JobManager] Emitting processJob for ${job.id}. Active listeners: ${listeners}`);
|
|
69
|
-
// In a real system, we'd have a registry of handlers for each JobType.
|
|
70
|
-
// For now, we emit an event so the orchestrator can run it.
|
|
71
|
-
this.emit("processJob", job, async (jobExecutionTask) => {
|
|
72
|
-
console.error(`[JobManager] Wrapper received jobExecutionTask: ${typeof jobExecutionTask}`);
|
|
73
|
-
if (typeof jobExecutionTask !== 'function') {
|
|
74
|
-
console.error(`[JobManager] Error: jobExecutionTask is NOT a function! It is: ${typeof jobExecutionTask}`);
|
|
75
|
-
this.updateJob(job.id, {
|
|
76
|
-
status: "failed",
|
|
77
|
-
status_text: "Internal error: jobExecutionTask is not a function"
|
|
78
|
-
});
|
|
79
|
-
return;
|
|
80
|
-
}
|
|
81
|
-
try {
|
|
82
|
-
const resultUrl = await jobExecutionTask();
|
|
83
|
-
const duration = Date.now() - startTime;
|
|
84
|
-
this.updateJob(job.id, {
|
|
85
|
-
status: "completed",
|
|
86
|
-
progress: 100,
|
|
87
|
-
status_text: "Completed successfully",
|
|
88
|
-
result_url: resultUrl || undefined
|
|
89
|
-
});
|
|
90
|
-
this.stats.recordJobSuccess(job.type, duration);
|
|
91
|
-
}
|
|
92
|
-
catch (error) {
|
|
93
|
-
if (this.queue.shouldRetry(job)) {
|
|
94
|
-
this.updateJob(job.id, {
|
|
95
|
-
status: "retrying",
|
|
96
|
-
status_text: `Failed: ${error.message}. Retrying...`
|
|
97
|
-
});
|
|
98
|
-
}
|
|
99
|
-
else {
|
|
100
|
-
this.updateJob(job.id, {
|
|
101
|
-
status: "failed",
|
|
102
|
-
status_text: "Max retries exceeded",
|
|
103
|
-
error: error.message || String(error)
|
|
104
|
-
});
|
|
105
|
-
this.stats.recordJobFailure(job.type, error);
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
finally {
|
|
109
|
-
this.queue.finalize(job.id);
|
|
110
|
-
this.activeWorkers--;
|
|
111
|
-
this.processQueue(); // Look for next job
|
|
112
|
-
}
|
|
113
|
-
});
|
|
114
|
-
// Try to start another worker if capacity allows
|
|
115
|
-
this.processQueue();
|
|
116
|
-
}
|
|
117
|
-
/**
|
|
118
|
-
* Update job status and progress
|
|
119
|
-
*/
|
|
120
|
-
updateJob(id, updates) {
|
|
121
|
-
const job = this.store.getJob(id);
|
|
122
|
-
if (!job)
|
|
123
|
-
return;
|
|
124
|
-
// Correctly handle metadata update if it's an object
|
|
125
|
-
if (updates.metadata && typeof updates.metadata !== 'string') {
|
|
126
|
-
updates.metadata = JSON.stringify(updates.metadata);
|
|
127
|
-
}
|
|
128
|
-
const updatedJob = {
|
|
129
|
-
...job,
|
|
130
|
-
...updates,
|
|
131
|
-
updated_at: new Date().toISOString()
|
|
132
|
-
};
|
|
133
|
-
this.store.saveJob(updatedJob);
|
|
134
|
-
this.emit("jobUpdated", updatedJob);
|
|
135
|
-
}
|
|
136
|
-
}
|
package/build/jobs/queue.js
DELETED
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
export class QueueService {
|
|
2
|
-
queue = [];
|
|
3
|
-
running = new Map();
|
|
4
|
-
/**
|
|
5
|
-
* Adds a job to the queue, sorted by priority.
|
|
6
|
-
*/
|
|
7
|
-
enqueue(job) {
|
|
8
|
-
job.status = "queued";
|
|
9
|
-
this.queue.push(job);
|
|
10
|
-
this.sortQueue();
|
|
11
|
-
}
|
|
12
|
-
/**
|
|
13
|
-
* Gets the next job from the queue.
|
|
14
|
-
*/
|
|
15
|
-
dequeue() {
|
|
16
|
-
const job = this.queue.shift();
|
|
17
|
-
if (job) {
|
|
18
|
-
job.status = "running";
|
|
19
|
-
this.running.set(job.id, job);
|
|
20
|
-
}
|
|
21
|
-
return job;
|
|
22
|
-
}
|
|
23
|
-
/**
|
|
24
|
-
* Mark a job as finished (completed or failed permanently).
|
|
25
|
-
*/
|
|
26
|
-
finalize(id) {
|
|
27
|
-
this.running.delete(id);
|
|
28
|
-
}
|
|
29
|
-
/**
|
|
30
|
-
* Handles retry logic with exponential backoff (simulated).
|
|
31
|
-
*/
|
|
32
|
-
shouldRetry(job) {
|
|
33
|
-
if (job.attempts < job.max_attempts) {
|
|
34
|
-
job.attempts++;
|
|
35
|
-
job.status = "retrying";
|
|
36
|
-
// In a real system, we'd schedule this for later.
|
|
37
|
-
// For now, we'll re-enqueue it immediately.
|
|
38
|
-
this.enqueue(job);
|
|
39
|
-
return true;
|
|
40
|
-
}
|
|
41
|
-
return false;
|
|
42
|
-
}
|
|
43
|
-
getStats() {
|
|
44
|
-
return {
|
|
45
|
-
pending: this.queue.length,
|
|
46
|
-
running: this.running.size,
|
|
47
|
-
total: this.queue.length + this.running.size
|
|
48
|
-
};
|
|
49
|
-
}
|
|
50
|
-
sortQueue() {
|
|
51
|
-
// Priority high to low, then created_at old to new
|
|
52
|
-
this.queue.sort((a, b) => {
|
|
53
|
-
if (b.priority !== a.priority) {
|
|
54
|
-
return b.priority - a.priority;
|
|
55
|
-
}
|
|
56
|
-
return new Date(a.created_at).getTime() - new Date(b.created_at).getTime();
|
|
57
|
-
});
|
|
58
|
-
}
|
|
59
|
-
}
|
package/build/jobs/types.js
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|
package/build/lib/supabase.js
DELETED
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
import os from "os";
|
|
5
|
-
export class DataWorldSource {
|
|
6
|
-
pythonPath = "python";
|
|
7
|
-
scriptPath;
|
|
8
|
-
constructor(buildDir = process.cwd()) {
|
|
9
|
-
const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
10
|
-
const dataRoot = path.join(homeDir, ".vesper");
|
|
11
|
-
const scriptPath0 = path.resolve(dataRoot, "python", "dataworld_engine.py");
|
|
12
|
-
const scriptPath1 = path.resolve(buildDir, "python", "dataworld_engine.py");
|
|
13
|
-
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "dataworld_engine.py");
|
|
14
|
-
if (fs.existsSync(scriptPath0)) {
|
|
15
|
-
this.scriptPath = scriptPath0;
|
|
16
|
-
}
|
|
17
|
-
else if (fs.existsSync(scriptPath1)) {
|
|
18
|
-
this.scriptPath = scriptPath1;
|
|
19
|
-
}
|
|
20
|
-
else if (fs.existsSync(scriptPath2)) {
|
|
21
|
-
this.scriptPath = scriptPath2;
|
|
22
|
-
}
|
|
23
|
-
else {
|
|
24
|
-
this.scriptPath = scriptPath0;
|
|
25
|
-
}
|
|
26
|
-
if (process.platform === "win32") {
|
|
27
|
-
const venvPy = path.resolve(buildDir, ".venv", "Scripts", "python.exe");
|
|
28
|
-
if (fs.existsSync(venvPy)) {
|
|
29
|
-
this.pythonPath = venvPy;
|
|
30
|
-
}
|
|
31
|
-
else {
|
|
32
|
-
this.pythonPath = "py";
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
else {
|
|
36
|
-
const venvPy = path.resolve(buildDir, ".venv", "bin", "python");
|
|
37
|
-
if (fs.existsSync(venvPy)) {
|
|
38
|
-
this.pythonPath = venvPy;
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
async discover(query, limit = 20) {
|
|
43
|
-
const result = await this.run(["discover", query, String(limit)]);
|
|
44
|
-
if (!result.ok) {
|
|
45
|
-
throw new Error(result.error || "data.world discover failed");
|
|
46
|
-
}
|
|
47
|
-
return (result.results || []);
|
|
48
|
-
}
|
|
49
|
-
async download(datasetRef, targetDir) {
|
|
50
|
-
const args = ["download", datasetRef];
|
|
51
|
-
if (targetDir)
|
|
52
|
-
args.push(targetDir);
|
|
53
|
-
const result = await this.run(args);
|
|
54
|
-
if (!result.ok) {
|
|
55
|
-
throw new Error(result.error || "data.world download failed");
|
|
56
|
-
}
|
|
57
|
-
return {
|
|
58
|
-
local_path: result.local_path,
|
|
59
|
-
target_dir: result.target_dir,
|
|
60
|
-
};
|
|
61
|
-
}
|
|
62
|
-
run(args) {
|
|
63
|
-
return new Promise((resolve, reject) => {
|
|
64
|
-
const proc = spawn(this.pythonPath, [this.scriptPath, ...args], {
|
|
65
|
-
env: process.env
|
|
66
|
-
});
|
|
67
|
-
let stdout = "";
|
|
68
|
-
let stderr = "";
|
|
69
|
-
proc.stdout.on("data", (data) => {
|
|
70
|
-
stdout += data.toString();
|
|
71
|
-
});
|
|
72
|
-
proc.stderr.on("data", (data) => {
|
|
73
|
-
stderr += data.toString();
|
|
74
|
-
});
|
|
75
|
-
proc.on("close", (code) => {
|
|
76
|
-
if (code !== 0) {
|
|
77
|
-
return reject(new Error(`data.world engine exited with code ${code}: ${stderr}`));
|
|
78
|
-
}
|
|
79
|
-
try {
|
|
80
|
-
const parsed = JSON.parse(stdout.trim());
|
|
81
|
-
resolve(parsed);
|
|
82
|
-
}
|
|
83
|
-
catch (e) {
|
|
84
|
-
reject(new Error(`Failed to parse data.world engine output: ${stdout}`));
|
|
85
|
-
}
|
|
86
|
-
});
|
|
87
|
-
});
|
|
88
|
-
}
|
|
89
|
-
}
|
package/build/metadata/domain.js
DELETED
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
const DOMAIN_KEYWORDS = [
|
|
2
|
-
{
|
|
3
|
-
domain: "medicine",
|
|
4
|
-
keywords: ["medical", "medicine", "clinical", "diagnosis", "patient", "disease", "symptom", "treatment", "drug", "pharmaceutical", "healthcare", "hospital", "doctor", "physician", "medical imaging", "radiology", "pathology", "oncology", "cardiology"],
|
|
5
|
-
weight: 10
|
|
6
|
-
},
|
|
7
|
-
{
|
|
8
|
-
domain: "healthcare",
|
|
9
|
-
keywords: ["health", "healthcare", "wellness", "public health", "epidemiology", "biomedical", "health data", "medical records", "ehr", "electronic health"],
|
|
10
|
-
weight: 8
|
|
11
|
-
},
|
|
12
|
-
{
|
|
13
|
-
domain: "security",
|
|
14
|
-
keywords: ["security", "cybersecurity", "malware", "vulnerability", "threat", "attack", "defense", "encryption", "privacy", "authentication", "authorization", "penetration testing", "intrusion"],
|
|
15
|
-
weight: 10
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
domain: "finance",
|
|
19
|
-
keywords: ["finance", "financial", "banking", "trading", "stock", "market", "investment", "credit", "loan", "mortgage", "cryptocurrency", "bitcoin", "blockchain", "accounting"],
|
|
20
|
-
weight: 9
|
|
21
|
-
},
|
|
22
|
-
{
|
|
23
|
-
domain: "education",
|
|
24
|
-
keywords: ["education", "learning", "student", "academic", "curriculum", "pedagogy", "teaching", "school", "university", "course", "tutorial"],
|
|
25
|
-
weight: 8
|
|
26
|
-
},
|
|
27
|
-
{
|
|
28
|
-
domain: "science",
|
|
29
|
-
keywords: ["scientific", "research", "experiment", "laboratory", "physics", "chemistry", "biology", "astronomy", "geology", "environmental"],
|
|
30
|
-
weight: 7
|
|
31
|
-
},
|
|
32
|
-
{
|
|
33
|
-
domain: "technology",
|
|
34
|
-
keywords: ["software", "programming", "code", "algorithm", "system", "application", "development", "engineering", "technical"],
|
|
35
|
-
weight: 5
|
|
36
|
-
},
|
|
37
|
-
{
|
|
38
|
-
domain: "social",
|
|
39
|
-
keywords: ["social", "society", "community", "demographic", "census", "population", "sociology", "anthropology", "culture"],
|
|
40
|
-
weight: 6
|
|
41
|
-
},
|
|
42
|
-
{
|
|
43
|
-
domain: "legal",
|
|
44
|
-
keywords: ["legal", "law", "court", "judicial", "litigation", "contract", "regulation", "compliance", "legislation"],
|
|
45
|
-
weight: 9
|
|
46
|
-
},
|
|
47
|
-
{
|
|
48
|
-
domain: "business",
|
|
49
|
-
keywords: ["business", "corporate", "enterprise", "company", "organization", "management", "marketing", "sales", "customer"],
|
|
50
|
-
weight: 6
|
|
51
|
-
},
|
|
52
|
-
{
|
|
53
|
-
domain: "multimedia",
|
|
54
|
-
keywords: ["image", "video", "audio", "multimedia", "visual", "photography", "music", "sound"],
|
|
55
|
-
weight: 7
|
|
56
|
-
},
|
|
57
|
-
{
|
|
58
|
-
domain: "nlp",
|
|
59
|
-
keywords: ["natural language", "nlp", "text", "language model", "translation", "sentiment", "text classification", "named entity", "ner", "question answering", "qa"],
|
|
60
|
-
weight: 8
|
|
61
|
-
},
|
|
62
|
-
{
|
|
63
|
-
domain: "computer-vision",
|
|
64
|
-
keywords: ["computer vision", "cv", "image classification", "object detection", "segmentation", "visual", "optical", "camera", "facial recognition"],
|
|
65
|
-
weight: 8
|
|
66
|
-
}
|
|
67
|
-
];
|
|
68
|
-
export function classifyDomain(description, tags, name, task) {
|
|
69
|
-
const text = `${description} ${name} ${tags.join(" ")} ${task || ""}`.toLowerCase();
|
|
70
|
-
const scores = new Map();
|
|
71
|
-
// Initialize scores
|
|
72
|
-
DOMAIN_KEYWORDS.forEach(dk => {
|
|
73
|
-
scores.set(dk.domain, 0);
|
|
74
|
-
});
|
|
75
|
-
// Score each domain
|
|
76
|
-
DOMAIN_KEYWORDS.forEach(dk => {
|
|
77
|
-
let matches = 0;
|
|
78
|
-
dk.keywords.forEach(keyword => {
|
|
79
|
-
if (text.includes(keyword.toLowerCase())) {
|
|
80
|
-
matches++;
|
|
81
|
-
}
|
|
82
|
-
});
|
|
83
|
-
if (matches > 0) {
|
|
84
|
-
const currentScore = scores.get(dk.domain) || 0;
|
|
85
|
-
scores.set(dk.domain, currentScore + (matches * dk.weight));
|
|
86
|
-
}
|
|
87
|
-
});
|
|
88
|
-
// Find domain with highest score
|
|
89
|
-
let maxScore = 0;
|
|
90
|
-
let bestDomain = "unknown";
|
|
91
|
-
scores.forEach((score, domain) => {
|
|
92
|
-
if (score > maxScore) {
|
|
93
|
-
maxScore = score;
|
|
94
|
-
bestDomain = domain;
|
|
95
|
-
}
|
|
96
|
-
});
|
|
97
|
-
// If no strong match, check task type for hints
|
|
98
|
-
if (maxScore === 0 && task) {
|
|
99
|
-
const taskLower = task.toLowerCase();
|
|
100
|
-
if (taskLower.includes("image") || taskLower.includes("vision") || taskLower.includes("detection")) {
|
|
101
|
-
return "computer-vision";
|
|
102
|
-
}
|
|
103
|
-
if (taskLower.includes("text") || taskLower.includes("language") || taskLower.includes("nlp")) {
|
|
104
|
-
return "nlp";
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
// Require minimum score threshold to avoid false positives
|
|
108
|
-
if (maxScore < 5) {
|
|
109
|
-
return "general";
|
|
110
|
-
}
|
|
111
|
-
// Special handling: medicine and healthcare are related
|
|
112
|
-
if (bestDomain === "medicine" || bestDomain === "healthcare") {
|
|
113
|
-
const medicineScore = scores.get("medicine") || 0;
|
|
114
|
-
const healthcareScore = scores.get("healthcare") || 0;
|
|
115
|
-
if (medicineScore > healthcareScore * 1.5) {
|
|
116
|
-
return "medicine";
|
|
117
|
-
}
|
|
118
|
-
else if (healthcareScore > medicineScore * 1.5) {
|
|
119
|
-
return "healthcare";
|
|
120
|
-
}
|
|
121
|
-
else {
|
|
122
|
-
// If both are high, prefer medicine for medical-specific terms
|
|
123
|
-
return (medicineScore >= healthcareScore ? "medicine" : "healthcare");
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
return bestDomain;
|
|
127
|
-
}
|
|
128
|
-
export function getDomainDisplayName(domain) {
|
|
129
|
-
const names = {
|
|
130
|
-
"medicine": "Medicine",
|
|
131
|
-
"healthcare": "Healthcare",
|
|
132
|
-
"security": "Security",
|
|
133
|
-
"finance": "Finance",
|
|
134
|
-
"education": "Education",
|
|
135
|
-
"science": "Science",
|
|
136
|
-
"technology": "Technology",
|
|
137
|
-
"social": "Social",
|
|
138
|
-
"legal": "Legal",
|
|
139
|
-
"business": "Business",
|
|
140
|
-
"multimedia": "Multimedia",
|
|
141
|
-
"nlp": "Natural Language Processing",
|
|
142
|
-
"computer-vision": "Computer Vision",
|
|
143
|
-
"general": "General",
|
|
144
|
-
"unknown": "Unknown"
|
|
145
|
-
};
|
|
146
|
-
return names[domain] || "Unknown";
|
|
147
|
-
}
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import path from "path";
|
|
3
|
-
const PYTHON_SCRIPT_PATH = path.resolve("src", "python", "github_adapter.py");
|
|
4
|
-
export class GitHubScraper {
|
|
5
|
-
/**
|
|
6
|
-
* Search GitHub repositories for datasets using the Python adapter
|
|
7
|
-
*/
|
|
8
|
-
async scrape(query, limit = 10) {
|
|
9
|
-
return new Promise((resolve, reject) => {
|
|
10
|
-
const pythonProcess = spawn("python", [
|
|
11
|
-
PYTHON_SCRIPT_PATH,
|
|
12
|
-
"--action", "search",
|
|
13
|
-
"--query", query,
|
|
14
|
-
"--limit", String(limit)
|
|
15
|
-
]);
|
|
16
|
-
let output = "";
|
|
17
|
-
let errorOutput = "";
|
|
18
|
-
pythonProcess.stdout.on("data", (data) => {
|
|
19
|
-
output += data.toString();
|
|
20
|
-
});
|
|
21
|
-
pythonProcess.stderr.on("data", (data) => {
|
|
22
|
-
errorOutput += data.toString();
|
|
23
|
-
});
|
|
24
|
-
pythonProcess.on("close", (code) => {
|
|
25
|
-
if (code !== 0) {
|
|
26
|
-
console.error(`[GitHubScraper] Process exited with code ${code}: ${errorOutput}`);
|
|
27
|
-
resolve([]);
|
|
28
|
-
return;
|
|
29
|
-
}
|
|
30
|
-
try {
|
|
31
|
-
const results = JSON.parse(output);
|
|
32
|
-
if (results.error) {
|
|
33
|
-
console.error(`[GitHubScraper] Internal error: ${results.error}`);
|
|
34
|
-
resolve([]);
|
|
35
|
-
}
|
|
36
|
-
else {
|
|
37
|
-
resolve(results);
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
catch (e) {
|
|
41
|
-
console.error(`[GitHubScraper] JSON parse error: ${e.message}. Output: ${output.substring(0, 100)}...`);
|
|
42
|
-
resolve([]);
|
|
43
|
-
}
|
|
44
|
-
});
|
|
45
|
-
});
|
|
46
|
-
}
|
|
47
|
-
}
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import path from "path";
|
|
3
|
-
const WB_SCRIPT_PATH = path.resolve("src", "python", "worldbank_adapter.py");
|
|
4
|
-
const NASA_SCRIPT_PATH = path.resolve("src", "python", "nasa_adapter.py");
|
|
5
|
-
export class WorldBankScraper {
|
|
6
|
-
async scrape(query, limit = 10) {
|
|
7
|
-
return runAdapter(WB_SCRIPT_PATH, query, limit);
|
|
8
|
-
}
|
|
9
|
-
}
|
|
10
|
-
export class NASAScraper {
|
|
11
|
-
async scrape(query, limit = 10) {
|
|
12
|
-
return runAdapter(NASA_SCRIPT_PATH, query, limit);
|
|
13
|
-
}
|
|
14
|
-
}
|
|
15
|
-
async function runAdapter(scriptPath, query, limit) {
|
|
16
|
-
return new Promise((resolve) => {
|
|
17
|
-
const pythonProcess = spawn("python", [
|
|
18
|
-
scriptPath,
|
|
19
|
-
"--action", "search",
|
|
20
|
-
"--query", query,
|
|
21
|
-
"--limit", String(limit)
|
|
22
|
-
]);
|
|
23
|
-
let output = "";
|
|
24
|
-
let errorOutput = "";
|
|
25
|
-
pythonProcess.stdout.on("data", (data) => { output += data.toString(); });
|
|
26
|
-
pythonProcess.stderr.on("data", (data) => { errorOutput += data.toString(); });
|
|
27
|
-
pythonProcess.on("close", (code) => {
|
|
28
|
-
if (code !== 0) {
|
|
29
|
-
console.error(`[Adapter] ${path.basename(scriptPath)} exited with code ${code}: ${errorOutput}`);
|
|
30
|
-
resolve([]);
|
|
31
|
-
return;
|
|
32
|
-
}
|
|
33
|
-
try {
|
|
34
|
-
const results = JSON.parse(output);
|
|
35
|
-
if (results.error) {
|
|
36
|
-
console.error(`[Adapter] ${path.basename(scriptPath)} error: ${results.error}`);
|
|
37
|
-
resolve([]);
|
|
38
|
-
}
|
|
39
|
-
else {
|
|
40
|
-
resolve(results);
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
catch (e) {
|
|
44
|
-
console.error(`[Adapter] ${path.basename(scriptPath)} JSON error: ${e.message}`);
|
|
45
|
-
resolve([]);
|
|
46
|
-
}
|
|
47
|
-
});
|
|
48
|
-
});
|
|
49
|
-
}
|