vesper-wizard 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/row_count.py +0 -54
  203. package/src/python/splitter_engine.py +0 -283
  204. package/src/python/target_engine.py +0 -154
  205. package/src/python/test_framework_adapters.py +0 -61
  206. package/src/python/test_fusion_engine.py +0 -89
  207. package/src/python/uci_adapter.py +0 -94
  208. package/src/python/vesper/__init__.py +0 -1
  209. package/src/python/vesper/core/__init__.py +0 -1
  210. package/src/python/vesper/core/asset_downloader.py +0 -679
  211. package/src/python/vesper/core/download_recipe.py +0 -104
  212. package/src/python/worldbank_adapter.py +0 -99
  213. package/wizard.cjs +0 -3
@@ -1,136 +0,0 @@
1
- import { v4 as uuidv4 } from "uuid";
2
- import { EventEmitter } from "events";
3
- import { QueueService } from "./queue.js";
4
- import { ObservabilityService } from "../monitoring/observability.js";
5
- export class JobManager extends EventEmitter {
6
- store;
7
- static instance;
8
- queue;
9
- stats;
10
- maxConcurrency = 2;
11
- activeWorkers = 0;
12
- constructor(store, stats) {
13
- super();
14
- this.store = store;
15
- this.queue = new QueueService();
16
- this.stats = stats || new ObservabilityService();
17
- }
18
- static getInstance(store, stats) {
19
- if (!JobManager.instance) {
20
- JobManager.instance = new JobManager(store, stats);
21
- }
22
- return JobManager.instance;
23
- }
24
- /**
25
- * Set max parallel workers
26
- */
27
- setConcurrency(count) {
28
- this.maxConcurrency = count;
29
- }
30
- /**
31
- * Create a new job and add it to the priority queue
32
- */
33
- createJob(type, priority = 0, metadata, maxAttempts = 3) {
34
- const now = new Date().toISOString();
35
- const job = {
36
- id: uuidv4(),
37
- type,
38
- status: "pending",
39
- priority,
40
- progress: 0,
41
- status_text: "Job created",
42
- attempts: 0,
43
- max_attempts: maxAttempts,
44
- created_at: now,
45
- updated_at: now,
46
- metadata: metadata ? JSON.stringify(metadata) : undefined
47
- };
48
- this.store.saveJob(job);
49
- this.queue.enqueue(job);
50
- this.emit("jobCreated", job);
51
- // Start processing background queue
52
- this.processQueue();
53
- return job;
54
- }
55
- /**
56
- * Background loop to process queued jobs
57
- */
58
- async processQueue() {
59
- if (this.activeWorkers >= this.maxConcurrency)
60
- return;
61
- const job = this.queue.dequeue();
62
- if (!job)
63
- return;
64
- this.activeWorkers++;
65
- this.updateJob(job.id, { status: "running", status_text: "Picked up by worker" });
66
- const startTime = Date.now();
67
- const listeners = this.listenerCount("processJob");
68
- console.error(`[JobManager] Emitting processJob for ${job.id}. Active listeners: ${listeners}`);
69
- // In a real system, we'd have a registry of handlers for each JobType.
70
- // For now, we emit an event so the orchestrator can run it.
71
- this.emit("processJob", job, async (jobExecutionTask) => {
72
- console.error(`[JobManager] Wrapper received jobExecutionTask: ${typeof jobExecutionTask}`);
73
- if (typeof jobExecutionTask !== 'function') {
74
- console.error(`[JobManager] Error: jobExecutionTask is NOT a function! It is: ${typeof jobExecutionTask}`);
75
- this.updateJob(job.id, {
76
- status: "failed",
77
- status_text: "Internal error: jobExecutionTask is not a function"
78
- });
79
- return;
80
- }
81
- try {
82
- const resultUrl = await jobExecutionTask();
83
- const duration = Date.now() - startTime;
84
- this.updateJob(job.id, {
85
- status: "completed",
86
- progress: 100,
87
- status_text: "Completed successfully",
88
- result_url: resultUrl || undefined
89
- });
90
- this.stats.recordJobSuccess(job.type, duration);
91
- }
92
- catch (error) {
93
- if (this.queue.shouldRetry(job)) {
94
- this.updateJob(job.id, {
95
- status: "retrying",
96
- status_text: `Failed: ${error.message}. Retrying...`
97
- });
98
- }
99
- else {
100
- this.updateJob(job.id, {
101
- status: "failed",
102
- status_text: "Max retries exceeded",
103
- error: error.message || String(error)
104
- });
105
- this.stats.recordJobFailure(job.type, error);
106
- }
107
- }
108
- finally {
109
- this.queue.finalize(job.id);
110
- this.activeWorkers--;
111
- this.processQueue(); // Look for next job
112
- }
113
- });
114
- // Try to start another worker if capacity allows
115
- this.processQueue();
116
- }
117
- /**
118
- * Update job status and progress
119
- */
120
- updateJob(id, updates) {
121
- const job = this.store.getJob(id);
122
- if (!job)
123
- return;
124
- // Correctly handle metadata update if it's an object
125
- if (updates.metadata && typeof updates.metadata !== 'string') {
126
- updates.metadata = JSON.stringify(updates.metadata);
127
- }
128
- const updatedJob = {
129
- ...job,
130
- ...updates,
131
- updated_at: new Date().toISOString()
132
- };
133
- this.store.saveJob(updatedJob);
134
- this.emit("jobUpdated", updatedJob);
135
- }
136
- }
@@ -1,59 +0,0 @@
1
- export class QueueService {
2
- queue = [];
3
- running = new Map();
4
- /**
5
- * Adds a job to the queue, sorted by priority.
6
- */
7
- enqueue(job) {
8
- job.status = "queued";
9
- this.queue.push(job);
10
- this.sortQueue();
11
- }
12
- /**
13
- * Gets the next job from the queue.
14
- */
15
- dequeue() {
16
- const job = this.queue.shift();
17
- if (job) {
18
- job.status = "running";
19
- this.running.set(job.id, job);
20
- }
21
- return job;
22
- }
23
- /**
24
- * Mark a job as finished (completed or failed permanently).
25
- */
26
- finalize(id) {
27
- this.running.delete(id);
28
- }
29
- /**
30
- * Handles retry logic with exponential backoff (simulated).
31
- */
32
- shouldRetry(job) {
33
- if (job.attempts < job.max_attempts) {
34
- job.attempts++;
35
- job.status = "retrying";
36
- // In a real system, we'd schedule this for later.
37
- // For now, we'll re-enqueue it immediately.
38
- this.enqueue(job);
39
- return true;
40
- }
41
- return false;
42
- }
43
- getStats() {
44
- return {
45
- pending: this.queue.length,
46
- running: this.running.size,
47
- total: this.queue.length + this.running.size
48
- };
49
- }
50
- sortQueue() {
51
- // Priority high to low, then created_at old to new
52
- this.queue.sort((a, b) => {
53
- if (b.priority !== a.priority) {
54
- return b.priority - a.priority;
55
- }
56
- return new Date(a.created_at).getTime() - new Date(b.created_at).getTime();
57
- });
58
- }
59
- }
@@ -1 +0,0 @@
1
- export {};
@@ -1,3 +0,0 @@
1
- import { createClient } from '@supabase/supabase-js';
2
- export const supabase = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_SERVICE_ROLE_KEY // for MCP, use service_role, not anon
3
- );
@@ -1,89 +0,0 @@
1
- import { spawn } from "child_process";
2
- import path from "path";
3
- import fs from "fs";
4
- import os from "os";
5
- export class DataWorldSource {
6
- pythonPath = "python";
7
- scriptPath;
8
- constructor(buildDir = process.cwd()) {
9
- const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
10
- const dataRoot = path.join(homeDir, ".vesper");
11
- const scriptPath0 = path.resolve(dataRoot, "python", "dataworld_engine.py");
12
- const scriptPath1 = path.resolve(buildDir, "python", "dataworld_engine.py");
13
- const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "dataworld_engine.py");
14
- if (fs.existsSync(scriptPath0)) {
15
- this.scriptPath = scriptPath0;
16
- }
17
- else if (fs.existsSync(scriptPath1)) {
18
- this.scriptPath = scriptPath1;
19
- }
20
- else if (fs.existsSync(scriptPath2)) {
21
- this.scriptPath = scriptPath2;
22
- }
23
- else {
24
- this.scriptPath = scriptPath0;
25
- }
26
- if (process.platform === "win32") {
27
- const venvPy = path.resolve(buildDir, ".venv", "Scripts", "python.exe");
28
- if (fs.existsSync(venvPy)) {
29
- this.pythonPath = venvPy;
30
- }
31
- else {
32
- this.pythonPath = "py";
33
- }
34
- }
35
- else {
36
- const venvPy = path.resolve(buildDir, ".venv", "bin", "python");
37
- if (fs.existsSync(venvPy)) {
38
- this.pythonPath = venvPy;
39
- }
40
- }
41
- }
42
- async discover(query, limit = 20) {
43
- const result = await this.run(["discover", query, String(limit)]);
44
- if (!result.ok) {
45
- throw new Error(result.error || "data.world discover failed");
46
- }
47
- return (result.results || []);
48
- }
49
- async download(datasetRef, targetDir) {
50
- const args = ["download", datasetRef];
51
- if (targetDir)
52
- args.push(targetDir);
53
- const result = await this.run(args);
54
- if (!result.ok) {
55
- throw new Error(result.error || "data.world download failed");
56
- }
57
- return {
58
- local_path: result.local_path,
59
- target_dir: result.target_dir,
60
- };
61
- }
62
- run(args) {
63
- return new Promise((resolve, reject) => {
64
- const proc = spawn(this.pythonPath, [this.scriptPath, ...args], {
65
- env: process.env
66
- });
67
- let stdout = "";
68
- let stderr = "";
69
- proc.stdout.on("data", (data) => {
70
- stdout += data.toString();
71
- });
72
- proc.stderr.on("data", (data) => {
73
- stderr += data.toString();
74
- });
75
- proc.on("close", (code) => {
76
- if (code !== 0) {
77
- return reject(new Error(`data.world engine exited with code ${code}: ${stderr}`));
78
- }
79
- try {
80
- const parsed = JSON.parse(stdout.trim());
81
- resolve(parsed);
82
- }
83
- catch (e) {
84
- reject(new Error(`Failed to parse data.world engine output: ${stdout}`));
85
- }
86
- });
87
- });
88
- }
89
- }
@@ -1,147 +0,0 @@
1
- const DOMAIN_KEYWORDS = [
2
- {
3
- domain: "medicine",
4
- keywords: ["medical", "medicine", "clinical", "diagnosis", "patient", "disease", "symptom", "treatment", "drug", "pharmaceutical", "healthcare", "hospital", "doctor", "physician", "medical imaging", "radiology", "pathology", "oncology", "cardiology"],
5
- weight: 10
6
- },
7
- {
8
- domain: "healthcare",
9
- keywords: ["health", "healthcare", "wellness", "public health", "epidemiology", "biomedical", "health data", "medical records", "ehr", "electronic health"],
10
- weight: 8
11
- },
12
- {
13
- domain: "security",
14
- keywords: ["security", "cybersecurity", "malware", "vulnerability", "threat", "attack", "defense", "encryption", "privacy", "authentication", "authorization", "penetration testing", "intrusion"],
15
- weight: 10
16
- },
17
- {
18
- domain: "finance",
19
- keywords: ["finance", "financial", "banking", "trading", "stock", "market", "investment", "credit", "loan", "mortgage", "cryptocurrency", "bitcoin", "blockchain", "accounting"],
20
- weight: 9
21
- },
22
- {
23
- domain: "education",
24
- keywords: ["education", "learning", "student", "academic", "curriculum", "pedagogy", "teaching", "school", "university", "course", "tutorial"],
25
- weight: 8
26
- },
27
- {
28
- domain: "science",
29
- keywords: ["scientific", "research", "experiment", "laboratory", "physics", "chemistry", "biology", "astronomy", "geology", "environmental"],
30
- weight: 7
31
- },
32
- {
33
- domain: "technology",
34
- keywords: ["software", "programming", "code", "algorithm", "system", "application", "development", "engineering", "technical"],
35
- weight: 5
36
- },
37
- {
38
- domain: "social",
39
- keywords: ["social", "society", "community", "demographic", "census", "population", "sociology", "anthropology", "culture"],
40
- weight: 6
41
- },
42
- {
43
- domain: "legal",
44
- keywords: ["legal", "law", "court", "judicial", "litigation", "contract", "regulation", "compliance", "legislation"],
45
- weight: 9
46
- },
47
- {
48
- domain: "business",
49
- keywords: ["business", "corporate", "enterprise", "company", "organization", "management", "marketing", "sales", "customer"],
50
- weight: 6
51
- },
52
- {
53
- domain: "multimedia",
54
- keywords: ["image", "video", "audio", "multimedia", "visual", "photography", "music", "sound"],
55
- weight: 7
56
- },
57
- {
58
- domain: "nlp",
59
- keywords: ["natural language", "nlp", "text", "language model", "translation", "sentiment", "text classification", "named entity", "ner", "question answering", "qa"],
60
- weight: 8
61
- },
62
- {
63
- domain: "computer-vision",
64
- keywords: ["computer vision", "cv", "image classification", "object detection", "segmentation", "visual", "optical", "camera", "facial recognition"],
65
- weight: 8
66
- }
67
- ];
68
- export function classifyDomain(description, tags, name, task) {
69
- const text = `${description} ${name} ${tags.join(" ")} ${task || ""}`.toLowerCase();
70
- const scores = new Map();
71
- // Initialize scores
72
- DOMAIN_KEYWORDS.forEach(dk => {
73
- scores.set(dk.domain, 0);
74
- });
75
- // Score each domain
76
- DOMAIN_KEYWORDS.forEach(dk => {
77
- let matches = 0;
78
- dk.keywords.forEach(keyword => {
79
- if (text.includes(keyword.toLowerCase())) {
80
- matches++;
81
- }
82
- });
83
- if (matches > 0) {
84
- const currentScore = scores.get(dk.domain) || 0;
85
- scores.set(dk.domain, currentScore + (matches * dk.weight));
86
- }
87
- });
88
- // Find domain with highest score
89
- let maxScore = 0;
90
- let bestDomain = "unknown";
91
- scores.forEach((score, domain) => {
92
- if (score > maxScore) {
93
- maxScore = score;
94
- bestDomain = domain;
95
- }
96
- });
97
- // If no strong match, check task type for hints
98
- if (maxScore === 0 && task) {
99
- const taskLower = task.toLowerCase();
100
- if (taskLower.includes("image") || taskLower.includes("vision") || taskLower.includes("detection")) {
101
- return "computer-vision";
102
- }
103
- if (taskLower.includes("text") || taskLower.includes("language") || taskLower.includes("nlp")) {
104
- return "nlp";
105
- }
106
- }
107
- // Require minimum score threshold to avoid false positives
108
- if (maxScore < 5) {
109
- return "general";
110
- }
111
- // Special handling: medicine and healthcare are related
112
- if (bestDomain === "medicine" || bestDomain === "healthcare") {
113
- const medicineScore = scores.get("medicine") || 0;
114
- const healthcareScore = scores.get("healthcare") || 0;
115
- if (medicineScore > healthcareScore * 1.5) {
116
- return "medicine";
117
- }
118
- else if (healthcareScore > medicineScore * 1.5) {
119
- return "healthcare";
120
- }
121
- else {
122
- // If both are high, prefer medicine for medical-specific terms
123
- return (medicineScore >= healthcareScore ? "medicine" : "healthcare");
124
- }
125
- }
126
- return bestDomain;
127
- }
128
- export function getDomainDisplayName(domain) {
129
- const names = {
130
- "medicine": "Medicine",
131
- "healthcare": "Healthcare",
132
- "security": "Security",
133
- "finance": "Finance",
134
- "education": "Education",
135
- "science": "Science",
136
- "technology": "Technology",
137
- "social": "Social",
138
- "legal": "Legal",
139
- "business": "Business",
140
- "multimedia": "Multimedia",
141
- "nlp": "Natural Language Processing",
142
- "computer-vision": "Computer Vision",
143
- "general": "General",
144
- "unknown": "Unknown"
145
- };
146
- return names[domain] || "Unknown";
147
- }
@@ -1,47 +0,0 @@
1
- import { spawn } from "child_process";
2
- import path from "path";
3
- const PYTHON_SCRIPT_PATH = path.resolve("src", "python", "github_adapter.py");
4
- export class GitHubScraper {
5
- /**
6
- * Search GitHub repositories for datasets using the Python adapter
7
- */
8
- async scrape(query, limit = 10) {
9
- return new Promise((resolve, reject) => {
10
- const pythonProcess = spawn("python", [
11
- PYTHON_SCRIPT_PATH,
12
- "--action", "search",
13
- "--query", query,
14
- "--limit", String(limit)
15
- ]);
16
- let output = "";
17
- let errorOutput = "";
18
- pythonProcess.stdout.on("data", (data) => {
19
- output += data.toString();
20
- });
21
- pythonProcess.stderr.on("data", (data) => {
22
- errorOutput += data.toString();
23
- });
24
- pythonProcess.on("close", (code) => {
25
- if (code !== 0) {
26
- console.error(`[GitHubScraper] Process exited with code ${code}: ${errorOutput}`);
27
- resolve([]);
28
- return;
29
- }
30
- try {
31
- const results = JSON.parse(output);
32
- if (results.error) {
33
- console.error(`[GitHubScraper] Internal error: ${results.error}`);
34
- resolve([]);
35
- }
36
- else {
37
- resolve(results);
38
- }
39
- }
40
- catch (e) {
41
- console.error(`[GitHubScraper] JSON parse error: ${e.message}. Output: ${output.substring(0, 100)}...`);
42
- resolve([]);
43
- }
44
- });
45
- });
46
- }
47
- }
@@ -1,49 +0,0 @@
1
- import { spawn } from "child_process";
2
- import path from "path";
3
- const WB_SCRIPT_PATH = path.resolve("src", "python", "worldbank_adapter.py");
4
- const NASA_SCRIPT_PATH = path.resolve("src", "python", "nasa_adapter.py");
5
- export class WorldBankScraper {
6
- async scrape(query, limit = 10) {
7
- return runAdapter(WB_SCRIPT_PATH, query, limit);
8
- }
9
- }
10
- export class NASAScraper {
11
- async scrape(query, limit = 10) {
12
- return runAdapter(NASA_SCRIPT_PATH, query, limit);
13
- }
14
- }
15
- async function runAdapter(scriptPath, query, limit) {
16
- return new Promise((resolve) => {
17
- const pythonProcess = spawn("python", [
18
- scriptPath,
19
- "--action", "search",
20
- "--query", query,
21
- "--limit", String(limit)
22
- ]);
23
- let output = "";
24
- let errorOutput = "";
25
- pythonProcess.stdout.on("data", (data) => { output += data.toString(); });
26
- pythonProcess.stderr.on("data", (data) => { errorOutput += data.toString(); });
27
- pythonProcess.on("close", (code) => {
28
- if (code !== 0) {
29
- console.error(`[Adapter] ${path.basename(scriptPath)} exited with code ${code}: ${errorOutput}`);
30
- resolve([]);
31
- return;
32
- }
33
- try {
34
- const results = JSON.parse(output);
35
- if (results.error) {
36
- console.error(`[Adapter] ${path.basename(scriptPath)} error: ${results.error}`);
37
- resolve([]);
38
- }
39
- else {
40
- resolve(results);
41
- }
42
- }
43
- catch (e) {
44
- console.error(`[Adapter] ${path.basename(scriptPath)} JSON error: ${e.message}`);
45
- resolve([]);
46
- }
47
- });
48
- });
49
- }