@vespermcp/mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +259 -0
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +50 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +60 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +111 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/storage-manager.js +20 -0
  15. package/build/cloud/types.js +1 -0
  16. package/build/compliance/service.js +73 -0
  17. package/build/compliance/store.js +80 -0
  18. package/build/compliance/types.js +1 -0
  19. package/build/data/processing-worker.js +23 -0
  20. package/build/data/streaming.js +38 -0
  21. package/build/data/worker-pool.js +39 -0
  22. package/build/export/exporter.js +45 -0
  23. package/build/export/packager.js +100 -0
  24. package/build/export/types.js +1 -0
  25. package/build/fusion/aligner.js +56 -0
  26. package/build/fusion/deduplicator.js +69 -0
  27. package/build/fusion/harmonizer.js +39 -0
  28. package/build/fusion/orchestrator.js +86 -0
  29. package/build/fusion/types.js +1 -0
  30. package/build/index.js +632 -0
  31. package/build/ingestion/hf-downloader.js +64 -0
  32. package/build/ingestion/ingestor.js +96 -0
  33. package/build/ingestion/kaggle-downloader.js +79 -0
  34. package/build/install/install-service.js +41 -0
  35. package/build/jobs/manager.js +129 -0
  36. package/build/jobs/queue.js +59 -0
  37. package/build/jobs/types.js +1 -0
  38. package/build/metadata/domain.js +147 -0
  39. package/build/metadata/github-scraper.js +47 -0
  40. package/build/metadata/institutional-scrapers.js +49 -0
  41. package/build/metadata/kaggle-scraper.js +182 -0
  42. package/build/metadata/license.js +68 -0
  43. package/build/metadata/monitoring-service.js +107 -0
  44. package/build/metadata/monitoring-store.js +78 -0
  45. package/build/metadata/monitoring-types.js +1 -0
  46. package/build/metadata/quality.js +48 -0
  47. package/build/metadata/rate-limiter.js +128 -0
  48. package/build/metadata/scraper.js +353 -0
  49. package/build/metadata/store.js +325 -0
  50. package/build/metadata/types.js +1 -0
  51. package/build/metadata/uci-scraper.js +49 -0
  52. package/build/monitoring/observability.js +76 -0
  53. package/build/quality/analyzer.js +57 -0
  54. package/build/quality/image-analyzer.js +46 -0
  55. package/build/quality/media-analyzer.js +46 -0
  56. package/build/quality/quality-orchestrator.js +162 -0
  57. package/build/quality/types.js +1 -0
  58. package/build/scripts/build-index.js +54 -0
  59. package/build/scripts/check-db.js +73 -0
  60. package/build/scripts/check-jobs.js +24 -0
  61. package/build/scripts/check-naruto.js +17 -0
  62. package/build/scripts/demo-full-pipeline.js +62 -0
  63. package/build/scripts/demo-ui.js +58 -0
  64. package/build/scripts/e2e-demo.js +72 -0
  65. package/build/scripts/massive-scrape.js +103 -0
  66. package/build/scripts/ops-dashboard.js +33 -0
  67. package/build/scripts/scrape-metadata.js +100 -0
  68. package/build/scripts/search-cli.js +26 -0
  69. package/build/scripts/test-bias.js +45 -0
  70. package/build/scripts/test-caching.js +51 -0
  71. package/build/scripts/test-cleaning.js +76 -0
  72. package/build/scripts/test-cloud-storage.js +48 -0
  73. package/build/scripts/test-compliance.js +58 -0
  74. package/build/scripts/test-conversion.js +64 -0
  75. package/build/scripts/test-custom-rules.js +58 -0
  76. package/build/scripts/test-db-opt.js +63 -0
  77. package/build/scripts/test-export-custom.js +33 -0
  78. package/build/scripts/test-exporter.js +53 -0
  79. package/build/scripts/test-fusion.js +61 -0
  80. package/build/scripts/test-github.js +27 -0
  81. package/build/scripts/test-group-split.js +52 -0
  82. package/build/scripts/test-hf-download.js +29 -0
  83. package/build/scripts/test-holdout-manager.js +61 -0
  84. package/build/scripts/test-hybrid-search.js +41 -0
  85. package/build/scripts/test-image-analysis.js +50 -0
  86. package/build/scripts/test-ingestion-infra.js +39 -0
  87. package/build/scripts/test-install.js +40 -0
  88. package/build/scripts/test-institutional.js +26 -0
  89. package/build/scripts/test-integrity.js +41 -0
  90. package/build/scripts/test-jit.js +42 -0
  91. package/build/scripts/test-job-queue.js +62 -0
  92. package/build/scripts/test-kaggle-download.js +34 -0
  93. package/build/scripts/test-large-data.js +50 -0
  94. package/build/scripts/test-mcp-v5.js +73 -0
  95. package/build/scripts/test-media-analysis.js +61 -0
  96. package/build/scripts/test-monitoring.js +91 -0
  97. package/build/scripts/test-observability.js +106 -0
  98. package/build/scripts/test-packager.js +55 -0
  99. package/build/scripts/test-pipeline.js +50 -0
  100. package/build/scripts/test-planning.js +64 -0
  101. package/build/scripts/test-privacy.js +38 -0
  102. package/build/scripts/test-quality.js +43 -0
  103. package/build/scripts/test-robust-ingestion.js +41 -0
  104. package/build/scripts/test-schema.js +45 -0
  105. package/build/scripts/test-split-validation.js +40 -0
  106. package/build/scripts/test-splitter.js +93 -0
  107. package/build/scripts/test-uci.js +27 -0
  108. package/build/scripts/test-unified-quality.js +86 -0
  109. package/build/search/embedder.js +34 -0
  110. package/build/search/engine.js +129 -0
  111. package/build/search/jit-orchestrator.js +232 -0
  112. package/build/search/vector-store.js +105 -0
  113. package/build/splitting/splitter.js +57 -0
  114. package/build/splitting/types.js +1 -0
  115. package/build/tools/formatter.js +227 -0
  116. package/build/utils/downloader.js +52 -0
  117. package/mcp-config-template.json +15 -0
  118. package/package.json +84 -0
  119. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  120. package/src/python/cleaner.py +196 -0
  121. package/src/python/export_engine.py +112 -0
  122. package/src/python/framework_adapters.py +100 -0
  123. package/src/python/github_adapter.py +106 -0
  124. package/src/python/image_engine.py +86 -0
  125. package/src/python/media_engine.py +133 -0
  126. package/src/python/nasa_adapter.py +82 -0
  127. package/src/python/quality_engine.py +243 -0
  128. package/src/python/splitter_engine.py +283 -0
  129. package/src/python/test_framework_adapters.py +61 -0
  130. package/src/python/uci_adapter.py +94 -0
  131. package/src/python/worldbank_adapter.py +99 -0
@@ -0,0 +1,64 @@
1
+ import { listFiles } from "@huggingface/hub";
2
+ import { RobustDownloader } from "../utils/downloader.js";
3
+ export class HFDownloader {
4
+ hfToken;
5
+ downloader;
6
+ constructor(token) {
7
+ this.hfToken = token || process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
8
+ this.downloader = new RobustDownloader();
9
+ }
10
+ /**
11
+ * Finds the most suitable data file in a repository
12
+ * Returns the relative path within the repo
13
+ */
14
+ async findBestFile(repoId) {
15
+ try {
16
+ const files = [];
17
+ for await (const file of listFiles({
18
+ repo: { type: "dataset", name: repoId },
19
+ recursive: true,
20
+ ...(this.hfToken ? { accessToken: this.hfToken } : {})
21
+ })) {
22
+ if (file.type === "file") {
23
+ files.push(file.path);
24
+ }
25
+ }
26
+ // Priority logic for data scientists
27
+ const priorities = [
28
+ /train.*\.parquet$/i,
29
+ /data.*\.parquet$/i,
30
+ /.*\.parquet$/i,
31
+ /train.*\.csv$/i,
32
+ /data.*\.csv$/i,
33
+ /.*\.csv$/i,
34
+ /.*\.jsonl$/i,
35
+ /.*\.json$/i
36
+ ];
37
+ for (const pattern of priorities) {
38
+ const match = files.find(f => pattern.test(f));
39
+ if (match)
40
+ return match;
41
+ }
42
+ return files.length > 0 ? files[0] : null;
43
+ }
44
+ catch (error) {
45
+ console.error(`[HF] Failed to list files for ${repoId}:`, error.message);
46
+ return null;
47
+ }
48
+ }
49
+ /**
50
+ * Downloads a file from HF to local path
51
+ */
52
+ async download(repoId, filePath, targetPath, onProgress) {
53
+ const url = `https://huggingface.co/datasets/${repoId}/resolve/main/${filePath}`;
54
+ await this.downloader.download(url, targetPath, {
55
+ headers: this.hfToken ? { 'Authorization': `Bearer ${this.hfToken}` } : {},
56
+ resume: true,
57
+ onProgress: (bytes, total) => {
58
+ if (total > 0 && onProgress) {
59
+ onProgress(Math.round((bytes / total) * 100));
60
+ }
61
+ }
62
+ });
63
+ }
64
+ }
@@ -0,0 +1,96 @@
1
+ import path from "path";
2
+ import fs from "fs";
3
+ import { HFDownloader } from "./hf-downloader.js";
4
+ import { KaggleDownloader } from "./kaggle-downloader.js";
5
+ export class DataIngestor {
6
+ projectRoot;
7
+ store;
8
+ rawDataDir;
9
+ hfDownloader;
10
+ kaggleDownloader;
11
+ constructor(projectRoot, store) {
12
+ this.projectRoot = projectRoot;
13
+ this.store = store;
14
+ this.rawDataDir = path.join(this.projectRoot, "data", "raw");
15
+ if (!fs.existsSync(this.rawDataDir)) {
16
+ fs.mkdirSync(this.rawDataDir, { recursive: true });
17
+ }
18
+ this.hfDownloader = new HFDownloader();
19
+ this.kaggleDownloader = new KaggleDownloader();
20
+ }
21
+ /**
22
+ * Ensures a dataset is available locally
23
+ */
24
+ async ensureData(datasetId, source, onProgress) {
25
+ // 1. Check database for existing download
26
+ const status = this.store.getDownloadStatus(datasetId);
27
+ if (status && status.status === 'completed' && fs.existsSync(status.local_path)) {
28
+ return status.local_path;
29
+ }
30
+ if (status && status.status === 'downloading') {
31
+ console.log(`[Ingestor] Dataset ${datasetId} status is 'downloading'. Attempting to resume/concurrently monitor.`);
32
+ // In a better system we'd use a lock, but for now we let it resume
33
+ // the RobustDownloader handles the actual file locking/range logic.
34
+ }
35
+ // 2. Trigger source-specific download
36
+ if (source === "huggingface") {
37
+ onProgress?.("Discovering data files on HuggingFace Hub...");
38
+ const remotePath = await this.hfDownloader.findBestFile(datasetId);
39
+ if (!remotePath)
40
+ throw new Error(`No suitable data files found in HuggingFace repo: ${datasetId}`);
41
+ const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
42
+ const targetPath = this.getTargetPath(datasetId, ext);
43
+ this.store.registerDownload(datasetId, targetPath, "downloading");
44
+ try {
45
+ await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
46
+ onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
47
+ });
48
+ const stats = fs.statSync(targetPath);
49
+ this.completeDownload(datasetId, targetPath, stats.size);
50
+ return targetPath;
51
+ }
52
+ catch (e) {
53
+ this.failDownload(datasetId, e.message);
54
+ throw e;
55
+ }
56
+ }
57
+ else if (source === "kaggle") {
58
+ const safeId = datasetId.replace("kaggle:", "").replace(/\//g, "_");
59
+ const targetDir = path.join(this.rawDataDir, `kaggle_${safeId}`);
60
+ this.store.registerDownload(datasetId, targetDir, "downloading");
61
+ try {
62
+ const primaryFile = await this.kaggleDownloader.download(datasetId.replace("kaggle:", ""), targetDir, (progress) => {
63
+ onProgress?.("Downloading Kaggle archive...", progress);
64
+ });
65
+ const stats = fs.statSync(primaryFile);
66
+ this.completeDownload(datasetId, primaryFile, stats.size);
67
+ return primaryFile;
68
+ }
69
+ catch (e) {
70
+ this.failDownload(datasetId, e.message);
71
+ throw e;
72
+ }
73
+ }
74
+ throw new Error(`Download logic for ${source} not yet implemented`);
75
+ }
76
+ /**
77
+ * Register a successful download
78
+ */
79
+ completeDownload(datasetId, actualPath, sizeBytes) {
80
+ this.store.registerDownload(datasetId, actualPath, 'completed', sizeBytes);
81
+ }
82
+ /**
83
+ * Register a failed download
84
+ */
85
+ failDownload(datasetId, error) {
86
+ const existing = this.store.getDownloadStatus(datasetId);
87
+ this.store.registerDownload(datasetId, existing?.local_path || "", 'failed', 0, error);
88
+ }
89
+ /**
90
+ * Generates a safe local filename for a dataset ID
91
+ */
92
+ getTargetPath(datasetId, extension = "csv") {
93
+ const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
94
+ return path.join(this.rawDataDir, `${safeId}.${extension}`);
95
+ }
96
+ }
@@ -0,0 +1,79 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ import AdmZip from "adm-zip";
4
+ import { RobustDownloader } from "../utils/downloader.js";
5
+ export class KaggleDownloader {
6
+ username;
7
+ key;
8
+ downloader;
9
+ constructor(username, key) {
10
+ this.username = username || process.env.KAGGLE_USERNAME || "";
11
+ this.key = key || process.env.KAGGLE_KEY || "";
12
+ this.downloader = new RobustDownloader();
13
+ }
14
+ /**
15
+ * Downloads and extracts a Kaggle dataset
16
+ * returns the path to the primary data file
17
+ */
18
+ async download(repoId, targetDir, onProgress) {
19
+ if (!this.username || !this.key) {
20
+ throw new Error("Kaggle credentials missing (KAGGLE_USERNAME, KAGGLE_KEY)");
21
+ }
22
+ const auth = Buffer.from(`${this.username}:${this.key}`).toString('base64');
23
+ const url = `https://www.kaggle.com/api/v1/datasets/download/${repoId}`;
24
+ // Ensure target directory exists
25
+ if (!fs.existsSync(targetDir)) {
26
+ fs.mkdirSync(targetDir, { recursive: true });
27
+ }
28
+ const zipPath = path.join(targetDir, "data.zip");
29
+ await this.downloader.download(url, zipPath, {
30
+ headers: {
31
+ 'Authorization': `Basic ${auth}`
32
+ },
33
+ resume: true,
34
+ onProgress: (bytes, total) => {
35
+ if (total > 0 && onProgress) {
36
+ onProgress(Math.round((bytes / total) * 100));
37
+ }
38
+ }
39
+ });
40
+ // Unzip
41
+ const zip = new AdmZip(zipPath);
42
+ zip.extractAllTo(targetDir, true);
43
+ // Find best file
44
+ const extractedFiles = this.getAllFiles(targetDir);
45
+ const priorities = [
46
+ /.*\.parquet$/i,
47
+ /.*\.csv$/i,
48
+ /.*\.jsonl$/i,
49
+ /.*\.json$/i
50
+ ];
51
+ let bestFile = null;
52
+ for (const pattern of priorities) {
53
+ const match = extractedFiles.find(f => pattern.test(f) && !f.endsWith(".zip"));
54
+ if (match) {
55
+ bestFile = match;
56
+ break;
57
+ }
58
+ }
59
+ // Cleanup ZIP
60
+ fs.unlinkSync(zipPath);
61
+ if (!bestFile) {
62
+ throw new Error("No suitable data files found in Kaggle archive");
63
+ }
64
+ return bestFile;
65
+ }
66
+ getAllFiles(dir, allFiles = []) {
67
+ const files = fs.readdirSync(dir);
68
+ files.forEach(file => {
69
+ const name = path.join(dir, file);
70
+ if (fs.statSync(name).isDirectory()) {
71
+ this.getAllFiles(name, allFiles);
72
+ }
73
+ else {
74
+ allFiles.push(name);
75
+ }
76
+ });
77
+ return allFiles;
78
+ }
79
+ }
@@ -0,0 +1,41 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ export class InstallService {
4
+ projectRoot;
5
+ metadataStore;
6
+ constructor(projectRoot, metadataStore) {
7
+ this.projectRoot = projectRoot;
8
+ this.metadataStore = metadataStore;
9
+ }
10
+ /**
11
+ * Installs a prepared dataset file into the ./datasets directory
12
+ * @param datasetId The ID of the dataset
13
+ * @param sourcePath The current location of the processed file
14
+ * @returns The absolute path to the installed file
15
+ */
16
+ async install(datasetId, sourcePath, targetDir) {
17
+ if (!fs.existsSync(sourcePath)) {
18
+ throw new Error(`Source file not found for installation: ${sourcePath}`);
19
+ }
20
+ const dataset = this.metadataStore.getDataset(datasetId);
21
+ if (!dataset) {
22
+ throw new Error(`Dataset metadata not found for ${datasetId}`);
23
+ }
24
+ // Create target directory
25
+ const sanitizedName = dataset.name.replace(/[^a-z0-9]/gi, "_").toLowerCase();
26
+ const installDir = targetDir || path.join(this.projectRoot, "datasets", sanitizedName);
27
+ if (!fs.existsSync(installDir)) {
28
+ fs.mkdirSync(installDir, { recursive: true });
29
+ }
30
+ const extension = path.extname(sourcePath);
31
+ const targetFilename = `${sanitizedName}${extension}`;
32
+ const targetPath = path.join(installDir, targetFilename);
33
+ // Copy file
34
+ fs.copyFileSync(sourcePath, targetPath);
35
+ // Update metadata
36
+ const absolutePath = path.resolve(targetPath);
37
+ this.metadataStore.updateInstallPath(datasetId, absolutePath);
38
+ console.log(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
39
+ return absolutePath;
40
+ }
41
+ }
@@ -0,0 +1,129 @@
1
+ import { v4 as uuidv4 } from "uuid";
2
+ import { EventEmitter } from "events";
3
+ import { QueueService } from "./queue.js";
4
+ import { ObservabilityService } from "../monitoring/observability.js";
5
+ export class JobManager extends EventEmitter {
6
+ store;
7
+ static instance;
8
+ queue;
9
+ stats;
10
+ maxConcurrency = 2;
11
+ activeWorkers = 0;
12
+ constructor(store, stats) {
13
+ super();
14
+ this.store = store;
15
+ this.queue = new QueueService();
16
+ this.stats = stats || new ObservabilityService();
17
+ }
18
+ static getInstance(store, stats) {
19
+ if (!JobManager.instance) {
20
+ JobManager.instance = new JobManager(store, stats);
21
+ }
22
+ return JobManager.instance;
23
+ }
24
+ /**
25
+ * Set max parallel workers
26
+ */
27
+ setConcurrency(count) {
28
+ this.maxConcurrency = count;
29
+ }
30
+ /**
31
+ * Create a new job and add it to the priority queue
32
+ */
33
+ createJob(type, priority = 0, metadata, maxAttempts = 3) {
34
+ const now = new Date().toISOString();
35
+ const job = {
36
+ id: uuidv4(),
37
+ type,
38
+ status: "pending",
39
+ priority,
40
+ progress: 0,
41
+ status_text: "Job created",
42
+ attempts: 0,
43
+ max_attempts: maxAttempts,
44
+ created_at: now,
45
+ updated_at: now,
46
+ metadata: metadata ? JSON.stringify(metadata) : undefined
47
+ };
48
+ this.store.saveJob(job);
49
+ this.queue.enqueue(job);
50
+ this.emit("jobCreated", job);
51
+ // Start processing background queue
52
+ this.processQueue();
53
+ return job;
54
+ }
55
+ /**
56
+ * Background loop to process queued jobs
57
+ */
58
+ async processQueue() {
59
+ if (this.activeWorkers >= this.maxConcurrency)
60
+ return;
61
+ const job = this.queue.dequeue();
62
+ if (!job)
63
+ return;
64
+ this.activeWorkers++;
65
+ this.updateJob(job.id, { status: "running", status_text: "Picked up by worker" });
66
+ const startTime = Date.now();
67
+ // In a real system, we'd have a registry of handlers for each JobType.
68
+ // For now, we emit an event so the orchestrator can run it.
69
+ this.emit("processJob", job, async (task) => {
70
+ try {
71
+ const resultUrl = await task();
72
+ const duration = Date.now() - startTime;
73
+ this.updateJob(job.id, {
74
+ status: "completed",
75
+ progress: 100,
76
+ status_text: "Completed successfully",
77
+ result_url: resultUrl || undefined
78
+ });
79
+ this.stats.recordJobSuccess(job.type, duration);
80
+ }
81
+ catch (error) {
82
+ if (this.queue.shouldRetry(job)) {
83
+ this.updateJob(job.id, {
84
+ status: "retrying",
85
+ status_text: `Failed: ${error.message}. Retrying...`
86
+ });
87
+ }
88
+ else {
89
+ this.updateJob(job.id, {
90
+ status: "failed",
91
+ status_text: "Max retries exceeded",
92
+ error: error.message || String(error)
93
+ });
94
+ this.stats.recordJobFailure(job.type, error);
95
+ }
96
+ }
97
+ finally {
98
+ this.queue.finalize(job.id);
99
+ this.activeWorkers--;
100
+ this.processQueue(); // Look for next job
101
+ }
102
+ });
103
+ // Try to start another worker if capacity allows
104
+ this.processQueue();
105
+ }
106
+ /**
107
+ * Helper to run a task as a job with automatic status updates
108
+ */
109
+ runJob(id, task) {
110
+ this.emit("processJob", { id }, async () => {
111
+ return await task((updates) => this.updateJob(id, updates));
112
+ });
113
+ }
114
+ /**
115
+ * Update job status and progress
116
+ */
117
+ updateJob(id, updates) {
118
+ const job = this.store.getJob(id);
119
+ if (!job)
120
+ return;
121
+ const updatedJob = {
122
+ ...job,
123
+ ...updates,
124
+ updated_at: new Date().toISOString()
125
+ };
126
+ this.store.saveJob(updatedJob);
127
+ this.emit("jobUpdated", updatedJob);
128
+ }
129
+ }
@@ -0,0 +1,59 @@
1
+ export class QueueService {
2
+ queue = [];
3
+ running = new Map();
4
+ /**
5
+ * Adds a job to the queue, sorted by priority.
6
+ */
7
+ enqueue(job) {
8
+ job.status = "queued";
9
+ this.queue.push(job);
10
+ this.sortQueue();
11
+ }
12
+ /**
13
+ * Gets the next job from the queue.
14
+ */
15
+ dequeue() {
16
+ const job = this.queue.shift();
17
+ if (job) {
18
+ job.status = "running";
19
+ this.running.set(job.id, job);
20
+ }
21
+ return job;
22
+ }
23
+ /**
24
+ * Mark a job as finished (completed or failed permanently).
25
+ */
26
+ finalize(id) {
27
+ this.running.delete(id);
28
+ }
29
+ /**
30
+ * Handles retry logic with exponential backoff (simulated).
31
+ */
32
+ shouldRetry(job) {
33
+ if (job.attempts < job.max_attempts) {
34
+ job.attempts++;
35
+ job.status = "retrying";
36
+ // In a real system, we'd schedule this for later.
37
+ // For now, we'll re-enqueue it immediately.
38
+ this.enqueue(job);
39
+ return true;
40
+ }
41
+ return false;
42
+ }
43
+ getStats() {
44
+ return {
45
+ pending: this.queue.length,
46
+ running: this.running.size,
47
+ total: this.queue.length + this.running.size
48
+ };
49
+ }
50
+ sortQueue() {
51
+ // Priority high to low, then created_at old to new
52
+ this.queue.sort((a, b) => {
53
+ if (b.priority !== a.priority) {
54
+ return b.priority - a.priority;
55
+ }
56
+ return new Date(a.created_at).getTime() - new Date(b.created_at).getTime();
57
+ });
58
+ }
59
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,147 @@
1
+ const DOMAIN_KEYWORDS = [
2
+ {
3
+ domain: "medicine",
4
+ keywords: ["medical", "medicine", "clinical", "diagnosis", "patient", "disease", "symptom", "treatment", "drug", "pharmaceutical", "healthcare", "hospital", "doctor", "physician", "medical imaging", "radiology", "pathology", "oncology", "cardiology"],
5
+ weight: 10
6
+ },
7
+ {
8
+ domain: "healthcare",
9
+ keywords: ["health", "healthcare", "wellness", "public health", "epidemiology", "biomedical", "health data", "medical records", "ehr", "electronic health"],
10
+ weight: 8
11
+ },
12
+ {
13
+ domain: "security",
14
+ keywords: ["security", "cybersecurity", "malware", "vulnerability", "threat", "attack", "defense", "encryption", "privacy", "authentication", "authorization", "penetration testing", "intrusion"],
15
+ weight: 10
16
+ },
17
+ {
18
+ domain: "finance",
19
+ keywords: ["finance", "financial", "banking", "trading", "stock", "market", "investment", "credit", "loan", "mortgage", "cryptocurrency", "bitcoin", "blockchain", "accounting"],
20
+ weight: 9
21
+ },
22
+ {
23
+ domain: "education",
24
+ keywords: ["education", "learning", "student", "academic", "curriculum", "pedagogy", "teaching", "school", "university", "course", "tutorial"],
25
+ weight: 8
26
+ },
27
+ {
28
+ domain: "science",
29
+ keywords: ["scientific", "research", "experiment", "laboratory", "physics", "chemistry", "biology", "astronomy", "geology", "environmental"],
30
+ weight: 7
31
+ },
32
+ {
33
+ domain: "technology",
34
+ keywords: ["software", "programming", "code", "algorithm", "system", "application", "development", "engineering", "technical"],
35
+ weight: 5
36
+ },
37
+ {
38
+ domain: "social",
39
+ keywords: ["social", "society", "community", "demographic", "census", "population", "sociology", "anthropology", "culture"],
40
+ weight: 6
41
+ },
42
+ {
43
+ domain: "legal",
44
+ keywords: ["legal", "law", "court", "judicial", "litigation", "contract", "regulation", "compliance", "legislation"],
45
+ weight: 9
46
+ },
47
+ {
48
+ domain: "business",
49
+ keywords: ["business", "corporate", "enterprise", "company", "organization", "management", "marketing", "sales", "customer"],
50
+ weight: 6
51
+ },
52
+ {
53
+ domain: "multimedia",
54
+ keywords: ["image", "video", "audio", "multimedia", "visual", "photography", "music", "sound"],
55
+ weight: 7
56
+ },
57
+ {
58
+ domain: "nlp",
59
+ keywords: ["natural language", "nlp", "text", "language model", "translation", "sentiment", "text classification", "named entity", "ner", "question answering", "qa"],
60
+ weight: 8
61
+ },
62
+ {
63
+ domain: "computer-vision",
64
+ keywords: ["computer vision", "cv", "image classification", "object detection", "segmentation", "visual", "optical", "camera", "facial recognition"],
65
+ weight: 8
66
+ }
67
+ ];
68
+ export function classifyDomain(description, tags, name, task) {
69
+ const text = `${description} ${name} ${tags.join(" ")} ${task || ""}`.toLowerCase();
70
+ const scores = new Map();
71
+ // Initialize scores
72
+ DOMAIN_KEYWORDS.forEach(dk => {
73
+ scores.set(dk.domain, 0);
74
+ });
75
+ // Score each domain
76
+ DOMAIN_KEYWORDS.forEach(dk => {
77
+ let matches = 0;
78
+ dk.keywords.forEach(keyword => {
79
+ if (text.includes(keyword.toLowerCase())) {
80
+ matches++;
81
+ }
82
+ });
83
+ if (matches > 0) {
84
+ const currentScore = scores.get(dk.domain) || 0;
85
+ scores.set(dk.domain, currentScore + (matches * dk.weight));
86
+ }
87
+ });
88
+ // Find domain with highest score
89
+ let maxScore = 0;
90
+ let bestDomain = "unknown";
91
+ scores.forEach((score, domain) => {
92
+ if (score > maxScore) {
93
+ maxScore = score;
94
+ bestDomain = domain;
95
+ }
96
+ });
97
+ // If no strong match, check task type for hints
98
+ if (maxScore === 0 && task) {
99
+ const taskLower = task.toLowerCase();
100
+ if (taskLower.includes("image") || taskLower.includes("vision") || taskLower.includes("detection")) {
101
+ return "computer-vision";
102
+ }
103
+ if (taskLower.includes("text") || taskLower.includes("language") || taskLower.includes("nlp")) {
104
+ return "nlp";
105
+ }
106
+ }
107
+ // Require minimum score threshold to avoid false positives
108
+ if (maxScore < 5) {
109
+ return "general";
110
+ }
111
+ // Special handling: medicine and healthcare are related
112
+ if (bestDomain === "medicine" || bestDomain === "healthcare") {
113
+ const medicineScore = scores.get("medicine") || 0;
114
+ const healthcareScore = scores.get("healthcare") || 0;
115
+ if (medicineScore > healthcareScore * 1.5) {
116
+ return "medicine";
117
+ }
118
+ else if (healthcareScore > medicineScore * 1.5) {
119
+ return "healthcare";
120
+ }
121
+ else {
122
+ // If both are high, prefer medicine for medical-specific terms
123
+ return (medicineScore >= healthcareScore ? "medicine" : "healthcare");
124
+ }
125
+ }
126
+ return bestDomain;
127
+ }
128
+ export function getDomainDisplayName(domain) {
129
+ const names = {
130
+ "medicine": "Medicine",
131
+ "healthcare": "Healthcare",
132
+ "security": "Security",
133
+ "finance": "Finance",
134
+ "education": "Education",
135
+ "science": "Science",
136
+ "technology": "Technology",
137
+ "social": "Social",
138
+ "legal": "Legal",
139
+ "business": "Business",
140
+ "multimedia": "Multimedia",
141
+ "nlp": "Natural Language Processing",
142
+ "computer-vision": "Computer Vision",
143
+ "general": "General",
144
+ "unknown": "Unknown"
145
+ };
146
+ return names[domain] || "Unknown";
147
+ }
@@ -0,0 +1,47 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ const PYTHON_SCRIPT_PATH = path.resolve("src", "python", "github_adapter.py");
4
+ export class GitHubScraper {
5
+ /**
6
+ * Search GitHub repositories for datasets using the Python adapter
7
+ */
8
+ async scrape(query, limit = 10) {
9
+ return new Promise((resolve, reject) => {
10
+ const pythonProcess = spawn("python", [
11
+ PYTHON_SCRIPT_PATH,
12
+ "--action", "search",
13
+ "--query", query,
14
+ "--limit", String(limit)
15
+ ]);
16
+ let output = "";
17
+ let errorOutput = "";
18
+ pythonProcess.stdout.on("data", (data) => {
19
+ output += data.toString();
20
+ });
21
+ pythonProcess.stderr.on("data", (data) => {
22
+ errorOutput += data.toString();
23
+ });
24
+ pythonProcess.on("close", (code) => {
25
+ if (code !== 0) {
26
+ console.error(`[GitHubScraper] Process exited with code ${code}: ${errorOutput}`);
27
+ resolve([]);
28
+ return;
29
+ }
30
+ try {
31
+ const results = JSON.parse(output);
32
+ if (results.error) {
33
+ console.error(`[GitHubScraper] Internal error: ${results.error}`);
34
+ resolve([]);
35
+ }
36
+ else {
37
+ resolve(results);
38
+ }
39
+ }
40
+ catch (e) {
41
+ console.error(`[GitHubScraper] JSON parse error: ${e.message}. Output: ${output.substring(0, 100)}...`);
42
+ resolve([]);
43
+ }
44
+ });
45
+ });
46
+ }
47
+ }