@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { listFiles } from "@huggingface/hub";
|
|
2
|
+
import { RobustDownloader } from "../utils/downloader.js";
|
|
3
|
+
export class HFDownloader {
|
|
4
|
+
hfToken;
|
|
5
|
+
downloader;
|
|
6
|
+
constructor(token) {
|
|
7
|
+
this.hfToken = token || process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
8
|
+
this.downloader = new RobustDownloader();
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Finds the most suitable data file in a repository
|
|
12
|
+
* Returns the relative path within the repo
|
|
13
|
+
*/
|
|
14
|
+
async findBestFile(repoId) {
|
|
15
|
+
try {
|
|
16
|
+
const files = [];
|
|
17
|
+
for await (const file of listFiles({
|
|
18
|
+
repo: { type: "dataset", name: repoId },
|
|
19
|
+
recursive: true,
|
|
20
|
+
...(this.hfToken ? { accessToken: this.hfToken } : {})
|
|
21
|
+
})) {
|
|
22
|
+
if (file.type === "file") {
|
|
23
|
+
files.push(file.path);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
// Priority logic for data scientists
|
|
27
|
+
const priorities = [
|
|
28
|
+
/train.*\.parquet$/i,
|
|
29
|
+
/data.*\.parquet$/i,
|
|
30
|
+
/.*\.parquet$/i,
|
|
31
|
+
/train.*\.csv$/i,
|
|
32
|
+
/data.*\.csv$/i,
|
|
33
|
+
/.*\.csv$/i,
|
|
34
|
+
/.*\.jsonl$/i,
|
|
35
|
+
/.*\.json$/i
|
|
36
|
+
];
|
|
37
|
+
for (const pattern of priorities) {
|
|
38
|
+
const match = files.find(f => pattern.test(f));
|
|
39
|
+
if (match)
|
|
40
|
+
return match;
|
|
41
|
+
}
|
|
42
|
+
return files.length > 0 ? files[0] : null;
|
|
43
|
+
}
|
|
44
|
+
catch (error) {
|
|
45
|
+
console.error(`[HF] Failed to list files for ${repoId}:`, error.message);
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Downloads a file from HF to local path
|
|
51
|
+
*/
|
|
52
|
+
async download(repoId, filePath, targetPath, onProgress) {
|
|
53
|
+
const url = `https://huggingface.co/datasets/${repoId}/resolve/main/${filePath}`;
|
|
54
|
+
await this.downloader.download(url, targetPath, {
|
|
55
|
+
headers: this.hfToken ? { 'Authorization': `Bearer ${this.hfToken}` } : {},
|
|
56
|
+
resume: true,
|
|
57
|
+
onProgress: (bytes, total) => {
|
|
58
|
+
if (total > 0 && onProgress) {
|
|
59
|
+
onProgress(Math.round((bytes / total) * 100));
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import { HFDownloader } from "./hf-downloader.js";
|
|
4
|
+
import { KaggleDownloader } from "./kaggle-downloader.js";
|
|
5
|
+
export class DataIngestor {
|
|
6
|
+
projectRoot;
|
|
7
|
+
store;
|
|
8
|
+
rawDataDir;
|
|
9
|
+
hfDownloader;
|
|
10
|
+
kaggleDownloader;
|
|
11
|
+
constructor(projectRoot, store) {
|
|
12
|
+
this.projectRoot = projectRoot;
|
|
13
|
+
this.store = store;
|
|
14
|
+
this.rawDataDir = path.join(this.projectRoot, "data", "raw");
|
|
15
|
+
if (!fs.existsSync(this.rawDataDir)) {
|
|
16
|
+
fs.mkdirSync(this.rawDataDir, { recursive: true });
|
|
17
|
+
}
|
|
18
|
+
this.hfDownloader = new HFDownloader();
|
|
19
|
+
this.kaggleDownloader = new KaggleDownloader();
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Ensures a dataset is available locally
|
|
23
|
+
*/
|
|
24
|
+
async ensureData(datasetId, source, onProgress) {
|
|
25
|
+
// 1. Check database for existing download
|
|
26
|
+
const status = this.store.getDownloadStatus(datasetId);
|
|
27
|
+
if (status && status.status === 'completed' && fs.existsSync(status.local_path)) {
|
|
28
|
+
return status.local_path;
|
|
29
|
+
}
|
|
30
|
+
if (status && status.status === 'downloading') {
|
|
31
|
+
console.log(`[Ingestor] Dataset ${datasetId} status is 'downloading'. Attempting to resume/concurrently monitor.`);
|
|
32
|
+
// In a better system we'd use a lock, but for now we let it resume
|
|
33
|
+
// the RobustDownloader handles the actual file locking/range logic.
|
|
34
|
+
}
|
|
35
|
+
// 2. Trigger source-specific download
|
|
36
|
+
if (source === "huggingface") {
|
|
37
|
+
onProgress?.("Discovering data files on HuggingFace Hub...");
|
|
38
|
+
const remotePath = await this.hfDownloader.findBestFile(datasetId);
|
|
39
|
+
if (!remotePath)
|
|
40
|
+
throw new Error(`No suitable data files found in HuggingFace repo: ${datasetId}`);
|
|
41
|
+
const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
|
|
42
|
+
const targetPath = this.getTargetPath(datasetId, ext);
|
|
43
|
+
this.store.registerDownload(datasetId, targetPath, "downloading");
|
|
44
|
+
try {
|
|
45
|
+
await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
|
|
46
|
+
onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
|
|
47
|
+
});
|
|
48
|
+
const stats = fs.statSync(targetPath);
|
|
49
|
+
this.completeDownload(datasetId, targetPath, stats.size);
|
|
50
|
+
return targetPath;
|
|
51
|
+
}
|
|
52
|
+
catch (e) {
|
|
53
|
+
this.failDownload(datasetId, e.message);
|
|
54
|
+
throw e;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
else if (source === "kaggle") {
|
|
58
|
+
const safeId = datasetId.replace("kaggle:", "").replace(/\//g, "_");
|
|
59
|
+
const targetDir = path.join(this.rawDataDir, `kaggle_${safeId}`);
|
|
60
|
+
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
61
|
+
try {
|
|
62
|
+
const primaryFile = await this.kaggleDownloader.download(datasetId.replace("kaggle:", ""), targetDir, (progress) => {
|
|
63
|
+
onProgress?.("Downloading Kaggle archive...", progress);
|
|
64
|
+
});
|
|
65
|
+
const stats = fs.statSync(primaryFile);
|
|
66
|
+
this.completeDownload(datasetId, primaryFile, stats.size);
|
|
67
|
+
return primaryFile;
|
|
68
|
+
}
|
|
69
|
+
catch (e) {
|
|
70
|
+
this.failDownload(datasetId, e.message);
|
|
71
|
+
throw e;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
throw new Error(`Download logic for ${source} not yet implemented`);
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Register a successful download
|
|
78
|
+
*/
|
|
79
|
+
completeDownload(datasetId, actualPath, sizeBytes) {
|
|
80
|
+
this.store.registerDownload(datasetId, actualPath, 'completed', sizeBytes);
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Register a failed download
|
|
84
|
+
*/
|
|
85
|
+
failDownload(datasetId, error) {
|
|
86
|
+
const existing = this.store.getDownloadStatus(datasetId);
|
|
87
|
+
this.store.registerDownload(datasetId, existing?.local_path || "", 'failed', 0, error);
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Generates a safe local filename for a dataset ID
|
|
91
|
+
*/
|
|
92
|
+
getTargetPath(datasetId, extension = "csv") {
|
|
93
|
+
const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
|
|
94
|
+
return path.join(this.rawDataDir, `${safeId}.${extension}`);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import AdmZip from "adm-zip";
|
|
4
|
+
import { RobustDownloader } from "../utils/downloader.js";
|
|
5
|
+
export class KaggleDownloader {
|
|
6
|
+
username;
|
|
7
|
+
key;
|
|
8
|
+
downloader;
|
|
9
|
+
constructor(username, key) {
|
|
10
|
+
this.username = username || process.env.KAGGLE_USERNAME || "";
|
|
11
|
+
this.key = key || process.env.KAGGLE_KEY || "";
|
|
12
|
+
this.downloader = new RobustDownloader();
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Downloads and extracts a Kaggle dataset
|
|
16
|
+
* returns the path to the primary data file
|
|
17
|
+
*/
|
|
18
|
+
async download(repoId, targetDir, onProgress) {
|
|
19
|
+
if (!this.username || !this.key) {
|
|
20
|
+
throw new Error("Kaggle credentials missing (KAGGLE_USERNAME, KAGGLE_KEY)");
|
|
21
|
+
}
|
|
22
|
+
const auth = Buffer.from(`${this.username}:${this.key}`).toString('base64');
|
|
23
|
+
const url = `https://www.kaggle.com/api/v1/datasets/download/${repoId}`;
|
|
24
|
+
// Ensure target directory exists
|
|
25
|
+
if (!fs.existsSync(targetDir)) {
|
|
26
|
+
fs.mkdirSync(targetDir, { recursive: true });
|
|
27
|
+
}
|
|
28
|
+
const zipPath = path.join(targetDir, "data.zip");
|
|
29
|
+
await this.downloader.download(url, zipPath, {
|
|
30
|
+
headers: {
|
|
31
|
+
'Authorization': `Basic ${auth}`
|
|
32
|
+
},
|
|
33
|
+
resume: true,
|
|
34
|
+
onProgress: (bytes, total) => {
|
|
35
|
+
if (total > 0 && onProgress) {
|
|
36
|
+
onProgress(Math.round((bytes / total) * 100));
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
});
|
|
40
|
+
// Unzip
|
|
41
|
+
const zip = new AdmZip(zipPath);
|
|
42
|
+
zip.extractAllTo(targetDir, true);
|
|
43
|
+
// Find best file
|
|
44
|
+
const extractedFiles = this.getAllFiles(targetDir);
|
|
45
|
+
const priorities = [
|
|
46
|
+
/.*\.parquet$/i,
|
|
47
|
+
/.*\.csv$/i,
|
|
48
|
+
/.*\.jsonl$/i,
|
|
49
|
+
/.*\.json$/i
|
|
50
|
+
];
|
|
51
|
+
let bestFile = null;
|
|
52
|
+
for (const pattern of priorities) {
|
|
53
|
+
const match = extractedFiles.find(f => pattern.test(f) && !f.endsWith(".zip"));
|
|
54
|
+
if (match) {
|
|
55
|
+
bestFile = match;
|
|
56
|
+
break;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
// Cleanup ZIP
|
|
60
|
+
fs.unlinkSync(zipPath);
|
|
61
|
+
if (!bestFile) {
|
|
62
|
+
throw new Error("No suitable data files found in Kaggle archive");
|
|
63
|
+
}
|
|
64
|
+
return bestFile;
|
|
65
|
+
}
|
|
66
|
+
getAllFiles(dir, allFiles = []) {
|
|
67
|
+
const files = fs.readdirSync(dir);
|
|
68
|
+
files.forEach(file => {
|
|
69
|
+
const name = path.join(dir, file);
|
|
70
|
+
if (fs.statSync(name).isDirectory()) {
|
|
71
|
+
this.getAllFiles(name, allFiles);
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
allFiles.push(name);
|
|
75
|
+
}
|
|
76
|
+
});
|
|
77
|
+
return allFiles;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import path from "path";
|
|
3
|
+
export class InstallService {
|
|
4
|
+
projectRoot;
|
|
5
|
+
metadataStore;
|
|
6
|
+
constructor(projectRoot, metadataStore) {
|
|
7
|
+
this.projectRoot = projectRoot;
|
|
8
|
+
this.metadataStore = metadataStore;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Installs a prepared dataset file into the ./datasets directory
|
|
12
|
+
* @param datasetId The ID of the dataset
|
|
13
|
+
* @param sourcePath The current location of the processed file
|
|
14
|
+
* @returns The absolute path to the installed file
|
|
15
|
+
*/
|
|
16
|
+
async install(datasetId, sourcePath, targetDir) {
|
|
17
|
+
if (!fs.existsSync(sourcePath)) {
|
|
18
|
+
throw new Error(`Source file not found for installation: ${sourcePath}`);
|
|
19
|
+
}
|
|
20
|
+
const dataset = this.metadataStore.getDataset(datasetId);
|
|
21
|
+
if (!dataset) {
|
|
22
|
+
throw new Error(`Dataset metadata not found for ${datasetId}`);
|
|
23
|
+
}
|
|
24
|
+
// Create target directory
|
|
25
|
+
const sanitizedName = dataset.name.replace(/[^a-z0-9]/gi, "_").toLowerCase();
|
|
26
|
+
const installDir = targetDir || path.join(this.projectRoot, "datasets", sanitizedName);
|
|
27
|
+
if (!fs.existsSync(installDir)) {
|
|
28
|
+
fs.mkdirSync(installDir, { recursive: true });
|
|
29
|
+
}
|
|
30
|
+
const extension = path.extname(sourcePath);
|
|
31
|
+
const targetFilename = `${sanitizedName}${extension}`;
|
|
32
|
+
const targetPath = path.join(installDir, targetFilename);
|
|
33
|
+
// Copy file
|
|
34
|
+
fs.copyFileSync(sourcePath, targetPath);
|
|
35
|
+
// Update metadata
|
|
36
|
+
const absolutePath = path.resolve(targetPath);
|
|
37
|
+
this.metadataStore.updateInstallPath(datasetId, absolutePath);
|
|
38
|
+
console.log(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
|
|
39
|
+
return absolutePath;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import { v4 as uuidv4 } from "uuid";
|
|
2
|
+
import { EventEmitter } from "events";
|
|
3
|
+
import { QueueService } from "./queue.js";
|
|
4
|
+
import { ObservabilityService } from "../monitoring/observability.js";
|
|
5
|
+
export class JobManager extends EventEmitter {
|
|
6
|
+
store;
|
|
7
|
+
static instance;
|
|
8
|
+
queue;
|
|
9
|
+
stats;
|
|
10
|
+
maxConcurrency = 2;
|
|
11
|
+
activeWorkers = 0;
|
|
12
|
+
constructor(store, stats) {
|
|
13
|
+
super();
|
|
14
|
+
this.store = store;
|
|
15
|
+
this.queue = new QueueService();
|
|
16
|
+
this.stats = stats || new ObservabilityService();
|
|
17
|
+
}
|
|
18
|
+
static getInstance(store, stats) {
|
|
19
|
+
if (!JobManager.instance) {
|
|
20
|
+
JobManager.instance = new JobManager(store, stats);
|
|
21
|
+
}
|
|
22
|
+
return JobManager.instance;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Set max parallel workers
|
|
26
|
+
*/
|
|
27
|
+
setConcurrency(count) {
|
|
28
|
+
this.maxConcurrency = count;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Create a new job and add it to the priority queue
|
|
32
|
+
*/
|
|
33
|
+
createJob(type, priority = 0, metadata, maxAttempts = 3) {
|
|
34
|
+
const now = new Date().toISOString();
|
|
35
|
+
const job = {
|
|
36
|
+
id: uuidv4(),
|
|
37
|
+
type,
|
|
38
|
+
status: "pending",
|
|
39
|
+
priority,
|
|
40
|
+
progress: 0,
|
|
41
|
+
status_text: "Job created",
|
|
42
|
+
attempts: 0,
|
|
43
|
+
max_attempts: maxAttempts,
|
|
44
|
+
created_at: now,
|
|
45
|
+
updated_at: now,
|
|
46
|
+
metadata: metadata ? JSON.stringify(metadata) : undefined
|
|
47
|
+
};
|
|
48
|
+
this.store.saveJob(job);
|
|
49
|
+
this.queue.enqueue(job);
|
|
50
|
+
this.emit("jobCreated", job);
|
|
51
|
+
// Start processing background queue
|
|
52
|
+
this.processQueue();
|
|
53
|
+
return job;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Background loop to process queued jobs
|
|
57
|
+
*/
|
|
58
|
+
async processQueue() {
|
|
59
|
+
if (this.activeWorkers >= this.maxConcurrency)
|
|
60
|
+
return;
|
|
61
|
+
const job = this.queue.dequeue();
|
|
62
|
+
if (!job)
|
|
63
|
+
return;
|
|
64
|
+
this.activeWorkers++;
|
|
65
|
+
this.updateJob(job.id, { status: "running", status_text: "Picked up by worker" });
|
|
66
|
+
const startTime = Date.now();
|
|
67
|
+
// In a real system, we'd have a registry of handlers for each JobType.
|
|
68
|
+
// For now, we emit an event so the orchestrator can run it.
|
|
69
|
+
this.emit("processJob", job, async (task) => {
|
|
70
|
+
try {
|
|
71
|
+
const resultUrl = await task();
|
|
72
|
+
const duration = Date.now() - startTime;
|
|
73
|
+
this.updateJob(job.id, {
|
|
74
|
+
status: "completed",
|
|
75
|
+
progress: 100,
|
|
76
|
+
status_text: "Completed successfully",
|
|
77
|
+
result_url: resultUrl || undefined
|
|
78
|
+
});
|
|
79
|
+
this.stats.recordJobSuccess(job.type, duration);
|
|
80
|
+
}
|
|
81
|
+
catch (error) {
|
|
82
|
+
if (this.queue.shouldRetry(job)) {
|
|
83
|
+
this.updateJob(job.id, {
|
|
84
|
+
status: "retrying",
|
|
85
|
+
status_text: `Failed: ${error.message}. Retrying...`
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
else {
|
|
89
|
+
this.updateJob(job.id, {
|
|
90
|
+
status: "failed",
|
|
91
|
+
status_text: "Max retries exceeded",
|
|
92
|
+
error: error.message || String(error)
|
|
93
|
+
});
|
|
94
|
+
this.stats.recordJobFailure(job.type, error);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
finally {
|
|
98
|
+
this.queue.finalize(job.id);
|
|
99
|
+
this.activeWorkers--;
|
|
100
|
+
this.processQueue(); // Look for next job
|
|
101
|
+
}
|
|
102
|
+
});
|
|
103
|
+
// Try to start another worker if capacity allows
|
|
104
|
+
this.processQueue();
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Helper to run a task as a job with automatic status updates
|
|
108
|
+
*/
|
|
109
|
+
runJob(id, task) {
|
|
110
|
+
this.emit("processJob", { id }, async () => {
|
|
111
|
+
return await task((updates) => this.updateJob(id, updates));
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Update job status and progress
|
|
116
|
+
*/
|
|
117
|
+
updateJob(id, updates) {
|
|
118
|
+
const job = this.store.getJob(id);
|
|
119
|
+
if (!job)
|
|
120
|
+
return;
|
|
121
|
+
const updatedJob = {
|
|
122
|
+
...job,
|
|
123
|
+
...updates,
|
|
124
|
+
updated_at: new Date().toISOString()
|
|
125
|
+
};
|
|
126
|
+
this.store.saveJob(updatedJob);
|
|
127
|
+
this.emit("jobUpdated", updatedJob);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
export class QueueService {
|
|
2
|
+
queue = [];
|
|
3
|
+
running = new Map();
|
|
4
|
+
/**
|
|
5
|
+
* Adds a job to the queue, sorted by priority.
|
|
6
|
+
*/
|
|
7
|
+
enqueue(job) {
|
|
8
|
+
job.status = "queued";
|
|
9
|
+
this.queue.push(job);
|
|
10
|
+
this.sortQueue();
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Gets the next job from the queue.
|
|
14
|
+
*/
|
|
15
|
+
dequeue() {
|
|
16
|
+
const job = this.queue.shift();
|
|
17
|
+
if (job) {
|
|
18
|
+
job.status = "running";
|
|
19
|
+
this.running.set(job.id, job);
|
|
20
|
+
}
|
|
21
|
+
return job;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Mark a job as finished (completed or failed permanently).
|
|
25
|
+
*/
|
|
26
|
+
finalize(id) {
|
|
27
|
+
this.running.delete(id);
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Handles retry logic with exponential backoff (simulated).
|
|
31
|
+
*/
|
|
32
|
+
shouldRetry(job) {
|
|
33
|
+
if (job.attempts < job.max_attempts) {
|
|
34
|
+
job.attempts++;
|
|
35
|
+
job.status = "retrying";
|
|
36
|
+
// In a real system, we'd schedule this for later.
|
|
37
|
+
// For now, we'll re-enqueue it immediately.
|
|
38
|
+
this.enqueue(job);
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
getStats() {
|
|
44
|
+
return {
|
|
45
|
+
pending: this.queue.length,
|
|
46
|
+
running: this.running.size,
|
|
47
|
+
total: this.queue.length + this.running.size
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
sortQueue() {
|
|
51
|
+
// Priority high to low, then created_at old to new
|
|
52
|
+
this.queue.sort((a, b) => {
|
|
53
|
+
if (b.priority !== a.priority) {
|
|
54
|
+
return b.priority - a.priority;
|
|
55
|
+
}
|
|
56
|
+
return new Date(a.created_at).getTime() - new Date(b.created_at).getTime();
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
const DOMAIN_KEYWORDS = [
|
|
2
|
+
{
|
|
3
|
+
domain: "medicine",
|
|
4
|
+
keywords: ["medical", "medicine", "clinical", "diagnosis", "patient", "disease", "symptom", "treatment", "drug", "pharmaceutical", "healthcare", "hospital", "doctor", "physician", "medical imaging", "radiology", "pathology", "oncology", "cardiology"],
|
|
5
|
+
weight: 10
|
|
6
|
+
},
|
|
7
|
+
{
|
|
8
|
+
domain: "healthcare",
|
|
9
|
+
keywords: ["health", "healthcare", "wellness", "public health", "epidemiology", "biomedical", "health data", "medical records", "ehr", "electronic health"],
|
|
10
|
+
weight: 8
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
domain: "security",
|
|
14
|
+
keywords: ["security", "cybersecurity", "malware", "vulnerability", "threat", "attack", "defense", "encryption", "privacy", "authentication", "authorization", "penetration testing", "intrusion"],
|
|
15
|
+
weight: 10
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
domain: "finance",
|
|
19
|
+
keywords: ["finance", "financial", "banking", "trading", "stock", "market", "investment", "credit", "loan", "mortgage", "cryptocurrency", "bitcoin", "blockchain", "accounting"],
|
|
20
|
+
weight: 9
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
domain: "education",
|
|
24
|
+
keywords: ["education", "learning", "student", "academic", "curriculum", "pedagogy", "teaching", "school", "university", "course", "tutorial"],
|
|
25
|
+
weight: 8
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
domain: "science",
|
|
29
|
+
keywords: ["scientific", "research", "experiment", "laboratory", "physics", "chemistry", "biology", "astronomy", "geology", "environmental"],
|
|
30
|
+
weight: 7
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
domain: "technology",
|
|
34
|
+
keywords: ["software", "programming", "code", "algorithm", "system", "application", "development", "engineering", "technical"],
|
|
35
|
+
weight: 5
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
domain: "social",
|
|
39
|
+
keywords: ["social", "society", "community", "demographic", "census", "population", "sociology", "anthropology", "culture"],
|
|
40
|
+
weight: 6
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
domain: "legal",
|
|
44
|
+
keywords: ["legal", "law", "court", "judicial", "litigation", "contract", "regulation", "compliance", "legislation"],
|
|
45
|
+
weight: 9
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
domain: "business",
|
|
49
|
+
keywords: ["business", "corporate", "enterprise", "company", "organization", "management", "marketing", "sales", "customer"],
|
|
50
|
+
weight: 6
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
domain: "multimedia",
|
|
54
|
+
keywords: ["image", "video", "audio", "multimedia", "visual", "photography", "music", "sound"],
|
|
55
|
+
weight: 7
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
domain: "nlp",
|
|
59
|
+
keywords: ["natural language", "nlp", "text", "language model", "translation", "sentiment", "text classification", "named entity", "ner", "question answering", "qa"],
|
|
60
|
+
weight: 8
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
domain: "computer-vision",
|
|
64
|
+
keywords: ["computer vision", "cv", "image classification", "object detection", "segmentation", "visual", "optical", "camera", "facial recognition"],
|
|
65
|
+
weight: 8
|
|
66
|
+
}
|
|
67
|
+
];
|
|
68
|
+
export function classifyDomain(description, tags, name, task) {
|
|
69
|
+
const text = `${description} ${name} ${tags.join(" ")} ${task || ""}`.toLowerCase();
|
|
70
|
+
const scores = new Map();
|
|
71
|
+
// Initialize scores
|
|
72
|
+
DOMAIN_KEYWORDS.forEach(dk => {
|
|
73
|
+
scores.set(dk.domain, 0);
|
|
74
|
+
});
|
|
75
|
+
// Score each domain
|
|
76
|
+
DOMAIN_KEYWORDS.forEach(dk => {
|
|
77
|
+
let matches = 0;
|
|
78
|
+
dk.keywords.forEach(keyword => {
|
|
79
|
+
if (text.includes(keyword.toLowerCase())) {
|
|
80
|
+
matches++;
|
|
81
|
+
}
|
|
82
|
+
});
|
|
83
|
+
if (matches > 0) {
|
|
84
|
+
const currentScore = scores.get(dk.domain) || 0;
|
|
85
|
+
scores.set(dk.domain, currentScore + (matches * dk.weight));
|
|
86
|
+
}
|
|
87
|
+
});
|
|
88
|
+
// Find domain with highest score
|
|
89
|
+
let maxScore = 0;
|
|
90
|
+
let bestDomain = "unknown";
|
|
91
|
+
scores.forEach((score, domain) => {
|
|
92
|
+
if (score > maxScore) {
|
|
93
|
+
maxScore = score;
|
|
94
|
+
bestDomain = domain;
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
// If no strong match, check task type for hints
|
|
98
|
+
if (maxScore === 0 && task) {
|
|
99
|
+
const taskLower = task.toLowerCase();
|
|
100
|
+
if (taskLower.includes("image") || taskLower.includes("vision") || taskLower.includes("detection")) {
|
|
101
|
+
return "computer-vision";
|
|
102
|
+
}
|
|
103
|
+
if (taskLower.includes("text") || taskLower.includes("language") || taskLower.includes("nlp")) {
|
|
104
|
+
return "nlp";
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
// Require minimum score threshold to avoid false positives
|
|
108
|
+
if (maxScore < 5) {
|
|
109
|
+
return "general";
|
|
110
|
+
}
|
|
111
|
+
// Special handling: medicine and healthcare are related
|
|
112
|
+
if (bestDomain === "medicine" || bestDomain === "healthcare") {
|
|
113
|
+
const medicineScore = scores.get("medicine") || 0;
|
|
114
|
+
const healthcareScore = scores.get("healthcare") || 0;
|
|
115
|
+
if (medicineScore > healthcareScore * 1.5) {
|
|
116
|
+
return "medicine";
|
|
117
|
+
}
|
|
118
|
+
else if (healthcareScore > medicineScore * 1.5) {
|
|
119
|
+
return "healthcare";
|
|
120
|
+
}
|
|
121
|
+
else {
|
|
122
|
+
// If both are high, prefer medicine for medical-specific terms
|
|
123
|
+
return (medicineScore >= healthcareScore ? "medicine" : "healthcare");
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return bestDomain;
|
|
127
|
+
}
|
|
128
|
+
export function getDomainDisplayName(domain) {
|
|
129
|
+
const names = {
|
|
130
|
+
"medicine": "Medicine",
|
|
131
|
+
"healthcare": "Healthcare",
|
|
132
|
+
"security": "Security",
|
|
133
|
+
"finance": "Finance",
|
|
134
|
+
"education": "Education",
|
|
135
|
+
"science": "Science",
|
|
136
|
+
"technology": "Technology",
|
|
137
|
+
"social": "Social",
|
|
138
|
+
"legal": "Legal",
|
|
139
|
+
"business": "Business",
|
|
140
|
+
"multimedia": "Multimedia",
|
|
141
|
+
"nlp": "Natural Language Processing",
|
|
142
|
+
"computer-vision": "Computer Vision",
|
|
143
|
+
"general": "General",
|
|
144
|
+
"unknown": "Unknown"
|
|
145
|
+
};
|
|
146
|
+
return names[domain] || "Unknown";
|
|
147
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import path from "path";
|
|
3
|
+
const PYTHON_SCRIPT_PATH = path.resolve("src", "python", "github_adapter.py");
|
|
4
|
+
export class GitHubScraper {
|
|
5
|
+
/**
|
|
6
|
+
* Search GitHub repositories for datasets using the Python adapter
|
|
7
|
+
*/
|
|
8
|
+
async scrape(query, limit = 10) {
|
|
9
|
+
return new Promise((resolve, reject) => {
|
|
10
|
+
const pythonProcess = spawn("python", [
|
|
11
|
+
PYTHON_SCRIPT_PATH,
|
|
12
|
+
"--action", "search",
|
|
13
|
+
"--query", query,
|
|
14
|
+
"--limit", String(limit)
|
|
15
|
+
]);
|
|
16
|
+
let output = "";
|
|
17
|
+
let errorOutput = "";
|
|
18
|
+
pythonProcess.stdout.on("data", (data) => {
|
|
19
|
+
output += data.toString();
|
|
20
|
+
});
|
|
21
|
+
pythonProcess.stderr.on("data", (data) => {
|
|
22
|
+
errorOutput += data.toString();
|
|
23
|
+
});
|
|
24
|
+
pythonProcess.on("close", (code) => {
|
|
25
|
+
if (code !== 0) {
|
|
26
|
+
console.error(`[GitHubScraper] Process exited with code ${code}: ${errorOutput}`);
|
|
27
|
+
resolve([]);
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
try {
|
|
31
|
+
const results = JSON.parse(output);
|
|
32
|
+
if (results.error) {
|
|
33
|
+
console.error(`[GitHubScraper] Internal error: ${results.error}`);
|
|
34
|
+
resolve([]);
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
resolve(results);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
catch (e) {
|
|
41
|
+
console.error(`[GitHubScraper] JSON parse error: ${e.message}. Output: ${output.substring(0, 100)}...`);
|
|
42
|
+
resolve([]);
|
|
43
|
+
}
|
|
44
|
+
});
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
}
|