vesper-wizard 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import re
|
|
5
|
-
from dataclasses import dataclass, asdict
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Any, Dict, Optional
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
DEFAULT_RECIPES_DIR = Path.home() / ".vesper" / "recipes"
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@dataclass
|
|
14
|
-
class DownloadRecipe:
|
|
15
|
-
dataset_id: str
|
|
16
|
-
source: str
|
|
17
|
-
repo_id: str
|
|
18
|
-
image_column: Optional[str]
|
|
19
|
-
download_method: str
|
|
20
|
-
requires_auth: bool
|
|
21
|
-
estimated_asset_size_gb: float
|
|
22
|
-
total_images: int
|
|
23
|
-
fallback_strategy: list[str]
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def _safe_name(value: str) -> str:
|
|
27
|
-
return re.sub(r"[^a-zA-Z0-9._-]+", "_", value).strip("_") or "dataset"
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def build_download_recipe(dataset_info: Dict[str, Any]) -> Dict[str, Any]:
|
|
31
|
-
dataset_id = str(dataset_info.get("dataset_id") or dataset_info.get("id") or "")
|
|
32
|
-
source = str(dataset_info.get("source") or "unknown").lower()
|
|
33
|
-
repo_id = str(dataset_info.get("repo_id") or dataset_info.get("id") or dataset_id)
|
|
34
|
-
|
|
35
|
-
image_column = dataset_info.get("image_column")
|
|
36
|
-
if not image_column:
|
|
37
|
-
features = dataset_info.get("features") or {}
|
|
38
|
-
if isinstance(features, dict):
|
|
39
|
-
for key in features.keys():
|
|
40
|
-
lower = str(key).lower()
|
|
41
|
-
if lower in {"image", "images", "img", "img_path", "image_url", "url"}:
|
|
42
|
-
image_column = key
|
|
43
|
-
break
|
|
44
|
-
|
|
45
|
-
download_method = "url_list"
|
|
46
|
-
if source == "huggingface":
|
|
47
|
-
download_method = "hf_dataset_image_feature"
|
|
48
|
-
elif source == "kaggle":
|
|
49
|
-
download_method = "kaggle_archive"
|
|
50
|
-
elif source in {"dataworld", "openml"}:
|
|
51
|
-
download_method = "direct_file_scan"
|
|
52
|
-
|
|
53
|
-
requires_auth = bool(dataset_info.get("requires_auth", source in {"kaggle", "dataworld"}))
|
|
54
|
-
|
|
55
|
-
total_images = int(dataset_info.get("total_images") or dataset_info.get("total_examples") or 0)
|
|
56
|
-
if total_images <= 0:
|
|
57
|
-
total_images = 1000
|
|
58
|
-
|
|
59
|
-
estimated_asset_size_gb = round(float(dataset_info.get("estimated_asset_size_gb") or (total_images * 0.0004)), 3)
|
|
60
|
-
|
|
61
|
-
fallback_strategy = dataset_info.get("fallback_strategy") or [
|
|
62
|
-
"scan_archive_for_images",
|
|
63
|
-
"extract_url_column_and_download",
|
|
64
|
-
"export_metadata_only_with_actionable_error",
|
|
65
|
-
]
|
|
66
|
-
|
|
67
|
-
recipe = DownloadRecipe(
|
|
68
|
-
dataset_id=dataset_id or repo_id,
|
|
69
|
-
source=source,
|
|
70
|
-
repo_id=repo_id,
|
|
71
|
-
image_column=image_column,
|
|
72
|
-
download_method=download_method,
|
|
73
|
-
requires_auth=requires_auth,
|
|
74
|
-
estimated_asset_size_gb=estimated_asset_size_gb,
|
|
75
|
-
total_images=total_images,
|
|
76
|
-
fallback_strategy=list(fallback_strategy),
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
return asdict(recipe)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def save_recipe(recipe: Dict[str, Any], recipes_dir: Optional[str] = None) -> str:
|
|
83
|
-
root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
|
|
84
|
-
root.mkdir(parents=True, exist_ok=True)
|
|
85
|
-
|
|
86
|
-
dataset_id = str(recipe.get("dataset_id") or recipe.get("repo_id") or "dataset")
|
|
87
|
-
recipe_dir = root / _safe_name(dataset_id)
|
|
88
|
-
recipe_dir.mkdir(parents=True, exist_ok=True)
|
|
89
|
-
|
|
90
|
-
out_path = recipe_dir / "download_recipe.json"
|
|
91
|
-
out_path.write_text(json.dumps(recipe, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
92
|
-
return str(out_path)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def get_download_recipe(dataset_id: str, recipes_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
|
96
|
-
root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
|
|
97
|
-
path = root / _safe_name(dataset_id) / "download_recipe.json"
|
|
98
|
-
if not path.exists():
|
|
99
|
-
return None
|
|
100
|
-
|
|
101
|
-
try:
|
|
102
|
-
return json.loads(path.read_text(encoding="utf-8"))
|
|
103
|
-
except Exception:
|
|
104
|
-
return None
|
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import argparse
|
|
4
|
-
import urllib.request
|
|
5
|
-
import urllib.parse
|
|
6
|
-
from datetime import datetime
|
|
7
|
-
|
|
8
|
-
# WB API for indicators (Series)
|
|
9
|
-
# Source 2 is World Development Indicators
|
|
10
|
-
WB_API_URL = "https://api.worldbank.org/v2/indicator"
|
|
11
|
-
|
|
12
|
-
def search_worldbank(query: str, limit: int = 10):
|
|
13
|
-
"""
|
|
14
|
-
Search World Bank indicators.
|
|
15
|
-
"""
|
|
16
|
-
try:
|
|
17
|
-
# The World Bank Indicators API doesn't have a direct "search" parameter for indicators
|
|
18
|
-
# that works exactly like a search engine. We fetch a page and filter by query terms.
|
|
19
|
-
# Alternatively, we could use the 'qterm' on the documents API, but indicators are more tabular.
|
|
20
|
-
|
|
21
|
-
params = {
|
|
22
|
-
"format": "json",
|
|
23
|
-
"per_page": 299, # Max per page to search through more indicators
|
|
24
|
-
"source": 2
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
query_string = urllib.parse.urlencode(params)
|
|
28
|
-
url = f"{WB_API_URL}?{query_string}"
|
|
29
|
-
|
|
30
|
-
req = urllib.request.Request(url)
|
|
31
|
-
with urllib.request.urlopen(req) as response:
|
|
32
|
-
data = json.load(response)
|
|
33
|
-
|
|
34
|
-
# WB response is [metadata, data_list]
|
|
35
|
-
if len(data) < 2:
|
|
36
|
-
return []
|
|
37
|
-
|
|
38
|
-
indicators = data[1]
|
|
39
|
-
|
|
40
|
-
results = []
|
|
41
|
-
count = 0
|
|
42
|
-
|
|
43
|
-
query_terms = query.lower().split()
|
|
44
|
-
|
|
45
|
-
for ind in indicators:
|
|
46
|
-
name = ind.get('name', '')
|
|
47
|
-
source_note = ind.get('sourceNote', '')
|
|
48
|
-
text = (name + " " + source_note).lower()
|
|
49
|
-
|
|
50
|
-
# Simple keyword matching
|
|
51
|
-
if all(term in text for term in query_terms):
|
|
52
|
-
metadata = {
|
|
53
|
-
"id": f"wb:{ind.get('id')}",
|
|
54
|
-
"source": "worldbank",
|
|
55
|
-
"name": name,
|
|
56
|
-
"description": source_note or "No description available.",
|
|
57
|
-
"downloads": 1000, # Placeholder (high relevance for WB)
|
|
58
|
-
"likes": 100,
|
|
59
|
-
"last_updated": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
60
|
-
"quality_score": 95, # Institutional data is high quality
|
|
61
|
-
"license": {
|
|
62
|
-
"id": "cc-by-4.0",
|
|
63
|
-
"name": "Creative Commons Attribution 4.0",
|
|
64
|
-
"category": "safe",
|
|
65
|
-
"usage_restrictions": [],
|
|
66
|
-
"warnings": []
|
|
67
|
-
},
|
|
68
|
-
"tags": [ind.get('source', {}).get('value')] if ind.get('source') else [],
|
|
69
|
-
"total_examples": 0, # Time series length varies
|
|
70
|
-
"is_safe_source": True,
|
|
71
|
-
"is_structured": True,
|
|
72
|
-
"metadata_url": f"https://data.worldbank.org/indicator/{ind.get('id')}",
|
|
73
|
-
"domain": "economics"
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
results.append(metadata)
|
|
77
|
-
count += 1
|
|
78
|
-
if count >= limit:
|
|
79
|
-
break
|
|
80
|
-
|
|
81
|
-
return results
|
|
82
|
-
|
|
83
|
-
except Exception as e:
|
|
84
|
-
return {"error": str(e)}
|
|
85
|
-
|
|
86
|
-
def main():
|
|
87
|
-
parser = argparse.ArgumentParser(description="World Bank Adapter")
|
|
88
|
-
parser.add_argument("--action", required=True, choices=["search"])
|
|
89
|
-
parser.add_argument("--query", required=True)
|
|
90
|
-
parser.add_argument("--limit", type=int, default=10)
|
|
91
|
-
|
|
92
|
-
args = parser.parse_args()
|
|
93
|
-
|
|
94
|
-
if args.action == "search":
|
|
95
|
-
results = search_worldbank(args.query, args.limit)
|
|
96
|
-
print(json.dumps(results))
|
|
97
|
-
|
|
98
|
-
if __name__ == "__main__":
|
|
99
|
-
main()
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
export class QualityAnalyzer {
|
|
5
|
-
cache;
|
|
6
|
-
pythonPath = "python"; // Assumes python is in PATH
|
|
7
|
-
scriptPath;
|
|
8
|
-
constructor(cache, buildDir = process.cwd()) {
|
|
9
|
-
// buildDir is the directory containing the compiled JS (e.g., build/)
|
|
10
|
-
// Priority:
|
|
11
|
-
// 1. ~/.vesper/python (stable synced location)
|
|
12
|
-
// 2. build/python (production)
|
|
13
|
-
// 3. src/python (development)
|
|
14
|
-
this.cache = cache;
|
|
15
|
-
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
16
|
-
const dataRoot = path.join(homeDir, ".vesper");
|
|
17
|
-
const scriptPath0 = path.resolve(dataRoot, "python", "quality_engine.py");
|
|
18
|
-
const scriptPath1 = path.resolve(buildDir, "python", "quality_engine.py");
|
|
19
|
-
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "quality_engine.py");
|
|
20
|
-
const scriptPath3 = path.resolve(buildDir, "..", "python", "quality_engine.py");
|
|
21
|
-
if (fs.existsSync(scriptPath0)) {
|
|
22
|
-
this.scriptPath = scriptPath0;
|
|
23
|
-
}
|
|
24
|
-
else if (fs.existsSync(scriptPath1)) {
|
|
25
|
-
this.scriptPath = scriptPath1;
|
|
26
|
-
}
|
|
27
|
-
else if (fs.existsSync(scriptPath2)) {
|
|
28
|
-
this.scriptPath = scriptPath2;
|
|
29
|
-
}
|
|
30
|
-
else if (fs.existsSync(scriptPath3)) {
|
|
31
|
-
this.scriptPath = scriptPath3;
|
|
32
|
-
}
|
|
33
|
-
else {
|
|
34
|
-
// Fallback to stable data path, error will be caught during execution
|
|
35
|
-
this.scriptPath = scriptPath0;
|
|
36
|
-
console.error(`[QualityAnalyzer] WARNING: Python script not found!`);
|
|
37
|
-
}
|
|
38
|
-
// Detect Python command (Windows may use 'py' instead of 'python')
|
|
39
|
-
if (process.platform === "win32") {
|
|
40
|
-
this.pythonPath = "py";
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
/**
|
|
44
|
-
* Run quality analysis on a local file (CSV/Parquet/JSON)
|
|
45
|
-
* @param datasetId Used for caching
|
|
46
|
-
*/
|
|
47
|
-
async analyze(filePath, datasetId) {
|
|
48
|
-
if (this.cache && datasetId) {
|
|
49
|
-
const cached = await this.cache.getReport(datasetId);
|
|
50
|
-
if (cached) {
|
|
51
|
-
console.error(`[QualityAnalyzer] Cache hit for ${datasetId}`);
|
|
52
|
-
return cached;
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
const report = await new Promise((resolve, reject) => {
|
|
56
|
-
const process = spawn(this.pythonPath, [this.scriptPath, filePath]);
|
|
57
|
-
let stdout = "";
|
|
58
|
-
let stderr = "";
|
|
59
|
-
process.stdout.on("data", (data) => {
|
|
60
|
-
stdout += data.toString();
|
|
61
|
-
});
|
|
62
|
-
process.stderr.on("data", (data) => {
|
|
63
|
-
stderr += data.toString();
|
|
64
|
-
});
|
|
65
|
-
process.on("close", (code) => {
|
|
66
|
-
if (code !== 0) {
|
|
67
|
-
const errorDetails = `Quality Analyzer failed (code ${code})
|
|
68
|
-
Command: ${this.pythonPath} ${this.scriptPath} ${filePath}
|
|
69
|
-
Script path exists: ${fs.existsSync(this.scriptPath)}
|
|
70
|
-
Error output: ${stderr}`;
|
|
71
|
-
reject(new Error(errorDetails));
|
|
72
|
-
return;
|
|
73
|
-
}
|
|
74
|
-
try {
|
|
75
|
-
const report = JSON.parse(stdout);
|
|
76
|
-
if (report.error) {
|
|
77
|
-
reject(new Error(report.error));
|
|
78
|
-
}
|
|
79
|
-
else {
|
|
80
|
-
resolve(report);
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
catch (e) {
|
|
84
|
-
reject(new Error(`Failed to parse analyzer output: ${stdout}`));
|
|
85
|
-
}
|
|
86
|
-
});
|
|
87
|
-
});
|
|
88
|
-
if (this.cache && datasetId) {
|
|
89
|
-
await this.cache.saveReport(datasetId, report);
|
|
90
|
-
}
|
|
91
|
-
return report;
|
|
92
|
-
}
|
|
93
|
-
}
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
export class ImageAnalyzer {
|
|
5
|
-
pythonPath = "python";
|
|
6
|
-
scriptPath;
|
|
7
|
-
constructor(buildDir = process.cwd()) {
|
|
8
|
-
// buildDir is the directory containing the compiled JS (e.g., build/)
|
|
9
|
-
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
10
|
-
const dataRoot = path.join(homeDir, ".vesper");
|
|
11
|
-
const scriptPath0 = path.resolve(dataRoot, "python", "image_engine.py");
|
|
12
|
-
const scriptPath1 = path.resolve(buildDir, "python", "image_engine.py");
|
|
13
|
-
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "image_engine.py");
|
|
14
|
-
const scriptPath3 = path.resolve(buildDir, "..", "python", "image_engine.py");
|
|
15
|
-
if (fs.existsSync(scriptPath0)) {
|
|
16
|
-
this.scriptPath = scriptPath0;
|
|
17
|
-
}
|
|
18
|
-
else if (fs.existsSync(scriptPath1)) {
|
|
19
|
-
this.scriptPath = scriptPath1;
|
|
20
|
-
}
|
|
21
|
-
else if (fs.existsSync(scriptPath2)) {
|
|
22
|
-
this.scriptPath = scriptPath2;
|
|
23
|
-
}
|
|
24
|
-
else if (fs.existsSync(scriptPath3)) {
|
|
25
|
-
this.scriptPath = scriptPath3;
|
|
26
|
-
}
|
|
27
|
-
else {
|
|
28
|
-
this.scriptPath = scriptPath0;
|
|
29
|
-
}
|
|
30
|
-
// Detect Python command (Windows may use 'py' instead of 'python')
|
|
31
|
-
if (process.platform === "win32") {
|
|
32
|
-
this.pythonPath = "py";
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Analyze image quality for a single file or a directory
|
|
37
|
-
*/
|
|
38
|
-
async analyze(inputPath) {
|
|
39
|
-
return new Promise((resolve, reject) => {
|
|
40
|
-
const process = spawn(this.pythonPath, [
|
|
41
|
-
this.scriptPath,
|
|
42
|
-
inputPath
|
|
43
|
-
]);
|
|
44
|
-
let stdout = "";
|
|
45
|
-
let stderr = "";
|
|
46
|
-
process.stdout.on("data", (data) => {
|
|
47
|
-
stdout += data.toString();
|
|
48
|
-
});
|
|
49
|
-
process.stderr.on("data", (data) => {
|
|
50
|
-
stderr += data.toString();
|
|
51
|
-
});
|
|
52
|
-
process.on("error", (err) => {
|
|
53
|
-
if (err.code === "ENOENT") {
|
|
54
|
-
// Python not found - return a graceful failure report
|
|
55
|
-
resolve({
|
|
56
|
-
total_images: 1,
|
|
57
|
-
ok_images: 0,
|
|
58
|
-
failed_images: 1,
|
|
59
|
-
details: [{
|
|
60
|
-
status: "error",
|
|
61
|
-
error: "Python not installed or not in PATH. Please install Python to use image analysis features."
|
|
62
|
-
}]
|
|
63
|
-
});
|
|
64
|
-
}
|
|
65
|
-
else {
|
|
66
|
-
reject(new Error(`Failed to start image analysis process: ${err.message}`));
|
|
67
|
-
}
|
|
68
|
-
});
|
|
69
|
-
process.on("close", (code) => {
|
|
70
|
-
if (code !== 0) {
|
|
71
|
-
// Handle case where script fails
|
|
72
|
-
resolve({
|
|
73
|
-
total_images: 1,
|
|
74
|
-
ok_images: 0,
|
|
75
|
-
failed_images: 1,
|
|
76
|
-
details: [{
|
|
77
|
-
status: "error",
|
|
78
|
-
error: `Image Analyzer process failed (code ${code}): ${stderr || "Unknown error"}`
|
|
79
|
-
}]
|
|
80
|
-
});
|
|
81
|
-
return;
|
|
82
|
-
}
|
|
83
|
-
try {
|
|
84
|
-
const result = JSON.parse(stdout);
|
|
85
|
-
if (result.error) {
|
|
86
|
-
resolve({
|
|
87
|
-
total_images: 1,
|
|
88
|
-
ok_images: 0,
|
|
89
|
-
failed_images: 1,
|
|
90
|
-
details: [{
|
|
91
|
-
status: "error",
|
|
92
|
-
error: result.error
|
|
93
|
-
}]
|
|
94
|
-
});
|
|
95
|
-
}
|
|
96
|
-
else {
|
|
97
|
-
resolve(result);
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
catch (e) {
|
|
101
|
-
resolve({
|
|
102
|
-
total_images: 1,
|
|
103
|
-
ok_images: 0,
|
|
104
|
-
failed_images: 1,
|
|
105
|
-
details: [{
|
|
106
|
-
status: "error",
|
|
107
|
-
error: `Failed to parse image analyzer output: ${stdout}`
|
|
108
|
-
}]
|
|
109
|
-
});
|
|
110
|
-
}
|
|
111
|
-
});
|
|
112
|
-
});
|
|
113
|
-
}
|
|
114
|
-
}
|
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
export class MediaAnalyzer {
|
|
5
|
-
pythonPath = "python";
|
|
6
|
-
scriptPath;
|
|
7
|
-
constructor(buildDir = process.cwd()) {
|
|
8
|
-
// buildDir is the directory containing the compiled JS (e.g., build/)
|
|
9
|
-
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
10
|
-
const dataRoot = path.join(homeDir, ".vesper");
|
|
11
|
-
const scriptPath0 = path.resolve(dataRoot, "python", "media_engine.py");
|
|
12
|
-
const scriptPath1 = path.resolve(buildDir, "python", "media_engine.py");
|
|
13
|
-
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "media_engine.py");
|
|
14
|
-
const scriptPath3 = path.resolve(buildDir, "..", "python", "media_engine.py");
|
|
15
|
-
if (fs.existsSync(scriptPath0)) {
|
|
16
|
-
this.scriptPath = scriptPath0;
|
|
17
|
-
}
|
|
18
|
-
else if (fs.existsSync(scriptPath1)) {
|
|
19
|
-
this.scriptPath = scriptPath1;
|
|
20
|
-
}
|
|
21
|
-
else if (fs.existsSync(scriptPath2)) {
|
|
22
|
-
this.scriptPath = scriptPath2;
|
|
23
|
-
}
|
|
24
|
-
else if (fs.existsSync(scriptPath3)) {
|
|
25
|
-
this.scriptPath = scriptPath3;
|
|
26
|
-
}
|
|
27
|
-
else {
|
|
28
|
-
this.scriptPath = scriptPath0;
|
|
29
|
-
}
|
|
30
|
-
// Detect Python command (Windows may use 'py' instead of 'python')
|
|
31
|
-
if (process.platform === "win32") {
|
|
32
|
-
this.pythonPath = "py";
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Analyze audio/video quality for a single file or a directory
|
|
37
|
-
*/
|
|
38
|
-
async analyze(inputPath) {
|
|
39
|
-
return new Promise((resolve, reject) => {
|
|
40
|
-
const process = spawn(this.pythonPath, [
|
|
41
|
-
this.scriptPath,
|
|
42
|
-
inputPath
|
|
43
|
-
]);
|
|
44
|
-
let stdout = "";
|
|
45
|
-
let stderr = "";
|
|
46
|
-
process.stdout.on("data", (data) => {
|
|
47
|
-
stdout += data.toString();
|
|
48
|
-
});
|
|
49
|
-
process.stderr.on("data", (data) => {
|
|
50
|
-
stderr += data.toString();
|
|
51
|
-
});
|
|
52
|
-
process.on("error", (err) => {
|
|
53
|
-
if (err.code === "ENOENT") {
|
|
54
|
-
// Python not found - return a graceful failure report
|
|
55
|
-
resolve({
|
|
56
|
-
total_files: 1,
|
|
57
|
-
ok_files: 0,
|
|
58
|
-
failed_files: 1,
|
|
59
|
-
details: [{
|
|
60
|
-
status: "error",
|
|
61
|
-
error: "Python not installed or not in PATH. Please install Python to use media analysis features."
|
|
62
|
-
}]
|
|
63
|
-
});
|
|
64
|
-
}
|
|
65
|
-
else {
|
|
66
|
-
reject(new Error(`Failed to start media analysis process: ${err.message}`));
|
|
67
|
-
}
|
|
68
|
-
});
|
|
69
|
-
process.on("close", (code) => {
|
|
70
|
-
if (code !== 0) {
|
|
71
|
-
// Handle case where python exists but script fails
|
|
72
|
-
resolve({
|
|
73
|
-
total_files: 1,
|
|
74
|
-
ok_files: 0,
|
|
75
|
-
failed_files: 1,
|
|
76
|
-
details: [{
|
|
77
|
-
status: "error",
|
|
78
|
-
error: `Media Analyzer process failed (code ${code}): ${stderr || "Unknown error"}`
|
|
79
|
-
}]
|
|
80
|
-
});
|
|
81
|
-
return;
|
|
82
|
-
}
|
|
83
|
-
try {
|
|
84
|
-
const result = JSON.parse(stdout);
|
|
85
|
-
if (result.error) {
|
|
86
|
-
// Return error as part of report instead of rejecting
|
|
87
|
-
resolve({
|
|
88
|
-
total_files: 1,
|
|
89
|
-
ok_files: 0,
|
|
90
|
-
failed_files: 1,
|
|
91
|
-
details: [{
|
|
92
|
-
status: "error",
|
|
93
|
-
error: result.error
|
|
94
|
-
}]
|
|
95
|
-
});
|
|
96
|
-
}
|
|
97
|
-
else {
|
|
98
|
-
resolve(result);
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
catch (e) {
|
|
102
|
-
resolve({
|
|
103
|
-
total_files: 1,
|
|
104
|
-
ok_files: 0,
|
|
105
|
-
failed_files: 1,
|
|
106
|
-
details: [{
|
|
107
|
-
status: "error",
|
|
108
|
-
error: `Failed to parse media analyzer output: ${stdout}`
|
|
109
|
-
}]
|
|
110
|
-
});
|
|
111
|
-
}
|
|
112
|
-
});
|
|
113
|
-
});
|
|
114
|
-
}
|
|
115
|
-
}
|
|
@@ -1,162 +0,0 @@
|
|
|
1
|
-
import fs from "fs";
|
|
2
|
-
import { ImageAnalyzer } from "./image-analyzer.js";
|
|
3
|
-
import { MediaAnalyzer } from "./media-analyzer.js";
|
|
4
|
-
export class QualityOrchestrator {
|
|
5
|
-
imageAnalyzer;
|
|
6
|
-
mediaAnalyzer;
|
|
7
|
-
constructor(projectRoot = process.cwd()) {
|
|
8
|
-
this.imageAnalyzer = new ImageAnalyzer(projectRoot);
|
|
9
|
-
this.mediaAnalyzer = new MediaAnalyzer(projectRoot);
|
|
10
|
-
}
|
|
11
|
-
/**
|
|
12
|
-
* Detect modalities present in a dataset directory
|
|
13
|
-
*/
|
|
14
|
-
detectModalities(datasetPath) {
|
|
15
|
-
const modalities = [];
|
|
16
|
-
if (!fs.existsSync(datasetPath)) {
|
|
17
|
-
return modalities;
|
|
18
|
-
}
|
|
19
|
-
const files = fs.readdirSync(datasetPath);
|
|
20
|
-
// Check for text/tabular data
|
|
21
|
-
const hasText = files.some(f => /\.(csv|json|parquet|txt)$/i.test(f));
|
|
22
|
-
if (hasText)
|
|
23
|
-
modalities.push("text");
|
|
24
|
-
// Check for images
|
|
25
|
-
const hasImages = files.some(f => /\.(jpg|jpeg|png|bmp|webp)$/i.test(f));
|
|
26
|
-
if (hasImages)
|
|
27
|
-
modalities.push("image");
|
|
28
|
-
// Check for audio
|
|
29
|
-
const hasAudio = files.some(f => /\.(wav|mp3|flac|ogg|m4a)$/i.test(f));
|
|
30
|
-
if (hasAudio)
|
|
31
|
-
modalities.push("audio");
|
|
32
|
-
// Check for video
|
|
33
|
-
const hasVideo = files.some(f => /\.(mp4|avi|mkv|mov|wmv)$/i.test(f));
|
|
34
|
-
if (hasVideo)
|
|
35
|
-
modalities.push("video");
|
|
36
|
-
return modalities;
|
|
37
|
-
}
|
|
38
|
-
/**
|
|
39
|
-
* Generate a unified quality report for a dataset
|
|
40
|
-
*/
|
|
41
|
-
async generateReport(datasetId, datasetPath, textQuality) {
|
|
42
|
-
const modalities = this.detectModalities(datasetPath);
|
|
43
|
-
const report = {
|
|
44
|
-
dataset_id: datasetId,
|
|
45
|
-
modalities,
|
|
46
|
-
overall_quality_score: 0,
|
|
47
|
-
recommendations: [],
|
|
48
|
-
generated_at: new Date().toISOString()
|
|
49
|
-
};
|
|
50
|
-
let totalScore = 0;
|
|
51
|
-
let scoreCount = 0;
|
|
52
|
-
// Text quality (if provided from existing analysis)
|
|
53
|
-
if (textQuality) {
|
|
54
|
-
report.text_quality = {
|
|
55
|
-
row_count: textQuality.row_count || 0,
|
|
56
|
-
column_count: textQuality.column_count || 0,
|
|
57
|
-
missing_percentage: textQuality.missing_percentage || 0,
|
|
58
|
-
duplicate_percentage: textQuality.duplicate_percentage || 0
|
|
59
|
-
};
|
|
60
|
-
// Calculate text quality score (0-100)
|
|
61
|
-
const textScore = Math.max(0, 100 - (report.text_quality.missing_percentage * 2) - (report.text_quality.duplicate_percentage));
|
|
62
|
-
totalScore += textScore;
|
|
63
|
-
scoreCount++;
|
|
64
|
-
if (report.text_quality.missing_percentage > 20) {
|
|
65
|
-
report.recommendations.push("High missing data detected. Consider imputation or removal.");
|
|
66
|
-
}
|
|
67
|
-
if (report.text_quality.duplicate_percentage > 10) {
|
|
68
|
-
report.recommendations.push("Significant duplicates found. Run deduplication.");
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
// Image quality
|
|
72
|
-
if (modalities.includes("image")) {
|
|
73
|
-
try {
|
|
74
|
-
const imageReport = await this.imageAnalyzer.analyze(datasetPath);
|
|
75
|
-
report.image_quality = {
|
|
76
|
-
total_images: imageReport.total_images,
|
|
77
|
-
corrupted_count: imageReport.corrupted_count,
|
|
78
|
-
avg_resolution: `${Math.round(imageReport.average_width)}x${Math.round(imageReport.average_height)}`,
|
|
79
|
-
blurry_percentage: (imageReport.blurry_count / imageReport.total_images) * 100
|
|
80
|
-
};
|
|
81
|
-
// Calculate image quality score
|
|
82
|
-
const corruptionPenalty = (imageReport.corrupted_count / imageReport.total_images) * 50;
|
|
83
|
-
const blurPenalty = report.image_quality.blurry_percentage * 0.3;
|
|
84
|
-
const imageScore = Math.max(0, 100 - corruptionPenalty - blurPenalty);
|
|
85
|
-
totalScore += imageScore;
|
|
86
|
-
scoreCount++;
|
|
87
|
-
if (report.image_quality.corrupted_count > 0) {
|
|
88
|
-
report.recommendations.push(`Remove ${imageReport.corrupted_count} corrupted images.`);
|
|
89
|
-
}
|
|
90
|
-
if (report.image_quality.blurry_percentage > 15) {
|
|
91
|
-
report.recommendations.push("High blur detected. Consider filtering blurry images.");
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
catch (e) {
|
|
95
|
-
console.error("Image analysis failed:", e);
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
// Audio quality
|
|
99
|
-
if (modalities.includes("audio")) {
|
|
100
|
-
try {
|
|
101
|
-
const audioReport = await this.mediaAnalyzer.analyze(datasetPath);
|
|
102
|
-
if ('avg_audio_duration' in audioReport) {
|
|
103
|
-
const silentFiles = audioReport.details.filter(d => d.status === "ok" && 'is_silent' in d && d.is_silent).length;
|
|
104
|
-
const avgSampleRate = audioReport.details
|
|
105
|
-
.filter(d => d.status === "ok" && 'sample_rate' in d)
|
|
106
|
-
.reduce((sum, d) => sum + (('sample_rate' in d) ? (d.sample_rate || 0) : 0), 0) / audioReport.ok_files;
|
|
107
|
-
report.audio_quality = {
|
|
108
|
-
total_files: audioReport.total_files,
|
|
109
|
-
avg_duration: audioReport.avg_audio_duration || 0,
|
|
110
|
-
avg_sample_rate: avgSampleRate,
|
|
111
|
-
silent_percentage: (silentFiles / audioReport.total_files) * 100
|
|
112
|
-
};
|
|
113
|
-
// Calculate audio quality score
|
|
114
|
-
const failurePenalty = (audioReport.failed_files / audioReport.total_files) * 50;
|
|
115
|
-
const silentPenalty = report.audio_quality.silent_percentage * 0.5;
|
|
116
|
-
const audioScore = Math.max(0, 100 - failurePenalty - silentPenalty);
|
|
117
|
-
totalScore += audioScore;
|
|
118
|
-
scoreCount++;
|
|
119
|
-
if (report.audio_quality.silent_percentage > 10) {
|
|
120
|
-
report.recommendations.push("High percentage of silent audio files detected.");
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
catch (e) {
|
|
125
|
-
console.error("Audio analysis failed:", e);
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
// Video quality
|
|
129
|
-
if (modalities.includes("video")) {
|
|
130
|
-
try {
|
|
131
|
-
const videoReport = await this.mediaAnalyzer.analyze(datasetPath);
|
|
132
|
-
if ('avg_video_duration' in videoReport) {
|
|
133
|
-
const highRiskFiles = videoReport.details.filter(d => d.status === "ok" && d.corruption_risk === "high").length;
|
|
134
|
-
report.video_quality = {
|
|
135
|
-
total_files: videoReport.total_files,
|
|
136
|
-
avg_duration: videoReport.avg_video_duration || 0,
|
|
137
|
-
avg_fps: videoReport.avg_fps || 0,
|
|
138
|
-
corruption_risk_high: highRiskFiles
|
|
139
|
-
};
|
|
140
|
-
// Calculate video quality score
|
|
141
|
-
const failurePenalty = (videoReport.failed_files / videoReport.total_files) * 50;
|
|
142
|
-
const corruptionPenalty = (highRiskFiles / videoReport.total_files) * 30;
|
|
143
|
-
const videoScore = Math.max(0, 100 - failurePenalty - corruptionPenalty);
|
|
144
|
-
totalScore += videoScore;
|
|
145
|
-
scoreCount++;
|
|
146
|
-
if (highRiskFiles > 0) {
|
|
147
|
-
report.recommendations.push(`${highRiskFiles} video files have high corruption risk.`);
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
catch (e) {
|
|
152
|
-
console.error("Video analysis failed:", e);
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
// Calculate overall quality score
|
|
156
|
-
report.overall_quality_score = scoreCount > 0 ? Math.round(totalScore / scoreCount) : 0;
|
|
157
|
-
if (report.recommendations.length === 0) {
|
|
158
|
-
report.recommendations.push("Dataset quality is good. No major issues detected.");
|
|
159
|
-
}
|
|
160
|
-
return report;
|
|
161
|
-
}
|
|
162
|
-
}
|