@vespermcp/mcp-server 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/build/cleaning/cleaner.js +27 -2
- package/build/cleaning/executor.js +7 -6
- package/build/cleaning/planner.js +16 -4
- package/build/config/config-manager.js +199 -0
- package/build/export/exporter.js +26 -2
- package/build/index.js +272 -72
- package/build/ingestion/ingestor.js +17 -16
- package/build/ingestion/kaggle-downloader.js +25 -2
- package/build/install/install-service.js +1 -1
- package/build/jobs/manager.js +17 -10
- package/build/metadata/monitoring-service.js +2 -2
- package/build/metadata/scraper.js +8 -8
- package/build/metadata/store.js +17 -2
- package/build/monitoring/observability.js +2 -2
- package/build/preparation/target-detector.js +75 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/export_engine.py +131 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/quality_engine.py +243 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +40 -4
- package/build/quality/image-analyzer.js +73 -5
- package/build/quality/media-analyzer.js +74 -5
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/test-mcp-v5.js +12 -11
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/engine.js +13 -2
- package/build/search/jit-orchestrator.js +6 -40
- package/build/search/vector-store.js +18 -0
- package/build/splitting/splitter.js +27 -2
- package/build/tools/formatter.js +23 -8
- package/build/utils/downloader.js +2 -2
- package/build/utils/selector.js +69 -0
- package/package.json +8 -4
- package/src/python/cleaner.py +33 -3
- package/src/python/export_engine.py +19 -0
- package/src/python/target_engine.py +154 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from PIL import Image
|
|
5
|
+
import cv2
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
def analyze_image(image_path):
|
|
9
|
+
stats = {
|
|
10
|
+
"path": image_path,
|
|
11
|
+
"filename": os.path.basename(image_path),
|
|
12
|
+
"status": "ok",
|
|
13
|
+
"error": None
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
# 1. Basic Metadata with Pillow
|
|
18
|
+
img = Image.open(image_path)
|
|
19
|
+
stats["width"], stats["height"] = img.size
|
|
20
|
+
stats["format"] = img.format
|
|
21
|
+
stats["mode"] = img.mode
|
|
22
|
+
|
|
23
|
+
# 2. Advanced Analysis with OpenCV
|
|
24
|
+
cv_img = cv2.imread(image_path)
|
|
25
|
+
if cv_img is None:
|
|
26
|
+
stats["status"] = "corrupted"
|
|
27
|
+
stats["error"] = "OpenCV failed to decode image"
|
|
28
|
+
return stats
|
|
29
|
+
|
|
30
|
+
# Blur detection (Laplacian variance)
|
|
31
|
+
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
|
|
32
|
+
laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
|
|
33
|
+
stats["blur_score"] = laplacian_var
|
|
34
|
+
stats["is_blurry"] = laplacian_var < 100 # Rule of thumb threshold
|
|
35
|
+
|
|
36
|
+
# Brightness
|
|
37
|
+
stats["brightness"] = np.mean(gray)
|
|
38
|
+
|
|
39
|
+
# Aspect Ratio
|
|
40
|
+
stats["aspect_ratio"] = stats["width"] / stats["height"]
|
|
41
|
+
|
|
42
|
+
except Exception as e:
|
|
43
|
+
stats["status"] = "failed"
|
|
44
|
+
stats["error"] = str(e)
|
|
45
|
+
|
|
46
|
+
return stats
|
|
47
|
+
|
|
48
|
+
def main():
|
|
49
|
+
if len(sys.argv) < 2:
|
|
50
|
+
print(json.dumps({"error": "No path provided"}))
|
|
51
|
+
sys.exit(1)
|
|
52
|
+
|
|
53
|
+
input_path = sys.argv[1]
|
|
54
|
+
results = []
|
|
55
|
+
|
|
56
|
+
if os.path.isfile(input_path):
|
|
57
|
+
results.append(analyze_image(input_path))
|
|
58
|
+
elif os.path.isdir(input_path):
|
|
59
|
+
# Analyze first 50 images for performance in this demo
|
|
60
|
+
valid_exts = (".jpg", ".jpeg", ".png", ".bmp", ".webp")
|
|
61
|
+
files = [os.path.join(input_path, f) for f in os.listdir(input_path) if f.lower().endswith(valid_exts)]
|
|
62
|
+
for f in files[:50]:
|
|
63
|
+
results.append(analyze_image(f))
|
|
64
|
+
else:
|
|
65
|
+
print(json.dumps({"error": "Invalid path"}))
|
|
66
|
+
sys.exit(1)
|
|
67
|
+
|
|
68
|
+
# Aggregate stats
|
|
69
|
+
if not results:
|
|
70
|
+
print(json.dumps({"error": "No images found"}))
|
|
71
|
+
sys.exit(1)
|
|
72
|
+
|
|
73
|
+
report = {
|
|
74
|
+
"total_images": len(results),
|
|
75
|
+
"corrupted_count": len([r for r in results if r["status"] == "corrupted"]),
|
|
76
|
+
"failed_count": len([r for r in results if r["status"] == "failed"]),
|
|
77
|
+
"average_width": np.mean([r["width"] for r in results if "width" in r]),
|
|
78
|
+
"average_height": np.mean([r["height"] for r in results if "height" in r]),
|
|
79
|
+
"blurry_count": len([r for r in results if r.get("is_blurry")]),
|
|
80
|
+
"individual_results": results
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
print(json.dumps(report))
|
|
84
|
+
|
|
85
|
+
if __name__ == "__main__":
|
|
86
|
+
main()
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import cv2
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
# Audio analysis depends on librosa/soundfile. Fallback if not available.
|
|
8
|
+
try:
|
|
9
|
+
import librosa
|
|
10
|
+
AUDIO_SUPPORT = True
|
|
11
|
+
except ImportError:
|
|
12
|
+
AUDIO_SUPPORT = False
|
|
13
|
+
|
|
14
|
+
def analyze_audio(path):
|
|
15
|
+
if not AUDIO_SUPPORT:
|
|
16
|
+
return {"status": "error", "error": "librosa not installed"}
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
# Load audio (mono, default sr)
|
|
20
|
+
y, sr = librosa.load(path, sr=None)
|
|
21
|
+
duration = librosa.get_duration(y=y, sr=sr)
|
|
22
|
+
|
|
23
|
+
# Audio metrics
|
|
24
|
+
rms = librosa.feature.rms(y=y)
|
|
25
|
+
avg_rms = float(np.mean(rms))
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
"status": "ok",
|
|
29
|
+
"type": "audio",
|
|
30
|
+
"filename": os.path.basename(path),
|
|
31
|
+
"sample_rate": int(sr),
|
|
32
|
+
"duration": float(duration),
|
|
33
|
+
"avg_volume_rms": avg_rms,
|
|
34
|
+
"is_silent": avg_rms < 0.001
|
|
35
|
+
}
|
|
36
|
+
except Exception as e:
|
|
37
|
+
return {"status": "error", "error": str(e)}
|
|
38
|
+
|
|
39
|
+
def analyze_video(path):
|
|
40
|
+
try:
|
|
41
|
+
cap = cv2.VideoCapture(path)
|
|
42
|
+
if not cap.isOpened():
|
|
43
|
+
return {"status": "error", "error": "Could not open video file"}
|
|
44
|
+
|
|
45
|
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|
46
|
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|
47
|
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
|
48
|
+
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
49
|
+
duration = frame_count / fps if fps > 0 else 0
|
|
50
|
+
|
|
51
|
+
# Check integrity by reading a few frames
|
|
52
|
+
test_frame_indices = [0, frame_count // 2, frame_count - 1] if frame_count > 0 else []
|
|
53
|
+
failed_frames = 0
|
|
54
|
+
|
|
55
|
+
for idx in test_frame_indices:
|
|
56
|
+
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
|
|
57
|
+
ret, frame = cap.read()
|
|
58
|
+
if not ret or frame is None:
|
|
59
|
+
failed_frames += 1
|
|
60
|
+
|
|
61
|
+
cap.release()
|
|
62
|
+
|
|
63
|
+
return {
|
|
64
|
+
"status": "ok",
|
|
65
|
+
"type": "video",
|
|
66
|
+
"filename": os.path.basename(path),
|
|
67
|
+
"width": width,
|
|
68
|
+
"height": height,
|
|
69
|
+
"fps": float(fps),
|
|
70
|
+
"duration": float(duration),
|
|
71
|
+
"frame_count": frame_count,
|
|
72
|
+
"corruption_risk": "high" if failed_frames > 0 else "low"
|
|
73
|
+
}
|
|
74
|
+
except Exception as e:
|
|
75
|
+
return {"status": "error", "error": str(e)}
|
|
76
|
+
|
|
77
|
+
def main():
|
|
78
|
+
if len(sys.argv) < 2:
|
|
79
|
+
print(json.dumps({"error": "No path provided"}))
|
|
80
|
+
sys.exit(1)
|
|
81
|
+
|
|
82
|
+
input_path = sys.argv[1]
|
|
83
|
+
results = []
|
|
84
|
+
|
|
85
|
+
# Supported extensions
|
|
86
|
+
AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg", ".m4a")
|
|
87
|
+
VIDEO_EXTS = (".mp4", ".avi", ".mkv", ".mov", ".wmv")
|
|
88
|
+
|
|
89
|
+
if os.path.isfile(input_path):
|
|
90
|
+
ext = os.path.splitext(input_path.lower())[1]
|
|
91
|
+
if ext in AUDIO_EXTS:
|
|
92
|
+
results.append(analyze_audio(input_path))
|
|
93
|
+
elif ext in VIDEO_EXTS:
|
|
94
|
+
results.append(analyze_video(input_path))
|
|
95
|
+
else:
|
|
96
|
+
results.append({"status": "error", "error": f"Unsupported file type: {ext}"})
|
|
97
|
+
elif os.path.isdir(input_path):
|
|
98
|
+
files = [os.path.join(input_path, f) for f in os.listdir(input_path)]
|
|
99
|
+
for f in files[:50]: # Limit for demo
|
|
100
|
+
ext = os.path.splitext(f.lower())[1]
|
|
101
|
+
if ext in AUDIO_EXTS:
|
|
102
|
+
results.append(analyze_audio(f))
|
|
103
|
+
elif ext in VIDEO_EXTS:
|
|
104
|
+
results.append(analyze_video(f))
|
|
105
|
+
else:
|
|
106
|
+
print(json.dumps({"error": "Invalid path"}))
|
|
107
|
+
sys.exit(1)
|
|
108
|
+
|
|
109
|
+
# Filtering failed results for report aggregation
|
|
110
|
+
ok_results = [r for r in results if r.get("status") == "ok"]
|
|
111
|
+
|
|
112
|
+
report = {
|
|
113
|
+
"total_files": len(results),
|
|
114
|
+
"ok_files": len(ok_results),
|
|
115
|
+
"failed_files": len(results) - len(ok_results),
|
|
116
|
+
"details": results
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
# Calculate some averages if files were found
|
|
120
|
+
if ok_results:
|
|
121
|
+
audio_files = [r for r in ok_results if r["type"] == "audio"]
|
|
122
|
+
video_files = [r for r in ok_results if r["type"] == "video"]
|
|
123
|
+
|
|
124
|
+
if audio_files:
|
|
125
|
+
report["avg_audio_duration"] = float(np.mean([r["duration"] for r in audio_files]))
|
|
126
|
+
if video_files:
|
|
127
|
+
report["avg_video_duration"] = float(np.mean([r["duration"] for r in video_files]))
|
|
128
|
+
report["avg_fps"] = float(np.mean([r["fps"] for r in video_files]))
|
|
129
|
+
|
|
130
|
+
print(json.dumps(report))
|
|
131
|
+
|
|
132
|
+
if __name__ == "__main__":
|
|
133
|
+
main()
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import urllib.request
|
|
5
|
+
import urllib.parse
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
# NASA Data Portal uses Socrata
|
|
9
|
+
NASA_API_URL = "https://api.us.socrata.com/api/catalog/v1"
|
|
10
|
+
NASA_DOMAIN = "data.nasa.gov"
|
|
11
|
+
|
|
12
|
+
def search_nasa(query: str, limit: int = 10):
|
|
13
|
+
"""
|
|
14
|
+
Search NASA data portal.
|
|
15
|
+
"""
|
|
16
|
+
try:
|
|
17
|
+
params = {
|
|
18
|
+
"q": query,
|
|
19
|
+
"limit": limit,
|
|
20
|
+
"domains": NASA_DOMAIN,
|
|
21
|
+
"search_context": NASA_DOMAIN
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
query_string = urllib.parse.urlencode(params)
|
|
25
|
+
url = f"{NASA_API_URL}?{query_string}"
|
|
26
|
+
|
|
27
|
+
req = urllib.request.Request(url)
|
|
28
|
+
with urllib.request.urlopen(req) as response:
|
|
29
|
+
data = json.load(response)
|
|
30
|
+
|
|
31
|
+
results = []
|
|
32
|
+
# Socrata catalog results are in 'results'
|
|
33
|
+
items = data.get('results', [])
|
|
34
|
+
|
|
35
|
+
for item in items:
|
|
36
|
+
ds = item.get('resource', {})
|
|
37
|
+
|
|
38
|
+
metadata = {
|
|
39
|
+
"id": f"nasa:{ds.get('id')}",
|
|
40
|
+
"source": "nasa",
|
|
41
|
+
"name": ds.get('name'),
|
|
42
|
+
"description": ds.get('description') or "No description available.",
|
|
43
|
+
"downloads": ds.get('download_count', 0),
|
|
44
|
+
"likes": ds.get('view_count', 0) // 10,
|
|
45
|
+
"last_updated": ds.get('updatedAt') or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
46
|
+
"quality_score": 90,
|
|
47
|
+
"license": {
|
|
48
|
+
"id": "public_domain",
|
|
49
|
+
"name": "Public Domain",
|
|
50
|
+
"category": "safe",
|
|
51
|
+
"usage_restrictions": [],
|
|
52
|
+
"warnings": []
|
|
53
|
+
},
|
|
54
|
+
"tags": ds.get('tags', []),
|
|
55
|
+
"total_examples": 0,
|
|
56
|
+
"is_safe_source": True,
|
|
57
|
+
"is_structured": True,
|
|
58
|
+
"metadata_url": f"https://data.nasa.gov/d/{ds.get('id')}",
|
|
59
|
+
"domain": "science"
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
results.append(metadata)
|
|
63
|
+
|
|
64
|
+
return results
|
|
65
|
+
|
|
66
|
+
except Exception as e:
|
|
67
|
+
return {"error": str(e)}
|
|
68
|
+
|
|
69
|
+
def main():
|
|
70
|
+
parser = argparse.ArgumentParser(description="NASA Adapter")
|
|
71
|
+
parser.add_argument("--action", required=True, choices=["search"])
|
|
72
|
+
parser.add_argument("--query", required=True)
|
|
73
|
+
parser.add_argument("--limit", type=int, default=10)
|
|
74
|
+
|
|
75
|
+
args = parser.parse_args()
|
|
76
|
+
|
|
77
|
+
if args.action == "search":
|
|
78
|
+
results = search_nasa(args.query, args.limit)
|
|
79
|
+
print(json.dumps(results))
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
main()
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import polars as pl
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
def analyze_column(df, col_name, dtype):
|
|
7
|
+
stats = {
|
|
8
|
+
"name": col_name,
|
|
9
|
+
"type": str(dtype),
|
|
10
|
+
"inferred_type": str(dtype), # Default to actual
|
|
11
|
+
"missing_count": 0,
|
|
12
|
+
"missing_percentage": 0.0,
|
|
13
|
+
"unique_count": 0,
|
|
14
|
+
"is_constant": False,
|
|
15
|
+
"is_mixed_type": False
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
col = df[col_name]
|
|
20
|
+
null_count = col.null_count()
|
|
21
|
+
row_count = len(col)
|
|
22
|
+
|
|
23
|
+
stats["missing_count"] = null_count
|
|
24
|
+
stats["missing_percentage"] = (null_count / row_count) * 100 if row_count > 0 else 0
|
|
25
|
+
stats["unique_count"] = col.n_unique()
|
|
26
|
+
stats["is_constant"] = stats["unique_count"] <= 1 and row_count > 0
|
|
27
|
+
|
|
28
|
+
# Schema Inference & Validation
|
|
29
|
+
is_string = dtype == pl.Utf8 or dtype == pl.Object
|
|
30
|
+
|
|
31
|
+
if is_string and row_count > 0:
|
|
32
|
+
# Try inferring Numeric
|
|
33
|
+
# Check if majority can be cast to float
|
|
34
|
+
try:
|
|
35
|
+
# Use strict=False to turn non-numbers into nulls
|
|
36
|
+
numeric_cast = col.str.strip_chars().cast(pl.Float64, strict=False)
|
|
37
|
+
numeric_nulls = numeric_cast.null_count()
|
|
38
|
+
|
|
39
|
+
# If valid numbers are significantly more than original nulls, it might be numeric
|
|
40
|
+
valid_numbers = row_count - numeric_nulls
|
|
41
|
+
original_valid = row_count - null_count
|
|
42
|
+
|
|
43
|
+
if valid_numbers > 0 and (valid_numbers / original_valid) > 0.9:
|
|
44
|
+
stats["inferred_type"] = "Numeric (Stored as String)"
|
|
45
|
+
|
|
46
|
+
# Mixed type check: If valid numbers exist but plenty of strings too
|
|
47
|
+
elif valid_numbers > 0 and (valid_numbers / original_valid) < 0.9:
|
|
48
|
+
stats["is_mixed_type"] = True
|
|
49
|
+
except:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
# Numeric Analysis
|
|
53
|
+
if dtype in [pl.Int64, pl.Int32, pl.Float64, pl.Float32] or stats["inferred_type"].startswith("Numeric"):
|
|
54
|
+
clean_col = col
|
|
55
|
+
if is_string:
|
|
56
|
+
# Cast for analysis if it was inferred
|
|
57
|
+
clean_col = col.str.strip_chars().cast(pl.Float64, strict=False)
|
|
58
|
+
|
|
59
|
+
clean_col = clean_col.drop_nulls()
|
|
60
|
+
|
|
61
|
+
if len(clean_col) > 0:
|
|
62
|
+
stats["distribution"] = {
|
|
63
|
+
"min": float(clean_col.min()),
|
|
64
|
+
"max": float(clean_col.max()),
|
|
65
|
+
"mean": float(clean_col.mean()),
|
|
66
|
+
"std": float(clean_col.std()) if len(clean_col) > 1 else 0,
|
|
67
|
+
"p25": float(clean_col.quantile(0.25)),
|
|
68
|
+
"p50": float(clean_col.median()),
|
|
69
|
+
"p75": float(clean_col.quantile(0.75))
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# Categorical Analysis
|
|
73
|
+
if dtype == pl.Utf8 or dtype == pl.Categorical:
|
|
74
|
+
value_counts = col.value_counts(sort=True).head(5)
|
|
75
|
+
# Handle different polars versions return structure for value_counts
|
|
76
|
+
try:
|
|
77
|
+
# Format: struct with name/counts or columns
|
|
78
|
+
rows = value_counts.rows()
|
|
79
|
+
top_values = {}
|
|
80
|
+
for row in rows:
|
|
81
|
+
val = str(row[0]) if row[0] is not None else "null"
|
|
82
|
+
count = int(row[1])
|
|
83
|
+
top_values[val] = count
|
|
84
|
+
stats["top_values"] = top_values
|
|
85
|
+
except:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
except Exception as e:
|
|
89
|
+
stats["error"] = str(e)
|
|
90
|
+
|
|
91
|
+
return stats
|
|
92
|
+
|
|
93
|
+
def main():
|
|
94
|
+
if len(sys.argv) < 2:
|
|
95
|
+
print(json.dumps({"error": "No file path provided"}))
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
file_path = sys.argv[1]
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
# Robust file reading with extension detection
|
|
102
|
+
file_path_lower = file_path.lower()
|
|
103
|
+
if file_path_lower.endswith(".csv"):
|
|
104
|
+
df = pl.read_csv(file_path, ignore_errors=True, n_rows=10000)
|
|
105
|
+
elif file_path_lower.endswith(".parquet"):
|
|
106
|
+
try:
|
|
107
|
+
# Try scanning first (faster for large files)
|
|
108
|
+
df = pl.scan_parquet(file_path).limit(10000).collect()
|
|
109
|
+
except:
|
|
110
|
+
df = pl.read_parquet(file_path)
|
|
111
|
+
if len(df) > 10000: df = df.head(10000)
|
|
112
|
+
elif file_path_lower.endswith(".jsonl") or file_path_lower.endswith(".ndjson"):
|
|
113
|
+
# Explicit NDJSON
|
|
114
|
+
df = pl.scan_ndjson(file_path).limit(10000).collect()
|
|
115
|
+
elif file_path_lower.endswith(".json"):
|
|
116
|
+
# Ambiguous .json: Try standard JSON first, then NDJSON fallback
|
|
117
|
+
try:
|
|
118
|
+
# read_json reads standard JSON array [{}, {}]
|
|
119
|
+
df = pl.read_json(file_path)
|
|
120
|
+
if len(df) > 10000: df = df.head(10000)
|
|
121
|
+
except Exception:
|
|
122
|
+
try:
|
|
123
|
+
# Fallback to NDJSON (common for large datasets mislabeled as .json)
|
|
124
|
+
df = pl.scan_ndjson(file_path).limit(10000).collect()
|
|
125
|
+
except Exception as e:
|
|
126
|
+
print(json.dumps({"error": f"Failed to read JSON: {str(e)}"}))
|
|
127
|
+
sys.exit(1)
|
|
128
|
+
else:
|
|
129
|
+
print(json.dumps({"error": f"Unsupported file extension: {file_path}"}))
|
|
130
|
+
sys.exit(1)
|
|
131
|
+
|
|
132
|
+
row_count = len(df)
|
|
133
|
+
column_count = len(df.columns)
|
|
134
|
+
|
|
135
|
+
# Duplicate detection (exact)
|
|
136
|
+
try:
|
|
137
|
+
duplicate_count = df.is_duplicated().sum()
|
|
138
|
+
except Exception:
|
|
139
|
+
# Duplicate check might fail on complex nested types (List, Struct)
|
|
140
|
+
duplicate_count = 0
|
|
141
|
+
|
|
142
|
+
columns_stats = []
|
|
143
|
+
text_cols = []
|
|
144
|
+
for col in df.columns:
|
|
145
|
+
stats = analyze_column(df, col, df.schema[col])
|
|
146
|
+
columns_stats.append(stats)
|
|
147
|
+
# Check for String type (Polars can return 'String' or 'Utf8' depending on version)
|
|
148
|
+
dtype_str = stats["type"]
|
|
149
|
+
if ("String" in dtype_str or "Utf8" in dtype_str) and stats["unique_count"] > 1:
|
|
150
|
+
text_cols.append(col)
|
|
151
|
+
|
|
152
|
+
report = {
|
|
153
|
+
"row_count": row_count,
|
|
154
|
+
"column_count": column_count,
|
|
155
|
+
"duplicate_rows": int(duplicate_count),
|
|
156
|
+
"duplicate_percentage": (duplicate_count / row_count * 100) if row_count > 0 else 0,
|
|
157
|
+
"columns": columns_stats,
|
|
158
|
+
"warnings": [],
|
|
159
|
+
"schema_warnings": [],
|
|
160
|
+
"overall_score": 100
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
# Integrity Check 1: Text Duplicates (Fuzzyish Proxy)
|
|
164
|
+
# If duplicated rows are 0, check if main text content is duplicated
|
|
165
|
+
if duplicate_count == 0 and len(text_cols) > 0:
|
|
166
|
+
# Pick longest text column as likely "content"
|
|
167
|
+
# In real impl, we'd use heuristics. For now, first text col.
|
|
168
|
+
target_col = text_cols[0]
|
|
169
|
+
text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
|
|
170
|
+
if text_dupes > 0:
|
|
171
|
+
report["text_duplicates"] = int(text_dupes)
|
|
172
|
+
if text_dupes > (row_count * 0.2):
|
|
173
|
+
report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
|
|
174
|
+
|
|
175
|
+
# Integrity Check 2: Contamination / Leakage (Basic)
|
|
176
|
+
# (Skipping correlation for now)
|
|
177
|
+
|
|
178
|
+
report["class_imbalance_warnings"] = []
|
|
179
|
+
report["pii_warnings"] = []
|
|
180
|
+
|
|
181
|
+
# PII Patterns (Regex)
|
|
182
|
+
import re
|
|
183
|
+
pii_patterns = {
|
|
184
|
+
"Email": r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
|
|
185
|
+
"Phone": r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', # Basic US-ish pattern
|
|
186
|
+
"SSN": r'\d{3}-\d{2}-\d{4}',
|
|
187
|
+
"IPv4": r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
# Bias & PII Analysis
|
|
191
|
+
for col_name, stats in zip(df.columns, columns_stats):
|
|
192
|
+
# Class Imbalance
|
|
193
|
+
if stats["unique_count"] > 1 and stats["unique_count"] < 50:
|
|
194
|
+
try:
|
|
195
|
+
col = df[col_name]
|
|
196
|
+
top_val_count = col.value_counts().sort("count", descending=True).row(0)[1]
|
|
197
|
+
total = len(col)
|
|
198
|
+
if total > 0:
|
|
199
|
+
ratio = top_val_count / total
|
|
200
|
+
if ratio > 0.9:
|
|
201
|
+
report["class_imbalance_warnings"].append(f"Severe imbalance in '{col_name}': Top class is {(ratio*100):.1f}% of data")
|
|
202
|
+
except:
|
|
203
|
+
pass
|
|
204
|
+
|
|
205
|
+
# PII Detection (on Text Columns only)
|
|
206
|
+
if ("String" in stats["type"] or "Utf8" in stats["type"]):
|
|
207
|
+
try:
|
|
208
|
+
# Sample for performance (check first 1000 non-null values)
|
|
209
|
+
sample_text = df[col_name].drop_nulls().head(1000).to_list()
|
|
210
|
+
# Join a subset to regex against (faster than row-by-row for simple checks)
|
|
211
|
+
combined_text = " ".join([str(x) for x in sample_text])
|
|
212
|
+
|
|
213
|
+
for pii_type, pattern in pii_patterns.items():
|
|
214
|
+
if re.search(pattern, combined_text):
|
|
215
|
+
# Ensure we don't flag column names like "email_address" but actual content
|
|
216
|
+
# Double check with a strict count if trigger found
|
|
217
|
+
matches = len(re.findall(pattern, combined_text))
|
|
218
|
+
if matches > 0:
|
|
219
|
+
report["pii_warnings"].append(f"Potential {pii_type} detected in column '{col_name}' ({matches} matches in sample)")
|
|
220
|
+
except:
|
|
221
|
+
pass
|
|
222
|
+
|
|
223
|
+
# Basic warnings
|
|
224
|
+
if report["duplicate_percentage"] > 10:
|
|
225
|
+
report["warnings"].append("High duplication rate (>10%)")
|
|
226
|
+
if row_count < 50:
|
|
227
|
+
report["warnings"].append("Dataset is very small (<50 rows)")
|
|
228
|
+
|
|
229
|
+
# Schema warnings
|
|
230
|
+
for col in columns_stats:
|
|
231
|
+
if "Numeric" in col.get("inferred_type", "") and "Utf8" in col.get("type", ""):
|
|
232
|
+
report["schema_warnings"].append(f"Column '{col['name']}' looks Numeric but is stored as String")
|
|
233
|
+
if col.get("is_mixed_type"):
|
|
234
|
+
report["schema_warnings"].append(f"Column '{col['name']}' likely contains mixed types (numbers and strings)")
|
|
235
|
+
|
|
236
|
+
print(json.dumps(report))
|
|
237
|
+
|
|
238
|
+
except Exception as e:
|
|
239
|
+
print(json.dumps({"error": f"Analysis failed: {str(e)}"}))
|
|
240
|
+
sys.exit(1)
|
|
241
|
+
|
|
242
|
+
if __name__ == "__main__":
|
|
243
|
+
main()
|