ma-agents 3.3.0 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.opencode/skills/.ma-agents.json +99 -99
- package/.roo/skills/.ma-agents.json +99 -99
- package/README.md +19 -1
- package/bin/cli.js +55 -0
- package/lib/agents.js +23 -0
- package/lib/bmad-cache/cache-manifest.json +1 -1
- package/lib/bmad-customizations/bmm-demerzel.customize.yaml +36 -0
- package/lib/bmad-customizations/demerzel.md +32 -0
- package/lib/bmad-extension/module-help.csv +13 -0
- package/lib/bmad-extension/skills/bmad-ma-agent-ml/.gitkeep +0 -0
- package/lib/bmad-extension/skills/bmad-ma-agent-ml/SKILL.md +59 -0
- package/lib/bmad-extension/skills/bmad-ma-agent-ml/bmad-skill-manifest.yaml +11 -0
- package/lib/bmad-extension/skills/generate-backlog/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-advise/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-advise/SKILL.md +76 -0
- package/lib/bmad-extension/skills/ml-advise/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-advise/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-analysis/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-analysis/SKILL.md +60 -0
- package/lib/bmad-extension/skills/ml-analysis/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-analysis/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-architecture/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-architecture/SKILL.md +55 -0
- package/lib/bmad-extension/skills/ml-architecture/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-architecture/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-detailed-design/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-detailed-design/SKILL.md +67 -0
- package/lib/bmad-extension/skills/ml-detailed-design/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-detailed-design/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-eda/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-eda/SKILL.md +56 -0
- package/lib/bmad-extension/skills/ml-eda/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/baseline_classifier.py +522 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/class_weights_calculator.py +295 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/clustering_explorer.py +383 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/eda_analyzer.py +654 -0
- package/lib/bmad-extension/skills/ml-eda/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-experiment/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-experiment/SKILL.md +74 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/advanced_trainer_configs.py +430 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/quick_trainer_setup.py +233 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/template_datamodule.py +219 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/template_gnn_module.py +341 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/template_lightning_module.py +158 -0
- package/lib/bmad-extension/skills/ml-experiment/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-experiment/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-hparam/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-hparam/SKILL.md +81 -0
- package/lib/bmad-extension/skills/ml-hparam/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-hparam/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-ideation/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-ideation/SKILL.md +50 -0
- package/lib/bmad-extension/skills/ml-ideation/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-ideation/scripts/validate_ml_prd.py +287 -0
- package/lib/bmad-extension/skills/ml-ideation/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-infra/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-infra/SKILL.md +58 -0
- package/lib/bmad-extension/skills/ml-infra/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-infra/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-retrospective/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-retrospective/SKILL.md +63 -0
- package/lib/bmad-extension/skills/ml-retrospective/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-retrospective/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-revision/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-revision/SKILL.md +82 -0
- package/lib/bmad-extension/skills/ml-revision/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-revision/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-techspec/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-techspec/SKILL.md +80 -0
- package/lib/bmad-extension/skills/ml-techspec/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-techspec/skill.json +7 -0
- package/lib/bmad.js +85 -8
- package/lib/skill-authoring.js +1 -1
- package/package.json +2 -2
- package/test/agent-injection-strategy.test.js +4 -4
- package/test/bmad-version-bump.test.js +34 -34
- package/test/build-bmad-args.test.js +13 -6
- package/test/convert-agents-to-skills.test.js +11 -1
- package/test/extension-module-restructure.test.js +31 -7
- package/test/migration-validation.test.js +14 -11
|
@@ -0,0 +1,654 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
eda_analyzer.py — BMAD DL Lifecycle (inspired by K-Dense claude-scientific-skills)
|
|
4
|
+
ML-focused Exploratory Data Analysis for common deep learning data formats.
|
|
5
|
+
|
|
6
|
+
Supported formats:
|
|
7
|
+
- Image datasets : directory of images (class-labeled subdirs or flat)
|
|
8
|
+
- CSV / TSV : tabular feature/label datasets
|
|
9
|
+
- NumPy : .npy / .npz arrays
|
|
10
|
+
- HDF5 : .h5 / .hdf5 files
|
|
11
|
+
- JSON annotations: COCO-style or flat label files
|
|
12
|
+
|
|
13
|
+
Generates a structured markdown EDA report aligned with TSK-001 requirements:
|
|
14
|
+
class distributions, annotation quality, missing values, split verification.
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
python3 scripts/eda_analyzer.py <data_path> [--output report.md] [--splits train val test]
|
|
18
|
+
python3 scripts/eda_analyzer.py data/images/ --splits train val test
|
|
19
|
+
python3 scripts/eda_analyzer.py data/features.csv
|
|
20
|
+
|
|
21
|
+
Exit codes:
|
|
22
|
+
0 — success, report written
|
|
23
|
+
1 — warnings (partial data)
|
|
24
|
+
2 — error
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import argparse
|
|
30
|
+
import json
|
|
31
|
+
import sys
|
|
32
|
+
from collections import Counter, defaultdict
|
|
33
|
+
from dataclasses import dataclass, field
|
|
34
|
+
from datetime import datetime
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
from typing import Any
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ── Optional imports ───────────────────────────────────────────────────────────
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
import numpy as np
|
|
43
|
+
HAS_NUMPY = True
|
|
44
|
+
except ImportError:
|
|
45
|
+
HAS_NUMPY = False
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
import csv as _csv
|
|
49
|
+
HAS_CSV = True
|
|
50
|
+
except ImportError:
|
|
51
|
+
HAS_CSV = False # csv is stdlib, always available
|
|
52
|
+
|
|
53
|
+
HAS_PIL = False
|
|
54
|
+
try:
|
|
55
|
+
from PIL import Image
|
|
56
|
+
HAS_PIL = True
|
|
57
|
+
except ImportError:
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
HAS_H5PY = False
|
|
61
|
+
try:
|
|
62
|
+
import h5py
|
|
63
|
+
HAS_H5PY = True
|
|
64
|
+
except ImportError:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ── Data structures ────────────────────────────────────────────────────────────
|
|
69
|
+
|
|
70
|
+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class ClassInfo:
|
|
74
|
+
name: str
|
|
75
|
+
count: int
|
|
76
|
+
sample_files: list[str] = field(default_factory=list)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class EDAReport:
|
|
81
|
+
data_path: Path
|
|
82
|
+
format_detected: str
|
|
83
|
+
warnings: list[str] = field(default_factory=list)
|
|
84
|
+
errors: list[str] = field(default_factory=list)
|
|
85
|
+
|
|
86
|
+
# Image dataset
|
|
87
|
+
total_images: int = 0
|
|
88
|
+
classes: list[ClassInfo] = field(default_factory=list)
|
|
89
|
+
image_sizes: list[tuple[int, int]] = field(default_factory=list)
|
|
90
|
+
corrupt_files: list[str] = field(default_factory=list)
|
|
91
|
+
splits_found: dict[str, int] = field(default_factory=dict)
|
|
92
|
+
|
|
93
|
+
# Tabular
|
|
94
|
+
num_rows: int = 0
|
|
95
|
+
num_cols: int = 0
|
|
96
|
+
columns: list[str] = field(default_factory=list)
|
|
97
|
+
missing_values: dict[str, int] = field(default_factory=dict)
|
|
98
|
+
label_distribution: dict[str, int] = field(default_factory=dict)
|
|
99
|
+
numeric_stats: dict[str, dict[str, float]] = field(default_factory=dict)
|
|
100
|
+
|
|
101
|
+
# NumPy
|
|
102
|
+
array_shapes: list[tuple] = field(default_factory=list)
|
|
103
|
+
array_dtypes: list[str] = field(default_factory=list)
|
|
104
|
+
array_stats: dict[str, Any] = field(default_factory=dict)
|
|
105
|
+
|
|
106
|
+
# HDF5
|
|
107
|
+
hdf5_keys: list[str] = field(default_factory=list)
|
|
108
|
+
hdf5_shapes: dict[str, tuple] = field(default_factory=dict)
|
|
109
|
+
|
|
110
|
+
# Annotation JSON
|
|
111
|
+
annotation_classes: dict[str, int] = field(default_factory=dict)
|
|
112
|
+
annotation_count: int = 0
|
|
113
|
+
images_without_annotations: int = 0
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# ── Format detection ───────────────────────────────────────────────────────────
|
|
117
|
+
|
|
118
|
+
def detect_format(path: Path) -> str:
|
|
119
|
+
if path.is_dir():
|
|
120
|
+
# Check if it's an image dataset directory
|
|
121
|
+
image_files = list(path.rglob("*"))
|
|
122
|
+
if any(f.suffix.lower() in IMAGE_EXTENSIONS for f in image_files[:50]):
|
|
123
|
+
return "image_dir"
|
|
124
|
+
return "unknown_dir"
|
|
125
|
+
suffix = path.suffix.lower()
|
|
126
|
+
if suffix in (".csv", ".tsv"):
|
|
127
|
+
return "csv"
|
|
128
|
+
if suffix in (".npy", ".npz"):
|
|
129
|
+
return "numpy"
|
|
130
|
+
if suffix in (".h5", ".hdf5"):
|
|
131
|
+
return "hdf5"
|
|
132
|
+
if suffix == ".json":
|
|
133
|
+
return "json_annotations"
|
|
134
|
+
return "unknown"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# ── Image dataset analysis ─────────────────────────────────────────────────────
|
|
138
|
+
|
|
139
|
+
def analyze_image_dir(path: Path, report: EDAReport, split_names: list[str]) -> None:
|
|
140
|
+
report.format_detected = "Image Dataset (directory)"
|
|
141
|
+
|
|
142
|
+
# Check for split subdirectories
|
|
143
|
+
subdirs = [d for d in path.iterdir() if d.is_dir()]
|
|
144
|
+
split_dirs = [d for d in subdirs if d.name.lower() in (s.lower() for s in split_names)]
|
|
145
|
+
|
|
146
|
+
if split_dirs:
|
|
147
|
+
# Split-structured dataset: path/train/class/img.jpg
|
|
148
|
+
for split_dir in split_dirs:
|
|
149
|
+
class_dirs = [d for d in split_dir.iterdir() if d.is_dir()]
|
|
150
|
+
split_count = sum(
|
|
151
|
+
len([f for f in cd.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS])
|
|
152
|
+
for cd in class_dirs
|
|
153
|
+
)
|
|
154
|
+
report.splits_found[split_dir.name] = split_count
|
|
155
|
+
|
|
156
|
+
# Analyze class distribution from train split (or first split)
|
|
157
|
+
ref_split = next((d for d in split_dirs if "train" in d.name.lower()), split_dirs[0])
|
|
158
|
+
_analyze_class_structure(ref_split, report)
|
|
159
|
+
else:
|
|
160
|
+
# Flat or class-labeled directory: path/class/img.jpg
|
|
161
|
+
_analyze_class_structure(path, report)
|
|
162
|
+
|
|
163
|
+
report.total_images = sum(c.count for c in report.classes)
|
|
164
|
+
|
|
165
|
+
# Sample image sizes
|
|
166
|
+
if HAS_PIL:
|
|
167
|
+
_sample_image_sizes(path, report)
|
|
168
|
+
else:
|
|
169
|
+
report.warnings.append(
|
|
170
|
+
"PIL not installed — skipping image size and corruption checks. "
|
|
171
|
+
"Install with: pip install Pillow"
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Class balance check
|
|
175
|
+
if report.classes:
|
|
176
|
+
counts = [c.count for c in report.classes]
|
|
177
|
+
max_c, min_c = max(counts), min(counts)
|
|
178
|
+
if min_c > 0 and max_c / min_c > 5:
|
|
179
|
+
report.warnings.append(
|
|
180
|
+
f"Severe class imbalance detected: ratio {max_c/min_c:.1f}:1 "
|
|
181
|
+
f"({report.classes[counts.index(max_c)].name} vs "
|
|
182
|
+
f"{report.classes[counts.index(min_c)].name}). "
|
|
183
|
+
f"Consider oversampling, undersampling, or weighted loss."
|
|
184
|
+
)
|
|
185
|
+
elif min_c > 0 and max_c / min_c > 2:
|
|
186
|
+
report.warnings.append(
|
|
187
|
+
f"Moderate class imbalance: ratio {max_c/min_c:.1f}:1. "
|
|
188
|
+
f"Monitor per-class metrics during training."
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _analyze_class_structure(base_dir: Path, report: EDAReport) -> None:
|
|
193
|
+
class_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
|
|
194
|
+
if class_dirs:
|
|
195
|
+
for class_dir in sorted(class_dirs):
|
|
196
|
+
images = [f for f in class_dir.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS]
|
|
197
|
+
report.classes.append(ClassInfo(
|
|
198
|
+
name=class_dir.name,
|
|
199
|
+
count=len(images),
|
|
200
|
+
sample_files=[f.name for f in images[:3]],
|
|
201
|
+
))
|
|
202
|
+
else:
|
|
203
|
+
# Flat directory
|
|
204
|
+
images = [f for f in base_dir.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS]
|
|
205
|
+
if images:
|
|
206
|
+
report.classes.append(ClassInfo(name="(unlabeled)", count=len(images)))
|
|
207
|
+
report.warnings.append(
|
|
208
|
+
"No class subdirectories found — images appear unlabeled. "
|
|
209
|
+
"Organize into class subdirectories for supervised training."
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _sample_image_sizes(base_dir: Path, report: EDAReport, max_samples: int = 100) -> None:
|
|
214
|
+
all_images = list(base_dir.rglob("*"))
|
|
215
|
+
image_files = [f for f in all_images if f.suffix.lower() in IMAGE_EXTENSIONS][:max_samples]
|
|
216
|
+
sizes: list[tuple[int, int]] = []
|
|
217
|
+
|
|
218
|
+
for img_path in image_files:
|
|
219
|
+
try:
|
|
220
|
+
with Image.open(img_path) as img:
|
|
221
|
+
sizes.append(img.size) # (width, height)
|
|
222
|
+
except Exception:
|
|
223
|
+
report.corrupt_files.append(str(img_path.relative_to(base_dir)))
|
|
224
|
+
|
|
225
|
+
report.image_sizes = sizes
|
|
226
|
+
if report.corrupt_files:
|
|
227
|
+
report.warnings.append(
|
|
228
|
+
f"{len(report.corrupt_files)} corrupt or unreadable image(s) found."
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
if sizes:
|
|
232
|
+
unique_sizes = set(sizes)
|
|
233
|
+
if len(unique_sizes) > 1:
|
|
234
|
+
report.warnings.append(
|
|
235
|
+
f"Inconsistent image sizes detected: {len(unique_sizes)} unique sizes in sample. "
|
|
236
|
+
f"Most common: {Counter(sizes).most_common(1)[0][0]}. "
|
|
237
|
+
f"Consider resizing to a fixed resolution."
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# ── CSV / tabular analysis ─────────────────────────────────────────────────────
|
|
242
|
+
|
|
243
|
+
def analyze_csv(path: Path, report: EDAReport) -> None:
|
|
244
|
+
import csv
|
|
245
|
+
report.format_detected = "CSV/Tabular Dataset"
|
|
246
|
+
|
|
247
|
+
with path.open(newline="", encoding="utf-8", errors="replace") as f:
|
|
248
|
+
reader = csv.DictReader(f)
|
|
249
|
+
rows = list(reader)
|
|
250
|
+
|
|
251
|
+
if not rows:
|
|
252
|
+
report.errors.append("CSV file is empty.")
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
report.columns = list(rows[0].keys())
|
|
256
|
+
report.num_rows = len(rows)
|
|
257
|
+
report.num_cols = len(report.columns)
|
|
258
|
+
|
|
259
|
+
# Missing values per column
|
|
260
|
+
for col in report.columns:
|
|
261
|
+
missing = sum(1 for row in rows if not row.get(col, "").strip())
|
|
262
|
+
if missing:
|
|
263
|
+
report.missing_values[col] = missing
|
|
264
|
+
|
|
265
|
+
# Detect label column (common names)
|
|
266
|
+
label_col = next(
|
|
267
|
+
(c for c in report.columns if c.lower() in ("label", "class", "target", "y", "category")),
|
|
268
|
+
None,
|
|
269
|
+
)
|
|
270
|
+
if label_col:
|
|
271
|
+
label_counts = Counter(row[label_col].strip() for row in rows if row.get(label_col))
|
|
272
|
+
report.label_distribution = dict(label_counts.most_common())
|
|
273
|
+
|
|
274
|
+
counts = list(label_counts.values())
|
|
275
|
+
if counts and max(counts) / max(min(counts), 1) > 5:
|
|
276
|
+
report.warnings.append(
|
|
277
|
+
f"Class imbalance in '{label_col}': "
|
|
278
|
+
f"ratio {max(counts)/min(counts):.1f}:1"
|
|
279
|
+
)
|
|
280
|
+
else:
|
|
281
|
+
report.warnings.append(
|
|
282
|
+
"No standard label column found (tried: label, class, target, y, category). "
|
|
283
|
+
"Verify your column names."
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Numeric stats for up to 10 columns
|
|
287
|
+
if HAS_NUMPY:
|
|
288
|
+
numeric_cols = []
|
|
289
|
+
for col in report.columns[:20]:
|
|
290
|
+
try:
|
|
291
|
+
vals = [float(row[col]) for row in rows if row.get(col, "").strip()]
|
|
292
|
+
if vals:
|
|
293
|
+
arr = np.array(vals)
|
|
294
|
+
report.numeric_stats[col] = {
|
|
295
|
+
"mean": float(arr.mean()),
|
|
296
|
+
"std": float(arr.std()),
|
|
297
|
+
"min": float(arr.min()),
|
|
298
|
+
"max": float(arr.max()),
|
|
299
|
+
"missing": report.missing_values.get(col, 0),
|
|
300
|
+
}
|
|
301
|
+
numeric_cols.append(col)
|
|
302
|
+
if len(numeric_cols) >= 10:
|
|
303
|
+
break
|
|
304
|
+
except ValueError:
|
|
305
|
+
pass
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
# ── NumPy analysis ─────────────────────────────────────────────────────────────
|
|
309
|
+
|
|
310
|
+
def analyze_numpy(path: Path, report: EDAReport) -> None:
|
|
311
|
+
if not HAS_NUMPY:
|
|
312
|
+
report.errors.append("numpy not installed. Install with: pip install numpy")
|
|
313
|
+
return
|
|
314
|
+
|
|
315
|
+
report.format_detected = "NumPy Array"
|
|
316
|
+
if path.suffix == ".npz":
|
|
317
|
+
data = np.load(path, allow_pickle=True)
|
|
318
|
+
for key in data.files:
|
|
319
|
+
arr = data[key]
|
|
320
|
+
report.array_shapes.append((key, arr.shape))
|
|
321
|
+
report.array_dtypes.append(f"{key}: {arr.dtype}")
|
|
322
|
+
if arr.dtype.kind in ("f", "i", "u"):
|
|
323
|
+
report.array_stats[key] = {
|
|
324
|
+
"shape": arr.shape,
|
|
325
|
+
"dtype": str(arr.dtype),
|
|
326
|
+
"min": float(arr.min()),
|
|
327
|
+
"max": float(arr.max()),
|
|
328
|
+
"mean": float(arr.mean()),
|
|
329
|
+
"nan_count": int(np.isnan(arr).sum()) if arr.dtype.kind == "f" else 0,
|
|
330
|
+
}
|
|
331
|
+
else:
|
|
332
|
+
arr = np.load(path, allow_pickle=True)
|
|
333
|
+
report.array_shapes.append(("array", arr.shape))
|
|
334
|
+
report.array_dtypes.append(str(arr.dtype))
|
|
335
|
+
if arr.dtype.kind in ("f", "i", "u"):
|
|
336
|
+
report.array_stats["array"] = {
|
|
337
|
+
"shape": arr.shape,
|
|
338
|
+
"dtype": str(arr.dtype),
|
|
339
|
+
"min": float(arr.min()),
|
|
340
|
+
"max": float(arr.max()),
|
|
341
|
+
"mean": float(arr.mean()),
|
|
342
|
+
"nan_count": int(np.isnan(arr).sum()) if arr.dtype.kind == "f" else 0,
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# ── HDF5 analysis ──────────────────────────────────────────────────────────────
|
|
347
|
+
|
|
348
|
+
def analyze_hdf5(path: Path, report: EDAReport) -> None:
|
|
349
|
+
if not HAS_H5PY:
|
|
350
|
+
report.errors.append("h5py not installed. Install with: pip install h5py")
|
|
351
|
+
return
|
|
352
|
+
|
|
353
|
+
report.format_detected = "HDF5 File"
|
|
354
|
+
with h5py.File(path, "r") as f:
|
|
355
|
+
def visitor(name, obj):
|
|
356
|
+
if isinstance(obj, h5py.Dataset):
|
|
357
|
+
report.hdf5_keys.append(name)
|
|
358
|
+
report.hdf5_shapes[name] = obj.shape
|
|
359
|
+
|
|
360
|
+
f.visititems(visitor)
|
|
361
|
+
|
|
362
|
+
if not report.hdf5_keys:
|
|
363
|
+
report.warnings.append("HDF5 file contains no datasets.")
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
# ── JSON annotation analysis ───────────────────────────────────────────────────
|
|
367
|
+
|
|
368
|
+
def analyze_json_annotations(path: Path, report: EDAReport) -> None:
|
|
369
|
+
report.format_detected = "JSON Annotations"
|
|
370
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
371
|
+
|
|
372
|
+
# COCO format detection
|
|
373
|
+
if isinstance(data, dict) and "annotations" in data and "categories" in data:
|
|
374
|
+
categories = {c["id"]: c["name"] for c in data.get("categories", [])}
|
|
375
|
+
ann_counts: Counter = Counter()
|
|
376
|
+
for ann in data.get("annotations", []):
|
|
377
|
+
cat_id = ann.get("category_id")
|
|
378
|
+
ann_counts[categories.get(cat_id, f"id_{cat_id}")] += 1
|
|
379
|
+
|
|
380
|
+
report.annotation_count = len(data["annotations"])
|
|
381
|
+
report.annotation_classes = dict(ann_counts)
|
|
382
|
+
|
|
383
|
+
# Images without annotations
|
|
384
|
+
annotated_images = {ann["image_id"] for ann in data["annotations"]}
|
|
385
|
+
total_images = len(data.get("images", []))
|
|
386
|
+
report.images_without_annotations = total_images - len(annotated_images)
|
|
387
|
+
if report.images_without_annotations > 0:
|
|
388
|
+
pct = report.images_without_annotations / max(total_images, 1) * 100
|
|
389
|
+
report.warnings.append(
|
|
390
|
+
f"{report.images_without_annotations} images ({pct:.1f}%) have no annotations."
|
|
391
|
+
)
|
|
392
|
+
return
|
|
393
|
+
|
|
394
|
+
# Flat label dict format: {"image.jpg": "class_name", ...}
|
|
395
|
+
if isinstance(data, dict):
|
|
396
|
+
counter: Counter = Counter(str(v) for v in data.values())
|
|
397
|
+
report.annotation_classes = dict(counter)
|
|
398
|
+
report.annotation_count = len(data)
|
|
399
|
+
return
|
|
400
|
+
|
|
401
|
+
# List of dicts: [{"image": "...", "label": "..."}, ...]
|
|
402
|
+
if isinstance(data, list) and data and isinstance(data[0], dict):
|
|
403
|
+
label_key = next(
|
|
404
|
+
(k for k in data[0] if k.lower() in ("label", "class", "category", "target")),
|
|
405
|
+
None,
|
|
406
|
+
)
|
|
407
|
+
if label_key:
|
|
408
|
+
counter = Counter(str(item.get(label_key, "unknown")) for item in data)
|
|
409
|
+
report.annotation_classes = dict(counter)
|
|
410
|
+
report.annotation_count = len(data)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
# ── Report generation ──────────────────────────────────────────────────────────
|
|
414
|
+
|
|
415
|
+
def generate_markdown_report(report: EDAReport) -> str:
|
|
416
|
+
lines: list[str] = []
|
|
417
|
+
now = datetime.now().strftime("%Y-%m-%d %H:%M")
|
|
418
|
+
|
|
419
|
+
lines += [
|
|
420
|
+
f"# EDA Report: {report.data_path.name}",
|
|
421
|
+
f"*Generated: {now} | Format: {report.format_detected}*",
|
|
422
|
+
"",
|
|
423
|
+
"---",
|
|
424
|
+
"",
|
|
425
|
+
"## A. Dataset Overview",
|
|
426
|
+
"",
|
|
427
|
+
f"| Property | Value |",
|
|
428
|
+
f"| :--- | :--- |",
|
|
429
|
+
f"| Path | `{report.data_path}` |",
|
|
430
|
+
f"| Format | {report.format_detected} |",
|
|
431
|
+
]
|
|
432
|
+
|
|
433
|
+
# Format-specific overview
|
|
434
|
+
if report.classes:
|
|
435
|
+
lines += [
|
|
436
|
+
f"| Total Images | {report.total_images:,} |",
|
|
437
|
+
f"| Number of Classes | {len(report.classes)} |",
|
|
438
|
+
]
|
|
439
|
+
if report.splits_found:
|
|
440
|
+
for split, count in report.splits_found.items():
|
|
441
|
+
lines.append(f"| Split: {split} | {count:,} images |")
|
|
442
|
+
if report.num_rows:
|
|
443
|
+
lines += [
|
|
444
|
+
f"| Rows | {report.num_rows:,} |",
|
|
445
|
+
f"| Columns | {report.num_cols} |",
|
|
446
|
+
]
|
|
447
|
+
|
|
448
|
+
lines += ["", "---", "", "## B. Class Distribution", ""]
|
|
449
|
+
|
|
450
|
+
if report.classes:
|
|
451
|
+
total = sum(c.count for c in report.classes) or 1
|
|
452
|
+
lines += [
|
|
453
|
+
"| Class | Count | % of Total |",
|
|
454
|
+
"| :--- | ---: | ---: |",
|
|
455
|
+
]
|
|
456
|
+
for c in sorted(report.classes, key=lambda x: -x.count):
|
|
457
|
+
pct = c.count / total * 100
|
|
458
|
+
bar = "█" * int(pct / 5)
|
|
459
|
+
lines.append(f"| {c.name} | {c.count:,} | {pct:.1f}% {bar} |")
|
|
460
|
+
|
|
461
|
+
elif report.label_distribution:
|
|
462
|
+
total = sum(report.label_distribution.values()) or 1
|
|
463
|
+
lines += [
|
|
464
|
+
"| Label | Count | % of Total |",
|
|
465
|
+
"| :--- | ---: | ---: |",
|
|
466
|
+
]
|
|
467
|
+
for label, count in report.label_distribution.items():
|
|
468
|
+
pct = count / total * 100
|
|
469
|
+
lines.append(f"| {label} | {count:,} | {pct:.1f}% |")
|
|
470
|
+
|
|
471
|
+
elif report.annotation_classes:
|
|
472
|
+
lines += [
|
|
473
|
+
f"Total annotations: {report.annotation_count:,}",
|
|
474
|
+
"",
|
|
475
|
+
"| Class | Count |",
|
|
476
|
+
"| :--- | ---: |",
|
|
477
|
+
]
|
|
478
|
+
for cls, count in sorted(report.annotation_classes.items(), key=lambda x: -x[1]):
|
|
479
|
+
lines.append(f"| {cls} | {count:,} |")
|
|
480
|
+
|
|
481
|
+
else:
|
|
482
|
+
lines.append("*No class/label information available.*")
|
|
483
|
+
|
|
484
|
+
lines += ["", "---", "", "## C. Data Quality Assessment", ""]
|
|
485
|
+
|
|
486
|
+
# Image sizes
|
|
487
|
+
if report.image_sizes:
|
|
488
|
+
from collections import Counter as C
|
|
489
|
+
size_counts = C(report.image_sizes)
|
|
490
|
+
most_common_size, count = size_counts.most_common(1)[0]
|
|
491
|
+
pct = count / len(report.image_sizes) * 100
|
|
492
|
+
lines += [
|
|
493
|
+
f"| Property | Value |",
|
|
494
|
+
f"| :--- | :--- |",
|
|
495
|
+
f"| Most common image size | {most_common_size[0]}×{most_common_size[1]}px ({pct:.0f}% of sample) |",
|
|
496
|
+
f"| Unique sizes in sample | {len(size_counts)} |",
|
|
497
|
+
f"| Corrupt/unreadable files | {len(report.corrupt_files)} |",
|
|
498
|
+
"",
|
|
499
|
+
]
|
|
500
|
+
|
|
501
|
+
# Missing values
|
|
502
|
+
if report.missing_values:
|
|
503
|
+
total = report.num_rows or 1
|
|
504
|
+
lines += [
|
|
505
|
+
"**Missing Values:**",
|
|
506
|
+
"",
|
|
507
|
+
"| Column | Missing | % |",
|
|
508
|
+
"| :--- | ---: | ---: |",
|
|
509
|
+
]
|
|
510
|
+
for col, count in sorted(report.missing_values.items(), key=lambda x: -x[1]):
|
|
511
|
+
lines.append(f"| {col} | {count} | {count/total*100:.1f}% |")
|
|
512
|
+
lines.append("")
|
|
513
|
+
elif report.num_rows:
|
|
514
|
+
lines.append("✓ No missing values detected.")
|
|
515
|
+
lines.append("")
|
|
516
|
+
|
|
517
|
+
# NumPy stats
|
|
518
|
+
if report.array_stats:
|
|
519
|
+
lines += ["**Array Statistics:**", ""]
|
|
520
|
+
for key, stats in report.array_stats.items():
|
|
521
|
+
lines += [
|
|
522
|
+
f"*{key}*: shape={stats['shape']}, dtype={stats['dtype']} ",
|
|
523
|
+
f"min={stats['min']:.4f}, max={stats['max']:.4f}, mean={stats['mean']:.4f}",
|
|
524
|
+
f"NaN count: {stats.get('nan_count', 0)}",
|
|
525
|
+
"",
|
|
526
|
+
]
|
|
527
|
+
|
|
528
|
+
# HDF5
|
|
529
|
+
if report.hdf5_keys:
|
|
530
|
+
lines += [
|
|
531
|
+
"**HDF5 Datasets:**", "",
|
|
532
|
+
"| Dataset | Shape |",
|
|
533
|
+
"| :--- | :--- |",
|
|
534
|
+
]
|
|
535
|
+
for key in report.hdf5_keys:
|
|
536
|
+
lines.append(f"| {key} | {report.hdf5_shapes[key]} |")
|
|
537
|
+
lines.append("")
|
|
538
|
+
|
|
539
|
+
# Split verification
|
|
540
|
+
lines += ["---", "", "## D. Split Verification", ""]
|
|
541
|
+
if report.splits_found:
|
|
542
|
+
total_split = sum(report.splits_found.values()) or 1
|
|
543
|
+
lines += [
|
|
544
|
+
"| Split | Count | % |",
|
|
545
|
+
"| :--- | ---: | ---: |",
|
|
546
|
+
]
|
|
547
|
+
for split, count in report.splits_found.items():
|
|
548
|
+
pct = count / total_split * 100
|
|
549
|
+
lines.append(f"| {split} | {count:,} | {pct:.1f}% |")
|
|
550
|
+
|
|
551
|
+
# Check for reasonable split ratios
|
|
552
|
+
counts = list(report.splits_found.values())
|
|
553
|
+
if len(counts) >= 2:
|
|
554
|
+
train_count = report.splits_found.get("train", counts[0])
|
|
555
|
+
if train_count / total_split < 0.5:
|
|
556
|
+
report.warnings.append(
|
|
557
|
+
f"Training split is only {train_count/total_split*100:.0f}% of total data. "
|
|
558
|
+
f"Typical splits: 70-80% train, 10-15% val, 10-15% test."
|
|
559
|
+
)
|
|
560
|
+
else:
|
|
561
|
+
lines.append("⚠ No explicit split directories found.")
|
|
562
|
+
lines.append(
|
|
563
|
+
"Ensure you implement train/val/test splits in your DataLoader. "
|
|
564
|
+
"Recommended: 70/15/15 or 80/10/10."
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
# Warnings and errors
|
|
568
|
+
if report.warnings or report.errors:
|
|
569
|
+
lines += ["", "---", "", "## E. Issues & Recommendations", ""]
|
|
570
|
+
for w in report.warnings:
|
|
571
|
+
lines.append(f"⚠ **Warning:** {w}")
|
|
572
|
+
for e in report.errors:
|
|
573
|
+
lines.append(f"✗ **Error:** {e}")
|
|
574
|
+
lines.append("")
|
|
575
|
+
|
|
576
|
+
lines += [
|
|
577
|
+
"---",
|
|
578
|
+
"",
|
|
579
|
+
"## F. EDA Summary for TSK-001",
|
|
580
|
+
"",
|
|
581
|
+
"| Check | Status |",
|
|
582
|
+
"| :--- | :--- |",
|
|
583
|
+
f"| Class distribution analyzed | {'✓' if report.classes or report.label_distribution or report.annotation_classes else '✗'} |",
|
|
584
|
+
f"| Missing/corrupt data checked | {'✓' if not report.errors else '⚠'} |",
|
|
585
|
+
f"| Class imbalance assessed | {'✓' if report.classes or report.label_distribution else 'N/A'} |",
|
|
586
|
+
f"| Split structure verified | {'✓' if report.splits_found else '⚠ manual check needed'} |",
|
|
587
|
+
f"| Issues found | {len(report.warnings)} warning(s), {len(report.errors)} error(s) |",
|
|
588
|
+
"",
|
|
589
|
+
"*Complete this EDA before proceeding to TSK-002 (DataLoader implementation).*",
|
|
590
|
+
]
|
|
591
|
+
|
|
592
|
+
return "\n".join(lines)
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
# ── Main ───────────────────────────────────────────────────────────────────────
|
|
596
|
+
|
|
597
|
+
def analyze(data_path: Path, split_names: list[str] | None = None) -> EDAReport:
|
|
598
|
+
split_names = split_names or ["train", "val", "test", "validation"]
|
|
599
|
+
report = EDAReport(data_path=data_path, format_detected="unknown")
|
|
600
|
+
fmt = detect_format(data_path)
|
|
601
|
+
|
|
602
|
+
if fmt == "image_dir":
|
|
603
|
+
analyze_image_dir(data_path, report, split_names)
|
|
604
|
+
elif fmt == "csv":
|
|
605
|
+
analyze_csv(data_path, report)
|
|
606
|
+
elif fmt == "numpy":
|
|
607
|
+
analyze_numpy(data_path, report)
|
|
608
|
+
elif fmt == "hdf5":
|
|
609
|
+
analyze_hdf5(data_path, report)
|
|
610
|
+
elif fmt == "json_annotations":
|
|
611
|
+
analyze_json_annotations(data_path, report)
|
|
612
|
+
else:
|
|
613
|
+
report.errors.append(
|
|
614
|
+
f"Unrecognized format for '{data_path}'. "
|
|
615
|
+
f"Supported: image directory, CSV, .npy/.npz, .h5/.hdf5, .json (annotations)"
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
return report
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def main() -> int:
|
|
622
|
+
parser = argparse.ArgumentParser(description="ML EDA Analyzer for BMAD DL Lifecycle")
|
|
623
|
+
parser.add_argument("data_path", type=Path, help="Path to dataset file or directory")
|
|
624
|
+
parser.add_argument("--output", type=Path, default=None,
|
|
625
|
+
help="Output markdown report path (default: <data_path>_eda_report.md)")
|
|
626
|
+
parser.add_argument("--splits", nargs="+", default=["train", "val", "test"],
|
|
627
|
+
help="Expected split directory names")
|
|
628
|
+
args = parser.parse_args()
|
|
629
|
+
|
|
630
|
+
if not args.data_path.exists():
|
|
631
|
+
print(f"Error: Path not found: {args.data_path}", file=sys.stderr)
|
|
632
|
+
return 2
|
|
633
|
+
|
|
634
|
+
report = analyze(args.data_path, args.splits)
|
|
635
|
+
markdown = generate_markdown_report(report)
|
|
636
|
+
|
|
637
|
+
output_path = args.output or args.data_path.parent / f"{args.data_path.stem}_eda_report.md"
|
|
638
|
+
output_path.write_text(markdown, encoding="utf-8")
|
|
639
|
+
|
|
640
|
+
print(f"\n✓ EDA report written to: {output_path}")
|
|
641
|
+
print(f" Format: {report.format_detected}")
|
|
642
|
+
if report.classes:
|
|
643
|
+
print(f" Classes: {len(report.classes)}, Total samples: {report.total_images:,}")
|
|
644
|
+
if report.warnings:
|
|
645
|
+
print(f" ⚠ {len(report.warnings)} warning(s) — review report for details")
|
|
646
|
+
if report.errors:
|
|
647
|
+
print(f" ✗ {len(report.errors)} error(s)")
|
|
648
|
+
return 1
|
|
649
|
+
|
|
650
|
+
return 0
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
if __name__ == "__main__":
|
|
654
|
+
sys.exit(main())
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "ML Exploratory Data Analysis (EDA)",
|
|
3
|
+
"description": "Performs statistical analysis, establishes performance baselines, and interprets results with the Scientist Demerzel.",
|
|
4
|
+
"version": "1.0.0",
|
|
5
|
+
"author": "Demerzel (ML Scientist)",
|
|
6
|
+
"tags": ["Machine Learning", "Exploratory Data Analysis", "EDA", "Baselines"]
|
|
7
|
+
}
|
|
File without changes
|