ma-agents 3.3.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/.opencode/skills/.ma-agents.json +99 -99
  2. package/.roo/skills/.ma-agents.json +99 -99
  3. package/README.md +19 -1
  4. package/bin/cli.js +55 -0
  5. package/lib/agents.js +23 -0
  6. package/lib/bmad-cache/cache-manifest.json +1 -1
  7. package/lib/bmad-customizations/bmm-demerzel.customize.yaml +36 -0
  8. package/lib/bmad-customizations/demerzel.md +32 -0
  9. package/lib/bmad-extension/module-help.csv +13 -0
  10. package/lib/bmad-extension/skills/bmad-ma-agent-ml/.gitkeep +0 -0
  11. package/lib/bmad-extension/skills/bmad-ma-agent-ml/SKILL.md +59 -0
  12. package/lib/bmad-extension/skills/bmad-ma-agent-ml/bmad-skill-manifest.yaml +11 -0
  13. package/lib/bmad-extension/skills/generate-backlog/.gitkeep +0 -0
  14. package/lib/bmad-extension/skills/ml-advise/.gitkeep +0 -0
  15. package/lib/bmad-extension/skills/ml-advise/SKILL.md +76 -0
  16. package/lib/bmad-extension/skills/ml-advise/bmad-skill-manifest.yaml +3 -0
  17. package/lib/bmad-extension/skills/ml-advise/skill.json +7 -0
  18. package/lib/bmad-extension/skills/ml-analysis/.gitkeep +0 -0
  19. package/lib/bmad-extension/skills/ml-analysis/SKILL.md +60 -0
  20. package/lib/bmad-extension/skills/ml-analysis/bmad-skill-manifest.yaml +3 -0
  21. package/lib/bmad-extension/skills/ml-analysis/skill.json +7 -0
  22. package/lib/bmad-extension/skills/ml-architecture/.gitkeep +0 -0
  23. package/lib/bmad-extension/skills/ml-architecture/SKILL.md +55 -0
  24. package/lib/bmad-extension/skills/ml-architecture/bmad-skill-manifest.yaml +3 -0
  25. package/lib/bmad-extension/skills/ml-architecture/skill.json +7 -0
  26. package/lib/bmad-extension/skills/ml-detailed-design/.gitkeep +0 -0
  27. package/lib/bmad-extension/skills/ml-detailed-design/SKILL.md +67 -0
  28. package/lib/bmad-extension/skills/ml-detailed-design/bmad-skill-manifest.yaml +3 -0
  29. package/lib/bmad-extension/skills/ml-detailed-design/skill.json +7 -0
  30. package/lib/bmad-extension/skills/ml-eda/.gitkeep +0 -0
  31. package/lib/bmad-extension/skills/ml-eda/SKILL.md +56 -0
  32. package/lib/bmad-extension/skills/ml-eda/bmad-skill-manifest.yaml +3 -0
  33. package/lib/bmad-extension/skills/ml-eda/scripts/baseline_classifier.py +522 -0
  34. package/lib/bmad-extension/skills/ml-eda/scripts/class_weights_calculator.py +295 -0
  35. package/lib/bmad-extension/skills/ml-eda/scripts/clustering_explorer.py +383 -0
  36. package/lib/bmad-extension/skills/ml-eda/scripts/eda_analyzer.py +654 -0
  37. package/lib/bmad-extension/skills/ml-eda/skill.json +7 -0
  38. package/lib/bmad-extension/skills/ml-experiment/.gitkeep +0 -0
  39. package/lib/bmad-extension/skills/ml-experiment/SKILL.md +74 -0
  40. package/lib/bmad-extension/skills/ml-experiment/assets/advanced_trainer_configs.py +430 -0
  41. package/lib/bmad-extension/skills/ml-experiment/assets/quick_trainer_setup.py +233 -0
  42. package/lib/bmad-extension/skills/ml-experiment/assets/template_datamodule.py +219 -0
  43. package/lib/bmad-extension/skills/ml-experiment/assets/template_gnn_module.py +341 -0
  44. package/lib/bmad-extension/skills/ml-experiment/assets/template_lightning_module.py +158 -0
  45. package/lib/bmad-extension/skills/ml-experiment/bmad-skill-manifest.yaml +3 -0
  46. package/lib/bmad-extension/skills/ml-experiment/skill.json +7 -0
  47. package/lib/bmad-extension/skills/ml-hparam/.gitkeep +0 -0
  48. package/lib/bmad-extension/skills/ml-hparam/SKILL.md +81 -0
  49. package/lib/bmad-extension/skills/ml-hparam/bmad-skill-manifest.yaml +3 -0
  50. package/lib/bmad-extension/skills/ml-hparam/skill.json +7 -0
  51. package/lib/bmad-extension/skills/ml-ideation/.gitkeep +0 -0
  52. package/lib/bmad-extension/skills/ml-ideation/SKILL.md +50 -0
  53. package/lib/bmad-extension/skills/ml-ideation/bmad-skill-manifest.yaml +3 -0
  54. package/lib/bmad-extension/skills/ml-ideation/scripts/validate_ml_prd.py +287 -0
  55. package/lib/bmad-extension/skills/ml-ideation/skill.json +7 -0
  56. package/lib/bmad-extension/skills/ml-infra/.gitkeep +0 -0
  57. package/lib/bmad-extension/skills/ml-infra/SKILL.md +58 -0
  58. package/lib/bmad-extension/skills/ml-infra/bmad-skill-manifest.yaml +3 -0
  59. package/lib/bmad-extension/skills/ml-infra/skill.json +7 -0
  60. package/lib/bmad-extension/skills/ml-retrospective/.gitkeep +0 -0
  61. package/lib/bmad-extension/skills/ml-retrospective/SKILL.md +63 -0
  62. package/lib/bmad-extension/skills/ml-retrospective/bmad-skill-manifest.yaml +3 -0
  63. package/lib/bmad-extension/skills/ml-retrospective/skill.json +7 -0
  64. package/lib/bmad-extension/skills/ml-revision/.gitkeep +0 -0
  65. package/lib/bmad-extension/skills/ml-revision/SKILL.md +82 -0
  66. package/lib/bmad-extension/skills/ml-revision/bmad-skill-manifest.yaml +3 -0
  67. package/lib/bmad-extension/skills/ml-revision/skill.json +7 -0
  68. package/lib/bmad-extension/skills/ml-techspec/.gitkeep +0 -0
  69. package/lib/bmad-extension/skills/ml-techspec/SKILL.md +80 -0
  70. package/lib/bmad-extension/skills/ml-techspec/bmad-skill-manifest.yaml +3 -0
  71. package/lib/bmad-extension/skills/ml-techspec/skill.json +7 -0
  72. package/lib/bmad.js +85 -8
  73. package/lib/skill-authoring.js +1 -1
  74. package/package.json +2 -2
  75. package/test/agent-injection-strategy.test.js +4 -4
  76. package/test/bmad-version-bump.test.js +34 -34
  77. package/test/build-bmad-args.test.js +13 -6
  78. package/test/convert-agents-to-skills.test.js +11 -1
  79. package/test/extension-module-restructure.test.js +31 -7
  80. package/test/migration-validation.test.js +14 -11
@@ -0,0 +1,654 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ eda_analyzer.py — BMAD DL Lifecycle (inspired by K-Dense claude-scientific-skills)
4
+ ML-focused Exploratory Data Analysis for common deep learning data formats.
5
+
6
+ Supported formats:
7
+ - Image datasets : directory of images (class-labeled subdirs or flat)
8
+ - CSV / TSV : tabular feature/label datasets
9
+ - NumPy : .npy / .npz arrays
10
+ - HDF5 : .h5 / .hdf5 files
11
+ - JSON annotations: COCO-style or flat label files
12
+
13
+ Generates a structured markdown EDA report aligned with TSK-001 requirements:
14
+ class distributions, annotation quality, missing values, split verification.
15
+
16
+ Usage:
17
+ python3 scripts/eda_analyzer.py <data_path> [--output report.md] [--splits train val test]
18
+ python3 scripts/eda_analyzer.py data/images/ --splits train val test
19
+ python3 scripts/eda_analyzer.py data/features.csv
20
+
21
+ Exit codes:
22
+ 0 — success, report written
23
+ 1 — warnings (partial data)
24
+ 2 — error
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import argparse
30
+ import json
31
+ import sys
32
+ from collections import Counter, defaultdict
33
+ from dataclasses import dataclass, field
34
+ from datetime import datetime
35
+ from pathlib import Path
36
+ from typing import Any
37
+
38
+
39
+ # ── Optional imports ───────────────────────────────────────────────────────────
40
+
41
+ try:
42
+ import numpy as np
43
+ HAS_NUMPY = True
44
+ except ImportError:
45
+ HAS_NUMPY = False
46
+
47
+ try:
48
+ import csv as _csv
49
+ HAS_CSV = True
50
+ except ImportError:
51
+ HAS_CSV = False # csv is stdlib, always available
52
+
53
+ HAS_PIL = False
54
+ try:
55
+ from PIL import Image
56
+ HAS_PIL = True
57
+ except ImportError:
58
+ pass
59
+
60
+ HAS_H5PY = False
61
+ try:
62
+ import h5py
63
+ HAS_H5PY = True
64
+ except ImportError:
65
+ pass
66
+
67
+
68
+ # ── Data structures ────────────────────────────────────────────────────────────
69
+
70
+ IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}
71
+
72
+ @dataclass
73
+ class ClassInfo:
74
+ name: str
75
+ count: int
76
+ sample_files: list[str] = field(default_factory=list)
77
+
78
+
79
+ @dataclass
80
+ class EDAReport:
81
+ data_path: Path
82
+ format_detected: str
83
+ warnings: list[str] = field(default_factory=list)
84
+ errors: list[str] = field(default_factory=list)
85
+
86
+ # Image dataset
87
+ total_images: int = 0
88
+ classes: list[ClassInfo] = field(default_factory=list)
89
+ image_sizes: list[tuple[int, int]] = field(default_factory=list)
90
+ corrupt_files: list[str] = field(default_factory=list)
91
+ splits_found: dict[str, int] = field(default_factory=dict)
92
+
93
+ # Tabular
94
+ num_rows: int = 0
95
+ num_cols: int = 0
96
+ columns: list[str] = field(default_factory=list)
97
+ missing_values: dict[str, int] = field(default_factory=dict)
98
+ label_distribution: dict[str, int] = field(default_factory=dict)
99
+ numeric_stats: dict[str, dict[str, float]] = field(default_factory=dict)
100
+
101
+ # NumPy
102
+ array_shapes: list[tuple] = field(default_factory=list)
103
+ array_dtypes: list[str] = field(default_factory=list)
104
+ array_stats: dict[str, Any] = field(default_factory=dict)
105
+
106
+ # HDF5
107
+ hdf5_keys: list[str] = field(default_factory=list)
108
+ hdf5_shapes: dict[str, tuple] = field(default_factory=dict)
109
+
110
+ # Annotation JSON
111
+ annotation_classes: dict[str, int] = field(default_factory=dict)
112
+ annotation_count: int = 0
113
+ images_without_annotations: int = 0
114
+
115
+
116
+ # ── Format detection ───────────────────────────────────────────────────────────
117
+
118
+ def detect_format(path: Path) -> str:
119
+ if path.is_dir():
120
+ # Check if it's an image dataset directory
121
+ image_files = list(path.rglob("*"))
122
+ if any(f.suffix.lower() in IMAGE_EXTENSIONS for f in image_files[:50]):
123
+ return "image_dir"
124
+ return "unknown_dir"
125
+ suffix = path.suffix.lower()
126
+ if suffix in (".csv", ".tsv"):
127
+ return "csv"
128
+ if suffix in (".npy", ".npz"):
129
+ return "numpy"
130
+ if suffix in (".h5", ".hdf5"):
131
+ return "hdf5"
132
+ if suffix == ".json":
133
+ return "json_annotations"
134
+ return "unknown"
135
+
136
+
137
+ # ── Image dataset analysis ─────────────────────────────────────────────────────
138
+
139
+ def analyze_image_dir(path: Path, report: EDAReport, split_names: list[str]) -> None:
140
+ report.format_detected = "Image Dataset (directory)"
141
+
142
+ # Check for split subdirectories
143
+ subdirs = [d for d in path.iterdir() if d.is_dir()]
144
+ split_dirs = [d for d in subdirs if d.name.lower() in (s.lower() for s in split_names)]
145
+
146
+ if split_dirs:
147
+ # Split-structured dataset: path/train/class/img.jpg
148
+ for split_dir in split_dirs:
149
+ class_dirs = [d for d in split_dir.iterdir() if d.is_dir()]
150
+ split_count = sum(
151
+ len([f for f in cd.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS])
152
+ for cd in class_dirs
153
+ )
154
+ report.splits_found[split_dir.name] = split_count
155
+
156
+ # Analyze class distribution from train split (or first split)
157
+ ref_split = next((d for d in split_dirs if "train" in d.name.lower()), split_dirs[0])
158
+ _analyze_class_structure(ref_split, report)
159
+ else:
160
+ # Flat or class-labeled directory: path/class/img.jpg
161
+ _analyze_class_structure(path, report)
162
+
163
+ report.total_images = sum(c.count for c in report.classes)
164
+
165
+ # Sample image sizes
166
+ if HAS_PIL:
167
+ _sample_image_sizes(path, report)
168
+ else:
169
+ report.warnings.append(
170
+ "PIL not installed — skipping image size and corruption checks. "
171
+ "Install with: pip install Pillow"
172
+ )
173
+
174
+ # Class balance check
175
+ if report.classes:
176
+ counts = [c.count for c in report.classes]
177
+ max_c, min_c = max(counts), min(counts)
178
+ if min_c > 0 and max_c / min_c > 5:
179
+ report.warnings.append(
180
+ f"Severe class imbalance detected: ratio {max_c/min_c:.1f}:1 "
181
+ f"({report.classes[counts.index(max_c)].name} vs "
182
+ f"{report.classes[counts.index(min_c)].name}). "
183
+ f"Consider oversampling, undersampling, or weighted loss."
184
+ )
185
+ elif min_c > 0 and max_c / min_c > 2:
186
+ report.warnings.append(
187
+ f"Moderate class imbalance: ratio {max_c/min_c:.1f}:1. "
188
+ f"Monitor per-class metrics during training."
189
+ )
190
+
191
+
192
+ def _analyze_class_structure(base_dir: Path, report: EDAReport) -> None:
193
+ class_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
194
+ if class_dirs:
195
+ for class_dir in sorted(class_dirs):
196
+ images = [f for f in class_dir.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS]
197
+ report.classes.append(ClassInfo(
198
+ name=class_dir.name,
199
+ count=len(images),
200
+ sample_files=[f.name for f in images[:3]],
201
+ ))
202
+ else:
203
+ # Flat directory
204
+ images = [f for f in base_dir.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS]
205
+ if images:
206
+ report.classes.append(ClassInfo(name="(unlabeled)", count=len(images)))
207
+ report.warnings.append(
208
+ "No class subdirectories found — images appear unlabeled. "
209
+ "Organize into class subdirectories for supervised training."
210
+ )
211
+
212
+
213
+ def _sample_image_sizes(base_dir: Path, report: EDAReport, max_samples: int = 100) -> None:
214
+ all_images = list(base_dir.rglob("*"))
215
+ image_files = [f for f in all_images if f.suffix.lower() in IMAGE_EXTENSIONS][:max_samples]
216
+ sizes: list[tuple[int, int]] = []
217
+
218
+ for img_path in image_files:
219
+ try:
220
+ with Image.open(img_path) as img:
221
+ sizes.append(img.size) # (width, height)
222
+ except Exception:
223
+ report.corrupt_files.append(str(img_path.relative_to(base_dir)))
224
+
225
+ report.image_sizes = sizes
226
+ if report.corrupt_files:
227
+ report.warnings.append(
228
+ f"{len(report.corrupt_files)} corrupt or unreadable image(s) found."
229
+ )
230
+
231
+ if sizes:
232
+ unique_sizes = set(sizes)
233
+ if len(unique_sizes) > 1:
234
+ report.warnings.append(
235
+ f"Inconsistent image sizes detected: {len(unique_sizes)} unique sizes in sample. "
236
+ f"Most common: {Counter(sizes).most_common(1)[0][0]}. "
237
+ f"Consider resizing to a fixed resolution."
238
+ )
239
+
240
+
241
+ # ── CSV / tabular analysis ─────────────────────────────────────────────────────
242
+
243
+ def analyze_csv(path: Path, report: EDAReport) -> None:
244
+ import csv
245
+ report.format_detected = "CSV/Tabular Dataset"
246
+
247
+ with path.open(newline="", encoding="utf-8", errors="replace") as f:
248
+ reader = csv.DictReader(f)
249
+ rows = list(reader)
250
+
251
+ if not rows:
252
+ report.errors.append("CSV file is empty.")
253
+ return
254
+
255
+ report.columns = list(rows[0].keys())
256
+ report.num_rows = len(rows)
257
+ report.num_cols = len(report.columns)
258
+
259
+ # Missing values per column
260
+ for col in report.columns:
261
+ missing = sum(1 for row in rows if not row.get(col, "").strip())
262
+ if missing:
263
+ report.missing_values[col] = missing
264
+
265
+ # Detect label column (common names)
266
+ label_col = next(
267
+ (c for c in report.columns if c.lower() in ("label", "class", "target", "y", "category")),
268
+ None,
269
+ )
270
+ if label_col:
271
+ label_counts = Counter(row[label_col].strip() for row in rows if row.get(label_col))
272
+ report.label_distribution = dict(label_counts.most_common())
273
+
274
+ counts = list(label_counts.values())
275
+ if counts and max(counts) / max(min(counts), 1) > 5:
276
+ report.warnings.append(
277
+ f"Class imbalance in '{label_col}': "
278
+ f"ratio {max(counts)/min(counts):.1f}:1"
279
+ )
280
+ else:
281
+ report.warnings.append(
282
+ "No standard label column found (tried: label, class, target, y, category). "
283
+ "Verify your column names."
284
+ )
285
+
286
+ # Numeric stats for up to 10 columns
287
+ if HAS_NUMPY:
288
+ numeric_cols = []
289
+ for col in report.columns[:20]:
290
+ try:
291
+ vals = [float(row[col]) for row in rows if row.get(col, "").strip()]
292
+ if vals:
293
+ arr = np.array(vals)
294
+ report.numeric_stats[col] = {
295
+ "mean": float(arr.mean()),
296
+ "std": float(arr.std()),
297
+ "min": float(arr.min()),
298
+ "max": float(arr.max()),
299
+ "missing": report.missing_values.get(col, 0),
300
+ }
301
+ numeric_cols.append(col)
302
+ if len(numeric_cols) >= 10:
303
+ break
304
+ except ValueError:
305
+ pass
306
+
307
+
308
+ # ── NumPy analysis ─────────────────────────────────────────────────────────────
309
+
310
+ def analyze_numpy(path: Path, report: EDAReport) -> None:
311
+ if not HAS_NUMPY:
312
+ report.errors.append("numpy not installed. Install with: pip install numpy")
313
+ return
314
+
315
+ report.format_detected = "NumPy Array"
316
+ if path.suffix == ".npz":
317
+ data = np.load(path, allow_pickle=True)
318
+ for key in data.files:
319
+ arr = data[key]
320
+ report.array_shapes.append((key, arr.shape))
321
+ report.array_dtypes.append(f"{key}: {arr.dtype}")
322
+ if arr.dtype.kind in ("f", "i", "u"):
323
+ report.array_stats[key] = {
324
+ "shape": arr.shape,
325
+ "dtype": str(arr.dtype),
326
+ "min": float(arr.min()),
327
+ "max": float(arr.max()),
328
+ "mean": float(arr.mean()),
329
+ "nan_count": int(np.isnan(arr).sum()) if arr.dtype.kind == "f" else 0,
330
+ }
331
+ else:
332
+ arr = np.load(path, allow_pickle=True)
333
+ report.array_shapes.append(("array", arr.shape))
334
+ report.array_dtypes.append(str(arr.dtype))
335
+ if arr.dtype.kind in ("f", "i", "u"):
336
+ report.array_stats["array"] = {
337
+ "shape": arr.shape,
338
+ "dtype": str(arr.dtype),
339
+ "min": float(arr.min()),
340
+ "max": float(arr.max()),
341
+ "mean": float(arr.mean()),
342
+ "nan_count": int(np.isnan(arr).sum()) if arr.dtype.kind == "f" else 0,
343
+ }
344
+
345
+
346
+ # ── HDF5 analysis ──────────────────────────────────────────────────────────────
347
+
348
+ def analyze_hdf5(path: Path, report: EDAReport) -> None:
349
+ if not HAS_H5PY:
350
+ report.errors.append("h5py not installed. Install with: pip install h5py")
351
+ return
352
+
353
+ report.format_detected = "HDF5 File"
354
+ with h5py.File(path, "r") as f:
355
+ def visitor(name, obj):
356
+ if isinstance(obj, h5py.Dataset):
357
+ report.hdf5_keys.append(name)
358
+ report.hdf5_shapes[name] = obj.shape
359
+
360
+ f.visititems(visitor)
361
+
362
+ if not report.hdf5_keys:
363
+ report.warnings.append("HDF5 file contains no datasets.")
364
+
365
+
366
+ # ── JSON annotation analysis ───────────────────────────────────────────────────
367
+
368
+ def analyze_json_annotations(path: Path, report: EDAReport) -> None:
369
+ report.format_detected = "JSON Annotations"
370
+ data = json.loads(path.read_text(encoding="utf-8"))
371
+
372
+ # COCO format detection
373
+ if isinstance(data, dict) and "annotations" in data and "categories" in data:
374
+ categories = {c["id"]: c["name"] for c in data.get("categories", [])}
375
+ ann_counts: Counter = Counter()
376
+ for ann in data.get("annotations", []):
377
+ cat_id = ann.get("category_id")
378
+ ann_counts[categories.get(cat_id, f"id_{cat_id}")] += 1
379
+
380
+ report.annotation_count = len(data["annotations"])
381
+ report.annotation_classes = dict(ann_counts)
382
+
383
+ # Images without annotations
384
+ annotated_images = {ann["image_id"] for ann in data["annotations"]}
385
+ total_images = len(data.get("images", []))
386
+ report.images_without_annotations = total_images - len(annotated_images)
387
+ if report.images_without_annotations > 0:
388
+ pct = report.images_without_annotations / max(total_images, 1) * 100
389
+ report.warnings.append(
390
+ f"{report.images_without_annotations} images ({pct:.1f}%) have no annotations."
391
+ )
392
+ return
393
+
394
+ # Flat label dict format: {"image.jpg": "class_name", ...}
395
+ if isinstance(data, dict):
396
+ counter: Counter = Counter(str(v) for v in data.values())
397
+ report.annotation_classes = dict(counter)
398
+ report.annotation_count = len(data)
399
+ return
400
+
401
+ # List of dicts: [{"image": "...", "label": "..."}, ...]
402
+ if isinstance(data, list) and data and isinstance(data[0], dict):
403
+ label_key = next(
404
+ (k for k in data[0] if k.lower() in ("label", "class", "category", "target")),
405
+ None,
406
+ )
407
+ if label_key:
408
+ counter = Counter(str(item.get(label_key, "unknown")) for item in data)
409
+ report.annotation_classes = dict(counter)
410
+ report.annotation_count = len(data)
411
+
412
+
413
+ # ── Report generation ──────────────────────────────────────────────────────────
414
+
415
+ def generate_markdown_report(report: EDAReport) -> str:
416
+ lines: list[str] = []
417
+ now = datetime.now().strftime("%Y-%m-%d %H:%M")
418
+
419
+ lines += [
420
+ f"# EDA Report: {report.data_path.name}",
421
+ f"*Generated: {now} | Format: {report.format_detected}*",
422
+ "",
423
+ "---",
424
+ "",
425
+ "## A. Dataset Overview",
426
+ "",
427
+ f"| Property | Value |",
428
+ f"| :--- | :--- |",
429
+ f"| Path | `{report.data_path}` |",
430
+ f"| Format | {report.format_detected} |",
431
+ ]
432
+
433
+ # Format-specific overview
434
+ if report.classes:
435
+ lines += [
436
+ f"| Total Images | {report.total_images:,} |",
437
+ f"| Number of Classes | {len(report.classes)} |",
438
+ ]
439
+ if report.splits_found:
440
+ for split, count in report.splits_found.items():
441
+ lines.append(f"| Split: {split} | {count:,} images |")
442
+ if report.num_rows:
443
+ lines += [
444
+ f"| Rows | {report.num_rows:,} |",
445
+ f"| Columns | {report.num_cols} |",
446
+ ]
447
+
448
+ lines += ["", "---", "", "## B. Class Distribution", ""]
449
+
450
+ if report.classes:
451
+ total = sum(c.count for c in report.classes) or 1
452
+ lines += [
453
+ "| Class | Count | % of Total |",
454
+ "| :--- | ---: | ---: |",
455
+ ]
456
+ for c in sorted(report.classes, key=lambda x: -x.count):
457
+ pct = c.count / total * 100
458
+ bar = "█" * int(pct / 5)
459
+ lines.append(f"| {c.name} | {c.count:,} | {pct:.1f}% {bar} |")
460
+
461
+ elif report.label_distribution:
462
+ total = sum(report.label_distribution.values()) or 1
463
+ lines += [
464
+ "| Label | Count | % of Total |",
465
+ "| :--- | ---: | ---: |",
466
+ ]
467
+ for label, count in report.label_distribution.items():
468
+ pct = count / total * 100
469
+ lines.append(f"| {label} | {count:,} | {pct:.1f}% |")
470
+
471
+ elif report.annotation_classes:
472
+ lines += [
473
+ f"Total annotations: {report.annotation_count:,}",
474
+ "",
475
+ "| Class | Count |",
476
+ "| :--- | ---: |",
477
+ ]
478
+ for cls, count in sorted(report.annotation_classes.items(), key=lambda x: -x[1]):
479
+ lines.append(f"| {cls} | {count:,} |")
480
+
481
+ else:
482
+ lines.append("*No class/label information available.*")
483
+
484
+ lines += ["", "---", "", "## C. Data Quality Assessment", ""]
485
+
486
+ # Image sizes
487
+ if report.image_sizes:
488
+ from collections import Counter as C
489
+ size_counts = C(report.image_sizes)
490
+ most_common_size, count = size_counts.most_common(1)[0]
491
+ pct = count / len(report.image_sizes) * 100
492
+ lines += [
493
+ f"| Property | Value |",
494
+ f"| :--- | :--- |",
495
+ f"| Most common image size | {most_common_size[0]}×{most_common_size[1]}px ({pct:.0f}% of sample) |",
496
+ f"| Unique sizes in sample | {len(size_counts)} |",
497
+ f"| Corrupt/unreadable files | {len(report.corrupt_files)} |",
498
+ "",
499
+ ]
500
+
501
+ # Missing values
502
+ if report.missing_values:
503
+ total = report.num_rows or 1
504
+ lines += [
505
+ "**Missing Values:**",
506
+ "",
507
+ "| Column | Missing | % |",
508
+ "| :--- | ---: | ---: |",
509
+ ]
510
+ for col, count in sorted(report.missing_values.items(), key=lambda x: -x[1]):
511
+ lines.append(f"| {col} | {count} | {count/total*100:.1f}% |")
512
+ lines.append("")
513
+ elif report.num_rows:
514
+ lines.append("✓ No missing values detected.")
515
+ lines.append("")
516
+
517
+ # NumPy stats
518
+ if report.array_stats:
519
+ lines += ["**Array Statistics:**", ""]
520
+ for key, stats in report.array_stats.items():
521
+ lines += [
522
+ f"*{key}*: shape={stats['shape']}, dtype={stats['dtype']} ",
523
+ f"min={stats['min']:.4f}, max={stats['max']:.4f}, mean={stats['mean']:.4f}",
524
+ f"NaN count: {stats.get('nan_count', 0)}",
525
+ "",
526
+ ]
527
+
528
+ # HDF5
529
+ if report.hdf5_keys:
530
+ lines += [
531
+ "**HDF5 Datasets:**", "",
532
+ "| Dataset | Shape |",
533
+ "| :--- | :--- |",
534
+ ]
535
+ for key in report.hdf5_keys:
536
+ lines.append(f"| {key} | {report.hdf5_shapes[key]} |")
537
+ lines.append("")
538
+
539
+ # Split verification
540
+ lines += ["---", "", "## D. Split Verification", ""]
541
+ if report.splits_found:
542
+ total_split = sum(report.splits_found.values()) or 1
543
+ lines += [
544
+ "| Split | Count | % |",
545
+ "| :--- | ---: | ---: |",
546
+ ]
547
+ for split, count in report.splits_found.items():
548
+ pct = count / total_split * 100
549
+ lines.append(f"| {split} | {count:,} | {pct:.1f}% |")
550
+
551
+ # Check for reasonable split ratios
552
+ counts = list(report.splits_found.values())
553
+ if len(counts) >= 2:
554
+ train_count = report.splits_found.get("train", counts[0])
555
+ if train_count / total_split < 0.5:
556
+ report.warnings.append(
557
+ f"Training split is only {train_count/total_split*100:.0f}% of total data. "
558
+ f"Typical splits: 70-80% train, 10-15% val, 10-15% test."
559
+ )
560
+ else:
561
+ lines.append("⚠ No explicit split directories found.")
562
+ lines.append(
563
+ "Ensure you implement train/val/test splits in your DataLoader. "
564
+ "Recommended: 70/15/15 or 80/10/10."
565
+ )
566
+
567
+ # Warnings and errors
568
+ if report.warnings or report.errors:
569
+ lines += ["", "---", "", "## E. Issues & Recommendations", ""]
570
+ for w in report.warnings:
571
+ lines.append(f"⚠ **Warning:** {w}")
572
+ for e in report.errors:
573
+ lines.append(f"✗ **Error:** {e}")
574
+ lines.append("")
575
+
576
+ lines += [
577
+ "---",
578
+ "",
579
+ "## F. EDA Summary for TSK-001",
580
+ "",
581
+ "| Check | Status |",
582
+ "| :--- | :--- |",
583
+ f"| Class distribution analyzed | {'✓' if report.classes or report.label_distribution or report.annotation_classes else '✗'} |",
584
+ f"| Missing/corrupt data checked | {'✓' if not report.errors else '⚠'} |",
585
+ f"| Class imbalance assessed | {'✓' if report.classes or report.label_distribution else 'N/A'} |",
586
+ f"| Split structure verified | {'✓' if report.splits_found else '⚠ manual check needed'} |",
587
+ f"| Issues found | {len(report.warnings)} warning(s), {len(report.errors)} error(s) |",
588
+ "",
589
+ "*Complete this EDA before proceeding to TSK-002 (DataLoader implementation).*",
590
+ ]
591
+
592
+ return "\n".join(lines)
593
+
594
+
595
+ # ── Main ───────────────────────────────────────────────────────────────────────
596
+
597
+ def analyze(data_path: Path, split_names: list[str] | None = None) -> EDAReport:
598
+ split_names = split_names or ["train", "val", "test", "validation"]
599
+ report = EDAReport(data_path=data_path, format_detected="unknown")
600
+ fmt = detect_format(data_path)
601
+
602
+ if fmt == "image_dir":
603
+ analyze_image_dir(data_path, report, split_names)
604
+ elif fmt == "csv":
605
+ analyze_csv(data_path, report)
606
+ elif fmt == "numpy":
607
+ analyze_numpy(data_path, report)
608
+ elif fmt == "hdf5":
609
+ analyze_hdf5(data_path, report)
610
+ elif fmt == "json_annotations":
611
+ analyze_json_annotations(data_path, report)
612
+ else:
613
+ report.errors.append(
614
+ f"Unrecognized format for '{data_path}'. "
615
+ f"Supported: image directory, CSV, .npy/.npz, .h5/.hdf5, .json (annotations)"
616
+ )
617
+
618
+ return report
619
+
620
+
621
+ def main() -> int:
622
+ parser = argparse.ArgumentParser(description="ML EDA Analyzer for BMAD DL Lifecycle")
623
+ parser.add_argument("data_path", type=Path, help="Path to dataset file or directory")
624
+ parser.add_argument("--output", type=Path, default=None,
625
+ help="Output markdown report path (default: <data_path>_eda_report.md)")
626
+ parser.add_argument("--splits", nargs="+", default=["train", "val", "test"],
627
+ help="Expected split directory names")
628
+ args = parser.parse_args()
629
+
630
+ if not args.data_path.exists():
631
+ print(f"Error: Path not found: {args.data_path}", file=sys.stderr)
632
+ return 2
633
+
634
+ report = analyze(args.data_path, args.splits)
635
+ markdown = generate_markdown_report(report)
636
+
637
+ output_path = args.output or args.data_path.parent / f"{args.data_path.stem}_eda_report.md"
638
+ output_path.write_text(markdown, encoding="utf-8")
639
+
640
+ print(f"\n✓ EDA report written to: {output_path}")
641
+ print(f" Format: {report.format_detected}")
642
+ if report.classes:
643
+ print(f" Classes: {len(report.classes)}, Total samples: {report.total_images:,}")
644
+ if report.warnings:
645
+ print(f" ⚠ {len(report.warnings)} warning(s) — review report for details")
646
+ if report.errors:
647
+ print(f" ✗ {len(report.errors)} error(s)")
648
+ return 1
649
+
650
+ return 0
651
+
652
+
653
+ if __name__ == "__main__":
654
+ sys.exit(main())
@@ -0,0 +1,7 @@
1
+ {
2
+ "name": "ML Exploratory Data Analysis (EDA)",
3
+ "description": "Performs statistical analysis, establishes performance baselines, and interprets results with the Scientist Demerzel.",
4
+ "version": "1.0.0",
5
+ "author": "Demerzel (ML Scientist)",
6
+ "tags": ["Machine Learning", "Exploratory Data Analysis", "EDA", "Baselines"]
7
+ }