ma-agents 3.3.0 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/.opencode/skills/.ma-agents.json +99 -99
  2. package/.roo/skills/.ma-agents.json +99 -99
  3. package/README.md +56 -15
  4. package/bin/cli.js +63 -8
  5. package/lib/agents.js +23 -0
  6. package/lib/bmad-cache/cache-manifest.json +1 -1
  7. package/lib/bmad-customizations/bmm-demerzel.customize.yaml +36 -0
  8. package/lib/bmad-customizations/demerzel.md +32 -0
  9. package/lib/bmad-extension/module-help.csv +13 -0
  10. package/lib/bmad-extension/skills/bmad-ma-agent-ml/.gitkeep +0 -0
  11. package/lib/bmad-extension/skills/bmad-ma-agent-ml/SKILL.md +59 -0
  12. package/lib/bmad-extension/skills/bmad-ma-agent-ml/bmad-skill-manifest.yaml +11 -0
  13. package/lib/bmad-extension/skills/generate-backlog/.gitkeep +0 -0
  14. package/lib/bmad-extension/skills/ml-advise/.gitkeep +0 -0
  15. package/lib/bmad-extension/skills/ml-advise/SKILL.md +76 -0
  16. package/lib/bmad-extension/skills/ml-advise/bmad-skill-manifest.yaml +3 -0
  17. package/lib/bmad-extension/skills/ml-advise/skill.json +7 -0
  18. package/lib/bmad-extension/skills/ml-analysis/.gitkeep +0 -0
  19. package/lib/bmad-extension/skills/ml-analysis/SKILL.md +60 -0
  20. package/lib/bmad-extension/skills/ml-analysis/bmad-skill-manifest.yaml +3 -0
  21. package/lib/bmad-extension/skills/ml-analysis/skill.json +7 -0
  22. package/lib/bmad-extension/skills/ml-architecture/.gitkeep +0 -0
  23. package/lib/bmad-extension/skills/ml-architecture/SKILL.md +55 -0
  24. package/lib/bmad-extension/skills/ml-architecture/bmad-skill-manifest.yaml +3 -0
  25. package/lib/bmad-extension/skills/ml-architecture/skill.json +7 -0
  26. package/lib/bmad-extension/skills/ml-detailed-design/.gitkeep +0 -0
  27. package/lib/bmad-extension/skills/ml-detailed-design/SKILL.md +67 -0
  28. package/lib/bmad-extension/skills/ml-detailed-design/bmad-skill-manifest.yaml +3 -0
  29. package/lib/bmad-extension/skills/ml-detailed-design/skill.json +7 -0
  30. package/lib/bmad-extension/skills/ml-eda/.gitkeep +0 -0
  31. package/lib/bmad-extension/skills/ml-eda/SKILL.md +56 -0
  32. package/lib/bmad-extension/skills/ml-eda/bmad-skill-manifest.yaml +3 -0
  33. package/lib/bmad-extension/skills/ml-eda/scripts/baseline_classifier.py +522 -0
  34. package/lib/bmad-extension/skills/ml-eda/scripts/class_weights_calculator.py +295 -0
  35. package/lib/bmad-extension/skills/ml-eda/scripts/clustering_explorer.py +383 -0
  36. package/lib/bmad-extension/skills/ml-eda/scripts/eda_analyzer.py +654 -0
  37. package/lib/bmad-extension/skills/ml-eda/skill.json +7 -0
  38. package/lib/bmad-extension/skills/ml-experiment/.gitkeep +0 -0
  39. package/lib/bmad-extension/skills/ml-experiment/SKILL.md +74 -0
  40. package/lib/bmad-extension/skills/ml-experiment/assets/advanced_trainer_configs.py +430 -0
  41. package/lib/bmad-extension/skills/ml-experiment/assets/quick_trainer_setup.py +233 -0
  42. package/lib/bmad-extension/skills/ml-experiment/assets/template_datamodule.py +219 -0
  43. package/lib/bmad-extension/skills/ml-experiment/assets/template_gnn_module.py +341 -0
  44. package/lib/bmad-extension/skills/ml-experiment/assets/template_lightning_module.py +158 -0
  45. package/lib/bmad-extension/skills/ml-experiment/bmad-skill-manifest.yaml +3 -0
  46. package/lib/bmad-extension/skills/ml-experiment/skill.json +7 -0
  47. package/lib/bmad-extension/skills/ml-hparam/.gitkeep +0 -0
  48. package/lib/bmad-extension/skills/ml-hparam/SKILL.md +81 -0
  49. package/lib/bmad-extension/skills/ml-hparam/bmad-skill-manifest.yaml +3 -0
  50. package/lib/bmad-extension/skills/ml-hparam/skill.json +7 -0
  51. package/lib/bmad-extension/skills/ml-ideation/.gitkeep +0 -0
  52. package/lib/bmad-extension/skills/ml-ideation/SKILL.md +50 -0
  53. package/lib/bmad-extension/skills/ml-ideation/bmad-skill-manifest.yaml +3 -0
  54. package/lib/bmad-extension/skills/ml-ideation/scripts/validate_ml_prd.py +287 -0
  55. package/lib/bmad-extension/skills/ml-ideation/skill.json +7 -0
  56. package/lib/bmad-extension/skills/ml-infra/.gitkeep +0 -0
  57. package/lib/bmad-extension/skills/ml-infra/SKILL.md +58 -0
  58. package/lib/bmad-extension/skills/ml-infra/bmad-skill-manifest.yaml +3 -0
  59. package/lib/bmad-extension/skills/ml-infra/skill.json +7 -0
  60. package/lib/bmad-extension/skills/ml-retrospective/.gitkeep +0 -0
  61. package/lib/bmad-extension/skills/ml-retrospective/SKILL.md +63 -0
  62. package/lib/bmad-extension/skills/ml-retrospective/bmad-skill-manifest.yaml +3 -0
  63. package/lib/bmad-extension/skills/ml-retrospective/skill.json +7 -0
  64. package/lib/bmad-extension/skills/ml-revision/.gitkeep +0 -0
  65. package/lib/bmad-extension/skills/ml-revision/SKILL.md +82 -0
  66. package/lib/bmad-extension/skills/ml-revision/bmad-skill-manifest.yaml +3 -0
  67. package/lib/bmad-extension/skills/ml-revision/skill.json +7 -0
  68. package/lib/bmad-extension/skills/ml-techspec/.gitkeep +0 -0
  69. package/lib/bmad-extension/skills/ml-techspec/SKILL.md +80 -0
  70. package/lib/bmad-extension/skills/ml-techspec/bmad-skill-manifest.yaml +3 -0
  71. package/lib/bmad-extension/skills/ml-techspec/skill.json +7 -0
  72. package/lib/bmad.js +85 -8
  73. package/lib/skill-authoring.js +1 -1
  74. package/package.json +2 -2
  75. package/test/agent-injection-strategy.test.js +4 -4
  76. package/test/bmad-version-bump.test.js +34 -34
  77. package/test/build-bmad-args.test.js +13 -6
  78. package/test/convert-agents-to-skills.test.js +11 -1
  79. package/test/extension-module-restructure.test.js +31 -7
  80. package/test/migration-validation.test.js +14 -11
@@ -0,0 +1,295 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ class_weights_calculator.py — BMAD DL Lifecycle
4
+ Computes class weights for imbalanced datasets to use in weighted loss functions.
5
+
6
+ Supports:
7
+ - Image datasets (class-labeled subdirectory layout)
8
+ - CSV/TSV tabular datasets
9
+ - JSON annotation files (COCO or flat dict)
10
+
11
+ Outputs ready-to-paste Python dict for PyTorch loss functions, plus a
12
+ markdown summary. Works entirely with stdlib (no external deps).
13
+
14
+ Usage:
15
+ python3 scripts/class_weights_calculator.py <data_path> [--label-col LABEL] [--output report.md]
16
+ python3 scripts/class_weights_calculator.py data/images/ # image dir
17
+ python3 scripts/class_weights_calculator.py data/labels.csv --label-col defective
18
+ python3 scripts/class_weights_calculator.py data/annotations.json
19
+
20
+ Exit codes:
21
+ 0 — success
22
+ 2 — error
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import csv
29
+ import json
30
+ import sys
31
+ from collections import Counter
32
+ from pathlib import Path
33
+
34
+ IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}
35
+
36
+
37
+ def count_image_classes(data_dir: Path) -> dict[str, int]:
38
+ """Count images per class from class-labeled subdirectories."""
39
+ counts: dict[str, int] = {}
40
+ subdirs = [d for d in sorted(data_dir.iterdir()) if d.is_dir()]
41
+ if not subdirs:
42
+ raise ValueError(f"No subdirectories found in {data_dir}. "
43
+ "Expected class-labeled subdirectory layout: data/class_name/img.jpg")
44
+ for cls_dir in subdirs:
45
+ images = [f for f in cls_dir.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS]
46
+ if images:
47
+ counts[cls_dir.name] = len(images)
48
+ return counts
49
+
50
+
51
+ def count_csv_classes(path: Path, label_col: str | None) -> dict[str, int]:
52
+ """Count class frequencies from a CSV label column."""
53
+ with path.open(newline="", encoding="utf-8", errors="replace") as f:
54
+ reader = csv.DictReader(f)
55
+ rows = list(reader)
56
+ if not rows:
57
+ raise ValueError("CSV is empty")
58
+
59
+ columns = list(rows[0].keys())
60
+ if label_col and label_col in columns:
61
+ target = label_col
62
+ else:
63
+ target = next(
64
+ (c for c in columns if c.lower() in ("label", "class", "target", "y", "category")),
65
+ None,
66
+ )
67
+ if target is None:
68
+ raise ValueError(
69
+ f"No label column found. Use --label-col or name your column: "
70
+ f"label, class, target, y, category"
71
+ )
72
+ counter = Counter(row[target].strip() for row in rows if row.get(target, "").strip())
73
+ return dict(counter)
74
+
75
+
76
+ def count_json_classes(path: Path) -> dict[str, int]:
77
+ """Count class frequencies from JSON annotations (COCO or flat dict)."""
78
+ data = json.loads(path.read_text(encoding="utf-8"))
79
+
80
+ # COCO format
81
+ if isinstance(data, dict) and "annotations" in data and "categories" in data:
82
+ categories = {c["id"]: c["name"] for c in data.get("categories", [])}
83
+ counter: Counter = Counter()
84
+ for ann in data.get("annotations", []):
85
+ cat_id = ann.get("category_id")
86
+ counter[categories.get(cat_id, f"id_{cat_id}")] += 1
87
+ return dict(counter)
88
+
89
+ # Flat dict: {"img.jpg": "class_name"}
90
+ if isinstance(data, dict):
91
+ return dict(Counter(str(v) for v in data.values()))
92
+
93
+ # List of dicts
94
+ if isinstance(data, list) and data and isinstance(data[0], dict):
95
+ label_key = next(
96
+ (k for k in data[0] if k.lower() in ("label", "class", "category", "target")),
97
+ None,
98
+ )
99
+ if label_key:
100
+ return dict(Counter(str(item.get(label_key, "unknown")) for item in data))
101
+
102
+ raise ValueError("Unrecognized JSON annotation format")
103
+
104
+
105
+ def compute_weights(counts: dict[str, int]) -> dict[str, float]:
106
+ """
107
+ Compute balanced class weights: weight_i = n_samples / (n_classes * count_i).
108
+ This is sklearn's 'balanced' strategy, equivalent to:
109
+ sklearn.utils.class_weight.compute_class_weight('balanced', ...)
110
+ """
111
+ n_samples = sum(counts.values())
112
+ n_classes = len(counts)
113
+ return {
114
+ cls: round(n_samples / (n_classes * count), 6)
115
+ for cls, count in counts.items()
116
+ }
117
+
118
+
119
+ def compute_inverse_freq_weights(counts: dict[str, int]) -> dict[str, float]:
120
+ """Normalized inverse frequency weights: weight_i = (1/count_i) / sum(1/count_j)."""
121
+ inv = {cls: 1.0 / count for cls, count in counts.items()}
122
+ total_inv = sum(inv.values())
123
+ return {cls: round(v / total_inv, 6) for cls, v in inv.items()}
124
+
125
+
126
+ def generate_report(
127
+ counts: dict[str, int],
128
+ weights_balanced: dict[str, float],
129
+ weights_inv: dict[str, float],
130
+ data_path: Path,
131
+ ) -> str:
132
+ total = sum(counts.values())
133
+ n_classes = len(counts)
134
+ sorted_classes = sorted(counts, key=lambda c: -counts[c])
135
+
136
+ lines: list[str] = [
137
+ "# Class Weights Report",
138
+ f"*Dataset: `{data_path.name}` | {total:,} samples | {n_classes} classes*",
139
+ "",
140
+ "---",
141
+ "",
142
+ "## A. Class Distribution",
143
+ "",
144
+ "| Class | Count | % | Imbalance Ratio |",
145
+ "| :--- | ---: | ---: | ---: |",
146
+ ]
147
+ majority = counts[sorted_classes[0]]
148
+ for cls in sorted_classes:
149
+ pct = counts[cls] / total * 100
150
+ ratio = majority / counts[cls]
151
+ bar = "█" * int(pct / 5)
152
+ lines.append(f"| {cls} | {counts[cls]:,} | {pct:.1f}% {bar} | {ratio:.1f}:1 |")
153
+
154
+ max_ratio = majority / counts[sorted_classes[-1]]
155
+ if max_ratio > 10:
156
+ lines += ["", f"⚠ **Severe imbalance** detected: {max_ratio:.0f}:1 ratio. Weighted loss is strongly recommended."]
157
+ elif max_ratio > 3:
158
+ lines += ["", f"⚠ **Moderate imbalance** detected: {max_ratio:.0f}:1 ratio. Consider weighted loss."]
159
+ else:
160
+ lines += ["", f"✓ Dataset is relatively balanced ({max_ratio:.1f}:1 ratio)."]
161
+
162
+ # Balanced weights
163
+ sorted_by_cls = sorted(weights_balanced)
164
+ lines += [
165
+ "",
166
+ "---",
167
+ "",
168
+ "## B. Balanced Class Weights",
169
+ "",
170
+ "*Formula: `n_samples / (n_classes × class_count)` — equivalent to sklearn's `class_weight='balanced'`*",
171
+ "",
172
+ "| Class | Count | Weight |",
173
+ "| :--- | ---: | ---: |",
174
+ ]
175
+ for cls in sorted_by_cls:
176
+ lines.append(f"| {cls} | {counts[cls]:,} | {weights_balanced[cls]:.4f} |")
177
+
178
+ # Python code snippets
179
+ weight_list_balanced = [weights_balanced[c] for c in sorted(weights_balanced)]
180
+ lines += [
181
+ "",
182
+ "### PyTorch Usage",
183
+ "",
184
+ "```python",
185
+ "import torch",
186
+ "",
187
+ "# Option 1: As tensor for nn.CrossEntropyLoss",
188
+ f"class_names = {sorted_by_cls}",
189
+ f"weights = torch.tensor({[round(weights_balanced[c], 4) for c in sorted_by_cls]}, dtype=torch.float)",
190
+ "criterion = torch.nn.CrossEntropyLoss(weight=weights.to(device))",
191
+ "",
192
+ "# Option 2: As dict (for custom loss or WeightedRandomSampler)",
193
+ f"class_weight_dict = {dict(zip(sorted_by_cls, [weights_balanced[c] for c in sorted_by_cls]))}",
194
+ "```",
195
+ "",
196
+ ]
197
+
198
+ # Inverse freq weights
199
+ lines += [
200
+ "---",
201
+ "",
202
+ "## C. Inverse Frequency Weights (Normalized)",
203
+ "",
204
+ "*Alternative: normalized so weights sum to 1.0*",
205
+ "",
206
+ "| Class | Count | Weight |",
207
+ "| :--- | ---: | ---: |",
208
+ ]
209
+ for cls in sorted_by_cls:
210
+ lines.append(f"| {cls} | {counts[cls]:,} | {weights_inv[cls]:.4f} |")
211
+
212
+ lines += [
213
+ "",
214
+ "```python",
215
+ "# Inverse frequency weights tensor",
216
+ f"weights_inv = torch.tensor({[round(weights_inv[c], 4) for c in sorted_by_cls]}, dtype=torch.float)",
217
+ "```",
218
+ "",
219
+ "---",
220
+ "",
221
+ "## D. Recommendations",
222
+ "",
223
+ ]
224
+ if max_ratio > 10:
225
+ lines += [
226
+ "1. Use `CrossEntropyLoss(weight=...)` with balanced weights (Section B).",
227
+ "2. Consider `WeightedRandomSampler` to oversample minority classes in each batch.",
228
+ "3. Use per-class metrics (F1, precision, recall per class) — not just accuracy.",
229
+ "4. Consider Focal Loss for severe imbalance (set `gamma=2`).",
230
+ ]
231
+ elif max_ratio > 3:
232
+ lines += [
233
+ "1. Apply balanced class weights to your loss function (Section B).",
234
+ "2. Monitor per-class F1 during training.",
235
+ "3. Consider data augmentation on minority classes.",
236
+ ]
237
+ else:
238
+ lines += [
239
+ "1. Dataset is balanced — standard `CrossEntropyLoss` without weights is acceptable.",
240
+ "2. Monitor per-class metrics to catch any per-class degradation.",
241
+ ]
242
+
243
+ lines += [
244
+ "",
245
+ "---",
246
+ "*Generated by `class_weights_calculator.py` — BMAD DL Lifecycle*",
247
+ ]
248
+ return "\n".join(lines)
249
+
250
+
251
+ def main() -> int:
252
+ parser = argparse.ArgumentParser(description="Compute class weights for imbalanced datasets")
253
+ parser.add_argument("data_path", type=Path)
254
+ parser.add_argument("--label-col", type=str, default=None)
255
+ parser.add_argument("--output", type=Path, default=None)
256
+ args = parser.parse_args()
257
+
258
+ if not args.data_path.exists():
259
+ print(f"Error: Path not found: {args.data_path}", file=sys.stderr)
260
+ return 2
261
+
262
+ try:
263
+ if args.data_path.is_dir():
264
+ counts = count_image_classes(args.data_path)
265
+ elif args.data_path.suffix.lower() in (".csv", ".tsv"):
266
+ counts = count_csv_classes(args.data_path, args.label_col)
267
+ elif args.data_path.suffix.lower() == ".json":
268
+ counts = count_json_classes(args.data_path)
269
+ else:
270
+ print(f"Error: Unsupported format. Use image dir, CSV, or JSON.", file=sys.stderr)
271
+ return 2
272
+ except Exception as e:
273
+ print(f"Error: {e}", file=sys.stderr)
274
+ return 2
275
+
276
+ if not counts:
277
+ print("Error: No class data found.", file=sys.stderr)
278
+ return 2
279
+
280
+ weights_balanced = compute_weights(counts)
281
+ weights_inv = compute_inverse_freq_weights(counts)
282
+
283
+ report = generate_report(counts, weights_balanced, weights_inv, args.data_path)
284
+
285
+ output = args.output or args.data_path.parent / f"{args.data_path.stem}_class_weights.md"
286
+ output.write_text(report, encoding="utf-8")
287
+
288
+ print(f"✓ Class weight report: {output}")
289
+ print(f" Classes: {list(counts.keys())}")
290
+ print(f" Balanced weights: {weights_balanced}")
291
+ return 0
292
+
293
+
294
+ if __name__ == "__main__":
295
+ sys.exit(main())
@@ -0,0 +1,383 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ clustering_explorer.py — BMAD DL Lifecycle
4
+ (Inspired by K-Dense claude-scientific-skills/scikit-learn/clustering_analysis.py)
5
+
6
+ Unsupervised cluster analysis for EDA on unlabeled or partially-labeled datasets.
7
+ Useful during TSK-001 to discover natural groupings before annotation or labeling.
8
+
9
+ Runs K-Means, Agglomerative, and DBSCAN; scores with Silhouette, Calinski-Harabasz,
10
+ and Davies-Bouldin indices; optionally saves a PCA 2D scatter plot.
11
+
12
+ Usage:
13
+ python3 scripts/clustering_explorer.py <data_csv> [--k N] [--find-k] [--output report.md] [--plot clusters.png]
14
+ python3 scripts/clustering_explorer.py data/features.csv --find-k --plot clusters.png
15
+
16
+ Exit codes:
17
+ 0 — success
18
+ 2 — error
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import csv
25
+ import sys
26
+ import warnings
27
+ from dataclasses import dataclass, field
28
+ from pathlib import Path
29
+
30
+ warnings.filterwarnings("ignore")
31
+
32
+ try:
33
+ import numpy as np
34
+ HAS_NUMPY = True
35
+ except ImportError:
36
+ HAS_NUMPY = False
37
+
38
+ try:
39
+ from sklearn.preprocessing import StandardScaler
40
+ from sklearn.decomposition import PCA
41
+ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
42
+ from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
43
+ from sklearn.impute import SimpleImputer
44
+ HAS_SKLEARN = True
45
+ except ImportError:
46
+ HAS_SKLEARN = False
47
+
48
+ try:
49
+ import matplotlib
50
+ matplotlib.use("Agg")
51
+ import matplotlib.pyplot as plt
52
+ import matplotlib.cm as cm
53
+ HAS_MPL = True
54
+ except ImportError:
55
+ HAS_MPL = False
56
+
57
+
58
+ # ── Data structures ────────────────────────────────────────────────────────────
59
+
60
+ @dataclass
61
+ class ClusterResult:
62
+ name: str
63
+ n_clusters: int
64
+ silhouette: float | None
65
+ calinski: float | None
66
+ davies: float | None
67
+ labels: "list | None" = None
68
+ n_noise: int = 0
69
+ notes: str = ""
70
+
71
+
72
+ # ── Data loading ───────────────────────────────────────────────────────────────
73
+
74
+ def load_numeric_csv(path: Path) -> tuple["np.ndarray", list[str]]:
75
+ """Load CSV, drop non-numeric and label columns, return (X, feature_names)."""
76
+ with path.open(newline="", encoding="utf-8", errors="replace") as f:
77
+ reader = csv.DictReader(f)
78
+ rows = list(reader)
79
+ if not rows:
80
+ raise ValueError("CSV is empty")
81
+
82
+ columns = list(rows[0].keys())
83
+ # Exclude likely label columns
84
+ label_cols = {c for c in columns if c.lower() in ("label", "class", "target", "y", "category")}
85
+
86
+ numeric_cols: list[str] = []
87
+ for col in columns:
88
+ if col in label_cols:
89
+ continue
90
+ try:
91
+ [float(row[col]) for row in rows if row.get(col, "").strip()]
92
+ numeric_cols.append(col)
93
+ except ValueError:
94
+ pass
95
+
96
+ if not numeric_cols:
97
+ raise ValueError("No numeric feature columns found")
98
+
99
+ X = np.array([
100
+ [float(row[c]) if row.get(c, "").strip() else float("nan") for c in numeric_cols]
101
+ for row in rows
102
+ ], dtype=float)
103
+ return X, numeric_cols
104
+
105
+
106
+ def preprocess(X: "np.ndarray") -> "np.ndarray":
107
+ imputer = SimpleImputer(strategy="median")
108
+ scaler = StandardScaler()
109
+ return scaler.fit_transform(imputer.fit_transform(X))
110
+
111
+
112
+ # ── Optimal K search ───────────────────────────────────────────────────────────
113
+
114
+ def find_optimal_k(X: "np.ndarray", k_range: range) -> tuple[int, list[float], list[float]]:
115
+ inertias, silhouettes = [], []
116
+ for k in k_range:
117
+ km = KMeans(n_clusters=k, random_state=42, n_init=10)
118
+ labels = km.fit_predict(X)
119
+ inertias.append(km.inertia_)
120
+ silhouettes.append(silhouette_score(X, labels))
121
+ best_k = list(k_range)[int(np.argmax(silhouettes))]
122
+ return best_k, inertias, silhouettes
123
+
124
+
125
+ # ── Clustering ─────────────────────────────────────────────────────────────────
126
+
127
+ def run_clustering(X: "np.ndarray", n_clusters: int) -> list[ClusterResult]:
128
+ results: list[ClusterResult] = []
129
+
130
+ algorithms = {
131
+ "K-Means": KMeans(n_clusters=n_clusters, random_state=42, n_init=10),
132
+ "Agglomerative": AgglomerativeClustering(n_clusters=n_clusters, linkage="ward"),
133
+ }
134
+ for name, algo in algorithms.items():
135
+ labels = algo.fit_predict(X)
136
+ try:
137
+ sil = silhouette_score(X, labels)
138
+ cal = calinski_harabasz_score(X, labels)
139
+ dav = davies_bouldin_score(X, labels)
140
+ except Exception:
141
+ sil = cal = dav = None
142
+ results.append(ClusterResult(
143
+ name=name, n_clusters=n_clusters,
144
+ silhouette=sil, calinski=cal, davies=dav, labels=labels.tolist(),
145
+ ))
146
+
147
+ # DBSCAN (auto eps via 5th-NN heuristic)
148
+ try:
149
+ from sklearn.neighbors import NearestNeighbors
150
+ nn = NearestNeighbors(n_neighbors=5)
151
+ nn.fit(X)
152
+ distances, _ = nn.kneighbors(X)
153
+ eps = float(np.percentile(distances[:, -1], 90))
154
+ except Exception:
155
+ eps = 0.5
156
+
157
+ dbscan = DBSCAN(eps=eps, min_samples=5)
158
+ db_labels = dbscan.fit_predict(X)
159
+ unique_clusters = set(db_labels) - {-1}
160
+ n_noise = int((db_labels == -1).sum())
161
+
162
+ if len(unique_clusters) > 1:
163
+ mask = db_labels != -1
164
+ try:
165
+ sil = silhouette_score(X[mask], db_labels[mask])
166
+ cal = calinski_harabasz_score(X[mask], db_labels[mask])
167
+ dav = davies_bouldin_score(X[mask], db_labels[mask])
168
+ except Exception:
169
+ sil = cal = dav = None
170
+ results.append(ClusterResult(
171
+ name="DBSCAN", n_clusters=len(unique_clusters),
172
+ silhouette=sil, calinski=cal, davies=dav,
173
+ labels=db_labels.tolist(), n_noise=n_noise,
174
+ ))
175
+ else:
176
+ results.append(ClusterResult(
177
+ name="DBSCAN", n_clusters=len(unique_clusters),
178
+ silhouette=None, calinski=None, davies=None,
179
+ labels=db_labels.tolist(), n_noise=n_noise,
180
+ notes=f"Only {len(unique_clusters)} cluster(s) found — try adjusting eps",
181
+ ))
182
+ return results
183
+
184
+
185
+ # ── Visualization ──────────────────────────────────────────────────────────────
186
+
187
+ def save_cluster_plot(X: "np.ndarray", results: list[ClusterResult], output_path: Path) -> None:
188
+ pca = PCA(n_components=2)
189
+ X_2d = pca.fit_transform(X)
190
+ var = pca.explained_variance_ratio_
191
+
192
+ n_plots = len(results)
193
+ ncols = min(3, n_plots)
194
+ nrows = (n_plots + ncols - 1) // ncols
195
+
196
+ fig, axes = plt.subplots(nrows, ncols, figsize=(5 * ncols, 4 * nrows), squeeze=False)
197
+ axes = axes.flatten()
198
+
199
+ for idx, r in enumerate(results):
200
+ ax = axes[idx]
201
+ labels = np.array(r.labels)
202
+ unique = sorted(set(labels))
203
+ colors = cm.tab10(np.linspace(0, 1, max(len(unique), 1)))
204
+
205
+ for i, lbl in enumerate(unique):
206
+ mask = labels == lbl
207
+ color = "gray" if lbl == -1 else colors[i % len(colors)]
208
+ marker = "x" if lbl == -1 else "o"
209
+ label = "Noise" if lbl == -1 else f"C{lbl}"
210
+ ax.scatter(X_2d[mask, 0], X_2d[mask, 1], c=[color], marker=marker,
211
+ alpha=0.6, s=20, label=label)
212
+
213
+ title = f"{r.name} (k={r.n_clusters})"
214
+ if r.silhouette is not None:
215
+ title += f"\nSil={r.silhouette:.3f}"
216
+ ax.set_title(title, fontsize=9)
217
+ ax.set_xlabel(f"PC1 ({var[0]:.1%})", fontsize=8)
218
+ ax.set_ylabel(f"PC2 ({var[1]:.1%})", fontsize=8)
219
+ ax.tick_params(labelsize=7)
220
+ if r.n_clusters <= 8:
221
+ ax.legend(fontsize=7, markerscale=1.2)
222
+
223
+ for idx in range(len(results), len(axes)):
224
+ axes[idx].axis("off")
225
+
226
+ plt.suptitle("Cluster Analysis — PCA Projection", fontsize=11, y=1.01)
227
+ plt.tight_layout()
228
+ plt.savefig(output_path, dpi=150, bbox_inches="tight")
229
+ plt.close()
230
+
231
+
232
+ # ── Report generation ──────────────────────────────────────────────────────────
233
+
234
+ def generate_report(
235
+ results: list[ClusterResult],
236
+ data_path: Path,
237
+ n_samples: int,
238
+ n_features: int,
239
+ k_range: range | None,
240
+ inertias: list[float] | None,
241
+ silhouettes_per_k: list[float] | None,
242
+ optimal_k: int | None,
243
+ plot_path: Path | None,
244
+ ) -> str:
245
+ lines: list[str] = [
246
+ "# Clustering Explorer Report",
247
+ f"*Dataset: `{data_path.name}` | {n_samples} samples | {n_features} features*",
248
+ "",
249
+ "---",
250
+ "",
251
+ ]
252
+
253
+ if k_range and optimal_k:
254
+ lines += [
255
+ "## A. Optimal K Analysis (K-Means Silhouette)",
256
+ "",
257
+ "| K | Inertia | Silhouette Score |",
258
+ "| ---: | ---: | ---: |",
259
+ ]
260
+ for k, inert, sil in zip(k_range, inertias or [], silhouettes_per_k or []):
261
+ marker = " ←" if k == optimal_k else ""
262
+ lines.append(f"| {k} | {inert:.1f} | {sil:.4f}{marker} |")
263
+ lines += ["", f"**Recommended K = {optimal_k}** (highest silhouette score)", "", "---", ""]
264
+
265
+ lines += [
266
+ "## B. Algorithm Comparison",
267
+ "",
268
+ "| Algorithm | Clusters | Silhouette ↑ | Calinski-Harabasz ↑ | Davies-Bouldin ↓ | Notes |",
269
+ "| :--- | ---: | ---: | ---: | ---: | :--- |",
270
+ ]
271
+ for r in results:
272
+ sil = f"{r.silhouette:.4f}" if r.silhouette is not None else "N/A"
273
+ cal = f"{r.calinski:.1f}" if r.calinski is not None else "N/A"
274
+ dav = f"{r.davies:.4f}" if r.davies is not None else "N/A"
275
+ noise = f" ({r.n_noise} noise pts)" if r.n_noise else ""
276
+ lines.append(f"| {r.name} | {r.n_clusters}{noise} | {sil} | {cal} | {dav} | {r.notes} |")
277
+
278
+ # Best algorithm by silhouette
279
+ scored = [r for r in results if r.silhouette is not None]
280
+ if scored:
281
+ best = max(scored, key=lambda r: r.silhouette)
282
+ lines += ["", f"**Best algorithm by silhouette: {best.name}** (score: {best.silhouette:.4f})", ""]
283
+
284
+ if plot_path:
285
+ lines += [
286
+ "---",
287
+ "",
288
+ "## C. Cluster Visualization",
289
+ "",
290
+ f"![Cluster scatter]({plot_path.name})",
291
+ "",
292
+ "*2D PCA projection. Colors indicate cluster assignments.*",
293
+ "",
294
+ ]
295
+
296
+ lines += [
297
+ "---",
298
+ "",
299
+ "## D. Interpretation Guide",
300
+ "",
301
+ "| Metric | Good Range | Meaning |",
302
+ "| :--- | :--- | :--- |",
303
+ "| Silhouette Score | 0.5 – 1.0 | Points are well-separated from other clusters |",
304
+ "| Calinski-Harabasz | Higher = better | Dense, well-separated clusters |",
305
+ "| Davies-Bouldin | 0.0 – 1.0 | Low = compact, well-separated clusters |",
306
+ "",
307
+ "**Next steps:**",
308
+ "1. Use the best cluster assignments as pseudo-labels for semi-supervised training.",
309
+ "2. Investigate outlier/noise points (DBSCAN noise) — these may be rare defects or data errors.",
310
+ "3. If clusters align with known classes, your feature space is discriminative — good sign for DL.",
311
+ "",
312
+ "---",
313
+ "*Generated by `clustering_explorer.py` — BMAD DL Lifecycle (TSK-001)*",
314
+ ]
315
+ return "\n".join(lines)
316
+
317
+
318
+ # ── Main ───────────────────────────────────────────────────────────────────────
319
+
320
+ def main() -> int:
321
+ parser = argparse.ArgumentParser(description="Unsupervised clustering explorer for BMAD DL")
322
+ parser.add_argument("data_csv", type=Path)
323
+ parser.add_argument("--k", type=int, default=3, help="Number of clusters (default: 3)")
324
+ parser.add_argument("--find-k", action="store_true", help="Search for optimal K (2–10)")
325
+ parser.add_argument("--k-max", type=int, default=10)
326
+ parser.add_argument("--output", type=Path, default=None)
327
+ parser.add_argument("--plot", type=Path, default=None, help="Save cluster plot PNG")
328
+ args = parser.parse_args()
329
+
330
+ if not HAS_SKLEARN or not HAS_NUMPY:
331
+ print("Error: scikit-learn and numpy required. Run: pip install scikit-learn numpy",
332
+ file=sys.stderr)
333
+ return 2
334
+ if not args.data_csv.exists():
335
+ print(f"Error: File not found: {args.data_csv}", file=sys.stderr)
336
+ return 2
337
+
338
+ try:
339
+ X_raw, feature_names = load_numeric_csv(args.data_csv)
340
+ except Exception as e:
341
+ print(f"Error loading CSV: {e}", file=sys.stderr)
342
+ return 2
343
+
344
+ X = preprocess(X_raw)
345
+ n_samples, n_features = X.shape
346
+ print(f"Dataset: {n_samples} samples, {n_features} features")
347
+
348
+ # Find optimal K
349
+ k_range = inertias = silhouettes_k = optimal_k = None
350
+ n_clusters = args.k
351
+ if args.find_k:
352
+ k_max = min(args.k_max, n_samples - 1)
353
+ k_range = range(2, k_max + 1)
354
+ print(f"Searching optimal K in range 2–{k_max}...")
355
+ optimal_k, inertias, silhouettes_k = find_optimal_k(X, k_range)
356
+ n_clusters = optimal_k
357
+ print(f"Optimal K = {n_clusters}")
358
+
359
+ print(f"Running clustering with k={n_clusters}...")
360
+ results = run_clustering(X, n_clusters)
361
+
362
+ # Plot
363
+ plot_path: Path | None = None
364
+ if args.plot:
365
+ if HAS_MPL:
366
+ save_cluster_plot(X, results, args.plot)
367
+ plot_path = args.plot
368
+ print(f"✓ Cluster plot: {plot_path}")
369
+ else:
370
+ print("⚠ matplotlib not available — skipping plot")
371
+
372
+ report = generate_report(
373
+ results, args.data_csv, n_samples, n_features,
374
+ k_range, inertias, silhouettes_k, optimal_k, plot_path,
375
+ )
376
+ output = args.output or args.data_csv.parent / f"{args.data_csv.stem}_clustering_report.md"
377
+ output.write_text(report, encoding="utf-8")
378
+ print(f"✓ Report: {output}")
379
+ return 0
380
+
381
+
382
+ if __name__ == "__main__":
383
+ sys.exit(main())