ma-agents 3.3.0 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.opencode/skills/.ma-agents.json +99 -99
- package/.roo/skills/.ma-agents.json +99 -99
- package/README.md +56 -15
- package/bin/cli.js +63 -8
- package/lib/agents.js +23 -0
- package/lib/bmad-cache/cache-manifest.json +1 -1
- package/lib/bmad-customizations/bmm-demerzel.customize.yaml +36 -0
- package/lib/bmad-customizations/demerzel.md +32 -0
- package/lib/bmad-extension/module-help.csv +13 -0
- package/lib/bmad-extension/skills/bmad-ma-agent-ml/.gitkeep +0 -0
- package/lib/bmad-extension/skills/bmad-ma-agent-ml/SKILL.md +59 -0
- package/lib/bmad-extension/skills/bmad-ma-agent-ml/bmad-skill-manifest.yaml +11 -0
- package/lib/bmad-extension/skills/generate-backlog/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-advise/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-advise/SKILL.md +76 -0
- package/lib/bmad-extension/skills/ml-advise/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-advise/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-analysis/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-analysis/SKILL.md +60 -0
- package/lib/bmad-extension/skills/ml-analysis/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-analysis/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-architecture/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-architecture/SKILL.md +55 -0
- package/lib/bmad-extension/skills/ml-architecture/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-architecture/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-detailed-design/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-detailed-design/SKILL.md +67 -0
- package/lib/bmad-extension/skills/ml-detailed-design/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-detailed-design/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-eda/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-eda/SKILL.md +56 -0
- package/lib/bmad-extension/skills/ml-eda/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/baseline_classifier.py +522 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/class_weights_calculator.py +295 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/clustering_explorer.py +383 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/eda_analyzer.py +654 -0
- package/lib/bmad-extension/skills/ml-eda/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-experiment/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-experiment/SKILL.md +74 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/advanced_trainer_configs.py +430 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/quick_trainer_setup.py +233 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/template_datamodule.py +219 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/template_gnn_module.py +341 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/template_lightning_module.py +158 -0
- package/lib/bmad-extension/skills/ml-experiment/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-experiment/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-hparam/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-hparam/SKILL.md +81 -0
- package/lib/bmad-extension/skills/ml-hparam/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-hparam/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-ideation/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-ideation/SKILL.md +50 -0
- package/lib/bmad-extension/skills/ml-ideation/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-ideation/scripts/validate_ml_prd.py +287 -0
- package/lib/bmad-extension/skills/ml-ideation/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-infra/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-infra/SKILL.md +58 -0
- package/lib/bmad-extension/skills/ml-infra/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-infra/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-retrospective/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-retrospective/SKILL.md +63 -0
- package/lib/bmad-extension/skills/ml-retrospective/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-retrospective/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-revision/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-revision/SKILL.md +82 -0
- package/lib/bmad-extension/skills/ml-revision/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-revision/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-techspec/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-techspec/SKILL.md +80 -0
- package/lib/bmad-extension/skills/ml-techspec/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-techspec/skill.json +7 -0
- package/lib/bmad.js +85 -8
- package/lib/skill-authoring.js +1 -1
- package/package.json +2 -2
- package/test/agent-injection-strategy.test.js +4 -4
- package/test/bmad-version-bump.test.js +34 -34
- package/test/build-bmad-args.test.js +13 -6
- package/test/convert-agents-to-skills.test.js +11 -1
- package/test/extension-module-restructure.test.js +31 -7
- package/test/migration-validation.test.js +14 -11
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
class_weights_calculator.py — BMAD DL Lifecycle
|
|
4
|
+
Computes class weights for imbalanced datasets to use in weighted loss functions.
|
|
5
|
+
|
|
6
|
+
Supports:
|
|
7
|
+
- Image datasets (class-labeled subdirectory layout)
|
|
8
|
+
- CSV/TSV tabular datasets
|
|
9
|
+
- JSON annotation files (COCO or flat dict)
|
|
10
|
+
|
|
11
|
+
Outputs ready-to-paste Python dict for PyTorch loss functions, plus a
|
|
12
|
+
markdown summary. Works entirely with stdlib (no external deps).
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
python3 scripts/class_weights_calculator.py <data_path> [--label-col LABEL] [--output report.md]
|
|
16
|
+
python3 scripts/class_weights_calculator.py data/images/ # image dir
|
|
17
|
+
python3 scripts/class_weights_calculator.py data/labels.csv --label-col defective
|
|
18
|
+
python3 scripts/class_weights_calculator.py data/annotations.json
|
|
19
|
+
|
|
20
|
+
Exit codes:
|
|
21
|
+
0 — success
|
|
22
|
+
2 — error
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import csv
|
|
29
|
+
import json
|
|
30
|
+
import sys
|
|
31
|
+
from collections import Counter
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
|
|
34
|
+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def count_image_classes(data_dir: Path) -> dict[str, int]:
|
|
38
|
+
"""Count images per class from class-labeled subdirectories."""
|
|
39
|
+
counts: dict[str, int] = {}
|
|
40
|
+
subdirs = [d for d in sorted(data_dir.iterdir()) if d.is_dir()]
|
|
41
|
+
if not subdirs:
|
|
42
|
+
raise ValueError(f"No subdirectories found in {data_dir}. "
|
|
43
|
+
"Expected class-labeled subdirectory layout: data/class_name/img.jpg")
|
|
44
|
+
for cls_dir in subdirs:
|
|
45
|
+
images = [f for f in cls_dir.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS]
|
|
46
|
+
if images:
|
|
47
|
+
counts[cls_dir.name] = len(images)
|
|
48
|
+
return counts
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def count_csv_classes(path: Path, label_col: str | None) -> dict[str, int]:
|
|
52
|
+
"""Count class frequencies from a CSV label column."""
|
|
53
|
+
with path.open(newline="", encoding="utf-8", errors="replace") as f:
|
|
54
|
+
reader = csv.DictReader(f)
|
|
55
|
+
rows = list(reader)
|
|
56
|
+
if not rows:
|
|
57
|
+
raise ValueError("CSV is empty")
|
|
58
|
+
|
|
59
|
+
columns = list(rows[0].keys())
|
|
60
|
+
if label_col and label_col in columns:
|
|
61
|
+
target = label_col
|
|
62
|
+
else:
|
|
63
|
+
target = next(
|
|
64
|
+
(c for c in columns if c.lower() in ("label", "class", "target", "y", "category")),
|
|
65
|
+
None,
|
|
66
|
+
)
|
|
67
|
+
if target is None:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"No label column found. Use --label-col or name your column: "
|
|
70
|
+
f"label, class, target, y, category"
|
|
71
|
+
)
|
|
72
|
+
counter = Counter(row[target].strip() for row in rows if row.get(target, "").strip())
|
|
73
|
+
return dict(counter)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def count_json_classes(path: Path) -> dict[str, int]:
|
|
77
|
+
"""Count class frequencies from JSON annotations (COCO or flat dict)."""
|
|
78
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
79
|
+
|
|
80
|
+
# COCO format
|
|
81
|
+
if isinstance(data, dict) and "annotations" in data and "categories" in data:
|
|
82
|
+
categories = {c["id"]: c["name"] for c in data.get("categories", [])}
|
|
83
|
+
counter: Counter = Counter()
|
|
84
|
+
for ann in data.get("annotations", []):
|
|
85
|
+
cat_id = ann.get("category_id")
|
|
86
|
+
counter[categories.get(cat_id, f"id_{cat_id}")] += 1
|
|
87
|
+
return dict(counter)
|
|
88
|
+
|
|
89
|
+
# Flat dict: {"img.jpg": "class_name"}
|
|
90
|
+
if isinstance(data, dict):
|
|
91
|
+
return dict(Counter(str(v) for v in data.values()))
|
|
92
|
+
|
|
93
|
+
# List of dicts
|
|
94
|
+
if isinstance(data, list) and data and isinstance(data[0], dict):
|
|
95
|
+
label_key = next(
|
|
96
|
+
(k for k in data[0] if k.lower() in ("label", "class", "category", "target")),
|
|
97
|
+
None,
|
|
98
|
+
)
|
|
99
|
+
if label_key:
|
|
100
|
+
return dict(Counter(str(item.get(label_key, "unknown")) for item in data))
|
|
101
|
+
|
|
102
|
+
raise ValueError("Unrecognized JSON annotation format")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def compute_weights(counts: dict[str, int]) -> dict[str, float]:
|
|
106
|
+
"""
|
|
107
|
+
Compute balanced class weights: weight_i = n_samples / (n_classes * count_i).
|
|
108
|
+
This is sklearn's 'balanced' strategy, equivalent to:
|
|
109
|
+
sklearn.utils.class_weight.compute_class_weight('balanced', ...)
|
|
110
|
+
"""
|
|
111
|
+
n_samples = sum(counts.values())
|
|
112
|
+
n_classes = len(counts)
|
|
113
|
+
return {
|
|
114
|
+
cls: round(n_samples / (n_classes * count), 6)
|
|
115
|
+
for cls, count in counts.items()
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def compute_inverse_freq_weights(counts: dict[str, int]) -> dict[str, float]:
|
|
120
|
+
"""Normalized inverse frequency weights: weight_i = (1/count_i) / sum(1/count_j)."""
|
|
121
|
+
inv = {cls: 1.0 / count for cls, count in counts.items()}
|
|
122
|
+
total_inv = sum(inv.values())
|
|
123
|
+
return {cls: round(v / total_inv, 6) for cls, v in inv.items()}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def generate_report(
|
|
127
|
+
counts: dict[str, int],
|
|
128
|
+
weights_balanced: dict[str, float],
|
|
129
|
+
weights_inv: dict[str, float],
|
|
130
|
+
data_path: Path,
|
|
131
|
+
) -> str:
|
|
132
|
+
total = sum(counts.values())
|
|
133
|
+
n_classes = len(counts)
|
|
134
|
+
sorted_classes = sorted(counts, key=lambda c: -counts[c])
|
|
135
|
+
|
|
136
|
+
lines: list[str] = [
|
|
137
|
+
"# Class Weights Report",
|
|
138
|
+
f"*Dataset: `{data_path.name}` | {total:,} samples | {n_classes} classes*",
|
|
139
|
+
"",
|
|
140
|
+
"---",
|
|
141
|
+
"",
|
|
142
|
+
"## A. Class Distribution",
|
|
143
|
+
"",
|
|
144
|
+
"| Class | Count | % | Imbalance Ratio |",
|
|
145
|
+
"| :--- | ---: | ---: | ---: |",
|
|
146
|
+
]
|
|
147
|
+
majority = counts[sorted_classes[0]]
|
|
148
|
+
for cls in sorted_classes:
|
|
149
|
+
pct = counts[cls] / total * 100
|
|
150
|
+
ratio = majority / counts[cls]
|
|
151
|
+
bar = "█" * int(pct / 5)
|
|
152
|
+
lines.append(f"| {cls} | {counts[cls]:,} | {pct:.1f}% {bar} | {ratio:.1f}:1 |")
|
|
153
|
+
|
|
154
|
+
max_ratio = majority / counts[sorted_classes[-1]]
|
|
155
|
+
if max_ratio > 10:
|
|
156
|
+
lines += ["", f"⚠ **Severe imbalance** detected: {max_ratio:.0f}:1 ratio. Weighted loss is strongly recommended."]
|
|
157
|
+
elif max_ratio > 3:
|
|
158
|
+
lines += ["", f"⚠ **Moderate imbalance** detected: {max_ratio:.0f}:1 ratio. Consider weighted loss."]
|
|
159
|
+
else:
|
|
160
|
+
lines += ["", f"✓ Dataset is relatively balanced ({max_ratio:.1f}:1 ratio)."]
|
|
161
|
+
|
|
162
|
+
# Balanced weights
|
|
163
|
+
sorted_by_cls = sorted(weights_balanced)
|
|
164
|
+
lines += [
|
|
165
|
+
"",
|
|
166
|
+
"---",
|
|
167
|
+
"",
|
|
168
|
+
"## B. Balanced Class Weights",
|
|
169
|
+
"",
|
|
170
|
+
"*Formula: `n_samples / (n_classes × class_count)` — equivalent to sklearn's `class_weight='balanced'`*",
|
|
171
|
+
"",
|
|
172
|
+
"| Class | Count | Weight |",
|
|
173
|
+
"| :--- | ---: | ---: |",
|
|
174
|
+
]
|
|
175
|
+
for cls in sorted_by_cls:
|
|
176
|
+
lines.append(f"| {cls} | {counts[cls]:,} | {weights_balanced[cls]:.4f} |")
|
|
177
|
+
|
|
178
|
+
# Python code snippets
|
|
179
|
+
weight_list_balanced = [weights_balanced[c] for c in sorted(weights_balanced)]
|
|
180
|
+
lines += [
|
|
181
|
+
"",
|
|
182
|
+
"### PyTorch Usage",
|
|
183
|
+
"",
|
|
184
|
+
"```python",
|
|
185
|
+
"import torch",
|
|
186
|
+
"",
|
|
187
|
+
"# Option 1: As tensor for nn.CrossEntropyLoss",
|
|
188
|
+
f"class_names = {sorted_by_cls}",
|
|
189
|
+
f"weights = torch.tensor({[round(weights_balanced[c], 4) for c in sorted_by_cls]}, dtype=torch.float)",
|
|
190
|
+
"criterion = torch.nn.CrossEntropyLoss(weight=weights.to(device))",
|
|
191
|
+
"",
|
|
192
|
+
"# Option 2: As dict (for custom loss or WeightedRandomSampler)",
|
|
193
|
+
f"class_weight_dict = {dict(zip(sorted_by_cls, [weights_balanced[c] for c in sorted_by_cls]))}",
|
|
194
|
+
"```",
|
|
195
|
+
"",
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
# Inverse freq weights
|
|
199
|
+
lines += [
|
|
200
|
+
"---",
|
|
201
|
+
"",
|
|
202
|
+
"## C. Inverse Frequency Weights (Normalized)",
|
|
203
|
+
"",
|
|
204
|
+
"*Alternative: normalized so weights sum to 1.0*",
|
|
205
|
+
"",
|
|
206
|
+
"| Class | Count | Weight |",
|
|
207
|
+
"| :--- | ---: | ---: |",
|
|
208
|
+
]
|
|
209
|
+
for cls in sorted_by_cls:
|
|
210
|
+
lines.append(f"| {cls} | {counts[cls]:,} | {weights_inv[cls]:.4f} |")
|
|
211
|
+
|
|
212
|
+
lines += [
|
|
213
|
+
"",
|
|
214
|
+
"```python",
|
|
215
|
+
"# Inverse frequency weights tensor",
|
|
216
|
+
f"weights_inv = torch.tensor({[round(weights_inv[c], 4) for c in sorted_by_cls]}, dtype=torch.float)",
|
|
217
|
+
"```",
|
|
218
|
+
"",
|
|
219
|
+
"---",
|
|
220
|
+
"",
|
|
221
|
+
"## D. Recommendations",
|
|
222
|
+
"",
|
|
223
|
+
]
|
|
224
|
+
if max_ratio > 10:
|
|
225
|
+
lines += [
|
|
226
|
+
"1. Use `CrossEntropyLoss(weight=...)` with balanced weights (Section B).",
|
|
227
|
+
"2. Consider `WeightedRandomSampler` to oversample minority classes in each batch.",
|
|
228
|
+
"3. Use per-class metrics (F1, precision, recall per class) — not just accuracy.",
|
|
229
|
+
"4. Consider Focal Loss for severe imbalance (set `gamma=2`).",
|
|
230
|
+
]
|
|
231
|
+
elif max_ratio > 3:
|
|
232
|
+
lines += [
|
|
233
|
+
"1. Apply balanced class weights to your loss function (Section B).",
|
|
234
|
+
"2. Monitor per-class F1 during training.",
|
|
235
|
+
"3. Consider data augmentation on minority classes.",
|
|
236
|
+
]
|
|
237
|
+
else:
|
|
238
|
+
lines += [
|
|
239
|
+
"1. Dataset is balanced — standard `CrossEntropyLoss` without weights is acceptable.",
|
|
240
|
+
"2. Monitor per-class metrics to catch any per-class degradation.",
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
lines += [
|
|
244
|
+
"",
|
|
245
|
+
"---",
|
|
246
|
+
"*Generated by `class_weights_calculator.py` — BMAD DL Lifecycle*",
|
|
247
|
+
]
|
|
248
|
+
return "\n".join(lines)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def main() -> int:
|
|
252
|
+
parser = argparse.ArgumentParser(description="Compute class weights for imbalanced datasets")
|
|
253
|
+
parser.add_argument("data_path", type=Path)
|
|
254
|
+
parser.add_argument("--label-col", type=str, default=None)
|
|
255
|
+
parser.add_argument("--output", type=Path, default=None)
|
|
256
|
+
args = parser.parse_args()
|
|
257
|
+
|
|
258
|
+
if not args.data_path.exists():
|
|
259
|
+
print(f"Error: Path not found: {args.data_path}", file=sys.stderr)
|
|
260
|
+
return 2
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
if args.data_path.is_dir():
|
|
264
|
+
counts = count_image_classes(args.data_path)
|
|
265
|
+
elif args.data_path.suffix.lower() in (".csv", ".tsv"):
|
|
266
|
+
counts = count_csv_classes(args.data_path, args.label_col)
|
|
267
|
+
elif args.data_path.suffix.lower() == ".json":
|
|
268
|
+
counts = count_json_classes(args.data_path)
|
|
269
|
+
else:
|
|
270
|
+
print(f"Error: Unsupported format. Use image dir, CSV, or JSON.", file=sys.stderr)
|
|
271
|
+
return 2
|
|
272
|
+
except Exception as e:
|
|
273
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
274
|
+
return 2
|
|
275
|
+
|
|
276
|
+
if not counts:
|
|
277
|
+
print("Error: No class data found.", file=sys.stderr)
|
|
278
|
+
return 2
|
|
279
|
+
|
|
280
|
+
weights_balanced = compute_weights(counts)
|
|
281
|
+
weights_inv = compute_inverse_freq_weights(counts)
|
|
282
|
+
|
|
283
|
+
report = generate_report(counts, weights_balanced, weights_inv, args.data_path)
|
|
284
|
+
|
|
285
|
+
output = args.output or args.data_path.parent / f"{args.data_path.stem}_class_weights.md"
|
|
286
|
+
output.write_text(report, encoding="utf-8")
|
|
287
|
+
|
|
288
|
+
print(f"✓ Class weight report: {output}")
|
|
289
|
+
print(f" Classes: {list(counts.keys())}")
|
|
290
|
+
print(f" Balanced weights: {weights_balanced}")
|
|
291
|
+
return 0
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
if __name__ == "__main__":
|
|
295
|
+
sys.exit(main())
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
clustering_explorer.py — BMAD DL Lifecycle
|
|
4
|
+
(Inspired by K-Dense claude-scientific-skills/scikit-learn/clustering_analysis.py)
|
|
5
|
+
|
|
6
|
+
Unsupervised cluster analysis for EDA on unlabeled or partially-labeled datasets.
|
|
7
|
+
Useful during TSK-001 to discover natural groupings before annotation or labeling.
|
|
8
|
+
|
|
9
|
+
Runs K-Means, Agglomerative, and DBSCAN; scores with Silhouette, Calinski-Harabasz,
|
|
10
|
+
and Davies-Bouldin indices; optionally saves a PCA 2D scatter plot.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
python3 scripts/clustering_explorer.py <data_csv> [--k N] [--find-k] [--output report.md] [--plot clusters.png]
|
|
14
|
+
python3 scripts/clustering_explorer.py data/features.csv --find-k --plot clusters.png
|
|
15
|
+
|
|
16
|
+
Exit codes:
|
|
17
|
+
0 — success
|
|
18
|
+
2 — error
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import csv
|
|
25
|
+
import sys
|
|
26
|
+
import warnings
|
|
27
|
+
from dataclasses import dataclass, field
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
warnings.filterwarnings("ignore")
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
import numpy as np
|
|
34
|
+
HAS_NUMPY = True
|
|
35
|
+
except ImportError:
|
|
36
|
+
HAS_NUMPY = False
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
from sklearn.preprocessing import StandardScaler
|
|
40
|
+
from sklearn.decomposition import PCA
|
|
41
|
+
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
|
42
|
+
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
|
|
43
|
+
from sklearn.impute import SimpleImputer
|
|
44
|
+
HAS_SKLEARN = True
|
|
45
|
+
except ImportError:
|
|
46
|
+
HAS_SKLEARN = False
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
import matplotlib
|
|
50
|
+
matplotlib.use("Agg")
|
|
51
|
+
import matplotlib.pyplot as plt
|
|
52
|
+
import matplotlib.cm as cm
|
|
53
|
+
HAS_MPL = True
|
|
54
|
+
except ImportError:
|
|
55
|
+
HAS_MPL = False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ── Data structures ────────────────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class ClusterResult:
|
|
62
|
+
name: str
|
|
63
|
+
n_clusters: int
|
|
64
|
+
silhouette: float | None
|
|
65
|
+
calinski: float | None
|
|
66
|
+
davies: float | None
|
|
67
|
+
labels: "list | None" = None
|
|
68
|
+
n_noise: int = 0
|
|
69
|
+
notes: str = ""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ── Data loading ───────────────────────────────────────────────────────────────
|
|
73
|
+
|
|
74
|
+
def load_numeric_csv(path: Path) -> tuple["np.ndarray", list[str]]:
|
|
75
|
+
"""Load CSV, drop non-numeric and label columns, return (X, feature_names)."""
|
|
76
|
+
with path.open(newline="", encoding="utf-8", errors="replace") as f:
|
|
77
|
+
reader = csv.DictReader(f)
|
|
78
|
+
rows = list(reader)
|
|
79
|
+
if not rows:
|
|
80
|
+
raise ValueError("CSV is empty")
|
|
81
|
+
|
|
82
|
+
columns = list(rows[0].keys())
|
|
83
|
+
# Exclude likely label columns
|
|
84
|
+
label_cols = {c for c in columns if c.lower() in ("label", "class", "target", "y", "category")}
|
|
85
|
+
|
|
86
|
+
numeric_cols: list[str] = []
|
|
87
|
+
for col in columns:
|
|
88
|
+
if col in label_cols:
|
|
89
|
+
continue
|
|
90
|
+
try:
|
|
91
|
+
[float(row[col]) for row in rows if row.get(col, "").strip()]
|
|
92
|
+
numeric_cols.append(col)
|
|
93
|
+
except ValueError:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
if not numeric_cols:
|
|
97
|
+
raise ValueError("No numeric feature columns found")
|
|
98
|
+
|
|
99
|
+
X = np.array([
|
|
100
|
+
[float(row[c]) if row.get(c, "").strip() else float("nan") for c in numeric_cols]
|
|
101
|
+
for row in rows
|
|
102
|
+
], dtype=float)
|
|
103
|
+
return X, numeric_cols
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def preprocess(X: "np.ndarray") -> "np.ndarray":
|
|
107
|
+
imputer = SimpleImputer(strategy="median")
|
|
108
|
+
scaler = StandardScaler()
|
|
109
|
+
return scaler.fit_transform(imputer.fit_transform(X))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# ── Optimal K search ───────────────────────────────────────────────────────────
|
|
113
|
+
|
|
114
|
+
def find_optimal_k(X: "np.ndarray", k_range: range) -> tuple[int, list[float], list[float]]:
|
|
115
|
+
inertias, silhouettes = [], []
|
|
116
|
+
for k in k_range:
|
|
117
|
+
km = KMeans(n_clusters=k, random_state=42, n_init=10)
|
|
118
|
+
labels = km.fit_predict(X)
|
|
119
|
+
inertias.append(km.inertia_)
|
|
120
|
+
silhouettes.append(silhouette_score(X, labels))
|
|
121
|
+
best_k = list(k_range)[int(np.argmax(silhouettes))]
|
|
122
|
+
return best_k, inertias, silhouettes
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# ── Clustering ─────────────────────────────────────────────────────────────────
|
|
126
|
+
|
|
127
|
+
def run_clustering(X: "np.ndarray", n_clusters: int) -> list[ClusterResult]:
|
|
128
|
+
results: list[ClusterResult] = []
|
|
129
|
+
|
|
130
|
+
algorithms = {
|
|
131
|
+
"K-Means": KMeans(n_clusters=n_clusters, random_state=42, n_init=10),
|
|
132
|
+
"Agglomerative": AgglomerativeClustering(n_clusters=n_clusters, linkage="ward"),
|
|
133
|
+
}
|
|
134
|
+
for name, algo in algorithms.items():
|
|
135
|
+
labels = algo.fit_predict(X)
|
|
136
|
+
try:
|
|
137
|
+
sil = silhouette_score(X, labels)
|
|
138
|
+
cal = calinski_harabasz_score(X, labels)
|
|
139
|
+
dav = davies_bouldin_score(X, labels)
|
|
140
|
+
except Exception:
|
|
141
|
+
sil = cal = dav = None
|
|
142
|
+
results.append(ClusterResult(
|
|
143
|
+
name=name, n_clusters=n_clusters,
|
|
144
|
+
silhouette=sil, calinski=cal, davies=dav, labels=labels.tolist(),
|
|
145
|
+
))
|
|
146
|
+
|
|
147
|
+
# DBSCAN (auto eps via 5th-NN heuristic)
|
|
148
|
+
try:
|
|
149
|
+
from sklearn.neighbors import NearestNeighbors
|
|
150
|
+
nn = NearestNeighbors(n_neighbors=5)
|
|
151
|
+
nn.fit(X)
|
|
152
|
+
distances, _ = nn.kneighbors(X)
|
|
153
|
+
eps = float(np.percentile(distances[:, -1], 90))
|
|
154
|
+
except Exception:
|
|
155
|
+
eps = 0.5
|
|
156
|
+
|
|
157
|
+
dbscan = DBSCAN(eps=eps, min_samples=5)
|
|
158
|
+
db_labels = dbscan.fit_predict(X)
|
|
159
|
+
unique_clusters = set(db_labels) - {-1}
|
|
160
|
+
n_noise = int((db_labels == -1).sum())
|
|
161
|
+
|
|
162
|
+
if len(unique_clusters) > 1:
|
|
163
|
+
mask = db_labels != -1
|
|
164
|
+
try:
|
|
165
|
+
sil = silhouette_score(X[mask], db_labels[mask])
|
|
166
|
+
cal = calinski_harabasz_score(X[mask], db_labels[mask])
|
|
167
|
+
dav = davies_bouldin_score(X[mask], db_labels[mask])
|
|
168
|
+
except Exception:
|
|
169
|
+
sil = cal = dav = None
|
|
170
|
+
results.append(ClusterResult(
|
|
171
|
+
name="DBSCAN", n_clusters=len(unique_clusters),
|
|
172
|
+
silhouette=sil, calinski=cal, davies=dav,
|
|
173
|
+
labels=db_labels.tolist(), n_noise=n_noise,
|
|
174
|
+
))
|
|
175
|
+
else:
|
|
176
|
+
results.append(ClusterResult(
|
|
177
|
+
name="DBSCAN", n_clusters=len(unique_clusters),
|
|
178
|
+
silhouette=None, calinski=None, davies=None,
|
|
179
|
+
labels=db_labels.tolist(), n_noise=n_noise,
|
|
180
|
+
notes=f"Only {len(unique_clusters)} cluster(s) found — try adjusting eps",
|
|
181
|
+
))
|
|
182
|
+
return results
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# ── Visualization ──────────────────────────────────────────────────────────────
|
|
186
|
+
|
|
187
|
+
def save_cluster_plot(X: "np.ndarray", results: list[ClusterResult], output_path: Path) -> None:
|
|
188
|
+
pca = PCA(n_components=2)
|
|
189
|
+
X_2d = pca.fit_transform(X)
|
|
190
|
+
var = pca.explained_variance_ratio_
|
|
191
|
+
|
|
192
|
+
n_plots = len(results)
|
|
193
|
+
ncols = min(3, n_plots)
|
|
194
|
+
nrows = (n_plots + ncols - 1) // ncols
|
|
195
|
+
|
|
196
|
+
fig, axes = plt.subplots(nrows, ncols, figsize=(5 * ncols, 4 * nrows), squeeze=False)
|
|
197
|
+
axes = axes.flatten()
|
|
198
|
+
|
|
199
|
+
for idx, r in enumerate(results):
|
|
200
|
+
ax = axes[idx]
|
|
201
|
+
labels = np.array(r.labels)
|
|
202
|
+
unique = sorted(set(labels))
|
|
203
|
+
colors = cm.tab10(np.linspace(0, 1, max(len(unique), 1)))
|
|
204
|
+
|
|
205
|
+
for i, lbl in enumerate(unique):
|
|
206
|
+
mask = labels == lbl
|
|
207
|
+
color = "gray" if lbl == -1 else colors[i % len(colors)]
|
|
208
|
+
marker = "x" if lbl == -1 else "o"
|
|
209
|
+
label = "Noise" if lbl == -1 else f"C{lbl}"
|
|
210
|
+
ax.scatter(X_2d[mask, 0], X_2d[mask, 1], c=[color], marker=marker,
|
|
211
|
+
alpha=0.6, s=20, label=label)
|
|
212
|
+
|
|
213
|
+
title = f"{r.name} (k={r.n_clusters})"
|
|
214
|
+
if r.silhouette is not None:
|
|
215
|
+
title += f"\nSil={r.silhouette:.3f}"
|
|
216
|
+
ax.set_title(title, fontsize=9)
|
|
217
|
+
ax.set_xlabel(f"PC1 ({var[0]:.1%})", fontsize=8)
|
|
218
|
+
ax.set_ylabel(f"PC2 ({var[1]:.1%})", fontsize=8)
|
|
219
|
+
ax.tick_params(labelsize=7)
|
|
220
|
+
if r.n_clusters <= 8:
|
|
221
|
+
ax.legend(fontsize=7, markerscale=1.2)
|
|
222
|
+
|
|
223
|
+
for idx in range(len(results), len(axes)):
|
|
224
|
+
axes[idx].axis("off")
|
|
225
|
+
|
|
226
|
+
plt.suptitle("Cluster Analysis — PCA Projection", fontsize=11, y=1.01)
|
|
227
|
+
plt.tight_layout()
|
|
228
|
+
plt.savefig(output_path, dpi=150, bbox_inches="tight")
|
|
229
|
+
plt.close()
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# ── Report generation ──────────────────────────────────────────────────────────
|
|
233
|
+
|
|
234
|
+
def generate_report(
|
|
235
|
+
results: list[ClusterResult],
|
|
236
|
+
data_path: Path,
|
|
237
|
+
n_samples: int,
|
|
238
|
+
n_features: int,
|
|
239
|
+
k_range: range | None,
|
|
240
|
+
inertias: list[float] | None,
|
|
241
|
+
silhouettes_per_k: list[float] | None,
|
|
242
|
+
optimal_k: int | None,
|
|
243
|
+
plot_path: Path | None,
|
|
244
|
+
) -> str:
|
|
245
|
+
lines: list[str] = [
|
|
246
|
+
"# Clustering Explorer Report",
|
|
247
|
+
f"*Dataset: `{data_path.name}` | {n_samples} samples | {n_features} features*",
|
|
248
|
+
"",
|
|
249
|
+
"---",
|
|
250
|
+
"",
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
if k_range and optimal_k:
|
|
254
|
+
lines += [
|
|
255
|
+
"## A. Optimal K Analysis (K-Means Silhouette)",
|
|
256
|
+
"",
|
|
257
|
+
"| K | Inertia | Silhouette Score |",
|
|
258
|
+
"| ---: | ---: | ---: |",
|
|
259
|
+
]
|
|
260
|
+
for k, inert, sil in zip(k_range, inertias or [], silhouettes_per_k or []):
|
|
261
|
+
marker = " ←" if k == optimal_k else ""
|
|
262
|
+
lines.append(f"| {k} | {inert:.1f} | {sil:.4f}{marker} |")
|
|
263
|
+
lines += ["", f"**Recommended K = {optimal_k}** (highest silhouette score)", "", "---", ""]
|
|
264
|
+
|
|
265
|
+
lines += [
|
|
266
|
+
"## B. Algorithm Comparison",
|
|
267
|
+
"",
|
|
268
|
+
"| Algorithm | Clusters | Silhouette ↑ | Calinski-Harabasz ↑ | Davies-Bouldin ↓ | Notes |",
|
|
269
|
+
"| :--- | ---: | ---: | ---: | ---: | :--- |",
|
|
270
|
+
]
|
|
271
|
+
for r in results:
|
|
272
|
+
sil = f"{r.silhouette:.4f}" if r.silhouette is not None else "N/A"
|
|
273
|
+
cal = f"{r.calinski:.1f}" if r.calinski is not None else "N/A"
|
|
274
|
+
dav = f"{r.davies:.4f}" if r.davies is not None else "N/A"
|
|
275
|
+
noise = f" ({r.n_noise} noise pts)" if r.n_noise else ""
|
|
276
|
+
lines.append(f"| {r.name} | {r.n_clusters}{noise} | {sil} | {cal} | {dav} | {r.notes} |")
|
|
277
|
+
|
|
278
|
+
# Best algorithm by silhouette
|
|
279
|
+
scored = [r for r in results if r.silhouette is not None]
|
|
280
|
+
if scored:
|
|
281
|
+
best = max(scored, key=lambda r: r.silhouette)
|
|
282
|
+
lines += ["", f"**Best algorithm by silhouette: {best.name}** (score: {best.silhouette:.4f})", ""]
|
|
283
|
+
|
|
284
|
+
if plot_path:
|
|
285
|
+
lines += [
|
|
286
|
+
"---",
|
|
287
|
+
"",
|
|
288
|
+
"## C. Cluster Visualization",
|
|
289
|
+
"",
|
|
290
|
+
f"",
|
|
291
|
+
"",
|
|
292
|
+
"*2D PCA projection. Colors indicate cluster assignments.*",
|
|
293
|
+
"",
|
|
294
|
+
]
|
|
295
|
+
|
|
296
|
+
lines += [
|
|
297
|
+
"---",
|
|
298
|
+
"",
|
|
299
|
+
"## D. Interpretation Guide",
|
|
300
|
+
"",
|
|
301
|
+
"| Metric | Good Range | Meaning |",
|
|
302
|
+
"| :--- | :--- | :--- |",
|
|
303
|
+
"| Silhouette Score | 0.5 – 1.0 | Points are well-separated from other clusters |",
|
|
304
|
+
"| Calinski-Harabasz | Higher = better | Dense, well-separated clusters |",
|
|
305
|
+
"| Davies-Bouldin | 0.0 – 1.0 | Low = compact, well-separated clusters |",
|
|
306
|
+
"",
|
|
307
|
+
"**Next steps:**",
|
|
308
|
+
"1. Use the best cluster assignments as pseudo-labels for semi-supervised training.",
|
|
309
|
+
"2. Investigate outlier/noise points (DBSCAN noise) — these may be rare defects or data errors.",
|
|
310
|
+
"3. If clusters align with known classes, your feature space is discriminative — good sign for DL.",
|
|
311
|
+
"",
|
|
312
|
+
"---",
|
|
313
|
+
"*Generated by `clustering_explorer.py` — BMAD DL Lifecycle (TSK-001)*",
|
|
314
|
+
]
|
|
315
|
+
return "\n".join(lines)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# ── Main ───────────────────────────────────────────────────────────────────────
|
|
319
|
+
|
|
320
|
+
def main() -> int:
|
|
321
|
+
parser = argparse.ArgumentParser(description="Unsupervised clustering explorer for BMAD DL")
|
|
322
|
+
parser.add_argument("data_csv", type=Path)
|
|
323
|
+
parser.add_argument("--k", type=int, default=3, help="Number of clusters (default: 3)")
|
|
324
|
+
parser.add_argument("--find-k", action="store_true", help="Search for optimal K (2–10)")
|
|
325
|
+
parser.add_argument("--k-max", type=int, default=10)
|
|
326
|
+
parser.add_argument("--output", type=Path, default=None)
|
|
327
|
+
parser.add_argument("--plot", type=Path, default=None, help="Save cluster plot PNG")
|
|
328
|
+
args = parser.parse_args()
|
|
329
|
+
|
|
330
|
+
if not HAS_SKLEARN or not HAS_NUMPY:
|
|
331
|
+
print("Error: scikit-learn and numpy required. Run: pip install scikit-learn numpy",
|
|
332
|
+
file=sys.stderr)
|
|
333
|
+
return 2
|
|
334
|
+
if not args.data_csv.exists():
|
|
335
|
+
print(f"Error: File not found: {args.data_csv}", file=sys.stderr)
|
|
336
|
+
return 2
|
|
337
|
+
|
|
338
|
+
try:
|
|
339
|
+
X_raw, feature_names = load_numeric_csv(args.data_csv)
|
|
340
|
+
except Exception as e:
|
|
341
|
+
print(f"Error loading CSV: {e}", file=sys.stderr)
|
|
342
|
+
return 2
|
|
343
|
+
|
|
344
|
+
X = preprocess(X_raw)
|
|
345
|
+
n_samples, n_features = X.shape
|
|
346
|
+
print(f"Dataset: {n_samples} samples, {n_features} features")
|
|
347
|
+
|
|
348
|
+
# Find optimal K
|
|
349
|
+
k_range = inertias = silhouettes_k = optimal_k = None
|
|
350
|
+
n_clusters = args.k
|
|
351
|
+
if args.find_k:
|
|
352
|
+
k_max = min(args.k_max, n_samples - 1)
|
|
353
|
+
k_range = range(2, k_max + 1)
|
|
354
|
+
print(f"Searching optimal K in range 2–{k_max}...")
|
|
355
|
+
optimal_k, inertias, silhouettes_k = find_optimal_k(X, k_range)
|
|
356
|
+
n_clusters = optimal_k
|
|
357
|
+
print(f"Optimal K = {n_clusters}")
|
|
358
|
+
|
|
359
|
+
print(f"Running clustering with k={n_clusters}...")
|
|
360
|
+
results = run_clustering(X, n_clusters)
|
|
361
|
+
|
|
362
|
+
# Plot
|
|
363
|
+
plot_path: Path | None = None
|
|
364
|
+
if args.plot:
|
|
365
|
+
if HAS_MPL:
|
|
366
|
+
save_cluster_plot(X, results, args.plot)
|
|
367
|
+
plot_path = args.plot
|
|
368
|
+
print(f"✓ Cluster plot: {plot_path}")
|
|
369
|
+
else:
|
|
370
|
+
print("⚠ matplotlib not available — skipping plot")
|
|
371
|
+
|
|
372
|
+
report = generate_report(
|
|
373
|
+
results, args.data_csv, n_samples, n_features,
|
|
374
|
+
k_range, inertias, silhouettes_k, optimal_k, plot_path,
|
|
375
|
+
)
|
|
376
|
+
output = args.output or args.data_csv.parent / f"{args.data_csv.stem}_clustering_report.md"
|
|
377
|
+
output.write_text(report, encoding="utf-8")
|
|
378
|
+
print(f"✓ Report: {output}")
|
|
379
|
+
return 0
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
if __name__ == "__main__":
|
|
383
|
+
sys.exit(main())
|