@nahisaho/satori 0.24.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -22
- package/package.json +1 -1
- package/src/.github/skills/scientific-adaptive-experiments/SKILL.md +287 -0
- package/src/.github/skills/scientific-federated-learning/SKILL.md +241 -0
- package/src/.github/skills/scientific-multi-task-learning/SKILL.md +238 -0
- package/src/.github/skills/scientific-neural-architecture-search/SKILL.md +206 -0
- package/src/.github/skills/scientific-radiology-ai/SKILL.md +285 -0
- package/src/.github/skills/scientific-semi-supervised-learning/SKILL.md +210 -0
- package/src/.github/skills/scientific-statistical-simulation/SKILL.md +227 -0
- package/src/.github/skills/scientific-streaming-analytics/SKILL.md +221 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-radiology-ai
|
|
3
|
+
description: |
|
|
4
|
+
放射線診断支援 AI スキル。CADe/CADx パイプライン・
|
|
5
|
+
CT/MRI 分類・セグメンテーション・Grad-CAM 説明可能性・
|
|
6
|
+
構造化レポート・AI-RADS グレーディング。
|
|
7
|
+
※ scientific-medical-imaging (DICOM/WSI/Radiomics) の
|
|
8
|
+
放射線診断 AI 特化拡張。
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Scientific Radiology AI
|
|
12
|
+
|
|
13
|
+
放射線画像(CT/MRI/X 線)に対する AI 診断支援
|
|
14
|
+
パイプラインを提供する。MONAI ベースの学習・推論・
|
|
15
|
+
説明可能性・構造化レポート生成を含む。
|
|
16
|
+
|
|
17
|
+
## When to Use
|
|
18
|
+
|
|
19
|
+
- CT/MRI/X 線画像の AI 分類・セグメンテーションを行うとき
|
|
20
|
+
- CADe (検出) / CADx (診断) パイプラインを構築するとき
|
|
21
|
+
- Grad-CAM で AI 判断の説明可能性を付与するとき
|
|
22
|
+
- 構造化放射線レポートを自動生成するとき
|
|
23
|
+
- AI-RADS スコアリングを実装するとき
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
## 1. MONAI 放射線 AI 分類パイプライン
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import numpy as np
|
|
33
|
+
import torch
|
|
34
|
+
import torch.nn as nn
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def build_radiology_classifier(in_channels=1, num_classes=2,
|
|
38
|
+
spatial_dims=3,
|
|
39
|
+
architecture="densenet121"):
|
|
40
|
+
"""
|
|
41
|
+
MONAI ベース放射線画像分類モデル。
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
in_channels: int — 入力チャネル数 (CT=1, MRI multimodal=4)
|
|
45
|
+
num_classes: int — クラス数
|
|
46
|
+
spatial_dims: int — 2 (2D スライス) or 3 (3D ボリューム)
|
|
47
|
+
architecture: str — "densenet121" / "resnet50" / "efficientnet"
|
|
48
|
+
"""
|
|
49
|
+
import monai.networks.nets as nets
|
|
50
|
+
|
|
51
|
+
models = {
|
|
52
|
+
"densenet121": nets.DenseNet121(
|
|
53
|
+
spatial_dims=spatial_dims,
|
|
54
|
+
in_channels=in_channels,
|
|
55
|
+
out_channels=num_classes),
|
|
56
|
+
"resnet50": nets.ResNet(
|
|
57
|
+
block="bottleneck", layers=[3, 4, 6, 3],
|
|
58
|
+
block_inplanes=[64, 128, 256, 512],
|
|
59
|
+
spatial_dims=spatial_dims,
|
|
60
|
+
n_input_channels=in_channels,
|
|
61
|
+
num_classes=num_classes),
|
|
62
|
+
"efficientnet": nets.EfficientNetBN(
|
|
63
|
+
"efficientnet-b0",
|
|
64
|
+
spatial_dims=spatial_dims,
|
|
65
|
+
in_channels=in_channels,
|
|
66
|
+
num_classes=num_classes),
|
|
67
|
+
}
|
|
68
|
+
model = models.get(architecture, models["densenet121"])
|
|
69
|
+
total_params = sum(p.numel() for p in model.parameters())
|
|
70
|
+
print(f"Radiology classifier: {architecture} | "
|
|
71
|
+
f"{total_params:,} params | {spatial_dims}D")
|
|
72
|
+
return model
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def train_radiology_model(model, train_loader, val_loader,
|
|
76
|
+
epochs=50, lr=1e-4, device="cuda"):
|
|
77
|
+
"""
|
|
78
|
+
放射線 AI モデル学習。
|
|
79
|
+
|
|
80
|
+
Parameters:
|
|
81
|
+
model: nn.Module — 分類モデル
|
|
82
|
+
train_loader: DataLoader — 訓練データ
|
|
83
|
+
val_loader: DataLoader — 検証データ
|
|
84
|
+
epochs: int — 学習エポック数
|
|
85
|
+
lr: float — 学習率
|
|
86
|
+
device: str — デバイス
|
|
87
|
+
"""
|
|
88
|
+
import pandas as pd
|
|
89
|
+
from monai.utils import set_determinism
|
|
90
|
+
set_determinism(seed=42)
|
|
91
|
+
|
|
92
|
+
model.to(device)
|
|
93
|
+
optimizer = torch.optim.AdamW(model.parameters(), lr=lr,
|
|
94
|
+
weight_decay=1e-4)
|
|
95
|
+
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
|
|
96
|
+
optimizer, T_max=epochs)
|
|
97
|
+
criterion = nn.CrossEntropyLoss()
|
|
98
|
+
history = []
|
|
99
|
+
|
|
100
|
+
best_val_acc = 0
|
|
101
|
+
for epoch in range(epochs):
|
|
102
|
+
model.train()
|
|
103
|
+
train_loss, correct, total = 0, 0, 0
|
|
104
|
+
for batch in train_loader:
|
|
105
|
+
images = batch["image"].to(device)
|
|
106
|
+
labels = batch["label"].to(device)
|
|
107
|
+
optimizer.zero_grad()
|
|
108
|
+
outputs = model(images)
|
|
109
|
+
loss = criterion(outputs, labels)
|
|
110
|
+
loss.backward()
|
|
111
|
+
optimizer.step()
|
|
112
|
+
train_loss += loss.item()
|
|
113
|
+
correct += (outputs.argmax(1) == labels).sum().item()
|
|
114
|
+
total += len(labels)
|
|
115
|
+
|
|
116
|
+
scheduler.step()
|
|
117
|
+
|
|
118
|
+
# Validation
|
|
119
|
+
model.eval()
|
|
120
|
+
val_loss, val_correct, val_total = 0, 0, 0
|
|
121
|
+
with torch.no_grad():
|
|
122
|
+
for batch in val_loader:
|
|
123
|
+
images = batch["image"].to(device)
|
|
124
|
+
labels = batch["label"].to(device)
|
|
125
|
+
outputs = model(images)
|
|
126
|
+
val_loss += criterion(outputs, labels).item()
|
|
127
|
+
val_correct += (outputs.argmax(1) == labels).sum().item()
|
|
128
|
+
val_total += len(labels)
|
|
129
|
+
|
|
130
|
+
val_acc = val_correct / val_total
|
|
131
|
+
if val_acc > best_val_acc:
|
|
132
|
+
best_val_acc = val_acc
|
|
133
|
+
torch.save(model.state_dict(), "best_radiology_model.pt")
|
|
134
|
+
|
|
135
|
+
history.append({
|
|
136
|
+
"epoch": epoch + 1,
|
|
137
|
+
"train_loss": train_loss / len(train_loader),
|
|
138
|
+
"train_acc": correct / total,
|
|
139
|
+
"val_loss": val_loss / len(val_loader),
|
|
140
|
+
"val_acc": val_acc,
|
|
141
|
+
})
|
|
142
|
+
|
|
143
|
+
if (epoch + 1) % 10 == 0:
|
|
144
|
+
print(f"Epoch {epoch+1}: train_acc={correct/total:.3f}, "
|
|
145
|
+
f"val_acc={val_acc:.3f}")
|
|
146
|
+
|
|
147
|
+
print(f"Best val_acc: {best_val_acc:.4f}")
|
|
148
|
+
return pd.DataFrame(history)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## 2. Grad-CAM 説明可能性
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
def radiology_gradcam(model, image_tensor, target_layer=None,
|
|
155
|
+
target_class=None, device="cuda"):
|
|
156
|
+
"""
|
|
157
|
+
放射線画像に対する Grad-CAM 可視化。
|
|
158
|
+
|
|
159
|
+
Parameters:
|
|
160
|
+
model: nn.Module — 学習済み分類モデル
|
|
161
|
+
image_tensor: torch.Tensor — 入力画像 [1, C, H, W] or [1, C, D, H, W]
|
|
162
|
+
target_layer: nn.Module | None — CAM 対象層
|
|
163
|
+
target_class: int | None — 対象クラス (None=予測クラス)
|
|
164
|
+
device: str — デバイス
|
|
165
|
+
"""
|
|
166
|
+
import matplotlib.pyplot as plt
|
|
167
|
+
from monai.visualize import GradCAM
|
|
168
|
+
|
|
169
|
+
model.to(device).eval()
|
|
170
|
+
image_tensor = image_tensor.to(device)
|
|
171
|
+
|
|
172
|
+
if target_layer is None:
|
|
173
|
+
# DenseNet の最終 features 層を使用
|
|
174
|
+
for name, module in model.named_modules():
|
|
175
|
+
if "features" in name or "layer4" in name:
|
|
176
|
+
target_layer = name
|
|
177
|
+
if target_layer is None:
|
|
178
|
+
target_layer = list(model.named_modules())[-2][0]
|
|
179
|
+
|
|
180
|
+
cam = GradCAM(nn_module=model, target_layers=target_layer)
|
|
181
|
+
|
|
182
|
+
if target_class is None:
|
|
183
|
+
with torch.no_grad():
|
|
184
|
+
target_class = model(image_tensor).argmax(1).item()
|
|
185
|
+
|
|
186
|
+
result = cam(x=image_tensor, class_idx=target_class)
|
|
187
|
+
cam_map = result.squeeze().cpu().numpy()
|
|
188
|
+
|
|
189
|
+
# 2D スライス可視化
|
|
190
|
+
if cam_map.ndim == 3:
|
|
191
|
+
mid_slice = cam_map.shape[0] // 2
|
|
192
|
+
cam_map_2d = cam_map[mid_slice]
|
|
193
|
+
img_2d = image_tensor.squeeze().cpu().numpy()[mid_slice]
|
|
194
|
+
else:
|
|
195
|
+
cam_map_2d = cam_map
|
|
196
|
+
img_2d = image_tensor.squeeze().cpu().numpy()
|
|
197
|
+
|
|
198
|
+
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
|
|
199
|
+
axes[0].imshow(img_2d, cmap="gray")
|
|
200
|
+
axes[0].set_title("Original")
|
|
201
|
+
axes[1].imshow(cam_map_2d, cmap="jet")
|
|
202
|
+
axes[1].set_title(f"Grad-CAM (class={target_class})")
|
|
203
|
+
axes[2].imshow(img_2d, cmap="gray")
|
|
204
|
+
axes[2].imshow(cam_map_2d, cmap="jet", alpha=0.4)
|
|
205
|
+
axes[2].set_title("Overlay")
|
|
206
|
+
for ax in axes:
|
|
207
|
+
ax.axis("off")
|
|
208
|
+
plt.tight_layout()
|
|
209
|
+
plt.savefig("gradcam_radiology.png", dpi=150, bbox_inches="tight")
|
|
210
|
+
print(f"Grad-CAM saved → gradcam_radiology.png (class={target_class})")
|
|
211
|
+
return cam_map
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## 3. 構造化放射線レポート
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
def generate_structured_report(predictions, patient_info=None,
|
|
218
|
+
modality="CT", body_part="Chest"):
|
|
219
|
+
"""
|
|
220
|
+
AI 支援構造化放射線レポート生成。
|
|
221
|
+
|
|
222
|
+
Parameters:
|
|
223
|
+
predictions: dict — {"finding": str, "probability": float, ...}
|
|
224
|
+
patient_info: dict | None — 患者情報
|
|
225
|
+
modality: str — "CT" / "MRI" / "XR"
|
|
226
|
+
body_part: str — 検査部位
|
|
227
|
+
"""
|
|
228
|
+
if patient_info is None:
|
|
229
|
+
patient_info = {"id": "ANON", "age": "N/A", "sex": "N/A"}
|
|
230
|
+
|
|
231
|
+
findings = []
|
|
232
|
+
for finding, prob in predictions.items():
|
|
233
|
+
if prob >= 0.5:
|
|
234
|
+
confidence = "High" if prob >= 0.8 else "Moderate"
|
|
235
|
+
findings.append(f"- {finding}: {prob:.1%} ({confidence} confidence)")
|
|
236
|
+
|
|
237
|
+
report = f"""## Structured Radiology Report (AI-Assisted)
|
|
238
|
+
|
|
239
|
+
**Patient**: {patient_info.get('id', 'N/A')} | \
|
|
240
|
+
Age: {patient_info.get('age', 'N/A')} | Sex: {patient_info.get('sex', 'N/A')}
|
|
241
|
+
**Modality**: {modality} | **Body Part**: {body_part}
|
|
242
|
+
|
|
243
|
+
### AI Findings
|
|
244
|
+
|
|
245
|
+
{chr(10).join(findings) if findings else '- No significant findings detected'}
|
|
246
|
+
|
|
247
|
+
### AI Confidence Summary
|
|
248
|
+
|
|
249
|
+
| Finding | Probability | AI-RADS |
|
|
250
|
+
|---------|:-----------:|:-------:|
|
|
251
|
+
"""
|
|
252
|
+
for finding, prob in sorted(predictions.items(),
|
|
253
|
+
key=lambda x: x[1], reverse=True):
|
|
254
|
+
rads = 5 if prob >= 0.9 else 4 if prob >= 0.7 else \
|
|
255
|
+
3 if prob >= 0.5 else 2 if prob >= 0.3 else 1
|
|
256
|
+
report += f"| {finding} | {prob:.1%} | {rads} |\n"
|
|
257
|
+
|
|
258
|
+
report += """
|
|
259
|
+
### Disclaimer
|
|
260
|
+
> This report was generated with AI assistance and requires
|
|
261
|
+
> review by a qualified radiologist before clinical use.
|
|
262
|
+
"""
|
|
263
|
+
print(report)
|
|
264
|
+
return report
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## パイプライン統合
|
|
270
|
+
|
|
271
|
+
```
|
|
272
|
+
[DICOM 取得] → medical-imaging → radiology-ai → clinical-report
|
|
273
|
+
(前処理/Radiomics) (AI 診断) (臨床レポート)
|
|
274
|
+
│
|
|
275
|
+
explainable-ai ← deep-learning
|
|
276
|
+
(説明可能性) (基盤学習)
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## パイプライン出力
|
|
280
|
+
|
|
281
|
+
| ファイル | 説明 | 次スキル |
|
|
282
|
+
|---------|------|---------|
|
|
283
|
+
| `best_radiology_model.pt` | 学習済み分類モデル | → 推論 |
|
|
284
|
+
| `gradcam_radiology.png` | Grad-CAM 可視化 | → レポート |
|
|
285
|
+
| `structured_report.md` | 構造化レポート | → clinical-report |
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-semi-supervised-learning
|
|
3
|
+
description: |
|
|
4
|
+
半教師あり学習スキル。Self-Training・Label Propagation・
|
|
5
|
+
MixMatch/FixMatch・Pseudo-Labeling・ラベル効率評価。
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Scientific Semi-Supervised Learning
|
|
9
|
+
|
|
10
|
+
少量のラベル付きデータと大量の未ラベルデータを活用する
|
|
11
|
+
半教師あり学習パイプラインを提供する。
|
|
12
|
+
|
|
13
|
+
## When to Use
|
|
14
|
+
|
|
15
|
+
- ラベル付きデータが少量しかないとき
|
|
16
|
+
- アノテーションコストが高く全量ラベリングが困難なとき
|
|
17
|
+
- Self-Training で反復的にラベルを拡張するとき
|
|
18
|
+
- グラフベースの Label Propagation を適用するとき
|
|
19
|
+
- Pseudo-Labeling の信頼度閾値を設計するとき
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
## 1. Self-Training パイプライン
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import numpy as np
|
|
29
|
+
import pandas as pd
|
|
30
|
+
from sklearn.base import clone
|
|
31
|
+
from sklearn.metrics import accuracy_score, classification_report
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def self_training_pipeline(X_labeled, y_labeled, X_unlabeled,
|
|
35
|
+
base_estimator=None, threshold=0.95,
|
|
36
|
+
max_iterations=10, batch_size=None,
|
|
37
|
+
X_test=None, y_test=None):
|
|
38
|
+
"""
|
|
39
|
+
Self-Training 半教師あり学習。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
X_labeled: np.ndarray — ラベル付き特徴量
|
|
43
|
+
y_labeled: np.ndarray — ラベル
|
|
44
|
+
X_unlabeled: np.ndarray — 未ラベル特徴量
|
|
45
|
+
base_estimator: sklearn estimator | None — 基底分類器
|
|
46
|
+
threshold: float — Pseudo-Label 採用閾値
|
|
47
|
+
max_iterations: int — 最大反復回数
|
|
48
|
+
batch_size: int | None — 各反復で追加するサンプル数上限
|
|
49
|
+
X_test: np.ndarray | None — テスト特徴量
|
|
50
|
+
y_test: np.ndarray | None — テストラベル
|
|
51
|
+
"""
|
|
52
|
+
from sklearn.ensemble import GradientBoostingClassifier
|
|
53
|
+
|
|
54
|
+
if base_estimator is None:
|
|
55
|
+
base_estimator = GradientBoostingClassifier(
|
|
56
|
+
n_estimators=100, random_state=42)
|
|
57
|
+
|
|
58
|
+
X_train = X_labeled.copy()
|
|
59
|
+
y_train = y_labeled.copy()
|
|
60
|
+
X_pool = X_unlabeled.copy()
|
|
61
|
+
history = []
|
|
62
|
+
|
|
63
|
+
for iteration in range(max_iterations):
|
|
64
|
+
if len(X_pool) == 0:
|
|
65
|
+
print(f"Iteration {iteration}: Pool exhausted")
|
|
66
|
+
break
|
|
67
|
+
|
|
68
|
+
model = clone(base_estimator)
|
|
69
|
+
model.fit(X_train, y_train)
|
|
70
|
+
proba = model.predict_proba(X_pool)
|
|
71
|
+
max_proba = proba.max(axis=1)
|
|
72
|
+
pseudo_labels = proba.argmax(axis=1)
|
|
73
|
+
|
|
74
|
+
confident_mask = max_proba >= threshold
|
|
75
|
+
n_confident = confident_mask.sum()
|
|
76
|
+
|
|
77
|
+
if batch_size and n_confident > batch_size:
|
|
78
|
+
top_idx = np.argsort(max_proba)[-batch_size:]
|
|
79
|
+
confident_mask = np.zeros(len(X_pool), dtype=bool)
|
|
80
|
+
confident_mask[top_idx] = True
|
|
81
|
+
n_confident = batch_size
|
|
82
|
+
|
|
83
|
+
if n_confident == 0:
|
|
84
|
+
print(f"Iteration {iteration}: No confident samples")
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
X_train = np.vstack([X_train, X_pool[confident_mask]])
|
|
88
|
+
y_train = np.concatenate([
|
|
89
|
+
y_train, pseudo_labels[confident_mask]])
|
|
90
|
+
X_pool = X_pool[~confident_mask]
|
|
91
|
+
|
|
92
|
+
record = {"iteration": iteration,
|
|
93
|
+
"n_labeled": len(X_train),
|
|
94
|
+
"n_pool": len(X_pool),
|
|
95
|
+
"n_added": int(n_confident),
|
|
96
|
+
"mean_confidence": float(max_proba[confident_mask].mean())}
|
|
97
|
+
|
|
98
|
+
if X_test is not None and y_test is not None:
|
|
99
|
+
test_acc = accuracy_score(y_test, model.predict(X_test))
|
|
100
|
+
record["test_accuracy"] = test_acc
|
|
101
|
+
|
|
102
|
+
history.append(record)
|
|
103
|
+
print(f"Iter {iteration}: +{n_confident} samples, "
|
|
104
|
+
f"total={len(X_train)}, pool={len(X_pool)}")
|
|
105
|
+
|
|
106
|
+
final_model = clone(base_estimator)
|
|
107
|
+
final_model.fit(X_train, y_train)
|
|
108
|
+
return final_model, pd.DataFrame(history)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## 2. Label Propagation
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
def label_propagation_ssl(X_all, y_partial, kernel="rbf",
|
|
115
|
+
gamma=20, n_neighbors=7,
|
|
116
|
+
max_iter=1000):
|
|
117
|
+
"""
|
|
118
|
+
グラフベース Label Propagation。
|
|
119
|
+
|
|
120
|
+
Parameters:
|
|
121
|
+
X_all: np.ndarray — 全サンプル特徴量 (ラベル付き+未ラベル)
|
|
122
|
+
y_partial: np.ndarray — ラベル (-1 = 未ラベル)
|
|
123
|
+
kernel: str — "rbf" / "knn"
|
|
124
|
+
gamma: float — RBF カーネルの γ
|
|
125
|
+
n_neighbors: int — KNN カーネルの k
|
|
126
|
+
max_iter: int — 最大反復回数
|
|
127
|
+
"""
|
|
128
|
+
from sklearn.semi_supervised import (
|
|
129
|
+
LabelPropagation, LabelSpreading)
|
|
130
|
+
|
|
131
|
+
models = {
|
|
132
|
+
"propagation": LabelPropagation(
|
|
133
|
+
kernel=kernel, gamma=gamma,
|
|
134
|
+
n_neighbors=n_neighbors, max_iter=max_iter),
|
|
135
|
+
"spreading": LabelSpreading(
|
|
136
|
+
kernel=kernel, gamma=gamma,
|
|
137
|
+
n_neighbors=n_neighbors, max_iter=max_iter, alpha=0.2),
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
results = {}
|
|
141
|
+
for name, model in models.items():
|
|
142
|
+
model.fit(X_all, y_partial)
|
|
143
|
+
y_pred = model.transduction_
|
|
144
|
+
n_propagated = (y_partial == -1).sum()
|
|
145
|
+
results[name] = {
|
|
146
|
+
"model": model,
|
|
147
|
+
"predictions": y_pred,
|
|
148
|
+
"n_propagated": int(n_propagated),
|
|
149
|
+
"label_distributions": model.label_distributions_,
|
|
150
|
+
}
|
|
151
|
+
print(f"{name}: propagated {n_propagated} labels")
|
|
152
|
+
|
|
153
|
+
return results
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## 3. Pseudo-Labeling 品質評価
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
def evaluate_pseudo_labels(y_true_unlabeled, pseudo_labels,
|
|
160
|
+
confidences, thresholds=None):
|
|
161
|
+
"""
|
|
162
|
+
Pseudo-Label の品質を評価。
|
|
163
|
+
|
|
164
|
+
Parameters:
|
|
165
|
+
y_true_unlabeled: np.ndarray — 真のラベル (評価用)
|
|
166
|
+
pseudo_labels: np.ndarray — 予測した疑似ラベル
|
|
167
|
+
confidences: np.ndarray — 各予測の信頼度
|
|
168
|
+
thresholds: list[float] | None — 閾値リスト
|
|
169
|
+
"""
|
|
170
|
+
if thresholds is None:
|
|
171
|
+
thresholds = [0.5, 0.7, 0.8, 0.9, 0.95, 0.99]
|
|
172
|
+
|
|
173
|
+
records = []
|
|
174
|
+
for t in thresholds:
|
|
175
|
+
mask = confidences >= t
|
|
176
|
+
if mask.sum() == 0:
|
|
177
|
+
continue
|
|
178
|
+
acc = accuracy_score(y_true_unlabeled[mask],
|
|
179
|
+
pseudo_labels[mask])
|
|
180
|
+
records.append({
|
|
181
|
+
"threshold": t,
|
|
182
|
+
"n_selected": int(mask.sum()),
|
|
183
|
+
"coverage": float(mask.mean()),
|
|
184
|
+
"pseudo_accuracy": acc,
|
|
185
|
+
})
|
|
186
|
+
print(f"τ={t:.2f}: {mask.sum()} samples, "
|
|
187
|
+
f"coverage={mask.mean():.1%}, acc={acc:.3f}")
|
|
188
|
+
|
|
189
|
+
return pd.DataFrame(records)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## パイプライン統合
|
|
195
|
+
|
|
196
|
+
```
|
|
197
|
+
[少量ラベル] → semi-supervised-learning → ml-classification
|
|
198
|
+
(ラベル拡張) (本分類)
|
|
199
|
+
│
|
|
200
|
+
active-learning ← data-profiling
|
|
201
|
+
(能動学習) (データ品質)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## パイプライン出力
|
|
205
|
+
|
|
206
|
+
| ファイル | 説明 | 次スキル |
|
|
207
|
+
|---------|------|---------|
|
|
208
|
+
| `self_training_history.csv` | 反復学習履歴 | → 収束分析 |
|
|
209
|
+
| `pseudo_label_quality.csv` | 疑似ラベル品質 | → 閾値選択 |
|
|
210
|
+
| `propagated_labels.npy` | 伝播ラベル | → ml-classification |
|