@nahisaho/satori 0.23.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -29
- package/package.json +1 -1
- package/src/.github/skills/scientific-adaptive-experiments/SKILL.md +287 -0
- package/src/.github/skills/scientific-anomaly-detection/SKILL.md +296 -0
- package/src/.github/skills/scientific-causal-ml/SKILL.md +240 -0
- package/src/.github/skills/scientific-data-profiling/SKILL.md +247 -0
- package/src/.github/skills/scientific-federated-learning/SKILL.md +241 -0
- package/src/.github/skills/scientific-geospatial-analysis/SKILL.md +274 -0
- package/src/.github/skills/scientific-model-monitoring/SKILL.md +247 -0
- package/src/.github/skills/scientific-multi-task-learning/SKILL.md +238 -0
- package/src/.github/skills/scientific-network-visualization/SKILL.md +278 -0
- package/src/.github/skills/scientific-neural-architecture-search/SKILL.md +206 -0
- package/src/.github/skills/scientific-radiology-ai/SKILL.md +285 -0
- package/src/.github/skills/scientific-reproducible-reporting/SKILL.md +330 -0
- package/src/.github/skills/scientific-semi-supervised-learning/SKILL.md +210 -0
- package/src/.github/skills/scientific-statistical-simulation/SKILL.md +227 -0
- package/src/.github/skills/scientific-streaming-analytics/SKILL.md +221 -0
- package/src/.github/skills/scientific-time-series-forecasting/SKILL.md +246 -0
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-reproducible-reporting
|
|
3
|
+
description: |
|
|
4
|
+
再現可能レポーティングスキル。Quarto 科学文書・
|
|
5
|
+
Jupyter Book 多章構成・Papermill パラメトリック実行・
|
|
6
|
+
nbconvert 自動変換・Sphinx-Gallery コード例ドキュメント。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Reproducible Reporting
|
|
10
|
+
|
|
11
|
+
再現可能な科学レポート・文書生成パイプラインを提供し、
|
|
12
|
+
コード → 実行 → 文書化の自動化を実現する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- Quarto で再現可能な科学文書を作成するとき
|
|
17
|
+
- Jupyter Book で多章構成の文書を構築するとき
|
|
18
|
+
- Papermill でパラメトリック実行を自動化するとき
|
|
19
|
+
- nbconvert でノートブックを各種形式に変換するとき
|
|
20
|
+
- CI/CD で解析レポートを自動生成するとき
|
|
21
|
+
- 複数パラメータセットで解析を繰り返し実行するとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. Quarto 科学文書テンプレート
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import os
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def generate_quarto_document(title="Scientific Analysis Report",
|
|
34
|
+
author="SATORI",
|
|
35
|
+
format_type="html",
|
|
36
|
+
output_dir="quarto_project"):
|
|
37
|
+
"""
|
|
38
|
+
Quarto 科学文書テンプレート生成。
|
|
39
|
+
|
|
40
|
+
Parameters:
|
|
41
|
+
title: str — ドキュメントタイトル
|
|
42
|
+
author: str — 著者名
|
|
43
|
+
format_type: str — "html" / "pdf" / "docx" / "revealjs"
|
|
44
|
+
output_dir: str — 出力ディレクトリ
|
|
45
|
+
"""
|
|
46
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
47
|
+
|
|
48
|
+
# _quarto.yml
|
|
49
|
+
quarto_config = f"""project:
|
|
50
|
+
type: default
|
|
51
|
+
output-dir: _output
|
|
52
|
+
|
|
53
|
+
format:
|
|
54
|
+
{format_type}:
|
|
55
|
+
toc: true
|
|
56
|
+
toc-depth: 3
|
|
57
|
+
number-sections: true
|
|
58
|
+
code-fold: true
|
|
59
|
+
code-tools: true
|
|
60
|
+
theme: cosmo
|
|
61
|
+
|
|
62
|
+
execute:
|
|
63
|
+
echo: true
|
|
64
|
+
warning: false
|
|
65
|
+
cache: true
|
|
66
|
+
|
|
67
|
+
bibliography: references.bib
|
|
68
|
+
csl: nature.csl
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
# メインドキュメント
|
|
72
|
+
main_qmd = f"""---
|
|
73
|
+
title: "{title}"
|
|
74
|
+
author: "{author}"
|
|
75
|
+
date: today
|
|
76
|
+
format:
|
|
77
|
+
{format_type}:
|
|
78
|
+
code-fold: true
|
|
79
|
+
code-tools: true
|
|
80
|
+
jupyter: python3
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## はじめに
|
|
84
|
+
|
|
85
|
+
このレポートは SATORI スキルを用いた再現可能な科学解析文書です。
|
|
86
|
+
|
|
87
|
+
```{{python}}
|
|
88
|
+
#| label: setup
|
|
89
|
+
#| echo: false
|
|
90
|
+
|
|
91
|
+
import numpy as np
|
|
92
|
+
import pandas as pd
|
|
93
|
+
import matplotlib.pyplot as plt
|
|
94
|
+
import warnings
|
|
95
|
+
warnings.filterwarnings("ignore")
|
|
96
|
+
|
|
97
|
+
# パラメータ (Papermill 互換)
|
|
98
|
+
n_samples = 1000
|
|
99
|
+
random_seed = 42
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## データ概要
|
|
103
|
+
|
|
104
|
+
```{{python}}
|
|
105
|
+
#| label: data-summary
|
|
106
|
+
#| tbl-cap: "データセット概要"
|
|
107
|
+
|
|
108
|
+
np.random.seed(random_seed)
|
|
109
|
+
df = pd.DataFrame({{
|
|
110
|
+
"x": np.random.randn(n_samples),
|
|
111
|
+
"y": np.random.randn(n_samples),
|
|
112
|
+
"group": np.random.choice(["A", "B", "C"], n_samples)
|
|
113
|
+
}})
|
|
114
|
+
df.describe()
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## 可視化
|
|
118
|
+
|
|
119
|
+
```{{python}}
|
|
120
|
+
#| label: fig-scatter
|
|
121
|
+
#| fig-cap: "散布図"
|
|
122
|
+
|
|
123
|
+
fig, ax = plt.subplots(figsize=(8, 6))
|
|
124
|
+
for g, sub in df.groupby("group"):
|
|
125
|
+
ax.scatter(sub["x"], sub["y"], label=g, alpha=0.6, s=20)
|
|
126
|
+
ax.legend()
|
|
127
|
+
ax.set_xlabel("X")
|
|
128
|
+
ax.set_ylabel("Y")
|
|
129
|
+
plt.show()
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## 結論
|
|
133
|
+
|
|
134
|
+
解析結果のサマリーを記載する。
|
|
135
|
+
|
|
136
|
+
## References
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
# references.bib (空テンプレート)
|
|
140
|
+
bib_template = """@article{example2024,
|
|
141
|
+
title={Example Reference},
|
|
142
|
+
author={Author, A.},
|
|
143
|
+
journal={Journal},
|
|
144
|
+
year={2024}
|
|
145
|
+
}
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
with open(os.path.join(output_dir, "_quarto.yml"), "w") as f:
|
|
149
|
+
f.write(quarto_config)
|
|
150
|
+
with open(os.path.join(output_dir, "report.qmd"), "w") as f:
|
|
151
|
+
f.write(main_qmd)
|
|
152
|
+
with open(os.path.join(output_dir, "references.bib"), "w") as f:
|
|
153
|
+
f.write(bib_template)
|
|
154
|
+
|
|
155
|
+
print(f"Quarto project → {output_dir}/")
|
|
156
|
+
print(f" Build: cd {output_dir} && quarto render report.qmd")
|
|
157
|
+
return output_dir
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## 2. Papermill パラメトリック実行
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
def papermill_parametric_run(template_notebook, output_dir,
|
|
164
|
+
parameter_sets, kernel="python3"):
|
|
165
|
+
"""
|
|
166
|
+
Papermill パラメトリック実行 — 複数パラメータセットで自動実行。
|
|
167
|
+
|
|
168
|
+
Parameters:
|
|
169
|
+
template_notebook: str — テンプレートノートブックパス
|
|
170
|
+
output_dir: str — 出力ディレクトリ
|
|
171
|
+
parameter_sets: list[dict] — パラメータセットのリスト
|
|
172
|
+
kernel: str — カーネル名
|
|
173
|
+
"""
|
|
174
|
+
import papermill as pm
|
|
175
|
+
|
|
176
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
177
|
+
results = []
|
|
178
|
+
|
|
179
|
+
for i, params in enumerate(parameter_sets):
|
|
180
|
+
output_path = os.path.join(output_dir, f"run_{i:03d}.ipynb")
|
|
181
|
+
try:
|
|
182
|
+
pm.execute_notebook(
|
|
183
|
+
template_notebook,
|
|
184
|
+
output_path,
|
|
185
|
+
parameters=params,
|
|
186
|
+
kernel_name=kernel)
|
|
187
|
+
results.append({
|
|
188
|
+
"run": i, "params": params,
|
|
189
|
+
"output": output_path, "status": "success"})
|
|
190
|
+
except Exception as e:
|
|
191
|
+
results.append({
|
|
192
|
+
"run": i, "params": params,
|
|
193
|
+
"output": output_path, "status": f"error: {str(e)}"})
|
|
194
|
+
|
|
195
|
+
import pandas as pd
|
|
196
|
+
results_df = pd.DataFrame(results)
|
|
197
|
+
n_success = (results_df["status"] == "success").sum()
|
|
198
|
+
print(f"Papermill: {n_success}/{len(parameter_sets)} runs succeeded")
|
|
199
|
+
return results_df
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## 3. Jupyter Book 多章構成
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
def generate_jupyter_book(title="Scientific Analysis Book",
|
|
206
|
+
chapters=None,
|
|
207
|
+
output_dir="jupyter_book"):
|
|
208
|
+
"""
|
|
209
|
+
Jupyter Book プロジェクトテンプレート生成。
|
|
210
|
+
|
|
211
|
+
Parameters:
|
|
212
|
+
title: str — 書籍タイトル
|
|
213
|
+
chapters: list[dict] | None — 章情報 [{"title": ..., "file": ...}]
|
|
214
|
+
output_dir: str — 出力ディレクトリ
|
|
215
|
+
"""
|
|
216
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
217
|
+
|
|
218
|
+
if chapters is None:
|
|
219
|
+
chapters = [
|
|
220
|
+
{"title": "Introduction", "file": "intro"},
|
|
221
|
+
{"title": "Data Loading", "file": "ch01_data"},
|
|
222
|
+
{"title": "Exploratory Analysis", "file": "ch02_eda"},
|
|
223
|
+
{"title": "Modeling", "file": "ch03_model"},
|
|
224
|
+
{"title": "Results", "file": "ch04_results"},
|
|
225
|
+
]
|
|
226
|
+
|
|
227
|
+
# _config.yml
|
|
228
|
+
config = f"""title: "{title}"
|
|
229
|
+
author: SATORI
|
|
230
|
+
execute:
|
|
231
|
+
execute_notebooks: auto
|
|
232
|
+
timeout: 600
|
|
233
|
+
repository:
|
|
234
|
+
url: ""
|
|
235
|
+
launch_buttons:
|
|
236
|
+
binderhub_url: ""
|
|
237
|
+
sphinx:
|
|
238
|
+
extra_extensions:
|
|
239
|
+
- sphinx_proof
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
# _toc.yml
|
|
243
|
+
toc_entries = "\n".join(
|
|
244
|
+
[f" - file: {ch['file']}" for ch in chapters])
|
|
245
|
+
toc = f"""format: jb-book
|
|
246
|
+
root: intro
|
|
247
|
+
chapters:
|
|
248
|
+
{toc_entries}
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
with open(os.path.join(output_dir, "_config.yml"), "w") as f:
|
|
252
|
+
f.write(config)
|
|
253
|
+
with open(os.path.join(output_dir, "_toc.yml"), "w") as f:
|
|
254
|
+
f.write(toc)
|
|
255
|
+
|
|
256
|
+
# 各章テンプレート
|
|
257
|
+
for ch in chapters:
|
|
258
|
+
filepath = os.path.join(output_dir, f"{ch['file']}.md")
|
|
259
|
+
if not os.path.exists(filepath):
|
|
260
|
+
content = f"# {ch['title']}\n\nThis chapter covers {ch['title'].lower()}.\n"
|
|
261
|
+
with open(filepath, "w") as f:
|
|
262
|
+
f.write(content)
|
|
263
|
+
|
|
264
|
+
print(f"Jupyter Book → {output_dir}/")
|
|
265
|
+
print(f" Build: jupyter-book build {output_dir}")
|
|
266
|
+
return output_dir
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## 4. nbconvert 自動変換
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
def batch_convert_notebooks(notebook_dir, output_format="html",
|
|
273
|
+
output_dir=None, execute=True):
|
|
274
|
+
"""
|
|
275
|
+
ノートブック一括変換。
|
|
276
|
+
|
|
277
|
+
Parameters:
|
|
278
|
+
notebook_dir: str — ノートブックディレクトリ
|
|
279
|
+
output_format: str — "html" / "pdf" / "markdown" / "script"
|
|
280
|
+
output_dir: str | None — 出力先 (None=同ディレクトリ)
|
|
281
|
+
execute: bool — 実行後に変換
|
|
282
|
+
"""
|
|
283
|
+
import subprocess
|
|
284
|
+
import glob
|
|
285
|
+
|
|
286
|
+
notebooks = sorted(glob.glob(os.path.join(notebook_dir, "*.ipynb")))
|
|
287
|
+
if output_dir:
|
|
288
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
289
|
+
|
|
290
|
+
results = []
|
|
291
|
+
for nb_path in notebooks:
|
|
292
|
+
cmd = ["jupyter", "nbconvert", f"--to={output_format}"]
|
|
293
|
+
if execute:
|
|
294
|
+
cmd.append("--execute")
|
|
295
|
+
if output_dir:
|
|
296
|
+
cmd.extend(["--output-dir", output_dir])
|
|
297
|
+
cmd.append(nb_path)
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
301
|
+
results.append({"notebook": nb_path, "status": "success"})
|
|
302
|
+
except subprocess.CalledProcessError as e:
|
|
303
|
+
results.append({"notebook": nb_path, "status": f"error: {e.stderr[:100]}"})
|
|
304
|
+
|
|
305
|
+
import pandas as pd
|
|
306
|
+
results_df = pd.DataFrame(results)
|
|
307
|
+
n_ok = (results_df["status"] == "success").sum()
|
|
308
|
+
print(f"nbconvert ({output_format}): {n_ok}/{len(notebooks)} converted")
|
|
309
|
+
return results_df
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
## パイプライン統合
|
|
315
|
+
|
|
316
|
+
```
|
|
317
|
+
[解析完了] → reproducible-reporting → presentation-design
|
|
318
|
+
(レポート自動生成) (プレゼン作成)
|
|
319
|
+
│ ↓
|
|
320
|
+
interactive-dashboard academic-writing
|
|
321
|
+
(ダッシュボード) (論文執筆)
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
## パイプライン出力
|
|
325
|
+
|
|
326
|
+
| ファイル | 説明 | 次スキル |
|
|
327
|
+
|---------|------|---------|
|
|
328
|
+
| `quarto_project/` | Quarto プロジェクト | → quarto render |
|
|
329
|
+
| `papermill_runs/` | パラメトリック実行結果 | → 集計 |
|
|
330
|
+
| `jupyter_book/` | Jupyter Book プロジェクト | → jb build |
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-semi-supervised-learning
|
|
3
|
+
description: |
|
|
4
|
+
半教師あり学習スキル。Self-Training・Label Propagation・
|
|
5
|
+
MixMatch/FixMatch・Pseudo-Labeling・ラベル効率評価。
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Scientific Semi-Supervised Learning
|
|
9
|
+
|
|
10
|
+
少量のラベル付きデータと大量の未ラベルデータを活用する
|
|
11
|
+
半教師あり学習パイプラインを提供する。
|
|
12
|
+
|
|
13
|
+
## When to Use
|
|
14
|
+
|
|
15
|
+
- ラベル付きデータが少量しかないとき
|
|
16
|
+
- アノテーションコストが高く全量ラベリングが困難なとき
|
|
17
|
+
- Self-Training で反復的にラベルを拡張するとき
|
|
18
|
+
- グラフベースの Label Propagation を適用するとき
|
|
19
|
+
- Pseudo-Labeling の信頼度閾値を設計するとき
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
## 1. Self-Training パイプライン
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import numpy as np
|
|
29
|
+
import pandas as pd
|
|
30
|
+
from sklearn.base import clone
|
|
31
|
+
from sklearn.metrics import accuracy_score, classification_report
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def self_training_pipeline(X_labeled, y_labeled, X_unlabeled,
|
|
35
|
+
base_estimator=None, threshold=0.95,
|
|
36
|
+
max_iterations=10, batch_size=None,
|
|
37
|
+
X_test=None, y_test=None):
|
|
38
|
+
"""
|
|
39
|
+
Self-Training 半教師あり学習。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
X_labeled: np.ndarray — ラベル付き特徴量
|
|
43
|
+
y_labeled: np.ndarray — ラベル
|
|
44
|
+
X_unlabeled: np.ndarray — 未ラベル特徴量
|
|
45
|
+
base_estimator: sklearn estimator | None — 基底分類器
|
|
46
|
+
threshold: float — Pseudo-Label 採用閾値
|
|
47
|
+
max_iterations: int — 最大反復回数
|
|
48
|
+
batch_size: int | None — 各反復で追加するサンプル数上限
|
|
49
|
+
X_test: np.ndarray | None — テスト特徴量
|
|
50
|
+
y_test: np.ndarray | None — テストラベル
|
|
51
|
+
"""
|
|
52
|
+
from sklearn.ensemble import GradientBoostingClassifier
|
|
53
|
+
|
|
54
|
+
if base_estimator is None:
|
|
55
|
+
base_estimator = GradientBoostingClassifier(
|
|
56
|
+
n_estimators=100, random_state=42)
|
|
57
|
+
|
|
58
|
+
X_train = X_labeled.copy()
|
|
59
|
+
y_train = y_labeled.copy()
|
|
60
|
+
X_pool = X_unlabeled.copy()
|
|
61
|
+
history = []
|
|
62
|
+
|
|
63
|
+
for iteration in range(max_iterations):
|
|
64
|
+
if len(X_pool) == 0:
|
|
65
|
+
print(f"Iteration {iteration}: Pool exhausted")
|
|
66
|
+
break
|
|
67
|
+
|
|
68
|
+
model = clone(base_estimator)
|
|
69
|
+
model.fit(X_train, y_train)
|
|
70
|
+
proba = model.predict_proba(X_pool)
|
|
71
|
+
max_proba = proba.max(axis=1)
|
|
72
|
+
pseudo_labels = proba.argmax(axis=1)
|
|
73
|
+
|
|
74
|
+
confident_mask = max_proba >= threshold
|
|
75
|
+
n_confident = confident_mask.sum()
|
|
76
|
+
|
|
77
|
+
if batch_size and n_confident > batch_size:
|
|
78
|
+
top_idx = np.argsort(max_proba)[-batch_size:]
|
|
79
|
+
confident_mask = np.zeros(len(X_pool), dtype=bool)
|
|
80
|
+
confident_mask[top_idx] = True
|
|
81
|
+
n_confident = batch_size
|
|
82
|
+
|
|
83
|
+
if n_confident == 0:
|
|
84
|
+
print(f"Iteration {iteration}: No confident samples")
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
X_train = np.vstack([X_train, X_pool[confident_mask]])
|
|
88
|
+
y_train = np.concatenate([
|
|
89
|
+
y_train, pseudo_labels[confident_mask]])
|
|
90
|
+
X_pool = X_pool[~confident_mask]
|
|
91
|
+
|
|
92
|
+
record = {"iteration": iteration,
|
|
93
|
+
"n_labeled": len(X_train),
|
|
94
|
+
"n_pool": len(X_pool),
|
|
95
|
+
"n_added": int(n_confident),
|
|
96
|
+
"mean_confidence": float(max_proba[confident_mask].mean())}
|
|
97
|
+
|
|
98
|
+
if X_test is not None and y_test is not None:
|
|
99
|
+
test_acc = accuracy_score(y_test, model.predict(X_test))
|
|
100
|
+
record["test_accuracy"] = test_acc
|
|
101
|
+
|
|
102
|
+
history.append(record)
|
|
103
|
+
print(f"Iter {iteration}: +{n_confident} samples, "
|
|
104
|
+
f"total={len(X_train)}, pool={len(X_pool)}")
|
|
105
|
+
|
|
106
|
+
final_model = clone(base_estimator)
|
|
107
|
+
final_model.fit(X_train, y_train)
|
|
108
|
+
return final_model, pd.DataFrame(history)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## 2. Label Propagation
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
def label_propagation_ssl(X_all, y_partial, kernel="rbf",
|
|
115
|
+
gamma=20, n_neighbors=7,
|
|
116
|
+
max_iter=1000):
|
|
117
|
+
"""
|
|
118
|
+
グラフベース Label Propagation。
|
|
119
|
+
|
|
120
|
+
Parameters:
|
|
121
|
+
X_all: np.ndarray — 全サンプル特徴量 (ラベル付き+未ラベル)
|
|
122
|
+
y_partial: np.ndarray — ラベル (-1 = 未ラベル)
|
|
123
|
+
kernel: str — "rbf" / "knn"
|
|
124
|
+
gamma: float — RBF カーネルの γ
|
|
125
|
+
n_neighbors: int — KNN カーネルの k
|
|
126
|
+
max_iter: int — 最大反復回数
|
|
127
|
+
"""
|
|
128
|
+
from sklearn.semi_supervised import (
|
|
129
|
+
LabelPropagation, LabelSpreading)
|
|
130
|
+
|
|
131
|
+
models = {
|
|
132
|
+
"propagation": LabelPropagation(
|
|
133
|
+
kernel=kernel, gamma=gamma,
|
|
134
|
+
n_neighbors=n_neighbors, max_iter=max_iter),
|
|
135
|
+
"spreading": LabelSpreading(
|
|
136
|
+
kernel=kernel, gamma=gamma,
|
|
137
|
+
n_neighbors=n_neighbors, max_iter=max_iter, alpha=0.2),
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
results = {}
|
|
141
|
+
for name, model in models.items():
|
|
142
|
+
model.fit(X_all, y_partial)
|
|
143
|
+
y_pred = model.transduction_
|
|
144
|
+
n_propagated = (y_partial == -1).sum()
|
|
145
|
+
results[name] = {
|
|
146
|
+
"model": model,
|
|
147
|
+
"predictions": y_pred,
|
|
148
|
+
"n_propagated": int(n_propagated),
|
|
149
|
+
"label_distributions": model.label_distributions_,
|
|
150
|
+
}
|
|
151
|
+
print(f"{name}: propagated {n_propagated} labels")
|
|
152
|
+
|
|
153
|
+
return results
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## 3. Pseudo-Labeling 品質評価
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
def evaluate_pseudo_labels(y_true_unlabeled, pseudo_labels,
|
|
160
|
+
confidences, thresholds=None):
|
|
161
|
+
"""
|
|
162
|
+
Pseudo-Label の品質を評価。
|
|
163
|
+
|
|
164
|
+
Parameters:
|
|
165
|
+
y_true_unlabeled: np.ndarray — 真のラベル (評価用)
|
|
166
|
+
pseudo_labels: np.ndarray — 予測した疑似ラベル
|
|
167
|
+
confidences: np.ndarray — 各予測の信頼度
|
|
168
|
+
thresholds: list[float] | None — 閾値リスト
|
|
169
|
+
"""
|
|
170
|
+
if thresholds is None:
|
|
171
|
+
thresholds = [0.5, 0.7, 0.8, 0.9, 0.95, 0.99]
|
|
172
|
+
|
|
173
|
+
records = []
|
|
174
|
+
for t in thresholds:
|
|
175
|
+
mask = confidences >= t
|
|
176
|
+
if mask.sum() == 0:
|
|
177
|
+
continue
|
|
178
|
+
acc = accuracy_score(y_true_unlabeled[mask],
|
|
179
|
+
pseudo_labels[mask])
|
|
180
|
+
records.append({
|
|
181
|
+
"threshold": t,
|
|
182
|
+
"n_selected": int(mask.sum()),
|
|
183
|
+
"coverage": float(mask.mean()),
|
|
184
|
+
"pseudo_accuracy": acc,
|
|
185
|
+
})
|
|
186
|
+
print(f"τ={t:.2f}: {mask.sum()} samples, "
|
|
187
|
+
f"coverage={mask.mean():.1%}, acc={acc:.3f}")
|
|
188
|
+
|
|
189
|
+
return pd.DataFrame(records)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## パイプライン統合
|
|
195
|
+
|
|
196
|
+
```
|
|
197
|
+
[少量ラベル] → semi-supervised-learning → ml-classification
|
|
198
|
+
(ラベル拡張) (本分類)
|
|
199
|
+
│
|
|
200
|
+
active-learning ← data-profiling
|
|
201
|
+
(能動学習) (データ品質)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## パイプライン出力
|
|
205
|
+
|
|
206
|
+
| ファイル | 説明 | 次スキル |
|
|
207
|
+
|---------|------|---------|
|
|
208
|
+
| `self_training_history.csv` | 反復学習履歴 | → 収束分析 |
|
|
209
|
+
| `pseudo_label_quality.csv` | 疑似ラベル品質 | → 閾値選択 |
|
|
210
|
+
| `propagated_labels.npy` | 伝播ラベル | → ml-classification |
|