@nahisaho/satori 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,330 @@
1
+ ---
2
+ name: scientific-reproducible-reporting
3
+ description: |
4
+ 再現可能レポーティングスキル。Quarto 科学文書・
5
+ Jupyter Book 多章構成・Papermill パラメトリック実行・
6
+ nbconvert 自動変換・Sphinx-Gallery コード例ドキュメント。
7
+ ---
8
+
9
+ # Scientific Reproducible Reporting
10
+
11
+ 再現可能な科学レポート・文書生成パイプラインを提供し、
12
+ コード → 実行 → 文書化の自動化を実現する。
13
+
14
+ ## When to Use
15
+
16
+ - Quarto で再現可能な科学文書を作成するとき
17
+ - Jupyter Book で多章構成の文書を構築するとき
18
+ - Papermill でパラメトリック実行を自動化するとき
19
+ - nbconvert でノートブックを各種形式に変換するとき
20
+ - CI/CD で解析レポートを自動生成するとき
21
+ - 複数パラメータセットで解析を繰り返し実行するとき
22
+
23
+ ---
24
+
25
+ ## Quick Start
26
+
27
+ ## 1. Quarto 科学文書テンプレート
28
+
29
+ ```python
30
+ import os
31
+
32
+
33
+ def generate_quarto_document(title="Scientific Analysis Report",
34
+ author="SATORI",
35
+ format_type="html",
36
+ output_dir="quarto_project"):
37
+ """
38
+ Quarto 科学文書テンプレート生成。
39
+
40
+ Parameters:
41
+ title: str — ドキュメントタイトル
42
+ author: str — 著者名
43
+ format_type: str — "html" / "pdf" / "docx" / "revealjs"
44
+ output_dir: str — 出力ディレクトリ
45
+ """
46
+ os.makedirs(output_dir, exist_ok=True)
47
+
48
+ # _quarto.yml
49
+ quarto_config = f"""project:
50
+ type: default
51
+ output-dir: _output
52
+
53
+ format:
54
+ {format_type}:
55
+ toc: true
56
+ toc-depth: 3
57
+ number-sections: true
58
+ code-fold: true
59
+ code-tools: true
60
+ theme: cosmo
61
+
62
+ execute:
63
+ echo: true
64
+ warning: false
65
+ cache: true
66
+
67
+ bibliography: references.bib
68
+ csl: nature.csl
69
+ """
70
+
71
+ # メインドキュメント
72
+ main_qmd = f"""---
73
+ title: "{title}"
74
+ author: "{author}"
75
+ date: today
76
+ format:
77
+ {format_type}:
78
+ code-fold: true
79
+ code-tools: true
80
+ jupyter: python3
81
+ ---
82
+
83
+ ## はじめに
84
+
85
+ このレポートは SATORI スキルを用いた再現可能な科学解析文書です。
86
+
87
+ ```{{python}}
88
+ #| label: setup
89
+ #| echo: false
90
+
91
+ import numpy as np
92
+ import pandas as pd
93
+ import matplotlib.pyplot as plt
94
+ import warnings
95
+ warnings.filterwarnings("ignore")
96
+
97
+ # パラメータ (Papermill 互換)
98
+ n_samples = 1000
99
+ random_seed = 42
100
+ ```
101
+
102
+ ## データ概要
103
+
104
+ ```{{python}}
105
+ #| label: data-summary
106
+ #| tbl-cap: "データセット概要"
107
+
108
+ np.random.seed(random_seed)
109
+ df = pd.DataFrame({{
110
+ "x": np.random.randn(n_samples),
111
+ "y": np.random.randn(n_samples),
112
+ "group": np.random.choice(["A", "B", "C"], n_samples)
113
+ }})
114
+ df.describe()
115
+ ```
116
+
117
+ ## 可視化
118
+
119
+ ```{{python}}
120
+ #| label: fig-scatter
121
+ #| fig-cap: "散布図"
122
+
123
+ fig, ax = plt.subplots(figsize=(8, 6))
124
+ for g, sub in df.groupby("group"):
125
+ ax.scatter(sub["x"], sub["y"], label=g, alpha=0.6, s=20)
126
+ ax.legend()
127
+ ax.set_xlabel("X")
128
+ ax.set_ylabel("Y")
129
+ plt.show()
130
+ ```
131
+
132
+ ## 結論
133
+
134
+ 解析結果のサマリーを記載する。
135
+
136
+ ## References
137
+ """
138
+
139
+ # references.bib (空テンプレート)
140
+ bib_template = """@article{example2024,
141
+ title={Example Reference},
142
+ author={Author, A.},
143
+ journal={Journal},
144
+ year={2024}
145
+ }
146
+ """
147
+
148
+ with open(os.path.join(output_dir, "_quarto.yml"), "w") as f:
149
+ f.write(quarto_config)
150
+ with open(os.path.join(output_dir, "report.qmd"), "w") as f:
151
+ f.write(main_qmd)
152
+ with open(os.path.join(output_dir, "references.bib"), "w") as f:
153
+ f.write(bib_template)
154
+
155
+ print(f"Quarto project → {output_dir}/")
156
+ print(f" Build: cd {output_dir} && quarto render report.qmd")
157
+ return output_dir
158
+ ```
159
+
160
+ ## 2. Papermill パラメトリック実行
161
+
162
+ ```python
163
+ def papermill_parametric_run(template_notebook, output_dir,
164
+ parameter_sets, kernel="python3"):
165
+ """
166
+ Papermill パラメトリック実行 — 複数パラメータセットで自動実行。
167
+
168
+ Parameters:
169
+ template_notebook: str — テンプレートノートブックパス
170
+ output_dir: str — 出力ディレクトリ
171
+ parameter_sets: list[dict] — パラメータセットのリスト
172
+ kernel: str — カーネル名
173
+ """
174
+ import papermill as pm
175
+
176
+ os.makedirs(output_dir, exist_ok=True)
177
+ results = []
178
+
179
+ for i, params in enumerate(parameter_sets):
180
+ output_path = os.path.join(output_dir, f"run_{i:03d}.ipynb")
181
+ try:
182
+ pm.execute_notebook(
183
+ template_notebook,
184
+ output_path,
185
+ parameters=params,
186
+ kernel_name=kernel)
187
+ results.append({
188
+ "run": i, "params": params,
189
+ "output": output_path, "status": "success"})
190
+ except Exception as e:
191
+ results.append({
192
+ "run": i, "params": params,
193
+ "output": output_path, "status": f"error: {str(e)}"})
194
+
195
+ import pandas as pd
196
+ results_df = pd.DataFrame(results)
197
+ n_success = (results_df["status"] == "success").sum()
198
+ print(f"Papermill: {n_success}/{len(parameter_sets)} runs succeeded")
199
+ return results_df
200
+ ```
201
+
202
+ ## 3. Jupyter Book 多章構成
203
+
204
+ ```python
205
+ def generate_jupyter_book(title="Scientific Analysis Book",
206
+ chapters=None,
207
+ output_dir="jupyter_book"):
208
+ """
209
+ Jupyter Book プロジェクトテンプレート生成。
210
+
211
+ Parameters:
212
+ title: str — 書籍タイトル
213
+ chapters: list[dict] | None — 章情報 [{"title": ..., "file": ...}]
214
+ output_dir: str — 出力ディレクトリ
215
+ """
216
+ os.makedirs(output_dir, exist_ok=True)
217
+
218
+ if chapters is None:
219
+ chapters = [
220
+ {"title": "Introduction", "file": "intro"},
221
+ {"title": "Data Loading", "file": "ch01_data"},
222
+ {"title": "Exploratory Analysis", "file": "ch02_eda"},
223
+ {"title": "Modeling", "file": "ch03_model"},
224
+ {"title": "Results", "file": "ch04_results"},
225
+ ]
226
+
227
+ # _config.yml
228
+ config = f"""title: "{title}"
229
+ author: SATORI
230
+ execute:
231
+ execute_notebooks: auto
232
+ timeout: 600
233
+ repository:
234
+ url: ""
235
+ launch_buttons:
236
+ binderhub_url: ""
237
+ sphinx:
238
+ extra_extensions:
239
+ - sphinx_proof
240
+ """
241
+
242
+ # _toc.yml
243
+ toc_entries = "\n".join(
244
+ [f" - file: {ch['file']}" for ch in chapters])
245
+ toc = f"""format: jb-book
246
+ root: intro
247
+ chapters:
248
+ {toc_entries}
249
+ """
250
+
251
+ with open(os.path.join(output_dir, "_config.yml"), "w") as f:
252
+ f.write(config)
253
+ with open(os.path.join(output_dir, "_toc.yml"), "w") as f:
254
+ f.write(toc)
255
+
256
+ # 各章テンプレート
257
+ for ch in chapters:
258
+ filepath = os.path.join(output_dir, f"{ch['file']}.md")
259
+ if not os.path.exists(filepath):
260
+ content = f"# {ch['title']}\n\nThis chapter covers {ch['title'].lower()}.\n"
261
+ with open(filepath, "w") as f:
262
+ f.write(content)
263
+
264
+ print(f"Jupyter Book → {output_dir}/")
265
+ print(f" Build: jupyter-book build {output_dir}")
266
+ return output_dir
267
+ ```
268
+
269
+ ## 4. nbconvert 自動変換
270
+
271
+ ```python
272
+ def batch_convert_notebooks(notebook_dir, output_format="html",
273
+ output_dir=None, execute=True):
274
+ """
275
+ ノートブック一括変換。
276
+
277
+ Parameters:
278
+ notebook_dir: str — ノートブックディレクトリ
279
+ output_format: str — "html" / "pdf" / "markdown" / "script"
280
+ output_dir: str | None — 出力先 (None=同ディレクトリ)
281
+ execute: bool — 実行後に変換
282
+ """
283
+ import subprocess
284
+ import glob
285
+
286
+ notebooks = sorted(glob.glob(os.path.join(notebook_dir, "*.ipynb")))
287
+ if output_dir:
288
+ os.makedirs(output_dir, exist_ok=True)
289
+
290
+ results = []
291
+ for nb_path in notebooks:
292
+ cmd = ["jupyter", "nbconvert", f"--to={output_format}"]
293
+ if execute:
294
+ cmd.append("--execute")
295
+ if output_dir:
296
+ cmd.extend(["--output-dir", output_dir])
297
+ cmd.append(nb_path)
298
+
299
+ try:
300
+ subprocess.run(cmd, check=True, capture_output=True, text=True)
301
+ results.append({"notebook": nb_path, "status": "success"})
302
+ except subprocess.CalledProcessError as e:
303
+ results.append({"notebook": nb_path, "status": f"error: {e.stderr[:100]}"})
304
+
305
+ import pandas as pd
306
+ results_df = pd.DataFrame(results)
307
+ n_ok = (results_df["status"] == "success").sum()
308
+ print(f"nbconvert ({output_format}): {n_ok}/{len(notebooks)} converted")
309
+ return results_df
310
+ ```
311
+
312
+ ---
313
+
314
+ ## パイプライン統合
315
+
316
+ ```
317
+ [解析完了] → reproducible-reporting → presentation-design
318
+ (レポート自動生成) (プレゼン作成)
319
+ │ ↓
320
+ interactive-dashboard academic-writing
321
+ (ダッシュボード) (論文執筆)
322
+ ```
323
+
324
+ ## パイプライン出力
325
+
326
+ | ファイル | 説明 | 次スキル |
327
+ |---------|------|---------|
328
+ | `quarto_project/` | Quarto プロジェクト | → quarto render |
329
+ | `papermill_runs/` | パラメトリック実行結果 | → 集計 |
330
+ | `jupyter_book/` | Jupyter Book プロジェクト | → jb build |
@@ -0,0 +1,210 @@
1
+ ---
2
+ name: scientific-semi-supervised-learning
3
+ description: |
4
+ 半教師あり学習スキル。Self-Training・Label Propagation・
5
+ MixMatch/FixMatch・Pseudo-Labeling・ラベル効率評価。
6
+ ---
7
+
8
+ # Scientific Semi-Supervised Learning
9
+
10
+ 少量のラベル付きデータと大量の未ラベルデータを活用する
11
+ 半教師あり学習パイプラインを提供する。
12
+
13
+ ## When to Use
14
+
15
+ - ラベル付きデータが少量しかないとき
16
+ - アノテーションコストが高く全量ラベリングが困難なとき
17
+ - Self-Training で反復的にラベルを拡張するとき
18
+ - グラフベースの Label Propagation を適用するとき
19
+ - Pseudo-Labeling の信頼度閾値を設計するとき
20
+
21
+ ---
22
+
23
+ ## Quick Start
24
+
25
+ ## 1. Self-Training パイプライン
26
+
27
+ ```python
28
+ import numpy as np
29
+ import pandas as pd
30
+ from sklearn.base import clone
31
+ from sklearn.metrics import accuracy_score, classification_report
32
+
33
+
34
+ def self_training_pipeline(X_labeled, y_labeled, X_unlabeled,
35
+ base_estimator=None, threshold=0.95,
36
+ max_iterations=10, batch_size=None,
37
+ X_test=None, y_test=None):
38
+ """
39
+ Self-Training 半教師あり学習。
40
+
41
+ Parameters:
42
+ X_labeled: np.ndarray — ラベル付き特徴量
43
+ y_labeled: np.ndarray — ラベル
44
+ X_unlabeled: np.ndarray — 未ラベル特徴量
45
+ base_estimator: sklearn estimator | None — 基底分類器
46
+ threshold: float — Pseudo-Label 採用閾値
47
+ max_iterations: int — 最大反復回数
48
+ batch_size: int | None — 各反復で追加するサンプル数上限
49
+ X_test: np.ndarray | None — テスト特徴量
50
+ y_test: np.ndarray | None — テストラベル
51
+ """
52
+ from sklearn.ensemble import GradientBoostingClassifier
53
+
54
+ if base_estimator is None:
55
+ base_estimator = GradientBoostingClassifier(
56
+ n_estimators=100, random_state=42)
57
+
58
+ X_train = X_labeled.copy()
59
+ y_train = y_labeled.copy()
60
+ X_pool = X_unlabeled.copy()
61
+ history = []
62
+
63
+ for iteration in range(max_iterations):
64
+ if len(X_pool) == 0:
65
+ print(f"Iteration {iteration}: Pool exhausted")
66
+ break
67
+
68
+ model = clone(base_estimator)
69
+ model.fit(X_train, y_train)
70
+ proba = model.predict_proba(X_pool)
71
+ max_proba = proba.max(axis=1)
72
+ pseudo_labels = proba.argmax(axis=1)
73
+
74
+ confident_mask = max_proba >= threshold
75
+ n_confident = confident_mask.sum()
76
+
77
+ if batch_size and n_confident > batch_size:
78
+ top_idx = np.argsort(max_proba)[-batch_size:]
79
+ confident_mask = np.zeros(len(X_pool), dtype=bool)
80
+ confident_mask[top_idx] = True
81
+ n_confident = batch_size
82
+
83
+ if n_confident == 0:
84
+ print(f"Iteration {iteration}: No confident samples")
85
+ break
86
+
87
+ X_train = np.vstack([X_train, X_pool[confident_mask]])
88
+ y_train = np.concatenate([
89
+ y_train, pseudo_labels[confident_mask]])
90
+ X_pool = X_pool[~confident_mask]
91
+
92
+ record = {"iteration": iteration,
93
+ "n_labeled": len(X_train),
94
+ "n_pool": len(X_pool),
95
+ "n_added": int(n_confident),
96
+ "mean_confidence": float(max_proba[confident_mask].mean())}
97
+
98
+ if X_test is not None and y_test is not None:
99
+ test_acc = accuracy_score(y_test, model.predict(X_test))
100
+ record["test_accuracy"] = test_acc
101
+
102
+ history.append(record)
103
+ print(f"Iter {iteration}: +{n_confident} samples, "
104
+ f"total={len(X_train)}, pool={len(X_pool)}")
105
+
106
+ final_model = clone(base_estimator)
107
+ final_model.fit(X_train, y_train)
108
+ return final_model, pd.DataFrame(history)
109
+ ```
110
+
111
+ ## 2. Label Propagation
112
+
113
+ ```python
114
+ def label_propagation_ssl(X_all, y_partial, kernel="rbf",
115
+ gamma=20, n_neighbors=7,
116
+ max_iter=1000):
117
+ """
118
+ グラフベース Label Propagation。
119
+
120
+ Parameters:
121
+ X_all: np.ndarray — 全サンプル特徴量 (ラベル付き+未ラベル)
122
+ y_partial: np.ndarray — ラベル (-1 = 未ラベル)
123
+ kernel: str — "rbf" / "knn"
124
+ gamma: float — RBF カーネルの γ
125
+ n_neighbors: int — KNN カーネルの k
126
+ max_iter: int — 最大反復回数
127
+ """
128
+ from sklearn.semi_supervised import (
129
+ LabelPropagation, LabelSpreading)
130
+
131
+ models = {
132
+ "propagation": LabelPropagation(
133
+ kernel=kernel, gamma=gamma,
134
+ n_neighbors=n_neighbors, max_iter=max_iter),
135
+ "spreading": LabelSpreading(
136
+ kernel=kernel, gamma=gamma,
137
+ n_neighbors=n_neighbors, max_iter=max_iter, alpha=0.2),
138
+ }
139
+
140
+ results = {}
141
+ for name, model in models.items():
142
+ model.fit(X_all, y_partial)
143
+ y_pred = model.transduction_
144
+ n_propagated = (y_partial == -1).sum()
145
+ results[name] = {
146
+ "model": model,
147
+ "predictions": y_pred,
148
+ "n_propagated": int(n_propagated),
149
+ "label_distributions": model.label_distributions_,
150
+ }
151
+ print(f"{name}: propagated {n_propagated} labels")
152
+
153
+ return results
154
+ ```
155
+
156
+ ## 3. Pseudo-Labeling 品質評価
157
+
158
+ ```python
159
+ def evaluate_pseudo_labels(y_true_unlabeled, pseudo_labels,
160
+ confidences, thresholds=None):
161
+ """
162
+ Pseudo-Label の品質を評価。
163
+
164
+ Parameters:
165
+ y_true_unlabeled: np.ndarray — 真のラベル (評価用)
166
+ pseudo_labels: np.ndarray — 予測した疑似ラベル
167
+ confidences: np.ndarray — 各予測の信頼度
168
+ thresholds: list[float] | None — 閾値リスト
169
+ """
170
+ if thresholds is None:
171
+ thresholds = [0.5, 0.7, 0.8, 0.9, 0.95, 0.99]
172
+
173
+ records = []
174
+ for t in thresholds:
175
+ mask = confidences >= t
176
+ if mask.sum() == 0:
177
+ continue
178
+ acc = accuracy_score(y_true_unlabeled[mask],
179
+ pseudo_labels[mask])
180
+ records.append({
181
+ "threshold": t,
182
+ "n_selected": int(mask.sum()),
183
+ "coverage": float(mask.mean()),
184
+ "pseudo_accuracy": acc,
185
+ })
186
+ print(f"τ={t:.2f}: {mask.sum()} samples, "
187
+ f"coverage={mask.mean():.1%}, acc={acc:.3f}")
188
+
189
+ return pd.DataFrame(records)
190
+ ```
191
+
192
+ ---
193
+
194
+ ## パイプライン統合
195
+
196
+ ```
197
+ [少量ラベル] → semi-supervised-learning → ml-classification
198
+ (ラベル拡張) (本分類)
199
+
200
+ active-learning ← data-profiling
201
+ (能動学習) (データ品質)
202
+ ```
203
+
204
+ ## パイプライン出力
205
+
206
+ | ファイル | 説明 | 次スキル |
207
+ |---------|------|---------|
208
+ | `self_training_history.csv` | 反復学習履歴 | → 収束分析 |
209
+ | `pseudo_label_quality.csv` | 疑似ラベル品質 | → 閾値選択 |
210
+ | `propagated_labels.npy` | 伝播ラベル | → ml-classification |