@nahisaho/satori 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/LICENCE +0 -0
  2. package/README.md +191 -0
  3. package/bin/satori.js +95 -0
  4. package/package.json +29 -0
  5. package/src/.github/skills/scientific-academic-writing/SKILL.md +361 -0
  6. package/src/.github/skills/scientific-academic-writing/assets/acs_article.md +199 -0
  7. package/src/.github/skills/scientific-academic-writing/assets/elsevier_article.md +244 -0
  8. package/src/.github/skills/scientific-academic-writing/assets/ieee_transactions.md +212 -0
  9. package/src/.github/skills/scientific-academic-writing/assets/imrad_standard.md +181 -0
  10. package/src/.github/skills/scientific-academic-writing/assets/nature_article.md +179 -0
  11. package/src/.github/skills/scientific-academic-writing/assets/qiita_technical_article.md +385 -0
  12. package/src/.github/skills/scientific-academic-writing/assets/science_research_article.md +169 -0
  13. package/src/.github/skills/scientific-bioinformatics/SKILL.md +220 -0
  14. package/src/.github/skills/scientific-biosignal-processing/SKILL.md +357 -0
  15. package/src/.github/skills/scientific-causal-inference/SKILL.md +347 -0
  16. package/src/.github/skills/scientific-cheminformatics/SKILL.md +196 -0
  17. package/src/.github/skills/scientific-data-preprocessing/SKILL.md +413 -0
  18. package/src/.github/skills/scientific-data-simulation/SKILL.md +244 -0
  19. package/src/.github/skills/scientific-doe/SKILL.md +360 -0
  20. package/src/.github/skills/scientific-eda-correlation/SKILL.md +141 -0
  21. package/src/.github/skills/scientific-feature-importance/SKILL.md +208 -0
  22. package/src/.github/skills/scientific-image-analysis/SKILL.md +310 -0
  23. package/src/.github/skills/scientific-materials-characterization/SKILL.md +368 -0
  24. package/src/.github/skills/scientific-meta-analysis/SKILL.md +352 -0
  25. package/src/.github/skills/scientific-metabolomics/SKILL.md +326 -0
  26. package/src/.github/skills/scientific-ml-classification/SKILL.md +265 -0
  27. package/src/.github/skills/scientific-ml-regression/SKILL.md +215 -0
  28. package/src/.github/skills/scientific-multi-omics/SKILL.md +303 -0
  29. package/src/.github/skills/scientific-network-analysis/SKILL.md +257 -0
  30. package/src/.github/skills/scientific-pca-tsne/SKILL.md +235 -0
  31. package/src/.github/skills/scientific-pipeline-scaffold/SKILL.md +331 -0
  32. package/src/.github/skills/scientific-process-optimization/SKILL.md +215 -0
  33. package/src/.github/skills/scientific-publication-figures/SKILL.md +208 -0
  34. package/src/.github/skills/scientific-sequence-analysis/SKILL.md +389 -0
  35. package/src/.github/skills/scientific-spectral-signal/SKILL.md +227 -0
  36. package/src/.github/skills/scientific-statistical-testing/SKILL.md +240 -0
  37. package/src/.github/skills/scientific-survival-clinical/SKILL.md +239 -0
  38. package/src/.github/skills/scientific-time-series/SKILL.md +291 -0
@@ -0,0 +1,352 @@
1
+ ---
2
+ name: scientific-meta-analysis
3
+ description: |
4
+ メタ解析スキル。固定効果・ランダム効果モデル(DerSimonian-Laird)、Forest プロット、
5
+ 異質性評価(I²/Q 検定/τ²)、出版バイアス検出(Funnel プロット/Egger/Begg 検定)、
6
+ サブグループ解析、メタ回帰、累積メタ解析のテンプレートを提供。
7
+ ---
8
+
9
+ # Scientific Meta-Analysis
10
+
11
+ 複数の独立した研究結果を統合し、全体的なエビデンスを定量化するためのスキル。
12
+ 効果量(SMD / OR / RR / MD)の統合、異質性評価、出版バイアス検出のパイプラインを
13
+ 提供する。
14
+
15
+ ## When to Use
16
+
17
+ - 複数の研究・実験結果を統合的に評価するとき
18
+ - 効果量(Hedges' g / Cohen's d / SMD)を算出するとき
19
+ - Forest プロット / Funnel プロットを描画するとき
20
+ - 研究間の異質性を定量化するとき(I² / Q / τ²)
21
+ - 出版バイアスの有無を検定するとき
22
+
23
+ ---
24
+
25
+ ## Quick Start
26
+
27
+ ## 1. 効果量の算出
28
+
29
+ ```python
30
+ import numpy as np
31
+ import pandas as pd
32
+ from scipy.stats import norm
33
+
34
+ def compute_effect_sizes(studies_df, effect_type="SMD"):
35
+ """
36
+ 各研究の効果量と分散を算出する。
37
+
38
+ effect_type:
39
+ "SMD" — Standardized Mean Difference (Hedges' g)
40
+ "MD" — Mean Difference (同スケール)
41
+ "OR" — Odds Ratio (log 変換)
42
+ "RR" — Risk Ratio (log 変換)
43
+
44
+ Input columns (SMD/MD):
45
+ mean1, sd1, n1, mean2, sd2, n2
46
+
47
+ Input columns (OR/RR):
48
+ events1, total1, events2, total2
49
+ """
50
+ df = studies_df.copy()
51
+
52
+ if effect_type == "SMD":
53
+ # Cohen's d → Hedges' g (小標本補正)
54
+ pooled_sd = np.sqrt(
55
+ ((df["n1"]-1)*df["sd1"]**2 + (df["n2"]-1)*df["sd2"]**2) /
56
+ (df["n1"] + df["n2"] - 2)
57
+ )
58
+ d = (df["mean1"] - df["mean2"]) / pooled_sd
59
+ # Hedges' correction factor
60
+ J = 1 - 3 / (4*(df["n1"]+df["n2"]-2) - 1)
61
+ df["effect_size"] = d * J
62
+ df["variance"] = (df["n1"]+df["n2"])/(df["n1"]*df["n2"]) + \
63
+ df["effect_size"]**2 / (2*(df["n1"]+df["n2"]))
64
+
65
+ elif effect_type == "MD":
66
+ df["effect_size"] = df["mean1"] - df["mean2"]
67
+ df["variance"] = df["sd1"]**2/df["n1"] + df["sd2"]**2/df["n2"]
68
+
69
+ elif effect_type == "OR":
70
+ a = df["events1"]; b = df["total1"] - df["events1"]
71
+ c = df["events2"]; d_val = df["total2"] - df["events2"]
72
+ df["effect_size"] = np.log((a * d_val) / (b * c + 1e-10) + 1e-10)
73
+ df["variance"] = 1/a + 1/b + 1/c + 1/d_val
74
+
75
+ elif effect_type == "RR":
76
+ p1 = df["events1"] / df["total1"]
77
+ p2 = df["events2"] / df["total2"]
78
+ df["effect_size"] = np.log(p1 / (p2 + 1e-10) + 1e-10)
79
+ df["variance"] = (1-p1)/(df["events1"]+1e-10) + \
80
+ (1-p2)/(df["events2"]+1e-10)
81
+
82
+ df["se"] = np.sqrt(df["variance"])
83
+ df["ci_lower"] = df["effect_size"] - 1.96 * df["se"]
84
+ df["ci_upper"] = df["effect_size"] + 1.96 * df["se"]
85
+ df["weight"] = 1 / df["variance"]
86
+
87
+ return df
88
+ ```
89
+
90
+ ## 2. 固定効果 / ランダム効果モデル
91
+
92
+ ```python
93
+ def meta_analysis(studies_df, model="random"):
94
+ """
95
+ メタ解析統合。
96
+
97
+ model:
98
+ "fixed" — 固定効果モデル (Inverse-Variance weighted)
99
+ "random" — ランダム効果モデル (DerSimonian-Laird)
100
+
101
+ Input: DataFrame with columns: study, effect_size, variance
102
+ """
103
+ es = studies_df["effect_size"].values
104
+ var = studies_df["variance"].values
105
+ w = 1 / var
106
+ k = len(es)
107
+
108
+ # 固定効果
109
+ theta_fixed = np.sum(w * es) / np.sum(w)
110
+ se_fixed = 1 / np.sqrt(np.sum(w))
111
+
112
+ # 異質性
113
+ Q = np.sum(w * (es - theta_fixed)**2)
114
+ df_Q = k - 1
115
+ p_Q = 1 - __import__("scipy").stats.chi2.cdf(Q, df_Q)
116
+ I2 = max(0, (Q - df_Q) / Q * 100) if Q > 0 else 0
117
+
118
+ if model == "random":
119
+ # DerSimonian-Laird τ² 推定
120
+ C = np.sum(w) - np.sum(w**2) / np.sum(w)
121
+ tau2 = max(0, (Q - df_Q) / C)
122
+
123
+ # ランダム効果重み
124
+ w_re = 1 / (var + tau2)
125
+ theta_random = np.sum(w_re * es) / np.sum(w_re)
126
+ se_random = 1 / np.sqrt(np.sum(w_re))
127
+
128
+ summary_effect = theta_random
129
+ summary_se = se_random
130
+ else:
131
+ tau2 = 0
132
+ summary_effect = theta_fixed
133
+ summary_se = se_fixed
134
+
135
+ z = summary_effect / summary_se
136
+ p_val = 2 * (1 - norm.cdf(abs(z)))
137
+
138
+ return {
139
+ "model": model,
140
+ "summary_effect": summary_effect,
141
+ "se": summary_se,
142
+ "ci_lower": summary_effect - 1.96 * summary_se,
143
+ "ci_upper": summary_effect + 1.96 * summary_se,
144
+ "z_value": z,
145
+ "p_value": p_val,
146
+ "Q_statistic": Q,
147
+ "Q_p_value": p_Q,
148
+ "I_squared": I2,
149
+ "tau_squared": tau2,
150
+ "k_studies": k,
151
+ }
152
+ ```
153
+
154
+ ## 3. Forest プロット
155
+
156
+ ```python
157
+ import matplotlib.pyplot as plt
158
+
159
+ def forest_plot(studies_df, meta_result, effect_label="SMD",
160
+ figsize=(10, None)):
161
+ """
162
+ Forest プロットを描画する。
163
+
164
+ studies_df: study, effect_size, ci_lower, ci_upper, weight
165
+ meta_result: meta_analysis() の出力
166
+ """
167
+ k = len(studies_df)
168
+ if figsize[1] is None:
169
+ figsize = (figsize[0], max(4, k * 0.4 + 2))
170
+
171
+ fig, ax = plt.subplots(figsize=figsize)
172
+
173
+ y_positions = range(k, 0, -1)
174
+
175
+ # 個別研究
176
+ for i, (_, row) in enumerate(studies_df.iterrows()):
177
+ y = list(y_positions)[i]
178
+ ax.plot([row["ci_lower"], row["ci_upper"]], [y, y],
179
+ "b-", linewidth=1.5)
180
+ size = row.get("weight", 1) / studies_df["weight"].max() * 200 + 20
181
+ ax.plot(row["effect_size"], y, "bs", markersize=np.sqrt(size),
182
+ markerfacecolor="steelblue")
183
+ ax.text(-0.05, y, row.get("study", f"Study {i+1}"),
184
+ ha="right", va="center", fontsize=9,
185
+ transform=ax.get_yaxis_transform())
186
+
187
+ # サマリーダイヤモンド
188
+ y_summary = 0
189
+ diamond_x = [meta_result["ci_lower"], meta_result["summary_effect"],
190
+ meta_result["ci_upper"], meta_result["summary_effect"]]
191
+ diamond_y = [y_summary, y_summary + 0.3, y_summary, y_summary - 0.3]
192
+ ax.fill(diamond_x, diamond_y, color="red", alpha=0.7)
193
+ ax.text(-0.05, y_summary, "Summary",
194
+ ha="right", va="center", fontsize=9, fontweight="bold",
195
+ transform=ax.get_yaxis_transform())
196
+
197
+ # 参照線
198
+ ax.axvline(0, color="black", linestyle="-", linewidth=0.5)
199
+
200
+ ax.set_xlabel(effect_label)
201
+ ax.set_yticks([])
202
+ ax.set_title(f"Forest Plot (I²={meta_result['I_squared']:.1f}%, "
203
+ f"p={meta_result['p_value']:.4f})", fontweight="bold")
204
+
205
+ # 右側に数値
206
+ for i, (_, row) in enumerate(studies_df.iterrows()):
207
+ y = list(y_positions)[i]
208
+ ax.text(1.02, y, f"{row['effect_size']:.2f} [{row['ci_lower']:.2f}, "
209
+ f"{row['ci_upper']:.2f}]",
210
+ ha="left", va="center", fontsize=8,
211
+ transform=ax.get_yaxis_transform())
212
+
213
+ plt.tight_layout()
214
+ plt.savefig("figures/forest_plot.png", dpi=300, bbox_inches="tight")
215
+ plt.close()
216
+ ```
217
+
218
+ ## 4. 出版バイアス検出
219
+
220
+ ### 4.1 Funnel プロット
221
+
222
+ ```python
223
+ def funnel_plot(studies_df, meta_result, figsize=(8, 6)):
224
+ """
225
+ Funnel プロットを描画する。
226
+ 非対称なら出版バイアスの存在を示唆。
227
+ """
228
+ fig, ax = plt.subplots(figsize=figsize)
229
+
230
+ ax.scatter(studies_df["effect_size"], studies_df["se"],
231
+ c="steelblue", s=50, edgecolors="black", zorder=5)
232
+
233
+ # 参照線
234
+ ax.axvline(meta_result["summary_effect"], color="red", linestyle="--")
235
+
236
+ # 95% 擬似信頼区間
237
+ se_range = np.linspace(0, studies_df["se"].max() * 1.1, 100)
238
+ ax.plot(meta_result["summary_effect"] - 1.96 * se_range, se_range,
239
+ "gray", linestyle="--", alpha=0.5)
240
+ ax.plot(meta_result["summary_effect"] + 1.96 * se_range, se_range,
241
+ "gray", linestyle="--", alpha=0.5)
242
+
243
+ ax.set_xlabel("Effect Size")
244
+ ax.set_ylabel("Standard Error")
245
+ ax.set_title("Funnel Plot", fontweight="bold")
246
+ ax.invert_yaxis()
247
+ plt.tight_layout()
248
+ plt.savefig("figures/funnel_plot.png", dpi=300, bbox_inches="tight")
249
+ plt.close()
250
+ ```
251
+
252
+ ### 4.2 Egger 検定
253
+
254
+ ```python
255
+ import statsmodels.api as sm
256
+
257
+ def egger_test(studies_df):
258
+ """
259
+ Egger 回帰検定 — Funnel プロットの非対称性を統計的に検定。
260
+
261
+ y = effect_size / se
262
+ x = 1 / se
263
+ 切片 ≠ 0 → 出版バイアスあり
264
+ """
265
+ precision = 1 / studies_df["se"]
266
+ z_score = studies_df["effect_size"] / studies_df["se"]
267
+
268
+ X = sm.add_constant(precision)
269
+ model = sm.OLS(z_score, X).fit()
270
+
271
+ return {
272
+ "intercept": model.params["const"],
273
+ "intercept_se": model.bse["const"],
274
+ "intercept_p": model.pvalues["const"],
275
+ "publication_bias": model.pvalues["const"] < 0.05,
276
+ }
277
+ ```
278
+
279
+ ## 5. サブグループ解析
280
+
281
+ ```python
282
+ def subgroup_analysis(studies_df, subgroup_col, model="random"):
283
+ """サブグループごとにメタ解析を行い、グループ間差を検定する。"""
284
+ subgroups = studies_df[subgroup_col].unique()
285
+ results = []
286
+
287
+ for sg in subgroups:
288
+ subset = studies_df[studies_df[subgroup_col] == sg]
289
+ if len(subset) >= 2:
290
+ ma = meta_analysis(subset, model=model)
291
+ ma["subgroup"] = sg
292
+ ma["k"] = len(subset)
293
+ results.append(ma)
294
+
295
+ results_df = pd.DataFrame(results)
296
+
297
+ # グループ間検定 (Q_between)
298
+ overall = meta_analysis(studies_df, model=model)
299
+ Q_within = sum(r["Q_statistic"] for r in results)
300
+ Q_between = overall["Q_statistic"] - Q_within
301
+ df_between = len(results) - 1
302
+ p_between = 1 - __import__("scipy").stats.chi2.cdf(Q_between, df_between)
303
+
304
+ return {
305
+ "subgroup_results": results_df,
306
+ "Q_between": Q_between,
307
+ "df_between": df_between,
308
+ "p_between": p_between,
309
+ }
310
+ ```
311
+
312
+ ## 6. 累積メタ解析
313
+
314
+ ```python
315
+ def cumulative_meta_analysis(studies_df, sort_by="year", model="random"):
316
+ """
317
+ 研究を順に追加しながらメタ解析を実行する。
318
+ エビデンスの蓄積過程を可視化。
319
+ """
320
+ sorted_df = studies_df.sort_values(sort_by)
321
+ cumulative_results = []
322
+
323
+ for i in range(2, len(sorted_df) + 1):
324
+ subset = sorted_df.iloc[:i]
325
+ ma = meta_analysis(subset, model=model)
326
+ ma["n_studies"] = i
327
+ ma["last_added"] = sorted_df.iloc[i-1].get("study", f"Study {i}")
328
+ cumulative_results.append(ma)
329
+
330
+ return pd.DataFrame(cumulative_results)
331
+ ```
332
+
333
+ ## References
334
+
335
+ ### Output Files
336
+
337
+ | ファイル | 形式 |
338
+ |---|---|
339
+ | `results/meta_analysis_summary.csv` | CSV |
340
+ | `results/effect_sizes.csv` | CSV |
341
+ | `results/publication_bias_tests.csv` | CSV |
342
+ | `results/subgroup_analysis.csv` | CSV |
343
+ | `figures/forest_plot.png` | PNG |
344
+ | `figures/funnel_plot.png` | PNG |
345
+ | `figures/cumulative_meta.png` | PNG |
346
+
347
+ #### 依存パッケージ
348
+
349
+ ```
350
+ scipy>=1.10
351
+ statsmodels>=0.14
352
+ ```
@@ -0,0 +1,326 @@
1
+ ---
2
+ name: scientific-metabolomics
3
+ description: |
4
+ メタボロミクス解析スキル。Pareto スケーリング、PLS-DA + VIP スコア、置換検定(Q²)、
5
+ 代謝パスウェイ濃縮解析(Fisher exact test)、代謝物相関ネットワーク、
6
+ Volcano プロット/箱ひげ図による差次代謝物同定パイプライン。
7
+ Scientific Skills Exp-07 で確立したパターン。
8
+ ---
9
+
10
+ # Scientific Metabolomics Analysis
11
+
12
+ LC-MS / GC-MS / NMR ベースのメタボロミクスデータを対象に、品質管理→前処理→
13
+ 単変量解析→多変量解析→パスウェイ解析の標準パイプラインを提供する。
14
+ メタボロミクス固有の統計手法(PLS-DA、VIP スコア)に特化している。
15
+
16
+ ## When to Use
17
+
18
+ - メタボロミクスデータの統計解析パイプラインが必要なとき
19
+ - PLS-DA による群間判別+VIP スコアによるバイオマーカー候補を算出するとき
20
+ - 代謝パスウェイ濃縮解析が必要なとき
21
+ - Pareto スケーリングや代謝物相関ネットワークが必要なとき
22
+
23
+ ---
24
+
25
+ ## Quick Start
26
+
27
+ ## 1. メタボロミクス前処理
28
+
29
+ ```python
30
+ import numpy as np
31
+ import pandas as pd
32
+ from sklearn.impute import KNNImputer
33
+
34
+ def metabolomics_preprocessing(df, sample_col="Sample_ID", group_col="Group",
35
+ min_detect_pct=0.5):
36
+ """
37
+ メタボロミクス標準前処理パイプライン。
38
+ 1. 低検出率代謝物の除去
39
+ 2. KNN 欠損値補完
40
+ 3. log2 変換
41
+ 4. Pareto スケーリング
42
+ """
43
+ metabolite_cols = [c for c in df.columns if c not in [sample_col, group_col]]
44
+
45
+ # Step 1: 低検出率フィルタリング
46
+ detect_rate = df[metabolite_cols].notna().mean()
47
+ keep = detect_rate[detect_rate >= min_detect_pct].index.tolist()
48
+ removed = len(metabolite_cols) - len(keep)
49
+ print(f" Removed {removed} metabolites with <{min_detect_pct*100:.0f}% detection rate")
50
+ metabolite_cols = keep
51
+
52
+ # Step 2: KNN 補完
53
+ imputer = KNNImputer(n_neighbors=5)
54
+ df[metabolite_cols] = imputer.fit_transform(df[metabolite_cols])
55
+
56
+ # Step 3: log2 変換
57
+ df[metabolite_cols] = np.log2(df[metabolite_cols].clip(lower=1e-10) + 1)
58
+
59
+ # Step 4: Pareto スケーリング
60
+ for col in metabolite_cols:
61
+ mean = df[col].mean()
62
+ std = df[col].std()
63
+ df[col] = (df[col] - mean) / np.sqrt(std + 1e-10)
64
+
65
+ return df, metabolite_cols
66
+ ```
67
+
68
+ ## 2. 単変量解析 — 差次代謝物同定
69
+
70
+ ```python
71
+ from scipy.stats import mannwhitneyu, ttest_ind
72
+ from statsmodels.stats.multitest import multipletests
73
+
74
+ def univariate_analysis(df, metabolite_cols, group_col, group1, group2,
75
+ test="mannwhitneyu", correction="fdr_bh"):
76
+ """
77
+ 2 群間の差次代謝物を同定する。
78
+
79
+ Returns:
80
+ DataFrame with columns: metabolite, log2FC, pvalue, padj, significant
81
+ """
82
+ g1 = df[df[group_col] == group1]
83
+ g2 = df[df[group_col] == group2]
84
+
85
+ results = []
86
+ for met in metabolite_cols:
87
+ v1 = g1[met].dropna()
88
+ v2 = g2[met].dropna()
89
+
90
+ if test == "mannwhitneyu":
91
+ stat, pval = mannwhitneyu(v1, v2, alternative="two-sided")
92
+ else:
93
+ stat, pval = ttest_ind(v1, v2)
94
+
95
+ log2fc = v2.mean() - v1.mean()
96
+ results.append({"metabolite": met, "log2FC": log2fc, "pvalue": pval})
97
+
98
+ results_df = pd.DataFrame(results)
99
+
100
+ # 多重検定補正
101
+ reject, padj, _, _ = multipletests(results_df["pvalue"], method=correction)
102
+ results_df["padj"] = padj
103
+ results_df["significant"] = reject
104
+ results_df["neg_log10p"] = -np.log10(results_df["pvalue"] + 1e-300)
105
+
106
+ results_df = results_df.sort_values("pvalue")
107
+ return results_df
108
+ ```
109
+
110
+ ## 3. PLS-DA + VIP スコア
111
+
112
+ ```python
113
+ from sklearn.cross_decomposition import PLSRegression
114
+ from sklearn.preprocessing import LabelEncoder
115
+
116
+ def plsda_analysis(X, y, n_components=2):
117
+ """
118
+ PLS-DA を実行し、VIP スコアを算出する。
119
+
120
+ VIP (Variable Importance in Projection):
121
+ VIP_j = sqrt(p * Σ(q²_a * w²_ja) / Σ(q²_a))
122
+ VIP > 1 の変数がバイオマーカー候補
123
+
124
+ Parameters:
125
+ X: 代謝物データ行列 (n_samples, n_metabolites)
126
+ y: グループラベル
127
+
128
+ Returns:
129
+ pls_model, scores, vip_scores
130
+ """
131
+ le = LabelEncoder()
132
+ y_encoded = le.fit_transform(y).astype(float)
133
+
134
+ pls = PLSRegression(n_components=n_components, scale=True)
135
+ pls.fit(X, y_encoded)
136
+
137
+ # スコア(潜在変数)
138
+ scores = pls.transform(X)
139
+
140
+ # VIP スコア算出
141
+ T = pls.x_scores_ # (n, n_comp)
142
+ W = pls.x_weights_ # (p, n_comp)
143
+ Q = pls.y_loadings_ # (1, n_comp)
144
+
145
+ p = X.shape[1]
146
+ vip = np.zeros(p)
147
+
148
+ ss_total = np.sum(Q**2 * np.sum(T**2, axis=0))
149
+ for j in range(p):
150
+ ss_j = np.sum(Q**2 * np.sum(T**2, axis=0) * W[j, :]**2)
151
+ vip[j] = np.sqrt(p * ss_j / ss_total)
152
+
153
+ return pls, scores, vip
154
+
155
+
156
+ def plot_plsda_scores(scores, y, group_names=None, figsize=(8, 6)):
157
+ """PLS-DA スコアプロットを描画する。"""
158
+ import matplotlib.pyplot as plt
159
+
160
+ fig, ax = plt.subplots(figsize=figsize)
161
+ unique = np.unique(y)
162
+ colors = plt.cm.Set1(np.linspace(0, 0.5, len(unique)))
163
+
164
+ for color, group in zip(colors, unique):
165
+ mask = y == group
166
+ label = group_names[group] if group_names else str(group)
167
+ ax.scatter(scores[mask, 0], scores[mask, 1],
168
+ c=[color], label=label, s=60, alpha=0.7, edgecolors="black")
169
+ # 95% 信頼楕円
170
+ from matplotlib.patches import Ellipse
171
+ cov = np.cov(scores[mask, 0], scores[mask, 1])
172
+ vals, vecs = np.linalg.eigh(cov)
173
+ angle = np.degrees(np.arctan2(vecs[1, 1], vecs[0, 1]))
174
+ w, h = 2 * np.sqrt(vals * 5.991) # chi2(2, 0.95) = 5.991
175
+ ell = Ellipse(xy=(scores[mask, 0].mean(), scores[mask, 1].mean()),
176
+ width=w, height=h, angle=angle,
177
+ fill=False, color=color, linewidth=2, linestyle="--")
178
+ ax.add_patch(ell)
179
+
180
+ ax.set_xlabel("PLS Component 1")
181
+ ax.set_ylabel("PLS Component 2")
182
+ ax.set_title("PLS-DA Score Plot", fontweight="bold")
183
+ ax.legend()
184
+ plt.tight_layout()
185
+ plt.savefig("figures/plsda_scores.png", dpi=300, bbox_inches="tight")
186
+ plt.close()
187
+ ```
188
+
189
+ ## 4. 置換検定(PLS-DA バリデーション)
190
+
191
+ ```python
192
+ def permutation_test_plsda(X, y, n_components=2, n_permutations=100):
193
+ """
194
+ PLS-DA モデルの置換検定。
195
+ Q² と R²Y の分布を生成し、真のモデルの有意性を評価する。
196
+ """
197
+ from sklearn.model_selection import cross_val_predict
198
+ le = LabelEncoder()
199
+ y_enc = le.fit_transform(y).astype(float)
200
+
201
+ # 真のモデル
202
+ pls_true = PLSRegression(n_components=n_components, scale=True)
203
+ y_pred = cross_val_predict(pls_true, X, y_enc, cv=5)
204
+ ss_res = np.sum((y_enc - y_pred.ravel())**2)
205
+ ss_tot = np.sum((y_enc - y_enc.mean())**2)
206
+ q2_true = 1 - ss_res / ss_tot
207
+
208
+ # 置換
209
+ q2_perm = []
210
+ for _ in range(n_permutations):
211
+ y_perm = np.random.permutation(y_enc)
212
+ pls_p = PLSRegression(n_components=n_components, scale=True)
213
+ y_pred_p = cross_val_predict(pls_p, X, y_perm, cv=5)
214
+ ss_res_p = np.sum((y_perm - y_pred_p.ravel())**2)
215
+ ss_tot_p = np.sum((y_perm - y_perm.mean())**2)
216
+ q2_perm.append(1 - ss_res_p / ss_tot_p)
217
+
218
+ p_value = np.mean(np.array(q2_perm) >= q2_true)
219
+
220
+ return {
221
+ "Q2_true": q2_true,
222
+ "Q2_perm_mean": np.mean(q2_perm),
223
+ "Q2_perm_std": np.std(q2_perm),
224
+ "p_value": p_value,
225
+ "significant": p_value < 0.05,
226
+ }
227
+ ```
228
+
229
+ ## 5. 代謝パスウェイ濃縮解析
230
+
231
+ ```python
232
+ from scipy.stats import fisher_exact
233
+
234
+ def pathway_enrichment(significant_metabolites, pathway_annotations,
235
+ metabolite_col="Metabolite", pathway_col="Pathway",
236
+ total_metabolites=None):
237
+ """
238
+ Fisher 正確検定による代謝パスウェイ濃縮解析。
239
+
240
+ Parameters:
241
+ significant_metabolites: list of significant metabolite names
242
+ pathway_annotations: DataFrame with metabolite-pathway mapping
243
+ total_metabolites: 解析対象の全代謝物数
244
+ """
245
+ sig_set = set(significant_metabolites)
246
+ all_annotated = set(pathway_annotations[metabolite_col])
247
+ if total_metabolites is None:
248
+ total_metabolites = len(all_annotated)
249
+
250
+ pathways = pathway_annotations[pathway_col].unique()
251
+ results = []
252
+
253
+ for pw in pathways:
254
+ pw_members = set(
255
+ pathway_annotations[pathway_annotations[pathway_col] == pw][metabolite_col]
256
+ )
257
+ k = len(sig_set & pw_members) # hit
258
+ K = len(pw_members) # pathway size
259
+ n = len(sig_set) # significant
260
+ N = total_metabolites # total
261
+
262
+ # 2x2 分割表
263
+ table = [[k, K - k], [n - k, N - K - n + k]]
264
+ odds_ratio, p_value = fisher_exact(table, alternative="greater")
265
+
266
+ results.append({
267
+ "Pathway": pw,
268
+ "Hits": k,
269
+ "Pathway_Size": K,
270
+ "Significant_Total": n,
271
+ "Odds_Ratio": odds_ratio,
272
+ "p_value": p_value,
273
+ })
274
+
275
+ results_df = pd.DataFrame(results).sort_values("p_value")
276
+ _, padj, _, _ = multipletests(results_df["p_value"], method="fdr_bh")
277
+ results_df["padj"] = padj
278
+
279
+ results_df.to_csv("results/pathway_enrichment.csv", index=False)
280
+ return results_df
281
+ ```
282
+
283
+ ## 6. 代謝物相関ネットワーク
284
+
285
+ ```python
286
+ def metabolite_correlation_network(df, metabolite_cols, method="spearman",
287
+ threshold=0.7):
288
+ """
289
+ 代謝物間の相関からネットワークを構築する。
290
+
291
+ Parameters:
292
+ threshold: |r| ≥ threshold のペアのみエッジとして採用
293
+ """
294
+ import networkx as nx
295
+
296
+ corr = df[metabolite_cols].corr(method=method)
297
+
298
+ G = nx.Graph()
299
+ for i, met_i in enumerate(metabolite_cols):
300
+ G.add_node(met_i)
301
+ for j, met_j in enumerate(metabolite_cols):
302
+ if i < j:
303
+ r = corr.iloc[i, j]
304
+ if abs(r) >= threshold:
305
+ G.add_edge(met_i, met_j, weight=abs(r),
306
+ sign="positive" if r > 0 else "negative")
307
+
308
+ return G, corr
309
+ ```
310
+
311
+ ## References
312
+
313
+ ### Output Files
314
+
315
+ | ファイル | 形式 |
316
+ |---|---|
317
+ | `results/univariate_results.csv` | CSV |
318
+ | `results/vip_scores.csv` | CSV |
319
+ | `results/pathway_enrichment.csv` | CSV |
320
+ | `figures/plsda_scores.png` | PNG |
321
+ | `figures/vip_barplot.png` | PNG |
322
+ | `figures/metabolite_network.png` | PNG |
323
+
324
+ #### 参照実験
325
+
326
+ - **Exp-07**: PLS-DA + VIP、Pareto スケーリング、パスウェイ濃縮、相関ネットワーク