@nahisaho/satori 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENCE +0 -0
- package/README.md +191 -0
- package/bin/satori.js +95 -0
- package/package.json +29 -0
- package/src/.github/skills/scientific-academic-writing/SKILL.md +361 -0
- package/src/.github/skills/scientific-academic-writing/assets/acs_article.md +199 -0
- package/src/.github/skills/scientific-academic-writing/assets/elsevier_article.md +244 -0
- package/src/.github/skills/scientific-academic-writing/assets/ieee_transactions.md +212 -0
- package/src/.github/skills/scientific-academic-writing/assets/imrad_standard.md +181 -0
- package/src/.github/skills/scientific-academic-writing/assets/nature_article.md +179 -0
- package/src/.github/skills/scientific-academic-writing/assets/qiita_technical_article.md +385 -0
- package/src/.github/skills/scientific-academic-writing/assets/science_research_article.md +169 -0
- package/src/.github/skills/scientific-bioinformatics/SKILL.md +220 -0
- package/src/.github/skills/scientific-biosignal-processing/SKILL.md +357 -0
- package/src/.github/skills/scientific-causal-inference/SKILL.md +347 -0
- package/src/.github/skills/scientific-cheminformatics/SKILL.md +196 -0
- package/src/.github/skills/scientific-data-preprocessing/SKILL.md +413 -0
- package/src/.github/skills/scientific-data-simulation/SKILL.md +244 -0
- package/src/.github/skills/scientific-doe/SKILL.md +360 -0
- package/src/.github/skills/scientific-eda-correlation/SKILL.md +141 -0
- package/src/.github/skills/scientific-feature-importance/SKILL.md +208 -0
- package/src/.github/skills/scientific-image-analysis/SKILL.md +310 -0
- package/src/.github/skills/scientific-materials-characterization/SKILL.md +368 -0
- package/src/.github/skills/scientific-meta-analysis/SKILL.md +352 -0
- package/src/.github/skills/scientific-metabolomics/SKILL.md +326 -0
- package/src/.github/skills/scientific-ml-classification/SKILL.md +265 -0
- package/src/.github/skills/scientific-ml-regression/SKILL.md +215 -0
- package/src/.github/skills/scientific-multi-omics/SKILL.md +303 -0
- package/src/.github/skills/scientific-network-analysis/SKILL.md +257 -0
- package/src/.github/skills/scientific-pca-tsne/SKILL.md +235 -0
- package/src/.github/skills/scientific-pipeline-scaffold/SKILL.md +331 -0
- package/src/.github/skills/scientific-process-optimization/SKILL.md +215 -0
- package/src/.github/skills/scientific-publication-figures/SKILL.md +208 -0
- package/src/.github/skills/scientific-sequence-analysis/SKILL.md +389 -0
- package/src/.github/skills/scientific-spectral-signal/SKILL.md +227 -0
- package/src/.github/skills/scientific-statistical-testing/SKILL.md +240 -0
- package/src/.github/skills/scientific-survival-clinical/SKILL.md +239 -0
- package/src/.github/skills/scientific-time-series/SKILL.md +291 -0
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-meta-analysis
|
|
3
|
+
description: |
|
|
4
|
+
メタ解析スキル。固定効果・ランダム効果モデル(DerSimonian-Laird)、Forest プロット、
|
|
5
|
+
異質性評価(I²/Q 検定/τ²)、出版バイアス検出(Funnel プロット/Egger/Begg 検定)、
|
|
6
|
+
サブグループ解析、メタ回帰、累積メタ解析のテンプレートを提供。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Meta-Analysis
|
|
10
|
+
|
|
11
|
+
複数の独立した研究結果を統合し、全体的なエビデンスを定量化するためのスキル。
|
|
12
|
+
効果量(SMD / OR / RR / MD)の統合、異質性評価、出版バイアス検出のパイプラインを
|
|
13
|
+
提供する。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- 複数の研究・実験結果を統合的に評価するとき
|
|
18
|
+
- 効果量(Hedges' g / Cohen's d / SMD)を算出するとき
|
|
19
|
+
- Forest プロット / Funnel プロットを描画するとき
|
|
20
|
+
- 研究間の異質性を定量化するとき(I² / Q / τ²)
|
|
21
|
+
- 出版バイアスの有無を検定するとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. 効果量の算出
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pandas as pd
|
|
32
|
+
from scipy.stats import norm
|
|
33
|
+
|
|
34
|
+
def compute_effect_sizes(studies_df, effect_type="SMD"):
|
|
35
|
+
"""
|
|
36
|
+
各研究の効果量と分散を算出する。
|
|
37
|
+
|
|
38
|
+
effect_type:
|
|
39
|
+
"SMD" — Standardized Mean Difference (Hedges' g)
|
|
40
|
+
"MD" — Mean Difference (同スケール)
|
|
41
|
+
"OR" — Odds Ratio (log 変換)
|
|
42
|
+
"RR" — Risk Ratio (log 変換)
|
|
43
|
+
|
|
44
|
+
Input columns (SMD/MD):
|
|
45
|
+
mean1, sd1, n1, mean2, sd2, n2
|
|
46
|
+
|
|
47
|
+
Input columns (OR/RR):
|
|
48
|
+
events1, total1, events2, total2
|
|
49
|
+
"""
|
|
50
|
+
df = studies_df.copy()
|
|
51
|
+
|
|
52
|
+
if effect_type == "SMD":
|
|
53
|
+
# Cohen's d → Hedges' g (小標本補正)
|
|
54
|
+
pooled_sd = np.sqrt(
|
|
55
|
+
((df["n1"]-1)*df["sd1"]**2 + (df["n2"]-1)*df["sd2"]**2) /
|
|
56
|
+
(df["n1"] + df["n2"] - 2)
|
|
57
|
+
)
|
|
58
|
+
d = (df["mean1"] - df["mean2"]) / pooled_sd
|
|
59
|
+
# Hedges' correction factor
|
|
60
|
+
J = 1 - 3 / (4*(df["n1"]+df["n2"]-2) - 1)
|
|
61
|
+
df["effect_size"] = d * J
|
|
62
|
+
df["variance"] = (df["n1"]+df["n2"])/(df["n1"]*df["n2"]) + \
|
|
63
|
+
df["effect_size"]**2 / (2*(df["n1"]+df["n2"]))
|
|
64
|
+
|
|
65
|
+
elif effect_type == "MD":
|
|
66
|
+
df["effect_size"] = df["mean1"] - df["mean2"]
|
|
67
|
+
df["variance"] = df["sd1"]**2/df["n1"] + df["sd2"]**2/df["n2"]
|
|
68
|
+
|
|
69
|
+
elif effect_type == "OR":
|
|
70
|
+
a = df["events1"]; b = df["total1"] - df["events1"]
|
|
71
|
+
c = df["events2"]; d_val = df["total2"] - df["events2"]
|
|
72
|
+
df["effect_size"] = np.log((a * d_val) / (b * c + 1e-10) + 1e-10)
|
|
73
|
+
df["variance"] = 1/a + 1/b + 1/c + 1/d_val
|
|
74
|
+
|
|
75
|
+
elif effect_type == "RR":
|
|
76
|
+
p1 = df["events1"] / df["total1"]
|
|
77
|
+
p2 = df["events2"] / df["total2"]
|
|
78
|
+
df["effect_size"] = np.log(p1 / (p2 + 1e-10) + 1e-10)
|
|
79
|
+
df["variance"] = (1-p1)/(df["events1"]+1e-10) + \
|
|
80
|
+
(1-p2)/(df["events2"]+1e-10)
|
|
81
|
+
|
|
82
|
+
df["se"] = np.sqrt(df["variance"])
|
|
83
|
+
df["ci_lower"] = df["effect_size"] - 1.96 * df["se"]
|
|
84
|
+
df["ci_upper"] = df["effect_size"] + 1.96 * df["se"]
|
|
85
|
+
df["weight"] = 1 / df["variance"]
|
|
86
|
+
|
|
87
|
+
return df
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## 2. 固定効果 / ランダム効果モデル
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
def meta_analysis(studies_df, model="random"):
|
|
94
|
+
"""
|
|
95
|
+
メタ解析統合。
|
|
96
|
+
|
|
97
|
+
model:
|
|
98
|
+
"fixed" — 固定効果モデル (Inverse-Variance weighted)
|
|
99
|
+
"random" — ランダム効果モデル (DerSimonian-Laird)
|
|
100
|
+
|
|
101
|
+
Input: DataFrame with columns: study, effect_size, variance
|
|
102
|
+
"""
|
|
103
|
+
es = studies_df["effect_size"].values
|
|
104
|
+
var = studies_df["variance"].values
|
|
105
|
+
w = 1 / var
|
|
106
|
+
k = len(es)
|
|
107
|
+
|
|
108
|
+
# 固定効果
|
|
109
|
+
theta_fixed = np.sum(w * es) / np.sum(w)
|
|
110
|
+
se_fixed = 1 / np.sqrt(np.sum(w))
|
|
111
|
+
|
|
112
|
+
# 異質性
|
|
113
|
+
Q = np.sum(w * (es - theta_fixed)**2)
|
|
114
|
+
df_Q = k - 1
|
|
115
|
+
p_Q = 1 - __import__("scipy").stats.chi2.cdf(Q, df_Q)
|
|
116
|
+
I2 = max(0, (Q - df_Q) / Q * 100) if Q > 0 else 0
|
|
117
|
+
|
|
118
|
+
if model == "random":
|
|
119
|
+
# DerSimonian-Laird τ² 推定
|
|
120
|
+
C = np.sum(w) - np.sum(w**2) / np.sum(w)
|
|
121
|
+
tau2 = max(0, (Q - df_Q) / C)
|
|
122
|
+
|
|
123
|
+
# ランダム効果重み
|
|
124
|
+
w_re = 1 / (var + tau2)
|
|
125
|
+
theta_random = np.sum(w_re * es) / np.sum(w_re)
|
|
126
|
+
se_random = 1 / np.sqrt(np.sum(w_re))
|
|
127
|
+
|
|
128
|
+
summary_effect = theta_random
|
|
129
|
+
summary_se = se_random
|
|
130
|
+
else:
|
|
131
|
+
tau2 = 0
|
|
132
|
+
summary_effect = theta_fixed
|
|
133
|
+
summary_se = se_fixed
|
|
134
|
+
|
|
135
|
+
z = summary_effect / summary_se
|
|
136
|
+
p_val = 2 * (1 - norm.cdf(abs(z)))
|
|
137
|
+
|
|
138
|
+
return {
|
|
139
|
+
"model": model,
|
|
140
|
+
"summary_effect": summary_effect,
|
|
141
|
+
"se": summary_se,
|
|
142
|
+
"ci_lower": summary_effect - 1.96 * summary_se,
|
|
143
|
+
"ci_upper": summary_effect + 1.96 * summary_se,
|
|
144
|
+
"z_value": z,
|
|
145
|
+
"p_value": p_val,
|
|
146
|
+
"Q_statistic": Q,
|
|
147
|
+
"Q_p_value": p_Q,
|
|
148
|
+
"I_squared": I2,
|
|
149
|
+
"tau_squared": tau2,
|
|
150
|
+
"k_studies": k,
|
|
151
|
+
}
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## 3. Forest プロット
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
import matplotlib.pyplot as plt
|
|
158
|
+
|
|
159
|
+
def forest_plot(studies_df, meta_result, effect_label="SMD",
|
|
160
|
+
figsize=(10, None)):
|
|
161
|
+
"""
|
|
162
|
+
Forest プロットを描画する。
|
|
163
|
+
|
|
164
|
+
studies_df: study, effect_size, ci_lower, ci_upper, weight
|
|
165
|
+
meta_result: meta_analysis() の出力
|
|
166
|
+
"""
|
|
167
|
+
k = len(studies_df)
|
|
168
|
+
if figsize[1] is None:
|
|
169
|
+
figsize = (figsize[0], max(4, k * 0.4 + 2))
|
|
170
|
+
|
|
171
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
172
|
+
|
|
173
|
+
y_positions = range(k, 0, -1)
|
|
174
|
+
|
|
175
|
+
# 個別研究
|
|
176
|
+
for i, (_, row) in enumerate(studies_df.iterrows()):
|
|
177
|
+
y = list(y_positions)[i]
|
|
178
|
+
ax.plot([row["ci_lower"], row["ci_upper"]], [y, y],
|
|
179
|
+
"b-", linewidth=1.5)
|
|
180
|
+
size = row.get("weight", 1) / studies_df["weight"].max() * 200 + 20
|
|
181
|
+
ax.plot(row["effect_size"], y, "bs", markersize=np.sqrt(size),
|
|
182
|
+
markerfacecolor="steelblue")
|
|
183
|
+
ax.text(-0.05, y, row.get("study", f"Study {i+1}"),
|
|
184
|
+
ha="right", va="center", fontsize=9,
|
|
185
|
+
transform=ax.get_yaxis_transform())
|
|
186
|
+
|
|
187
|
+
# サマリーダイヤモンド
|
|
188
|
+
y_summary = 0
|
|
189
|
+
diamond_x = [meta_result["ci_lower"], meta_result["summary_effect"],
|
|
190
|
+
meta_result["ci_upper"], meta_result["summary_effect"]]
|
|
191
|
+
diamond_y = [y_summary, y_summary + 0.3, y_summary, y_summary - 0.3]
|
|
192
|
+
ax.fill(diamond_x, diamond_y, color="red", alpha=0.7)
|
|
193
|
+
ax.text(-0.05, y_summary, "Summary",
|
|
194
|
+
ha="right", va="center", fontsize=9, fontweight="bold",
|
|
195
|
+
transform=ax.get_yaxis_transform())
|
|
196
|
+
|
|
197
|
+
# 参照線
|
|
198
|
+
ax.axvline(0, color="black", linestyle="-", linewidth=0.5)
|
|
199
|
+
|
|
200
|
+
ax.set_xlabel(effect_label)
|
|
201
|
+
ax.set_yticks([])
|
|
202
|
+
ax.set_title(f"Forest Plot (I²={meta_result['I_squared']:.1f}%, "
|
|
203
|
+
f"p={meta_result['p_value']:.4f})", fontweight="bold")
|
|
204
|
+
|
|
205
|
+
# 右側に数値
|
|
206
|
+
for i, (_, row) in enumerate(studies_df.iterrows()):
|
|
207
|
+
y = list(y_positions)[i]
|
|
208
|
+
ax.text(1.02, y, f"{row['effect_size']:.2f} [{row['ci_lower']:.2f}, "
|
|
209
|
+
f"{row['ci_upper']:.2f}]",
|
|
210
|
+
ha="left", va="center", fontsize=8,
|
|
211
|
+
transform=ax.get_yaxis_transform())
|
|
212
|
+
|
|
213
|
+
plt.tight_layout()
|
|
214
|
+
plt.savefig("figures/forest_plot.png", dpi=300, bbox_inches="tight")
|
|
215
|
+
plt.close()
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## 4. 出版バイアス検出
|
|
219
|
+
|
|
220
|
+
### 4.1 Funnel プロット
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
def funnel_plot(studies_df, meta_result, figsize=(8, 6)):
|
|
224
|
+
"""
|
|
225
|
+
Funnel プロットを描画する。
|
|
226
|
+
非対称なら出版バイアスの存在を示唆。
|
|
227
|
+
"""
|
|
228
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
229
|
+
|
|
230
|
+
ax.scatter(studies_df["effect_size"], studies_df["se"],
|
|
231
|
+
c="steelblue", s=50, edgecolors="black", zorder=5)
|
|
232
|
+
|
|
233
|
+
# 参照線
|
|
234
|
+
ax.axvline(meta_result["summary_effect"], color="red", linestyle="--")
|
|
235
|
+
|
|
236
|
+
# 95% 擬似信頼区間
|
|
237
|
+
se_range = np.linspace(0, studies_df["se"].max() * 1.1, 100)
|
|
238
|
+
ax.plot(meta_result["summary_effect"] - 1.96 * se_range, se_range,
|
|
239
|
+
"gray", linestyle="--", alpha=0.5)
|
|
240
|
+
ax.plot(meta_result["summary_effect"] + 1.96 * se_range, se_range,
|
|
241
|
+
"gray", linestyle="--", alpha=0.5)
|
|
242
|
+
|
|
243
|
+
ax.set_xlabel("Effect Size")
|
|
244
|
+
ax.set_ylabel("Standard Error")
|
|
245
|
+
ax.set_title("Funnel Plot", fontweight="bold")
|
|
246
|
+
ax.invert_yaxis()
|
|
247
|
+
plt.tight_layout()
|
|
248
|
+
plt.savefig("figures/funnel_plot.png", dpi=300, bbox_inches="tight")
|
|
249
|
+
plt.close()
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### 4.2 Egger 検定
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
import statsmodels.api as sm
|
|
256
|
+
|
|
257
|
+
def egger_test(studies_df):
|
|
258
|
+
"""
|
|
259
|
+
Egger 回帰検定 — Funnel プロットの非対称性を統計的に検定。
|
|
260
|
+
|
|
261
|
+
y = effect_size / se
|
|
262
|
+
x = 1 / se
|
|
263
|
+
切片 ≠ 0 → 出版バイアスあり
|
|
264
|
+
"""
|
|
265
|
+
precision = 1 / studies_df["se"]
|
|
266
|
+
z_score = studies_df["effect_size"] / studies_df["se"]
|
|
267
|
+
|
|
268
|
+
X = sm.add_constant(precision)
|
|
269
|
+
model = sm.OLS(z_score, X).fit()
|
|
270
|
+
|
|
271
|
+
return {
|
|
272
|
+
"intercept": model.params["const"],
|
|
273
|
+
"intercept_se": model.bse["const"],
|
|
274
|
+
"intercept_p": model.pvalues["const"],
|
|
275
|
+
"publication_bias": model.pvalues["const"] < 0.05,
|
|
276
|
+
}
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## 5. サブグループ解析
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
def subgroup_analysis(studies_df, subgroup_col, model="random"):
|
|
283
|
+
"""サブグループごとにメタ解析を行い、グループ間差を検定する。"""
|
|
284
|
+
subgroups = studies_df[subgroup_col].unique()
|
|
285
|
+
results = []
|
|
286
|
+
|
|
287
|
+
for sg in subgroups:
|
|
288
|
+
subset = studies_df[studies_df[subgroup_col] == sg]
|
|
289
|
+
if len(subset) >= 2:
|
|
290
|
+
ma = meta_analysis(subset, model=model)
|
|
291
|
+
ma["subgroup"] = sg
|
|
292
|
+
ma["k"] = len(subset)
|
|
293
|
+
results.append(ma)
|
|
294
|
+
|
|
295
|
+
results_df = pd.DataFrame(results)
|
|
296
|
+
|
|
297
|
+
# グループ間検定 (Q_between)
|
|
298
|
+
overall = meta_analysis(studies_df, model=model)
|
|
299
|
+
Q_within = sum(r["Q_statistic"] for r in results)
|
|
300
|
+
Q_between = overall["Q_statistic"] - Q_within
|
|
301
|
+
df_between = len(results) - 1
|
|
302
|
+
p_between = 1 - __import__("scipy").stats.chi2.cdf(Q_between, df_between)
|
|
303
|
+
|
|
304
|
+
return {
|
|
305
|
+
"subgroup_results": results_df,
|
|
306
|
+
"Q_between": Q_between,
|
|
307
|
+
"df_between": df_between,
|
|
308
|
+
"p_between": p_between,
|
|
309
|
+
}
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
## 6. 累積メタ解析
|
|
313
|
+
|
|
314
|
+
```python
|
|
315
|
+
def cumulative_meta_analysis(studies_df, sort_by="year", model="random"):
|
|
316
|
+
"""
|
|
317
|
+
研究を順に追加しながらメタ解析を実行する。
|
|
318
|
+
エビデンスの蓄積過程を可視化。
|
|
319
|
+
"""
|
|
320
|
+
sorted_df = studies_df.sort_values(sort_by)
|
|
321
|
+
cumulative_results = []
|
|
322
|
+
|
|
323
|
+
for i in range(2, len(sorted_df) + 1):
|
|
324
|
+
subset = sorted_df.iloc[:i]
|
|
325
|
+
ma = meta_analysis(subset, model=model)
|
|
326
|
+
ma["n_studies"] = i
|
|
327
|
+
ma["last_added"] = sorted_df.iloc[i-1].get("study", f"Study {i}")
|
|
328
|
+
cumulative_results.append(ma)
|
|
329
|
+
|
|
330
|
+
return pd.DataFrame(cumulative_results)
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
## References
|
|
334
|
+
|
|
335
|
+
### Output Files
|
|
336
|
+
|
|
337
|
+
| ファイル | 形式 |
|
|
338
|
+
|---|---|
|
|
339
|
+
| `results/meta_analysis_summary.csv` | CSV |
|
|
340
|
+
| `results/effect_sizes.csv` | CSV |
|
|
341
|
+
| `results/publication_bias_tests.csv` | CSV |
|
|
342
|
+
| `results/subgroup_analysis.csv` | CSV |
|
|
343
|
+
| `figures/forest_plot.png` | PNG |
|
|
344
|
+
| `figures/funnel_plot.png` | PNG |
|
|
345
|
+
| `figures/cumulative_meta.png` | PNG |
|
|
346
|
+
|
|
347
|
+
#### 依存パッケージ
|
|
348
|
+
|
|
349
|
+
```
|
|
350
|
+
scipy>=1.10
|
|
351
|
+
statsmodels>=0.14
|
|
352
|
+
```
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-metabolomics
|
|
3
|
+
description: |
|
|
4
|
+
メタボロミクス解析スキル。Pareto スケーリング、PLS-DA + VIP スコア、置換検定(Q²)、
|
|
5
|
+
代謝パスウェイ濃縮解析(Fisher exact test)、代謝物相関ネットワーク、
|
|
6
|
+
Volcano プロット/箱ひげ図による差次代謝物同定パイプライン。
|
|
7
|
+
Scientific Skills Exp-07 で確立したパターン。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Metabolomics Analysis
|
|
11
|
+
|
|
12
|
+
LC-MS / GC-MS / NMR ベースのメタボロミクスデータを対象に、品質管理→前処理→
|
|
13
|
+
単変量解析→多変量解析→パスウェイ解析の標準パイプラインを提供する。
|
|
14
|
+
メタボロミクス固有の統計手法(PLS-DA、VIP スコア)に特化している。
|
|
15
|
+
|
|
16
|
+
## When to Use
|
|
17
|
+
|
|
18
|
+
- メタボロミクスデータの統計解析パイプラインが必要なとき
|
|
19
|
+
- PLS-DA による群間判別+VIP スコアによるバイオマーカー候補を算出するとき
|
|
20
|
+
- 代謝パスウェイ濃縮解析が必要なとき
|
|
21
|
+
- Pareto スケーリングや代謝物相関ネットワークが必要なとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. メタボロミクス前処理
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pandas as pd
|
|
32
|
+
from sklearn.impute import KNNImputer
|
|
33
|
+
|
|
34
|
+
def metabolomics_preprocessing(df, sample_col="Sample_ID", group_col="Group",
|
|
35
|
+
min_detect_pct=0.5):
|
|
36
|
+
"""
|
|
37
|
+
メタボロミクス標準前処理パイプライン。
|
|
38
|
+
1. 低検出率代謝物の除去
|
|
39
|
+
2. KNN 欠損値補完
|
|
40
|
+
3. log2 変換
|
|
41
|
+
4. Pareto スケーリング
|
|
42
|
+
"""
|
|
43
|
+
metabolite_cols = [c for c in df.columns if c not in [sample_col, group_col]]
|
|
44
|
+
|
|
45
|
+
# Step 1: 低検出率フィルタリング
|
|
46
|
+
detect_rate = df[metabolite_cols].notna().mean()
|
|
47
|
+
keep = detect_rate[detect_rate >= min_detect_pct].index.tolist()
|
|
48
|
+
removed = len(metabolite_cols) - len(keep)
|
|
49
|
+
print(f" Removed {removed} metabolites with <{min_detect_pct*100:.0f}% detection rate")
|
|
50
|
+
metabolite_cols = keep
|
|
51
|
+
|
|
52
|
+
# Step 2: KNN 補完
|
|
53
|
+
imputer = KNNImputer(n_neighbors=5)
|
|
54
|
+
df[metabolite_cols] = imputer.fit_transform(df[metabolite_cols])
|
|
55
|
+
|
|
56
|
+
# Step 3: log2 変換
|
|
57
|
+
df[metabolite_cols] = np.log2(df[metabolite_cols].clip(lower=1e-10) + 1)
|
|
58
|
+
|
|
59
|
+
# Step 4: Pareto スケーリング
|
|
60
|
+
for col in metabolite_cols:
|
|
61
|
+
mean = df[col].mean()
|
|
62
|
+
std = df[col].std()
|
|
63
|
+
df[col] = (df[col] - mean) / np.sqrt(std + 1e-10)
|
|
64
|
+
|
|
65
|
+
return df, metabolite_cols
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## 2. 単変量解析 — 差次代謝物同定
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from scipy.stats import mannwhitneyu, ttest_ind
|
|
72
|
+
from statsmodels.stats.multitest import multipletests
|
|
73
|
+
|
|
74
|
+
def univariate_analysis(df, metabolite_cols, group_col, group1, group2,
|
|
75
|
+
test="mannwhitneyu", correction="fdr_bh"):
|
|
76
|
+
"""
|
|
77
|
+
2 群間の差次代謝物を同定する。
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
DataFrame with columns: metabolite, log2FC, pvalue, padj, significant
|
|
81
|
+
"""
|
|
82
|
+
g1 = df[df[group_col] == group1]
|
|
83
|
+
g2 = df[df[group_col] == group2]
|
|
84
|
+
|
|
85
|
+
results = []
|
|
86
|
+
for met in metabolite_cols:
|
|
87
|
+
v1 = g1[met].dropna()
|
|
88
|
+
v2 = g2[met].dropna()
|
|
89
|
+
|
|
90
|
+
if test == "mannwhitneyu":
|
|
91
|
+
stat, pval = mannwhitneyu(v1, v2, alternative="two-sided")
|
|
92
|
+
else:
|
|
93
|
+
stat, pval = ttest_ind(v1, v2)
|
|
94
|
+
|
|
95
|
+
log2fc = v2.mean() - v1.mean()
|
|
96
|
+
results.append({"metabolite": met, "log2FC": log2fc, "pvalue": pval})
|
|
97
|
+
|
|
98
|
+
results_df = pd.DataFrame(results)
|
|
99
|
+
|
|
100
|
+
# 多重検定補正
|
|
101
|
+
reject, padj, _, _ = multipletests(results_df["pvalue"], method=correction)
|
|
102
|
+
results_df["padj"] = padj
|
|
103
|
+
results_df["significant"] = reject
|
|
104
|
+
results_df["neg_log10p"] = -np.log10(results_df["pvalue"] + 1e-300)
|
|
105
|
+
|
|
106
|
+
results_df = results_df.sort_values("pvalue")
|
|
107
|
+
return results_df
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## 3. PLS-DA + VIP スコア
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from sklearn.cross_decomposition import PLSRegression
|
|
114
|
+
from sklearn.preprocessing import LabelEncoder
|
|
115
|
+
|
|
116
|
+
def plsda_analysis(X, y, n_components=2):
|
|
117
|
+
"""
|
|
118
|
+
PLS-DA を実行し、VIP スコアを算出する。
|
|
119
|
+
|
|
120
|
+
VIP (Variable Importance in Projection):
|
|
121
|
+
VIP_j = sqrt(p * Σ(q²_a * w²_ja) / Σ(q²_a))
|
|
122
|
+
VIP > 1 の変数がバイオマーカー候補
|
|
123
|
+
|
|
124
|
+
Parameters:
|
|
125
|
+
X: 代謝物データ行列 (n_samples, n_metabolites)
|
|
126
|
+
y: グループラベル
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
pls_model, scores, vip_scores
|
|
130
|
+
"""
|
|
131
|
+
le = LabelEncoder()
|
|
132
|
+
y_encoded = le.fit_transform(y).astype(float)
|
|
133
|
+
|
|
134
|
+
pls = PLSRegression(n_components=n_components, scale=True)
|
|
135
|
+
pls.fit(X, y_encoded)
|
|
136
|
+
|
|
137
|
+
# スコア(潜在変数)
|
|
138
|
+
scores = pls.transform(X)
|
|
139
|
+
|
|
140
|
+
# VIP スコア算出
|
|
141
|
+
T = pls.x_scores_ # (n, n_comp)
|
|
142
|
+
W = pls.x_weights_ # (p, n_comp)
|
|
143
|
+
Q = pls.y_loadings_ # (1, n_comp)
|
|
144
|
+
|
|
145
|
+
p = X.shape[1]
|
|
146
|
+
vip = np.zeros(p)
|
|
147
|
+
|
|
148
|
+
ss_total = np.sum(Q**2 * np.sum(T**2, axis=0))
|
|
149
|
+
for j in range(p):
|
|
150
|
+
ss_j = np.sum(Q**2 * np.sum(T**2, axis=0) * W[j, :]**2)
|
|
151
|
+
vip[j] = np.sqrt(p * ss_j / ss_total)
|
|
152
|
+
|
|
153
|
+
return pls, scores, vip
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def plot_plsda_scores(scores, y, group_names=None, figsize=(8, 6)):
|
|
157
|
+
"""PLS-DA スコアプロットを描画する。"""
|
|
158
|
+
import matplotlib.pyplot as plt
|
|
159
|
+
|
|
160
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
161
|
+
unique = np.unique(y)
|
|
162
|
+
colors = plt.cm.Set1(np.linspace(0, 0.5, len(unique)))
|
|
163
|
+
|
|
164
|
+
for color, group in zip(colors, unique):
|
|
165
|
+
mask = y == group
|
|
166
|
+
label = group_names[group] if group_names else str(group)
|
|
167
|
+
ax.scatter(scores[mask, 0], scores[mask, 1],
|
|
168
|
+
c=[color], label=label, s=60, alpha=0.7, edgecolors="black")
|
|
169
|
+
# 95% 信頼楕円
|
|
170
|
+
from matplotlib.patches import Ellipse
|
|
171
|
+
cov = np.cov(scores[mask, 0], scores[mask, 1])
|
|
172
|
+
vals, vecs = np.linalg.eigh(cov)
|
|
173
|
+
angle = np.degrees(np.arctan2(vecs[1, 1], vecs[0, 1]))
|
|
174
|
+
w, h = 2 * np.sqrt(vals * 5.991) # chi2(2, 0.95) = 5.991
|
|
175
|
+
ell = Ellipse(xy=(scores[mask, 0].mean(), scores[mask, 1].mean()),
|
|
176
|
+
width=w, height=h, angle=angle,
|
|
177
|
+
fill=False, color=color, linewidth=2, linestyle="--")
|
|
178
|
+
ax.add_patch(ell)
|
|
179
|
+
|
|
180
|
+
ax.set_xlabel("PLS Component 1")
|
|
181
|
+
ax.set_ylabel("PLS Component 2")
|
|
182
|
+
ax.set_title("PLS-DA Score Plot", fontweight="bold")
|
|
183
|
+
ax.legend()
|
|
184
|
+
plt.tight_layout()
|
|
185
|
+
plt.savefig("figures/plsda_scores.png", dpi=300, bbox_inches="tight")
|
|
186
|
+
plt.close()
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## 4. 置換検定(PLS-DA バリデーション)
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
def permutation_test_plsda(X, y, n_components=2, n_permutations=100):
|
|
193
|
+
"""
|
|
194
|
+
PLS-DA モデルの置換検定。
|
|
195
|
+
Q² と R²Y の分布を生成し、真のモデルの有意性を評価する。
|
|
196
|
+
"""
|
|
197
|
+
from sklearn.model_selection import cross_val_predict
|
|
198
|
+
le = LabelEncoder()
|
|
199
|
+
y_enc = le.fit_transform(y).astype(float)
|
|
200
|
+
|
|
201
|
+
# 真のモデル
|
|
202
|
+
pls_true = PLSRegression(n_components=n_components, scale=True)
|
|
203
|
+
y_pred = cross_val_predict(pls_true, X, y_enc, cv=5)
|
|
204
|
+
ss_res = np.sum((y_enc - y_pred.ravel())**2)
|
|
205
|
+
ss_tot = np.sum((y_enc - y_enc.mean())**2)
|
|
206
|
+
q2_true = 1 - ss_res / ss_tot
|
|
207
|
+
|
|
208
|
+
# 置換
|
|
209
|
+
q2_perm = []
|
|
210
|
+
for _ in range(n_permutations):
|
|
211
|
+
y_perm = np.random.permutation(y_enc)
|
|
212
|
+
pls_p = PLSRegression(n_components=n_components, scale=True)
|
|
213
|
+
y_pred_p = cross_val_predict(pls_p, X, y_perm, cv=5)
|
|
214
|
+
ss_res_p = np.sum((y_perm - y_pred_p.ravel())**2)
|
|
215
|
+
ss_tot_p = np.sum((y_perm - y_perm.mean())**2)
|
|
216
|
+
q2_perm.append(1 - ss_res_p / ss_tot_p)
|
|
217
|
+
|
|
218
|
+
p_value = np.mean(np.array(q2_perm) >= q2_true)
|
|
219
|
+
|
|
220
|
+
return {
|
|
221
|
+
"Q2_true": q2_true,
|
|
222
|
+
"Q2_perm_mean": np.mean(q2_perm),
|
|
223
|
+
"Q2_perm_std": np.std(q2_perm),
|
|
224
|
+
"p_value": p_value,
|
|
225
|
+
"significant": p_value < 0.05,
|
|
226
|
+
}
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## 5. 代謝パスウェイ濃縮解析
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
from scipy.stats import fisher_exact
|
|
233
|
+
|
|
234
|
+
def pathway_enrichment(significant_metabolites, pathway_annotations,
|
|
235
|
+
metabolite_col="Metabolite", pathway_col="Pathway",
|
|
236
|
+
total_metabolites=None):
|
|
237
|
+
"""
|
|
238
|
+
Fisher 正確検定による代謝パスウェイ濃縮解析。
|
|
239
|
+
|
|
240
|
+
Parameters:
|
|
241
|
+
significant_metabolites: list of significant metabolite names
|
|
242
|
+
pathway_annotations: DataFrame with metabolite-pathway mapping
|
|
243
|
+
total_metabolites: 解析対象の全代謝物数
|
|
244
|
+
"""
|
|
245
|
+
sig_set = set(significant_metabolites)
|
|
246
|
+
all_annotated = set(pathway_annotations[metabolite_col])
|
|
247
|
+
if total_metabolites is None:
|
|
248
|
+
total_metabolites = len(all_annotated)
|
|
249
|
+
|
|
250
|
+
pathways = pathway_annotations[pathway_col].unique()
|
|
251
|
+
results = []
|
|
252
|
+
|
|
253
|
+
for pw in pathways:
|
|
254
|
+
pw_members = set(
|
|
255
|
+
pathway_annotations[pathway_annotations[pathway_col] == pw][metabolite_col]
|
|
256
|
+
)
|
|
257
|
+
k = len(sig_set & pw_members) # hit
|
|
258
|
+
K = len(pw_members) # pathway size
|
|
259
|
+
n = len(sig_set) # significant
|
|
260
|
+
N = total_metabolites # total
|
|
261
|
+
|
|
262
|
+
# 2x2 分割表
|
|
263
|
+
table = [[k, K - k], [n - k, N - K - n + k]]
|
|
264
|
+
odds_ratio, p_value = fisher_exact(table, alternative="greater")
|
|
265
|
+
|
|
266
|
+
results.append({
|
|
267
|
+
"Pathway": pw,
|
|
268
|
+
"Hits": k,
|
|
269
|
+
"Pathway_Size": K,
|
|
270
|
+
"Significant_Total": n,
|
|
271
|
+
"Odds_Ratio": odds_ratio,
|
|
272
|
+
"p_value": p_value,
|
|
273
|
+
})
|
|
274
|
+
|
|
275
|
+
results_df = pd.DataFrame(results).sort_values("p_value")
|
|
276
|
+
_, padj, _, _ = multipletests(results_df["p_value"], method="fdr_bh")
|
|
277
|
+
results_df["padj"] = padj
|
|
278
|
+
|
|
279
|
+
results_df.to_csv("results/pathway_enrichment.csv", index=False)
|
|
280
|
+
return results_df
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## 6. 代謝物相関ネットワーク
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
def metabolite_correlation_network(df, metabolite_cols, method="spearman",
|
|
287
|
+
threshold=0.7):
|
|
288
|
+
"""
|
|
289
|
+
代謝物間の相関からネットワークを構築する。
|
|
290
|
+
|
|
291
|
+
Parameters:
|
|
292
|
+
threshold: |r| ≥ threshold のペアのみエッジとして採用
|
|
293
|
+
"""
|
|
294
|
+
import networkx as nx
|
|
295
|
+
|
|
296
|
+
corr = df[metabolite_cols].corr(method=method)
|
|
297
|
+
|
|
298
|
+
G = nx.Graph()
|
|
299
|
+
for i, met_i in enumerate(metabolite_cols):
|
|
300
|
+
G.add_node(met_i)
|
|
301
|
+
for j, met_j in enumerate(metabolite_cols):
|
|
302
|
+
if i < j:
|
|
303
|
+
r = corr.iloc[i, j]
|
|
304
|
+
if abs(r) >= threshold:
|
|
305
|
+
G.add_edge(met_i, met_j, weight=abs(r),
|
|
306
|
+
sign="positive" if r > 0 else "negative")
|
|
307
|
+
|
|
308
|
+
return G, corr
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
## References
|
|
312
|
+
|
|
313
|
+
### Output Files
|
|
314
|
+
|
|
315
|
+
| ファイル | 形式 |
|
|
316
|
+
|---|---|
|
|
317
|
+
| `results/univariate_results.csv` | CSV |
|
|
318
|
+
| `results/vip_scores.csv` | CSV |
|
|
319
|
+
| `results/pathway_enrichment.csv` | CSV |
|
|
320
|
+
| `figures/plsda_scores.png` | PNG |
|
|
321
|
+
| `figures/vip_barplot.png` | PNG |
|
|
322
|
+
| `figures/metabolite_network.png` | PNG |
|
|
323
|
+
|
|
324
|
+
#### 参照実験
|
|
325
|
+
|
|
326
|
+
- **Exp-07**: PLS-DA + VIP、Pareto スケーリング、パスウェイ濃縮、相関ネットワーク
|