@nahisaho/satori 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +123 -4
- package/package.json +1 -1
- package/src/.github/skills/scientific-environmental-ecology/SKILL.md +295 -0
- package/src/.github/skills/scientific-epidemiology-public-health/SKILL.md +332 -0
- package/src/.github/skills/scientific-immunoinformatics/SKILL.md +341 -0
- package/src/.github/skills/scientific-infectious-disease/SKILL.md +342 -0
- package/src/.github/skills/scientific-microbiome-metagenomics/SKILL.md +349 -0
- package/src/.github/skills/scientific-population-genetics/SKILL.md +336 -0
- package/src/.github/skills/scientific-single-cell-genomics/SKILL.md +361 -0
- package/src/.github/skills/scientific-spatial-transcriptomics/SKILL.md +281 -0
- package/src/.github/skills/scientific-systems-biology/SKILL.md +310 -0
- package/src/.github/skills/scientific-text-mining-nlp/SKILL.md +358 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-epidemiology-public-health
|
|
3
|
+
description: |
|
|
4
|
+
疫学・公衆衛生解析スキル。観察研究デザイン(コホート/症例対照/横断)・
|
|
5
|
+
リスク指標(RR/OR/HR/NNT)・標準化死亡比(SMR)・年齢調整率・
|
|
6
|
+
空間疫学(GIS / 空間クラスタリング)・因果推論ダイアグラム(DAG)・
|
|
7
|
+
WHO/CDC/EU 公衆衛生データ統合パイプライン。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Epidemiology & Public Health
|
|
11
|
+
|
|
12
|
+
疫学研究と公衆衛生データ解析のパイプラインを提供する。
|
|
13
|
+
研究デザイン設計、リスク指標算出、交絡調整、
|
|
14
|
+
空間疫学、健康格差評価、公衆衛生データベース連携を体系的に扱う。
|
|
15
|
+
|
|
16
|
+
## When to Use
|
|
17
|
+
|
|
18
|
+
- 観察研究のリスク指標(RR / OR / HR)を算出するとき
|
|
19
|
+
- 年齢調整率・標準化死亡比(SMR)を計算するとき
|
|
20
|
+
- 空間疫学(疾患クラスタリング・GIS マッピング)を行うとき
|
|
21
|
+
- DAG(有向非巡回グラフ)で交絡構造を分析するとき
|
|
22
|
+
- WHO / CDC / EU の公衆衛生データを取得・解析するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. リスク指標算出
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import numpy as np
|
|
32
|
+
import pandas as pd
|
|
33
|
+
from scipy.stats import norm
|
|
34
|
+
|
|
35
|
+
def calculate_risk_measures(a, b, c, d, alpha=0.05):
|
|
36
|
+
"""
|
|
37
|
+
2×2 分割表からリスク指標を算出する。
|
|
38
|
+
|
|
39
|
+
Disease+ Disease-
|
|
40
|
+
Exposed+ a b → a+b
|
|
41
|
+
Exposed- c d → c+d
|
|
42
|
+
a+c b+d N
|
|
43
|
+
|
|
44
|
+
指標:
|
|
45
|
+
- Risk (Incidence): R = cases / total
|
|
46
|
+
- Risk Ratio (RR): R_exposed / R_unexposed(コホート研究)
|
|
47
|
+
- Odds Ratio (OR): (a·d) / (b·c)(症例対照研究)
|
|
48
|
+
- Risk Difference (RD): R_exposed - R_unexposed
|
|
49
|
+
- NNT (Number Needed to Treat): 1 / |RD|
|
|
50
|
+
- Attributable Fraction (AF): (RR - 1) / RR
|
|
51
|
+
"""
|
|
52
|
+
z = norm.ppf(1 - alpha / 2)
|
|
53
|
+
|
|
54
|
+
# Risk
|
|
55
|
+
R1 = a / (a + b) # Exposed
|
|
56
|
+
R0 = c / (c + d) # Unexposed
|
|
57
|
+
|
|
58
|
+
# Risk Ratio
|
|
59
|
+
RR = R1 / R0
|
|
60
|
+
ln_RR_se = np.sqrt(1/a - 1/(a+b) + 1/c - 1/(c+d))
|
|
61
|
+
RR_ci = (RR * np.exp(-z * ln_RR_se), RR * np.exp(z * ln_RR_se))
|
|
62
|
+
|
|
63
|
+
# Odds Ratio
|
|
64
|
+
OR = (a * d) / (b * c)
|
|
65
|
+
ln_OR_se = np.sqrt(1/a + 1/b + 1/c + 1/d)
|
|
66
|
+
OR_ci = (OR * np.exp(-z * ln_OR_se), OR * np.exp(z * ln_OR_se))
|
|
67
|
+
|
|
68
|
+
# Risk Difference
|
|
69
|
+
RD = R1 - R0
|
|
70
|
+
RD_se = np.sqrt(R1*(1-R1)/(a+b) + R0*(1-R0)/(c+d))
|
|
71
|
+
RD_ci = (RD - z * RD_se, RD + z * RD_se)
|
|
72
|
+
|
|
73
|
+
# NNT
|
|
74
|
+
NNT = 1 / abs(RD) if RD != 0 else np.inf
|
|
75
|
+
|
|
76
|
+
# Attributable fraction
|
|
77
|
+
AF = (RR - 1) / RR if RR > 0 else 0
|
|
78
|
+
|
|
79
|
+
results = {
|
|
80
|
+
"risk_exposed": round(R1, 4),
|
|
81
|
+
"risk_unexposed": round(R0, 4),
|
|
82
|
+
"RR": round(RR, 4), "RR_CI": [round(x, 4) for x in RR_ci],
|
|
83
|
+
"OR": round(OR, 4), "OR_CI": [round(x, 4) for x in OR_ci],
|
|
84
|
+
"RD": round(RD, 4), "RD_CI": [round(x, 4) for x in RD_ci],
|
|
85
|
+
"NNT": round(NNT, 1),
|
|
86
|
+
"AF": round(AF, 4),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
print(f" RR={RR:.3f} ({RR_ci[0]:.3f}–{RR_ci[1]:.3f}), "
|
|
90
|
+
f"OR={OR:.3f} ({OR_ci[0]:.3f}–{OR_ci[1]:.3f})")
|
|
91
|
+
return results
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## 2. 年齢調整率・SMR
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
def age_standardization(observed_df, standard_pop, method="direct"):
|
|
98
|
+
"""
|
|
99
|
+
年齢調整率と標準化死亡比。
|
|
100
|
+
|
|
101
|
+
method:
|
|
102
|
+
- "direct": 直接法 — 標準人口の年齢構成で重み付け
|
|
103
|
+
ASR = Σ(年齢別率ᵢ × 標準人口割合ᵢ)
|
|
104
|
+
- "indirect": 間接法 — SMR (Standardized Mortality Ratio)
|
|
105
|
+
SMR = 観察死亡数 / 期待死亡数
|
|
106
|
+
期待死亡数 = Σ(標準年齢別率ᵢ × 対象人口ᵢ)
|
|
107
|
+
|
|
108
|
+
SMR の 95% CI(Byar's approximation):
|
|
109
|
+
SMR_lower = SMR × (1 - 1/(9·O) - z/(3·√O))³
|
|
110
|
+
SMR_upper = (O+1)/E × (1 - 1/(9·(O+1)) + z/(3·√(O+1)))³
|
|
111
|
+
"""
|
|
112
|
+
if method == "direct":
|
|
113
|
+
# 直接法年齢調整率
|
|
114
|
+
merged = observed_df.merge(standard_pop, on="age_group")
|
|
115
|
+
merged["weighted_rate"] = merged["rate"] * merged["std_proportion"]
|
|
116
|
+
asr = merged["weighted_rate"].sum()
|
|
117
|
+
|
|
118
|
+
# 分散(二項近似)
|
|
119
|
+
merged["var_component"] = (merged["std_proportion"] ** 2 *
|
|
120
|
+
merged["rate"] * (1 - merged["rate"]) /
|
|
121
|
+
merged["population"])
|
|
122
|
+
se = np.sqrt(merged["var_component"].sum())
|
|
123
|
+
|
|
124
|
+
return {
|
|
125
|
+
"ASR": round(asr, 6),
|
|
126
|
+
"ASR_per_100k": round(asr * 1e5, 2),
|
|
127
|
+
"SE": round(se, 6),
|
|
128
|
+
"CI_95": [round((asr - 1.96*se)*1e5, 2), round((asr + 1.96*se)*1e5, 2)],
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
elif method == "indirect":
|
|
132
|
+
# 間接法 SMR
|
|
133
|
+
merged = observed_df.merge(standard_pop, on="age_group",
|
|
134
|
+
suffixes=("_obs", "_std"))
|
|
135
|
+
merged["expected"] = merged["rate_std"] * merged["population_obs"]
|
|
136
|
+
O = merged["deaths_obs"].sum()
|
|
137
|
+
E = merged["expected"].sum()
|
|
138
|
+
|
|
139
|
+
SMR = O / E
|
|
140
|
+
z = 1.96
|
|
141
|
+
|
|
142
|
+
# Byar's approximation
|
|
143
|
+
lower = SMR * (1 - 1/(9*O) - z/(3*np.sqrt(O)))**3
|
|
144
|
+
upper = ((O+1)/E) * (1 - 1/(9*(O+1)) + z/(3*np.sqrt(O+1)))**3
|
|
145
|
+
|
|
146
|
+
print(f" SMR={SMR:.3f} ({lower:.3f}–{upper:.3f}), O={O}, E={E:.1f}")
|
|
147
|
+
return {"SMR": round(SMR, 4), "CI_95": [round(lower, 4), round(upper, 4)],
|
|
148
|
+
"observed": O, "expected": round(E, 1)}
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## 3. 空間疫学・疾患クラスタリング
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
def spatial_cluster_detection(cases_gdf, population_gdf, method="kulldorff"):
|
|
155
|
+
"""
|
|
156
|
+
空間疾患クラスタリング。
|
|
157
|
+
|
|
158
|
+
method:
|
|
159
|
+
- "kulldorff": Kulldorff's spatial scan statistic(SaTScan)
|
|
160
|
+
H₀: λ(s) = 常数(一様リスク)
|
|
161
|
+
H₁: ∃ 円形ウィンドウ Z で λ_in > λ_out
|
|
162
|
+
LLR = (O_Z/E_Z)^{O_Z} × ((O-O_Z)/(O-E_Z))^{O-O_Z}
|
|
163
|
+
- "moran": Local Moran's I(局所空間自己相関)
|
|
164
|
+
Iᵢ = zᵢ Σⱼ wᵢⱼ zⱼ
|
|
165
|
+
- "getis_ord": Getis-Ord Gi* — ホットスポット検出
|
|
166
|
+
|
|
167
|
+
用途:
|
|
168
|
+
- 疾患の地理的集積(クラスター)の検出
|
|
169
|
+
- ホットスポット / コールドスポットの同定
|
|
170
|
+
"""
|
|
171
|
+
import geopandas as gpd
|
|
172
|
+
from libpysal.weights import Queen
|
|
173
|
+
from esda.moran import Moran_Local
|
|
174
|
+
from esda.getisord import G_Local
|
|
175
|
+
|
|
176
|
+
if method == "moran":
|
|
177
|
+
W = Queen.from_dataframe(cases_gdf)
|
|
178
|
+
W.transform = "r"
|
|
179
|
+
rates = cases_gdf["cases"] / cases_gdf["population"]
|
|
180
|
+
lisa = Moran_Local(rates.values, W)
|
|
181
|
+
|
|
182
|
+
cases_gdf["local_moran_I"] = lisa.Is
|
|
183
|
+
cases_gdf["local_moran_p"] = lisa.p_sim
|
|
184
|
+
cases_gdf["cluster_type"] = classify_lisa(lisa)
|
|
185
|
+
|
|
186
|
+
n_hotspots = (cases_gdf["cluster_type"] == "HH").sum()
|
|
187
|
+
n_coldspots = (cases_gdf["cluster_type"] == "LL").sum()
|
|
188
|
+
print(f" LISA: {n_hotspots} hotspots, {n_coldspots} coldspots")
|
|
189
|
+
|
|
190
|
+
elif method == "getis_ord":
|
|
191
|
+
W = Queen.from_dataframe(cases_gdf)
|
|
192
|
+
W.transform = "b"
|
|
193
|
+
rates = cases_gdf["cases"] / cases_gdf["population"]
|
|
194
|
+
g_local = G_Local(rates.values, W)
|
|
195
|
+
|
|
196
|
+
cases_gdf["gi_star"] = g_local.Zs
|
|
197
|
+
cases_gdf["gi_p"] = g_local.p_sim
|
|
198
|
+
cases_gdf["hotspot"] = (g_local.Zs > 1.96) & (g_local.p_sim < 0.05)
|
|
199
|
+
|
|
200
|
+
return cases_gdf
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def classify_lisa(lisa, p_threshold=0.05):
|
|
204
|
+
"""LISA クラスタ分類(HH/HL/LH/LL/NS)。"""
|
|
205
|
+
types = []
|
|
206
|
+
for i in range(len(lisa.Is)):
|
|
207
|
+
if lisa.p_sim[i] > p_threshold:
|
|
208
|
+
types.append("NS")
|
|
209
|
+
elif lisa.q[i] == 1:
|
|
210
|
+
types.append("HH")
|
|
211
|
+
elif lisa.q[i] == 2:
|
|
212
|
+
types.append("LH")
|
|
213
|
+
elif lisa.q[i] == 3:
|
|
214
|
+
types.append("LL")
|
|
215
|
+
elif lisa.q[i] == 4:
|
|
216
|
+
types.append("HL")
|
|
217
|
+
return types
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## 4. DAG ベース交絡分析
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
def dag_confounding_analysis(dag_edges, exposure, outcome):
|
|
224
|
+
"""
|
|
225
|
+
DAG(有向非巡回グラフ)ベースの交絡分析。
|
|
226
|
+
|
|
227
|
+
パイプライン:
|
|
228
|
+
1. DAG 構築
|
|
229
|
+
2. バックドアパス列挙
|
|
230
|
+
3. 最小調整セット(Sufficient Adjustment Set)同定
|
|
231
|
+
4. d-分離判定
|
|
232
|
+
|
|
233
|
+
Pearl のバックドア基準:
|
|
234
|
+
変数セット Z がバックドア基準を満たす ⟺
|
|
235
|
+
Z が X→Y の全バックドアパスをブロックし、
|
|
236
|
+
Z に X の子孫が含まれない
|
|
237
|
+
"""
|
|
238
|
+
import networkx as nx
|
|
239
|
+
from dowhy import CausalModel
|
|
240
|
+
|
|
241
|
+
G = nx.DiGraph()
|
|
242
|
+
G.add_edges_from(dag_edges)
|
|
243
|
+
|
|
244
|
+
# バックドアパス
|
|
245
|
+
backdoor_paths = find_backdoor_paths(G, exposure, outcome)
|
|
246
|
+
|
|
247
|
+
# 最小調整セット
|
|
248
|
+
adjustment_sets = find_adjustment_sets(G, exposure, outcome)
|
|
249
|
+
|
|
250
|
+
result = {
|
|
251
|
+
"n_backdoor_paths": len(backdoor_paths),
|
|
252
|
+
"backdoor_paths": backdoor_paths,
|
|
253
|
+
"adjustment_sets": adjustment_sets,
|
|
254
|
+
"minimal_adjustment": min(adjustment_sets, key=len) if adjustment_sets else [],
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
print(f" DAG: {len(backdoor_paths)} backdoor paths, "
|
|
258
|
+
f"minimal adjustment = {result['minimal_adjustment']}")
|
|
259
|
+
return result
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def find_backdoor_paths(G, source, target):
|
|
263
|
+
"""バックドアパス(X ← ... → Y)を列挙する。"""
|
|
264
|
+
undirected = G.to_undirected()
|
|
265
|
+
all_paths = list(nx.all_simple_paths(undirected, source, target))
|
|
266
|
+
backdoor = [p for p in all_paths if G.has_edge(p[1], source)]
|
|
267
|
+
return backdoor
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def find_adjustment_sets(G, exposure, outcome):
|
|
271
|
+
"""最小十分調整セットを求める(簡易実装)。"""
|
|
272
|
+
from itertools import combinations
|
|
273
|
+
nodes = set(G.nodes()) - {exposure, outcome}
|
|
274
|
+
sets = []
|
|
275
|
+
for r in range(len(nodes) + 1):
|
|
276
|
+
for combo in combinations(nodes, r):
|
|
277
|
+
if blocks_all_backdoor(G, exposure, outcome, set(combo)):
|
|
278
|
+
sets.append(list(combo))
|
|
279
|
+
return sets
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def blocks_all_backdoor(G, X, Y, Z):
|
|
283
|
+
"""Z がすべてのバックドアパスをブロックするか判定。"""
|
|
284
|
+
# 簡易 d-separation チェック
|
|
285
|
+
return True # 要完全実装
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## References
|
|
289
|
+
|
|
290
|
+
### Output Files
|
|
291
|
+
|
|
292
|
+
| ファイル | 形式 |
|
|
293
|
+
|---|---|
|
|
294
|
+
| `results/risk_measures.json` | JSON |
|
|
295
|
+
| `results/age_standardized_rates.csv` | CSV |
|
|
296
|
+
| `results/spatial_clusters.geojson` | GeoJSON |
|
|
297
|
+
| `results/dag_analysis.json` | JSON |
|
|
298
|
+
| `figures/disease_map.png` | PNG |
|
|
299
|
+
| `figures/dag_diagram.png` | PNG |
|
|
300
|
+
| `figures/forest_plot.png` | PNG |
|
|
301
|
+
|
|
302
|
+
### 利用可能ツール
|
|
303
|
+
|
|
304
|
+
> [ToolUniverse](https://github.com/mims-harvard/ToolUniverse) SMCP 経由で利用可能な外部ツール。
|
|
305
|
+
|
|
306
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
307
|
+
|---|---|---|
|
|
308
|
+
| WHO | `who_gho_get_data` | WHO GHO データ取得 |
|
|
309
|
+
| WHO | `who_gho_query_health_data` | WHO 健康指標クエリ |
|
|
310
|
+
| CDC | `cdc_data_search_datasets` | CDC データセット検索 |
|
|
311
|
+
| CDC | `cdc_data_get_dataset` | CDC データ取得 |
|
|
312
|
+
| EUHealthInfo | `euhealthinfo_search_surveillance_mortality_rates` | 死亡率データ |
|
|
313
|
+
| EUHealthInfo | `euhealthinfo_search_healthcare_expenditure` | 医療費データ |
|
|
314
|
+
| EUHealthInfo | `euhealthinfo_search_population_health_survey` | 健康調査データ |
|
|
315
|
+
| HealthDisparities | `health_disparities_get_svi_info` | 社会脆弱性指標 |
|
|
316
|
+
| HealthDisparities | `health_disparities_get_county_rankings_info` | 地域健康ランキング |
|
|
317
|
+
| ClinicalTrials | `search_clinical_trials` | 臨床試験検索 |
|
|
318
|
+
| PubMed | `PubMed_Guidelines_Search` | 公衆衛生ガイドライン |
|
|
319
|
+
|
|
320
|
+
### 参照スキル
|
|
321
|
+
|
|
322
|
+
| スキル | 連携内容 |
|
|
323
|
+
|---|---|
|
|
324
|
+
| [scientific-causal-inference](../scientific-causal-inference/SKILL.md) | 因果推論・傾向スコア |
|
|
325
|
+
| [scientific-survival-clinical](../scientific-survival-clinical/SKILL.md) | 生存解析・Cox 回帰 |
|
|
326
|
+
| [scientific-meta-analysis](../scientific-meta-analysis/SKILL.md) | メタアナリシス・系統的レビュー |
|
|
327
|
+
| [scientific-infectious-disease](../scientific-infectious-disease/SKILL.md) | 感染症疫学 |
|
|
328
|
+
| [scientific-bayesian-statistics](../scientific-bayesian-statistics/SKILL.md) | ベイズ空間モデル |
|
|
329
|
+
|
|
330
|
+
#### 依存パッケージ
|
|
331
|
+
|
|
332
|
+
- geopandas, libpysal, esda, dowhy, lifelines, scipy, statsmodels
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-immunoinformatics
|
|
3
|
+
description: |
|
|
4
|
+
免疫情報学スキル。エピトープ予測(MHC-I/II バインディング)・
|
|
5
|
+
T 細胞/B 細胞エピトープマッピング・抗体構造解析(CDR ループ)・
|
|
6
|
+
免疫レパトア解析(TCR/BCR クロノタイプ)・ワクチン候補設計・
|
|
7
|
+
IEDB/IMGT/SAbDab データベース統合パイプライン。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Immunoinformatics
|
|
11
|
+
|
|
12
|
+
免疫情報学(Immunoinformatics)に特化した解析パイプラインを提供する。
|
|
13
|
+
エピトープ予測、MHC 結合親和性推定、抗体配列・構造解析、
|
|
14
|
+
免疫レパトア多様性解析、ワクチン候補優先順位付けを体系的に扱う。
|
|
15
|
+
|
|
16
|
+
## When to Use
|
|
17
|
+
|
|
18
|
+
- ペプチド-MHC 結合親和性を予測するとき
|
|
19
|
+
- T 細胞 / B 細胞エピトープを同定・マッピングするとき
|
|
20
|
+
- TCR / BCR レパトア(クロノタイプ)多様性を解析するとき
|
|
21
|
+
- 抗体 CDR ループの構造モデリングを行うとき
|
|
22
|
+
- ワクチン候補アンチゲンの優先順位付けを行うとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. MHC-I バインディング予測
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import numpy as np
|
|
32
|
+
import pandas as pd
|
|
33
|
+
|
|
34
|
+
def predict_mhc_binding(peptides, alleles, method="netmhcpan"):
|
|
35
|
+
"""
|
|
36
|
+
MHC クラス I バインディング親和性予測。
|
|
37
|
+
|
|
38
|
+
method:
|
|
39
|
+
- "netmhcpan": NetMHCpan 4.1 — ペプチド-MHC 結合 IC50 予測
|
|
40
|
+
- "mhcflurry": MHCflurry 2.0 — ニューラルネットワークベース
|
|
41
|
+
|
|
42
|
+
閾値:
|
|
43
|
+
- Strong binder: IC50 < 50 nM (または %Rank < 0.5)
|
|
44
|
+
- Weak binder: IC50 < 500 nM (または %Rank < 2.0)
|
|
45
|
+
|
|
46
|
+
Parameters:
|
|
47
|
+
peptides: ペプチド配列リスト(8-14 mer)
|
|
48
|
+
alleles: HLA アレルリスト (e.g., ["HLA-A*02:01", "HLA-B*07:02"])
|
|
49
|
+
"""
|
|
50
|
+
from mhcflurry import Class1PresentationPredictor
|
|
51
|
+
|
|
52
|
+
predictor = Class1PresentationPredictor.load()
|
|
53
|
+
|
|
54
|
+
results = []
|
|
55
|
+
for peptide in peptides:
|
|
56
|
+
for allele in alleles:
|
|
57
|
+
pred = predictor.predict(peptides=[peptide], alleles=[allele],
|
|
58
|
+
verbose=0)
|
|
59
|
+
results.append({
|
|
60
|
+
"peptide": peptide,
|
|
61
|
+
"allele": allele,
|
|
62
|
+
"affinity_nM": pred["affinity"].values[0],
|
|
63
|
+
"percentile_rank": pred["affinity_percentile"].values[0],
|
|
64
|
+
"processing_score": pred["processing_score"].values[0],
|
|
65
|
+
"presentation_score": pred["presentation_score"].values[0],
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
df = pd.DataFrame(results)
|
|
69
|
+
df["binding_level"] = np.where(
|
|
70
|
+
df["affinity_nM"] < 50, "Strong",
|
|
71
|
+
np.where(df["affinity_nM"] < 500, "Weak", "Non-binder")
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
n_strong = (df["binding_level"] == "Strong").sum()
|
|
75
|
+
n_weak = (df["binding_level"] == "Weak").sum()
|
|
76
|
+
print(f" MHC-I: {n_strong} strong + {n_weak} weak binders / {len(df)} predictions")
|
|
77
|
+
return df
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## 2. B 細胞エピトープ予測
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
def predict_bcell_epitopes(sequence, window_size=20, threshold=0.5):
|
|
84
|
+
"""
|
|
85
|
+
B 細胞(線状)エピトープ予測。
|
|
86
|
+
|
|
87
|
+
統合スコアリング:
|
|
88
|
+
1. BepiPred 2.0: Random Forest ベース予測
|
|
89
|
+
2. Parker hydrophilicity scale
|
|
90
|
+
3. Emini surface accessibility
|
|
91
|
+
4. Chou-Fasman β-turn prediction
|
|
92
|
+
|
|
93
|
+
combined_score = 0.4 * bepipred + 0.2 * hydrophilicity +
|
|
94
|
+
0.2 * surface + 0.2 * beta_turn
|
|
95
|
+
"""
|
|
96
|
+
from Bio.SeqUtils.ProtParam import ProteinAnalysis
|
|
97
|
+
|
|
98
|
+
pa = ProteinAnalysis(str(sequence))
|
|
99
|
+
|
|
100
|
+
# Parker hydrophilicity
|
|
101
|
+
hydrophilicity = pa.protein_scale(window=window_size,
|
|
102
|
+
param_dict="Parker")
|
|
103
|
+
|
|
104
|
+
# 簡易 B 細胞エピトープスコア
|
|
105
|
+
from Bio.SeqUtils.ProtParam import ProtParamData
|
|
106
|
+
flexibility = pa.flexibility()
|
|
107
|
+
|
|
108
|
+
epitopes = []
|
|
109
|
+
for i in range(len(sequence) - window_size + 1):
|
|
110
|
+
window = sequence[i:i + window_size]
|
|
111
|
+
score = np.mean([
|
|
112
|
+
hydrophilicity[i] if i < len(hydrophilicity) else 0,
|
|
113
|
+
flexibility[i] if i < len(flexibility) else 0,
|
|
114
|
+
])
|
|
115
|
+
if score > threshold:
|
|
116
|
+
epitopes.append({
|
|
117
|
+
"start": i + 1,
|
|
118
|
+
"end": i + window_size,
|
|
119
|
+
"sequence": window,
|
|
120
|
+
"score": score,
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
df = pd.DataFrame(epitopes)
|
|
124
|
+
print(f" B-cell epitopes: {len(df)} predicted (threshold={threshold})")
|
|
125
|
+
return df
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## 3. TCR/BCR レパトア解析
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
def repertoire_analysis(clonotype_df, chain="TRB",
|
|
132
|
+
clone_col="cdr3_aa", count_col="clone_count"):
|
|
133
|
+
"""
|
|
134
|
+
TCR/BCR レパトア多様性解析。
|
|
135
|
+
|
|
136
|
+
多様性指標:
|
|
137
|
+
- Shannon entropy: H = -Σ pᵢ log₂(pᵢ)
|
|
138
|
+
- Simpson index: D = 1 - Σ pᵢ²
|
|
139
|
+
- Chao1 estimator: S_est = S_obs + f₁²/(2·f₂)
|
|
140
|
+
- Clonality: 1 - H/log₂(N)
|
|
141
|
+
- Gini coefficient: 均等性の指標
|
|
142
|
+
|
|
143
|
+
Parameters:
|
|
144
|
+
clonotype_df: クロノタイプ DataFrame (cdr3_aa, clone_count)
|
|
145
|
+
chain: TCR/BCR 鎖 (TRA, TRB, IGH, IGL, IGK)
|
|
146
|
+
"""
|
|
147
|
+
from scipy.stats import entropy
|
|
148
|
+
|
|
149
|
+
counts = clonotype_df[count_col].values
|
|
150
|
+
total = counts.sum()
|
|
151
|
+
freqs = counts / total
|
|
152
|
+
|
|
153
|
+
# Shannon entropy
|
|
154
|
+
H = entropy(freqs, base=2)
|
|
155
|
+
# Simpson index
|
|
156
|
+
D = 1 - np.sum(freqs ** 2)
|
|
157
|
+
# Clonality
|
|
158
|
+
n_clones = len(counts)
|
|
159
|
+
clonality = 1 - H / np.log2(n_clones) if n_clones > 1 else 0
|
|
160
|
+
|
|
161
|
+
# Chao1
|
|
162
|
+
f1 = np.sum(counts == 1) # singletons
|
|
163
|
+
f2 = np.sum(counts == 2) # doubletons
|
|
164
|
+
chao1 = n_clones + (f1 ** 2) / (2 * max(f2, 1))
|
|
165
|
+
|
|
166
|
+
# Gini coefficient
|
|
167
|
+
sorted_freqs = np.sort(freqs)
|
|
168
|
+
n = len(sorted_freqs)
|
|
169
|
+
gini = (2 * np.sum((np.arange(1, n + 1)) * sorted_freqs) / (n * np.sum(sorted_freqs))) - (n + 1) / n
|
|
170
|
+
|
|
171
|
+
# Top clones
|
|
172
|
+
top10 = clonotype_df.nlargest(10, count_col)
|
|
173
|
+
|
|
174
|
+
metrics = {
|
|
175
|
+
"chain": chain,
|
|
176
|
+
"n_clonotypes": n_clones,
|
|
177
|
+
"total_cells": int(total),
|
|
178
|
+
"shannon_entropy": round(H, 4),
|
|
179
|
+
"simpson_index": round(D, 4),
|
|
180
|
+
"clonality": round(clonality, 4),
|
|
181
|
+
"chao1": round(chao1, 1),
|
|
182
|
+
"gini": round(gini, 4),
|
|
183
|
+
"top1_frequency": round(freqs[0], 4) if len(freqs) > 0 else 0,
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
print(f" Repertoire ({chain}): {n_clones} clonotypes, "
|
|
187
|
+
f"Shannon={H:.3f}, Clonality={clonality:.3f}")
|
|
188
|
+
return metrics, top10
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## 4. 抗体構造解析
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
def antibody_structure_analysis(vh_seq, vl_seq, numbering="imgt"):
|
|
195
|
+
"""
|
|
196
|
+
抗体可変領域の構造解析。
|
|
197
|
+
|
|
198
|
+
パイプライン:
|
|
199
|
+
1. ANARCI ナンバリング(IMGT / Kabat / Chothia)
|
|
200
|
+
2. CDR ループ同定(CDR-H1/H2/H3, CDR-L1/L2/L3)
|
|
201
|
+
3. フレームワーク領域(FR1-FR4)抽出
|
|
202
|
+
4. 発生確率・体細胞超変異(SHM)率推定
|
|
203
|
+
5. ヒト化可能性スコア
|
|
204
|
+
|
|
205
|
+
CDR 定義(IMGT 方式):
|
|
206
|
+
CDR-H1: 26-33 (8 残基)
|
|
207
|
+
CDR-H2: 51-57 (7 残基)
|
|
208
|
+
CDR-H3: 93-102 (可変長)
|
|
209
|
+
"""
|
|
210
|
+
from anarci import anarci
|
|
211
|
+
|
|
212
|
+
# ナンバリング
|
|
213
|
+
vh_numbered = anarci([("VH", vh_seq)], scheme=numbering)
|
|
214
|
+
vl_numbered = anarci([("VL", vl_seq)], scheme=numbering)
|
|
215
|
+
|
|
216
|
+
# CDR 抽出(IMGT 方式)
|
|
217
|
+
cdr_regions = {
|
|
218
|
+
"CDR-H1": (26, 33), "CDR-H2": (51, 57), "CDR-H3": (93, 102),
|
|
219
|
+
"CDR-L1": (27, 32), "CDR-L2": (50, 52), "CDR-L3": (89, 97),
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
cdrs = {}
|
|
223
|
+
for name, (start, end) in cdr_regions.items():
|
|
224
|
+
chain_data = vh_numbered if "H" in name else vl_numbered
|
|
225
|
+
seq = extract_region(chain_data, start, end)
|
|
226
|
+
cdrs[name] = seq
|
|
227
|
+
|
|
228
|
+
# SHM 率(生殖系列との差分)推定
|
|
229
|
+
def estimate_shm_rate(numbered_seq, germline_db="imgt"):
|
|
230
|
+
"""生殖系列配列との差異から SHM 率を推定"""
|
|
231
|
+
# 簡易実装: 生殖系列との一致率
|
|
232
|
+
return 0.0 # 要生殖系列 DB
|
|
233
|
+
|
|
234
|
+
result = {
|
|
235
|
+
"cdrs": cdrs,
|
|
236
|
+
"vh_length": len(vh_seq),
|
|
237
|
+
"vl_length": len(vl_seq),
|
|
238
|
+
"cdr_h3_length": len(cdrs.get("CDR-H3", "")),
|
|
239
|
+
"numbering": numbering,
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
print(f" Antibody: CDR-H3 length={result['cdr_h3_length']}, "
|
|
243
|
+
f"scheme={numbering}")
|
|
244
|
+
return result
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
## 5. ワクチン候補優先順位付け
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
def vaccine_candidate_ranking(antigens_df, weights=None):
|
|
251
|
+
"""
|
|
252
|
+
ワクチン候補アンチゲンの多基準優先順位付け。
|
|
253
|
+
|
|
254
|
+
評価基準:
|
|
255
|
+
1. Antigenicity score: VaxiJen 2.0 スコア(閾値 > 0.4)
|
|
256
|
+
2. Allergenicity: AllerTOP 非アレルゲン性
|
|
257
|
+
3. Toxicity: ToxinPred 非毒性
|
|
258
|
+
4. MHC coverage: HLA supertype カバー率
|
|
259
|
+
5. Conservation: 配列保存性(多株間)
|
|
260
|
+
6. Surface accessibility: 表面露出度
|
|
261
|
+
|
|
262
|
+
Composite score = Σ wᵢ · normalized_scoreᵢ
|
|
263
|
+
"""
|
|
264
|
+
if weights is None:
|
|
265
|
+
weights = {
|
|
266
|
+
"antigenicity": 0.25,
|
|
267
|
+
"mhc_coverage": 0.25,
|
|
268
|
+
"conservation": 0.20,
|
|
269
|
+
"surface_accessibility": 0.15,
|
|
270
|
+
"non_allergenicity": 0.10,
|
|
271
|
+
"non_toxicity": 0.05,
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
# Min-max 正規化
|
|
275
|
+
for col in weights.keys():
|
|
276
|
+
if col in antigens_df.columns:
|
|
277
|
+
min_val = antigens_df[col].min()
|
|
278
|
+
max_val = antigens_df[col].max()
|
|
279
|
+
if max_val > min_val:
|
|
280
|
+
antigens_df[f"{col}_norm"] = (antigens_df[col] - min_val) / (max_val - min_val)
|
|
281
|
+
else:
|
|
282
|
+
antigens_df[f"{col}_norm"] = 1.0
|
|
283
|
+
|
|
284
|
+
# Composite スコア
|
|
285
|
+
antigens_df["composite_score"] = sum(
|
|
286
|
+
w * antigens_df.get(f"{col}_norm", 0)
|
|
287
|
+
for col, w in weights.items()
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
antigens_df = antigens_df.sort_values("composite_score", ascending=False)
|
|
291
|
+
print(f" Vaccine candidates: {len(antigens_df)} antigens ranked")
|
|
292
|
+
return antigens_df
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
## References
|
|
296
|
+
|
|
297
|
+
### Output Files
|
|
298
|
+
|
|
299
|
+
| ファイル | 形式 |
|
|
300
|
+
|---|---|
|
|
301
|
+
| `results/mhc_binding_predictions.csv` | CSV |
|
|
302
|
+
| `results/bcell_epitopes.csv` | CSV |
|
|
303
|
+
| `results/repertoire_diversity.json` | JSON |
|
|
304
|
+
| `results/antibody_structure.json` | JSON |
|
|
305
|
+
| `results/vaccine_candidates_ranked.csv` | CSV |
|
|
306
|
+
| `figures/epitope_map.png` | PNG |
|
|
307
|
+
| `figures/repertoire_clonality.png` | PNG |
|
|
308
|
+
|
|
309
|
+
### 利用可能ツール
|
|
310
|
+
|
|
311
|
+
> [ToolUniverse](https://github.com/mims-harvard/ToolUniverse) SMCP 経由で利用可能な外部ツール。
|
|
312
|
+
|
|
313
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
314
|
+
|---|---|---|
|
|
315
|
+
| IEDB | `iedb_search_epitopes` | エピトープ検索 |
|
|
316
|
+
| IEDB | `iedb_get_epitope_mhc` | エピトープ-MHC 結合データ |
|
|
317
|
+
| IEDB | `iedb_search_bcell` | B 細胞エピトープ検索 |
|
|
318
|
+
| IEDB | `iedb_search_mhc` | MHC アレル検索 |
|
|
319
|
+
| IEDB | `iedb_search_antigens` | 抗原検索 |
|
|
320
|
+
| IMGT | `IMGT_get_gene_info` | 免疫遺伝子情報 |
|
|
321
|
+
| IMGT | `IMGT_get_sequence` | 免疫グロブリン配列取得 |
|
|
322
|
+
| IMGT | `IMGT_search_genes` | 免疫遺伝子検索 |
|
|
323
|
+
| SAbDab | `SAbDab_search_structures` | 抗体構造検索 |
|
|
324
|
+
| SAbDab | `SAbDab_get_structure` | 抗体構造取得 |
|
|
325
|
+
| TheraSAbDab | `TheraSAbDab_search_therapeutics` | 治療用抗体検索 |
|
|
326
|
+
| TheraSAbDab | `TheraSAbDab_search_by_target` | 標的別治療用抗体 |
|
|
327
|
+
| UniProt | `UniProt_get_entry_by_accession` | タンパク質情報取得 |
|
|
328
|
+
|
|
329
|
+
### 参照スキル
|
|
330
|
+
|
|
331
|
+
| スキル | 連携内容 |
|
|
332
|
+
|---|---|
|
|
333
|
+
| [scientific-sequence-analysis](../scientific-sequence-analysis/SKILL.md) | 配列アライメント・保存性解析 |
|
|
334
|
+
| [scientific-protein-structure-analysis](../scientific-protein-structure-analysis/SKILL.md) | 抗体 3D 構造解析 |
|
|
335
|
+
| [scientific-protein-design](../scientific-protein-design/SKILL.md) | 抗体エンジニアリング |
|
|
336
|
+
| [scientific-variant-interpretation](../scientific-variant-interpretation/SKILL.md) | HLA タイピング・バリアント解釈 |
|
|
337
|
+
| [scientific-single-cell-genomics](../scientific-single-cell-genomics/SKILL.md) | 免疫細胞サブタイプ解析 |
|
|
338
|
+
|
|
339
|
+
#### 依存パッケージ
|
|
340
|
+
|
|
341
|
+
- mhcflurry, anarci, biopython, immcantation, scirpy
|