@nahisaho/satori 0.11.1 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -56
- package/package.json +1 -1
- package/src/.github/skills/scientific-biothings-idmapping/SKILL.md +298 -0
- package/src/.github/skills/scientific-cancer-genomics/SKILL.md +287 -0
- package/src/.github/skills/scientific-clinical-reporting/SKILL.md +324 -0
- package/src/.github/skills/scientific-compound-screening/SKILL.md +245 -0
- package/src/.github/skills/scientific-genome-sequence-tools/SKILL.md +304 -0
- package/src/.github/skills/scientific-healthcare-ai/SKILL.md +273 -0
- package/src/.github/skills/scientific-human-protein-atlas/SKILL.md +244 -0
- package/src/.github/skills/scientific-literature-search/SKILL.md +443 -0
- package/src/.github/skills/scientific-metabolic-modeling/SKILL.md +288 -0
- package/src/.github/skills/scientific-metabolomics-databases/SKILL.md +288 -0
- package/src/.github/skills/scientific-molecular-docking/SKILL.md +303 -0
- package/src/.github/skills/scientific-noncoding-rna/SKILL.md +262 -0
- package/src/.github/skills/scientific-pathway-enrichment/SKILL.md +449 -0
- package/src/.github/skills/scientific-pharmacology-targets/SKILL.md +323 -0
- package/src/.github/skills/scientific-protein-domain-family/SKILL.md +369 -0
- package/src/.github/skills/scientific-protein-interaction-network/SKILL.md +352 -0
- package/src/.github/skills/scientific-rare-disease-genetics/SKILL.md +327 -0
- package/src/.github/skills/scientific-structural-proteomics/SKILL.md +317 -0
- package/src/.github/skills/scientific-systematic-review/SKILL.md +361 -0
- package/src/.github/skills/scientific-variant-effect-prediction/SKILL.md +325 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-genome-sequence-tools
|
|
3
|
+
description: |
|
|
4
|
+
ゲノム配列解析総合スキル。Ensembl ゲノムブラウザ、dbSNP 変異データ、
|
|
5
|
+
BLAST 相同性検索、NCBI Nucleotide 配列取得、GDC がんゲノミクスデータの
|
|
6
|
+
統合パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Genome Sequence Tools
|
|
10
|
+
|
|
11
|
+
公的ゲノムデータベース (Ensembl, dbSNP, BLAST, NCBI, GDC) を横断した
|
|
12
|
+
配列検索・変異アノテーション・がんゲノミクスパイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- ゲノム配列・エクソン構造を Ensembl から取得するとき
|
|
17
|
+
- rsID から変異のアレル頻度を調べるとき
|
|
18
|
+
- BLAST で塩基/アミノ酸配列の相同性検索を行うとき
|
|
19
|
+
- NCBI Nucleotide から配列をフェッチするとき
|
|
20
|
+
- GDC がんゲノミクスデータ (体細胞変異, CNV, 発現) を取得するとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. dbSNP 変異情報取得
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import requests
|
|
30
|
+
import pandas as pd
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_dbsnp_variant(rsid):
|
|
34
|
+
"""
|
|
35
|
+
dbSNP から rsID ベースの変異情報 (アレル頻度含む) を取得。
|
|
36
|
+
|
|
37
|
+
Parameters:
|
|
38
|
+
rsid: str — e.g. "rs7412"
|
|
39
|
+
|
|
40
|
+
ToolUniverse:
|
|
41
|
+
dbsnp_get_variant_by_rsid(rsid=rsid)
|
|
42
|
+
dbsnp_get_frequencies(rsid=rsid)
|
|
43
|
+
dbsnp_search_by_gene(gene_symbol=gene_symbol)
|
|
44
|
+
"""
|
|
45
|
+
url = f"https://api.ncbi.nlm.nih.gov/variation/v0/refsnp/{rsid.lstrip('rs')}"
|
|
46
|
+
resp = requests.get(url)
|
|
47
|
+
resp.raise_for_status()
|
|
48
|
+
data = resp.json()
|
|
49
|
+
|
|
50
|
+
# Extract primary info
|
|
51
|
+
info = {
|
|
52
|
+
"rsid": f"rs{data.get('refsnp_id', '')}",
|
|
53
|
+
"create_date": data.get("create_date", ""),
|
|
54
|
+
"update_date": data.get("update_date", ""),
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Allele frequencies
|
|
58
|
+
alleles = data.get("primary_snapshot_data", {}).get(
|
|
59
|
+
"allele_annotations", []
|
|
60
|
+
)
|
|
61
|
+
freq_data = []
|
|
62
|
+
for allele in alleles:
|
|
63
|
+
for freq_entry in allele.get("frequency", []):
|
|
64
|
+
freq_data.append({
|
|
65
|
+
"study": freq_entry.get("study_name", ""),
|
|
66
|
+
"allele": freq_entry.get("allele", ""),
|
|
67
|
+
"count": freq_entry.get("allele_count", 0),
|
|
68
|
+
"total": freq_entry.get("total_count", 0),
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
df_freq = pd.DataFrame(freq_data)
|
|
72
|
+
print(f"dbSNP {info['rsid']}: {len(df_freq)} frequency entries")
|
|
73
|
+
return info, df_freq
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## 2. BLAST 相同性検索
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import time
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def blast_search(sequence, program="blastn", database="nt", max_hits=10):
|
|
83
|
+
"""
|
|
84
|
+
NCBI BLAST REST API で相同性検索。
|
|
85
|
+
|
|
86
|
+
Parameters:
|
|
87
|
+
sequence: str — query sequence (nucleotide or protein)
|
|
88
|
+
program: str — "blastn", "blastp", "blastx", "tblastn"
|
|
89
|
+
database: str — "nt", "nr", "refseq_rna", etc.
|
|
90
|
+
|
|
91
|
+
ToolUniverse:
|
|
92
|
+
BLAST_nucleotide_search(sequence=sequence, database=database)
|
|
93
|
+
BLAST_protein_search(sequence=sequence, database=database)
|
|
94
|
+
"""
|
|
95
|
+
put_url = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
|
|
96
|
+
params = {
|
|
97
|
+
"CMD": "Put",
|
|
98
|
+
"PROGRAM": program,
|
|
99
|
+
"DATABASE": database,
|
|
100
|
+
"QUERY": sequence,
|
|
101
|
+
"FORMAT_TYPE": "JSON2",
|
|
102
|
+
"HITLIST_SIZE": max_hits,
|
|
103
|
+
}
|
|
104
|
+
resp = requests.post(put_url, data=params)
|
|
105
|
+
resp.raise_for_status()
|
|
106
|
+
|
|
107
|
+
# Extract RID
|
|
108
|
+
import re
|
|
109
|
+
rid_match = re.search(r"RID = (\S+)", resp.text)
|
|
110
|
+
if not rid_match:
|
|
111
|
+
raise ValueError("BLAST RID not found")
|
|
112
|
+
rid = rid_match.group(1)
|
|
113
|
+
print(f"BLAST submitted: RID={rid}")
|
|
114
|
+
|
|
115
|
+
# Poll for results
|
|
116
|
+
for _ in range(60):
|
|
117
|
+
time.sleep(10)
|
|
118
|
+
check = requests.get(put_url, params={
|
|
119
|
+
"CMD": "Get", "RID": rid, "FORMAT_TYPE": "JSON2"
|
|
120
|
+
})
|
|
121
|
+
if "Status=WAITING" not in check.text:
|
|
122
|
+
break
|
|
123
|
+
|
|
124
|
+
return check.json() if check.headers.get(
|
|
125
|
+
"Content-Type", ""
|
|
126
|
+
).startswith("application/json") else check.text
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## 3. NCBI Nucleotide 配列フェッチ
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
def fetch_ncbi_sequence(accession, rettype="fasta"):
|
|
133
|
+
"""
|
|
134
|
+
NCBI Nucleotide (E-utilities) から配列を取得。
|
|
135
|
+
|
|
136
|
+
Parameters:
|
|
137
|
+
accession: str — NCBI accession (e.g., "NM_000546.6")
|
|
138
|
+
rettype: str — "fasta", "gb", "gbwithparts"
|
|
139
|
+
|
|
140
|
+
ToolUniverse:
|
|
141
|
+
NCBI_search_nucleotide(query=query)
|
|
142
|
+
NCBI_fetch_accessions(accessions=accessions)
|
|
143
|
+
NCBI_get_sequence(accession=accession)
|
|
144
|
+
"""
|
|
145
|
+
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
|
146
|
+
params = {
|
|
147
|
+
"db": "nucleotide",
|
|
148
|
+
"id": accession,
|
|
149
|
+
"rettype": rettype,
|
|
150
|
+
"retmode": "text",
|
|
151
|
+
}
|
|
152
|
+
resp = requests.get(url, params=params)
|
|
153
|
+
resp.raise_for_status()
|
|
154
|
+
|
|
155
|
+
print(f"NCBI Nucleotide '{accession}': {len(resp.text)} chars ({rettype})")
|
|
156
|
+
return resp.text
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## 4. GDC がんゲノミクスデータ
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
def get_gdc_mutations(gene_symbol, project_id=None):
|
|
163
|
+
"""
|
|
164
|
+
NCI GDC (Genomic Data Commons) から体細胞変異データを取得。
|
|
165
|
+
|
|
166
|
+
Parameters:
|
|
167
|
+
gene_symbol: str — e.g. "TP53"
|
|
168
|
+
project_id: str | None — e.g. "TCGA-BRCA"
|
|
169
|
+
|
|
170
|
+
ToolUniverse:
|
|
171
|
+
GDC_get_ssm_by_gene(gene_symbol=gene_symbol)
|
|
172
|
+
GDC_get_mutation_frequency(project_id=project_id)
|
|
173
|
+
GDC_get_gene_expression(gene_id=gene_id, project_id=project_id)
|
|
174
|
+
GDC_get_cnv_data(gene_id=gene_id)
|
|
175
|
+
GDC_list_projects()
|
|
176
|
+
GDC_search_cases(filters=filters)
|
|
177
|
+
GDC_list_files(filters=filters)
|
|
178
|
+
"""
|
|
179
|
+
url = "https://api.gdc.cancer.gov/ssms"
|
|
180
|
+
filters = {
|
|
181
|
+
"op": "and",
|
|
182
|
+
"content": [
|
|
183
|
+
{"op": "in", "content": {
|
|
184
|
+
"field": "consequence.transcript.gene.symbol",
|
|
185
|
+
"value": [gene_symbol],
|
|
186
|
+
}},
|
|
187
|
+
],
|
|
188
|
+
}
|
|
189
|
+
if project_id:
|
|
190
|
+
filters["content"].append({
|
|
191
|
+
"op": "in",
|
|
192
|
+
"content": {
|
|
193
|
+
"field": "cases.project.project_id",
|
|
194
|
+
"value": [project_id],
|
|
195
|
+
},
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
import json
|
|
199
|
+
params = {
|
|
200
|
+
"filters": json.dumps(filters),
|
|
201
|
+
"fields": ("ssm_id,consequence.transcript.gene.symbol,"
|
|
202
|
+
"consequence.transcript.aa_change,"
|
|
203
|
+
"consequence.transcript.consequence_type,"
|
|
204
|
+
"genomic_dna_change"),
|
|
205
|
+
"size": 100,
|
|
206
|
+
"format": "json",
|
|
207
|
+
}
|
|
208
|
+
resp = requests.get(url, params=params)
|
|
209
|
+
resp.raise_for_status()
|
|
210
|
+
hits = resp.json().get("data", {}).get("hits", [])
|
|
211
|
+
|
|
212
|
+
results = []
|
|
213
|
+
for hit in hits:
|
|
214
|
+
for csq in hit.get("consequence", []):
|
|
215
|
+
tx = csq.get("transcript", {})
|
|
216
|
+
results.append({
|
|
217
|
+
"ssm_id": hit.get("ssm_id", ""),
|
|
218
|
+
"gene": tx.get("gene", {}).get("symbol", ""),
|
|
219
|
+
"aa_change": tx.get("aa_change", ""),
|
|
220
|
+
"consequence_type": tx.get("consequence_type", ""),
|
|
221
|
+
"genomic_dna_change": hit.get("genomic_dna_change", ""),
|
|
222
|
+
})
|
|
223
|
+
|
|
224
|
+
df = pd.DataFrame(results)
|
|
225
|
+
print(f"GDC SSMs '{gene_symbol}'"
|
|
226
|
+
f"{f' ({project_id})' if project_id else ''}: {len(df)} mutations")
|
|
227
|
+
return df
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
## 5. 統合ゲノム変異パイプライン
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
def integrated_variant_pipeline(rsid, gene_symbol=None):
|
|
234
|
+
"""
|
|
235
|
+
dbSNP + GDC を統合したゲノム変異解析パイプライン。
|
|
236
|
+
|
|
237
|
+
ToolUniverse (横断):
|
|
238
|
+
dbsnp_get_variant_by_rsid(rsid) → GDC_get_ssm_by_gene(gene_symbol)
|
|
239
|
+
"""
|
|
240
|
+
pipeline_result = {"rsid": rsid}
|
|
241
|
+
|
|
242
|
+
# Step 1: dbSNP
|
|
243
|
+
info, freq_df = get_dbsnp_variant(rsid)
|
|
244
|
+
pipeline_result["dbsnp"] = info
|
|
245
|
+
|
|
246
|
+
# Step 2: GDC somatic mutations (if gene provided)
|
|
247
|
+
if gene_symbol:
|
|
248
|
+
gdc_df = get_gdc_mutations(gene_symbol)
|
|
249
|
+
pipeline_result["gdc_mutation_count"] = len(gdc_df)
|
|
250
|
+
pipeline_result["gdc_top_consequences"] = (
|
|
251
|
+
gdc_df["consequence_type"].value_counts().head(5).to_dict()
|
|
252
|
+
if not gdc_df.empty else {}
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
print(f"Integrated variant: {rsid}"
|
|
256
|
+
f" | GDC={pipeline_result.get('gdc_mutation_count', 'N/A')}")
|
|
257
|
+
return pipeline_result
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## References
|
|
261
|
+
|
|
262
|
+
### Output Files
|
|
263
|
+
|
|
264
|
+
| ファイル | 形式 |
|
|
265
|
+
|---|---|
|
|
266
|
+
| `results/dbsnp_variant.json` | JSON |
|
|
267
|
+
| `results/dbsnp_frequencies.csv` | CSV |
|
|
268
|
+
| `results/blast_results.json` | JSON |
|
|
269
|
+
| `results/ncbi_sequence.fasta` | FASTA |
|
|
270
|
+
| `results/gdc_mutations.csv` | CSV |
|
|
271
|
+
|
|
272
|
+
### 利用可能ツール
|
|
273
|
+
|
|
274
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
275
|
+
|---|---|---|
|
|
276
|
+
| dbSNP | `dbsnp_get_variant_by_rsid` | rsID 変異情報 |
|
|
277
|
+
| dbSNP | `dbsnp_get_frequencies` | アレル頻度 |
|
|
278
|
+
| dbSNP | `dbsnp_search_by_gene` | 遺伝子→変異 |
|
|
279
|
+
| BLAST | `BLAST_nucleotide_search` | 核酸相同性検索 |
|
|
280
|
+
| BLAST | `BLAST_protein_search` | タンパク質相同性検索 |
|
|
281
|
+
| NCBI | `NCBI_search_nucleotide` | 配列検索 |
|
|
282
|
+
| NCBI | `NCBI_fetch_accessions` | アクセッション取得 |
|
|
283
|
+
| NCBI | `NCBI_get_sequence` | 配列フェッチ |
|
|
284
|
+
| GDC | `GDC_get_ssm_by_gene` | 体細胞変異 |
|
|
285
|
+
| GDC | `GDC_get_mutation_frequency` | 変異頻度 |
|
|
286
|
+
| GDC | `GDC_get_gene_expression` | 発現データ |
|
|
287
|
+
| GDC | `GDC_get_cnv_data` | CNV データ |
|
|
288
|
+
| GDC | `GDC_list_projects` | プロジェクト一覧 |
|
|
289
|
+
| GDC | `GDC_search_cases` | 症例検索 |
|
|
290
|
+
| GDC | `GDC_list_files` | ファイル一覧 |
|
|
291
|
+
|
|
292
|
+
### 参照スキル
|
|
293
|
+
|
|
294
|
+
| スキル | 関連 |
|
|
295
|
+
|---|---|
|
|
296
|
+
| `scientific-variant-interpretation` | 変異アノテーション |
|
|
297
|
+
| `scientific-population-genetics` | 集団遺伝学 |
|
|
298
|
+
| `scientific-cancer-genomics` | がんゲノミクス |
|
|
299
|
+
| `scientific-rare-disease-genetics` | 希少疾患遺伝学 |
|
|
300
|
+
| `scientific-biothings-idmapping` | ID マッピング |
|
|
301
|
+
|
|
302
|
+
### 依存パッケージ
|
|
303
|
+
|
|
304
|
+
`requests`, `pandas`
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-healthcare-ai
|
|
3
|
+
description: |
|
|
4
|
+
ヘルスケア AI スキル。PyHealth 臨床 ML パイプライン、
|
|
5
|
+
フローサイトメトリー (FlowIO) 解析、電子健康記録 (EHR) 処理、
|
|
6
|
+
臨床予測モデル構築のガイダンス。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Healthcare AI
|
|
10
|
+
|
|
11
|
+
臨床データ解析・ヘルスケア機械学習パイプラインを提供する。
|
|
12
|
+
PyHealth フレームワーク、フローサイトメトリー解析ツールを活用。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 臨床予測モデル (再入院予測, 死亡率予測等) を構築するとき
|
|
17
|
+
- EHR (電子健康記録) データの前処理・特徴量エンジニアリングを行うとき
|
|
18
|
+
- フローサイトメトリー (FACS) データを読み込み・解析するとき
|
|
19
|
+
- 臨床タスク向けの ML パイプラインを設計するとき
|
|
20
|
+
- 医療コード (ICD-10, SNOMED, ATC) のマッピングを行うとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. PyHealth 臨床予測パイプライン
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
"""
|
|
30
|
+
PyHealth による臨床予測モデル構築。
|
|
31
|
+
pip install pyhealth
|
|
32
|
+
|
|
33
|
+
K-Dense-AI 参照: pyhealth — 臨床 ML フレームワーク
|
|
34
|
+
"""
|
|
35
|
+
from pyhealth.datasets import MIMIC3Dataset
|
|
36
|
+
from pyhealth.tasks import readmission_prediction_mimic3_fn
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def build_clinical_pipeline(
|
|
40
|
+
mimic3_root,
|
|
41
|
+
tables=("DIAGNOSES_ICD", "PROCEDURES_ICD", "PRESCRIPTIONS"),
|
|
42
|
+
code_mapping=None,
|
|
43
|
+
):
|
|
44
|
+
"""
|
|
45
|
+
MIMIC-III データセットから臨床予測パイプラインを構築。
|
|
46
|
+
|
|
47
|
+
Parameters:
|
|
48
|
+
mimic3_root: str — MIMIC-III CSV ディレクトリパス
|
|
49
|
+
tables: tuple — 使用するテーブル
|
|
50
|
+
code_mapping: dict | None — コードマッピング設定
|
|
51
|
+
"""
|
|
52
|
+
# Step 1: Dataset loading
|
|
53
|
+
if code_mapping is None:
|
|
54
|
+
code_mapping = {
|
|
55
|
+
"NDC": ("ATC", {"target_kwargs": {"level": 3}}),
|
|
56
|
+
"ICD9CM": "CCSCM",
|
|
57
|
+
"ICD9PROC": "CCSPROC",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
dataset = MIMIC3Dataset(
|
|
61
|
+
root=mimic3_root,
|
|
62
|
+
tables=tables,
|
|
63
|
+
code_mapping=code_mapping,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
print(f"MIMIC-III dataset: {len(dataset.patients)} patients")
|
|
67
|
+
return dataset
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def apply_clinical_task(dataset, task_fn=None):
|
|
71
|
+
"""
|
|
72
|
+
臨床タスク関数を適用しサンプルを生成。
|
|
73
|
+
"""
|
|
74
|
+
from pyhealth.datasets import split_by_patient
|
|
75
|
+
|
|
76
|
+
if task_fn is None:
|
|
77
|
+
task_fn = readmission_prediction_mimic3_fn
|
|
78
|
+
|
|
79
|
+
samples = dataset.set_task(task_fn)
|
|
80
|
+
train, val, test = split_by_patient(samples, [0.8, 0.1, 0.1])
|
|
81
|
+
|
|
82
|
+
print(f"Clinical task samples: "
|
|
83
|
+
f"train={len(train)}, val={len(val)}, test={len(test)}")
|
|
84
|
+
return train, val, test
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## 2. PyHealth モデル学習
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
def train_clinical_model(
|
|
91
|
+
train_dataset,
|
|
92
|
+
val_dataset,
|
|
93
|
+
model_type="Transformer",
|
|
94
|
+
epochs=20,
|
|
95
|
+
batch_size=64,
|
|
96
|
+
):
|
|
97
|
+
"""
|
|
98
|
+
PyHealth モデルの学習。
|
|
99
|
+
|
|
100
|
+
Parameters:
|
|
101
|
+
train_dataset: SampleDataset
|
|
102
|
+
val_dataset: SampleDataset
|
|
103
|
+
model_type: str — "Transformer", "RETAIN", "GRU", "CNN"
|
|
104
|
+
epochs: int — 学習エポック数
|
|
105
|
+
"""
|
|
106
|
+
from pyhealth.models import Transformer
|
|
107
|
+
from pyhealth.trainer import Trainer
|
|
108
|
+
|
|
109
|
+
model_classes = {
|
|
110
|
+
"Transformer": Transformer,
|
|
111
|
+
}
|
|
112
|
+
ModelClass = model_classes.get(model_type, Transformer)
|
|
113
|
+
|
|
114
|
+
model = ModelClass(
|
|
115
|
+
dataset=train_dataset,
|
|
116
|
+
feature_keys=["conditions", "procedures", "drugs"],
|
|
117
|
+
label_key="label",
|
|
118
|
+
mode="binary",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
trainer = Trainer(model=model)
|
|
122
|
+
trainer.train(
|
|
123
|
+
train_dataloader=train_dataset,
|
|
124
|
+
val_dataloader=val_dataset,
|
|
125
|
+
epochs=epochs,
|
|
126
|
+
monitor="pr_auc",
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
print(f"Clinical model ({model_type}): trained for {epochs} epochs")
|
|
130
|
+
return model, trainer
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## 3. フローサイトメトリー解析
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
def read_fcs_file(fcs_path):
|
|
137
|
+
"""
|
|
138
|
+
FCS ファイルの読み込みと前処理。
|
|
139
|
+
pip install flowio
|
|
140
|
+
|
|
141
|
+
K-Dense-AI 参照: flowio — FCS file I/O
|
|
142
|
+
|
|
143
|
+
Parameters:
|
|
144
|
+
fcs_path: str — FCS ファイルパス
|
|
145
|
+
"""
|
|
146
|
+
import flowio
|
|
147
|
+
import numpy as np
|
|
148
|
+
import pandas as pd
|
|
149
|
+
|
|
150
|
+
fcs_data = flowio.FlowData(fcs_path)
|
|
151
|
+
|
|
152
|
+
# Extract channel names
|
|
153
|
+
channels = []
|
|
154
|
+
for i in range(1, fcs_data.channel_count + 1):
|
|
155
|
+
name = fcs_data.channels.get(f"P{i}N", f"Channel_{i}")
|
|
156
|
+
channels.append(name)
|
|
157
|
+
|
|
158
|
+
# Convert to DataFrame
|
|
159
|
+
events = np.array(fcs_data.events).reshape(-1, fcs_data.channel_count)
|
|
160
|
+
df = pd.DataFrame(events, columns=channels)
|
|
161
|
+
|
|
162
|
+
print(f"FCS '{fcs_path}': {len(df)} events x {len(channels)} channels")
|
|
163
|
+
return df, fcs_data
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def gate_fcs_data(df, channel, low=None, high=None):
|
|
167
|
+
"""
|
|
168
|
+
単純な矩形ゲーティング。
|
|
169
|
+
|
|
170
|
+
Parameters:
|
|
171
|
+
df: pd.DataFrame — FCS データ
|
|
172
|
+
channel: str — チャネル名
|
|
173
|
+
low: float | None — 下限
|
|
174
|
+
high: float | None — 上限
|
|
175
|
+
"""
|
|
176
|
+
mask = pd.Series([True] * len(df))
|
|
177
|
+
if low is not None:
|
|
178
|
+
mask &= df[channel] >= low
|
|
179
|
+
if high is not None:
|
|
180
|
+
mask &= df[channel] <= high
|
|
181
|
+
|
|
182
|
+
gated = df[mask]
|
|
183
|
+
pct = len(gated) / len(df) * 100
|
|
184
|
+
print(f"Gate '{channel}' [{low},{high}]: "
|
|
185
|
+
f"{len(gated)}/{len(df)} events ({pct:.1f}%)")
|
|
186
|
+
return gated
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## 4. 医療コードマッピング
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
def map_medical_codes(codes, source_system, target_system):
|
|
193
|
+
"""
|
|
194
|
+
医療コード間のマッピング。
|
|
195
|
+
|
|
196
|
+
Parameters:
|
|
197
|
+
codes: list[str] — ソースコードのリスト
|
|
198
|
+
source_system: str — "ICD9CM", "ICD10CM", "NDC", "ATC", "SNOMED"
|
|
199
|
+
target_system: str — 変換先コード体系
|
|
200
|
+
"""
|
|
201
|
+
try:
|
|
202
|
+
from pyhealth.medcode import CrossMap
|
|
203
|
+
|
|
204
|
+
mapper = CrossMap(source_system, target_system)
|
|
205
|
+
results = {}
|
|
206
|
+
for code in codes:
|
|
207
|
+
mapped = mapper.map(code)
|
|
208
|
+
results[code] = mapped
|
|
209
|
+
|
|
210
|
+
mapped_count = sum(1 for v in results.values() if v)
|
|
211
|
+
print(f"Code mapping {source_system}→{target_system}: "
|
|
212
|
+
f"{mapped_count}/{len(codes)} mapped")
|
|
213
|
+
return results
|
|
214
|
+
|
|
215
|
+
except ImportError:
|
|
216
|
+
print("pyhealth.medcode not available; install pyhealth")
|
|
217
|
+
return {}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## 5. 臨床モデル評価
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
def evaluate_clinical_model(trainer, test_dataset):
|
|
224
|
+
"""
|
|
225
|
+
臨床予測モデルの評価。
|
|
226
|
+
|
|
227
|
+
Parameters:
|
|
228
|
+
trainer: Trainer — 学習済み Trainer
|
|
229
|
+
test_dataset: SampleDataset — テストデータ
|
|
230
|
+
"""
|
|
231
|
+
metrics = trainer.evaluate(test_dataset)
|
|
232
|
+
|
|
233
|
+
print("Clinical model evaluation:")
|
|
234
|
+
for metric_name, value in metrics.items():
|
|
235
|
+
print(f" {metric_name}: {value:.4f}")
|
|
236
|
+
|
|
237
|
+
return metrics
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## References
|
|
241
|
+
|
|
242
|
+
### Output Files
|
|
243
|
+
|
|
244
|
+
| ファイル | 形式 |
|
|
245
|
+
|---|---|
|
|
246
|
+
| `results/clinical_predictions.csv` | CSV |
|
|
247
|
+
| `results/clinical_metrics.json` | JSON |
|
|
248
|
+
| `results/fcs_processed.csv` | CSV |
|
|
249
|
+
| `results/code_mapping.json` | JSON |
|
|
250
|
+
|
|
251
|
+
### 利用可能ツール
|
|
252
|
+
|
|
253
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
254
|
+
|---|---|---|
|
|
255
|
+
| (K-Dense) | `pyhealth` | 臨床 ML フレームワーク |
|
|
256
|
+
| (K-Dense) | `flowio` | FCS ファイル I/O |
|
|
257
|
+
|
|
258
|
+
> **注**: 本スキルは ToolUniverse ツールを持たず、
|
|
259
|
+
> K-Dense-AI Scientific Skills からの参照のみ。
|
|
260
|
+
|
|
261
|
+
### 参照スキル
|
|
262
|
+
|
|
263
|
+
| スキル | 関連 |
|
|
264
|
+
|---|---|
|
|
265
|
+
| `scientific-clinical-nlp` | 臨床 NLP |
|
|
266
|
+
| `scientific-biostatistics-survival` | 生存時間解析 |
|
|
267
|
+
| `scientific-single-cell-rnaseq` | 単一細胞解析 |
|
|
268
|
+
| `scientific-machine-learning-omics` | ML x オミクス |
|
|
269
|
+
| `scientific-biothings-idmapping` | ID マッピング |
|
|
270
|
+
|
|
271
|
+
### 依存パッケージ
|
|
272
|
+
|
|
273
|
+
`pyhealth`, `flowio`, `numpy`, `pandas`, `scikit-learn`
|