@nahisaho/satori 0.11.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. package/README.md +125 -56
  2. package/package.json +1 -1
  3. package/src/.github/skills/scientific-biothings-idmapping/SKILL.md +298 -0
  4. package/src/.github/skills/scientific-cancer-genomics/SKILL.md +287 -0
  5. package/src/.github/skills/scientific-clinical-reporting/SKILL.md +324 -0
  6. package/src/.github/skills/scientific-compound-screening/SKILL.md +245 -0
  7. package/src/.github/skills/scientific-genome-sequence-tools/SKILL.md +304 -0
  8. package/src/.github/skills/scientific-healthcare-ai/SKILL.md +273 -0
  9. package/src/.github/skills/scientific-human-protein-atlas/SKILL.md +244 -0
  10. package/src/.github/skills/scientific-literature-search/SKILL.md +443 -0
  11. package/src/.github/skills/scientific-metabolic-modeling/SKILL.md +288 -0
  12. package/src/.github/skills/scientific-metabolomics-databases/SKILL.md +288 -0
  13. package/src/.github/skills/scientific-molecular-docking/SKILL.md +303 -0
  14. package/src/.github/skills/scientific-noncoding-rna/SKILL.md +262 -0
  15. package/src/.github/skills/scientific-pathway-enrichment/SKILL.md +449 -0
  16. package/src/.github/skills/scientific-pharmacology-targets/SKILL.md +323 -0
  17. package/src/.github/skills/scientific-protein-domain-family/SKILL.md +369 -0
  18. package/src/.github/skills/scientific-protein-interaction-network/SKILL.md +352 -0
  19. package/src/.github/skills/scientific-rare-disease-genetics/SKILL.md +327 -0
  20. package/src/.github/skills/scientific-structural-proteomics/SKILL.md +317 -0
  21. package/src/.github/skills/scientific-systematic-review/SKILL.md +361 -0
  22. package/src/.github/skills/scientific-variant-effect-prediction/SKILL.md +325 -0
@@ -0,0 +1,304 @@
1
+ ---
2
+ name: scientific-genome-sequence-tools
3
+ description: |
4
+ ゲノム配列解析総合スキル。Ensembl ゲノムブラウザ、dbSNP 変異データ、
5
+ BLAST 相同性検索、NCBI Nucleotide 配列取得、GDC がんゲノミクスデータの
6
+ 統合パイプライン。
7
+ ---
8
+
9
+ # Scientific Genome Sequence Tools
10
+
11
+ 公的ゲノムデータベース (Ensembl, dbSNP, BLAST, NCBI, GDC) を横断した
12
+ 配列検索・変異アノテーション・がんゲノミクスパイプラインを提供する。
13
+
14
+ ## When to Use
15
+
16
+ - ゲノム配列・エクソン構造を Ensembl から取得するとき
17
+ - rsID から変異のアレル頻度を調べるとき
18
+ - BLAST で塩基/アミノ酸配列の相同性検索を行うとき
19
+ - NCBI Nucleotide から配列をフェッチするとき
20
+ - GDC がんゲノミクスデータ (体細胞変異, CNV, 発現) を取得するとき
21
+
22
+ ---
23
+
24
+ ## Quick Start
25
+
26
+ ## 1. dbSNP 変異情報取得
27
+
28
+ ```python
29
+ import requests
30
+ import pandas as pd
31
+
32
+
33
+ def get_dbsnp_variant(rsid):
34
+ """
35
+ dbSNP から rsID ベースの変異情報 (アレル頻度含む) を取得。
36
+
37
+ Parameters:
38
+ rsid: str — e.g. "rs7412"
39
+
40
+ ToolUniverse:
41
+ dbsnp_get_variant_by_rsid(rsid=rsid)
42
+ dbsnp_get_frequencies(rsid=rsid)
43
+ dbsnp_search_by_gene(gene_symbol=gene_symbol)
44
+ """
45
+ url = f"https://api.ncbi.nlm.nih.gov/variation/v0/refsnp/{rsid.lstrip('rs')}"
46
+ resp = requests.get(url)
47
+ resp.raise_for_status()
48
+ data = resp.json()
49
+
50
+ # Extract primary info
51
+ info = {
52
+ "rsid": f"rs{data.get('refsnp_id', '')}",
53
+ "create_date": data.get("create_date", ""),
54
+ "update_date": data.get("update_date", ""),
55
+ }
56
+
57
+ # Allele frequencies
58
+ alleles = data.get("primary_snapshot_data", {}).get(
59
+ "allele_annotations", []
60
+ )
61
+ freq_data = []
62
+ for allele in alleles:
63
+ for freq_entry in allele.get("frequency", []):
64
+ freq_data.append({
65
+ "study": freq_entry.get("study_name", ""),
66
+ "allele": freq_entry.get("allele", ""),
67
+ "count": freq_entry.get("allele_count", 0),
68
+ "total": freq_entry.get("total_count", 0),
69
+ })
70
+
71
+ df_freq = pd.DataFrame(freq_data)
72
+ print(f"dbSNP {info['rsid']}: {len(df_freq)} frequency entries")
73
+ return info, df_freq
74
+ ```
75
+
76
+ ## 2. BLAST 相同性検索
77
+
78
+ ```python
79
+ import time
80
+
81
+
82
+ def blast_search(sequence, program="blastn", database="nt", max_hits=10):
83
+ """
84
+ NCBI BLAST REST API で相同性検索。
85
+
86
+ Parameters:
87
+ sequence: str — query sequence (nucleotide or protein)
88
+ program: str — "blastn", "blastp", "blastx", "tblastn"
89
+ database: str — "nt", "nr", "refseq_rna", etc.
90
+
91
+ ToolUniverse:
92
+ BLAST_nucleotide_search(sequence=sequence, database=database)
93
+ BLAST_protein_search(sequence=sequence, database=database)
94
+ """
95
+ put_url = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
96
+ params = {
97
+ "CMD": "Put",
98
+ "PROGRAM": program,
99
+ "DATABASE": database,
100
+ "QUERY": sequence,
101
+ "FORMAT_TYPE": "JSON2",
102
+ "HITLIST_SIZE": max_hits,
103
+ }
104
+ resp = requests.post(put_url, data=params)
105
+ resp.raise_for_status()
106
+
107
+ # Extract RID
108
+ import re
109
+ rid_match = re.search(r"RID = (\S+)", resp.text)
110
+ if not rid_match:
111
+ raise ValueError("BLAST RID not found")
112
+ rid = rid_match.group(1)
113
+ print(f"BLAST submitted: RID={rid}")
114
+
115
+ # Poll for results
116
+ for _ in range(60):
117
+ time.sleep(10)
118
+ check = requests.get(put_url, params={
119
+ "CMD": "Get", "RID": rid, "FORMAT_TYPE": "JSON2"
120
+ })
121
+ if "Status=WAITING" not in check.text:
122
+ break
123
+
124
+ return check.json() if check.headers.get(
125
+ "Content-Type", ""
126
+ ).startswith("application/json") else check.text
127
+ ```
128
+
129
+ ## 3. NCBI Nucleotide 配列フェッチ
130
+
131
+ ```python
132
+ def fetch_ncbi_sequence(accession, rettype="fasta"):
133
+ """
134
+ NCBI Nucleotide (E-utilities) から配列を取得。
135
+
136
+ Parameters:
137
+ accession: str — NCBI accession (e.g., "NM_000546.6")
138
+ rettype: str — "fasta", "gb", "gbwithparts"
139
+
140
+ ToolUniverse:
141
+ NCBI_search_nucleotide(query=query)
142
+ NCBI_fetch_accessions(accessions=accessions)
143
+ NCBI_get_sequence(accession=accession)
144
+ """
145
+ url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
146
+ params = {
147
+ "db": "nucleotide",
148
+ "id": accession,
149
+ "rettype": rettype,
150
+ "retmode": "text",
151
+ }
152
+ resp = requests.get(url, params=params)
153
+ resp.raise_for_status()
154
+
155
+ print(f"NCBI Nucleotide '{accession}': {len(resp.text)} chars ({rettype})")
156
+ return resp.text
157
+ ```
158
+
159
+ ## 4. GDC がんゲノミクスデータ
160
+
161
+ ```python
162
+ def get_gdc_mutations(gene_symbol, project_id=None):
163
+ """
164
+ NCI GDC (Genomic Data Commons) から体細胞変異データを取得。
165
+
166
+ Parameters:
167
+ gene_symbol: str — e.g. "TP53"
168
+ project_id: str | None — e.g. "TCGA-BRCA"
169
+
170
+ ToolUniverse:
171
+ GDC_get_ssm_by_gene(gene_symbol=gene_symbol)
172
+ GDC_get_mutation_frequency(project_id=project_id)
173
+ GDC_get_gene_expression(gene_id=gene_id, project_id=project_id)
174
+ GDC_get_cnv_data(gene_id=gene_id)
175
+ GDC_list_projects()
176
+ GDC_search_cases(filters=filters)
177
+ GDC_list_files(filters=filters)
178
+ """
179
+ url = "https://api.gdc.cancer.gov/ssms"
180
+ filters = {
181
+ "op": "and",
182
+ "content": [
183
+ {"op": "in", "content": {
184
+ "field": "consequence.transcript.gene.symbol",
185
+ "value": [gene_symbol],
186
+ }},
187
+ ],
188
+ }
189
+ if project_id:
190
+ filters["content"].append({
191
+ "op": "in",
192
+ "content": {
193
+ "field": "cases.project.project_id",
194
+ "value": [project_id],
195
+ },
196
+ })
197
+
198
+ import json
199
+ params = {
200
+ "filters": json.dumps(filters),
201
+ "fields": ("ssm_id,consequence.transcript.gene.symbol,"
202
+ "consequence.transcript.aa_change,"
203
+ "consequence.transcript.consequence_type,"
204
+ "genomic_dna_change"),
205
+ "size": 100,
206
+ "format": "json",
207
+ }
208
+ resp = requests.get(url, params=params)
209
+ resp.raise_for_status()
210
+ hits = resp.json().get("data", {}).get("hits", [])
211
+
212
+ results = []
213
+ for hit in hits:
214
+ for csq in hit.get("consequence", []):
215
+ tx = csq.get("transcript", {})
216
+ results.append({
217
+ "ssm_id": hit.get("ssm_id", ""),
218
+ "gene": tx.get("gene", {}).get("symbol", ""),
219
+ "aa_change": tx.get("aa_change", ""),
220
+ "consequence_type": tx.get("consequence_type", ""),
221
+ "genomic_dna_change": hit.get("genomic_dna_change", ""),
222
+ })
223
+
224
+ df = pd.DataFrame(results)
225
+ print(f"GDC SSMs '{gene_symbol}'"
226
+ f"{f' ({project_id})' if project_id else ''}: {len(df)} mutations")
227
+ return df
228
+ ```
229
+
230
+ ## 5. 統合ゲノム変異パイプライン
231
+
232
+ ```python
233
+ def integrated_variant_pipeline(rsid, gene_symbol=None):
234
+ """
235
+ dbSNP + GDC を統合したゲノム変異解析パイプライン。
236
+
237
+ ToolUniverse (横断):
238
+ dbsnp_get_variant_by_rsid(rsid) → GDC_get_ssm_by_gene(gene_symbol)
239
+ """
240
+ pipeline_result = {"rsid": rsid}
241
+
242
+ # Step 1: dbSNP
243
+ info, freq_df = get_dbsnp_variant(rsid)
244
+ pipeline_result["dbsnp"] = info
245
+
246
+ # Step 2: GDC somatic mutations (if gene provided)
247
+ if gene_symbol:
248
+ gdc_df = get_gdc_mutations(gene_symbol)
249
+ pipeline_result["gdc_mutation_count"] = len(gdc_df)
250
+ pipeline_result["gdc_top_consequences"] = (
251
+ gdc_df["consequence_type"].value_counts().head(5).to_dict()
252
+ if not gdc_df.empty else {}
253
+ )
254
+
255
+ print(f"Integrated variant: {rsid}"
256
+ f" | GDC={pipeline_result.get('gdc_mutation_count', 'N/A')}")
257
+ return pipeline_result
258
+ ```
259
+
260
+ ## References
261
+
262
+ ### Output Files
263
+
264
+ | ファイル | 形式 |
265
+ |---|---|
266
+ | `results/dbsnp_variant.json` | JSON |
267
+ | `results/dbsnp_frequencies.csv` | CSV |
268
+ | `results/blast_results.json` | JSON |
269
+ | `results/ncbi_sequence.fasta` | FASTA |
270
+ | `results/gdc_mutations.csv` | CSV |
271
+
272
+ ### 利用可能ツール
273
+
274
+ | カテゴリ | 主要ツール | 用途 |
275
+ |---|---|---|
276
+ | dbSNP | `dbsnp_get_variant_by_rsid` | rsID 変異情報 |
277
+ | dbSNP | `dbsnp_get_frequencies` | アレル頻度 |
278
+ | dbSNP | `dbsnp_search_by_gene` | 遺伝子→変異 |
279
+ | BLAST | `BLAST_nucleotide_search` | 核酸相同性検索 |
280
+ | BLAST | `BLAST_protein_search` | タンパク質相同性検索 |
281
+ | NCBI | `NCBI_search_nucleotide` | 配列検索 |
282
+ | NCBI | `NCBI_fetch_accessions` | アクセッション取得 |
283
+ | NCBI | `NCBI_get_sequence` | 配列フェッチ |
284
+ | GDC | `GDC_get_ssm_by_gene` | 体細胞変異 |
285
+ | GDC | `GDC_get_mutation_frequency` | 変異頻度 |
286
+ | GDC | `GDC_get_gene_expression` | 発現データ |
287
+ | GDC | `GDC_get_cnv_data` | CNV データ |
288
+ | GDC | `GDC_list_projects` | プロジェクト一覧 |
289
+ | GDC | `GDC_search_cases` | 症例検索 |
290
+ | GDC | `GDC_list_files` | ファイル一覧 |
291
+
292
+ ### 参照スキル
293
+
294
+ | スキル | 関連 |
295
+ |---|---|
296
+ | `scientific-variant-interpretation` | 変異アノテーション |
297
+ | `scientific-population-genetics` | 集団遺伝学 |
298
+ | `scientific-cancer-genomics` | がんゲノミクス |
299
+ | `scientific-rare-disease-genetics` | 希少疾患遺伝学 |
300
+ | `scientific-biothings-idmapping` | ID マッピング |
301
+
302
+ ### 依存パッケージ
303
+
304
+ `requests`, `pandas`
@@ -0,0 +1,273 @@
1
+ ---
2
+ name: scientific-healthcare-ai
3
+ description: |
4
+ ヘルスケア AI スキル。PyHealth 臨床 ML パイプライン、
5
+ フローサイトメトリー (FlowIO) 解析、電子健康記録 (EHR) 処理、
6
+ 臨床予測モデル構築のガイダンス。
7
+ ---
8
+
9
+ # Scientific Healthcare AI
10
+
11
+ 臨床データ解析・ヘルスケア機械学習パイプラインを提供する。
12
+ PyHealth フレームワーク、フローサイトメトリー解析ツールを活用。
13
+
14
+ ## When to Use
15
+
16
+ - 臨床予測モデル (再入院予測, 死亡率予測等) を構築するとき
17
+ - EHR (電子健康記録) データの前処理・特徴量エンジニアリングを行うとき
18
+ - フローサイトメトリー (FACS) データを読み込み・解析するとき
19
+ - 臨床タスク向けの ML パイプラインを設計するとき
20
+ - 医療コード (ICD-10, SNOMED, ATC) のマッピングを行うとき
21
+
22
+ ---
23
+
24
+ ## Quick Start
25
+
26
+ ## 1. PyHealth 臨床予測パイプライン
27
+
28
+ ```python
29
+ """
30
+ PyHealth による臨床予測モデル構築。
31
+ pip install pyhealth
32
+
33
+ K-Dense-AI 参照: pyhealth — 臨床 ML フレームワーク
34
+ """
35
+ from pyhealth.datasets import MIMIC3Dataset
36
+ from pyhealth.tasks import readmission_prediction_mimic3_fn
37
+
38
+
39
+ def build_clinical_pipeline(
40
+ mimic3_root,
41
+ tables=("DIAGNOSES_ICD", "PROCEDURES_ICD", "PRESCRIPTIONS"),
42
+ code_mapping=None,
43
+ ):
44
+ """
45
+ MIMIC-III データセットから臨床予測パイプラインを構築。
46
+
47
+ Parameters:
48
+ mimic3_root: str — MIMIC-III CSV ディレクトリパス
49
+ tables: tuple — 使用するテーブル
50
+ code_mapping: dict | None — コードマッピング設定
51
+ """
52
+ # Step 1: Dataset loading
53
+ if code_mapping is None:
54
+ code_mapping = {
55
+ "NDC": ("ATC", {"target_kwargs": {"level": 3}}),
56
+ "ICD9CM": "CCSCM",
57
+ "ICD9PROC": "CCSPROC",
58
+ }
59
+
60
+ dataset = MIMIC3Dataset(
61
+ root=mimic3_root,
62
+ tables=tables,
63
+ code_mapping=code_mapping,
64
+ )
65
+
66
+ print(f"MIMIC-III dataset: {len(dataset.patients)} patients")
67
+ return dataset
68
+
69
+
70
+ def apply_clinical_task(dataset, task_fn=None):
71
+ """
72
+ 臨床タスク関数を適用しサンプルを生成。
73
+ """
74
+ from pyhealth.datasets import split_by_patient
75
+
76
+ if task_fn is None:
77
+ task_fn = readmission_prediction_mimic3_fn
78
+
79
+ samples = dataset.set_task(task_fn)
80
+ train, val, test = split_by_patient(samples, [0.8, 0.1, 0.1])
81
+
82
+ print(f"Clinical task samples: "
83
+ f"train={len(train)}, val={len(val)}, test={len(test)}")
84
+ return train, val, test
85
+ ```
86
+
87
+ ## 2. PyHealth モデル学習
88
+
89
+ ```python
90
+ def train_clinical_model(
91
+ train_dataset,
92
+ val_dataset,
93
+ model_type="Transformer",
94
+ epochs=20,
95
+ batch_size=64,
96
+ ):
97
+ """
98
+ PyHealth モデルの学習。
99
+
100
+ Parameters:
101
+ train_dataset: SampleDataset
102
+ val_dataset: SampleDataset
103
+ model_type: str — "Transformer", "RETAIN", "GRU", "CNN"
104
+ epochs: int — 学習エポック数
105
+ """
106
+ from pyhealth.models import Transformer
107
+ from pyhealth.trainer import Trainer
108
+
109
+ model_classes = {
110
+ "Transformer": Transformer,
111
+ }
112
+ ModelClass = model_classes.get(model_type, Transformer)
113
+
114
+ model = ModelClass(
115
+ dataset=train_dataset,
116
+ feature_keys=["conditions", "procedures", "drugs"],
117
+ label_key="label",
118
+ mode="binary",
119
+ )
120
+
121
+ trainer = Trainer(model=model)
122
+ trainer.train(
123
+ train_dataloader=train_dataset,
124
+ val_dataloader=val_dataset,
125
+ epochs=epochs,
126
+ monitor="pr_auc",
127
+ )
128
+
129
+ print(f"Clinical model ({model_type}): trained for {epochs} epochs")
130
+ return model, trainer
131
+ ```
132
+
133
+ ## 3. フローサイトメトリー解析
134
+
135
+ ```python
136
+ def read_fcs_file(fcs_path):
137
+ """
138
+ FCS ファイルの読み込みと前処理。
139
+ pip install flowio
140
+
141
+ K-Dense-AI 参照: flowio — FCS file I/O
142
+
143
+ Parameters:
144
+ fcs_path: str — FCS ファイルパス
145
+ """
146
+ import flowio
147
+ import numpy as np
148
+ import pandas as pd
149
+
150
+ fcs_data = flowio.FlowData(fcs_path)
151
+
152
+ # Extract channel names
153
+ channels = []
154
+ for i in range(1, fcs_data.channel_count + 1):
155
+ name = fcs_data.channels.get(f"P{i}N", f"Channel_{i}")
156
+ channels.append(name)
157
+
158
+ # Convert to DataFrame
159
+ events = np.array(fcs_data.events).reshape(-1, fcs_data.channel_count)
160
+ df = pd.DataFrame(events, columns=channels)
161
+
162
+ print(f"FCS '{fcs_path}': {len(df)} events x {len(channels)} channels")
163
+ return df, fcs_data
164
+
165
+
166
+ def gate_fcs_data(df, channel, low=None, high=None):
167
+ """
168
+ 単純な矩形ゲーティング。
169
+
170
+ Parameters:
171
+ df: pd.DataFrame — FCS データ
172
+ channel: str — チャネル名
173
+ low: float | None — 下限
174
+ high: float | None — 上限
175
+ """
176
+ mask = pd.Series([True] * len(df))
177
+ if low is not None:
178
+ mask &= df[channel] >= low
179
+ if high is not None:
180
+ mask &= df[channel] <= high
181
+
182
+ gated = df[mask]
183
+ pct = len(gated) / len(df) * 100
184
+ print(f"Gate '{channel}' [{low},{high}]: "
185
+ f"{len(gated)}/{len(df)} events ({pct:.1f}%)")
186
+ return gated
187
+ ```
188
+
189
+ ## 4. 医療コードマッピング
190
+
191
+ ```python
192
+ def map_medical_codes(codes, source_system, target_system):
193
+ """
194
+ 医療コード間のマッピング。
195
+
196
+ Parameters:
197
+ codes: list[str] — ソースコードのリスト
198
+ source_system: str — "ICD9CM", "ICD10CM", "NDC", "ATC", "SNOMED"
199
+ target_system: str — 変換先コード体系
200
+ """
201
+ try:
202
+ from pyhealth.medcode import CrossMap
203
+
204
+ mapper = CrossMap(source_system, target_system)
205
+ results = {}
206
+ for code in codes:
207
+ mapped = mapper.map(code)
208
+ results[code] = mapped
209
+
210
+ mapped_count = sum(1 for v in results.values() if v)
211
+ print(f"Code mapping {source_system}→{target_system}: "
212
+ f"{mapped_count}/{len(codes)} mapped")
213
+ return results
214
+
215
+ except ImportError:
216
+ print("pyhealth.medcode not available; install pyhealth")
217
+ return {}
218
+ ```
219
+
220
+ ## 5. 臨床モデル評価
221
+
222
+ ```python
223
+ def evaluate_clinical_model(trainer, test_dataset):
224
+ """
225
+ 臨床予測モデルの評価。
226
+
227
+ Parameters:
228
+ trainer: Trainer — 学習済み Trainer
229
+ test_dataset: SampleDataset — テストデータ
230
+ """
231
+ metrics = trainer.evaluate(test_dataset)
232
+
233
+ print("Clinical model evaluation:")
234
+ for metric_name, value in metrics.items():
235
+ print(f" {metric_name}: {value:.4f}")
236
+
237
+ return metrics
238
+ ```
239
+
240
+ ## References
241
+
242
+ ### Output Files
243
+
244
+ | ファイル | 形式 |
245
+ |---|---|
246
+ | `results/clinical_predictions.csv` | CSV |
247
+ | `results/clinical_metrics.json` | JSON |
248
+ | `results/fcs_processed.csv` | CSV |
249
+ | `results/code_mapping.json` | JSON |
250
+
251
+ ### 利用可能ツール
252
+
253
+ | カテゴリ | 主要ツール | 用途 |
254
+ |---|---|---|
255
+ | (K-Dense) | `pyhealth` | 臨床 ML フレームワーク |
256
+ | (K-Dense) | `flowio` | FCS ファイル I/O |
257
+
258
+ > **注**: 本スキルは ToolUniverse ツールを持たず、
259
+ > K-Dense-AI Scientific Skills からの参照のみ。
260
+
261
+ ### 参照スキル
262
+
263
+ | スキル | 関連 |
264
+ |---|---|
265
+ | `scientific-clinical-nlp` | 臨床 NLP |
266
+ | `scientific-biostatistics-survival` | 生存時間解析 |
267
+ | `scientific-single-cell-rnaseq` | 単一細胞解析 |
268
+ | `scientific-machine-learning-omics` | ML x オミクス |
269
+ | `scientific-biothings-idmapping` | ID マッピング |
270
+
271
+ ### 依存パッケージ
272
+
273
+ `pyhealth`, `flowio`, `numpy`, `pandas`, `scikit-learn`