@nahisaho/satori 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,245 @@
1
+ ---
2
+ name: scientific-compound-screening
3
+ description: |
4
+ 化合物スクリーニングスキル。ZINC データベースを活用した購入可能化合物検索、
5
+ SMILES/名前ベースの類似性検索、カタログフィルタリング、
6
+ バーチャルスクリーニング前処理パイプライン。
7
+ ---
8
+
9
+ # Scientific Compound Screening
10
+
11
+ ZINC データベースを活用した化合物ライブラリ検索・
12
+ バーチャルスクリーニング前処理パイプラインを提供する。
13
+
14
+ ## When to Use
15
+
16
+ - 購入可能な化合物ライブラリを検索するとき
17
+ - SMILES 構造式から類似化合物を探すとき
18
+ - 化合物名からデータベースレコードを取得するとき
19
+ - ベンダーカタログの絞り込みを行うとき
20
+ - バーチャルスクリーニング用の化合物セットを準備するとき
21
+
22
+ ---
23
+
24
+ ## Quick Start
25
+
26
+ ## 1. ZINC 化合物名検索
27
+
28
+ ```python
29
+ import requests
30
+ import pandas as pd
31
+
32
+ ZINC_API = "https://zinc15.docking.org"
33
+
34
+
35
+ def zinc_search_by_name(name, max_results=20):
36
+ """
37
+ ZINC データベースで化合物名による検索。
38
+
39
+ Parameters:
40
+ name: str — compound name (e.g., "aspirin")
41
+ max_results: int — maximum results
42
+
43
+ ToolUniverse:
44
+ ZINC_search_by_name(name=name)
45
+ """
46
+ url = f"{ZINC_API}/substances/search"
47
+ params = {"q": name, "count": max_results}
48
+ resp = requests.get(url, params=params)
49
+ resp.raise_for_status()
50
+ data = resp.json()
51
+
52
+ results = []
53
+ for item in data:
54
+ results.append({
55
+ "zinc_id": item.get("zinc_id", ""),
56
+ "name": item.get("name", ""),
57
+ "smiles": item.get("smiles", ""),
58
+ "mwt": item.get("mwt", ""),
59
+ "logp": item.get("logp", ""),
60
+ "purchasable": item.get("purchasability", ""),
61
+ })
62
+
63
+ df = pd.DataFrame(results)
64
+ print(f"ZINC search '{name}': {len(df)} compounds")
65
+ return df
66
+ ```
67
+
68
+ ## 2. ZINC SMILES 類似性検索
69
+
70
+ ```python
71
+ def zinc_search_by_smiles(smiles, similarity=0.7, max_results=20):
72
+ """
73
+ ZINC で SMILES 構造式による類似性検索。
74
+
75
+ Parameters:
76
+ smiles: str — SMILES string
77
+ similarity: float — Tanimoto similarity threshold (0-1)
78
+
79
+ ToolUniverse:
80
+ ZINC_search_by_smiles(smiles=smiles)
81
+ """
82
+ url = f"{ZINC_API}/substances/search"
83
+ params = {
84
+ "smiles": smiles,
85
+ "similarity": similarity,
86
+ "count": max_results,
87
+ }
88
+ resp = requests.get(url, params=params)
89
+ resp.raise_for_status()
90
+ data = resp.json()
91
+
92
+ results = []
93
+ for item in data:
94
+ results.append({
95
+ "zinc_id": item.get("zinc_id", ""),
96
+ "smiles": item.get("smiles", ""),
97
+ "similarity": item.get("similarity", ""),
98
+ "mwt": item.get("mwt", ""),
99
+ "logp": item.get("logp", ""),
100
+ "purchasable": item.get("purchasability", ""),
101
+ })
102
+
103
+ df = pd.DataFrame(results)
104
+ print(f"ZINC SMILES search: {len(df)} similar compounds "
105
+ f"(threshold={similarity})")
106
+ return df
107
+ ```
108
+
109
+ ## 3. ZINC 化合物詳細取得
110
+
111
+ ```python
112
+ def zinc_get_substance(zinc_id):
113
+ """
114
+ ZINC ID から化合物の完全情報を取得。
115
+
116
+ Parameters:
117
+ zinc_id: str — ZINC ID (e.g., "ZINC000000000001")
118
+
119
+ ToolUniverse:
120
+ ZINC_get_substance(zinc_id=zinc_id)
121
+ """
122
+ url = f"{ZINC_API}/substances/{zinc_id}.json"
123
+ resp = requests.get(url)
124
+ resp.raise_for_status()
125
+ data = resp.json()
126
+
127
+ info = {
128
+ "zinc_id": data.get("zinc_id", ""),
129
+ "name": data.get("name", ""),
130
+ "smiles": data.get("smiles", ""),
131
+ "inchikey": data.get("inchikey", ""),
132
+ "mwt": data.get("mwt", ""),
133
+ "logp": data.get("logp", ""),
134
+ "num_rotatable_bonds": data.get("num_rotatable_bonds", ""),
135
+ "num_hba": data.get("num_hba", ""),
136
+ "num_hbd": data.get("num_hbd", ""),
137
+ "tpsa": data.get("tpsa", ""),
138
+ "purchasable": data.get("purchasability", ""),
139
+ }
140
+
141
+ print(f"ZINC {zinc_id}: {info['name']} (MW={info['mwt']})")
142
+ return info, data
143
+ ```
144
+
145
+ ## 4. ZINC カタログ一覧
146
+
147
+ ```python
148
+ def zinc_get_catalogs():
149
+ """
150
+ ZINC の利用可能カタログ (ベンダー) 一覧を取得。
151
+
152
+ ToolUniverse:
153
+ ZINC_get_catalogs()
154
+ """
155
+ url = f"{ZINC_API}/catalogs.json"
156
+ resp = requests.get(url)
157
+ resp.raise_for_status()
158
+ data = resp.json()
159
+
160
+ results = []
161
+ for cat in data:
162
+ results.append({
163
+ "catalog_name": cat.get("name", ""),
164
+ "short_name": cat.get("short_name", ""),
165
+ "num_substances": cat.get("num_substances", 0),
166
+ "url": cat.get("url", ""),
167
+ })
168
+
169
+ df = pd.DataFrame(results)
170
+ print(f"ZINC catalogs: {len(df)} vendors")
171
+ return df
172
+ ```
173
+
174
+ ## 5. バーチャルスクリーニング前処理パイプライン
175
+
176
+ ```python
177
+ def virtual_screening_prep(query_smiles, lipinski=True, max_compounds=100):
178
+ """
179
+ バーチャルスクリーニング用の化合物セット準備。
180
+ Lipinski's Rule of Five フィルタリング含む。
181
+
182
+ ToolUniverse (横断):
183
+ ZINC_search_by_smiles(smiles=query_smiles) → ZINC_get_substance(zinc_id)
184
+ """
185
+ # Step 1: Similar compound search
186
+ df = zinc_search_by_smiles(query_smiles, similarity=0.6,
187
+ max_results=max_compounds)
188
+
189
+ if df.empty:
190
+ print("No similar compounds found")
191
+ return df
192
+
193
+ # Step 2: Lipinski filter
194
+ if lipinski:
195
+ df["mwt"] = pd.to_numeric(df["mwt"], errors="coerce")
196
+ df["logp"] = pd.to_numeric(df["logp"], errors="coerce")
197
+ before = len(df)
198
+ df = df[
199
+ (df["mwt"] <= 500)
200
+ & (df["logp"] <= 5)
201
+ ]
202
+ print(f"Lipinski filter: {before} → {len(df)} compounds")
203
+
204
+ # Step 3: Sort by similarity
205
+ df["similarity"] = pd.to_numeric(df["similarity"], errors="coerce")
206
+ df = df.sort_values("similarity", ascending=False)
207
+
208
+ print(f"VS prep: {len(df)} compounds ready for screening")
209
+ return df
210
+ ```
211
+
212
+ ## References
213
+
214
+ ### Output Files
215
+
216
+ | ファイル | 形式 |
217
+ |---|---|
218
+ | `results/zinc_search.csv` | CSV |
219
+ | `results/zinc_similar.csv` | CSV |
220
+ | `results/zinc_substance.json` | JSON |
221
+ | `results/zinc_catalogs.csv` | CSV |
222
+ | `results/vs_library.csv` | CSV |
223
+
224
+ ### 利用可能ツール
225
+
226
+ | カテゴリ | 主要ツール | 用途 |
227
+ |---|---|---|
228
+ | ZINC | `ZINC_search_by_name` | 化合物名検索 |
229
+ | ZINC | `ZINC_search_by_smiles` | SMILES 類似性検索 |
230
+ | ZINC | `ZINC_get_substance` | 化合物詳細 |
231
+ | ZINC | `ZINC_get_catalogs` | カタログ一覧 |
232
+
233
+ ### 参照スキル
234
+
235
+ | スキル | 関連 |
236
+ |---|---|
237
+ | `scientific-compound-similarity` | 化合物類似性 |
238
+ | `scientific-pharmacology-targets` | 薬理学ターゲット |
239
+ | `scientific-molecular-docking` | 分子ドッキング |
240
+ | `scientific-drug-target-interaction` | DTI 解析 |
241
+ | `scientific-admet-toxicity` | ADMET 毒性 |
242
+
243
+ ### 依存パッケージ
244
+
245
+ `requests`, `pandas`
@@ -0,0 +1,304 @@
1
+ ---
2
+ name: scientific-genome-sequence-tools
3
+ description: |
4
+ ゲノム配列解析総合スキル。Ensembl ゲノムブラウザ、dbSNP 変異データ、
5
+ BLAST 相同性検索、NCBI Nucleotide 配列取得、GDC がんゲノミクスデータの
6
+ 統合パイプライン。
7
+ ---
8
+
9
+ # Scientific Genome Sequence Tools
10
+
11
+ 公的ゲノムデータベース (Ensembl, dbSNP, BLAST, NCBI, GDC) を横断した
12
+ 配列検索・変異アノテーション・がんゲノミクスパイプラインを提供する。
13
+
14
+ ## When to Use
15
+
16
+ - ゲノム配列・エクソン構造を Ensembl から取得するとき
17
+ - rsID から変異のアレル頻度を調べるとき
18
+ - BLAST で塩基/アミノ酸配列の相同性検索を行うとき
19
+ - NCBI Nucleotide から配列をフェッチするとき
20
+ - GDC がんゲノミクスデータ (体細胞変異, CNV, 発現) を取得するとき
21
+
22
+ ---
23
+
24
+ ## Quick Start
25
+
26
+ ## 1. dbSNP 変異情報取得
27
+
28
+ ```python
29
+ import requests
30
+ import pandas as pd
31
+
32
+
33
+ def get_dbsnp_variant(rsid):
34
+ """
35
+ dbSNP から rsID ベースの変異情報 (アレル頻度含む) を取得。
36
+
37
+ Parameters:
38
+ rsid: str — e.g. "rs7412"
39
+
40
+ ToolUniverse:
41
+ dbsnp_get_variant_by_rsid(rsid=rsid)
42
+ dbsnp_get_frequencies(rsid=rsid)
43
+ dbsnp_search_by_gene(gene_symbol=gene_symbol)
44
+ """
45
+ url = f"https://api.ncbi.nlm.nih.gov/variation/v0/refsnp/{rsid.lstrip('rs')}"
46
+ resp = requests.get(url)
47
+ resp.raise_for_status()
48
+ data = resp.json()
49
+
50
+ # Extract primary info
51
+ info = {
52
+ "rsid": f"rs{data.get('refsnp_id', '')}",
53
+ "create_date": data.get("create_date", ""),
54
+ "update_date": data.get("update_date", ""),
55
+ }
56
+
57
+ # Allele frequencies
58
+ alleles = data.get("primary_snapshot_data", {}).get(
59
+ "allele_annotations", []
60
+ )
61
+ freq_data = []
62
+ for allele in alleles:
63
+ for freq_entry in allele.get("frequency", []):
64
+ freq_data.append({
65
+ "study": freq_entry.get("study_name", ""),
66
+ "allele": freq_entry.get("allele", ""),
67
+ "count": freq_entry.get("allele_count", 0),
68
+ "total": freq_entry.get("total_count", 0),
69
+ })
70
+
71
+ df_freq = pd.DataFrame(freq_data)
72
+ print(f"dbSNP {info['rsid']}: {len(df_freq)} frequency entries")
73
+ return info, df_freq
74
+ ```
75
+
76
+ ## 2. BLAST 相同性検索
77
+
78
+ ```python
79
+ import time
80
+
81
+
82
+ def blast_search(sequence, program="blastn", database="nt", max_hits=10):
83
+ """
84
+ NCBI BLAST REST API で相同性検索。
85
+
86
+ Parameters:
87
+ sequence: str — query sequence (nucleotide or protein)
88
+ program: str — "blastn", "blastp", "blastx", "tblastn"
89
+ database: str — "nt", "nr", "refseq_rna", etc.
90
+
91
+ ToolUniverse:
92
+ BLAST_nucleotide_search(sequence=sequence, database=database)
93
+ BLAST_protein_search(sequence=sequence, database=database)
94
+ """
95
+ put_url = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
96
+ params = {
97
+ "CMD": "Put",
98
+ "PROGRAM": program,
99
+ "DATABASE": database,
100
+ "QUERY": sequence,
101
+ "FORMAT_TYPE": "JSON2",
102
+ "HITLIST_SIZE": max_hits,
103
+ }
104
+ resp = requests.post(put_url, data=params)
105
+ resp.raise_for_status()
106
+
107
+ # Extract RID
108
+ import re
109
+ rid_match = re.search(r"RID = (\S+)", resp.text)
110
+ if not rid_match:
111
+ raise ValueError("BLAST RID not found")
112
+ rid = rid_match.group(1)
113
+ print(f"BLAST submitted: RID={rid}")
114
+
115
+ # Poll for results
116
+ for _ in range(60):
117
+ time.sleep(10)
118
+ check = requests.get(put_url, params={
119
+ "CMD": "Get", "RID": rid, "FORMAT_TYPE": "JSON2"
120
+ })
121
+ if "Status=WAITING" not in check.text:
122
+ break
123
+
124
+ return check.json() if check.headers.get(
125
+ "Content-Type", ""
126
+ ).startswith("application/json") else check.text
127
+ ```
128
+
129
+ ## 3. NCBI Nucleotide 配列フェッチ
130
+
131
+ ```python
132
+ def fetch_ncbi_sequence(accession, rettype="fasta"):
133
+ """
134
+ NCBI Nucleotide (E-utilities) から配列を取得。
135
+
136
+ Parameters:
137
+ accession: str — NCBI accession (e.g., "NM_000546.6")
138
+ rettype: str — "fasta", "gb", "gbwithparts"
139
+
140
+ ToolUniverse:
141
+ NCBI_search_nucleotide(query=query)
142
+ NCBI_fetch_accessions(accessions=accessions)
143
+ NCBI_get_sequence(accession=accession)
144
+ """
145
+ url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
146
+ params = {
147
+ "db": "nucleotide",
148
+ "id": accession,
149
+ "rettype": rettype,
150
+ "retmode": "text",
151
+ }
152
+ resp = requests.get(url, params=params)
153
+ resp.raise_for_status()
154
+
155
+ print(f"NCBI Nucleotide '{accession}': {len(resp.text)} chars ({rettype})")
156
+ return resp.text
157
+ ```
158
+
159
+ ## 4. GDC がんゲノミクスデータ
160
+
161
+ ```python
162
+ def get_gdc_mutations(gene_symbol, project_id=None):
163
+ """
164
+ NCI GDC (Genomic Data Commons) から体細胞変異データを取得。
165
+
166
+ Parameters:
167
+ gene_symbol: str — e.g. "TP53"
168
+ project_id: str | None — e.g. "TCGA-BRCA"
169
+
170
+ ToolUniverse:
171
+ GDC_get_ssm_by_gene(gene_symbol=gene_symbol)
172
+ GDC_get_mutation_frequency(project_id=project_id)
173
+ GDC_get_gene_expression(gene_id=gene_id, project_id=project_id)
174
+ GDC_get_cnv_data(gene_id=gene_id)
175
+ GDC_list_projects()
176
+ GDC_search_cases(filters=filters)
177
+ GDC_list_files(filters=filters)
178
+ """
179
+ url = "https://api.gdc.cancer.gov/ssms"
180
+ filters = {
181
+ "op": "and",
182
+ "content": [
183
+ {"op": "in", "content": {
184
+ "field": "consequence.transcript.gene.symbol",
185
+ "value": [gene_symbol],
186
+ }},
187
+ ],
188
+ }
189
+ if project_id:
190
+ filters["content"].append({
191
+ "op": "in",
192
+ "content": {
193
+ "field": "cases.project.project_id",
194
+ "value": [project_id],
195
+ },
196
+ })
197
+
198
+ import json
199
+ params = {
200
+ "filters": json.dumps(filters),
201
+ "fields": ("ssm_id,consequence.transcript.gene.symbol,"
202
+ "consequence.transcript.aa_change,"
203
+ "consequence.transcript.consequence_type,"
204
+ "genomic_dna_change"),
205
+ "size": 100,
206
+ "format": "json",
207
+ }
208
+ resp = requests.get(url, params=params)
209
+ resp.raise_for_status()
210
+ hits = resp.json().get("data", {}).get("hits", [])
211
+
212
+ results = []
213
+ for hit in hits:
214
+ for csq in hit.get("consequence", []):
215
+ tx = csq.get("transcript", {})
216
+ results.append({
217
+ "ssm_id": hit.get("ssm_id", ""),
218
+ "gene": tx.get("gene", {}).get("symbol", ""),
219
+ "aa_change": tx.get("aa_change", ""),
220
+ "consequence_type": tx.get("consequence_type", ""),
221
+ "genomic_dna_change": hit.get("genomic_dna_change", ""),
222
+ })
223
+
224
+ df = pd.DataFrame(results)
225
+ print(f"GDC SSMs '{gene_symbol}'"
226
+ f"{f' ({project_id})' if project_id else ''}: {len(df)} mutations")
227
+ return df
228
+ ```
229
+
230
+ ## 5. 統合ゲノム変異パイプライン
231
+
232
+ ```python
233
+ def integrated_variant_pipeline(rsid, gene_symbol=None):
234
+ """
235
+ dbSNP + GDC を統合したゲノム変異解析パイプライン。
236
+
237
+ ToolUniverse (横断):
238
+ dbsnp_get_variant_by_rsid(rsid) → GDC_get_ssm_by_gene(gene_symbol)
239
+ """
240
+ pipeline_result = {"rsid": rsid}
241
+
242
+ # Step 1: dbSNP
243
+ info, freq_df = get_dbsnp_variant(rsid)
244
+ pipeline_result["dbsnp"] = info
245
+
246
+ # Step 2: GDC somatic mutations (if gene provided)
247
+ if gene_symbol:
248
+ gdc_df = get_gdc_mutations(gene_symbol)
249
+ pipeline_result["gdc_mutation_count"] = len(gdc_df)
250
+ pipeline_result["gdc_top_consequences"] = (
251
+ gdc_df["consequence_type"].value_counts().head(5).to_dict()
252
+ if not gdc_df.empty else {}
253
+ )
254
+
255
+ print(f"Integrated variant: {rsid}"
256
+ f" | GDC={pipeline_result.get('gdc_mutation_count', 'N/A')}")
257
+ return pipeline_result
258
+ ```
259
+
260
+ ## References
261
+
262
+ ### Output Files
263
+
264
+ | ファイル | 形式 |
265
+ |---|---|
266
+ | `results/dbsnp_variant.json` | JSON |
267
+ | `results/dbsnp_frequencies.csv` | CSV |
268
+ | `results/blast_results.json` | JSON |
269
+ | `results/ncbi_sequence.fasta` | FASTA |
270
+ | `results/gdc_mutations.csv` | CSV |
271
+
272
+ ### 利用可能ツール
273
+
274
+ | カテゴリ | 主要ツール | 用途 |
275
+ |---|---|---|
276
+ | dbSNP | `dbsnp_get_variant_by_rsid` | rsID 変異情報 |
277
+ | dbSNP | `dbsnp_get_frequencies` | アレル頻度 |
278
+ | dbSNP | `dbsnp_search_by_gene` | 遺伝子→変異 |
279
+ | BLAST | `BLAST_nucleotide_search` | 核酸相同性検索 |
280
+ | BLAST | `BLAST_protein_search` | タンパク質相同性検索 |
281
+ | NCBI | `NCBI_search_nucleotide` | 配列検索 |
282
+ | NCBI | `NCBI_fetch_accessions` | アクセッション取得 |
283
+ | NCBI | `NCBI_get_sequence` | 配列フェッチ |
284
+ | GDC | `GDC_get_ssm_by_gene` | 体細胞変異 |
285
+ | GDC | `GDC_get_mutation_frequency` | 変異頻度 |
286
+ | GDC | `GDC_get_gene_expression` | 発現データ |
287
+ | GDC | `GDC_get_cnv_data` | CNV データ |
288
+ | GDC | `GDC_list_projects` | プロジェクト一覧 |
289
+ | GDC | `GDC_search_cases` | 症例検索 |
290
+ | GDC | `GDC_list_files` | ファイル一覧 |
291
+
292
+ ### 参照スキル
293
+
294
+ | スキル | 関連 |
295
+ |---|---|
296
+ | `scientific-variant-interpretation` | 変異アノテーション |
297
+ | `scientific-population-genetics` | 集団遺伝学 |
298
+ | `scientific-cancer-genomics` | がんゲノミクス |
299
+ | `scientific-rare-disease-genetics` | 希少疾患遺伝学 |
300
+ | `scientific-biothings-idmapping` | ID マッピング |
301
+
302
+ ### 依存パッケージ
303
+
304
+ `requests`, `pandas`