@nahisaho/satori 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,264 @@
1
+ ---
2
+ name: scientific-arrayexpress-expression
3
+ description: |
4
+ ArrayExpress 発現アーカイブスキル。BioStudies/ArrayExpress
5
+ REST API によるマイクロアレイ・RNA-seq 発現実験検索・メタ
6
+ データ取得・データ再解析。ToolUniverse 連携: arrayexpress。
7
+ tu_tools:
8
+ - key: arrayexpress
9
+ name: ArrayExpress
10
+ description: ArrayExpress 発現実験検索・メタデータ・ファイル取得
11
+ ---
12
+
13
+ # Scientific ArrayExpress Expression
14
+
15
+ EBI ArrayExpress / BioStudies REST API を活用した発現データ
16
+ アーカイブ検索・再解析パイプラインを提供する。
17
+
18
+ ## When to Use
19
+
20
+ - ArrayExpress/BioStudies の発現実験を検索するとき
21
+ - マイクロアレイ/RNA-seq 発現データのメタデータを取得するとき
22
+ - SDRF サンプル情報テーブルを解析するとき
23
+ - E-MTAB/E-GEOD アクセッションからデータ再解析するとき
24
+ - 発現データアーカイブを横断検索するとき
25
+ - GEO と ArrayExpress の両方でデータを探すとき
26
+
27
+ ---
28
+
29
+ ## Quick Start
30
+
31
+ ## 1. BioStudies 発現実験検索
32
+
33
+ ```python
34
+ import requests
35
+ import pandas as pd
36
+
37
+ BIOSTUDIES_BASE = "https://www.ebi.ac.uk/biostudies/api/v1"
38
+ AE_BASE = "https://www.ebi.ac.uk/arrayexpress/json/v3"
39
+
40
+
41
+ def arrayexpress_search_experiments(query, organism=None,
42
+ experiment_type=None,
43
+ limit=50):
44
+ """
45
+ ArrayExpress — 発現実験検索 (BioStudies API)。
46
+
47
+ Parameters:
48
+ query: str — 検索クエリ (例: "breast cancer RNA-seq")
49
+ organism: str — 生物種 (例: "Homo sapiens")
50
+ experiment_type: str — 実験タイプ (例: "RNA-seq of coding RNA")
51
+ limit: int — 最大結果数
52
+ """
53
+ url = f"{BIOSTUDIES_BASE}/search"
54
+ params = {
55
+ "query": query,
56
+ "type": "study",
57
+ "pageSize": limit,
58
+ }
59
+ if organism:
60
+ params["organism"] = organism
61
+ if experiment_type:
62
+ params["experimenttype"] = experiment_type
63
+
64
+ resp = requests.get(url, params=params, timeout=30)
65
+ resp.raise_for_status()
66
+ data = resp.json()
67
+
68
+ hits = data.get("hits", [])
69
+ results = []
70
+ for h in hits:
71
+ attrs = {a.get("name", ""): a.get("value", "")
72
+ for a in h.get("attributes", [])}
73
+ results.append({
74
+ "accession": h.get("accession", ""),
75
+ "title": attrs.get("Title", h.get("title", "")),
76
+ "organism": attrs.get("Organism", ""),
77
+ "experiment_type": attrs.get("Experiment type", ""),
78
+ "release_date": h.get("releaseDate", ""),
79
+ "files_count": h.get("filesCount", 0),
80
+ "links_count": h.get("linksCount", 0),
81
+ })
82
+
83
+ df = pd.DataFrame(results)
84
+ print(f"ArrayExpress search: {len(df)} experiments "
85
+ f"(query={query})")
86
+ return df
87
+ ```
88
+
89
+ ## 2. 実験メタデータ・SDRF 取得
90
+
91
+ ```python
92
+ def arrayexpress_get_experiment(accession):
93
+ """
94
+ ArrayExpress — 実験メタデータ & SDRF 取得。
95
+
96
+ Parameters:
97
+ accession: str — アクセッション (例: "E-MTAB-12345")
98
+ """
99
+ url = f"{BIOSTUDIES_BASE}/studies/{accession}"
100
+ resp = requests.get(url, timeout=30)
101
+ resp.raise_for_status()
102
+ data = resp.json()
103
+
104
+ # メタデータ
105
+ attrs = {a.get("name", ""): a.get("value", "")
106
+ for a in data.get("attributes", [])}
107
+ metadata = {
108
+ "accession": accession,
109
+ "title": attrs.get("Title", ""),
110
+ "description": attrs.get("Description", "")[:500],
111
+ "organism": attrs.get("Organism", ""),
112
+ "experiment_type": attrs.get("Experiment type", ""),
113
+ "release_date": data.get("releaseDate", ""),
114
+ }
115
+
116
+ # ファイル一覧
117
+ files = []
118
+ for section in data.get("section", {}).get("files", []):
119
+ if isinstance(section, list):
120
+ for f in section:
121
+ files.append({
122
+ "filename": f.get("path", ""),
123
+ "type": f.get("type", ""),
124
+ "size": f.get("size", 0),
125
+ })
126
+ elif isinstance(section, dict):
127
+ files.append({
128
+ "filename": section.get("path", ""),
129
+ "type": section.get("type", ""),
130
+ "size": section.get("size", 0),
131
+ })
132
+
133
+ files_df = pd.DataFrame(files)
134
+
135
+ # SDRF 取得試行
136
+ sdrf_url = (f"https://www.ebi.ac.uk/biostudies/files/"
137
+ f"{accession}/{accession}.sdrf.txt")
138
+ sdrf_df = pd.DataFrame()
139
+ try:
140
+ sdrf_resp = requests.get(sdrf_url, timeout=30)
141
+ if sdrf_resp.status_code == 200:
142
+ from io import StringIO
143
+ sdrf_df = pd.read_csv(StringIO(sdrf_resp.text), sep="\t")
144
+ except Exception:
145
+ pass
146
+
147
+ print(f"ArrayExpress {accession}: {len(files_df)} files, "
148
+ f"{len(sdrf_df)} SDRF rows")
149
+ return metadata, files_df, sdrf_df
150
+ ```
151
+
152
+ ## 3. 発現データダウンロード・処理
153
+
154
+ ```python
155
+ def arrayexpress_download_matrix(accession, output_dir="results"):
156
+ """
157
+ ArrayExpress — 発現マトリクスダウンロード。
158
+
159
+ Parameters:
160
+ accession: str — アクセッション
161
+ output_dir: str — 出力ディレクトリ
162
+ """
163
+ from pathlib import Path
164
+ output_dir = Path(output_dir)
165
+ output_dir.mkdir(parents=True, exist_ok=True)
166
+
167
+ metadata, files_df, sdrf_df = arrayexpress_get_experiment(accession)
168
+
169
+ # 処理済み発現ファイル検索
170
+ expr_files = files_df[
171
+ files_df["filename"].str.contains(
172
+ r"processed|normalized|expression|counts",
173
+ case=False, na=False)
174
+ ]
175
+
176
+ downloaded = []
177
+ for _, frow in expr_files.iterrows():
178
+ fname = frow["filename"]
179
+ url = (f"https://www.ebi.ac.uk/biostudies/files/"
180
+ f"{accession}/{fname}")
181
+ try:
182
+ resp = requests.get(url, timeout=120)
183
+ if resp.status_code == 200:
184
+ fpath = output_dir / fname.split("/")[-1]
185
+ fpath.write_bytes(resp.content)
186
+ downloaded.append(str(fpath))
187
+ except Exception:
188
+ continue
189
+
190
+ # SDRF 保存
191
+ if not sdrf_df.empty:
192
+ sdrf_df.to_csv(output_dir / "sdrf.csv", index=False)
193
+
194
+ print(f"ArrayExpress download: {len(downloaded)} files → "
195
+ f"{output_dir}")
196
+ return {
197
+ "metadata": metadata,
198
+ "files": downloaded,
199
+ "sdrf": sdrf_df,
200
+ }
201
+ ```
202
+
203
+ ## 4. ArrayExpress 統合パイプライン
204
+
205
+ ```python
206
+ def arrayexpress_pipeline(query, organism="Homo sapiens",
207
+ output_dir="results"):
208
+ """
209
+ ArrayExpress 統合パイプライン。
210
+
211
+ Parameters:
212
+ query: str — 検索クエリ
213
+ organism: str — 生物種
214
+ output_dir: str — 出力ディレクトリ
215
+ """
216
+ from pathlib import Path
217
+ output_dir = Path(output_dir)
218
+ output_dir.mkdir(parents=True, exist_ok=True)
219
+
220
+ # 1) 実験検索
221
+ experiments = arrayexpress_search_experiments(
222
+ query, organism=organism)
223
+ experiments.to_csv(output_dir / "experiments.csv", index=False)
224
+
225
+ # 2) トップ実験の詳細
226
+ if not experiments.empty:
227
+ top_acc = experiments.iloc[0]["accession"]
228
+ metadata, files, sdrf = arrayexpress_get_experiment(top_acc)
229
+ files.to_csv(output_dir / "experiment_files.csv", index=False)
230
+ if not sdrf.empty:
231
+ sdrf.to_csv(output_dir / "sdrf.csv", index=False)
232
+
233
+ print(f"ArrayExpress pipeline: {output_dir}")
234
+ return {"experiments": experiments}
235
+ ```
236
+
237
+ ---
238
+
239
+ ## ToolUniverse 連携
240
+
241
+ | TU Key | ツール名 | 連携内容 |
242
+ |--------|---------|---------|
243
+ | `arrayexpress` | ArrayExpress | 発現実験検索・メタデータ・ファイル取得 |
244
+
245
+ ## パイプライン統合
246
+
247
+ ```
248
+ ebi-databases → arrayexpress-expression → gene-expression-transcriptomics
249
+ (EBI Search) (ArrayExpress/BioStudies) (DESeq2/GSEA)
250
+ │ │ ↓
251
+ geo-expression ─────────┘ pathway-enrichment
252
+ (GEO データ) │ (KEGG/Reactome)
253
+
254
+ multi-omics
255
+ (統合解析)
256
+ ```
257
+
258
+ ## パイプライン出力
259
+
260
+ | ファイル | 説明 | 次スキル |
261
+ |---------|------|---------|
262
+ | `results/experiments.csv` | 実験一覧 | → geo-expression |
263
+ | `results/sdrf.csv` | サンプル情報 | → gene-expression-transcriptomics |
264
+ | `results/experiment_files.csv` | ファイルリスト | → data-preprocessing |
@@ -0,0 +1,313 @@
1
+ ---
2
+ name: scientific-crossref-metadata
3
+ description: |
4
+ CrossRef メタデータスキル。CrossRef REST API による
5
+ DOI 解決・論文メタデータ・引用数・ジャーナル情報・
6
+ 助成金情報検索。ToolUniverse 連携: crossref。
7
+ tu_tools:
8
+ - key: crossref
9
+ name: CrossRef
10
+ description: DOI 解決・論文メタデータ・引用数・ジャーナル情報
11
+ ---
12
+
13
+ # Scientific CrossRef Metadata
14
+
15
+ CrossRef REST API を活用した学術文献 DOI 解決・メタデータ検索・
16
+ 引用分析・ジャーナル情報・助成金レジストリ検索パイプラインを
17
+ 提供する。
18
+
19
+ ## When to Use
20
+
21
+ - DOI から論文メタデータを取得するとき
22
+ - 学術文献をタイトル・著者で検索するとき
23
+ - ジャーナルの ISSN やインパクト情報を調べるとき
24
+ - 論文の引用数・被引用数を確認するとき
25
+ - 研究助成金の情報を検索するとき
26
+ - 参考文献リストのメタデータを一括取得するとき
27
+ - 特定出版社やジャーナルの出版傾向を分析するとき
28
+
29
+ ---
30
+
31
+ ## Quick Start
32
+
33
+ ## 1. DOI 解決・論文メタデータ
34
+
35
+ ```python
36
+ import requests
37
+ import pandas as pd
38
+
39
+ CR_BASE = "https://api.crossref.org"
40
+ CR_HEADERS = {
41
+ "User-Agent": "SATORI/0.18.0 (mailto:your@email.com)",
42
+ }
43
+
44
+
45
+ def crossref_resolve_doi(doi):
46
+ """
47
+ CrossRef — DOI からメタデータ取得。
48
+
49
+ Parameters:
50
+ doi: str — DOI (例: "10.1038/s41586-020-2649-2")
51
+ """
52
+ url = f"{CR_BASE}/works/{doi}"
53
+ resp = requests.get(url, headers=CR_HEADERS, timeout=30)
54
+ resp.raise_for_status()
55
+ item = resp.json().get("message", {})
56
+
57
+ authors = []
58
+ for a in item.get("author", []):
59
+ name = f"{a.get('given', '')} {a.get('family', '')}"
60
+ authors.append(name.strip())
61
+
62
+ result = {
63
+ "doi": item.get("DOI", ""),
64
+ "title": " ".join(item.get("title", [])),
65
+ "authors": "; ".join(authors[:10]),
66
+ "journal": " ".join(
67
+ item.get("container-title", [])),
68
+ "publisher": item.get("publisher", ""),
69
+ "type": item.get("type", ""),
70
+ "published_date": _cr_date(
71
+ item.get("published-print") or
72
+ item.get("published-online")),
73
+ "citation_count": item.get(
74
+ "is-referenced-by-count", 0),
75
+ "reference_count": item.get("reference-count", 0),
76
+ "issn": ", ".join(item.get("ISSN", [])),
77
+ "url": item.get("URL", ""),
78
+ "abstract": (item.get("abstract") or "")[:500],
79
+ "funder": "; ".join(
80
+ f.get("name", "")
81
+ for f in item.get("funder", [])),
82
+ "license": _cr_license(item),
83
+ }
84
+
85
+ print(f"CrossRef DOI: {doi}")
86
+ print(f" {result['title'][:80]}")
87
+ print(f" Citations: {result['citation_count']}")
88
+ return result
89
+
90
+
91
+ def _cr_date(date_obj):
92
+ if not date_obj:
93
+ return ""
94
+ parts = date_obj.get("date-parts", [[]])[0]
95
+ return "-".join(str(p) for p in parts)
96
+
97
+
98
+ def _cr_license(item):
99
+ licenses = item.get("license", [])
100
+ if licenses:
101
+ return licenses[0].get("content-version", "")
102
+ return ""
103
+ ```
104
+
105
+ ## 2. 論文検索
106
+
107
+ ```python
108
+ def crossref_search_works(query, limit=50,
109
+ sort="relevance",
110
+ filter_type=None,
111
+ from_date=None):
112
+ """
113
+ CrossRef — 論文検索。
114
+
115
+ Parameters:
116
+ query: str — 検索クエリ
117
+ limit: int — 最大結果数
118
+ sort: str — ソート ("relevance", "published",
119
+ "is-referenced-by-count")
120
+ filter_type: str — 文献タイプフィルタ
121
+ (例: "journal-article")
122
+ from_date: str — 開始日 (例: "2020-01-01")
123
+ """
124
+ url = f"{CR_BASE}/works"
125
+ params = {
126
+ "query": query,
127
+ "rows": min(limit, 1000),
128
+ "sort": sort,
129
+ }
130
+
131
+ filters = []
132
+ if filter_type:
133
+ filters.append(f"type:{filter_type}")
134
+ if from_date:
135
+ filters.append(f"from-pub-date:{from_date}")
136
+ if filters:
137
+ params["filter"] = ",".join(filters)
138
+
139
+ resp = requests.get(url, params=params,
140
+ headers=CR_HEADERS, timeout=30)
141
+ resp.raise_for_status()
142
+ data = resp.json().get("message", {})
143
+
144
+ results = []
145
+ for item in data.get("items", []):
146
+ authors = []
147
+ for a in item.get("author", []):
148
+ name = f"{a.get('given', '')} {a.get('family', '')}"
149
+ authors.append(name.strip())
150
+ results.append({
151
+ "doi": item.get("DOI", ""),
152
+ "title": " ".join(item.get("title", [])),
153
+ "authors": "; ".join(authors[:5]),
154
+ "journal": " ".join(
155
+ item.get("container-title", [])),
156
+ "year": _cr_date(
157
+ item.get("published-print") or
158
+ item.get("published-online")),
159
+ "citations": item.get(
160
+ "is-referenced-by-count", 0),
161
+ "type": item.get("type", ""),
162
+ })
163
+
164
+ df = pd.DataFrame(results)
165
+ total = data.get("total-results", 0)
166
+ print(f"CrossRef search: {len(df)}/{total} works "
167
+ f"(query='{query}')")
168
+ return df
169
+ ```
170
+
171
+ ## 3. ジャーナル情報・助成金検索
172
+
173
+ ```python
174
+ def crossref_journal_info(issn):
175
+ """
176
+ CrossRef — ジャーナル情報取得。
177
+
178
+ Parameters:
179
+ issn: str — ISSN (例: "0028-0836")
180
+ """
181
+ url = f"{CR_BASE}/journals/{issn}"
182
+ resp = requests.get(url, headers=CR_HEADERS, timeout=30)
183
+ resp.raise_for_status()
184
+ data = resp.json().get("message", {})
185
+
186
+ counts = data.get("counts", {})
187
+ result = {
188
+ "issn": issn,
189
+ "title": data.get("title", ""),
190
+ "publisher": data.get("publisher", ""),
191
+ "subjects": "; ".join(
192
+ s.get("name", "")
193
+ for s in data.get("subjects", [])),
194
+ "total_dois": counts.get("total-dois", 0),
195
+ "current_dois": counts.get("current-dois", 0),
196
+ "backfile_dois": counts.get("backfile-dois", 0),
197
+ }
198
+
199
+ print(f"CrossRef journal: {result['title']} "
200
+ f"({result['total_dois']} DOIs)")
201
+ return result
202
+
203
+
204
+ def crossref_search_funders(query, limit=20):
205
+ """
206
+ CrossRef — 助成金機関検索。
207
+
208
+ Parameters:
209
+ query: str — 機関名 (例: "NIH", "JSPS")
210
+ limit: int — 最大結果数
211
+ """
212
+ url = f"{CR_BASE}/funders"
213
+ params = {"query": query, "rows": limit}
214
+ resp = requests.get(url, params=params,
215
+ headers=CR_HEADERS, timeout=30)
216
+ resp.raise_for_status()
217
+ data = resp.json().get("message", {})
218
+
219
+ results = []
220
+ for item in data.get("items", []):
221
+ results.append({
222
+ "funder_id": item.get("id", ""),
223
+ "name": item.get("name", ""),
224
+ "location": item.get("location", ""),
225
+ "alt_names": "; ".join(
226
+ item.get("alt-names", [])[:3]),
227
+ "work_count": item.get("work-count", 0),
228
+ })
229
+
230
+ df = pd.DataFrame(results)
231
+ print(f"CrossRef funders: {len(df)} (query='{query}')")
232
+ return df
233
+ ```
234
+
235
+ ## 4. CrossRef 統合パイプライン
236
+
237
+ ```python
238
+ def crossref_pipeline(query, dois=None,
239
+ output_dir="results"):
240
+ """
241
+ CrossRef 統合パイプライン。
242
+
243
+ Parameters:
244
+ query: str — 検索クエリ
245
+ dois: list[str] — DOI リスト (直接解決)
246
+ output_dir: str — 出力ディレクトリ
247
+ """
248
+ from pathlib import Path
249
+ output_dir = Path(output_dir)
250
+ output_dir.mkdir(parents=True, exist_ok=True)
251
+
252
+ # 1) 論文検索
253
+ works = crossref_search_works(query)
254
+ works.to_csv(output_dir / "works.csv", index=False)
255
+
256
+ # 2) DOI 一括解決
257
+ if dois:
258
+ resolved = []
259
+ for doi in dois:
260
+ try:
261
+ meta = crossref_resolve_doi(doi)
262
+ resolved.append(meta)
263
+ except Exception as e:
264
+ print(f" Warning: {doi} — {e}")
265
+ continue
266
+ resolved_df = pd.DataFrame(resolved)
267
+ resolved_df.to_csv(output_dir / "doi_resolved.csv",
268
+ index=False)
269
+
270
+ # 3) 引用分析
271
+ if not works.empty:
272
+ stats = {
273
+ "total_works": len(works),
274
+ "total_citations": works["citations"].sum(),
275
+ "mean_citations": works["citations"].mean(),
276
+ "median_citations": works["citations"].median(),
277
+ "max_citations": works["citations"].max(),
278
+ }
279
+ pd.DataFrame([stats]).to_csv(
280
+ output_dir / "citation_stats.csv", index=False)
281
+
282
+ print(f"CrossRef pipeline: {output_dir}")
283
+ return {"works": works}
284
+ ```
285
+
286
+ ---
287
+
288
+ ## ToolUniverse 連携
289
+
290
+ | TU Key | ツール名 | 連携内容 |
291
+ |--------|---------|---------|
292
+ | `crossref` | CrossRef | DOI 解決・メタデータ・引用・ジャーナル情報 |
293
+
294
+ ## パイプライン統合
295
+
296
+ ```
297
+ literature-search → crossref-metadata → citation-checker
298
+ (PubMed/NCBI) (CrossRef REST API) (引用品質検証)
299
+ │ │ ↓
300
+ semantic-scholar ──────┘ deep-research
301
+ (S2 Academic Graph) │ (知識統合)
302
+
303
+ bibliometrics
304
+ (書誌計量分析)
305
+ ```
306
+
307
+ ## パイプライン出力
308
+
309
+ | ファイル | 説明 | 次スキル |
310
+ |---------|------|---------|
311
+ | `results/works.csv` | 論文検索結果 | → semantic-scholar |
312
+ | `results/doi_resolved.csv` | DOI メタデータ | → citation-checker |
313
+ | `results/citation_stats.csv` | 引用統計 | → bibliometrics |