@nahisaho/satori 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,476 @@
1
+ ---
2
+ name: scientific-preprint-archive
3
+ description: |
4
+ プレプリント・オープンアクセスアーカイブ検索スキル。bioRxiv/medRxiv
5
+ プレプリント検索、arXiv 論文取得、PMC フルテキスト、DOAJ OA ジャーナル、
6
+ Unpaywall OA リンク、CORE/HAL/Zenodo/OpenAIRE/OSF/Fatcat/DBLP
7
+ 統合文献アクセスパイプライン。
8
+ ---
9
+
10
+ # Scientific Preprint Archive
11
+
12
+ bioRxiv / medRxiv / arXiv / PMC / DOAJ / Unpaywall / CORE / HAL /
13
+ Zenodo / OpenAIRE / OSF Preprints / Fatcat / DBLP を統合した
14
+ プレプリント・オープンアクセス文献検索パイプラインを提供する。
15
+
16
+ ## When to Use
17
+
18
+ - 最新のプレプリントを bioRxiv / medRxiv から検索するとき
19
+ - arXiv の機械学習・計算科学論文を取得するとき
20
+ - PMC フルテキスト XML を取得してテキストマイニングするとき
21
+ - OA 版のリンクを Unpaywall で見つけるとき
22
+ - CORE / Zenodo / OpenAIRE など複数アーカイブを横断検索するとき
23
+ - 系統的レビューの文献収集で網羅的プレプリント検索が必要なとき
24
+ - DBLP から計算機科学文献メタデータを取得するとき
25
+
26
+ ---
27
+
28
+ ## Quick Start
29
+
30
+ ## 1. bioRxiv / medRxiv プレプリント検索
31
+
32
+ ```python
33
+ import requests
34
+ import pandas as pd
35
+ from datetime import datetime, timedelta
36
+
37
+ BIORXIV_API = "https://api.biorxiv.org"
38
+
39
+
40
+ def search_biorxiv(query, server="biorxiv", days=30, cursor=0):
41
+ """
42
+ bioRxiv/medRxiv プレプリント検索。
43
+
44
+ Parameters:
45
+ query: str — 検索クエリ
46
+ server: str — "biorxiv" or "medrxiv"
47
+ days: int — 過去何日分を検索するか
48
+ cursor: int — ページネーション offset
49
+
50
+ ToolUniverse:
51
+ bioRxiv_search_preprints(query=query, server=server)
52
+ bioRxiv_get_preprint_details(doi=doi)
53
+ medRxiv_search_preprints(query=query)
54
+ """
55
+ end_date = datetime.now().strftime("%Y-%m-%d")
56
+ start_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
57
+
58
+ url = f"{BIORXIV_API}/details/{server}/{start_date}/{end_date}/{cursor}"
59
+ resp = requests.get(url)
60
+ resp.raise_for_status()
61
+ data = resp.json()
62
+
63
+ results = []
64
+ for paper in data.get("collection", []):
65
+ title = paper.get("title", "").lower()
66
+ abstract = paper.get("abstract", "").lower()
67
+ if query.lower() in title or query.lower() in abstract:
68
+ results.append({
69
+ "doi": paper.get("doi", ""),
70
+ "title": paper.get("title", ""),
71
+ "authors": paper.get("authors", ""),
72
+ "date": paper.get("date", ""),
73
+ "category": paper.get("category", ""),
74
+ "server": server,
75
+ "abstract": paper.get("abstract", "")[:300],
76
+ })
77
+
78
+ df = pd.DataFrame(results)
79
+ print(f"{server} search '{query}': {len(df)} preprints (last {days} days)")
80
+ return df
81
+ ```
82
+
83
+ ## 2. arXiv 論文検索
84
+
85
+ ```python
86
+ import urllib.parse
87
+ import xml.etree.ElementTree as ET
88
+
89
+ ARXIV_API = "http://export.arxiv.org/api/query"
90
+
91
+
92
+ def search_arxiv(query, category=None, max_results=50, sort_by="submittedDate"):
93
+ """
94
+ arXiv 論文検索。
95
+
96
+ Parameters:
97
+ query: str — 検索クエリ
98
+ category: str — arXiv カテゴリ (e.g., "q-bio.GN", "cs.LG", "stat.ML")
99
+ max_results: int — 最大取得数
100
+ sort_by: str — "submittedDate", "lastUpdatedDate", "relevance"
101
+
102
+ ToolUniverse:
103
+ arXiv_search_papers(query=query, category=category)
104
+ arXiv_get_paper(arxiv_id=arxiv_id)
105
+ """
106
+ search_query = f"all:{query}"
107
+ if category:
108
+ search_query += f"+AND+cat:{category}"
109
+
110
+ params = {
111
+ "search_query": search_query,
112
+ "start": 0,
113
+ "max_results": max_results,
114
+ "sortBy": sort_by,
115
+ "sortOrder": "descending",
116
+ }
117
+ resp = requests.get(ARXIV_API, params=params)
118
+ resp.raise_for_status()
119
+
120
+ ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
121
+ root = ET.fromstring(resp.text)
122
+
123
+ results = []
124
+ for entry in root.findall("atom:entry", ns):
125
+ categories = [c.get("term") for c in entry.findall("atom:category", ns)]
126
+ results.append({
127
+ "arxiv_id": entry.find("atom:id", ns).text.split("/abs/")[-1],
128
+ "title": entry.find("atom:title", ns).text.strip().replace("\n", " "),
129
+ "authors": ", ".join(
130
+ a.find("atom:name", ns).text
131
+ for a in entry.findall("atom:author", ns)
132
+ ),
133
+ "published": entry.find("atom:published", ns).text[:10],
134
+ "categories": categories,
135
+ "abstract": entry.find("atom:summary", ns).text.strip()[:300],
136
+ "pdf_url": next(
137
+ (l.get("href") for l in entry.findall("atom:link", ns)
138
+ if l.get("title") == "pdf"), ""
139
+ ),
140
+ })
141
+
142
+ df = pd.DataFrame(results)
143
+ cat_str = f" [{category}]" if category else ""
144
+ print(f"arXiv search '{query}'{cat_str}: {len(df)} papers")
145
+ return df
146
+ ```
147
+
148
+ ## 3. PMC フルテキストアクセス
149
+
150
+ ```python
151
+ PMC_API = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
152
+
153
+
154
+ def get_pmc_fulltext(pmcid, email="user@example.com"):
155
+ """
156
+ PMC フルテキスト XML 取得。
157
+
158
+ Parameters:
159
+ pmcid: str — PMC ID (e.g., "PMC1234567")
160
+ email: str — NCBI API 用メールアドレス
161
+
162
+ ToolUniverse:
163
+ PMC_get_fulltext(pmcid=pmcid)
164
+ """
165
+ params = {
166
+ "db": "pmc",
167
+ "id": pmcid.replace("PMC", ""),
168
+ "rettype": "xml",
169
+ "email": email,
170
+ }
171
+ resp = requests.get(f"{PMC_API}/efetch.fcgi", params=params)
172
+ resp.raise_for_status()
173
+
174
+ root = ET.fromstring(resp.text)
175
+ article = root.find(".//article")
176
+
177
+ sections = {}
178
+ for sec in article.findall(".//sec") if article is not None else []:
179
+ title = sec.find("title")
180
+ if title is not None and title.text:
181
+ paragraphs = [p.text for p in sec.findall("p") if p.text]
182
+ sections[title.text] = " ".join(paragraphs)
183
+
184
+ print(f"PMC {pmcid}: {len(sections)} sections retrieved")
185
+ return sections
186
+ ```
187
+
188
+ ## 4. Unpaywall OA リンク検索
189
+
190
+ ```python
191
+ UNPAYWALL_API = "https://api.unpaywall.org/v2"
192
+
193
+
194
+ def find_oa_version(doi, email="user@example.com"):
195
+ """
196
+ Unpaywall で論文の OA 版リンクを検索。
197
+
198
+ Parameters:
199
+ doi: str — DOI
200
+ email: str — API 利用者メール
201
+
202
+ ToolUniverse:
203
+ Unpaywall_get_oa_status(doi=doi)
204
+ """
205
+ resp = requests.get(f"{UNPAYWALL_API}/{doi}", params={"email": email})
206
+ resp.raise_for_status()
207
+ data = resp.json()
208
+
209
+ oa_locations = data.get("oa_locations", [])
210
+ result = {
211
+ "doi": doi,
212
+ "is_oa": data.get("is_oa", False),
213
+ "oa_status": data.get("oa_status", ""),
214
+ "best_oa_url": data.get("best_oa_location", {}).get("url_for_pdf", ""),
215
+ "journal": data.get("journal_name", ""),
216
+ "publisher": data.get("publisher", ""),
217
+ "n_oa_locations": len(oa_locations),
218
+ "locations": [
219
+ {
220
+ "url": loc.get("url_for_pdf") or loc.get("url"),
221
+ "host_type": loc.get("host_type"),
222
+ "version": loc.get("version"),
223
+ }
224
+ for loc in oa_locations
225
+ ],
226
+ }
227
+ print(f"Unpaywall {doi}: OA={result['is_oa']}, status={result['oa_status']}")
228
+ return result
229
+ ```
230
+
231
+ ## 5. CORE 統合検索
232
+
233
+ ```python
234
+ CORE_API = "https://api.core.ac.uk/v3"
235
+
236
+
237
+ def search_core(query, api_key, limit=25):
238
+ """
239
+ CORE リポジトリ横断検索 (1.4 億+ 論文)。
240
+
241
+ Parameters:
242
+ query: str — 検索クエリ
243
+ api_key: str — CORE API キー
244
+ limit: int — 最大取得数
245
+
246
+ ToolUniverse:
247
+ CORE_search_works(query=query)
248
+ CORE_get_work(core_id=core_id)
249
+ """
250
+ headers = {"Authorization": f"Bearer {api_key}"}
251
+ params = {"q": query, "limit": limit}
252
+ resp = requests.get(f"{CORE_API}/search/works", headers=headers, params=params)
253
+ resp.raise_for_status()
254
+ data = resp.json()
255
+
256
+ results = []
257
+ for work in data.get("results", []):
258
+ results.append({
259
+ "core_id": work.get("id", ""),
260
+ "title": work.get("title", ""),
261
+ "authors": ", ".join(
262
+ a.get("name", "") for a in work.get("authors", [])
263
+ ),
264
+ "year": work.get("yearPublished", ""),
265
+ "doi": work.get("doi", ""),
266
+ "download_url": work.get("downloadUrl", ""),
267
+ "abstract": (work.get("abstract") or "")[:300],
268
+ })
269
+
270
+ df = pd.DataFrame(results)
271
+ print(f"CORE search '{query}': {len(df)} works")
272
+ return df
273
+ ```
274
+
275
+ ## 6. Zenodo レコード検索
276
+
277
+ ```python
278
+ ZENODO_API = "https://zenodo.org/api"
279
+
280
+
281
+ def search_zenodo(query, resource_type=None, size=25):
282
+ """
283
+ Zenodo レコード検索 (データセット・ソフトウェア・論文)。
284
+
285
+ Parameters:
286
+ query: str — 検索クエリ
287
+ resource_type: str — "publication", "dataset", "software", "poster"
288
+ size: int — 最大取得数
289
+
290
+ ToolUniverse:
291
+ Zenodo_search_records(query=query, type=resource_type)
292
+ """
293
+ params = {"q": query, "size": size}
294
+ if resource_type:
295
+ params["type"] = resource_type
296
+ resp = requests.get(f"{ZENODO_API}/records", params=params)
297
+ resp.raise_for_status()
298
+ data = resp.json()
299
+
300
+ results = []
301
+ for hit in data.get("hits", {}).get("hits", []):
302
+ meta = hit.get("metadata", {})
303
+ results.append({
304
+ "zenodo_id": hit.get("id", ""),
305
+ "doi": meta.get("doi", ""),
306
+ "title": meta.get("title", ""),
307
+ "creators": ", ".join(
308
+ c.get("name", "") for c in meta.get("creators", [])
309
+ ),
310
+ "resource_type": meta.get("resource_type", {}).get("type", ""),
311
+ "publication_date": meta.get("publication_date", ""),
312
+ "access_right": meta.get("access_right", ""),
313
+ })
314
+
315
+ df = pd.DataFrame(results)
316
+ print(f"Zenodo search '{query}': {len(df)} records")
317
+ return df
318
+ ```
319
+
320
+ ## 7. DOAJ OA ジャーナル検索
321
+
322
+ ```python
323
+ DOAJ_API = "https://doaj.org/api"
324
+
325
+
326
+ def search_doaj_articles(query, page=1, page_size=25):
327
+ """
328
+ DOAJ OA ジャーナル記事検索。
329
+
330
+ ToolUniverse:
331
+ DOAJ_search_articles(query=query)
332
+ """
333
+ params = {"q": query, "page": page, "pageSize": page_size}
334
+ resp = requests.get(f"{DOAJ_API}/search/articles/{query}")
335
+ resp.raise_for_status()
336
+ data = resp.json()
337
+
338
+ results = []
339
+ for item in data.get("results", []):
340
+ bib = item.get("bibjson", {})
341
+ results.append({
342
+ "doi": bib.get("identifier", [{}])[0].get("id", ""),
343
+ "title": bib.get("title", ""),
344
+ "journal": bib.get("journal", {}).get("title", ""),
345
+ "year": bib.get("year", ""),
346
+ "authors": ", ".join(
347
+ a.get("name", "") for a in bib.get("author", [])
348
+ ),
349
+ "keywords": bib.get("keywords", []),
350
+ })
351
+
352
+ df = pd.DataFrame(results)
353
+ print(f"DOAJ search '{query}': {len(df)} OA articles")
354
+ return df
355
+ ```
356
+
357
+ ## 8. OpenAIRE 研究成果物検索
358
+
359
+ ```python
360
+ OPENAIRE_API = "https://api.openaire.eu/search"
361
+
362
+
363
+ def search_openaire(query, result_type="publication", size=25):
364
+ """
365
+ OpenAIRE 研究成果物検索 (EU 助成研究中心)。
366
+
367
+ ToolUniverse:
368
+ OpenAIRE_search_publications(query=query)
369
+ """
370
+ params = {
371
+ "keywords": query,
372
+ "size": size,
373
+ "format": "json",
374
+ }
375
+ resp = requests.get(f"{OPENAIRE_API}/{result_type}s", params=params)
376
+ resp.raise_for_status()
377
+ data = resp.json()
378
+
379
+ results_list = (
380
+ data.get("response", {}).get("results", {}).get("result", [])
381
+ )
382
+ results = []
383
+ for item in results_list:
384
+ meta = item.get("metadata", {}).get("oaf:entity", {}).get("oaf:result", {})
385
+ results.append({
386
+ "title": meta.get("title", {}).get("$", ""),
387
+ "date": meta.get("dateofacceptance", {}).get("$", ""),
388
+ "publisher": meta.get("publisher", {}).get("$", ""),
389
+ })
390
+
391
+ df = pd.DataFrame(results)
392
+ print(f"OpenAIRE search '{query}': {len(df)} results")
393
+ return df
394
+ ```
395
+
396
+ ## 9. 統合マルチアーカイブ検索パイプライン
397
+
398
+ ```python
399
+ def multi_archive_search(query, archives=None, **kwargs):
400
+ """
401
+ 複数プレプリント/OA アーカイブ横断検索。
402
+
403
+ Parameters:
404
+ query: str — 検索クエリ
405
+ archives: list — ["biorxiv", "medrxiv", "arxiv", "core", "zenodo", "doaj", "openaire"]
406
+ """
407
+ if archives is None:
408
+ archives = ["biorxiv", "arxiv", "core"]
409
+
410
+ all_results = {}
411
+ search_funcs = {
412
+ "biorxiv": lambda q: search_biorxiv(q, server="biorxiv"),
413
+ "medrxiv": lambda q: search_biorxiv(q, server="medrxiv"),
414
+ "arxiv": lambda q: search_arxiv(q),
415
+ "core": lambda q: search_core(q, api_key=kwargs.get("core_api_key", "")),
416
+ "zenodo": lambda q: search_zenodo(q),
417
+ "doaj": lambda q: search_doaj_articles(q),
418
+ "openaire": lambda q: search_openaire(q),
419
+ }
420
+
421
+ for archive in archives:
422
+ if archive in search_funcs:
423
+ try:
424
+ df = search_funcs[archive](query)
425
+ all_results[archive] = df
426
+ print(f" ✓ {archive}: {len(df)} results")
427
+ except Exception as e:
428
+ print(f" ✗ {archive}: {e}")
429
+ all_results[archive] = pd.DataFrame()
430
+
431
+ total = sum(len(df) for df in all_results.values())
432
+ print(f"\nTotal: {total} results across {len(archives)} archives")
433
+ return all_results
434
+ ```
435
+
436
+ ---
437
+
438
+ ## 利用可能ツール
439
+
440
+ 以下のツールが ToolUniverse SMCP 経由で利用可能:
441
+
442
+ | ToolUniverse カテゴリ | 主なツール |
443
+ |---|---|
444
+ | `biorxiv` | `bioRxiv_search_preprints`, `bioRxiv_get_preprint_details` |
445
+ | `medrxiv` | `medRxiv_search_preprints` |
446
+ | `arxiv` | `arXiv_search_papers`, `arXiv_get_paper` |
447
+ | `pmc` | `PMC_get_fulltext` |
448
+ | `doaj` | `DOAJ_search_articles` |
449
+ | `unpaywall` | `Unpaywall_get_oa_status` |
450
+ | `hal` | `HAL_search` |
451
+ | `core` | `CORE_search_works`, `CORE_get_work` |
452
+ | `zenodo` | `Zenodo_search_records` |
453
+ | `openaire` | `OpenAIRE_search_publications` |
454
+ | `osf_preprints` | `OSF_search_preprints` |
455
+ | `fatcat` | `Fatcat_search_releases` |
456
+ | `dblp` | `DBLP_search_publications` |
457
+
458
+ ## パイプライン出力
459
+
460
+ | 出力ファイル | 説明 | 連携先スキル |
461
+ |---|---|---|
462
+ | `results/preprint_search.csv` | 横断検索結果 | → literature-search, systematic-review |
463
+ | `results/oa_availability.json` | OA ステータス・リンク | → deep-research |
464
+ | `results/fulltext_corpus/` | フルテキストコーパス | → text-mining-nlp, biomedical-pubtator |
465
+ | `results/arxiv_papers.csv` | arXiv 論文メタデータ | → deep-learning, graph-neural-networks |
466
+
467
+ ## パイプライン統合
468
+
469
+ ```
470
+ literature-search ──→ preprint-archive ──→ systematic-review
471
+ (PubMed/OpenAlex) (bioRxiv/arXiv/CORE) (PRISMA 2020)
472
+
473
+ ├──→ text-mining-nlp (NER/KG)
474
+ ├──→ biomedical-pubtator (PubTator NER)
475
+ └──→ deep-research (エビデンス統合)
476
+ ```