@nahisaho/satori 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +123 -4
- package/package.json +1 -1
- package/src/.github/skills/scientific-environmental-ecology/SKILL.md +295 -0
- package/src/.github/skills/scientific-epidemiology-public-health/SKILL.md +332 -0
- package/src/.github/skills/scientific-immunoinformatics/SKILL.md +341 -0
- package/src/.github/skills/scientific-infectious-disease/SKILL.md +342 -0
- package/src/.github/skills/scientific-microbiome-metagenomics/SKILL.md +349 -0
- package/src/.github/skills/scientific-population-genetics/SKILL.md +336 -0
- package/src/.github/skills/scientific-single-cell-genomics/SKILL.md +361 -0
- package/src/.github/skills/scientific-spatial-transcriptomics/SKILL.md +281 -0
- package/src/.github/skills/scientific-systems-biology/SKILL.md +310 -0
- package/src/.github/skills/scientific-text-mining-nlp/SKILL.md +358 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-text-mining-nlp
|
|
3
|
+
description: |
|
|
4
|
+
科学テキストマイニング・NLP スキル。生物医学 NER(遺伝子/疾患/薬物/化合物)・
|
|
5
|
+
関係抽出(PPI / DDI / GDA)・文献ベースナレッジグラフ構築・
|
|
6
|
+
エビデンス要約・トピックモデリング・引用ネットワーク解析パイプライン。
|
|
7
|
+
PubTator / SemanticScholar / EuropePMC データ統合。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Text Mining & NLP
|
|
11
|
+
|
|
12
|
+
科学文献に対する自然言語処理(NLP)パイプラインを提供する。
|
|
13
|
+
生物医学エンティティ認識、関係抽出、ナレッジグラフ構築、
|
|
14
|
+
トピックモデリング、自動エビデンス要約を体系的に扱う。
|
|
15
|
+
|
|
16
|
+
## When to Use
|
|
17
|
+
|
|
18
|
+
- 大量の科学文献から遺伝子・疾患・薬物名を自動抽出するとき
|
|
19
|
+
- タンパク質-タンパク質相互作用(PPI)等の関係を文献から抽出するとき
|
|
20
|
+
- 文献ベースのナレッジグラフを構築するとき
|
|
21
|
+
- 研究トレンドのトピックモデリングを行うとき
|
|
22
|
+
- 引用ネットワーク分析で影響力のある論文を同定するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. 生物医学 NER(Named Entity Recognition)
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import numpy as np
|
|
32
|
+
import pandas as pd
|
|
33
|
+
|
|
34
|
+
def biomedical_ner(texts, model="biobert", entity_types=None):
|
|
35
|
+
"""
|
|
36
|
+
生物医学テキストからのエンティティ認識。
|
|
37
|
+
|
|
38
|
+
model:
|
|
39
|
+
- "biobert": BioBERT — PubMed 事前学習 BERT
|
|
40
|
+
- "scispacy": SciSpaCy — 科学テキスト特化 spaCy
|
|
41
|
+
- "pubtator": PubTator3 API — NCBI の NER サービス
|
|
42
|
+
|
|
43
|
+
entity_types:
|
|
44
|
+
- Gene/Protein: 遺伝子・タンパク質名
|
|
45
|
+
- Disease: 疾患名(MESH / OMIM ID)
|
|
46
|
+
- Chemical/Drug: 化合物・薬物名(MeSH / DrugBank ID)
|
|
47
|
+
- Species: 生物種
|
|
48
|
+
- Mutation: 変異(tmVar 形式)
|
|
49
|
+
- Cell Line / Cell Type
|
|
50
|
+
"""
|
|
51
|
+
if entity_types is None:
|
|
52
|
+
entity_types = ["Gene", "Disease", "Chemical", "Species", "Mutation"]
|
|
53
|
+
|
|
54
|
+
if model == "scispacy":
|
|
55
|
+
import spacy
|
|
56
|
+
nlp = spacy.load("en_core_sci_lg")
|
|
57
|
+
from scispacy.linking import EntityLinker
|
|
58
|
+
nlp.add_pipe("scispacy_linker", config={
|
|
59
|
+
"resolve_abbreviations": True,
|
|
60
|
+
"linker_name": "umls"
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
all_entities = []
|
|
64
|
+
for i, text in enumerate(texts):
|
|
65
|
+
doc = nlp(text)
|
|
66
|
+
for ent in doc.ents:
|
|
67
|
+
all_entities.append({
|
|
68
|
+
"doc_id": i,
|
|
69
|
+
"text": ent.text,
|
|
70
|
+
"label": ent.label_,
|
|
71
|
+
"start": ent.start_char,
|
|
72
|
+
"end": ent.end_char,
|
|
73
|
+
"kb_id": ent._.kb_ents[0][0] if ent._.kb_ents else None,
|
|
74
|
+
"confidence": ent._.kb_ents[0][1] if ent._.kb_ents else None,
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
df = pd.DataFrame(all_entities)
|
|
78
|
+
print(f" NER: {len(df)} entities from {len(texts)} documents")
|
|
79
|
+
return df
|
|
80
|
+
|
|
81
|
+
elif model == "biobert":
|
|
82
|
+
from transformers import pipeline
|
|
83
|
+
ner_pipeline = pipeline("ner", model="dmis-lab/biobert-large-cased-v1.1-ner",
|
|
84
|
+
aggregation_strategy="simple")
|
|
85
|
+
|
|
86
|
+
all_entities = []
|
|
87
|
+
for i, text in enumerate(texts):
|
|
88
|
+
entities = ner_pipeline(text)
|
|
89
|
+
for ent in entities:
|
|
90
|
+
all_entities.append({
|
|
91
|
+
"doc_id": i, "text": ent["word"],
|
|
92
|
+
"label": ent["entity_group"],
|
|
93
|
+
"score": ent["score"],
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
return pd.DataFrame(all_entities)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## 2. 関係抽出
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
def relation_extraction(texts, relation_type="ppi", model="biobert_re"):
|
|
103
|
+
"""
|
|
104
|
+
科学文献からの関係抽出。
|
|
105
|
+
|
|
106
|
+
relation_type:
|
|
107
|
+
- "ppi": Protein-Protein Interaction
|
|
108
|
+
- "ddi": Drug-Drug Interaction
|
|
109
|
+
- "gda": Gene-Disease Association
|
|
110
|
+
- "chem_disease": Chemical-Disease Relation
|
|
111
|
+
- "chem_gene": Chemical-Gene Interaction
|
|
112
|
+
|
|
113
|
+
パイプライン:
|
|
114
|
+
1. NER でエンティティ抽出
|
|
115
|
+
2. 同一文内のエンティティペアを候補として列挙
|
|
116
|
+
3. 各ペアの関係分類(BERT ベース)
|
|
117
|
+
4. 信頼度フィルタリング
|
|
118
|
+
"""
|
|
119
|
+
from transformers import pipeline
|
|
120
|
+
|
|
121
|
+
if relation_type == "ppi":
|
|
122
|
+
re_model = "dmis-lab/biobert-v1.1" # Fine-tuned for PPI
|
|
123
|
+
elif relation_type == "ddi":
|
|
124
|
+
re_model = "dmis-lab/biobert-v1.1"
|
|
125
|
+
|
|
126
|
+
classifier = pipeline("text-classification", model=re_model)
|
|
127
|
+
|
|
128
|
+
relations = []
|
|
129
|
+
for i, text in enumerate(texts):
|
|
130
|
+
# エンティティペア候補をマーキング
|
|
131
|
+
ner_results = biomedical_ner([text], model="scispacy")
|
|
132
|
+
entities = ner_results[ner_results["doc_id"] == 0]
|
|
133
|
+
|
|
134
|
+
# 全ペアの関係分類
|
|
135
|
+
for idx_a, ent_a in entities.iterrows():
|
|
136
|
+
for idx_b, ent_b in entities.iterrows():
|
|
137
|
+
if idx_a < idx_b:
|
|
138
|
+
# コンテキスト付きテキスト
|
|
139
|
+
marked_text = mark_entities(text, ent_a, ent_b)
|
|
140
|
+
pred = classifier(marked_text[:512])
|
|
141
|
+
|
|
142
|
+
if pred[0]["score"] > 0.7:
|
|
143
|
+
relations.append({
|
|
144
|
+
"doc_id": i,
|
|
145
|
+
"entity_a": ent_a["text"],
|
|
146
|
+
"entity_b": ent_b["text"],
|
|
147
|
+
"relation": pred[0]["label"],
|
|
148
|
+
"confidence": pred[0]["score"],
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
df = pd.DataFrame(relations)
|
|
152
|
+
print(f" RE: {len(df)} relations from {len(texts)} documents")
|
|
153
|
+
return df
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def mark_entities(text, ent_a, ent_b):
|
|
157
|
+
"""エンティティをマーキングしたテキストを生成。"""
|
|
158
|
+
# 簡易実装: @ENTITY_A@ / @ENTITY_B@ でマーク
|
|
159
|
+
marked = text.replace(ent_a["text"], f"@ENTITY_A@ {ent_a['text']} @/ENTITY_A@")
|
|
160
|
+
marked = marked.replace(ent_b["text"], f"@ENTITY_B@ {ent_b['text']} @/ENTITY_B@")
|
|
161
|
+
return marked
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## 3. ナレッジグラフ構築
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
def build_knowledge_graph(entities_df, relations_df, min_confidence=0.7):
|
|
168
|
+
"""
|
|
169
|
+
文献ベースのナレッジグラフ構築。
|
|
170
|
+
|
|
171
|
+
ノード: エンティティ(遺伝子、疾患、薬物、経路 etc.)
|
|
172
|
+
エッジ: 関係(interacts_with, treats, causes, associated_with etc.)
|
|
173
|
+
|
|
174
|
+
パイプライン:
|
|
175
|
+
1. エンティティ正規化(UMLS CUI / MeSH 統一)
|
|
176
|
+
2. 重複エンティティマージ
|
|
177
|
+
3. 関係集約(頻度 + 最大信頼度)
|
|
178
|
+
4. グラフ構築 + コミュニティ検出
|
|
179
|
+
"""
|
|
180
|
+
import networkx as nx
|
|
181
|
+
from collections import Counter
|
|
182
|
+
|
|
183
|
+
# 信頼度フィルタ
|
|
184
|
+
rel_filtered = relations_df[relations_df["confidence"] >= min_confidence]
|
|
185
|
+
|
|
186
|
+
# グラフ構築
|
|
187
|
+
G = nx.MultiDiGraph()
|
|
188
|
+
|
|
189
|
+
# エンティティノード追加
|
|
190
|
+
for _, ent in entities_df.iterrows():
|
|
191
|
+
G.add_node(ent["text"], type=ent["label"],
|
|
192
|
+
kb_id=ent.get("kb_id", None))
|
|
193
|
+
|
|
194
|
+
# 関係エッジ追加
|
|
195
|
+
edge_counts = Counter()
|
|
196
|
+
for _, rel in rel_filtered.iterrows():
|
|
197
|
+
key = (rel["entity_a"], rel["entity_b"], rel["relation"])
|
|
198
|
+
edge_counts[key] += 1
|
|
199
|
+
G.add_edge(rel["entity_a"], rel["entity_b"],
|
|
200
|
+
relation=rel["relation"],
|
|
201
|
+
confidence=rel["confidence"],
|
|
202
|
+
frequency=edge_counts[key])
|
|
203
|
+
|
|
204
|
+
# コミュニティ検出
|
|
205
|
+
G_simple = nx.Graph(G)
|
|
206
|
+
from networkx.algorithms.community import louvain_communities
|
|
207
|
+
communities = louvain_communities(G_simple, resolution=1.0)
|
|
208
|
+
|
|
209
|
+
print(f" KG: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges, "
|
|
210
|
+
f"{len(communities)} communities")
|
|
211
|
+
return G, communities
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## 4. トピックモデリング
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
def topic_modeling(abstracts, n_topics=10, method="bertopic"):
|
|
218
|
+
"""
|
|
219
|
+
科学文献のトピックモデリング。
|
|
220
|
+
|
|
221
|
+
method:
|
|
222
|
+
- "bertopic": BERTopic — BERT 埋め込み + HDBSCAN + c-TF-IDF
|
|
223
|
+
- "lda": LDA (Latent Dirichlet Allocation) — 確率的トピックモデル
|
|
224
|
+
- "nmf": NMF (Non-negative Matrix Factorization)
|
|
225
|
+
|
|
226
|
+
BERTopic パイプライン:
|
|
227
|
+
1. BERT / SPECTER で文書埋め込み
|
|
228
|
+
2. UMAP で次元削減
|
|
229
|
+
3. HDBSCAN でクラスタリング
|
|
230
|
+
4. c-TF-IDF でトピックワード抽出
|
|
231
|
+
"""
|
|
232
|
+
if method == "bertopic":
|
|
233
|
+
from bertopic import BERTopic
|
|
234
|
+
from sentence_transformers import SentenceTransformer
|
|
235
|
+
|
|
236
|
+
embedding_model = SentenceTransformer("allenai-specter")
|
|
237
|
+
topic_model = BERTopic(embedding_model=embedding_model,
|
|
238
|
+
nr_topics=n_topics,
|
|
239
|
+
calculate_probabilities=True)
|
|
240
|
+
|
|
241
|
+
topics, probs = topic_model.fit_transform(abstracts)
|
|
242
|
+
|
|
243
|
+
topic_info = topic_model.get_topic_info()
|
|
244
|
+
print(f" Topics: {len(topic_info) - 1} topics from {len(abstracts)} documents")
|
|
245
|
+
return topic_model, topics, probs
|
|
246
|
+
|
|
247
|
+
elif method == "lda":
|
|
248
|
+
from sklearn.decomposition import LatentDirichletAllocation
|
|
249
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
|
250
|
+
|
|
251
|
+
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")
|
|
252
|
+
dtm = vectorizer.fit_transform(abstracts)
|
|
253
|
+
|
|
254
|
+
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
|
|
255
|
+
lda.fit(dtm)
|
|
256
|
+
|
|
257
|
+
feature_names = vectorizer.get_feature_names_out()
|
|
258
|
+
topics = {}
|
|
259
|
+
for i, topic_dist in enumerate(lda.components_):
|
|
260
|
+
top_words = [feature_names[j] for j in topic_dist.argsort()[-10:]]
|
|
261
|
+
topics[f"Topic_{i}"] = top_words
|
|
262
|
+
|
|
263
|
+
return lda, topics
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## 5. 引用ネットワーク分析
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
def citation_network_analysis(papers_df, citations_df):
|
|
270
|
+
"""
|
|
271
|
+
引用ネットワーク分析。
|
|
272
|
+
|
|
273
|
+
指標:
|
|
274
|
+
- In-degree: 被引用数 → 影響力
|
|
275
|
+
- PageRank: 引用の質を加味した影響力
|
|
276
|
+
- Hub/Authority (HITS): Hub=多数引用、Authority=多数被引用
|
|
277
|
+
- Citation burst: 急激な被引用増加(新興トピック)
|
|
278
|
+
- Bibliographic coupling: 同じ論文を引用するペア
|
|
279
|
+
- Co-citation: 同時に引用されるペア
|
|
280
|
+
"""
|
|
281
|
+
import networkx as nx
|
|
282
|
+
|
|
283
|
+
G = nx.DiGraph()
|
|
284
|
+
for _, paper in papers_df.iterrows():
|
|
285
|
+
G.add_node(paper["paper_id"], title=paper["title"],
|
|
286
|
+
year=paper["year"])
|
|
287
|
+
|
|
288
|
+
for _, cite in citations_df.iterrows():
|
|
289
|
+
G.add_edge(cite["citing"], cite["cited"])
|
|
290
|
+
|
|
291
|
+
# PageRank
|
|
292
|
+
pagerank = nx.pagerank(G, alpha=0.85)
|
|
293
|
+
|
|
294
|
+
# HITS
|
|
295
|
+
hubs, authorities = nx.hits(G, max_iter=100)
|
|
296
|
+
|
|
297
|
+
# 結果集約
|
|
298
|
+
metrics_df = pd.DataFrame({
|
|
299
|
+
"paper_id": list(G.nodes()),
|
|
300
|
+
"in_degree": [G.in_degree(n) for n in G.nodes()],
|
|
301
|
+
"out_degree": [G.out_degree(n) for n in G.nodes()],
|
|
302
|
+
"pagerank": [pagerank.get(n, 0) for n in G.nodes()],
|
|
303
|
+
"hub_score": [hubs.get(n, 0) for n in G.nodes()],
|
|
304
|
+
"authority_score": [authorities.get(n, 0) for n in G.nodes()],
|
|
305
|
+
})
|
|
306
|
+
metrics_df = metrics_df.sort_values("pagerank", ascending=False)
|
|
307
|
+
|
|
308
|
+
print(f" Citation network: {G.number_of_nodes()} papers, "
|
|
309
|
+
f"{G.number_of_edges()} citations")
|
|
310
|
+
return G, metrics_df
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
## References
|
|
314
|
+
|
|
315
|
+
### Output Files
|
|
316
|
+
|
|
317
|
+
| ファイル | 形式 |
|
|
318
|
+
|---|---|
|
|
319
|
+
| `results/ner_entities.csv` | CSV |
|
|
320
|
+
| `results/relations.csv` | CSV |
|
|
321
|
+
| `results/knowledge_graph.json` | JSON |
|
|
322
|
+
| `results/topic_model_info.csv` | CSV |
|
|
323
|
+
| `results/citation_metrics.csv` | CSV |
|
|
324
|
+
| `figures/kg_visualization.png` | PNG |
|
|
325
|
+
| `figures/topic_distribution.png` | PNG |
|
|
326
|
+
| `figures/citation_network.png` | PNG |
|
|
327
|
+
|
|
328
|
+
### 利用可能ツール
|
|
329
|
+
|
|
330
|
+
> [ToolUniverse](https://github.com/mims-harvard/ToolUniverse) SMCP 経由で利用可能な外部ツール。
|
|
331
|
+
|
|
332
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
333
|
+
|---|---|---|
|
|
334
|
+
| PubTator | `PubTator3_LiteratureSearch` | NER 付き文献検索 |
|
|
335
|
+
| PubTator | `PubTator3_EntityAutocomplete` | エンティティ補完 |
|
|
336
|
+
| PubMed | `PubMed_search_articles` | PubMed 文献検索 |
|
|
337
|
+
| PubMed | `PubMed_get_cited_by` | 被引用論文取得 |
|
|
338
|
+
| PubMed | `PubMed_get_related` | 関連論文取得 |
|
|
339
|
+
| SemanticScholar | `SemanticScholar_search_papers` | 学術論文検索 |
|
|
340
|
+
| EuropePMC | `EuropePMC_search_articles` | EuropePMC 検索 |
|
|
341
|
+
| EuropePMC | `EuropePMC_get_fulltext` | 全文テキスト取得 |
|
|
342
|
+
| EuropePMC | `EuropePMC_get_references` | 引用文献取得 |
|
|
343
|
+
| OpenAlex | `openalex_search_works` | OpenAlex 検索 |
|
|
344
|
+
| DBLP | `DBLP_search_publications` | CS 文献検索 |
|
|
345
|
+
|
|
346
|
+
### 参照スキル
|
|
347
|
+
|
|
348
|
+
| スキル | 連携内容 |
|
|
349
|
+
|---|---|
|
|
350
|
+
| [scientific-deep-research](../scientific-deep-research/SKILL.md) | 深層文献調査 |
|
|
351
|
+
| [scientific-citation-checker](../scientific-citation-checker/SKILL.md) | 引用検証 |
|
|
352
|
+
| [scientific-network-analysis](../scientific-network-analysis/SKILL.md) | ネットワーク解析 |
|
|
353
|
+
| [scientific-meta-analysis](../scientific-meta-analysis/SKILL.md) | 系統的文献レビュー |
|
|
354
|
+
| [scientific-graph-neural-networks](../scientific-graph-neural-networks/SKILL.md) | ナレッジグラフ推論 |
|
|
355
|
+
|
|
356
|
+
#### 依存パッケージ
|
|
357
|
+
|
|
358
|
+
- scispacy, spacy, transformers, bertopic, sentence-transformers, networkx
|