@fbraza/pi-cite 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -0
- package/package.json +5 -1
- package/skills/literature/SKILL.md +208 -0
- package/skills/literature/references/full-text-access-guide.md +34 -0
- package/skills/literature/references/preclinical-extraction-guide.md +215 -0
- package/skills/literature/references/pubmed_api_reference.md +298 -0
- package/skills/literature/references/pubmed_common_queries.md +453 -0
- package/skills/literature/references/pubmed_routine.md +93 -0
- package/skills/literature/references/pubmed_search_syntax.md +436 -0
- package/skills/literature/references/scihub_routine.md +40 -0
- package/skills/literature/references/semanticscholar_routine.md +50 -0
- package/skills/literature/scripts/export_all.py +53 -0
- package/skills/literature/scripts/extract_experiments.py +401 -0
- package/skills/literature/scripts/generate_table.py +96 -0
- package/skills/literature/scripts/scihub_pdf_resolver.py +289 -0
- package/skills/literature/scripts/synthesis.py +93 -0
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Preclinical Experiment Extraction Module
|
|
3
|
+
|
|
4
|
+
Parse abstracts to extract structured in vitro and in vivo experiment details.
|
|
5
|
+
Uses keyword-based extraction to identify cell lines, assays, animal models,
|
|
6
|
+
endpoints, and key findings from each paper.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import os
|
|
11
|
+
from typing import List, Dict, Tuple
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# Keyword dictionaries
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
# In vitro indicators
|
|
20
|
+
IN_VITRO_KEYWORDS = [
|
|
21
|
+
"cell line", "cell lines", "cell culture", "in vitro", "cultured cells",
|
|
22
|
+
"transfect", "transduct", "knockdown", "overexpress", "overexpression",
|
|
23
|
+
"siRNA", "shRNA", "CRISPR", "sgRNA",
|
|
24
|
+
"co-culture", "monolayer", "spheroid", "organoid",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
# Common cell line names (case-insensitive matching handled separately)
|
|
28
|
+
CELL_LINE_NAMES = [
|
|
29
|
+
"MCF-7", "MCF7", "MDA-MB-231", "MDA-MB-468", "T47D", "BT-474", "BT474",
|
|
30
|
+
"BT-549", "BT549", "MDA-MB-453", "CAL-51", "HCC1937", "HCC1806",
|
|
31
|
+
"SK-BR-3", "SKBR3", "ZR-75", "4T1", "EMT6",
|
|
32
|
+
"HeLa", "HEK293", "HEK-293", "293T", "HEK293T",
|
|
33
|
+
"A549", "H1299", "H460", "H1975", "PC9", "HCC827",
|
|
34
|
+
"HCT116", "HT29", "SW480", "SW620", "LoVo", "Caco-2",
|
|
35
|
+
"U87", "U251", "T98G", "LN229",
|
|
36
|
+
"PC3", "PC-3", "LNCaP", "DU145", "22Rv1", "VCaP",
|
|
37
|
+
"K562", "HL60", "HL-60", "Jurkat", "THP-1", "U937",
|
|
38
|
+
"HepG2", "Hep3B", "Huh7", "SMMC-7721",
|
|
39
|
+
"PANC-1", "MiaPaCa-2", "BxPC-3", "AsPC-1",
|
|
40
|
+
"A375", "SK-MEL-28", "B16", "B16F10",
|
|
41
|
+
"OVCAR3", "SKOV3", "A2780",
|
|
42
|
+
"CHO", "NIH3T3", "3T3", "COS-7",
|
|
43
|
+
"Raji", "Ramos", "Daudi",
|
|
44
|
+
"SH-SY5Y", "Neuro-2a", "N2a",
|
|
45
|
+
"RAW264.7", "RAW 264.7", "J774",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
# Assay keyword categories
|
|
49
|
+
ASSAY_KEYWORDS = {
|
|
50
|
+
"viability": [
|
|
51
|
+
"viability", "MTT", "CCK-8", "CCK8", "WST", "cell counting",
|
|
52
|
+
"CellTiter", "MTS", "XTT", "alamarBlue", "resazurin",
|
|
53
|
+
"cytotoxicity", "IC50", "EC50", "dose-response",
|
|
54
|
+
],
|
|
55
|
+
"proliferation": [
|
|
56
|
+
"proliferation", "colony formation", "clonogenic", "BrdU", "EdU",
|
|
57
|
+
"Ki-67", "Ki67", "cell growth", "growth curve", "doubling time",
|
|
58
|
+
],
|
|
59
|
+
"apoptosis": [
|
|
60
|
+
"apoptosis", "annexin", "caspase", "TUNEL", "cell death",
|
|
61
|
+
"sub-G1", "programmed cell death", "Bcl-2", "BAX",
|
|
62
|
+
"cleaved PARP", "cytochrome c release",
|
|
63
|
+
],
|
|
64
|
+
"migration_invasion": [
|
|
65
|
+
"migration", "invasion", "wound healing", "transwell", "Boyden",
|
|
66
|
+
"scratch assay", "chemotaxis", "Matrigel",
|
|
67
|
+
],
|
|
68
|
+
"gene_expression": [
|
|
69
|
+
"qPCR", "RT-PCR", "real-time PCR", "qRT-PCR",
|
|
70
|
+
"mRNA expression", "RNA-seq", "RNAseq", "transcriptom",
|
|
71
|
+
"gene expression", "Northern blot",
|
|
72
|
+
],
|
|
73
|
+
"protein_analysis": [
|
|
74
|
+
"Western blot", "immunoblot", "ELISA", "immunoprecipitation",
|
|
75
|
+
"phosphorylation", "Co-IP", "pull-down", "mass spectrometry",
|
|
76
|
+
"proteomics", "immunofluorescence",
|
|
77
|
+
],
|
|
78
|
+
"flow_cytometry": [
|
|
79
|
+
"flow cytometry", "FACS", "cell cycle", "cell sorting",
|
|
80
|
+
"intracellular staining", "surface marker",
|
|
81
|
+
],
|
|
82
|
+
"reporter": [
|
|
83
|
+
"luciferase", "reporter assay", "GFP", "fluorescent reporter",
|
|
84
|
+
"dual-luciferase", "beta-galactosidase",
|
|
85
|
+
],
|
|
86
|
+
"cell_signaling": [
|
|
87
|
+
"signaling assay", "signaling pathway analysis",
|
|
88
|
+
"phospho-", "kinase activity", "kinase assay",
|
|
89
|
+
"pathway activation assay", "phosphoproteomics",
|
|
90
|
+
],
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# In vivo indicators
|
|
94
|
+
IN_VIVO_KEYWORDS = [
|
|
95
|
+
"in vivo", "mouse", "mice", "murine", "rat", "rats",
|
|
96
|
+
"xenograft", "allograft", "PDX", "patient-derived xenograft",
|
|
97
|
+
"orthotopic", "subcutaneous", "tumor-bearing",
|
|
98
|
+
"nude mice", "BALB/c", "C57BL/6", "SCID", "NSG", "NOD",
|
|
99
|
+
"transgenic", "knockout mice", "knock-in",
|
|
100
|
+
"animal model", "animal experiment", "preclinical model",
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
# Animal model categories
|
|
104
|
+
ANIMAL_MODEL_KEYWORDS = {
|
|
105
|
+
"xenograft": [
|
|
106
|
+
"xenograft", "subcutaneous tumor", "subcutaneous implant",
|
|
107
|
+
"orthotopic implant", "orthotopic model",
|
|
108
|
+
"human tumor", "nude mice xenograft",
|
|
109
|
+
],
|
|
110
|
+
"pdx": [
|
|
111
|
+
"PDX", "patient-derived xenograft", "patient-derived model",
|
|
112
|
+
],
|
|
113
|
+
"syngeneic": [
|
|
114
|
+
"syngeneic", "allograft", "immunocompetent",
|
|
115
|
+
"4T1", "CT26", "B16", "MC38", "LLC", "EMT6",
|
|
116
|
+
],
|
|
117
|
+
"transgenic": [
|
|
118
|
+
"transgenic", "knockout", "knock-in", "conditional knockout",
|
|
119
|
+
"Cre-lox", "GEMM", "genetically engineered",
|
|
120
|
+
],
|
|
121
|
+
"metastasis": [
|
|
122
|
+
"metastasis model", "tail vein injection", "intracardiac",
|
|
123
|
+
"metastatic", "lung metastasis", "liver metastasis",
|
|
124
|
+
"spontaneous metastasis",
|
|
125
|
+
],
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
# In vivo endpoint categories
|
|
129
|
+
ENDPOINT_KEYWORDS = {
|
|
130
|
+
"tumor_growth": [
|
|
131
|
+
"tumor volume", "tumor growth", "tumor weight", "tumor size",
|
|
132
|
+
"tumor regression", "tumor inhibition", "anti-tumor",
|
|
133
|
+
"tumor growth inhibition", "TGI",
|
|
134
|
+
],
|
|
135
|
+
"survival": [
|
|
136
|
+
"survival", "overall survival", "Kaplan-Meier",
|
|
137
|
+
"median survival", "survival rate", "lifespan",
|
|
138
|
+
],
|
|
139
|
+
"biomarker": [
|
|
140
|
+
"biomarker", "serum level", "plasma level", "circulating",
|
|
141
|
+
"pharmacodynamic", "PD marker",
|
|
142
|
+
],
|
|
143
|
+
"imaging": [
|
|
144
|
+
"bioluminescence", "in vivo imaging", "PET", "MRI", "CT scan",
|
|
145
|
+
"IVIS", "fluorescence imaging", "ultrasound",
|
|
146
|
+
],
|
|
147
|
+
"histology": [
|
|
148
|
+
"histology", "immunohistochemistry", "IHC", "H&E",
|
|
149
|
+
"histopathology", "tissue staining", "TUNEL staining",
|
|
150
|
+
],
|
|
151
|
+
"pharmacokinetics": [
|
|
152
|
+
"pharmacokinetic", "PK", "half-life", "bioavailability",
|
|
153
|
+
"AUC", "Cmax", "clearance", "distribution",
|
|
154
|
+
],
|
|
155
|
+
"toxicity": [
|
|
156
|
+
"toxicity", "body weight", "adverse", "tolerability",
|
|
157
|
+
"maximum tolerated dose", "MTD", "safety", "organ toxicity",
|
|
158
|
+
],
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
# Finding / result keywords
|
|
162
|
+
FINDING_KEYWORDS = [
|
|
163
|
+
"significantly", "inhibited", "reduced", "suppressed", "attenuated",
|
|
164
|
+
"enhanced", "increased", "promoted", "induced", "abolished",
|
|
165
|
+
"demonstrated", "showed", "revealed", "observed",
|
|
166
|
+
"decreased", "elevated", "impaired", "restored", "abrogated",
|
|
167
|
+
"potentiated", "synergistic", "additive", "antagonistic",
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# ---------------------------------------------------------------------------
|
|
172
|
+
# Public API
|
|
173
|
+
# ---------------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
def extract_all_experiments(
|
|
176
|
+
results: List[Dict],
|
|
177
|
+
output_dir: str = "preclinical_results"
|
|
178
|
+
) -> List[Dict]:
|
|
179
|
+
"""
|
|
180
|
+
Extract in vitro and in vivo experiment details from all paper abstracts.
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
results : List[Dict]
|
|
185
|
+
Search results from preclinical_search
|
|
186
|
+
output_dir : str
|
|
187
|
+
Output directory for experiment_extraction.csv
|
|
188
|
+
|
|
189
|
+
Returns
|
|
190
|
+
-------
|
|
191
|
+
List[Dict]
|
|
192
|
+
List of experiment extraction dicts, one per paper
|
|
193
|
+
|
|
194
|
+
Verification
|
|
195
|
+
------------
|
|
196
|
+
Prints "✓ Experiment extraction completed successfully!"
|
|
197
|
+
"""
|
|
198
|
+
print("\n" + "=" * 70)
|
|
199
|
+
print("EXTRACTING EXPERIMENTS FROM ABSTRACTS")
|
|
200
|
+
print("=" * 70)
|
|
201
|
+
|
|
202
|
+
experiments = []
|
|
203
|
+
in_vitro_count = 0
|
|
204
|
+
in_vivo_count = 0
|
|
205
|
+
both_count = 0
|
|
206
|
+
|
|
207
|
+
for i, paper in enumerate(results):
|
|
208
|
+
abstract = paper.get("abstract", "")
|
|
209
|
+
title = paper.get("title", "")
|
|
210
|
+
text = f"{title} {abstract}"
|
|
211
|
+
|
|
212
|
+
# Classify experiment type
|
|
213
|
+
exp_type = _classify_experiment_type(text)
|
|
214
|
+
|
|
215
|
+
# Extract details
|
|
216
|
+
in_vitro = _extract_in_vitro(text)
|
|
217
|
+
in_vivo = _extract_in_vivo(text)
|
|
218
|
+
findings = _extract_findings(text)
|
|
219
|
+
|
|
220
|
+
experiment = {
|
|
221
|
+
"pmid": paper.get("pmid", ""),
|
|
222
|
+
"doi": paper.get("doi", ""),
|
|
223
|
+
"title": paper.get("title", ""),
|
|
224
|
+
"authors": paper.get("authors", ""),
|
|
225
|
+
"publication_date": paper.get("publication_date", ""),
|
|
226
|
+
"experiment_type": exp_type,
|
|
227
|
+
# In vitro details
|
|
228
|
+
"cell_lines": "; ".join(in_vitro["cell_lines"]) if in_vitro["cell_lines"] else "",
|
|
229
|
+
"assays": "; ".join(in_vitro["assays"]) if in_vitro["assays"] else "",
|
|
230
|
+
"in_vitro_findings": " | ".join(in_vitro["findings"][:3]) if in_vitro["findings"] else "",
|
|
231
|
+
# In vivo details
|
|
232
|
+
"animal_models": "; ".join(in_vivo["animal_models"]) if in_vivo["animal_models"] else "",
|
|
233
|
+
"endpoints": "; ".join(in_vivo["endpoints"]) if in_vivo["endpoints"] else "",
|
|
234
|
+
"in_vivo_findings": " | ".join(in_vivo["findings"][:3]) if in_vivo["findings"] else "",
|
|
235
|
+
# General findings
|
|
236
|
+
"key_findings": " | ".join(findings[:3]) if findings else "",
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
experiments.append(experiment)
|
|
240
|
+
|
|
241
|
+
if exp_type == "in_vitro":
|
|
242
|
+
in_vitro_count += 1
|
|
243
|
+
elif exp_type == "in_vivo":
|
|
244
|
+
in_vivo_count += 1
|
|
245
|
+
elif exp_type == "both":
|
|
246
|
+
both_count += 1
|
|
247
|
+
|
|
248
|
+
# Summary
|
|
249
|
+
unclassified = len(experiments) - in_vitro_count - in_vivo_count - both_count
|
|
250
|
+
print(f"\n Processed {len(experiments)} papers:")
|
|
251
|
+
print(f" In vitro only: {in_vitro_count}")
|
|
252
|
+
print(f" In vivo only: {in_vivo_count}")
|
|
253
|
+
print(f" Both: {both_count}")
|
|
254
|
+
print(f" Unclassified: {unclassified}")
|
|
255
|
+
|
|
256
|
+
# Save CSV
|
|
257
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
258
|
+
output_file = os.path.join(output_dir, "experiment_extraction.csv")
|
|
259
|
+
df = pd.DataFrame(experiments)
|
|
260
|
+
df.to_csv(output_file, index=False, encoding="utf-8")
|
|
261
|
+
print(f"\n Saved extraction results to {output_file}")
|
|
262
|
+
|
|
263
|
+
print(f"\n✓ Experiment extraction completed successfully!")
|
|
264
|
+
return experiments
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
# ---------------------------------------------------------------------------
|
|
268
|
+
# Internal helpers
|
|
269
|
+
# ---------------------------------------------------------------------------
|
|
270
|
+
|
|
271
|
+
def _keyword_match(text: str, keywords: list, case_sensitive: bool = False) -> bool:
|
|
272
|
+
"""Check if any keyword matches as a whole word in text."""
|
|
273
|
+
for kw in keywords:
|
|
274
|
+
if case_sensitive:
|
|
275
|
+
pattern = r'\b' + re.escape(kw) + r'\b'
|
|
276
|
+
if re.search(pattern, text):
|
|
277
|
+
return True
|
|
278
|
+
else:
|
|
279
|
+
pattern = r'\b' + re.escape(kw.lower()) + r'\b'
|
|
280
|
+
if re.search(pattern, text.lower()):
|
|
281
|
+
return True
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _classify_experiment_type(text: str) -> str:
|
|
286
|
+
"""Classify whether a paper describes in vitro, in vivo, or both experiments."""
|
|
287
|
+
has_in_vitro = _keyword_match(text, IN_VITRO_KEYWORDS)
|
|
288
|
+
has_in_vivo = _keyword_match(text, IN_VIVO_KEYWORDS)
|
|
289
|
+
|
|
290
|
+
# Also check for cell line names (case-sensitive for some)
|
|
291
|
+
if not has_in_vitro:
|
|
292
|
+
has_in_vitro = any(re.search(r'\b' + re.escape(cl) + r'\b', text) for cl in CELL_LINE_NAMES)
|
|
293
|
+
|
|
294
|
+
if has_in_vitro and has_in_vivo:
|
|
295
|
+
return "both"
|
|
296
|
+
elif has_in_vitro:
|
|
297
|
+
return "in_vitro"
|
|
298
|
+
elif has_in_vivo:
|
|
299
|
+
return "in_vivo"
|
|
300
|
+
else:
|
|
301
|
+
return "unclassified"
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _extract_in_vitro(text: str) -> Dict:
|
|
305
|
+
"""Extract in vitro experiment details from text."""
|
|
306
|
+
result = {
|
|
307
|
+
"cell_lines": [],
|
|
308
|
+
"assays": [],
|
|
309
|
+
"findings": [],
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
# Detect cell lines (word boundary matching to avoid partial matches)
|
|
313
|
+
found_cell_lines = set()
|
|
314
|
+
for cl_name in CELL_LINE_NAMES:
|
|
315
|
+
pattern = r'\b' + re.escape(cl_name) + r'\b'
|
|
316
|
+
if re.search(pattern, text):
|
|
317
|
+
found_cell_lines.add(cl_name)
|
|
318
|
+
|
|
319
|
+
result["cell_lines"] = sorted(found_cell_lines)
|
|
320
|
+
|
|
321
|
+
# Detect assay types
|
|
322
|
+
found_assays = set()
|
|
323
|
+
for assay_category, keywords in ASSAY_KEYWORDS.items():
|
|
324
|
+
if _keyword_match(text, keywords):
|
|
325
|
+
found_assays.add(assay_category)
|
|
326
|
+
|
|
327
|
+
result["assays"] = sorted(found_assays)
|
|
328
|
+
|
|
329
|
+
# Extract in vitro finding sentences
|
|
330
|
+
result["findings"] = _extract_finding_sentences(text, IN_VITRO_KEYWORDS)
|
|
331
|
+
|
|
332
|
+
return result
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _extract_in_vivo(text: str) -> Dict:
|
|
336
|
+
"""Extract in vivo experiment details from text."""
|
|
337
|
+
result = {
|
|
338
|
+
"animal_models": [],
|
|
339
|
+
"endpoints": [],
|
|
340
|
+
"findings": [],
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
# Detect animal model types
|
|
344
|
+
found_models = set()
|
|
345
|
+
for model_category, keywords in ANIMAL_MODEL_KEYWORDS.items():
|
|
346
|
+
if _keyword_match(text, keywords):
|
|
347
|
+
found_models.add(model_category)
|
|
348
|
+
|
|
349
|
+
result["animal_models"] = sorted(found_models)
|
|
350
|
+
|
|
351
|
+
# Detect endpoints
|
|
352
|
+
found_endpoints = set()
|
|
353
|
+
for endpoint_category, keywords in ENDPOINT_KEYWORDS.items():
|
|
354
|
+
if _keyword_match(text, keywords):
|
|
355
|
+
found_endpoints.add(endpoint_category)
|
|
356
|
+
|
|
357
|
+
result["endpoints"] = sorted(found_endpoints)
|
|
358
|
+
|
|
359
|
+
# Extract in vivo finding sentences
|
|
360
|
+
result["findings"] = _extract_finding_sentences(text, IN_VIVO_KEYWORDS)
|
|
361
|
+
|
|
362
|
+
return result
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _extract_findings(text: str) -> List[str]:
|
|
366
|
+
"""Extract general key finding sentences from text."""
|
|
367
|
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
368
|
+
findings = []
|
|
369
|
+
|
|
370
|
+
for sentence in sentences:
|
|
371
|
+
sentence = sentence.strip()
|
|
372
|
+
if len(sentence.split()) < 8:
|
|
373
|
+
continue
|
|
374
|
+
|
|
375
|
+
if _keyword_match(sentence, FINDING_KEYWORDS):
|
|
376
|
+
if len(sentence) > 300:
|
|
377
|
+
sentence = sentence[:297] + "..."
|
|
378
|
+
findings.append(sentence)
|
|
379
|
+
|
|
380
|
+
return findings[:5]
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def _extract_finding_sentences(text: str, context_keywords: List[str]) -> List[str]:
|
|
384
|
+
"""Extract finding sentences that also mention context keywords."""
|
|
385
|
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
386
|
+
findings = []
|
|
387
|
+
|
|
388
|
+
for sentence in sentences:
|
|
389
|
+
sentence = sentence.strip()
|
|
390
|
+
if len(sentence.split()) < 8:
|
|
391
|
+
continue
|
|
392
|
+
|
|
393
|
+
has_finding = _keyword_match(sentence, FINDING_KEYWORDS)
|
|
394
|
+
has_context = _keyword_match(sentence, context_keywords)
|
|
395
|
+
|
|
396
|
+
if has_finding and has_context:
|
|
397
|
+
if len(sentence) > 300:
|
|
398
|
+
sentence = sentence[:297] + "..."
|
|
399
|
+
findings.append(sentence)
|
|
400
|
+
|
|
401
|
+
return findings[:3]
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Generate the unified per-paper summary table for the literature skill."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
from typing import Dict, Iterable, List
|
|
7
|
+
|
|
8
|
+
from synthesis import classify_evidence_quality, classify_study_type
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _authors_year(paper: Dict) -> str:
|
|
12
|
+
authors = paper.get("authors") or []
|
|
13
|
+
if isinstance(authors, str):
|
|
14
|
+
authors = [a.strip() for a in authors.split(";") if a.strip()]
|
|
15
|
+
lead = authors[0] if authors else "Unknown"
|
|
16
|
+
if len(authors) > 1 and "et al." not in lead:
|
|
17
|
+
lead = f"{lead} et al."
|
|
18
|
+
year = paper.get("year") or paper.get("publication_date", "")[:4] or "n.d."
|
|
19
|
+
return f"{lead} ({year})"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _identifier(paper: Dict) -> str:
|
|
23
|
+
if paper.get("pmid"):
|
|
24
|
+
return f"PMID:{paper['pmid']}"
|
|
25
|
+
if paper.get("doi"):
|
|
26
|
+
return paper["doi"]
|
|
27
|
+
if paper.get("s2_id"):
|
|
28
|
+
return paper["s2_id"]
|
|
29
|
+
return "NA"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _truncate(text: str, limit: int = 160) -> str:
|
|
33
|
+
text = " ".join(str(text or "").split())
|
|
34
|
+
if len(text) <= limit:
|
|
35
|
+
return text
|
|
36
|
+
return text[: limit - 1].rstrip() + "…"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def build_table_rows(papers: List[Dict], experiments: List[Dict] | None = None, mode: str = "general") -> List[Dict]:
|
|
40
|
+
experiment_map = {}
|
|
41
|
+
for exp in experiments or []:
|
|
42
|
+
key = exp.get("pmid") or exp.get("doi")
|
|
43
|
+
if key:
|
|
44
|
+
experiment_map[key] = exp
|
|
45
|
+
|
|
46
|
+
rows = []
|
|
47
|
+
for idx, paper in enumerate(papers, start=1):
|
|
48
|
+
key = paper.get("pmid") or paper.get("doi")
|
|
49
|
+
exp = experiment_map.get(key, {})
|
|
50
|
+
row = {
|
|
51
|
+
"#": idx,
|
|
52
|
+
"PMID/DOI": _identifier(paper),
|
|
53
|
+
"Authors (year)": _authors_year(paper),
|
|
54
|
+
"Key Message": _truncate(paper.get("tldr") or paper.get("title") or ""),
|
|
55
|
+
"Key Results": _truncate(paper.get("abstract") or exp.get("key_findings") or ""),
|
|
56
|
+
"Key Methods": _truncate(
|
|
57
|
+
"; ".join(filter(None, [
|
|
58
|
+
", ".join(paper.get("publication_types", [])[:3]) if isinstance(paper.get("publication_types"), list) else "",
|
|
59
|
+
exp.get("assays", ""),
|
|
60
|
+
exp.get("endpoints", ""),
|
|
61
|
+
]))
|
|
62
|
+
),
|
|
63
|
+
"Study Type": classify_study_type(paper),
|
|
64
|
+
"Evidence Quality": classify_evidence_quality(paper),
|
|
65
|
+
}
|
|
66
|
+
if mode == "preclinical":
|
|
67
|
+
row.update({
|
|
68
|
+
"Experiment Type": exp.get("experiment_type", ""),
|
|
69
|
+
"Model System": exp.get("cell_lines") or exp.get("animal_models") or "",
|
|
70
|
+
"Assay/Endpoint": "; ".join(filter(None, [exp.get("assays", ""), exp.get("endpoints", "")])),
|
|
71
|
+
"Finding Direction": exp.get("key_findings", ""),
|
|
72
|
+
})
|
|
73
|
+
rows.append(row)
|
|
74
|
+
return rows
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def rows_to_markdown(rows: Iterable[Dict]) -> str:
|
|
78
|
+
rows = list(rows)
|
|
79
|
+
if not rows:
|
|
80
|
+
return "## Paper Summary Table\n\n_No papers available._\n"
|
|
81
|
+
headers = list(rows[0].keys())
|
|
82
|
+
out = ["## Paper Summary Table", "", "| " + " | ".join(headers) + " |", "|" + "---|" * len(headers)]
|
|
83
|
+
for row in rows:
|
|
84
|
+
out.append("| " + " | ".join(str(row.get(header, "")).replace("\n", " ") for header in headers) + " |")
|
|
85
|
+
return "\n".join(out) + "\n"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def write_csv(rows: Iterable[Dict], output_path: str) -> None:
|
|
89
|
+
rows = list(rows)
|
|
90
|
+
if not rows:
|
|
91
|
+
return
|
|
92
|
+
headers = list(rows[0].keys())
|
|
93
|
+
with open(output_path, "w", newline="", encoding="utf-8") as handle:
|
|
94
|
+
writer = csv.DictWriter(handle, fieldnames=headers)
|
|
95
|
+
writer.writeheader()
|
|
96
|
+
writer.writerows(rows)
|