biolit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolit-0.1.0/PKG-INFO +153 -0
- biolit-0.1.0/README.md +132 -0
- biolit-0.1.0/biolit/__init__.py +2 -0
- biolit-0.1.0/biolit/cli.py +197 -0
- biolit-0.1.0/biolit/fetchers/__init__.py +12 -0
- biolit-0.1.0/biolit/fetchers/geo.py +92 -0
- biolit-0.1.0/biolit/fetchers/preprints.py +50 -0
- biolit-0.1.0/biolit/fetchers/pubmed.py +98 -0
- biolit-0.1.0/biolit/fetchers/unpaywall.py +47 -0
- biolit-0.1.0/biolit/llm/__init__.py +28 -0
- biolit-0.1.0/biolit/llm/anthropic_client.py +29 -0
- biolit-0.1.0/biolit/llm/base.py +18 -0
- biolit-0.1.0/biolit/llm/ollama_client.py +25 -0
- biolit-0.1.0/biolit/llm/openai_client.py +28 -0
- biolit-0.1.0/biolit/parsers/__init__.py +7 -0
- biolit-0.1.0/biolit/parsers/jats.py +126 -0
- biolit-0.1.0/biolit/parsers/pdf.py +87 -0
- biolit-0.1.0/biolit/parsers/utils.py +46 -0
- biolit-0.1.0/biolit/pipeline.py +385 -0
- biolit-0.1.0/biolit/utils.py +72 -0
- biolit-0.1.0/biolit.egg-info/PKG-INFO +153 -0
- biolit-0.1.0/biolit.egg-info/SOURCES.txt +32 -0
- biolit-0.1.0/biolit.egg-info/dependency_links.txt +1 -0
- biolit-0.1.0/biolit.egg-info/entry_points.txt +2 -0
- biolit-0.1.0/biolit.egg-info/requires.txt +6 -0
- biolit-0.1.0/biolit.egg-info/top_level.txt +1 -0
- biolit-0.1.0/pyproject.toml +38 -0
- biolit-0.1.0/setup.cfg +4 -0
- biolit-0.1.0/tests/test_cli.py +102 -0
- biolit-0.1.0/tests/test_geo.py +110 -0
- biolit-0.1.0/tests/test_parsers.py +72 -0
- biolit-0.1.0/tests/test_pipeline.py +366 -0
- biolit-0.1.0/tests/test_test1_eml.py +56 -0
- biolit-0.1.0/tests/test_utils.py +56 -0
biolit-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: biolit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LLM-assisted biomedical literature screening and structured extraction for PubMed and GEO.
|
|
5
|
+
Author-email: Rachel Schwartz <raschwaaa@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/rachadele/pubmed-screener
|
|
8
|
+
Project-URL: Homepage, https://github.com/rachadele/pubmed-screener#readme
|
|
9
|
+
Keywords: pubmed,geo,literature-review,llm,bioinformatics,genomics,mcp
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
13
|
+
Requires-Python: >=3.8
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: anthropic
|
|
16
|
+
Requires-Dist: openai
|
|
17
|
+
Requires-Dist: requests
|
|
18
|
+
Requires-Dist: python-dotenv
|
|
19
|
+
Requires-Dist: lxml
|
|
20
|
+
Requires-Dist: pdfminer.six
|
|
21
|
+
|
|
22
|
+
# biolit
|
|
23
|
+
|
|
24
|
+
LLM-assisted biomedical literature screening and structured extraction. Accepts PubMed alert emails, plain PMID lists, or GEO accession lists. Supports multiple LLM providers and optional full-text retrieval.
|
|
25
|
+
|
|
26
|
+
## Setup
|
|
27
|
+
|
|
28
|
+
**Requirements:** Python 3.8+
|
|
29
|
+
|
|
30
|
+
Install the package (creates the `biolit` command):
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install -e .
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Copy `.env.example` to `.env` and add your API key:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
cp .env.example .env
|
|
40
|
+
# edit .env and set ANTHROPIC_API_KEY (or OPENAI_API_KEY)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
The tool accepts several input formats, auto-detected by file extension or content:
|
|
46
|
+
|
|
47
|
+
| Input | How to pass | Example |
|
|
48
|
+
|---|---|---|
|
|
49
|
+
| PubMed alert email | positional `.eml` file | `alert.eml` |
|
|
50
|
+
| PMID list (file) | positional plain-text file, one PMID per line | `pmids.txt` |
|
|
51
|
+
| GEO accession list (file) | positional plain-text file, one accession per line | `geo_accessions.txt` |
|
|
52
|
+
| PMIDs (inline) | `--pmids` flag, comma-separated | `--pmids 41795042,41792186` |
|
|
53
|
+
| GEO accessions (inline) | `--accessions` flag, comma-separated | `--accessions GSE53987,GSE12345` |
|
|
54
|
+
|
|
55
|
+
Use `--default` to run with schizophrenia genomics defaults (no prompts):
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
biolit alert.eml --default
|
|
59
|
+
biolit pmids.txt --default
|
|
60
|
+
biolit geo_accessions.txt --default
|
|
61
|
+
biolit --pmids 41795042,41792186 --default
|
|
62
|
+
biolit --accessions GSE53987 --default
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Or specify criterion and fields as flags:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
biolit pmids.txt \
|
|
69
|
+
--criterion "Is this about treatment-resistant schizophrenia?" \
|
|
70
|
+
--fields "methodology, sample_size, treatment, outcomes"
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or interactively (prompted if not provided):
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
biolit alert.eml
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### GEO accession input
|
|
80
|
+
|
|
81
|
+
Pass a file of GEO series accessions (GSE, GDS, GSM, or GPL prefixes) to screen GEO records directly. The tool fetches each record's MINiML XML, extracts the summary, overall design, experiment type, and organism, then runs the same LLM screening and extraction pipeline.
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
biolit geo_accessions.txt \
|
|
85
|
+
--criterion "Does this study perturb a transcription factor?" \
|
|
86
|
+
--fields "organism, experiment_type, tf_perturbed, perturbation_method, summary"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
GEO results include `geo_accession` and `pmids` (linked PubMed IDs) columns in place of `pmid`.
|
|
90
|
+
|
|
91
|
+
### Full-text retrieval (PubMed inputs only)
|
|
92
|
+
|
|
93
|
+
Use `--fulltext` to screen and extract from full text instead of just the abstract. The pipeline tries each source in order:
|
|
94
|
+
|
|
95
|
+
1. PMC JATS XML (open access)
|
|
96
|
+
2. Preprint XML (bioRxiv / medRxiv)
|
|
97
|
+
3. Unpaywall PDF (requires `--unpaywall-email`)
|
|
98
|
+
4. Abstract fallback
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
biolit alert.eml --default --fulltext --unpaywall-email you@example.com
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Limit which sections are sent to the LLM:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
biolit alert.eml --default --fulltext --sections methods,results
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### LLM providers
|
|
111
|
+
|
|
112
|
+
The tool supports Anthropic (default), OpenAI, and local Ollama models:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
# OpenAI
|
|
116
|
+
biolit pmids.txt --default --provider openai --model gpt-4o
|
|
117
|
+
|
|
118
|
+
# Ollama (local)
|
|
119
|
+
biolit pmids.txt --default --provider ollama --model llama3
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
You can also set `LLM_PROVIDER` and `LLM_MODEL` as environment variables.
|
|
123
|
+
|
|
124
|
+
## Output
|
|
125
|
+
|
|
126
|
+
Each run creates a timestamped directory (e.g. `run_20260313_142000/`) containing:
|
|
127
|
+
|
|
128
|
+
- `results.csv` — one row per relevant record
|
|
129
|
+
- `artifacts/<id>/` — per-record folder with the text sent to the LLM, metadata, and any retrieved full-text files
|
|
130
|
+
|
|
131
|
+
With `--default` on PubMed inputs, the CSV columns are:
|
|
132
|
+
|
|
133
|
+
| Column | Description |
|
|
134
|
+
|---|---|
|
|
135
|
+
| `title` | Paper title |
|
|
136
|
+
| `url` | PubMed link |
|
|
137
|
+
| `pmid` | PubMed ID |
|
|
138
|
+
| `doi` | DOI |
|
|
139
|
+
| `text_source` | Where the text came from (`abstract`, `pmc_fulltext`, `preprint_fulltext`, `unpaywall_pdf`) |
|
|
140
|
+
| `methodology` | General method (e.g. GWAS, scRNA-seq, proteomics) |
|
|
141
|
+
| `sample_type` | Tissue/sample type and origin |
|
|
142
|
+
| `causal_claims` | Statements about causes of schizophrenia inferred from the data |
|
|
143
|
+
| `genetics_claims` | Claims about specific genes, loci, or pathways |
|
|
144
|
+
| `summary` | 2-3 sentence plain-language summary for triage |
|
|
145
|
+
|
|
146
|
+
For GEO inputs, `pmid` is replaced by `geo_accession` and `pmids`.
|
|
147
|
+
|
|
148
|
+
The CSV can be imported directly into Google Sheets (File → Import).
|
|
149
|
+
|
|
150
|
+
## Known Limitations
|
|
151
|
+
|
|
152
|
+
- Papers without abstracts or accessible full text are skipped silently.
|
|
153
|
+
- Full-text retrieval (`--fulltext`) applies to PubMed inputs only; GEO records use the record metadata directly.
|
biolit-0.1.0/README.md
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# biolit
|
|
2
|
+
|
|
3
|
+
LLM-assisted biomedical literature screening and structured extraction. Accepts PubMed alert emails, plain PMID lists, or GEO accession lists. Supports multiple LLM providers and optional full-text retrieval.
|
|
4
|
+
|
|
5
|
+
## Setup
|
|
6
|
+
|
|
7
|
+
**Requirements:** Python 3.8+
|
|
8
|
+
|
|
9
|
+
Install the package (creates the `biolit` command):
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install -e .
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Copy `.env.example` to `.env` and add your API key:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
cp .env.example .env
|
|
19
|
+
# edit .env and set ANTHROPIC_API_KEY (or OPENAI_API_KEY)
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
The tool accepts several input formats, auto-detected by file extension or content:
|
|
25
|
+
|
|
26
|
+
| Input | How to pass | Example |
|
|
27
|
+
|---|---|---|
|
|
28
|
+
| PubMed alert email | positional `.eml` file | `alert.eml` |
|
|
29
|
+
| PMID list (file) | positional plain-text file, one PMID per line | `pmids.txt` |
|
|
30
|
+
| GEO accession list (file) | positional plain-text file, one accession per line | `geo_accessions.txt` |
|
|
31
|
+
| PMIDs (inline) | `--pmids` flag, comma-separated | `--pmids 41795042,41792186` |
|
|
32
|
+
| GEO accessions (inline) | `--accessions` flag, comma-separated | `--accessions GSE53987,GSE12345` |
|
|
33
|
+
|
|
34
|
+
Use `--default` to run with schizophrenia genomics defaults (no prompts):
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
biolit alert.eml --default
|
|
38
|
+
biolit pmids.txt --default
|
|
39
|
+
biolit geo_accessions.txt --default
|
|
40
|
+
biolit --pmids 41795042,41792186 --default
|
|
41
|
+
biolit --accessions GSE53987 --default
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Or specify criterion and fields as flags:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
biolit pmids.txt \
|
|
48
|
+
--criterion "Is this about treatment-resistant schizophrenia?" \
|
|
49
|
+
--fields "methodology, sample_size, treatment, outcomes"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Or interactively (prompted if not provided):
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
biolit alert.eml
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### GEO accession input
|
|
59
|
+
|
|
60
|
+
Pass a file of GEO series accessions (GSE, GDS, GSM, or GPL prefixes) to screen GEO records directly. The tool fetches each record's MINiML XML, extracts the summary, overall design, experiment type, and organism, then runs the same LLM screening and extraction pipeline.
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
biolit geo_accessions.txt \
|
|
64
|
+
--criterion "Does this study perturb a transcription factor?" \
|
|
65
|
+
--fields "organism, experiment_type, tf_perturbed, perturbation_method, summary"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
GEO results include `geo_accession` and `pmids` (linked PubMed IDs) columns in place of `pmid`.
|
|
69
|
+
|
|
70
|
+
### Full-text retrieval (PubMed inputs only)
|
|
71
|
+
|
|
72
|
+
Use `--fulltext` to screen and extract from full text instead of just the abstract. The pipeline tries each source in order:
|
|
73
|
+
|
|
74
|
+
1. PMC JATS XML (open access)
|
|
75
|
+
2. Preprint XML (bioRxiv / medRxiv)
|
|
76
|
+
3. Unpaywall PDF (requires `--unpaywall-email`)
|
|
77
|
+
4. Abstract fallback
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
biolit alert.eml --default --fulltext --unpaywall-email you@example.com
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Limit which sections are sent to the LLM:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
biolit alert.eml --default --fulltext --sections methods,results
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### LLM providers
|
|
90
|
+
|
|
91
|
+
The tool supports Anthropic (default), OpenAI, and local Ollama models:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# OpenAI
|
|
95
|
+
biolit pmids.txt --default --provider openai --model gpt-4o
|
|
96
|
+
|
|
97
|
+
# Ollama (local)
|
|
98
|
+
biolit pmids.txt --default --provider ollama --model llama3
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
You can also set `LLM_PROVIDER` and `LLM_MODEL` as environment variables.
|
|
102
|
+
|
|
103
|
+
## Output
|
|
104
|
+
|
|
105
|
+
Each run creates a timestamped directory (e.g. `run_20260313_142000/`) containing:
|
|
106
|
+
|
|
107
|
+
- `results.csv` — one row per relevant record
|
|
108
|
+
- `artifacts/<id>/` — per-record folder with the text sent to the LLM, metadata, and any retrieved full-text files
|
|
109
|
+
|
|
110
|
+
With `--default` on PubMed inputs, the CSV columns are:
|
|
111
|
+
|
|
112
|
+
| Column | Description |
|
|
113
|
+
|---|---|
|
|
114
|
+
| `title` | Paper title |
|
|
115
|
+
| `url` | PubMed link |
|
|
116
|
+
| `pmid` | PubMed ID |
|
|
117
|
+
| `doi` | DOI |
|
|
118
|
+
| `text_source` | Where the text came from (`abstract`, `pmc_fulltext`, `preprint_fulltext`, `unpaywall_pdf`) |
|
|
119
|
+
| `methodology` | General method (e.g. GWAS, scRNA-seq, proteomics) |
|
|
120
|
+
| `sample_type` | Tissue/sample type and origin |
|
|
121
|
+
| `causal_claims` | Statements about causes of schizophrenia inferred from the data |
|
|
122
|
+
| `genetics_claims` | Claims about specific genes, loci, or pathways |
|
|
123
|
+
| `summary` | 2-3 sentence plain-language summary for triage |
|
|
124
|
+
|
|
125
|
+
For GEO inputs, `pmid` is replaced by `geo_accession` and `pmids`.
|
|
126
|
+
|
|
127
|
+
The CSV can be imported directly into Google Sheets (File → Import).
|
|
128
|
+
|
|
129
|
+
## Known Limitations
|
|
130
|
+
|
|
131
|
+
- Papers without abstracts or accessible full text are skipped silently.
|
|
132
|
+
- Full-text retrieval (`--fulltext`) applies to PubMed inputs only; GEO records use the record metadata directly.
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""CLI entry point for biolit."""
|
|
2
|
+
import argparse
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from dotenv import load_dotenv
|
|
7
|
+
|
|
8
|
+
from biolit.llm import get_llm_client
|
|
9
|
+
from biolit.pipeline import run, run_geo
|
|
10
|
+
from biolit.parsers.utils import DEFAULT_MAX_CHARS
|
|
11
|
+
from biolit.utils import read_eml_body, extract_pmids, read_pmids_file, read_geo_file
|
|
12
|
+
|
|
13
|
+
load_dotenv()
|
|
14
|
+
|
|
15
|
+
DEFAULT_CRITERION = (
|
|
16
|
+
"Is this paper SPECIFICALLY about schizophrenia AND does it use genetics "
|
|
17
|
+
"or genomics methods (e.g. GWAS, WGS, scRNA-seq, proteomics, gene expression)?"
|
|
18
|
+
)
|
|
19
|
+
DEFAULT_FIELDS = "methodology, sample_type, causal_claims, genetics_claims, summary"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def main(argv: list[str] | None = None) -> None:
|
|
23
|
+
parser = argparse.ArgumentParser(
|
|
24
|
+
description=(
|
|
25
|
+
"Screen PubMed alert emails with a configurable criterion and output fields. "
|
|
26
|
+
"Supports multiple LLM providers and optional full-text fetching."
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"input_file", nargs="?", default=None,
|
|
31
|
+
help="PubMed alert .eml file, plain-text file of PMIDs, or plain-text file of GEO accessions",
|
|
32
|
+
)
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--pmids", default=None,
|
|
35
|
+
help="Comma-separated PMIDs (alternative to input_file)",
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--accessions", default=None,
|
|
39
|
+
help="Comma-separated GEO accessions (alternative to input_file)",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Screening / extraction
|
|
43
|
+
parser.add_argument("--criterion", default=None,
|
|
44
|
+
help="Relevance screening criterion (yes/no question)")
|
|
45
|
+
parser.add_argument("--fields", default=None,
|
|
46
|
+
help="Fields to extract (comma-separated names)")
|
|
47
|
+
parser.add_argument("--default", action="store_true",
|
|
48
|
+
help="Use default schizophrenia genomics criterion and fields")
|
|
49
|
+
parser.add_argument("--output", default="results.csv",
|
|
50
|
+
help="Output CSV path (default: results.csv)")
|
|
51
|
+
|
|
52
|
+
# LLM provider
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--provider", default=os.environ.get("LLM_PROVIDER", "anthropic"),
|
|
55
|
+
choices=["anthropic", "openai", "ollama"],
|
|
56
|
+
help="LLM provider (default: anthropic, or LLM_PROVIDER env var)",
|
|
57
|
+
)
|
|
58
|
+
parser.add_argument(
|
|
59
|
+
"--model", default=os.environ.get("LLM_MODEL"),
|
|
60
|
+
help="Model name for the chosen provider (uses provider default if omitted)",
|
|
61
|
+
)
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--openai-base-url", default=None,
|
|
64
|
+
help="Custom base URL for OpenAI-compatible endpoints (e.g. Azure, local vLLM)",
|
|
65
|
+
)
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"--ollama-url", default="http://localhost:11434",
|
|
68
|
+
help="Ollama server URL (default: http://localhost:11434)",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Full-text
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"--fulltext", action="store_true",
|
|
74
|
+
help="Attempt to fetch full text from PMC, preprint servers, and Unpaywall",
|
|
75
|
+
)
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
"--unpaywall-email", default=os.environ.get("UNPAYWALL_EMAIL"),
|
|
78
|
+
help="Email for Unpaywall API (required when --fulltext is used; or set UNPAYWALL_EMAIL)",
|
|
79
|
+
)
|
|
80
|
+
parser.add_argument(
|
|
81
|
+
"--sections", default=None,
|
|
82
|
+
help=(
|
|
83
|
+
"Comma-separated list of sections to send to the LLM when full text is available "
|
|
84
|
+
"(e.g. 'methods,results'). Default: all sections."
|
|
85
|
+
),
|
|
86
|
+
)
|
|
87
|
+
parser.add_argument(
|
|
88
|
+
"--max-chars", type=int, default=DEFAULT_MAX_CHARS,
|
|
89
|
+
help=f"Maximum characters of paper text sent to the LLM (default: {DEFAULT_MAX_CHARS})",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
args = parser.parse_args(argv)
|
|
93
|
+
|
|
94
|
+
# Resolve criterion and fields
|
|
95
|
+
if args.default:
|
|
96
|
+
criterion = DEFAULT_CRITERION
|
|
97
|
+
fields = DEFAULT_FIELDS
|
|
98
|
+
else:
|
|
99
|
+
criterion = args.criterion
|
|
100
|
+
if not criterion:
|
|
101
|
+
criterion = input("Screening criterion (yes/no question about relevance): ").strip()
|
|
102
|
+
fields = args.fields
|
|
103
|
+
if not fields:
|
|
104
|
+
fields = input(
|
|
105
|
+
"Fields to extract (comma-separated, e.g. methodology, sample_type, summary): "
|
|
106
|
+
).strip()
|
|
107
|
+
|
|
108
|
+
# Build LLM client
|
|
109
|
+
try:
|
|
110
|
+
extra: dict = {}
|
|
111
|
+
if args.provider == "openai" and args.openai_base_url:
|
|
112
|
+
extra["base_url"] = args.openai_base_url
|
|
113
|
+
if args.provider == "ollama":
|
|
114
|
+
extra["base_url"] = args.ollama_url
|
|
115
|
+
client = get_llm_client(args.provider, args.model, **extra)
|
|
116
|
+
except (EnvironmentError, ImportError) as e:
|
|
117
|
+
print(f"Error: {e}")
|
|
118
|
+
sys.exit(1)
|
|
119
|
+
|
|
120
|
+
print(f"Using LLM: {client}\n")
|
|
121
|
+
|
|
122
|
+
# Resolve input — CLI flags take priority over file
|
|
123
|
+
if not args.input_file and not args.pmids and not args.accessions:
|
|
124
|
+
print("Error: provide an input_file, --pmids, or --accessions.")
|
|
125
|
+
sys.exit(1)
|
|
126
|
+
|
|
127
|
+
if args.accessions:
|
|
128
|
+
input_type = "geo"
|
|
129
|
+
accessions = [a.strip() for a in args.accessions.split(",") if a.strip()]
|
|
130
|
+
print(f"Using {len(accessions)} GEO accessions from --accessions\n")
|
|
131
|
+
elif args.pmids:
|
|
132
|
+
input_type = "pubmed"
|
|
133
|
+
pmids = [p.strip() for p in args.pmids.split(",") if p.strip()]
|
|
134
|
+
print(f"Using {len(pmids)} PMIDs from --pmids\n")
|
|
135
|
+
elif args.input_file.endswith(".eml"):
|
|
136
|
+
input_type = "pubmed"
|
|
137
|
+
body = read_eml_body(args.input_file)
|
|
138
|
+
pmids = extract_pmids(body)
|
|
139
|
+
print(f"Found {len(pmids)} PMIDs in {args.input_file}\n")
|
|
140
|
+
else:
|
|
141
|
+
first_value = _peek_first_value(args.input_file)
|
|
142
|
+
if first_value and first_value.upper().startswith(("GSE", "GDS", "GSM", "GPL")):
|
|
143
|
+
input_type = "geo"
|
|
144
|
+
accessions = read_geo_file(args.input_file)
|
|
145
|
+
print(f"Read {len(accessions)} GEO accessions from {args.input_file}\n")
|
|
146
|
+
else:
|
|
147
|
+
input_type = "pubmed"
|
|
148
|
+
pmids = read_pmids_file(args.input_file)
|
|
149
|
+
print(f"Read {len(pmids)} PMIDs from {args.input_file}\n")
|
|
150
|
+
|
|
151
|
+
if input_type == "geo":
|
|
152
|
+
if not accessions:
|
|
153
|
+
print("No accessions found. Exiting.")
|
|
154
|
+
sys.exit(1)
|
|
155
|
+
run_geo(
|
|
156
|
+
client=client,
|
|
157
|
+
accessions=accessions,
|
|
158
|
+
criterion=criterion,
|
|
159
|
+
fields_description=fields,
|
|
160
|
+
output_path=args.output,
|
|
161
|
+
max_chars=args.max_chars,
|
|
162
|
+
)
|
|
163
|
+
else:
|
|
164
|
+
if not pmids:
|
|
165
|
+
print("No PMIDs found. Exiting.")
|
|
166
|
+
sys.exit(1)
|
|
167
|
+
sections_wanted = (
|
|
168
|
+
[s.strip() for s in args.sections.split(",") if s.strip()]
|
|
169
|
+
if args.sections
|
|
170
|
+
else None
|
|
171
|
+
)
|
|
172
|
+
run(
|
|
173
|
+
client=client,
|
|
174
|
+
pmids=pmids,
|
|
175
|
+
criterion=criterion,
|
|
176
|
+
fields_description=fields,
|
|
177
|
+
output_path=args.output,
|
|
178
|
+
fulltext=args.fulltext,
|
|
179
|
+
unpaywall_email=args.unpaywall_email,
|
|
180
|
+
sections_wanted=sections_wanted,
|
|
181
|
+
max_chars=args.max_chars,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _peek_first_value(path: str) -> str | None:
|
|
186
|
+
"""Return the first non-blank, non-comment line of a file."""
|
|
187
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
188
|
+
for line in f:
|
|
189
|
+
line = line.strip()
|
|
190
|
+
if line and not line.startswith("#"):
|
|
191
|
+
return line
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
main()
|
|
197
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Fetcher sub-package."""
|
|
2
|
+
from biolit.fetchers.pubmed import fetch_pubmed_metadata, fetch_pmc_fulltext
|
|
3
|
+
from biolit.fetchers.preprints import fetch_preprint
|
|
4
|
+
from biolit.fetchers.unpaywall import fetch_via_unpaywall
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"fetch_pubmed_metadata",
|
|
8
|
+
"fetch_pmc_fulltext",
|
|
9
|
+
"fetch_preprint",
|
|
10
|
+
"fetch_via_unpaywall",
|
|
11
|
+
]
|
|
12
|
+
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Fetcher for NCBI GEO series records (GSE accessions) via the MINiML XML API."""
|
|
2
|
+
import time
|
|
3
|
+
import xml.etree.ElementTree as ET
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
GEO_MINIML_URL = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
|
|
8
|
+
_RATE_DELAY = 0.4
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def fetch_geo_record(accession: str) -> dict | None:
|
|
12
|
+
"""Fetch a GEO series record and return a paper-shaped dict.
|
|
13
|
+
|
|
14
|
+
Fetches MINiML XML for *accession* (e.g. ``GSE12345``) and extracts:
|
|
15
|
+
title, summary, overall design, experiment type, organism, and any
|
|
16
|
+
linked PubMed IDs.
|
|
17
|
+
|
|
18
|
+
Returns a dict compatible with the pipeline's ``paper`` format so the
|
|
19
|
+
same LLM screening and extraction calls work unchanged.
|
|
20
|
+
"""
|
|
21
|
+
try:
|
|
22
|
+
resp = requests.get(
|
|
23
|
+
GEO_MINIML_URL,
|
|
24
|
+
params={"acc": accession, "targ": "self", "form": "xml", "view": "brief"},
|
|
25
|
+
timeout=30,
|
|
26
|
+
)
|
|
27
|
+
resp.raise_for_status()
|
|
28
|
+
except Exception as e:
|
|
29
|
+
raise RuntimeError(f"GEO fetch failed for {accession}: {e}") from e
|
|
30
|
+
|
|
31
|
+
time.sleep(_RATE_DELAY)
|
|
32
|
+
return _parse_miniml(accession, resp.content)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _parse_miniml(accession: str, xml_bytes: bytes) -> dict | None:
|
|
36
|
+
"""Parse a GEO MINiML XML response into a paper-shaped dict."""
|
|
37
|
+
try:
|
|
38
|
+
root = ET.fromstring(xml_bytes)
|
|
39
|
+
except ET.ParseError:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
# MINiML uses a namespace; strip it for simpler findtext calls
|
|
43
|
+
ns = ""
|
|
44
|
+
if root.tag.startswith("{"):
|
|
45
|
+
ns = root.tag.split("}")[0] + "}"
|
|
46
|
+
|
|
47
|
+
def _text(elem, tag: str, default: str = "") -> str:
|
|
48
|
+
node = elem.find(f"{ns}{tag}")
|
|
49
|
+
return (node.text or "").strip() if node is not None else default
|
|
50
|
+
|
|
51
|
+
def _all_text(elem, tag: str) -> list[str]:
|
|
52
|
+
return [(n.text or "").strip() for n in elem.findall(f"{ns}{tag}") if n.text]
|
|
53
|
+
|
|
54
|
+
series = root.find(f"{ns}Series")
|
|
55
|
+
if series is None:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
title = _text(series, "Title")
|
|
59
|
+
summary = _text(series, "Summary")
|
|
60
|
+
overall_design = _text(series, "Overall-Design")
|
|
61
|
+
experiment_type = _text(series, "Type")
|
|
62
|
+
pmids = _all_text(series, "Pubmed-ID")
|
|
63
|
+
|
|
64
|
+
# Collect organism from child Sample elements if not on Series directly
|
|
65
|
+
organisms = list({
|
|
66
|
+
_text(s, "Organism") or _text(s, "organism")
|
|
67
|
+
for s in root.findall(f".//{ns}Sample")
|
|
68
|
+
} - {""})
|
|
69
|
+
|
|
70
|
+
# Build an abstract-like blob from the structured fields
|
|
71
|
+
abstract_parts = []
|
|
72
|
+
if summary:
|
|
73
|
+
abstract_parts.append(f"Summary: {summary}")
|
|
74
|
+
if overall_design:
|
|
75
|
+
abstract_parts.append(f"Overall design: {overall_design}")
|
|
76
|
+
if experiment_type:
|
|
77
|
+
abstract_parts.append(f"Experiment type: {experiment_type}")
|
|
78
|
+
if organisms:
|
|
79
|
+
abstract_parts.append(f"Organism(s): {', '.join(organisms)}")
|
|
80
|
+
abstract = "\n\n".join(abstract_parts)
|
|
81
|
+
|
|
82
|
+
return {
|
|
83
|
+
"pmid": pmids[0] if pmids else None,
|
|
84
|
+
"accession": accession,
|
|
85
|
+
"doi": None,
|
|
86
|
+
"title": title,
|
|
87
|
+
"abstract": abstract,
|
|
88
|
+
"mesh_terms": ([experiment_type] if experiment_type else []) + organisms,
|
|
89
|
+
"url": f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={accession}",
|
|
90
|
+
"pmids": pmids,
|
|
91
|
+
"text_source": "geo_record",
|
|
92
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Fetch full text from bioRxiv / medRxiv preprint servers."""
|
|
2
|
+
import requests
|
|
3
|
+
|
|
4
|
+
_API_BASE = "https://api.biorxiv.org/details/{server}/{doi}/na/json"
|
|
5
|
+
_JATS_TEMPLATE = "https://www.biorxiv.org/content/{doi}.source.xml"
|
|
6
|
+
_MEDRXIV_JATS_TEMPLATE = "https://www.medrxiv.org/content/{doi}.source.xml"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _is_preprint_doi(doi: str) -> str | None:
|
|
10
|
+
"""Return 'biorxiv' or 'medrxiv' if the DOI belongs to a preprint server."""
|
|
11
|
+
if not doi:
|
|
12
|
+
return None
|
|
13
|
+
doi_lower = doi.lower()
|
|
14
|
+
if "10.1101/" in doi_lower:
|
|
15
|
+
# Both bioRxiv and medRxiv share the 10.1101 prefix; try biorxiv first
|
|
16
|
+
return "biorxiv"
|
|
17
|
+
return None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def fetch_preprint(doi: str) -> bytes | None:
|
|
21
|
+
"""Try to retrieve JATS XML for a bioRxiv or medRxiv preprint by DOI.
|
|
22
|
+
|
|
23
|
+
Returns raw XML bytes, or None if the paper is not a preprint / fetch fails.
|
|
24
|
+
"""
|
|
25
|
+
if not doi:
|
|
26
|
+
return None
|
|
27
|
+
server = _is_preprint_doi(doi)
|
|
28
|
+
if not server:
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
# Query the biorxiv API; fall back to medrxiv if no results
|
|
32
|
+
for srv in ("biorxiv", "medrxiv"):
|
|
33
|
+
try:
|
|
34
|
+
api_url = _API_BASE.format(server=srv, doi=doi)
|
|
35
|
+
resp = requests.get(api_url, timeout=15)
|
|
36
|
+
resp.raise_for_status()
|
|
37
|
+
data = resp.json()
|
|
38
|
+
collection = data.get("collection", [])
|
|
39
|
+
if not collection:
|
|
40
|
+
continue
|
|
41
|
+
# The API confirms this paper exists on this server — grab JATS XML
|
|
42
|
+
jats_template = _JATS_TEMPLATE if srv == "biorxiv" else _MEDRXIV_JATS_TEMPLATE
|
|
43
|
+
xml_url = jats_template.format(doi=doi)
|
|
44
|
+
xml_resp = requests.get(xml_url, timeout=30)
|
|
45
|
+
xml_resp.raise_for_status()
|
|
46
|
+
return xml_resp.content
|
|
47
|
+
except Exception:
|
|
48
|
+
continue
|
|
49
|
+
return None
|
|
50
|
+
|