pubmed-research-classifier 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ # Byte-compiled / cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ .Python
6
+ .mypy_cache/
7
+ .pytest_cache/
8
+ .ruff_cache/
9
+
10
+ # Virtual environments
11
+ .venv/
12
+ venv/
13
+ env/
14
+
15
+ # IDE
16
+ .idea/
17
+ .vscode/
18
+ *.swp
19
+ *.swo
20
+ .DS_Store
21
+
22
+ # Env / secrets
23
+ .env
24
+ .env.local
25
+
26
+ # Large data files (JSONL candidate downloads — can be hundreds of MB)
27
+ data/*.jsonl
28
+
29
+ # Keep the curated labels file (small, valuable)
30
+ !dataset/labels.csv
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 EMBO
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,224 @@
1
+ Metadata-Version: 2.4
2
+ Name: pubmed-research-classifier
3
+ Version: 0.1.0
4
+ Summary: Classify PubMed articles as research or non-research using a trained MLP + ModernBERT.
5
+ Project-URL: Repository, https://github.com/embo-press/pubmed-research-classifier
6
+ Author-email: Jorge Abreu <jorge.abreu@embo.org>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2026 EMBO
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+ License-File: LICENSE
29
+ Keywords: classification,nlp,pubmed,scientometrics
30
+ Classifier: Intended Audience :: Science/Research
31
+ Classifier: Programming Language :: Python :: 3
32
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
33
+ Requires-Python: >=3.9
34
+ Requires-Dist: joblib>=1.3
35
+ Requires-Dist: numpy>=1.24
36
+ Requires-Dist: scikit-learn>=1.3
37
+ Requires-Dist: torch>=2.0
38
+ Provides-Extra: embed
39
+ Requires-Dist: sentence-transformers>=3.0; extra == 'embed'
40
+ Description-Content-Type: text/markdown
41
+
42
+ # pubmed-research-classifier
43
+
44
+ Classify PubMed articles as **research** or **non-research** using a trained
45
+ MLP on top of [EMBO/ModernBERT-neg-sampling-PubMed](https://huggingface.co/EMBO/ModernBERT-neg-sampling-PubMed) embeddings.
46
+
47
+ Model weights, StandardScaler, and publication-type vocabulary are bundled in
48
+ the package — no external model downloads are needed for the embedding-mode API.
49
+
50
+ ## Installation
51
+
52
+ ```bash
53
+ # Embedding mode only (no sentence-transformers required)
54
+ pip install pubmed-research-classifier
55
+
56
+ # Text mode (package embeds internally)
57
+ pip install "pubmed-research-classifier[embed]"
58
+ ```
59
+
60
+ ## Quick start
61
+
62
+ ### Text mode
63
+
64
+ ```python
65
+ from pubmed_research_classifier import classify
66
+
67
+ result = classify({
68
+ "title": "Structural basis of CRISPR-Cas9 activity",
69
+ "abstract": "We report crystal structures of Cas9 ...",
70
+ "pub_types": ["Journal Article"],
71
+ "n_authors": 8,
72
+ "n_refs": 42,
73
+ })
74
+ # {"label": "research", "p_nr": 0.018}
75
+ ```
76
+
77
+ ### Embedding mode
78
+
79
+ Pre-compute embeddings with `EMBO/ModernBERT-neg-sampling-PubMed`
80
+ using `normalize_embeddings=True`, then pass them directly:
81
+
82
+ ```python
83
+ from pubmed_research_classifier import classify
84
+ import numpy as np
85
+
86
+ result = classify({
87
+ "title_emb": title_embedding, # np.ndarray, shape (768,)
88
+ "abstract_emb": abstract_embedding, # np.ndarray, shape (768,); zeros if absent
89
+ "has_abstract": True,
90
+ "length_title": 52,
91
+ "length_abstract": 1240,
92
+ "pub_types": ["Journal Article"],
93
+ "n_authors": 8,
94
+ "n_refs": 42,
95
+ })
96
+ ```
97
+
98
+ ### Batch — millions of records
99
+
100
+ ```python
101
+ results = classify(records, batch_size=128)
102
+ # Returns a list in the same order as the input.
103
+ ```
104
+
105
+ ## Input fields
106
+
107
+ | Field | Type | Mode | Notes |
108
+ |---|---|---|---|
109
+ | `title` | str | text | |
110
+ | `abstract` | str or None | text | empty/None → treated as absent |
111
+ | `title_emb` | array (768,) | embed | L2-normalised |
112
+ | `abstract_emb` | array (768,) | embed | L2-normalised; zeros if absent |
113
+ | `has_abstract` | bool | embed | |
114
+ | `length_title` | int | embed | auto-derived from `title` in text mode |
115
+ | `length_abstract` | int | embed | auto-derived from `abstract` in text mode |
116
+ | `pub_types` | list[str] or str | both | PubMed PT tags; comma-sep string accepted |
117
+ | `n_authors` | int | both | |
118
+ | `n_refs` | int | both | |
119
+ | `has_funding` | bool | both | optional; inferred from "Research Support" PTs if omitted |
120
+
121
+ ## Output
122
+
123
+ ```python
124
+ {"label": "research", "p_nr": 0.018}
125
+ {"label": "non-research", "p_nr": 0.921}
126
+ ```
127
+
128
+ `p_nr` is P(non-research) from the model.
129
+ Default threshold: 0.75 (configurable via `classify(..., threshold=0.75)`).
130
+
131
+ ## Obtaining `has_funding` from PubMed XML
132
+
133
+ `has_funding` is `True` when the article's PubMed XML record contains at least
134
+ one `<Grant>` element inside a `<GrantList>`. It is **not** the same as the
135
+ "Research Support, …" publication type tags (those are a separate, coarser
136
+ signal also used by the model via `pub_types`).
137
+
138
+ If you fetch articles via the NCBI E-utilities API (efetch, XML format), you
139
+ can extract it like this:
140
+
141
+ ```python
142
+ import xml.etree.ElementTree as ET
143
+
144
+ def has_funding_from_xml(article_xml: str) -> bool:
145
+ """Return True if the PubMed XML contains at least one <Grant> entry."""
146
+ root = ET.fromstring(article_xml)
147
+ return len(root.findall(".//Grant")) > 0
148
+ ```
149
+
150
+ Or, if you are working with a parsed `xml.etree.ElementTree.Element` object
151
+ (e.g. the `<PubmedArticle>` node returned by your ETL pipeline):
152
+
153
+ ```python
154
+ has_funding = len(article_element.findall(".//Grant")) > 0
155
+ ```
156
+
157
+ If you do not have access to the raw XML and only have the metadata fields,
158
+ omit `has_funding` entirely — the package will fall back to checking whether
159
+ any of the `pub_types` start with `"Research Support"`, which is a reasonable
160
+ proxy and is already captured separately in the model's publication-type
161
+ features.
162
+
163
+ ## Publishing a new version to PyPI
164
+
165
+ The built artifacts live in `pubmed-research-classifier/dist/`.
166
+
167
+ ### Workflow for every new release
168
+
169
+ 1. **Update the model weights** — copy new `mlp_best.pt`, `scaler.joblib`,
170
+ and/or `mlp_config.json` into
171
+ `src/pubmed_research_classifier/_data/` and overwrite the old files.
172
+
173
+ 2. **Bump the version** in two places:
174
+
175
+ ```toml
176
+ # pyproject.toml
177
+ version = "0.2.0"
178
+ ```
179
+
180
+ ```python
181
+ # src/pubmed_research_classifier/__init__.py
182
+ __version__ = "0.2.0"
183
+ ```
184
+
185
+ 3. **Rebuild the wheel:**
186
+
187
+ ```bash
188
+ cd pubmed-research-classifier
189
+ pip install build # first time only
190
+ python -m build
191
+ # produces dist/pubmed_research_classifier-0.2.0-py3-none-any.whl
192
+ # and dist/pubmed_research_classifier-0.2.0.tar.gz
193
+ ```
194
+
195
+ 4. **Upload to PyPI:**
196
+
197
+ ```bash
198
+ pip install twine # first time only
199
+ twine upload dist/pubmed_research_classifier-0.2.0*
200
+ # Username: __token__
201
+ # Password: <your PyPI API token>
202
+ ```
203
+
204
+ PyPI API tokens are managed at <https://pypi.org/manage/account/token/>.
205
+ Use a project-scoped token (not account-wide) for safety.
206
+
207
+ 5. **Verify the release:**
208
+
209
+ ```bash
210
+ pip install "pubmed-research-classifier==0.2.0" --force-reinstall
211
+ python -c "from pubmed_research_classifier import classify; print('ok')"
212
+ ```
213
+
214
+ ### First-time PyPI setup
215
+
216
+ If the package does not yet exist on PyPI, the first upload creates it
217
+ automatically. You will need a PyPI account and a project-scoped (or
218
+ account-scoped) API token. Test releases can go to
219
+ <https://test.pypi.org> first:
220
+
221
+ ```bash
222
+ twine upload --repository testpypi dist/pubmed_research_classifier-0.1.0*
223
+ pip install --index-url https://test.pypi.org/simple/ pubmed-research-classifier
224
+ ```
@@ -0,0 +1,183 @@
1
+ # pubmed-research-classifier
2
+
3
+ Classify PubMed articles as **research** or **non-research** using a trained
4
+ MLP on top of [EMBO/ModernBERT-neg-sampling-PubMed](https://huggingface.co/EMBO/ModernBERT-neg-sampling-PubMed) embeddings.
5
+
6
+ Model weights, StandardScaler, and publication-type vocabulary are bundled in
7
+ the package — no external model downloads are needed for the embedding-mode API.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ # Embedding mode only (no sentence-transformers required)
13
+ pip install pubmed-research-classifier
14
+
15
+ # Text mode (package embeds internally)
16
+ pip install "pubmed-research-classifier[embed]"
17
+ ```
18
+
19
+ ## Quick start
20
+
21
+ ### Text mode
22
+
23
+ ```python
24
+ from pubmed_research_classifier import classify
25
+
26
+ result = classify({
27
+ "title": "Structural basis of CRISPR-Cas9 activity",
28
+ "abstract": "We report crystal structures of Cas9 ...",
29
+ "pub_types": ["Journal Article"],
30
+ "n_authors": 8,
31
+ "n_refs": 42,
32
+ })
33
+ # {"label": "research", "p_nr": 0.018}
34
+ ```
35
+
36
+ ### Embedding mode
37
+
38
+ Pre-compute embeddings with `EMBO/ModernBERT-neg-sampling-PubMed`
39
+ using `normalize_embeddings=True`, then pass them directly:
40
+
41
+ ```python
42
+ from pubmed_research_classifier import classify
43
+ import numpy as np
44
+
45
+ result = classify({
46
+ "title_emb": title_embedding, # np.ndarray, shape (768,)
47
+ "abstract_emb": abstract_embedding, # np.ndarray, shape (768,); zeros if absent
48
+ "has_abstract": True,
49
+ "length_title": 52,
50
+ "length_abstract": 1240,
51
+ "pub_types": ["Journal Article"],
52
+ "n_authors": 8,
53
+ "n_refs": 42,
54
+ })
55
+ ```
56
+
57
+ ### Batch — millions of records
58
+
59
+ ```python
60
+ results = classify(records, batch_size=128)
61
+ # Returns a list in the same order as the input.
62
+ ```
63
+
64
+ ## Input fields
65
+
66
+ | Field | Type | Mode | Notes |
67
+ |---|---|---|---|
68
+ | `title` | str | text | |
69
+ | `abstract` | str or None | text | empty/None → treated as absent |
70
+ | `title_emb` | array (768,) | embed | L2-normalised |
71
+ | `abstract_emb` | array (768,) | embed | L2-normalised; zeros if absent |
72
+ | `has_abstract` | bool | embed | |
73
+ | `length_title` | int | embed | auto-derived from `title` in text mode |
74
+ | `length_abstract` | int | embed | auto-derived from `abstract` in text mode |
75
+ | `pub_types` | list[str] or str | both | PubMed PT tags; comma-sep string accepted |
76
+ | `n_authors` | int | both | |
77
+ | `n_refs` | int | both | |
78
+ | `has_funding` | bool | both | optional; inferred from "Research Support" PTs if omitted |
79
+
80
+ ## Output
81
+
82
+ ```python
83
+ {"label": "research", "p_nr": 0.018}
84
+ {"label": "non-research", "p_nr": 0.921}
85
+ ```
86
+
87
+ `p_nr` is P(non-research) from the model.
88
+ Default threshold: 0.75 (configurable via `classify(..., threshold=0.75)`).
89
+
90
+ ## Obtaining `has_funding` from PubMed XML
91
+
92
+ `has_funding` is `True` when the article's PubMed XML record contains at least
93
+ one `<Grant>` element inside a `<GrantList>`. It is **not** the same as the
94
+ "Research Support, …" publication type tags (those are a separate, coarser
95
+ signal also used by the model via `pub_types`).
96
+
97
+ If you fetch articles via the NCBI E-utilities API (efetch, XML format), you
98
+ can extract it like this:
99
+
100
+ ```python
101
+ import xml.etree.ElementTree as ET
102
+
103
+ def has_funding_from_xml(article_xml: str) -> bool:
104
+ """Return True if the PubMed XML contains at least one <Grant> entry."""
105
+ root = ET.fromstring(article_xml)
106
+ return len(root.findall(".//Grant")) > 0
107
+ ```
108
+
109
+ Or, if you are working with a parsed `xml.etree.ElementTree.Element` object
110
+ (e.g. the `<PubmedArticle>` node returned by your ETL pipeline):
111
+
112
+ ```python
113
+ has_funding = len(article_element.findall(".//Grant")) > 0
114
+ ```
115
+
116
+ If you do not have access to the raw XML and only have the metadata fields,
117
+ omit `has_funding` entirely — the package will fall back to checking whether
118
+ any of the `pub_types` start with `"Research Support"`, which is a reasonable
119
+ proxy and is already captured separately in the model's publication-type
120
+ features.
121
+
122
+ ## Publishing a new version to PyPI
123
+
124
+ The built artifacts live in `pubmed-research-classifier/dist/`.
125
+
126
+ ### Workflow for every new release
127
+
128
+ 1. **Update the model weights** — copy new `mlp_best.pt`, `scaler.joblib`,
129
+ and/or `mlp_config.json` into
130
+ `src/pubmed_research_classifier/_data/` and overwrite the old files.
131
+
132
+ 2. **Bump the version** in two places:
133
+
134
+ ```toml
135
+ # pyproject.toml
136
+ version = "0.2.0"
137
+ ```
138
+
139
+ ```python
140
+ # src/pubmed_research_classifier/__init__.py
141
+ __version__ = "0.2.0"
142
+ ```
143
+
144
+ 3. **Rebuild the wheel:**
145
+
146
+ ```bash
147
+ cd pubmed-research-classifier
148
+ pip install build # first time only
149
+ python -m build
150
+ # produces dist/pubmed_research_classifier-0.2.0-py3-none-any.whl
151
+ # and dist/pubmed_research_classifier-0.2.0.tar.gz
152
+ ```
153
+
154
+ 4. **Upload to PyPI:**
155
+
156
+ ```bash
157
+ pip install twine # first time only
158
+ twine upload dist/pubmed_research_classifier-0.2.0*
159
+ # Username: __token__
160
+ # Password: <your PyPI API token>
161
+ ```
162
+
163
+ PyPI API tokens are managed at <https://pypi.org/manage/account/token/>.
164
+ Use a project-scoped token (not account-wide) for safety.
165
+
166
+ 5. **Verify the release:**
167
+
168
+ ```bash
169
+ pip install "pubmed-research-classifier==0.2.0" --force-reinstall
170
+ python -c "from pubmed_research_classifier import classify; print('ok')"
171
+ ```
172
+
173
+ ### First-time PyPI setup
174
+
175
+ If the package does not yet exist on PyPI, the first upload creates it
176
+ automatically. You will need a PyPI account and a project-scoped (or
177
+ account-scoped) API token. Test releases can go to
178
+ <https://test.pypi.org> first:
179
+
180
+ ```bash
181
+ twine upload --repository testpypi dist/pubmed_research_classifier-0.1.0*
182
+ pip install --index-url https://test.pypi.org/simple/ pubmed-research-classifier
183
+ ```
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "pubmed-research-classifier"
7
+ version = "0.1.0"
8
+ description = "Classify PubMed articles as research or non-research using a trained MLP + ModernBERT."
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ requires-python = ">=3.9"
12
+ authors = [{ name = "Jorge Abreu", email = "jorge.abreu@embo.org" }]
13
+ keywords = ["pubmed", "nlp", "scientometrics", "classification"]
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "Intended Audience :: Science/Research",
17
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
18
+ ]
19
+ dependencies = [
20
+ "torch>=2.0",
21
+ "numpy>=1.24",
22
+ "scikit-learn>=1.3",
23
+ "joblib>=1.3",
24
+ ]
25
+
26
+ [project.optional-dependencies]
27
+ embed = ["sentence-transformers>=3.0"]
28
+
29
+ [project.urls]
30
+ Repository = "https://github.com/embo-press/pubmed-research-classifier"
31
+
32
+ [tool.pytest.ini_options]
33
+ testpaths = ["tests"]
34
+
35
+ [tool.hatch.build.targets.wheel]
36
+ packages = ["src/pubmed_research_classifier"]
37
+ # Ensure binary and JSON data files inside the package are included.
38
+ artifacts = [
39
+ "*.pt",
40
+ "*.joblib",
41
+ "*.json",
42
+ ]
@@ -0,0 +1,52 @@
1
+ """pubmed-research-classifier — classify PubMed articles as research or non-research.
2
+
3
+ Quick start
4
+ -----------
5
+ Text mode (package embeds internally — requires the ``embed`` extra)::
6
+
7
+ pip install pubmed-research-classifier[embed]
8
+
9
+ from pubmed_research_classifier import classify
10
+
11
+ result = classify({
12
+ "title": "Structural basis of CRISPR-Cas9 activity",
13
+ "abstract": "We report crystal structures ...",
14
+ "pub_types": ["Journal Article"],
15
+ "n_authors": 8,
16
+ "n_refs": 42,
17
+ })
18
+ # {"label": "research", "p_nr": 0.018}
19
+
20
+ Embedding mode (pre-compute with EMBO/ModernBERT-neg-sampling-PubMed)::
21
+
22
+ pip install pubmed-research-classifier
23
+
24
+ from pubmed_research_classifier import classify
25
+ import numpy as np
26
+
27
+ result = classify({
28
+ "title_emb": np.zeros(768), # shape (768,), L2-normalised
29
+ "abstract_emb": np.zeros(768),
30
+ "has_abstract": True,
31
+ "length_title": 52,
32
+ "length_abstract": 1240,
33
+ "pub_types": ["Journal Article"],
34
+ "n_authors": 8,
35
+ "n_refs": 42,
36
+ })
37
+
38
+ Batch (millions of records)::
39
+
40
+ results = classify(list_of_records, batch_size=128)
41
+
42
+ Model
43
+ -----
44
+ Trained MLP + ModernBERT embeddings (EMBO/ModernBERT-neg-sampling-PubMed).
45
+ Weights, scaler, and vocabulary are bundled in the package — no external
46
+ downloads required (except for the embedding model in text mode).
47
+ """
48
+
49
+ from ._inference import classify_records as classify
50
+
51
+ __all__ = ["classify"]
52
+ __version__ = "0.1.0"
@@ -0,0 +1,14 @@
1
+ {
2
+ "n_scalars": 200,
3
+ "use_funding": true,
4
+ "use_pub_types": true,
5
+ "proj_dim": 64,
6
+ "hidden_dims": [
7
+ 128,
8
+ 32,
9
+ 16
10
+ ],
11
+ "dropout": 0.2,
12
+ "use_title_emb": true,
13
+ "use_abstract_emb": true
14
+ }