paperscraper 0.2.16__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {paperscraper-0.2.16 → paperscraper-0.3.0}/PKG-INFO +97 -34
  2. {paperscraper-0.2.16 → paperscraper-0.3.0}/README.md +93 -32
  3. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/__init__.py +1 -1
  4. paperscraper-0.3.0/paperscraper/citations/__init__.py +3 -0
  5. paperscraper-0.3.0/paperscraper/citations/citations.py +63 -0
  6. paperscraper-0.3.0/paperscraper/citations/tests/test_citations.py +19 -0
  7. paperscraper-0.3.0/paperscraper/pdf.py +527 -0
  8. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/scholar/scholar.py +1 -28
  9. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/scholar/tests/test_scholar.py +2 -6
  10. paperscraper-0.3.0/paperscraper/tests/test_pdf.py +302 -0
  11. paperscraper-0.3.0/paperscraper/xrxiv/tests/__init__.py +0 -0
  12. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/PKG-INFO +97 -34
  13. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/SOURCES.txt +3 -0
  14. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/requires.txt +1 -0
  15. {paperscraper-0.2.16 → paperscraper-0.3.0}/setup.py +1 -0
  16. paperscraper-0.2.16/paperscraper/citations/__init__.py +0 -2
  17. paperscraper-0.2.16/paperscraper/pdf.py +0 -164
  18. paperscraper-0.2.16/paperscraper/tests/test_pdf.py +0 -161
  19. {paperscraper-0.2.16 → paperscraper-0.3.0}/LICENSE +0 -0
  20. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/arxiv/__init__.py +0 -0
  21. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/arxiv/arxiv.py +0 -0
  22. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/arxiv/utils.py +0 -0
  23. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/core.py +0 -0
  24. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/entity/__init__.py +0 -0
  25. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/entity/core.py +0 -0
  26. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/entity/paper.py +0 -0
  27. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/entity/researcher.py +0 -0
  28. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/self_citations.py +0 -0
  29. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/self_references.py +0 -0
  30. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/tests/__init__.py +0 -0
  31. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/tests/test_self_references.py +0 -0
  32. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/utils.py +0 -0
  33. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/__init__.py +0 -0
  34. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/arxiv.py +0 -0
  35. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/biorxiv.py +0 -0
  36. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/chemrxiv.py +0 -0
  37. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/medrxiv.py +0 -0
  38. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/utils/__init__.py +0 -0
  39. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
  40. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +0 -0
  41. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/utils/chemrxiv/utils.py +0 -0
  42. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/impact.py +0 -0
  43. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/load_dumps.py +0 -0
  44. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/plotting.py +0 -0
  45. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/postprocessing.py +0 -0
  46. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/pubmed/__init__.py +0 -0
  47. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/pubmed/pubmed.py +0 -0
  48. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/pubmed/tests/__init__.py +0 -0
  49. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/pubmed/tests/test_pubmed.py +0 -0
  50. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/pubmed/utils.py +0 -0
  51. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/scholar/__init__.py +0 -0
  52. /paperscraper-0.2.16/paperscraper/scholar/tests/__init__.py → /paperscraper-0.3.0/paperscraper/scholar/core.py +0 -0
  53. {paperscraper-0.2.16/paperscraper → paperscraper-0.3.0/paperscraper/scholar}/tests/__init__.py +0 -0
  54. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/server_dumps/__init__.py +0 -0
  55. {paperscraper-0.2.16/paperscraper/xrxiv → paperscraper-0.3.0/paperscraper}/tests/__init__.py +0 -0
  56. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/tests/test_dump.py +0 -0
  57. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/tests/test_impactor.py +0 -0
  58. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/utils.py +0 -0
  59. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/xrxiv/__init__.py +0 -0
  60. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/xrxiv/tests/test_xrxiv.py +0 -0
  61. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/xrxiv/xrxiv_api.py +0 -0
  62. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/xrxiv/xrxiv_query.py +0 -0
  63. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/dependency_links.txt +0 -0
  64. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/not-zip-safe +0 -0
  65. {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/top_level.txt +0 -0
  66. {paperscraper-0.2.16 → paperscraper-0.3.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: paperscraper
3
- Version: 0.2.16
3
+ Version: 0.3.0
4
4
  Summary: paperscraper: Package to scrape papers.
5
5
  Home-page: https://github.com/jannisborn/paperscraper
6
6
  Author: Jannis Born, Matteo Manica
@@ -34,6 +34,7 @@ Requires-Dist: thefuzz
34
34
  Requires-Dist: pytest
35
35
  Requires-Dist: tldextract
36
36
  Requires-Dist: semanticscholar
37
+ Requires-Dist: pydantic
37
38
  Dynamic: author
38
39
  Dynamic: author-email
39
40
  Dynamic: classifier
@@ -42,6 +43,7 @@ Dynamic: description-content-type
42
43
  Dynamic: home-page
43
44
  Dynamic: keywords
44
45
  Dynamic: license
46
+ Dynamic: license-file
45
47
  Dynamic: requires-dist
46
48
  Dynamic: summary
47
49
 
@@ -56,12 +58,27 @@ MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.or
56
58
  [![codecov](https://codecov.io/github/jannisborn/paperscraper/branch/main/graph/badge.svg?token=Clwi0pu61a)](https://codecov.io/github/jannisborn/paperscraper)
57
59
  # paperscraper
58
60
 
59
- `paperscraper` is a `python` package for scraping publication metadata or full PDF files from
61
+ `paperscraper` is a `python` package for scraping publication metadata or full text files (PDF or XML) from
60
62
  **PubMed** or preprint servers such as **arXiv**, **medRxiv**, **bioRxiv** and **chemRxiv**.
61
63
  It provides a streamlined interface to scrape metadata, allows to retrieve citation counts
62
64
  from Google Scholar, impact factors from journals and comes with simple postprocessing functions
63
65
  and plotting routines for meta-analysis.
64
66
 
67
+ ## Table of Contents
68
+
69
+ 1. [Getting Started](#getting-started)
70
+ - [Download X-rxiv Dumps](#download-x-rxiv-dumps)
71
+ - [Arxiv Local Dump](#arxiv-local-dump)
72
+ 2. [Examples](#examples)
73
+ - [Publication Keyword Search](#publication-keyword-search)
74
+ - [Full-Text Retrieval (PDFs & XMLs)](#full-text-retrieval-pdfs--xmls)
75
+ - [Citation Search](#citation-search)
76
+ - [Journal Impact Factor](#journal-impact-factor)
77
+ 3. [Plotting](#plotting)
78
+ - [Barplots](#barplots)
79
+ - [Venn Diagrams](#venn-diagrams)
80
+ 4. [Citation](#citation)
81
+ 5. [Contributions](#contributions)
65
82
 
66
83
  ## Getting started
67
84
 
@@ -90,6 +107,21 @@ medrxiv(start_date="2023-04-01", end_date="2023-04-08")
90
107
  ```
91
108
  But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
92
109
 
110
+ #### Arxiv local dump
111
+ If you prefer local search rather than using the arxiv API:
112
+
113
+ ```py
114
+ from paperscraper.get_dumps import arxiv
115
+ arxiv(start_date='2024-01-01', end_date=None) # scrapes all metadata from 2024 until today.
116
+ ```
117
+
118
+ Afterwards you can search the local arxiv dump just like the other x-rxiv dumps.
119
+ The direct endpoint is `paperscraper.arxiv.get_arxiv_papers_local`. You can also specify the
120
+ backend directly in the `get_and_dump_arxiv_papers` function:
121
+ ```py
122
+ from paperscraper.arxiv import get_and_dump_arxiv_papers
123
+ get_and_dump_arxiv_papers(..., backend='local')
124
+ ```
93
125
 
94
126
  ## Examples
95
127
 
@@ -158,10 +190,15 @@ from paperscraper.scholar import get_and_dump_scholar_papers
158
190
  topic = 'Machine Learning'
159
191
  get_and_dump_scholar_papers(topic)
160
192
  ```
193
+ *NOTE*: The scholar endpoint does not require authentication but since it regularly prompts with captchas, it's difficult to apply large scale.
194
+
195
+ ### Full-Text Retrieval (PDFs & XMLs)
161
196
 
162
- ### Scrape PDFs
197
+ `paperscraper` allows you to download full text of publications using DOIs. The basic functionality works reliably for preprint servers (arXiv, bioRxiv, medRxiv, chemRxiv), but retrieving papers from PubMed dumps is more challenging due to publisher restrictions and paywalls.
163
198
 
164
- `paperscraper` also allows you to download the PDF files.
199
+ #### Standard Usage
200
+
201
+ The main download functions work for all paper types with automatic fallbacks:
165
202
 
166
203
  ```py
167
204
  from paperscraper.pdf import save_pdf
@@ -169,31 +206,71 @@ paper_data = {'doi': "10.48550/arXiv.2207.03928"}
169
206
  save_pdf(paper_data, filepath='gt4sd_paper.pdf')
170
207
  ```
171
208
 
172
- If you want to batch download all PDFs for your previous metadata search, use the wrapper.
173
- Here we scrape the PDFs for the metadata obtained in the previous example.
209
+ To batch download full texts from your metadata search results:
174
210
 
175
211
  ```py
176
212
  from paperscraper.pdf import save_pdf_from_dump
177
213
 
178
- # Save PDFs in current folder and name the files by their DOI
214
+ # Save PDFs/XMLs in current folder and name the files by their DOI
179
215
  save_pdf_from_dump('medrxiv_covid_ai_imaging.jsonl', pdf_path='.', key_to_save='doi')
180
216
  ```
181
- *NOTE*: This works robustly for preprint servers, but if you use it on a PubMed dump, dont expect to obtain all PDFs.
182
- Many publishers detect and block scraping and many publications are simply behind paywalls.
217
+
218
+ #### Automatic Fallback Mechanisms
219
+
220
+ When the standard text retrieval fails, `paperscraper` automatically tries these fallbacks:
221
+
222
+ - **BioC-PMC**: For biomedical papers in [PubMed Central](https://pmc.ncbi.nlm.nih.gov/) (open-access repository), it retrieves open-access full-text XML from the [BioC-PMC API](https://www.ncbi.nlm.nih.gov/research/bionlp/APIs/BioC-PMC/).
223
+ - **eLife Papers**: For [eLife](https://elifesciences.org/) journal papers, it fetches XML files from eLife's open [GitHub repository](https://github.com/elifesciences/elife-article-xml).
224
+
225
+ These fallbacks are tried automatically without requiring any additional configuration.
226
+
227
+ #### Enhanced Retrieval with Publisher APIs
228
+
229
+ For more comprehensive access to papers from major publishers, you can provide API keys for:
230
+
231
+ - **Wiley TDM API**: Enables access to [Wiley](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) publications (2,000+ journals).
232
+ - **Elsevier TDM API**: Enables access to [Elsevier](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) publications (The Lancet, Cell, ...).
233
+
234
+ To use publisher APIs:
235
+
236
+ 1. Create a file with your API keys:
237
+ ```
238
+ WILEY_TDM_API_TOKEN=your_wiley_token_here
239
+ ELSEVIER_TDM_API_KEY=your_elsevier_key_here
240
+ ```
241
+
242
+ 2. Pass the file path when calling retrieval functions:
243
+
244
+ ```py
245
+ from paperscraper.pdf import save_pdf_from_dump
246
+
247
+ save_pdf_from_dump(
248
+ 'pubmed_query_results.jsonl',
249
+ pdf_path='./papers',
250
+ key_to_save='doi',
251
+ api_keys='path/to/your/api_keys.txt'
252
+ )
253
+ ```
254
+
255
+ For obtaining API keys:
256
+ - Wiley TDM API: Visit [Wiley Text and Data Mining](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) (free for academic users with institutional subscription)
257
+ - Elsevier TDM API: Visit [Elsevier's Text and Data Mining](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) (free for academic users with institutional subscription)
258
+
259
+ *NOTE*: While these fallback mechanisms improve retrieval success rates, they cannot guarantee access to all papers due to various access restrictions.
183
260
 
184
261
 
185
262
  ### Citation search
186
263
 
187
- A plus of the Scholar endpoint is that the number of citations of a paper can be fetched:
264
+ You can fetch the number of citations of a paper from its title or DOI
188
265
 
189
266
  ```py
190
- from paperscraper.scholar import get_citations_from_title
267
+ from paperscraper.citations import get_citations_from_title, get_citations_by_doi
191
268
  title = 'Über formal unentscheidbare Sätze der Principia Mathematica und verwandter Systeme I.'
192
- get_citations_from_title(title)
193
- ```
269
+ print(get_citations_from_title(title))
194
270
 
195
- *NOTE*: The scholar endpoint does not require authentication but since it regularly
196
- prompts with captchas, it's difficult to apply large scale.
271
+ doi = '10.1021/acs.jcim.3c00132'
272
+ get_citations_by_doi(doi)
273
+ ```
197
274
 
198
275
  ### Journal impact factor
199
276
 
@@ -231,28 +308,13 @@ i.search("quantum information", threshold=90, return_all=True)
231
308
  # ]
232
309
  ```
233
310
 
234
- ## Arxiv local dump
235
- If you prefer local search rather than using the arxiv API:
236
-
237
- ```py
238
- from paperscraper.get_dumps import arxiv
239
- arxiv(start_date='2024-01-01', end_date=None) # scrapes all metadata from 2024 until today.
240
- ```
241
-
242
- Afterwards you can search the local arxiv dump just like the other x-rxiv dumps.
243
- The direct endpoint is `paperscraper.arxiv.get_arxiv_papers_local`. You can also specify the
244
- backend directly in the `get_and_dump_arxiv_papers` function:
245
- ```py
246
- from paperscraper.arxiv import get_and_dump_arxiv_papers
247
- get_and_dump_arxiv_papers(..., backend='local')
248
- ```
249
311
 
250
- ### Plotting
312
+ ## Plotting
251
313
 
252
314
  When multiple query searches are performed, two types of plots can be generated
253
315
  automatically: Venn diagrams and bar plots.
254
316
 
255
- #### Barplots
317
+ ### Barplots
256
318
 
257
319
  Compare the temporal evolution of different queries across different servers.
258
320
 
@@ -310,7 +372,7 @@ plot_comparison(
310
372
  ![molreps](https://github.com/jannisborn/paperscraper/blob/main/assets/molreps.png?raw=true "MolReps")
311
373
 
312
374
 
313
- #### Venn Diagrams
375
+ ### Venn Diagrams
314
376
 
315
377
  ```py
316
378
  from paperscraper.plotting import (
@@ -369,6 +431,7 @@ If you use `paperscraper`, please cite a paper that motivated our development of
369
431
 
370
432
  ## Contributions
371
433
  Thanks to the following contributors:
434
+ - [@mathinic](https://github.com/mathinic): Since `v0.3.0` improved PubMed full text retrieval with additional fallback mechanisms (BioC-PMC, eLife and optional Wiley/Elsevier APIs).
372
435
  - [@memray](https://github.com/memray): Since `v0.2.12` there are automatic retries when downloading the {med/bio/chem}rxiv dumps.
373
436
  - [@achouhan93](https://github.com/achouhan93): Since `v0.2.5` {med/bio/chem}rxiv can be scraped for specific dates!
374
437
  - [@daenuprobst](https://github.com/daenuprobst): Since `v0.2.4` PDF files can be scraped directly (`paperscraper.pdf.save_pdf`)
@@ -9,12 +9,27 @@ MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.or
9
9
  [![codecov](https://codecov.io/github/jannisborn/paperscraper/branch/main/graph/badge.svg?token=Clwi0pu61a)](https://codecov.io/github/jannisborn/paperscraper)
10
10
  # paperscraper
11
11
 
12
- `paperscraper` is a `python` package for scraping publication metadata or full PDF files from
12
+ `paperscraper` is a `python` package for scraping publication metadata or full text files (PDF or XML) from
13
13
  **PubMed** or preprint servers such as **arXiv**, **medRxiv**, **bioRxiv** and **chemRxiv**.
14
14
  It provides a streamlined interface to scrape metadata, allows to retrieve citation counts
15
15
  from Google Scholar, impact factors from journals and comes with simple postprocessing functions
16
16
  and plotting routines for meta-analysis.
17
17
 
18
+ ## Table of Contents
19
+
20
+ 1. [Getting Started](#getting-started)
21
+ - [Download X-rxiv Dumps](#download-x-rxiv-dumps)
22
+ - [Arxiv Local Dump](#arxiv-local-dump)
23
+ 2. [Examples](#examples)
24
+ - [Publication Keyword Search](#publication-keyword-search)
25
+ - [Full-Text Retrieval (PDFs & XMLs)](#full-text-retrieval-pdfs--xmls)
26
+ - [Citation Search](#citation-search)
27
+ - [Journal Impact Factor](#journal-impact-factor)
28
+ 3. [Plotting](#plotting)
29
+ - [Barplots](#barplots)
30
+ - [Venn Diagrams](#venn-diagrams)
31
+ 4. [Citation](#citation)
32
+ 5. [Contributions](#contributions)
18
33
 
19
34
  ## Getting started
20
35
 
@@ -43,6 +58,21 @@ medrxiv(start_date="2023-04-01", end_date="2023-04-08")
43
58
  ```
44
59
  But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
45
60
 
61
+ #### Arxiv local dump
62
+ If you prefer local search rather than using the arxiv API:
63
+
64
+ ```py
65
+ from paperscraper.get_dumps import arxiv
66
+ arxiv(start_date='2024-01-01', end_date=None) # scrapes all metadata from 2024 until today.
67
+ ```
68
+
69
+ Afterwards you can search the local arxiv dump just like the other x-rxiv dumps.
70
+ The direct endpoint is `paperscraper.arxiv.get_arxiv_papers_local`. You can also specify the
71
+ backend directly in the `get_and_dump_arxiv_papers` function:
72
+ ```py
73
+ from paperscraper.arxiv import get_and_dump_arxiv_papers
74
+ get_and_dump_arxiv_papers(..., backend='local')
75
+ ```
46
76
 
47
77
  ## Examples
48
78
 
@@ -111,10 +141,15 @@ from paperscraper.scholar import get_and_dump_scholar_papers
111
141
  topic = 'Machine Learning'
112
142
  get_and_dump_scholar_papers(topic)
113
143
  ```
144
+ *NOTE*: The scholar endpoint does not require authentication but since it regularly prompts with captchas, it's difficult to apply large scale.
145
+
146
+ ### Full-Text Retrieval (PDFs & XMLs)
114
147
 
115
- ### Scrape PDFs
148
+ `paperscraper` allows you to download full text of publications using DOIs. The basic functionality works reliably for preprint servers (arXiv, bioRxiv, medRxiv, chemRxiv), but retrieving papers from PubMed dumps is more challenging due to publisher restrictions and paywalls.
116
149
 
117
- `paperscraper` also allows you to download the PDF files.
150
+ #### Standard Usage
151
+
152
+ The main download functions work for all paper types with automatic fallbacks:
118
153
 
119
154
  ```py
120
155
  from paperscraper.pdf import save_pdf
@@ -122,31 +157,71 @@ paper_data = {'doi': "10.48550/arXiv.2207.03928"}
122
157
  save_pdf(paper_data, filepath='gt4sd_paper.pdf')
123
158
  ```
124
159
 
125
- If you want to batch download all PDFs for your previous metadata search, use the wrapper.
126
- Here we scrape the PDFs for the metadata obtained in the previous example.
160
+ To batch download full texts from your metadata search results:
127
161
 
128
162
  ```py
129
163
  from paperscraper.pdf import save_pdf_from_dump
130
164
 
131
- # Save PDFs in current folder and name the files by their DOI
165
+ # Save PDFs/XMLs in current folder and name the files by their DOI
132
166
  save_pdf_from_dump('medrxiv_covid_ai_imaging.jsonl', pdf_path='.', key_to_save='doi')
133
167
  ```
134
- *NOTE*: This works robustly for preprint servers, but if you use it on a PubMed dump, dont expect to obtain all PDFs.
135
- Many publishers detect and block scraping and many publications are simply behind paywalls.
168
+
169
+ #### Automatic Fallback Mechanisms
170
+
171
+ When the standard text retrieval fails, `paperscraper` automatically tries these fallbacks:
172
+
173
+ - **BioC-PMC**: For biomedical papers in [PubMed Central](https://pmc.ncbi.nlm.nih.gov/) (open-access repository), it retrieves open-access full-text XML from the [BioC-PMC API](https://www.ncbi.nlm.nih.gov/research/bionlp/APIs/BioC-PMC/).
174
+ - **eLife Papers**: For [eLife](https://elifesciences.org/) journal papers, it fetches XML files from eLife's open [GitHub repository](https://github.com/elifesciences/elife-article-xml).
175
+
176
+ These fallbacks are tried automatically without requiring any additional configuration.
177
+
178
+ #### Enhanced Retrieval with Publisher APIs
179
+
180
+ For more comprehensive access to papers from major publishers, you can provide API keys for:
181
+
182
+ - **Wiley TDM API**: Enables access to [Wiley](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) publications (2,000+ journals).
183
+ - **Elsevier TDM API**: Enables access to [Elsevier](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) publications (The Lancet, Cell, ...).
184
+
185
+ To use publisher APIs:
186
+
187
+ 1. Create a file with your API keys:
188
+ ```
189
+ WILEY_TDM_API_TOKEN=your_wiley_token_here
190
+ ELSEVIER_TDM_API_KEY=your_elsevier_key_here
191
+ ```
192
+
193
+ 2. Pass the file path when calling retrieval functions:
194
+
195
+ ```py
196
+ from paperscraper.pdf import save_pdf_from_dump
197
+
198
+ save_pdf_from_dump(
199
+ 'pubmed_query_results.jsonl',
200
+ pdf_path='./papers',
201
+ key_to_save='doi',
202
+ api_keys='path/to/your/api_keys.txt'
203
+ )
204
+ ```
205
+
206
+ For obtaining API keys:
207
+ - Wiley TDM API: Visit [Wiley Text and Data Mining](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) (free for academic users with institutional subscription)
208
+ - Elsevier TDM API: Visit [Elsevier's Text and Data Mining](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) (free for academic users with institutional subscription)
209
+
210
+ *NOTE*: While these fallback mechanisms improve retrieval success rates, they cannot guarantee access to all papers due to various access restrictions.
136
211
 
137
212
 
138
213
  ### Citation search
139
214
 
140
- A plus of the Scholar endpoint is that the number of citations of a paper can be fetched:
215
+ You can fetch the number of citations of a paper from its title or DOI
141
216
 
142
217
  ```py
143
- from paperscraper.scholar import get_citations_from_title
218
+ from paperscraper.citations import get_citations_from_title, get_citations_by_doi
144
219
  title = 'Über formal unentscheidbare Sätze der Principia Mathematica und verwandter Systeme I.'
145
- get_citations_from_title(title)
146
- ```
220
+ print(get_citations_from_title(title))
147
221
 
148
- *NOTE*: The scholar endpoint does not require authentication but since it regularly
149
- prompts with captchas, it's difficult to apply large scale.
222
+ doi = '10.1021/acs.jcim.3c00132'
223
+ get_citations_by_doi(doi)
224
+ ```
150
225
 
151
226
  ### Journal impact factor
152
227
 
@@ -184,28 +259,13 @@ i.search("quantum information", threshold=90, return_all=True)
184
259
  # ]
185
260
  ```
186
261
 
187
- ## Arxiv local dump
188
- If you prefer local search rather than using the arxiv API:
189
-
190
- ```py
191
- from paperscraper.get_dumps import arxiv
192
- arxiv(start_date='2024-01-01', end_date=None) # scrapes all metadata from 2024 until today.
193
- ```
194
-
195
- Afterwards you can search the local arxiv dump just like the other x-rxiv dumps.
196
- The direct endpoint is `paperscraper.arxiv.get_arxiv_papers_local`. You can also specify the
197
- backend directly in the `get_and_dump_arxiv_papers` function:
198
- ```py
199
- from paperscraper.arxiv import get_and_dump_arxiv_papers
200
- get_and_dump_arxiv_papers(..., backend='local')
201
- ```
202
262
 
203
- ### Plotting
263
+ ## Plotting
204
264
 
205
265
  When multiple query searches are performed, two types of plots can be generated
206
266
  automatically: Venn diagrams and bar plots.
207
267
 
208
- #### Barplots
268
+ ### Barplots
209
269
 
210
270
  Compare the temporal evolution of different queries across different servers.
211
271
 
@@ -263,7 +323,7 @@ plot_comparison(
263
323
  ![molreps](https://github.com/jannisborn/paperscraper/blob/main/assets/molreps.png?raw=true "MolReps")
264
324
 
265
325
 
266
- #### Venn Diagrams
326
+ ### Venn Diagrams
267
327
 
268
328
  ```py
269
329
  from paperscraper.plotting import (
@@ -322,6 +382,7 @@ If you use `paperscraper`, please cite a paper that motivated our development of
322
382
 
323
383
  ## Contributions
324
384
  Thanks to the following contributors:
385
+ - [@mathinic](https://github.com/mathinic): Since `v0.3.0` improved PubMed full text retrieval with additional fallback mechanisms (BioC-PMC, eLife and optional Wiley/Elsevier APIs).
325
386
  - [@memray](https://github.com/memray): Since `v0.2.12` there are automatic retries when downloading the {med/bio/chem}rxiv dumps.
326
387
  - [@achouhan93](https://github.com/achouhan93): Since `v0.2.5` {med/bio/chem}rxiv can be scraped for specific dates!
327
388
  - [@daenuprobst](https://github.com/daenuprobst): Since `v0.2.4` PDF files can be scraped directly (`paperscraper.pdf.save_pdf`)
@@ -1,7 +1,7 @@
1
1
  """Initialize the module."""
2
2
 
3
3
  __name__ = "paperscraper"
4
- __version__ = "0.2.16"
4
+ __version__ = "0.3.0"
5
5
 
6
6
  import logging
7
7
  import os
@@ -0,0 +1,3 @@
1
+ from .citations import get_citations_by_doi, get_citations_from_title
2
+ from .core import SelfLinkClient
3
+ from .self_references import self_references, self_references_paper
@@ -0,0 +1,63 @@
1
+ import logging
2
+ import sys
3
+ from time import sleep
4
+
5
+ from scholarly import scholarly
6
+ from semanticscholar import SemanticScholar, SemanticScholarException
7
+
8
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+ sch = SemanticScholar()
11
+
12
+
13
+ def get_citations_by_doi(doi: str) -> int:
14
+ """
15
+ Get the number of citations of a paper according to semantic scholar.
16
+
17
+ Args:
18
+ doi: the DOI of the paper.
19
+
20
+ Returns:
21
+ The number of citations
22
+ """
23
+
24
+ try:
25
+ paper = sch.get_paper(doi)
26
+ citations = len(paper["citations"])
27
+ except SemanticScholarException.ObjectNotFoundException:
28
+ logger.warning(f"Could not find paper {doi}, assuming 0 citation.")
29
+ citations = 0
30
+ except ConnectionRefusedError as e:
31
+ logger.warning(f"Waiting for 10 sec since {doi} gave: {e}")
32
+ sleep(10)
33
+ citations = len(sch.get_paper(doi)["citations"])
34
+ finally:
35
+ return citations
36
+
37
+
38
+ def get_citations_from_title(title: str) -> int:
39
+ """
40
+ Args:
41
+ title (str): Title of paper to be searched on Scholar.
42
+
43
+ Raises:
44
+ TypeError: If sth else than str is passed.
45
+
46
+ Returns:
47
+ int: Number of citations of paper.
48
+ """
49
+
50
+ if not isinstance(title, str):
51
+ raise TypeError(f"Pass str not {type(title)}")
52
+
53
+ # Search for exact match
54
+ title = '"' + title.strip() + '"'
55
+
56
+ matches = scholarly.search_pubs(title)
57
+ counts = list(map(lambda p: int(p["num_citations"]), matches))
58
+ if len(counts) == 0:
59
+ logger.warning(f"Found no match for {title}.")
60
+ return 0
61
+ if len(counts) > 1:
62
+ logger.warning(f"Found {len(counts)} matches for {title}, returning first one.")
63
+ return counts[0]
@@ -0,0 +1,19 @@
1
+ import logging
2
+
3
+ from paperscraper.citations import get_citations_by_doi
4
+
5
+ logging.disable(logging.INFO)
6
+
7
+
8
+ class TestCitations:
9
+ def test_citations(self):
10
+ num = get_citations_by_doi("10.1038/s42256-023-00639-z")
11
+ assert isinstance(num, int) and num > 50
12
+
13
+ # Try invalid DOI
14
+ num = get_citations_by_doi("10.1035348/s42256-023-00639-z")
15
+ assert isinstance(num, int) and num == 0
16
+
17
+
18
+ num = get_citations_by_doi("10.1035348/s42256-023-00639-z")
19
+ assert isinstance(num, int) and num == 0