paperscraper 0.2.8__tar.gz → 0.2.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {paperscraper-0.2.8 → paperscraper-0.2.10}/PKG-INFO +37 -7
- {paperscraper-0.2.8 → paperscraper-0.2.10}/README.md +32 -5
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/__init__.py +1 -1
- paperscraper-0.2.10/paperscraper/impact.py +111 -0
- paperscraper-0.2.10/paperscraper/tests/__init__.py +0 -0
- paperscraper-0.2.10/paperscraper/tests/test_impactor.py +69 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/PKG-INFO +37 -7
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/SOURCES.txt +3 -1
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/requires.txt +3 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/setup.py +7 -3
- paperscraper-0.2.8/paperscraper/journal_if.py +0 -155
- {paperscraper-0.2.8 → paperscraper-0.2.10}/LICENSE +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/arxiv/__init__.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/arxiv/arxiv.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/arxiv/utils.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/__init__.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/biorxiv.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/chemrxiv.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/medrxiv.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/utils/__init__.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/utils/chemrxiv/utils.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/load_dumps.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/pdf.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/plotting.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/postprocessing.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/pubmed/__init__.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/pubmed/pubmed.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/pubmed/utils.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/scholar/__init__.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/scholar/scholar.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/server_dumps/__init__.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/utils.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/xrxiv/__init__.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/xrxiv/xrxiv_api.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/xrxiv/xrxiv_query.py +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/dependency_links.txt +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/not-zip-safe +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/top_level.txt +0 -0
- {paperscraper-0.2.8 → paperscraper-0.2.10}/setup.cfg +0 -0
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: paperscraper
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.10
|
|
4
4
|
Summary: paperscraper: Package to scrape papers.
|
|
5
5
|
Home-page: https://github.com/PhosphorylatedRabbits/paperscraper
|
|
6
6
|
Author: Jannis Born, Matteo Manica
|
|
7
7
|
Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
|
|
8
8
|
License: MIT
|
|
9
|
-
Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv
|
|
9
|
+
Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv,Google Scholar
|
|
10
10
|
Classifier: Development Status :: 3 - Alpha
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -27,6 +27,9 @@ Requires-Dist: seaborn
|
|
|
27
27
|
Requires-Dist: matplotlib
|
|
28
28
|
Requires-Dist: matplotlib_venn
|
|
29
29
|
Requires-Dist: bs4
|
|
30
|
+
Requires-Dist: impact-factor>=1.1.0
|
|
31
|
+
Requires-Dist: thefuzz
|
|
32
|
+
Requires-Dist: pytest
|
|
30
33
|
|
|
31
34
|
[](https://github.com/PhosphorylatedRabbits/paperscraper/actions/workflows/build.yml)
|
|
32
35
|
[![License:
|
|
@@ -179,14 +182,41 @@ get_citations_from_title(title)
|
|
|
179
182
|
*NOTE*: The scholar endpoint does not require authentification but since it regularly
|
|
180
183
|
prompts with captchas, it's difficult to apply large scale.
|
|
181
184
|
|
|
182
|
-
|
|
185
|
+
### Journal impact factor
|
|
183
186
|
|
|
184
|
-
You can also retrieve the impact factor for all journals
|
|
187
|
+
You can also retrieve the impact factor for all journals:
|
|
185
188
|
```py
|
|
186
|
-
from paperscraper.
|
|
187
|
-
i = Impactor()
|
|
189
|
+
>>>from paperscraper.impact import Impactor
|
|
190
|
+
>>>i = Impactor()
|
|
191
|
+
>>>i.search("Nat Comms", threshold=85, sort_by='impact')
|
|
192
|
+
[
|
|
193
|
+
{'journal': 'Nature Communications', 'factor': 17.694, 'score': 94},
|
|
194
|
+
{'journal': 'Natural Computing', 'factor': 1.504, 'score': 88}
|
|
195
|
+
]
|
|
196
|
+
```
|
|
197
|
+
This performs a fuzzy search with a threshold of 85. `threshold` defaults to 100 in which case an exact search
|
|
198
|
+
is performed. You can also search by journal abbreviation, [E-ISSN](https://portal.issn.org) or [NLM ID](https://portal.issn.org).
|
|
199
|
+
```py
|
|
200
|
+
i.search("Nat Rev Earth Environ") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
|
|
201
|
+
i.search("101771060") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
|
|
202
|
+
i.search('2662-138X') # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
|
|
203
|
+
|
|
204
|
+
# Filter results by impact factor
|
|
205
|
+
i.search("Neural network", threshold=85, min_impact=1.5, max_impact=20)
|
|
206
|
+
# [
|
|
207
|
+
# {'journal': 'IEEE Transactions on Neural Networks and Learning Systems', 'factor': 14.255, 'score': 93},
|
|
208
|
+
# {'journal': 'NEURAL NETWORKS', 'factor': 9.657, 'score': 91},
|
|
209
|
+
# {'journal': 'WORK-A Journal of Prevention Assessment & Rehabilitation', 'factor': 1.803, 'score': 86},
|
|
210
|
+
# {'journal': 'NETWORK-COMPUTATION IN NEURAL SYSTEMS', 'factor': 1.5, 'score': 92}
|
|
211
|
+
# ]
|
|
212
|
+
|
|
213
|
+
# Show all fields
|
|
214
|
+
i.search("quantum information", threshold=90, return_all=True)
|
|
215
|
+
# [
|
|
216
|
+
# {'factor': 10.758, 'jcr': 'Q1', 'journal_abbr': 'npj Quantum Inf', 'eissn': '2056-6387', 'journal': 'npj Quantum Information', 'nlm_id': '101722857', 'issn': '', 'score': 92},
|
|
217
|
+
# {'factor': 1.577, 'jcr': 'Q3', 'journal_abbr': 'Nation', 'eissn': '0027-8378', 'journal': 'NATION', 'nlm_id': '9877123', 'issn': '0027-8378', 'score': 91}
|
|
218
|
+
# ]
|
|
188
219
|
```
|
|
189
|
-
Then, `i.journal_to_if` should give you a dictionary wit journal to IF mappings for >9000 journals as of 2014.
|
|
190
220
|
|
|
191
221
|
### Plotting
|
|
192
222
|
|
|
@@ -149,14 +149,41 @@ get_citations_from_title(title)
|
|
|
149
149
|
*NOTE*: The scholar endpoint does not require authentification but since it regularly
|
|
150
150
|
prompts with captchas, it's difficult to apply large scale.
|
|
151
151
|
|
|
152
|
-
|
|
152
|
+
### Journal impact factor
|
|
153
153
|
|
|
154
|
-
You can also retrieve the impact factor for all journals
|
|
154
|
+
You can also retrieve the impact factor for all journals:
|
|
155
155
|
```py
|
|
156
|
-
from paperscraper.
|
|
157
|
-
i = Impactor()
|
|
156
|
+
>>>from paperscraper.impact import Impactor
|
|
157
|
+
>>>i = Impactor()
|
|
158
|
+
>>>i.search("Nat Comms", threshold=85, sort_by='impact')
|
|
159
|
+
[
|
|
160
|
+
{'journal': 'Nature Communications', 'factor': 17.694, 'score': 94},
|
|
161
|
+
{'journal': 'Natural Computing', 'factor': 1.504, 'score': 88}
|
|
162
|
+
]
|
|
163
|
+
```
|
|
164
|
+
This performs a fuzzy search with a threshold of 85. `threshold` defaults to 100 in which case an exact search
|
|
165
|
+
is performed. You can also search by journal abbreviation, [E-ISSN](https://portal.issn.org) or [NLM ID](https://portal.issn.org).
|
|
166
|
+
```py
|
|
167
|
+
i.search("Nat Rev Earth Environ") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
|
|
168
|
+
i.search("101771060") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
|
|
169
|
+
i.search('2662-138X') # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
|
|
170
|
+
|
|
171
|
+
# Filter results by impact factor
|
|
172
|
+
i.search("Neural network", threshold=85, min_impact=1.5, max_impact=20)
|
|
173
|
+
# [
|
|
174
|
+
# {'journal': 'IEEE Transactions on Neural Networks and Learning Systems', 'factor': 14.255, 'score': 93},
|
|
175
|
+
# {'journal': 'NEURAL NETWORKS', 'factor': 9.657, 'score': 91},
|
|
176
|
+
# {'journal': 'WORK-A Journal of Prevention Assessment & Rehabilitation', 'factor': 1.803, 'score': 86},
|
|
177
|
+
# {'journal': 'NETWORK-COMPUTATION IN NEURAL SYSTEMS', 'factor': 1.5, 'score': 92}
|
|
178
|
+
# ]
|
|
179
|
+
|
|
180
|
+
# Show all fields
|
|
181
|
+
i.search("quantum information", threshold=90, return_all=True)
|
|
182
|
+
# [
|
|
183
|
+
# {'factor': 10.758, 'jcr': 'Q1', 'journal_abbr': 'npj Quantum Inf', 'eissn': '2056-6387', 'journal': 'npj Quantum Information', 'nlm_id': '101722857', 'issn': '', 'score': 92},
|
|
184
|
+
# {'factor': 1.577, 'jcr': 'Q3', 'journal_abbr': 'Nation', 'eissn': '0027-8378', 'journal': 'NATION', 'nlm_id': '9877123', 'issn': '0027-8378', 'score': 91}
|
|
185
|
+
# ]
|
|
158
186
|
```
|
|
159
|
-
Then, `i.journal_to_if` should give you a dictionary wit journal to IF mappings for >9000 journals as of 2014.
|
|
160
187
|
|
|
161
188
|
### Plotting
|
|
162
189
|
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from impact_factor.core import Factor
|
|
6
|
+
from thefuzz import fuzz
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
logger.setLevel(logging.INFO)
|
|
10
|
+
logging.disable(logging.INFO)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Impactor:
|
|
14
|
+
def __init__(self):
|
|
15
|
+
"""
|
|
16
|
+
Initialize the Impactor class with an instance of the Factor class.
|
|
17
|
+
This allows access to the database of journal impact factors.
|
|
18
|
+
"""
|
|
19
|
+
self.fa = Factor()
|
|
20
|
+
self.all_journals = self.fa.search("%")
|
|
21
|
+
self.metadata = pd.DataFrame(self.all_journals, dtype=str)
|
|
22
|
+
logger.info(f"Loaded metadata for {len(self.metadata)} journals")
|
|
23
|
+
|
|
24
|
+
def search(
|
|
25
|
+
self,
|
|
26
|
+
query: str,
|
|
27
|
+
threshold: int = 100,
|
|
28
|
+
sort_by: Optional[str] = None,
|
|
29
|
+
min_impact: float = 0.0,
|
|
30
|
+
max_impact: float = float("inf"),
|
|
31
|
+
return_all: bool = False,
|
|
32
|
+
) -> List[Dict[str, Any]]:
|
|
33
|
+
"""
|
|
34
|
+
Search for journals matching the given query with an optional fuzziness
|
|
35
|
+
level and sorting.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
query: The journal name or abbreviation to search for.
|
|
39
|
+
threshold: The threshold for fuzzy matching. If set to 100, exact matching
|
|
40
|
+
is performed. If set below 100, fuzzy matching is used. Defaults to 100.
|
|
41
|
+
sort_by: Criterion for sorting results, one of 'impact', 'journal' and 'score'.
|
|
42
|
+
min_impact: Minimum impact factor for journals to be considered, defaults to 0.
|
|
43
|
+
max_impact: Maximum impact factor for journals to be considered, defaults to infinity.
|
|
44
|
+
return_all: If True, returns all columns of the DataFrame for each match.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
List[dict]: A list of dictionaries containing the journal information.
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
# Validation of parameters
|
|
51
|
+
if not isinstance(query, str) or not isinstance(threshold, int):
|
|
52
|
+
raise TypeError(
|
|
53
|
+
f"Query must be a str and threshold must be an int, not {type(query)} and {type(threshold)}"
|
|
54
|
+
)
|
|
55
|
+
if threshold < 0 or threshold > 100:
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"Fuzziness threshold must be between 0 and 100, not {threshold}"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if str.isdigit(query) and threshold >= 100:
|
|
61
|
+
# When querying with NLM ID, exact matching does not work since impact_factor
|
|
62
|
+
# strips off leading zeros, so we use fuzzy matching instead
|
|
63
|
+
threshold = 99
|
|
64
|
+
|
|
65
|
+
# Define a function to calculate fuzziness score
|
|
66
|
+
def calculate_fuzziness_score(row):
|
|
67
|
+
return max(fuzz.partial_ratio(query, str(value)) for value in row.values)
|
|
68
|
+
|
|
69
|
+
# Search with or without fuzzy matching
|
|
70
|
+
if threshold >= 100:
|
|
71
|
+
matched_df = self.metadata[
|
|
72
|
+
self.metadata.apply(
|
|
73
|
+
lambda x: query.lower() in x.astype(str).str.lower().values, axis=1
|
|
74
|
+
)
|
|
75
|
+
].copy()
|
|
76
|
+
# Exact matches get a default score of 100
|
|
77
|
+
matched_df["score"] = 100
|
|
78
|
+
else:
|
|
79
|
+
matched_df = self.metadata[
|
|
80
|
+
self.metadata.apply(
|
|
81
|
+
lambda x: calculate_fuzziness_score(x) >= threshold, axis=1
|
|
82
|
+
)
|
|
83
|
+
].copy()
|
|
84
|
+
matched_df["score"] = matched_df.apply(calculate_fuzziness_score, axis=1)
|
|
85
|
+
|
|
86
|
+
# Sorting based on the specified criterion
|
|
87
|
+
if sort_by == "score":
|
|
88
|
+
matched_df = matched_df.sort_values(by="score", ascending=False)
|
|
89
|
+
elif sort_by == "journal":
|
|
90
|
+
matched_df = matched_df.sort_values(by="journal")
|
|
91
|
+
elif sort_by == "impact":
|
|
92
|
+
matched_df = matched_df.sort_values(by="factor", ascending=False)
|
|
93
|
+
|
|
94
|
+
matched_df["factor"] = pd.to_numeric(matched_df["factor"])
|
|
95
|
+
matched_df = matched_df[
|
|
96
|
+
(matched_df["factor"] >= min_impact) & (matched_df["factor"] <= max_impact)
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
# Prepare the final result
|
|
100
|
+
results = [
|
|
101
|
+
row.to_dict()
|
|
102
|
+
if return_all
|
|
103
|
+
else {
|
|
104
|
+
"journal": row["journal"],
|
|
105
|
+
"factor": row["factor"],
|
|
106
|
+
"score": row["score"],
|
|
107
|
+
}
|
|
108
|
+
for _, row in matched_df.iterrows()
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
return results
|
|
File without changes
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from paperscraper.impact import Impactor
|
|
6
|
+
|
|
7
|
+
logging.disable(logging.INFO)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestImpactor:
|
|
11
|
+
@pytest.fixture
|
|
12
|
+
def impactor(self):
|
|
13
|
+
return Impactor()
|
|
14
|
+
|
|
15
|
+
def test_basic_search(self, impactor: Impactor):
|
|
16
|
+
results = impactor.search("Nat Comm", threshold=99, sort_by="score")
|
|
17
|
+
assert len(results) > 0 # Ensure we get some results
|
|
18
|
+
assert all(
|
|
19
|
+
"journal" in r and "factor" in r and "score" in r for r in results
|
|
20
|
+
) # Basic fields are present
|
|
21
|
+
|
|
22
|
+
def test_fuzzy_search(self, impactor: Impactor):
|
|
23
|
+
results = impactor.search("Nat Comm", threshold=99)
|
|
24
|
+
assert any(
|
|
25
|
+
r["journal"] == "Nature Communications" for r in results
|
|
26
|
+
) # Check for a specific journal
|
|
27
|
+
|
|
28
|
+
def test_sort_by_score(self, impactor: Impactor):
|
|
29
|
+
results = impactor.search("nature chem", threshold=80, sort_by="score")
|
|
30
|
+
scores = [r["score"] for r in results]
|
|
31
|
+
assert scores == sorted(
|
|
32
|
+
scores, reverse=True
|
|
33
|
+
) # Ensure results are sorted by score
|
|
34
|
+
|
|
35
|
+
def test_impact_factor_filtering(self, impactor: Impactor):
|
|
36
|
+
results = impactor.search("Quantum information", threshold=70, min_impact=8)
|
|
37
|
+
assert all(
|
|
38
|
+
8 <= r["factor"] for r in results
|
|
39
|
+
) # Check if all results have a factor >= 8
|
|
40
|
+
|
|
41
|
+
def test_return_all_fields(self, impactor: Impactor):
|
|
42
|
+
results = impactor.search("nature chem", return_all=True)
|
|
43
|
+
assert all(
|
|
44
|
+
len(r) > 3 for r in results
|
|
45
|
+
) # Check if more than the basic fields are returned
|
|
46
|
+
|
|
47
|
+
def test_quantum_information_search(self, impactor):
|
|
48
|
+
expected_results = [
|
|
49
|
+
{"journal": "InfoMat", "factor": 24.798, "score": 71},
|
|
50
|
+
{"journal": "Information Fusion", "factor": 17.564, "score": 71},
|
|
51
|
+
{"journal": "npj Quantum Information", "factor": 10.758, "score": 95},
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
results = impactor.search(
|
|
55
|
+
"Quantum information", threshold=70, sort_by="factor", min_impact=8
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Ensure that the results match the expected results
|
|
59
|
+
assert len(results) == len(expected_results), "Number of results does not match"
|
|
60
|
+
for expected, actual in zip(expected_results, results):
|
|
61
|
+
assert (
|
|
62
|
+
expected["journal"] == actual["journal"]
|
|
63
|
+
), f"Journal name does not match for {expected['journal']}"
|
|
64
|
+
assert (
|
|
65
|
+
abs(expected["factor"] - actual["factor"]) < 0.001
|
|
66
|
+
), f"Impact factor does not match for {expected['journal']}"
|
|
67
|
+
assert (
|
|
68
|
+
expected["score"] == actual["score"]
|
|
69
|
+
), f"Score does not match for {expected['journal']}"
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: paperscraper
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.10
|
|
4
4
|
Summary: paperscraper: Package to scrape papers.
|
|
5
5
|
Home-page: https://github.com/PhosphorylatedRabbits/paperscraper
|
|
6
6
|
Author: Jannis Born, Matteo Manica
|
|
7
7
|
Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
|
|
8
8
|
License: MIT
|
|
9
|
-
Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv
|
|
9
|
+
Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv,Google Scholar
|
|
10
10
|
Classifier: Development Status :: 3 - Alpha
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -27,6 +27,9 @@ Requires-Dist: seaborn
|
|
|
27
27
|
Requires-Dist: matplotlib
|
|
28
28
|
Requires-Dist: matplotlib_venn
|
|
29
29
|
Requires-Dist: bs4
|
|
30
|
+
Requires-Dist: impact-factor>=1.1.0
|
|
31
|
+
Requires-Dist: thefuzz
|
|
32
|
+
Requires-Dist: pytest
|
|
30
33
|
|
|
31
34
|
[](https://github.com/PhosphorylatedRabbits/paperscraper/actions/workflows/build.yml)
|
|
32
35
|
[![License:
|
|
@@ -179,14 +182,41 @@ get_citations_from_title(title)
|
|
|
179
182
|
*NOTE*: The scholar endpoint does not require authentification but since it regularly
|
|
180
183
|
prompts with captchas, it's difficult to apply large scale.
|
|
181
184
|
|
|
182
|
-
|
|
185
|
+
### Journal impact factor
|
|
183
186
|
|
|
184
|
-
You can also retrieve the impact factor for all journals
|
|
187
|
+
You can also retrieve the impact factor for all journals:
|
|
185
188
|
```py
|
|
186
|
-
from paperscraper.
|
|
187
|
-
i = Impactor()
|
|
189
|
+
>>>from paperscraper.impact import Impactor
|
|
190
|
+
>>>i = Impactor()
|
|
191
|
+
>>>i.search("Nat Comms", threshold=85, sort_by='impact')
|
|
192
|
+
[
|
|
193
|
+
{'journal': 'Nature Communications', 'factor': 17.694, 'score': 94},
|
|
194
|
+
{'journal': 'Natural Computing', 'factor': 1.504, 'score': 88}
|
|
195
|
+
]
|
|
196
|
+
```
|
|
197
|
+
This performs a fuzzy search with a threshold of 85. `threshold` defaults to 100 in which case an exact search
|
|
198
|
+
is performed. You can also search by journal abbreviation, [E-ISSN](https://portal.issn.org) or [NLM ID](https://portal.issn.org).
|
|
199
|
+
```py
|
|
200
|
+
i.search("Nat Rev Earth Environ") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
|
|
201
|
+
i.search("101771060") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
|
|
202
|
+
i.search('2662-138X') # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
|
|
203
|
+
|
|
204
|
+
# Filter results by impact factor
|
|
205
|
+
i.search("Neural network", threshold=85, min_impact=1.5, max_impact=20)
|
|
206
|
+
# [
|
|
207
|
+
# {'journal': 'IEEE Transactions on Neural Networks and Learning Systems', 'factor': 14.255, 'score': 93},
|
|
208
|
+
# {'journal': 'NEURAL NETWORKS', 'factor': 9.657, 'score': 91},
|
|
209
|
+
# {'journal': 'WORK-A Journal of Prevention Assessment & Rehabilitation', 'factor': 1.803, 'score': 86},
|
|
210
|
+
# {'journal': 'NETWORK-COMPUTATION IN NEURAL SYSTEMS', 'factor': 1.5, 'score': 92}
|
|
211
|
+
# ]
|
|
212
|
+
|
|
213
|
+
# Show all fields
|
|
214
|
+
i.search("quantum information", threshold=90, return_all=True)
|
|
215
|
+
# [
|
|
216
|
+
# {'factor': 10.758, 'jcr': 'Q1', 'journal_abbr': 'npj Quantum Inf', 'eissn': '2056-6387', 'journal': 'npj Quantum Information', 'nlm_id': '101722857', 'issn': '', 'score': 92},
|
|
217
|
+
# {'factor': 1.577, 'jcr': 'Q3', 'journal_abbr': 'Nation', 'eissn': '0027-8378', 'journal': 'NATION', 'nlm_id': '9877123', 'issn': '0027-8378', 'score': 91}
|
|
218
|
+
# ]
|
|
188
219
|
```
|
|
189
|
-
Then, `i.journal_to_if` should give you a dictionary wit journal to IF mappings for >9000 journals as of 2014.
|
|
190
220
|
|
|
191
221
|
### Plotting
|
|
192
222
|
|
|
@@ -2,7 +2,7 @@ LICENSE
|
|
|
2
2
|
README.md
|
|
3
3
|
setup.py
|
|
4
4
|
paperscraper/__init__.py
|
|
5
|
-
paperscraper/
|
|
5
|
+
paperscraper/impact.py
|
|
6
6
|
paperscraper/load_dumps.py
|
|
7
7
|
paperscraper/pdf.py
|
|
8
8
|
paperscraper/plotting.py
|
|
@@ -31,6 +31,8 @@ paperscraper/pubmed/utils.py
|
|
|
31
31
|
paperscraper/scholar/__init__.py
|
|
32
32
|
paperscraper/scholar/scholar.py
|
|
33
33
|
paperscraper/server_dumps/__init__.py
|
|
34
|
+
paperscraper/tests/__init__.py
|
|
35
|
+
paperscraper/tests/test_impactor.py
|
|
34
36
|
paperscraper/xrxiv/__init__.py
|
|
35
37
|
paperscraper/xrxiv/xrxiv_api.py
|
|
36
38
|
paperscraper/xrxiv/xrxiv_query.py
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
"""Install package."""
|
|
2
|
-
import os
|
|
3
|
-
from setuptools import setup
|
|
4
|
-
from setuptools import find_packages
|
|
5
2
|
import io
|
|
3
|
+
import os
|
|
6
4
|
import re
|
|
7
5
|
|
|
6
|
+
from setuptools import find_packages, setup
|
|
7
|
+
|
|
8
8
|
__version__ = re.search(
|
|
9
9
|
r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
|
|
10
10
|
io.open("paperscraper/__init__.py", encoding="utf_8_sig").read(),
|
|
@@ -36,6 +36,9 @@ setup(
|
|
|
36
36
|
"matplotlib",
|
|
37
37
|
"matplotlib_venn",
|
|
38
38
|
"bs4",
|
|
39
|
+
"impact-factor>=1.1.0",
|
|
40
|
+
"thefuzz",
|
|
41
|
+
"pytest",
|
|
39
42
|
],
|
|
40
43
|
keywords=[
|
|
41
44
|
"Academics",
|
|
@@ -47,6 +50,7 @@ setup(
|
|
|
47
50
|
"Medrxiv",
|
|
48
51
|
"Biorxiv",
|
|
49
52
|
"Chemrxiv",
|
|
53
|
+
"Google Scholar",
|
|
50
54
|
],
|
|
51
55
|
packages=find_packages("."),
|
|
52
56
|
package_data={"paperscraper.server_dumps": ["*"]},
|
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Class to fetch the impact factor of all citefactor-indexed journals.
|
|
3
|
-
Limitation: Fetches the 2014 IFs.
|
|
4
|
-
|
|
5
|
-
Adapted from: https://github.com/andrew-hill/impactor/blob/master/impactor.py
|
|
6
|
-
Available via MIT License.
|
|
7
|
-
|
|
8
|
-
Adaptions:
|
|
9
|
-
- Converting code from Python2 to Python3.
|
|
10
|
-
- Fetching IFs from *all* journals not just from journals starting with "A".
|
|
11
|
-
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
import logging
|
|
15
|
-
import pickle
|
|
16
|
-
import re
|
|
17
|
-
import string
|
|
18
|
-
from urllib.request import urlopen
|
|
19
|
-
|
|
20
|
-
# http://www.crummy.com/software/BeautifulSoup/
|
|
21
|
-
from bs4 import BeautifulSoup
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class Impactor(object):
|
|
25
|
-
"""
|
|
26
|
-
Class to fetch the impact factor of all citefactor-indexed journals as of 2014.
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
BASE_URL_PREFIX = r"http://www.citefactor.org/journal-impact-factor-list-"
|
|
30
|
-
BASE_URL_SUFFIX = r".html"
|
|
31
|
-
URL_REGEX_PREFIX = r"http://www\.citefactor\.org/journal-impact-factor-list-"
|
|
32
|
-
URL_REGEX_SUFFIX = r"_?[A-Z]?\.html"
|
|
33
|
-
|
|
34
|
-
def __init__(self, journal_db_file=None, year=2014):
|
|
35
|
-
logging.debug("journal_db_file={}, year={}".format(journal_db_file, year))
|
|
36
|
-
|
|
37
|
-
self.journal_data = None
|
|
38
|
-
self.journal_db_file = journal_db_file
|
|
39
|
-
self.matches = set()
|
|
40
|
-
self.year = year
|
|
41
|
-
|
|
42
|
-
assert year in (2014,), "Can only handle 2014 at the moment."
|
|
43
|
-
self.base_url = self.BASE_URL_PREFIX + str(year) + self.BASE_URL_SUFFIX
|
|
44
|
-
self.url_regex = self.URL_REGEX_PREFIX + str(year) + self.URL_REGEX_SUFFIX
|
|
45
|
-
self.re = re.compile(self.url_regex)
|
|
46
|
-
self.load()
|
|
47
|
-
self.save()
|
|
48
|
-
self.create_if_dict()
|
|
49
|
-
|
|
50
|
-
def match(self, search_terms):
|
|
51
|
-
# If no terms specified, show all entries
|
|
52
|
-
if search_terms is None or len(search_terms) == 0:
|
|
53
|
-
for j in self.journal_data.values():
|
|
54
|
-
self.matches.add(j["ISSN"])
|
|
55
|
-
# Otherwise do search
|
|
56
|
-
issn_re = re.compile(r"\d{4}-\d{4}")
|
|
57
|
-
for s in search_terms:
|
|
58
|
-
if issn_re.match(s):
|
|
59
|
-
self.matches.add(s)
|
|
60
|
-
else:
|
|
61
|
-
for j in self.journal_data.values():
|
|
62
|
-
if j["JOURNAL"].lower().find(s.lower()) >= 0:
|
|
63
|
-
self.matches.add(j["ISSN"])
|
|
64
|
-
|
|
65
|
-
def load(self):
|
|
66
|
-
# Try to load from file
|
|
67
|
-
if self.journal_db_file is not None:
|
|
68
|
-
try:
|
|
69
|
-
with open(self.journal_db_file, "rb") as f:
|
|
70
|
-
self.journal_data = pickle.load(f)
|
|
71
|
-
logging.debug(
|
|
72
|
-
"loaded journals from {}".format(self.journal_db_file)
|
|
73
|
-
)
|
|
74
|
-
except Exception:
|
|
75
|
-
pass
|
|
76
|
-
# If cannot load from file, load from URL
|
|
77
|
-
if self.journal_data is None:
|
|
78
|
-
logging.info("Fetching database from citefactor.org...")
|
|
79
|
-
self.journal_data = self.get_all_journal_data()
|
|
80
|
-
|
|
81
|
-
def save(self):
|
|
82
|
-
if self.journal_db_file is not None:
|
|
83
|
-
try:
|
|
84
|
-
with open(self.journal_db_file, "wb") as f:
|
|
85
|
-
pickle.dump(self.journal_data, f, -1)
|
|
86
|
-
logging.debug("saved journals to {}".format(self.journal_db_file))
|
|
87
|
-
except Exception:
|
|
88
|
-
pass
|
|
89
|
-
|
|
90
|
-
def get_all_urls(self):
|
|
91
|
-
main_page_content = urlopen(self.base_url).read()
|
|
92
|
-
soup = BeautifulSoup(main_page_content)
|
|
93
|
-
soup.prettify() # necessary?
|
|
94
|
-
return [
|
|
95
|
-
self.base_url,
|
|
96
|
-
] + [anchor["href"] for anchor in soup.find_all("a", href=self.re)]
|
|
97
|
-
|
|
98
|
-
def get_journal_table(self, url):
|
|
99
|
-
content = urlopen(url).read()
|
|
100
|
-
soup = BeautifulSoup(content)
|
|
101
|
-
soup.prettify() # necessary?
|
|
102
|
-
t = soup.table
|
|
103
|
-
caption_re = re.compile(
|
|
104
|
-
r"^Impact Factor " + str(self.year)
|
|
105
|
-
) # works for Year==2015 only
|
|
106
|
-
while t is not None:
|
|
107
|
-
if (
|
|
108
|
-
t.caption is None
|
|
109
|
-
or t.caption.string is None
|
|
110
|
-
or caption_re.match(t.caption.string) is None
|
|
111
|
-
):
|
|
112
|
-
t = t.find_next()
|
|
113
|
-
continue
|
|
114
|
-
return t
|
|
115
|
-
|
|
116
|
-
def get_table_headers(self, table):
|
|
117
|
-
return [str(x.string) for x in table.tr.find_all("td")]
|
|
118
|
-
|
|
119
|
-
def get_journal_data(self, table):
|
|
120
|
-
headers = self.get_table_headers(table)
|
|
121
|
-
journals = dict()
|
|
122
|
-
for row in table.find_all("tr")[1:]:
|
|
123
|
-
cells = row.find_all("td")
|
|
124
|
-
j = dict(zip(headers, [str(x.string) for x in cells]))
|
|
125
|
-
# logging.debug('importing: {}'.format(j))
|
|
126
|
-
journals[j["ISSN"]] = j
|
|
127
|
-
return journals
|
|
128
|
-
|
|
129
|
-
def get_all_journal_data(self):
|
|
130
|
-
journals = dict()
|
|
131
|
-
for url in self.get_all_urls():
|
|
132
|
-
|
|
133
|
-
for page in string.ascii_uppercase:
|
|
134
|
-
page = "0-A" if page == "A" else page
|
|
135
|
-
url_page = url.split("2014")[0] + "2014_" + page + url.split("2014")[1]
|
|
136
|
-
table = self.get_journal_table(url_page)
|
|
137
|
-
journals.update(self.get_journal_data(table))
|
|
138
|
-
logging.info(
|
|
139
|
-
"imported {} journal entries from citefactor.org".format(len(journals))
|
|
140
|
-
)
|
|
141
|
-
return journals
|
|
142
|
-
|
|
143
|
-
def create_if_dict(self):
|
|
144
|
-
"""
|
|
145
|
-
Creates a dictionary with journal names as key (lowercase) and impact factors
|
|
146
|
-
as values.
|
|
147
|
-
"""
|
|
148
|
-
|
|
149
|
-
stringparse = (
|
|
150
|
-
lambda x: str(x).strip().lower().replace("\\", "_").replace(" ", "_")
|
|
151
|
-
)
|
|
152
|
-
self.journal_to_if = dict(
|
|
153
|
-
(stringparse(value["JOURNAL"]), value["2013/2014"])
|
|
154
|
-
for key, value in self.journal_data.items()
|
|
155
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/utils/chemrxiv/__init__.py
RENAMED
|
File without changes
|
{paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|