paperscraper 0.2.8__tar.gz → 0.2.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {paperscraper-0.2.8 → paperscraper-0.2.10}/PKG-INFO +37 -7
  2. {paperscraper-0.2.8 → paperscraper-0.2.10}/README.md +32 -5
  3. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/__init__.py +1 -1
  4. paperscraper-0.2.10/paperscraper/impact.py +111 -0
  5. paperscraper-0.2.10/paperscraper/tests/__init__.py +0 -0
  6. paperscraper-0.2.10/paperscraper/tests/test_impactor.py +69 -0
  7. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/PKG-INFO +37 -7
  8. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/SOURCES.txt +3 -1
  9. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/requires.txt +3 -0
  10. {paperscraper-0.2.8 → paperscraper-0.2.10}/setup.py +7 -3
  11. paperscraper-0.2.8/paperscraper/journal_if.py +0 -155
  12. {paperscraper-0.2.8 → paperscraper-0.2.10}/LICENSE +0 -0
  13. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/arxiv/__init__.py +0 -0
  14. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/arxiv/arxiv.py +0 -0
  15. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/arxiv/utils.py +0 -0
  16. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/__init__.py +0 -0
  17. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/biorxiv.py +0 -0
  18. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/chemrxiv.py +0 -0
  19. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/medrxiv.py +0 -0
  20. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/utils/__init__.py +0 -0
  21. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
  22. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +0 -0
  23. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/get_dumps/utils/chemrxiv/utils.py +0 -0
  24. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/load_dumps.py +0 -0
  25. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/pdf.py +0 -0
  26. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/plotting.py +0 -0
  27. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/postprocessing.py +0 -0
  28. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/pubmed/__init__.py +0 -0
  29. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/pubmed/pubmed.py +0 -0
  30. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/pubmed/utils.py +0 -0
  31. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/scholar/__init__.py +0 -0
  32. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/scholar/scholar.py +0 -0
  33. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/server_dumps/__init__.py +0 -0
  34. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/utils.py +0 -0
  35. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/xrxiv/__init__.py +0 -0
  36. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/xrxiv/xrxiv_api.py +0 -0
  37. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper/xrxiv/xrxiv_query.py +0 -0
  38. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/dependency_links.txt +0 -0
  39. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/not-zip-safe +0 -0
  40. {paperscraper-0.2.8 → paperscraper-0.2.10}/paperscraper.egg-info/top_level.txt +0 -0
  41. {paperscraper-0.2.8 → paperscraper-0.2.10}/setup.cfg +0 -0
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: paperscraper
3
- Version: 0.2.8
3
+ Version: 0.2.10
4
4
  Summary: paperscraper: Package to scrape papers.
5
5
  Home-page: https://github.com/PhosphorylatedRabbits/paperscraper
6
6
  Author: Jannis Born, Matteo Manica
7
7
  Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
8
8
  License: MIT
9
- Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv
9
+ Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv,Google Scholar
10
10
  Classifier: Development Status :: 3 - Alpha
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: Intended Audience :: Science/Research
@@ -27,6 +27,9 @@ Requires-Dist: seaborn
27
27
  Requires-Dist: matplotlib
28
28
  Requires-Dist: matplotlib_venn
29
29
  Requires-Dist: bs4
30
+ Requires-Dist: impact-factor>=1.1.0
31
+ Requires-Dist: thefuzz
32
+ Requires-Dist: pytest
30
33
 
31
34
  [![build](https://github.com/PhosphorylatedRabbits/paperscraper/actions/workflows/build.yml/badge.svg)](https://github.com/PhosphorylatedRabbits/paperscraper/actions/workflows/build.yml)
32
35
  [![License:
@@ -179,14 +182,41 @@ get_citations_from_title(title)
179
182
  *NOTE*: The scholar endpoint does not require authentification but since it regularly
180
183
  prompts with captchas, it's difficult to apply large scale.
181
184
 
182
- #### Journal impact factor
185
+ ### Journal impact factor
183
186
 
184
- You can also retrieve the impact factor for all journals indexed by citefactor:
187
+ You can also retrieve the impact factor for all journals:
185
188
  ```py
186
- from paperscraper.journal_if import Impactor
187
- i = Impactor()
189
+ >>>from paperscraper.impact import Impactor
190
+ >>>i = Impactor()
191
+ >>>i.search("Nat Comms", threshold=85, sort_by='impact')
192
+ [
193
+ {'journal': 'Nature Communications', 'factor': 17.694, 'score': 94},
194
+ {'journal': 'Natural Computing', 'factor': 1.504, 'score': 88}
195
+ ]
196
+ ```
197
+ This performs a fuzzy search with a threshold of 85. `threshold` defaults to 100 in which case an exact search
198
+ is performed. You can also search by journal abbreviation, [E-ISSN](https://portal.issn.org) or [NLM ID](https://portal.issn.org).
199
+ ```py
200
+ i.search("Nat Rev Earth Environ") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
201
+ i.search("101771060") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
202
+ i.search('2662-138X') # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
203
+
204
+ # Filter results by impact factor
205
+ i.search("Neural network", threshold=85, min_impact=1.5, max_impact=20)
206
+ # [
207
+ # {'journal': 'IEEE Transactions on Neural Networks and Learning Systems', 'factor': 14.255, 'score': 93},
208
+ # {'journal': 'NEURAL NETWORKS', 'factor': 9.657, 'score': 91},
209
+ # {'journal': 'WORK-A Journal of Prevention Assessment & Rehabilitation', 'factor': 1.803, 'score': 86},
210
+ # {'journal': 'NETWORK-COMPUTATION IN NEURAL SYSTEMS', 'factor': 1.5, 'score': 92}
211
+ # ]
212
+
213
+ # Show all fields
214
+ i.search("quantum information", threshold=90, return_all=True)
215
+ # [
216
+ # {'factor': 10.758, 'jcr': 'Q1', 'journal_abbr': 'npj Quantum Inf', 'eissn': '2056-6387', 'journal': 'npj Quantum Information', 'nlm_id': '101722857', 'issn': '', 'score': 92},
217
+ # {'factor': 1.577, 'jcr': 'Q3', 'journal_abbr': 'Nation', 'eissn': '0027-8378', 'journal': 'NATION', 'nlm_id': '9877123', 'issn': '0027-8378', 'score': 91}
218
+ # ]
188
219
  ```
189
- Then, `i.journal_to_if` should give you a dictionary wit journal to IF mappings for >9000 journals as of 2014.
190
220
 
191
221
  ### Plotting
192
222
 
@@ -149,14 +149,41 @@ get_citations_from_title(title)
149
149
  *NOTE*: The scholar endpoint does not require authentification but since it regularly
150
150
  prompts with captchas, it's difficult to apply large scale.
151
151
 
152
- #### Journal impact factor
152
+ ### Journal impact factor
153
153
 
154
- You can also retrieve the impact factor for all journals indexed by citefactor:
154
+ You can also retrieve the impact factor for all journals:
155
155
  ```py
156
- from paperscraper.journal_if import Impactor
157
- i = Impactor()
156
+ >>>from paperscraper.impact import Impactor
157
+ >>>i = Impactor()
158
+ >>>i.search("Nat Comms", threshold=85, sort_by='impact')
159
+ [
160
+ {'journal': 'Nature Communications', 'factor': 17.694, 'score': 94},
161
+ {'journal': 'Natural Computing', 'factor': 1.504, 'score': 88}
162
+ ]
163
+ ```
164
+ This performs a fuzzy search with a threshold of 85. `threshold` defaults to 100 in which case an exact search
165
+ is performed. You can also search by journal abbreviation, [E-ISSN](https://portal.issn.org) or [NLM ID](https://portal.issn.org).
166
+ ```py
167
+ i.search("Nat Rev Earth Environ") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
168
+ i.search("101771060") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
169
+ i.search('2662-138X') # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
170
+
171
+ # Filter results by impact factor
172
+ i.search("Neural network", threshold=85, min_impact=1.5, max_impact=20)
173
+ # [
174
+ # {'journal': 'IEEE Transactions on Neural Networks and Learning Systems', 'factor': 14.255, 'score': 93},
175
+ # {'journal': 'NEURAL NETWORKS', 'factor': 9.657, 'score': 91},
176
+ # {'journal': 'WORK-A Journal of Prevention Assessment & Rehabilitation', 'factor': 1.803, 'score': 86},
177
+ # {'journal': 'NETWORK-COMPUTATION IN NEURAL SYSTEMS', 'factor': 1.5, 'score': 92}
178
+ # ]
179
+
180
+ # Show all fields
181
+ i.search("quantum information", threshold=90, return_all=True)
182
+ # [
183
+ # {'factor': 10.758, 'jcr': 'Q1', 'journal_abbr': 'npj Quantum Inf', 'eissn': '2056-6387', 'journal': 'npj Quantum Information', 'nlm_id': '101722857', 'issn': '', 'score': 92},
184
+ # {'factor': 1.577, 'jcr': 'Q3', 'journal_abbr': 'Nation', 'eissn': '0027-8378', 'journal': 'NATION', 'nlm_id': '9877123', 'issn': '0027-8378', 'score': 91}
185
+ # ]
158
186
  ```
159
- Then, `i.journal_to_if` should give you a dictionary wit journal to IF mappings for >9000 journals as of 2014.
160
187
 
161
188
  ### Plotting
162
189
 
@@ -1,6 +1,6 @@
1
1
  """Initialize the module."""
2
2
  __name__ = "paperscraper"
3
- __version__ = "0.2.8"
3
+ __version__ = "0.2.10"
4
4
 
5
5
  import logging
6
6
  import os
@@ -0,0 +1,111 @@
1
+ import logging
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ import pandas as pd
5
+ from impact_factor.core import Factor
6
+ from thefuzz import fuzz
7
+
8
+ logger = logging.getLogger(__name__)
9
+ logger.setLevel(logging.INFO)
10
+ logging.disable(logging.INFO)
11
+
12
+
13
+ class Impactor:
14
+ def __init__(self):
15
+ """
16
+ Initialize the Impactor class with an instance of the Factor class.
17
+ This allows access to the database of journal impact factors.
18
+ """
19
+ self.fa = Factor()
20
+ self.all_journals = self.fa.search("%")
21
+ self.metadata = pd.DataFrame(self.all_journals, dtype=str)
22
+ logger.info(f"Loaded metadata for {len(self.metadata)} journals")
23
+
24
+ def search(
25
+ self,
26
+ query: str,
27
+ threshold: int = 100,
28
+ sort_by: Optional[str] = None,
29
+ min_impact: float = 0.0,
30
+ max_impact: float = float("inf"),
31
+ return_all: bool = False,
32
+ ) -> List[Dict[str, Any]]:
33
+ """
34
+ Search for journals matching the given query with an optional fuzziness
35
+ level and sorting.
36
+
37
+ Args:
38
+ query: The journal name or abbreviation to search for.
39
+ threshold: The threshold for fuzzy matching. If set to 100, exact matching
40
+ is performed. If set below 100, fuzzy matching is used. Defaults to 100.
41
+ sort_by: Criterion for sorting results, one of 'impact', 'journal' and 'score'.
42
+ min_impact: Minimum impact factor for journals to be considered, defaults to 0.
43
+ max_impact: Maximum impact factor for journals to be considered, defaults to infinity.
44
+ return_all: If True, returns all columns of the DataFrame for each match.
45
+
46
+ Returns:
47
+ List[dict]: A list of dictionaries containing the journal information.
48
+
49
+ """
50
+ # Validation of parameters
51
+ if not isinstance(query, str) or not isinstance(threshold, int):
52
+ raise TypeError(
53
+ f"Query must be a str and threshold must be an int, not {type(query)} and {type(threshold)}"
54
+ )
55
+ if threshold < 0 or threshold > 100:
56
+ raise ValueError(
57
+ f"Fuzziness threshold must be between 0 and 100, not {threshold}"
58
+ )
59
+
60
+ if str.isdigit(query) and threshold >= 100:
61
+ # When querying with NLM ID, exact matching does not work since impact_factor
62
+ # strips off leading zeros, so we use fuzzy matching instead
63
+ threshold = 99
64
+
65
+ # Define a function to calculate fuzziness score
66
+ def calculate_fuzziness_score(row):
67
+ return max(fuzz.partial_ratio(query, str(value)) for value in row.values)
68
+
69
+ # Search with or without fuzzy matching
70
+ if threshold >= 100:
71
+ matched_df = self.metadata[
72
+ self.metadata.apply(
73
+ lambda x: query.lower() in x.astype(str).str.lower().values, axis=1
74
+ )
75
+ ].copy()
76
+ # Exact matches get a default score of 100
77
+ matched_df["score"] = 100
78
+ else:
79
+ matched_df = self.metadata[
80
+ self.metadata.apply(
81
+ lambda x: calculate_fuzziness_score(x) >= threshold, axis=1
82
+ )
83
+ ].copy()
84
+ matched_df["score"] = matched_df.apply(calculate_fuzziness_score, axis=1)
85
+
86
+ # Sorting based on the specified criterion
87
+ if sort_by == "score":
88
+ matched_df = matched_df.sort_values(by="score", ascending=False)
89
+ elif sort_by == "journal":
90
+ matched_df = matched_df.sort_values(by="journal")
91
+ elif sort_by == "impact":
92
+ matched_df = matched_df.sort_values(by="factor", ascending=False)
93
+
94
+ matched_df["factor"] = pd.to_numeric(matched_df["factor"])
95
+ matched_df = matched_df[
96
+ (matched_df["factor"] >= min_impact) & (matched_df["factor"] <= max_impact)
97
+ ]
98
+
99
+ # Prepare the final result
100
+ results = [
101
+ row.to_dict()
102
+ if return_all
103
+ else {
104
+ "journal": row["journal"],
105
+ "factor": row["factor"],
106
+ "score": row["score"],
107
+ }
108
+ for _, row in matched_df.iterrows()
109
+ ]
110
+
111
+ return results
File without changes
@@ -0,0 +1,69 @@
1
+ import logging
2
+
3
+ import pytest
4
+
5
+ from paperscraper.impact import Impactor
6
+
7
+ logging.disable(logging.INFO)
8
+
9
+
10
+ class TestImpactor:
11
+ @pytest.fixture
12
+ def impactor(self):
13
+ return Impactor()
14
+
15
+ def test_basic_search(self, impactor: Impactor):
16
+ results = impactor.search("Nat Comm", threshold=99, sort_by="score")
17
+ assert len(results) > 0 # Ensure we get some results
18
+ assert all(
19
+ "journal" in r and "factor" in r and "score" in r for r in results
20
+ ) # Basic fields are present
21
+
22
+ def test_fuzzy_search(self, impactor: Impactor):
23
+ results = impactor.search("Nat Comm", threshold=99)
24
+ assert any(
25
+ r["journal"] == "Nature Communications" for r in results
26
+ ) # Check for a specific journal
27
+
28
+ def test_sort_by_score(self, impactor: Impactor):
29
+ results = impactor.search("nature chem", threshold=80, sort_by="score")
30
+ scores = [r["score"] for r in results]
31
+ assert scores == sorted(
32
+ scores, reverse=True
33
+ ) # Ensure results are sorted by score
34
+
35
+ def test_impact_factor_filtering(self, impactor: Impactor):
36
+ results = impactor.search("Quantum information", threshold=70, min_impact=8)
37
+ assert all(
38
+ 8 <= r["factor"] for r in results
39
+ ) # Check if all results have a factor >= 8
40
+
41
+ def test_return_all_fields(self, impactor: Impactor):
42
+ results = impactor.search("nature chem", return_all=True)
43
+ assert all(
44
+ len(r) > 3 for r in results
45
+ ) # Check if more than the basic fields are returned
46
+
47
+ def test_quantum_information_search(self, impactor):
48
+ expected_results = [
49
+ {"journal": "InfoMat", "factor": 24.798, "score": 71},
50
+ {"journal": "Information Fusion", "factor": 17.564, "score": 71},
51
+ {"journal": "npj Quantum Information", "factor": 10.758, "score": 95},
52
+ ]
53
+
54
+ results = impactor.search(
55
+ "Quantum information", threshold=70, sort_by="factor", min_impact=8
56
+ )
57
+
58
+ # Ensure that the results match the expected results
59
+ assert len(results) == len(expected_results), "Number of results does not match"
60
+ for expected, actual in zip(expected_results, results):
61
+ assert (
62
+ expected["journal"] == actual["journal"]
63
+ ), f"Journal name does not match for {expected['journal']}"
64
+ assert (
65
+ abs(expected["factor"] - actual["factor"]) < 0.001
66
+ ), f"Impact factor does not match for {expected['journal']}"
67
+ assert (
68
+ expected["score"] == actual["score"]
69
+ ), f"Score does not match for {expected['journal']}"
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: paperscraper
3
- Version: 0.2.8
3
+ Version: 0.2.10
4
4
  Summary: paperscraper: Package to scrape papers.
5
5
  Home-page: https://github.com/PhosphorylatedRabbits/paperscraper
6
6
  Author: Jannis Born, Matteo Manica
7
7
  Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
8
8
  License: MIT
9
- Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv
9
+ Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv,Google Scholar
10
10
  Classifier: Development Status :: 3 - Alpha
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: Intended Audience :: Science/Research
@@ -27,6 +27,9 @@ Requires-Dist: seaborn
27
27
  Requires-Dist: matplotlib
28
28
  Requires-Dist: matplotlib_venn
29
29
  Requires-Dist: bs4
30
+ Requires-Dist: impact-factor>=1.1.0
31
+ Requires-Dist: thefuzz
32
+ Requires-Dist: pytest
30
33
 
31
34
  [![build](https://github.com/PhosphorylatedRabbits/paperscraper/actions/workflows/build.yml/badge.svg)](https://github.com/PhosphorylatedRabbits/paperscraper/actions/workflows/build.yml)
32
35
  [![License:
@@ -179,14 +182,41 @@ get_citations_from_title(title)
179
182
  *NOTE*: The scholar endpoint does not require authentification but since it regularly
180
183
  prompts with captchas, it's difficult to apply large scale.
181
184
 
182
- #### Journal impact factor
185
+ ### Journal impact factor
183
186
 
184
- You can also retrieve the impact factor for all journals indexed by citefactor:
187
+ You can also retrieve the impact factor for all journals:
185
188
  ```py
186
- from paperscraper.journal_if import Impactor
187
- i = Impactor()
189
+ >>>from paperscraper.impact import Impactor
190
+ >>>i = Impactor()
191
+ >>>i.search("Nat Comms", threshold=85, sort_by='impact')
192
+ [
193
+ {'journal': 'Nature Communications', 'factor': 17.694, 'score': 94},
194
+ {'journal': 'Natural Computing', 'factor': 1.504, 'score': 88}
195
+ ]
196
+ ```
197
+ This performs a fuzzy search with a threshold of 85. `threshold` defaults to 100 in which case an exact search
198
+ is performed. You can also search by journal abbreviation, [E-ISSN](https://portal.issn.org) or [NLM ID](https://portal.issn.org).
199
+ ```py
200
+ i.search("Nat Rev Earth Environ") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
201
+ i.search("101771060") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
202
+ i.search('2662-138X') # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
203
+
204
+ # Filter results by impact factor
205
+ i.search("Neural network", threshold=85, min_impact=1.5, max_impact=20)
206
+ # [
207
+ # {'journal': 'IEEE Transactions on Neural Networks and Learning Systems', 'factor': 14.255, 'score': 93},
208
+ # {'journal': 'NEURAL NETWORKS', 'factor': 9.657, 'score': 91},
209
+ # {'journal': 'WORK-A Journal of Prevention Assessment & Rehabilitation', 'factor': 1.803, 'score': 86},
210
+ # {'journal': 'NETWORK-COMPUTATION IN NEURAL SYSTEMS', 'factor': 1.5, 'score': 92}
211
+ # ]
212
+
213
+ # Show all fields
214
+ i.search("quantum information", threshold=90, return_all=True)
215
+ # [
216
+ # {'factor': 10.758, 'jcr': 'Q1', 'journal_abbr': 'npj Quantum Inf', 'eissn': '2056-6387', 'journal': 'npj Quantum Information', 'nlm_id': '101722857', 'issn': '', 'score': 92},
217
+ # {'factor': 1.577, 'jcr': 'Q3', 'journal_abbr': 'Nation', 'eissn': '0027-8378', 'journal': 'NATION', 'nlm_id': '9877123', 'issn': '0027-8378', 'score': 91}
218
+ # ]
188
219
  ```
189
- Then, `i.journal_to_if` should give you a dictionary wit journal to IF mappings for >9000 journals as of 2014.
190
220
 
191
221
  ### Plotting
192
222
 
@@ -2,7 +2,7 @@ LICENSE
2
2
  README.md
3
3
  setup.py
4
4
  paperscraper/__init__.py
5
- paperscraper/journal_if.py
5
+ paperscraper/impact.py
6
6
  paperscraper/load_dumps.py
7
7
  paperscraper/pdf.py
8
8
  paperscraper/plotting.py
@@ -31,6 +31,8 @@ paperscraper/pubmed/utils.py
31
31
  paperscraper/scholar/__init__.py
32
32
  paperscraper/scholar/scholar.py
33
33
  paperscraper/server_dumps/__init__.py
34
+ paperscraper/tests/__init__.py
35
+ paperscraper/tests/test_impactor.py
34
36
  paperscraper/xrxiv/__init__.py
35
37
  paperscraper/xrxiv/xrxiv_api.py
36
38
  paperscraper/xrxiv/xrxiv_query.py
@@ -8,3 +8,6 @@ seaborn
8
8
  matplotlib
9
9
  matplotlib_venn
10
10
  bs4
11
+ impact-factor>=1.1.0
12
+ thefuzz
13
+ pytest
@@ -1,10 +1,10 @@
1
1
  """Install package."""
2
- import os
3
- from setuptools import setup
4
- from setuptools import find_packages
5
2
  import io
3
+ import os
6
4
  import re
7
5
 
6
+ from setuptools import find_packages, setup
7
+
8
8
  __version__ = re.search(
9
9
  r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
10
10
  io.open("paperscraper/__init__.py", encoding="utf_8_sig").read(),
@@ -36,6 +36,9 @@ setup(
36
36
  "matplotlib",
37
37
  "matplotlib_venn",
38
38
  "bs4",
39
+ "impact-factor>=1.1.0",
40
+ "thefuzz",
41
+ "pytest",
39
42
  ],
40
43
  keywords=[
41
44
  "Academics",
@@ -47,6 +50,7 @@ setup(
47
50
  "Medrxiv",
48
51
  "Biorxiv",
49
52
  "Chemrxiv",
53
+ "Google Scholar",
50
54
  ],
51
55
  packages=find_packages("."),
52
56
  package_data={"paperscraper.server_dumps": ["*"]},
@@ -1,155 +0,0 @@
1
- """
2
- Class to fetch the impact factor of all citefactor-indexed journals.
3
- Limitation: Fetches the 2014 IFs.
4
-
5
- Adapted from: https://github.com/andrew-hill/impactor/blob/master/impactor.py
6
- Available via MIT License.
7
-
8
- Adaptions:
9
- - Converting code from Python2 to Python3.
10
- - Fetching IFs from *all* journals not just from journals starting with "A".
11
-
12
- """
13
-
14
- import logging
15
- import pickle
16
- import re
17
- import string
18
- from urllib.request import urlopen
19
-
20
- # http://www.crummy.com/software/BeautifulSoup/
21
- from bs4 import BeautifulSoup
22
-
23
-
24
- class Impactor(object):
25
- """
26
- Class to fetch the impact factor of all citefactor-indexed journals as of 2014.
27
- """
28
-
29
- BASE_URL_PREFIX = r"http://www.citefactor.org/journal-impact-factor-list-"
30
- BASE_URL_SUFFIX = r".html"
31
- URL_REGEX_PREFIX = r"http://www\.citefactor\.org/journal-impact-factor-list-"
32
- URL_REGEX_SUFFIX = r"_?[A-Z]?\.html"
33
-
34
- def __init__(self, journal_db_file=None, year=2014):
35
- logging.debug("journal_db_file={}, year={}".format(journal_db_file, year))
36
-
37
- self.journal_data = None
38
- self.journal_db_file = journal_db_file
39
- self.matches = set()
40
- self.year = year
41
-
42
- assert year in (2014,), "Can only handle 2014 at the moment."
43
- self.base_url = self.BASE_URL_PREFIX + str(year) + self.BASE_URL_SUFFIX
44
- self.url_regex = self.URL_REGEX_PREFIX + str(year) + self.URL_REGEX_SUFFIX
45
- self.re = re.compile(self.url_regex)
46
- self.load()
47
- self.save()
48
- self.create_if_dict()
49
-
50
- def match(self, search_terms):
51
- # If no terms specified, show all entries
52
- if search_terms is None or len(search_terms) == 0:
53
- for j in self.journal_data.values():
54
- self.matches.add(j["ISSN"])
55
- # Otherwise do search
56
- issn_re = re.compile(r"\d{4}-\d{4}")
57
- for s in search_terms:
58
- if issn_re.match(s):
59
- self.matches.add(s)
60
- else:
61
- for j in self.journal_data.values():
62
- if j["JOURNAL"].lower().find(s.lower()) >= 0:
63
- self.matches.add(j["ISSN"])
64
-
65
- def load(self):
66
- # Try to load from file
67
- if self.journal_db_file is not None:
68
- try:
69
- with open(self.journal_db_file, "rb") as f:
70
- self.journal_data = pickle.load(f)
71
- logging.debug(
72
- "loaded journals from {}".format(self.journal_db_file)
73
- )
74
- except Exception:
75
- pass
76
- # If cannot load from file, load from URL
77
- if self.journal_data is None:
78
- logging.info("Fetching database from citefactor.org...")
79
- self.journal_data = self.get_all_journal_data()
80
-
81
- def save(self):
82
- if self.journal_db_file is not None:
83
- try:
84
- with open(self.journal_db_file, "wb") as f:
85
- pickle.dump(self.journal_data, f, -1)
86
- logging.debug("saved journals to {}".format(self.journal_db_file))
87
- except Exception:
88
- pass
89
-
90
- def get_all_urls(self):
91
- main_page_content = urlopen(self.base_url).read()
92
- soup = BeautifulSoup(main_page_content)
93
- soup.prettify() # necessary?
94
- return [
95
- self.base_url,
96
- ] + [anchor["href"] for anchor in soup.find_all("a", href=self.re)]
97
-
98
- def get_journal_table(self, url):
99
- content = urlopen(url).read()
100
- soup = BeautifulSoup(content)
101
- soup.prettify() # necessary?
102
- t = soup.table
103
- caption_re = re.compile(
104
- r"^Impact Factor " + str(self.year)
105
- ) # works for Year==2015 only
106
- while t is not None:
107
- if (
108
- t.caption is None
109
- or t.caption.string is None
110
- or caption_re.match(t.caption.string) is None
111
- ):
112
- t = t.find_next()
113
- continue
114
- return t
115
-
116
- def get_table_headers(self, table):
117
- return [str(x.string) for x in table.tr.find_all("td")]
118
-
119
- def get_journal_data(self, table):
120
- headers = self.get_table_headers(table)
121
- journals = dict()
122
- for row in table.find_all("tr")[1:]:
123
- cells = row.find_all("td")
124
- j = dict(zip(headers, [str(x.string) for x in cells]))
125
- # logging.debug('importing: {}'.format(j))
126
- journals[j["ISSN"]] = j
127
- return journals
128
-
129
- def get_all_journal_data(self):
130
- journals = dict()
131
- for url in self.get_all_urls():
132
-
133
- for page in string.ascii_uppercase:
134
- page = "0-A" if page == "A" else page
135
- url_page = url.split("2014")[0] + "2014_" + page + url.split("2014")[1]
136
- table = self.get_journal_table(url_page)
137
- journals.update(self.get_journal_data(table))
138
- logging.info(
139
- "imported {} journal entries from citefactor.org".format(len(journals))
140
- )
141
- return journals
142
-
143
- def create_if_dict(self):
144
- """
145
- Creates a dictionary with journal names as key (lowercase) and impact factors
146
- as values.
147
- """
148
-
149
- stringparse = (
150
- lambda x: str(x).strip().lower().replace("\\", "_").replace(" ", "_")
151
- )
152
- self.journal_to_if = dict(
153
- (stringparse(value["JOURNAL"]), value["2013/2014"])
154
- for key, value in self.journal_data.items()
155
- )
File without changes
File without changes