roagg 2025.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- roagg-2025.0.8/LICENSE +21 -0
- roagg-2025.0.8/PKG-INFO +6 -0
- roagg-2025.0.8/README.md +102 -0
- roagg-2025.0.8/pyproject.toml +48 -0
- roagg-2025.0.8/setup.cfg +4 -0
- roagg-2025.0.8/src/roagg/__init__.py +0 -0
- roagg-2025.0.8/src/roagg/__main__.py +3 -0
- roagg-2025.0.8/src/roagg/aggregator.py +69 -0
- roagg-2025.0.8/src/roagg/cli.py +91 -0
- roagg-2025.0.8/src/roagg/helpers/ror.py +14 -0
- roagg-2025.0.8/src/roagg/helpers/utils.py +65 -0
- roagg-2025.0.8/src/roagg/models/research_output_item.py +51 -0
- roagg-2025.0.8/src/roagg/providers/datacite.py +178 -0
- roagg-2025.0.8/src/roagg/providers/openaire.py +155 -0
- roagg-2025.0.8/src/roagg/providers/openalex.py +119 -0
- roagg-2025.0.8/src/roagg.egg-info/PKG-INFO +6 -0
- roagg-2025.0.8/src/roagg.egg-info/SOURCES.txt +19 -0
- roagg-2025.0.8/src/roagg.egg-info/dependency_links.txt +1 -0
- roagg-2025.0.8/src/roagg.egg-info/entry_points.txt +5 -0
- roagg-2025.0.8/src/roagg.egg-info/top_level.txt +1 -0
- roagg-2025.0.8/tests/test_utils.py +255 -0
roagg-2025.0.8/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Swedish National Data Service
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
roagg-2025.0.8/PKG-INFO
ADDED
roagg-2025.0.8/README.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Research output aggregator
|
|
2
|
+
> [!NOTE]
|
|
3
|
+
> This script is under development
|
|
4
|
+
|
|
5
|
+
The goal of this project is to create a script to get a summarization for a research organization about the research output.
|
|
6
|
+
First target is to query and process information from DataCite.
|
|
7
|
+
|
|
8
|
+
The goal for this script is to create a list over research output where an organization is mentioned as:
|
|
9
|
+
* publisher
|
|
10
|
+
* creator with affiliation to the organization
|
|
11
|
+
* contributor with affiliation to the organization
|
|
12
|
+
|
|
13
|
+
input: ROR-id and list of variants on the organization name.
|
|
14
|
+
|
|
15
|
+
Properties to collect for each research output:
|
|
16
|
+
|Field |Type |Comment |
|
|
17
|
+
|--------------------------------------|-------|---------------------------------------------------------------------------------------|
|
|
18
|
+
|publicationYear |integer|The year of publication, can be empty in some cases |
|
|
19
|
+
|resourceType |string |The resource type (free text string) |
|
|
20
|
+
|title |string |Title of the resource (first one if multiple) |
|
|
21
|
+
|publisher |string |Publisher (free text) |
|
|
22
|
+
|createdAt |string |Created date if availible |
|
|
23
|
+
|updatedAt |string |Updatade date if availible |
|
|
24
|
+
|isPublisher |bool |True if the the publisher match the requested organisation |
|
|
25
|
+
|isFunder |bool |True if the the funder match the requested organisation |
|
|
26
|
+
|haveCreatorAffiliation |bool |True if the the any creator match the requested organisation |
|
|
27
|
+
|haveContributorAffiliation |bool |True if the the any contributor match the requested organisation |
|
|
28
|
+
|isLatestVersion |bool |True if the DataCite metadata indicates this beeing the latest version |
|
|
29
|
+
|isConceptDoi |bool |True if the DataCite metadata indicates this beeing a concept DOI |
|
|
30
|
+
|matchPublisherRor |bool |True if the ROR id for publisher match the ROR in the provided argument |
|
|
31
|
+
|matchCreatorAffiliationRor |bool |True if the ROR id for a creator affiliation match the ROR in the provided argument |
|
|
32
|
+
|matchContributorAffiliationRor |bool |True if the ROR id for a contributor affiliation match the ROR in the provided argument|
|
|
33
|
+
|matchFunderRor |bool |True if the ROR id for funder match the ROR in the provided argument |
|
|
34
|
+
|matchPublisherName |bool |True if any of the names supplied matches the publisher name in the resource |
|
|
35
|
+
|matchCreatorName |bool |True if any of the names supplied matches the creator name in the resource |
|
|
36
|
+
|matchContributorName |bool |True if any of the names supplied matches the contributor name in the resource |
|
|
37
|
+
|matchFunderName |bool |True if any of the names supplied matches the funder name in the resource |
|
|
38
|
+
|inDataCite |bool |True if the DOI was matched in the DataCite |
|
|
39
|
+
|inOpenAire |bool |True if the DOI was matched in OpenAire |
|
|
40
|
+
|inOpenAlex |bool |True if the DOI was matched in OpenAlex |
|
|
41
|
+
|inCrossRef |bool |True if the DOI was matched in CrossRef |
|
|
42
|
+
|dataCiteClientId |string |The client id for the organisation minting the DOI |
|
|
43
|
+
|dataCiteClientName |string |The human readable name of the minting organisation |
|
|
44
|
+
|dataCiteCitationCount |integer|Citation count for the resource provided by the DataCite API |
|
|
45
|
+
|dataCiteReferenceCount |integer|Reference count for the resource provided by the DataCite API |
|
|
46
|
+
|dataCiteViewCount |integer|View count for the resource provided by the DataCite API |
|
|
47
|
+
|dataCiteDownloadCount |integer|Download count for the resource provided by the DataCite API |
|
|
48
|
+
|openAireBestAccessRight |string |Access Rights for the resource indicated indicated by the OpenAire API |
|
|
49
|
+
|openAireIndicatorsUsageCountsDownloads|integer|Download count for the resource indicated by the OpenAire API |
|
|
50
|
+
|openAireIndicatorsUsageCountsViews |integer|View count for the resource provided by the OpenAire API |
|
|
51
|
+
|openAireId |string |Id for the resource in OpenAire |
|
|
52
|
+
|openAlexId |string |Id for the resource in OpenAlex |
|
|
53
|
+
|openAlexCitedByCount |integer|Citation count for the resource provided by the OpenAlex API |
|
|
54
|
+
|openAlexReferencedWorksCount |integer|Reference count for the resource provided by the OpenAlex API |
|
|
55
|
+
|titleWordCount |integer|Number of words in the title (useful for sorting in some cases) |
|
|
56
|
+
|referencedByDoi |string | |
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
## Install
|
|
60
|
+
`pip install .`
|
|
61
|
+
|
|
62
|
+
## Install dev
|
|
63
|
+
`pip install -e .`
|
|
64
|
+
|
|
65
|
+
## Development stuff to do
|
|
66
|
+
- [x] ROR get name variants from ROR
|
|
67
|
+
- [x] CLI add options to get name list from txt
|
|
68
|
+
- [x] DataCite API build query for matching publisher and affiliation
|
|
69
|
+
- [ ] Crossref API build query for matching publisher and affiliation
|
|
70
|
+
- [ ] Publish as cmd tool on PyPI
|
|
71
|
+
|
|
72
|
+
## Run
|
|
73
|
+
List arguments:
|
|
74
|
+
`roagg --help`
|
|
75
|
+
|
|
76
|
+
## Tests
|
|
77
|
+
Some tests are available, to run them:
|
|
78
|
+
`python -m pytest`
|
|
79
|
+
|
|
80
|
+
### Some example arguments
|
|
81
|
+
Chalmers with ror and name list:
|
|
82
|
+
```bash
|
|
83
|
+
roagg --ror https://ror.org/040wg7k59 --name-txt tests/name-lists/chalmers.txt --output chalmers.csv
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
GU with ror, name list and extra name not in the text file:
|
|
87
|
+
```bash
|
|
88
|
+
roagg --name "Department of Nephrology Gothenburg" --ror https://ror.org/01tm6cn81 --name-txt tests/name-lists/gu.txt --output data/gu.csv
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
KTH with ror and name list:
|
|
92
|
+
```bash
|
|
93
|
+
roagg --ror https://ror.org/026vcq606 --name-txt tests/name-lists/kth.txt --output data/kth.csv
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
KAU with ror:
|
|
97
|
+
```bash
|
|
98
|
+
roagg --ror https://ror.org/05s754026 --output kau.csv
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## License
|
|
102
|
+
[MIT License](LICENSE)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "roagg"
|
|
3
|
+
version = "2025.0.8"
|
|
4
|
+
requires-python = ">=3.10"
|
|
5
|
+
dependencies = []
|
|
6
|
+
|
|
7
|
+
[project.scripts]
|
|
8
|
+
roagg = "roagg.cli:main"
|
|
9
|
+
|
|
10
|
+
[project.entry-points."pipx.run"]
|
|
11
|
+
roagg = "roagg.cli:main"
|
|
12
|
+
|
|
13
|
+
[build-system]
|
|
14
|
+
requires = ["setuptools>=42", "wheel"]
|
|
15
|
+
build-backend = "setuptools.build_meta"
|
|
16
|
+
|
|
17
|
+
[tool.setuptools]
|
|
18
|
+
package-dir = {"" = "src"}
|
|
19
|
+
|
|
20
|
+
[tool.setuptools.packages.find]
|
|
21
|
+
where = ["src"]
|
|
22
|
+
|
|
23
|
+
[tool.pytest.ini_options]
|
|
24
|
+
# Specify test directory
|
|
25
|
+
testpaths = ["tests"]
|
|
26
|
+
|
|
27
|
+
# Minimum version of pytest
|
|
28
|
+
minversion = "6.0"
|
|
29
|
+
|
|
30
|
+
# Add command line options that are always used
|
|
31
|
+
addopts = [
|
|
32
|
+
"-ra", # Show summary of all test outcomes
|
|
33
|
+
"--strict-markers", # Raise error on unknown markers
|
|
34
|
+
"--strict-config", # Raise error on invalid config
|
|
35
|
+
"--showlocals", # Show local variables in tracebacks
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# Configure test discovery patterns
|
|
39
|
+
python_files = ["test_*.py", "*_test.py"]
|
|
40
|
+
python_classes = ["Test*"]
|
|
41
|
+
python_functions = ["test_*"]
|
|
42
|
+
|
|
43
|
+
# Define custom markers (optional, for organizing tests)
|
|
44
|
+
markers = [
|
|
45
|
+
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
46
|
+
"integration: marks tests as integration tests",
|
|
47
|
+
"unit: marks tests as unit tests",
|
|
48
|
+
]
|
roagg-2025.0.8/setup.cfg
ADDED
|
File without changes
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import List
|
|
3
|
+
from roagg.helpers.ror import get_names_from_ror
|
|
4
|
+
from roagg.providers.datacite import DataCiteAPI
|
|
5
|
+
from roagg.providers.openaire import OpenAireAPI
|
|
6
|
+
from roagg.providers.openalex import OpenAlexAPI
|
|
7
|
+
import logging
|
|
8
|
+
from roagg.models.research_output_item import ResearchOutputItem
|
|
9
|
+
import json
|
|
10
|
+
import csv
|
|
11
|
+
from dataclasses import fields
|
|
12
|
+
|
|
13
|
+
def aggregate(name: List[str] = [], ror: str = "", output: str = "output.csv") -> None:
|
|
14
|
+
if ror:
|
|
15
|
+
ror_name = get_names_from_ror(ror)
|
|
16
|
+
name.extend(ror_name)
|
|
17
|
+
|
|
18
|
+
# remove duplicates
|
|
19
|
+
name = list(set(name))
|
|
20
|
+
|
|
21
|
+
datacite = DataCiteAPI(name=name, ror=ror)
|
|
22
|
+
url = datacite.api_request_url()
|
|
23
|
+
# debug print of the query string
|
|
24
|
+
logging.info("DataCite url:")
|
|
25
|
+
logging.info(url)
|
|
26
|
+
|
|
27
|
+
records = datacite.all()
|
|
28
|
+
research_output_items = []
|
|
29
|
+
logging.info(f"Checking {len(records)} records...")
|
|
30
|
+
for record in records:
|
|
31
|
+
research_output_items.append(datacite.get_record(record))
|
|
32
|
+
|
|
33
|
+
openaire = OpenAireAPI(ror=ror, results=research_output_items)
|
|
34
|
+
openaire_id = openaire.get_openaire_id_from_ror()
|
|
35
|
+
logging.info(f"OpenAire ID from ROR {ror} : {openaire_id}")
|
|
36
|
+
openaire.get_records()
|
|
37
|
+
|
|
38
|
+
openalex = OpenAlexAPI(ror=ror, results=research_output_items)
|
|
39
|
+
openalex_id = openalex.get_openalex_id_from_ror()
|
|
40
|
+
logging.info(f"OpenAlex ID from ROR {ror} : {openalex_id}")
|
|
41
|
+
openalex.get_records()
|
|
42
|
+
|
|
43
|
+
logging.info(f"Writing: {output}")
|
|
44
|
+
|
|
45
|
+
write_csv(research_output_items, output)
|
|
46
|
+
logging.info(f"Writing output to csv: {output} - Done")
|
|
47
|
+
|
|
48
|
+
def write_csv(records: List[ResearchOutputItem], output: str) -> None:
|
|
49
|
+
# Get field names from the dataclass
|
|
50
|
+
dataclass_fields = fields(ResearchOutputItem)
|
|
51
|
+
header = [field.name for field in dataclass_fields]
|
|
52
|
+
|
|
53
|
+
def format_value(value):
|
|
54
|
+
"""Format values for CSV output"""
|
|
55
|
+
if value is None:
|
|
56
|
+
return ""
|
|
57
|
+
elif isinstance(value, bool):
|
|
58
|
+
return 1 if value else 0
|
|
59
|
+
else:
|
|
60
|
+
return value
|
|
61
|
+
|
|
62
|
+
with open(output, 'w', newline='', encoding='utf-8') as file:
|
|
63
|
+
writer = csv.writer(file)
|
|
64
|
+
writer.writerow(header)
|
|
65
|
+
|
|
66
|
+
writer.writerows([
|
|
67
|
+
[format_value(getattr(record, field.name)) for field in dataclass_fields]
|
|
68
|
+
for record in records
|
|
69
|
+
])
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from roagg.helpers.utils import get_roagg_version
|
|
8
|
+
from roagg.aggregator import aggregate
|
|
9
|
+
|
|
10
|
+
def validate_ror_id(ror_id: str) -> str:
|
|
11
|
+
"""validate ROR ID format (should start with https://ror.org/)."""
|
|
12
|
+
if not ror_id.startswith('https://ror.org/'):
|
|
13
|
+
raise argparse.ArgumentTypeError("ROR ID must start with 'https://ror.org/'")
|
|
14
|
+
return ror_id
|
|
15
|
+
|
|
16
|
+
def read_names_from_file(filepath: Path) -> List[str]:
|
|
17
|
+
"""Read organization names from a file, one per line."""
|
|
18
|
+
try:
|
|
19
|
+
return [line.strip() for line in filepath.read_text().splitlines() if line.strip()]
|
|
20
|
+
except IOError as e:
|
|
21
|
+
logging.error(f"Failed to read names file: {e}")
|
|
22
|
+
sys.exit(1)
|
|
23
|
+
|
|
24
|
+
def main() -> None:
|
|
25
|
+
"""create a summary CSV file for all research output for an organization."""
|
|
26
|
+
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
|
27
|
+
|
|
28
|
+
parser = argparse.ArgumentParser(
|
|
29
|
+
description="aggregate research outputs for an organization into a CSV file",
|
|
30
|
+
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--version",
|
|
35
|
+
action="version",
|
|
36
|
+
version=get_roagg_version()
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"--name",
|
|
41
|
+
type=str,
|
|
42
|
+
action='append',
|
|
43
|
+
help="name variant of the organization (can be used multiple times)"
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--name-txt",
|
|
47
|
+
type=Path,
|
|
48
|
+
help="path to text file containing organization name variants (one per line)"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--ror",
|
|
53
|
+
type=validate_ror_id,
|
|
54
|
+
help="ROR ID of the organization (must start with https://ror.org/)"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"--source",
|
|
59
|
+
default="api",
|
|
60
|
+
choices=["api"],
|
|
61
|
+
help="source for resource aggregation (only api is supported right now)"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--output",
|
|
66
|
+
default="data/output.csv",
|
|
67
|
+
help="name of the output file (default: data/output.csv)"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
args = parser.parse_args()
|
|
71
|
+
|
|
72
|
+
# print parser.print_help() if no argument for name, name-txt or ror is provided
|
|
73
|
+
if not any([args.name, args.name_txt, args.ror]):
|
|
74
|
+
parser.print_help()
|
|
75
|
+
sys.exit(1)
|
|
76
|
+
|
|
77
|
+
names: List[str] = []
|
|
78
|
+
if args.name:
|
|
79
|
+
names = args.name
|
|
80
|
+
|
|
81
|
+
if args.name_txt:
|
|
82
|
+
names.extend(read_names_from_file(args.name_txt))
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
aggregate(names, args.ror, args.output)
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logging.error(f"Aggregation failed: {e}")
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
if __name__ == "__main__":
|
|
91
|
+
main()
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import urllib.request
|
|
2
|
+
import json
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
def get_ror_info(ror: str):
|
|
6
|
+
ror_id = ror.split('/')[-1]
|
|
7
|
+
url = f"https://api.ror.org/v2/organizations/{ror_id}"
|
|
8
|
+
with urllib.request.urlopen(url) as response:
|
|
9
|
+
return json.loads(response.read())
|
|
10
|
+
|
|
11
|
+
def get_names_from_ror(ror: str) -> List[str]:
|
|
12
|
+
names = get_ror_info(ror)['names']
|
|
13
|
+
valid_types = {'alias', 'ror_display', 'label'}
|
|
14
|
+
return [n['value'] for n in names if valid_types.intersection(n['types'])]
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import importlib.metadata
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
doi_pattern = re.compile(r'^10\.\d{4,9}/[-._;()/:A-Z0-9]+$', re.IGNORECASE)
|
|
5
|
+
|
|
6
|
+
def is_valid_doi(s: str) -> bool:
|
|
7
|
+
return bool(doi_pattern.match(s))
|
|
8
|
+
|
|
9
|
+
def find_doi_in_text(text: str) -> str | None:
|
|
10
|
+
return re.findall(r'\b10\.\d{4,9}/[-.;()/:\w]+', text)
|
|
11
|
+
|
|
12
|
+
def remove_resolver_prefix_from_doi(doi: str) -> str:
|
|
13
|
+
if doi is None:
|
|
14
|
+
return None
|
|
15
|
+
prefixes = [
|
|
16
|
+
"https://doi.org/",
|
|
17
|
+
"http://doi.org/",
|
|
18
|
+
"doi.org/",
|
|
19
|
+
"https://dx.doi.org/",
|
|
20
|
+
"http://dx.doi.org/",
|
|
21
|
+
"dx.doi.org/"
|
|
22
|
+
]
|
|
23
|
+
for prefix in prefixes:
|
|
24
|
+
if doi.lower().startswith(prefix):
|
|
25
|
+
return doi[len(prefix):]
|
|
26
|
+
return doi
|
|
27
|
+
|
|
28
|
+
def match_patterns(string, patterns):
|
|
29
|
+
if string is None:
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
for pattern in patterns:
|
|
33
|
+
# if pattern does not contain * or ? check if string contains pattern
|
|
34
|
+
if '*' not in pattern and '?' not in pattern:
|
|
35
|
+
if pattern.lower() is string.lower() or pattern.lower() in string.lower():
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
if re.match(pattern_to_regexp(pattern), string, re.IGNORECASE):
|
|
40
|
+
return True
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
def pattern_to_regexp(pattern: str) -> str:
|
|
44
|
+
regex = ""
|
|
45
|
+
for char in pattern:
|
|
46
|
+
if char == '*':
|
|
47
|
+
regex += '.*'
|
|
48
|
+
elif char == '?':
|
|
49
|
+
regex += '.'
|
|
50
|
+
else:
|
|
51
|
+
regex += re.escape(char)
|
|
52
|
+
return '^' + regex + '$'
|
|
53
|
+
|
|
54
|
+
def get_roagg_version() -> str:
|
|
55
|
+
"""Get package version from metadata."""
|
|
56
|
+
try:
|
|
57
|
+
return importlib.metadata.version("roagg")
|
|
58
|
+
except importlib.metadata.PackageNotFoundError:
|
|
59
|
+
return "unknown"
|
|
60
|
+
|
|
61
|
+
def string_word_count(string: str) -> int:
|
|
62
|
+
"""Count words in a string after trimming whitespace."""
|
|
63
|
+
if not string:
|
|
64
|
+
return 0
|
|
65
|
+
return len(string.strip().split())
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
@dataclass
|
|
4
|
+
class ResearchOutputItem:
|
|
5
|
+
doi: str
|
|
6
|
+
publicationYear: int = None
|
|
7
|
+
resourceType: str = None
|
|
8
|
+
title: str = None
|
|
9
|
+
publisher: str = None
|
|
10
|
+
createdAt: str = ""
|
|
11
|
+
updatedAt: str = ""
|
|
12
|
+
isPublisher: bool = False
|
|
13
|
+
isFunder: bool = None
|
|
14
|
+
haveCreatorAffiliation: bool = False
|
|
15
|
+
haveContributorAffiliation: bool = False
|
|
16
|
+
isLatestVersion: bool = True
|
|
17
|
+
isConceptDoi: bool = False
|
|
18
|
+
#match on ROR
|
|
19
|
+
matchPublisherRor: bool = False
|
|
20
|
+
matchCreatorAffiliationRor: bool = False
|
|
21
|
+
matchContributorAffiliationRor: bool = False
|
|
22
|
+
matchFunderRor: bool = False
|
|
23
|
+
#match on free text in name
|
|
24
|
+
matchPublisherName: bool = False
|
|
25
|
+
matchCreatorName: bool = False
|
|
26
|
+
matchContributorName: bool = False
|
|
27
|
+
matchFunderName: bool = False
|
|
28
|
+
#where was the match found
|
|
29
|
+
inDataCite: bool = None
|
|
30
|
+
inOpenAire: bool = None
|
|
31
|
+
inOpenAlex: bool = None
|
|
32
|
+
inCrossRef: bool = None
|
|
33
|
+
#datacite specific
|
|
34
|
+
dataCiteClientId: str = None
|
|
35
|
+
dataCiteClientName: str = None
|
|
36
|
+
dataCiteCitationCount: int = None
|
|
37
|
+
dataCiteReferenceCount: int = None
|
|
38
|
+
dataCiteViewCount: int = None
|
|
39
|
+
dataCiteDownloadCount: int = None
|
|
40
|
+
#openaire specific
|
|
41
|
+
openAireBestAccessRight: str = None
|
|
42
|
+
openAireIndicatorsUsageCountsDownloads: int = None
|
|
43
|
+
openAireIndicatorsUsageCountsViews: int = None
|
|
44
|
+
openAireId: str = None
|
|
45
|
+
#openalex specific
|
|
46
|
+
openAlexId: str = None
|
|
47
|
+
openAlexCitedByCount: int = None
|
|
48
|
+
openAlexReferencedWorksCount: int = None
|
|
49
|
+
#extra fields
|
|
50
|
+
titleWordCount: int = None
|
|
51
|
+
referencedByDoi: str = None
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import urllib.request
|
|
3
|
+
import logging
|
|
4
|
+
import json
|
|
5
|
+
from roagg.helpers.utils import get_roagg_version
|
|
6
|
+
from roagg.models.research_output_item import ResearchOutputItem
|
|
7
|
+
from roagg.helpers.utils import match_patterns, string_word_count
|
|
8
|
+
|
|
9
|
+
class DataCiteAPI:
|
|
10
|
+
def __init__(self, page_size: int = 500, name: List[str] = [], ror: str = ""):
|
|
11
|
+
self.page_size = page_size
|
|
12
|
+
self.name = name
|
|
13
|
+
self.ror = ror
|
|
14
|
+
|
|
15
|
+
def get_query_string(self) -> str:
|
|
16
|
+
if not self.name and not self.ror:
|
|
17
|
+
return ""
|
|
18
|
+
|
|
19
|
+
query_parts = []
|
|
20
|
+
|
|
21
|
+
if self.name:
|
|
22
|
+
# Separate wildcard and exact matches, handle spaces in wildcard queries appropriately
|
|
23
|
+
wildcard = ' OR '.join(n.replace(" ", "\\ ") for n in self.name if '*' in n)
|
|
24
|
+
exact = ' OR '.join(f'"{n}"' for n in self.name if '*' not in n)
|
|
25
|
+
name_fields = [
|
|
26
|
+
"creators.affiliation.name",
|
|
27
|
+
"contributors.affiliation.name",
|
|
28
|
+
"publisher.name"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
if wildcard and exact:
|
|
32
|
+
name_conditions = f'{wildcard} OR {exact}'
|
|
33
|
+
else:
|
|
34
|
+
name_conditions = wildcard or exact
|
|
35
|
+
|
|
36
|
+
query_parts.extend([f"{field}:({name_conditions})" for field in name_fields])
|
|
37
|
+
|
|
38
|
+
if self.ror:
|
|
39
|
+
ror_fields = [
|
|
40
|
+
"publisher.publisherIdentifier",
|
|
41
|
+
"creators.affiliation.affiliationIdentifier",
|
|
42
|
+
"contributors.affiliation.affiliationIdentifier",
|
|
43
|
+
"creators.nameIdentifiers.nameIdentifier",
|
|
44
|
+
"contributors.nameIdentifiers.nameIdentifier",
|
|
45
|
+
"fundingReferences.funderIdentifier"
|
|
46
|
+
]
|
|
47
|
+
query_parts.extend([f'{field}:"{self.ror}"' for field in ror_fields])
|
|
48
|
+
# nameIdentifiers are formated without https://ror.org/ prefix from some sources, so we need to check both
|
|
49
|
+
query_parts.extend([f'{field}:"{self.ror.split("https://ror.org/")[1]}"' for field in ror_fields])
|
|
50
|
+
|
|
51
|
+
return " OR ".join(query_parts)
|
|
52
|
+
|
|
53
|
+
def api_request_url(self, page_size: int = None) -> str:
|
|
54
|
+
if page_size is None:
|
|
55
|
+
page_size = self.page_size
|
|
56
|
+
params = urllib.parse.urlencode({
|
|
57
|
+
'page[size]': page_size,
|
|
58
|
+
'page[cursor]': '1',
|
|
59
|
+
'affiliation': 'true',
|
|
60
|
+
'publisher': 'true',
|
|
61
|
+
'detail': 'true',
|
|
62
|
+
'disable-facets': 'false',
|
|
63
|
+
'query': self.get_query_string()
|
|
64
|
+
})
|
|
65
|
+
return f"https://api.datacite.org/dois?{params}"
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def get_api_result(url: str) -> dict:
|
|
69
|
+
request = urllib.request.Request(url)
|
|
70
|
+
version = get_roagg_version()
|
|
71
|
+
request.add_header('User-Agent', f'ResearchOutputAggregator/{version} (https://github.com/snd-sweden/research-output-aggregator; mailto:team-it@snd.se)')
|
|
72
|
+
try:
|
|
73
|
+
with urllib.request.urlopen(request) as response:
|
|
74
|
+
return json.loads(response.read())
|
|
75
|
+
except (urllib.error.URLError, json.JSONDecodeError, KeyError) as e:
|
|
76
|
+
raise RuntimeError(f"Failed run DataCite query: {e}")
|
|
77
|
+
|
|
78
|
+
def get_record(self, item: dict) -> ResearchOutputItem:
|
|
79
|
+
attributes = item.get("attributes", {})
|
|
80
|
+
publisher_attr = attributes.get("publisher", {})
|
|
81
|
+
versionCount = 0 if attributes.get("versionCount", {}) is None else int(attributes.get("versionCount", {}))
|
|
82
|
+
versionOfCount = 0 if attributes.get("versionOfCount", {}) is None else int(attributes.get("versionOfCount", {}))
|
|
83
|
+
|
|
84
|
+
record = ResearchOutputItem(
|
|
85
|
+
doi=attributes.get("doi"),
|
|
86
|
+
dataCiteClientId=item["relationships"]["client"]["data"]["id"],
|
|
87
|
+
resourceType=attributes.get("types", None).get("resourceTypeGeneral"),
|
|
88
|
+
publisher=publisher_attr.get("name"),
|
|
89
|
+
publicationYear=attributes.get("publicationYear"),
|
|
90
|
+
title=item["attributes"]["titles"][0]["title"],
|
|
91
|
+
inDataCite=True,
|
|
92
|
+
dataCiteCitationCount=attributes.get("citationCount", None),
|
|
93
|
+
dataCiteReferenceCount=attributes.get("referenceCount", None),
|
|
94
|
+
dataCiteViewCount=attributes.get("viewCount", None),
|
|
95
|
+
dataCiteDownloadCount=attributes.get("downloadCount", None),
|
|
96
|
+
titleWordCount=string_word_count(item["attributes"]["titles"][0]["title"])
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if record.resourceType is None or record.resourceType == "":
|
|
100
|
+
record.resourceType = attributes.get("types", {}).get("citeproc")
|
|
101
|
+
if record.resourceType is None or record.resourceType == "":
|
|
102
|
+
record.resourceType = attributes.get("types", {}).get("bibtex")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
record.isPublisher = (
|
|
106
|
+
publisher_attr.get("publisherIdentifier") == self.ror or
|
|
107
|
+
match_patterns(publisher_attr.get("name"), self.name)
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
related = [
|
|
111
|
+
r for r in item["attributes"].get("relatedIdentifiers", [])
|
|
112
|
+
if (r.get("relationType") == "IsReferencedBy" or r.get("relationType") == "IsSupplementTo" or r.get("relationType") == "IsSourceOf") and r.get("relatedIdentifierType") == "DOI"
|
|
113
|
+
]
|
|
114
|
+
if related and len(related) > 0:
|
|
115
|
+
record.referencedByDoi = related[0].get("relatedIdentifier")
|
|
116
|
+
else:
|
|
117
|
+
record.referencedByDoi = None
|
|
118
|
+
|
|
119
|
+
record.createdAt = str(attributes.get("created", "") or "")
|
|
120
|
+
|
|
121
|
+
record.updatedAt = max([
|
|
122
|
+
str(attributes.get("updated", "") or ""),
|
|
123
|
+
str(attributes.get("created", "") or ""),
|
|
124
|
+
str(attributes.get("registered", "") or "")
|
|
125
|
+
])
|
|
126
|
+
|
|
127
|
+
for relation in attributes.get("relatedIdentifiers", []):
|
|
128
|
+
if relation.get("relationType") == "IsPreviousVersionOf":
|
|
129
|
+
record.isLatestVersion = False
|
|
130
|
+
if relation.get("relationType") == "HasVersion":
|
|
131
|
+
record.isLatestVersion = False
|
|
132
|
+
|
|
133
|
+
record.isConceptDoi = (
|
|
134
|
+
versionCount > 0 and
|
|
135
|
+
versionOfCount == 0
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
record.haveCreatorAffiliation = self.check_agent_list_match(attributes.get("creators", []))
|
|
139
|
+
record.haveContributorAffiliation = self.check_agent_list_match(attributes.get("contributors", []))
|
|
140
|
+
return record
|
|
141
|
+
|
|
142
|
+
def check_agent_list_match(self, items: list) -> bool:
|
|
143
|
+
partial_ror = self.ror.split("https://ror.org/")[1] if self.ror else ""
|
|
144
|
+
for agent in items:
|
|
145
|
+
# Check if any nameIdentifier matches the ror
|
|
146
|
+
if any(identifier.get("nameIdentifier") == self.ror for identifier in agent.get("nameIdentifiers", [])):
|
|
147
|
+
return True
|
|
148
|
+
# Check if any nameIdentifier matches the partial ror
|
|
149
|
+
if any(identifier.get("nameIdentifier") == partial_ror for identifier in agent.get("nameIdentifiers", [])):
|
|
150
|
+
return True
|
|
151
|
+
# Check if the agent name matches any pattern
|
|
152
|
+
if match_patterns(agent.get("name"), self.name):
|
|
153
|
+
return True
|
|
154
|
+
# Check each affiliation
|
|
155
|
+
for affiliation in agent.get("affiliation", []):
|
|
156
|
+
if (affiliation.get("affiliationIdentifier") == self.ror or
|
|
157
|
+
match_patterns(affiliation.get("name"), self.name)):
|
|
158
|
+
return True
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
def all(self) -> list:
|
|
162
|
+
result = []
|
|
163
|
+
url = self.api_request_url()
|
|
164
|
+
while True:
|
|
165
|
+
response = self.get_api_result(url)
|
|
166
|
+
result.extend(response["data"])
|
|
167
|
+
logging.info(f"Retrieved DataCite {len(result)} of {response['meta']['total']}")
|
|
168
|
+
if response['links'].get('next'):
|
|
169
|
+
url = response['links']['next']
|
|
170
|
+
else:
|
|
171
|
+
break
|
|
172
|
+
return result
|
|
173
|
+
|
|
174
|
+
def count(self) -> int:
|
|
175
|
+
if not self.get_query_string():
|
|
176
|
+
return 0
|
|
177
|
+
url = self.api_request_url(page_size=0)
|
|
178
|
+
return self.get_api_result(url)["meta"]["total"]
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import urllib.request
|
|
3
|
+
import logging
|
|
4
|
+
import json
|
|
5
|
+
from roagg.models.research_output_item import ResearchOutputItem
|
|
6
|
+
from roagg.helpers.utils import find_doi_in_text, is_valid_doi, string_word_count
|
|
7
|
+
|
|
8
|
+
class OpenAireAPI:
|
|
9
|
+
openaire_base_url = "https://api.openaire.eu/graph/v1/"
|
|
10
|
+
|
|
11
|
+
def __init__(self, page_size: int = 100, ror: str = "", results: List[ResearchOutputItem] = []):
|
|
12
|
+
self.page_size = page_size
|
|
13
|
+
self.ror = ror
|
|
14
|
+
self.results = results
|
|
15
|
+
|
|
16
|
+
def get_openaire_id_from_ror(self) -> str:
|
|
17
|
+
url = f"{self.openaire_base_url}organizations?pid={self.ror}"
|
|
18
|
+
with urllib.request.urlopen(url) as response:
|
|
19
|
+
json_response = json.loads(response.read())
|
|
20
|
+
|
|
21
|
+
if 'results' in json_response and len(json_response['results']) > 0:
|
|
22
|
+
return json_response['results'][0]['id']
|
|
23
|
+
else:
|
|
24
|
+
return ""
|
|
25
|
+
|
|
26
|
+
def get_records(self) -> List[ResearchOutputItem]:
|
|
27
|
+
if not self.ror:
|
|
28
|
+
return []
|
|
29
|
+
openaire_results = []
|
|
30
|
+
openaire_id = self.get_openaire_id_from_ror()
|
|
31
|
+
|
|
32
|
+
if not openaire_id:
|
|
33
|
+
logging.info(f"No OpenAire ID found for ROR {self.ror}")
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
params = {
|
|
37
|
+
'pageSize': self.page_size,
|
|
38
|
+
'cursor': '*',
|
|
39
|
+
'type': 'dataset', # limit to only datasets for now
|
|
40
|
+
'relOrganizationId': openaire_id
|
|
41
|
+
}
|
|
42
|
+
retrieve_count = 0
|
|
43
|
+
while True:
|
|
44
|
+
query_string = urllib.parse.urlencode(params)
|
|
45
|
+
url = f"{self.openaire_base_url}researchProducts?{query_string}"
|
|
46
|
+
with urllib.request.urlopen(url) as response:
|
|
47
|
+
json_response = json.loads(response.read())
|
|
48
|
+
if 'results' in json_response:
|
|
49
|
+
openaire_results.extend(json_response['results'])
|
|
50
|
+
|
|
51
|
+
retrieve_count = len(openaire_results)
|
|
52
|
+
logging.info(f"Retrieved OpenAire {retrieve_count} of {json_response['header']['numFound']}")
|
|
53
|
+
|
|
54
|
+
if 'nextCursor' in json_response['header'] and json_response['header']['nextCursor']:
|
|
55
|
+
params['cursor'] = json_response['header']['nextCursor']
|
|
56
|
+
else:
|
|
57
|
+
break
|
|
58
|
+
|
|
59
|
+
# Create a dictionary for O(1) lookups
|
|
60
|
+
doi_to_item = {item.doi.lower(): item for item in self.results if item.doi}
|
|
61
|
+
|
|
62
|
+
for r in openaire_results:
|
|
63
|
+
openAireBestAccessRight = None
|
|
64
|
+
if 'bestAccessRight' in r and r['bestAccessRight'] and 'label' in r['bestAccessRight']:
|
|
65
|
+
openAireBestAccessRight = r['bestAccessRight']['label']
|
|
66
|
+
|
|
67
|
+
openAireIndicatorsUsageCountsDownloads = None
|
|
68
|
+
if 'indicators' in r and r['indicators'] and 'usageCounts' in r['indicators']:
|
|
69
|
+
if 'downloads' in r['indicators']['usageCounts']:
|
|
70
|
+
openAireIndicatorsUsageCountsDownloads = r['indicators']['usageCounts']['downloads']
|
|
71
|
+
|
|
72
|
+
openAireIndicatorsUsageCountsViews = None
|
|
73
|
+
if 'indicators' in r and r['indicators'] and 'usageCounts' in r['indicators']:
|
|
74
|
+
if 'views' in r['indicators']['usageCounts']:
|
|
75
|
+
openAireIndicatorsUsageCountsViews = r['indicators']['usageCounts']['views']
|
|
76
|
+
|
|
77
|
+
dois = self.get_doi_list_from_resource(r)
|
|
78
|
+
recordMatch = False
|
|
79
|
+
for doi in dois:
|
|
80
|
+
item = doi_to_item.get(doi.lower())
|
|
81
|
+
if item:
|
|
82
|
+
recordMatch = True
|
|
83
|
+
item.openAireBestAccessRight = openAireBestAccessRight
|
|
84
|
+
item.openAireIndicatorsUsageCountsDownloads = openAireIndicatorsUsageCountsDownloads
|
|
85
|
+
item.openAireIndicatorsUsageCountsViews = openAireIndicatorsUsageCountsViews
|
|
86
|
+
item.inOpenAire = True
|
|
87
|
+
if not recordMatch and len(dois) > 0:
|
|
88
|
+
publication_date = r.get('publicationDate', None)
|
|
89
|
+
publication_year = None
|
|
90
|
+
if publication_date:
|
|
91
|
+
publication_year = publication_date[:4] if len(publication_date) >= 4 else None
|
|
92
|
+
item = ResearchOutputItem(
|
|
93
|
+
doi=dois[0],
|
|
94
|
+
isPublisher=None,
|
|
95
|
+
resourceType=r.get('type', None),
|
|
96
|
+
title=r.get('mainTitle', None),
|
|
97
|
+
publisher=r.get('publisher', None),
|
|
98
|
+
publicationYear=publication_year,
|
|
99
|
+
haveContributorAffiliation=None,
|
|
100
|
+
haveCreatorAffiliation=None,
|
|
101
|
+
isLatestVersion=None,
|
|
102
|
+
isConceptDoi=None,
|
|
103
|
+
inOpenAire=True,
|
|
104
|
+
openAireBestAccessRight=openAireBestAccessRight,
|
|
105
|
+
openAireIndicatorsUsageCountsDownloads=openAireIndicatorsUsageCountsDownloads,
|
|
106
|
+
openAireIndicatorsUsageCountsViews=openAireIndicatorsUsageCountsViews,
|
|
107
|
+
openAireId=r.get('id', None),
|
|
108
|
+
titleWordCount=string_word_count(r.get('mainTitle', None))
|
|
109
|
+
)
|
|
110
|
+
self.results.append(item)
|
|
111
|
+
doi_to_item[item.doi.lower()] = item # Add to lookup dictionary
|
|
112
|
+
|
|
113
|
+
return openaire_results
|
|
114
|
+
|
|
115
|
+
def get_doi_list_from_resource(self, resource: dict) -> List[str]:
|
|
116
|
+
doi_list = []
|
|
117
|
+
|
|
118
|
+
for instance in resource['instances']:
|
|
119
|
+
logging.debug(f"Instance: {instance}")
|
|
120
|
+
|
|
121
|
+
if 'pids' in instance and len(instance['pids']) > 0:
|
|
122
|
+
for pid in instance['pids']:
|
|
123
|
+
if pid['scheme'].lower() == 'doi':
|
|
124
|
+
doi_list.append(pid['value'])
|
|
125
|
+
|
|
126
|
+
if 'alternateIdentifiers' in instance and len(instance['alternateIdentifiers']) > 0:
|
|
127
|
+
for alternateIdentifier in instance['alternateIdentifiers']:
|
|
128
|
+
if alternateIdentifier['scheme'].lower() == 'doi':
|
|
129
|
+
doi_list.append(alternateIdentifier['value'])
|
|
130
|
+
|
|
131
|
+
if len(doi_list) == 0:
|
|
132
|
+
# Normalize URLs to standard DOI format
|
|
133
|
+
url_replacements = [
|
|
134
|
+
("https://doi.pangaea.de/", "https://doi.org/"),
|
|
135
|
+
("https://zenodo.org/doi/", "https://doi.org/"),
|
|
136
|
+
("https://zenodo.org/records/", "https://doi.org/10.5281/zenodo.")
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
for url in instance['urls']:
|
|
140
|
+
normalized_url = url
|
|
141
|
+
for old_pattern, new_pattern in url_replacements:
|
|
142
|
+
normalized_url = normalized_url.replace(old_pattern, new_pattern)
|
|
143
|
+
|
|
144
|
+
for doi in find_doi_in_text(normalized_url):
|
|
145
|
+
if is_valid_doi(doi):
|
|
146
|
+
doi_list.append(doi)
|
|
147
|
+
|
|
148
|
+
# if doi_list is empty print json for instances
|
|
149
|
+
if len(doi_list) == 0:
|
|
150
|
+
logging.warning(f"No DOI found in resource: {json.dumps(resource['instances'], indent=2)}")
|
|
151
|
+
|
|
152
|
+
return list(set(doi_list))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import urllib.request
|
|
3
|
+
import logging
|
|
4
|
+
import json
|
|
5
|
+
from roagg.models.research_output_item import ResearchOutputItem
|
|
6
|
+
from roagg.helpers.utils import string_word_count, remove_resolver_prefix_from_doi
|
|
7
|
+
|
|
8
|
+
class OpenAlexAPI:
|
|
9
|
+
openalex_base_url = "https://api.openalex.org/"
|
|
10
|
+
|
|
11
|
+
def __init__(self, page_size: int = 200, ror: str = "", results: List[ResearchOutputItem] = []):
|
|
12
|
+
self.page_size = page_size
|
|
13
|
+
self.ror = ror
|
|
14
|
+
self.results = results
|
|
15
|
+
|
|
16
|
+
def get_openalex_id_from_ror(self) -> str:
|
|
17
|
+
url = f"{self.openalex_base_url}institutions/ror:{self.ror}"
|
|
18
|
+
with urllib.request.urlopen(url) as response:
|
|
19
|
+
json_response = json.loads(response.read())
|
|
20
|
+
|
|
21
|
+
if 'id' in json_response:
|
|
22
|
+
return json_response['id']
|
|
23
|
+
else:
|
|
24
|
+
return ""
|
|
25
|
+
|
|
26
|
+
def get_records(self) -> List[ResearchOutputItem]:
|
|
27
|
+
if not self.ror:
|
|
28
|
+
return []
|
|
29
|
+
openalex_results = []
|
|
30
|
+
openalex_id = self.get_openalex_id_from_ror()
|
|
31
|
+
|
|
32
|
+
if not openalex_id:
|
|
33
|
+
logging.info(f"No OpenAlex ID found for ROR {self.ror}")
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
params = {
|
|
37
|
+
'per-page': self.page_size,
|
|
38
|
+
'cursor': '*',
|
|
39
|
+
'filter': f'institutions.id:{openalex_id},type:dataset' # limit to only datasets for now
|
|
40
|
+
}
|
|
41
|
+
retrieve_count = 0
|
|
42
|
+
|
|
43
|
+
while True:
|
|
44
|
+
query_string = urllib.parse.urlencode(params)
|
|
45
|
+
url = f"{self.openalex_base_url}works?{query_string}"
|
|
46
|
+
with urllib.request.urlopen(url) as response:
|
|
47
|
+
json_response = json.loads(response.read())
|
|
48
|
+
if 'results' in json_response:
|
|
49
|
+
openalex_results.extend(json_response['results'])
|
|
50
|
+
retrieve_count = len(openalex_results)
|
|
51
|
+
logging.info(f"Retrieved OpenAlex {retrieve_count} of {json_response['meta']['count']}")
|
|
52
|
+
|
|
53
|
+
if 'next_cursor' in json_response['meta'] and json_response['meta']['next_cursor']:
|
|
54
|
+
params['cursor'] = json_response['meta']['next_cursor']
|
|
55
|
+
else:
|
|
56
|
+
break
|
|
57
|
+
|
|
58
|
+
# Create a dictionary for O(1) lookups
|
|
59
|
+
doi_to_item = {item.doi.lower(): item for item in self.results if item.doi}
|
|
60
|
+
|
|
61
|
+
for r in openalex_results:
|
|
62
|
+
openAlexCitedByCount = None
|
|
63
|
+
if 'cited_by_count' in r:
|
|
64
|
+
openAlexCitedByCount = r['cited_by_count']
|
|
65
|
+
|
|
66
|
+
openAlexReferencedWorksCount = None
|
|
67
|
+
if 'referenced_works_count' in r:
|
|
68
|
+
openAlexReferencedWorksCount = r['referenced_works_count']
|
|
69
|
+
|
|
70
|
+
haveCreatorAffiliation = False
|
|
71
|
+
|
|
72
|
+
for authorship in r.get('institutions', []):
|
|
73
|
+
for affiliation in authorship.get('institutions', []):
|
|
74
|
+
if affiliation.get('ror') == self.ror:
|
|
75
|
+
haveCreatorAffiliation = True
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
doi = remove_resolver_prefix_from_doi(r.get('doi', None))
|
|
79
|
+
if doi is None:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
recordMatch = False
|
|
83
|
+
|
|
84
|
+
item = doi_to_item.get(doi.lower())
|
|
85
|
+
if item:
|
|
86
|
+
recordMatch = True
|
|
87
|
+
item.openAlexCitedByCount = openAlexCitedByCount
|
|
88
|
+
item.openAlexReferencedWorksCount = openAlexReferencedWorksCount
|
|
89
|
+
item.inOpenAlex = True
|
|
90
|
+
item.openAlexId = r.get('id', None)
|
|
91
|
+
item.haveCreatorAffiliation = haveCreatorAffiliation
|
|
92
|
+
if not recordMatch:
|
|
93
|
+
publication_date = r.get('publication_date', None)
|
|
94
|
+
publication_year = r.get('publication_year', None)
|
|
95
|
+
if publication_date:
|
|
96
|
+
publication_year = publication_date[:4] if len(publication_date) >= 4 else None
|
|
97
|
+
item = ResearchOutputItem(
|
|
98
|
+
doi=doi,
|
|
99
|
+
isPublisher=None,
|
|
100
|
+
resourceType=r.get('type', None),
|
|
101
|
+
title=r.get('title', None),
|
|
102
|
+
publisher=None,
|
|
103
|
+
publicationYear=publication_year,
|
|
104
|
+
createdAt=r.get('created_date', None),
|
|
105
|
+
updatedAt=r.get('updated_date', None),
|
|
106
|
+
haveContributorAffiliation=None,
|
|
107
|
+
haveCreatorAffiliation=haveCreatorAffiliation,
|
|
108
|
+
isLatestVersion=None,
|
|
109
|
+
isConceptDoi=None,
|
|
110
|
+
inOpenAlex=True,
|
|
111
|
+
openAlexCitedByCount=openAlexCitedByCount,
|
|
112
|
+
openAlexReferencedWorksCount=openAlexReferencedWorksCount,
|
|
113
|
+
openAlexId=r.get('id', None),
|
|
114
|
+
titleWordCount=string_word_count(r.get('title', None))
|
|
115
|
+
)
|
|
116
|
+
self.results.append(item)
|
|
117
|
+
doi_to_item[item.doi.lower()] = item # Add to lookup dictionary
|
|
118
|
+
|
|
119
|
+
return openalex_results
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/roagg/__init__.py
|
|
5
|
+
src/roagg/__main__.py
|
|
6
|
+
src/roagg/aggregator.py
|
|
7
|
+
src/roagg/cli.py
|
|
8
|
+
src/roagg.egg-info/PKG-INFO
|
|
9
|
+
src/roagg.egg-info/SOURCES.txt
|
|
10
|
+
src/roagg.egg-info/dependency_links.txt
|
|
11
|
+
src/roagg.egg-info/entry_points.txt
|
|
12
|
+
src/roagg.egg-info/top_level.txt
|
|
13
|
+
src/roagg/helpers/ror.py
|
|
14
|
+
src/roagg/helpers/utils.py
|
|
15
|
+
src/roagg/models/research_output_item.py
|
|
16
|
+
src/roagg/providers/datacite.py
|
|
17
|
+
src/roagg/providers/openaire.py
|
|
18
|
+
src/roagg/providers/openalex.py
|
|
19
|
+
tests/test_utils.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
roagg
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from roagg.helpers.utils import is_valid_doi, find_doi_in_text, string_word_count
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TestIsValidDoi:
|
|
6
|
+
"""Test cases for the is_valid_doi function."""
|
|
7
|
+
|
|
8
|
+
def test_valid_doi_basic(self):
|
|
9
|
+
"""Test basic valid DOI format."""
|
|
10
|
+
assert is_valid_doi("10.1234/example")
|
|
11
|
+
|
|
12
|
+
def test_valid_doi_with_longer_prefix(self):
|
|
13
|
+
"""Test valid DOI with longer prefix number."""
|
|
14
|
+
assert is_valid_doi("10.123456789/test")
|
|
15
|
+
|
|
16
|
+
def test_valid_doi_with_special_characters(self):
|
|
17
|
+
"""Test valid DOI with allowed special characters."""
|
|
18
|
+
assert is_valid_doi("10.1000/test-example_123")
|
|
19
|
+
assert is_valid_doi("10.1000/test.example")
|
|
20
|
+
assert is_valid_doi("10.1000/test(123)")
|
|
21
|
+
assert is_valid_doi("10.1000/test;123")
|
|
22
|
+
assert is_valid_doi("10.1000/test:123")
|
|
23
|
+
assert is_valid_doi("10.1000/test/subpath")
|
|
24
|
+
|
|
25
|
+
def test_valid_doi_with_uppercase(self):
|
|
26
|
+
"""Test valid DOI with uppercase letters."""
|
|
27
|
+
assert is_valid_doi("10.1234/ABC-DEF")
|
|
28
|
+
assert is_valid_doi("10.1234/Test-Example")
|
|
29
|
+
|
|
30
|
+
def test_valid_doi_with_numbers(self):
|
|
31
|
+
"""Test valid DOI with various numbers in suffix."""
|
|
32
|
+
assert is_valid_doi("10.1234/123456789")
|
|
33
|
+
|
|
34
|
+
def test_valid_doi_complex(self):
|
|
35
|
+
"""Test complex real-world DOI examples."""
|
|
36
|
+
assert is_valid_doi("10.1000/182")
|
|
37
|
+
assert is_valid_doi("10.1038/nphys1170")
|
|
38
|
+
assert is_valid_doi("10.1016/j.cell.2009.01.002")
|
|
39
|
+
|
|
40
|
+
def test_invalid_doi_missing_prefix(self):
|
|
41
|
+
"""Test invalid DOI without '10.' prefix."""
|
|
42
|
+
assert not is_valid_doi("11.1234/example")
|
|
43
|
+
assert not is_valid_doi("1.1234/example")
|
|
44
|
+
assert not is_valid_doi("1234/example")
|
|
45
|
+
|
|
46
|
+
def test_invalid_doi_short_prefix_number(self):
|
|
47
|
+
"""Test invalid DOI with too short prefix number."""
|
|
48
|
+
assert not is_valid_doi("10.123/example") # Only 3 digits
|
|
49
|
+
assert not is_valid_doi("10.12/example") # Only 2 digits
|
|
50
|
+
assert not is_valid_doi("10.1/example") # Only 1 digit
|
|
51
|
+
|
|
52
|
+
def test_invalid_doi_missing_slash(self):
|
|
53
|
+
"""Test invalid DOI without slash separator."""
|
|
54
|
+
assert not is_valid_doi("10.1234-example")
|
|
55
|
+
assert not is_valid_doi("10.1234.example")
|
|
56
|
+
|
|
57
|
+
def test_invalid_doi_missing_suffix(self):
|
|
58
|
+
"""Test invalid DOI without suffix."""
|
|
59
|
+
assert not is_valid_doi("10.1234/")
|
|
60
|
+
assert not is_valid_doi("10.1234")
|
|
61
|
+
|
|
62
|
+
def test_invalid_doi_empty_string(self):
|
|
63
|
+
"""Test invalid DOI with empty string."""
|
|
64
|
+
assert not is_valid_doi("")
|
|
65
|
+
|
|
66
|
+
def test_invalid_doi_whitespace(self):
|
|
67
|
+
"""Test invalid DOI with whitespace."""
|
|
68
|
+
assert not is_valid_doi("10.1234/ example")
|
|
69
|
+
assert not is_valid_doi("10.1234 /example")
|
|
70
|
+
assert not is_valid_doi(" 10.1234/example")
|
|
71
|
+
assert not is_valid_doi("10.1234/example ")
|
|
72
|
+
|
|
73
|
+
def test_invalid_doi_only_prefix(self):
|
|
74
|
+
"""Test invalid DOI with only the prefix."""
|
|
75
|
+
assert not is_valid_doi("10.")
|
|
76
|
+
|
|
77
|
+
def test_invalid_doi_non_numeric_prefix(self):
|
|
78
|
+
"""Test invalid DOI with non-numeric prefix."""
|
|
79
|
+
assert not is_valid_doi("10.abcd/example")
|
|
80
|
+
|
|
81
|
+
def test_edge_case_minimum_valid_length(self):
|
|
82
|
+
"""Test DOI with minimum valid prefix length (4 digits)."""
|
|
83
|
+
assert is_valid_doi("10.1000/a")
|
|
84
|
+
|
|
85
|
+
def test_edge_case_maximum_valid_length(self):
|
|
86
|
+
"""Test DOI with maximum valid prefix length (9 digits)."""
|
|
87
|
+
assert is_valid_doi("10.123456789/a")
|
|
88
|
+
|
|
89
|
+
def test_edge_case_too_long_prefix(self):
|
|
90
|
+
"""Test DOI with prefix longer than 9 digits."""
|
|
91
|
+
assert not is_valid_doi("10.1234567890/example")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class TestFindDoiInText:
|
|
95
|
+
"""Test cases for the find_doi_in_text function."""
|
|
96
|
+
|
|
97
|
+
def test_pangea_doi_url(self):
|
|
98
|
+
"""PANGAEA DOI URL."""
|
|
99
|
+
text = "https://doi.pangaea.de/10.1234/example"
|
|
100
|
+
result = find_doi_in_text(text)
|
|
101
|
+
assert result == ["10.1234/example"]
|
|
102
|
+
|
|
103
|
+
def test_zenodo_doi_url(self):
|
|
104
|
+
"""ZENODO DOI URL."""
|
|
105
|
+
text = "https://zenodo.org/doi/10.1234/example"
|
|
106
|
+
result = find_doi_in_text(text)
|
|
107
|
+
assert result == ["10.1234/example"]
|
|
108
|
+
|
|
109
|
+
def test_single_doi_in_text(self):
|
|
110
|
+
"""Test finding a single DOI in text."""
|
|
111
|
+
text = "This is a reference to 10.1234/example in the middle of text."
|
|
112
|
+
result = find_doi_in_text(text)
|
|
113
|
+
assert result == ["10.1234/example"]
|
|
114
|
+
|
|
115
|
+
def test_multiple_dois_in_text(self):
|
|
116
|
+
"""Test finding multiple DOIs in text."""
|
|
117
|
+
text = "See 10.1234/first and also 10.5678/second for more info."
|
|
118
|
+
result = find_doi_in_text(text)
|
|
119
|
+
assert result == ["10.1234/first", "10.5678/second"]
|
|
120
|
+
|
|
121
|
+
def test_doi_at_start_of_text(self):
|
|
122
|
+
"""Test finding DOI at the beginning of text."""
|
|
123
|
+
text = "10.1234/example is the DOI for this article."
|
|
124
|
+
result = find_doi_in_text(text)
|
|
125
|
+
assert result == ["10.1234/example"]
|
|
126
|
+
|
|
127
|
+
def test_doi_at_end_of_text(self):
|
|
128
|
+
"""Test finding DOI at the end of text."""
|
|
129
|
+
text = "The DOI for this article is 10.1234/example"
|
|
130
|
+
result = find_doi_in_text(text)
|
|
131
|
+
assert result == ["10.1234/example"]
|
|
132
|
+
|
|
133
|
+
def test_doi_with_special_characters(self):
|
|
134
|
+
"""Test finding DOIs with various special characters."""
|
|
135
|
+
text = "DOI: 10.1038/nphys1170 and 10.1016/j.cell.2009.01.002"
|
|
136
|
+
result = find_doi_in_text(text)
|
|
137
|
+
assert len(result) == 2
|
|
138
|
+
assert "10.1038/nphys1170" in result
|
|
139
|
+
assert "10.1016/j.cell.2009.01.002" in result
|
|
140
|
+
|
|
141
|
+
def test_doi_with_parentheses(self):
|
|
142
|
+
"""Test finding DOI with parentheses."""
|
|
143
|
+
text = "See DOI 10.1234/test(2024) for details."
|
|
144
|
+
result = find_doi_in_text(text)
|
|
145
|
+
assert "10.1234/test(2024)" in result
|
|
146
|
+
|
|
147
|
+
def test_no_doi_in_text(self):
|
|
148
|
+
"""Test text without any DOI."""
|
|
149
|
+
text = "This text has no DOI at all."
|
|
150
|
+
result = find_doi_in_text(text)
|
|
151
|
+
assert result == []
|
|
152
|
+
|
|
153
|
+
def test_empty_text(self):
|
|
154
|
+
"""Test with empty string."""
|
|
155
|
+
result = find_doi_in_text("")
|
|
156
|
+
assert result == []
|
|
157
|
+
|
|
158
|
+
def test_doi_with_url(self):
|
|
159
|
+
"""Test finding DOI within a URL."""
|
|
160
|
+
text = "Visit https://doi.org/10.1234/example for the article."
|
|
161
|
+
result = find_doi_in_text(text)
|
|
162
|
+
assert "10.1234/example" in result
|
|
163
|
+
|
|
164
|
+
def test_doi_minimum_prefix_length(self):
|
|
165
|
+
"""Test DOI with minimum prefix length (4 digits)."""
|
|
166
|
+
text = "The DOI is 10.1000/test"
|
|
167
|
+
result = find_doi_in_text(text)
|
|
168
|
+
assert result == ["10.1000/test"]
|
|
169
|
+
|
|
170
|
+
def test_doi_maximum_prefix_length(self):
|
|
171
|
+
"""Test DOI with maximum prefix length (9 digits)."""
|
|
172
|
+
text = "The DOI is 10.123456789/test"
|
|
173
|
+
result = find_doi_in_text(text)
|
|
174
|
+
assert result == ["10.123456789/test"]
|
|
175
|
+
|
|
176
|
+
def test_invalid_doi_too_short_prefix(self):
|
|
177
|
+
"""Test that DOIs with too short prefix are not found."""
|
|
178
|
+
text = "This is not a valid DOI: 10.123/test"
|
|
179
|
+
result = find_doi_in_text(text)
|
|
180
|
+
assert result == []
|
|
181
|
+
|
|
182
|
+
def test_doi_with_underscores(self):
|
|
183
|
+
"""Test finding DOI with underscores."""
|
|
184
|
+
text = "DOI: 10.1234/test_example_123"
|
|
185
|
+
result = find_doi_in_text(text)
|
|
186
|
+
assert "10.1234/test_example_123" in result
|
|
187
|
+
|
|
188
|
+
def test_doi_in_multiline_text(self):
|
|
189
|
+
"""Test finding DOI in multiline text."""
|
|
190
|
+
text = """This is a paper.
|
|
191
|
+
The DOI is 10.1234/example
|
|
192
|
+
It was published in 2024."""
|
|
193
|
+
result = find_doi_in_text(text)
|
|
194
|
+
assert result == ["10.1234/example"]
|
|
195
|
+
|
|
196
|
+
def test_doi_with_mixed_case(self):
|
|
197
|
+
"""Test finding DOI with mixed case letters."""
|
|
198
|
+
text = "DOI: 10.1234/AbCdEf123"
|
|
199
|
+
result = find_doi_in_text(text)
|
|
200
|
+
assert "10.1234/AbCdEf123" in result
|
|
201
|
+
|
|
202
|
+
def test_real_world_dois(self):
|
|
203
|
+
"""Test finding real-world DOI examples."""
|
|
204
|
+
text = "See 10.1038/nature12373 and 10.1126/science.1259855 for more information."
|
|
205
|
+
result = find_doi_in_text(text)
|
|
206
|
+
assert len(result) == 2
|
|
207
|
+
assert "10.1038/nature12373" in result
|
|
208
|
+
assert "10.1126/science.1259855" in result
|
|
209
|
+
|
|
210
|
+
class TestStringWordCount:
|
|
211
|
+
def test_single_word(self):
|
|
212
|
+
"""Test with a single word."""
|
|
213
|
+
assert string_word_count("file.csv") == 1
|
|
214
|
+
assert string_word_count("another_file[2].jpg") == 1
|
|
215
|
+
|
|
216
|
+
def test_multiple_words(self):
|
|
217
|
+
"""Test with multiple words."""
|
|
218
|
+
assert string_word_count("Hello World from Roagg") == 4
|
|
219
|
+
|
|
220
|
+
def test_leading_trailing_spaces(self):
|
|
221
|
+
"""Test with leading and trailing spaces."""
|
|
222
|
+
# "Leading and trailing spaces" = 4 words (not 5)
|
|
223
|
+
assert string_word_count(" Leading and trailing spaces ") == 4
|
|
224
|
+
|
|
225
|
+
def test_multiple_spaces_between_words(self):
|
|
226
|
+
"""Test with multiple spaces between words."""
|
|
227
|
+
# "Multiple spaces between words" = 4 words (not 5)
|
|
228
|
+
assert string_word_count("Multiple spaces between words") == 4
|
|
229
|
+
|
|
230
|
+
def test_empty_string(self):
|
|
231
|
+
"""Test with an empty string."""
|
|
232
|
+
assert string_word_count("") == 0
|
|
233
|
+
|
|
234
|
+
def test_string_with_only_spaces(self):
|
|
235
|
+
"""Test with a string containing only spaces."""
|
|
236
|
+
assert string_word_count(" ") == 0
|
|
237
|
+
|
|
238
|
+
def test_string_with_newlines_and_tabs(self):
|
|
239
|
+
"""Test with newlines and tabs."""
|
|
240
|
+
assert string_word_count("Hello\nWorld\tfrom Roagg") == 4
|
|
241
|
+
|
|
242
|
+
def test_string_with_punctuation(self):
|
|
243
|
+
"""Test with punctuation - split() doesn't separate on punctuation."""
|
|
244
|
+
# "Hello, world! This is Roagg." = 5 words (punctuation stays attached)
|
|
245
|
+
assert string_word_count("Hello, world! This is Roagg.") == 5
|
|
246
|
+
|
|
247
|
+
def test_string_with_special_characters(self):
|
|
248
|
+
"""Test with special characters - split() doesn't separate on special chars."""
|
|
249
|
+
# "datafile.csv is ready!" = 3 words
|
|
250
|
+
assert string_word_count("datafile.csv is ready! ") == 3
|
|
251
|
+
|
|
252
|
+
def test_string_with_numeric_characters(self):
|
|
253
|
+
"""Test with numeric characters - split() doesn't separate on dots."""
|
|
254
|
+
# "Version 2.0 of the software" = 5 words (2.0 is one word)
|
|
255
|
+
assert string_word_count("Version 2.0 of the software ") == 5
|