roagg 2025.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- roagg/__init__.py +0 -0
- roagg/__main__.py +3 -0
- roagg/aggregator.py +69 -0
- roagg/cli.py +91 -0
- roagg/helpers/ror.py +14 -0
- roagg/helpers/utils.py +65 -0
- roagg/models/research_output_item.py +51 -0
- roagg/providers/datacite.py +178 -0
- roagg/providers/openaire.py +155 -0
- roagg/providers/openalex.py +119 -0
- roagg-2025.0.8.dist-info/METADATA +6 -0
- roagg-2025.0.8.dist-info/RECORD +16 -0
- roagg-2025.0.8.dist-info/WHEEL +5 -0
- roagg-2025.0.8.dist-info/entry_points.txt +5 -0
- roagg-2025.0.8.dist-info/licenses/LICENSE +21 -0
- roagg-2025.0.8.dist-info/top_level.txt +1 -0
roagg/__init__.py
ADDED
|
File without changes
|
roagg/__main__.py
ADDED
roagg/aggregator.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import List
|
|
3
|
+
from roagg.helpers.ror import get_names_from_ror
|
|
4
|
+
from roagg.providers.datacite import DataCiteAPI
|
|
5
|
+
from roagg.providers.openaire import OpenAireAPI
|
|
6
|
+
from roagg.providers.openalex import OpenAlexAPI
|
|
7
|
+
import logging
|
|
8
|
+
from roagg.models.research_output_item import ResearchOutputItem
|
|
9
|
+
import json
|
|
10
|
+
import csv
|
|
11
|
+
from dataclasses import fields
|
|
12
|
+
|
|
13
|
+
def aggregate(name: List[str] = [], ror: str = "", output: str = "output.csv") -> None:
|
|
14
|
+
if ror:
|
|
15
|
+
ror_name = get_names_from_ror(ror)
|
|
16
|
+
name.extend(ror_name)
|
|
17
|
+
|
|
18
|
+
# remove duplicates
|
|
19
|
+
name = list(set(name))
|
|
20
|
+
|
|
21
|
+
datacite = DataCiteAPI(name=name, ror=ror)
|
|
22
|
+
url = datacite.api_request_url()
|
|
23
|
+
# debug print of the query string
|
|
24
|
+
logging.info("DataCite url:")
|
|
25
|
+
logging.info(url)
|
|
26
|
+
|
|
27
|
+
records = datacite.all()
|
|
28
|
+
research_output_items = []
|
|
29
|
+
logging.info(f"Checking {len(records)} records...")
|
|
30
|
+
for record in records:
|
|
31
|
+
research_output_items.append(datacite.get_record(record))
|
|
32
|
+
|
|
33
|
+
openaire = OpenAireAPI(ror=ror, results=research_output_items)
|
|
34
|
+
openaire_id = openaire.get_openaire_id_from_ror()
|
|
35
|
+
logging.info(f"OpenAire ID from ROR {ror} : {openaire_id}")
|
|
36
|
+
openaire.get_records()
|
|
37
|
+
|
|
38
|
+
openalex = OpenAlexAPI(ror=ror, results=research_output_items)
|
|
39
|
+
openalex_id = openalex.get_openalex_id_from_ror()
|
|
40
|
+
logging.info(f"OpenAlex ID from ROR {ror} : {openalex_id}")
|
|
41
|
+
openalex.get_records()
|
|
42
|
+
|
|
43
|
+
logging.info(f"Writing: {output}")
|
|
44
|
+
|
|
45
|
+
write_csv(research_output_items, output)
|
|
46
|
+
logging.info(f"Writing output to csv: {output} - Done")
|
|
47
|
+
|
|
48
|
+
def write_csv(records: List[ResearchOutputItem], output: str) -> None:
|
|
49
|
+
# Get field names from the dataclass
|
|
50
|
+
dataclass_fields = fields(ResearchOutputItem)
|
|
51
|
+
header = [field.name for field in dataclass_fields]
|
|
52
|
+
|
|
53
|
+
def format_value(value):
|
|
54
|
+
"""Format values for CSV output"""
|
|
55
|
+
if value is None:
|
|
56
|
+
return ""
|
|
57
|
+
elif isinstance(value, bool):
|
|
58
|
+
return 1 if value else 0
|
|
59
|
+
else:
|
|
60
|
+
return value
|
|
61
|
+
|
|
62
|
+
with open(output, 'w', newline='', encoding='utf-8') as file:
|
|
63
|
+
writer = csv.writer(file)
|
|
64
|
+
writer.writerow(header)
|
|
65
|
+
|
|
66
|
+
writer.writerows([
|
|
67
|
+
[format_value(getattr(record, field.name)) for field in dataclass_fields]
|
|
68
|
+
for record in records
|
|
69
|
+
])
|
roagg/cli.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from roagg.helpers.utils import get_roagg_version
|
|
8
|
+
from roagg.aggregator import aggregate
|
|
9
|
+
|
|
10
|
+
def validate_ror_id(ror_id: str) -> str:
|
|
11
|
+
"""validate ROR ID format (should start with https://ror.org/)."""
|
|
12
|
+
if not ror_id.startswith('https://ror.org/'):
|
|
13
|
+
raise argparse.ArgumentTypeError("ROR ID must start with 'https://ror.org/'")
|
|
14
|
+
return ror_id
|
|
15
|
+
|
|
16
|
+
def read_names_from_file(filepath: Path) -> List[str]:
|
|
17
|
+
"""Read organization names from a file, one per line."""
|
|
18
|
+
try:
|
|
19
|
+
return [line.strip() for line in filepath.read_text().splitlines() if line.strip()]
|
|
20
|
+
except IOError as e:
|
|
21
|
+
logging.error(f"Failed to read names file: {e}")
|
|
22
|
+
sys.exit(1)
|
|
23
|
+
|
|
24
|
+
def main() -> None:
|
|
25
|
+
"""create a summary CSV file for all research output for an organization."""
|
|
26
|
+
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
|
27
|
+
|
|
28
|
+
parser = argparse.ArgumentParser(
|
|
29
|
+
description="aggregate research outputs for an organization into a CSV file",
|
|
30
|
+
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--version",
|
|
35
|
+
action="version",
|
|
36
|
+
version=get_roagg_version()
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"--name",
|
|
41
|
+
type=str,
|
|
42
|
+
action='append',
|
|
43
|
+
help="name variant of the organization (can be used multiple times)"
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--name-txt",
|
|
47
|
+
type=Path,
|
|
48
|
+
help="path to text file containing organization name variants (one per line)"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--ror",
|
|
53
|
+
type=validate_ror_id,
|
|
54
|
+
help="ROR ID of the organization (must start with https://ror.org/)"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"--source",
|
|
59
|
+
default="api",
|
|
60
|
+
choices=["api"],
|
|
61
|
+
help="source for resource aggregation (only api is supported right now)"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--output",
|
|
66
|
+
default="data/output.csv",
|
|
67
|
+
help="name of the output file (default: data/output.csv)"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
args = parser.parse_args()
|
|
71
|
+
|
|
72
|
+
# print parser.print_help() if no argument for name, name-txt or ror is provided
|
|
73
|
+
if not any([args.name, args.name_txt, args.ror]):
|
|
74
|
+
parser.print_help()
|
|
75
|
+
sys.exit(1)
|
|
76
|
+
|
|
77
|
+
names: List[str] = []
|
|
78
|
+
if args.name:
|
|
79
|
+
names = args.name
|
|
80
|
+
|
|
81
|
+
if args.name_txt:
|
|
82
|
+
names.extend(read_names_from_file(args.name_txt))
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
aggregate(names, args.ror, args.output)
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logging.error(f"Aggregation failed: {e}")
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
if __name__ == "__main__":
|
|
91
|
+
main()
|
roagg/helpers/ror.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import urllib.request
|
|
2
|
+
import json
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
def get_ror_info(ror: str):
|
|
6
|
+
ror_id = ror.split('/')[-1]
|
|
7
|
+
url = f"https://api.ror.org/v2/organizations/{ror_id}"
|
|
8
|
+
with urllib.request.urlopen(url) as response:
|
|
9
|
+
return json.loads(response.read())
|
|
10
|
+
|
|
11
|
+
def get_names_from_ror(ror: str) -> List[str]:
|
|
12
|
+
names = get_ror_info(ror)['names']
|
|
13
|
+
valid_types = {'alias', 'ror_display', 'label'}
|
|
14
|
+
return [n['value'] for n in names if valid_types.intersection(n['types'])]
|
roagg/helpers/utils.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import importlib.metadata
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
doi_pattern = re.compile(r'^10\.\d{4,9}/[-._;()/:A-Z0-9]+$', re.IGNORECASE)
|
|
5
|
+
|
|
6
|
+
def is_valid_doi(s: str) -> bool:
|
|
7
|
+
return bool(doi_pattern.match(s))
|
|
8
|
+
|
|
9
|
+
def find_doi_in_text(text: str) -> str | None:
|
|
10
|
+
return re.findall(r'\b10\.\d{4,9}/[-.;()/:\w]+', text)
|
|
11
|
+
|
|
12
|
+
def remove_resolver_prefix_from_doi(doi: str) -> str:
|
|
13
|
+
if doi is None:
|
|
14
|
+
return None
|
|
15
|
+
prefixes = [
|
|
16
|
+
"https://doi.org/",
|
|
17
|
+
"http://doi.org/",
|
|
18
|
+
"doi.org/",
|
|
19
|
+
"https://dx.doi.org/",
|
|
20
|
+
"http://dx.doi.org/",
|
|
21
|
+
"dx.doi.org/"
|
|
22
|
+
]
|
|
23
|
+
for prefix in prefixes:
|
|
24
|
+
if doi.lower().startswith(prefix):
|
|
25
|
+
return doi[len(prefix):]
|
|
26
|
+
return doi
|
|
27
|
+
|
|
28
|
+
def match_patterns(string, patterns):
|
|
29
|
+
if string is None:
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
for pattern in patterns:
|
|
33
|
+
# if pattern does not contain * or ? check if string contains pattern
|
|
34
|
+
if '*' not in pattern and '?' not in pattern:
|
|
35
|
+
if pattern.lower() is string.lower() or pattern.lower() in string.lower():
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
if re.match(pattern_to_regexp(pattern), string, re.IGNORECASE):
|
|
40
|
+
return True
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
def pattern_to_regexp(pattern: str) -> str:
|
|
44
|
+
regex = ""
|
|
45
|
+
for char in pattern:
|
|
46
|
+
if char == '*':
|
|
47
|
+
regex += '.*'
|
|
48
|
+
elif char == '?':
|
|
49
|
+
regex += '.'
|
|
50
|
+
else:
|
|
51
|
+
regex += re.escape(char)
|
|
52
|
+
return '^' + regex + '$'
|
|
53
|
+
|
|
54
|
+
def get_roagg_version() -> str:
|
|
55
|
+
"""Get package version from metadata."""
|
|
56
|
+
try:
|
|
57
|
+
return importlib.metadata.version("roagg")
|
|
58
|
+
except importlib.metadata.PackageNotFoundError:
|
|
59
|
+
return "unknown"
|
|
60
|
+
|
|
61
|
+
def string_word_count(string: str) -> int:
|
|
62
|
+
"""Count words in a string after trimming whitespace."""
|
|
63
|
+
if not string:
|
|
64
|
+
return 0
|
|
65
|
+
return len(string.strip().split())
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
@dataclass
|
|
4
|
+
class ResearchOutputItem:
|
|
5
|
+
doi: str
|
|
6
|
+
publicationYear: int = None
|
|
7
|
+
resourceType: str = None
|
|
8
|
+
title: str = None
|
|
9
|
+
publisher: str = None
|
|
10
|
+
createdAt: str = ""
|
|
11
|
+
updatedAt: str = ""
|
|
12
|
+
isPublisher: bool = False
|
|
13
|
+
isFunder: bool = None
|
|
14
|
+
haveCreatorAffiliation: bool = False
|
|
15
|
+
haveContributorAffiliation: bool = False
|
|
16
|
+
isLatestVersion: bool = True
|
|
17
|
+
isConceptDoi: bool = False
|
|
18
|
+
#match on ROR
|
|
19
|
+
matchPublisherRor: bool = False
|
|
20
|
+
matchCreatorAffiliationRor: bool = False
|
|
21
|
+
matchContributorAffiliationRor: bool = False
|
|
22
|
+
matchFunderRor: bool = False
|
|
23
|
+
#match on free text in name
|
|
24
|
+
matchPublisherName: bool = False
|
|
25
|
+
matchCreatorName: bool = False
|
|
26
|
+
matchContributorName: bool = False
|
|
27
|
+
matchFunderName: bool = False
|
|
28
|
+
#where was the match found
|
|
29
|
+
inDataCite: bool = None
|
|
30
|
+
inOpenAire: bool = None
|
|
31
|
+
inOpenAlex: bool = None
|
|
32
|
+
inCrossRef: bool = None
|
|
33
|
+
#datacite specific
|
|
34
|
+
dataCiteClientId: str = None
|
|
35
|
+
dataCiteClientName: str = None
|
|
36
|
+
dataCiteCitationCount: int = None
|
|
37
|
+
dataCiteReferenceCount: int = None
|
|
38
|
+
dataCiteViewCount: int = None
|
|
39
|
+
dataCiteDownloadCount: int = None
|
|
40
|
+
#openaire specific
|
|
41
|
+
openAireBestAccessRight: str = None
|
|
42
|
+
openAireIndicatorsUsageCountsDownloads: int = None
|
|
43
|
+
openAireIndicatorsUsageCountsViews: int = None
|
|
44
|
+
openAireId: str = None
|
|
45
|
+
#openalex specific
|
|
46
|
+
openAlexId: str = None
|
|
47
|
+
openAlexCitedByCount: int = None
|
|
48
|
+
openAlexReferencedWorksCount: int = None
|
|
49
|
+
#extra fields
|
|
50
|
+
titleWordCount: int = None
|
|
51
|
+
referencedByDoi: str = None
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import urllib.request
|
|
3
|
+
import logging
|
|
4
|
+
import json
|
|
5
|
+
from roagg.helpers.utils import get_roagg_version
|
|
6
|
+
from roagg.models.research_output_item import ResearchOutputItem
|
|
7
|
+
from roagg.helpers.utils import match_patterns, string_word_count
|
|
8
|
+
|
|
9
|
+
class DataCiteAPI:
|
|
10
|
+
def __init__(self, page_size: int = 500, name: List[str] = [], ror: str = ""):
|
|
11
|
+
self.page_size = page_size
|
|
12
|
+
self.name = name
|
|
13
|
+
self.ror = ror
|
|
14
|
+
|
|
15
|
+
def get_query_string(self) -> str:
|
|
16
|
+
if not self.name and not self.ror:
|
|
17
|
+
return ""
|
|
18
|
+
|
|
19
|
+
query_parts = []
|
|
20
|
+
|
|
21
|
+
if self.name:
|
|
22
|
+
# Separate wildcard and exact matches, handle spaces in wildcard queries appropriately
|
|
23
|
+
wildcard = ' OR '.join(n.replace(" ", "\\ ") for n in self.name if '*' in n)
|
|
24
|
+
exact = ' OR '.join(f'"{n}"' for n in self.name if '*' not in n)
|
|
25
|
+
name_fields = [
|
|
26
|
+
"creators.affiliation.name",
|
|
27
|
+
"contributors.affiliation.name",
|
|
28
|
+
"publisher.name"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
if wildcard and exact:
|
|
32
|
+
name_conditions = f'{wildcard} OR {exact}'
|
|
33
|
+
else:
|
|
34
|
+
name_conditions = wildcard or exact
|
|
35
|
+
|
|
36
|
+
query_parts.extend([f"{field}:({name_conditions})" for field in name_fields])
|
|
37
|
+
|
|
38
|
+
if self.ror:
|
|
39
|
+
ror_fields = [
|
|
40
|
+
"publisher.publisherIdentifier",
|
|
41
|
+
"creators.affiliation.affiliationIdentifier",
|
|
42
|
+
"contributors.affiliation.affiliationIdentifier",
|
|
43
|
+
"creators.nameIdentifiers.nameIdentifier",
|
|
44
|
+
"contributors.nameIdentifiers.nameIdentifier",
|
|
45
|
+
"fundingReferences.funderIdentifier"
|
|
46
|
+
]
|
|
47
|
+
query_parts.extend([f'{field}:"{self.ror}"' for field in ror_fields])
|
|
48
|
+
# nameIdentifiers are formated without https://ror.org/ prefix from some sources, so we need to check both
|
|
49
|
+
query_parts.extend([f'{field}:"{self.ror.split("https://ror.org/")[1]}"' for field in ror_fields])
|
|
50
|
+
|
|
51
|
+
return " OR ".join(query_parts)
|
|
52
|
+
|
|
53
|
+
def api_request_url(self, page_size: int = None) -> str:
|
|
54
|
+
if page_size is None:
|
|
55
|
+
page_size = self.page_size
|
|
56
|
+
params = urllib.parse.urlencode({
|
|
57
|
+
'page[size]': page_size,
|
|
58
|
+
'page[cursor]': '1',
|
|
59
|
+
'affiliation': 'true',
|
|
60
|
+
'publisher': 'true',
|
|
61
|
+
'detail': 'true',
|
|
62
|
+
'disable-facets': 'false',
|
|
63
|
+
'query': self.get_query_string()
|
|
64
|
+
})
|
|
65
|
+
return f"https://api.datacite.org/dois?{params}"
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def get_api_result(url: str) -> dict:
|
|
69
|
+
request = urllib.request.Request(url)
|
|
70
|
+
version = get_roagg_version()
|
|
71
|
+
request.add_header('User-Agent', f'ResearchOutputAggregator/{version} (https://github.com/snd-sweden/research-output-aggregator; mailto:team-it@snd.se)')
|
|
72
|
+
try:
|
|
73
|
+
with urllib.request.urlopen(request) as response:
|
|
74
|
+
return json.loads(response.read())
|
|
75
|
+
except (urllib.error.URLError, json.JSONDecodeError, KeyError) as e:
|
|
76
|
+
raise RuntimeError(f"Failed run DataCite query: {e}")
|
|
77
|
+
|
|
78
|
+
def get_record(self, item: dict) -> ResearchOutputItem:
|
|
79
|
+
attributes = item.get("attributes", {})
|
|
80
|
+
publisher_attr = attributes.get("publisher", {})
|
|
81
|
+
versionCount = 0 if attributes.get("versionCount", {}) is None else int(attributes.get("versionCount", {}))
|
|
82
|
+
versionOfCount = 0 if attributes.get("versionOfCount", {}) is None else int(attributes.get("versionOfCount", {}))
|
|
83
|
+
|
|
84
|
+
record = ResearchOutputItem(
|
|
85
|
+
doi=attributes.get("doi"),
|
|
86
|
+
dataCiteClientId=item["relationships"]["client"]["data"]["id"],
|
|
87
|
+
resourceType=attributes.get("types", None).get("resourceTypeGeneral"),
|
|
88
|
+
publisher=publisher_attr.get("name"),
|
|
89
|
+
publicationYear=attributes.get("publicationYear"),
|
|
90
|
+
title=item["attributes"]["titles"][0]["title"],
|
|
91
|
+
inDataCite=True,
|
|
92
|
+
dataCiteCitationCount=attributes.get("citationCount", None),
|
|
93
|
+
dataCiteReferenceCount=attributes.get("referenceCount", None),
|
|
94
|
+
dataCiteViewCount=attributes.get("viewCount", None),
|
|
95
|
+
dataCiteDownloadCount=attributes.get("downloadCount", None),
|
|
96
|
+
titleWordCount=string_word_count(item["attributes"]["titles"][0]["title"])
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if record.resourceType is None or record.resourceType == "":
|
|
100
|
+
record.resourceType = attributes.get("types", {}).get("citeproc")
|
|
101
|
+
if record.resourceType is None or record.resourceType == "":
|
|
102
|
+
record.resourceType = attributes.get("types", {}).get("bibtex")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
record.isPublisher = (
|
|
106
|
+
publisher_attr.get("publisherIdentifier") == self.ror or
|
|
107
|
+
match_patterns(publisher_attr.get("name"), self.name)
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
related = [
|
|
111
|
+
r for r in item["attributes"].get("relatedIdentifiers", [])
|
|
112
|
+
if (r.get("relationType") == "IsReferencedBy" or r.get("relationType") == "IsSupplementTo" or r.get("relationType") == "IsSourceOf") and r.get("relatedIdentifierType") == "DOI"
|
|
113
|
+
]
|
|
114
|
+
if related and len(related) > 0:
|
|
115
|
+
record.referencedByDoi = related[0].get("relatedIdentifier")
|
|
116
|
+
else:
|
|
117
|
+
record.referencedByDoi = None
|
|
118
|
+
|
|
119
|
+
record.createdAt = str(attributes.get("created", "") or "")
|
|
120
|
+
|
|
121
|
+
record.updatedAt = max([
|
|
122
|
+
str(attributes.get("updated", "") or ""),
|
|
123
|
+
str(attributes.get("created", "") or ""),
|
|
124
|
+
str(attributes.get("registered", "") or "")
|
|
125
|
+
])
|
|
126
|
+
|
|
127
|
+
for relation in attributes.get("relatedIdentifiers", []):
|
|
128
|
+
if relation.get("relationType") == "IsPreviousVersionOf":
|
|
129
|
+
record.isLatestVersion = False
|
|
130
|
+
if relation.get("relationType") == "HasVersion":
|
|
131
|
+
record.isLatestVersion = False
|
|
132
|
+
|
|
133
|
+
record.isConceptDoi = (
|
|
134
|
+
versionCount > 0 and
|
|
135
|
+
versionOfCount == 0
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
record.haveCreatorAffiliation = self.check_agent_list_match(attributes.get("creators", []))
|
|
139
|
+
record.haveContributorAffiliation = self.check_agent_list_match(attributes.get("contributors", []))
|
|
140
|
+
return record
|
|
141
|
+
|
|
142
|
+
def check_agent_list_match(self, items: list) -> bool:
|
|
143
|
+
partial_ror = self.ror.split("https://ror.org/")[1] if self.ror else ""
|
|
144
|
+
for agent in items:
|
|
145
|
+
# Check if any nameIdentifier matches the ror
|
|
146
|
+
if any(identifier.get("nameIdentifier") == self.ror for identifier in agent.get("nameIdentifiers", [])):
|
|
147
|
+
return True
|
|
148
|
+
# Check if any nameIdentifier matches the partial ror
|
|
149
|
+
if any(identifier.get("nameIdentifier") == partial_ror for identifier in agent.get("nameIdentifiers", [])):
|
|
150
|
+
return True
|
|
151
|
+
# Check if the agent name matches any pattern
|
|
152
|
+
if match_patterns(agent.get("name"), self.name):
|
|
153
|
+
return True
|
|
154
|
+
# Check each affiliation
|
|
155
|
+
for affiliation in agent.get("affiliation", []):
|
|
156
|
+
if (affiliation.get("affiliationIdentifier") == self.ror or
|
|
157
|
+
match_patterns(affiliation.get("name"), self.name)):
|
|
158
|
+
return True
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
def all(self) -> list:
|
|
162
|
+
result = []
|
|
163
|
+
url = self.api_request_url()
|
|
164
|
+
while True:
|
|
165
|
+
response = self.get_api_result(url)
|
|
166
|
+
result.extend(response["data"])
|
|
167
|
+
logging.info(f"Retrieved DataCite {len(result)} of {response['meta']['total']}")
|
|
168
|
+
if response['links'].get('next'):
|
|
169
|
+
url = response['links']['next']
|
|
170
|
+
else:
|
|
171
|
+
break
|
|
172
|
+
return result
|
|
173
|
+
|
|
174
|
+
def count(self) -> int:
|
|
175
|
+
if not self.get_query_string():
|
|
176
|
+
return 0
|
|
177
|
+
url = self.api_request_url(page_size=0)
|
|
178
|
+
return self.get_api_result(url)["meta"]["total"]
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import urllib.request
|
|
3
|
+
import logging
|
|
4
|
+
import json
|
|
5
|
+
from roagg.models.research_output_item import ResearchOutputItem
|
|
6
|
+
from roagg.helpers.utils import find_doi_in_text, is_valid_doi, string_word_count
|
|
7
|
+
|
|
8
|
+
class OpenAireAPI:
|
|
9
|
+
openaire_base_url = "https://api.openaire.eu/graph/v1/"
|
|
10
|
+
|
|
11
|
+
def __init__(self, page_size: int = 100, ror: str = "", results: List[ResearchOutputItem] = []):
|
|
12
|
+
self.page_size = page_size
|
|
13
|
+
self.ror = ror
|
|
14
|
+
self.results = results
|
|
15
|
+
|
|
16
|
+
def get_openaire_id_from_ror(self) -> str:
|
|
17
|
+
url = f"{self.openaire_base_url}organizations?pid={self.ror}"
|
|
18
|
+
with urllib.request.urlopen(url) as response:
|
|
19
|
+
json_response = json.loads(response.read())
|
|
20
|
+
|
|
21
|
+
if 'results' in json_response and len(json_response['results']) > 0:
|
|
22
|
+
return json_response['results'][0]['id']
|
|
23
|
+
else:
|
|
24
|
+
return ""
|
|
25
|
+
|
|
26
|
+
def get_records(self) -> List[ResearchOutputItem]:
|
|
27
|
+
if not self.ror:
|
|
28
|
+
return []
|
|
29
|
+
openaire_results = []
|
|
30
|
+
openaire_id = self.get_openaire_id_from_ror()
|
|
31
|
+
|
|
32
|
+
if not openaire_id:
|
|
33
|
+
logging.info(f"No OpenAire ID found for ROR {self.ror}")
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
params = {
|
|
37
|
+
'pageSize': self.page_size,
|
|
38
|
+
'cursor': '*',
|
|
39
|
+
'type': 'dataset', # limit to only datasets for now
|
|
40
|
+
'relOrganizationId': openaire_id
|
|
41
|
+
}
|
|
42
|
+
retrieve_count = 0
|
|
43
|
+
while True:
|
|
44
|
+
query_string = urllib.parse.urlencode(params)
|
|
45
|
+
url = f"{self.openaire_base_url}researchProducts?{query_string}"
|
|
46
|
+
with urllib.request.urlopen(url) as response:
|
|
47
|
+
json_response = json.loads(response.read())
|
|
48
|
+
if 'results' in json_response:
|
|
49
|
+
openaire_results.extend(json_response['results'])
|
|
50
|
+
|
|
51
|
+
retrieve_count = len(openaire_results)
|
|
52
|
+
logging.info(f"Retrieved OpenAire {retrieve_count} of {json_response['header']['numFound']}")
|
|
53
|
+
|
|
54
|
+
if 'nextCursor' in json_response['header'] and json_response['header']['nextCursor']:
|
|
55
|
+
params['cursor'] = json_response['header']['nextCursor']
|
|
56
|
+
else:
|
|
57
|
+
break
|
|
58
|
+
|
|
59
|
+
# Create a dictionary for O(1) lookups
|
|
60
|
+
doi_to_item = {item.doi.lower(): item for item in self.results if item.doi}
|
|
61
|
+
|
|
62
|
+
for r in openaire_results:
|
|
63
|
+
openAireBestAccessRight = None
|
|
64
|
+
if 'bestAccessRight' in r and r['bestAccessRight'] and 'label' in r['bestAccessRight']:
|
|
65
|
+
openAireBestAccessRight = r['bestAccessRight']['label']
|
|
66
|
+
|
|
67
|
+
openAireIndicatorsUsageCountsDownloads = None
|
|
68
|
+
if 'indicators' in r and r['indicators'] and 'usageCounts' in r['indicators']:
|
|
69
|
+
if 'downloads' in r['indicators']['usageCounts']:
|
|
70
|
+
openAireIndicatorsUsageCountsDownloads = r['indicators']['usageCounts']['downloads']
|
|
71
|
+
|
|
72
|
+
openAireIndicatorsUsageCountsViews = None
|
|
73
|
+
if 'indicators' in r and r['indicators'] and 'usageCounts' in r['indicators']:
|
|
74
|
+
if 'views' in r['indicators']['usageCounts']:
|
|
75
|
+
openAireIndicatorsUsageCountsViews = r['indicators']['usageCounts']['views']
|
|
76
|
+
|
|
77
|
+
dois = self.get_doi_list_from_resource(r)
|
|
78
|
+
recordMatch = False
|
|
79
|
+
for doi in dois:
|
|
80
|
+
item = doi_to_item.get(doi.lower())
|
|
81
|
+
if item:
|
|
82
|
+
recordMatch = True
|
|
83
|
+
item.openAireBestAccessRight = openAireBestAccessRight
|
|
84
|
+
item.openAireIndicatorsUsageCountsDownloads = openAireIndicatorsUsageCountsDownloads
|
|
85
|
+
item.openAireIndicatorsUsageCountsViews = openAireIndicatorsUsageCountsViews
|
|
86
|
+
item.inOpenAire = True
|
|
87
|
+
if not recordMatch and len(dois) > 0:
|
|
88
|
+
publication_date = r.get('publicationDate', None)
|
|
89
|
+
publication_year = None
|
|
90
|
+
if publication_date:
|
|
91
|
+
publication_year = publication_date[:4] if len(publication_date) >= 4 else None
|
|
92
|
+
item = ResearchOutputItem(
|
|
93
|
+
doi=dois[0],
|
|
94
|
+
isPublisher=None,
|
|
95
|
+
resourceType=r.get('type', None),
|
|
96
|
+
title=r.get('mainTitle', None),
|
|
97
|
+
publisher=r.get('publisher', None),
|
|
98
|
+
publicationYear=publication_year,
|
|
99
|
+
haveContributorAffiliation=None,
|
|
100
|
+
haveCreatorAffiliation=None,
|
|
101
|
+
isLatestVersion=None,
|
|
102
|
+
isConceptDoi=None,
|
|
103
|
+
inOpenAire=True,
|
|
104
|
+
openAireBestAccessRight=openAireBestAccessRight,
|
|
105
|
+
openAireIndicatorsUsageCountsDownloads=openAireIndicatorsUsageCountsDownloads,
|
|
106
|
+
openAireIndicatorsUsageCountsViews=openAireIndicatorsUsageCountsViews,
|
|
107
|
+
openAireId=r.get('id', None),
|
|
108
|
+
titleWordCount=string_word_count(r.get('mainTitle', None))
|
|
109
|
+
)
|
|
110
|
+
self.results.append(item)
|
|
111
|
+
doi_to_item[item.doi.lower()] = item # Add to lookup dictionary
|
|
112
|
+
|
|
113
|
+
return openaire_results
|
|
114
|
+
|
|
115
|
+
def get_doi_list_from_resource(self, resource: dict) -> List[str]:
|
|
116
|
+
doi_list = []
|
|
117
|
+
|
|
118
|
+
for instance in resource['instances']:
|
|
119
|
+
logging.debug(f"Instance: {instance}")
|
|
120
|
+
|
|
121
|
+
if 'pids' in instance and len(instance['pids']) > 0:
|
|
122
|
+
for pid in instance['pids']:
|
|
123
|
+
if pid['scheme'].lower() == 'doi':
|
|
124
|
+
doi_list.append(pid['value'])
|
|
125
|
+
|
|
126
|
+
if 'alternateIdentifiers' in instance and len(instance['alternateIdentifiers']) > 0:
|
|
127
|
+
for alternateIdentifier in instance['alternateIdentifiers']:
|
|
128
|
+
if alternateIdentifier['scheme'].lower() == 'doi':
|
|
129
|
+
doi_list.append(alternateIdentifier['value'])
|
|
130
|
+
|
|
131
|
+
if len(doi_list) == 0:
|
|
132
|
+
# Normalize URLs to standard DOI format
|
|
133
|
+
url_replacements = [
|
|
134
|
+
("https://doi.pangaea.de/", "https://doi.org/"),
|
|
135
|
+
("https://zenodo.org/doi/", "https://doi.org/"),
|
|
136
|
+
("https://zenodo.org/records/", "https://doi.org/10.5281/zenodo.")
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
for url in instance['urls']:
|
|
140
|
+
normalized_url = url
|
|
141
|
+
for old_pattern, new_pattern in url_replacements:
|
|
142
|
+
normalized_url = normalized_url.replace(old_pattern, new_pattern)
|
|
143
|
+
|
|
144
|
+
for doi in find_doi_in_text(normalized_url):
|
|
145
|
+
if is_valid_doi(doi):
|
|
146
|
+
doi_list.append(doi)
|
|
147
|
+
|
|
148
|
+
# if doi_list is empty print json for instances
|
|
149
|
+
if len(doi_list) == 0:
|
|
150
|
+
logging.warning(f"No DOI found in resource: {json.dumps(resource['instances'], indent=2)}")
|
|
151
|
+
|
|
152
|
+
return list(set(doi_list))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import urllib.request
|
|
3
|
+
import logging
|
|
4
|
+
import json
|
|
5
|
+
from roagg.models.research_output_item import ResearchOutputItem
|
|
6
|
+
from roagg.helpers.utils import string_word_count, remove_resolver_prefix_from_doi
|
|
7
|
+
|
|
8
|
+
class OpenAlexAPI:
|
|
9
|
+
openalex_base_url = "https://api.openalex.org/"
|
|
10
|
+
|
|
11
|
+
def __init__(self, page_size: int = 200, ror: str = "", results: List[ResearchOutputItem] = []):
|
|
12
|
+
self.page_size = page_size
|
|
13
|
+
self.ror = ror
|
|
14
|
+
self.results = results
|
|
15
|
+
|
|
16
|
+
def get_openalex_id_from_ror(self) -> str:
|
|
17
|
+
url = f"{self.openalex_base_url}institutions/ror:{self.ror}"
|
|
18
|
+
with urllib.request.urlopen(url) as response:
|
|
19
|
+
json_response = json.loads(response.read())
|
|
20
|
+
|
|
21
|
+
if 'id' in json_response:
|
|
22
|
+
return json_response['id']
|
|
23
|
+
else:
|
|
24
|
+
return ""
|
|
25
|
+
|
|
26
|
+
def get_records(self) -> List[ResearchOutputItem]:
|
|
27
|
+
if not self.ror:
|
|
28
|
+
return []
|
|
29
|
+
openalex_results = []
|
|
30
|
+
openalex_id = self.get_openalex_id_from_ror()
|
|
31
|
+
|
|
32
|
+
if not openalex_id:
|
|
33
|
+
logging.info(f"No OpenAlex ID found for ROR {self.ror}")
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
params = {
|
|
37
|
+
'per-page': self.page_size,
|
|
38
|
+
'cursor': '*',
|
|
39
|
+
'filter': f'institutions.id:{openalex_id},type:dataset' # limit to only datasets for now
|
|
40
|
+
}
|
|
41
|
+
retrieve_count = 0
|
|
42
|
+
|
|
43
|
+
while True:
|
|
44
|
+
query_string = urllib.parse.urlencode(params)
|
|
45
|
+
url = f"{self.openalex_base_url}works?{query_string}"
|
|
46
|
+
with urllib.request.urlopen(url) as response:
|
|
47
|
+
json_response = json.loads(response.read())
|
|
48
|
+
if 'results' in json_response:
|
|
49
|
+
openalex_results.extend(json_response['results'])
|
|
50
|
+
retrieve_count = len(openalex_results)
|
|
51
|
+
logging.info(f"Retrieved OpenAlex {retrieve_count} of {json_response['meta']['count']}")
|
|
52
|
+
|
|
53
|
+
if 'next_cursor' in json_response['meta'] and json_response['meta']['next_cursor']:
|
|
54
|
+
params['cursor'] = json_response['meta']['next_cursor']
|
|
55
|
+
else:
|
|
56
|
+
break
|
|
57
|
+
|
|
58
|
+
# Create a dictionary for O(1) lookups
|
|
59
|
+
doi_to_item = {item.doi.lower(): item for item in self.results if item.doi}
|
|
60
|
+
|
|
61
|
+
for r in openalex_results:
|
|
62
|
+
openAlexCitedByCount = None
|
|
63
|
+
if 'cited_by_count' in r:
|
|
64
|
+
openAlexCitedByCount = r['cited_by_count']
|
|
65
|
+
|
|
66
|
+
openAlexReferencedWorksCount = None
|
|
67
|
+
if 'referenced_works_count' in r:
|
|
68
|
+
openAlexReferencedWorksCount = r['referenced_works_count']
|
|
69
|
+
|
|
70
|
+
haveCreatorAffiliation = False
|
|
71
|
+
|
|
72
|
+
for authorship in r.get('institutions', []):
|
|
73
|
+
for affiliation in authorship.get('institutions', []):
|
|
74
|
+
if affiliation.get('ror') == self.ror:
|
|
75
|
+
haveCreatorAffiliation = True
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
doi = remove_resolver_prefix_from_doi(r.get('doi', None))
|
|
79
|
+
if doi is None:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
recordMatch = False
|
|
83
|
+
|
|
84
|
+
item = doi_to_item.get(doi.lower())
|
|
85
|
+
if item:
|
|
86
|
+
recordMatch = True
|
|
87
|
+
item.openAlexCitedByCount = openAlexCitedByCount
|
|
88
|
+
item.openAlexReferencedWorksCount = openAlexReferencedWorksCount
|
|
89
|
+
item.inOpenAlex = True
|
|
90
|
+
item.openAlexId = r.get('id', None)
|
|
91
|
+
item.haveCreatorAffiliation = haveCreatorAffiliation
|
|
92
|
+
if not recordMatch:
|
|
93
|
+
publication_date = r.get('publication_date', None)
|
|
94
|
+
publication_year = r.get('publication_year', None)
|
|
95
|
+
if publication_date:
|
|
96
|
+
publication_year = publication_date[:4] if len(publication_date) >= 4 else None
|
|
97
|
+
item = ResearchOutputItem(
|
|
98
|
+
doi=doi,
|
|
99
|
+
isPublisher=None,
|
|
100
|
+
resourceType=r.get('type', None),
|
|
101
|
+
title=r.get('title', None),
|
|
102
|
+
publisher=None,
|
|
103
|
+
publicationYear=publication_year,
|
|
104
|
+
createdAt=r.get('created_date', None),
|
|
105
|
+
updatedAt=r.get('updated_date', None),
|
|
106
|
+
haveContributorAffiliation=None,
|
|
107
|
+
haveCreatorAffiliation=haveCreatorAffiliation,
|
|
108
|
+
isLatestVersion=None,
|
|
109
|
+
isConceptDoi=None,
|
|
110
|
+
inOpenAlex=True,
|
|
111
|
+
openAlexCitedByCount=openAlexCitedByCount,
|
|
112
|
+
openAlexReferencedWorksCount=openAlexReferencedWorksCount,
|
|
113
|
+
openAlexId=r.get('id', None),
|
|
114
|
+
titleWordCount=string_word_count(r.get('title', None))
|
|
115
|
+
)
|
|
116
|
+
self.results.append(item)
|
|
117
|
+
doi_to_item[item.doi.lower()] = item # Add to lookup dictionary
|
|
118
|
+
|
|
119
|
+
return openalex_results
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
roagg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
roagg/__main__.py,sha256=VmfwfqdlUj_9LkmCrYVET_GCM6zgNkrOZHEE-DqlPIE,68
|
|
3
|
+
roagg/aggregator.py,sha256=iZjchp9ldO6LJqhWlCWTXrcGEVuhXTNEmzRuuKRLRTU,2318
|
|
4
|
+
roagg/cli.py,sha256=FGaqOL17-fmKfa0UDHrOTIbqrDVIAT7WHlo3_1qvbTk,2677
|
|
5
|
+
roagg/helpers/ror.py,sha256=6m3u0ke0Guwe6xBkKZdRRR2882pcx5eN9C7EK0Yr7o0,488
|
|
6
|
+
roagg/helpers/utils.py,sha256=0hsImpjIIKZf4rw8i3_wOWcrgh3MuwJncx2WjvB1j24,1839
|
|
7
|
+
roagg/models/research_output_item.py,sha256=ioYn6iYsUSNPK84WLA3_2p9Hd-PyIhwH2luINa0A63M,1599
|
|
8
|
+
roagg/providers/datacite.py,sha256=gHuUY_OXzguHxB-oA9da7cH4QHNlC4tQIaSq0cDWpwE,8112
|
|
9
|
+
roagg/providers/openaire.py,sha256=aMC_n7sae4y9ihX-fzRljMPJFRWDRxeAUSA9iVfNrAw,7012
|
|
10
|
+
roagg/providers/openalex.py,sha256=_5CD0LzoJ6kLDzQFFAJz-8ZZ7H77kNTFSRWzYtKsm_A,4938
|
|
11
|
+
roagg-2025.0.8.dist-info/licenses/LICENSE,sha256=Qhd6QZcm7nnbZ3A9bCnqMYbWec83s20vAA_NWmEGwsE,1086
|
|
12
|
+
roagg-2025.0.8.dist-info/METADATA,sha256=srldfXcHDD5N50zlhcm6pp8Lnf7agOPL3vs8Kz4_TR4,120
|
|
13
|
+
roagg-2025.0.8.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
14
|
+
roagg-2025.0.8.dist-info/entry_points.txt,sha256=t-eLnlkWaxvQQ_LQRAVmYXK5A-fJdUs-dQRZEQt1DsQ,76
|
|
15
|
+
roagg-2025.0.8.dist-info/top_level.txt,sha256=qPVTgmR5hG3qUkccGNGEo3F4VqjIsg7Caa50G6HqQ1k,6
|
|
16
|
+
roagg-2025.0.8.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Swedish National Data Service
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
roagg
|