roagg 2025.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
roagg/__init__.py ADDED
File without changes
roagg/__main__.py ADDED
@@ -0,0 +1,3 @@
1
+ if __name__ == "__main__":
2
+ from roagg.cli import main
3
+ main()
roagg/aggregator.py ADDED
@@ -0,0 +1,69 @@
1
+ import sys
2
+ from typing import List
3
+ from roagg.helpers.ror import get_names_from_ror
4
+ from roagg.providers.datacite import DataCiteAPI
5
+ from roagg.providers.openaire import OpenAireAPI
6
+ from roagg.providers.openalex import OpenAlexAPI
7
+ import logging
8
+ from roagg.models.research_output_item import ResearchOutputItem
9
+ import json
10
+ import csv
11
+ from dataclasses import fields
12
+
13
+ def aggregate(name: List[str] = [], ror: str = "", output: str = "output.csv") -> None:
14
+ if ror:
15
+ ror_name = get_names_from_ror(ror)
16
+ name.extend(ror_name)
17
+
18
+ # remove duplicates
19
+ name = list(set(name))
20
+
21
+ datacite = DataCiteAPI(name=name, ror=ror)
22
+ url = datacite.api_request_url()
23
+ # debug print of the query string
24
+ logging.info("DataCite url:")
25
+ logging.info(url)
26
+
27
+ records = datacite.all()
28
+ research_output_items = []
29
+ logging.info(f"Checking {len(records)} records...")
30
+ for record in records:
31
+ research_output_items.append(datacite.get_record(record))
32
+
33
+ openaire = OpenAireAPI(ror=ror, results=research_output_items)
34
+ openaire_id = openaire.get_openaire_id_from_ror()
35
+ logging.info(f"OpenAire ID from ROR {ror} : {openaire_id}")
36
+ openaire.get_records()
37
+
38
+ openalex = OpenAlexAPI(ror=ror, results=research_output_items)
39
+ openalex_id = openalex.get_openalex_id_from_ror()
40
+ logging.info(f"OpenAlex ID from ROR {ror} : {openalex_id}")
41
+ openalex.get_records()
42
+
43
+ logging.info(f"Writing: {output}")
44
+
45
+ write_csv(research_output_items, output)
46
+ logging.info(f"Writing output to csv: {output} - Done")
47
+
48
+ def write_csv(records: List[ResearchOutputItem], output: str) -> None:
49
+ # Get field names from the dataclass
50
+ dataclass_fields = fields(ResearchOutputItem)
51
+ header = [field.name for field in dataclass_fields]
52
+
53
+ def format_value(value):
54
+ """Format values for CSV output"""
55
+ if value is None:
56
+ return ""
57
+ elif isinstance(value, bool):
58
+ return 1 if value else 0
59
+ else:
60
+ return value
61
+
62
+ with open(output, 'w', newline='', encoding='utf-8') as file:
63
+ writer = csv.writer(file)
64
+ writer.writerow(header)
65
+
66
+ writer.writerows([
67
+ [format_value(getattr(record, field.name)) for field in dataclass_fields]
68
+ for record in records
69
+ ])
roagg/cli.py ADDED
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env python3
2
+ from typing import List, Optional
3
+ import argparse
4
+ import logging
5
+ import sys
6
+ from pathlib import Path
7
+ from roagg.helpers.utils import get_roagg_version
8
+ from roagg.aggregator import aggregate
9
+
10
+ def validate_ror_id(ror_id: str) -> str:
11
+ """validate ROR ID format (should start with https://ror.org/)."""
12
+ if not ror_id.startswith('https://ror.org/'):
13
+ raise argparse.ArgumentTypeError("ROR ID must start with 'https://ror.org/'")
14
+ return ror_id
15
+
16
+ def read_names_from_file(filepath: Path) -> List[str]:
17
+ """Read organization names from a file, one per line."""
18
+ try:
19
+ return [line.strip() for line in filepath.read_text().splitlines() if line.strip()]
20
+ except IOError as e:
21
+ logging.error(f"Failed to read names file: {e}")
22
+ sys.exit(1)
23
+
24
+ def main() -> None:
25
+ """create a summary CSV file for all research output for an organization."""
26
+ logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
27
+
28
+ parser = argparse.ArgumentParser(
29
+ description="aggregate research outputs for an organization into a CSV file",
30
+ formatter_class=argparse.RawDescriptionHelpFormatter
31
+ )
32
+
33
+ parser.add_argument(
34
+ "--version",
35
+ action="version",
36
+ version=get_roagg_version()
37
+ )
38
+
39
+ parser.add_argument(
40
+ "--name",
41
+ type=str,
42
+ action='append',
43
+ help="name variant of the organization (can be used multiple times)"
44
+ )
45
+ parser.add_argument(
46
+ "--name-txt",
47
+ type=Path,
48
+ help="path to text file containing organization name variants (one per line)"
49
+ )
50
+
51
+ parser.add_argument(
52
+ "--ror",
53
+ type=validate_ror_id,
54
+ help="ROR ID of the organization (must start with https://ror.org/)"
55
+ )
56
+
57
+ parser.add_argument(
58
+ "--source",
59
+ default="api",
60
+ choices=["api"],
61
+ help="source for resource aggregation (only api is supported right now)"
62
+ )
63
+
64
+ parser.add_argument(
65
+ "--output",
66
+ default="data/output.csv",
67
+ help="name of the output file (default: data/output.csv)"
68
+ )
69
+
70
+ args = parser.parse_args()
71
+
72
+ # print parser.print_help() if no argument for name, name-txt or ror is provided
73
+ if not any([args.name, args.name_txt, args.ror]):
74
+ parser.print_help()
75
+ sys.exit(1)
76
+
77
+ names: List[str] = []
78
+ if args.name:
79
+ names = args.name
80
+
81
+ if args.name_txt:
82
+ names.extend(read_names_from_file(args.name_txt))
83
+
84
+ try:
85
+ aggregate(names, args.ror, args.output)
86
+ except Exception as e:
87
+ logging.error(f"Aggregation failed: {e}")
88
+ sys.exit(1)
89
+
90
+ if __name__ == "__main__":
91
+ main()
roagg/helpers/ror.py ADDED
@@ -0,0 +1,14 @@
1
+ import urllib.request
2
+ import json
3
+ from typing import List
4
+
5
+ def get_ror_info(ror: str):
6
+ ror_id = ror.split('/')[-1]
7
+ url = f"https://api.ror.org/v2/organizations/{ror_id}"
8
+ with urllib.request.urlopen(url) as response:
9
+ return json.loads(response.read())
10
+
11
+ def get_names_from_ror(ror: str) -> List[str]:
12
+ names = get_ror_info(ror)['names']
13
+ valid_types = {'alias', 'ror_display', 'label'}
14
+ return [n['value'] for n in names if valid_types.intersection(n['types'])]
roagg/helpers/utils.py ADDED
@@ -0,0 +1,65 @@
1
+ import importlib.metadata
2
+ import re
3
+
4
+ doi_pattern = re.compile(r'^10\.\d{4,9}/[-._;()/:A-Z0-9]+$', re.IGNORECASE)
5
+
6
+ def is_valid_doi(s: str) -> bool:
7
+ return bool(doi_pattern.match(s))
8
+
9
+ def find_doi_in_text(text: str) -> str | None:
10
+ return re.findall(r'\b10\.\d{4,9}/[-.;()/:\w]+', text)
11
+
12
+ def remove_resolver_prefix_from_doi(doi: str) -> str:
13
+ if doi is None:
14
+ return None
15
+ prefixes = [
16
+ "https://doi.org/",
17
+ "http://doi.org/",
18
+ "doi.org/",
19
+ "https://dx.doi.org/",
20
+ "http://dx.doi.org/",
21
+ "dx.doi.org/"
22
+ ]
23
+ for prefix in prefixes:
24
+ if doi.lower().startswith(prefix):
25
+ return doi[len(prefix):]
26
+ return doi
27
+
28
+ def match_patterns(string, patterns):
29
+ if string is None:
30
+ return False
31
+
32
+ for pattern in patterns:
33
+ # if pattern does not contain * or ? check if string contains pattern
34
+ if '*' not in pattern and '?' not in pattern:
35
+ if pattern.lower() is string.lower() or pattern.lower() in string.lower():
36
+ return True
37
+
38
+
39
+ if re.match(pattern_to_regexp(pattern), string, re.IGNORECASE):
40
+ return True
41
+ return False
42
+
43
+ def pattern_to_regexp(pattern: str) -> str:
44
+ regex = ""
45
+ for char in pattern:
46
+ if char == '*':
47
+ regex += '.*'
48
+ elif char == '?':
49
+ regex += '.'
50
+ else:
51
+ regex += re.escape(char)
52
+ return '^' + regex + '$'
53
+
54
+ def get_roagg_version() -> str:
55
+ """Get package version from metadata."""
56
+ try:
57
+ return importlib.metadata.version("roagg")
58
+ except importlib.metadata.PackageNotFoundError:
59
+ return "unknown"
60
+
61
+ def string_word_count(string: str) -> int:
62
+ """Count words in a string after trimming whitespace."""
63
+ if not string:
64
+ return 0
65
+ return len(string.strip().split())
@@ -0,0 +1,51 @@
1
+ from dataclasses import dataclass
2
+
3
+ @dataclass
4
+ class ResearchOutputItem:
5
+ doi: str
6
+ publicationYear: int = None
7
+ resourceType: str = None
8
+ title: str = None
9
+ publisher: str = None
10
+ createdAt: str = ""
11
+ updatedAt: str = ""
12
+ isPublisher: bool = False
13
+ isFunder: bool = None
14
+ haveCreatorAffiliation: bool = False
15
+ haveContributorAffiliation: bool = False
16
+ isLatestVersion: bool = True
17
+ isConceptDoi: bool = False
18
+ #match on ROR
19
+ matchPublisherRor: bool = False
20
+ matchCreatorAffiliationRor: bool = False
21
+ matchContributorAffiliationRor: bool = False
22
+ matchFunderRor: bool = False
23
+ #match on free text in name
24
+ matchPublisherName: bool = False
25
+ matchCreatorName: bool = False
26
+ matchContributorName: bool = False
27
+ matchFunderName: bool = False
28
+ #where was the match found
29
+ inDataCite: bool = None
30
+ inOpenAire: bool = None
31
+ inOpenAlex: bool = None
32
+ inCrossRef: bool = None
33
+ #datacite specific
34
+ dataCiteClientId: str = None
35
+ dataCiteClientName: str = None
36
+ dataCiteCitationCount: int = None
37
+ dataCiteReferenceCount: int = None
38
+ dataCiteViewCount: int = None
39
+ dataCiteDownloadCount: int = None
40
+ #openaire specific
41
+ openAireBestAccessRight: str = None
42
+ openAireIndicatorsUsageCountsDownloads: int = None
43
+ openAireIndicatorsUsageCountsViews: int = None
44
+ openAireId: str = None
45
+ #openalex specific
46
+ openAlexId: str = None
47
+ openAlexCitedByCount: int = None
48
+ openAlexReferencedWorksCount: int = None
49
+ #extra fields
50
+ titleWordCount: int = None
51
+ referencedByDoi: str = None
@@ -0,0 +1,178 @@
1
+ from typing import List
2
+ import urllib.request
3
+ import logging
4
+ import json
5
+ from roagg.helpers.utils import get_roagg_version
6
+ from roagg.models.research_output_item import ResearchOutputItem
7
+ from roagg.helpers.utils import match_patterns, string_word_count
8
+
9
+ class DataCiteAPI:
10
+ def __init__(self, page_size: int = 500, name: List[str] = [], ror: str = ""):
11
+ self.page_size = page_size
12
+ self.name = name
13
+ self.ror = ror
14
+
15
+ def get_query_string(self) -> str:
16
+ if not self.name and not self.ror:
17
+ return ""
18
+
19
+ query_parts = []
20
+
21
+ if self.name:
22
+ # Separate wildcard and exact matches, handle spaces in wildcard queries appropriately
23
+ wildcard = ' OR '.join(n.replace(" ", "\\ ") for n in self.name if '*' in n)
24
+ exact = ' OR '.join(f'"{n}"' for n in self.name if '*' not in n)
25
+ name_fields = [
26
+ "creators.affiliation.name",
27
+ "contributors.affiliation.name",
28
+ "publisher.name"
29
+ ]
30
+
31
+ if wildcard and exact:
32
+ name_conditions = f'{wildcard} OR {exact}'
33
+ else:
34
+ name_conditions = wildcard or exact
35
+
36
+ query_parts.extend([f"{field}:({name_conditions})" for field in name_fields])
37
+
38
+ if self.ror:
39
+ ror_fields = [
40
+ "publisher.publisherIdentifier",
41
+ "creators.affiliation.affiliationIdentifier",
42
+ "contributors.affiliation.affiliationIdentifier",
43
+ "creators.nameIdentifiers.nameIdentifier",
44
+ "contributors.nameIdentifiers.nameIdentifier",
45
+ "fundingReferences.funderIdentifier"
46
+ ]
47
+ query_parts.extend([f'{field}:"{self.ror}"' for field in ror_fields])
48
+ # nameIdentifiers are formated without https://ror.org/ prefix from some sources, so we need to check both
49
+ query_parts.extend([f'{field}:"{self.ror.split("https://ror.org/")[1]}"' for field in ror_fields])
50
+
51
+ return " OR ".join(query_parts)
52
+
53
+ def api_request_url(self, page_size: int = None) -> str:
54
+ if page_size is None:
55
+ page_size = self.page_size
56
+ params = urllib.parse.urlencode({
57
+ 'page[size]': page_size,
58
+ 'page[cursor]': '1',
59
+ 'affiliation': 'true',
60
+ 'publisher': 'true',
61
+ 'detail': 'true',
62
+ 'disable-facets': 'false',
63
+ 'query': self.get_query_string()
64
+ })
65
+ return f"https://api.datacite.org/dois?{params}"
66
+
67
+ @staticmethod
68
+ def get_api_result(url: str) -> dict:
69
+ request = urllib.request.Request(url)
70
+ version = get_roagg_version()
71
+ request.add_header('User-Agent', f'ResearchOutputAggregator/{version} (https://github.com/snd-sweden/research-output-aggregator; mailto:team-it@snd.se)')
72
+ try:
73
+ with urllib.request.urlopen(request) as response:
74
+ return json.loads(response.read())
75
+ except (urllib.error.URLError, json.JSONDecodeError, KeyError) as e:
76
+ raise RuntimeError(f"Failed run DataCite query: {e}")
77
+
78
+ def get_record(self, item: dict) -> ResearchOutputItem:
79
+ attributes = item.get("attributes", {})
80
+ publisher_attr = attributes.get("publisher", {})
81
+ versionCount = 0 if attributes.get("versionCount", {}) is None else int(attributes.get("versionCount", {}))
82
+ versionOfCount = 0 if attributes.get("versionOfCount", {}) is None else int(attributes.get("versionOfCount", {}))
83
+
84
+ record = ResearchOutputItem(
85
+ doi=attributes.get("doi"),
86
+ dataCiteClientId=item["relationships"]["client"]["data"]["id"],
87
+ resourceType=attributes.get("types", None).get("resourceTypeGeneral"),
88
+ publisher=publisher_attr.get("name"),
89
+ publicationYear=attributes.get("publicationYear"),
90
+ title=item["attributes"]["titles"][0]["title"],
91
+ inDataCite=True,
92
+ dataCiteCitationCount=attributes.get("citationCount", None),
93
+ dataCiteReferenceCount=attributes.get("referenceCount", None),
94
+ dataCiteViewCount=attributes.get("viewCount", None),
95
+ dataCiteDownloadCount=attributes.get("downloadCount", None),
96
+ titleWordCount=string_word_count(item["attributes"]["titles"][0]["title"])
97
+ )
98
+
99
+ if record.resourceType is None or record.resourceType == "":
100
+ record.resourceType = attributes.get("types", {}).get("citeproc")
101
+ if record.resourceType is None or record.resourceType == "":
102
+ record.resourceType = attributes.get("types", {}).get("bibtex")
103
+
104
+
105
+ record.isPublisher = (
106
+ publisher_attr.get("publisherIdentifier") == self.ror or
107
+ match_patterns(publisher_attr.get("name"), self.name)
108
+ )
109
+
110
+ related = [
111
+ r for r in item["attributes"].get("relatedIdentifiers", [])
112
+ if (r.get("relationType") == "IsReferencedBy" or r.get("relationType") == "IsSupplementTo" or r.get("relationType") == "IsSourceOf") and r.get("relatedIdentifierType") == "DOI"
113
+ ]
114
+ if related and len(related) > 0:
115
+ record.referencedByDoi = related[0].get("relatedIdentifier")
116
+ else:
117
+ record.referencedByDoi = None
118
+
119
+ record.createdAt = str(attributes.get("created", "") or "")
120
+
121
+ record.updatedAt = max([
122
+ str(attributes.get("updated", "") or ""),
123
+ str(attributes.get("created", "") or ""),
124
+ str(attributes.get("registered", "") or "")
125
+ ])
126
+
127
+ for relation in attributes.get("relatedIdentifiers", []):
128
+ if relation.get("relationType") == "IsPreviousVersionOf":
129
+ record.isLatestVersion = False
130
+ if relation.get("relationType") == "HasVersion":
131
+ record.isLatestVersion = False
132
+
133
+ record.isConceptDoi = (
134
+ versionCount > 0 and
135
+ versionOfCount == 0
136
+ )
137
+
138
+ record.haveCreatorAffiliation = self.check_agent_list_match(attributes.get("creators", []))
139
+ record.haveContributorAffiliation = self.check_agent_list_match(attributes.get("contributors", []))
140
+ return record
141
+
142
+ def check_agent_list_match(self, items: list) -> bool:
143
+ partial_ror = self.ror.split("https://ror.org/")[1] if self.ror else ""
144
+ for agent in items:
145
+ # Check if any nameIdentifier matches the ror
146
+ if any(identifier.get("nameIdentifier") == self.ror for identifier in agent.get("nameIdentifiers", [])):
147
+ return True
148
+ # Check if any nameIdentifier matches the partial ror
149
+ if any(identifier.get("nameIdentifier") == partial_ror for identifier in agent.get("nameIdentifiers", [])):
150
+ return True
151
+ # Check if the agent name matches any pattern
152
+ if match_patterns(agent.get("name"), self.name):
153
+ return True
154
+ # Check each affiliation
155
+ for affiliation in agent.get("affiliation", []):
156
+ if (affiliation.get("affiliationIdentifier") == self.ror or
157
+ match_patterns(affiliation.get("name"), self.name)):
158
+ return True
159
+ return False
160
+
161
+ def all(self) -> list:
162
+ result = []
163
+ url = self.api_request_url()
164
+ while True:
165
+ response = self.get_api_result(url)
166
+ result.extend(response["data"])
167
+ logging.info(f"Retrieved DataCite {len(result)} of {response['meta']['total']}")
168
+ if response['links'].get('next'):
169
+ url = response['links']['next']
170
+ else:
171
+ break
172
+ return result
173
+
174
+ def count(self) -> int:
175
+ if not self.get_query_string():
176
+ return 0
177
+ url = self.api_request_url(page_size=0)
178
+ return self.get_api_result(url)["meta"]["total"]
@@ -0,0 +1,155 @@
1
+ from typing import List
2
+ import urllib.request
3
+ import logging
4
+ import json
5
+ from roagg.models.research_output_item import ResearchOutputItem
6
+ from roagg.helpers.utils import find_doi_in_text, is_valid_doi, string_word_count
7
+
8
+ class OpenAireAPI:
9
+ openaire_base_url = "https://api.openaire.eu/graph/v1/"
10
+
11
+ def __init__(self, page_size: int = 100, ror: str = "", results: List[ResearchOutputItem] = []):
12
+ self.page_size = page_size
13
+ self.ror = ror
14
+ self.results = results
15
+
16
+ def get_openaire_id_from_ror(self) -> str:
17
+ url = f"{self.openaire_base_url}organizations?pid={self.ror}"
18
+ with urllib.request.urlopen(url) as response:
19
+ json_response = json.loads(response.read())
20
+
21
+ if 'results' in json_response and len(json_response['results']) > 0:
22
+ return json_response['results'][0]['id']
23
+ else:
24
+ return ""
25
+
26
+ def get_records(self) -> List[ResearchOutputItem]:
27
+ if not self.ror:
28
+ return []
29
+ openaire_results = []
30
+ openaire_id = self.get_openaire_id_from_ror()
31
+
32
+ if not openaire_id:
33
+ logging.info(f"No OpenAire ID found for ROR {self.ror}")
34
+ return []
35
+
36
+ params = {
37
+ 'pageSize': self.page_size,
38
+ 'cursor': '*',
39
+ 'type': 'dataset', # limit to only datasets for now
40
+ 'relOrganizationId': openaire_id
41
+ }
42
+ retrieve_count = 0
43
+ while True:
44
+ query_string = urllib.parse.urlencode(params)
45
+ url = f"{self.openaire_base_url}researchProducts?{query_string}"
46
+ with urllib.request.urlopen(url) as response:
47
+ json_response = json.loads(response.read())
48
+ if 'results' in json_response:
49
+ openaire_results.extend(json_response['results'])
50
+
51
+ retrieve_count = len(openaire_results)
52
+ logging.info(f"Retrieved OpenAire {retrieve_count} of {json_response['header']['numFound']}")
53
+
54
+ if 'nextCursor' in json_response['header'] and json_response['header']['nextCursor']:
55
+ params['cursor'] = json_response['header']['nextCursor']
56
+ else:
57
+ break
58
+
59
+ # Create a dictionary for O(1) lookups
60
+ doi_to_item = {item.doi.lower(): item for item in self.results if item.doi}
61
+
62
+ for r in openaire_results:
63
+ openAireBestAccessRight = None
64
+ if 'bestAccessRight' in r and r['bestAccessRight'] and 'label' in r['bestAccessRight']:
65
+ openAireBestAccessRight = r['bestAccessRight']['label']
66
+
67
+ openAireIndicatorsUsageCountsDownloads = None
68
+ if 'indicators' in r and r['indicators'] and 'usageCounts' in r['indicators']:
69
+ if 'downloads' in r['indicators']['usageCounts']:
70
+ openAireIndicatorsUsageCountsDownloads = r['indicators']['usageCounts']['downloads']
71
+
72
+ openAireIndicatorsUsageCountsViews = None
73
+ if 'indicators' in r and r['indicators'] and 'usageCounts' in r['indicators']:
74
+ if 'views' in r['indicators']['usageCounts']:
75
+ openAireIndicatorsUsageCountsViews = r['indicators']['usageCounts']['views']
76
+
77
+ dois = self.get_doi_list_from_resource(r)
78
+ recordMatch = False
79
+ for doi in dois:
80
+ item = doi_to_item.get(doi.lower())
81
+ if item:
82
+ recordMatch = True
83
+ item.openAireBestAccessRight = openAireBestAccessRight
84
+ item.openAireIndicatorsUsageCountsDownloads = openAireIndicatorsUsageCountsDownloads
85
+ item.openAireIndicatorsUsageCountsViews = openAireIndicatorsUsageCountsViews
86
+ item.inOpenAire = True
87
+ if not recordMatch and len(dois) > 0:
88
+ publication_date = r.get('publicationDate', None)
89
+ publication_year = None
90
+ if publication_date:
91
+ publication_year = publication_date[:4] if len(publication_date) >= 4 else None
92
+ item = ResearchOutputItem(
93
+ doi=dois[0],
94
+ isPublisher=None,
95
+ resourceType=r.get('type', None),
96
+ title=r.get('mainTitle', None),
97
+ publisher=r.get('publisher', None),
98
+ publicationYear=publication_year,
99
+ haveContributorAffiliation=None,
100
+ haveCreatorAffiliation=None,
101
+ isLatestVersion=None,
102
+ isConceptDoi=None,
103
+ inOpenAire=True,
104
+ openAireBestAccessRight=openAireBestAccessRight,
105
+ openAireIndicatorsUsageCountsDownloads=openAireIndicatorsUsageCountsDownloads,
106
+ openAireIndicatorsUsageCountsViews=openAireIndicatorsUsageCountsViews,
107
+ openAireId=r.get('id', None),
108
+ titleWordCount=string_word_count(r.get('mainTitle', None))
109
+ )
110
+ self.results.append(item)
111
+ doi_to_item[item.doi.lower()] = item # Add to lookup dictionary
112
+
113
+ return openaire_results
114
+
115
+ def get_doi_list_from_resource(self, resource: dict) -> List[str]:
116
+ doi_list = []
117
+
118
+ for instance in resource['instances']:
119
+ logging.debug(f"Instance: {instance}")
120
+
121
+ if 'pids' in instance and len(instance['pids']) > 0:
122
+ for pid in instance['pids']:
123
+ if pid['scheme'].lower() == 'doi':
124
+ doi_list.append(pid['value'])
125
+
126
+ if 'alternateIdentifiers' in instance and len(instance['alternateIdentifiers']) > 0:
127
+ for alternateIdentifier in instance['alternateIdentifiers']:
128
+ if alternateIdentifier['scheme'].lower() == 'doi':
129
+ doi_list.append(alternateIdentifier['value'])
130
+
131
+ if len(doi_list) == 0:
132
+ # Normalize URLs to standard DOI format
133
+ url_replacements = [
134
+ ("https://doi.pangaea.de/", "https://doi.org/"),
135
+ ("https://zenodo.org/doi/", "https://doi.org/"),
136
+ ("https://zenodo.org/records/", "https://doi.org/10.5281/zenodo.")
137
+ ]
138
+
139
+ for url in instance['urls']:
140
+ normalized_url = url
141
+ for old_pattern, new_pattern in url_replacements:
142
+ normalized_url = normalized_url.replace(old_pattern, new_pattern)
143
+
144
+ for doi in find_doi_in_text(normalized_url):
145
+ if is_valid_doi(doi):
146
+ doi_list.append(doi)
147
+
148
+ # if doi_list is empty print json for instances
149
+ if len(doi_list) == 0:
150
+ logging.warning(f"No DOI found in resource: {json.dumps(resource['instances'], indent=2)}")
151
+
152
+ return list(set(doi_list))
153
+
154
+
155
+
@@ -0,0 +1,119 @@
1
+ from typing import List
2
+ import urllib.request
3
+ import logging
4
+ import json
5
+ from roagg.models.research_output_item import ResearchOutputItem
6
+ from roagg.helpers.utils import string_word_count, remove_resolver_prefix_from_doi
7
+
8
+ class OpenAlexAPI:
9
+ openalex_base_url = "https://api.openalex.org/"
10
+
11
+ def __init__(self, page_size: int = 200, ror: str = "", results: List[ResearchOutputItem] = []):
12
+ self.page_size = page_size
13
+ self.ror = ror
14
+ self.results = results
15
+
16
+ def get_openalex_id_from_ror(self) -> str:
17
+ url = f"{self.openalex_base_url}institutions/ror:{self.ror}"
18
+ with urllib.request.urlopen(url) as response:
19
+ json_response = json.loads(response.read())
20
+
21
+ if 'id' in json_response:
22
+ return json_response['id']
23
+ else:
24
+ return ""
25
+
26
+ def get_records(self) -> List[ResearchOutputItem]:
27
+ if not self.ror:
28
+ return []
29
+ openalex_results = []
30
+ openalex_id = self.get_openalex_id_from_ror()
31
+
32
+ if not openalex_id:
33
+ logging.info(f"No OpenAlex ID found for ROR {self.ror}")
34
+ return []
35
+
36
+ params = {
37
+ 'per-page': self.page_size,
38
+ 'cursor': '*',
39
+ 'filter': f'institutions.id:{openalex_id},type:dataset' # limit to only datasets for now
40
+ }
41
+ retrieve_count = 0
42
+
43
+ while True:
44
+ query_string = urllib.parse.urlencode(params)
45
+ url = f"{self.openalex_base_url}works?{query_string}"
46
+ with urllib.request.urlopen(url) as response:
47
+ json_response = json.loads(response.read())
48
+ if 'results' in json_response:
49
+ openalex_results.extend(json_response['results'])
50
+ retrieve_count = len(openalex_results)
51
+ logging.info(f"Retrieved OpenAlex {retrieve_count} of {json_response['meta']['count']}")
52
+
53
+ if 'next_cursor' in json_response['meta'] and json_response['meta']['next_cursor']:
54
+ params['cursor'] = json_response['meta']['next_cursor']
55
+ else:
56
+ break
57
+
58
+ # Create a dictionary for O(1) lookups
59
+ doi_to_item = {item.doi.lower(): item for item in self.results if item.doi}
60
+
61
+ for r in openalex_results:
62
+ openAlexCitedByCount = None
63
+ if 'cited_by_count' in r:
64
+ openAlexCitedByCount = r['cited_by_count']
65
+
66
+ openAlexReferencedWorksCount = None
67
+ if 'referenced_works_count' in r:
68
+ openAlexReferencedWorksCount = r['referenced_works_count']
69
+
70
+ haveCreatorAffiliation = False
71
+
72
+ for authorship in r.get('institutions', []):
73
+ for affiliation in authorship.get('institutions', []):
74
+ if affiliation.get('ror') == self.ror:
75
+ haveCreatorAffiliation = True
76
+ break
77
+
78
+ doi = remove_resolver_prefix_from_doi(r.get('doi', None))
79
+ if doi is None:
80
+ continue
81
+
82
+ recordMatch = False
83
+
84
+ item = doi_to_item.get(doi.lower())
85
+ if item:
86
+ recordMatch = True
87
+ item.openAlexCitedByCount = openAlexCitedByCount
88
+ item.openAlexReferencedWorksCount = openAlexReferencedWorksCount
89
+ item.inOpenAlex = True
90
+ item.openAlexId = r.get('id', None)
91
+ item.haveCreatorAffiliation = haveCreatorAffiliation
92
+ if not recordMatch:
93
+ publication_date = r.get('publication_date', None)
94
+ publication_year = r.get('publication_year', None)
95
+ if publication_date:
96
+ publication_year = publication_date[:4] if len(publication_date) >= 4 else None
97
+ item = ResearchOutputItem(
98
+ doi=doi,
99
+ isPublisher=None,
100
+ resourceType=r.get('type', None),
101
+ title=r.get('title', None),
102
+ publisher=None,
103
+ publicationYear=publication_year,
104
+ createdAt=r.get('created_date', None),
105
+ updatedAt=r.get('updated_date', None),
106
+ haveContributorAffiliation=None,
107
+ haveCreatorAffiliation=haveCreatorAffiliation,
108
+ isLatestVersion=None,
109
+ isConceptDoi=None,
110
+ inOpenAlex=True,
111
+ openAlexCitedByCount=openAlexCitedByCount,
112
+ openAlexReferencedWorksCount=openAlexReferencedWorksCount,
113
+ openAlexId=r.get('id', None),
114
+ titleWordCount=string_word_count(r.get('title', None))
115
+ )
116
+ self.results.append(item)
117
+ doi_to_item[item.doi.lower()] = item # Add to lookup dictionary
118
+
119
+ return openalex_results
@@ -0,0 +1,6 @@
1
+ Metadata-Version: 2.4
2
+ Name: roagg
3
+ Version: 2025.0.8
4
+ Requires-Python: >=3.10
5
+ License-File: LICENSE
6
+ Dynamic: license-file
@@ -0,0 +1,16 @@
1
+ roagg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ roagg/__main__.py,sha256=VmfwfqdlUj_9LkmCrYVET_GCM6zgNkrOZHEE-DqlPIE,68
3
+ roagg/aggregator.py,sha256=iZjchp9ldO6LJqhWlCWTXrcGEVuhXTNEmzRuuKRLRTU,2318
4
+ roagg/cli.py,sha256=FGaqOL17-fmKfa0UDHrOTIbqrDVIAT7WHlo3_1qvbTk,2677
5
+ roagg/helpers/ror.py,sha256=6m3u0ke0Guwe6xBkKZdRRR2882pcx5eN9C7EK0Yr7o0,488
6
+ roagg/helpers/utils.py,sha256=0hsImpjIIKZf4rw8i3_wOWcrgh3MuwJncx2WjvB1j24,1839
7
+ roagg/models/research_output_item.py,sha256=ioYn6iYsUSNPK84WLA3_2p9Hd-PyIhwH2luINa0A63M,1599
8
+ roagg/providers/datacite.py,sha256=gHuUY_OXzguHxB-oA9da7cH4QHNlC4tQIaSq0cDWpwE,8112
9
+ roagg/providers/openaire.py,sha256=aMC_n7sae4y9ihX-fzRljMPJFRWDRxeAUSA9iVfNrAw,7012
10
+ roagg/providers/openalex.py,sha256=_5CD0LzoJ6kLDzQFFAJz-8ZZ7H77kNTFSRWzYtKsm_A,4938
11
+ roagg-2025.0.8.dist-info/licenses/LICENSE,sha256=Qhd6QZcm7nnbZ3A9bCnqMYbWec83s20vAA_NWmEGwsE,1086
12
+ roagg-2025.0.8.dist-info/METADATA,sha256=srldfXcHDD5N50zlhcm6pp8Lnf7agOPL3vs8Kz4_TR4,120
13
+ roagg-2025.0.8.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
14
+ roagg-2025.0.8.dist-info/entry_points.txt,sha256=t-eLnlkWaxvQQ_LQRAVmYXK5A-fJdUs-dQRZEQt1DsQ,76
15
+ roagg-2025.0.8.dist-info/top_level.txt,sha256=qPVTgmR5hG3qUkccGNGEo3F4VqjIsg7Caa50G6HqQ1k,6
16
+ roagg-2025.0.8.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,5 @@
1
+ [console_scripts]
2
+ roagg = roagg.cli:main
3
+
4
+ [pipx.run]
5
+ roagg = roagg.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Swedish National Data Service
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ roagg