roagg 2025.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
roagg-2025.0.8/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Swedish National Data Service
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ Metadata-Version: 2.4
2
+ Name: roagg
3
+ Version: 2025.0.8
4
+ Requires-Python: >=3.10
5
+ License-File: LICENSE
6
+ Dynamic: license-file
@@ -0,0 +1,102 @@
1
+ # Research output aggregator
2
+ > [!NOTE]
3
+ > This script is under development
4
+
5
+ The goal of this project is to create a script to get a summarization for a research organization about the research output.
6
+ First target is to query and process information from DataCite.
7
+
8
+ The goal for this script is to create a list over research output where an organization is mentioned as:
9
+ * publisher
10
+ * creator with affiliation to the organization
11
+ * contributor with affiliation to the organization
12
+
13
+ input: ROR-id and list of variants on the organization name.
14
+
15
+ Properties to collect for each research output:
16
+ |Field |Type |Comment |
17
+ |--------------------------------------|-------|---------------------------------------------------------------------------------------|
18
+ |publicationYear |integer|The year of publication, can be empty in some cases |
19
+ |resourceType |string |The resource type (free text string) |
20
+ |title |string |Title of the resource (first one if multiple) |
21
+ |publisher |string |Publisher (free text) |
22
+ |createdAt |string |Created date if availible |
23
+ |updatedAt |string |Updatade date if availible |
24
+ |isPublisher |bool |True if the the publisher match the requested organisation |
25
+ |isFunder |bool |True if the the funder match the requested organisation |
26
+ |haveCreatorAffiliation |bool |True if the the any creator match the requested organisation |
27
+ |haveContributorAffiliation |bool |True if the the any contributor match the requested organisation |
28
+ |isLatestVersion |bool |True if the DataCite metadata indicates this beeing the latest version |
29
+ |isConceptDoi |bool |True if the DataCite metadata indicates this beeing a concept DOI |
30
+ |matchPublisherRor |bool |True if the ROR id for publisher match the ROR in the provided argument |
31
+ |matchCreatorAffiliationRor |bool |True if the ROR id for a creator affiliation match the ROR in the provided argument |
32
+ |matchContributorAffiliationRor |bool |True if the ROR id for a contributor affiliation match the ROR in the provided argument|
33
+ |matchFunderRor |bool |True if the ROR id for funder match the ROR in the provided argument |
34
+ |matchPublisherName |bool |True if any of the names supplied matches the publisher name in the resource |
35
+ |matchCreatorName |bool |True if any of the names supplied matches the creator name in the resource |
36
+ |matchContributorName |bool |True if any of the names supplied matches the contributor name in the resource |
37
+ |matchFunderName |bool |True if any of the names supplied matches the funder name in the resource |
38
+ |inDataCite |bool |True if the DOI was matched in the DataCite |
39
+ |inOpenAire |bool |True if the DOI was matched in OpenAire |
40
+ |inOpenAlex |bool |True if the DOI was matched in OpenAlex |
41
+ |inCrossRef |bool |True if the DOI was matched in CrossRef |
42
+ |dataCiteClientId |string |The client id for the organisation minting the DOI |
43
+ |dataCiteClientName |string |The human readable name of the minting organisation |
44
+ |dataCiteCitationCount |integer|Citation count for the resource provided by the DataCite API |
45
+ |dataCiteReferenceCount |integer|Reference count for the resource provided by the DataCite API |
46
+ |dataCiteViewCount |integer|View count for the resource provided by the DataCite API |
47
+ |dataCiteDownloadCount |integer|Download count for the resource provided by the DataCite API |
48
+ |openAireBestAccessRight |string |Access Rights for the resource indicated indicated by the OpenAire API |
49
+ |openAireIndicatorsUsageCountsDownloads|integer|Download count for the resource indicated by the OpenAire API |
50
+ |openAireIndicatorsUsageCountsViews |integer|View count for the resource provided by the OpenAire API |
51
+ |openAireId |string |Id for the resource in OpenAire |
52
+ |openAlexId |string |Id for the resource in OpenAlex |
53
+ |openAlexCitedByCount |integer|Citation count for the resource provided by the OpenAlex API |
54
+ |openAlexReferencedWorksCount |integer|Reference count for the resource provided by the OpenAlex API |
55
+ |titleWordCount |integer|Number of words in the title (useful for sorting in some cases) |
56
+ |referencedByDoi |string | |
57
+
58
+
59
+ ## Install
60
+ `pip install .`
61
+
62
+ ## Install dev
63
+ `pip install -e .`
64
+
65
+ ## Development stuff to do
66
+ - [x] ROR get name variants from ROR
67
+ - [x] CLI add options to get name list from txt
68
+ - [x] DataCite API build query for matching publisher and affiliation
69
+ - [ ] Crossref API build query for matching publisher and affiliation
70
+ - [ ] Publish as cmd tool on PyPI
71
+
72
+ ## Run
73
+ List arguments:
74
+ `roagg --help`
75
+
76
+ ## Tests
77
+ Some tests are available, to run them:
78
+ `python -m pytest`
79
+
80
+ ### Some example arguments
81
+ Chalmers with ror and name list:
82
+ ```bash
83
+ roagg --ror https://ror.org/040wg7k59 --name-txt tests/name-lists/chalmers.txt --output chalmers.csv
84
+ ```
85
+
86
+ GU with ror, name list and extra name not in the text file:
87
+ ```bash
88
+ roagg --name "Department of Nephrology Gothenburg" --ror https://ror.org/01tm6cn81 --name-txt tests/name-lists/gu.txt --output data/gu.csv
89
+ ```
90
+
91
+ KTH with ror and name list:
92
+ ```bash
93
+ roagg --ror https://ror.org/026vcq606 --name-txt tests/name-lists/kth.txt --output data/kth.csv
94
+ ```
95
+
96
+ KAU with ror:
97
+ ```bash
98
+ roagg --ror https://ror.org/05s754026 --output kau.csv
99
+ ```
100
+
101
+ ## License
102
+ [MIT License](LICENSE)
@@ -0,0 +1,48 @@
1
+ [project]
2
+ name = "roagg"
3
+ version = "2025.0.8"
4
+ requires-python = ">=3.10"
5
+ dependencies = []
6
+
7
+ [project.scripts]
8
+ roagg = "roagg.cli:main"
9
+
10
+ [project.entry-points."pipx.run"]
11
+ roagg = "roagg.cli:main"
12
+
13
+ [build-system]
14
+ requires = ["setuptools>=42", "wheel"]
15
+ build-backend = "setuptools.build_meta"
16
+
17
+ [tool.setuptools]
18
+ package-dir = {"" = "src"}
19
+
20
+ [tool.setuptools.packages.find]
21
+ where = ["src"]
22
+
23
+ [tool.pytest.ini_options]
24
+ # Specify test directory
25
+ testpaths = ["tests"]
26
+
27
+ # Minimum version of pytest
28
+ minversion = "6.0"
29
+
30
+ # Add command line options that are always used
31
+ addopts = [
32
+ "-ra", # Show summary of all test outcomes
33
+ "--strict-markers", # Raise error on unknown markers
34
+ "--strict-config", # Raise error on invalid config
35
+ "--showlocals", # Show local variables in tracebacks
36
+ ]
37
+
38
+ # Configure test discovery patterns
39
+ python_files = ["test_*.py", "*_test.py"]
40
+ python_classes = ["Test*"]
41
+ python_functions = ["test_*"]
42
+
43
+ # Define custom markers (optional, for organizing tests)
44
+ markers = [
45
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
46
+ "integration: marks tests as integration tests",
47
+ "unit: marks tests as unit tests",
48
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,3 @@
1
+ if __name__ == "__main__":
2
+ from roagg.cli import main
3
+ main()
@@ -0,0 +1,69 @@
1
+ import sys
2
+ from typing import List
3
+ from roagg.helpers.ror import get_names_from_ror
4
+ from roagg.providers.datacite import DataCiteAPI
5
+ from roagg.providers.openaire import OpenAireAPI
6
+ from roagg.providers.openalex import OpenAlexAPI
7
+ import logging
8
+ from roagg.models.research_output_item import ResearchOutputItem
9
+ import json
10
+ import csv
11
+ from dataclasses import fields
12
+
13
+ def aggregate(name: List[str] = [], ror: str = "", output: str = "output.csv") -> None:
14
+ if ror:
15
+ ror_name = get_names_from_ror(ror)
16
+ name.extend(ror_name)
17
+
18
+ # remove duplicates
19
+ name = list(set(name))
20
+
21
+ datacite = DataCiteAPI(name=name, ror=ror)
22
+ url = datacite.api_request_url()
23
+ # debug print of the query string
24
+ logging.info("DataCite url:")
25
+ logging.info(url)
26
+
27
+ records = datacite.all()
28
+ research_output_items = []
29
+ logging.info(f"Checking {len(records)} records...")
30
+ for record in records:
31
+ research_output_items.append(datacite.get_record(record))
32
+
33
+ openaire = OpenAireAPI(ror=ror, results=research_output_items)
34
+ openaire_id = openaire.get_openaire_id_from_ror()
35
+ logging.info(f"OpenAire ID from ROR {ror} : {openaire_id}")
36
+ openaire.get_records()
37
+
38
+ openalex = OpenAlexAPI(ror=ror, results=research_output_items)
39
+ openalex_id = openalex.get_openalex_id_from_ror()
40
+ logging.info(f"OpenAlex ID from ROR {ror} : {openalex_id}")
41
+ openalex.get_records()
42
+
43
+ logging.info(f"Writing: {output}")
44
+
45
+ write_csv(research_output_items, output)
46
+ logging.info(f"Writing output to csv: {output} - Done")
47
+
48
+ def write_csv(records: List[ResearchOutputItem], output: str) -> None:
49
+ # Get field names from the dataclass
50
+ dataclass_fields = fields(ResearchOutputItem)
51
+ header = [field.name for field in dataclass_fields]
52
+
53
+ def format_value(value):
54
+ """Format values for CSV output"""
55
+ if value is None:
56
+ return ""
57
+ elif isinstance(value, bool):
58
+ return 1 if value else 0
59
+ else:
60
+ return value
61
+
62
+ with open(output, 'w', newline='', encoding='utf-8') as file:
63
+ writer = csv.writer(file)
64
+ writer.writerow(header)
65
+
66
+ writer.writerows([
67
+ [format_value(getattr(record, field.name)) for field in dataclass_fields]
68
+ for record in records
69
+ ])
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env python3
2
+ from typing import List, Optional
3
+ import argparse
4
+ import logging
5
+ import sys
6
+ from pathlib import Path
7
+ from roagg.helpers.utils import get_roagg_version
8
+ from roagg.aggregator import aggregate
9
+
10
+ def validate_ror_id(ror_id: str) -> str:
11
+ """validate ROR ID format (should start with https://ror.org/)."""
12
+ if not ror_id.startswith('https://ror.org/'):
13
+ raise argparse.ArgumentTypeError("ROR ID must start with 'https://ror.org/'")
14
+ return ror_id
15
+
16
+ def read_names_from_file(filepath: Path) -> List[str]:
17
+ """Read organization names from a file, one per line."""
18
+ try:
19
+ return [line.strip() for line in filepath.read_text().splitlines() if line.strip()]
20
+ except IOError as e:
21
+ logging.error(f"Failed to read names file: {e}")
22
+ sys.exit(1)
23
+
24
+ def main() -> None:
25
+ """create a summary CSV file for all research output for an organization."""
26
+ logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
27
+
28
+ parser = argparse.ArgumentParser(
29
+ description="aggregate research outputs for an organization into a CSV file",
30
+ formatter_class=argparse.RawDescriptionHelpFormatter
31
+ )
32
+
33
+ parser.add_argument(
34
+ "--version",
35
+ action="version",
36
+ version=get_roagg_version()
37
+ )
38
+
39
+ parser.add_argument(
40
+ "--name",
41
+ type=str,
42
+ action='append',
43
+ help="name variant of the organization (can be used multiple times)"
44
+ )
45
+ parser.add_argument(
46
+ "--name-txt",
47
+ type=Path,
48
+ help="path to text file containing organization name variants (one per line)"
49
+ )
50
+
51
+ parser.add_argument(
52
+ "--ror",
53
+ type=validate_ror_id,
54
+ help="ROR ID of the organization (must start with https://ror.org/)"
55
+ )
56
+
57
+ parser.add_argument(
58
+ "--source",
59
+ default="api",
60
+ choices=["api"],
61
+ help="source for resource aggregation (only api is supported right now)"
62
+ )
63
+
64
+ parser.add_argument(
65
+ "--output",
66
+ default="data/output.csv",
67
+ help="name of the output file (default: data/output.csv)"
68
+ )
69
+
70
+ args = parser.parse_args()
71
+
72
+ # print parser.print_help() if no argument for name, name-txt or ror is provided
73
+ if not any([args.name, args.name_txt, args.ror]):
74
+ parser.print_help()
75
+ sys.exit(1)
76
+
77
+ names: List[str] = []
78
+ if args.name:
79
+ names = args.name
80
+
81
+ if args.name_txt:
82
+ names.extend(read_names_from_file(args.name_txt))
83
+
84
+ try:
85
+ aggregate(names, args.ror, args.output)
86
+ except Exception as e:
87
+ logging.error(f"Aggregation failed: {e}")
88
+ sys.exit(1)
89
+
90
+ if __name__ == "__main__":
91
+ main()
@@ -0,0 +1,14 @@
1
+ import urllib.request
2
+ import json
3
+ from typing import List
4
+
5
+ def get_ror_info(ror: str):
6
+ ror_id = ror.split('/')[-1]
7
+ url = f"https://api.ror.org/v2/organizations/{ror_id}"
8
+ with urllib.request.urlopen(url) as response:
9
+ return json.loads(response.read())
10
+
11
+ def get_names_from_ror(ror: str) -> List[str]:
12
+ names = get_ror_info(ror)['names']
13
+ valid_types = {'alias', 'ror_display', 'label'}
14
+ return [n['value'] for n in names if valid_types.intersection(n['types'])]
@@ -0,0 +1,65 @@
1
+ import importlib.metadata
2
+ import re
3
+
4
+ doi_pattern = re.compile(r'^10\.\d{4,9}/[-._;()/:A-Z0-9]+$', re.IGNORECASE)
5
+
6
+ def is_valid_doi(s: str) -> bool:
7
+ return bool(doi_pattern.match(s))
8
+
9
+ def find_doi_in_text(text: str) -> str | None:
10
+ return re.findall(r'\b10\.\d{4,9}/[-.;()/:\w]+', text)
11
+
12
+ def remove_resolver_prefix_from_doi(doi: str) -> str:
13
+ if doi is None:
14
+ return None
15
+ prefixes = [
16
+ "https://doi.org/",
17
+ "http://doi.org/",
18
+ "doi.org/",
19
+ "https://dx.doi.org/",
20
+ "http://dx.doi.org/",
21
+ "dx.doi.org/"
22
+ ]
23
+ for prefix in prefixes:
24
+ if doi.lower().startswith(prefix):
25
+ return doi[len(prefix):]
26
+ return doi
27
+
28
+ def match_patterns(string, patterns):
29
+ if string is None:
30
+ return False
31
+
32
+ for pattern in patterns:
33
+ # if pattern does not contain * or ? check if string contains pattern
34
+ if '*' not in pattern and '?' not in pattern:
35
+ if pattern.lower() is string.lower() or pattern.lower() in string.lower():
36
+ return True
37
+
38
+
39
+ if re.match(pattern_to_regexp(pattern), string, re.IGNORECASE):
40
+ return True
41
+ return False
42
+
43
+ def pattern_to_regexp(pattern: str) -> str:
44
+ regex = ""
45
+ for char in pattern:
46
+ if char == '*':
47
+ regex += '.*'
48
+ elif char == '?':
49
+ regex += '.'
50
+ else:
51
+ regex += re.escape(char)
52
+ return '^' + regex + '$'
53
+
54
+ def get_roagg_version() -> str:
55
+ """Get package version from metadata."""
56
+ try:
57
+ return importlib.metadata.version("roagg")
58
+ except importlib.metadata.PackageNotFoundError:
59
+ return "unknown"
60
+
61
+ def string_word_count(string: str) -> int:
62
+ """Count words in a string after trimming whitespace."""
63
+ if not string:
64
+ return 0
65
+ return len(string.strip().split())
@@ -0,0 +1,51 @@
1
+ from dataclasses import dataclass
2
+
3
+ @dataclass
4
+ class ResearchOutputItem:
5
+ doi: str
6
+ publicationYear: int = None
7
+ resourceType: str = None
8
+ title: str = None
9
+ publisher: str = None
10
+ createdAt: str = ""
11
+ updatedAt: str = ""
12
+ isPublisher: bool = False
13
+ isFunder: bool = None
14
+ haveCreatorAffiliation: bool = False
15
+ haveContributorAffiliation: bool = False
16
+ isLatestVersion: bool = True
17
+ isConceptDoi: bool = False
18
+ #match on ROR
19
+ matchPublisherRor: bool = False
20
+ matchCreatorAffiliationRor: bool = False
21
+ matchContributorAffiliationRor: bool = False
22
+ matchFunderRor: bool = False
23
+ #match on free text in name
24
+ matchPublisherName: bool = False
25
+ matchCreatorName: bool = False
26
+ matchContributorName: bool = False
27
+ matchFunderName: bool = False
28
+ #where was the match found
29
+ inDataCite: bool = None
30
+ inOpenAire: bool = None
31
+ inOpenAlex: bool = None
32
+ inCrossRef: bool = None
33
+ #datacite specific
34
+ dataCiteClientId: str = None
35
+ dataCiteClientName: str = None
36
+ dataCiteCitationCount: int = None
37
+ dataCiteReferenceCount: int = None
38
+ dataCiteViewCount: int = None
39
+ dataCiteDownloadCount: int = None
40
+ #openaire specific
41
+ openAireBestAccessRight: str = None
42
+ openAireIndicatorsUsageCountsDownloads: int = None
43
+ openAireIndicatorsUsageCountsViews: int = None
44
+ openAireId: str = None
45
+ #openalex specific
46
+ openAlexId: str = None
47
+ openAlexCitedByCount: int = None
48
+ openAlexReferencedWorksCount: int = None
49
+ #extra fields
50
+ titleWordCount: int = None
51
+ referencedByDoi: str = None
@@ -0,0 +1,178 @@
1
+ from typing import List
2
+ import urllib.request
3
+ import logging
4
+ import json
5
+ from roagg.helpers.utils import get_roagg_version
6
+ from roagg.models.research_output_item import ResearchOutputItem
7
+ from roagg.helpers.utils import match_patterns, string_word_count
8
+
9
+ class DataCiteAPI:
10
+ def __init__(self, page_size: int = 500, name: List[str] = [], ror: str = ""):
11
+ self.page_size = page_size
12
+ self.name = name
13
+ self.ror = ror
14
+
15
+ def get_query_string(self) -> str:
16
+ if not self.name and not self.ror:
17
+ return ""
18
+
19
+ query_parts = []
20
+
21
+ if self.name:
22
+ # Separate wildcard and exact matches, handle spaces in wildcard queries appropriately
23
+ wildcard = ' OR '.join(n.replace(" ", "\\ ") for n in self.name if '*' in n)
24
+ exact = ' OR '.join(f'"{n}"' for n in self.name if '*' not in n)
25
+ name_fields = [
26
+ "creators.affiliation.name",
27
+ "contributors.affiliation.name",
28
+ "publisher.name"
29
+ ]
30
+
31
+ if wildcard and exact:
32
+ name_conditions = f'{wildcard} OR {exact}'
33
+ else:
34
+ name_conditions = wildcard or exact
35
+
36
+ query_parts.extend([f"{field}:({name_conditions})" for field in name_fields])
37
+
38
+ if self.ror:
39
+ ror_fields = [
40
+ "publisher.publisherIdentifier",
41
+ "creators.affiliation.affiliationIdentifier",
42
+ "contributors.affiliation.affiliationIdentifier",
43
+ "creators.nameIdentifiers.nameIdentifier",
44
+ "contributors.nameIdentifiers.nameIdentifier",
45
+ "fundingReferences.funderIdentifier"
46
+ ]
47
+ query_parts.extend([f'{field}:"{self.ror}"' for field in ror_fields])
48
+ # nameIdentifiers are formated without https://ror.org/ prefix from some sources, so we need to check both
49
+ query_parts.extend([f'{field}:"{self.ror.split("https://ror.org/")[1]}"' for field in ror_fields])
50
+
51
+ return " OR ".join(query_parts)
52
+
53
+ def api_request_url(self, page_size: int = None) -> str:
54
+ if page_size is None:
55
+ page_size = self.page_size
56
+ params = urllib.parse.urlencode({
57
+ 'page[size]': page_size,
58
+ 'page[cursor]': '1',
59
+ 'affiliation': 'true',
60
+ 'publisher': 'true',
61
+ 'detail': 'true',
62
+ 'disable-facets': 'false',
63
+ 'query': self.get_query_string()
64
+ })
65
+ return f"https://api.datacite.org/dois?{params}"
66
+
67
+ @staticmethod
68
+ def get_api_result(url: str) -> dict:
69
+ request = urllib.request.Request(url)
70
+ version = get_roagg_version()
71
+ request.add_header('User-Agent', f'ResearchOutputAggregator/{version} (https://github.com/snd-sweden/research-output-aggregator; mailto:team-it@snd.se)')
72
+ try:
73
+ with urllib.request.urlopen(request) as response:
74
+ return json.loads(response.read())
75
+ except (urllib.error.URLError, json.JSONDecodeError, KeyError) as e:
76
+ raise RuntimeError(f"Failed run DataCite query: {e}")
77
+
78
+ def get_record(self, item: dict) -> ResearchOutputItem:
79
+ attributes = item.get("attributes", {})
80
+ publisher_attr = attributes.get("publisher", {})
81
+ versionCount = 0 if attributes.get("versionCount", {}) is None else int(attributes.get("versionCount", {}))
82
+ versionOfCount = 0 if attributes.get("versionOfCount", {}) is None else int(attributes.get("versionOfCount", {}))
83
+
84
+ record = ResearchOutputItem(
85
+ doi=attributes.get("doi"),
86
+ dataCiteClientId=item["relationships"]["client"]["data"]["id"],
87
+ resourceType=attributes.get("types", None).get("resourceTypeGeneral"),
88
+ publisher=publisher_attr.get("name"),
89
+ publicationYear=attributes.get("publicationYear"),
90
+ title=item["attributes"]["titles"][0]["title"],
91
+ inDataCite=True,
92
+ dataCiteCitationCount=attributes.get("citationCount", None),
93
+ dataCiteReferenceCount=attributes.get("referenceCount", None),
94
+ dataCiteViewCount=attributes.get("viewCount", None),
95
+ dataCiteDownloadCount=attributes.get("downloadCount", None),
96
+ titleWordCount=string_word_count(item["attributes"]["titles"][0]["title"])
97
+ )
98
+
99
+ if record.resourceType is None or record.resourceType == "":
100
+ record.resourceType = attributes.get("types", {}).get("citeproc")
101
+ if record.resourceType is None or record.resourceType == "":
102
+ record.resourceType = attributes.get("types", {}).get("bibtex")
103
+
104
+
105
+ record.isPublisher = (
106
+ publisher_attr.get("publisherIdentifier") == self.ror or
107
+ match_patterns(publisher_attr.get("name"), self.name)
108
+ )
109
+
110
+ related = [
111
+ r for r in item["attributes"].get("relatedIdentifiers", [])
112
+ if (r.get("relationType") == "IsReferencedBy" or r.get("relationType") == "IsSupplementTo" or r.get("relationType") == "IsSourceOf") and r.get("relatedIdentifierType") == "DOI"
113
+ ]
114
+ if related and len(related) > 0:
115
+ record.referencedByDoi = related[0].get("relatedIdentifier")
116
+ else:
117
+ record.referencedByDoi = None
118
+
119
+ record.createdAt = str(attributes.get("created", "") or "")
120
+
121
+ record.updatedAt = max([
122
+ str(attributes.get("updated", "") or ""),
123
+ str(attributes.get("created", "") or ""),
124
+ str(attributes.get("registered", "") or "")
125
+ ])
126
+
127
+ for relation in attributes.get("relatedIdentifiers", []):
128
+ if relation.get("relationType") == "IsPreviousVersionOf":
129
+ record.isLatestVersion = False
130
+ if relation.get("relationType") == "HasVersion":
131
+ record.isLatestVersion = False
132
+
133
+ record.isConceptDoi = (
134
+ versionCount > 0 and
135
+ versionOfCount == 0
136
+ )
137
+
138
+ record.haveCreatorAffiliation = self.check_agent_list_match(attributes.get("creators", []))
139
+ record.haveContributorAffiliation = self.check_agent_list_match(attributes.get("contributors", []))
140
+ return record
141
+
142
+ def check_agent_list_match(self, items: list) -> bool:
143
+ partial_ror = self.ror.split("https://ror.org/")[1] if self.ror else ""
144
+ for agent in items:
145
+ # Check if any nameIdentifier matches the ror
146
+ if any(identifier.get("nameIdentifier") == self.ror for identifier in agent.get("nameIdentifiers", [])):
147
+ return True
148
+ # Check if any nameIdentifier matches the partial ror
149
+ if any(identifier.get("nameIdentifier") == partial_ror for identifier in agent.get("nameIdentifiers", [])):
150
+ return True
151
+ # Check if the agent name matches any pattern
152
+ if match_patterns(agent.get("name"), self.name):
153
+ return True
154
+ # Check each affiliation
155
+ for affiliation in agent.get("affiliation", []):
156
+ if (affiliation.get("affiliationIdentifier") == self.ror or
157
+ match_patterns(affiliation.get("name"), self.name)):
158
+ return True
159
+ return False
160
+
161
+ def all(self) -> list:
162
+ result = []
163
+ url = self.api_request_url()
164
+ while True:
165
+ response = self.get_api_result(url)
166
+ result.extend(response["data"])
167
+ logging.info(f"Retrieved DataCite {len(result)} of {response['meta']['total']}")
168
+ if response['links'].get('next'):
169
+ url = response['links']['next']
170
+ else:
171
+ break
172
+ return result
173
+
174
+ def count(self) -> int:
175
+ if not self.get_query_string():
176
+ return 0
177
+ url = self.api_request_url(page_size=0)
178
+ return self.get_api_result(url)["meta"]["total"]
@@ -0,0 +1,155 @@
1
+ from typing import List
2
+ import urllib.request
3
+ import logging
4
+ import json
5
+ from roagg.models.research_output_item import ResearchOutputItem
6
+ from roagg.helpers.utils import find_doi_in_text, is_valid_doi, string_word_count
7
+
8
+ class OpenAireAPI:
9
+ openaire_base_url = "https://api.openaire.eu/graph/v1/"
10
+
11
+ def __init__(self, page_size: int = 100, ror: str = "", results: List[ResearchOutputItem] = []):
12
+ self.page_size = page_size
13
+ self.ror = ror
14
+ self.results = results
15
+
16
+ def get_openaire_id_from_ror(self) -> str:
17
+ url = f"{self.openaire_base_url}organizations?pid={self.ror}"
18
+ with urllib.request.urlopen(url) as response:
19
+ json_response = json.loads(response.read())
20
+
21
+ if 'results' in json_response and len(json_response['results']) > 0:
22
+ return json_response['results'][0]['id']
23
+ else:
24
+ return ""
25
+
26
+ def get_records(self) -> List[ResearchOutputItem]:
27
+ if not self.ror:
28
+ return []
29
+ openaire_results = []
30
+ openaire_id = self.get_openaire_id_from_ror()
31
+
32
+ if not openaire_id:
33
+ logging.info(f"No OpenAire ID found for ROR {self.ror}")
34
+ return []
35
+
36
+ params = {
37
+ 'pageSize': self.page_size,
38
+ 'cursor': '*',
39
+ 'type': 'dataset', # limit to only datasets for now
40
+ 'relOrganizationId': openaire_id
41
+ }
42
+ retrieve_count = 0
43
+ while True:
44
+ query_string = urllib.parse.urlencode(params)
45
+ url = f"{self.openaire_base_url}researchProducts?{query_string}"
46
+ with urllib.request.urlopen(url) as response:
47
+ json_response = json.loads(response.read())
48
+ if 'results' in json_response:
49
+ openaire_results.extend(json_response['results'])
50
+
51
+ retrieve_count = len(openaire_results)
52
+ logging.info(f"Retrieved OpenAire {retrieve_count} of {json_response['header']['numFound']}")
53
+
54
+ if 'nextCursor' in json_response['header'] and json_response['header']['nextCursor']:
55
+ params['cursor'] = json_response['header']['nextCursor']
56
+ else:
57
+ break
58
+
59
+ # Create a dictionary for O(1) lookups
60
+ doi_to_item = {item.doi.lower(): item for item in self.results if item.doi}
61
+
62
+ for r in openaire_results:
63
+ openAireBestAccessRight = None
64
+ if 'bestAccessRight' in r and r['bestAccessRight'] and 'label' in r['bestAccessRight']:
65
+ openAireBestAccessRight = r['bestAccessRight']['label']
66
+
67
+ openAireIndicatorsUsageCountsDownloads = None
68
+ if 'indicators' in r and r['indicators'] and 'usageCounts' in r['indicators']:
69
+ if 'downloads' in r['indicators']['usageCounts']:
70
+ openAireIndicatorsUsageCountsDownloads = r['indicators']['usageCounts']['downloads']
71
+
72
+ openAireIndicatorsUsageCountsViews = None
73
+ if 'indicators' in r and r['indicators'] and 'usageCounts' in r['indicators']:
74
+ if 'views' in r['indicators']['usageCounts']:
75
+ openAireIndicatorsUsageCountsViews = r['indicators']['usageCounts']['views']
76
+
77
+ dois = self.get_doi_list_from_resource(r)
78
+ recordMatch = False
79
+ for doi in dois:
80
+ item = doi_to_item.get(doi.lower())
81
+ if item:
82
+ recordMatch = True
83
+ item.openAireBestAccessRight = openAireBestAccessRight
84
+ item.openAireIndicatorsUsageCountsDownloads = openAireIndicatorsUsageCountsDownloads
85
+ item.openAireIndicatorsUsageCountsViews = openAireIndicatorsUsageCountsViews
86
+ item.inOpenAire = True
87
+ if not recordMatch and len(dois) > 0:
88
+ publication_date = r.get('publicationDate', None)
89
+ publication_year = None
90
+ if publication_date:
91
+ publication_year = publication_date[:4] if len(publication_date) >= 4 else None
92
+ item = ResearchOutputItem(
93
+ doi=dois[0],
94
+ isPublisher=None,
95
+ resourceType=r.get('type', None),
96
+ title=r.get('mainTitle', None),
97
+ publisher=r.get('publisher', None),
98
+ publicationYear=publication_year,
99
+ haveContributorAffiliation=None,
100
+ haveCreatorAffiliation=None,
101
+ isLatestVersion=None,
102
+ isConceptDoi=None,
103
+ inOpenAire=True,
104
+ openAireBestAccessRight=openAireBestAccessRight,
105
+ openAireIndicatorsUsageCountsDownloads=openAireIndicatorsUsageCountsDownloads,
106
+ openAireIndicatorsUsageCountsViews=openAireIndicatorsUsageCountsViews,
107
+ openAireId=r.get('id', None),
108
+ titleWordCount=string_word_count(r.get('mainTitle', None))
109
+ )
110
+ self.results.append(item)
111
+ doi_to_item[item.doi.lower()] = item # Add to lookup dictionary
112
+
113
+ return openaire_results
114
+
115
+ def get_doi_list_from_resource(self, resource: dict) -> List[str]:
116
+ doi_list = []
117
+
118
+ for instance in resource['instances']:
119
+ logging.debug(f"Instance: {instance}")
120
+
121
+ if 'pids' in instance and len(instance['pids']) > 0:
122
+ for pid in instance['pids']:
123
+ if pid['scheme'].lower() == 'doi':
124
+ doi_list.append(pid['value'])
125
+
126
+ if 'alternateIdentifiers' in instance and len(instance['alternateIdentifiers']) > 0:
127
+ for alternateIdentifier in instance['alternateIdentifiers']:
128
+ if alternateIdentifier['scheme'].lower() == 'doi':
129
+ doi_list.append(alternateIdentifier['value'])
130
+
131
+ if len(doi_list) == 0:
132
+ # Normalize URLs to standard DOI format
133
+ url_replacements = [
134
+ ("https://doi.pangaea.de/", "https://doi.org/"),
135
+ ("https://zenodo.org/doi/", "https://doi.org/"),
136
+ ("https://zenodo.org/records/", "https://doi.org/10.5281/zenodo.")
137
+ ]
138
+
139
+ for url in instance['urls']:
140
+ normalized_url = url
141
+ for old_pattern, new_pattern in url_replacements:
142
+ normalized_url = normalized_url.replace(old_pattern, new_pattern)
143
+
144
+ for doi in find_doi_in_text(normalized_url):
145
+ if is_valid_doi(doi):
146
+ doi_list.append(doi)
147
+
148
+ # if doi_list is empty print json for instances
149
+ if len(doi_list) == 0:
150
+ logging.warning(f"No DOI found in resource: {json.dumps(resource['instances'], indent=2)}")
151
+
152
+ return list(set(doi_list))
153
+
154
+
155
+
@@ -0,0 +1,119 @@
1
+ from typing import List
2
+ import urllib.request
3
+ import logging
4
+ import json
5
+ from roagg.models.research_output_item import ResearchOutputItem
6
+ from roagg.helpers.utils import string_word_count, remove_resolver_prefix_from_doi
7
+
8
+ class OpenAlexAPI:
9
+ openalex_base_url = "https://api.openalex.org/"
10
+
11
+ def __init__(self, page_size: int = 200, ror: str = "", results: List[ResearchOutputItem] = []):
12
+ self.page_size = page_size
13
+ self.ror = ror
14
+ self.results = results
15
+
16
+ def get_openalex_id_from_ror(self) -> str:
17
+ url = f"{self.openalex_base_url}institutions/ror:{self.ror}"
18
+ with urllib.request.urlopen(url) as response:
19
+ json_response = json.loads(response.read())
20
+
21
+ if 'id' in json_response:
22
+ return json_response['id']
23
+ else:
24
+ return ""
25
+
26
+ def get_records(self) -> List[ResearchOutputItem]:
27
+ if not self.ror:
28
+ return []
29
+ openalex_results = []
30
+ openalex_id = self.get_openalex_id_from_ror()
31
+
32
+ if not openalex_id:
33
+ logging.info(f"No OpenAlex ID found for ROR {self.ror}")
34
+ return []
35
+
36
+ params = {
37
+ 'per-page': self.page_size,
38
+ 'cursor': '*',
39
+ 'filter': f'institutions.id:{openalex_id},type:dataset' # limit to only datasets for now
40
+ }
41
+ retrieve_count = 0
42
+
43
+ while True:
44
+ query_string = urllib.parse.urlencode(params)
45
+ url = f"{self.openalex_base_url}works?{query_string}"
46
+ with urllib.request.urlopen(url) as response:
47
+ json_response = json.loads(response.read())
48
+ if 'results' in json_response:
49
+ openalex_results.extend(json_response['results'])
50
+ retrieve_count = len(openalex_results)
51
+ logging.info(f"Retrieved OpenAlex {retrieve_count} of {json_response['meta']['count']}")
52
+
53
+ if 'next_cursor' in json_response['meta'] and json_response['meta']['next_cursor']:
54
+ params['cursor'] = json_response['meta']['next_cursor']
55
+ else:
56
+ break
57
+
58
+ # Create a dictionary for O(1) lookups
59
+ doi_to_item = {item.doi.lower(): item for item in self.results if item.doi}
60
+
61
+ for r in openalex_results:
62
+ openAlexCitedByCount = None
63
+ if 'cited_by_count' in r:
64
+ openAlexCitedByCount = r['cited_by_count']
65
+
66
+ openAlexReferencedWorksCount = None
67
+ if 'referenced_works_count' in r:
68
+ openAlexReferencedWorksCount = r['referenced_works_count']
69
+
70
+ haveCreatorAffiliation = False
71
+
72
+ for authorship in r.get('institutions', []):
73
+ for affiliation in authorship.get('institutions', []):
74
+ if affiliation.get('ror') == self.ror:
75
+ haveCreatorAffiliation = True
76
+ break
77
+
78
+ doi = remove_resolver_prefix_from_doi(r.get('doi', None))
79
+ if doi is None:
80
+ continue
81
+
82
+ recordMatch = False
83
+
84
+ item = doi_to_item.get(doi.lower())
85
+ if item:
86
+ recordMatch = True
87
+ item.openAlexCitedByCount = openAlexCitedByCount
88
+ item.openAlexReferencedWorksCount = openAlexReferencedWorksCount
89
+ item.inOpenAlex = True
90
+ item.openAlexId = r.get('id', None)
91
+ item.haveCreatorAffiliation = haveCreatorAffiliation
92
+ if not recordMatch:
93
+ publication_date = r.get('publication_date', None)
94
+ publication_year = r.get('publication_year', None)
95
+ if publication_date:
96
+ publication_year = publication_date[:4] if len(publication_date) >= 4 else None
97
+ item = ResearchOutputItem(
98
+ doi=doi,
99
+ isPublisher=None,
100
+ resourceType=r.get('type', None),
101
+ title=r.get('title', None),
102
+ publisher=None,
103
+ publicationYear=publication_year,
104
+ createdAt=r.get('created_date', None),
105
+ updatedAt=r.get('updated_date', None),
106
+ haveContributorAffiliation=None,
107
+ haveCreatorAffiliation=haveCreatorAffiliation,
108
+ isLatestVersion=None,
109
+ isConceptDoi=None,
110
+ inOpenAlex=True,
111
+ openAlexCitedByCount=openAlexCitedByCount,
112
+ openAlexReferencedWorksCount=openAlexReferencedWorksCount,
113
+ openAlexId=r.get('id', None),
114
+ titleWordCount=string_word_count(r.get('title', None))
115
+ )
116
+ self.results.append(item)
117
+ doi_to_item[item.doi.lower()] = item # Add to lookup dictionary
118
+
119
+ return openalex_results
@@ -0,0 +1,6 @@
1
+ Metadata-Version: 2.4
2
+ Name: roagg
3
+ Version: 2025.0.8
4
+ Requires-Python: >=3.10
5
+ License-File: LICENSE
6
+ Dynamic: license-file
@@ -0,0 +1,19 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/roagg/__init__.py
5
+ src/roagg/__main__.py
6
+ src/roagg/aggregator.py
7
+ src/roagg/cli.py
8
+ src/roagg.egg-info/PKG-INFO
9
+ src/roagg.egg-info/SOURCES.txt
10
+ src/roagg.egg-info/dependency_links.txt
11
+ src/roagg.egg-info/entry_points.txt
12
+ src/roagg.egg-info/top_level.txt
13
+ src/roagg/helpers/ror.py
14
+ src/roagg/helpers/utils.py
15
+ src/roagg/models/research_output_item.py
16
+ src/roagg/providers/datacite.py
17
+ src/roagg/providers/openaire.py
18
+ src/roagg/providers/openalex.py
19
+ tests/test_utils.py
@@ -0,0 +1,5 @@
1
+ [console_scripts]
2
+ roagg = roagg.cli:main
3
+
4
+ [pipx.run]
5
+ roagg = roagg.cli:main
@@ -0,0 +1 @@
1
+ roagg
@@ -0,0 +1,255 @@
1
+ import pytest
2
+ from roagg.helpers.utils import is_valid_doi, find_doi_in_text, string_word_count
3
+
4
+
5
+ class TestIsValidDoi:
6
+ """Test cases for the is_valid_doi function."""
7
+
8
+ def test_valid_doi_basic(self):
9
+ """Test basic valid DOI format."""
10
+ assert is_valid_doi("10.1234/example")
11
+
12
+ def test_valid_doi_with_longer_prefix(self):
13
+ """Test valid DOI with longer prefix number."""
14
+ assert is_valid_doi("10.123456789/test")
15
+
16
+ def test_valid_doi_with_special_characters(self):
17
+ """Test valid DOI with allowed special characters."""
18
+ assert is_valid_doi("10.1000/test-example_123")
19
+ assert is_valid_doi("10.1000/test.example")
20
+ assert is_valid_doi("10.1000/test(123)")
21
+ assert is_valid_doi("10.1000/test;123")
22
+ assert is_valid_doi("10.1000/test:123")
23
+ assert is_valid_doi("10.1000/test/subpath")
24
+
25
+ def test_valid_doi_with_uppercase(self):
26
+ """Test valid DOI with uppercase letters."""
27
+ assert is_valid_doi("10.1234/ABC-DEF")
28
+ assert is_valid_doi("10.1234/Test-Example")
29
+
30
+ def test_valid_doi_with_numbers(self):
31
+ """Test valid DOI with various numbers in suffix."""
32
+ assert is_valid_doi("10.1234/123456789")
33
+
34
+ def test_valid_doi_complex(self):
35
+ """Test complex real-world DOI examples."""
36
+ assert is_valid_doi("10.1000/182")
37
+ assert is_valid_doi("10.1038/nphys1170")
38
+ assert is_valid_doi("10.1016/j.cell.2009.01.002")
39
+
40
+ def test_invalid_doi_missing_prefix(self):
41
+ """Test invalid DOI without '10.' prefix."""
42
+ assert not is_valid_doi("11.1234/example")
43
+ assert not is_valid_doi("1.1234/example")
44
+ assert not is_valid_doi("1234/example")
45
+
46
+ def test_invalid_doi_short_prefix_number(self):
47
+ """Test invalid DOI with too short prefix number."""
48
+ assert not is_valid_doi("10.123/example") # Only 3 digits
49
+ assert not is_valid_doi("10.12/example") # Only 2 digits
50
+ assert not is_valid_doi("10.1/example") # Only 1 digit
51
+
52
+ def test_invalid_doi_missing_slash(self):
53
+ """Test invalid DOI without slash separator."""
54
+ assert not is_valid_doi("10.1234-example")
55
+ assert not is_valid_doi("10.1234.example")
56
+
57
+ def test_invalid_doi_missing_suffix(self):
58
+ """Test invalid DOI without suffix."""
59
+ assert not is_valid_doi("10.1234/")
60
+ assert not is_valid_doi("10.1234")
61
+
62
+ def test_invalid_doi_empty_string(self):
63
+ """Test invalid DOI with empty string."""
64
+ assert not is_valid_doi("")
65
+
66
+ def test_invalid_doi_whitespace(self):
67
+ """Test invalid DOI with whitespace."""
68
+ assert not is_valid_doi("10.1234/ example")
69
+ assert not is_valid_doi("10.1234 /example")
70
+ assert not is_valid_doi(" 10.1234/example")
71
+ assert not is_valid_doi("10.1234/example ")
72
+
73
+ def test_invalid_doi_only_prefix(self):
74
+ """Test invalid DOI with only the prefix."""
75
+ assert not is_valid_doi("10.")
76
+
77
+ def test_invalid_doi_non_numeric_prefix(self):
78
+ """Test invalid DOI with non-numeric prefix."""
79
+ assert not is_valid_doi("10.abcd/example")
80
+
81
+ def test_edge_case_minimum_valid_length(self):
82
+ """Test DOI with minimum valid prefix length (4 digits)."""
83
+ assert is_valid_doi("10.1000/a")
84
+
85
+ def test_edge_case_maximum_valid_length(self):
86
+ """Test DOI with maximum valid prefix length (9 digits)."""
87
+ assert is_valid_doi("10.123456789/a")
88
+
89
+ def test_edge_case_too_long_prefix(self):
90
+ """Test DOI with prefix longer than 9 digits."""
91
+ assert not is_valid_doi("10.1234567890/example")
92
+
93
+
94
+ class TestFindDoiInText:
95
+ """Test cases for the find_doi_in_text function."""
96
+
97
+ def test_pangea_doi_url(self):
98
+ """PANGAEA DOI URL."""
99
+ text = "https://doi.pangaea.de/10.1234/example"
100
+ result = find_doi_in_text(text)
101
+ assert result == ["10.1234/example"]
102
+
103
+ def test_zenodo_doi_url(self):
104
+ """ZENODO DOI URL."""
105
+ text = "https://zenodo.org/doi/10.1234/example"
106
+ result = find_doi_in_text(text)
107
+ assert result == ["10.1234/example"]
108
+
109
+ def test_single_doi_in_text(self):
110
+ """Test finding a single DOI in text."""
111
+ text = "This is a reference to 10.1234/example in the middle of text."
112
+ result = find_doi_in_text(text)
113
+ assert result == ["10.1234/example"]
114
+
115
+ def test_multiple_dois_in_text(self):
116
+ """Test finding multiple DOIs in text."""
117
+ text = "See 10.1234/first and also 10.5678/second for more info."
118
+ result = find_doi_in_text(text)
119
+ assert result == ["10.1234/first", "10.5678/second"]
120
+
121
+ def test_doi_at_start_of_text(self):
122
+ """Test finding DOI at the beginning of text."""
123
+ text = "10.1234/example is the DOI for this article."
124
+ result = find_doi_in_text(text)
125
+ assert result == ["10.1234/example"]
126
+
127
+ def test_doi_at_end_of_text(self):
128
+ """Test finding DOI at the end of text."""
129
+ text = "The DOI for this article is 10.1234/example"
130
+ result = find_doi_in_text(text)
131
+ assert result == ["10.1234/example"]
132
+
133
+ def test_doi_with_special_characters(self):
134
+ """Test finding DOIs with various special characters."""
135
+ text = "DOI: 10.1038/nphys1170 and 10.1016/j.cell.2009.01.002"
136
+ result = find_doi_in_text(text)
137
+ assert len(result) == 2
138
+ assert "10.1038/nphys1170" in result
139
+ assert "10.1016/j.cell.2009.01.002" in result
140
+
141
+ def test_doi_with_parentheses(self):
142
+ """Test finding DOI with parentheses."""
143
+ text = "See DOI 10.1234/test(2024) for details."
144
+ result = find_doi_in_text(text)
145
+ assert "10.1234/test(2024)" in result
146
+
147
+ def test_no_doi_in_text(self):
148
+ """Test text without any DOI."""
149
+ text = "This text has no DOI at all."
150
+ result = find_doi_in_text(text)
151
+ assert result == []
152
+
153
+ def test_empty_text(self):
154
+ """Test with empty string."""
155
+ result = find_doi_in_text("")
156
+ assert result == []
157
+
158
+ def test_doi_with_url(self):
159
+ """Test finding DOI within a URL."""
160
+ text = "Visit https://doi.org/10.1234/example for the article."
161
+ result = find_doi_in_text(text)
162
+ assert "10.1234/example" in result
163
+
164
+ def test_doi_minimum_prefix_length(self):
165
+ """Test DOI with minimum prefix length (4 digits)."""
166
+ text = "The DOI is 10.1000/test"
167
+ result = find_doi_in_text(text)
168
+ assert result == ["10.1000/test"]
169
+
170
+ def test_doi_maximum_prefix_length(self):
171
+ """Test DOI with maximum prefix length (9 digits)."""
172
+ text = "The DOI is 10.123456789/test"
173
+ result = find_doi_in_text(text)
174
+ assert result == ["10.123456789/test"]
175
+
176
+ def test_invalid_doi_too_short_prefix(self):
177
+ """Test that DOIs with too short prefix are not found."""
178
+ text = "This is not a valid DOI: 10.123/test"
179
+ result = find_doi_in_text(text)
180
+ assert result == []
181
+
182
+ def test_doi_with_underscores(self):
183
+ """Test finding DOI with underscores."""
184
+ text = "DOI: 10.1234/test_example_123"
185
+ result = find_doi_in_text(text)
186
+ assert "10.1234/test_example_123" in result
187
+
188
+ def test_doi_in_multiline_text(self):
189
+ """Test finding DOI in multiline text."""
190
+ text = """This is a paper.
191
+ The DOI is 10.1234/example
192
+ It was published in 2024."""
193
+ result = find_doi_in_text(text)
194
+ assert result == ["10.1234/example"]
195
+
196
+ def test_doi_with_mixed_case(self):
197
+ """Test finding DOI with mixed case letters."""
198
+ text = "DOI: 10.1234/AbCdEf123"
199
+ result = find_doi_in_text(text)
200
+ assert "10.1234/AbCdEf123" in result
201
+
202
+ def test_real_world_dois(self):
203
+ """Test finding real-world DOI examples."""
204
+ text = "See 10.1038/nature12373 and 10.1126/science.1259855 for more information."
205
+ result = find_doi_in_text(text)
206
+ assert len(result) == 2
207
+ assert "10.1038/nature12373" in result
208
+ assert "10.1126/science.1259855" in result
209
+
210
+ class TestStringWordCount:
211
+ def test_single_word(self):
212
+ """Test with a single word."""
213
+ assert string_word_count("file.csv") == 1
214
+ assert string_word_count("another_file[2].jpg") == 1
215
+
216
+ def test_multiple_words(self):
217
+ """Test with multiple words."""
218
+ assert string_word_count("Hello World from Roagg") == 4
219
+
220
+ def test_leading_trailing_spaces(self):
221
+ """Test with leading and trailing spaces."""
222
+ # "Leading and trailing spaces" = 4 words (not 5)
223
+ assert string_word_count(" Leading and trailing spaces ") == 4
224
+
225
+ def test_multiple_spaces_between_words(self):
226
+ """Test with multiple spaces between words."""
227
+ # "Multiple spaces between words" = 4 words (not 5)
228
+ assert string_word_count("Multiple spaces between words") == 4
229
+
230
+ def test_empty_string(self):
231
+ """Test with an empty string."""
232
+ assert string_word_count("") == 0
233
+
234
+ def test_string_with_only_spaces(self):
235
+ """Test with a string containing only spaces."""
236
+ assert string_word_count(" ") == 0
237
+
238
+ def test_string_with_newlines_and_tabs(self):
239
+ """Test with newlines and tabs."""
240
+ assert string_word_count("Hello\nWorld\tfrom Roagg") == 4
241
+
242
+ def test_string_with_punctuation(self):
243
+ """Test with punctuation - split() doesn't separate on punctuation."""
244
+ # "Hello, world! This is Roagg." = 5 words (punctuation stays attached)
245
+ assert string_word_count("Hello, world! This is Roagg.") == 5
246
+
247
+ def test_string_with_special_characters(self):
248
+ """Test with special characters - split() doesn't separate on special chars."""
249
+ # "datafile.csv is ready!" = 3 words
250
+ assert string_word_count("datafile.csv is ready! ") == 3
251
+
252
+ def test_string_with_numeric_characters(self):
253
+ """Test with numeric characters - split() doesn't separate on dots."""
254
+ # "Version 2.0 of the software" = 5 words (2.0 is one word)
255
+ assert string_word_count("Version 2.0 of the software ") == 5