citations-collector 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- citations_collector/__init__.py +18 -0
- citations_collector/_version.py +34 -0
- citations_collector/cli.py +525 -0
- citations_collector/core.py +503 -0
- citations_collector/discovery/__init__.py +17 -0
- citations_collector/discovery/base.py +26 -0
- citations_collector/discovery/crossref.py +210 -0
- citations_collector/discovery/datacite.py +260 -0
- citations_collector/discovery/openalex.py +252 -0
- citations_collector/discovery/opencitations.py +168 -0
- citations_collector/discovery/utils.py +62 -0
- citations_collector/importers/__init__.py +17 -0
- citations_collector/importers/bibtex.py +178 -0
- citations_collector/importers/dandi.py +314 -0
- citations_collector/importers/github.py +147 -0
- citations_collector/importers/zenodo.py +110 -0
- citations_collector/importers/zotero.py +262 -0
- citations_collector/merge_detection.py +216 -0
- citations_collector/models/__init__.py +44 -0
- citations_collector/models/generated.py +525 -0
- citations_collector/pdf.py +260 -0
- citations_collector/persistence/__init__.py +7 -0
- citations_collector/persistence/tsv_io.py +121 -0
- citations_collector/persistence/yaml_io.py +50 -0
- citations_collector/py.typed +0 -0
- citations_collector/unpaywall.py +60 -0
- citations_collector/zotero_sync.py +591 -0
- citations_collector-0.2.3.dist-info/METADATA +456 -0
- citations_collector-0.2.3.dist-info/RECORD +31 -0
- citations_collector-0.2.3.dist-info/WHEEL +4 -0
- citations_collector-0.2.3.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
"""Import dandisets from DANDI Archive API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import logging
|
|
7
|
+
from collections.abc import Callable, Iterator
|
|
8
|
+
from datetime import date
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import requests
|
|
12
|
+
|
|
13
|
+
from citations_collector.models import Collection, Item, ItemFlavor, ItemRef, RefType
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DANDIImporter:
|
|
19
|
+
"""
|
|
20
|
+
Import dandisets from DANDI Archive API.
|
|
21
|
+
|
|
22
|
+
Fetches all dandisets (or a subset) from the DANDI Archive API
|
|
23
|
+
and creates a Collection with version DOIs for citation tracking.
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
importer = DANDIImporter()
|
|
27
|
+
collection = importer.import_all(limit=10)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
BASE_URL = "https://api.dandiarchive.org/api"
|
|
31
|
+
PAGE_SIZE = 100 # DANDI API default
|
|
32
|
+
|
|
33
|
+
def __init__(self, api_url: str | None = None) -> None:
|
|
34
|
+
"""
|
|
35
|
+
Initialize DANDI importer.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
api_url: Optional custom API URL (for testing).
|
|
39
|
+
Defaults to DANDI Archive production API.
|
|
40
|
+
"""
|
|
41
|
+
self.api_url = api_url or self.BASE_URL
|
|
42
|
+
self.session = requests.Session()
|
|
43
|
+
# Set a reasonable timeout and user agent
|
|
44
|
+
self.session.headers["User-Agent"] = "citations-collector/0.1"
|
|
45
|
+
|
|
46
|
+
def import_specific(
|
|
47
|
+
self,
|
|
48
|
+
dandiset_ids: list[str],
|
|
49
|
+
include_draft: bool = False,
|
|
50
|
+
progress_callback: Callable[[int, int | None], None] | None = None,
|
|
51
|
+
) -> Collection:
|
|
52
|
+
"""
|
|
53
|
+
Import specific dandisets by their identifiers.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
dandiset_ids: List of dandiset identifiers (e.g., ["000003", "000402"])
|
|
57
|
+
include_draft: If True, include draft versions without DOIs.
|
|
58
|
+
progress_callback: Optional callback(current, total) for progress updates.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Collection with the specified dandisets and their versions.
|
|
62
|
+
"""
|
|
63
|
+
items: list[Item] = []
|
|
64
|
+
total = len(dandiset_ids)
|
|
65
|
+
|
|
66
|
+
for idx, dandiset_id in enumerate(dandiset_ids):
|
|
67
|
+
# Fetch dandiset metadata
|
|
68
|
+
dandiset = self._fetch_dandiset(dandiset_id)
|
|
69
|
+
if dandiset is None:
|
|
70
|
+
logger.warning(f"Dandiset {dandiset_id} not found, skipping")
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
item = self._dandiset_to_item(dandiset, include_draft=include_draft)
|
|
74
|
+
if item is not None and item.flavors:
|
|
75
|
+
items.append(item)
|
|
76
|
+
logger.debug(f"Imported dandiset {item.item_id} with {len(item.flavors)} versions")
|
|
77
|
+
|
|
78
|
+
if progress_callback:
|
|
79
|
+
progress_callback(idx + 1, total)
|
|
80
|
+
|
|
81
|
+
logger.info(f"Imported {len(items)} specific dandisets from DANDI Archive")
|
|
82
|
+
|
|
83
|
+
return Collection(
|
|
84
|
+
name="DANDI Archive",
|
|
85
|
+
description="Neural data archive with versioned dandisets",
|
|
86
|
+
homepage="https://dandiarchive.org",
|
|
87
|
+
source_type="dandi",
|
|
88
|
+
items=items,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def import_all(
|
|
92
|
+
self,
|
|
93
|
+
include_draft: bool = False,
|
|
94
|
+
limit: int | None = None,
|
|
95
|
+
progress_callback: Callable[[int, int | None], None] | None = None,
|
|
96
|
+
) -> Collection:
|
|
97
|
+
"""
|
|
98
|
+
Import all dandisets as a Collection.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
include_draft: If True, include draft versions without DOIs.
|
|
102
|
+
Default False (only published versions with DOIs).
|
|
103
|
+
limit: Optional limit on number of dandisets to import.
|
|
104
|
+
progress_callback: Optional callback(current, total) for progress updates.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Collection with:
|
|
108
|
+
- item_id: "dandi:NNNNNN" (e.g., "dandi:000003")
|
|
109
|
+
- flavor_id: version string (e.g., "0.230629.1955")
|
|
110
|
+
- ref: DOI (e.g., "10.48324/dandi.000003/0.230629.1955")
|
|
111
|
+
"""
|
|
112
|
+
items: list[Item] = []
|
|
113
|
+
count = 0
|
|
114
|
+
|
|
115
|
+
for dandiset in self._iter_dandisets():
|
|
116
|
+
if limit is not None and count >= limit:
|
|
117
|
+
break
|
|
118
|
+
|
|
119
|
+
item = self._dandiset_to_item(dandiset, include_draft=include_draft)
|
|
120
|
+
if item is not None and item.flavors: # Only include if has versions
|
|
121
|
+
items.append(item)
|
|
122
|
+
count += 1
|
|
123
|
+
|
|
124
|
+
if progress_callback:
|
|
125
|
+
progress_callback(count, limit)
|
|
126
|
+
|
|
127
|
+
logger.debug(f"Imported dandiset {item.item_id} with {len(item.flavors)} versions")
|
|
128
|
+
|
|
129
|
+
logger.info(f"Imported {len(items)} dandisets from DANDI Archive")
|
|
130
|
+
|
|
131
|
+
return Collection(
|
|
132
|
+
name="DANDI Archive",
|
|
133
|
+
description="Neural data archive with versioned dandisets",
|
|
134
|
+
homepage="https://dandiarchive.org",
|
|
135
|
+
source_type="dandi",
|
|
136
|
+
items=items,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def _fetch_dandiset(self, dandiset_id: str) -> dict | None:
|
|
140
|
+
"""
|
|
141
|
+
Fetch a single dandiset by ID.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
dandiset_id: The dandiset identifier (e.g., "000003", "000402")
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Dandiset metadata dictionary or None if not found
|
|
148
|
+
"""
|
|
149
|
+
url = f"{self.api_url}/dandisets/{dandiset_id}/"
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
response = self.session.get(url, timeout=60)
|
|
153
|
+
response.raise_for_status()
|
|
154
|
+
return response.json() # type: ignore[no-any-return]
|
|
155
|
+
except requests.RequestException as e:
|
|
156
|
+
logger.error(f"Failed to fetch dandiset {dandiset_id}: {e}")
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
def _iter_dandisets(self) -> Iterator[dict]:
|
|
160
|
+
"""
|
|
161
|
+
Iterate over all dandisets from the API (handles pagination).
|
|
162
|
+
|
|
163
|
+
Yields:
|
|
164
|
+
Dandiset metadata dictionaries from API
|
|
165
|
+
"""
|
|
166
|
+
url: str | None = f"{self.api_url}/dandisets/"
|
|
167
|
+
params: dict[str, Any] = {"page_size": self.PAGE_SIZE, "ordering": "identifier"}
|
|
168
|
+
|
|
169
|
+
while url:
|
|
170
|
+
try:
|
|
171
|
+
response = self.session.get(url, params=params, timeout=60)
|
|
172
|
+
response.raise_for_status()
|
|
173
|
+
data = response.json()
|
|
174
|
+
|
|
175
|
+
yield from data.get("results", [])
|
|
176
|
+
|
|
177
|
+
# Follow pagination
|
|
178
|
+
url = data.get("next")
|
|
179
|
+
params = {} # Next URL includes params
|
|
180
|
+
|
|
181
|
+
except requests.RequestException as e:
|
|
182
|
+
logger.error(f"DANDI API error: {e}")
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
def _dandiset_to_item(self, dandiset: dict, include_draft: bool = False) -> Item | None:
|
|
186
|
+
"""
|
|
187
|
+
Convert a dandiset API response to an Item.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
dandiset: Dandiset metadata from API
|
|
191
|
+
include_draft: Whether to include draft versions
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
Item with flavors for each published version, or None if no versions
|
|
195
|
+
"""
|
|
196
|
+
identifier = dandiset.get("identifier", "")
|
|
197
|
+
if not identifier:
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
# Extract most recent version info for name
|
|
201
|
+
draft_version = dandiset.get("draft_version", {})
|
|
202
|
+
most_recent = dandiset.get("most_recent_published_version") or draft_version
|
|
203
|
+
|
|
204
|
+
name = most_recent.get("name", f"Dandiset {identifier}")
|
|
205
|
+
item_id = f"dandi:{identifier}"
|
|
206
|
+
homepage = f"https://dandiarchive.org/dandiset/{identifier}"
|
|
207
|
+
|
|
208
|
+
# Get all versions
|
|
209
|
+
flavors = self._get_versions(identifier, include_draft=include_draft)
|
|
210
|
+
|
|
211
|
+
if not flavors:
|
|
212
|
+
logger.debug(f"Dandiset {identifier} has no published versions, skipping")
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
return Item(
|
|
216
|
+
item_id=item_id,
|
|
217
|
+
name=name,
|
|
218
|
+
homepage=homepage,
|
|
219
|
+
flavors=flavors,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def _get_versions(self, dandiset_id: str, include_draft: bool = False) -> list[ItemFlavor]:
|
|
223
|
+
"""
|
|
224
|
+
Get all versions for a dandiset.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
dandiset_id: The dandiset identifier (e.g., "000003")
|
|
228
|
+
include_draft: Whether to include draft version
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
List of ItemFlavor objects for each version with a DOI
|
|
232
|
+
"""
|
|
233
|
+
url: str | None = f"{self.api_url}/dandisets/{dandiset_id}/versions/"
|
|
234
|
+
flavors: list[ItemFlavor] = []
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
# Paginate through versions
|
|
238
|
+
params: dict[str, Any] = {"page_size": 100, "ordering": "-created"}
|
|
239
|
+
|
|
240
|
+
while url:
|
|
241
|
+
response = self.session.get(url, params=params, timeout=60)
|
|
242
|
+
response.raise_for_status()
|
|
243
|
+
data = response.json()
|
|
244
|
+
|
|
245
|
+
for version in data.get("results", []):
|
|
246
|
+
flavor = self._version_to_flavor(dandiset_id, version, include_draft)
|
|
247
|
+
if flavor:
|
|
248
|
+
flavors.append(flavor)
|
|
249
|
+
|
|
250
|
+
url = data.get("next")
|
|
251
|
+
params = {}
|
|
252
|
+
|
|
253
|
+
except requests.RequestException as e:
|
|
254
|
+
logger.warning(f"Failed to fetch versions for dandiset {dandiset_id}: {e}")
|
|
255
|
+
|
|
256
|
+
return flavors
|
|
257
|
+
|
|
258
|
+
def _version_to_flavor(
|
|
259
|
+
self, dandiset_id: str, version: dict, include_draft: bool = False
|
|
260
|
+
) -> ItemFlavor | None:
|
|
261
|
+
"""
|
|
262
|
+
Convert a version API response to an ItemFlavor.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
dandiset_id: The dandiset identifier
|
|
266
|
+
version: Version metadata from API
|
|
267
|
+
include_draft: Whether to include draft versions
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
ItemFlavor with DOI ref, or None if draft and not included
|
|
271
|
+
"""
|
|
272
|
+
version_str = version.get("version", "")
|
|
273
|
+
status = version.get("status", "")
|
|
274
|
+
|
|
275
|
+
# DANDI API uses "Valid" for published versions with DOIs
|
|
276
|
+
# "Published" status is confusingly used for draft versions
|
|
277
|
+
# Draft versions have version_str == "draft"
|
|
278
|
+
is_draft = version_str == "draft"
|
|
279
|
+
|
|
280
|
+
# Skip draft versions unless requested
|
|
281
|
+
if is_draft and not include_draft:
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
# Published versions (status="Valid") have DOIs in the format:
|
|
285
|
+
# 10.48324/dandi.{dandiset_id}/{version}
|
|
286
|
+
# Draft versions don't have DOIs
|
|
287
|
+
if status == "Valid" and not is_draft:
|
|
288
|
+
doi = f"10.48324/dandi.{dandiset_id}/{version_str}"
|
|
289
|
+
refs = [ItemRef(ref_type=RefType.doi, ref_value=doi)]
|
|
290
|
+
elif include_draft and is_draft:
|
|
291
|
+
# For drafts, we can still track them but without DOI
|
|
292
|
+
refs = [
|
|
293
|
+
ItemRef(
|
|
294
|
+
ref_type=RefType.url,
|
|
295
|
+
ref_value=f"https://dandiarchive.org/dandiset/{dandiset_id}/draft",
|
|
296
|
+
)
|
|
297
|
+
]
|
|
298
|
+
else:
|
|
299
|
+
return None
|
|
300
|
+
|
|
301
|
+
# Parse release date
|
|
302
|
+
created = version.get("created")
|
|
303
|
+
release_date = None
|
|
304
|
+
if created:
|
|
305
|
+
with contextlib.suppress(ValueError, TypeError):
|
|
306
|
+
# API returns ISO format datetime
|
|
307
|
+
release_date = date.fromisoformat(created[:10])
|
|
308
|
+
|
|
309
|
+
return ItemFlavor(
|
|
310
|
+
flavor_id=version_str,
|
|
311
|
+
name=version.get("name"),
|
|
312
|
+
release_date=release_date,
|
|
313
|
+
refs=refs,
|
|
314
|
+
)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""GitHub repository to Zenodo DOI mapping."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from citations_collector.models import ItemRef, RefType
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GitHubMapper:
|
|
17
|
+
"""
|
|
18
|
+
Map GitHub repositories to Zenodo DOIs.
|
|
19
|
+
|
|
20
|
+
Many open source projects use Zenodo to archive releases and get DOIs.
|
|
21
|
+
This mapper attempts to extract Zenodo DOIs from GitHub repositories
|
|
22
|
+
via multiple strategies:
|
|
23
|
+
|
|
24
|
+
1. Check repository description for Zenodo badge
|
|
25
|
+
2. Check README for Zenodo DOI badge
|
|
26
|
+
3. Look for .zenodo.json file
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
"datalad/datalad" → 10.5281/zenodo.808846 (concept DOI)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
GITHUB_API = "https://api.github.com/repos"
|
|
33
|
+
ZENODO_DOI_PATTERN = re.compile(r"10\.5281/zenodo\.(\d+)")
|
|
34
|
+
|
|
35
|
+
def __init__(self, github_token: str | None = None) -> None:
|
|
36
|
+
"""
|
|
37
|
+
Initialize GitHub mapper.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
github_token: Optional GitHub personal access token for higher rate limits.
|
|
41
|
+
If not provided, reads from GITHUB_TOKEN environment variable.
|
|
42
|
+
"""
|
|
43
|
+
# Use provided token, or fallback to environment variable
|
|
44
|
+
if github_token is None:
|
|
45
|
+
github_token = os.getenv("GITHUB_TOKEN")
|
|
46
|
+
|
|
47
|
+
self.session = requests.Session()
|
|
48
|
+
if github_token:
|
|
49
|
+
self.session.headers["Authorization"] = f"token {github_token}"
|
|
50
|
+
logger.debug("Using GitHub token for authentication")
|
|
51
|
+
|
|
52
|
+
def map_to_doi(self, repo: str) -> ItemRef | None:
|
|
53
|
+
"""
|
|
54
|
+
Map GitHub repository to Zenodo DOI.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
repo: GitHub repository in "owner/name" format (e.g., "datalad/datalad")
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
ItemRef with DOI if found, None otherwise
|
|
61
|
+
"""
|
|
62
|
+
# Query GitHub API for repository info
|
|
63
|
+
url = f"{self.GITHUB_API}/{repo}"
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
response = self.session.get(url, timeout=30)
|
|
67
|
+
response.raise_for_status()
|
|
68
|
+
data = response.json()
|
|
69
|
+
except requests.RequestException as e:
|
|
70
|
+
logger.warning(f"GitHub API error for {repo}: {e}")
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
# Strategy 1: Check repository description
|
|
74
|
+
description = data.get("description", "")
|
|
75
|
+
doi = self._extract_zenodo_doi(description)
|
|
76
|
+
if doi:
|
|
77
|
+
logger.info(f"Found Zenodo DOI in description for {repo}: {doi}")
|
|
78
|
+
return ItemRef(ref_type=RefType("doi"), ref_value=doi)
|
|
79
|
+
|
|
80
|
+
# Strategy 2: Check README
|
|
81
|
+
readme_url = f"{self.GITHUB_API}/{repo}/readme"
|
|
82
|
+
try:
|
|
83
|
+
readme_response = self.session.get(readme_url, timeout=30)
|
|
84
|
+
readme_response.raise_for_status()
|
|
85
|
+
readme_data = readme_response.json()
|
|
86
|
+
|
|
87
|
+
# README content is base64 encoded
|
|
88
|
+
import base64
|
|
89
|
+
|
|
90
|
+
content = base64.b64decode(readme_data.get("content", "")).decode("utf-8")
|
|
91
|
+
doi = self._extract_zenodo_doi(content)
|
|
92
|
+
if doi:
|
|
93
|
+
logger.info(f"Found Zenodo DOI in README for {repo}: {doi}")
|
|
94
|
+
return ItemRef(ref_type=RefType("doi"), ref_value=doi)
|
|
95
|
+
except requests.RequestException:
|
|
96
|
+
# README not found or other error, continue to next strategy
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
# Strategy 3: Check for .zenodo.json file
|
|
100
|
+
zenodo_json_url = f"{self.GITHUB_API}/{repo}/contents/.zenodo.json"
|
|
101
|
+
try:
|
|
102
|
+
zenodo_response = self.session.get(zenodo_json_url, timeout=30)
|
|
103
|
+
zenodo_response.raise_for_status()
|
|
104
|
+
zenodo_data = zenodo_response.json()
|
|
105
|
+
|
|
106
|
+
import base64
|
|
107
|
+
import json
|
|
108
|
+
|
|
109
|
+
content = base64.b64decode(zenodo_data.get("content", "")).decode("utf-8")
|
|
110
|
+
zenodo_config = json.loads(content)
|
|
111
|
+
|
|
112
|
+
# .zenodo.json might have related_identifiers with DOI
|
|
113
|
+
for identifier in zenodo_config.get("related_identifiers", []):
|
|
114
|
+
if identifier.get("scheme") == "doi":
|
|
115
|
+
doi_value = identifier.get("identifier")
|
|
116
|
+
if doi_value:
|
|
117
|
+
# Clean up DOI (remove https://doi.org/ prefix)
|
|
118
|
+
doi_clean = doi_value.replace("https://doi.org/", "")
|
|
119
|
+
logger.info(f"Found Zenodo DOI in .zenodo.json for {repo}: {doi_clean}")
|
|
120
|
+
return ItemRef(ref_type=RefType("doi"), ref_value=doi_clean)
|
|
121
|
+
except requests.RequestException:
|
|
122
|
+
# .zenodo.json not found or other error
|
|
123
|
+
pass
|
|
124
|
+
|
|
125
|
+
logger.info(f"No Zenodo DOI found for GitHub repo {repo}")
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
def _extract_zenodo_doi(self, text: str) -> str | None:
|
|
129
|
+
"""
|
|
130
|
+
Extract Zenodo DOI from text.
|
|
131
|
+
|
|
132
|
+
Looks for common patterns like:
|
|
133
|
+
- https://doi.org/10.5281/zenodo.808846
|
|
134
|
+
- 10.5281/zenodo.808846
|
|
135
|
+
- []
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
text: Text to search
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
DOI string if found, None otherwise
|
|
142
|
+
"""
|
|
143
|
+
match = self.ZENODO_DOI_PATTERN.search(text)
|
|
144
|
+
if match:
|
|
145
|
+
# Return full DOI
|
|
146
|
+
return f"10.5281/zenodo.{match.group(1)}"
|
|
147
|
+
return None
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Zenodo concept expansion to version DOIs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
from citations_collector.models import ItemRef, RefType
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ZenodoExpander:
|
|
16
|
+
"""
|
|
17
|
+
Expand Zenodo concept IDs to all version DOIs.
|
|
18
|
+
|
|
19
|
+
Zenodo uses a "concept" DOI that represents all versions of a work,
|
|
20
|
+
and separate version-specific DOIs. This expander queries the Zenodo
|
|
21
|
+
API to get all version DOIs for a concept.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
Concept ID 808846 (DataLad) expands to:
|
|
25
|
+
- 10.5281/zenodo.808846 (concept DOI)
|
|
26
|
+
- 10.5281/zenodo.808847 (v0.1.0)
|
|
27
|
+
- 10.5281/zenodo.1234567 (v0.2.0)
|
|
28
|
+
- ... (all other versions)
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
BASE_URL = "https://zenodo.org/api/records"
|
|
32
|
+
|
|
33
|
+
def __init__(self, zenodo_token: str | None = None) -> None:
|
|
34
|
+
"""
|
|
35
|
+
Initialize Zenodo expander.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
zenodo_token: Optional Zenodo personal access token for authentication.
|
|
39
|
+
If not provided, reads from ZENODO_TOKEN environment variable.
|
|
40
|
+
"""
|
|
41
|
+
# Use provided token, or fallback to environment variable
|
|
42
|
+
if zenodo_token is None:
|
|
43
|
+
zenodo_token = os.getenv("ZENODO_TOKEN")
|
|
44
|
+
|
|
45
|
+
self.session = requests.Session()
|
|
46
|
+
if zenodo_token:
|
|
47
|
+
# Zenodo uses Bearer token authentication
|
|
48
|
+
self.session.headers["Authorization"] = f"Bearer {zenodo_token}"
|
|
49
|
+
logger.debug("Using Zenodo token for authentication")
|
|
50
|
+
|
|
51
|
+
def expand(self, concept_id: str) -> list[ItemRef]:
|
|
52
|
+
"""
|
|
53
|
+
Expand Zenodo concept ID to all version DOIs.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
concept_id: Zenodo concept ID (numeric, e.g. "808846")
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
List of ItemRef objects with DOI references for each version
|
|
60
|
+
"""
|
|
61
|
+
# Query Zenodo API for the concept record (latest version)
|
|
62
|
+
url = f"{self.BASE_URL}/{concept_id}"
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
response = self.session.get(url, timeout=30)
|
|
66
|
+
response.raise_for_status()
|
|
67
|
+
data = response.json()
|
|
68
|
+
except requests.RequestException as e:
|
|
69
|
+
logger.warning(f"Zenodo API error for concept {concept_id}: {e}")
|
|
70
|
+
return []
|
|
71
|
+
|
|
72
|
+
# Extract version DOIs
|
|
73
|
+
refs = []
|
|
74
|
+
|
|
75
|
+
# Strategy 1: Get concept DOI (represents all versions)
|
|
76
|
+
# This is the most comprehensive as it aggregates citations across versions
|
|
77
|
+
concept_doi = data.get("conceptdoi")
|
|
78
|
+
if concept_doi:
|
|
79
|
+
# Extract just the DOI part (remove https://doi.org/ prefix if present)
|
|
80
|
+
concept_doi_clean = concept_doi.replace("https://doi.org/", "")
|
|
81
|
+
refs.append(ItemRef(ref_type=RefType("doi"), ref_value=concept_doi_clean))
|
|
82
|
+
|
|
83
|
+
# Strategy 2: Get all individual version DOIs
|
|
84
|
+
# Check if there's a link to all versions
|
|
85
|
+
links = data.get("links", {})
|
|
86
|
+
versions_url = links.get("versions")
|
|
87
|
+
|
|
88
|
+
if versions_url:
|
|
89
|
+
# Query the versions endpoint to get all versions
|
|
90
|
+
try:
|
|
91
|
+
versions_response = self.session.get(versions_url, timeout=30)
|
|
92
|
+
versions_response.raise_for_status()
|
|
93
|
+
versions_data = versions_response.json()
|
|
94
|
+
|
|
95
|
+
# Extract DOIs from each version
|
|
96
|
+
for hit in versions_data.get("hits", {}).get("hits", []):
|
|
97
|
+
version_doi = hit.get("doi")
|
|
98
|
+
if version_doi:
|
|
99
|
+
version_doi_clean = version_doi.replace("https://doi.org/", "")
|
|
100
|
+
# Don't duplicate concept DOI
|
|
101
|
+
if version_doi_clean != concept_doi_clean:
|
|
102
|
+
refs.append(
|
|
103
|
+
ItemRef(ref_type=RefType("doi"), ref_value=version_doi_clean)
|
|
104
|
+
)
|
|
105
|
+
except requests.RequestException as e:
|
|
106
|
+
logger.warning(f"Failed to fetch versions for {concept_id}: {e}")
|
|
107
|
+
# Fall back to just the concept DOI
|
|
108
|
+
|
|
109
|
+
logger.info(f"Expanded Zenodo concept {concept_id} to {len(refs)} DOI references")
|
|
110
|
+
return refs
|