citations-collector 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,314 @@
1
+ """Import dandisets from DANDI Archive API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import logging
7
+ from collections.abc import Callable, Iterator
8
+ from datetime import date
9
+ from typing import Any
10
+
11
+ import requests
12
+
13
+ from citations_collector.models import Collection, Item, ItemFlavor, ItemRef, RefType
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class DANDIImporter:
19
+ """
20
+ Import dandisets from DANDI Archive API.
21
+
22
+ Fetches all dandisets (or a subset) from the DANDI Archive API
23
+ and creates a Collection with version DOIs for citation tracking.
24
+
25
+ Example:
26
+ importer = DANDIImporter()
27
+ collection = importer.import_all(limit=10)
28
+ """
29
+
30
+ BASE_URL = "https://api.dandiarchive.org/api"
31
+ PAGE_SIZE = 100 # DANDI API default
32
+
33
+ def __init__(self, api_url: str | None = None) -> None:
34
+ """
35
+ Initialize DANDI importer.
36
+
37
+ Args:
38
+ api_url: Optional custom API URL (for testing).
39
+ Defaults to DANDI Archive production API.
40
+ """
41
+ self.api_url = api_url or self.BASE_URL
42
+ self.session = requests.Session()
43
+ # Set a reasonable timeout and user agent
44
+ self.session.headers["User-Agent"] = "citations-collector/0.1"
45
+
46
+ def import_specific(
47
+ self,
48
+ dandiset_ids: list[str],
49
+ include_draft: bool = False,
50
+ progress_callback: Callable[[int, int | None], None] | None = None,
51
+ ) -> Collection:
52
+ """
53
+ Import specific dandisets by their identifiers.
54
+
55
+ Args:
56
+ dandiset_ids: List of dandiset identifiers (e.g., ["000003", "000402"])
57
+ include_draft: If True, include draft versions without DOIs.
58
+ progress_callback: Optional callback(current, total) for progress updates.
59
+
60
+ Returns:
61
+ Collection with the specified dandisets and their versions.
62
+ """
63
+ items: list[Item] = []
64
+ total = len(dandiset_ids)
65
+
66
+ for idx, dandiset_id in enumerate(dandiset_ids):
67
+ # Fetch dandiset metadata
68
+ dandiset = self._fetch_dandiset(dandiset_id)
69
+ if dandiset is None:
70
+ logger.warning(f"Dandiset {dandiset_id} not found, skipping")
71
+ continue
72
+
73
+ item = self._dandiset_to_item(dandiset, include_draft=include_draft)
74
+ if item is not None and item.flavors:
75
+ items.append(item)
76
+ logger.debug(f"Imported dandiset {item.item_id} with {len(item.flavors)} versions")
77
+
78
+ if progress_callback:
79
+ progress_callback(idx + 1, total)
80
+
81
+ logger.info(f"Imported {len(items)} specific dandisets from DANDI Archive")
82
+
83
+ return Collection(
84
+ name="DANDI Archive",
85
+ description="Neural data archive with versioned dandisets",
86
+ homepage="https://dandiarchive.org",
87
+ source_type="dandi",
88
+ items=items,
89
+ )
90
+
91
+ def import_all(
92
+ self,
93
+ include_draft: bool = False,
94
+ limit: int | None = None,
95
+ progress_callback: Callable[[int, int | None], None] | None = None,
96
+ ) -> Collection:
97
+ """
98
+ Import all dandisets as a Collection.
99
+
100
+ Args:
101
+ include_draft: If True, include draft versions without DOIs.
102
+ Default False (only published versions with DOIs).
103
+ limit: Optional limit on number of dandisets to import.
104
+ progress_callback: Optional callback(current, total) for progress updates.
105
+
106
+ Returns:
107
+ Collection with:
108
+ - item_id: "dandi:NNNNNN" (e.g., "dandi:000003")
109
+ - flavor_id: version string (e.g., "0.230629.1955")
110
+ - ref: DOI (e.g., "10.48324/dandi.000003/0.230629.1955")
111
+ """
112
+ items: list[Item] = []
113
+ count = 0
114
+
115
+ for dandiset in self._iter_dandisets():
116
+ if limit is not None and count >= limit:
117
+ break
118
+
119
+ item = self._dandiset_to_item(dandiset, include_draft=include_draft)
120
+ if item is not None and item.flavors: # Only include if has versions
121
+ items.append(item)
122
+ count += 1
123
+
124
+ if progress_callback:
125
+ progress_callback(count, limit)
126
+
127
+ logger.debug(f"Imported dandiset {item.item_id} with {len(item.flavors)} versions")
128
+
129
+ logger.info(f"Imported {len(items)} dandisets from DANDI Archive")
130
+
131
+ return Collection(
132
+ name="DANDI Archive",
133
+ description="Neural data archive with versioned dandisets",
134
+ homepage="https://dandiarchive.org",
135
+ source_type="dandi",
136
+ items=items,
137
+ )
138
+
139
+ def _fetch_dandiset(self, dandiset_id: str) -> dict | None:
140
+ """
141
+ Fetch a single dandiset by ID.
142
+
143
+ Args:
144
+ dandiset_id: The dandiset identifier (e.g., "000003", "000402")
145
+
146
+ Returns:
147
+ Dandiset metadata dictionary or None if not found
148
+ """
149
+ url = f"{self.api_url}/dandisets/{dandiset_id}/"
150
+
151
+ try:
152
+ response = self.session.get(url, timeout=60)
153
+ response.raise_for_status()
154
+ return response.json() # type: ignore[no-any-return]
155
+ except requests.RequestException as e:
156
+ logger.error(f"Failed to fetch dandiset {dandiset_id}: {e}")
157
+ return None
158
+
159
+ def _iter_dandisets(self) -> Iterator[dict]:
160
+ """
161
+ Iterate over all dandisets from the API (handles pagination).
162
+
163
+ Yields:
164
+ Dandiset metadata dictionaries from API
165
+ """
166
+ url: str | None = f"{self.api_url}/dandisets/"
167
+ params: dict[str, Any] = {"page_size": self.PAGE_SIZE, "ordering": "identifier"}
168
+
169
+ while url:
170
+ try:
171
+ response = self.session.get(url, params=params, timeout=60)
172
+ response.raise_for_status()
173
+ data = response.json()
174
+
175
+ yield from data.get("results", [])
176
+
177
+ # Follow pagination
178
+ url = data.get("next")
179
+ params = {} # Next URL includes params
180
+
181
+ except requests.RequestException as e:
182
+ logger.error(f"DANDI API error: {e}")
183
+ break
184
+
185
+ def _dandiset_to_item(self, dandiset: dict, include_draft: bool = False) -> Item | None:
186
+ """
187
+ Convert a dandiset API response to an Item.
188
+
189
+ Args:
190
+ dandiset: Dandiset metadata from API
191
+ include_draft: Whether to include draft versions
192
+
193
+ Returns:
194
+ Item with flavors for each published version, or None if no versions
195
+ """
196
+ identifier = dandiset.get("identifier", "")
197
+ if not identifier:
198
+ return None
199
+
200
+ # Extract most recent version info for name
201
+ draft_version = dandiset.get("draft_version", {})
202
+ most_recent = dandiset.get("most_recent_published_version") or draft_version
203
+
204
+ name = most_recent.get("name", f"Dandiset {identifier}")
205
+ item_id = f"dandi:{identifier}"
206
+ homepage = f"https://dandiarchive.org/dandiset/{identifier}"
207
+
208
+ # Get all versions
209
+ flavors = self._get_versions(identifier, include_draft=include_draft)
210
+
211
+ if not flavors:
212
+ logger.debug(f"Dandiset {identifier} has no published versions, skipping")
213
+ return None
214
+
215
+ return Item(
216
+ item_id=item_id,
217
+ name=name,
218
+ homepage=homepage,
219
+ flavors=flavors,
220
+ )
221
+
222
+ def _get_versions(self, dandiset_id: str, include_draft: bool = False) -> list[ItemFlavor]:
223
+ """
224
+ Get all versions for a dandiset.
225
+
226
+ Args:
227
+ dandiset_id: The dandiset identifier (e.g., "000003")
228
+ include_draft: Whether to include draft version
229
+
230
+ Returns:
231
+ List of ItemFlavor objects for each version with a DOI
232
+ """
233
+ url: str | None = f"{self.api_url}/dandisets/{dandiset_id}/versions/"
234
+ flavors: list[ItemFlavor] = []
235
+
236
+ try:
237
+ # Paginate through versions
238
+ params: dict[str, Any] = {"page_size": 100, "ordering": "-created"}
239
+
240
+ while url:
241
+ response = self.session.get(url, params=params, timeout=60)
242
+ response.raise_for_status()
243
+ data = response.json()
244
+
245
+ for version in data.get("results", []):
246
+ flavor = self._version_to_flavor(dandiset_id, version, include_draft)
247
+ if flavor:
248
+ flavors.append(flavor)
249
+
250
+ url = data.get("next")
251
+ params = {}
252
+
253
+ except requests.RequestException as e:
254
+ logger.warning(f"Failed to fetch versions for dandiset {dandiset_id}: {e}")
255
+
256
+ return flavors
257
+
258
+ def _version_to_flavor(
259
+ self, dandiset_id: str, version: dict, include_draft: bool = False
260
+ ) -> ItemFlavor | None:
261
+ """
262
+ Convert a version API response to an ItemFlavor.
263
+
264
+ Args:
265
+ dandiset_id: The dandiset identifier
266
+ version: Version metadata from API
267
+ include_draft: Whether to include draft versions
268
+
269
+ Returns:
270
+ ItemFlavor with DOI ref, or None if draft and not included
271
+ """
272
+ version_str = version.get("version", "")
273
+ status = version.get("status", "")
274
+
275
+ # DANDI API uses "Valid" for published versions with DOIs
276
+ # "Published" status is confusingly used for draft versions
277
+ # Draft versions have version_str == "draft"
278
+ is_draft = version_str == "draft"
279
+
280
+ # Skip draft versions unless requested
281
+ if is_draft and not include_draft:
282
+ return None
283
+
284
+ # Published versions (status="Valid") have DOIs in the format:
285
+ # 10.48324/dandi.{dandiset_id}/{version}
286
+ # Draft versions don't have DOIs
287
+ if status == "Valid" and not is_draft:
288
+ doi = f"10.48324/dandi.{dandiset_id}/{version_str}"
289
+ refs = [ItemRef(ref_type=RefType.doi, ref_value=doi)]
290
+ elif include_draft and is_draft:
291
+ # For drafts, we can still track them but without DOI
292
+ refs = [
293
+ ItemRef(
294
+ ref_type=RefType.url,
295
+ ref_value=f"https://dandiarchive.org/dandiset/{dandiset_id}/draft",
296
+ )
297
+ ]
298
+ else:
299
+ return None
300
+
301
+ # Parse release date
302
+ created = version.get("created")
303
+ release_date = None
304
+ if created:
305
+ with contextlib.suppress(ValueError, TypeError):
306
+ # API returns ISO format datetime
307
+ release_date = date.fromisoformat(created[:10])
308
+
309
+ return ItemFlavor(
310
+ flavor_id=version_str,
311
+ name=version.get("name"),
312
+ release_date=release_date,
313
+ refs=refs,
314
+ )
@@ -0,0 +1,147 @@
1
+ """GitHub repository to Zenodo DOI mapping."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ import re
8
+
9
+ import requests
10
+
11
+ from citations_collector.models import ItemRef, RefType
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class GitHubMapper:
17
+ """
18
+ Map GitHub repositories to Zenodo DOIs.
19
+
20
+ Many open source projects use Zenodo to archive releases and get DOIs.
21
+ This mapper attempts to extract Zenodo DOIs from GitHub repositories
22
+ via multiple strategies:
23
+
24
+ 1. Check repository description for Zenodo badge
25
+ 2. Check README for Zenodo DOI badge
26
+ 3. Look for .zenodo.json file
27
+
28
+ Example:
29
+ "datalad/datalad" → 10.5281/zenodo.808846 (concept DOI)
30
+ """
31
+
32
+ GITHUB_API = "https://api.github.com/repos"
33
+ ZENODO_DOI_PATTERN = re.compile(r"10\.5281/zenodo\.(\d+)")
34
+
35
+ def __init__(self, github_token: str | None = None) -> None:
36
+ """
37
+ Initialize GitHub mapper.
38
+
39
+ Args:
40
+ github_token: Optional GitHub personal access token for higher rate limits.
41
+ If not provided, reads from GITHUB_TOKEN environment variable.
42
+ """
43
+ # Use provided token, or fallback to environment variable
44
+ if github_token is None:
45
+ github_token = os.getenv("GITHUB_TOKEN")
46
+
47
+ self.session = requests.Session()
48
+ if github_token:
49
+ self.session.headers["Authorization"] = f"token {github_token}"
50
+ logger.debug("Using GitHub token for authentication")
51
+
52
+ def map_to_doi(self, repo: str) -> ItemRef | None:
53
+ """
54
+ Map GitHub repository to Zenodo DOI.
55
+
56
+ Args:
57
+ repo: GitHub repository in "owner/name" format (e.g., "datalad/datalad")
58
+
59
+ Returns:
60
+ ItemRef with DOI if found, None otherwise
61
+ """
62
+ # Query GitHub API for repository info
63
+ url = f"{self.GITHUB_API}/{repo}"
64
+
65
+ try:
66
+ response = self.session.get(url, timeout=30)
67
+ response.raise_for_status()
68
+ data = response.json()
69
+ except requests.RequestException as e:
70
+ logger.warning(f"GitHub API error for {repo}: {e}")
71
+ return None
72
+
73
+ # Strategy 1: Check repository description
74
+ description = data.get("description", "")
75
+ doi = self._extract_zenodo_doi(description)
76
+ if doi:
77
+ logger.info(f"Found Zenodo DOI in description for {repo}: {doi}")
78
+ return ItemRef(ref_type=RefType("doi"), ref_value=doi)
79
+
80
+ # Strategy 2: Check README
81
+ readme_url = f"{self.GITHUB_API}/{repo}/readme"
82
+ try:
83
+ readme_response = self.session.get(readme_url, timeout=30)
84
+ readme_response.raise_for_status()
85
+ readme_data = readme_response.json()
86
+
87
+ # README content is base64 encoded
88
+ import base64
89
+
90
+ content = base64.b64decode(readme_data.get("content", "")).decode("utf-8")
91
+ doi = self._extract_zenodo_doi(content)
92
+ if doi:
93
+ logger.info(f"Found Zenodo DOI in README for {repo}: {doi}")
94
+ return ItemRef(ref_type=RefType("doi"), ref_value=doi)
95
+ except requests.RequestException:
96
+ # README not found or other error, continue to next strategy
97
+ pass
98
+
99
+ # Strategy 3: Check for .zenodo.json file
100
+ zenodo_json_url = f"{self.GITHUB_API}/{repo}/contents/.zenodo.json"
101
+ try:
102
+ zenodo_response = self.session.get(zenodo_json_url, timeout=30)
103
+ zenodo_response.raise_for_status()
104
+ zenodo_data = zenodo_response.json()
105
+
106
+ import base64
107
+ import json
108
+
109
+ content = base64.b64decode(zenodo_data.get("content", "")).decode("utf-8")
110
+ zenodo_config = json.loads(content)
111
+
112
+ # .zenodo.json might have related_identifiers with DOI
113
+ for identifier in zenodo_config.get("related_identifiers", []):
114
+ if identifier.get("scheme") == "doi":
115
+ doi_value = identifier.get("identifier")
116
+ if doi_value:
117
+ # Clean up DOI (remove https://doi.org/ prefix)
118
+ doi_clean = doi_value.replace("https://doi.org/", "")
119
+ logger.info(f"Found Zenodo DOI in .zenodo.json for {repo}: {doi_clean}")
120
+ return ItemRef(ref_type=RefType("doi"), ref_value=doi_clean)
121
+ except requests.RequestException:
122
+ # .zenodo.json not found or other error
123
+ pass
124
+
125
+ logger.info(f"No Zenodo DOI found for GitHub repo {repo}")
126
+ return None
127
+
128
+ def _extract_zenodo_doi(self, text: str) -> str | None:
129
+ """
130
+ Extract Zenodo DOI from text.
131
+
132
+ Looks for common patterns like:
133
+ - https://doi.org/10.5281/zenodo.808846
134
+ - 10.5281/zenodo.808846
135
+ - [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.808846.svg)]
136
+
137
+ Args:
138
+ text: Text to search
139
+
140
+ Returns:
141
+ DOI string if found, None otherwise
142
+ """
143
+ match = self.ZENODO_DOI_PATTERN.search(text)
144
+ if match:
145
+ # Return full DOI
146
+ return f"10.5281/zenodo.{match.group(1)}"
147
+ return None
@@ -0,0 +1,110 @@
1
+ """Zenodo concept expansion to version DOIs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+
8
+ import requests
9
+
10
+ from citations_collector.models import ItemRef, RefType
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class ZenodoExpander:
16
+ """
17
+ Expand Zenodo concept IDs to all version DOIs.
18
+
19
+ Zenodo uses a "concept" DOI that represents all versions of a work,
20
+ and separate version-specific DOIs. This expander queries the Zenodo
21
+ API to get all version DOIs for a concept.
22
+
23
+ Example:
24
+ Concept ID 808846 (DataLad) expands to:
25
+ - 10.5281/zenodo.808846 (concept DOI)
26
+ - 10.5281/zenodo.808847 (v0.1.0)
27
+ - 10.5281/zenodo.1234567 (v0.2.0)
28
+ - ... (all other versions)
29
+ """
30
+
31
+ BASE_URL = "https://zenodo.org/api/records"
32
+
33
+ def __init__(self, zenodo_token: str | None = None) -> None:
34
+ """
35
+ Initialize Zenodo expander.
36
+
37
+ Args:
38
+ zenodo_token: Optional Zenodo personal access token for authentication.
39
+ If not provided, reads from ZENODO_TOKEN environment variable.
40
+ """
41
+ # Use provided token, or fallback to environment variable
42
+ if zenodo_token is None:
43
+ zenodo_token = os.getenv("ZENODO_TOKEN")
44
+
45
+ self.session = requests.Session()
46
+ if zenodo_token:
47
+ # Zenodo uses Bearer token authentication
48
+ self.session.headers["Authorization"] = f"Bearer {zenodo_token}"
49
+ logger.debug("Using Zenodo token for authentication")
50
+
51
+ def expand(self, concept_id: str) -> list[ItemRef]:
52
+ """
53
+ Expand Zenodo concept ID to all version DOIs.
54
+
55
+ Args:
56
+ concept_id: Zenodo concept ID (numeric, e.g. "808846")
57
+
58
+ Returns:
59
+ List of ItemRef objects with DOI references for each version
60
+ """
61
+ # Query Zenodo API for the concept record (latest version)
62
+ url = f"{self.BASE_URL}/{concept_id}"
63
+
64
+ try:
65
+ response = self.session.get(url, timeout=30)
66
+ response.raise_for_status()
67
+ data = response.json()
68
+ except requests.RequestException as e:
69
+ logger.warning(f"Zenodo API error for concept {concept_id}: {e}")
70
+ return []
71
+
72
+ # Extract version DOIs
73
+ refs = []
74
+
75
+ # Strategy 1: Get concept DOI (represents all versions)
76
+ # This is the most comprehensive as it aggregates citations across versions
77
+ concept_doi = data.get("conceptdoi")
78
+ if concept_doi:
79
+ # Extract just the DOI part (remove https://doi.org/ prefix if present)
80
+ concept_doi_clean = concept_doi.replace("https://doi.org/", "")
81
+ refs.append(ItemRef(ref_type=RefType("doi"), ref_value=concept_doi_clean))
82
+
83
+ # Strategy 2: Get all individual version DOIs
84
+ # Check if there's a link to all versions
85
+ links = data.get("links", {})
86
+ versions_url = links.get("versions")
87
+
88
+ if versions_url:
89
+ # Query the versions endpoint to get all versions
90
+ try:
91
+ versions_response = self.session.get(versions_url, timeout=30)
92
+ versions_response.raise_for_status()
93
+ versions_data = versions_response.json()
94
+
95
+ # Extract DOIs from each version
96
+ for hit in versions_data.get("hits", {}).get("hits", []):
97
+ version_doi = hit.get("doi")
98
+ if version_doi:
99
+ version_doi_clean = version_doi.replace("https://doi.org/", "")
100
+ # Don't duplicate concept DOI
101
+ if version_doi_clean != concept_doi_clean:
102
+ refs.append(
103
+ ItemRef(ref_type=RefType("doi"), ref_value=version_doi_clean)
104
+ )
105
+ except requests.RequestException as e:
106
+ logger.warning(f"Failed to fetch versions for {concept_id}: {e}")
107
+ # Fall back to just the concept DOI
108
+
109
+ logger.info(f"Expanded Zenodo concept {concept_id} to {len(refs)} DOI references")
110
+ return refs