citations-collector 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,210 @@
1
+ """CrossRef Event Data citation discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from datetime import datetime
8
+ from typing import Any, cast
9
+
10
+ import requests
11
+ from requests.adapters import HTTPAdapter
12
+ from urllib3.util.retry import Retry
13
+
14
+ from citations_collector.discovery.base import AbstractDiscoverer
15
+ from citations_collector.models import CitationRecord, CitationSource, ItemRef
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _sanitize_text(text: str | None) -> str | None:
21
+ """Sanitize text for TSV output - normalize whitespace, remove control chars."""
22
+ if text is None:
23
+ return None
24
+ # Replace newlines, tabs, carriage returns with spaces
25
+ text = re.sub(r"[\n\r\t]+", " ", text)
26
+ # Collapse multiple spaces
27
+ text = re.sub(r" +", " ", text)
28
+ # Strip leading/trailing whitespace
29
+ return text.strip() or None
30
+
31
+
32
+ class CrossRefDiscoverer(AbstractDiscoverer):
33
+ """Discover citations via CrossRef Event Data API."""
34
+
35
+ BASE_URL = "https://api.eventdata.crossref.org/v1/events"
36
+ DOI_API = "https://doi.org"
37
+
38
+ def __init__(self, email: str | None = None) -> None:
39
+ """
40
+ Initialize CrossRef Event Data discoverer.
41
+
42
+ Args:
43
+ email: Email for polite pool (better rate limits)
44
+ """
45
+ self.email = email
46
+ self.session = requests.Session()
47
+ if email:
48
+ self.session.headers["User-Agent"] = f"citations-collector (mailto:{email})"
49
+
50
+ # Add retry logic for timeouts and server errors
51
+ retry_strategy = Retry(
52
+ total=3,
53
+ backoff_factor=2, # 2s, 4s, 8s
54
+ status_forcelist=[429, 500, 502, 503, 504],
55
+ allowed_methods=["GET", "HEAD"],
56
+ )
57
+ adapter = HTTPAdapter(max_retries=retry_strategy)
58
+ self.session.mount("http://", adapter)
59
+ self.session.mount("https://", adapter)
60
+
61
+ def discover(self, item_ref: ItemRef, since: datetime | None = None) -> list[CitationRecord]:
62
+ """
63
+ Discover citations from CrossRef Event Data.
64
+
65
+ Args:
66
+ item_ref: DOI reference to query
67
+ since: Optional date for incremental updates (from-updated-date filter)
68
+
69
+ Returns:
70
+ List of citation records
71
+ """
72
+ if item_ref.ref_type != "doi":
73
+ logger.warning(f"CrossRef only supports DOI refs, got {item_ref.ref_type}")
74
+ return []
75
+
76
+ doi = item_ref.ref_value
77
+ logger.debug(f"CrossRef querying for DOI: {doi}")
78
+
79
+ # Query CrossRef Event Data for citations
80
+ # obj-id is the DOI being cited, subj-id is the citing work
81
+ params: dict[str, Any] = {"obj-id": doi, "rows": 1000}
82
+
83
+ # Add date filter if provided
84
+ if since:
85
+ date_str = since.strftime("%Y-%m-%d")
86
+ params["from-updated-date"] = date_str
87
+
88
+ try:
89
+ response = self.session.get(self.BASE_URL, params=params, timeout=30)
90
+ response.raise_for_status()
91
+ data = response.json()
92
+ except requests.RequestException as e:
93
+ logger.warning(f"CrossRef Event Data API error for {doi}: {e}")
94
+ return []
95
+
96
+ # Parse citations from events
97
+ citations = []
98
+ events = data.get("message", {}).get("events", [])
99
+
100
+ for event in events:
101
+ # Get the citing DOI
102
+ subj = event.get("subj", {})
103
+ citing_doi_url = subj.get("pid", "")
104
+
105
+ # Extract DOI from URL (e.g., "https://doi.org/10.1234/abc" -> "10.1234/abc")
106
+ citing_doi = citing_doi_url.replace("https://doi.org/", "").replace(
107
+ "http://doi.org/", ""
108
+ )
109
+
110
+ if not citing_doi or not citing_doi.startswith("10."):
111
+ continue
112
+
113
+ # Fetch metadata for the citing DOI via DOI content negotiation
114
+ metadata = self._fetch_doi_metadata(citing_doi)
115
+
116
+ # Create citation record with metadata
117
+ citation = CitationRecord(
118
+ item_id="", # Will be filled by caller
119
+ item_flavor="", # Will be filled by caller
120
+ citation_doi=citing_doi,
121
+ citation_title=cast(str | None, metadata.get("title")),
122
+ citation_authors=cast(str | None, metadata.get("authors")),
123
+ citation_year=cast(int | None, metadata.get("year")),
124
+ citation_journal=cast(str | None, metadata.get("journal")),
125
+ citation_relationship="Cites", # type: ignore[arg-type]
126
+ citation_source=CitationSource("crossref"),
127
+ citation_status="active", # type: ignore[arg-type]
128
+ )
129
+ citations.append(citation)
130
+
131
+ # Warn if Event Data returned events but they didn't yield valid citations
132
+ if len(events) > 0 and len(citations) == 0:
133
+ logger.info(
134
+ f"CrossRef Event Data returned {len(events)} events for {doi} "
135
+ f"but none were valid DOI-based citations (may be news/blog references)"
136
+ )
137
+
138
+ # Also check metadata API if we got 0 citations total
139
+ if len(citations) == 0:
140
+ try:
141
+ meta_resp = self.session.get(f"https://api.crossref.org/works/{doi}", timeout=10)
142
+ if meta_resp.status_code == 200:
143
+ meta_data = meta_resp.json()
144
+ cited_by_count = meta_data.get("message", {}).get("is-referenced-by-count", 0)
145
+ if cited_by_count > 0:
146
+ logger.warning(
147
+ f"CrossRef metadata shows {cited_by_count} citations for {doi}, "
148
+ f"but Event Data API has 0 valid citations. "
149
+ f"Full cited-by data requires CrossRef membership: "
150
+ f"https://www.crossref.org/services/cited-by/"
151
+ )
152
+ except Exception as e:
153
+ logger.debug(f"Failed to check cited-by count for {doi}: {e}")
154
+
155
+ return citations
156
+
157
+ def _fetch_doi_metadata(self, doi: str) -> dict[str, str | int | None]:
158
+ """
159
+ Fetch metadata for a DOI via content negotiation.
160
+
161
+ Args:
162
+ doi: The DOI to fetch metadata for
163
+
164
+ Returns:
165
+ Dictionary with title, authors, year, journal
166
+ """
167
+ metadata: dict[str, str | int | None] = {
168
+ "title": None,
169
+ "authors": None,
170
+ "year": None,
171
+ "journal": None,
172
+ }
173
+
174
+ try:
175
+ response = self.session.get(
176
+ f"{self.DOI_API}/{doi}",
177
+ headers={"Accept": "application/json"},
178
+ timeout=30,
179
+ )
180
+ response.raise_for_status()
181
+ data = response.json()
182
+
183
+ # Extract title (sanitize for TSV)
184
+ metadata["title"] = _sanitize_text(data.get("title"))
185
+
186
+ # Extract authors
187
+ authors = data.get("author", [])
188
+ if authors:
189
+ author_names = [
190
+ f"{a.get('given', '')} {a.get('family', '')}".strip() for a in authors
191
+ ]
192
+ metadata["authors"] = _sanitize_text("; ".join(author_names))
193
+
194
+ # Extract year
195
+ published = data.get("published", {})
196
+ date_parts = published.get("date-parts", [[]])
197
+ if date_parts and len(date_parts[0]) > 0:
198
+ metadata["year"] = date_parts[0][0]
199
+
200
+ # Extract journal (may be string or list, sanitize for TSV)
201
+ container = data.get("container-title")
202
+ if isinstance(container, list):
203
+ metadata["journal"] = _sanitize_text(container[0]) if container else None
204
+ else:
205
+ metadata["journal"] = _sanitize_text(container)
206
+
207
+ except requests.RequestException as e:
208
+ logger.debug(f"Failed to fetch metadata for DOI {doi}: {e}")
209
+
210
+ return metadata
@@ -0,0 +1,260 @@
1
+ """DataCite citation discovery via Event Data API and DOI metadata API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from contextlib import suppress
8
+ from datetime import datetime
9
+ from typing import cast
10
+
11
+ import requests
12
+
13
+ from citations_collector.discovery.base import AbstractDiscoverer
14
+ from citations_collector.models import CitationRecord, CitationSource, ItemRef
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def _sanitize_text(text: str | None) -> str | None:
20
+ """Sanitize text for TSV output - normalize whitespace, remove control chars."""
21
+ if text is None:
22
+ return None
23
+ # Replace newlines, tabs, carriage returns with spaces
24
+ text = re.sub(r"[\n\r\t]+", " ", text)
25
+ # Collapse multiple spaces
26
+ text = re.sub(r" +", " ", text)
27
+ # Strip leading/trailing whitespace
28
+ return text.strip() or None
29
+
30
+
31
+ class DataCiteDiscoverer(AbstractDiscoverer):
32
+ """
33
+ Discover citations via DataCite APIs.
34
+
35
+ Uses two approaches:
36
+ 1. Event Data API - tracks citation events from various sources
37
+ 2. DOI Metadata API - relationships.citations from DOI records
38
+
39
+ This provides broader coverage for DataCite-registered content (datasets, etc.).
40
+ """
41
+
42
+ # DataCite Event Data API for citation events
43
+ EVENT_DATA_URL = "https://api.datacite.org/events"
44
+ # DataCite DOI Metadata API
45
+ DOI_API_URL = "https://api.datacite.org/dois"
46
+ # DOI content negotiation for metadata
47
+ DOI_ORG = "https://doi.org"
48
+
49
+ def __init__(self) -> None:
50
+ """Initialize DataCite discoverer."""
51
+ self.session = requests.Session()
52
+
53
+ def discover(self, item_ref: ItemRef, since: datetime | None = None) -> list[CitationRecord]:
54
+ """
55
+ Discover citations from DataCite.
56
+
57
+ Queries both Event Data API and DOI relationships endpoint.
58
+
59
+ Args:
60
+ item_ref: DOI reference to query
61
+ since: Optional date for incremental updates
62
+
63
+ Returns:
64
+ List of citation records
65
+ """
66
+ if item_ref.ref_type != "doi":
67
+ logger.warning(f"DataCite only supports DOI refs, got {item_ref.ref_type}")
68
+ return []
69
+
70
+ doi = item_ref.ref_value
71
+ seen_dois: set[str] = set()
72
+ citations = []
73
+
74
+ # Method 1: Event Data API
75
+ event_citations = self._discover_from_events(doi, since)
76
+ for citation in event_citations:
77
+ if citation.citation_doi and citation.citation_doi not in seen_dois:
78
+ seen_dois.add(citation.citation_doi)
79
+ citations.append(citation)
80
+
81
+ # Method 2: DOI relationships.citations (what SPARC-Citations uses)
82
+ rel_citations = self._discover_from_relationships(doi)
83
+ for citation in rel_citations:
84
+ if citation.citation_doi and citation.citation_doi not in seen_dois:
85
+ seen_dois.add(citation.citation_doi)
86
+ citations.append(citation)
87
+
88
+ return citations
89
+
90
+ def _discover_from_events(
91
+ self, doi: str, since: datetime | None = None
92
+ ) -> list[CitationRecord]:
93
+ """Query DataCite Event Data API for citation events."""
94
+ # DataCite requires full DOI URL and uses "references" relation type
95
+ doi_url = doi if doi.startswith("http") else f"https://doi.org/{doi}"
96
+ params: dict[str, str | int] = {
97
+ "obj-id": doi_url,
98
+ "relation-type-id": "references",
99
+ "page[size]": 1000,
100
+ }
101
+
102
+ if since:
103
+ params["occurred-since"] = since.strftime("%Y-%m-%d")
104
+
105
+ try:
106
+ response = self.session.get(
107
+ self.EVENT_DATA_URL,
108
+ params=params,
109
+ timeout=30, # type: ignore[arg-type]
110
+ )
111
+ response.raise_for_status()
112
+ data = response.json()
113
+ except requests.RequestException as e:
114
+ logger.warning(f"DataCite Event Data API error for {doi}: {e}")
115
+ return []
116
+
117
+ citations = []
118
+ events = data.get("data", [])
119
+
120
+ for event in events:
121
+ attributes = event.get("attributes", {})
122
+ subj = attributes.get("subj", {})
123
+
124
+ subj_id = subj.get("pid")
125
+ if not subj_id:
126
+ continue
127
+
128
+ citing_doi = subj_id.replace("https://doi.org/", "").replace("doi:", "")
129
+
130
+ # Get metadata from event or fetch via DOI
131
+ title = _sanitize_text(subj.get("title"))
132
+ year = None
133
+ if "published" in subj:
134
+ with suppress(ValueError, TypeError):
135
+ year = int(subj["published"][:4])
136
+
137
+ # If missing metadata, fetch from DOI
138
+ if not title:
139
+ metadata = self._fetch_doi_metadata(citing_doi)
140
+ title = cast(str | None, metadata.get("title")) # Already sanitized
141
+ if not year:
142
+ year = cast(int | None, metadata.get("year"))
143
+ authors = cast(str | None, metadata.get("authors"))
144
+ journal = cast(str | None, metadata.get("journal"))
145
+ else:
146
+ authors = None
147
+ journal = None
148
+
149
+ citation = CitationRecord(
150
+ item_id="",
151
+ item_flavor="",
152
+ citation_doi=citing_doi,
153
+ citation_title=title,
154
+ citation_authors=authors,
155
+ citation_year=year,
156
+ citation_journal=journal,
157
+ citation_relationship="Cites", # type: ignore[arg-type]
158
+ citation_source=CitationSource("datacite"),
159
+ citation_status="active", # type: ignore[arg-type]
160
+ )
161
+ citations.append(citation)
162
+
163
+ return citations
164
+
165
+ def _discover_from_relationships(self, doi: str) -> list[CitationRecord]:
166
+ """Query DataCite DOI API for relationships.citations."""
167
+ url = f"{self.DOI_API_URL}/{doi}"
168
+
169
+ try:
170
+ response = self.session.get(url, timeout=30)
171
+ response.raise_for_status()
172
+ data = response.json()
173
+ except requests.RequestException as e:
174
+ logger.debug(f"DataCite DOI API error for {doi}: {e}")
175
+ return []
176
+
177
+ citations = []
178
+
179
+ # Navigate to relationships.citations.data
180
+ relationships = data.get("data", {}).get("relationships", {})
181
+ citations_data = relationships.get("citations", {}).get("data", [])
182
+
183
+ for citation_entry in citations_data:
184
+ citing_doi = citation_entry.get("id")
185
+ if not citing_doi:
186
+ continue
187
+
188
+ # Fetch metadata for the citing DOI
189
+ metadata = self._fetch_doi_metadata(citing_doi)
190
+
191
+ citation = CitationRecord(
192
+ item_id="",
193
+ item_flavor="",
194
+ citation_doi=citing_doi,
195
+ citation_title=cast(str | None, metadata.get("title")),
196
+ citation_authors=cast(str | None, metadata.get("authors")),
197
+ citation_year=cast(int | None, metadata.get("year")),
198
+ citation_journal=cast(str | None, metadata.get("journal")),
199
+ citation_relationship="Cites", # type: ignore[arg-type]
200
+ citation_source=CitationSource("datacite"),
201
+ citation_status="active", # type: ignore[arg-type]
202
+ )
203
+ citations.append(citation)
204
+
205
+ return citations
206
+
207
+ def _fetch_doi_metadata(self, doi: str) -> dict[str, str | int | None]:
208
+ """
209
+ Fetch metadata for a DOI via content negotiation.
210
+
211
+ Args:
212
+ doi: The DOI to fetch metadata for
213
+
214
+ Returns:
215
+ Dictionary with title, authors, year, journal
216
+ """
217
+ metadata: dict[str, str | int | None] = {
218
+ "title": None,
219
+ "authors": None,
220
+ "year": None,
221
+ "journal": None,
222
+ }
223
+
224
+ try:
225
+ response = self.session.get(
226
+ f"{self.DOI_ORG}/{doi}",
227
+ headers={"Accept": "application/json"},
228
+ timeout=30,
229
+ )
230
+ response.raise_for_status()
231
+ data = response.json()
232
+
233
+ # Extract title (sanitize for TSV)
234
+ metadata["title"] = _sanitize_text(data.get("title"))
235
+
236
+ # Extract authors
237
+ authors = data.get("author", [])
238
+ if authors:
239
+ author_names = [
240
+ f"{a.get('given', '')} {a.get('family', '')}".strip() for a in authors
241
+ ]
242
+ metadata["authors"] = _sanitize_text("; ".join(author_names))
243
+
244
+ # Extract year
245
+ published = data.get("published", {})
246
+ date_parts = published.get("date-parts", [[]])
247
+ if date_parts and len(date_parts[0]) > 0:
248
+ metadata["year"] = date_parts[0][0]
249
+
250
+ # Extract journal (may be string or list, sanitize for TSV)
251
+ container = data.get("container-title")
252
+ if isinstance(container, list):
253
+ metadata["journal"] = _sanitize_text(container[0]) if container else None
254
+ else:
255
+ metadata["journal"] = _sanitize_text(container)
256
+
257
+ except requests.RequestException as e:
258
+ logger.debug(f"Failed to fetch metadata for DOI {doi}: {e}")
259
+
260
+ return metadata