citations-collector 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,252 @@
1
+ """OpenAlex citation discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ import time
8
+ from datetime import datetime
9
+ from typing import Any
10
+
11
+ import requests
12
+
13
+ from citations_collector.discovery.base import AbstractDiscoverer
14
+ from citations_collector.models import CitationRecord, CitationSource, ItemRef
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def _sanitize_text(text: str | None) -> str | None:
20
+ """Sanitize text for TSV output - normalize whitespace, remove control chars."""
21
+ if text is None:
22
+ return None
23
+ # Replace newlines, tabs, carriage returns with spaces
24
+ text = re.sub(r"[\n\r\t]+", " ", text)
25
+ # Collapse multiple spaces
26
+ text = re.sub(r" +", " ", text)
27
+ # Strip leading/trailing whitespace
28
+ return text.strip() or None
29
+
30
+
31
+ class OpenAlexDiscoverer(AbstractDiscoverer):
32
+ """Discover citations via OpenAlex API."""
33
+
34
+ BASE_URL = "https://api.openalex.org"
35
+ RATE_LIMIT_DELAY = 0.1 # 10 requests/second = 0.1s between requests
36
+
37
+ def __init__(self, email: str | None = None, api_key: str | None = None) -> None:
38
+ """
39
+ Initialize OpenAlex discoverer.
40
+
41
+ Args:
42
+ email: Email for polite pool (adds to User-Agent)
43
+ api_key: Optional API key for higher rate limits
44
+ """
45
+ self.email = email
46
+ self.api_key = api_key
47
+ self.session = requests.Session()
48
+
49
+ # Set User-Agent with mailto for polite pool
50
+ user_agent = "citations-collector"
51
+ if email:
52
+ user_agent += f" (mailto:{email})"
53
+ self.session.headers["User-Agent"] = user_agent
54
+
55
+ self._last_request_time = 0.0
56
+
57
+ def discover(self, item_ref: ItemRef, since: datetime | None = None) -> list[CitationRecord]:
58
+ """
59
+ Discover citations from OpenAlex.
60
+
61
+ Args:
62
+ item_ref: DOI reference to query
63
+ since: Optional date for incremental updates (from-publication-date filter)
64
+
65
+ Returns:
66
+ List of citation records
67
+ """
68
+ if item_ref.ref_type != "doi":
69
+ logger.warning(f"OpenAlex only supports DOI refs, got {item_ref.ref_type}")
70
+ return []
71
+
72
+ doi = item_ref.ref_value
73
+
74
+ # First resolve DOI to OpenAlex ID (required for cites: filter)
75
+ openalex_id = self._resolve_doi_to_id(doi)
76
+ if not openalex_id:
77
+ logger.warning(f"Could not resolve DOI {doi} to OpenAlex ID")
78
+ return []
79
+
80
+ # Query OpenAlex for works that cite this work
81
+ # Filter format: cites:{openalex_id} (e.g., cites:W4409283533)
82
+ citations = []
83
+ cursor = "*" # OpenAlex uses cursor-based pagination
84
+
85
+ while cursor:
86
+ self._rate_limit()
87
+
88
+ params: dict[str, Any] = {
89
+ "filter": f"cites:{openalex_id}",
90
+ "per-page": 200, # Max per page
91
+ "cursor": cursor,
92
+ }
93
+
94
+ if self.email:
95
+ params["mailto"] = self.email
96
+
97
+ # Add date filter if provided
98
+ if since:
99
+ date_str = since.strftime("%Y-%m-%d")
100
+ params["filter"] += f",from_publication_date:{date_str}"
101
+
102
+ try:
103
+ response = self.session.get(
104
+ f"{self.BASE_URL}/works",
105
+ params=params,
106
+ timeout=30,
107
+ )
108
+ response.raise_for_status()
109
+ data = response.json()
110
+ except requests.RequestException as e:
111
+ logger.warning(f"OpenAlex API error for {doi}: {e}")
112
+ break
113
+
114
+ # Parse results
115
+ results = data.get("results", [])
116
+ for work in results:
117
+ citation = self._parse_work(work)
118
+ if citation:
119
+ citations.append(citation)
120
+
121
+ # Check for next page
122
+ meta = data.get("meta", {})
123
+ cursor = meta.get("next_cursor")
124
+
125
+ # Stop if we've processed all results
126
+ if not cursor or not results:
127
+ break
128
+
129
+ return citations
130
+
131
+ def _resolve_doi_to_id(self, doi: str) -> str | None:
132
+ """
133
+ Resolve a DOI to its OpenAlex ID.
134
+
135
+ Args:
136
+ doi: The DOI to resolve (e.g., "10.1038/s41586-025-08790-w")
137
+
138
+ Returns:
139
+ OpenAlex ID (e.g., "W4409283533") or None if not found
140
+ """
141
+ self._rate_limit()
142
+
143
+ try:
144
+ response = self.session.get(
145
+ f"{self.BASE_URL}/works/https://doi.org/{doi}",
146
+ timeout=30,
147
+ )
148
+ response.raise_for_status()
149
+ work = response.json()
150
+
151
+ # Extract ID from URL (e.g., "https://openalex.org/W4409283533" -> "W4409283533")
152
+ openalex_url = work.get("id")
153
+ if openalex_url:
154
+ openalex_id: str = openalex_url.split("/")[-1]
155
+ logger.debug(f"Resolved DOI {doi} to OpenAlex ID {openalex_id}")
156
+ return openalex_id
157
+
158
+ except requests.RequestException as e:
159
+ logger.warning(f"Failed to resolve DOI {doi} to OpenAlex ID: {e}")
160
+
161
+ return None
162
+
163
+ def _rate_limit(self) -> None:
164
+ """Implement rate limiting to stay under 10 req/sec."""
165
+ elapsed = time.time() - self._last_request_time
166
+ if elapsed < self.RATE_LIMIT_DELAY:
167
+ time.sleep(self.RATE_LIMIT_DELAY - elapsed)
168
+ self._last_request_time = time.time()
169
+
170
+ def _parse_work(self, work: dict[str, Any]) -> CitationRecord | None:
171
+ """
172
+ Parse an OpenAlex work into a CitationRecord.
173
+
174
+ Args:
175
+ work: OpenAlex work object
176
+
177
+ Returns:
178
+ CitationRecord or None if missing required fields
179
+ """
180
+ # Extract DOI
181
+ doi = work.get("doi")
182
+ if not doi:
183
+ return None
184
+
185
+ # Remove https://doi.org/ prefix if present
186
+ doi = doi.replace("https://doi.org/", "").replace("http://doi.org/", "")
187
+
188
+ if not doi.startswith("10."):
189
+ return None
190
+
191
+ # Extract title
192
+ title = _sanitize_text(work.get("title"))
193
+
194
+ # Extract authors
195
+ authorships = work.get("authorships", [])
196
+ authors = []
197
+ for authorship in authorships:
198
+ author_obj = authorship.get("author", {})
199
+ display_name = author_obj.get("display_name")
200
+ if display_name:
201
+ authors.append(display_name)
202
+ authors_str = _sanitize_text("; ".join(authors)) if authors else None
203
+
204
+ # Extract year
205
+ pub_year = work.get("publication_year")
206
+
207
+ # Extract journal/venue
208
+ primary_location = work.get("primary_location") or {}
209
+ source = primary_location.get("source") or {}
210
+ journal = _sanitize_text(source.get("display_name")) if source else None
211
+
212
+ # Determine citation type based on work type
213
+ work_type = work.get("type")
214
+ citation_type = self._map_work_type(work_type)
215
+
216
+ return CitationRecord(
217
+ item_id="", # Will be filled by caller
218
+ item_flavor="", # Will be filled by caller
219
+ citation_doi=doi,
220
+ citation_title=title,
221
+ citation_authors=authors_str,
222
+ citation_year=pub_year,
223
+ citation_journal=journal,
224
+ citation_type=citation_type, # type: ignore[arg-type]
225
+ citation_relationship="Cites", # type: ignore[arg-type]
226
+ citation_source=CitationSource("openalex"),
227
+ citation_status="active", # type: ignore[arg-type]
228
+ )
229
+
230
+ def _map_work_type(self, work_type: str | None) -> str | None:
231
+ """
232
+ Map OpenAlex work type to CitationType.
233
+
234
+ OpenAlex types: article, book, dataset, paratext, preprint, etc.
235
+ See: https://docs.openalex.org/api-entities/works/work-object#type
236
+ """
237
+ if not work_type:
238
+ return None
239
+
240
+ type_mapping = {
241
+ "article": "Publication",
242
+ "book-chapter": "Book",
243
+ "monograph": "Book",
244
+ "book": "Book",
245
+ "dataset": "Dataset",
246
+ "preprint": "Preprint",
247
+ "posted-content": "Preprint",
248
+ "dissertation": "Thesis",
249
+ "other": "Other",
250
+ }
251
+
252
+ return type_mapping.get(work_type.lower(), "Other")
@@ -0,0 +1,168 @@
1
+ """OpenCitations citation discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from datetime import datetime
8
+ from typing import cast
9
+
10
+ import requests
11
+
12
+ from citations_collector.discovery.base import AbstractDiscoverer
13
+ from citations_collector.models import CitationRecord, CitationSource, ItemRef
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def _sanitize_text(text: str | None) -> str | None:
19
+ """Sanitize text for TSV output - normalize whitespace, remove control chars."""
20
+ if text is None:
21
+ return None
22
+ # Replace newlines, tabs, carriage returns with spaces
23
+ text = re.sub(r"[\n\r\t]+", " ", text)
24
+ # Collapse multiple spaces
25
+ text = re.sub(r" +", " ", text)
26
+ # Strip leading/trailing whitespace
27
+ return text.strip() or None
28
+
29
+
30
+ class OpenCitationsDiscoverer(AbstractDiscoverer):
31
+ """Discover citations via OpenCitations COCI API."""
32
+
33
+ BASE_URL = "https://opencitations.net/index/coci/api/v1/citations"
34
+ DOI_API = "https://doi.org"
35
+
36
+ def __init__(self) -> None:
37
+ """Initialize OpenCitations discoverer."""
38
+ self.session = requests.Session()
39
+
40
+ def discover(self, item_ref: ItemRef, since: datetime | None = None) -> list[CitationRecord]:
41
+ """
42
+ Discover citations from OpenCitations COCI.
43
+
44
+ Args:
45
+ item_ref: DOI reference to query
46
+ since: Optional date for incremental updates (creation date filter)
47
+
48
+ Returns:
49
+ List of citation records
50
+ """
51
+ if item_ref.ref_type != "doi":
52
+ logger.warning(f"OpenCitations only supports DOI refs, got {item_ref.ref_type}")
53
+ return []
54
+
55
+ doi = item_ref.ref_value
56
+ url = f"{self.BASE_URL}/{doi}"
57
+
58
+ try:
59
+ response = self.session.get(url, timeout=30)
60
+ response.raise_for_status()
61
+ data = response.json()
62
+ except requests.RequestException as e:
63
+ logger.warning(f"OpenCitations API error for {doi}: {e}")
64
+ return []
65
+
66
+ # Parse citations from response
67
+ citations = []
68
+ if not isinstance(data, list):
69
+ data = [data]
70
+
71
+ for item in data:
72
+ citing_doi = item.get("citing")
73
+ if not citing_doi:
74
+ continue
75
+
76
+ # Apply date filter if provided
77
+ if since:
78
+ creation_date = item.get("creation", "")
79
+ try:
80
+ # Parse creation date (formats: YYYY-MM, YYYY-MM-DD, YYYY)
81
+ if creation_date:
82
+ # Convert to datetime for comparison
83
+ if len(creation_date) == 4: # YYYY
84
+ item_date = datetime.strptime(creation_date, "%Y")
85
+ elif len(creation_date) == 7: # YYYY-MM
86
+ item_date = datetime.strptime(creation_date, "%Y-%m")
87
+ else: # YYYY-MM-DD
88
+ item_date = datetime.strptime(creation_date, "%Y-%m-%d")
89
+
90
+ if item_date < since:
91
+ continue # Skip older citations
92
+ except ValueError:
93
+ pass # Include if we can't parse date
94
+
95
+ # Fetch metadata for the citing DOI
96
+ metadata = self._fetch_doi_metadata(citing_doi)
97
+
98
+ # Create citation record with metadata
99
+ citation = CitationRecord(
100
+ item_id="", # Will be filled by caller
101
+ item_flavor="", # Will be filled by caller
102
+ citation_doi=citing_doi,
103
+ citation_title=cast(str | None, metadata.get("title")),
104
+ citation_authors=cast(str | None, metadata.get("authors")),
105
+ citation_year=cast(int | None, metadata.get("year")),
106
+ citation_journal=cast(str | None, metadata.get("journal")),
107
+ citation_relationship="Cites", # type: ignore[arg-type]
108
+ citation_source=CitationSource("opencitations"),
109
+ citation_status="active", # type: ignore[arg-type]
110
+ )
111
+ citations.append(citation)
112
+
113
+ return citations
114
+
115
+ def _fetch_doi_metadata(self, doi: str) -> dict[str, str | int | None]:
116
+ """
117
+ Fetch metadata for a DOI via content negotiation.
118
+
119
+ Args:
120
+ doi: The DOI to fetch metadata for
121
+
122
+ Returns:
123
+ Dictionary with title, authors, year, journal
124
+ """
125
+ metadata: dict[str, str | int | None] = {
126
+ "title": None,
127
+ "authors": None,
128
+ "year": None,
129
+ "journal": None,
130
+ }
131
+
132
+ try:
133
+ response = self.session.get(
134
+ f"{self.DOI_API}/{doi}",
135
+ headers={"Accept": "application/json"},
136
+ timeout=30,
137
+ )
138
+ response.raise_for_status()
139
+ data = response.json()
140
+
141
+ # Extract title (sanitize for TSV)
142
+ metadata["title"] = _sanitize_text(data.get("title"))
143
+
144
+ # Extract authors
145
+ authors = data.get("author", [])
146
+ if authors:
147
+ author_names = [
148
+ f"{a.get('given', '')} {a.get('family', '')}".strip() for a in authors
149
+ ]
150
+ metadata["authors"] = _sanitize_text("; ".join(author_names))
151
+
152
+ # Extract year
153
+ published = data.get("published", {})
154
+ date_parts = published.get("date-parts", [[]])
155
+ if date_parts and len(date_parts[0]) > 0:
156
+ metadata["year"] = date_parts[0][0]
157
+
158
+ # Extract journal (may be string or list, sanitize for TSV)
159
+ container = data.get("container-title")
160
+ if isinstance(container, list):
161
+ metadata["journal"] = _sanitize_text(container[0]) if container else None
162
+ else:
163
+ metadata["journal"] = _sanitize_text(container)
164
+
165
+ except requests.RequestException as e:
166
+ logger.debug(f"Failed to fetch metadata for DOI {doi}: {e}")
167
+
168
+ return metadata
@@ -0,0 +1,62 @@
1
+ """Utility functions for citation discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from citations_collector.models import CitationRecord
6
+
7
+
8
+ def build_doi_url(doi: str) -> str:
9
+ """
10
+ Build resolver URL for DOI.
11
+
12
+ Args:
13
+ doi: DOI string (without doi: prefix)
14
+
15
+ Returns:
16
+ Full DOI resolver URL
17
+ """
18
+ return f"https://doi.org/{doi}"
19
+
20
+
21
+ def deduplicate_citations(citations: list[CitationRecord]) -> list[CitationRecord]:
22
+ """
23
+ Deduplicate citations by unique key (item_id, item_flavor, citation_doi).
24
+
25
+ When duplicates found across sources, collects all sources in citation_sources field.
26
+
27
+ Args:
28
+ citations: List of citation records
29
+
30
+ Returns:
31
+ Deduplicated list with sources merged
32
+ """
33
+ # Group citations by unique key
34
+ grouped: dict[tuple[str, str, str | None], list[CitationRecord]] = {}
35
+
36
+ for citation in citations:
37
+ key = (citation.item_id, citation.item_flavor, citation.citation_doi)
38
+ if key not in grouped:
39
+ grouped[key] = []
40
+ grouped[key].append(citation)
41
+
42
+ # Build unique list, merging sources
43
+ unique = []
44
+ for _key, group in grouped.items():
45
+ # Use first citation as base
46
+ citation = group[0]
47
+
48
+ # Collect all sources that found this citation
49
+ sources = []
50
+ for c in group:
51
+ if c.citation_source and c.citation_source not in sources:
52
+ sources.append(c.citation_source)
53
+
54
+ # Always set citation_sources to the list (even if single source)
55
+ if sources:
56
+ citation.citation_sources = sources # type: ignore[assignment]
57
+ # Keep citation_source set to first source (required field, backward compat)
58
+ citation.citation_source = sources[0] # type: ignore[assignment]
59
+
60
+ unique.append(citation)
61
+
62
+ return unique
@@ -0,0 +1,17 @@
1
+ """Importers and reference expanders for citations-collector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from citations_collector.importers.bibtex import BibTeXImporter
6
+ from citations_collector.importers.dandi import DANDIImporter
7
+ from citations_collector.importers.github import GitHubMapper
8
+ from citations_collector.importers.zenodo import ZenodoExpander
9
+ from citations_collector.importers.zotero import ZoteroImporter
10
+
11
+ __all__ = [
12
+ "BibTeXImporter",
13
+ "DANDIImporter",
14
+ "GitHubMapper",
15
+ "ZenodoExpander",
16
+ "ZoteroImporter",
17
+ ]
@@ -0,0 +1,178 @@
1
+ """Import items from BibTeX files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from datetime import date
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import bibtexparser
12
+
13
+ from citations_collector.models.generated import (
14
+ Collection,
15
+ Item,
16
+ ItemFlavor,
17
+ ItemRef,
18
+ RefType,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Suppress bibtexparser's duplicate key warnings - we handle deduplication ourselves
24
+ logging.getLogger("bibtexparser").setLevel(logging.ERROR)
25
+
26
+
27
+ class BibTeXImporter:
28
+ """Import items from BibTeX files with regex-based parsing."""
29
+
30
+ def __init__(
31
+ self,
32
+ bibtex_file: Path,
33
+ bib_field: str,
34
+ ref_type: RefType,
35
+ ref_regex: str,
36
+ ) -> None:
37
+ """
38
+ Initialize BibTeX importer.
39
+
40
+ Args:
41
+ bibtex_file: Path to .bib file
42
+ bib_field: BibTeX field to extract reference from (e.g., 'doi')
43
+ ref_type: Type of reference (e.g., RefType.doi)
44
+ ref_regex: Regex with named groups (?P<item_id>...) and (?P<flavor_id>...)
45
+ """
46
+ self.bibtex_file = bibtex_file
47
+ self.bib_field = bib_field
48
+ self.ref_type = ref_type
49
+ self.ref_pattern = re.compile(ref_regex)
50
+
51
+ # Validate regex has required groups
52
+ if "item_id" not in self.ref_pattern.groupindex:
53
+ raise ValueError("ref_regex must contain (?P<item_id>...) named group")
54
+
55
+ def import_all(self) -> Collection:
56
+ """
57
+ Import all entries from BibTeX file.
58
+
59
+ Groups entries by item_id, creating one Item per unique item_id
60
+ with multiple flavors (versions).
61
+
62
+ Returns:
63
+ Collection with items parsed from BibTeX entries
64
+ """
65
+ if not self.bibtex_file.exists():
66
+ raise FileNotFoundError(f"BibTeX file not found: {self.bibtex_file}")
67
+
68
+ # Parse BibTeX file
69
+ library = bibtexparser.parse_file(str(self.bibtex_file))
70
+
71
+ # Group flavors by item_id
72
+ items_dict: dict[str, dict[str, Any]] = {}
73
+ skipped = 0
74
+
75
+ for entry in library.entries:
76
+ result = self._entry_to_flavor(entry)
77
+ if result:
78
+ item_id, flavor, name = result
79
+ if item_id not in items_dict:
80
+ items_dict[item_id] = {"name": name, "flavors": [], "seen_flavors": set()}
81
+ # Deduplicate flavors by flavor_id
82
+ if flavor.flavor_id not in items_dict[item_id]["seen_flavors"]:
83
+ items_dict[item_id]["flavors"].append(flavor)
84
+ items_dict[item_id]["seen_flavors"].add(flavor.flavor_id)
85
+ else:
86
+ skipped += 1
87
+
88
+ # Build Item objects with merged flavors
89
+ items = [
90
+ Item(
91
+ item_id=item_id,
92
+ name=data["name"],
93
+ flavors=data["flavors"],
94
+ )
95
+ for item_id, data in items_dict.items()
96
+ ]
97
+
98
+ total_flavors = sum(len(item.flavors) for item in items)
99
+ logger.info(
100
+ f"Imported {len(items)} items ({total_flavors} flavors) "
101
+ f"from {self.bibtex_file.name}, skipped {skipped}"
102
+ )
103
+
104
+ return Collection(
105
+ name=f"BibTeX: {self.bibtex_file.stem}",
106
+ description=f"Items imported from {self.bibtex_file}",
107
+ items=items,
108
+ )
109
+
110
+ def _entry_to_flavor(self, entry: Any) -> tuple[str, ItemFlavor, str] | None:
111
+ """
112
+ Convert BibTeX entry to flavor components.
113
+
114
+ Args:
115
+ entry: BibTeX entry from bibtexparser
116
+
117
+ Returns:
118
+ Tuple of (item_id, flavor, name) if reference can be parsed, None otherwise
119
+ """
120
+ # Get reference value from specified field
121
+ ref_value = entry.fields_dict.get(self.bib_field)
122
+ if not ref_value:
123
+ logger.debug(f"Entry {entry.key} missing field '{self.bib_field}', skipping")
124
+ return None
125
+
126
+ ref_value_str = ref_value.value if hasattr(ref_value, "value") else str(ref_value)
127
+
128
+ # Parse with regex to extract item_id and flavor_id
129
+ match = self.ref_pattern.match(ref_value_str)
130
+ if not match:
131
+ logger.warning(
132
+ f"Entry {entry.key}: '{self.bib_field}' value '{ref_value_str}' "
133
+ f"doesn't match regex pattern, skipping"
134
+ )
135
+ return None
136
+
137
+ # Normalize to lowercase for consistency (DOIs are case-insensitive)
138
+ item_id = match.group("item_id").lower()
139
+ flavor_id = match.group("flavor_id").lower() if "flavor_id" in match.groupdict() else "main"
140
+
141
+ # Extract metadata
142
+ title = self._get_field(entry, "title")
143
+ year = self._get_field(entry, "year")
144
+ release_date = self._parse_year(year) if year else None
145
+
146
+ # Build ItemRef (normalize DOI to lowercase for consistency)
147
+ item_ref = ItemRef(
148
+ ref_type=self.ref_type,
149
+ ref_value=ref_value_str.lower() if self.ref_type == RefType.doi else ref_value_str,
150
+ )
151
+
152
+ # Build ItemFlavor
153
+ flavor = ItemFlavor(
154
+ flavor_id=flavor_id,
155
+ name=title or f"Version {flavor_id}",
156
+ release_date=release_date,
157
+ refs=[item_ref],
158
+ )
159
+
160
+ # Return components for grouping by item_id
161
+ item_name = title or item_id
162
+ return (item_id, flavor, item_name)
163
+
164
+ def _get_field(self, entry: Any, field_name: str) -> str | None:
165
+ """Extract field value from BibTeX entry."""
166
+ field = entry.fields_dict.get(field_name)
167
+ if not field:
168
+ return None
169
+ return field.value if hasattr(field, "value") else str(field)
170
+
171
+ def _parse_year(self, year_str: str) -> date | None:
172
+ """Parse year string to date (Jan 1 of that year)."""
173
+ try:
174
+ year = int(year_str)
175
+ return date(year, 1, 1)
176
+ except (ValueError, TypeError):
177
+ logger.warning(f"Could not parse year: {year_str}")
178
+ return None