citations-collector 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,262 @@
1
+ """Import items from Zotero groups/collections."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ import re
8
+ from datetime import date
9
+ from typing import Any, cast
10
+
11
+ from pyzotero import zotero
12
+
13
+ from citations_collector.models import Collection, Item, ItemFlavor, ItemRef, RefType
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class ZoteroImporter:
19
+ """
20
+ Import items from Zotero group or collection.
21
+
22
+ Extracts DOIs and other identifiers from Zotero item metadata
23
+ to create a Collection for citation tracking.
24
+
25
+ Example:
26
+ importer = ZoteroImporter(api_key="your-api-key")
27
+ collection = importer.import_group(group_id=5774211)
28
+ """
29
+
30
+ DOI_PATTERN = re.compile(r"^10\.\d{4,}/[^\s]+$")
31
+
32
+ def __init__(self, api_key: str | None = None) -> None:
33
+ """
34
+ Initialize Zotero importer.
35
+
36
+ Args:
37
+ api_key: Zotero API key. If not provided, reads from
38
+ ZOTERO_API_KEY environment variable.
39
+ Public groups can be read without an API key.
40
+ """
41
+ self.api_key = api_key or os.getenv("ZOTERO_API_KEY")
42
+
43
+ def import_group(
44
+ self,
45
+ group_id: int,
46
+ collection_key: str | None = None,
47
+ limit: int | None = None,
48
+ ) -> Collection:
49
+ """
50
+ Import items from a Zotero group.
51
+
52
+ Args:
53
+ group_id: Zotero group ID
54
+ collection_key: Optional collection key within the group.
55
+ If None, imports all items in the group.
56
+ limit: Optional limit on number of items to import.
57
+
58
+ Returns:
59
+ Collection with items extracted from Zotero.
60
+ Each item gets:
61
+ - item_id: "zotero:{item_key}" or DOI-based ID if available
62
+ - flavor_id: "main" (single flavor per item)
63
+ - ref: DOI extracted from Zotero metadata
64
+ """
65
+ # Initialize pyzotero client
66
+ zot = zotero.Zotero(group_id, "group", self.api_key)
67
+
68
+ # Fetch items
69
+ if collection_key:
70
+ raw_items = self._fetch_collection_items(zot, collection_key, limit)
71
+ collection_name = self._get_collection_name(zot, collection_key)
72
+ else:
73
+ raw_items = self._fetch_all_items(zot, limit)
74
+ collection_name = self._get_group_name(zot, group_id)
75
+
76
+ # Convert to Collection items
77
+ items: list[Item] = []
78
+ for raw_item in raw_items:
79
+ item = self._zotero_item_to_item(raw_item)
80
+ if item:
81
+ items.append(item)
82
+
83
+ logger.info(f"Imported {len(items)} items from Zotero group {group_id}")
84
+
85
+ return Collection(
86
+ name=collection_name or f"Zotero Group {group_id}",
87
+ description=f"Items imported from Zotero group {group_id}",
88
+ homepage=f"https://www.zotero.org/groups/{group_id}",
89
+ source_type="zotero",
90
+ zotero_group_id=group_id,
91
+ zotero_collection_key=collection_key,
92
+ items=items,
93
+ )
94
+
95
+ def _fetch_all_items(
96
+ self, zot: zotero.Zotero, limit: int | None = None
97
+ ) -> list[dict[str, Any]]:
98
+ """Fetch all items from a Zotero library."""
99
+ try:
100
+ if limit:
101
+ return cast(list[dict[str, Any]], zot.items(limit=limit))
102
+ else:
103
+ return cast(list[dict[str, Any]], zot.everything(zot.items()))
104
+ except Exception as e:
105
+ logger.error(f"Failed to fetch Zotero items: {e}")
106
+ return []
107
+
108
+ def _fetch_collection_items(
109
+ self, zot: zotero.Zotero, collection_key: str, limit: int | None = None
110
+ ) -> list[dict[str, Any]]:
111
+ """Fetch items from a specific collection."""
112
+ try:
113
+ if limit:
114
+ return cast(list[dict[str, Any]], zot.collection_items(collection_key, limit=limit))
115
+ else:
116
+ return cast(
117
+ list[dict[str, Any]], zot.everything(zot.collection_items(collection_key))
118
+ )
119
+ except Exception as e:
120
+ logger.error(f"Failed to fetch collection {collection_key}: {e}")
121
+ return []
122
+
123
+ def _get_collection_name(self, zot: zotero.Zotero, collection_key: str) -> str | None:
124
+ """Get the name of a collection."""
125
+ try:
126
+ collection = zot.collection(collection_key)
127
+ name: str | None = collection.get("data", {}).get("name")
128
+ return name
129
+ except Exception:
130
+ return None
131
+
132
+ def _get_group_name(self, zot: zotero.Zotero, group_id: int) -> str | None:
133
+ """Get the name of a group."""
134
+ # pyzotero doesn't have a direct group info method for group libraries
135
+ # Return None to use fallback
136
+ return None
137
+
138
+ def _zotero_item_to_item(self, raw_item: dict) -> Item | None:
139
+ """
140
+ Convert a Zotero item to a Collection Item.
141
+
142
+ Args:
143
+ raw_item: Raw item data from pyzotero
144
+
145
+ Returns:
146
+ Item with DOI ref if available, None if no usable identifier
147
+ """
148
+ data = raw_item.get("data", {})
149
+ item_key = raw_item.get("key", "")
150
+
151
+ # Skip attachments and notes
152
+ item_type = data.get("itemType", "")
153
+ if item_type in ("attachment", "note"):
154
+ return None
155
+
156
+ # Extract DOI
157
+ doi = self._extract_doi(data)
158
+
159
+ # Extract other identifiers
160
+ pmid = data.get("extra", "")
161
+ pmid_match = re.search(r"PMID:\s*(\d+)", pmid)
162
+ pmid_value = pmid_match.group(1) if pmid_match else None
163
+
164
+ # Build refs list
165
+ refs: list[ItemRef] = []
166
+ if doi:
167
+ refs.append(ItemRef(ref_type=RefType.doi, ref_value=doi))
168
+ if pmid_value:
169
+ refs.append(ItemRef(ref_type=RefType.pmid, ref_value=pmid_value))
170
+
171
+ # If no usable refs, use URL if available
172
+ if not refs:
173
+ url = data.get("url")
174
+ if url:
175
+ refs.append(ItemRef(ref_type=RefType.url, ref_value=url))
176
+ else:
177
+ # No usable identifier
178
+ logger.debug(f"Skipping Zotero item {item_key}: no DOI, PMID, or URL")
179
+ return None
180
+
181
+ # Determine item_id (prefer DOI-based ID)
182
+ item_id = f"doi:{doi}" if doi else f"zotero:{item_key}"
183
+
184
+ # Extract metadata
185
+ title = data.get("title", f"Untitled ({item_key})")
186
+ url = data.get("url")
187
+
188
+ # Parse date
189
+ date_str = data.get("date", "")
190
+ release_date = self._parse_date(date_str)
191
+
192
+ # Create single flavor for the item
193
+ flavor = ItemFlavor(
194
+ flavor_id="main",
195
+ name=title,
196
+ release_date=release_date,
197
+ refs=refs,
198
+ )
199
+
200
+ return Item(
201
+ item_id=item_id,
202
+ name=title,
203
+ homepage=url,
204
+ flavors=[flavor],
205
+ )
206
+
207
+ def _extract_doi(self, data: dict[str, Any]) -> str | None:
208
+ """
209
+ Extract DOI from Zotero item data.
210
+
211
+ Checks multiple fields where DOI might be stored.
212
+ """
213
+ # Check dedicated DOI field
214
+ doi: str = data.get("DOI", "")
215
+ if doi and self.DOI_PATTERN.match(doi):
216
+ return doi
217
+
218
+ # Check URL field for DOI URLs
219
+ url = data.get("url", "")
220
+ if "doi.org/" in url:
221
+ # Extract DOI from URL
222
+ match = re.search(r"doi\.org/(10\.\d{4,}/[^\s]+)", url)
223
+ if match:
224
+ return match.group(1)
225
+
226
+ # Check extra field for DOI
227
+ extra = data.get("extra", "")
228
+ match = re.search(r"DOI:\s*(10\.\d{4,}/[^\s]+)", extra, re.IGNORECASE)
229
+ if match:
230
+ return match.group(1)
231
+
232
+ return None
233
+
234
+ def _parse_date(self, date_str: str) -> date | None:
235
+ """Parse various date formats from Zotero."""
236
+ if not date_str:
237
+ return None
238
+
239
+ # Try ISO format first
240
+ try:
241
+ return date.fromisoformat(date_str[:10])
242
+ except ValueError:
243
+ pass
244
+
245
+ # Try year-only
246
+ match = re.match(r"^(\d{4})$", date_str)
247
+ if match:
248
+ return date(int(match.group(1)), 1, 1)
249
+
250
+ # Try "Month Day, Year" format
251
+ try:
252
+ from datetime import datetime
253
+
254
+ for fmt in ["%B %d, %Y", "%b %d, %Y", "%Y-%m-%d", "%Y/%m/%d"]:
255
+ try:
256
+ return datetime.strptime(date_str, fmt).date()
257
+ except ValueError:
258
+ continue
259
+ except Exception:
260
+ pass
261
+
262
+ return None
@@ -0,0 +1,216 @@
1
+ """Detect and mark merged citations (preprints with published versions)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import TYPE_CHECKING
7
+
8
+ import requests
9
+ from rapidfuzz import fuzz
10
+
11
+ if TYPE_CHECKING:
12
+ from citations_collector.models.generated import CitationRecord
13
+
14
+ from citations_collector.models.generated import CitationStatus
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class MergeDetector:
20
+ """Detect preprints that have published versions and mark them as merged."""
21
+
22
+ def __init__(self, email: str = "site-unpaywall@oneukrainian.com", timeout: int = 30):
23
+ """Initialize the merge detector.
24
+
25
+ Args:
26
+ email: Email for CrossRef API (polite pool)
27
+ timeout: HTTP request timeout in seconds
28
+ """
29
+ self.email = email
30
+ self.timeout = timeout
31
+ self.session = requests.Session()
32
+ self.session.headers.update({"User-Agent": f"citations-collector ({email})"})
33
+
34
+ def detect_merged_pairs(self, citations: list[CitationRecord]) -> dict[str, str]:
35
+ """Detect which citations are preprints with published versions.
36
+
37
+ Args:
38
+ citations: List of citation records to analyze
39
+
40
+ Returns:
41
+ Dictionary mapping preprint DOI -> published DOI
42
+ """
43
+ merged_pairs: dict[str, str] = {}
44
+ doi_to_citation = {c.citation_doi: c for c in citations if c.citation_doi}
45
+
46
+ for citation in citations:
47
+ if not citation.citation_doi:
48
+ continue
49
+
50
+ # Check if this is a preprint with a published version
51
+ published_doi = self._get_published_version(citation.citation_doi)
52
+ if published_doi and (
53
+ published_doi in doi_to_citation or self._verify_doi_exists(published_doi)
54
+ ):
55
+ merged_pairs[citation.citation_doi] = published_doi
56
+ logger.info(f"Detected merge: {citation.citation_doi} -> {published_doi}")
57
+
58
+ return merged_pairs
59
+
60
+ def _get_published_version(self, doi: str) -> str | None:
61
+ """Get the published version DOI for a preprint.
62
+
63
+ Args:
64
+ doi: DOI of the potential preprint
65
+
66
+ Returns:
67
+ DOI of published version, or None if not found
68
+ """
69
+ try:
70
+ # Query CrossRef for this DOI's metadata
71
+ url = f"https://api.crossref.org/works/{doi}"
72
+ params = {"mailto": self.email}
73
+ resp = self.session.get(url, params=params, timeout=self.timeout)
74
+ resp.raise_for_status()
75
+ data = resp.json()
76
+
77
+ message = data.get("message", {})
78
+
79
+ # Check for "is-preprint-of" relationship
80
+ relations = message.get("relation", {})
81
+ is_preprint_of = relations.get("is-preprint-of", [])
82
+
83
+ for rel in is_preprint_of:
84
+ if "id" in rel:
85
+ # Extract DOI from the full URL if needed
86
+ rel_id = str(rel["id"])
87
+ if rel_id.startswith("https://doi.org/"):
88
+ return str(rel_id.replace("https://doi.org/", ""))
89
+ elif rel_id.startswith("http://dx.doi.org/"):
90
+ return str(rel_id.replace("http://dx.doi.org/", ""))
91
+ return str(rel_id)
92
+
93
+ # Check if this is a bioRxiv/medRxiv preprint (common case)
94
+ # Sometimes the relationship isn't explicit but the DOI pattern helps
95
+ if self._is_preprint_server(doi):
96
+ # Try to find via title fuzzy matching in our dataset
97
+ # (this is a fallback and should be used carefully)
98
+ pass
99
+
100
+ except requests.RequestException as e:
101
+ logger.warning(f"Failed to check CrossRef for {doi}: {e}")
102
+ except (KeyError, ValueError) as e:
103
+ logger.warning(f"Unexpected CrossRef response format for {doi}: {e}")
104
+
105
+ return None
106
+
107
+ def _is_preprint_server(self, doi: str) -> bool:
108
+ """Check if DOI is from a known preprint server.
109
+
110
+ Args:
111
+ doi: DOI to check
112
+
113
+ Returns:
114
+ True if from a preprint server
115
+ """
116
+ preprint_prefixes = [
117
+ "10.1101/", # bioRxiv, medRxiv
118
+ "10.31219/", # OSF Preprints
119
+ "10.20944/", # Preprints.org
120
+ "10.48550/", # arXiv
121
+ ]
122
+ return any(doi.startswith(prefix) for prefix in preprint_prefixes)
123
+
124
+ def _verify_doi_exists(self, doi: str) -> bool:
125
+ """Verify that a DOI exists and is accessible.
126
+
127
+ Args:
128
+ doi: DOI to verify
129
+
130
+ Returns:
131
+ True if DOI exists
132
+ """
133
+ try:
134
+ url = f"https://api.crossref.org/works/{doi}"
135
+ params = {"mailto": self.email}
136
+ resp = self.session.get(url, params=params, timeout=self.timeout, allow_redirects=False)
137
+ return bool(resp.status_code == 200)
138
+ except requests.RequestException:
139
+ return False
140
+
141
+ def mark_merged_citations(
142
+ self, citations: list[CitationRecord], merged_pairs: dict[str, str]
143
+ ) -> int:
144
+ """Mark citations as merged in place.
145
+
146
+ Args:
147
+ citations: List of citation records to update
148
+ merged_pairs: Dictionary mapping preprint DOI -> published DOI
149
+
150
+ Returns:
151
+ Number of citations marked as merged
152
+ """
153
+ marked_count = 0
154
+ for citation in citations:
155
+ if citation.citation_doi and citation.citation_doi in merged_pairs:
156
+ citation.citation_status = CitationStatus.merged
157
+ citation.citation_merged_into = merged_pairs[citation.citation_doi]
158
+ marked_count += 1
159
+ logger.info(
160
+ f"Marked {citation.citation_doi} as merged into {citation.citation_merged_into}"
161
+ )
162
+ return marked_count
163
+
164
+ def fuzzy_match_by_title(
165
+ self,
166
+ citations: list[CitationRecord],
167
+ threshold: int = 90,
168
+ ) -> dict[str, str]:
169
+ """Find potential merges by fuzzy title matching (fallback method).
170
+
171
+ This is a heuristic approach for cases where CrossRef relationships
172
+ are not explicitly registered.
173
+
174
+ Args:
175
+ citations: List of citation records
176
+ threshold: Minimum similarity score (0-100) for matching
177
+
178
+ Returns:
179
+ Dictionary mapping preprint DOI -> published DOI candidates
180
+ """
181
+ potential_pairs: dict[str, str] = {}
182
+ preprints = [
183
+ c for c in citations if c.citation_doi and self._is_preprint_server(c.citation_doi)
184
+ ]
185
+ published = [
186
+ c for c in citations if c.citation_doi and not self._is_preprint_server(c.citation_doi)
187
+ ]
188
+
189
+ for preprint in preprints:
190
+ if not preprint.citation_title:
191
+ continue
192
+
193
+ best_match = None
194
+ best_score: float = 0.0
195
+
196
+ for pub in published:
197
+ if not pub.citation_title:
198
+ continue
199
+
200
+ # Check if they have similar authors (if available)
201
+ # and similar publication years
202
+ # (This is a heuristic - use with caution)
203
+
204
+ score = fuzz.ratio(preprint.citation_title.lower(), pub.citation_title.lower())
205
+ if score > best_score and score >= threshold:
206
+ best_score = score
207
+ best_match = pub
208
+
209
+ if best_match and preprint.citation_doi and best_match.citation_doi:
210
+ logger.info(
211
+ f"Fuzzy match found (score {best_score}): "
212
+ f"{preprint.citation_doi} ~> {best_match.citation_doi}"
213
+ )
214
+ potential_pairs[preprint.citation_doi] = best_match.citation_doi
215
+
216
+ return potential_pairs
@@ -0,0 +1,44 @@
1
+ """Data models for citations-collector.
2
+
3
+ Generated from LinkML schema at schema/citations.yaml.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from citations_collector.models.generated import (
9
+ CitationRecord,
10
+ CitationRelationship,
11
+ CitationSource,
12
+ CitationStatus,
13
+ CitationType,
14
+ Collection,
15
+ CurationConfig,
16
+ CurationRule,
17
+ DiscoverConfig,
18
+ Item,
19
+ ItemFlavor,
20
+ ItemRef,
21
+ PdfsConfig,
22
+ RefType,
23
+ SourceConfig,
24
+ ZoteroConfig,
25
+ )
26
+
27
+ __all__ = [
28
+ "CitationRecord",
29
+ "CitationRelationship",
30
+ "CitationSource",
31
+ "CitationStatus",
32
+ "CitationType",
33
+ "Collection",
34
+ "CurationConfig",
35
+ "CurationRule",
36
+ "DiscoverConfig",
37
+ "Item",
38
+ "ItemFlavor",
39
+ "ItemRef",
40
+ "PdfsConfig",
41
+ "RefType",
42
+ "SourceConfig",
43
+ "ZoteroConfig",
44
+ ]