citations-collector 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,591 @@
1
+ """Sync citations to Zotero as hierarchical collections."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from dataclasses import dataclass, field
7
+
8
+ from pyzotero import zotero
9
+
10
+ from citations_collector.models import CitationRecord, Collection
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ TRACKER_PREFIX = "CitationTracker:"
15
+
16
+
17
+ @dataclass
18
+ class SyncReport:
19
+ """Summary of a Zotero sync operation."""
20
+
21
+ collections_created: int = 0
22
+ items_created: int = 0
23
+ items_updated: int = 0
24
+ items_skipped: int = 0
25
+ attachments_created: int = 0
26
+ errors: list[str] = field(default_factory=list)
27
+
28
+
29
+ class ZoteroSyncer:
30
+ """Sync citation records to Zotero as hierarchical collections.
31
+
32
+ Creates a two-level collection hierarchy under the configured top-level
33
+ collection::
34
+
35
+ top_collection/
36
+ {item_id}/
37
+ {flavor}/
38
+ <active citation items>
39
+ Merged/
40
+ <preprints and old versions>
41
+
42
+ Active citations are dual-assigned to both the item-level and
43
+ flavor-level collections so they appear when browsing either level.
44
+ Merged citations are only placed in the ``Merged`` subcollection.
45
+
46
+ Each citation item includes a tracker key in the ``extra`` field
47
+ (``CitationTracker: {item_id}/{flavor}/{doi_or_url}``) so that
48
+ subsequent syncs can detect items that already exist.
49
+ """
50
+
51
+ def __init__(self, api_key: str, group_id: int, collection_key: str) -> None:
52
+ self.zot = zotero.Zotero(group_id, "group", api_key)
53
+ self.group_id = group_id
54
+ self.top_collection_key = collection_key
55
+
56
+ def sync(
57
+ self,
58
+ collection: Collection,
59
+ citations: list[CitationRecord],
60
+ dry_run: bool = False,
61
+ ) -> SyncReport:
62
+ """Sync citations to Zotero hierarchy.
63
+
64
+ Args:
65
+ collection: The source collection definition.
66
+ citations: Citation records to sync.
67
+ dry_run: If ``True``, log what would happen but make no API calls.
68
+
69
+ Returns:
70
+ A :class:`SyncReport` summarising the operations performed.
71
+ """
72
+ report = SyncReport()
73
+
74
+ # 1. Fetch existing collections under top_collection_key
75
+ existing_collections = self._fetch_subcollections(self.top_collection_key)
76
+
77
+ # 2. Fetch existing items and index by tracker key
78
+ existing_items = self._fetch_existing_items()
79
+
80
+ # 3. Group citations by item_id, then flavor
81
+ grouped = self._group_citations(citations)
82
+
83
+ # 4. For each item, ensure collection hierarchy exists
84
+ for item_id, flavors in grouped.items():
85
+ bare_id = self._strip_prefix(item_id)
86
+ item_collection_name = bare_id
87
+
88
+ # Find or create item-level collection
89
+ item_coll_key = self._find_collection(existing_collections, item_collection_name)
90
+ if not item_coll_key:
91
+ if dry_run:
92
+ logger.info("Would create collection: %s", item_collection_name)
93
+ report.collections_created += 1
94
+ for flavor_id, buckets in flavors.items():
95
+ logger.info(" Would create sub-collection: %s", flavor_id)
96
+ report.collections_created += 1
97
+ for bucket_citations in buckets.values():
98
+ for c in bucket_citations:
99
+ logger.info(
100
+ " Would create item: %s",
101
+ c.citation_doi or c.citation_title,
102
+ )
103
+ report.items_created += 1
104
+ continue
105
+ item_coll_key = self._create_collection(
106
+ item_collection_name, self.top_collection_key
107
+ )
108
+ report.collections_created += 1
109
+ existing_collections[item_coll_key] = item_collection_name
110
+
111
+ # Fetch sub-collections for this item
112
+ item_subcollections = self._fetch_subcollections(item_coll_key)
113
+
114
+ for flavor_id, buckets in flavors.items():
115
+ # Find or create flavor-level collection
116
+ flavor_coll_key = self._find_collection(item_subcollections, flavor_id)
117
+ if not flavor_coll_key:
118
+ if dry_run:
119
+ logger.info(
120
+ " Would create sub-collection: %s under %s",
121
+ flavor_id,
122
+ item_collection_name,
123
+ )
124
+ report.collections_created += 1
125
+ for bucket_citations in buckets.values():
126
+ for c in bucket_citations:
127
+ logger.info(
128
+ " Would create item: %s",
129
+ c.citation_doi or c.citation_title,
130
+ )
131
+ report.items_created += 1
132
+ continue
133
+ flavor_coll_key = self._create_collection(flavor_id, item_coll_key)
134
+ report.collections_created += 1
135
+ item_subcollections[flavor_coll_key] = flavor_id
136
+
137
+ # Resolve Merged subcollection only if needed
138
+ merged_coll_key: str | None = None
139
+
140
+ # Sync active citations — dual-assign to item + flavor collections
141
+ for citation in buckets.get("active", []):
142
+ self._sync_single_citation(
143
+ citation,
144
+ [item_coll_key, flavor_coll_key],
145
+ existing_items,
146
+ dry_run,
147
+ report,
148
+ is_merged=False,
149
+ )
150
+
151
+ # Sync merged citations — only in Merged subcollection
152
+ merged_list = buckets.get("merged", [])
153
+ if merged_list:
154
+ flavor_subcollections = self._fetch_subcollections(flavor_coll_key)
155
+ merged_coll_key = self._find_collection(flavor_subcollections, "Merged")
156
+ if not merged_coll_key:
157
+ if dry_run:
158
+ logger.info(" Would create sub-collection: Merged")
159
+ report.collections_created += 1
160
+ else:
161
+ merged_coll_key = self._create_collection("Merged", flavor_coll_key)
162
+ report.collections_created += 1
163
+
164
+ for citation in merged_list:
165
+ target = [merged_coll_key] if merged_coll_key else []
166
+ self._sync_single_citation(
167
+ citation, target, existing_items, dry_run, report, is_merged=True
168
+ )
169
+
170
+ return report
171
+
172
+ # ------------------------------------------------------------------
173
+ # Internal helpers
174
+ # ------------------------------------------------------------------
175
+
176
+ def _sync_single_citation(
177
+ self,
178
+ citation: CitationRecord,
179
+ collection_keys: list[str],
180
+ existing_items: dict[str, dict],
181
+ dry_run: bool,
182
+ report: SyncReport,
183
+ is_merged: bool = False,
184
+ ) -> None:
185
+ """Create or update a single citation item.
186
+
187
+ Args:
188
+ citation: Citation record to sync
189
+ collection_keys: Target collection keys for this citation
190
+ existing_items: Dict of existing items by tracker key
191
+ dry_run: If True, log actions without making API calls
192
+ report: Sync report to update
193
+ is_merged: If True, this citation is marked as merged
194
+ """
195
+ tracker_key = self._make_tracker_key(citation)
196
+
197
+ # Check if item already exists
198
+ if tracker_key in existing_items:
199
+ existing_item = existing_items[tracker_key]
200
+ current_collections = existing_item["data"].get("collections", [])
201
+
202
+ # If citation is merged, handle moving and related items
203
+ if is_merged:
204
+ needs_move = set(current_collections) != set(collection_keys)
205
+ needs_relation = False
206
+ published_key = None
207
+
208
+ # Try to find published version for related items link
209
+ if citation.citation_merged_into:
210
+ published_tracker = self._make_tracker_key_for_doi(
211
+ citation.item_id, citation.item_flavor, citation.citation_merged_into
212
+ )
213
+ if published_tracker in existing_items:
214
+ merged_key = existing_item["data"]["key"]
215
+ published_key = existing_items[published_tracker]["data"]["key"]
216
+
217
+ # Check if relation already exists
218
+ relations = existing_item["data"].get("relations", {})
219
+ dc_relation = relations.get("dc:relation", [])
220
+ if isinstance(dc_relation, str):
221
+ dc_relation = [dc_relation]
222
+
223
+ published_uri = (
224
+ f"http://zotero.org/groups/{self.group_id}/items/{published_key}"
225
+ )
226
+ needs_relation = published_uri not in dc_relation
227
+
228
+ # Update if move or relation needed
229
+ if needs_move or needs_relation:
230
+ if dry_run:
231
+ if needs_move:
232
+ logger.info(
233
+ " Would move existing item to Merged: %s",
234
+ citation.citation_title,
235
+ )
236
+ if needs_relation:
237
+ logger.info(
238
+ " Would add related item link: %s",
239
+ citation.citation_title,
240
+ )
241
+ report.items_updated += 1
242
+ else:
243
+ try:
244
+ if needs_move:
245
+ self._move_item_to_collections(existing_item, collection_keys)
246
+ logger.info("Moved item to Merged: %s", citation.citation_title)
247
+
248
+ if needs_relation and published_key:
249
+ # Build items-by-key dict for the relation method
250
+ merged_key = existing_item["data"]["key"]
251
+ items_by_key = {
252
+ merged_key: existing_item,
253
+ published_key: existing_items[published_tracker],
254
+ }
255
+ self._add_related_item(merged_key, published_key, items_by_key)
256
+
257
+ report.items_updated += 1
258
+ except Exception as e:
259
+ logger.error("Error updating item %s: %s", citation.citation_doi, e)
260
+ report.errors.append(f"{citation.citation_doi}: {e}")
261
+ else:
262
+ report.items_skipped += 1
263
+ else:
264
+ report.items_skipped += 1
265
+ return
266
+
267
+ # Create new item
268
+ if dry_run:
269
+ logger.info(" Would create: %s (%s)", citation.citation_title, citation.citation_doi)
270
+ report.items_created += 1
271
+ return
272
+
273
+ try:
274
+ zot_item = self._citation_to_zotero_item(citation, collection_keys)
275
+ resp = self.zot.create_items([zot_item])
276
+
277
+ if resp.get("successful"):
278
+ report.items_created += 1
279
+ # Attach PDF link if available
280
+ if citation.pdf_url:
281
+ created_key = resp["successful"]["0"]["key"]
282
+ self._attach_linked_url(created_key, citation.pdf_url, citation.citation_title)
283
+ report.attachments_created += 1
284
+ elif resp.get("failed"):
285
+ err = str(resp["failed"])
286
+ logger.error("Failed to create item %s: %s", citation.citation_doi, err)
287
+ report.errors.append(f"{citation.citation_doi}: {err}")
288
+ except Exception as e:
289
+ logger.error("Error creating item %s: %s", citation.citation_doi, e)
290
+ report.errors.append(f"{citation.citation_doi}: {e}")
291
+
292
+ def _fetch_subcollections(self, parent_key: str) -> dict[str, str]:
293
+ """Fetch subcollections under *parent_key*. Returns ``{key: name}``."""
294
+ try:
295
+ collections = self.zot.collections_sub(parent_key)
296
+ return {c["key"]: c["data"]["name"] for c in collections}
297
+ except Exception:
298
+ return {}
299
+
300
+ def _fetch_existing_items(self) -> dict[str, dict]:
301
+ """Fetch all items under the top collection tree, indexed by tracker key.
302
+
303
+ Walks subcollections recursively since ``collection_items`` only
304
+ returns items directly in the given collection.
305
+ """
306
+ items: dict[str, dict] = {}
307
+ try:
308
+ collection_keys = self._collect_all_subcollection_keys(self.top_collection_key)
309
+ collection_keys.append(self.top_collection_key)
310
+ for coll_key in collection_keys:
311
+ coll_items = self.zot.everything(self.zot.collection_items(coll_key))
312
+ for item in coll_items:
313
+ if item["data"].get("itemType") in ("attachment", "note"):
314
+ continue
315
+ extra = item["data"].get("extra", "")
316
+ for line in extra.split("\n"):
317
+ if line.startswith(TRACKER_PREFIX):
318
+ tracker_key = line[len(TRACKER_PREFIX) :].strip()
319
+ items[tracker_key] = item
320
+ break
321
+ except Exception as e:
322
+ logger.warning("Error fetching existing items: %s", e)
323
+ return items
324
+
325
+ def _collect_all_subcollection_keys(self, parent_key: str) -> list[str]:
326
+ """Recursively collect all subcollection keys under a parent."""
327
+ keys: list[str] = []
328
+ subs = self._fetch_subcollections(parent_key)
329
+ for key in subs:
330
+ keys.append(key)
331
+ keys.extend(self._collect_all_subcollection_keys(key))
332
+ return keys
333
+
334
+ def _group_citations(
335
+ self, citations: list[CitationRecord]
336
+ ) -> dict[str, dict[str, dict[str, list[CitationRecord]]]]:
337
+ """Group citations by ``item_id -> flavor -> status_bucket -> [citations]``.
338
+
339
+ ``status_bucket`` is either ``"active"`` or ``"merged"``.
340
+ Other statuses (e.g. ``ignored``) are skipped entirely.
341
+ """
342
+ grouped: dict[str, dict[str, dict[str, list[CitationRecord]]]] = {}
343
+ for c in citations:
344
+ status = str(c.citation_status) if c.citation_status else "active"
345
+ if status not in ("active", "merged"):
346
+ continue
347
+ bucket = "merged" if status == "merged" else "active"
348
+ (
349
+ grouped.setdefault(c.item_id, {})
350
+ .setdefault(c.item_flavor, {})
351
+ .setdefault(bucket, [])
352
+ .append(c)
353
+ )
354
+ return grouped
355
+
356
+ def _get_item_name(self, citations: list[CitationRecord], item_id: str) -> str | None:
357
+ """Return the item name from the first citation matching *item_id*."""
358
+ for c in citations:
359
+ if c.item_id == item_id and c.item_name:
360
+ return c.item_name
361
+ return None
362
+
363
+ @staticmethod
364
+ def _strip_prefix(item_id: str) -> str:
365
+ """Strip namespace prefix: ``'dandi:000020'`` -> ``'000020'``."""
366
+ return item_id.split(":", 1)[-1]
367
+
368
+ @staticmethod
369
+ def _find_collection(collections: dict[str, str], name: str) -> str | None:
370
+ """Find collection key by name."""
371
+ for key, coll_name in collections.items():
372
+ if coll_name == name:
373
+ return key
374
+ return None
375
+
376
+ def _create_collection(self, name: str, parent_key: str) -> str:
377
+ """Create a new collection under *parent_key*. Returns the new key."""
378
+ payload = {"name": name, "parentCollection": parent_key}
379
+ resp = self.zot.create_collections([payload])
380
+ if resp.get("successful"):
381
+ return str(resp["successful"]["0"]["key"])
382
+ raise RuntimeError(f"Failed to create collection '{name}': {resp}")
383
+
384
+ def _citation_to_zotero_item(
385
+ self, citation: CitationRecord, collection_keys: list[str]
386
+ ) -> dict:
387
+ """Convert a :class:`CitationRecord` to a Zotero item dict."""
388
+ # Determine item type
389
+ item_type = "journalArticle"
390
+ if citation.citation_type:
391
+ type_map = {
392
+ "Preprint": "preprint",
393
+ "Thesis": "thesis",
394
+ "Book": "book",
395
+ "Software": "computerProgram",
396
+ "Dataset": "dataset",
397
+ }
398
+ item_type = type_map.get(str(citation.citation_type), "journalArticle")
399
+
400
+ # Build creators list
401
+ creators = []
402
+ if citation.citation_authors:
403
+ for author in citation.citation_authors.split("; "):
404
+ parts = author.rsplit(" ", 1)
405
+ if len(parts) == 2:
406
+ creators.append(
407
+ {"creatorType": "author", "firstName": parts[0], "lastName": parts[1]}
408
+ )
409
+ else:
410
+ creators.append({"creatorType": "author", "name": author})
411
+
412
+ tracker_key = self._make_tracker_key(citation)
413
+ extra_lines = [f"{TRACKER_PREFIX} {tracker_key}"]
414
+ if citation.citation_source:
415
+ extra_lines.append(f"Discovery Source: {citation.citation_source}")
416
+
417
+ # Build base item
418
+ item = {
419
+ "itemType": item_type,
420
+ "title": citation.citation_title or "",
421
+ "creators": creators,
422
+ "DOI": citation.citation_doi or "",
423
+ "url": citation.citation_url or "",
424
+ "date": str(citation.citation_year) if citation.citation_year else "",
425
+ "extra": "\n".join(extra_lines),
426
+ "collections": collection_keys,
427
+ }
428
+
429
+ # Add journal/repository field based on item type
430
+ if citation.citation_journal:
431
+ if item_type == "preprint":
432
+ # Preprints use 'repository' field (e.g., bioRxiv, arXiv)
433
+ item["repository"] = citation.citation_journal
434
+ else:
435
+ # Journal articles and most other types use 'publicationTitle'
436
+ item["publicationTitle"] = citation.citation_journal
437
+
438
+ return item
439
+
440
+ @staticmethod
441
+ def _make_tracker_key(citation: CitationRecord) -> str:
442
+ """Create tracker key for the ``extra`` field."""
443
+ return (
444
+ f"{citation.item_id}/{citation.item_flavor}"
445
+ f"/{citation.citation_doi or citation.citation_url or ''}"
446
+ )
447
+
448
+ @staticmethod
449
+ def _make_tracker_key_for_doi(item_id: str, flavor: str, doi: str) -> str:
450
+ """Create tracker key for a specific DOI."""
451
+ return f"{item_id}/{flavor}/{doi}"
452
+
453
+ def _attach_linked_url(self, parent_key: str, url: str, title: str | None = None) -> None:
454
+ """Attach a linked URL to a Zotero item."""
455
+ try:
456
+ attachment = {
457
+ "itemType": "attachment",
458
+ "linkMode": "linked_url",
459
+ "url": url,
460
+ "title": title or "PDF",
461
+ "parentItem": parent_key,
462
+ "tags": [],
463
+ "relations": {},
464
+ "contentType": "application/pdf",
465
+ }
466
+ self.zot.create_items([attachment])
467
+ except Exception as e:
468
+ logger.warning("Failed to attach URL to %s: %s", parent_key, e)
469
+
470
+ def _move_item_to_collections(
471
+ self, existing_item: dict, new_collection_keys: list[str]
472
+ ) -> None:
473
+ """Move an existing Zotero item to different collections.
474
+
475
+ Args:
476
+ existing_item: The existing item dict from Zotero API
477
+ new_collection_keys: List of collection keys to move the item to
478
+ """
479
+ item_key = existing_item["data"]["key"]
480
+ version = existing_item["data"]["version"]
481
+
482
+ # Update the item's collections
483
+ updated_data = {
484
+ "key": item_key,
485
+ "version": version,
486
+ "collections": new_collection_keys,
487
+ }
488
+
489
+ try:
490
+ self.zot.update_item(updated_data)
491
+ logger.info(
492
+ "Updated collections for item %s: %s",
493
+ existing_item["data"].get("title", ""),
494
+ new_collection_keys,
495
+ )
496
+ except Exception as e:
497
+ logger.error("Failed to update item collections: %s", e)
498
+ raise
499
+
500
+ def _add_related_item(
501
+ self, item1_key: str, item2_key: str, existing_items_by_key: dict[str, dict]
502
+ ) -> None:
503
+ """Add bidirectional 'Related Items' link between two Zotero items.
504
+
505
+ Args:
506
+ item1_key: Zotero key of first item (e.g., merged preprint)
507
+ item2_key: Zotero key of second item (e.g., published version)
508
+ existing_items_by_key: Dict of existing items indexed by Zotero key
509
+ """
510
+ try:
511
+ # Get current state of both items
512
+ item1 = existing_items_by_key.get(item1_key)
513
+ item2 = existing_items_by_key.get(item2_key)
514
+
515
+ if not item1 or not item2:
516
+ logger.warning(
517
+ f"Cannot add relation: item keys not found ({item1_key}, {item2_key})"
518
+ )
519
+ return
520
+
521
+ # Get current relations
522
+ relations1 = item1["data"].get("relations", {})
523
+ relations2 = item2["data"].get("relations", {})
524
+
525
+ # Ensure relations are dicts with proper structure
526
+ if not isinstance(relations1, dict):
527
+ relations1 = {}
528
+ if not isinstance(relations2, dict):
529
+ relations2 = {}
530
+
531
+ # Build full item URIs
532
+ item1_uri = f"http://zotero.org/groups/{self.group_id}/items/{item1_key}"
533
+ item2_uri = f"http://zotero.org/groups/{self.group_id}/items/{item2_key}"
534
+
535
+ # Add item2 to item1's related items (if not already there)
536
+ dc_relation1 = relations1.get("dc:relation", [])
537
+ if isinstance(dc_relation1, str):
538
+ dc_relation1 = [dc_relation1]
539
+ elif not isinstance(dc_relation1, list):
540
+ dc_relation1 = []
541
+
542
+ if item2_uri not in dc_relation1:
543
+ dc_relation1.append(item2_uri)
544
+ relations1["dc:relation"] = dc_relation1
545
+
546
+ # Update item1
547
+ update1 = {
548
+ "key": item1_key,
549
+ "version": item1["data"]["version"],
550
+ "relations": relations1,
551
+ }
552
+ self.zot.update_item(update1)
553
+ logger.info(f"Added related item link: {item1_key} -> {item2_key}")
554
+
555
+ # Add item1 to item2's related items (if not already there)
556
+ dc_relation2 = relations2.get("dc:relation", [])
557
+ if isinstance(dc_relation2, str):
558
+ dc_relation2 = [dc_relation2]
559
+ elif not isinstance(dc_relation2, list):
560
+ dc_relation2 = []
561
+
562
+ if item1_uri not in dc_relation2:
563
+ dc_relation2.append(item1_uri)
564
+ relations2["dc:relation"] = dc_relation2
565
+
566
+ # Refresh item2 to get latest version (item1 update may have changed it)
567
+ item2_refreshed = self.zot.item(item2_key)
568
+ relations2_refreshed = item2_refreshed["data"].get("relations", {})
569
+ if not isinstance(relations2_refreshed, dict):
570
+ relations2_refreshed = {}
571
+
572
+ dc_relation2_refreshed = relations2_refreshed.get("dc:relation", [])
573
+ if isinstance(dc_relation2_refreshed, str):
574
+ dc_relation2_refreshed = [dc_relation2_refreshed]
575
+ elif not isinstance(dc_relation2_refreshed, list):
576
+ dc_relation2_refreshed = []
577
+
578
+ if item1_uri not in dc_relation2_refreshed:
579
+ dc_relation2_refreshed.append(item1_uri)
580
+ relations2_refreshed["dc:relation"] = dc_relation2_refreshed
581
+
582
+ update2 = {
583
+ "key": item2_key,
584
+ "version": item2_refreshed["data"]["version"],
585
+ "relations": relations2_refreshed,
586
+ }
587
+ self.zot.update_item(update2)
588
+ logger.info(f"Added related item link: {item2_key} -> {item1_key}")
589
+
590
+ except Exception as e:
591
+ logger.warning(f"Failed to add related items: {e}")