citations-collector 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,503 @@
1
+ """Core orchestration for citation collection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from collections.abc import Callable
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+
10
+ from tqdm import tqdm
11
+ from tqdm.contrib.logging import logging_redirect_tqdm
12
+
13
+ from citations_collector.discovery import (
14
+ CrossRefDiscoverer,
15
+ DataCiteDiscoverer,
16
+ OpenAlexDiscoverer,
17
+ OpenCitationsDiscoverer,
18
+ )
19
+ from citations_collector.discovery.utils import deduplicate_citations
20
+ from citations_collector.models import CitationRecord, Collection
21
+ from citations_collector.persistence import tsv_io, yaml_io
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class CitationCollector:
27
+ """
28
+ Main orchestration class for citation collection.
29
+
30
+ Provides library-first API for loading collections, discovering citations,
31
+ and saving results.
32
+ """
33
+
34
+ def __init__(self, collection: Collection, collection_path: Path | None = None) -> None:
35
+ """
36
+ Initialize with a Collection object.
37
+
38
+ Args:
39
+ collection: Collection object to manage
40
+ collection_path: Path to collection YAML file (for resolving relative paths)
41
+ """
42
+ self.collection = collection
43
+ self.collection_path = collection_path
44
+ self.citations: list[CitationRecord] = []
45
+ self._skip_yaml_save = False # Flag to skip YAML save when items from external source
46
+
47
+ @classmethod
48
+ def from_yaml(cls, path: Path) -> CitationCollector:
49
+ """
50
+ Load collection from YAML file.
51
+
52
+ Args:
53
+ path: Path to collection YAML file
54
+
55
+ Returns:
56
+ CitationCollector instance
57
+ """
58
+ collection = yaml_io.load_collection(path)
59
+ return cls(collection, collection_path=path)
60
+
61
+ def populate_from_source(
62
+ self, progress_callback: Callable[[int, int | None], None] | None = None
63
+ ) -> None:
64
+ """
65
+ Dynamically populate items from source configuration.
66
+
67
+ If collection.source is configured (e.g., type="dandi" with dandiset_ids),
68
+ fetches the items from the source API and adds them to collection.items.
69
+
70
+ This allows collections to stay up-to-date without manually maintaining
71
+ item lists - the items are fetched dynamically at discovery time.
72
+
73
+ Args:
74
+ progress_callback: Optional callback(current, total) for progress reporting
75
+ """
76
+ if not self.collection.source or not self.collection.source.type:
77
+ return
78
+
79
+ source_type = self.collection.source.type
80
+ logger.info(f"Populating items from source: {source_type}")
81
+
82
+ if source_type == "dandi":
83
+ self._populate_from_dandi(progress_callback)
84
+ elif source_type == "bibtex":
85
+ self._populate_from_bibtex(progress_callback)
86
+ else:
87
+ logger.warning(f"Unknown source type: {source_type}")
88
+
89
+ def _populate_from_dandi(
90
+ self, progress_callback: Callable[[int, int | None], None] | None = None
91
+ ) -> None:
92
+ """Populate items from DANDI Archive using source.dandiset_ids."""
93
+ from citations_collector.importers import DANDIImporter
94
+
95
+ if not self.collection.source or not self.collection.source.dandiset_ids:
96
+ logger.warning("No dandiset_ids specified in source config")
97
+ return
98
+
99
+ dandiset_ids = self.collection.source.dandiset_ids
100
+ importer = DANDIImporter()
101
+
102
+ # Import specific dandisets
103
+ imported = importer.import_specific(
104
+ dandiset_ids=dandiset_ids,
105
+ progress_callback=progress_callback,
106
+ )
107
+
108
+ # Add imported items to collection (avoiding duplicates)
109
+ if not imported.items:
110
+ logger.warning("No items imported from DANDI")
111
+ return
112
+
113
+ if not self.collection.items:
114
+ self.collection.items = []
115
+
116
+ existing_ids = {item.item_id for item in self.collection.items}
117
+ for item in imported.items:
118
+ if item.item_id not in existing_ids:
119
+ self.collection.items.append(item)
120
+ logger.info(f"Added item from DANDI: {item.item_id}")
121
+ else:
122
+ logger.debug(f"Skipping duplicate item: {item.item_id}")
123
+
124
+ def _populate_from_bibtex(
125
+ self, progress_callback: Callable[[int, int | None], None] | None = None
126
+ ) -> None:
127
+ """
128
+ Populate items from BibTeX source.
129
+
130
+ Reads BibTeX file specified in source config, parses entries using
131
+ regex pattern to extract item_id and flavor_id.
132
+ """
133
+ from citations_collector.importers import BibTeXImporter
134
+
135
+ source = self.collection.source
136
+ if not source:
137
+ return
138
+
139
+ # Resolve path relative to collection file location
140
+ bibtex_file = Path(source.bibtex_file) if source.bibtex_file else None
141
+ if not bibtex_file:
142
+ logger.error("BibTeX source requires bibtex_file to be specified")
143
+ return
144
+
145
+ if not bibtex_file.is_absolute() and self.collection_path:
146
+ # Resolve relative to collection YAML directory
147
+ bibtex_file = (self.collection_path.parent / bibtex_file).resolve()
148
+
149
+ if not bibtex_file.exists():
150
+ logger.error(f"BibTeX file not found: {bibtex_file}")
151
+ return
152
+
153
+ # Validate required fields
154
+ if not source.bib_field:
155
+ logger.error("BibTeX source requires bib_field to be specified")
156
+ return
157
+ if not source.ref_type:
158
+ logger.error("BibTeX source requires ref_type to be specified")
159
+ return
160
+ if not source.ref_regex:
161
+ logger.error("BibTeX source requires ref_regex to be specified")
162
+ return
163
+
164
+ if progress_callback:
165
+ progress_callback(0, None)
166
+
167
+ logger.info(f"Reading BibTeX from {bibtex_file.name}")
168
+
169
+ # Import from BibTeX
170
+ importer = BibTeXImporter(
171
+ bibtex_file=bibtex_file,
172
+ bib_field=source.bib_field,
173
+ ref_type=source.ref_type,
174
+ ref_regex=source.ref_regex,
175
+ )
176
+
177
+ try:
178
+ bib_collection = importer.import_all()
179
+ except Exception as e:
180
+ logger.error(f"Failed to import from BibTeX: {e}")
181
+ return
182
+
183
+ # Handle update_items setting
184
+ bib_items = bib_collection.items or []
185
+ if source.update_items == "sync":
186
+ # Replace all items with BibTeX items
187
+ self.collection.items = bib_items
188
+ logger.info(f"Synced {len(bib_items)} items from BibTeX")
189
+ elif source.update_items == "add":
190
+ # Add only new items (by item_id)
191
+ if not self.collection.items:
192
+ self.collection.items = []
193
+ existing_ids = {item.item_id for item in self.collection.items}
194
+ new_items = [item for item in bib_items if item.item_id not in existing_ids]
195
+ self.collection.items.extend(new_items)
196
+ logger.info(f"Added {len(new_items)} new items from BibTeX")
197
+ else:
198
+ # update_items: false/omitted - don't modify YAML, just use BibTeX items for discovery
199
+ # Replace items temporarily for discovery, but won't be saved to YAML
200
+ self.collection.items = bib_items
201
+ self._skip_yaml_save = True # Don't save YAML - items maintained externally
202
+ logger.info(f"Loaded {len(bib_items)} items from BibTeX (not saving to YAML)")
203
+
204
+ if progress_callback:
205
+ progress_callback(len(bib_items), len(bib_items))
206
+
207
+ def expand_refs(
208
+ self,
209
+ github_token: str | None = None,
210
+ zenodo_token: str | None = None,
211
+ expand_types: list[str] | None = None,
212
+ ) -> None:
213
+ """
214
+ Expand non-DOI references to DOI references.
215
+
216
+ This pre-processes the collection to convert references like:
217
+ - zenodo_concept → multiple DOI refs (concept + all versions)
218
+ - github → DOI ref (via Zenodo badge extraction)
219
+
220
+ Expanded DOIs are added to the item's refs list alongside original refs.
221
+
222
+ Args:
223
+ github_token: Optional GitHub token for API rate limits
224
+ zenodo_token: Optional Zenodo token for authentication
225
+ expand_types: Which ref types to expand (default: ["zenodo_concept", "github"])
226
+ """
227
+ from citations_collector.importers import GitHubMapper, ZenodoExpander
228
+
229
+ if expand_types is None:
230
+ expand_types = ["zenodo_concept", "github"]
231
+
232
+ if not self.collection.items:
233
+ return
234
+
235
+ # Initialize expanders/mappers
236
+ zenodo_expander = (
237
+ ZenodoExpander(zenodo_token=zenodo_token) if "zenodo_concept" in expand_types else None
238
+ )
239
+ github_mapper = (
240
+ GitHubMapper(github_token=github_token) if "github" in expand_types else None
241
+ )
242
+
243
+ for item in self.collection.items:
244
+ for flavor in item.flavors:
245
+ # Collect expanded refs
246
+ expanded_refs = []
247
+
248
+ for ref in flavor.refs:
249
+ # Expand zenodo_concept to all version DOIs
250
+ if ref.ref_type == "zenodo_concept" and zenodo_expander:
251
+ logger.info(f"Expanding Zenodo concept {ref.ref_value} for {item.item_id}")
252
+ doi_refs = zenodo_expander.expand(ref.ref_value)
253
+ expanded_refs.extend(doi_refs)
254
+
255
+ # Map github to Zenodo DOI
256
+ elif ref.ref_type == "github" and github_mapper:
257
+ logger.info(f"Mapping GitHub {ref.ref_value} to DOI for {item.item_id}")
258
+ doi_ref = github_mapper.map_to_doi(ref.ref_value)
259
+ if doi_ref:
260
+ expanded_refs.append(doi_ref)
261
+
262
+ # Add expanded refs to flavor, avoiding duplicates
263
+ existing_ref_values = {(ref.ref_type, ref.ref_value) for ref in flavor.refs}
264
+ for expanded_ref in expanded_refs:
265
+ ref_key = (expanded_ref.ref_type, expanded_ref.ref_value)
266
+ if ref_key not in existing_ref_values:
267
+ flavor.refs.append(expanded_ref)
268
+ existing_ref_values.add(ref_key)
269
+
270
+ def _get_most_recent_discovery_date(self) -> datetime | None:
271
+ """
272
+ Get the most recent discovery date from existing citations.
273
+
274
+ Used for incremental discovery to avoid re-querying old citations.
275
+
276
+ Returns:
277
+ Most recent discovered_date, or None if no citations exist
278
+ """
279
+ if not self.citations:
280
+ return None
281
+
282
+ dates = [c.discovered_date for c in self.citations if c.discovered_date]
283
+ if not dates:
284
+ return None
285
+
286
+ # Convert date to datetime for API compatibility
287
+ most_recent = max(dates)
288
+ return datetime.combine(most_recent, datetime.min.time())
289
+
290
+ def _report_discoveries(self, new_citations: list[CitationRecord]) -> None:
291
+ """
292
+ Report discovered citations grouped by DOI, showing which sources found each.
293
+
294
+ Only reports citations that are NEW (not already in self.citations).
295
+
296
+ Args:
297
+ new_citations: Newly discovered citations to report
298
+ """
299
+ if not new_citations:
300
+ return
301
+
302
+ # Build set of existing citation keys to identify truly new ones
303
+ existing_keys = {(c.item_id, c.item_flavor, c.citation_doi) for c in self.citations}
304
+
305
+ # Group new citations by DOI
306
+ doi_groups: dict[str, list[CitationRecord]] = {}
307
+ for citation in new_citations:
308
+ key = (citation.item_id, citation.item_flavor, citation.citation_doi)
309
+ # Only report if not already in existing citations
310
+ if key not in existing_keys:
311
+ doi = citation.citation_doi or "unknown"
312
+ if doi not in doi_groups:
313
+ doi_groups[doi] = []
314
+ doi_groups[doi].append(citation)
315
+
316
+ # Report each new DOI with sources
317
+ if doi_groups:
318
+ logger.info(f"\nDiscovered {len(doi_groups)} new citations:")
319
+ for doi, group in sorted(doi_groups.items()):
320
+ # Collect all sources that found this DOI
321
+ sources = set()
322
+ for citation in group:
323
+ if hasattr(citation, "citation_sources") and citation.citation_sources:
324
+ sources.update(citation.citation_sources)
325
+ elif citation.citation_source:
326
+ sources.add(citation.citation_source)
327
+
328
+ # Format sources
329
+ sources_str = ", ".join(sorted(sources)) if sources else "unknown"
330
+
331
+ # Show item_id/flavor and title (truncated)
332
+ item_ref = f"{group[0].item_id}/{group[0].item_flavor}"
333
+ title = (group[0].citation_title or "")[:60]
334
+ title_suffix = "..." if len(group[0].citation_title or "") > 60 else ""
335
+ logger.info(f" {doi} [{sources_str}]")
336
+ logger.info(f" → {item_ref}: {title}{title_suffix}")
337
+
338
+ def discover_all(
339
+ self,
340
+ sources: list[str] | None = None,
341
+ incremental: bool = True,
342
+ since_date: datetime | None = None,
343
+ email: str | None = None,
344
+ ) -> None:
345
+ """
346
+ Discover citations for all items in collection.
347
+
348
+ Args:
349
+ sources: Which discoverers to use (default: all available)
350
+ Available: "crossref", "opencitations", "datacite", "openalex"
351
+ incremental: Derive since date from existing citations for incremental discovery
352
+ since_date: Optional explicit since date (overrides incremental)
353
+ email: Email for CrossRef polite pool
354
+ """
355
+ if sources is None:
356
+ sources = ["crossref", "opencitations", "datacite", "openalex"]
357
+
358
+ # Initialize discoverers
359
+ discoverers: list[
360
+ tuple[
361
+ str,
362
+ CrossRefDiscoverer
363
+ | OpenCitationsDiscoverer
364
+ | DataCiteDiscoverer
365
+ | OpenAlexDiscoverer,
366
+ ]
367
+ ] = []
368
+ if "crossref" in sources:
369
+ discoverers.append(("crossref", CrossRefDiscoverer(email=email)))
370
+ if "opencitations" in sources:
371
+ discoverers.append(("opencitations", OpenCitationsDiscoverer()))
372
+ if "datacite" in sources:
373
+ discoverers.append(("datacite", DataCiteDiscoverer()))
374
+ if "openalex" in sources:
375
+ discoverers.append(("openalex", OpenAlexDiscoverer(email=email)))
376
+
377
+ # Determine since date for incremental
378
+ since = since_date # Explicit override takes precedence
379
+ if since is None and incremental:
380
+ # Derive from existing citations (most recent discovered_date)
381
+ since = self._get_most_recent_discovery_date()
382
+
383
+ # Discover citations for all items/flavors/refs
384
+ all_citations = []
385
+
386
+ if not self.collection.items:
387
+ return
388
+
389
+ # Count total refs for progress bar
390
+ total_refs = sum(
391
+ len(flavor.refs) for item in self.collection.items for flavor in item.flavors
392
+ )
393
+
394
+ # Create progress bar with logging redirection
395
+ # Disable only if DEBUG logging is enabled (so debug messages are visible)
396
+ with (
397
+ logging_redirect_tqdm(),
398
+ tqdm(
399
+ total=total_refs * len(discoverers),
400
+ desc="Discovering citations",
401
+ unit="query",
402
+ disable=logger.getEffectiveLevel() <= logging.DEBUG,
403
+ ) as pbar,
404
+ ):
405
+ for item in self.collection.items:
406
+ for flavor in item.flavors:
407
+ for ref in flavor.refs:
408
+ for source_name, discoverer in discoverers:
409
+ try:
410
+ citations = discoverer.discover(ref, since=since)
411
+
412
+ # Fill in item context and track source
413
+ for citation in citations:
414
+ citation.item_id = item.item_id
415
+ citation.item_flavor = flavor.flavor_id
416
+ citation.item_ref_type = ref.ref_type
417
+ citation.item_ref_value = ref.ref_value
418
+ citation.item_name = item.name
419
+ # Track which source found this citation
420
+ citation.citation_source = source_name # type: ignore[assignment]
421
+
422
+ all_citations.extend(citations)
423
+ logger.debug(
424
+ f"{source_name}: {len(citations)} citations "
425
+ f"for {item.item_id}/{flavor.flavor_id}"
426
+ )
427
+
428
+ except Exception as e:
429
+ logger.error(
430
+ f"Error discovering from {source_name} "
431
+ f"for {item.item_id}/{flavor.flavor_id}: {e}"
432
+ )
433
+
434
+ # Update progress
435
+ pbar.update(1)
436
+
437
+ # Deduplicate and merge with existing
438
+ unique_citations = deduplicate_citations(all_citations)
439
+
440
+ # Report new citations grouped by DOI with sources
441
+ self._report_discoveries(unique_citations)
442
+
443
+ self.merge_citations(unique_citations)
444
+
445
+ def load_existing_citations(self, path: Path) -> None:
446
+ """
447
+ Load existing citations from TSV (preserves curation).
448
+
449
+ Args:
450
+ path: Path to TSV file
451
+ """
452
+ self.citations = tsv_io.load_citations(path)
453
+
454
+ def merge_citations(self, new_citations: list[CitationRecord]) -> None:
455
+ """
456
+ Merge new citations with existing, preserve curation status.
457
+
458
+ Uses unique key (item_id, item_flavor, citation_doi).
459
+
460
+ Args:
461
+ new_citations: New citations to merge
462
+ """
463
+ # Build index of existing citations
464
+ existing_index = {(c.item_id, c.item_flavor, c.citation_doi): c for c in self.citations}
465
+
466
+ # Merge new citations
467
+ for new_citation in new_citations:
468
+ key = (new_citation.item_id, new_citation.item_flavor, new_citation.citation_doi)
469
+
470
+ if key in existing_index:
471
+ # Citation exists - preserve curation fields
472
+ existing = existing_index[key]
473
+ # Keep existing curation status, comment, etc.
474
+ # Only update metadata if not curated
475
+ if existing.citation_status == "active" and not existing.citation_comment:
476
+ # Update title, authors, etc. from new discovery
477
+ if new_citation.citation_title:
478
+ existing.citation_title = new_citation.citation_title
479
+ if new_citation.citation_authors:
480
+ existing.citation_authors = new_citation.citation_authors
481
+ if new_citation.citation_year:
482
+ existing.citation_year = new_citation.citation_year
483
+ if new_citation.citation_journal:
484
+ existing.citation_journal = new_citation.citation_journal
485
+ else:
486
+ # New citation - add it
487
+ self.citations.append(new_citation)
488
+ existing_index[key] = new_citation
489
+
490
+ def save(self, yaml_path: Path, tsv_path: Path) -> None:
491
+ """
492
+ Save collection and citations.
493
+
494
+ Args:
495
+ yaml_path: Path to output collection YAML
496
+ tsv_path: Path to output citations TSV
497
+ """
498
+ # Only save YAML if items are managed in YAML (not external source)
499
+ if not self._skip_yaml_save:
500
+ yaml_io.save_collection(self.collection, yaml_path)
501
+ else:
502
+ logger.info("Skipping YAML save - items managed externally")
503
+ tsv_io.save_citations(self.citations, tsv_path)
@@ -0,0 +1,17 @@
1
+ """Citation discovery from external APIs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from citations_collector.discovery.base import AbstractDiscoverer
6
+ from citations_collector.discovery.crossref import CrossRefDiscoverer
7
+ from citations_collector.discovery.datacite import DataCiteDiscoverer
8
+ from citations_collector.discovery.openalex import OpenAlexDiscoverer
9
+ from citations_collector.discovery.opencitations import OpenCitationsDiscoverer
10
+
11
+ __all__ = [
12
+ "AbstractDiscoverer",
13
+ "CrossRefDiscoverer",
14
+ "DataCiteDiscoverer",
15
+ "OpenAlexDiscoverer",
16
+ "OpenCitationsDiscoverer",
17
+ ]
@@ -0,0 +1,26 @@
1
+ """Abstract base class for citation discoverers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from datetime import datetime
7
+
8
+ from citations_collector.models import CitationRecord, ItemRef
9
+
10
+
11
+ class AbstractDiscoverer(ABC):
12
+ """Base class for citation discovery APIs."""
13
+
14
+ @abstractmethod
15
+ def discover(self, item_ref: ItemRef, since: datetime | None = None) -> list[CitationRecord]:
16
+ """
17
+ Discover citations for a given item reference.
18
+
19
+ Args:
20
+ item_ref: The identifier to query (DOI, RRID, etc.)
21
+ since: Optional date filter for incremental updates
22
+
23
+ Returns:
24
+ List of discovered citation records
25
+ """
26
+ pass