citations-collector 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ """citations-collector: Discover and curate scholarly citations of datasets and software."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from citations_collector.core import CitationCollector
6
+ from citations_collector.models import CitationRecord, Collection
7
+
8
+ __all__ = [
9
+ "__version__",
10
+ "CitationCollector",
11
+ "CitationRecord",
12
+ "Collection",
13
+ ]
14
+
15
+ try:
16
+ from citations_collector._version import version as __version__
17
+ except ImportError:
18
+ __version__ = "0.0.0+unknown"
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.2.3'
32
+ __version_tuple__ = version_tuple = (0, 2, 3)
33
+
34
+ __commit_id__ = commit_id = None
@@ -0,0 +1,525 @@
1
+ """Click-based CLI for citations-collector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+
9
+ import click
10
+
11
+ from citations_collector.core import CitationCollector
12
+ from citations_collector.importers.dandi import DANDIImporter
13
+ from citations_collector.importers.zotero import ZoteroImporter
14
+ from citations_collector.persistence import yaml_io
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @click.group()
20
+ @click.version_option()
21
+ @click.option("--verbose", "-v", is_flag=True, help="Enable verbose logging")
22
+ def main(verbose: bool) -> None:
23
+ """Discover and curate scholarly citations of datasets and software."""
24
+ logging.basicConfig(
25
+ level=logging.DEBUG if verbose else logging.INFO,
26
+ format="%(levelname)s: %(message)s",
27
+ )
28
+
29
+
30
+ @main.command()
31
+ @click.argument("collection", type=click.Path(exists=True, path_type=Path))
32
+ @click.option(
33
+ "--output",
34
+ "-o",
35
+ type=click.Path(path_type=Path),
36
+ help="Output TSV file (overrides collection YAML output_tsv)",
37
+ )
38
+ @click.option(
39
+ "--full-refresh",
40
+ is_flag=True,
41
+ help="Ignore incremental mode (discover all citations)",
42
+ )
43
+ @click.option(
44
+ "--since",
45
+ type=click.DateTime(formats=["%Y-%m-%d"]),
46
+ help="Only discover citations since this date (YYYY-MM-DD). Overrides incremental mode.",
47
+ )
48
+ @click.option(
49
+ "--email",
50
+ envvar="CROSSREF_EMAIL",
51
+ help="Email for CrossRef polite pool (overrides discover.email in YAML)",
52
+ )
53
+ @click.option(
54
+ "--sources",
55
+ multiple=True,
56
+ type=click.Choice(["crossref", "opencitations", "datacite", "openalex"]),
57
+ help="Which sources to query (overrides discover.sources in YAML)",
58
+ )
59
+ @click.option(
60
+ "--expand-refs",
61
+ is_flag=True,
62
+ help="Expand non-DOI refs (zenodo_concept, github) to DOIs before discovery",
63
+ )
64
+ def discover(
65
+ collection: Path,
66
+ output: Path | None,
67
+ full_refresh: bool,
68
+ since: datetime | None,
69
+ email: str | None,
70
+ sources: tuple[str, ...],
71
+ expand_refs: bool,
72
+ ) -> None:
73
+ """Discover citations for all items in COLLECTION."""
74
+ click.echo(f"Loading collection from {collection}")
75
+
76
+ # Load collection
77
+ collector = CitationCollector.from_yaml(collection)
78
+ cfg = collector.collection
79
+
80
+ # Populate items dynamically from source config (e.g., DANDI API)
81
+ if cfg.source and cfg.source.type:
82
+ click.echo(f"Populating items from {cfg.source.type} source...")
83
+ collector.populate_from_source()
84
+
85
+ # Resolve config: CLI overrides > YAML config > defaults
86
+ discover_cfg = cfg.discover
87
+ if not output:
88
+ output = Path(cfg.output_tsv) if cfg.output_tsv else Path("citations.tsv")
89
+ if not email and discover_cfg:
90
+ email = discover_cfg.email
91
+ if not sources and discover_cfg and discover_cfg.sources:
92
+ sources = tuple(discover_cfg.sources)
93
+
94
+ # Expand non-DOI refs if requested
95
+ if expand_refs:
96
+ click.echo("Expanding non-DOI references (zenodo_concept, github) to DOIs...")
97
+ collector.expand_refs()
98
+
99
+ # Load existing citations if TSV exists
100
+ if output.exists():
101
+ click.echo(f"Loading existing citations from {output}")
102
+ collector.load_existing_citations(output)
103
+ existing_count = len(collector.citations)
104
+ else:
105
+ existing_count = 0
106
+
107
+ # Discover citations
108
+ click.echo(f"Discovering citations for {cfg.name}...")
109
+ if email:
110
+ click.echo(f"Using CrossRef polite pool with email: {email}")
111
+
112
+ sources_list = list(sources) if sources else None
113
+ collector.discover_all(
114
+ sources=sources_list,
115
+ incremental=not full_refresh,
116
+ since_date=since,
117
+ email=email,
118
+ )
119
+
120
+ # Report results
121
+ new_count = len(collector.citations) - existing_count
122
+ click.echo(f"Found {new_count} new citations ({len(collector.citations)} total)")
123
+
124
+ # Save results
125
+ collector.save(collection, output)
126
+ click.echo(f"Saved to {output}")
127
+
128
+
129
+ @main.command("import-dandi")
130
+ @click.option(
131
+ "--output",
132
+ "-o",
133
+ required=True,
134
+ type=click.Path(path_type=Path),
135
+ help="Output YAML file for collection",
136
+ )
137
+ @click.option(
138
+ "--include-draft",
139
+ is_flag=True,
140
+ help="Include draft versions (no DOI)",
141
+ )
142
+ @click.option(
143
+ "--limit",
144
+ type=int,
145
+ help="Limit number of dandisets to import (only for --all)",
146
+ )
147
+ @click.option(
148
+ "--dandiset-id",
149
+ "-d",
150
+ "dandiset_ids",
151
+ multiple=True,
152
+ help="Import specific dandiset(s) by ID (e.g., -d 000402 -d 000003)",
153
+ )
154
+ @click.option(
155
+ "--all",
156
+ "import_all",
157
+ is_flag=True,
158
+ help="Import all dandisets (default if no --dandiset-id specified)",
159
+ )
160
+ def import_dandi(
161
+ output: Path,
162
+ include_draft: bool,
163
+ limit: int | None,
164
+ dandiset_ids: tuple[str, ...],
165
+ import_all: bool,
166
+ ) -> None:
167
+ """Import dandisets from DANDI Archive.
168
+
169
+ Examples:
170
+
171
+ # Import specific dandisets
172
+ citations-collector import-dandi -o microns.yaml -d 000402
173
+
174
+ # Import multiple specific dandisets
175
+ citations-collector import-dandi -o multi.yaml -d 000003 -d 000402
176
+
177
+ # Import all dandisets (with limit)
178
+ citations-collector import-dandi -o all.yaml --all --limit 10
179
+ """
180
+ importer = DANDIImporter()
181
+
182
+ # Determine what to import
183
+ if dandiset_ids:
184
+ # Import specific dandisets
185
+ click.echo(f"Importing {len(dandiset_ids)} specific dandiset(s)...")
186
+
187
+ with click.progressbar(length=len(dandiset_ids), label="Importing") as bar: # type: ignore[var-annotated]
188
+
189
+ def progress(current: int, total: int | None) -> None:
190
+ bar.update(1)
191
+
192
+ collection = importer.import_specific(
193
+ dandiset_ids=list(dandiset_ids),
194
+ include_draft=include_draft,
195
+ progress_callback=progress,
196
+ )
197
+ else:
198
+ # Import all dandisets
199
+ if not import_all:
200
+ click.echo(
201
+ "No --dandiset-id specified, importing all dandisets "
202
+ "(use --all to suppress this message)"
203
+ )
204
+ else:
205
+ click.echo("Importing all dandisets from DANDI Archive...")
206
+
207
+ with click.progressbar(length=limit or 0, label="Importing") as bar: # type: ignore[var-annotated]
208
+
209
+ def progress(current: int, total: int | None) -> None:
210
+ bar.update(1)
211
+
212
+ collection = importer.import_all(
213
+ include_draft=include_draft,
214
+ limit=limit,
215
+ progress_callback=progress if limit else None,
216
+ )
217
+
218
+ yaml_io.save_collection(collection, output)
219
+ click.echo(f"Imported {len(collection.items or [])} dandisets to {output}")
220
+
221
+
222
+ @main.command("import-zotero")
223
+ @click.option(
224
+ "--output",
225
+ "-o",
226
+ required=True,
227
+ type=click.Path(path_type=Path),
228
+ help="Output YAML file for collection",
229
+ )
230
+ @click.option(
231
+ "--group-id",
232
+ required=True,
233
+ type=int,
234
+ help="Zotero group ID",
235
+ )
236
+ @click.option(
237
+ "--collection-key",
238
+ help="Specific collection within group",
239
+ )
240
+ @click.option(
241
+ "--api-key",
242
+ envvar="ZOTERO_API_KEY",
243
+ help="Zotero API key (optional for public groups)",
244
+ )
245
+ @click.option(
246
+ "--limit",
247
+ type=int,
248
+ help="Limit number of items to import",
249
+ )
250
+ def import_zotero(
251
+ output: Path,
252
+ group_id: int,
253
+ collection_key: str | None,
254
+ api_key: str | None,
255
+ limit: int | None,
256
+ ) -> None:
257
+ """Import items from a Zotero group."""
258
+ click.echo(f"Importing items from Zotero group {group_id}...")
259
+
260
+ importer = ZoteroImporter(api_key=api_key)
261
+ collection = importer.import_group(
262
+ group_id=group_id,
263
+ collection_key=collection_key,
264
+ limit=limit,
265
+ )
266
+
267
+ yaml_io.save_collection(collection, output)
268
+ click.echo(f"Imported {len(collection.items or [])} items to {output}")
269
+
270
+
271
+ @main.command("sync-zotero")
272
+ @click.argument("collection", type=click.Path(exists=True, path_type=Path))
273
+ @click.option(
274
+ "--tsv",
275
+ type=click.Path(exists=True, path_type=Path),
276
+ help="Citations TSV file (overrides collection YAML output_tsv)",
277
+ )
278
+ @click.option(
279
+ "--api-key",
280
+ envvar="ZOTERO_API_KEY",
281
+ help="Zotero API key",
282
+ )
283
+ @click.option(
284
+ "--group-id",
285
+ type=int,
286
+ help="Zotero group ID (overrides zotero.group_id in YAML)",
287
+ )
288
+ @click.option(
289
+ "--collection-key",
290
+ help="Zotero collection key (overrides zotero.collection_key in YAML)",
291
+ )
292
+ @click.option("--dry-run", is_flag=True, help="Show what would be synced without writing")
293
+ def sync_zotero(
294
+ collection: Path,
295
+ tsv: Path | None,
296
+ api_key: str | None,
297
+ group_id: int | None,
298
+ collection_key: str | None,
299
+ dry_run: bool,
300
+ ) -> None:
301
+ """Sync citations to Zotero as hierarchical collections."""
302
+ from citations_collector.persistence import tsv_io
303
+ from citations_collector.zotero_sync import ZoteroSyncer
304
+
305
+ collector = CitationCollector.from_yaml(collection)
306
+ cfg = collector.collection
307
+ zotero_cfg = cfg.zotero
308
+
309
+ # Resolve config
310
+ if not tsv:
311
+ tsv_path = Path(cfg.output_tsv) if cfg.output_tsv else Path("citations.tsv")
312
+ else:
313
+ tsv_path = tsv
314
+ if not group_id and zotero_cfg:
315
+ group_id = zotero_cfg.group_id
316
+ if not collection_key and zotero_cfg:
317
+ collection_key = zotero_cfg.collection_key
318
+ if not api_key:
319
+ raise click.UsageError("Zotero API key required (--api-key or ZOTERO_API_KEY)")
320
+ if not group_id:
321
+ raise click.UsageError("Zotero group ID required (--group-id or zotero.group_id in YAML)")
322
+ if not collection_key:
323
+ raise click.UsageError(
324
+ "Zotero collection key required (--collection-key or zotero.collection_key in YAML)"
325
+ )
326
+
327
+ citations = tsv_io.load_citations(tsv_path)
328
+ click.echo(f"Loaded {len(citations)} citations from {tsv_path}")
329
+
330
+ syncer = ZoteroSyncer(api_key=api_key, group_id=group_id, collection_key=collection_key)
331
+ report = syncer.sync(cfg, citations, dry_run=dry_run)
332
+
333
+ prefix = "[DRY RUN] " if dry_run else ""
334
+ click.echo(f"{prefix}Collections created: {report.collections_created}")
335
+ click.echo(f"{prefix}Items created: {report.items_created}")
336
+ click.echo(f"{prefix}Items skipped: {report.items_skipped}")
337
+ click.echo(f"{prefix}Attachments created: {report.attachments_created}")
338
+ if report.errors:
339
+ click.echo(f"Errors: {len(report.errors)}")
340
+ for err in report.errors[:10]:
341
+ click.echo(f" {err}")
342
+
343
+
344
+ @main.command("fetch-pdfs")
345
+ @click.argument("collection", type=click.Path(exists=True, path_type=Path))
346
+ @click.option(
347
+ "--tsv",
348
+ type=click.Path(path_type=Path),
349
+ help="Citations TSV file (overrides collection YAML output_tsv)",
350
+ )
351
+ @click.option(
352
+ "--output-dir",
353
+ type=click.Path(path_type=Path),
354
+ help="PDF output directory (overrides pdfs.output_dir in YAML)",
355
+ )
356
+ @click.option(
357
+ "--email",
358
+ envvar="UNPAYWALL_EMAIL",
359
+ help="Email for Unpaywall API (overrides pdfs.unpaywall_email in YAML)",
360
+ )
361
+ @click.option("--git-annex/--no-git-annex", default=None, help="Use git-annex for PDFs")
362
+ @click.option("--dry-run", is_flag=True, help="Report OA status without downloading")
363
+ def fetch_pdfs(
364
+ collection: Path,
365
+ tsv: Path | None,
366
+ output_dir: Path | None,
367
+ email: str | None,
368
+ git_annex: bool | None,
369
+ dry_run: bool,
370
+ ) -> None:
371
+ """Fetch open-access PDFs for citations in COLLECTION."""
372
+ from citations_collector.pdf import PDFAcquirer
373
+ from citations_collector.persistence import tsv_io
374
+
375
+ collector = CitationCollector.from_yaml(collection)
376
+ cfg = collector.collection
377
+ pdfs_cfg = cfg.pdfs
378
+
379
+ # Resolve config
380
+ if not tsv:
381
+ tsv_path = Path(cfg.output_tsv) if cfg.output_tsv else Path("citations.tsv")
382
+ else:
383
+ tsv_path = tsv
384
+ if not output_dir:
385
+ output_dir = Path((pdfs_cfg.output_dir if pdfs_cfg else None) or "pdfs/")
386
+
387
+ # Resolve with fallbacks (ensure non-None types for PDFAcquirer)
388
+ email_resolved: str = (
389
+ email
390
+ or (pdfs_cfg.unpaywall_email if pdfs_cfg else None)
391
+ or "site-unpaywall@oneukrainian.com"
392
+ )
393
+ git_annex_resolved: bool = (
394
+ git_annex
395
+ if git_annex is not None
396
+ else ((pdfs_cfg.git_annex if pdfs_cfg else None) or False)
397
+ )
398
+
399
+ citations = tsv_io.load_citations(tsv_path)
400
+ click.echo(f"Loaded {len(citations)} citations from {tsv_path}")
401
+
402
+ acquirer = PDFAcquirer(
403
+ output_dir=output_dir, email=email_resolved, git_annex=git_annex_resolved
404
+ )
405
+ counts = acquirer.acquire_all(citations, dry_run=dry_run)
406
+
407
+ prefix = "[DRY RUN] " if dry_run else ""
408
+ click.echo(f"{prefix}Downloaded: {counts['downloaded']}")
409
+ click.echo(f"{prefix}Skipped (existing): {counts['skipped']}")
410
+ click.echo(f"{prefix}No OA available: {counts['no_oa']}")
411
+ click.echo(f"{prefix}No DOI: {counts['no_doi']}")
412
+ if counts["error"]:
413
+ click.echo(f"Errors: {counts['error']}")
414
+
415
+ if not dry_run:
416
+ tsv_io.save_citations(citations, tsv_path)
417
+ click.echo(f"Updated {tsv_path}")
418
+
419
+
420
+ @main.command(name="detect-merges")
421
+ @click.option(
422
+ "--config",
423
+ "-c",
424
+ type=click.Path(exists=True, path_type=Path),
425
+ help="Path to collection YAML config",
426
+ )
427
+ @click.option(
428
+ "--tsv",
429
+ type=click.Path(exists=True, path_type=Path),
430
+ help="Override: path to TSV file (default: from config)",
431
+ )
432
+ @click.option(
433
+ "--email",
434
+ help="Email for CrossRef API (default: from config or fallback)",
435
+ )
436
+ @click.option(
437
+ "--fuzzy-match",
438
+ is_flag=True,
439
+ help="Also perform fuzzy title matching (heuristic, use with caution)",
440
+ )
441
+ @click.option(
442
+ "--dry-run",
443
+ is_flag=True,
444
+ help="Show what would be marked without saving",
445
+ )
446
+ def detect_merges(
447
+ config: Path | None,
448
+ tsv: Path | None,
449
+ email: str | None,
450
+ fuzzy_match: bool,
451
+ dry_run: bool,
452
+ ) -> None:
453
+ """Detect and mark preprints that have published versions.
454
+
455
+ Uses CrossRef API to find "is-preprint-of" relationships and marks
456
+ preprints as merged with citation_status=merged.
457
+ """
458
+ from citations_collector.core import CitationCollector
459
+ from citations_collector.merge_detection import MergeDetector
460
+ from citations_collector.persistence import tsv_io
461
+
462
+ # Load config if provided
463
+ pdfs_cfg = None
464
+ if config:
465
+ collector = CitationCollector.from_yaml(config)
466
+ cfg = collector.collection
467
+ pdfs_cfg = cfg.pdfs
468
+ if not tsv:
469
+ tsv_path = Path(cfg.output_tsv) if cfg.output_tsv else Path("citations.tsv")
470
+ else:
471
+ tsv_path = tsv
472
+ elif tsv:
473
+ tsv_path = tsv
474
+ else:
475
+ raise click.UsageError("Must provide either --config or --tsv")
476
+
477
+ # Resolve email
478
+ email_resolved: str = (
479
+ email
480
+ or (pdfs_cfg.unpaywall_email if pdfs_cfg else None)
481
+ or "site-unpaywall@oneukrainian.com"
482
+ )
483
+
484
+ citations = tsv_io.load_citations(tsv_path)
485
+ click.echo(f"Loaded {len(citations)} citations from {tsv_path}")
486
+
487
+ detector = MergeDetector(email=email_resolved)
488
+
489
+ # Detect merges via CrossRef relationships
490
+ merged_pairs = detector.detect_merged_pairs(citations)
491
+ click.echo(f"Found {len(merged_pairs)} merged pairs via CrossRef API")
492
+
493
+ # Optionally add fuzzy matching
494
+ if fuzzy_match:
495
+ click.echo("Running fuzzy title matching...")
496
+ fuzzy_pairs = detector.fuzzy_match_by_title(citations)
497
+ click.echo(f"Found {len(fuzzy_pairs)} potential pairs via fuzzy matching")
498
+
499
+ # Show fuzzy matches for manual review
500
+ if fuzzy_pairs:
501
+ click.echo("\nFuzzy matches (review before accepting):")
502
+ for preprint_doi, pub_doi in fuzzy_pairs.items():
503
+ click.echo(f" {preprint_doi} -> {pub_doi}")
504
+
505
+ # Don't auto-merge fuzzy matches - require manual review
506
+ click.echo(
507
+ "\nFuzzy matches not applied automatically. "
508
+ "Review and add to CrossRef metadata if valid."
509
+ )
510
+
511
+ # Mark merged citations
512
+ if merged_pairs:
513
+ prefix = "[DRY RUN] " if dry_run else ""
514
+ marked = detector.mark_merged_citations(citations, merged_pairs)
515
+ click.echo(f"{prefix}Marked {marked} citations as merged")
516
+
517
+ if not dry_run:
518
+ tsv_io.save_citations(citations, tsv_path)
519
+ click.echo(f"Updated {tsv_path}")
520
+ else:
521
+ click.echo("No merges detected")
522
+
523
+
524
+ if __name__ == "__main__":
525
+ main()