citations-collector 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- citations_collector/__init__.py +18 -0
- citations_collector/_version.py +34 -0
- citations_collector/cli.py +525 -0
- citations_collector/core.py +503 -0
- citations_collector/discovery/__init__.py +17 -0
- citations_collector/discovery/base.py +26 -0
- citations_collector/discovery/crossref.py +210 -0
- citations_collector/discovery/datacite.py +260 -0
- citations_collector/discovery/openalex.py +252 -0
- citations_collector/discovery/opencitations.py +168 -0
- citations_collector/discovery/utils.py +62 -0
- citations_collector/importers/__init__.py +17 -0
- citations_collector/importers/bibtex.py +178 -0
- citations_collector/importers/dandi.py +314 -0
- citations_collector/importers/github.py +147 -0
- citations_collector/importers/zenodo.py +110 -0
- citations_collector/importers/zotero.py +262 -0
- citations_collector/merge_detection.py +216 -0
- citations_collector/models/__init__.py +44 -0
- citations_collector/models/generated.py +525 -0
- citations_collector/pdf.py +260 -0
- citations_collector/persistence/__init__.py +7 -0
- citations_collector/persistence/tsv_io.py +121 -0
- citations_collector/persistence/yaml_io.py +50 -0
- citations_collector/py.typed +0 -0
- citations_collector/unpaywall.py +60 -0
- citations_collector/zotero_sync.py +591 -0
- citations_collector-0.2.3.dist-info/METADATA +456 -0
- citations_collector-0.2.3.dist-info/RECORD +31 -0
- citations_collector-0.2.3.dist-info/WHEEL +4 -0
- citations_collector-0.2.3.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""citations-collector: Discover and curate scholarly citations of datasets and software."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from citations_collector.core import CitationCollector
|
|
6
|
+
from citations_collector.models import CitationRecord, Collection
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"__version__",
|
|
10
|
+
"CitationCollector",
|
|
11
|
+
"CitationRecord",
|
|
12
|
+
"Collection",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from citations_collector._version import version as __version__
|
|
17
|
+
except ImportError:
|
|
18
|
+
__version__ = "0.0.0+unknown"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.2.3'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 3)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
"""Click-based CLI for citations-collector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
|
|
11
|
+
from citations_collector.core import CitationCollector
|
|
12
|
+
from citations_collector.importers.dandi import DANDIImporter
|
|
13
|
+
from citations_collector.importers.zotero import ZoteroImporter
|
|
14
|
+
from citations_collector.persistence import yaml_io
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@click.group()
|
|
20
|
+
@click.version_option()
|
|
21
|
+
@click.option("--verbose", "-v", is_flag=True, help="Enable verbose logging")
|
|
22
|
+
def main(verbose: bool) -> None:
|
|
23
|
+
"""Discover and curate scholarly citations of datasets and software."""
|
|
24
|
+
logging.basicConfig(
|
|
25
|
+
level=logging.DEBUG if verbose else logging.INFO,
|
|
26
|
+
format="%(levelname)s: %(message)s",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@main.command()
|
|
31
|
+
@click.argument("collection", type=click.Path(exists=True, path_type=Path))
|
|
32
|
+
@click.option(
|
|
33
|
+
"--output",
|
|
34
|
+
"-o",
|
|
35
|
+
type=click.Path(path_type=Path),
|
|
36
|
+
help="Output TSV file (overrides collection YAML output_tsv)",
|
|
37
|
+
)
|
|
38
|
+
@click.option(
|
|
39
|
+
"--full-refresh",
|
|
40
|
+
is_flag=True,
|
|
41
|
+
help="Ignore incremental mode (discover all citations)",
|
|
42
|
+
)
|
|
43
|
+
@click.option(
|
|
44
|
+
"--since",
|
|
45
|
+
type=click.DateTime(formats=["%Y-%m-%d"]),
|
|
46
|
+
help="Only discover citations since this date (YYYY-MM-DD). Overrides incremental mode.",
|
|
47
|
+
)
|
|
48
|
+
@click.option(
|
|
49
|
+
"--email",
|
|
50
|
+
envvar="CROSSREF_EMAIL",
|
|
51
|
+
help="Email for CrossRef polite pool (overrides discover.email in YAML)",
|
|
52
|
+
)
|
|
53
|
+
@click.option(
|
|
54
|
+
"--sources",
|
|
55
|
+
multiple=True,
|
|
56
|
+
type=click.Choice(["crossref", "opencitations", "datacite", "openalex"]),
|
|
57
|
+
help="Which sources to query (overrides discover.sources in YAML)",
|
|
58
|
+
)
|
|
59
|
+
@click.option(
|
|
60
|
+
"--expand-refs",
|
|
61
|
+
is_flag=True,
|
|
62
|
+
help="Expand non-DOI refs (zenodo_concept, github) to DOIs before discovery",
|
|
63
|
+
)
|
|
64
|
+
def discover(
|
|
65
|
+
collection: Path,
|
|
66
|
+
output: Path | None,
|
|
67
|
+
full_refresh: bool,
|
|
68
|
+
since: datetime | None,
|
|
69
|
+
email: str | None,
|
|
70
|
+
sources: tuple[str, ...],
|
|
71
|
+
expand_refs: bool,
|
|
72
|
+
) -> None:
|
|
73
|
+
"""Discover citations for all items in COLLECTION."""
|
|
74
|
+
click.echo(f"Loading collection from {collection}")
|
|
75
|
+
|
|
76
|
+
# Load collection
|
|
77
|
+
collector = CitationCollector.from_yaml(collection)
|
|
78
|
+
cfg = collector.collection
|
|
79
|
+
|
|
80
|
+
# Populate items dynamically from source config (e.g., DANDI API)
|
|
81
|
+
if cfg.source and cfg.source.type:
|
|
82
|
+
click.echo(f"Populating items from {cfg.source.type} source...")
|
|
83
|
+
collector.populate_from_source()
|
|
84
|
+
|
|
85
|
+
# Resolve config: CLI overrides > YAML config > defaults
|
|
86
|
+
discover_cfg = cfg.discover
|
|
87
|
+
if not output:
|
|
88
|
+
output = Path(cfg.output_tsv) if cfg.output_tsv else Path("citations.tsv")
|
|
89
|
+
if not email and discover_cfg:
|
|
90
|
+
email = discover_cfg.email
|
|
91
|
+
if not sources and discover_cfg and discover_cfg.sources:
|
|
92
|
+
sources = tuple(discover_cfg.sources)
|
|
93
|
+
|
|
94
|
+
# Expand non-DOI refs if requested
|
|
95
|
+
if expand_refs:
|
|
96
|
+
click.echo("Expanding non-DOI references (zenodo_concept, github) to DOIs...")
|
|
97
|
+
collector.expand_refs()
|
|
98
|
+
|
|
99
|
+
# Load existing citations if TSV exists
|
|
100
|
+
if output.exists():
|
|
101
|
+
click.echo(f"Loading existing citations from {output}")
|
|
102
|
+
collector.load_existing_citations(output)
|
|
103
|
+
existing_count = len(collector.citations)
|
|
104
|
+
else:
|
|
105
|
+
existing_count = 0
|
|
106
|
+
|
|
107
|
+
# Discover citations
|
|
108
|
+
click.echo(f"Discovering citations for {cfg.name}...")
|
|
109
|
+
if email:
|
|
110
|
+
click.echo(f"Using CrossRef polite pool with email: {email}")
|
|
111
|
+
|
|
112
|
+
sources_list = list(sources) if sources else None
|
|
113
|
+
collector.discover_all(
|
|
114
|
+
sources=sources_list,
|
|
115
|
+
incremental=not full_refresh,
|
|
116
|
+
since_date=since,
|
|
117
|
+
email=email,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Report results
|
|
121
|
+
new_count = len(collector.citations) - existing_count
|
|
122
|
+
click.echo(f"Found {new_count} new citations ({len(collector.citations)} total)")
|
|
123
|
+
|
|
124
|
+
# Save results
|
|
125
|
+
collector.save(collection, output)
|
|
126
|
+
click.echo(f"Saved to {output}")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@main.command("import-dandi")
|
|
130
|
+
@click.option(
|
|
131
|
+
"--output",
|
|
132
|
+
"-o",
|
|
133
|
+
required=True,
|
|
134
|
+
type=click.Path(path_type=Path),
|
|
135
|
+
help="Output YAML file for collection",
|
|
136
|
+
)
|
|
137
|
+
@click.option(
|
|
138
|
+
"--include-draft",
|
|
139
|
+
is_flag=True,
|
|
140
|
+
help="Include draft versions (no DOI)",
|
|
141
|
+
)
|
|
142
|
+
@click.option(
|
|
143
|
+
"--limit",
|
|
144
|
+
type=int,
|
|
145
|
+
help="Limit number of dandisets to import (only for --all)",
|
|
146
|
+
)
|
|
147
|
+
@click.option(
|
|
148
|
+
"--dandiset-id",
|
|
149
|
+
"-d",
|
|
150
|
+
"dandiset_ids",
|
|
151
|
+
multiple=True,
|
|
152
|
+
help="Import specific dandiset(s) by ID (e.g., -d 000402 -d 000003)",
|
|
153
|
+
)
|
|
154
|
+
@click.option(
|
|
155
|
+
"--all",
|
|
156
|
+
"import_all",
|
|
157
|
+
is_flag=True,
|
|
158
|
+
help="Import all dandisets (default if no --dandiset-id specified)",
|
|
159
|
+
)
|
|
160
|
+
def import_dandi(
|
|
161
|
+
output: Path,
|
|
162
|
+
include_draft: bool,
|
|
163
|
+
limit: int | None,
|
|
164
|
+
dandiset_ids: tuple[str, ...],
|
|
165
|
+
import_all: bool,
|
|
166
|
+
) -> None:
|
|
167
|
+
"""Import dandisets from DANDI Archive.
|
|
168
|
+
|
|
169
|
+
Examples:
|
|
170
|
+
|
|
171
|
+
# Import specific dandisets
|
|
172
|
+
citations-collector import-dandi -o microns.yaml -d 000402
|
|
173
|
+
|
|
174
|
+
# Import multiple specific dandisets
|
|
175
|
+
citations-collector import-dandi -o multi.yaml -d 000003 -d 000402
|
|
176
|
+
|
|
177
|
+
# Import all dandisets (with limit)
|
|
178
|
+
citations-collector import-dandi -o all.yaml --all --limit 10
|
|
179
|
+
"""
|
|
180
|
+
importer = DANDIImporter()
|
|
181
|
+
|
|
182
|
+
# Determine what to import
|
|
183
|
+
if dandiset_ids:
|
|
184
|
+
# Import specific dandisets
|
|
185
|
+
click.echo(f"Importing {len(dandiset_ids)} specific dandiset(s)...")
|
|
186
|
+
|
|
187
|
+
with click.progressbar(length=len(dandiset_ids), label="Importing") as bar: # type: ignore[var-annotated]
|
|
188
|
+
|
|
189
|
+
def progress(current: int, total: int | None) -> None:
|
|
190
|
+
bar.update(1)
|
|
191
|
+
|
|
192
|
+
collection = importer.import_specific(
|
|
193
|
+
dandiset_ids=list(dandiset_ids),
|
|
194
|
+
include_draft=include_draft,
|
|
195
|
+
progress_callback=progress,
|
|
196
|
+
)
|
|
197
|
+
else:
|
|
198
|
+
# Import all dandisets
|
|
199
|
+
if not import_all:
|
|
200
|
+
click.echo(
|
|
201
|
+
"No --dandiset-id specified, importing all dandisets "
|
|
202
|
+
"(use --all to suppress this message)"
|
|
203
|
+
)
|
|
204
|
+
else:
|
|
205
|
+
click.echo("Importing all dandisets from DANDI Archive...")
|
|
206
|
+
|
|
207
|
+
with click.progressbar(length=limit or 0, label="Importing") as bar: # type: ignore[var-annotated]
|
|
208
|
+
|
|
209
|
+
def progress(current: int, total: int | None) -> None:
|
|
210
|
+
bar.update(1)
|
|
211
|
+
|
|
212
|
+
collection = importer.import_all(
|
|
213
|
+
include_draft=include_draft,
|
|
214
|
+
limit=limit,
|
|
215
|
+
progress_callback=progress if limit else None,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
yaml_io.save_collection(collection, output)
|
|
219
|
+
click.echo(f"Imported {len(collection.items or [])} dandisets to {output}")
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@main.command("import-zotero")
|
|
223
|
+
@click.option(
|
|
224
|
+
"--output",
|
|
225
|
+
"-o",
|
|
226
|
+
required=True,
|
|
227
|
+
type=click.Path(path_type=Path),
|
|
228
|
+
help="Output YAML file for collection",
|
|
229
|
+
)
|
|
230
|
+
@click.option(
|
|
231
|
+
"--group-id",
|
|
232
|
+
required=True,
|
|
233
|
+
type=int,
|
|
234
|
+
help="Zotero group ID",
|
|
235
|
+
)
|
|
236
|
+
@click.option(
|
|
237
|
+
"--collection-key",
|
|
238
|
+
help="Specific collection within group",
|
|
239
|
+
)
|
|
240
|
+
@click.option(
|
|
241
|
+
"--api-key",
|
|
242
|
+
envvar="ZOTERO_API_KEY",
|
|
243
|
+
help="Zotero API key (optional for public groups)",
|
|
244
|
+
)
|
|
245
|
+
@click.option(
|
|
246
|
+
"--limit",
|
|
247
|
+
type=int,
|
|
248
|
+
help="Limit number of items to import",
|
|
249
|
+
)
|
|
250
|
+
def import_zotero(
|
|
251
|
+
output: Path,
|
|
252
|
+
group_id: int,
|
|
253
|
+
collection_key: str | None,
|
|
254
|
+
api_key: str | None,
|
|
255
|
+
limit: int | None,
|
|
256
|
+
) -> None:
|
|
257
|
+
"""Import items from a Zotero group."""
|
|
258
|
+
click.echo(f"Importing items from Zotero group {group_id}...")
|
|
259
|
+
|
|
260
|
+
importer = ZoteroImporter(api_key=api_key)
|
|
261
|
+
collection = importer.import_group(
|
|
262
|
+
group_id=group_id,
|
|
263
|
+
collection_key=collection_key,
|
|
264
|
+
limit=limit,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
yaml_io.save_collection(collection, output)
|
|
268
|
+
click.echo(f"Imported {len(collection.items or [])} items to {output}")
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
@main.command("sync-zotero")
|
|
272
|
+
@click.argument("collection", type=click.Path(exists=True, path_type=Path))
|
|
273
|
+
@click.option(
|
|
274
|
+
"--tsv",
|
|
275
|
+
type=click.Path(exists=True, path_type=Path),
|
|
276
|
+
help="Citations TSV file (overrides collection YAML output_tsv)",
|
|
277
|
+
)
|
|
278
|
+
@click.option(
|
|
279
|
+
"--api-key",
|
|
280
|
+
envvar="ZOTERO_API_KEY",
|
|
281
|
+
help="Zotero API key",
|
|
282
|
+
)
|
|
283
|
+
@click.option(
|
|
284
|
+
"--group-id",
|
|
285
|
+
type=int,
|
|
286
|
+
help="Zotero group ID (overrides zotero.group_id in YAML)",
|
|
287
|
+
)
|
|
288
|
+
@click.option(
|
|
289
|
+
"--collection-key",
|
|
290
|
+
help="Zotero collection key (overrides zotero.collection_key in YAML)",
|
|
291
|
+
)
|
|
292
|
+
@click.option("--dry-run", is_flag=True, help="Show what would be synced without writing")
|
|
293
|
+
def sync_zotero(
|
|
294
|
+
collection: Path,
|
|
295
|
+
tsv: Path | None,
|
|
296
|
+
api_key: str | None,
|
|
297
|
+
group_id: int | None,
|
|
298
|
+
collection_key: str | None,
|
|
299
|
+
dry_run: bool,
|
|
300
|
+
) -> None:
|
|
301
|
+
"""Sync citations to Zotero as hierarchical collections."""
|
|
302
|
+
from citations_collector.persistence import tsv_io
|
|
303
|
+
from citations_collector.zotero_sync import ZoteroSyncer
|
|
304
|
+
|
|
305
|
+
collector = CitationCollector.from_yaml(collection)
|
|
306
|
+
cfg = collector.collection
|
|
307
|
+
zotero_cfg = cfg.zotero
|
|
308
|
+
|
|
309
|
+
# Resolve config
|
|
310
|
+
if not tsv:
|
|
311
|
+
tsv_path = Path(cfg.output_tsv) if cfg.output_tsv else Path("citations.tsv")
|
|
312
|
+
else:
|
|
313
|
+
tsv_path = tsv
|
|
314
|
+
if not group_id and zotero_cfg:
|
|
315
|
+
group_id = zotero_cfg.group_id
|
|
316
|
+
if not collection_key and zotero_cfg:
|
|
317
|
+
collection_key = zotero_cfg.collection_key
|
|
318
|
+
if not api_key:
|
|
319
|
+
raise click.UsageError("Zotero API key required (--api-key or ZOTERO_API_KEY)")
|
|
320
|
+
if not group_id:
|
|
321
|
+
raise click.UsageError("Zotero group ID required (--group-id or zotero.group_id in YAML)")
|
|
322
|
+
if not collection_key:
|
|
323
|
+
raise click.UsageError(
|
|
324
|
+
"Zotero collection key required (--collection-key or zotero.collection_key in YAML)"
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
citations = tsv_io.load_citations(tsv_path)
|
|
328
|
+
click.echo(f"Loaded {len(citations)} citations from {tsv_path}")
|
|
329
|
+
|
|
330
|
+
syncer = ZoteroSyncer(api_key=api_key, group_id=group_id, collection_key=collection_key)
|
|
331
|
+
report = syncer.sync(cfg, citations, dry_run=dry_run)
|
|
332
|
+
|
|
333
|
+
prefix = "[DRY RUN] " if dry_run else ""
|
|
334
|
+
click.echo(f"{prefix}Collections created: {report.collections_created}")
|
|
335
|
+
click.echo(f"{prefix}Items created: {report.items_created}")
|
|
336
|
+
click.echo(f"{prefix}Items skipped: {report.items_skipped}")
|
|
337
|
+
click.echo(f"{prefix}Attachments created: {report.attachments_created}")
|
|
338
|
+
if report.errors:
|
|
339
|
+
click.echo(f"Errors: {len(report.errors)}")
|
|
340
|
+
for err in report.errors[:10]:
|
|
341
|
+
click.echo(f" {err}")
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
@main.command("fetch-pdfs")
|
|
345
|
+
@click.argument("collection", type=click.Path(exists=True, path_type=Path))
|
|
346
|
+
@click.option(
|
|
347
|
+
"--tsv",
|
|
348
|
+
type=click.Path(path_type=Path),
|
|
349
|
+
help="Citations TSV file (overrides collection YAML output_tsv)",
|
|
350
|
+
)
|
|
351
|
+
@click.option(
|
|
352
|
+
"--output-dir",
|
|
353
|
+
type=click.Path(path_type=Path),
|
|
354
|
+
help="PDF output directory (overrides pdfs.output_dir in YAML)",
|
|
355
|
+
)
|
|
356
|
+
@click.option(
|
|
357
|
+
"--email",
|
|
358
|
+
envvar="UNPAYWALL_EMAIL",
|
|
359
|
+
help="Email for Unpaywall API (overrides pdfs.unpaywall_email in YAML)",
|
|
360
|
+
)
|
|
361
|
+
@click.option("--git-annex/--no-git-annex", default=None, help="Use git-annex for PDFs")
|
|
362
|
+
@click.option("--dry-run", is_flag=True, help="Report OA status without downloading")
|
|
363
|
+
def fetch_pdfs(
|
|
364
|
+
collection: Path,
|
|
365
|
+
tsv: Path | None,
|
|
366
|
+
output_dir: Path | None,
|
|
367
|
+
email: str | None,
|
|
368
|
+
git_annex: bool | None,
|
|
369
|
+
dry_run: bool,
|
|
370
|
+
) -> None:
|
|
371
|
+
"""Fetch open-access PDFs for citations in COLLECTION."""
|
|
372
|
+
from citations_collector.pdf import PDFAcquirer
|
|
373
|
+
from citations_collector.persistence import tsv_io
|
|
374
|
+
|
|
375
|
+
collector = CitationCollector.from_yaml(collection)
|
|
376
|
+
cfg = collector.collection
|
|
377
|
+
pdfs_cfg = cfg.pdfs
|
|
378
|
+
|
|
379
|
+
# Resolve config
|
|
380
|
+
if not tsv:
|
|
381
|
+
tsv_path = Path(cfg.output_tsv) if cfg.output_tsv else Path("citations.tsv")
|
|
382
|
+
else:
|
|
383
|
+
tsv_path = tsv
|
|
384
|
+
if not output_dir:
|
|
385
|
+
output_dir = Path((pdfs_cfg.output_dir if pdfs_cfg else None) or "pdfs/")
|
|
386
|
+
|
|
387
|
+
# Resolve with fallbacks (ensure non-None types for PDFAcquirer)
|
|
388
|
+
email_resolved: str = (
|
|
389
|
+
email
|
|
390
|
+
or (pdfs_cfg.unpaywall_email if pdfs_cfg else None)
|
|
391
|
+
or "site-unpaywall@oneukrainian.com"
|
|
392
|
+
)
|
|
393
|
+
git_annex_resolved: bool = (
|
|
394
|
+
git_annex
|
|
395
|
+
if git_annex is not None
|
|
396
|
+
else ((pdfs_cfg.git_annex if pdfs_cfg else None) or False)
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
citations = tsv_io.load_citations(tsv_path)
|
|
400
|
+
click.echo(f"Loaded {len(citations)} citations from {tsv_path}")
|
|
401
|
+
|
|
402
|
+
acquirer = PDFAcquirer(
|
|
403
|
+
output_dir=output_dir, email=email_resolved, git_annex=git_annex_resolved
|
|
404
|
+
)
|
|
405
|
+
counts = acquirer.acquire_all(citations, dry_run=dry_run)
|
|
406
|
+
|
|
407
|
+
prefix = "[DRY RUN] " if dry_run else ""
|
|
408
|
+
click.echo(f"{prefix}Downloaded: {counts['downloaded']}")
|
|
409
|
+
click.echo(f"{prefix}Skipped (existing): {counts['skipped']}")
|
|
410
|
+
click.echo(f"{prefix}No OA available: {counts['no_oa']}")
|
|
411
|
+
click.echo(f"{prefix}No DOI: {counts['no_doi']}")
|
|
412
|
+
if counts["error"]:
|
|
413
|
+
click.echo(f"Errors: {counts['error']}")
|
|
414
|
+
|
|
415
|
+
if not dry_run:
|
|
416
|
+
tsv_io.save_citations(citations, tsv_path)
|
|
417
|
+
click.echo(f"Updated {tsv_path}")
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
@main.command(name="detect-merges")
|
|
421
|
+
@click.option(
|
|
422
|
+
"--config",
|
|
423
|
+
"-c",
|
|
424
|
+
type=click.Path(exists=True, path_type=Path),
|
|
425
|
+
help="Path to collection YAML config",
|
|
426
|
+
)
|
|
427
|
+
@click.option(
|
|
428
|
+
"--tsv",
|
|
429
|
+
type=click.Path(exists=True, path_type=Path),
|
|
430
|
+
help="Override: path to TSV file (default: from config)",
|
|
431
|
+
)
|
|
432
|
+
@click.option(
|
|
433
|
+
"--email",
|
|
434
|
+
help="Email for CrossRef API (default: from config or fallback)",
|
|
435
|
+
)
|
|
436
|
+
@click.option(
|
|
437
|
+
"--fuzzy-match",
|
|
438
|
+
is_flag=True,
|
|
439
|
+
help="Also perform fuzzy title matching (heuristic, use with caution)",
|
|
440
|
+
)
|
|
441
|
+
@click.option(
|
|
442
|
+
"--dry-run",
|
|
443
|
+
is_flag=True,
|
|
444
|
+
help="Show what would be marked without saving",
|
|
445
|
+
)
|
|
446
|
+
def detect_merges(
|
|
447
|
+
config: Path | None,
|
|
448
|
+
tsv: Path | None,
|
|
449
|
+
email: str | None,
|
|
450
|
+
fuzzy_match: bool,
|
|
451
|
+
dry_run: bool,
|
|
452
|
+
) -> None:
|
|
453
|
+
"""Detect and mark preprints that have published versions.
|
|
454
|
+
|
|
455
|
+
Uses CrossRef API to find "is-preprint-of" relationships and marks
|
|
456
|
+
preprints as merged with citation_status=merged.
|
|
457
|
+
"""
|
|
458
|
+
from citations_collector.core import CitationCollector
|
|
459
|
+
from citations_collector.merge_detection import MergeDetector
|
|
460
|
+
from citations_collector.persistence import tsv_io
|
|
461
|
+
|
|
462
|
+
# Load config if provided
|
|
463
|
+
pdfs_cfg = None
|
|
464
|
+
if config:
|
|
465
|
+
collector = CitationCollector.from_yaml(config)
|
|
466
|
+
cfg = collector.collection
|
|
467
|
+
pdfs_cfg = cfg.pdfs
|
|
468
|
+
if not tsv:
|
|
469
|
+
tsv_path = Path(cfg.output_tsv) if cfg.output_tsv else Path("citations.tsv")
|
|
470
|
+
else:
|
|
471
|
+
tsv_path = tsv
|
|
472
|
+
elif tsv:
|
|
473
|
+
tsv_path = tsv
|
|
474
|
+
else:
|
|
475
|
+
raise click.UsageError("Must provide either --config or --tsv")
|
|
476
|
+
|
|
477
|
+
# Resolve email
|
|
478
|
+
email_resolved: str = (
|
|
479
|
+
email
|
|
480
|
+
or (pdfs_cfg.unpaywall_email if pdfs_cfg else None)
|
|
481
|
+
or "site-unpaywall@oneukrainian.com"
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
citations = tsv_io.load_citations(tsv_path)
|
|
485
|
+
click.echo(f"Loaded {len(citations)} citations from {tsv_path}")
|
|
486
|
+
|
|
487
|
+
detector = MergeDetector(email=email_resolved)
|
|
488
|
+
|
|
489
|
+
# Detect merges via CrossRef relationships
|
|
490
|
+
merged_pairs = detector.detect_merged_pairs(citations)
|
|
491
|
+
click.echo(f"Found {len(merged_pairs)} merged pairs via CrossRef API")
|
|
492
|
+
|
|
493
|
+
# Optionally add fuzzy matching
|
|
494
|
+
if fuzzy_match:
|
|
495
|
+
click.echo("Running fuzzy title matching...")
|
|
496
|
+
fuzzy_pairs = detector.fuzzy_match_by_title(citations)
|
|
497
|
+
click.echo(f"Found {len(fuzzy_pairs)} potential pairs via fuzzy matching")
|
|
498
|
+
|
|
499
|
+
# Show fuzzy matches for manual review
|
|
500
|
+
if fuzzy_pairs:
|
|
501
|
+
click.echo("\nFuzzy matches (review before accepting):")
|
|
502
|
+
for preprint_doi, pub_doi in fuzzy_pairs.items():
|
|
503
|
+
click.echo(f" {preprint_doi} -> {pub_doi}")
|
|
504
|
+
|
|
505
|
+
# Don't auto-merge fuzzy matches - require manual review
|
|
506
|
+
click.echo(
|
|
507
|
+
"\nFuzzy matches not applied automatically. "
|
|
508
|
+
"Review and add to CrossRef metadata if valid."
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
# Mark merged citations
|
|
512
|
+
if merged_pairs:
|
|
513
|
+
prefix = "[DRY RUN] " if dry_run else ""
|
|
514
|
+
marked = detector.mark_merged_citations(citations, merged_pairs)
|
|
515
|
+
click.echo(f"{prefix}Marked {marked} citations as merged")
|
|
516
|
+
|
|
517
|
+
if not dry_run:
|
|
518
|
+
tsv_io.save_citations(citations, tsv_path)
|
|
519
|
+
click.echo(f"Updated {tsv_path}")
|
|
520
|
+
else:
|
|
521
|
+
click.echo("No merges detected")
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
if __name__ == "__main__":
|
|
525
|
+
main()
|