citations-collector 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- citations_collector-0.2.1/.gitignore +4 -0
- citations_collector-0.2.1/PKG-INFO +354 -0
- citations_collector-0.2.1/README.md +293 -0
- citations_collector-0.2.1/examples/citations-example.tsv +7 -0
- citations_collector-0.2.1/examples/dandi-sample-collection.yaml +64 -0
- citations_collector-0.2.1/examples/github-ci-workflow.yaml +118 -0
- citations_collector-0.2.1/examples/microns-citations.tsv.bak +4 -0
- citations_collector-0.2.1/examples/microns-collection.yaml +50 -0
- citations_collector-0.2.1/examples/repronim-tools.yaml +122 -0
- citations_collector-0.2.1/examples/simple-resources.yaml +30 -0
- citations_collector-0.2.1/examples/studyforrest-preseeded.tsv +24 -0
- citations_collector-0.2.1/examples/studyforrest.yaml +33 -0
- citations_collector-0.2.1/pyproject.toml +129 -0
- citations_collector-0.2.1/schema/citations.schema.json +637 -0
- citations_collector-0.2.1/schema/citations.yaml +518 -0
- citations_collector-0.2.1/src/citations_collector/__init__.py +18 -0
- citations_collector-0.2.1/src/citations_collector/_version.py +34 -0
- citations_collector-0.2.1/src/citations_collector/cli.py +525 -0
- citations_collector-0.2.1/src/citations_collector/core.py +336 -0
- citations_collector-0.2.1/src/citations_collector/discovery/__init__.py +17 -0
- citations_collector-0.2.1/src/citations_collector/discovery/base.py +26 -0
- citations_collector-0.2.1/src/citations_collector/discovery/crossref.py +196 -0
- citations_collector-0.2.1/src/citations_collector/discovery/datacite.py +260 -0
- citations_collector-0.2.1/src/citations_collector/discovery/openalex.py +253 -0
- citations_collector-0.2.1/src/citations_collector/discovery/opencitations.py +168 -0
- citations_collector-0.2.1/src/citations_collector/discovery/utils.py +40 -0
- citations_collector-0.2.1/src/citations_collector/importers/__init__.py +15 -0
- citations_collector-0.2.1/src/citations_collector/importers/dandi.py +314 -0
- citations_collector-0.2.1/src/citations_collector/importers/github.py +147 -0
- citations_collector-0.2.1/src/citations_collector/importers/zenodo.py +110 -0
- citations_collector-0.2.1/src/citations_collector/importers/zotero.py +262 -0
- citations_collector-0.2.1/src/citations_collector/merge_detection.py +216 -0
- citations_collector-0.2.1/src/citations_collector/models/__init__.py +44 -0
- citations_collector-0.2.1/src/citations_collector/models/generated.py +544 -0
- citations_collector-0.2.1/src/citations_collector/pdf.py +152 -0
- citations_collector-0.2.1/src/citations_collector/persistence/__init__.py +7 -0
- citations_collector-0.2.1/src/citations_collector/persistence/tsv_io.py +94 -0
- citations_collector-0.2.1/src/citations_collector/persistence/yaml_io.py +50 -0
- citations_collector-0.2.1/src/citations_collector/py.typed +0 -0
- citations_collector-0.2.1/src/citations_collector/unpaywall.py +60 -0
- citations_collector-0.2.1/src/citations_collector/zotero_sync.py +591 -0
- citations_collector-0.2.1/tests/INTEGRATION_TESTS.md +284 -0
- citations_collector-0.2.1/tests/__init__.py +1 -0
- citations_collector-0.2.1/tests/conftest.py +31 -0
- citations_collector-0.2.1/tests/fixtures/collections/repronim-tools.yaml +131 -0
- citations_collector-0.2.1/tests/fixtures/collections/simple.yaml +10 -0
- citations_collector-0.2.1/tests/fixtures/responses/crossref_empty.json +8 -0
- citations_collector-0.2.1/tests/fixtures/responses/crossref_success.json +27 -0
- citations_collector-0.2.1/tests/fixtures/responses/datacite_empty.json +6 -0
- citations_collector-0.2.1/tests/fixtures/responses/datacite_success.json +35 -0
- citations_collector-0.2.1/tests/fixtures/responses/opencitations_success.json +8 -0
- citations_collector-0.2.1/tests/fixtures/tsv/citations-example.tsv +7 -0
- citations_collector-0.2.1/tests/fixtures/tsv/simple.tsv +2 -0
- citations_collector-0.2.1/tests/test_cli.py +157 -0
- citations_collector-0.2.1/tests/test_core.py +151 -0
- citations_collector-0.2.1/tests/test_discovery.py +328 -0
- citations_collector-0.2.1/tests/test_discovery_openalex.py +334 -0
- citations_collector-0.2.1/tests/test_importers.py +202 -0
- citations_collector-0.2.1/tests/test_importers_dandi.py +391 -0
- citations_collector-0.2.1/tests/test_importers_zotero.py +370 -0
- citations_collector-0.2.1/tests/test_integration.py +316 -0
- citations_collector-0.2.1/tests/test_merge_detection.py +309 -0
- citations_collector-0.2.1/tests/test_multi_source_validation.py +108 -0
- citations_collector-0.2.1/tests/test_pdf.py +166 -0
- citations_collector-0.2.1/tests/test_persistence.py +165 -0
- citations_collector-0.2.1/tests/test_unpaywall.py +99 -0
- citations_collector-0.2.1/tests/test_zotero_sync.py +396 -0
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: citations-collector
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Discover and curate scholarly citations of datasets and software
|
|
5
|
+
Project-URL: Homepage, https://github.com/dandi/citations-collector
|
|
6
|
+
Project-URL: Repository, https://github.com/dandi/citations-collector
|
|
7
|
+
Project-URL: Issues, https://github.com/dandi/citations-collector/issues
|
|
8
|
+
Author-email: DANDI <team@dandiarchive.org>
|
|
9
|
+
License: MIT
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: click>=8.0
|
|
20
|
+
Requires-Dist: pydantic>=2.0
|
|
21
|
+
Requires-Dist: pyyaml>=6.0
|
|
22
|
+
Requires-Dist: pyzotero>=1.5
|
|
23
|
+
Requires-Dist: rapidfuzz>=3.0
|
|
24
|
+
Requires-Dist: requests>=2.28
|
|
25
|
+
Provides-Extra: ci
|
|
26
|
+
Requires-Dist: mypy>=1.0; extra == 'ci'
|
|
27
|
+
Requires-Dist: pre-commit>=3.0; extra == 'ci'
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'ci'
|
|
29
|
+
Requires-Dist: pytest-timeout>=2.0; extra == 'ci'
|
|
30
|
+
Requires-Dist: pytest>=7.0; extra == 'ci'
|
|
31
|
+
Requires-Dist: responses>=0.23; extra == 'ci'
|
|
32
|
+
Requires-Dist: ruff>=0.1.0; extra == 'ci'
|
|
33
|
+
Requires-Dist: tox-gh-actions>=3.0; extra == 'ci'
|
|
34
|
+
Requires-Dist: tox-uv>=1.0; extra == 'ci'
|
|
35
|
+
Requires-Dist: tox>=4.0; extra == 'ci'
|
|
36
|
+
Requires-Dist: types-pyyaml; extra == 'ci'
|
|
37
|
+
Requires-Dist: types-requests; extra == 'ci'
|
|
38
|
+
Provides-Extra: dandi
|
|
39
|
+
Requires-Dist: dandi>=0.60.0; extra == 'dandi'
|
|
40
|
+
Provides-Extra: devel
|
|
41
|
+
Requires-Dist: mypy>=1.0; extra == 'devel'
|
|
42
|
+
Requires-Dist: pre-commit>=3.0; extra == 'devel'
|
|
43
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'devel'
|
|
44
|
+
Requires-Dist: pytest-timeout>=2.0; extra == 'devel'
|
|
45
|
+
Requires-Dist: pytest>=7.0; extra == 'devel'
|
|
46
|
+
Requires-Dist: responses>=0.23; extra == 'devel'
|
|
47
|
+
Requires-Dist: ruff>=0.1.0; extra == 'devel'
|
|
48
|
+
Requires-Dist: tox-uv>=1.0; extra == 'devel'
|
|
49
|
+
Requires-Dist: tox>=4.0; extra == 'devel'
|
|
50
|
+
Requires-Dist: types-pyyaml; extra == 'devel'
|
|
51
|
+
Requires-Dist: types-requests; extra == 'devel'
|
|
52
|
+
Provides-Extra: linkml
|
|
53
|
+
Requires-Dist: linkml-runtime>=1.7.0; extra == 'linkml'
|
|
54
|
+
Requires-Dist: linkml>=1.7.0; extra == 'linkml'
|
|
55
|
+
Provides-Extra: test
|
|
56
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'test'
|
|
57
|
+
Requires-Dist: pytest-timeout>=2.0; extra == 'test'
|
|
58
|
+
Requires-Dist: pytest>=7.0; extra == 'test'
|
|
59
|
+
Requires-Dist: responses>=0.23; extra == 'test'
|
|
60
|
+
Description-Content-Type: text/markdown
|
|
61
|
+
|
|
62
|
+
# citations-collector
|
|
63
|
+
|
|
64
|
+
Discover and curate scholarly citations of datasets and software.
|
|
65
|
+
|
|
66
|
+
## Features
|
|
67
|
+
|
|
68
|
+
- **Citation Discovery**: Query CrossRef, OpenCitations, DataCite for citing papers
|
|
69
|
+
- **Hierarchical Collections**: Organize citations by project/version (e.g., DANDI dandisets)
|
|
70
|
+
- **Git-Friendly**: YAML collections + TSV citation records for version control
|
|
71
|
+
- **Curation Workflow**: Mark citations as ignored, merge preprints with published versions
|
|
72
|
+
- **PDF Acquisition**: Automatically download open-access PDFs via Unpaywall with optional git-annex tracking
|
|
73
|
+
- **Merge Detection**: Auto-detect preprints with published versions using CrossRef relationships
|
|
74
|
+
- **Zotero Integration**: Sync citations to hierarchical Zotero collections with automatic merged item relocation
|
|
75
|
+
- **Incremental Updates**: Efficiently discover only new citations since last run
|
|
76
|
+
|
|
77
|
+
## Installation
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Using uv (recommended)
|
|
81
|
+
uv venv
|
|
82
|
+
source .venv/bin/activate
|
|
83
|
+
uv pip install citations-collector
|
|
84
|
+
|
|
85
|
+
# Or using pip
|
|
86
|
+
pip install citations-collector
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Quick Start
|
|
90
|
+
|
|
91
|
+
### 1. Create a Collection
|
|
92
|
+
|
|
93
|
+
Create `collection.yaml`:
|
|
94
|
+
|
|
95
|
+
```yaml
|
|
96
|
+
name: My Research Tools
|
|
97
|
+
description: Software tools used in our lab
|
|
98
|
+
items:
|
|
99
|
+
- item_id: my-tool
|
|
100
|
+
name: "My Analysis Tool"
|
|
101
|
+
flavors:
|
|
102
|
+
- flavor_id: "1.0.0"
|
|
103
|
+
refs:
|
|
104
|
+
- ref_type: doi
|
|
105
|
+
ref_value: "10.5281/zenodo.1234567"
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### 2. Discover Citations
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# Discover citations for all items in collection
|
|
112
|
+
citations-collector discover collection.yaml --output citations.tsv
|
|
113
|
+
|
|
114
|
+
# Use CrossRef polite pool (better rate limits)
|
|
115
|
+
citations-collector discover collection.yaml --email your@email.org
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### 3. View Results
|
|
119
|
+
|
|
120
|
+
Citations are saved to `citations.tsv` - a tab-separated file you can open in Excel or edit manually for curation.
|
|
121
|
+
|
|
122
|
+
## Advanced Workflows
|
|
123
|
+
|
|
124
|
+
### PDF Acquisition
|
|
125
|
+
|
|
126
|
+
Automatically download open-access PDFs using Unpaywall:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
# Fetch PDFs for discovered citations
|
|
130
|
+
citations-collector fetch-pdfs --config collection.yaml
|
|
131
|
+
|
|
132
|
+
# Use git-annex for provenance tracking
|
|
133
|
+
citations-collector fetch-pdfs --config collection.yaml --git-annex
|
|
134
|
+
|
|
135
|
+
# Dry run to see what would be downloaded
|
|
136
|
+
citations-collector fetch-pdfs --config collection.yaml --dry-run
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
PDFs are stored at `pdfs/{doi}/article.pdf` with accompanying `article.bib` BibTeX files.
|
|
140
|
+
|
|
141
|
+
### Merge Detection
|
|
142
|
+
|
|
143
|
+
Detect preprints that have published versions:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
# Detect merges via CrossRef relationships
|
|
147
|
+
citations-collector detect-merges --config collection.yaml
|
|
148
|
+
|
|
149
|
+
# Also run fuzzy title matching (use with caution)
|
|
150
|
+
citations-collector detect-merges --config collection.yaml --fuzzy-match
|
|
151
|
+
|
|
152
|
+
# Preview without updating
|
|
153
|
+
citations-collector detect-merges --config collection.yaml --dry-run
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Detected preprints are marked with `citation_status=merged` and `citation_merged_into={published_doi}`.
|
|
157
|
+
|
|
158
|
+
### Zotero Sync
|
|
159
|
+
|
|
160
|
+
Sync citations to Zotero for collaborative browsing:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
# Sync to Zotero (requires API key in config or env)
|
|
164
|
+
citations-collector sync-zotero --config collection.yaml
|
|
165
|
+
|
|
166
|
+
# Dry run to preview structure
|
|
167
|
+
citations-collector sync-zotero --config collection.yaml --dry-run
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Zotero hierarchy:
|
|
171
|
+
```
|
|
172
|
+
Top Collection/
|
|
173
|
+
├── {item_id}/
|
|
174
|
+
│ ├── {flavor}/
|
|
175
|
+
│ │ ├── <active citations>
|
|
176
|
+
│ │ └── Merged/
|
|
177
|
+
│ │ └── <preprints and old versions>
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Unified Configuration
|
|
181
|
+
|
|
182
|
+
Create a unified `collection.yaml` with all settings:
|
|
183
|
+
|
|
184
|
+
```yaml
|
|
185
|
+
name: My Research Collection
|
|
186
|
+
description: Tools and datasets from our lab
|
|
187
|
+
|
|
188
|
+
# Source items to track
|
|
189
|
+
source:
|
|
190
|
+
items:
|
|
191
|
+
- item_id: dandi-000055
|
|
192
|
+
name: "AJILE12: Long-term naturalistic human intracranial neural recordings"
|
|
193
|
+
flavors:
|
|
194
|
+
- flavor_id: "0.220113.0400"
|
|
195
|
+
refs:
|
|
196
|
+
- ref_type: doi
|
|
197
|
+
ref_value: "10.48324/dandi.000055/0.220113.0400"
|
|
198
|
+
|
|
199
|
+
# Citation discovery settings
|
|
200
|
+
discover:
|
|
201
|
+
sources:
|
|
202
|
+
- crossref
|
|
203
|
+
- opencitations
|
|
204
|
+
email: your@email.org # For CrossRef polite pool
|
|
205
|
+
incremental: true
|
|
206
|
+
|
|
207
|
+
# PDF acquisition settings (optional)
|
|
208
|
+
pdfs:
|
|
209
|
+
output_dir: pdfs/
|
|
210
|
+
unpaywall_email: your@email.org
|
|
211
|
+
git_annex: false
|
|
212
|
+
|
|
213
|
+
# Zotero sync settings (optional)
|
|
214
|
+
zotero:
|
|
215
|
+
library_type: group
|
|
216
|
+
library_id: "12345"
|
|
217
|
+
api_key: "YOUR_API_KEY" # Or set ZOTERO_API_KEY env var
|
|
218
|
+
top_collection_key: "ABCD1234"
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Then run the full workflow:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
# 1. Discover citations
|
|
225
|
+
citations-collector discover collection.yaml
|
|
226
|
+
|
|
227
|
+
# 2. Fetch open-access PDFs
|
|
228
|
+
citations-collector fetch-pdfs --config collection.yaml
|
|
229
|
+
|
|
230
|
+
# 3. Detect merged preprints
|
|
231
|
+
citations-collector detect-merges --config collection.yaml
|
|
232
|
+
|
|
233
|
+
# 4. Sync to Zotero
|
|
234
|
+
citations-collector sync-zotero --config collection.yaml
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## Library Usage
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
from citations_collector import CitationCollector
|
|
241
|
+
|
|
242
|
+
# Load collection
|
|
243
|
+
collector = CitationCollector.from_yaml("collection.yaml")
|
|
244
|
+
|
|
245
|
+
# Discover citations (incremental by default)
|
|
246
|
+
collector.discover_all(incremental=True, email="your@email.org")
|
|
247
|
+
|
|
248
|
+
# Save results
|
|
249
|
+
collector.save("collection.yaml", "citations.tsv")
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
## Examples
|
|
253
|
+
|
|
254
|
+
See the `examples/` directory for:
|
|
255
|
+
- **dandi-collection.yaml**: DANDI Archive dandisets with versioned DOIs
|
|
256
|
+
- **repronim-tools.yaml**: ReproNim neuroimaging tools with RRIDs
|
|
257
|
+
- **simple-resources.yaml**: Basic collection without versioning
|
|
258
|
+
- **citations-example.tsv**: Example citation records with curation
|
|
259
|
+
|
|
260
|
+
## Development
|
|
261
|
+
|
|
262
|
+
### Setup
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
# Clone repository
|
|
266
|
+
git clone https://github.com/dandi/citations-collector.git
|
|
267
|
+
cd citations-collector
|
|
268
|
+
|
|
269
|
+
# Setup development environment
|
|
270
|
+
uv venv
|
|
271
|
+
source .venv/bin/activate
|
|
272
|
+
uv pip install -e ".[devel]"
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
### Running Tests
|
|
276
|
+
|
|
277
|
+
```bash
|
|
278
|
+
# Run all tests, linting, and type checking
|
|
279
|
+
tox
|
|
280
|
+
|
|
281
|
+
# Run specific environment
|
|
282
|
+
tox -e py312 # Tests on Python 3.12
|
|
283
|
+
tox -e lint # Ruff linting
|
|
284
|
+
tox -e type # Mypy type checking
|
|
285
|
+
tox -e cov # Coverage report
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
### Regenerating LinkML Models
|
|
289
|
+
|
|
290
|
+
When `schema/citations.yaml` changes:
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
# Install linkml tools
|
|
294
|
+
uv pip install -e ".[linkml]"
|
|
295
|
+
|
|
296
|
+
# Regenerate Pydantic models
|
|
297
|
+
gen-pydantic schema/citations.yaml > src/citations_collector/models/generated.py
|
|
298
|
+
|
|
299
|
+
# Regenerate JSON Schema
|
|
300
|
+
gen-json-schema schema/citations.yaml > schema/citations.schema.json
|
|
301
|
+
|
|
302
|
+
# Commit generated files
|
|
303
|
+
git add src/citations_collector/models/generated.py schema/citations.schema.json
|
|
304
|
+
git commit -m "Regenerate LinkML models"
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
## Architecture
|
|
308
|
+
|
|
309
|
+
- **Library-First Design**: All functionality accessible programmatically
|
|
310
|
+
- **LinkML Schema**: Validated data models from `schema/citations.yaml`
|
|
311
|
+
- **Modular Structure**:
|
|
312
|
+
- `discovery/`: Citation API clients (CrossRef, OpenCitations, DataCite)
|
|
313
|
+
- `persistence/`: YAML/TSV I/O
|
|
314
|
+
- `importers/`: DANDI API, Zenodo, GitHub integrations
|
|
315
|
+
- `unpaywall.py`: Unpaywall API client for OA PDF URLs
|
|
316
|
+
- `pdf.py`: PDF acquisition with git-annex support
|
|
317
|
+
- `merge_detection.py`: Preprint/published version detection
|
|
318
|
+
- `zotero_sync.py`: Zotero hierarchical sync with merged item handling
|
|
319
|
+
- `core.py`: Main orchestration API
|
|
320
|
+
- `cli.py`: Click-based CLI (thin wrapper)
|
|
321
|
+
|
|
322
|
+
## Citation Sources
|
|
323
|
+
|
|
324
|
+
- **CrossRef**: Most comprehensive, best for DOI citations
|
|
325
|
+
- **OpenCitations**: Open index, may lag behind CrossRef
|
|
326
|
+
- **DataCite**: Good for dataset citations
|
|
327
|
+
- **Europe PMC**: PubMed-indexed papers (future)
|
|
328
|
+
- **Semantic Scholar**: AI-powered citation discovery (future)
|
|
329
|
+
|
|
330
|
+
## License
|
|
331
|
+
|
|
332
|
+
MIT License - see LICENSE file for details.
|
|
333
|
+
|
|
334
|
+
## Contributing
|
|
335
|
+
|
|
336
|
+
See CONSTITUTION.md for:
|
|
337
|
+
- Code standards (Ruff, mypy, type hints)
|
|
338
|
+
- Testing requirements (pytest, 100 lines max, mock HTTP)
|
|
339
|
+
- Architecture principles (library-first, reliability, simplicity)
|
|
340
|
+
|
|
341
|
+
Pull requests welcome!
|
|
342
|
+
|
|
343
|
+
## Citation
|
|
344
|
+
|
|
345
|
+
If you use citations-collector in your research, please cite:
|
|
346
|
+
|
|
347
|
+
```bibtex
|
|
348
|
+
@software{citations_collector,
|
|
349
|
+
title = {citations-collector: Discover and curate scholarly citations},
|
|
350
|
+
author = {{DANDI Team}},
|
|
351
|
+
url = {https://github.com/dandi/citations-collector},
|
|
352
|
+
license = {MIT}
|
|
353
|
+
}
|
|
354
|
+
```
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# citations-collector
|
|
2
|
+
|
|
3
|
+
Discover and curate scholarly citations of datasets and software.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Citation Discovery**: Query CrossRef, OpenCitations, DataCite for citing papers
|
|
8
|
+
- **Hierarchical Collections**: Organize citations by project/version (e.g., DANDI dandisets)
|
|
9
|
+
- **Git-Friendly**: YAML collections + TSV citation records for version control
|
|
10
|
+
- **Curation Workflow**: Mark citations as ignored, merge preprints with published versions
|
|
11
|
+
- **PDF Acquisition**: Automatically download open-access PDFs via Unpaywall with optional git-annex tracking
|
|
12
|
+
- **Merge Detection**: Auto-detect preprints with published versions using CrossRef relationships
|
|
13
|
+
- **Zotero Integration**: Sync citations to hierarchical Zotero collections with automatic merged item relocation
|
|
14
|
+
- **Incremental Updates**: Efficiently discover only new citations since last run
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
# Using uv (recommended)
|
|
20
|
+
uv venv
|
|
21
|
+
source .venv/bin/activate
|
|
22
|
+
uv pip install citations-collector
|
|
23
|
+
|
|
24
|
+
# Or using pip
|
|
25
|
+
pip install citations-collector
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
### 1. Create a Collection
|
|
31
|
+
|
|
32
|
+
Create `collection.yaml`:
|
|
33
|
+
|
|
34
|
+
```yaml
|
|
35
|
+
name: My Research Tools
|
|
36
|
+
description: Software tools used in our lab
|
|
37
|
+
items:
|
|
38
|
+
- item_id: my-tool
|
|
39
|
+
name: "My Analysis Tool"
|
|
40
|
+
flavors:
|
|
41
|
+
- flavor_id: "1.0.0"
|
|
42
|
+
refs:
|
|
43
|
+
- ref_type: doi
|
|
44
|
+
ref_value: "10.5281/zenodo.1234567"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### 2. Discover Citations
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# Discover citations for all items in collection
|
|
51
|
+
citations-collector discover collection.yaml --output citations.tsv
|
|
52
|
+
|
|
53
|
+
# Use CrossRef polite pool (better rate limits)
|
|
54
|
+
citations-collector discover collection.yaml --email your@email.org
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### 3. View Results
|
|
58
|
+
|
|
59
|
+
Citations are saved to `citations.tsv` - a tab-separated file you can open in Excel or edit manually for curation.
|
|
60
|
+
|
|
61
|
+
## Advanced Workflows
|
|
62
|
+
|
|
63
|
+
### PDF Acquisition
|
|
64
|
+
|
|
65
|
+
Automatically download open-access PDFs using Unpaywall:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Fetch PDFs for discovered citations
|
|
69
|
+
citations-collector fetch-pdfs --config collection.yaml
|
|
70
|
+
|
|
71
|
+
# Use git-annex for provenance tracking
|
|
72
|
+
citations-collector fetch-pdfs --config collection.yaml --git-annex
|
|
73
|
+
|
|
74
|
+
# Dry run to see what would be downloaded
|
|
75
|
+
citations-collector fetch-pdfs --config collection.yaml --dry-run
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
PDFs are stored at `pdfs/{doi}/article.pdf` with accompanying `article.bib` BibTeX files.
|
|
79
|
+
|
|
80
|
+
### Merge Detection
|
|
81
|
+
|
|
82
|
+
Detect preprints that have published versions:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# Detect merges via CrossRef relationships
|
|
86
|
+
citations-collector detect-merges --config collection.yaml
|
|
87
|
+
|
|
88
|
+
# Also run fuzzy title matching (use with caution)
|
|
89
|
+
citations-collector detect-merges --config collection.yaml --fuzzy-match
|
|
90
|
+
|
|
91
|
+
# Preview without updating
|
|
92
|
+
citations-collector detect-merges --config collection.yaml --dry-run
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Detected preprints are marked with `citation_status=merged` and `citation_merged_into={published_doi}`.
|
|
96
|
+
|
|
97
|
+
### Zotero Sync
|
|
98
|
+
|
|
99
|
+
Sync citations to Zotero for collaborative browsing:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# Sync to Zotero (requires API key in config or env)
|
|
103
|
+
citations-collector sync-zotero --config collection.yaml
|
|
104
|
+
|
|
105
|
+
# Dry run to preview structure
|
|
106
|
+
citations-collector sync-zotero --config collection.yaml --dry-run
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Zotero hierarchy:
|
|
110
|
+
```
|
|
111
|
+
Top Collection/
|
|
112
|
+
├── {item_id}/
|
|
113
|
+
│ ├── {flavor}/
|
|
114
|
+
│ │ ├── <active citations>
|
|
115
|
+
│ │ └── Merged/
|
|
116
|
+
│ │ └── <preprints and old versions>
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Unified Configuration
|
|
120
|
+
|
|
121
|
+
Create a unified `collection.yaml` with all settings:
|
|
122
|
+
|
|
123
|
+
```yaml
|
|
124
|
+
name: My Research Collection
|
|
125
|
+
description: Tools and datasets from our lab
|
|
126
|
+
|
|
127
|
+
# Source items to track
|
|
128
|
+
source:
|
|
129
|
+
items:
|
|
130
|
+
- item_id: dandi-000055
|
|
131
|
+
name: "AJILE12: Long-term naturalistic human intracranial neural recordings"
|
|
132
|
+
flavors:
|
|
133
|
+
- flavor_id: "0.220113.0400"
|
|
134
|
+
refs:
|
|
135
|
+
- ref_type: doi
|
|
136
|
+
ref_value: "10.48324/dandi.000055/0.220113.0400"
|
|
137
|
+
|
|
138
|
+
# Citation discovery settings
|
|
139
|
+
discover:
|
|
140
|
+
sources:
|
|
141
|
+
- crossref
|
|
142
|
+
- opencitations
|
|
143
|
+
email: your@email.org # For CrossRef polite pool
|
|
144
|
+
incremental: true
|
|
145
|
+
|
|
146
|
+
# PDF acquisition settings (optional)
|
|
147
|
+
pdfs:
|
|
148
|
+
output_dir: pdfs/
|
|
149
|
+
unpaywall_email: your@email.org
|
|
150
|
+
git_annex: false
|
|
151
|
+
|
|
152
|
+
# Zotero sync settings (optional)
|
|
153
|
+
zotero:
|
|
154
|
+
library_type: group
|
|
155
|
+
library_id: "12345"
|
|
156
|
+
api_key: "YOUR_API_KEY" # Or set ZOTERO_API_KEY env var
|
|
157
|
+
top_collection_key: "ABCD1234"
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Then run the full workflow:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
# 1. Discover citations
|
|
164
|
+
citations-collector discover collection.yaml
|
|
165
|
+
|
|
166
|
+
# 2. Fetch open-access PDFs
|
|
167
|
+
citations-collector fetch-pdfs --config collection.yaml
|
|
168
|
+
|
|
169
|
+
# 3. Detect merged preprints
|
|
170
|
+
citations-collector detect-merges --config collection.yaml
|
|
171
|
+
|
|
172
|
+
# 4. Sync to Zotero
|
|
173
|
+
citations-collector sync-zotero --config collection.yaml
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## Library Usage
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from citations_collector import CitationCollector
|
|
180
|
+
|
|
181
|
+
# Load collection
|
|
182
|
+
collector = CitationCollector.from_yaml("collection.yaml")
|
|
183
|
+
|
|
184
|
+
# Discover citations (incremental by default)
|
|
185
|
+
collector.discover_all(incremental=True, email="your@email.org")
|
|
186
|
+
|
|
187
|
+
# Save results
|
|
188
|
+
collector.save("collection.yaml", "citations.tsv")
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Examples
|
|
192
|
+
|
|
193
|
+
See the `examples/` directory for:
|
|
194
|
+
- **dandi-collection.yaml**: DANDI Archive dandisets with versioned DOIs
|
|
195
|
+
- **repronim-tools.yaml**: ReproNim neuroimaging tools with RRIDs
|
|
196
|
+
- **simple-resources.yaml**: Basic collection without versioning
|
|
197
|
+
- **citations-example.tsv**: Example citation records with curation
|
|
198
|
+
|
|
199
|
+
## Development
|
|
200
|
+
|
|
201
|
+
### Setup
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
# Clone repository
|
|
205
|
+
git clone https://github.com/dandi/citations-collector.git
|
|
206
|
+
cd citations-collector
|
|
207
|
+
|
|
208
|
+
# Setup development environment
|
|
209
|
+
uv venv
|
|
210
|
+
source .venv/bin/activate
|
|
211
|
+
uv pip install -e ".[devel]"
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Running Tests
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
# Run all tests, linting, and type checking
|
|
218
|
+
tox
|
|
219
|
+
|
|
220
|
+
# Run specific environment
|
|
221
|
+
tox -e py312 # Tests on Python 3.12
|
|
222
|
+
tox -e lint # Ruff linting
|
|
223
|
+
tox -e type # Mypy type checking
|
|
224
|
+
tox -e cov # Coverage report
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Regenerating LinkML Models
|
|
228
|
+
|
|
229
|
+
When `schema/citations.yaml` changes:
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
# Install linkml tools
|
|
233
|
+
uv pip install -e ".[linkml]"
|
|
234
|
+
|
|
235
|
+
# Regenerate Pydantic models
|
|
236
|
+
gen-pydantic schema/citations.yaml > src/citations_collector/models/generated.py
|
|
237
|
+
|
|
238
|
+
# Regenerate JSON Schema
|
|
239
|
+
gen-json-schema schema/citations.yaml > schema/citations.schema.json
|
|
240
|
+
|
|
241
|
+
# Commit generated files
|
|
242
|
+
git add src/citations_collector/models/generated.py schema/citations.schema.json
|
|
243
|
+
git commit -m "Regenerate LinkML models"
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Architecture
|
|
247
|
+
|
|
248
|
+
- **Library-First Design**: All functionality accessible programmatically
|
|
249
|
+
- **LinkML Schema**: Validated data models from `schema/citations.yaml`
|
|
250
|
+
- **Modular Structure**:
|
|
251
|
+
- `discovery/`: Citation API clients (CrossRef, OpenCitations, DataCite)
|
|
252
|
+
- `persistence/`: YAML/TSV I/O
|
|
253
|
+
- `importers/`: DANDI API, Zenodo, GitHub integrations
|
|
254
|
+
- `unpaywall.py`: Unpaywall API client for OA PDF URLs
|
|
255
|
+
- `pdf.py`: PDF acquisition with git-annex support
|
|
256
|
+
- `merge_detection.py`: Preprint/published version detection
|
|
257
|
+
- `zotero_sync.py`: Zotero hierarchical sync with merged item handling
|
|
258
|
+
- `core.py`: Main orchestration API
|
|
259
|
+
- `cli.py`: Click-based CLI (thin wrapper)
|
|
260
|
+
|
|
261
|
+
## Citation Sources
|
|
262
|
+
|
|
263
|
+
- **CrossRef**: Most comprehensive, best for DOI citations
|
|
264
|
+
- **OpenCitations**: Open index, may lag behind CrossRef
|
|
265
|
+
- **DataCite**: Good for dataset citations
|
|
266
|
+
- **Europe PMC**: PubMed-indexed papers (future)
|
|
267
|
+
- **Semantic Scholar**: AI-powered citation discovery (future)
|
|
268
|
+
|
|
269
|
+
## License
|
|
270
|
+
|
|
271
|
+
MIT License - see LICENSE file for details.
|
|
272
|
+
|
|
273
|
+
## Contributing
|
|
274
|
+
|
|
275
|
+
See CONSTITUTION.md for:
|
|
276
|
+
- Code standards (Ruff, mypy, type hints)
|
|
277
|
+
- Testing requirements (pytest, 100 lines max, mock HTTP)
|
|
278
|
+
- Architecture principles (library-first, reliability, simplicity)
|
|
279
|
+
|
|
280
|
+
Pull requests welcome!
|
|
281
|
+
|
|
282
|
+
## Citation
|
|
283
|
+
|
|
284
|
+
If you use citations-collector in your research, please cite:
|
|
285
|
+
|
|
286
|
+
```bibtex
|
|
287
|
+
@software{citations_collector,
|
|
288
|
+
title = {citations-collector: Discover and curate scholarly citations},
|
|
289
|
+
author = {{DANDI Team}},
|
|
290
|
+
url = {https://github.com/dandi/citations-collector},
|
|
291
|
+
license = {MIT}
|
|
292
|
+
}
|
|
293
|
+
```
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
item_id item_flavor item_ref_type item_ref_value item_name citation_doi citation_title citation_authors citation_year citation_journal citation_relationship citation_type citation_source discovered_date citation_status citation_merged_into citation_comment curated_by curated_date
|
|
2
|
+
dandi:000003 0.210812.1448 doi 10.48324/dandi.000003/0.210812.1448 Hippocampal Granule Cells 10.1016/j.neuron.2022.01.001 Analysis of hippocampal circuits Smith J; Jones A; Wang L 2022 Neuron Cites Publication crossref 2024-01-15 active
|
|
3
|
+
dandi:000003 0.210812.1448 doi 10.48324/dandi.000003/0.210812.1448 Hippocampal Granule Cells 10.1101/2021.12.15.472789 Preprint version of the analysis Smith J; Jones A 2021 bioRxiv Cites Preprint crossref 2024-01-15 merged 10.1016/j.neuron.2022.01.001 Merged with published version curator1 2024-01-20
|
|
4
|
+
dandi:000003 0.220126.1853 doi 10.48324/dandi.000003/0.220126.1853 Hippocampal Granule Cells 10.1523/JNEUROSCI.1234-22.2022 Follow-up analysis using newer version Brown K 2023 J Neuroscience Uses Publication opencitations 2024-02-01 active
|
|
5
|
+
fmriprep latest rrid SCR_016216 fMRIPrep 10.1038/s41596-022-00710-2 Standard preprocessing for fMRI Esteban O; et al 2022 Nature Protocols Describes Publication scicrunch 2024-01-10 active
|
|
6
|
+
fmriprep latest doi 10.5281/zenodo.852659 fMRIPrep 10.1016/j.neuroimage.2023.120123 Large-scale fMRI meta-analysis Chen M; et al 2023 NeuroImage Uses Publication datacite 2024-01-12 active
|
|
7
|
+
fmriprep 23.1.0 doi 10.5281/zenodo.8076123 fMRIPrep 10.1101/2024.01.05.574321 Methods comparison study Lee S 2024 bioRxiv Cites Preprint crossref 2024-02-15 pending Needs review - unclear citation
|