citations-collector 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of citations-collector might be problematic. Click here for more details.
- citations_collector-0.2.3/.gitignore +21 -0
- citations_collector-0.2.3/PKG-INFO +456 -0
- citations_collector-0.2.3/README.md +390 -0
- citations_collector-0.2.3/examples/citations-example.tsv +7 -0
- citations_collector-0.2.3/examples/dandi-citations.tsv +4 -0
- citations_collector-0.2.3/examples/dandi-collection.yaml +15 -0
- citations_collector-0.2.3/examples/dandi-sample-collection.yaml +58 -0
- citations_collector-0.2.3/examples/github-ci-workflow.yaml +118 -0
- citations_collector-0.2.3/examples/microns-citations.tsv +103 -0
- citations_collector-0.2.3/examples/microns-collection.yaml +44 -0
- citations_collector-0.2.3/examples/repronim-tools-citations.tsv +2546 -0
- citations_collector-0.2.3/examples/repronim-tools.yaml +122 -0
- citations_collector-0.2.3/examples/simple-resources-citations.tsv +1 -0
- citations_collector-0.2.3/examples/simple-resources.yaml +30 -0
- citations_collector-0.2.3/examples/studyforrest-citations.tsv +472 -0
- citations_collector-0.2.3/examples/studyforrest-preseeded.tsv +24 -0
- citations_collector-0.2.3/examples/studyforrest.yaml +33 -0
- citations_collector-0.2.3/pyproject.toml +133 -0
- citations_collector-0.2.3/schema/citations.schema.json +637 -0
- citations_collector-0.2.3/schema/citations.yaml +538 -0
- citations_collector-0.2.3/src/citations_collector/__init__.py +18 -0
- citations_collector-0.2.3/src/citations_collector/_version.py +34 -0
- citations_collector-0.2.3/src/citations_collector/cli.py +525 -0
- citations_collector-0.2.3/src/citations_collector/core.py +503 -0
- citations_collector-0.2.3/src/citations_collector/discovery/__init__.py +17 -0
- citations_collector-0.2.3/src/citations_collector/discovery/base.py +26 -0
- citations_collector-0.2.3/src/citations_collector/discovery/crossref.py +210 -0
- citations_collector-0.2.3/src/citations_collector/discovery/datacite.py +260 -0
- citations_collector-0.2.3/src/citations_collector/discovery/openalex.py +252 -0
- citations_collector-0.2.3/src/citations_collector/discovery/opencitations.py +168 -0
- citations_collector-0.2.3/src/citations_collector/discovery/utils.py +62 -0
- citations_collector-0.2.3/src/citations_collector/importers/__init__.py +17 -0
- citations_collector-0.2.3/src/citations_collector/importers/bibtex.py +178 -0
- citations_collector-0.2.3/src/citations_collector/importers/dandi.py +314 -0
- citations_collector-0.2.3/src/citations_collector/importers/github.py +147 -0
- citations_collector-0.2.3/src/citations_collector/importers/zenodo.py +110 -0
- citations_collector-0.2.3/src/citations_collector/importers/zotero.py +262 -0
- citations_collector-0.2.3/src/citations_collector/merge_detection.py +216 -0
- citations_collector-0.2.3/src/citations_collector/models/__init__.py +44 -0
- citations_collector-0.2.3/src/citations_collector/models/generated.py +525 -0
- citations_collector-0.2.3/src/citations_collector/pdf.py +260 -0
- citations_collector-0.2.3/src/citations_collector/persistence/__init__.py +7 -0
- citations_collector-0.2.3/src/citations_collector/persistence/tsv_io.py +121 -0
- citations_collector-0.2.3/src/citations_collector/persistence/yaml_io.py +50 -0
- citations_collector-0.2.3/src/citations_collector/py.typed +0 -0
- citations_collector-0.2.3/src/citations_collector/unpaywall.py +60 -0
- citations_collector-0.2.3/src/citations_collector/zotero_sync.py +591 -0
- citations_collector-0.2.3/tests/INTEGRATION_TESTS.md +284 -0
- citations_collector-0.2.3/tests/__init__.py +1 -0
- citations_collector-0.2.3/tests/conftest.py +31 -0
- citations_collector-0.2.3/tests/fixtures/bibtex/dandi.bib +15 -0
- citations_collector-0.2.3/tests/fixtures/collections/repronim-tools.yaml +131 -0
- citations_collector-0.2.3/tests/fixtures/collections/simple.yaml +10 -0
- citations_collector-0.2.3/tests/fixtures/responses/crossref_empty.json +8 -0
- citations_collector-0.2.3/tests/fixtures/responses/crossref_success.json +27 -0
- citations_collector-0.2.3/tests/fixtures/responses/datacite_empty.json +6 -0
- citations_collector-0.2.3/tests/fixtures/responses/datacite_success.json +35 -0
- citations_collector-0.2.3/tests/fixtures/responses/opencitations_success.json +8 -0
- citations_collector-0.2.3/tests/fixtures/tsv/citations-example.tsv +7 -0
- citations_collector-0.2.3/tests/fixtures/tsv/simple.tsv +2 -0
- citations_collector-0.2.3/tests/test_cli.py +157 -0
- citations_collector-0.2.3/tests/test_core.py +151 -0
- citations_collector-0.2.3/tests/test_core_bibtex.py +187 -0
- citations_collector-0.2.3/tests/test_discovery.py +328 -0
- citations_collector-0.2.3/tests/test_discovery_openalex.py +334 -0
- citations_collector-0.2.3/tests/test_e2e_bibtex.py +145 -0
- citations_collector-0.2.3/tests/test_importers.py +202 -0
- citations_collector-0.2.3/tests/test_importers_bibtex.py +221 -0
- citations_collector-0.2.3/tests/test_importers_dandi.py +391 -0
- citations_collector-0.2.3/tests/test_importers_zotero.py +370 -0
- citations_collector-0.2.3/tests/test_integration.py +316 -0
- citations_collector-0.2.3/tests/test_merge_detection.py +309 -0
- citations_collector-0.2.3/tests/test_multi_source_validation.py +108 -0
- citations_collector-0.2.3/tests/test_multisource_citations.py +259 -0
- citations_collector-0.2.3/tests/test_pdf.py +166 -0
- citations_collector-0.2.3/tests/test_pdf_retry.py +350 -0
- citations_collector-0.2.3/tests/test_persistence.py +165 -0
- citations_collector-0.2.3/tests/test_unpaywall.py +99 -0
- citations_collector-0.2.3/tests/test_zotero_sync.py +396 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
.npm/
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.lock
|
|
4
|
+
src/citations_collector/_version.py
|
|
5
|
+
|
|
6
|
+
# Development and testing artifacts
|
|
7
|
+
.cache/
|
|
8
|
+
.local/
|
|
9
|
+
.coverage
|
|
10
|
+
*.log
|
|
11
|
+
dist/
|
|
12
|
+
test-pdfs/
|
|
13
|
+
|
|
14
|
+
# Test data files (keep in tests/fixtures/)
|
|
15
|
+
/*.tsv
|
|
16
|
+
/*.yaml
|
|
17
|
+
/*.bib
|
|
18
|
+
cache
|
|
19
|
+
.coverage
|
|
20
|
+
.local
|
|
21
|
+
.claude/settings.local.json
|
|
@@ -0,0 +1,456 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: citations-collector
|
|
3
|
+
Version: 0.2.3
|
|
4
|
+
Summary: Discover and curate scholarly citations of datasets and software
|
|
5
|
+
Project-URL: Homepage, https://github.com/dandi/citations-collector
|
|
6
|
+
Project-URL: Repository, https://github.com/dandi/citations-collector
|
|
7
|
+
Project-URL: Issues, https://github.com/dandi/citations-collector/issues
|
|
8
|
+
Author-email: DANDI <team@dandiarchive.org>
|
|
9
|
+
License: MIT
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: bibtexparser>=2.0.0b7
|
|
20
|
+
Requires-Dist: click>=8.0
|
|
21
|
+
Requires-Dist: pydantic>=2.0
|
|
22
|
+
Requires-Dist: pyyaml>=6.0
|
|
23
|
+
Requires-Dist: pyzotero>=1.5
|
|
24
|
+
Requires-Dist: rapidfuzz>=3.0
|
|
25
|
+
Requires-Dist: requests>=2.28
|
|
26
|
+
Requires-Dist: tqdm>=4.0
|
|
27
|
+
Requires-Dist: urllib3>=1.26
|
|
28
|
+
Provides-Extra: ci
|
|
29
|
+
Requires-Dist: mypy>=1.0; extra == 'ci'
|
|
30
|
+
Requires-Dist: pre-commit>=3.0; extra == 'ci'
|
|
31
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'ci'
|
|
32
|
+
Requires-Dist: pytest-timeout>=2.0; extra == 'ci'
|
|
33
|
+
Requires-Dist: pytest>=7.0; extra == 'ci'
|
|
34
|
+
Requires-Dist: responses>=0.23; extra == 'ci'
|
|
35
|
+
Requires-Dist: ruff>=0.1.0; extra == 'ci'
|
|
36
|
+
Requires-Dist: tox-gh-actions>=3.0; extra == 'ci'
|
|
37
|
+
Requires-Dist: tox-uv>=1.0; extra == 'ci'
|
|
38
|
+
Requires-Dist: tox>=4.0; extra == 'ci'
|
|
39
|
+
Requires-Dist: types-pyyaml; extra == 'ci'
|
|
40
|
+
Requires-Dist: types-requests; extra == 'ci'
|
|
41
|
+
Requires-Dist: types-tqdm; extra == 'ci'
|
|
42
|
+
Provides-Extra: dandi
|
|
43
|
+
Requires-Dist: dandi>=0.60.0; extra == 'dandi'
|
|
44
|
+
Provides-Extra: devel
|
|
45
|
+
Requires-Dist: mypy>=1.0; extra == 'devel'
|
|
46
|
+
Requires-Dist: pre-commit>=3.0; extra == 'devel'
|
|
47
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'devel'
|
|
48
|
+
Requires-Dist: pytest-timeout>=2.0; extra == 'devel'
|
|
49
|
+
Requires-Dist: pytest>=7.0; extra == 'devel'
|
|
50
|
+
Requires-Dist: responses>=0.23; extra == 'devel'
|
|
51
|
+
Requires-Dist: ruff>=0.1.0; extra == 'devel'
|
|
52
|
+
Requires-Dist: tox-uv>=1.0; extra == 'devel'
|
|
53
|
+
Requires-Dist: tox>=4.0; extra == 'devel'
|
|
54
|
+
Requires-Dist: types-pyyaml; extra == 'devel'
|
|
55
|
+
Requires-Dist: types-requests; extra == 'devel'
|
|
56
|
+
Requires-Dist: types-tqdm; extra == 'devel'
|
|
57
|
+
Provides-Extra: linkml
|
|
58
|
+
Requires-Dist: linkml-runtime>=1.7.0; extra == 'linkml'
|
|
59
|
+
Requires-Dist: linkml>=1.7.0; extra == 'linkml'
|
|
60
|
+
Provides-Extra: test
|
|
61
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'test'
|
|
62
|
+
Requires-Dist: pytest-timeout>=2.0; extra == 'test'
|
|
63
|
+
Requires-Dist: pytest>=7.0; extra == 'test'
|
|
64
|
+
Requires-Dist: responses>=0.23; extra == 'test'
|
|
65
|
+
Description-Content-Type: text/markdown
|
|
66
|
+
|
|
67
|
+
# citations-collector
|
|
68
|
+
|
|
69
|
+
Discover and curate scholarly citations of datasets and software.
|
|
70
|
+
|
|
71
|
+
## Features
|
|
72
|
+
|
|
73
|
+
- **Multi-Source Citation Discovery**: Query CrossRef, OpenCitations, DataCite, and OpenAlex - automatically merges results from multiple sources
|
|
74
|
+
- **External Item Management**: Track items via BibTeX files or dynamically fetch from DANDI Archive API
|
|
75
|
+
- **Hierarchical Collections**: Organize citations by project/version (e.g., DANDI dandisets, dataset releases)
|
|
76
|
+
- **Git-Friendly**: YAML collections + TSV citation records for version control
|
|
77
|
+
- **Progress Monitoring**: Real-time progress bars with tqdm for long-running discovery tasks
|
|
78
|
+
- **Intelligent Retry Logic**: Respects rate limits with exponential backoff and Retry-After headers
|
|
79
|
+
- **PDF Acquisition**: Automatically download open-access PDFs via Unpaywall with intelligent HTML/PDF detection
|
|
80
|
+
- **Merge Detection**: Auto-detect preprints with published versions using CrossRef relationships
|
|
81
|
+
- **Zotero Integration**: Sync citations to hierarchical Zotero collections with automatic merged item relocation
|
|
82
|
+
- **Incremental Updates**: Efficiently discover only new citations since last run
|
|
83
|
+
|
|
84
|
+
## Installation
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# Using uv (recommended)
|
|
88
|
+
uv venv
|
|
89
|
+
source .venv/bin/activate
|
|
90
|
+
uv pip install citations-collector
|
|
91
|
+
|
|
92
|
+
# Or using pip
|
|
93
|
+
pip install citations-collector
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Quick Start
|
|
97
|
+
|
|
98
|
+
### 1. Create a Collection
|
|
99
|
+
|
|
100
|
+
Create `collection.yaml`:
|
|
101
|
+
|
|
102
|
+
```yaml
|
|
103
|
+
name: My Research Tools
|
|
104
|
+
description: Software tools used in our lab
|
|
105
|
+
items:
|
|
106
|
+
- item_id: my-tool
|
|
107
|
+
name: "My Analysis Tool"
|
|
108
|
+
flavors:
|
|
109
|
+
- flavor_id: "1.0.0"
|
|
110
|
+
refs:
|
|
111
|
+
- ref_type: doi
|
|
112
|
+
ref_value: "10.5281/zenodo.1234567"
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### 2. Discover Citations
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
# Discover citations for all items in collection
|
|
119
|
+
citations-collector discover collection.yaml --output citations.tsv
|
|
120
|
+
|
|
121
|
+
# Use CrossRef polite pool (better rate limits)
|
|
122
|
+
citations-collector discover collection.yaml --email your@email.org
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### 3. View Results
|
|
126
|
+
|
|
127
|
+
Citations are saved to `citations.tsv` - a tab-separated file you can open in Excel or edit manually for curation.
|
|
128
|
+
|
|
129
|
+
**TSV columns include**:
|
|
130
|
+
- Item identifiers (`item_id`, `item_flavor`, `item_ref_type`, `item_ref_value`)
|
|
131
|
+
- Citation metadata (`citation_doi`, `citation_title`, `citation_authors`, `citation_year`)
|
|
132
|
+
- `citation_sources`: Comma-separated list when found by multiple discoverers (e.g., "crossref, openalex")
|
|
133
|
+
- Curation fields (`citation_status`, `citation_comment`, `curated_by`)
|
|
134
|
+
- Open access tracking (`oa_status`, `pdf_url`, `pdf_path`)
|
|
135
|
+
|
|
136
|
+
## Source Types
|
|
137
|
+
|
|
138
|
+
### BibTeX Source
|
|
139
|
+
|
|
140
|
+
Maintain your items externally in BibTeX format and use citations-collector for discovery:
|
|
141
|
+
|
|
142
|
+
```yaml
|
|
143
|
+
name: DANDI Archive Dandisets
|
|
144
|
+
source:
|
|
145
|
+
type: bibtex
|
|
146
|
+
bibtex_file: ../dandi.bib # Relative or absolute path
|
|
147
|
+
bib_field: doi
|
|
148
|
+
ref_type: doi
|
|
149
|
+
ref_regex: '10\.48324/(?P<item_id>dandi\.\d{6})/(?P<flavor_id>[\d.]+)'
|
|
150
|
+
# update_items omitted - items read from BibTeX, not saved to YAML
|
|
151
|
+
|
|
152
|
+
items: [] # Empty - populated from BibTeX file
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
The regex extracts `item_id` and `flavor_id` from the reference field. Perfect for maintaining items in existing bibliography managers!
|
|
156
|
+
|
|
157
|
+
### DANDI Source
|
|
158
|
+
|
|
159
|
+
Dynamically fetch dandiset metadata from DANDI Archive:
|
|
160
|
+
|
|
161
|
+
```yaml
|
|
162
|
+
source:
|
|
163
|
+
type: dandi
|
|
164
|
+
dandiset_ids:
|
|
165
|
+
- '000402' # MICrONS dataset
|
|
166
|
+
include_draft: false # Only published versions
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Items and versions are automatically populated from the DANDI API - no manual YAML editing required!
|
|
170
|
+
|
|
171
|
+
## Advanced Workflows
|
|
172
|
+
|
|
173
|
+
### PDF Acquisition
|
|
174
|
+
|
|
175
|
+
Automatically download open-access PDFs using Unpaywall:
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
# Fetch PDFs for discovered citations
|
|
179
|
+
citations-collector fetch-pdfs --config collection.yaml
|
|
180
|
+
|
|
181
|
+
# Use git-annex for provenance tracking
|
|
182
|
+
citations-collector fetch-pdfs --config collection.yaml --git-annex
|
|
183
|
+
|
|
184
|
+
# Dry run to see what would be downloaded
|
|
185
|
+
citations-collector fetch-pdfs --config collection.yaml --dry-run
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
PDFs are stored at `pdfs/{doi}/article.pdf` with accompanying `article.bib` BibTeX files.
|
|
189
|
+
|
|
190
|
+
**Smart Features**:
|
|
191
|
+
- **Content-Type Detection**: Automatically detects when server returns HTML instead of PDF and saves with `.html` extension
|
|
192
|
+
- **Retry Logic**: Respects `Retry-After` headers and uses exponential backoff for rate-limited servers (bioRxiv, etc.)
|
|
193
|
+
- **Rate Limiting**: 2-second delay between downloads to avoid triggering bot detection
|
|
194
|
+
- **Skip Existing**: Won't re-download files that already exist (checks both `.pdf` and `.html` extensions)
|
|
195
|
+
|
|
196
|
+
### Merge Detection
|
|
197
|
+
|
|
198
|
+
Detect preprints that have published versions:
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
# Detect merges via CrossRef relationships
|
|
202
|
+
citations-collector detect-merges --config collection.yaml
|
|
203
|
+
|
|
204
|
+
# Also run fuzzy title matching (use with caution)
|
|
205
|
+
citations-collector detect-merges --config collection.yaml --fuzzy-match
|
|
206
|
+
|
|
207
|
+
# Preview without updating
|
|
208
|
+
citations-collector detect-merges --config collection.yaml --dry-run
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Detected preprints are marked with `citation_status=merged` and `citation_merged_into={published_doi}`.
|
|
212
|
+
|
|
213
|
+
### Zotero Sync
|
|
214
|
+
|
|
215
|
+
Sync citations to Zotero for collaborative browsing:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
# Sync to Zotero (requires API key in config or env)
|
|
219
|
+
citations-collector sync-zotero --config collection.yaml
|
|
220
|
+
|
|
221
|
+
# Dry run to preview structure
|
|
222
|
+
citations-collector sync-zotero --config collection.yaml --dry-run
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
Zotero hierarchy:
|
|
226
|
+
```
|
|
227
|
+
Top Collection/
|
|
228
|
+
├── {item_id}/
|
|
229
|
+
│ ├── {flavor}/
|
|
230
|
+
│ │ ├── <active citations>
|
|
231
|
+
│ │ └── Merged/
|
|
232
|
+
│ │ └── <preprints and old versions>
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Unified Configuration
|
|
236
|
+
|
|
237
|
+
Create a unified `collection.yaml` with all settings:
|
|
238
|
+
|
|
239
|
+
```yaml
|
|
240
|
+
name: My Research Collection
|
|
241
|
+
description: Tools and datasets from our lab
|
|
242
|
+
|
|
243
|
+
# Option 1: BibTeX source (external item management)
|
|
244
|
+
source:
|
|
245
|
+
type: bibtex
|
|
246
|
+
bibtex_file: ../my-items.bib
|
|
247
|
+
bib_field: doi
|
|
248
|
+
ref_type: doi
|
|
249
|
+
ref_regex: '(?P<item_id>[\w.-]+)/(?P<flavor_id>[\d.]+)'
|
|
250
|
+
|
|
251
|
+
# Option 2: DANDI source (dynamic API fetch)
|
|
252
|
+
# source:
|
|
253
|
+
# type: dandi
|
|
254
|
+
# dandiset_ids: ['000055']
|
|
255
|
+
# include_draft: false
|
|
256
|
+
|
|
257
|
+
# Option 3: Manual items
|
|
258
|
+
# items:
|
|
259
|
+
# - item_id: my-tool
|
|
260
|
+
# name: "My Analysis Tool"
|
|
261
|
+
# flavors:
|
|
262
|
+
# - flavor_id: "1.0.0"
|
|
263
|
+
# refs:
|
|
264
|
+
# - ref_type: doi
|
|
265
|
+
# ref_value: "10.5281/zenodo.1234567"
|
|
266
|
+
|
|
267
|
+
# Citation discovery settings (uses all 4 sources by default)
|
|
268
|
+
discover:
|
|
269
|
+
email: your@email.org # For CrossRef/OpenAlex polite pool
|
|
270
|
+
incremental: true
|
|
271
|
+
|
|
272
|
+
# PDF acquisition settings (optional)
|
|
273
|
+
pdfs:
|
|
274
|
+
output_dir: pdfs/
|
|
275
|
+
unpaywall_email: your@email.org
|
|
276
|
+
git_annex: false
|
|
277
|
+
|
|
278
|
+
# Zotero sync settings (optional)
|
|
279
|
+
zotero:
|
|
280
|
+
library_type: group
|
|
281
|
+
library_id: "12345"
|
|
282
|
+
api_key: "YOUR_API_KEY" # Or set ZOTERO_API_KEY env var
|
|
283
|
+
top_collection_key: "ABCD1234"
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
Then run the full workflow:
|
|
287
|
+
|
|
288
|
+
```bash
|
|
289
|
+
# 1. Discover citations
|
|
290
|
+
citations-collector discover collection.yaml
|
|
291
|
+
|
|
292
|
+
# 2. Fetch open-access PDFs
|
|
293
|
+
citations-collector fetch-pdfs --config collection.yaml
|
|
294
|
+
|
|
295
|
+
# 3. Detect merged preprints
|
|
296
|
+
citations-collector detect-merges --config collection.yaml
|
|
297
|
+
|
|
298
|
+
# 4. Sync to Zotero
|
|
299
|
+
citations-collector sync-zotero --config collection.yaml
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
## Library Usage
|
|
303
|
+
|
|
304
|
+
```python
|
|
305
|
+
from citations_collector import CitationCollector
|
|
306
|
+
|
|
307
|
+
# Load collection
|
|
308
|
+
collector = CitationCollector.from_yaml("collection.yaml")
|
|
309
|
+
|
|
310
|
+
# Discover citations (incremental by default)
|
|
311
|
+
collector.discover_all(incremental=True, email="your@email.org")
|
|
312
|
+
|
|
313
|
+
# Save results
|
|
314
|
+
collector.save("collection.yaml", "citations.tsv")
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## Real-World Examples
|
|
318
|
+
|
|
319
|
+
See the `examples/` directory for production configurations:
|
|
320
|
+
|
|
321
|
+
### DANDI Archive (`dandi-collection.yaml`)
|
|
322
|
+
Tracks all 850+ DANDI dandisets via external BibTeX file maintained by `dandi-citations` tool. Items are extracted from versioned DOIs like `10.48324/dandi.000003/0.210812.1448` using regex parsing. Perfect example of external item management - the YAML stays clean while the BibTeX file is the source of truth.
|
|
323
|
+
|
|
324
|
+
**Complete pipeline**: See [dandi/dandi-bib](https://github.com/dandi/dandi-bib) for the full production setup including automation, BibTeX generation, and citation tracking workflows.
|
|
325
|
+
|
|
326
|
+
**Key features**: BibTeX source, regex extraction, external maintenance
|
|
327
|
+
|
|
328
|
+
### MICrONS Dataset (`microns-collection.yaml`)
|
|
329
|
+
Machine Intelligence from Cortical Networks - a cubic millimeter of mouse visual cortex. Demonstrates **dynamic source fetching** from DANDI API for dandiset 000402, plus manually curated items for the Nature paper and BOSS database entry. Shows how to mix DANDI source with manual items.
|
|
330
|
+
|
|
331
|
+
**Key features**: DANDI API source, mixed item sources, multi-resource tracking
|
|
332
|
+
|
|
333
|
+
### StudyForrest (`studyforrest.yaml`)
|
|
334
|
+
High-resolution 7-Tesla fMRI dataset with complex naturalistic stimulation. Tracks 11 papers (dataset papers, analysis papers, protocols) all citing back to the main dataset. Real example of tracking a dataset with multiple associated publications.
|
|
335
|
+
|
|
336
|
+
**Key features**: Multiple papers for single dataset, manual item definition
|
|
337
|
+
|
|
338
|
+
### ReproNim Tools (`repronim-tools.yaml`)
|
|
339
|
+
Neuroimaging software tools tracked via RRIDs (Research Resource Identifiers), GitHub repos, Zenodo releases, and DOIs. Demonstrates tracking software across multiple identifier types and versions. **Over 2500 citations** discovered!
|
|
340
|
+
|
|
341
|
+
**Key features**: RRIDs, GitHub repos, Zenodo, multi-identifier tracking
|
|
342
|
+
|
|
343
|
+
### Simple Resources (`simple-resources.yaml`)
|
|
344
|
+
Basic example for getting started - minimal configuration without versioning complexities.
|
|
345
|
+
|
|
346
|
+
### Sample Collections
|
|
347
|
+
- **dandi-sample-collection.yaml**: Small subset of DANDI dandisets for testing
|
|
348
|
+
- **citations-example.tsv**: Curated citation records showing merge detection, status flags
|
|
349
|
+
|
|
350
|
+
## Development
|
|
351
|
+
|
|
352
|
+
### Setup
|
|
353
|
+
|
|
354
|
+
```bash
|
|
355
|
+
# Clone repository
|
|
356
|
+
git clone https://github.com/dandi/citations-collector.git
|
|
357
|
+
cd citations-collector
|
|
358
|
+
|
|
359
|
+
# Setup development environment
|
|
360
|
+
uv venv
|
|
361
|
+
source .venv/bin/activate
|
|
362
|
+
uv pip install -e ".[devel]"
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
### Running Tests
|
|
366
|
+
|
|
367
|
+
```bash
|
|
368
|
+
# Run all tests, linting, and type checking
|
|
369
|
+
tox
|
|
370
|
+
|
|
371
|
+
# Run specific environment
|
|
372
|
+
tox -e py312 # Tests on Python 3.12
|
|
373
|
+
tox -e lint # Ruff linting
|
|
374
|
+
tox -e type # Mypy type checking
|
|
375
|
+
tox -e cov # Coverage report
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
### Regenerating LinkML Models
|
|
379
|
+
|
|
380
|
+
When `schema/citations.yaml` changes:
|
|
381
|
+
|
|
382
|
+
```bash
|
|
383
|
+
# Install linkml tools
|
|
384
|
+
uv pip install -e ".[linkml]"
|
|
385
|
+
|
|
386
|
+
# Regenerate Pydantic models
|
|
387
|
+
gen-pydantic schema/citations.yaml > src/citations_collector/models/generated.py
|
|
388
|
+
|
|
389
|
+
# Regenerate JSON Schema
|
|
390
|
+
gen-json-schema schema/citations.yaml > schema/citations.schema.json
|
|
391
|
+
|
|
392
|
+
# Commit generated files
|
|
393
|
+
git add src/citations_collector/models/generated.py schema/citations.schema.json
|
|
394
|
+
git commit -m "Regenerate LinkML models"
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
## Architecture
|
|
398
|
+
|
|
399
|
+
- **Library-First Design**: All functionality accessible programmatically
|
|
400
|
+
- **LinkML Schema**: Validated data models from `schema/citations.yaml`
|
|
401
|
+
- **Modular Structure**:
|
|
402
|
+
- `discovery/`: Citation API clients (CrossRef, OpenCitations, DataCite, OpenAlex)
|
|
403
|
+
- `persistence/`: YAML/TSV I/O with multi-source support
|
|
404
|
+
- `importers/`: DANDI API, BibTeX, Zenodo, GitHub integrations
|
|
405
|
+
- `unpaywall.py`: Unpaywall API client for OA PDF URLs
|
|
406
|
+
- `pdf.py`: PDF acquisition with retry logic, Retry-After support, and git-annex tracking
|
|
407
|
+
- `merge_detection.py`: Preprint/published version detection
|
|
408
|
+
- `zotero_sync.py`: Zotero hierarchical sync with merged item handling
|
|
409
|
+
- `core.py`: Main orchestration API with progress bars
|
|
410
|
+
- `cli.py`: Click-based CLI (thin wrapper)
|
|
411
|
+
|
|
412
|
+
**Recent Improvements**:
|
|
413
|
+
- Multi-source citation deduplication and tracking
|
|
414
|
+
- Retry logic with exponential backoff and Retry-After header support
|
|
415
|
+
- HTML vs PDF content detection for downloads
|
|
416
|
+
- Progress bars with real-time logging
|
|
417
|
+
- BibTeX import for external item management
|
|
418
|
+
|
|
419
|
+
## Citation Sources
|
|
420
|
+
|
|
421
|
+
All four sources are queried in parallel, and results are automatically deduplicated and merged:
|
|
422
|
+
|
|
423
|
+
- **CrossRef**: Most comprehensive, best for journal articles and conference papers
|
|
424
|
+
- **OpenCitations**: Open citation index, community-maintained
|
|
425
|
+
- **DataCite**: Specialized for dataset citations and research data
|
|
426
|
+
- **OpenAlex**: Broad academic coverage including preprints, with additional metadata
|
|
427
|
+
|
|
428
|
+
**Multi-Source Tracking**: When the same citation is found by multiple sources (e.g., both CrossRef and OpenAlex), it's stored as a single row in the TSV with `citation_sources: "crossref, openalex"`. This helps verify citation coverage and identify which sources are most useful for your domain.
|
|
429
|
+
|
|
430
|
+
**Future Sources**: Europe PMC (PubMed-indexed papers), Semantic Scholar (AI-powered discovery)
|
|
431
|
+
|
|
432
|
+
## License
|
|
433
|
+
|
|
434
|
+
MIT License - see LICENSE file for details.
|
|
435
|
+
|
|
436
|
+
## Contributing
|
|
437
|
+
|
|
438
|
+
See CONSTITUTION.md for:
|
|
439
|
+
- Code standards (Ruff, mypy, type hints)
|
|
440
|
+
- Testing requirements (pytest, 100 lines max, mock HTTP)
|
|
441
|
+
- Architecture principles (library-first, reliability, simplicity)
|
|
442
|
+
|
|
443
|
+
Pull requests welcome!
|
|
444
|
+
|
|
445
|
+
## Citation
|
|
446
|
+
|
|
447
|
+
If you use citations-collector in your research, please cite:
|
|
448
|
+
|
|
449
|
+
```bibtex
|
|
450
|
+
@software{citations_collector,
|
|
451
|
+
title = {citations-collector: Discover and curate scholarly citations},
|
|
452
|
+
author = {{DANDI Team}},
|
|
453
|
+
url = {https://github.com/dandi/citations-collector},
|
|
454
|
+
license = {MIT}
|
|
455
|
+
}
|
|
456
|
+
```
|