citations-collector 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. citations_collector-0.2.1/.gitignore +4 -0
  2. citations_collector-0.2.1/PKG-INFO +354 -0
  3. citations_collector-0.2.1/README.md +293 -0
  4. citations_collector-0.2.1/examples/citations-example.tsv +7 -0
  5. citations_collector-0.2.1/examples/dandi-sample-collection.yaml +64 -0
  6. citations_collector-0.2.1/examples/github-ci-workflow.yaml +118 -0
  7. citations_collector-0.2.1/examples/microns-citations.tsv.bak +4 -0
  8. citations_collector-0.2.1/examples/microns-collection.yaml +50 -0
  9. citations_collector-0.2.1/examples/repronim-tools.yaml +122 -0
  10. citations_collector-0.2.1/examples/simple-resources.yaml +30 -0
  11. citations_collector-0.2.1/examples/studyforrest-preseeded.tsv +24 -0
  12. citations_collector-0.2.1/examples/studyforrest.yaml +33 -0
  13. citations_collector-0.2.1/pyproject.toml +129 -0
  14. citations_collector-0.2.1/schema/citations.schema.json +637 -0
  15. citations_collector-0.2.1/schema/citations.yaml +518 -0
  16. citations_collector-0.2.1/src/citations_collector/__init__.py +18 -0
  17. citations_collector-0.2.1/src/citations_collector/_version.py +34 -0
  18. citations_collector-0.2.1/src/citations_collector/cli.py +525 -0
  19. citations_collector-0.2.1/src/citations_collector/core.py +336 -0
  20. citations_collector-0.2.1/src/citations_collector/discovery/__init__.py +17 -0
  21. citations_collector-0.2.1/src/citations_collector/discovery/base.py +26 -0
  22. citations_collector-0.2.1/src/citations_collector/discovery/crossref.py +196 -0
  23. citations_collector-0.2.1/src/citations_collector/discovery/datacite.py +260 -0
  24. citations_collector-0.2.1/src/citations_collector/discovery/openalex.py +253 -0
  25. citations_collector-0.2.1/src/citations_collector/discovery/opencitations.py +168 -0
  26. citations_collector-0.2.1/src/citations_collector/discovery/utils.py +40 -0
  27. citations_collector-0.2.1/src/citations_collector/importers/__init__.py +15 -0
  28. citations_collector-0.2.1/src/citations_collector/importers/dandi.py +314 -0
  29. citations_collector-0.2.1/src/citations_collector/importers/github.py +147 -0
  30. citations_collector-0.2.1/src/citations_collector/importers/zenodo.py +110 -0
  31. citations_collector-0.2.1/src/citations_collector/importers/zotero.py +262 -0
  32. citations_collector-0.2.1/src/citations_collector/merge_detection.py +216 -0
  33. citations_collector-0.2.1/src/citations_collector/models/__init__.py +44 -0
  34. citations_collector-0.2.1/src/citations_collector/models/generated.py +544 -0
  35. citations_collector-0.2.1/src/citations_collector/pdf.py +152 -0
  36. citations_collector-0.2.1/src/citations_collector/persistence/__init__.py +7 -0
  37. citations_collector-0.2.1/src/citations_collector/persistence/tsv_io.py +94 -0
  38. citations_collector-0.2.1/src/citations_collector/persistence/yaml_io.py +50 -0
  39. citations_collector-0.2.1/src/citations_collector/py.typed +0 -0
  40. citations_collector-0.2.1/src/citations_collector/unpaywall.py +60 -0
  41. citations_collector-0.2.1/src/citations_collector/zotero_sync.py +591 -0
  42. citations_collector-0.2.1/tests/INTEGRATION_TESTS.md +284 -0
  43. citations_collector-0.2.1/tests/__init__.py +1 -0
  44. citations_collector-0.2.1/tests/conftest.py +31 -0
  45. citations_collector-0.2.1/tests/fixtures/collections/repronim-tools.yaml +131 -0
  46. citations_collector-0.2.1/tests/fixtures/collections/simple.yaml +10 -0
  47. citations_collector-0.2.1/tests/fixtures/responses/crossref_empty.json +8 -0
  48. citations_collector-0.2.1/tests/fixtures/responses/crossref_success.json +27 -0
  49. citations_collector-0.2.1/tests/fixtures/responses/datacite_empty.json +6 -0
  50. citations_collector-0.2.1/tests/fixtures/responses/datacite_success.json +35 -0
  51. citations_collector-0.2.1/tests/fixtures/responses/opencitations_success.json +8 -0
  52. citations_collector-0.2.1/tests/fixtures/tsv/citations-example.tsv +7 -0
  53. citations_collector-0.2.1/tests/fixtures/tsv/simple.tsv +2 -0
  54. citations_collector-0.2.1/tests/test_cli.py +157 -0
  55. citations_collector-0.2.1/tests/test_core.py +151 -0
  56. citations_collector-0.2.1/tests/test_discovery.py +328 -0
  57. citations_collector-0.2.1/tests/test_discovery_openalex.py +334 -0
  58. citations_collector-0.2.1/tests/test_importers.py +202 -0
  59. citations_collector-0.2.1/tests/test_importers_dandi.py +391 -0
  60. citations_collector-0.2.1/tests/test_importers_zotero.py +370 -0
  61. citations_collector-0.2.1/tests/test_integration.py +316 -0
  62. citations_collector-0.2.1/tests/test_merge_detection.py +309 -0
  63. citations_collector-0.2.1/tests/test_multi_source_validation.py +108 -0
  64. citations_collector-0.2.1/tests/test_pdf.py +166 -0
  65. citations_collector-0.2.1/tests/test_persistence.py +165 -0
  66. citations_collector-0.2.1/tests/test_unpaywall.py +99 -0
  67. citations_collector-0.2.1/tests/test_zotero_sync.py +396 -0
@@ -0,0 +1,4 @@
1
+ .npm/
2
+ __pycache__/
3
+ *.lock
4
+ src/citations_collector/_version.py
@@ -0,0 +1,354 @@
1
+ Metadata-Version: 2.4
2
+ Name: citations-collector
3
+ Version: 0.2.1
4
+ Summary: Discover and curate scholarly citations of datasets and software
5
+ Project-URL: Homepage, https://github.com/dandi/citations-collector
6
+ Project-URL: Repository, https://github.com/dandi/citations-collector
7
+ Project-URL: Issues, https://github.com/dandi/citations-collector/issues
8
+ Author-email: DANDI <team@dandiarchive.org>
9
+ License: MIT
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: click>=8.0
20
+ Requires-Dist: pydantic>=2.0
21
+ Requires-Dist: pyyaml>=6.0
22
+ Requires-Dist: pyzotero>=1.5
23
+ Requires-Dist: rapidfuzz>=3.0
24
+ Requires-Dist: requests>=2.28
25
+ Provides-Extra: ci
26
+ Requires-Dist: mypy>=1.0; extra == 'ci'
27
+ Requires-Dist: pre-commit>=3.0; extra == 'ci'
28
+ Requires-Dist: pytest-cov>=4.0; extra == 'ci'
29
+ Requires-Dist: pytest-timeout>=2.0; extra == 'ci'
30
+ Requires-Dist: pytest>=7.0; extra == 'ci'
31
+ Requires-Dist: responses>=0.23; extra == 'ci'
32
+ Requires-Dist: ruff>=0.1.0; extra == 'ci'
33
+ Requires-Dist: tox-gh-actions>=3.0; extra == 'ci'
34
+ Requires-Dist: tox-uv>=1.0; extra == 'ci'
35
+ Requires-Dist: tox>=4.0; extra == 'ci'
36
+ Requires-Dist: types-pyyaml; extra == 'ci'
37
+ Requires-Dist: types-requests; extra == 'ci'
38
+ Provides-Extra: dandi
39
+ Requires-Dist: dandi>=0.60.0; extra == 'dandi'
40
+ Provides-Extra: devel
41
+ Requires-Dist: mypy>=1.0; extra == 'devel'
42
+ Requires-Dist: pre-commit>=3.0; extra == 'devel'
43
+ Requires-Dist: pytest-cov>=4.0; extra == 'devel'
44
+ Requires-Dist: pytest-timeout>=2.0; extra == 'devel'
45
+ Requires-Dist: pytest>=7.0; extra == 'devel'
46
+ Requires-Dist: responses>=0.23; extra == 'devel'
47
+ Requires-Dist: ruff>=0.1.0; extra == 'devel'
48
+ Requires-Dist: tox-uv>=1.0; extra == 'devel'
49
+ Requires-Dist: tox>=4.0; extra == 'devel'
50
+ Requires-Dist: types-pyyaml; extra == 'devel'
51
+ Requires-Dist: types-requests; extra == 'devel'
52
+ Provides-Extra: linkml
53
+ Requires-Dist: linkml-runtime>=1.7.0; extra == 'linkml'
54
+ Requires-Dist: linkml>=1.7.0; extra == 'linkml'
55
+ Provides-Extra: test
56
+ Requires-Dist: pytest-cov>=4.0; extra == 'test'
57
+ Requires-Dist: pytest-timeout>=2.0; extra == 'test'
58
+ Requires-Dist: pytest>=7.0; extra == 'test'
59
+ Requires-Dist: responses>=0.23; extra == 'test'
60
+ Description-Content-Type: text/markdown
61
+
62
+ # citations-collector
63
+
64
+ Discover and curate scholarly citations of datasets and software.
65
+
66
+ ## Features
67
+
68
+ - **Citation Discovery**: Query CrossRef, OpenCitations, DataCite for citing papers
69
+ - **Hierarchical Collections**: Organize citations by project/version (e.g., DANDI dandisets)
70
+ - **Git-Friendly**: YAML collections + TSV citation records for version control
71
+ - **Curation Workflow**: Mark citations as ignored, merge preprints with published versions
72
+ - **PDF Acquisition**: Automatically download open-access PDFs via Unpaywall with optional git-annex tracking
73
+ - **Merge Detection**: Auto-detect preprints with published versions using CrossRef relationships
74
+ - **Zotero Integration**: Sync citations to hierarchical Zotero collections with automatic merged item relocation
75
+ - **Incremental Updates**: Efficiently discover only new citations since last run
76
+
77
+ ## Installation
78
+
79
+ ```bash
80
+ # Using uv (recommended)
81
+ uv venv
82
+ source .venv/bin/activate
83
+ uv pip install citations-collector
84
+
85
+ # Or using pip
86
+ pip install citations-collector
87
+ ```
88
+
89
+ ## Quick Start
90
+
91
+ ### 1. Create a Collection
92
+
93
+ Create `collection.yaml`:
94
+
95
+ ```yaml
96
+ name: My Research Tools
97
+ description: Software tools used in our lab
98
+ items:
99
+ - item_id: my-tool
100
+ name: "My Analysis Tool"
101
+ flavors:
102
+ - flavor_id: "1.0.0"
103
+ refs:
104
+ - ref_type: doi
105
+ ref_value: "10.5281/zenodo.1234567"
106
+ ```
107
+
108
+ ### 2. Discover Citations
109
+
110
+ ```bash
111
+ # Discover citations for all items in collection
112
+ citations-collector discover collection.yaml --output citations.tsv
113
+
114
+ # Use CrossRef polite pool (better rate limits)
115
+ citations-collector discover collection.yaml --email your@email.org
116
+ ```
117
+
118
+ ### 3. View Results
119
+
120
+ Citations are saved to `citations.tsv` - a tab-separated file you can open in Excel or edit manually for curation.
121
+
122
+ ## Advanced Workflows
123
+
124
+ ### PDF Acquisition
125
+
126
+ Automatically download open-access PDFs using Unpaywall:
127
+
128
+ ```bash
129
+ # Fetch PDFs for discovered citations
130
+ citations-collector fetch-pdfs --config collection.yaml
131
+
132
+ # Use git-annex for provenance tracking
133
+ citations-collector fetch-pdfs --config collection.yaml --git-annex
134
+
135
+ # Dry run to see what would be downloaded
136
+ citations-collector fetch-pdfs --config collection.yaml --dry-run
137
+ ```
138
+
139
+ PDFs are stored at `pdfs/{doi}/article.pdf` with accompanying `article.bib` BibTeX files.
140
+
141
+ ### Merge Detection
142
+
143
+ Detect preprints that have published versions:
144
+
145
+ ```bash
146
+ # Detect merges via CrossRef relationships
147
+ citations-collector detect-merges --config collection.yaml
148
+
149
+ # Also run fuzzy title matching (use with caution)
150
+ citations-collector detect-merges --config collection.yaml --fuzzy-match
151
+
152
+ # Preview without updating
153
+ citations-collector detect-merges --config collection.yaml --dry-run
154
+ ```
155
+
156
+ Detected preprints are marked with `citation_status=merged` and `citation_merged_into={published_doi}`.
157
+
158
+ ### Zotero Sync
159
+
160
+ Sync citations to Zotero for collaborative browsing:
161
+
162
+ ```bash
163
+ # Sync to Zotero (requires API key in config or env)
164
+ citations-collector sync-zotero --config collection.yaml
165
+
166
+ # Dry run to preview structure
167
+ citations-collector sync-zotero --config collection.yaml --dry-run
168
+ ```
169
+
170
+ Zotero hierarchy:
171
+ ```
172
+ Top Collection/
173
+ ├── {item_id}/
174
+ │ ├── {flavor}/
175
+ │ │ ├── <active citations>
176
+ │ │ └── Merged/
177
+ │ │ └── <preprints and old versions>
178
+ ```
179
+
180
+ ### Unified Configuration
181
+
182
+ Create a unified `collection.yaml` with all settings:
183
+
184
+ ```yaml
185
+ name: My Research Collection
186
+ description: Tools and datasets from our lab
187
+
188
+ # Source items to track
189
+ source:
190
+ items:
191
+ - item_id: dandi-000055
192
+ name: "AJILE12: Long-term naturalistic human intracranial neural recordings"
193
+ flavors:
194
+ - flavor_id: "0.220113.0400"
195
+ refs:
196
+ - ref_type: doi
197
+ ref_value: "10.48324/dandi.000055/0.220113.0400"
198
+
199
+ # Citation discovery settings
200
+ discover:
201
+ sources:
202
+ - crossref
203
+ - opencitations
204
+ email: your@email.org # For CrossRef polite pool
205
+ incremental: true
206
+
207
+ # PDF acquisition settings (optional)
208
+ pdfs:
209
+ output_dir: pdfs/
210
+ unpaywall_email: your@email.org
211
+ git_annex: false
212
+
213
+ # Zotero sync settings (optional)
214
+ zotero:
215
+ library_type: group
216
+ library_id: "12345"
217
+ api_key: "YOUR_API_KEY" # Or set ZOTERO_API_KEY env var
218
+ top_collection_key: "ABCD1234"
219
+ ```
220
+
221
+ Then run the full workflow:
222
+
223
+ ```bash
224
+ # 1. Discover citations
225
+ citations-collector discover collection.yaml
226
+
227
+ # 2. Fetch open-access PDFs
228
+ citations-collector fetch-pdfs --config collection.yaml
229
+
230
+ # 3. Detect merged preprints
231
+ citations-collector detect-merges --config collection.yaml
232
+
233
+ # 4. Sync to Zotero
234
+ citations-collector sync-zotero --config collection.yaml
235
+ ```
236
+
237
+ ## Library Usage
238
+
239
+ ```python
240
+ from citations_collector import CitationCollector
241
+
242
+ # Load collection
243
+ collector = CitationCollector.from_yaml("collection.yaml")
244
+
245
+ # Discover citations (incremental by default)
246
+ collector.discover_all(incremental=True, email="your@email.org")
247
+
248
+ # Save results
249
+ collector.save("collection.yaml", "citations.tsv")
250
+ ```
251
+
252
+ ## Examples
253
+
254
+ See the `examples/` directory for:
255
+ - **dandi-collection.yaml**: DANDI Archive dandisets with versioned DOIs
256
+ - **repronim-tools.yaml**: ReproNim neuroimaging tools with RRIDs
257
+ - **simple-resources.yaml**: Basic collection without versioning
258
+ - **citations-example.tsv**: Example citation records with curation
259
+
260
+ ## Development
261
+
262
+ ### Setup
263
+
264
+ ```bash
265
+ # Clone repository
266
+ git clone https://github.com/dandi/citations-collector.git
267
+ cd citations-collector
268
+
269
+ # Setup development environment
270
+ uv venv
271
+ source .venv/bin/activate
272
+ uv pip install -e ".[devel]"
273
+ ```
274
+
275
+ ### Running Tests
276
+
277
+ ```bash
278
+ # Run all tests, linting, and type checking
279
+ tox
280
+
281
+ # Run specific environment
282
+ tox -e py312 # Tests on Python 3.12
283
+ tox -e lint # Ruff linting
284
+ tox -e type # Mypy type checking
285
+ tox -e cov # Coverage report
286
+ ```
287
+
288
+ ### Regenerating LinkML Models
289
+
290
+ When `schema/citations.yaml` changes:
291
+
292
+ ```bash
293
+ # Install linkml tools
294
+ uv pip install -e ".[linkml]"
295
+
296
+ # Regenerate Pydantic models
297
+ gen-pydantic schema/citations.yaml > src/citations_collector/models/generated.py
298
+
299
+ # Regenerate JSON Schema
300
+ gen-json-schema schema/citations.yaml > schema/citations.schema.json
301
+
302
+ # Commit generated files
303
+ git add src/citations_collector/models/generated.py schema/citations.schema.json
304
+ git commit -m "Regenerate LinkML models"
305
+ ```
306
+
307
+ ## Architecture
308
+
309
+ - **Library-First Design**: All functionality accessible programmatically
310
+ - **LinkML Schema**: Validated data models from `schema/citations.yaml`
311
+ - **Modular Structure**:
312
+ - `discovery/`: Citation API clients (CrossRef, OpenCitations, DataCite)
313
+ - `persistence/`: YAML/TSV I/O
314
+ - `importers/`: DANDI API, Zenodo, GitHub integrations
315
+ - `unpaywall.py`: Unpaywall API client for OA PDF URLs
316
+ - `pdf.py`: PDF acquisition with git-annex support
317
+ - `merge_detection.py`: Preprint/published version detection
318
+ - `zotero_sync.py`: Zotero hierarchical sync with merged item handling
319
+ - `core.py`: Main orchestration API
320
+ - `cli.py`: Click-based CLI (thin wrapper)
321
+
322
+ ## Citation Sources
323
+
324
+ - **CrossRef**: Most comprehensive, best for DOI citations
325
+ - **OpenCitations**: Open index, may lag behind CrossRef
326
+ - **DataCite**: Good for dataset citations
327
+ - **Europe PMC**: PubMed-indexed papers (future)
328
+ - **Semantic Scholar**: AI-powered citation discovery (future)
329
+
330
+ ## License
331
+
332
+ MIT License - see LICENSE file for details.
333
+
334
+ ## Contributing
335
+
336
+ See CONSTITUTION.md for:
337
+ - Code standards (Ruff, mypy, type hints)
338
+ - Testing requirements (pytest, 100 lines max, mock HTTP)
339
+ - Architecture principles (library-first, reliability, simplicity)
340
+
341
+ Pull requests welcome!
342
+
343
+ ## Citation
344
+
345
+ If you use citations-collector in your research, please cite:
346
+
347
+ ```bibtex
348
+ @software{citations_collector,
349
+ title = {citations-collector: Discover and curate scholarly citations},
350
+ author = {{DANDI Team}},
351
+ url = {https://github.com/dandi/citations-collector},
352
+ license = {MIT}
353
+ }
354
+ ```
@@ -0,0 +1,293 @@
1
+ # citations-collector
2
+
3
+ Discover and curate scholarly citations of datasets and software.
4
+
5
+ ## Features
6
+
7
+ - **Citation Discovery**: Query CrossRef, OpenCitations, DataCite for citing papers
8
+ - **Hierarchical Collections**: Organize citations by project/version (e.g., DANDI dandisets)
9
+ - **Git-Friendly**: YAML collections + TSV citation records for version control
10
+ - **Curation Workflow**: Mark citations as ignored, merge preprints with published versions
11
+ - **PDF Acquisition**: Automatically download open-access PDFs via Unpaywall with optional git-annex tracking
12
+ - **Merge Detection**: Auto-detect preprints with published versions using CrossRef relationships
13
+ - **Zotero Integration**: Sync citations to hierarchical Zotero collections with automatic merged item relocation
14
+ - **Incremental Updates**: Efficiently discover only new citations since last run
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ # Using uv (recommended)
20
+ uv venv
21
+ source .venv/bin/activate
22
+ uv pip install citations-collector
23
+
24
+ # Or using pip
25
+ pip install citations-collector
26
+ ```
27
+
28
+ ## Quick Start
29
+
30
+ ### 1. Create a Collection
31
+
32
+ Create `collection.yaml`:
33
+
34
+ ```yaml
35
+ name: My Research Tools
36
+ description: Software tools used in our lab
37
+ items:
38
+ - item_id: my-tool
39
+ name: "My Analysis Tool"
40
+ flavors:
41
+ - flavor_id: "1.0.0"
42
+ refs:
43
+ - ref_type: doi
44
+ ref_value: "10.5281/zenodo.1234567"
45
+ ```
46
+
47
+ ### 2. Discover Citations
48
+
49
+ ```bash
50
+ # Discover citations for all items in collection
51
+ citations-collector discover collection.yaml --output citations.tsv
52
+
53
+ # Use CrossRef polite pool (better rate limits)
54
+ citations-collector discover collection.yaml --email your@email.org
55
+ ```
56
+
57
+ ### 3. View Results
58
+
59
+ Citations are saved to `citations.tsv` - a tab-separated file you can open in Excel or edit manually for curation.
60
+
61
+ ## Advanced Workflows
62
+
63
+ ### PDF Acquisition
64
+
65
+ Automatically download open-access PDFs using Unpaywall:
66
+
67
+ ```bash
68
+ # Fetch PDFs for discovered citations
69
+ citations-collector fetch-pdfs --config collection.yaml
70
+
71
+ # Use git-annex for provenance tracking
72
+ citations-collector fetch-pdfs --config collection.yaml --git-annex
73
+
74
+ # Dry run to see what would be downloaded
75
+ citations-collector fetch-pdfs --config collection.yaml --dry-run
76
+ ```
77
+
78
+ PDFs are stored at `pdfs/{doi}/article.pdf` with accompanying `article.bib` BibTeX files.
79
+
80
+ ### Merge Detection
81
+
82
+ Detect preprints that have published versions:
83
+
84
+ ```bash
85
+ # Detect merges via CrossRef relationships
86
+ citations-collector detect-merges --config collection.yaml
87
+
88
+ # Also run fuzzy title matching (use with caution)
89
+ citations-collector detect-merges --config collection.yaml --fuzzy-match
90
+
91
+ # Preview without updating
92
+ citations-collector detect-merges --config collection.yaml --dry-run
93
+ ```
94
+
95
+ Detected preprints are marked with `citation_status=merged` and `citation_merged_into={published_doi}`.
96
+
97
+ ### Zotero Sync
98
+
99
+ Sync citations to Zotero for collaborative browsing:
100
+
101
+ ```bash
102
+ # Sync to Zotero (requires API key in config or env)
103
+ citations-collector sync-zotero --config collection.yaml
104
+
105
+ # Dry run to preview structure
106
+ citations-collector sync-zotero --config collection.yaml --dry-run
107
+ ```
108
+
109
+ Zotero hierarchy:
110
+ ```
111
+ Top Collection/
112
+ ├── {item_id}/
113
+ │ ├── {flavor}/
114
+ │ │ ├── <active citations>
115
+ │ │ └── Merged/
116
+ │ │ └── <preprints and old versions>
117
+ ```
118
+
119
+ ### Unified Configuration
120
+
121
+ Create a unified `collection.yaml` with all settings:
122
+
123
+ ```yaml
124
+ name: My Research Collection
125
+ description: Tools and datasets from our lab
126
+
127
+ # Source items to track
128
+ source:
129
+ items:
130
+ - item_id: dandi-000055
131
+ name: "AJILE12: Long-term naturalistic human intracranial neural recordings"
132
+ flavors:
133
+ - flavor_id: "0.220113.0400"
134
+ refs:
135
+ - ref_type: doi
136
+ ref_value: "10.48324/dandi.000055/0.220113.0400"
137
+
138
+ # Citation discovery settings
139
+ discover:
140
+ sources:
141
+ - crossref
142
+ - opencitations
143
+ email: your@email.org # For CrossRef polite pool
144
+ incremental: true
145
+
146
+ # PDF acquisition settings (optional)
147
+ pdfs:
148
+ output_dir: pdfs/
149
+ unpaywall_email: your@email.org
150
+ git_annex: false
151
+
152
+ # Zotero sync settings (optional)
153
+ zotero:
154
+ library_type: group
155
+ library_id: "12345"
156
+ api_key: "YOUR_API_KEY" # Or set ZOTERO_API_KEY env var
157
+ top_collection_key: "ABCD1234"
158
+ ```
159
+
160
+ Then run the full workflow:
161
+
162
+ ```bash
163
+ # 1. Discover citations
164
+ citations-collector discover collection.yaml
165
+
166
+ # 2. Fetch open-access PDFs
167
+ citations-collector fetch-pdfs --config collection.yaml
168
+
169
+ # 3. Detect merged preprints
170
+ citations-collector detect-merges --config collection.yaml
171
+
172
+ # 4. Sync to Zotero
173
+ citations-collector sync-zotero --config collection.yaml
174
+ ```
175
+
176
+ ## Library Usage
177
+
178
+ ```python
179
+ from citations_collector import CitationCollector
180
+
181
+ # Load collection
182
+ collector = CitationCollector.from_yaml("collection.yaml")
183
+
184
+ # Discover citations (incremental by default)
185
+ collector.discover_all(incremental=True, email="your@email.org")
186
+
187
+ # Save results
188
+ collector.save("collection.yaml", "citations.tsv")
189
+ ```
190
+
191
+ ## Examples
192
+
193
+ See the `examples/` directory for:
194
+ - **dandi-collection.yaml**: DANDI Archive dandisets with versioned DOIs
195
+ - **repronim-tools.yaml**: ReproNim neuroimaging tools with RRIDs
196
+ - **simple-resources.yaml**: Basic collection without versioning
197
+ - **citations-example.tsv**: Example citation records with curation
198
+
199
+ ## Development
200
+
201
+ ### Setup
202
+
203
+ ```bash
204
+ # Clone repository
205
+ git clone https://github.com/dandi/citations-collector.git
206
+ cd citations-collector
207
+
208
+ # Setup development environment
209
+ uv venv
210
+ source .venv/bin/activate
211
+ uv pip install -e ".[devel]"
212
+ ```
213
+
214
+ ### Running Tests
215
+
216
+ ```bash
217
+ # Run all tests, linting, and type checking
218
+ tox
219
+
220
+ # Run specific environment
221
+ tox -e py312 # Tests on Python 3.12
222
+ tox -e lint # Ruff linting
223
+ tox -e type # Mypy type checking
224
+ tox -e cov # Coverage report
225
+ ```
226
+
227
+ ### Regenerating LinkML Models
228
+
229
+ When `schema/citations.yaml` changes:
230
+
231
+ ```bash
232
+ # Install linkml tools
233
+ uv pip install -e ".[linkml]"
234
+
235
+ # Regenerate Pydantic models
236
+ gen-pydantic schema/citations.yaml > src/citations_collector/models/generated.py
237
+
238
+ # Regenerate JSON Schema
239
+ gen-json-schema schema/citations.yaml > schema/citations.schema.json
240
+
241
+ # Commit generated files
242
+ git add src/citations_collector/models/generated.py schema/citations.schema.json
243
+ git commit -m "Regenerate LinkML models"
244
+ ```
245
+
246
+ ## Architecture
247
+
248
+ - **Library-First Design**: All functionality accessible programmatically
249
+ - **LinkML Schema**: Validated data models from `schema/citations.yaml`
250
+ - **Modular Structure**:
251
+ - `discovery/`: Citation API clients (CrossRef, OpenCitations, DataCite)
252
+ - `persistence/`: YAML/TSV I/O
253
+ - `importers/`: DANDI API, Zenodo, GitHub integrations
254
+ - `unpaywall.py`: Unpaywall API client for OA PDF URLs
255
+ - `pdf.py`: PDF acquisition with git-annex support
256
+ - `merge_detection.py`: Preprint/published version detection
257
+ - `zotero_sync.py`: Zotero hierarchical sync with merged item handling
258
+ - `core.py`: Main orchestration API
259
+ - `cli.py`: Click-based CLI (thin wrapper)
260
+
261
+ ## Citation Sources
262
+
263
+ - **CrossRef**: Most comprehensive, best for DOI citations
264
+ - **OpenCitations**: Open index, may lag behind CrossRef
265
+ - **DataCite**: Good for dataset citations
266
+ - **Europe PMC**: PubMed-indexed papers (future)
267
+ - **Semantic Scholar**: AI-powered citation discovery (future)
268
+
269
+ ## License
270
+
271
+ MIT License - see LICENSE file for details.
272
+
273
+ ## Contributing
274
+
275
+ See CONSTITUTION.md for:
276
+ - Code standards (Ruff, mypy, type hints)
277
+ - Testing requirements (pytest, 100 lines max, mock HTTP)
278
+ - Architecture principles (library-first, reliability, simplicity)
279
+
280
+ Pull requests welcome!
281
+
282
+ ## Citation
283
+
284
+ If you use citations-collector in your research, please cite:
285
+
286
+ ```bibtex
287
+ @software{citations_collector,
288
+ title = {citations-collector: Discover and curate scholarly citations},
289
+ author = {{DANDI Team}},
290
+ url = {https://github.com/dandi/citations-collector},
291
+ license = {MIT}
292
+ }
293
+ ```
@@ -0,0 +1,7 @@
1
+ item_id item_flavor item_ref_type item_ref_value item_name citation_doi citation_title citation_authors citation_year citation_journal citation_relationship citation_type citation_source discovered_date citation_status citation_merged_into citation_comment curated_by curated_date
2
+ dandi:000003 0.210812.1448 doi 10.48324/dandi.000003/0.210812.1448 Hippocampal Granule Cells 10.1016/j.neuron.2022.01.001 Analysis of hippocampal circuits Smith J; Jones A; Wang L 2022 Neuron Cites Publication crossref 2024-01-15 active
3
+ dandi:000003 0.210812.1448 doi 10.48324/dandi.000003/0.210812.1448 Hippocampal Granule Cells 10.1101/2021.12.15.472789 Preprint version of the analysis Smith J; Jones A 2021 bioRxiv Cites Preprint crossref 2024-01-15 merged 10.1016/j.neuron.2022.01.001 Merged with published version curator1 2024-01-20
4
+ dandi:000003 0.220126.1853 doi 10.48324/dandi.000003/0.220126.1853 Hippocampal Granule Cells 10.1523/JNEUROSCI.1234-22.2022 Follow-up analysis using newer version Brown K 2023 J Neuroscience Uses Publication opencitations 2024-02-01 active
5
+ fmriprep latest rrid SCR_016216 fMRIPrep 10.1038/s41596-022-00710-2 Standard preprocessing for fMRI Esteban O; et al 2022 Nature Protocols Describes Publication scicrunch 2024-01-10 active
6
+ fmriprep latest doi 10.5281/zenodo.852659 fMRIPrep 10.1016/j.neuroimage.2023.120123 Large-scale fMRI meta-analysis Chen M; et al 2023 NeuroImage Uses Publication datacite 2024-01-12 active
7
+ fmriprep 23.1.0 doi 10.5281/zenodo.8076123 fMRIPrep 10.1101/2024.01.05.574321 Methods comparison study Lee S 2024 bioRxiv Cites Preprint crossref 2024-02-15 pending Needs review - unclear citation