citations-collector 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- citations_collector/__init__.py +18 -0
- citations_collector/_version.py +34 -0
- citations_collector/cli.py +525 -0
- citations_collector/core.py +503 -0
- citations_collector/discovery/__init__.py +17 -0
- citations_collector/discovery/base.py +26 -0
- citations_collector/discovery/crossref.py +210 -0
- citations_collector/discovery/datacite.py +260 -0
- citations_collector/discovery/openalex.py +252 -0
- citations_collector/discovery/opencitations.py +168 -0
- citations_collector/discovery/utils.py +62 -0
- citations_collector/importers/__init__.py +17 -0
- citations_collector/importers/bibtex.py +178 -0
- citations_collector/importers/dandi.py +314 -0
- citations_collector/importers/github.py +147 -0
- citations_collector/importers/zenodo.py +110 -0
- citations_collector/importers/zotero.py +262 -0
- citations_collector/merge_detection.py +216 -0
- citations_collector/models/__init__.py +44 -0
- citations_collector/models/generated.py +525 -0
- citations_collector/pdf.py +260 -0
- citations_collector/persistence/__init__.py +7 -0
- citations_collector/persistence/tsv_io.py +121 -0
- citations_collector/persistence/yaml_io.py +50 -0
- citations_collector/py.typed +0 -0
- citations_collector/unpaywall.py +60 -0
- citations_collector/zotero_sync.py +591 -0
- citations_collector-0.2.3.dist-info/METADATA +456 -0
- citations_collector-0.2.3.dist-info/RECORD +31 -0
- citations_collector-0.2.3.dist-info/WHEEL +4 -0
- citations_collector-0.2.3.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import sys
|
|
5
|
+
from datetime import (
|
|
6
|
+
date,
|
|
7
|
+
datetime,
|
|
8
|
+
time
|
|
9
|
+
)
|
|
10
|
+
from decimal import Decimal
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import (
|
|
13
|
+
Any,
|
|
14
|
+
ClassVar,
|
|
15
|
+
Literal,
|
|
16
|
+
Optional,
|
|
17
|
+
Union
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from pydantic import (
|
|
21
|
+
BaseModel,
|
|
22
|
+
ConfigDict,
|
|
23
|
+
Field,
|
|
24
|
+
RootModel,
|
|
25
|
+
field_validator,
|
|
26
|
+
model_validator
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
metamodel_version = "None"
|
|
31
|
+
version = "0.2.0"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ConfiguredBaseModel(BaseModel):
|
|
35
|
+
model_config = ConfigDict(
|
|
36
|
+
validate_assignment = True,
|
|
37
|
+
validate_default = True,
|
|
38
|
+
extra = "forbid",
|
|
39
|
+
arbitrary_types_allowed = True,
|
|
40
|
+
use_enum_values = True,
|
|
41
|
+
strict = False,
|
|
42
|
+
)
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class LinkMLMeta(RootModel):
|
|
49
|
+
root: dict[str, Any] = {}
|
|
50
|
+
model_config = ConfigDict(frozen=True)
|
|
51
|
+
|
|
52
|
+
def __getattr__(self, key:str):
|
|
53
|
+
return getattr(self.root, key)
|
|
54
|
+
|
|
55
|
+
def __getitem__(self, key:str):
|
|
56
|
+
return self.root[key]
|
|
57
|
+
|
|
58
|
+
def __setitem__(self, key:str, value):
|
|
59
|
+
self.root[key] = value
|
|
60
|
+
|
|
61
|
+
def __contains__(self, key:str) -> bool:
|
|
62
|
+
return key in self.root
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
linkml_meta = LinkMLMeta({'default_prefix': 'citations',
|
|
66
|
+
'default_range': 'string',
|
|
67
|
+
'description': 'Schema for tracking scholarly citations of digital products '
|
|
68
|
+
'(datasets, software, tools) identified by DOIs, RRIDs, or '
|
|
69
|
+
'other identifiers. Supports flexible hierarchical collections '
|
|
70
|
+
'and curation workflows.',
|
|
71
|
+
'id': 'https://w3id.org/dandi/citations-collector',
|
|
72
|
+
'imports': ['linkml:types'],
|
|
73
|
+
'license': 'MIT',
|
|
74
|
+
'name': 'citations-collector',
|
|
75
|
+
'prefixes': {'citations': {'prefix_prefix': 'citations',
|
|
76
|
+
'prefix_reference': 'https://w3id.org/dandi/citations-collector/'},
|
|
77
|
+
'datacite': {'prefix_prefix': 'datacite',
|
|
78
|
+
'prefix_reference': 'https://purl.org/datacite/v4.4/'},
|
|
79
|
+
'linkml': {'prefix_prefix': 'linkml',
|
|
80
|
+
'prefix_reference': 'https://w3id.org/linkml/'},
|
|
81
|
+
'schema': {'prefix_prefix': 'schema',
|
|
82
|
+
'prefix_reference': 'http://schema.org/'}},
|
|
83
|
+
'source_file': 'schema/citations.yaml',
|
|
84
|
+
'title': 'Citations Collector Schema'} )
|
|
85
|
+
|
|
86
|
+
class RefType(str, Enum):
|
|
87
|
+
"""
|
|
88
|
+
Type of identifier reference.
|
|
89
|
+
"""
|
|
90
|
+
doi = "doi"
|
|
91
|
+
"""
|
|
92
|
+
Digital Object Identifier (version-specific).
|
|
93
|
+
"""
|
|
94
|
+
rrid = "rrid"
|
|
95
|
+
"""
|
|
96
|
+
Research Resource Identifier (SciCrunch).
|
|
97
|
+
"""
|
|
98
|
+
arxiv = "arxiv"
|
|
99
|
+
"""
|
|
100
|
+
arXiv preprint identifier.
|
|
101
|
+
"""
|
|
102
|
+
pmid = "pmid"
|
|
103
|
+
"""
|
|
104
|
+
PubMed identifier.
|
|
105
|
+
"""
|
|
106
|
+
pmcid = "pmcid"
|
|
107
|
+
"""
|
|
108
|
+
PubMed Central identifier.
|
|
109
|
+
"""
|
|
110
|
+
url = "url"
|
|
111
|
+
"""
|
|
112
|
+
Generic URL (fallback when no persistent ID available).
|
|
113
|
+
"""
|
|
114
|
+
zenodo_concept = "zenodo_concept"
|
|
115
|
+
"""
|
|
116
|
+
Zenodo concept DOI or parent.id representing ALL versions. Example: "10.5281/zenodo.1012598" or just "1012598". System will auto-discover all version DOIs via Zenodo API (query: parent.id:1012598&f=allversions:true).
|
|
117
|
+
"""
|
|
118
|
+
zenodo_version = "zenodo_version"
|
|
119
|
+
"""
|
|
120
|
+
Zenodo version-specific record ID (resolves to DOI).
|
|
121
|
+
"""
|
|
122
|
+
github = "github"
|
|
123
|
+
"""
|
|
124
|
+
GitHub repository (owner/repo format).
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class CitationRelationship(str, Enum):
|
|
129
|
+
"""
|
|
130
|
+
The relationship between a citing work and the cited item.
|
|
131
|
+
"""
|
|
132
|
+
Cites = "Cites"
|
|
133
|
+
"""
|
|
134
|
+
The work explicitly cites the item in its references.
|
|
135
|
+
"""
|
|
136
|
+
IsDocumentedBy = "IsDocumentedBy"
|
|
137
|
+
"""
|
|
138
|
+
The item is documented by this work (e.g., a data descriptor).
|
|
139
|
+
"""
|
|
140
|
+
Describes = "Describes"
|
|
141
|
+
"""
|
|
142
|
+
The work describes the item or its creation methodology.
|
|
143
|
+
"""
|
|
144
|
+
IsSupplementedBy = "IsSupplementedBy"
|
|
145
|
+
"""
|
|
146
|
+
The item is supplemented by this work.
|
|
147
|
+
"""
|
|
148
|
+
References = "References"
|
|
149
|
+
"""
|
|
150
|
+
The work references the item without formal citation.
|
|
151
|
+
"""
|
|
152
|
+
Uses = "Uses"
|
|
153
|
+
"""
|
|
154
|
+
The work uses data/code from the item.
|
|
155
|
+
"""
|
|
156
|
+
IsDerivedFrom = "IsDerivedFrom"
|
|
157
|
+
"""
|
|
158
|
+
The work is derived from the item.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class CitationType(str, Enum):
|
|
163
|
+
"""
|
|
164
|
+
The type of citing work.
|
|
165
|
+
"""
|
|
166
|
+
Publication = "Publication"
|
|
167
|
+
"""
|
|
168
|
+
Peer-reviewed journal article or conference paper.
|
|
169
|
+
"""
|
|
170
|
+
Preprint = "Preprint"
|
|
171
|
+
"""
|
|
172
|
+
Non-peer-reviewed preprint (bioRxiv, arXiv, etc.).
|
|
173
|
+
"""
|
|
174
|
+
Protocol = "Protocol"
|
|
175
|
+
"""
|
|
176
|
+
Published protocol (protocols.io, etc.).
|
|
177
|
+
"""
|
|
178
|
+
Thesis = "Thesis"
|
|
179
|
+
"""
|
|
180
|
+
Doctoral or master's thesis.
|
|
181
|
+
"""
|
|
182
|
+
Book = "Book"
|
|
183
|
+
"""
|
|
184
|
+
Book or book chapter.
|
|
185
|
+
"""
|
|
186
|
+
Software = "Software"
|
|
187
|
+
"""
|
|
188
|
+
Software package or tool.
|
|
189
|
+
"""
|
|
190
|
+
Dataset = "Dataset"
|
|
191
|
+
"""
|
|
192
|
+
Another dataset that cites this one.
|
|
193
|
+
"""
|
|
194
|
+
Other = "Other"
|
|
195
|
+
"""
|
|
196
|
+
Other type of work.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class CitationSource(str, Enum):
|
|
201
|
+
"""
|
|
202
|
+
The source from which the citation was discovered.
|
|
203
|
+
"""
|
|
204
|
+
crossref = "crossref"
|
|
205
|
+
"""
|
|
206
|
+
Discovered via CrossRef cited-by API.
|
|
207
|
+
"""
|
|
208
|
+
opencitations = "opencitations"
|
|
209
|
+
"""
|
|
210
|
+
Discovered via OpenCitations (OCI) API.
|
|
211
|
+
"""
|
|
212
|
+
datacite = "datacite"
|
|
213
|
+
"""
|
|
214
|
+
Discovered via DataCite API.
|
|
215
|
+
"""
|
|
216
|
+
openalex = "openalex"
|
|
217
|
+
"""
|
|
218
|
+
Discovered via OpenAlex API.
|
|
219
|
+
"""
|
|
220
|
+
europepmc = "europepmc"
|
|
221
|
+
"""
|
|
222
|
+
Discovered via Europe PMC API.
|
|
223
|
+
"""
|
|
224
|
+
semantic_scholar = "semantic_scholar"
|
|
225
|
+
"""
|
|
226
|
+
Discovered via Semantic Scholar API.
|
|
227
|
+
"""
|
|
228
|
+
scicrunch = "scicrunch"
|
|
229
|
+
"""
|
|
230
|
+
Discovered via SciCrunch/RRID API.
|
|
231
|
+
"""
|
|
232
|
+
manual = "manual"
|
|
233
|
+
"""
|
|
234
|
+
Manually added by curator.
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class CitationStatus(str, Enum):
|
|
239
|
+
"""
|
|
240
|
+
Curation status of the citation.
|
|
241
|
+
"""
|
|
242
|
+
active = "active"
|
|
243
|
+
"""
|
|
244
|
+
Citation is valid and should be included.
|
|
245
|
+
"""
|
|
246
|
+
ignored = "ignored"
|
|
247
|
+
"""
|
|
248
|
+
Citation is a false positive and should be excluded.
|
|
249
|
+
"""
|
|
250
|
+
merged = "merged"
|
|
251
|
+
"""
|
|
252
|
+
Citation has been merged into another (e.g., preprint → published).
|
|
253
|
+
"""
|
|
254
|
+
pending = "pending"
|
|
255
|
+
"""
|
|
256
|
+
Citation needs review by curator.
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class ItemRef(ConfiguredBaseModel):
|
|
262
|
+
"""
|
|
263
|
+
A resolvable identifier for an item (DOI, RRID, URL, etc.). An item may have multiple refs (e.g., both RRID and Zenodo DOI).
|
|
264
|
+
"""
|
|
265
|
+
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/dandi/citations-collector'})
|
|
266
|
+
|
|
267
|
+
ref_type: RefType = Field(default=..., description="""Type of identifier.""", json_schema_extra = { "linkml_meta": {'alias': 'ref_type', 'domain_of': ['ItemRef', 'SourceConfig']} })
|
|
268
|
+
ref_value: str = Field(default=..., description="""The identifier value. Format depends on ref_type: - doi: \"10.1234/example\" (without doi: prefix) - rrid: \"SCR_016216\" (without RRID: prefix) - arxiv: \"2301.12345\" - pmid: \"12345678\" - url: full URL - zenodo: record ID like \"852659\" - github: \"owner/repo\"""", json_schema_extra = { "linkml_meta": {'alias': 'ref_value', 'domain_of': ['ItemRef']} })
|
|
269
|
+
ref_url: Optional[str] = Field(default=None, description="""Resolved URL for this reference (auto-populated).""", json_schema_extra = { "linkml_meta": {'alias': 'ref_url', 'domain_of': ['ItemRef']} })
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class ItemFlavor(ConfiguredBaseModel):
|
|
273
|
+
"""
|
|
274
|
+
A specific version or variant of an item. For versioned resources (software releases, dataset versions), each version is a flavor. For unversioned resources, use a single flavor (e.g., \"latest\" or \"main\").
|
|
275
|
+
"""
|
|
276
|
+
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/dandi/citations-collector'})
|
|
277
|
+
|
|
278
|
+
flavor_id: str = Field(default=..., description="""Identifier for this flavor (e.g., \"0.210812.1448\", \"23.1.0\", \"latest\"). Use \"main\" or omit for unversioned items.""", json_schema_extra = { "linkml_meta": {'alias': 'flavor_id', 'domain_of': ['ItemFlavor']} })
|
|
279
|
+
name: Optional[str] = Field(default=None, description="""Human-readable name for this flavor.""", json_schema_extra = { "linkml_meta": {'alias': 'name', 'domain_of': ['ItemFlavor', 'Item', 'Collection']} })
|
|
280
|
+
release_date: Optional[date] = Field(default=None, description="""When this flavor was released (ISO 8601).""", json_schema_extra = { "linkml_meta": {'alias': 'release_date', 'domain_of': ['ItemFlavor']} })
|
|
281
|
+
refs: list[ItemRef] = Field(default=..., description="""Resolvable identifiers for this flavor. Multiple refs allowed (e.g., both DOI and RRID for the same version).""", json_schema_extra = { "linkml_meta": {'alias': 'refs', 'domain_of': ['ItemFlavor']} })
|
|
282
|
+
citations: Optional[list[CitationRecord]] = Field(default=None, description="""Citations discovered for this flavor.""", json_schema_extra = { "linkml_meta": {'alias': 'citations', 'domain_of': ['ItemFlavor']} })
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
class Item(ConfiguredBaseModel):
|
|
286
|
+
"""
|
|
287
|
+
A tracked resource with one or more flavors (versions). The item_id can encode hierarchy using \":\" separator (e.g., \"dandi:000003\", \"repronim:fmriprep\", or just \"my-tool\").
|
|
288
|
+
"""
|
|
289
|
+
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/dandi/citations-collector'})
|
|
290
|
+
|
|
291
|
+
item_id: str = Field(default=..., description="""Unique identifier for this item within the collection. May include namespace prefix with \":\" (e.g., \"dandi:000003\"). The part before \":\" indicates the source/project.""", json_schema_extra = { "linkml_meta": {'alias': 'item_id', 'domain_of': ['Item', 'CitationRecord']} })
|
|
292
|
+
name: Optional[str] = Field(default=None, description="""Human-readable name.""", json_schema_extra = { "linkml_meta": {'alias': 'name', 'domain_of': ['ItemFlavor', 'Item', 'Collection']} })
|
|
293
|
+
description: Optional[str] = Field(default=None, description="""Description of the item.""", json_schema_extra = { "linkml_meta": {'alias': 'description', 'domain_of': ['Item', 'Collection']} })
|
|
294
|
+
homepage: Optional[str] = Field(default=None, description="""URL to the item's homepage or landing page.""", json_schema_extra = { "linkml_meta": {'alias': 'homepage', 'domain_of': ['Item', 'Collection']} })
|
|
295
|
+
flavors: list[ItemFlavor] = Field(default=..., description="""Versions/variants of this item.""", json_schema_extra = { "linkml_meta": {'alias': 'flavors', 'domain_of': ['Item']} })
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
class CitationRecord(ConfiguredBaseModel):
|
|
299
|
+
"""
|
|
300
|
+
A record representing a citation relationship between a citing work and a tracked item. Each row in the citations TSV.
|
|
301
|
+
"""
|
|
302
|
+
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/dandi/citations-collector',
|
|
303
|
+
'unique_keys': {'citation_item_key': {'description': 'Unique key: each citing '
|
|
304
|
+
'work (by DOI or URL) is '
|
|
305
|
+
'unique per item+flavor.',
|
|
306
|
+
'unique_key_name': 'citation_item_key',
|
|
307
|
+
'unique_key_slots': ['item_id',
|
|
308
|
+
'item_flavor',
|
|
309
|
+
'citation_doi']}}})
|
|
310
|
+
|
|
311
|
+
item_id: str = Field(default=..., description="""ID of the tracked item being cited.""", json_schema_extra = { "linkml_meta": {'alias': 'item_id', 'domain_of': ['Item', 'CitationRecord']} })
|
|
312
|
+
item_flavor: str = Field(default=..., description="""Flavor (version) of the item being cited.""", json_schema_extra = { "linkml_meta": {'alias': 'item_flavor', 'domain_of': ['CitationRecord']} })
|
|
313
|
+
item_ref_type: Optional[RefType] = Field(default=None, description="""Which ref type was matched for this citation.""", json_schema_extra = { "linkml_meta": {'alias': 'item_ref_type', 'domain_of': ['CitationRecord']} })
|
|
314
|
+
item_ref_value: Optional[str] = Field(default=None, description="""Which ref value was matched for this citation.""", json_schema_extra = { "linkml_meta": {'alias': 'item_ref_value', 'domain_of': ['CitationRecord']} })
|
|
315
|
+
item_name: Optional[str] = Field(default=None, description="""Human-readable name of the item (for display).""", json_schema_extra = { "linkml_meta": {'alias': 'item_name', 'domain_of': ['CitationRecord']} })
|
|
316
|
+
citation_doi: Optional[str] = Field(default=None, description="""DOI of the citing work (primary identifier).""", json_schema_extra = { "linkml_meta": {'alias': 'citation_doi', 'domain_of': ['CitationRecord']} })
|
|
317
|
+
citation_pmid: Optional[str] = Field(default=None, description="""PubMed ID of the citing work.""", json_schema_extra = { "linkml_meta": {'alias': 'citation_pmid', 'domain_of': ['CitationRecord']} })
|
|
318
|
+
citation_arxiv: Optional[str] = Field(default=None, description="""arXiv ID of the citing work.""", json_schema_extra = { "linkml_meta": {'alias': 'citation_arxiv', 'domain_of': ['CitationRecord']} })
|
|
319
|
+
citation_url: Optional[str] = Field(default=None, description="""URL to the citing work (fallback if no DOI).""", json_schema_extra = { "linkml_meta": {'alias': 'citation_url', 'domain_of': ['CitationRecord']} })
|
|
320
|
+
citation_title: Optional[str] = Field(default=None, description="""Title of the citing work.""", json_schema_extra = { "linkml_meta": {'alias': 'citation_title', 'domain_of': ['CitationRecord']} })
|
|
321
|
+
citation_authors: Optional[str] = Field(default=None, description="""Authors of the citing work (semicolon-separated).""", json_schema_extra = { "linkml_meta": {'alias': 'citation_authors', 'domain_of': ['CitationRecord']} })
|
|
322
|
+
citation_year: Optional[int] = Field(default=None, description="""Publication year of the citing work.""", json_schema_extra = { "linkml_meta": {'alias': 'citation_year', 'domain_of': ['CitationRecord']} })
|
|
323
|
+
citation_journal: Optional[str] = Field(default=None, description="""Journal or venue of the citing work.""", json_schema_extra = { "linkml_meta": {'alias': 'citation_journal', 'domain_of': ['CitationRecord']} })
|
|
324
|
+
citation_relationship: CitationRelationship = Field(default=..., description="""How the citing work relates to the item.""", json_schema_extra = { "linkml_meta": {'alias': 'citation_relationship', 'domain_of': ['CitationRecord']} })
|
|
325
|
+
citation_type: Optional[CitationType] = Field(default=None, description="""Type of the citing work.""", json_schema_extra = { "linkml_meta": {'alias': 'citation_type', 'domain_of': ['CitationRecord']} })
|
|
326
|
+
citation_source: CitationSource = Field(default=..., description="""DEPRECATED: Use citation_sources instead. Primary discovery source (kept for backward compatibility).""", json_schema_extra = { "linkml_meta": {'alias': 'citation_source', 'domain_of': ['CitationRecord']} })
|
|
327
|
+
discovered_date: Optional[date] = Field(default=None, description="""DEPRECATED: Use discovered_dates instead. When this citation was first discovered (ISO 8601).""", json_schema_extra = { "linkml_meta": {'alias': 'discovered_date', 'domain_of': ['CitationRecord']} })
|
|
328
|
+
citation_sources: Optional[list[str]] = Field(default=None, description="""All discovery sources that found this citation. Must be coherent with discovered_dates keys. Example: [\"crossref\", \"openalex\", \"datacite\"]""", json_schema_extra = { "linkml_meta": {'alias': 'citation_sources', 'domain_of': ['CitationRecord']} })
|
|
329
|
+
discovered_dates: Optional[str] = Field(default=None, description="""Map of source name to discovery date (ISO 8601). Must be coherent with citation_sources list. Stored as JSON string in TSV. Example: {\"crossref\": \"2025-01-15\", \"openalex\": \"2025-01-20\"}""", json_schema_extra = { "linkml_meta": {'alias': 'discovered_dates', 'domain_of': ['CitationRecord']} })
|
|
330
|
+
citation_status: CitationStatus = Field(default=CitationStatus.active, description="""Curation status.""", json_schema_extra = { "linkml_meta": {'alias': 'citation_status',
|
|
331
|
+
'domain_of': ['CitationRecord'],
|
|
332
|
+
'ifabsent': 'string(active)'} })
|
|
333
|
+
citation_merged_into: Optional[str] = Field(default=None, description="""If status is 'merged', the DOI of the canonical version (e.g., published paper DOI when this is a preprint).""", json_schema_extra = { "linkml_meta": {'alias': 'citation_merged_into', 'domain_of': ['CitationRecord']} })
|
|
334
|
+
citation_comment: Optional[str] = Field(default=None, description="""Curator notes about this citation.""", json_schema_extra = { "linkml_meta": {'alias': 'citation_comment', 'domain_of': ['CitationRecord']} })
|
|
335
|
+
curated_by: Optional[str] = Field(default=None, description="""Who made the curation decision.""", json_schema_extra = { "linkml_meta": {'alias': 'curated_by', 'domain_of': ['CitationRecord']} })
|
|
336
|
+
curated_date: Optional[date] = Field(default=None, description="""When the curation decision was made (ISO 8601).""", json_schema_extra = { "linkml_meta": {'alias': 'curated_date', 'domain_of': ['CitationRecord']} })
|
|
337
|
+
oa_status: Optional[str] = Field(default=None, description="""Open access status from Unpaywall: gold, green, bronze, hybrid, or closed.""", json_schema_extra = { "linkml_meta": {'alias': 'oa_status', 'domain_of': ['CitationRecord']} })
|
|
338
|
+
pdf_url: Optional[str] = Field(default=None, description="""Best open access PDF URL from Unpaywall.""", json_schema_extra = { "linkml_meta": {'alias': 'pdf_url', 'domain_of': ['CitationRecord']} })
|
|
339
|
+
pdf_path: Optional[str] = Field(default=None, description="""Relative path to locally stored PDF file.""", json_schema_extra = { "linkml_meta": {'alias': 'pdf_path', 'domain_of': ['CitationRecord']} })
|
|
340
|
+
|
|
341
|
+
@field_validator('citation_doi')
|
|
342
|
+
def pattern_citation_doi(cls, v):
|
|
343
|
+
pattern=re.compile(r"^10\..+/.+$")
|
|
344
|
+
if isinstance(v, list):
|
|
345
|
+
for element in v:
|
|
346
|
+
if isinstance(element, str) and not pattern.match(element):
|
|
347
|
+
err_msg = f"Invalid citation_doi format: {element}"
|
|
348
|
+
raise ValueError(err_msg)
|
|
349
|
+
elif isinstance(v, str) and not pattern.match(v):
|
|
350
|
+
err_msg = f"Invalid citation_doi format: {v}"
|
|
351
|
+
raise ValueError(err_msg)
|
|
352
|
+
return v
|
|
353
|
+
|
|
354
|
+
@field_validator('citation_merged_into')
|
|
355
|
+
def pattern_citation_merged_into(cls, v):
|
|
356
|
+
pattern=re.compile(r"^10\..+/.+$")
|
|
357
|
+
if isinstance(v, list):
|
|
358
|
+
for element in v:
|
|
359
|
+
if isinstance(element, str) and not pattern.match(element):
|
|
360
|
+
err_msg = f"Invalid citation_merged_into format: {element}"
|
|
361
|
+
raise ValueError(err_msg)
|
|
362
|
+
elif isinstance(v, str) and not pattern.match(v):
|
|
363
|
+
err_msg = f"Invalid citation_merged_into format: {v}"
|
|
364
|
+
raise ValueError(err_msg)
|
|
365
|
+
return v
|
|
366
|
+
|
|
367
|
+
@model_validator(mode='after')
|
|
368
|
+
def validate_citation_sources_dates_coherence(self):
|
|
369
|
+
"""
|
|
370
|
+
Validate that citation_sources and discovered_dates are coherent.
|
|
371
|
+
|
|
372
|
+
CUSTOM VALIDATOR: This is not auto-generated from LinkML schema.
|
|
373
|
+
If regenerating models, this validator must be preserved.
|
|
374
|
+
"""
|
|
375
|
+
import json
|
|
376
|
+
|
|
377
|
+
# If both are None/empty, that's fine
|
|
378
|
+
if not self.citation_sources and not self.discovered_dates:
|
|
379
|
+
return self
|
|
380
|
+
|
|
381
|
+
# If only one is set, that's fine (discovered_dates is optional)
|
|
382
|
+
if not self.discovered_dates:
|
|
383
|
+
return self
|
|
384
|
+
|
|
385
|
+
if not self.citation_sources:
|
|
386
|
+
return self
|
|
387
|
+
|
|
388
|
+
# Parse discovered_dates JSON
|
|
389
|
+
try:
|
|
390
|
+
dates_dict = json.loads(self.discovered_dates)
|
|
391
|
+
except json.JSONDecodeError:
|
|
392
|
+
raise ValueError("Invalid JSON in discovered_dates")
|
|
393
|
+
|
|
394
|
+
if not isinstance(dates_dict, dict):
|
|
395
|
+
raise ValueError("discovered_dates must be a JSON object (dict), not array or other type")
|
|
396
|
+
|
|
397
|
+
# Check coherence: keys in dates_dict must match citation_sources
|
|
398
|
+
sources_set = set(self.citation_sources)
|
|
399
|
+
dates_set = set(dates_dict.keys())
|
|
400
|
+
|
|
401
|
+
if sources_set != dates_set:
|
|
402
|
+
raise ValueError(
|
|
403
|
+
f"citation_sources and discovered_dates must be coherent: "
|
|
404
|
+
f"sources={sorted(sources_set)}, dates_keys={sorted(dates_set)}"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
return self
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
class SourceConfig(ConfiguredBaseModel):
|
|
411
|
+
"""
|
|
412
|
+
Configuration for the item source (e.g., DANDI, Zenodo).
|
|
413
|
+
"""
|
|
414
|
+
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/dandi/citations-collector'})
|
|
415
|
+
|
|
416
|
+
type: Optional[str] = Field(default=None, description="""Source type: \"dandi\", \"bibtex\", \"zenodo_org\", \"zenodo_collection\", \"github_org\", \"yaml\", etc.""", json_schema_extra = { "linkml_meta": {'alias': 'type', 'domain_of': ['SourceConfig']} })
|
|
417
|
+
update_items: Optional[str] = Field(default=None, description="""How to handle items during import: \"add\" (only add new items) or \"sync\" (add new and update existing).""", json_schema_extra = { "linkml_meta": {'alias': 'update_items', 'domain_of': ['SourceConfig']} })
|
|
418
|
+
include_draft: Optional[bool] = Field(default=False, description="""Include draft/unpublished items.""", json_schema_extra = { "linkml_meta": {'alias': 'include_draft', 'domain_of': ['SourceConfig'], 'ifabsent': 'false'} })
|
|
419
|
+
group_id: Optional[int] = Field(default=None, description="""Numeric group/org ID (if applicable).""", json_schema_extra = { "linkml_meta": {'alias': 'group_id', 'domain_of': ['SourceConfig', 'ZoteroConfig']} })
|
|
420
|
+
collection_key: Optional[str] = Field(default=None, description="""Collection key within the source (if applicable).""", json_schema_extra = { "linkml_meta": {'alias': 'collection_key', 'domain_of': ['SourceConfig', 'ZoteroConfig']} })
|
|
421
|
+
dandiset_ids: Optional[list[str]] = Field(default=None, description="""List of specific DANDI dandiset identifiers to import (e.g., [\"000003\", \"000402\"]). If not specified, imports all dandisets.""", json_schema_extra = { "linkml_meta": {'alias': 'dandiset_ids', 'domain_of': ['SourceConfig']} })
|
|
422
|
+
bibtex_file: Optional[str] = Field(default=None, description="""Path to BibTeX file (relative to collection YAML or absolute).""", json_schema_extra = { "linkml_meta": {'alias': 'bibtex_file', 'domain_of': ['SourceConfig']} })
|
|
423
|
+
bib_field: Optional[str] = Field(default=None, description="""BibTeX field to extract reference from (e.g., 'doi', 'url').""", json_schema_extra = { "linkml_meta": {'alias': 'bib_field', 'domain_of': ['SourceConfig']} })
|
|
424
|
+
ref_type: Optional[RefType] = Field(default=None, description="""Type of reference extracted from bib_field.""", json_schema_extra = { "linkml_meta": {'alias': 'ref_type', 'domain_of': ['ItemRef', 'SourceConfig']} })
|
|
425
|
+
ref_regex: Optional[str] = Field(default=None, description="""Regex pattern with named groups (?P<item_id>...) and optionally (?P<flavor_id>...) to parse reference into components. If flavor_id not captured, uses 'main'.""", json_schema_extra = { "linkml_meta": {'alias': 'ref_regex', 'domain_of': ['SourceConfig']} })
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
class DiscoverConfig(ConfiguredBaseModel):
|
|
429
|
+
"""
|
|
430
|
+
Configuration for citation discovery.
|
|
431
|
+
"""
|
|
432
|
+
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/dandi/citations-collector'})
|
|
433
|
+
|
|
434
|
+
sources: Optional[list[str]] = Field(default=None, description="""List of discovery source names to query (e.g., crossref, opencitations, datacite).""", json_schema_extra = { "linkml_meta": {'alias': 'sources', 'domain_of': ['DiscoverConfig']} })
|
|
435
|
+
email: Optional[str] = Field(default=None, description="""Contact email for API polite pools.""", json_schema_extra = { "linkml_meta": {'alias': 'email', 'domain_of': ['DiscoverConfig']} })
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
class PdfsConfig(ConfiguredBaseModel):
|
|
439
|
+
"""
|
|
440
|
+
Configuration for PDF retrieval.
|
|
441
|
+
"""
|
|
442
|
+
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/dandi/citations-collector'})
|
|
443
|
+
|
|
444
|
+
output_dir: Optional[str] = Field(default="pdfs/", description="""Directory to store downloaded PDFs.""", json_schema_extra = { "linkml_meta": {'alias': 'output_dir',
|
|
445
|
+
'domain_of': ['PdfsConfig'],
|
|
446
|
+
'ifabsent': 'string(pdfs/)'} })
|
|
447
|
+
unpaywall_email: Optional[str] = Field(default="site-unpaywall@oneukrainian.com", description="""Email for Unpaywall API.""", json_schema_extra = { "linkml_meta": {'alias': 'unpaywall_email',
|
|
448
|
+
'domain_of': ['PdfsConfig'],
|
|
449
|
+
'ifabsent': 'string(site-unpaywall@oneukrainian.com)'} })
|
|
450
|
+
git_annex: Optional[bool] = Field(default=False, description="""Store PDFs in git-annex instead of git.""", json_schema_extra = { "linkml_meta": {'alias': 'git_annex', 'domain_of': ['PdfsConfig'], 'ifabsent': 'false'} })
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
class ZoteroConfig(ConfiguredBaseModel):
|
|
454
|
+
"""
|
|
455
|
+
Configuration for Zotero integration.
|
|
456
|
+
"""
|
|
457
|
+
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/dandi/citations-collector'})
|
|
458
|
+
|
|
459
|
+
group_id: Optional[int] = Field(default=None, description="""Zotero group library ID.""", json_schema_extra = { "linkml_meta": {'alias': 'group_id', 'domain_of': ['SourceConfig', 'ZoteroConfig']} })
|
|
460
|
+
collection_key: Optional[str] = Field(default=None, description="""Zotero collection key to sync into.""", json_schema_extra = { "linkml_meta": {'alias': 'collection_key', 'domain_of': ['SourceConfig', 'ZoteroConfig']} })
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
class Collection(ConfiguredBaseModel):
|
|
464
|
+
"""
|
|
465
|
+
A collection of tracked items. This is the root object that gets serialized to collection.yaml.
|
|
466
|
+
"""
|
|
467
|
+
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/dandi/citations-collector', 'tree_root': True})
|
|
468
|
+
|
|
469
|
+
name: str = Field(default=..., description="""Name of the collection (e.g., \"DANDI\", \"ReproNim Tools\").""", json_schema_extra = { "linkml_meta": {'alias': 'name', 'domain_of': ['ItemFlavor', 'Item', 'Collection']} })
|
|
470
|
+
description: Optional[str] = Field(default=None, description="""Description of the collection.""", json_schema_extra = { "linkml_meta": {'alias': 'description', 'domain_of': ['Item', 'Collection']} })
|
|
471
|
+
homepage: Optional[str] = Field(default=None, description="""URL to the collection homepage.""", json_schema_extra = { "linkml_meta": {'alias': 'homepage', 'domain_of': ['Item', 'Collection']} })
|
|
472
|
+
maintainers: Optional[list[str]] = Field(default=None, description="""List of maintainer names or emails.""", json_schema_extra = { "linkml_meta": {'alias': 'maintainers', 'domain_of': ['Collection']} })
|
|
473
|
+
source_type: Optional[str] = Field(default=None, description="""DEPRECATED: Use source.type instead. Hint for auto-import: \"dandi\", \"zenodo_org\", \"zenodo_collection\", \"github_org\", \"yaml\", etc.""", json_schema_extra = { "linkml_meta": {'alias': 'source_type', 'domain_of': ['Collection']} })
|
|
474
|
+
source_config: Optional[str] = Field(default=None, description="""DEPRECATED: Use source block instead. Configuration for auto-import (JSON string or nested object).""", json_schema_extra = { "linkml_meta": {'alias': 'source_config', 'domain_of': ['Collection']} })
|
|
475
|
+
output_tsv: Optional[str] = Field(default=None, description="""Path to the output TSV file for citations.""", json_schema_extra = { "linkml_meta": {'alias': 'output_tsv', 'domain_of': ['Collection']} })
|
|
476
|
+
source: Optional[SourceConfig] = Field(default=None, description="""Source configuration block.""", json_schema_extra = { "linkml_meta": {'alias': 'source', 'domain_of': ['Collection']} })
|
|
477
|
+
discover: Optional[DiscoverConfig] = Field(default=None, description="""Citation discovery configuration block.""", json_schema_extra = { "linkml_meta": {'alias': 'discover', 'domain_of': ['Collection']} })
|
|
478
|
+
pdfs: Optional[PdfsConfig] = Field(default=None, description="""PDF retrieval configuration block.""", json_schema_extra = { "linkml_meta": {'alias': 'pdfs', 'domain_of': ['Collection']} })
|
|
479
|
+
zotero: Optional[ZoteroConfig] = Field(default=None, description="""Zotero integration configuration block.""", json_schema_extra = { "linkml_meta": {'alias': 'zotero', 'domain_of': ['Collection']} })
|
|
480
|
+
zotero_group_id: Optional[int] = Field(default=None, description="""DEPRECATED: Use zotero.group_id instead. Zotero group ID for syncing.""", json_schema_extra = { "linkml_meta": {'alias': 'zotero_group_id', 'domain_of': ['Collection']} })
|
|
481
|
+
zotero_collection_key: Optional[str] = Field(default=None, description="""DEPRECATED: Use zotero.collection_key instead. Zotero parent collection key.""", json_schema_extra = { "linkml_meta": {'alias': 'zotero_collection_key', 'domain_of': ['Collection']} })
|
|
482
|
+
items: Optional[list[Item]] = Field(default=None, description="""Items in this collection.""", json_schema_extra = { "linkml_meta": {'alias': 'items', 'domain_of': ['Collection']} })
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
class CurationRule(ConfiguredBaseModel):
|
|
486
|
+
"""
|
|
487
|
+
A rule for automatic curation.
|
|
488
|
+
"""
|
|
489
|
+
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/dandi/citations-collector'})
|
|
490
|
+
|
|
491
|
+
rule_id: str = Field(default=..., description="""Unique identifier for this rule.""", json_schema_extra = { "linkml_meta": {'alias': 'rule_id', 'domain_of': ['CurationRule']} })
|
|
492
|
+
rule_type: str = Field(default=..., description="""Type of rule: \"ignore_doi_prefix\", \"ignore_doi\", \"merge_preprint\", \"auto_merge_preprint\", \"flag_for_review\".""", json_schema_extra = { "linkml_meta": {'alias': 'rule_type', 'domain_of': ['CurationRule']} })
|
|
493
|
+
pattern: str = Field(default=..., description="""Pattern to match (DOI prefix, regex, etc.).""", json_schema_extra = { "linkml_meta": {'alias': 'pattern', 'domain_of': ['CurationRule']} })
|
|
494
|
+
action: str = Field(default=..., description="""Action to take (ignore, merge, flag).""", json_schema_extra = { "linkml_meta": {'alias': 'action', 'domain_of': ['CurationRule']} })
|
|
495
|
+
target: Optional[str] = Field(default=None, description="""Target for merge actions.""", json_schema_extra = { "linkml_meta": {'alias': 'target', 'domain_of': ['CurationRule']} })
|
|
496
|
+
comment: Optional[str] = Field(default=None, description="""Explanation of why this rule exists.""", json_schema_extra = { "linkml_meta": {'alias': 'comment', 'domain_of': ['CurationRule']} })
|
|
497
|
+
created_by: Optional[str] = Field(default=None, description="""Who created this rule.""", json_schema_extra = { "linkml_meta": {'alias': 'created_by', 'domain_of': ['CurationRule']} })
|
|
498
|
+
created_date: Optional[date] = Field(default=None, description="""When this rule was created.""", json_schema_extra = { "linkml_meta": {'alias': 'created_date', 'domain_of': ['CurationRule']} })
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
class CurationConfig(ConfiguredBaseModel):
|
|
502
|
+
"""
|
|
503
|
+
Configuration for automatic curation.
|
|
504
|
+
"""
|
|
505
|
+
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/dandi/citations-collector'})
|
|
506
|
+
|
|
507
|
+
rules: Optional[list[CurationRule]] = Field(default=None, description="""Curation rules to apply automatically.""", json_schema_extra = { "linkml_meta": {'alias': 'rules', 'domain_of': ['CurationConfig']} })
|
|
508
|
+
preprint_doi_prefixes: Optional[list[str]] = Field(default=None, description="""DOI prefixes that indicate preprints. Default: 10.1101 (bioRxiv), 10.21203 (Research Square), 10.2139 (SSRN).""", json_schema_extra = { "linkml_meta": {'alias': 'preprint_doi_prefixes', 'domain_of': ['CurationConfig']} })
|
|
509
|
+
ignored_doi_prefixes: Optional[list[str]] = Field(default=None, description="""DOI prefixes to always ignore.""", json_schema_extra = { "linkml_meta": {'alias': 'ignored_doi_prefixes', 'domain_of': ['CurationConfig']} })
|
|
510
|
+
auto_merge_preprints: Optional[bool] = Field(default=None, description="""If true, automatically merge preprints when published version is found citing the same item.""", json_schema_extra = { "linkml_meta": {'alias': 'auto_merge_preprints', 'domain_of': ['CurationConfig']} })
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
# Model rebuild
|
|
514
|
+
# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model
|
|
515
|
+
ItemRef.model_rebuild()
|
|
516
|
+
ItemFlavor.model_rebuild()
|
|
517
|
+
Item.model_rebuild()
|
|
518
|
+
CitationRecord.model_rebuild()
|
|
519
|
+
SourceConfig.model_rebuild()
|
|
520
|
+
DiscoverConfig.model_rebuild()
|
|
521
|
+
PdfsConfig.model_rebuild()
|
|
522
|
+
ZoteroConfig.model_rebuild()
|
|
523
|
+
Collection.model_rebuild()
|
|
524
|
+
CurationRule.model_rebuild()
|
|
525
|
+
CurationConfig.model_rebuild()
|