commonmeta-py 0.100__py3-none-any.whl → 0.103__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. commonmeta/__init__.py +51 -50
  2. commonmeta/author_utils.py +7 -1
  3. commonmeta/base_utils.py +1 -0
  4. commonmeta/constants.py +35 -1
  5. commonmeta/crossref_utils.py +11 -8
  6. commonmeta/date_utils.py +1 -0
  7. commonmeta/doi_utils.py +42 -14
  8. commonmeta/metadata.py +209 -100
  9. commonmeta/readers/cff_reader.py +1 -0
  10. commonmeta/readers/codemeta_reader.py +1 -0
  11. commonmeta/readers/commonmeta_reader.py +1 -0
  12. commonmeta/readers/crossref_reader.py +19 -18
  13. commonmeta/readers/csl_reader.py +4 -1
  14. commonmeta/readers/inveniordm_reader.py +14 -9
  15. commonmeta/readers/json_feed_reader.py +9 -3
  16. commonmeta/readers/kbase_reader.py +1 -0
  17. commonmeta/readers/openalex_reader.py +380 -0
  18. commonmeta/readers/ris_reader.py +1 -0
  19. commonmeta/readers/schema_org_reader.py +2 -3
  20. commonmeta/schema_utils.py +1 -0
  21. commonmeta/utils.py +126 -63
  22. commonmeta/writers/bibtex_writer.py +1 -0
  23. commonmeta/writers/citation_writer.py +1 -0
  24. commonmeta/writers/crossref_xml_writer.py +1 -0
  25. commonmeta/writers/csl_writer.py +1 -0
  26. commonmeta/writers/datacite_writer.py +1 -0
  27. commonmeta/writers/ris_writer.py +1 -0
  28. commonmeta/writers/schema_org_writer.py +1 -0
  29. {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/METADATA +5 -8
  30. {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/RECORD +33 -32
  31. {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/licenses/LICENSE +1 -1
  32. {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/WHEEL +0 -0
  33. {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/entry_points.txt +0 -0
commonmeta/metadata.py CHANGED
@@ -1,56 +1,61 @@
1
1
  """Metadata"""
2
2
 
3
3
  from os import path
4
- import orjson as json
5
4
  from typing import Optional, Union
5
+
6
+ import orjson as json
6
7
  import yaml
7
8
  from pydash import py_
8
9
 
10
+ from .base_utils import parse_xml, wrap
11
+ from .constants import CM_TO_CR_TRANSLATIONS
12
+ from .doi_utils import doi_from_url
13
+ from .readers.cff_reader import get_cff, read_cff
14
+ from .readers.codemeta_reader import (
15
+ get_codemeta,
16
+ read_codemeta,
17
+ )
18
+ from .readers.commonmeta_reader import read_commonmeta
9
19
  from .readers.crossref_reader import (
10
20
  get_crossref,
11
21
  read_crossref,
12
22
  )
13
- from .readers.datacite_reader import (
14
- get_datacite,
15
- read_datacite,
16
- )
17
- from .readers.datacite_xml_reader import read_datacite_xml
18
23
  from .readers.crossref_xml_reader import (
19
24
  get_crossref_xml,
20
25
  read_crossref_xml,
21
26
  )
22
- from .readers.schema_org_reader import (
23
- get_schema_org,
24
- read_schema_org,
25
- )
26
- from .readers.codemeta_reader import (
27
- get_codemeta,
28
- read_codemeta,
29
- )
30
27
  from .readers.csl_reader import read_csl
31
- from .readers.cff_reader import get_cff, read_cff
32
- from .readers.json_feed_reader import get_json_feed_item, read_json_feed_item
28
+ from .readers.datacite_reader import (
29
+ get_datacite,
30
+ read_datacite,
31
+ )
32
+ from .readers.datacite_xml_reader import read_datacite_xml
33
33
  from .readers.inveniordm_reader import (
34
34
  get_inveniordm,
35
35
  read_inveniordm,
36
36
  )
37
+ from .readers.json_feed_reader import get_json_feed_item, read_json_feed_item
37
38
  from .readers.kbase_reader import read_kbase
38
- from .readers.commonmeta_reader import read_commonmeta
39
+ from .readers.openalex_reader import (
40
+ get_openalex,
41
+ read_openalex,
42
+ )
39
43
  from .readers.ris_reader import read_ris
40
- from .writers.datacite_writer import write_datacite
44
+ from .readers.schema_org_reader import (
45
+ get_schema_org,
46
+ read_schema_org,
47
+ )
48
+ from .schema_utils import json_schema_errors
49
+ from .utils import find_from_format, normalize_id
41
50
  from .writers.bibtex_writer import write_bibtex, write_bibtex_list
42
51
  from .writers.citation_writer import write_citation, write_citation_list
52
+ from .writers.commonmeta_writer import write_commonmeta, write_commonmeta_list
43
53
  from .writers.crossref_xml_writer import write_crossref_xml, write_crossref_xml_list
44
54
  from .writers.csl_writer import write_csl, write_csl_list
55
+ from .writers.datacite_writer import write_datacite
56
+ from .writers.inveniordm_writer import write_inveniordm
45
57
  from .writers.ris_writer import write_ris, write_ris_list
46
58
  from .writers.schema_org_writer import write_schema_org
47
- from .writers.commonmeta_writer import write_commonmeta, write_commonmeta_list
48
- from .writers.inveniordm_writer import write_inveniordm
49
- from .utils import normalize_id, find_from_format
50
- from .base_utils import parse_xml, wrap
51
- from .doi_utils import doi_from_url
52
- from .schema_utils import json_schema_errors
53
- from .constants import CM_TO_CR_TRANSLATIONS
54
59
 
55
60
 
56
61
  # pylint: disable=R0902
@@ -122,35 +127,68 @@ class Metadata:
122
127
  )
123
128
 
124
129
  def get_metadata(self, pid, string) -> dict:
130
+ """Get metadata from various sources based on pid or string input."""
125
131
  via = self.via
132
+
133
+ # Handle pid-based metadata retrieval
126
134
  if pid is not None:
127
- if via == "schema_org":
128
- return get_schema_org(pid)
129
- elif via == "datacite":
130
- return get_datacite(pid)
131
- elif via in ["crossref", "op"]:
132
- return get_crossref(pid)
133
- elif via == "crossref_xml":
134
- return get_crossref_xml(pid)
135
- elif via == "codemeta":
136
- return get_codemeta(pid)
137
- elif via == "cff":
138
- return get_cff(pid)
139
- elif via == "json_feed_item":
140
- return get_json_feed_item(pid)
141
- elif via == "inveniordm":
142
- return get_inveniordm(pid)
135
+ return self._get_metadata_from_pid(pid, via)
136
+ # Handle string-based metadata parsing
143
137
  elif string is not None:
138
+ return self._get_metadata_from_string(string, via)
139
+
140
+ # Default fallback
141
+ raise ValueError("No metadata found")
142
+
143
+ def _get_metadata_from_pid(self, pid, via) -> dict:
144
+ """Helper method to get metadata from a PID."""
145
+ if via == "schema_org":
146
+ return get_schema_org(pid)
147
+ elif via == "datacite":
148
+ return get_datacite(pid)
149
+ elif via in ["crossref", "op"]:
150
+ return get_crossref(pid)
151
+ elif via == "crossref_xml":
152
+ return get_crossref_xml(pid)
153
+ elif via == "codemeta":
154
+ return get_codemeta(pid)
155
+ elif via == "cff":
156
+ return get_cff(pid)
157
+ elif via == "json_feed_item":
158
+ return get_json_feed_item(pid)
159
+ elif via == "inveniordm":
160
+ return get_inveniordm(pid)
161
+ elif via == "openalex":
162
+ return get_openalex(pid)
163
+ else:
164
+ return {"pid": pid}
165
+
166
+ def _get_metadata_from_string(self, string, via) -> dict:
167
+ """Helper method to get metadata from a string."""
168
+ try:
169
+ # XML formats
144
170
  if via == "datacite_xml":
145
- return parse_xml(string)
171
+ result = parse_xml(string)
172
+ if isinstance(result, (dict, list)):
173
+ return (
174
+ dict(result) if isinstance(result, dict) else {"items": result}
175
+ )
176
+ return {}
146
177
  elif via == "crossref_xml":
147
- return parse_xml(string, dialect="crossref")
178
+ result = parse_xml(string, dialect="crossref")
179
+ if isinstance(result, (dict, list)):
180
+ return (
181
+ dict(result) if isinstance(result, dict) else {"items": result}
182
+ )
183
+ return {}
184
+ # YAML and other plain text formats
148
185
  elif via == "cff":
149
- return yaml.safe_load(string)
186
+ return dict(yaml.safe_load(string) or {})
150
187
  elif via == "bibtex":
151
188
  raise ValueError("Bibtex not supported")
152
189
  elif via == "ris":
153
- return string
190
+ return {"data": string}
191
+ # JSON-based formats
154
192
  elif via in [
155
193
  "commonmeta",
156
194
  "crossref",
@@ -165,81 +203,152 @@ class Metadata:
165
203
  return json.loads(string)
166
204
  else:
167
205
  raise ValueError("No input format found")
168
- else:
169
- raise ValueError("No metadata found")
206
+ except (TypeError, json.JSONDecodeError) as error:
207
+ return {"error": str(error)}
170
208
 
171
209
  def read_metadata(self, data: dict, **kwargs) -> dict:
172
- """get_metadata"""
173
- via = isinstance(data, dict) and data.get("via", None) or self.via
210
+ """Read and parse metadata from various formats."""
211
+ via = (isinstance(data, dict) and data.get("via")) or self.via
212
+
213
+ # All these reader methods should return a dict,
214
+ # even though some may return Commonmeta objects that can be treated as dicts
174
215
  if via == "commonmeta":
175
- return read_commonmeta(data, **kwargs)
216
+ return dict(read_commonmeta(data, **kwargs))
176
217
  elif via == "schema_org":
177
- return read_schema_org(data)
218
+ return dict(read_schema_org(data))
178
219
  elif via == "datacite":
179
- return read_datacite(data)
220
+ return dict(read_datacite(data))
180
221
  elif via == "datacite_xml":
181
- return read_datacite_xml(data)
222
+ return dict(read_datacite_xml(data))
182
223
  elif via in ["crossref", "op"]:
183
- return read_crossref(data)
224
+ return dict(read_crossref(data))
184
225
  elif via == "crossref_xml":
185
- return read_crossref_xml(data)
226
+ return dict(read_crossref_xml(data))
186
227
  elif via == "csl":
187
- return read_csl(data, **kwargs)
228
+ return dict(read_csl(data, **kwargs))
188
229
  elif via == "codemeta":
189
- return read_codemeta(data)
230
+ return dict(read_codemeta(data))
190
231
  elif via == "cff":
191
- return read_cff(data)
232
+ return dict(read_cff(data))
192
233
  elif via == "json_feed_item":
193
- return read_json_feed_item(data, **kwargs)
234
+ return dict(read_json_feed_item(data, **kwargs))
194
235
  elif via == "inveniordm":
195
- return read_inveniordm(data)
236
+ return dict(read_inveniordm(data))
196
237
  elif via == "kbase":
197
- return read_kbase(data)
238
+ return dict(read_kbase(data))
239
+ elif via == "openalex":
240
+ return read_openalex(data)
198
241
  elif via == "ris":
199
- return read_ris(data)
242
+ return dict(read_ris(data["data"] if isinstance(data, dict) else data))
200
243
  else:
201
244
  raise ValueError("No input format found")
202
245
 
203
246
  def write(self, to: str = "commonmeta", **kwargs) -> str:
204
- """convert metadata into different formats"""
247
+ """Convert metadata into different formats."""
205
248
  try:
206
- if to == "commonmeta":
207
- return write_commonmeta(self)
208
- elif to == "bibtex":
209
- return write_bibtex(self)
210
- elif to == "csl":
211
- instance = py_.omit(json.loads(write_csl(self)), [])
212
- self.errors = json_schema_errors(instance, schema="csl")
213
- return write_csl(self)
214
- elif to == "citation":
215
- self.style = kwargs.get("style", "apa")
216
- self.locale = kwargs.get("locale", "en-US")
217
- return write_citation(self)
218
- elif to == "ris":
219
- return write_ris(self)
220
- elif to == "schema_org":
221
- return write_schema_org(self)
222
- elif to == "inveniordm":
223
- return write_inveniordm(self)
224
- elif to == "datacite":
225
- instance = json.loads(write_datacite(self))
226
- self.write_errors = json_schema_errors(instance, schema="datacite")
227
- print(self.write_errors)
228
- return write_datacite(self)
229
- elif to == "crossref_xml":
230
- doi = doi_from_url(self.id)
231
- _type = CM_TO_CR_TRANSLATIONS.get(self.type, None)
232
- url = self.url
233
- instance = {"doi": doi, "type": _type, "url": url}
234
- self.depositor = kwargs.get("depositor", None)
235
- self.email = kwargs.get("email", None)
236
- self.registrant = kwargs.get("registrant", None)
237
- self.write_errors = json_schema_errors(instance, schema="crossref")
238
- return write_crossref_xml(self)
239
- else:
240
- raise ValueError("No output format found")
241
- except json.JSONDecodeError:
242
- raise ValueError("Invalid JSON")
249
+ result = self._write_format(to, **kwargs)
250
+ if result is None or result == "":
251
+ return "{}"
252
+ return result
253
+ except json.JSONDecodeError as e:
254
+ # More specific error message including the original JSONDecodeError details
255
+ raise ValueError(f"Invalid JSON: {str(e)}")
256
+
257
+ def _write_format(self, to: str, **kwargs) -> str:
258
+ """Helper method to handle writing to different formats."""
259
+ # Split the format handling into multiple methods to reduce cyclomatic complexity
260
+ if to in ["commonmeta", "datacite", "inveniordm", "schema_org"]:
261
+ return self._write_json_format(to)
262
+ elif to in ["bibtex", "csl", "citation", "ris"]:
263
+ return self._write_text_format(to, **kwargs)
264
+ elif to in ["crossref_xml"]:
265
+ return self._write_xml_format(to, **kwargs)
266
+ else:
267
+ raise ValueError("No output format found")
268
+
269
+ def _write_json_format(self, to: str) -> str:
270
+ """Handle JSON-based output formats."""
271
+ if to == "commonmeta":
272
+ result = write_commonmeta(self)
273
+ elif to == "datacite":
274
+ result = write_datacite(self)
275
+ elif to == "inveniordm":
276
+ result = write_inveniordm(self)
277
+ elif to == "schema_org":
278
+ result = write_schema_org(self)
279
+ else:
280
+ return "{}"
281
+
282
+ if isinstance(result, str):
283
+ # Verify it's valid JSON
284
+ try:
285
+ json.loads(result)
286
+ return result
287
+ except json.JSONDecodeError:
288
+ return "{}"
289
+ elif result is not None:
290
+ try:
291
+ decoded = result.decode("utf-8")
292
+ # Verify it's valid JSON
293
+ json.loads(decoded)
294
+ return decoded
295
+ except (json.JSONDecodeError, UnicodeDecodeError):
296
+ return "{}"
297
+ return "{}"
298
+
299
+ def _write_text_format(self, to: str, **kwargs) -> str:
300
+ """Handle text-based output formats."""
301
+ if to == "bibtex":
302
+ return write_bibtex(self)
303
+ elif to == "csl":
304
+ return self._write_csl(**kwargs)
305
+ elif to == "citation":
306
+ self.style = kwargs.get("style", "apa")
307
+ self.locale = kwargs.get("locale", "en-US")
308
+ return write_citation(self)
309
+ elif to == "ris":
310
+ return write_ris(self)
311
+ return ""
312
+
313
+ def _write_xml_format(self, to: str, **kwargs) -> str:
314
+ """Handle XML-based output formats."""
315
+ if to == "crossref_xml":
316
+ return self._write_crossref_xml(**kwargs)
317
+ return ""
318
+
319
+ def _write_csl(self, **kwargs) -> str:
320
+ """Write in CSL format with error checking."""
321
+ csl_output = write_csl(self)
322
+ if csl_output:
323
+ instance = py_.omit(json.loads(csl_output), [])
324
+ self.errors = json_schema_errors(instance, schema="csl")
325
+ return csl_output
326
+ return ""
327
+
328
+ def _write_datacite(self) -> str:
329
+ """Write in DataCite format with error checking."""
330
+ datacite_output = write_datacite(self)
331
+ if not datacite_output:
332
+ return ""
333
+ try:
334
+ instance = json.loads(datacite_output)
335
+ self.write_errors = json_schema_errors(instance, schema="datacite")
336
+ return str(datacite_output)
337
+ except (json.JSONDecodeError, TypeError):
338
+ return "{}" if not datacite_output else str(datacite_output)
339
+
340
+ def _write_crossref_xml(self, **kwargs) -> str:
341
+ """Write in Crossref XML format with error checking."""
342
+ doi = doi_from_url(self.id)
343
+ _type = CM_TO_CR_TRANSLATIONS.get(str(self.type or ""), None)
344
+ url = self.url
345
+ instance = {"doi": doi, "type": _type, "url": url}
346
+ self.depositor = kwargs.get("depositor", None)
347
+ self.email = kwargs.get("email", None)
348
+ self.registrant = kwargs.get("registrant", None)
349
+ self.write_errors = json_schema_errors(instance, schema="crossref")
350
+ result = write_crossref_xml(self)
351
+ return result if result is not None else ""
243
352
 
244
353
 
245
354
  class MetadataList:
@@ -1,4 +1,5 @@
1
1
  """cff reader for commonmeta-py"""
2
+
2
3
  from typing import Optional
3
4
  from urllib.parse import urlparse
4
5
  import httpx
@@ -1,4 +1,5 @@
1
1
  """codemeta reader for commonmeta-py"""
2
+
2
3
  from typing import Optional
3
4
  from collections import defaultdict
4
5
  import httpx
@@ -1,4 +1,5 @@
1
1
  """Commonmeta reader for commonmeta-py"""
2
+
2
3
  from ..constants import Commonmeta
3
4
 
4
5
 
@@ -1,32 +1,33 @@
1
1
  """crossref reader for commonmeta-py"""
2
2
 
3
3
  from typing import Optional
4
+
4
5
  import httpx
5
6
  from pydash import py_
6
7
 
7
- from ..utils import (
8
- dict_to_spdx,
9
- normalize_cc_url,
10
- normalize_url,
11
- normalize_doi,
12
- normalize_issn,
13
- issn_as_url,
14
- )
15
- from ..base_utils import wrap, compact, presence, sanitize, parse_attributes
16
8
  from ..author_utils import get_authors
9
+ from ..base_utils import compact, parse_attributes, presence, sanitize, wrap
10
+ from ..constants import (
11
+ CR_TO_CM_CONTAINER_TRANSLATIONS,
12
+ CR_TO_CM_TRANSLATIONS,
13
+ CROSSREF_CONTAINER_TYPES,
14
+ Commonmeta,
15
+ )
17
16
  from ..date_utils import get_date_from_date_parts
18
17
  from ..doi_utils import (
19
- doi_as_url,
20
- doi_from_url,
21
- crossref_api_url,
22
18
  crossref_api_query_url,
23
19
  crossref_api_sample_url,
20
+ crossref_api_url,
21
+ doi_as_url,
22
+ validate_doi,
24
23
  )
25
- from ..constants import (
26
- CR_TO_CM_TRANSLATIONS,
27
- CR_TO_CM_CONTAINER_TRANSLATIONS,
28
- CROSSREF_CONTAINER_TYPES,
29
- Commonmeta,
24
+ from ..utils import (
25
+ dict_to_spdx,
26
+ issn_as_url,
27
+ normalize_cc_url,
28
+ normalize_doi,
29
+ normalize_issn,
30
+ normalize_url,
30
31
  )
31
32
 
32
33
 
@@ -41,7 +42,7 @@ def get_crossref_list(query: dict, **kwargs) -> list[dict]:
41
42
 
42
43
  def get_crossref(pid: str, **kwargs) -> dict:
43
44
  """get_crossref"""
44
- doi = doi_from_url(pid)
45
+ doi = validate_doi(pid)
45
46
  if doi is None:
46
47
  return {"state": "not_found"}
47
48
  url = crossref_api_url(doi)
@@ -1,4 +1,5 @@
1
1
  """CSL-JSON reader for commonmeta-py"""
2
+
2
3
  from ..utils import dict_to_spdx, from_csl, normalize_id, name_to_fos, issn_as_url
3
4
  from ..base_utils import wrap, compact, sanitize, presence
4
5
  from ..author_utils import get_authors
@@ -18,7 +19,9 @@ def read_csl(data: dict, **kwargs) -> Commonmeta:
18
19
 
19
20
  read_options = kwargs or {}
20
21
 
21
- _id = normalize_id(meta.get("id", None) or meta.get("DOI", None)) or meta.get("id", None)
22
+ _id = normalize_id(meta.get("id", None) or meta.get("DOI", None)) or meta.get(
23
+ "id", None
24
+ )
22
25
  _type = CSL_TO_CM_TRANSLATIONS.get(meta.get("type", None), "Other")
23
26
 
24
27
  # optionally generate a DOI if missing but a DOI prefix is provided
@@ -188,17 +188,22 @@ def get_funding_references(funding_references: list) -> list:
188
188
  def map_funding(funding: dict) -> dict:
189
189
  """map_funding"""
190
190
 
191
- return compact({
192
- "funderName": py_.get(funding, "funder.name"),
193
- "funderIdentifier": py_.get(funding, "funder.id"),
194
- "funderIdentifierType": "ROR" if validate_ror(py_.get(funding, "funder.id")) else None,
195
- "awardTitle": py_.get(funding, "award.title.en"),
196
- "awardNumber": py_.get(funding, "award.number"),
197
- "awardUri": py_.get(funding, "award.identifiers[0].identifier"),
198
- })
199
-
191
+ return compact(
192
+ {
193
+ "funderName": py_.get(funding, "funder.name"),
194
+ "funderIdentifier": py_.get(funding, "funder.id"),
195
+ "funderIdentifierType": "ROR"
196
+ if validate_ror(py_.get(funding, "funder.id"))
197
+ else None,
198
+ "awardTitle": py_.get(funding, "award.title.en"),
199
+ "awardNumber": py_.get(funding, "award.number"),
200
+ "awardUri": py_.get(funding, "award.identifiers[0].identifier"),
201
+ }
202
+ )
203
+
200
204
  return [map_funding(i) for i in funding_references]
201
205
 
206
+
202
207
  def get_file(file: dict) -> str:
203
208
  """get_file"""
204
209
  _type = file.get("type", None)
@@ -206,7 +206,7 @@ def get_funding_references(meta: Optional[dict]) -> Optional[list]:
206
206
 
207
207
  if meta is None or not isinstance(meta, dict):
208
208
  return None
209
-
209
+
210
210
  def format_funding(urls: list) -> list:
211
211
  """format funding. URLs can either be a list of grant IDs or a funder identifier
212
212
  (Open Funder Registry ID or ROR), followed by a grant URL"""
@@ -288,6 +288,7 @@ def get_funding_references(meta: Optional[dict]) -> Optional[list]:
288
288
  if i.get("type", None) == "HasAward"
289
289
  ]
290
290
  )
291
+
291
292
  def format_funding_reference(funding: dict) -> dict:
292
293
  """format funding reference. Make sure award URI is either a DOI or URL"""
293
294
 
@@ -311,10 +312,15 @@ def get_funding_references(meta: Optional[dict]) -> Optional[list]:
311
312
  "awardUri": award_uri,
312
313
  }
313
314
  )
315
+
314
316
  funding_references = py_.get(meta, "funding_references")
315
317
  if funding_references is not None:
316
- awards += [format_funding_reference(i) for i in funding_references if i.get("funderName", None)]
317
-
318
+ awards += [
319
+ format_funding_reference(i)
320
+ for i in funding_references
321
+ if i.get("funderName", None)
322
+ ]
323
+
318
324
  awards += wrap(py_.get(meta, "blog.funding"))
319
325
  return py_.uniq(awards)
320
326
 
@@ -1,4 +1,5 @@
1
1
  """kbase reader for Commonmeta"""
2
+
2
3
  from pydash import py_
3
4
 
4
5
  from ..utils import normalize_url, normalize_doi, from_curie, from_kbase