invenio-vocabularies 6.5.0__py2.py3-none-any.whl → 6.7.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (46) hide show
  1. invenio_vocabularies/__init__.py +1 -1
  2. invenio_vocabularies/assets/semantic-ui/js/invenio_vocabularies/src/contrib/forms/Funding/FundingModal.js +3 -27
  3. invenio_vocabularies/config.py +27 -1
  4. invenio_vocabularies/contrib/affiliations/affiliations.py +2 -1
  5. invenio_vocabularies/contrib/affiliations/config.py +21 -10
  6. invenio_vocabularies/contrib/affiliations/datastreams.py +103 -1
  7. invenio_vocabularies/contrib/awards/awards.py +2 -1
  8. invenio_vocabularies/contrib/awards/datastreams.py +7 -0
  9. invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +9 -0
  10. invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +22 -1
  11. invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +22 -1
  12. invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +22 -1
  13. invenio_vocabularies/contrib/awards/schema.py +9 -3
  14. invenio_vocabularies/contrib/funders/config.py +19 -12
  15. invenio_vocabularies/contrib/funders/funders.py +2 -1
  16. invenio_vocabularies/contrib/names/config.py +13 -10
  17. invenio_vocabularies/contrib/names/jsonschemas/names/name-v1.0.0.json +28 -5
  18. invenio_vocabularies/contrib/names/mappings/os-v1/names/name-v2.0.0.json +15 -0
  19. invenio_vocabularies/contrib/names/mappings/os-v2/names/name-v2.0.0.json +15 -0
  20. invenio_vocabularies/contrib/names/names.py +1 -1
  21. invenio_vocabularies/contrib/names/permissions.py +14 -4
  22. invenio_vocabularies/contrib/names/schema.py +11 -2
  23. invenio_vocabularies/contrib/names/services.py +23 -14
  24. invenio_vocabularies/contrib/subjects/config.py +14 -2
  25. invenio_vocabularies/contrib/subjects/datastreams.py +4 -0
  26. invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +22 -114
  27. invenio_vocabularies/contrib/subjects/gemet/__init__.py +9 -0
  28. invenio_vocabularies/contrib/subjects/gemet/datastreams.py +109 -0
  29. invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +13 -2
  30. invenio_vocabularies/contrib/subjects/mesh/datastreams.py +7 -2
  31. invenio_vocabularies/contrib/subjects/schema.py +18 -3
  32. invenio_vocabularies/datastreams/readers.py +99 -9
  33. invenio_vocabularies/datastreams/transformers.py +55 -0
  34. invenio_vocabularies/factories.py +15 -0
  35. invenio_vocabularies/jobs.py +15 -0
  36. invenio_vocabularies/records/jsonschemas/vocabularies/definitions-v1.0.0.json +9 -0
  37. invenio_vocabularies/services/config.py +1 -7
  38. invenio_vocabularies/services/generators.py +38 -0
  39. invenio_vocabularies/services/permissions.py +6 -1
  40. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/METADATA +32 -2
  41. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/RECORD +46 -43
  42. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/entry_points.txt +1 -0
  43. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/AUTHORS.rst +0 -0
  44. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/LICENSE +0 -0
  45. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/WHEEL +0 -0
  46. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,7 @@
11
11
 
12
12
  import csv
13
13
  import gzip
14
+ import io
14
15
  import json
15
16
  import re
16
17
  import tarfile
@@ -27,11 +28,25 @@ from lxml.html import parse as html_parse
27
28
  from .errors import ReaderError
28
29
  from .xml import etree_to_dict
29
30
 
31
+ # Extras dependencies
32
+ # "oaipmh"
30
33
  try:
31
34
  import oaipmh_scythe
32
35
  except ImportError:
33
36
  oaipmh_scythe = None
34
37
 
38
+ # "rdf"
39
+ try:
40
+ import rdflib
41
+ except ImportError:
42
+ rdflib = None
43
+
44
+ # "sparql"
45
+ try:
46
+ import SPARQLWrapper as sparql
47
+ except ImportError:
48
+ sparql = None
49
+
35
50
 
36
51
  class BaseReader(ABC):
37
52
  """Base reader."""
@@ -103,8 +118,7 @@ class SimpleHTTPReader(BaseReader):
103
118
 
104
119
  def __init__(self, origin, id=None, ids=None, content_type=None, *args, **kwargs):
105
120
  """Constructor."""
106
- assert id or ids
107
- self._ids = ids if ids else [id]
121
+ self._ids = ids if ids else ([id] if id else None)
108
122
  self.content_type = content_type
109
123
  super().__init__(origin, *args, **kwargs)
110
124
 
@@ -113,14 +127,22 @@ class SimpleHTTPReader(BaseReader):
113
127
  base_url = url
114
128
  headers = {"Accept": self.content_type}
115
129
 
116
- for id_ in self._ids:
117
- url = base_url.format(id=id_)
130
+ # If there are no IDs, query the base URL
131
+ if not self._ids:
118
132
  resp = requests.get(url, headers=headers)
119
- if resp.status_code != 200:
120
- # todo add logging/fail
121
- pass
133
+ if resp.status_code == 200:
134
+ yield resp.content
135
+ else:
136
+ print(f"Failed to fetch URL {url}: {resp.status_code}")
137
+ else:
138
+ for id_ in self._ids:
139
+ url = base_url.format(id=id_)
140
+ resp = requests.get(url, headers=headers)
141
+ if resp.status_code != 200:
142
+ # todo add logging/fail
143
+ pass
122
144
 
123
- yield resp.content
145
+ yield resp.content
124
146
 
125
147
  def read(self, item=None, *args, **kwargs):
126
148
  """Chooses between item and origin as url."""
@@ -197,6 +219,9 @@ class GzipReader(BaseReader):
197
219
  """Gzip reader."""
198
220
 
199
221
  def _iter(self, fp, *args, **kwargs):
222
+ if isinstance(fp, bytes):
223
+ fp = io.BytesIO(fp)
224
+
200
225
  with gzip.open(fp) as gp:
201
226
  yield gp
202
227
 
@@ -236,7 +261,7 @@ class XMLReader(BaseReader):
236
261
  try:
237
262
  xml_tree = fromstring(fp)
238
263
  xml_dict = etree_to_dict(xml_tree)
239
- except Exception as e:
264
+ except Exception:
240
265
  xml_tree = html_parse(fp).getroot()
241
266
  xml_dict = etree_to_dict(xml_tree)["html"]["body"]
242
267
 
@@ -346,3 +371,68 @@ def xml_to_dict(tree: etree._Element):
346
371
  dict_obj["record"] = etree.tostring(tree)
347
372
 
348
373
  return dict_obj
374
+
375
+
376
+ class RDFReader(BaseReader):
377
+ """Base Reader class to fetch and process RDF data."""
378
+
379
+ @property
380
+ def skos_core(self):
381
+ """Return the SKOS Core namespace."""
382
+ return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
383
+
384
+ def _iter(self, rdf_graph):
385
+ """Iterate over the RDF graph, yielding one subject at a time."""
386
+ for subject, _, _ in rdf_graph.triples(
387
+ (None, rdflib.RDF.type, self.skos_core.Concept)
388
+ ):
389
+ yield {"subject": subject, "rdf_graph": rdf_graph}
390
+
391
+ def read(self, item=None, *args, **kwargs):
392
+ """Fetch and process the RDF data, yielding it one subject at a time."""
393
+ if isinstance(item, gzip.GzipFile):
394
+ rdf_content = item.read().decode("utf-8")
395
+
396
+ elif isinstance(item, bytes):
397
+ rdf_content = item.decode("utf-8")
398
+ else:
399
+ raise ReaderError("Unsupported content type")
400
+
401
+ rdf_graph = rdflib.Graph()
402
+ rdf_graph.parse(io.StringIO(rdf_content), format="xml")
403
+
404
+ yield from self._iter(rdf_graph)
405
+
406
+
407
+ class SPARQLReader(BaseReader):
408
+ """Generic reader class to fetch and process RDF data from a SPARQL endpoint."""
409
+
410
+ def __init__(self, origin, query, mode="r", *args, **kwargs):
411
+ """Initialize the reader with the data source.
412
+
413
+ :param origin: The SPARQL endpoint from which to fetch the RDF data.
414
+ :param query: The SPARQL query to execute.
415
+ :param mode: Mode of operation (default is 'r' for reading).
416
+ """
417
+ self._origin = origin
418
+ self._query = query
419
+ super().__init__(origin=origin, mode=mode, *args, **kwargs)
420
+
421
+ def _iter(self, fp, *args, **kwargs):
422
+ raise NotImplementedError(
423
+ "SPARQLReader downloads one result set from SPARQL and therefore does not iterate through items"
424
+ )
425
+
426
+ def read(self, item=None, *args, **kwargs):
427
+ """Fetch and process RDF data, yielding results one at a time."""
428
+ if item:
429
+ raise NotImplementedError(
430
+ "SPARQLReader does not support being chained after another reader"
431
+ )
432
+
433
+ sparql_client = sparql.SPARQLWrapper(self._origin)
434
+ sparql_client.setQuery(self._query)
435
+ sparql_client.setReturnFormat(sparql.JSON)
436
+
437
+ results = sparql_client.query().convert()
438
+ yield from results["results"]["bindings"]
@@ -15,6 +15,11 @@ from lxml import etree
15
15
  from .errors import TransformerError
16
16
  from .xml import etree_to_dict
17
17
 
18
+ try:
19
+ import rdflib
20
+ except ImportError:
21
+ rdflib = None
22
+
18
23
 
19
24
  class BaseTransformer(ABC):
20
25
  """Base transformer."""
@@ -61,3 +66,53 @@ class XMLTransformer(BaseTransformer):
61
66
 
62
67
  stream_entry.entry = record
63
68
  return stream_entry
69
+
70
+
71
+ class RDFTransformer(BaseTransformer):
72
+ """Base Transformer class for RDF data to dictionary format."""
73
+
74
+ @property
75
+ def skos_core(self):
76
+ """Get the SKOS core namespace."""
77
+ return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
78
+
79
+ def _get_labels(self, subject, rdf_graph):
80
+ """Extract labels (prefLabel or altLabel) for a subject."""
81
+ labels = {
82
+ label.language: label.value.capitalize()
83
+ for _, _, label in rdf_graph.triples(
84
+ (subject, self.skos_core.prefLabel, None)
85
+ )
86
+ if label.language and "-" not in label.language
87
+ }
88
+
89
+ if "en" not in labels:
90
+ for _, _, label in rdf_graph.triples(
91
+ (subject, self.skos_core.altLabel, None)
92
+ ):
93
+ labels.setdefault(label.language, label.value.capitalize())
94
+
95
+ return labels
96
+
97
+ def _find_parents(self, subject, rdf_graph):
98
+ """Find parent notations."""
99
+ return [
100
+ self._get_parent_notation(broader, rdf_graph)
101
+ for broader in rdf_graph.transitive_objects(subject, self.skos_core.broader)
102
+ if broader != subject
103
+ ]
104
+
105
+ def _get_parent_notation(self, broader, rdf_graph):
106
+ """Extract notation for a parent."""
107
+ raise NotImplementedError("This method should be implemented in a subclass.")
108
+
109
+ def _transform_entry(self, subject, rdf_graph):
110
+ """Transform an RDF subject entry into the desired dictionary format."""
111
+ raise NotImplementedError("This method should be implemented in a subclass.")
112
+
113
+ def apply(self, stream_entry, *args, **kwargs):
114
+ """Apply transformation to a stream entry."""
115
+ stream_entry.entry = self._transform_entry(
116
+ stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
117
+ )
118
+ return stream_entry
@@ -16,6 +16,9 @@ from invenio_records_resources.proxies import current_service_registry
16
16
  from .contrib.affiliations.datastreams import (
17
17
  DATASTREAM_CONFIG as affiliations_ds_config,
18
18
  )
19
+ from .contrib.affiliations.datastreams import (
20
+ DATASTREAM_CONFIG_EDMO as affiliations_edmo_ds_config,
21
+ )
19
22
  from .contrib.affiliations.datastreams import (
20
23
  DATASTREAM_CONFIG_OPENAIRE as affiliations_openaire_ds_config,
21
24
  )
@@ -123,6 +126,17 @@ class AffiliationsOpenAIREVocabularyConfig(VocabularyConfig):
123
126
  raise NotImplementedError("Service not implemented for OpenAIRE Affiliations")
124
127
 
125
128
 
129
+ class AffiliationsEDMOVocabularyConfig(VocabularyConfig):
130
+ """European Directory of Marine Organisations (EDMO) Affiliations Vocabulary Config."""
131
+
132
+ config = affiliations_edmo_ds_config
133
+ vocabulary_name = "affiliations:edmo"
134
+
135
+ def get_service(self):
136
+ """Get the service for the vocabulary."""
137
+ raise NotImplementedError("Service not implemented for EDMO Affiliations")
138
+
139
+
126
140
  def get_vocabulary_config(vocabulary):
127
141
  """Factory function to get the appropriate Vocabulary Config."""
128
142
  vocab_config = {
@@ -132,6 +146,7 @@ def get_vocabulary_config(vocabulary):
132
146
  "awards:cordis": AwardsCordisVocabularyConfig,
133
147
  "affiliations": AffiliationsVocabularyConfig,
134
148
  "affiliations:openaire": AffiliationsOpenAIREVocabularyConfig,
149
+ "affiliations:edmo": AffiliationsEDMOVocabularyConfig,
135
150
  "subjects": SubjectsVocabularyConfig,
136
151
  }
137
152
  return vocab_config.get(vocabulary, VocabularyConfig)()
@@ -18,6 +18,8 @@ from marshmallow_utils.fields import TZDateTime
18
18
 
19
19
  from invenio_vocabularies.services.tasks import process_datastream
20
20
 
21
+ from .contrib.names.datastreams import ORCID_PRESET_DATASTREAM_CONFIG
22
+
21
23
 
22
24
  class ArgsSchema(Schema):
23
25
  """Schema of task input arguments."""
@@ -194,3 +196,16 @@ class UpdateAwardsCordisJob(ProcessDataStreamJob):
194
196
  ],
195
197
  }
196
198
  }
199
+
200
+
201
+ class ImportORCIDJob(ProcessDataStreamJob):
202
+ """Import ORCID data registered task."""
203
+
204
+ description = "Import ORCID data"
205
+ title = "Import ORCID data"
206
+ id = "import_orcid"
207
+
208
+ @classmethod
209
+ def default_args(cls, job_obj, **kwargs):
210
+ """Generate default job arguments."""
211
+ return {"config": {**ORCID_PRESET_DATASTREAM_CONFIG}}
@@ -8,6 +8,15 @@
8
8
  }
9
9
  }
10
10
  },
11
+ "description": {
12
+ "type": "object",
13
+ "description": "Description of vocabulary item. Keys are locale codes.",
14
+ "patternProperties": {
15
+ "^[a-z]{2}$": {
16
+ "type": "string"
17
+ }
18
+ }
19
+ },
11
20
  "icon": {
12
21
  "type": "string"
13
22
  },
@@ -14,17 +14,11 @@ from flask import current_app
14
14
  from invenio_i18n import lazy_gettext as _
15
15
  from invenio_records_resources.services import (
16
16
  Link,
17
- LinksTemplate,
18
- RecordService,
19
17
  RecordServiceConfig,
20
18
  SearchOptions,
21
19
  pagination_links,
22
20
  )
23
- from invenio_records_resources.services.base import (
24
- ConditionalLink,
25
- Service,
26
- ServiceListResult,
27
- )
21
+ from invenio_records_resources.services.base import ConditionalLink
28
22
  from invenio_records_resources.services.records.components import DataComponent
29
23
  from invenio_records_resources.services.records.params import (
30
24
  FilterParam,
@@ -0,0 +1,38 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+ #
9
+
10
+ """Vocabulary generators."""
11
+
12
+ from invenio_access import any_user, authenticated_user
13
+ from invenio_records_permissions.generators import ConditionalGenerator
14
+ from invenio_search.engine import dsl
15
+
16
+
17
+ class IfTags(ConditionalGenerator):
18
+ """Generator to filter based on tags.
19
+
20
+ This generator will filter out records based on the tags field.
21
+ """
22
+
23
+ def __init__(self, tags, then_, else_):
24
+ """Constructor."""
25
+ self.tags = tags or []
26
+ super().__init__(then_, else_)
27
+
28
+ def _condition(self, record=None, **kwargs):
29
+ """Check if the record has the tags."""
30
+ return any(tag in record.get("tags", []) for tag in self.tags)
31
+
32
+ def query_filter(self, **kwargs):
33
+ """Search based on configured tags."""
34
+ must_not_clauses = [dsl.Q("terms", tags=self.tags)]
35
+ return dsl.Q(
36
+ "bool",
37
+ must_not=must_not_clauses,
38
+ )
@@ -11,12 +11,17 @@
11
11
  from invenio_records_permissions import RecordPermissionPolicy
12
12
  from invenio_records_permissions.generators import AnyUser, SystemProcess
13
13
 
14
+ from invenio_vocabularies.services.generators import IfTags
15
+
14
16
 
15
17
  class PermissionPolicy(RecordPermissionPolicy):
16
18
  """Permission policy."""
17
19
 
18
20
  can_search = [SystemProcess(), AnyUser()]
19
- can_read = [SystemProcess(), AnyUser()]
21
+ can_read = [
22
+ SystemProcess(),
23
+ IfTags(["unlisted"], then_=[SystemProcess()], else_=[AnyUser()]),
24
+ ]
20
25
  can_create = [SystemProcess()]
21
26
  can_update = [SystemProcess()]
22
27
  can_delete = [SystemProcess()]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: invenio-vocabularies
3
- Version: 6.5.0
3
+ Version: 6.7.0
4
4
  Summary: Invenio module for managing vocabularies.
5
5
  Home-page: https://github.com/inveniosoftware/invenio-vocabularies
6
6
  Author: CERN
@@ -15,9 +15,10 @@ Requires-Dist: invenio-records-resources <7.0.0,>=6.0.0
15
15
  Requires-Dist: invenio-administration <3.0.0,>=2.0.0
16
16
  Requires-Dist: invenio-jobs <2.0.0,>=1.0.0
17
17
  Requires-Dist: lxml >=4.5.0
18
+ Requires-Dist: pycountry <23.0.0,>=22.3.5
18
19
  Requires-Dist: PyYAML >=5.4.1
19
20
  Requires-Dist: regex >=2024.7.24
20
- Requires-Dist: rdflib >=7.0.0
21
+ Requires-Dist: SPARQLWrapper >=2.0.0
21
22
  Provides-Extra: elasticsearch7
22
23
  Requires-Dist: invenio-search[elasticsearch7] <3.0.0,>=2.1.0 ; extra == 'elasticsearch7'
23
24
  Provides-Extra: mysql
@@ -28,8 +29,12 @@ Requires-Dist: invenio-search[opensearch1] <3.0.0,>=2.1.0 ; extra == 'opensearch
28
29
  Provides-Extra: opensearch2
29
30
  Requires-Dist: invenio-search[opensearch2] <3.0.0,>=2.1.0 ; extra == 'opensearch2'
30
31
  Provides-Extra: postgresql
32
+ Provides-Extra: rdf
33
+ Requires-Dist: rdflib >=7.0.0 ; extra == 'rdf'
31
34
  Provides-Extra: s3fs
32
35
  Requires-Dist: s3fs >=2024.6.1 ; extra == 's3fs'
36
+ Provides-Extra: sparql
37
+ Requires-Dist: SPARQLWrapper >=2.0.0 ; extra == 'sparql'
33
38
  Provides-Extra: sqlite
34
39
  Provides-Extra: tests
35
40
  Requires-Dist: pytest-black-ng >=0.4.0 ; extra == 'tests'
@@ -83,6 +88,31 @@ https://invenio-vocabularies.readthedocs.io/
83
88
  Changes
84
89
  =======
85
90
 
91
+ Version v6.7.0 (released 2024-11-27)
92
+
93
+ - contrib: improve search accuracy for names, funders, affiliations
94
+ - names: add affiliation acronym in mappings and schema
95
+ * Dereferences the affiliation `acronym` when indexing names and serving
96
+ REST API results. This is useful for disambiguating authors in search.
97
+ - affiliations: move RDF and SPARQL as extra dependencies
98
+ * Moves `rdflib` and `SPARQLWrapper` to extras.
99
+ - affiliation: refactored edmo datastreams
100
+ - subjects: added datastream for GEMET vocabulary
101
+ - awards/schema.py: read app config for alternate funding validation (#429)
102
+ - awards: fix description field and mappings
103
+ - awards: add fields start/end date and description
104
+
105
+ Version v6.6.0 (released 2024-11-15)
106
+
107
+ - mesh: add title en if not present
108
+ - subjects: add subject to search fields
109
+ - jobs: add ORCID job
110
+ - global: Add unlisted tag
111
+ * This adds a new tag to the vocabularies to allow for unlisted
112
+ vocabularies. This is useful for vocabularies that are not meant to be
113
+ displayed in the UI.
114
+ * This requires to update the names mapping to add the props.
115
+
86
116
  Version v6.5.0 (released 2024-10-31)
87
117
 
88
118
  - subjects: euroscivoc: change default to latest version-less URL