udata 10.9.1.dev37462__py2.py3-none-any.whl → 10.9.1.dev37604__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- udata/api/__init__.py +0 -1
- udata/core/dataset/api.py +1 -1
- udata/core/dataset/search.py +5 -2
- udata/core/dataset/tasks.py +2 -5
- udata/core/reuse/tasks.py +3 -0
- udata/core/topic/__init__.py +1 -0
- udata/core/topic/api_fields.py +87 -0
- udata/core/topic/apiv2.py +116 -194
- udata/core/topic/factories.py +69 -8
- udata/core/topic/forms.py +58 -4
- udata/core/topic/models.py +65 -20
- udata/core/topic/parsers.py +40 -0
- udata/core/topic/tasks.py +11 -0
- udata/forms/fields.py +8 -1
- udata/harvest/backends/dcat.py +41 -20
- udata/harvest/tests/test_dcat_backend.py +89 -0
- udata/migrations/2025-05-26-migrate-topics-to-elements.py +59 -0
- udata/migrations/2025-06-02-delete-topic-name-index.py +19 -0
- udata/static/chunks/{11.51d706fb9521c16976bc.js → 11.822f6ccb39c92c796d13.js} +3 -3
- udata/static/chunks/{11.51d706fb9521c16976bc.js.map → 11.822f6ccb39c92c796d13.js.map} +1 -1
- udata/static/chunks/{13.f29411b06be1883356a3.js → 13.d9c1735d14038b94c17e.js} +2 -2
- udata/static/chunks/{13.f29411b06be1883356a3.js.map → 13.d9c1735d14038b94c17e.js.map} +1 -1
- udata/static/chunks/{17.3bd0340930d4a314ce9c.js → 17.81c57c0dedf812e43013.js} +2 -2
- udata/static/chunks/{17.3bd0340930d4a314ce9c.js.map → 17.81c57c0dedf812e43013.js.map} +1 -1
- udata/static/chunks/{8.b966402f5d680d4bdf4a.js → 8.0f42630e6d8ff782928e.js} +2 -2
- udata/static/chunks/{8.b966402f5d680d4bdf4a.js.map → 8.0f42630e6d8ff782928e.js.map} +1 -1
- udata/static/common.js +1 -1
- udata/static/common.js.map +1 -1
- udata/tasks.py +1 -0
- udata/tests/api/test_datasets_api.py +3 -2
- udata/tests/apiv2/test_me_api.py +2 -2
- udata/tests/apiv2/test_topics.py +457 -127
- udata/tests/dataset/test_dataset_tasks.py +7 -2
- udata/tests/reuse/test_reuse_task.py +9 -0
- udata/tests/search/test_adapter.py +43 -0
- udata/tests/test_topics.py +19 -8
- udata/tests/topic/test_topic_tasks.py +27 -0
- {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/METADATA +4 -2
- {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/RECORD +43 -40
- udata/core/topic/api.py +0 -145
- udata/tests/api/test_topics_api.py +0 -284
- {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/LICENSE +0 -0
- {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/WHEEL +0 -0
- {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/entry_points.txt +0 -0
- {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/top_level.txt +0 -0
udata/core/topic/forms.py
CHANGED
|
@@ -2,9 +2,30 @@ from udata.core.spatial.forms import SpatialCoverageField
|
|
|
2
2
|
from udata.forms import ModelForm, fields, validators
|
|
3
3
|
from udata.i18n import lazy_gettext as _
|
|
4
4
|
|
|
5
|
-
from .models import Topic
|
|
5
|
+
from .models import Topic, TopicElement
|
|
6
6
|
|
|
7
|
-
__all__ = ("TopicForm",)
|
|
7
|
+
__all__ = ("TopicForm", "TopicElementForm")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TopicElementForm(ModelForm):
|
|
11
|
+
model_class = TopicElement
|
|
12
|
+
|
|
13
|
+
title = fields.StringField(_("Title"))
|
|
14
|
+
description = fields.StringField(_("Description"))
|
|
15
|
+
tags = fields.TagField(_("Tags"))
|
|
16
|
+
extras = fields.ExtrasField()
|
|
17
|
+
element = fields.ModelField(_("Element"))
|
|
18
|
+
|
|
19
|
+
def validate(self, extra_validators=None):
|
|
20
|
+
"""
|
|
21
|
+
Make sure that either title or element is set.
|
|
22
|
+
(Empty nested element is a valid use case for "placeholder" elements)
|
|
23
|
+
"""
|
|
24
|
+
validation = super().validate(extra_validators)
|
|
25
|
+
if not self.element.data and not self.title.data:
|
|
26
|
+
self.element.errors.append(_("A topic element must have a title or an element."))
|
|
27
|
+
return False
|
|
28
|
+
return validation
|
|
8
29
|
|
|
9
30
|
|
|
10
31
|
class TopicForm(ModelForm):
|
|
@@ -16,8 +37,41 @@ class TopicForm(ModelForm):
|
|
|
16
37
|
name = fields.StringField(_("Name"), [validators.DataRequired()])
|
|
17
38
|
description = fields.MarkdownField(_("Description"), [validators.DataRequired()])
|
|
18
39
|
|
|
19
|
-
|
|
20
|
-
|
|
40
|
+
elements = fields.NestedModelList(TopicElementForm)
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def data(self):
|
|
44
|
+
"""Override to exclude non-model fields from data"""
|
|
45
|
+
# Get the base data from WTForms
|
|
46
|
+
base_data = super().data
|
|
47
|
+
# Filter out non-model fields
|
|
48
|
+
return {name: value for name, value in base_data.items() if name != "elements"}
|
|
49
|
+
|
|
50
|
+
def populate_obj(self, obj):
|
|
51
|
+
"""Override populate_obj to exclude non-model fields"""
|
|
52
|
+
# Only populate model fields, skip elements
|
|
53
|
+
for name, field in self._fields.items():
|
|
54
|
+
if name != "elements":
|
|
55
|
+
field.populate_obj(obj, name)
|
|
56
|
+
|
|
57
|
+
def save(self, commit=True, **kwargs):
|
|
58
|
+
"""Custom save to handle TopicElement creation properly"""
|
|
59
|
+
# Store elements data before parent save
|
|
60
|
+
elements_data = self.elements.data
|
|
61
|
+
|
|
62
|
+
# Use parent save method (elements field is excluded via populate_obj)
|
|
63
|
+
topic = super().save(commit=commit, **kwargs)
|
|
64
|
+
|
|
65
|
+
# Create elements and associate them with the topic
|
|
66
|
+
for element_data in elements_data or []:
|
|
67
|
+
element_form = TopicElementForm(data=element_data)
|
|
68
|
+
if element_form.validate():
|
|
69
|
+
element = element_form.save(commit=False)
|
|
70
|
+
element.topic = topic
|
|
71
|
+
if commit:
|
|
72
|
+
element.save()
|
|
73
|
+
|
|
74
|
+
return topic
|
|
21
75
|
|
|
22
76
|
spatial = SpatialCoverageField(
|
|
23
77
|
_("Spatial coverage"), description=_("The geographical area covered by the data.")
|
udata/core/topic/models.py
CHANGED
|
@@ -1,13 +1,47 @@
|
|
|
1
1
|
from blinker import Signal
|
|
2
|
-
from mongoengine.signals import
|
|
2
|
+
from mongoengine.signals import post_delete, post_save
|
|
3
3
|
|
|
4
4
|
from udata.api_fields import field
|
|
5
5
|
from udata.core.activity.models import Auditable
|
|
6
|
+
from udata.core.dataset.models import Dataset
|
|
6
7
|
from udata.core.owned import Owned, OwnedQuerySet
|
|
8
|
+
from udata.core.reuse.models import Reuse
|
|
7
9
|
from udata.models import SpatialCoverage, db
|
|
8
10
|
from udata.search import reindex
|
|
11
|
+
from udata.tasks import as_task_param
|
|
9
12
|
|
|
10
|
-
__all__ = ("Topic",)
|
|
13
|
+
__all__ = ("Topic", "TopicElement")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TopicElement(db.Document):
|
|
17
|
+
title = field(db.StringField(required=False))
|
|
18
|
+
description = field(db.StringField(required=False))
|
|
19
|
+
tags = field(db.ListField(db.StringField()))
|
|
20
|
+
extras = field(db.ExtrasField())
|
|
21
|
+
element = field(db.GenericReferenceField(choices=[Dataset, Reuse]))
|
|
22
|
+
# Made optional to allow proper form handling with commit=False
|
|
23
|
+
topic = field(db.ReferenceField("Topic", required=False))
|
|
24
|
+
|
|
25
|
+
meta = {
|
|
26
|
+
"indexes": [
|
|
27
|
+
{
|
|
28
|
+
"fields": ["$title", "$description"],
|
|
29
|
+
}
|
|
30
|
+
],
|
|
31
|
+
"auto_create_index_on_save": True,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def post_save(cls, sender, document, **kwargs):
|
|
36
|
+
"""Trigger reindex when element is saved"""
|
|
37
|
+
if document.topic and document.element and hasattr(document.element, "id"):
|
|
38
|
+
reindex.delay(*as_task_param(document.element))
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def post_delete(cls, sender, document, **kwargs):
|
|
42
|
+
"""Trigger reindex when element is deleted"""
|
|
43
|
+
if document.topic and document.element and hasattr(document.element, "id"):
|
|
44
|
+
reindex.delay(*as_task_param(document.element))
|
|
11
45
|
|
|
12
46
|
|
|
13
47
|
class Topic(db.Datetimed, Auditable, db.Document, Owned):
|
|
@@ -20,9 +54,6 @@ class Topic(db.Datetimed, Auditable, db.Document, Owned):
|
|
|
20
54
|
tags = field(db.ListField(db.StringField()))
|
|
21
55
|
color = field(db.IntField())
|
|
22
56
|
|
|
23
|
-
datasets = field(db.ListField(db.LazyReferenceField("Dataset", reverse_delete_rule=db.PULL)))
|
|
24
|
-
reuses = field(db.ListField(db.LazyReferenceField("Reuse", reverse_delete_rule=db.PULL)))
|
|
25
|
-
|
|
26
57
|
featured = field(db.BooleanField(default=False), auditable=False)
|
|
27
58
|
private = field(db.BooleanField())
|
|
28
59
|
extras = field(db.ExtrasField(), auditable=False)
|
|
@@ -30,7 +61,14 @@ class Topic(db.Datetimed, Auditable, db.Document, Owned):
|
|
|
30
61
|
spatial = field(db.EmbeddedDocumentField(SpatialCoverage))
|
|
31
62
|
|
|
32
63
|
meta = {
|
|
33
|
-
"indexes": [
|
|
64
|
+
"indexes": [
|
|
65
|
+
{
|
|
66
|
+
"fields": ["$name", "$description"],
|
|
67
|
+
},
|
|
68
|
+
"created_at",
|
|
69
|
+
"slug",
|
|
70
|
+
]
|
|
71
|
+
+ Owned.meta["indexes"],
|
|
34
72
|
"ordering": ["-created_at"],
|
|
35
73
|
"auto_create_index_on_save": True,
|
|
36
74
|
"queryset_class": OwnedQuerySet,
|
|
@@ -43,27 +81,34 @@ class Topic(db.Datetimed, Auditable, db.Document, Owned):
|
|
|
43
81
|
def __str__(self):
|
|
44
82
|
return self.name
|
|
45
83
|
|
|
46
|
-
@classmethod
|
|
47
|
-
def pre_save(cls, sender, document, **kwargs):
|
|
48
|
-
# Try catch is to prevent the mechanism to crash at the
|
|
49
|
-
# creation of the Topic, where an original state does not exist.
|
|
50
|
-
try:
|
|
51
|
-
original_doc = sender.objects.get(id=document.id)
|
|
52
|
-
# Get the diff between the original and current datasets
|
|
53
|
-
datasets_list_dif = set(original_doc.datasets) ^ set(document.datasets)
|
|
54
|
-
except cls.DoesNotExist:
|
|
55
|
-
datasets_list_dif = document.datasets
|
|
56
|
-
for dataset in datasets_list_dif:
|
|
57
|
-
reindex.delay("Dataset", str(dataset.pk))
|
|
58
|
-
|
|
59
84
|
def count_discussions(self):
|
|
60
85
|
# There are no metrics on Topic to store discussions count
|
|
61
86
|
pass
|
|
62
87
|
|
|
88
|
+
@property
|
|
89
|
+
def elements(self):
|
|
90
|
+
"""Get elements associated with this topic"""
|
|
91
|
+
return TopicElement.objects(topic=self)
|
|
92
|
+
|
|
93
|
+
def get_nested_elements_ids(self, cls: str) -> set[str]:
|
|
94
|
+
"""Optimized query to get objects ids from related elements, filtered by class."""
|
|
95
|
+
# Return empty set if topic doesn't have an ID yet
|
|
96
|
+
if not self.id:
|
|
97
|
+
return set()
|
|
98
|
+
|
|
99
|
+
return set(
|
|
100
|
+
str(elem["element"]["_ref"].id)
|
|
101
|
+
for elem in TopicElement.objects.filter(topic=self, __raw__={"element._cls": cls})
|
|
102
|
+
.fields(element=1)
|
|
103
|
+
.no_dereference()
|
|
104
|
+
.as_pymongo()
|
|
105
|
+
)
|
|
106
|
+
|
|
63
107
|
def self_web_url(self, **kwargs):
|
|
64
108
|
# Useful for Discussions to call self_web_url on their `subject`
|
|
65
109
|
return None
|
|
66
110
|
|
|
67
111
|
|
|
68
|
-
pre_save.connect(Topic.pre_save, sender=Topic)
|
|
69
112
|
post_save.connect(Topic.post_save, sender=Topic)
|
|
113
|
+
post_save.connect(TopicElement.post_save, sender=TopicElement)
|
|
114
|
+
post_delete.connect(TopicElement.post_delete, sender=TopicElement)
|
udata/core/topic/parsers.py
CHANGED
|
@@ -3,6 +3,46 @@ from flask_restx.inputs import boolean
|
|
|
3
3
|
|
|
4
4
|
from udata.api import api
|
|
5
5
|
from udata.api.parsers import ModelApiParser
|
|
6
|
+
from udata.core.topic import DEFAULT_PAGE_SIZE
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TopicElementsParser(ModelApiParser):
|
|
10
|
+
def __init__(self):
|
|
11
|
+
super().__init__()
|
|
12
|
+
self.parser.add_argument(
|
|
13
|
+
"page", type=int, default=1, location="args", help="The page to fetch"
|
|
14
|
+
)
|
|
15
|
+
self.parser.add_argument(
|
|
16
|
+
"page_size",
|
|
17
|
+
type=int,
|
|
18
|
+
default=DEFAULT_PAGE_SIZE,
|
|
19
|
+
location="args",
|
|
20
|
+
help="The page size to fetch",
|
|
21
|
+
)
|
|
22
|
+
self.parser.add_argument(
|
|
23
|
+
"class",
|
|
24
|
+
type=str,
|
|
25
|
+
location="args",
|
|
26
|
+
help="The class of elements to fetch (eg. Dataset or Reuse)",
|
|
27
|
+
)
|
|
28
|
+
self.parser.add_argument(
|
|
29
|
+
"q", type=str, location="args", help="query string to search through elements"
|
|
30
|
+
)
|
|
31
|
+
self.parser.add_argument("tag", type=str, location="args", action="append")
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def parse_filters(elements, args):
|
|
35
|
+
if args.get("q"):
|
|
36
|
+
phrase_query = " ".join([f'"{elem}"' for elem in args["q"].split(" ")])
|
|
37
|
+
elements = elements.search_text(phrase_query)
|
|
38
|
+
if args.get("tag"):
|
|
39
|
+
elements = elements.filter(tags__all=args["tag"])
|
|
40
|
+
if element_class := args.get("class"):
|
|
41
|
+
if element_class == "None":
|
|
42
|
+
elements = elements.filter(element=None)
|
|
43
|
+
else:
|
|
44
|
+
elements = elements.filter(__raw__={"element._cls": element_class})
|
|
45
|
+
return elements
|
|
6
46
|
|
|
7
47
|
|
|
8
48
|
class TopicApiParser(ModelApiParser):
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from udata.core.topic.models import TopicElement
|
|
2
|
+
from udata.tasks import job
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@job("purge-topics-elements")
|
|
6
|
+
def purge_topics_elements(self):
|
|
7
|
+
"""
|
|
8
|
+
Purge topic elements that have neither title nor element
|
|
9
|
+
This should run *after* the purge-reuses and purge-datasets jobs
|
|
10
|
+
"""
|
|
11
|
+
TopicElement.objects().filter(element=None, title=None).delete()
|
udata/forms/fields.py
CHANGED
|
@@ -480,7 +480,14 @@ class ModelField(Field):
|
|
|
480
480
|
if not valuelist or len(valuelist) != 1 or not valuelist[0]:
|
|
481
481
|
return
|
|
482
482
|
specs = valuelist[0]
|
|
483
|
-
|
|
483
|
+
|
|
484
|
+
try:
|
|
485
|
+
model_field = getattr(self._form.model_class, self.name)
|
|
486
|
+
# Handle the case where the field it is not fetchable in the model via self.name
|
|
487
|
+
# This can happen in nested forms like NestedModelList, where self.name is {parent}-{index}-{short_name}
|
|
488
|
+
except AttributeError:
|
|
489
|
+
model_field = getattr(self._form.model_class, self.short_name)
|
|
490
|
+
|
|
484
491
|
if isinstance(specs, str):
|
|
485
492
|
specs = {"id": specs}
|
|
486
493
|
elif not specs.get("id", None):
|
udata/harvest/backends/dcat.py
CHANGED
|
@@ -2,10 +2,10 @@ import logging
|
|
|
2
2
|
from datetime import date
|
|
3
3
|
from typing import ClassVar, Generator
|
|
4
4
|
|
|
5
|
-
import lxml.etree as ET
|
|
6
5
|
from flask import current_app
|
|
7
6
|
from rdflib import Graph
|
|
8
7
|
from rdflib.namespace import RDF
|
|
8
|
+
from saxonche import PySaxonProcessor, PyXdmNode
|
|
9
9
|
from typing_extensions import override
|
|
10
10
|
|
|
11
11
|
from udata.core.dataservices.rdf import dataservice_from_rdf
|
|
@@ -47,7 +47,6 @@ KNOWN_PAGINATION = (
|
|
|
47
47
|
)
|
|
48
48
|
|
|
49
49
|
CSW_NAMESPACE = "http://www.opengis.net/cat/csw/2.0.2"
|
|
50
|
-
OWS_NAMESPACE = "http://www.opengis.net/ows"
|
|
51
50
|
|
|
52
51
|
# Useful to patch essential failing URIs
|
|
53
52
|
URIS_TO_REPLACE = {
|
|
@@ -325,9 +324,23 @@ class CswDcatBackend(DcatBackend):
|
|
|
325
324
|
|
|
326
325
|
CSW_OUTPUT_SCHEMA = "http://www.w3.org/ns/dcat#"
|
|
327
326
|
|
|
327
|
+
SAXON_SECURITY_FEATURES = {
|
|
328
|
+
"http://saxon.sf.net/feature/allow-external-functions": "false",
|
|
329
|
+
"http://saxon.sf.net/feature/parserFeature?uri=http://apache.org/xml/features/nonvalidating/load-external-dtd": "false",
|
|
330
|
+
"http://saxon.sf.net/feature/parserFeature?uri=http://xml.org/sax/features/external-general-entities": "false",
|
|
331
|
+
"http://saxon.sf.net/feature/parserFeature?uri=http://xml.org/sax/features/external-parameter-entities": "false",
|
|
332
|
+
}
|
|
333
|
+
|
|
328
334
|
def __init__(self, *args, **kwargs):
|
|
329
335
|
super().__init__(*args, **kwargs)
|
|
330
|
-
self.
|
|
336
|
+
self.saxon_proc = PySaxonProcessor(license=False)
|
|
337
|
+
for feature, value in self.SAXON_SECURITY_FEATURES.items():
|
|
338
|
+
self.saxon_proc.set_configuration_property(feature, value)
|
|
339
|
+
self.saxon_proc.set_configuration_property(
|
|
340
|
+
"http://saxon.sf.net/feature/strip-whitespace", "all"
|
|
341
|
+
)
|
|
342
|
+
self.xpath_proc = self.saxon_proc.new_xpath_processor()
|
|
343
|
+
self.xpath_proc.declare_namespace("csw", CSW_NAMESPACE)
|
|
331
344
|
|
|
332
345
|
def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
|
|
333
346
|
"""
|
|
@@ -341,19 +354,23 @@ class CswDcatBackend(DcatBackend):
|
|
|
341
354
|
response = self.post(url, data=data, headers={"Content-Type": "application/xml"})
|
|
342
355
|
response.raise_for_status()
|
|
343
356
|
|
|
344
|
-
|
|
345
|
-
tree =
|
|
346
|
-
|
|
347
|
-
raise ValueError(f"Failed to query CSW:\n{content}")
|
|
357
|
+
text = response.text
|
|
358
|
+
tree = self.saxon_proc.parse_xml(xml_text=text)
|
|
359
|
+
self.xpath_proc.set_context(xdm_item=tree)
|
|
348
360
|
|
|
349
|
-
|
|
350
|
-
if
|
|
361
|
+
# Using * namespace so we don't have to enumerate ows versions
|
|
362
|
+
if self.xpath_proc.evaluate("/*:ExceptionReport"):
|
|
363
|
+
raise ValueError(f"Failed to query CSW:\n{text}")
|
|
364
|
+
|
|
365
|
+
if r := self.xpath_proc.evaluate("/csw:GetRecordsResponse/csw:SearchResults"):
|
|
366
|
+
search_results = r.head
|
|
367
|
+
else:
|
|
351
368
|
log.error(f"No search results found for {url} on page {page_number}")
|
|
352
369
|
return
|
|
353
370
|
|
|
354
|
-
for result in search_results:
|
|
371
|
+
for result in search_results.children:
|
|
355
372
|
subgraph = Graph(namespace_manager=namespace_manager)
|
|
356
|
-
doc =
|
|
373
|
+
doc = self.as_dcat(result).to_string("utf-8")
|
|
357
374
|
subgraph.parse(data=doc, format=fmt)
|
|
358
375
|
|
|
359
376
|
if not subgraph.subjects(
|
|
@@ -371,7 +388,7 @@ class CswDcatBackend(DcatBackend):
|
|
|
371
388
|
if not start:
|
|
372
389
|
return
|
|
373
390
|
|
|
374
|
-
def as_dcat(self, tree:
|
|
391
|
+
def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
|
|
375
392
|
"""
|
|
376
393
|
Return the input tree as a DCAT tree.
|
|
377
394
|
For CswDcatBackend, this method return the incoming tree as-is, since it's already DCAT.
|
|
@@ -379,10 +396,10 @@ class CswDcatBackend(DcatBackend):
|
|
|
379
396
|
"""
|
|
380
397
|
return tree
|
|
381
398
|
|
|
382
|
-
def next_position(self, start: int, search_results:
|
|
383
|
-
next_record = int(search_results.
|
|
384
|
-
matched_count = int(search_results.
|
|
385
|
-
returned_count = int(search_results.
|
|
399
|
+
def next_position(self, start: int, search_results: PyXdmNode) -> int | None:
|
|
400
|
+
next_record = int(search_results.get_attribute_value("nextRecord"))
|
|
401
|
+
matched_count = int(search_results.get_attribute_value("numberOfRecordsMatched"))
|
|
402
|
+
returned_count = int(search_results.get_attribute_value("numberOfRecordsReturned"))
|
|
386
403
|
|
|
387
404
|
# Break conditions copied gratefully from
|
|
388
405
|
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
|
|
@@ -423,9 +440,13 @@ class CswIso19139DcatBackend(CswDcatBackend):
|
|
|
423
440
|
def __init__(self, *args, **kwargs):
|
|
424
441
|
super().__init__(*args, **kwargs)
|
|
425
442
|
xslt_url = current_app.config["HARVEST_ISO19139_XSLT_URL"]
|
|
426
|
-
|
|
427
|
-
|
|
443
|
+
xslt_text = self.get(xslt_url).text
|
|
444
|
+
xslt_proc = self.saxon_proc.new_xslt30_processor()
|
|
445
|
+
self.xslt_exec = xslt_proc.compile_stylesheet(stylesheet_text=xslt_text)
|
|
446
|
+
self.xslt_exec.set_parameter(
|
|
447
|
+
"CoupledResourceLookUp", self.saxon_proc.make_string_value("disabled")
|
|
448
|
+
)
|
|
428
449
|
|
|
429
450
|
@override
|
|
430
|
-
def as_dcat(self, tree:
|
|
431
|
-
return self.
|
|
451
|
+
def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
|
|
452
|
+
return self.xslt_exec.transform_to_value(xdm_node=tree).head
|
|
@@ -881,6 +881,95 @@ class CswDcatBackendTest:
|
|
|
881
881
|
assert "User-Agent" in get_mock.last_request.headers
|
|
882
882
|
assert get_mock.last_request.headers["User-Agent"] == "uData/0.1 csw-dcat"
|
|
883
883
|
|
|
884
|
+
def test_csw_error(self, rmock):
|
|
885
|
+
exception_report = """<?xml version="1.0" encoding="UTF-8"?>
|
|
886
|
+
<ows:ExceptionReport xmlns:ows="http://www.opengis.net/ows/1.1"
|
|
887
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
888
|
+
xsi:schemaLocation="http://www.opengis.net/ows/1.1 http://schemas.opengis.net/ows/1.1.0/owsExceptionReport.xsd">
|
|
889
|
+
<ows:Exception exceptionCode="MissingParameterValue" locator="request">
|
|
890
|
+
<ows:ExceptionText>Mandatory parameter <request> was not specified</ows:ExceptionText>
|
|
891
|
+
</ows:Exception>
|
|
892
|
+
</ows:ExceptionReport>
|
|
893
|
+
"""
|
|
894
|
+
rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
|
|
895
|
+
rmock.post(rmock.ANY, text=exception_report)
|
|
896
|
+
source = HarvestSourceFactory(backend="csw-dcat")
|
|
897
|
+
|
|
898
|
+
actions.run(source)
|
|
899
|
+
|
|
900
|
+
source.reload()
|
|
901
|
+
job = source.get_last_job()
|
|
902
|
+
|
|
903
|
+
assert len(job.errors) == 1
|
|
904
|
+
assert "Failed to query CSW" in job.errors[0].message
|
|
905
|
+
assert job.status == "failed"
|
|
906
|
+
|
|
907
|
+
def test_disallow_external_entities(self, rmock):
|
|
908
|
+
xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
909
|
+
<!DOCTYPE root [
|
|
910
|
+
<!ENTITY entity SYSTEM "data:text/plain,EXTERNAL">
|
|
911
|
+
]>
|
|
912
|
+
<csw:GetRecordsResponse xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
913
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
914
|
+
xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
|
|
915
|
+
<csw:SearchStatus timestamp="2023-03-03T16:09:50.697645Z" />
|
|
916
|
+
<csw:SearchResults numberOfRecordsMatched="1" numberOfRecordsReturned="1" elementSet="full" nextRecord="0">
|
|
917
|
+
<rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
|
918
|
+
<rdf:Description rdf:about="https://example.com/test/">
|
|
919
|
+
<dct:identifier>https://example.com/test/</dct:identifier>
|
|
920
|
+
<rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
|
|
921
|
+
<dct:title>test&entity;</dct:title>
|
|
922
|
+
</rdf:Description>
|
|
923
|
+
</rdf:RDF>
|
|
924
|
+
</csw:SearchResults>
|
|
925
|
+
</csw:GetRecordsResponse>
|
|
926
|
+
"""
|
|
927
|
+
|
|
928
|
+
rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
|
|
929
|
+
rmock.post(rmock.ANY, text=xml)
|
|
930
|
+
source = HarvestSourceFactory(backend="csw-dcat")
|
|
931
|
+
|
|
932
|
+
actions.run(source)
|
|
933
|
+
|
|
934
|
+
source.reload()
|
|
935
|
+
job = source.get_last_job()
|
|
936
|
+
|
|
937
|
+
assert job.status == "done"
|
|
938
|
+
assert Dataset.objects.first().title == "test"
|
|
939
|
+
|
|
940
|
+
def test_disallow_external_dtd(self, rmock):
|
|
941
|
+
xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
942
|
+
<!DOCTYPE root SYSTEM "http://www.example.com/evil.dtd">
|
|
943
|
+
<csw:GetRecordsResponse xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
944
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
945
|
+
xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
|
|
946
|
+
<csw:SearchStatus timestamp="2023-03-03T16:09:50.697645Z" />
|
|
947
|
+
<csw:SearchResults numberOfRecordsMatched="1" numberOfRecordsReturned="1" elementSet="full" nextRecord="0">
|
|
948
|
+
<rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
|
949
|
+
<rdf:Description rdf:about="https://example.com/test/">
|
|
950
|
+
<dct:identifier>https://example.com/test/</dct:identifier>
|
|
951
|
+
<rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
|
|
952
|
+
<dct:title>test</dct:title>
|
|
953
|
+
</rdf:Description>
|
|
954
|
+
</rdf:RDF>
|
|
955
|
+
</csw:SearchResults>
|
|
956
|
+
</csw:GetRecordsResponse>
|
|
957
|
+
"""
|
|
958
|
+
|
|
959
|
+
rmock.get("http://www.example.com/evil.dtd", status_code=404)
|
|
960
|
+
rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
|
|
961
|
+
rmock.post(rmock.ANY, text=xml)
|
|
962
|
+
source = HarvestSourceFactory(backend="csw-dcat")
|
|
963
|
+
|
|
964
|
+
actions.run(source)
|
|
965
|
+
|
|
966
|
+
source.reload()
|
|
967
|
+
job = source.get_last_job()
|
|
968
|
+
|
|
969
|
+
assert not any(h.method == "GET" for h in rmock.request_history)
|
|
970
|
+
assert job.status == "done"
|
|
971
|
+
assert len(job.items) == 1
|
|
972
|
+
|
|
884
973
|
|
|
885
974
|
@pytest.mark.usefixtures("clean_db")
|
|
886
975
|
@pytest.mark.options(PLUGINS=["csw"])
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Migrate topic.datasets and topics.reuses to topic.elements with TopicElement.topic references"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from bson import DBRef, ObjectId
|
|
6
|
+
from mongoengine.connection import get_db
|
|
7
|
+
|
|
8
|
+
log = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def migrate(db):
|
|
12
|
+
log.info("Processing topics…")
|
|
13
|
+
|
|
14
|
+
topics = get_db().topic.find()
|
|
15
|
+
|
|
16
|
+
for topic in topics:
|
|
17
|
+
log.info(f"Processing topic {topic['_id']}…")
|
|
18
|
+
total_elements = 0
|
|
19
|
+
|
|
20
|
+
# Convert datasets to TopicElement documents
|
|
21
|
+
for dataset_id in topic.get("datasets", []):
|
|
22
|
+
element_doc = {
|
|
23
|
+
"_id": ObjectId(),
|
|
24
|
+
"tags": [],
|
|
25
|
+
"extras": {},
|
|
26
|
+
"element": {"_cls": "Dataset", "_ref": DBRef("dataset", dataset_id)},
|
|
27
|
+
"topic": topic["_id"], # Reference to the topic
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# Insert TopicElement document
|
|
31
|
+
get_db().topic_element.insert_one(element_doc)
|
|
32
|
+
total_elements += 1
|
|
33
|
+
|
|
34
|
+
# Convert reuses to TopicElement documents
|
|
35
|
+
for reuse_id in topic.get("reuses", []):
|
|
36
|
+
element_doc = {
|
|
37
|
+
"_id": ObjectId(),
|
|
38
|
+
"tags": [],
|
|
39
|
+
"extras": {},
|
|
40
|
+
"element": {"_cls": "Reuse", "_ref": DBRef("reuse", reuse_id)},
|
|
41
|
+
"topic": topic["_id"], # Reference to the topic
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# Insert TopicElement document
|
|
45
|
+
get_db().topic_element.insert_one(element_doc)
|
|
46
|
+
total_elements += 1
|
|
47
|
+
|
|
48
|
+
log.info(f"Topic: {topic.get('name', 'Unnamed')} (ID: {topic['_id']})")
|
|
49
|
+
log.info(f" - Converting {len(topic.get('datasets', []))} datasets")
|
|
50
|
+
log.info(f" - Converting {len(topic.get('reuses', []))} reuses")
|
|
51
|
+
log.info(f" - Total elements: {total_elements}")
|
|
52
|
+
|
|
53
|
+
# Remove old fields from topic document
|
|
54
|
+
get_db().topic.update_one(
|
|
55
|
+
{"_id": topic["_id"]},
|
|
56
|
+
{
|
|
57
|
+
"$unset": {"datasets": 1, "reuses": 1}, # Remove old fields
|
|
58
|
+
},
|
|
59
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Delete Topic index 'name_text'"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from mongoengine.connection import get_db
|
|
6
|
+
from pymongo.errors import OperationFailure
|
|
7
|
+
|
|
8
|
+
log = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def migrate(db):
|
|
12
|
+
log.info("Deleting index…")
|
|
13
|
+
|
|
14
|
+
collection = get_db().topic
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
collection.drop_index("name_text")
|
|
18
|
+
except OperationFailure:
|
|
19
|
+
log.info("Index does not exist?", exc_info=True)
|