udata 10.9.1.dev37462__py2.py3-none-any.whl → 10.9.1.dev37604__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (45) hide show
  1. udata/api/__init__.py +0 -1
  2. udata/core/dataset/api.py +1 -1
  3. udata/core/dataset/search.py +5 -2
  4. udata/core/dataset/tasks.py +2 -5
  5. udata/core/reuse/tasks.py +3 -0
  6. udata/core/topic/__init__.py +1 -0
  7. udata/core/topic/api_fields.py +87 -0
  8. udata/core/topic/apiv2.py +116 -194
  9. udata/core/topic/factories.py +69 -8
  10. udata/core/topic/forms.py +58 -4
  11. udata/core/topic/models.py +65 -20
  12. udata/core/topic/parsers.py +40 -0
  13. udata/core/topic/tasks.py +11 -0
  14. udata/forms/fields.py +8 -1
  15. udata/harvest/backends/dcat.py +41 -20
  16. udata/harvest/tests/test_dcat_backend.py +89 -0
  17. udata/migrations/2025-05-26-migrate-topics-to-elements.py +59 -0
  18. udata/migrations/2025-06-02-delete-topic-name-index.py +19 -0
  19. udata/static/chunks/{11.51d706fb9521c16976bc.js → 11.822f6ccb39c92c796d13.js} +3 -3
  20. udata/static/chunks/{11.51d706fb9521c16976bc.js.map → 11.822f6ccb39c92c796d13.js.map} +1 -1
  21. udata/static/chunks/{13.f29411b06be1883356a3.js → 13.d9c1735d14038b94c17e.js} +2 -2
  22. udata/static/chunks/{13.f29411b06be1883356a3.js.map → 13.d9c1735d14038b94c17e.js.map} +1 -1
  23. udata/static/chunks/{17.3bd0340930d4a314ce9c.js → 17.81c57c0dedf812e43013.js} +2 -2
  24. udata/static/chunks/{17.3bd0340930d4a314ce9c.js.map → 17.81c57c0dedf812e43013.js.map} +1 -1
  25. udata/static/chunks/{8.b966402f5d680d4bdf4a.js → 8.0f42630e6d8ff782928e.js} +2 -2
  26. udata/static/chunks/{8.b966402f5d680d4bdf4a.js.map → 8.0f42630e6d8ff782928e.js.map} +1 -1
  27. udata/static/common.js +1 -1
  28. udata/static/common.js.map +1 -1
  29. udata/tasks.py +1 -0
  30. udata/tests/api/test_datasets_api.py +3 -2
  31. udata/tests/apiv2/test_me_api.py +2 -2
  32. udata/tests/apiv2/test_topics.py +457 -127
  33. udata/tests/dataset/test_dataset_tasks.py +7 -2
  34. udata/tests/reuse/test_reuse_task.py +9 -0
  35. udata/tests/search/test_adapter.py +43 -0
  36. udata/tests/test_topics.py +19 -8
  37. udata/tests/topic/test_topic_tasks.py +27 -0
  38. {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/METADATA +4 -2
  39. {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/RECORD +43 -40
  40. udata/core/topic/api.py +0 -145
  41. udata/tests/api/test_topics_api.py +0 -284
  42. {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/LICENSE +0 -0
  43. {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/WHEEL +0 -0
  44. {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/entry_points.txt +0 -0
  45. {udata-10.9.1.dev37462.dist-info → udata-10.9.1.dev37604.dist-info}/top_level.txt +0 -0
udata/core/topic/forms.py CHANGED
@@ -2,9 +2,30 @@ from udata.core.spatial.forms import SpatialCoverageField
2
2
  from udata.forms import ModelForm, fields, validators
3
3
  from udata.i18n import lazy_gettext as _
4
4
 
5
- from .models import Topic
5
+ from .models import Topic, TopicElement
6
6
 
7
- __all__ = ("TopicForm",)
7
+ __all__ = ("TopicForm", "TopicElementForm")
8
+
9
+
10
+ class TopicElementForm(ModelForm):
11
+ model_class = TopicElement
12
+
13
+ title = fields.StringField(_("Title"))
14
+ description = fields.StringField(_("Description"))
15
+ tags = fields.TagField(_("Tags"))
16
+ extras = fields.ExtrasField()
17
+ element = fields.ModelField(_("Element"))
18
+
19
+ def validate(self, extra_validators=None):
20
+ """
21
+ Make sure that either title or element is set.
22
+ (Empty nested element is a valid use case for "placeholder" elements)
23
+ """
24
+ validation = super().validate(extra_validators)
25
+ if not self.element.data and not self.title.data:
26
+ self.element.errors.append(_("A topic element must have a title or an element."))
27
+ return False
28
+ return validation
8
29
 
9
30
 
10
31
  class TopicForm(ModelForm):
@@ -16,8 +37,41 @@ class TopicForm(ModelForm):
16
37
  name = fields.StringField(_("Name"), [validators.DataRequired()])
17
38
  description = fields.MarkdownField(_("Description"), [validators.DataRequired()])
18
39
 
19
- datasets = fields.DatasetListField(_("Associated datasets"))
20
- reuses = fields.ReuseListField(_("Associated reuses"))
40
+ elements = fields.NestedModelList(TopicElementForm)
41
+
42
+ @property
43
+ def data(self):
44
+ """Override to exclude non-model fields from data"""
45
+ # Get the base data from WTForms
46
+ base_data = super().data
47
+ # Filter out non-model fields
48
+ return {name: value for name, value in base_data.items() if name != "elements"}
49
+
50
+ def populate_obj(self, obj):
51
+ """Override populate_obj to exclude non-model fields"""
52
+ # Only populate model fields, skip elements
53
+ for name, field in self._fields.items():
54
+ if name != "elements":
55
+ field.populate_obj(obj, name)
56
+
57
+ def save(self, commit=True, **kwargs):
58
+ """Custom save to handle TopicElement creation properly"""
59
+ # Store elements data before parent save
60
+ elements_data = self.elements.data
61
+
62
+ # Use parent save method (elements field is excluded via populate_obj)
63
+ topic = super().save(commit=commit, **kwargs)
64
+
65
+ # Create elements and associate them with the topic
66
+ for element_data in elements_data or []:
67
+ element_form = TopicElementForm(data=element_data)
68
+ if element_form.validate():
69
+ element = element_form.save(commit=False)
70
+ element.topic = topic
71
+ if commit:
72
+ element.save()
73
+
74
+ return topic
21
75
 
22
76
  spatial = SpatialCoverageField(
23
77
  _("Spatial coverage"), description=_("The geographical area covered by the data.")
@@ -1,13 +1,47 @@
1
1
  from blinker import Signal
2
- from mongoengine.signals import post_save, pre_save
2
+ from mongoengine.signals import post_delete, post_save
3
3
 
4
4
  from udata.api_fields import field
5
5
  from udata.core.activity.models import Auditable
6
+ from udata.core.dataset.models import Dataset
6
7
  from udata.core.owned import Owned, OwnedQuerySet
8
+ from udata.core.reuse.models import Reuse
7
9
  from udata.models import SpatialCoverage, db
8
10
  from udata.search import reindex
11
+ from udata.tasks import as_task_param
9
12
 
10
- __all__ = ("Topic",)
13
+ __all__ = ("Topic", "TopicElement")
14
+
15
+
16
+ class TopicElement(db.Document):
17
+ title = field(db.StringField(required=False))
18
+ description = field(db.StringField(required=False))
19
+ tags = field(db.ListField(db.StringField()))
20
+ extras = field(db.ExtrasField())
21
+ element = field(db.GenericReferenceField(choices=[Dataset, Reuse]))
22
+ # Made optional to allow proper form handling with commit=False
23
+ topic = field(db.ReferenceField("Topic", required=False))
24
+
25
+ meta = {
26
+ "indexes": [
27
+ {
28
+ "fields": ["$title", "$description"],
29
+ }
30
+ ],
31
+ "auto_create_index_on_save": True,
32
+ }
33
+
34
+ @classmethod
35
+ def post_save(cls, sender, document, **kwargs):
36
+ """Trigger reindex when element is saved"""
37
+ if document.topic and document.element and hasattr(document.element, "id"):
38
+ reindex.delay(*as_task_param(document.element))
39
+
40
+ @classmethod
41
+ def post_delete(cls, sender, document, **kwargs):
42
+ """Trigger reindex when element is deleted"""
43
+ if document.topic and document.element and hasattr(document.element, "id"):
44
+ reindex.delay(*as_task_param(document.element))
11
45
 
12
46
 
13
47
  class Topic(db.Datetimed, Auditable, db.Document, Owned):
@@ -20,9 +54,6 @@ class Topic(db.Datetimed, Auditable, db.Document, Owned):
20
54
  tags = field(db.ListField(db.StringField()))
21
55
  color = field(db.IntField())
22
56
 
23
- datasets = field(db.ListField(db.LazyReferenceField("Dataset", reverse_delete_rule=db.PULL)))
24
- reuses = field(db.ListField(db.LazyReferenceField("Reuse", reverse_delete_rule=db.PULL)))
25
-
26
57
  featured = field(db.BooleanField(default=False), auditable=False)
27
58
  private = field(db.BooleanField())
28
59
  extras = field(db.ExtrasField(), auditable=False)
@@ -30,7 +61,14 @@ class Topic(db.Datetimed, Auditable, db.Document, Owned):
30
61
  spatial = field(db.EmbeddedDocumentField(SpatialCoverage))
31
62
 
32
63
  meta = {
33
- "indexes": ["$name", "created_at", "slug"] + Owned.meta["indexes"],
64
+ "indexes": [
65
+ {
66
+ "fields": ["$name", "$description"],
67
+ },
68
+ "created_at",
69
+ "slug",
70
+ ]
71
+ + Owned.meta["indexes"],
34
72
  "ordering": ["-created_at"],
35
73
  "auto_create_index_on_save": True,
36
74
  "queryset_class": OwnedQuerySet,
@@ -43,27 +81,34 @@ class Topic(db.Datetimed, Auditable, db.Document, Owned):
43
81
  def __str__(self):
44
82
  return self.name
45
83
 
46
- @classmethod
47
- def pre_save(cls, sender, document, **kwargs):
48
- # Try catch is to prevent the mechanism to crash at the
49
- # creation of the Topic, where an original state does not exist.
50
- try:
51
- original_doc = sender.objects.get(id=document.id)
52
- # Get the diff between the original and current datasets
53
- datasets_list_dif = set(original_doc.datasets) ^ set(document.datasets)
54
- except cls.DoesNotExist:
55
- datasets_list_dif = document.datasets
56
- for dataset in datasets_list_dif:
57
- reindex.delay("Dataset", str(dataset.pk))
58
-
59
84
  def count_discussions(self):
60
85
  # There are no metrics on Topic to store discussions count
61
86
  pass
62
87
 
88
+ @property
89
+ def elements(self):
90
+ """Get elements associated with this topic"""
91
+ return TopicElement.objects(topic=self)
92
+
93
+ def get_nested_elements_ids(self, cls: str) -> set[str]:
94
+ """Optimized query to get objects ids from related elements, filtered by class."""
95
+ # Return empty set if topic doesn't have an ID yet
96
+ if not self.id:
97
+ return set()
98
+
99
+ return set(
100
+ str(elem["element"]["_ref"].id)
101
+ for elem in TopicElement.objects.filter(topic=self, __raw__={"element._cls": cls})
102
+ .fields(element=1)
103
+ .no_dereference()
104
+ .as_pymongo()
105
+ )
106
+
63
107
  def self_web_url(self, **kwargs):
64
108
  # Useful for Discussions to call self_web_url on their `subject`
65
109
  return None
66
110
 
67
111
 
68
- pre_save.connect(Topic.pre_save, sender=Topic)
69
112
  post_save.connect(Topic.post_save, sender=Topic)
113
+ post_save.connect(TopicElement.post_save, sender=TopicElement)
114
+ post_delete.connect(TopicElement.post_delete, sender=TopicElement)
@@ -3,6 +3,46 @@ from flask_restx.inputs import boolean
3
3
 
4
4
  from udata.api import api
5
5
  from udata.api.parsers import ModelApiParser
6
+ from udata.core.topic import DEFAULT_PAGE_SIZE
7
+
8
+
9
+ class TopicElementsParser(ModelApiParser):
10
+ def __init__(self):
11
+ super().__init__()
12
+ self.parser.add_argument(
13
+ "page", type=int, default=1, location="args", help="The page to fetch"
14
+ )
15
+ self.parser.add_argument(
16
+ "page_size",
17
+ type=int,
18
+ default=DEFAULT_PAGE_SIZE,
19
+ location="args",
20
+ help="The page size to fetch",
21
+ )
22
+ self.parser.add_argument(
23
+ "class",
24
+ type=str,
25
+ location="args",
26
+ help="The class of elements to fetch (eg. Dataset or Reuse)",
27
+ )
28
+ self.parser.add_argument(
29
+ "q", type=str, location="args", help="query string to search through elements"
30
+ )
31
+ self.parser.add_argument("tag", type=str, location="args", action="append")
32
+
33
+ @staticmethod
34
+ def parse_filters(elements, args):
35
+ if args.get("q"):
36
+ phrase_query = " ".join([f'"{elem}"' for elem in args["q"].split(" ")])
37
+ elements = elements.search_text(phrase_query)
38
+ if args.get("tag"):
39
+ elements = elements.filter(tags__all=args["tag"])
40
+ if element_class := args.get("class"):
41
+ if element_class == "None":
42
+ elements = elements.filter(element=None)
43
+ else:
44
+ elements = elements.filter(__raw__={"element._cls": element_class})
45
+ return elements
6
46
 
7
47
 
8
48
  class TopicApiParser(ModelApiParser):
@@ -0,0 +1,11 @@
1
+ from udata.core.topic.models import TopicElement
2
+ from udata.tasks import job
3
+
4
+
5
+ @job("purge-topics-elements")
6
+ def purge_topics_elements(self):
7
+ """
8
+ Purge topic elements that have neither title nor element
9
+ This should run *after* the purge-reuses and purge-datasets jobs
10
+ """
11
+ TopicElement.objects().filter(element=None, title=None).delete()
udata/forms/fields.py CHANGED
@@ -480,7 +480,14 @@ class ModelField(Field):
480
480
  if not valuelist or len(valuelist) != 1 or not valuelist[0]:
481
481
  return
482
482
  specs = valuelist[0]
483
- model_field = getattr(self._form.model_class, self.name)
483
+
484
+ try:
485
+ model_field = getattr(self._form.model_class, self.name)
486
+ # Handle the case where the field it is not fetchable in the model via self.name
487
+ # This can happen in nested forms like NestedModelList, where self.name is {parent}-{index}-{short_name}
488
+ except AttributeError:
489
+ model_field = getattr(self._form.model_class, self.short_name)
490
+
484
491
  if isinstance(specs, str):
485
492
  specs = {"id": specs}
486
493
  elif not specs.get("id", None):
@@ -2,10 +2,10 @@ import logging
2
2
  from datetime import date
3
3
  from typing import ClassVar, Generator
4
4
 
5
- import lxml.etree as ET
6
5
  from flask import current_app
7
6
  from rdflib import Graph
8
7
  from rdflib.namespace import RDF
8
+ from saxonche import PySaxonProcessor, PyXdmNode
9
9
  from typing_extensions import override
10
10
 
11
11
  from udata.core.dataservices.rdf import dataservice_from_rdf
@@ -47,7 +47,6 @@ KNOWN_PAGINATION = (
47
47
  )
48
48
 
49
49
  CSW_NAMESPACE = "http://www.opengis.net/cat/csw/2.0.2"
50
- OWS_NAMESPACE = "http://www.opengis.net/ows"
51
50
 
52
51
  # Useful to patch essential failing URIs
53
52
  URIS_TO_REPLACE = {
@@ -325,9 +324,23 @@ class CswDcatBackend(DcatBackend):
325
324
 
326
325
  CSW_OUTPUT_SCHEMA = "http://www.w3.org/ns/dcat#"
327
326
 
327
+ SAXON_SECURITY_FEATURES = {
328
+ "http://saxon.sf.net/feature/allow-external-functions": "false",
329
+ "http://saxon.sf.net/feature/parserFeature?uri=http://apache.org/xml/features/nonvalidating/load-external-dtd": "false",
330
+ "http://saxon.sf.net/feature/parserFeature?uri=http://xml.org/sax/features/external-general-entities": "false",
331
+ "http://saxon.sf.net/feature/parserFeature?uri=http://xml.org/sax/features/external-parameter-entities": "false",
332
+ }
333
+
328
334
  def __init__(self, *args, **kwargs):
329
335
  super().__init__(*args, **kwargs)
330
- self.xml_parser = ET.XMLParser(resolve_entities=False)
336
+ self.saxon_proc = PySaxonProcessor(license=False)
337
+ for feature, value in self.SAXON_SECURITY_FEATURES.items():
338
+ self.saxon_proc.set_configuration_property(feature, value)
339
+ self.saxon_proc.set_configuration_property(
340
+ "http://saxon.sf.net/feature/strip-whitespace", "all"
341
+ )
342
+ self.xpath_proc = self.saxon_proc.new_xpath_processor()
343
+ self.xpath_proc.declare_namespace("csw", CSW_NAMESPACE)
331
344
 
332
345
  def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
333
346
  """
@@ -341,19 +354,23 @@ class CswDcatBackend(DcatBackend):
341
354
  response = self.post(url, data=data, headers={"Content-Type": "application/xml"})
342
355
  response.raise_for_status()
343
356
 
344
- content = response.content
345
- tree = ET.fromstring(content, parser=self.xml_parser)
346
- if tree.tag == "{" + OWS_NAMESPACE + "}ExceptionReport":
347
- raise ValueError(f"Failed to query CSW:\n{content}")
357
+ text = response.text
358
+ tree = self.saxon_proc.parse_xml(xml_text=text)
359
+ self.xpath_proc.set_context(xdm_item=tree)
348
360
 
349
- search_results = tree.find("csw:SearchResults", {"csw": CSW_NAMESPACE})
350
- if not search_results:
361
+ # Using * namespace so we don't have to enumerate ows versions
362
+ if self.xpath_proc.evaluate("/*:ExceptionReport"):
363
+ raise ValueError(f"Failed to query CSW:\n{text}")
364
+
365
+ if r := self.xpath_proc.evaluate("/csw:GetRecordsResponse/csw:SearchResults"):
366
+ search_results = r.head
367
+ else:
351
368
  log.error(f"No search results found for {url} on page {page_number}")
352
369
  return
353
370
 
354
- for result in search_results:
371
+ for result in search_results.children:
355
372
  subgraph = Graph(namespace_manager=namespace_manager)
356
- doc = ET.tostring(self.as_dcat(result))
373
+ doc = self.as_dcat(result).to_string("utf-8")
357
374
  subgraph.parse(data=doc, format=fmt)
358
375
 
359
376
  if not subgraph.subjects(
@@ -371,7 +388,7 @@ class CswDcatBackend(DcatBackend):
371
388
  if not start:
372
389
  return
373
390
 
374
- def as_dcat(self, tree: ET._Element) -> ET._Element:
391
+ def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
375
392
  """
376
393
  Return the input tree as a DCAT tree.
377
394
  For CswDcatBackend, this method return the incoming tree as-is, since it's already DCAT.
@@ -379,10 +396,10 @@ class CswDcatBackend(DcatBackend):
379
396
  """
380
397
  return tree
381
398
 
382
- def next_position(self, start: int, search_results: ET._Element) -> int | None:
383
- next_record = int(search_results.attrib["nextRecord"])
384
- matched_count = int(search_results.attrib["numberOfRecordsMatched"])
385
- returned_count = int(search_results.attrib["numberOfRecordsReturned"])
399
+ def next_position(self, start: int, search_results: PyXdmNode) -> int | None:
400
+ next_record = int(search_results.get_attribute_value("nextRecord"))
401
+ matched_count = int(search_results.get_attribute_value("numberOfRecordsMatched"))
402
+ returned_count = int(search_results.get_attribute_value("numberOfRecordsReturned"))
386
403
 
387
404
  # Break conditions copied gratefully from
388
405
  # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
@@ -423,9 +440,13 @@ class CswIso19139DcatBackend(CswDcatBackend):
423
440
  def __init__(self, *args, **kwargs):
424
441
  super().__init__(*args, **kwargs)
425
442
  xslt_url = current_app.config["HARVEST_ISO19139_XSLT_URL"]
426
- xslt = ET.fromstring(self.get(xslt_url).content, parser=self.xml_parser)
427
- self.transform = ET.XSLT(xslt)
443
+ xslt_text = self.get(xslt_url).text
444
+ xslt_proc = self.saxon_proc.new_xslt30_processor()
445
+ self.xslt_exec = xslt_proc.compile_stylesheet(stylesheet_text=xslt_text)
446
+ self.xslt_exec.set_parameter(
447
+ "CoupledResourceLookUp", self.saxon_proc.make_string_value("disabled")
448
+ )
428
449
 
429
450
  @override
430
- def as_dcat(self, tree: ET._Element) -> ET._Element:
431
- return self.transform(tree, CoupledResourceLookUp="'disabled'")
451
+ def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
452
+ return self.xslt_exec.transform_to_value(xdm_node=tree).head
@@ -881,6 +881,95 @@ class CswDcatBackendTest:
881
881
  assert "User-Agent" in get_mock.last_request.headers
882
882
  assert get_mock.last_request.headers["User-Agent"] == "uData/0.1 csw-dcat"
883
883
 
884
+ def test_csw_error(self, rmock):
885
+ exception_report = """<?xml version="1.0" encoding="UTF-8"?>
886
+ <ows:ExceptionReport xmlns:ows="http://www.opengis.net/ows/1.1"
887
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
888
+ xsi:schemaLocation="http://www.opengis.net/ows/1.1 http://schemas.opengis.net/ows/1.1.0/owsExceptionReport.xsd">
889
+ <ows:Exception exceptionCode="MissingParameterValue" locator="request">
890
+ <ows:ExceptionText>Mandatory parameter &lt;request&gt; was not specified</ows:ExceptionText>
891
+ </ows:Exception>
892
+ </ows:ExceptionReport>
893
+ """
894
+ rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
895
+ rmock.post(rmock.ANY, text=exception_report)
896
+ source = HarvestSourceFactory(backend="csw-dcat")
897
+
898
+ actions.run(source)
899
+
900
+ source.reload()
901
+ job = source.get_last_job()
902
+
903
+ assert len(job.errors) == 1
904
+ assert "Failed to query CSW" in job.errors[0].message
905
+ assert job.status == "failed"
906
+
907
+ def test_disallow_external_entities(self, rmock):
908
+ xml = """<?xml version="1.0" encoding="UTF-8"?>
909
+ <!DOCTYPE root [
910
+ <!ENTITY entity SYSTEM "data:text/plain,EXTERNAL">
911
+ ]>
912
+ <csw:GetRecordsResponse xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
913
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
914
+ xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
915
+ <csw:SearchStatus timestamp="2023-03-03T16:09:50.697645Z" />
916
+ <csw:SearchResults numberOfRecordsMatched="1" numberOfRecordsReturned="1" elementSet="full" nextRecord="0">
917
+ <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
918
+ <rdf:Description rdf:about="https://example.com/test/">
919
+ <dct:identifier>https://example.com/test/</dct:identifier>
920
+ <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
921
+ <dct:title>test&entity;</dct:title>
922
+ </rdf:Description>
923
+ </rdf:RDF>
924
+ </csw:SearchResults>
925
+ </csw:GetRecordsResponse>
926
+ """
927
+
928
+ rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
929
+ rmock.post(rmock.ANY, text=xml)
930
+ source = HarvestSourceFactory(backend="csw-dcat")
931
+
932
+ actions.run(source)
933
+
934
+ source.reload()
935
+ job = source.get_last_job()
936
+
937
+ assert job.status == "done"
938
+ assert Dataset.objects.first().title == "test"
939
+
940
+ def test_disallow_external_dtd(self, rmock):
941
+ xml = """<?xml version="1.0" encoding="UTF-8"?>
942
+ <!DOCTYPE root SYSTEM "http://www.example.com/evil.dtd">
943
+ <csw:GetRecordsResponse xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
944
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
945
+ xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
946
+ <csw:SearchStatus timestamp="2023-03-03T16:09:50.697645Z" />
947
+ <csw:SearchResults numberOfRecordsMatched="1" numberOfRecordsReturned="1" elementSet="full" nextRecord="0">
948
+ <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
949
+ <rdf:Description rdf:about="https://example.com/test/">
950
+ <dct:identifier>https://example.com/test/</dct:identifier>
951
+ <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
952
+ <dct:title>test</dct:title>
953
+ </rdf:Description>
954
+ </rdf:RDF>
955
+ </csw:SearchResults>
956
+ </csw:GetRecordsResponse>
957
+ """
958
+
959
+ rmock.get("http://www.example.com/evil.dtd", status_code=404)
960
+ rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
961
+ rmock.post(rmock.ANY, text=xml)
962
+ source = HarvestSourceFactory(backend="csw-dcat")
963
+
964
+ actions.run(source)
965
+
966
+ source.reload()
967
+ job = source.get_last_job()
968
+
969
+ assert not any(h.method == "GET" for h in rmock.request_history)
970
+ assert job.status == "done"
971
+ assert len(job.items) == 1
972
+
884
973
 
885
974
  @pytest.mark.usefixtures("clean_db")
886
975
  @pytest.mark.options(PLUGINS=["csw"])
@@ -0,0 +1,59 @@
1
+ """Migrate topic.datasets and topics.reuses to topic.elements with TopicElement.topic references"""
2
+
3
+ import logging
4
+
5
+ from bson import DBRef, ObjectId
6
+ from mongoengine.connection import get_db
7
+
8
+ log = logging.getLogger(__name__)
9
+
10
+
11
+ def migrate(db):
12
+ log.info("Processing topics…")
13
+
14
+ topics = get_db().topic.find()
15
+
16
+ for topic in topics:
17
+ log.info(f"Processing topic {topic['_id']}…")
18
+ total_elements = 0
19
+
20
+ # Convert datasets to TopicElement documents
21
+ for dataset_id in topic.get("datasets", []):
22
+ element_doc = {
23
+ "_id": ObjectId(),
24
+ "tags": [],
25
+ "extras": {},
26
+ "element": {"_cls": "Dataset", "_ref": DBRef("dataset", dataset_id)},
27
+ "topic": topic["_id"], # Reference to the topic
28
+ }
29
+
30
+ # Insert TopicElement document
31
+ get_db().topic_element.insert_one(element_doc)
32
+ total_elements += 1
33
+
34
+ # Convert reuses to TopicElement documents
35
+ for reuse_id in topic.get("reuses", []):
36
+ element_doc = {
37
+ "_id": ObjectId(),
38
+ "tags": [],
39
+ "extras": {},
40
+ "element": {"_cls": "Reuse", "_ref": DBRef("reuse", reuse_id)},
41
+ "topic": topic["_id"], # Reference to the topic
42
+ }
43
+
44
+ # Insert TopicElement document
45
+ get_db().topic_element.insert_one(element_doc)
46
+ total_elements += 1
47
+
48
+ log.info(f"Topic: {topic.get('name', 'Unnamed')} (ID: {topic['_id']})")
49
+ log.info(f" - Converting {len(topic.get('datasets', []))} datasets")
50
+ log.info(f" - Converting {len(topic.get('reuses', []))} reuses")
51
+ log.info(f" - Total elements: {total_elements}")
52
+
53
+ # Remove old fields from topic document
54
+ get_db().topic.update_one(
55
+ {"_id": topic["_id"]},
56
+ {
57
+ "$unset": {"datasets": 1, "reuses": 1}, # Remove old fields
58
+ },
59
+ )
@@ -0,0 +1,19 @@
1
+ """Delete Topic index 'name_text'"""
2
+
3
+ import logging
4
+
5
+ from mongoengine.connection import get_db
6
+ from pymongo.errors import OperationFailure
7
+
8
+ log = logging.getLogger(__name__)
9
+
10
+
11
+ def migrate(db):
12
+ log.info("Deleting index…")
13
+
14
+ collection = get_db().topic
15
+
16
+ try:
17
+ collection.drop_index("name_text")
18
+ except OperationFailure:
19
+ log.info("Index does not exist?", exc_info=True)