udata 10.9.0__py2.py3-none-any.whl → 10.9.1.dev37499__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (21) hide show
  1. udata/__init__.py +1 -1
  2. udata/harvest/backends/dcat.py +41 -20
  3. udata/harvest/tests/test_dcat_backend.py +89 -0
  4. udata/static/chunks/{11.b6f741fcc366abfad9c4.js → 11.0f04e49a40a0a381bcce.js} +3 -3
  5. udata/static/chunks/{11.b6f741fcc366abfad9c4.js.map → 11.0f04e49a40a0a381bcce.js.map} +1 -1
  6. udata/static/chunks/{13.2d06442dd9a05d9777b5.js → 13.d9c1735d14038b94c17e.js} +2 -2
  7. udata/static/chunks/{13.2d06442dd9a05d9777b5.js.map → 13.d9c1735d14038b94c17e.js.map} +1 -1
  8. udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js → 17.81c57c0dedf812e43013.js} +2 -2
  9. udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js.map → 17.81c57c0dedf812e43013.js.map} +1 -1
  10. udata/static/chunks/{19.f03a102365af4315f9db.js → 19.8da42e8359d72afc2618.js} +3 -3
  11. udata/static/chunks/{19.f03a102365af4315f9db.js.map → 19.8da42e8359d72afc2618.js.map} +1 -1
  12. udata/static/chunks/{8.778091d55cd8ea39af6b.js → 8.494b003a94383b142c18.js} +2 -2
  13. udata/static/chunks/{8.778091d55cd8ea39af6b.js.map → 8.494b003a94383b142c18.js.map} +1 -1
  14. udata/static/common.js +1 -1
  15. udata/static/common.js.map +1 -1
  16. {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/METADATA +6 -1
  17. {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/RECORD +21 -21
  18. {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/LICENSE +0 -0
  19. {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/WHEEL +0 -0
  20. {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/entry_points.txt +0 -0
  21. {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/top_level.txt +0 -0
udata/__init__.py CHANGED
@@ -4,5 +4,5 @@
4
4
  udata
5
5
  """
6
6
 
7
- __version__ = "10.9.0"
7
+ __version__ = "10.9.1.dev"
8
8
  __description__ = "Open data portal"
@@ -2,10 +2,10 @@ import logging
2
2
  from datetime import date
3
3
  from typing import ClassVar, Generator
4
4
 
5
- import lxml.etree as ET
6
5
  from flask import current_app
7
6
  from rdflib import Graph
8
7
  from rdflib.namespace import RDF
8
+ from saxonche import PySaxonProcessor, PyXdmNode
9
9
  from typing_extensions import override
10
10
 
11
11
  from udata.core.dataservices.rdf import dataservice_from_rdf
@@ -47,7 +47,6 @@ KNOWN_PAGINATION = (
47
47
  )
48
48
 
49
49
  CSW_NAMESPACE = "http://www.opengis.net/cat/csw/2.0.2"
50
- OWS_NAMESPACE = "http://www.opengis.net/ows"
51
50
 
52
51
  # Useful to patch essential failing URIs
53
52
  URIS_TO_REPLACE = {
@@ -325,9 +324,23 @@ class CswDcatBackend(DcatBackend):
325
324
 
326
325
  CSW_OUTPUT_SCHEMA = "http://www.w3.org/ns/dcat#"
327
326
 
327
+ SAXON_SECURITY_FEATURES = {
328
+ "http://saxon.sf.net/feature/allow-external-functions": "false",
329
+ "http://saxon.sf.net/feature/parserFeature?uri=http://apache.org/xml/features/nonvalidating/load-external-dtd": "false",
330
+ "http://saxon.sf.net/feature/parserFeature?uri=http://xml.org/sax/features/external-general-entities": "false",
331
+ "http://saxon.sf.net/feature/parserFeature?uri=http://xml.org/sax/features/external-parameter-entities": "false",
332
+ }
333
+
328
334
  def __init__(self, *args, **kwargs):
329
335
  super().__init__(*args, **kwargs)
330
- self.xml_parser = ET.XMLParser(resolve_entities=False)
336
+ self.saxon_proc = PySaxonProcessor(license=False)
337
+ for feature, value in self.SAXON_SECURITY_FEATURES.items():
338
+ self.saxon_proc.set_configuration_property(feature, value)
339
+ self.saxon_proc.set_configuration_property(
340
+ "http://saxon.sf.net/feature/strip-whitespace", "all"
341
+ )
342
+ self.xpath_proc = self.saxon_proc.new_xpath_processor()
343
+ self.xpath_proc.declare_namespace("csw", CSW_NAMESPACE)
331
344
 
332
345
  def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
333
346
  """
@@ -341,19 +354,23 @@ class CswDcatBackend(DcatBackend):
341
354
  response = self.post(url, data=data, headers={"Content-Type": "application/xml"})
342
355
  response.raise_for_status()
343
356
 
344
- content = response.content
345
- tree = ET.fromstring(content, parser=self.xml_parser)
346
- if tree.tag == "{" + OWS_NAMESPACE + "}ExceptionReport":
347
- raise ValueError(f"Failed to query CSW:\n{content}")
357
+ text = response.text
358
+ tree = self.saxon_proc.parse_xml(xml_text=text)
359
+ self.xpath_proc.set_context(xdm_item=tree)
348
360
 
349
- search_results = tree.find("csw:SearchResults", {"csw": CSW_NAMESPACE})
350
- if not search_results:
361
+ # Using * namespace so we don't have to enumerate ows versions
362
+ if self.xpath_proc.evaluate("/*:ExceptionReport"):
363
+ raise ValueError(f"Failed to query CSW:\n{text}")
364
+
365
+ if r := self.xpath_proc.evaluate("/csw:GetRecordsResponse/csw:SearchResults"):
366
+ search_results = r.head
367
+ else:
351
368
  log.error(f"No search results found for {url} on page {page_number}")
352
369
  return
353
370
 
354
- for result in search_results:
371
+ for result in search_results.children:
355
372
  subgraph = Graph(namespace_manager=namespace_manager)
356
- doc = ET.tostring(self.as_dcat(result))
373
+ doc = self.as_dcat(result).to_string("utf-8")
357
374
  subgraph.parse(data=doc, format=fmt)
358
375
 
359
376
  if not subgraph.subjects(
@@ -371,7 +388,7 @@ class CswDcatBackend(DcatBackend):
371
388
  if not start:
372
389
  return
373
390
 
374
- def as_dcat(self, tree: ET._Element) -> ET._Element:
391
+ def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
375
392
  """
376
393
  Return the input tree as a DCAT tree.
377
394
  For CswDcatBackend, this method return the incoming tree as-is, since it's already DCAT.
@@ -379,10 +396,10 @@ class CswDcatBackend(DcatBackend):
379
396
  """
380
397
  return tree
381
398
 
382
- def next_position(self, start: int, search_results: ET._Element) -> int | None:
383
- next_record = int(search_results.attrib["nextRecord"])
384
- matched_count = int(search_results.attrib["numberOfRecordsMatched"])
385
- returned_count = int(search_results.attrib["numberOfRecordsReturned"])
399
+ def next_position(self, start: int, search_results: PyXdmNode) -> int | None:
400
+ next_record = int(search_results.get_attribute_value("nextRecord"))
401
+ matched_count = int(search_results.get_attribute_value("numberOfRecordsMatched"))
402
+ returned_count = int(search_results.get_attribute_value("numberOfRecordsReturned"))
386
403
 
387
404
  # Break conditions copied gratefully from
388
405
  # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
@@ -423,9 +440,13 @@ class CswIso19139DcatBackend(CswDcatBackend):
423
440
  def __init__(self, *args, **kwargs):
424
441
  super().__init__(*args, **kwargs)
425
442
  xslt_url = current_app.config["HARVEST_ISO19139_XSLT_URL"]
426
- xslt = ET.fromstring(self.get(xslt_url).content, parser=self.xml_parser)
427
- self.transform = ET.XSLT(xslt)
443
+ xslt_text = self.get(xslt_url).text
444
+ xslt_proc = self.saxon_proc.new_xslt30_processor()
445
+ self.xslt_exec = xslt_proc.compile_stylesheet(stylesheet_text=xslt_text)
446
+ self.xslt_exec.set_parameter(
447
+ "CoupledResourceLookUp", self.saxon_proc.make_string_value("disabled")
448
+ )
428
449
 
429
450
  @override
430
- def as_dcat(self, tree: ET._Element) -> ET._Element:
431
- return self.transform(tree, CoupledResourceLookUp="'disabled'")
451
+ def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
452
+ return self.xslt_exec.transform_to_value(xdm_node=tree).head
@@ -881,6 +881,95 @@ class CswDcatBackendTest:
881
881
  assert "User-Agent" in get_mock.last_request.headers
882
882
  assert get_mock.last_request.headers["User-Agent"] == "uData/0.1 csw-dcat"
883
883
 
884
+ def test_csw_error(self, rmock):
885
+ exception_report = """<?xml version="1.0" encoding="UTF-8"?>
886
+ <ows:ExceptionReport xmlns:ows="http://www.opengis.net/ows/1.1"
887
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
888
+ xsi:schemaLocation="http://www.opengis.net/ows/1.1 http://schemas.opengis.net/ows/1.1.0/owsExceptionReport.xsd">
889
+ <ows:Exception exceptionCode="MissingParameterValue" locator="request">
890
+ <ows:ExceptionText>Mandatory parameter &lt;request&gt; was not specified</ows:ExceptionText>
891
+ </ows:Exception>
892
+ </ows:ExceptionReport>
893
+ """
894
+ rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
895
+ rmock.post(rmock.ANY, text=exception_report)
896
+ source = HarvestSourceFactory(backend="csw-dcat")
897
+
898
+ actions.run(source)
899
+
900
+ source.reload()
901
+ job = source.get_last_job()
902
+
903
+ assert len(job.errors) == 1
904
+ assert "Failed to query CSW" in job.errors[0].message
905
+ assert job.status == "failed"
906
+
907
+ def test_disallow_external_entities(self, rmock):
908
+ xml = """<?xml version="1.0" encoding="UTF-8"?>
909
+ <!DOCTYPE root [
910
+ <!ENTITY entity SYSTEM "data:text/plain,EXTERNAL">
911
+ ]>
912
+ <csw:GetRecordsResponse xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
913
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
914
+ xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
915
+ <csw:SearchStatus timestamp="2023-03-03T16:09:50.697645Z" />
916
+ <csw:SearchResults numberOfRecordsMatched="1" numberOfRecordsReturned="1" elementSet="full" nextRecord="0">
917
+ <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
918
+ <rdf:Description rdf:about="https://example.com/test/">
919
+ <dct:identifier>https://example.com/test/</dct:identifier>
920
+ <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
921
+ <dct:title>test&entity;</dct:title>
922
+ </rdf:Description>
923
+ </rdf:RDF>
924
+ </csw:SearchResults>
925
+ </csw:GetRecordsResponse>
926
+ """
927
+
928
+ rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
929
+ rmock.post(rmock.ANY, text=xml)
930
+ source = HarvestSourceFactory(backend="csw-dcat")
931
+
932
+ actions.run(source)
933
+
934
+ source.reload()
935
+ job = source.get_last_job()
936
+
937
+ assert job.status == "done"
938
+ assert Dataset.objects.first().title == "test"
939
+
940
+ def test_disallow_external_dtd(self, rmock):
941
+ xml = """<?xml version="1.0" encoding="UTF-8"?>
942
+ <!DOCTYPE root SYSTEM "http://www.example.com/evil.dtd">
943
+ <csw:GetRecordsResponse xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
944
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
945
+ xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
946
+ <csw:SearchStatus timestamp="2023-03-03T16:09:50.697645Z" />
947
+ <csw:SearchResults numberOfRecordsMatched="1" numberOfRecordsReturned="1" elementSet="full" nextRecord="0">
948
+ <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
949
+ <rdf:Description rdf:about="https://example.com/test/">
950
+ <dct:identifier>https://example.com/test/</dct:identifier>
951
+ <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
952
+ <dct:title>test</dct:title>
953
+ </rdf:Description>
954
+ </rdf:RDF>
955
+ </csw:SearchResults>
956
+ </csw:GetRecordsResponse>
957
+ """
958
+
959
+ rmock.get("http://www.example.com/evil.dtd", status_code=404)
960
+ rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
961
+ rmock.post(rmock.ANY, text=xml)
962
+ source = HarvestSourceFactory(backend="csw-dcat")
963
+
964
+ actions.run(source)
965
+
966
+ source.reload()
967
+ job = source.get_last_job()
968
+
969
+ assert not any(h.method == "GET" for h in rmock.request_history)
970
+ assert job.status == "done"
971
+ assert len(job.items) == 1
972
+
884
973
 
885
974
  @pytest.mark.usefixtures("clean_db")
886
975
  @pytest.mark.options(PLUGINS=["csw"])