udata 10.9.0__py2.py3-none-any.whl → 10.9.1.dev37499__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- udata/__init__.py +1 -1
- udata/harvest/backends/dcat.py +41 -20
- udata/harvest/tests/test_dcat_backend.py +89 -0
- udata/static/chunks/{11.b6f741fcc366abfad9c4.js → 11.0f04e49a40a0a381bcce.js} +3 -3
- udata/static/chunks/{11.b6f741fcc366abfad9c4.js.map → 11.0f04e49a40a0a381bcce.js.map} +1 -1
- udata/static/chunks/{13.2d06442dd9a05d9777b5.js → 13.d9c1735d14038b94c17e.js} +2 -2
- udata/static/chunks/{13.2d06442dd9a05d9777b5.js.map → 13.d9c1735d14038b94c17e.js.map} +1 -1
- udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js → 17.81c57c0dedf812e43013.js} +2 -2
- udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js.map → 17.81c57c0dedf812e43013.js.map} +1 -1
- udata/static/chunks/{19.f03a102365af4315f9db.js → 19.8da42e8359d72afc2618.js} +3 -3
- udata/static/chunks/{19.f03a102365af4315f9db.js.map → 19.8da42e8359d72afc2618.js.map} +1 -1
- udata/static/chunks/{8.778091d55cd8ea39af6b.js → 8.494b003a94383b142c18.js} +2 -2
- udata/static/chunks/{8.778091d55cd8ea39af6b.js.map → 8.494b003a94383b142c18.js.map} +1 -1
- udata/static/common.js +1 -1
- udata/static/common.js.map +1 -1
- {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/METADATA +6 -1
- {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/RECORD +21 -21
- {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/LICENSE +0 -0
- {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/WHEEL +0 -0
- {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/entry_points.txt +0 -0
- {udata-10.9.0.dist-info → udata-10.9.1.dev37499.dist-info}/top_level.txt +0 -0
udata/__init__.py
CHANGED
udata/harvest/backends/dcat.py
CHANGED
|
@@ -2,10 +2,10 @@ import logging
|
|
|
2
2
|
from datetime import date
|
|
3
3
|
from typing import ClassVar, Generator
|
|
4
4
|
|
|
5
|
-
import lxml.etree as ET
|
|
6
5
|
from flask import current_app
|
|
7
6
|
from rdflib import Graph
|
|
8
7
|
from rdflib.namespace import RDF
|
|
8
|
+
from saxonche import PySaxonProcessor, PyXdmNode
|
|
9
9
|
from typing_extensions import override
|
|
10
10
|
|
|
11
11
|
from udata.core.dataservices.rdf import dataservice_from_rdf
|
|
@@ -47,7 +47,6 @@ KNOWN_PAGINATION = (
|
|
|
47
47
|
)
|
|
48
48
|
|
|
49
49
|
CSW_NAMESPACE = "http://www.opengis.net/cat/csw/2.0.2"
|
|
50
|
-
OWS_NAMESPACE = "http://www.opengis.net/ows"
|
|
51
50
|
|
|
52
51
|
# Useful to patch essential failing URIs
|
|
53
52
|
URIS_TO_REPLACE = {
|
|
@@ -325,9 +324,23 @@ class CswDcatBackend(DcatBackend):
|
|
|
325
324
|
|
|
326
325
|
CSW_OUTPUT_SCHEMA = "http://www.w3.org/ns/dcat#"
|
|
327
326
|
|
|
327
|
+
SAXON_SECURITY_FEATURES = {
|
|
328
|
+
"http://saxon.sf.net/feature/allow-external-functions": "false",
|
|
329
|
+
"http://saxon.sf.net/feature/parserFeature?uri=http://apache.org/xml/features/nonvalidating/load-external-dtd": "false",
|
|
330
|
+
"http://saxon.sf.net/feature/parserFeature?uri=http://xml.org/sax/features/external-general-entities": "false",
|
|
331
|
+
"http://saxon.sf.net/feature/parserFeature?uri=http://xml.org/sax/features/external-parameter-entities": "false",
|
|
332
|
+
}
|
|
333
|
+
|
|
328
334
|
def __init__(self, *args, **kwargs):
|
|
329
335
|
super().__init__(*args, **kwargs)
|
|
330
|
-
self.
|
|
336
|
+
self.saxon_proc = PySaxonProcessor(license=False)
|
|
337
|
+
for feature, value in self.SAXON_SECURITY_FEATURES.items():
|
|
338
|
+
self.saxon_proc.set_configuration_property(feature, value)
|
|
339
|
+
self.saxon_proc.set_configuration_property(
|
|
340
|
+
"http://saxon.sf.net/feature/strip-whitespace", "all"
|
|
341
|
+
)
|
|
342
|
+
self.xpath_proc = self.saxon_proc.new_xpath_processor()
|
|
343
|
+
self.xpath_proc.declare_namespace("csw", CSW_NAMESPACE)
|
|
331
344
|
|
|
332
345
|
def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
|
|
333
346
|
"""
|
|
@@ -341,19 +354,23 @@ class CswDcatBackend(DcatBackend):
|
|
|
341
354
|
response = self.post(url, data=data, headers={"Content-Type": "application/xml"})
|
|
342
355
|
response.raise_for_status()
|
|
343
356
|
|
|
344
|
-
|
|
345
|
-
tree =
|
|
346
|
-
|
|
347
|
-
raise ValueError(f"Failed to query CSW:\n{content}")
|
|
357
|
+
text = response.text
|
|
358
|
+
tree = self.saxon_proc.parse_xml(xml_text=text)
|
|
359
|
+
self.xpath_proc.set_context(xdm_item=tree)
|
|
348
360
|
|
|
349
|
-
|
|
350
|
-
if
|
|
361
|
+
# Using * namespace so we don't have to enumerate ows versions
|
|
362
|
+
if self.xpath_proc.evaluate("/*:ExceptionReport"):
|
|
363
|
+
raise ValueError(f"Failed to query CSW:\n{text}")
|
|
364
|
+
|
|
365
|
+
if r := self.xpath_proc.evaluate("/csw:GetRecordsResponse/csw:SearchResults"):
|
|
366
|
+
search_results = r.head
|
|
367
|
+
else:
|
|
351
368
|
log.error(f"No search results found for {url} on page {page_number}")
|
|
352
369
|
return
|
|
353
370
|
|
|
354
|
-
for result in search_results:
|
|
371
|
+
for result in search_results.children:
|
|
355
372
|
subgraph = Graph(namespace_manager=namespace_manager)
|
|
356
|
-
doc =
|
|
373
|
+
doc = self.as_dcat(result).to_string("utf-8")
|
|
357
374
|
subgraph.parse(data=doc, format=fmt)
|
|
358
375
|
|
|
359
376
|
if not subgraph.subjects(
|
|
@@ -371,7 +388,7 @@ class CswDcatBackend(DcatBackend):
|
|
|
371
388
|
if not start:
|
|
372
389
|
return
|
|
373
390
|
|
|
374
|
-
def as_dcat(self, tree:
|
|
391
|
+
def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
|
|
375
392
|
"""
|
|
376
393
|
Return the input tree as a DCAT tree.
|
|
377
394
|
For CswDcatBackend, this method return the incoming tree as-is, since it's already DCAT.
|
|
@@ -379,10 +396,10 @@ class CswDcatBackend(DcatBackend):
|
|
|
379
396
|
"""
|
|
380
397
|
return tree
|
|
381
398
|
|
|
382
|
-
def next_position(self, start: int, search_results:
|
|
383
|
-
next_record = int(search_results.
|
|
384
|
-
matched_count = int(search_results.
|
|
385
|
-
returned_count = int(search_results.
|
|
399
|
+
def next_position(self, start: int, search_results: PyXdmNode) -> int | None:
|
|
400
|
+
next_record = int(search_results.get_attribute_value("nextRecord"))
|
|
401
|
+
matched_count = int(search_results.get_attribute_value("numberOfRecordsMatched"))
|
|
402
|
+
returned_count = int(search_results.get_attribute_value("numberOfRecordsReturned"))
|
|
386
403
|
|
|
387
404
|
# Break conditions copied gratefully from
|
|
388
405
|
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
|
|
@@ -423,9 +440,13 @@ class CswIso19139DcatBackend(CswDcatBackend):
|
|
|
423
440
|
def __init__(self, *args, **kwargs):
|
|
424
441
|
super().__init__(*args, **kwargs)
|
|
425
442
|
xslt_url = current_app.config["HARVEST_ISO19139_XSLT_URL"]
|
|
426
|
-
|
|
427
|
-
|
|
443
|
+
xslt_text = self.get(xslt_url).text
|
|
444
|
+
xslt_proc = self.saxon_proc.new_xslt30_processor()
|
|
445
|
+
self.xslt_exec = xslt_proc.compile_stylesheet(stylesheet_text=xslt_text)
|
|
446
|
+
self.xslt_exec.set_parameter(
|
|
447
|
+
"CoupledResourceLookUp", self.saxon_proc.make_string_value("disabled")
|
|
448
|
+
)
|
|
428
449
|
|
|
429
450
|
@override
|
|
430
|
-
def as_dcat(self, tree:
|
|
431
|
-
return self.
|
|
451
|
+
def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
|
|
452
|
+
return self.xslt_exec.transform_to_value(xdm_node=tree).head
|
|
@@ -881,6 +881,95 @@ class CswDcatBackendTest:
|
|
|
881
881
|
assert "User-Agent" in get_mock.last_request.headers
|
|
882
882
|
assert get_mock.last_request.headers["User-Agent"] == "uData/0.1 csw-dcat"
|
|
883
883
|
|
|
884
|
+
def test_csw_error(self, rmock):
|
|
885
|
+
exception_report = """<?xml version="1.0" encoding="UTF-8"?>
|
|
886
|
+
<ows:ExceptionReport xmlns:ows="http://www.opengis.net/ows/1.1"
|
|
887
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
888
|
+
xsi:schemaLocation="http://www.opengis.net/ows/1.1 http://schemas.opengis.net/ows/1.1.0/owsExceptionReport.xsd">
|
|
889
|
+
<ows:Exception exceptionCode="MissingParameterValue" locator="request">
|
|
890
|
+
<ows:ExceptionText>Mandatory parameter <request> was not specified</ows:ExceptionText>
|
|
891
|
+
</ows:Exception>
|
|
892
|
+
</ows:ExceptionReport>
|
|
893
|
+
"""
|
|
894
|
+
rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
|
|
895
|
+
rmock.post(rmock.ANY, text=exception_report)
|
|
896
|
+
source = HarvestSourceFactory(backend="csw-dcat")
|
|
897
|
+
|
|
898
|
+
actions.run(source)
|
|
899
|
+
|
|
900
|
+
source.reload()
|
|
901
|
+
job = source.get_last_job()
|
|
902
|
+
|
|
903
|
+
assert len(job.errors) == 1
|
|
904
|
+
assert "Failed to query CSW" in job.errors[0].message
|
|
905
|
+
assert job.status == "failed"
|
|
906
|
+
|
|
907
|
+
def test_disallow_external_entities(self, rmock):
|
|
908
|
+
xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
909
|
+
<!DOCTYPE root [
|
|
910
|
+
<!ENTITY entity SYSTEM "data:text/plain,EXTERNAL">
|
|
911
|
+
]>
|
|
912
|
+
<csw:GetRecordsResponse xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
913
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
914
|
+
xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
|
|
915
|
+
<csw:SearchStatus timestamp="2023-03-03T16:09:50.697645Z" />
|
|
916
|
+
<csw:SearchResults numberOfRecordsMatched="1" numberOfRecordsReturned="1" elementSet="full" nextRecord="0">
|
|
917
|
+
<rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
|
918
|
+
<rdf:Description rdf:about="https://example.com/test/">
|
|
919
|
+
<dct:identifier>https://example.com/test/</dct:identifier>
|
|
920
|
+
<rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
|
|
921
|
+
<dct:title>test&entity;</dct:title>
|
|
922
|
+
</rdf:Description>
|
|
923
|
+
</rdf:RDF>
|
|
924
|
+
</csw:SearchResults>
|
|
925
|
+
</csw:GetRecordsResponse>
|
|
926
|
+
"""
|
|
927
|
+
|
|
928
|
+
rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
|
|
929
|
+
rmock.post(rmock.ANY, text=xml)
|
|
930
|
+
source = HarvestSourceFactory(backend="csw-dcat")
|
|
931
|
+
|
|
932
|
+
actions.run(source)
|
|
933
|
+
|
|
934
|
+
source.reload()
|
|
935
|
+
job = source.get_last_job()
|
|
936
|
+
|
|
937
|
+
assert job.status == "done"
|
|
938
|
+
assert Dataset.objects.first().title == "test"
|
|
939
|
+
|
|
940
|
+
def test_disallow_external_dtd(self, rmock):
|
|
941
|
+
xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
942
|
+
<!DOCTYPE root SYSTEM "http://www.example.com/evil.dtd">
|
|
943
|
+
<csw:GetRecordsResponse xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
944
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
945
|
+
xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
|
|
946
|
+
<csw:SearchStatus timestamp="2023-03-03T16:09:50.697645Z" />
|
|
947
|
+
<csw:SearchResults numberOfRecordsMatched="1" numberOfRecordsReturned="1" elementSet="full" nextRecord="0">
|
|
948
|
+
<rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
|
949
|
+
<rdf:Description rdf:about="https://example.com/test/">
|
|
950
|
+
<dct:identifier>https://example.com/test/</dct:identifier>
|
|
951
|
+
<rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
|
|
952
|
+
<dct:title>test</dct:title>
|
|
953
|
+
</rdf:Description>
|
|
954
|
+
</rdf:RDF>
|
|
955
|
+
</csw:SearchResults>
|
|
956
|
+
</csw:GetRecordsResponse>
|
|
957
|
+
"""
|
|
958
|
+
|
|
959
|
+
rmock.get("http://www.example.com/evil.dtd", status_code=404)
|
|
960
|
+
rmock.head(rmock.ANY, headers={"Content-Type": "application/xml"})
|
|
961
|
+
rmock.post(rmock.ANY, text=xml)
|
|
962
|
+
source = HarvestSourceFactory(backend="csw-dcat")
|
|
963
|
+
|
|
964
|
+
actions.run(source)
|
|
965
|
+
|
|
966
|
+
source.reload()
|
|
967
|
+
job = source.get_last_job()
|
|
968
|
+
|
|
969
|
+
assert not any(h.method == "GET" for h in rmock.request_history)
|
|
970
|
+
assert job.status == "done"
|
|
971
|
+
assert len(job.items) == 1
|
|
972
|
+
|
|
884
973
|
|
|
885
974
|
@pytest.mark.usefixtures("clean_db")
|
|
886
975
|
@pytest.mark.options(PLUGINS=["csw"])
|