udata 10.2.1.dev34683__py2.py3-none-any.whl → 10.2.1.dev34728__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- udata/core/dataset/rdf.py +6 -0
- udata/harvest/backends/base.py +1 -1
- udata/harvest/backends/dcat.py +48 -6
- udata/harvest/tests/dcat/catalog.xml +15 -2
- udata/harvest/tests/factories.py +1 -1
- udata/harvest/tests/test_base_backend.py +2 -2
- udata/harvest/tests/test_dcat_backend.py +21 -2
- udata/static/chunks/{11.0f04e49a40a0a381bcce.js → 11.b6f741fcc366abfad9c4.js} +3 -3
- udata/static/chunks/{11.0f04e49a40a0a381bcce.js.map → 11.b6f741fcc366abfad9c4.js.map} +1 -1
- udata/static/chunks/{13.f29411b06be1883356a3.js → 13.2d06442dd9a05d9777b5.js} +2 -2
- udata/static/chunks/{13.f29411b06be1883356a3.js.map → 13.2d06442dd9a05d9777b5.js.map} +1 -1
- udata/static/chunks/{17.3bd0340930d4a314ce9c.js → 17.e8e4caaad5cb0cc0bacc.js} +2 -2
- udata/static/chunks/{17.3bd0340930d4a314ce9c.js.map → 17.e8e4caaad5cb0cc0bacc.js.map} +1 -1
- udata/static/chunks/{19.0586efa786ebf09fb288.js → 19.f03a102365af4315f9db.js} +3 -3
- udata/static/chunks/{19.0586efa786ebf09fb288.js.map → 19.f03a102365af4315f9db.js.map} +1 -1
- udata/static/chunks/{8.b966402f5d680d4bdf4a.js → 8.778091d55cd8ea39af6b.js} +2 -2
- udata/static/chunks/{8.b966402f5d680d4bdf4a.js.map → 8.778091d55cd8ea39af6b.js.map} +1 -1
- udata/static/common.js +1 -1
- udata/static/common.js.map +1 -1
- {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/METADATA +3 -2
- {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/RECORD +25 -25
- {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/LICENSE +0 -0
- {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/WHEEL +0 -0
- {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/entry_points.txt +0 -0
- {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/top_level.txt +0 -0
udata/core/dataset/rdf.py
CHANGED
|
@@ -722,6 +722,10 @@ def resource_from_rdf(graph_or_distrib, dataset=None, is_additionnal=False):
|
|
|
722
722
|
resource.checksum.type = algorithm
|
|
723
723
|
if is_additionnal:
|
|
724
724
|
resource.type = "other"
|
|
725
|
+
elif distrib.value(DCAT.accessService):
|
|
726
|
+
# The distribution has a DCAT.accessService property, we deduce
|
|
727
|
+
# that the distribution is of type API
|
|
728
|
+
resource.type = "api"
|
|
725
729
|
|
|
726
730
|
identifier = rdf_value(distrib, DCT.identifier)
|
|
727
731
|
uri = distrib.identifier.toPython() if isinstance(distrib.identifier, URIRef) else None
|
|
@@ -751,6 +755,8 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None, remote_url_prefix: s
|
|
|
751
755
|
|
|
752
756
|
dataset.title = rdf_value(d, DCT.title)
|
|
753
757
|
if not dataset.title:
|
|
758
|
+
# If the dataset is externaly defined (so without title and just with a link to the dataset XML)
|
|
759
|
+
# we should have skipped it way before in :ExcludeExternalyDefinedDataset
|
|
754
760
|
raise HarvestSkipException("missing title on dataset")
|
|
755
761
|
|
|
756
762
|
# Support dct:abstract if dct:description is missing (sometimes used instead)
|
udata/harvest/backends/base.py
CHANGED
|
@@ -256,7 +256,7 @@ class BaseBackend(object):
|
|
|
256
256
|
]
|
|
257
257
|
self.save_job()
|
|
258
258
|
|
|
259
|
-
def
|
|
259
|
+
def has_reached_max_items(self) -> bool:
|
|
260
260
|
"""Should be called after process_dataset to know if we reach the max items"""
|
|
261
261
|
return self.max_items and len(self.job.items) >= self.max_items
|
|
262
262
|
|
udata/harvest/backends/dcat.py
CHANGED
|
@@ -9,7 +9,7 @@ from rdflib.namespace import RDF
|
|
|
9
9
|
|
|
10
10
|
from udata.core.dataservices.rdf import dataservice_from_rdf
|
|
11
11
|
from udata.core.dataset.rdf import dataset_from_rdf
|
|
12
|
-
from udata.harvest.models import HarvestItem
|
|
12
|
+
from udata.harvest.models import HarvestError, HarvestItem
|
|
13
13
|
from udata.i18n import gettext as _
|
|
14
14
|
from udata.rdf import (
|
|
15
15
|
DCAT,
|
|
@@ -18,6 +18,7 @@ from udata.rdf import (
|
|
|
18
18
|
SPDX,
|
|
19
19
|
guess_format,
|
|
20
20
|
namespace_manager,
|
|
21
|
+
rdf_value,
|
|
21
22
|
url_from_rdf,
|
|
22
23
|
)
|
|
23
24
|
from udata.storage.s3 import store_as_json
|
|
@@ -77,9 +78,19 @@ class DcatBackend(BaseBackend):
|
|
|
77
78
|
self.process_one_datasets_page(page_number, page)
|
|
78
79
|
serialized_graphs.append(page.serialize(format=fmt, indent=None))
|
|
79
80
|
|
|
81
|
+
# We do a second pass to have all datasets in memory and attach datasets
|
|
82
|
+
# to dataservices. It could be better to be one pass of graph walking and
|
|
83
|
+
# then one pass of attaching datasets to dataservices.
|
|
80
84
|
for page_number, page in self.walk_graph(self.source.url, fmt):
|
|
81
85
|
self.process_one_dataservices_page(page_number, page)
|
|
82
86
|
|
|
87
|
+
if not self.dryrun and self.has_reached_max_items():
|
|
88
|
+
# We have reached the max_items limit. Warn the user that all the datasets may not be present.
|
|
89
|
+
error = HarvestError(
|
|
90
|
+
message=f"{self.max_items} max items reached, not all datasets/dataservices were retrieved"
|
|
91
|
+
)
|
|
92
|
+
self.job.errors.append(error)
|
|
93
|
+
|
|
83
94
|
# The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000).
|
|
84
95
|
max_harvest_graph_size_in_mongo = current_app.config.get(
|
|
85
96
|
"HARVEST_MAX_CATALOG_SIZE_IN_MONGO"
|
|
@@ -146,7 +157,7 @@ class DcatBackend(BaseBackend):
|
|
|
146
157
|
break
|
|
147
158
|
|
|
148
159
|
yield page_number, subgraph
|
|
149
|
-
if self.
|
|
160
|
+
if self.has_reached_max_items():
|
|
150
161
|
return
|
|
151
162
|
|
|
152
163
|
page_number += 1
|
|
@@ -154,17 +165,48 @@ class DcatBackend(BaseBackend):
|
|
|
154
165
|
def process_one_datasets_page(self, page_number: int, page: Graph):
|
|
155
166
|
for node in page.subjects(RDF.type, DCAT.Dataset):
|
|
156
167
|
remote_id = page.value(node, DCT.identifier)
|
|
168
|
+
if self.is_dataset_external_to_this_page(page, node):
|
|
169
|
+
continue
|
|
170
|
+
|
|
157
171
|
self.process_dataset(remote_id, page_number=page_number, page=page, node=node)
|
|
158
172
|
|
|
159
|
-
if self.
|
|
173
|
+
if self.has_reached_max_items():
|
|
160
174
|
return
|
|
161
175
|
|
|
176
|
+
def is_dataset_external_to_this_page(self, page: Graph, node) -> bool:
|
|
177
|
+
# In dataservice nodes we have `servesDataset` or `hasPart` that can contains nodes
|
|
178
|
+
# with type=dataset. We don't want to process them because these nodes are empty (they
|
|
179
|
+
# only contains a link to the dataset definition).
|
|
180
|
+
# These datasets are either present in the catalog in previous or next pages or
|
|
181
|
+
# external from the catalog we are currently harvesting (so we don't want to harvest them).
|
|
182
|
+
# First we thought of skipping them inside `dataset_from_rdf` (see :ExcludeExternalyDefinedDataset)
|
|
183
|
+
# but it creates a lot of "fake" items in the job and raising problems (reaching the max harvest item for
|
|
184
|
+
# example and not getting to the "real" datasets/dataservices in subsequent pages)
|
|
185
|
+
# So to prevent creating a lot of useless items in the job we first thought about checking to see if there is no title and
|
|
186
|
+
# if `isPrimaryTopicOf` is present. But it may be better to check if the only link of the node with the current page is a
|
|
187
|
+
# `servesDataset` or `hasPart`. If it's the case, the node is only present in a dataservice. (maybe we could also check that
|
|
188
|
+
# the `_other_node` is a dataservice?)
|
|
189
|
+
# `isPrimaryTopicOf` is the tag present in the first harvester raising the problem, it may exists other
|
|
190
|
+
# values of the same sort we need to check here.
|
|
191
|
+
|
|
192
|
+
# This is not dangerous because we check for missing title in `dataset_from_rdf` later so we would have skipped
|
|
193
|
+
# this dataset anyway.
|
|
194
|
+
resource = page.resource(node)
|
|
195
|
+
title = rdf_value(resource, DCT.title)
|
|
196
|
+
if title:
|
|
197
|
+
return False
|
|
198
|
+
|
|
199
|
+
predicates = [link_type for (_other_node, link_type) in page.subject_predicates(node)]
|
|
200
|
+
return len(predicates) == 1 and (
|
|
201
|
+
predicates[0] == DCAT.servesDataset or predicates[0] == DCT.hasPart
|
|
202
|
+
)
|
|
203
|
+
|
|
162
204
|
def process_one_dataservices_page(self, page_number: int, page: Graph):
|
|
163
205
|
for node in page.subjects(RDF.type, DCAT.DataService):
|
|
164
206
|
remote_id = page.value(node, DCT.identifier)
|
|
165
207
|
self.process_dataservice(remote_id, page_number=page_number, page=page, node=node)
|
|
166
208
|
|
|
167
|
-
if self.
|
|
209
|
+
if self.has_reached_max_items():
|
|
168
210
|
return
|
|
169
211
|
|
|
170
212
|
def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node):
|
|
@@ -266,7 +308,7 @@ class CswDcatBackend(DcatBackend):
|
|
|
266
308
|
subgraph.parse(data=ET.tostring(child), format=fmt)
|
|
267
309
|
|
|
268
310
|
yield page_number, subgraph
|
|
269
|
-
if self.
|
|
311
|
+
if self.has_reached_max_items():
|
|
270
312
|
return
|
|
271
313
|
|
|
272
314
|
next_record = self.next_record_if_should_continue(start, search_results)
|
|
@@ -375,7 +417,7 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
375
417
|
raise ValueError("Failed to fetch CSW content")
|
|
376
418
|
|
|
377
419
|
yield page_number, subgraph
|
|
378
|
-
if self.
|
|
420
|
+
if self.has_reached_max_items():
|
|
379
421
|
return
|
|
380
422
|
|
|
381
423
|
next_record = self.next_record_if_should_continue(start, search_results)
|
|
@@ -61,16 +61,17 @@
|
|
|
61
61
|
<dcat:theme>Theme 1</dcat:theme>
|
|
62
62
|
<dcterms:publisher rdf:resource="http://data.test.org/organizations/1"/>
|
|
63
63
|
<owl:versionInfo>1.0</owl:versionInfo>
|
|
64
|
-
<dcat:distribution rdf:resource="http://data.test.org/datasets/1/resources/2"/>
|
|
65
64
|
<dcat:keyword>Tag 4</dcat:keyword>
|
|
66
65
|
<dcterms:spatial rdf:resource="http://wuEurope.com/"/>
|
|
67
66
|
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T19:01:24.184120</dcterms:modified>
|
|
68
67
|
<dcat:keyword>Tag 2</dcat:keyword>
|
|
69
68
|
<dcat:keyword>Tag 1</dcat:keyword>
|
|
70
|
-
<dcat:distribution rdf:resource="http://data.test.org/datasets/1/resources/1"/>
|
|
71
69
|
<dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T18:59:02.737480</dcterms:issued>
|
|
72
70
|
<dcterms:identifier>1</dcterms:identifier>
|
|
71
|
+
<dcat:distribution rdf:resource="http://data.test.org/datasets/1/resources/1"/>
|
|
72
|
+
<dcat:distribution rdf:resource="http://data.test.org/datasets/1/resources/2"/>
|
|
73
73
|
<dcterms:hasPart rdf:resource="http://data.test.org/datasets/1/resources/3"/>
|
|
74
|
+
<dcat:distribution rdf:resource="http://data.test.org/datasets/1/resources/4"/>
|
|
74
75
|
</dcat:Dataset>
|
|
75
76
|
</dcat:dataset>
|
|
76
77
|
<dcat:dataset>
|
|
@@ -152,6 +153,18 @@
|
|
|
152
153
|
<dcterms:title>Resource 1-3</dcterms:title>
|
|
153
154
|
<dcterms:format>JSON</dcterms:format>
|
|
154
155
|
</foaf:Document>
|
|
156
|
+
<dcat:Distribution rdf:about="http://data.test.org/datasets/1/resources/4">
|
|
157
|
+
<dcterms:description>A resource pointing towards a Geo Service</dcterms:description>
|
|
158
|
+
<dcterms:title>Resource 1-4</dcterms:title>
|
|
159
|
+
<dcat:accessURL>http://data.test.org/datasets/1/resources/4/services?SERVICE=WMS&REQUEST=GetCapabilities&VERSION=1.3.0</dcat:accessURL>
|
|
160
|
+
<dcat:accessService>
|
|
161
|
+
<dcat:DataService>
|
|
162
|
+
<dcterms:title xml:lang="fr">Geo Service</dcterms:title>
|
|
163
|
+
<dcat:endpointURL rdf:resource="http://data.test.org/datasets/1/resources/4/services"/>
|
|
164
|
+
<dcat:endpointDescription rdf:resource="http://data.test.org/datasets/1/resources/4/services?SERVICE=WMS&REQUEST=GetCapabilities&VERSION=1.3.0"/>
|
|
165
|
+
</dcat:DataService>
|
|
166
|
+
</dcat:accessService>
|
|
167
|
+
</dcat:Distribution>
|
|
155
168
|
<!-- resources for dataset 2 -->
|
|
156
169
|
<dcat:Distribution rdf:about="http://data.test.org/datasets/2/resources/1">
|
|
157
170
|
<dcat:accessURL>http://data.test.org/datasets/2/resources/1/file.json</dcat:accessURL>
|
udata/harvest/tests/factories.py
CHANGED
|
@@ -61,7 +61,7 @@ class FactoryBackend(backends.BaseBackend):
|
|
|
61
61
|
mock_initialize.send(self)
|
|
62
62
|
for i in range(self.config.get("count", DEFAULT_COUNT)):
|
|
63
63
|
self.process_dataset(str(i))
|
|
64
|
-
if self.
|
|
64
|
+
if self.has_reached_max_items():
|
|
65
65
|
return
|
|
66
66
|
|
|
67
67
|
def inner_process_dataset(self, item: HarvestItem):
|
|
@@ -44,12 +44,12 @@ class FakeBackend(BaseBackend):
|
|
|
44
44
|
def inner_harvest(self):
|
|
45
45
|
for remote_id in self.source.config.get("dataset_remote_ids", []):
|
|
46
46
|
self.process_dataset(remote_id)
|
|
47
|
-
if self.
|
|
47
|
+
if self.has_reached_max_items():
|
|
48
48
|
return
|
|
49
49
|
|
|
50
50
|
for remote_id in self.source.config.get("dataservice_remote_ids", []):
|
|
51
51
|
self.process_dataservice(remote_id)
|
|
52
|
-
if self.
|
|
52
|
+
if self.has_reached_max_items():
|
|
53
53
|
return
|
|
54
54
|
|
|
55
55
|
def inner_process_dataset(self, item: HarvestItem):
|
|
@@ -137,6 +137,7 @@ class DcatBackendTest:
|
|
|
137
137
|
assert datasets["1"].resources[0].description == "A JSON resource"
|
|
138
138
|
assert datasets["1"].resources[0].format == "json"
|
|
139
139
|
assert datasets["1"].resources[0].mime == "application/json"
|
|
140
|
+
assert datasets["1"].resources[0].type == "main"
|
|
140
141
|
|
|
141
142
|
@pytest.mark.options(
|
|
142
143
|
SCHEMA_CATALOG_URL="https://example.com/schemas",
|
|
@@ -403,6 +404,7 @@ class DcatBackendTest:
|
|
|
403
404
|
assert len(dataset.resources) == 1
|
|
404
405
|
|
|
405
406
|
resource = dataset.resources[0]
|
|
407
|
+
assert resource.type == "main"
|
|
406
408
|
assert resource.checksum is not None
|
|
407
409
|
assert resource.checksum.type == "sha1"
|
|
408
410
|
assert resource.checksum.value == "fb4106aa286a53be44ec99515f0f0421d4d7ad7d"
|
|
@@ -476,7 +478,12 @@ class DcatBackendTest:
|
|
|
476
478
|
|
|
477
479
|
assert job.status == "done"
|
|
478
480
|
assert job.errors == []
|
|
479
|
-
assert len(job.items) ==
|
|
481
|
+
assert len(job.items) == 5
|
|
482
|
+
# 4 datasets and one Dataservice mentionned but not described
|
|
483
|
+
# because it appears in a distribution as DCAT.accessService
|
|
484
|
+
# but is missing a proper DCT.identifier
|
|
485
|
+
assert len([item for item in job.items if item.status == "done"]) == 4
|
|
486
|
+
assert len([item for item in job.items if item.status == "skipped"]) == 1
|
|
480
487
|
|
|
481
488
|
def test_xml_catalog(self, rmock):
|
|
482
489
|
LicenseFactory(id="lov2", title="Licence Ouverte Version 2.0")
|
|
@@ -524,7 +531,7 @@ class DcatBackendTest:
|
|
|
524
531
|
# test dct:license nested in distribution
|
|
525
532
|
assert dataset.license.id == "lov1"
|
|
526
533
|
|
|
527
|
-
assert len(dataset.resources) ==
|
|
534
|
+
assert len(dataset.resources) == 4
|
|
528
535
|
|
|
529
536
|
resource_1 = next(res for res in dataset.resources if res.title == "Resource 1-1")
|
|
530
537
|
assert resource_1.filetype == "remote"
|
|
@@ -549,6 +556,16 @@ class DcatBackendTest:
|
|
|
549
556
|
assert resource_3.url == "http://data.test.org/datasets/1/resources/3"
|
|
550
557
|
assert resource_3.type == "other"
|
|
551
558
|
|
|
559
|
+
# Make sure a resource with an accessService is of type api
|
|
560
|
+
resource_4 = next(res for res in dataset.resources if res.title == "Resource 1-4")
|
|
561
|
+
assert resource_4.format is None
|
|
562
|
+
assert resource_4.description == "A resource pointing towards a Geo Service"
|
|
563
|
+
assert (
|
|
564
|
+
resource_4.url
|
|
565
|
+
== "http://data.test.org/datasets/1/resources/4/services?SERVICE=WMS&REQUEST=GetCapabilities&VERSION=1.3.0"
|
|
566
|
+
)
|
|
567
|
+
assert resource_4.type == "api"
|
|
568
|
+
|
|
552
569
|
# test dct:rights -> license support from dataset
|
|
553
570
|
dataset = Dataset.objects.get(harvest__dct_identifier="2")
|
|
554
571
|
assert dataset.license.id == "lov2"
|
|
@@ -838,6 +855,7 @@ class CswDcatBackendTest:
|
|
|
838
855
|
assert resource.title == "accidento_hdf_L93"
|
|
839
856
|
assert resource.url == "https://www.geo2france.fr/geoserver/cr_hdf/ows"
|
|
840
857
|
assert resource.format == "ogc:wms"
|
|
858
|
+
assert resource.type == "main"
|
|
841
859
|
|
|
842
860
|
def test_user_agent_post(self, rmock):
|
|
843
861
|
url = mock_csw_pagination(rmock, "geonetwork/srv/eng/csw.rdf", "geonetworkv4-page-{}.xml")
|
|
@@ -949,6 +967,7 @@ class CswIso19139DcatBackendTest:
|
|
|
949
967
|
resource.url
|
|
950
968
|
== "http://atom.geo-ide.developpement-durable.gouv.fr/atomArchive/GetResource?id=fr-120066022-ldd-cab63273-b3ae-4e8a-ae1c-6192e45faa94&datasetAggregate=true"
|
|
951
969
|
)
|
|
970
|
+
assert resource.type == "main"
|
|
952
971
|
|
|
953
972
|
# Sadly resource format is parsed as a blank node. Format parsing should be improved.
|
|
954
973
|
assert re.match(r"n[0-9a-f]{32}", resource.format)
|