udata 10.8.2.dev37001__py2.py3-none-any.whl → 10.8.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- udata/__init__.py +1 -1
- udata/core/activity/models.py +23 -1
- udata/core/dataset/api_fields.py +2 -0
- udata/core/dataset/apiv2.py +4 -0
- udata/core/dataset/constants.py +1 -0
- udata/core/dataset/csv.py +1 -0
- udata/core/dataset/forms.py +6 -0
- udata/core/dataset/metrics.py +34 -0
- udata/core/dataset/models.py +15 -3
- udata/core/dataset/tasks.py +0 -11
- udata/core/metrics/__init__.py +1 -0
- udata/core/metrics/commands.py +3 -0
- udata/core/organization/csv.py +9 -26
- udata/core/organization/metrics.py +2 -0
- udata/core/organization/models.py +14 -9
- udata/core/user/metrics.py +2 -0
- udata/harvest/backends/dcat.py +161 -165
- udata/harvest/tests/ckan/test_ckan_backend.py +1 -1
- udata/harvest/tests/dcat/catalog.xml +1 -0
- udata/harvest/tests/test_dcat_backend.py +19 -6
- udata/migrations/2025-07-30-purge-old-harvest-dynamic-fields.py +29 -0
- udata/settings.py +1 -1
- udata/static/chunks/{13.2d06442dd9a05d9777b5.js → 13.d9c1735d14038b94c17e.js} +2 -2
- udata/static/chunks/{13.2d06442dd9a05d9777b5.js.map → 13.d9c1735d14038b94c17e.js.map} +1 -1
- udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js → 17.81c57c0dedf812e43013.js} +2 -2
- udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js.map → 17.81c57c0dedf812e43013.js.map} +1 -1
- udata/static/chunks/{19.f03a102365af4315f9db.js → 19.8d03c06efcac6884bebe.js} +3 -3
- udata/static/chunks/{19.f03a102365af4315f9db.js.map → 19.8d03c06efcac6884bebe.js.map} +1 -1
- udata/static/chunks/{5.0fa1408dae4e76b87b2e.js → 5.343ca020a2d38cec1a14.js} +3 -3
- udata/static/chunks/{5.0fa1408dae4e76b87b2e.js.map → 5.343ca020a2d38cec1a14.js.map} +1 -1
- udata/static/chunks/{6.d663709d877baa44a71e.js → 6.a3b07de9dd2ca2d24e85.js} +3 -3
- udata/static/chunks/{6.d663709d877baa44a71e.js.map → 6.a3b07de9dd2ca2d24e85.js.map} +1 -1
- udata/static/chunks/{8.778091d55cd8ea39af6b.js → 8.b966402f5d680d4bdf4a.js} +2 -2
- udata/static/chunks/{8.778091d55cd8ea39af6b.js.map → 8.b966402f5d680d4bdf4a.js.map} +1 -1
- udata/static/common.js +1 -1
- udata/static/common.js.map +1 -1
- udata/tests/api/test_datasets_api.py +0 -46
- udata/tests/dataset/test_dataset_model.py +63 -17
- udata/tests/organization/test_csv_adapter.py +3 -15
- udata/tests/reuse/test_reuse_model.py +6 -4
- udata/translations/ar/LC_MESSAGES/udata.mo +0 -0
- udata/translations/ar/LC_MESSAGES/udata.po +62 -54
- udata/translations/de/LC_MESSAGES/udata.mo +0 -0
- udata/translations/de/LC_MESSAGES/udata.po +62 -54
- udata/translations/es/LC_MESSAGES/udata.mo +0 -0
- udata/translations/es/LC_MESSAGES/udata.po +62 -54
- udata/translations/fr/LC_MESSAGES/udata.mo +0 -0
- udata/translations/fr/LC_MESSAGES/udata.po +62 -54
- udata/translations/it/LC_MESSAGES/udata.mo +0 -0
- udata/translations/it/LC_MESSAGES/udata.po +62 -54
- udata/translations/pt/LC_MESSAGES/udata.mo +0 -0
- udata/translations/pt/LC_MESSAGES/udata.po +62 -54
- udata/translations/sr/LC_MESSAGES/udata.mo +0 -0
- udata/translations/sr/LC_MESSAGES/udata.po +62 -54
- udata/translations/udata.pot +63 -56
- udata/utils.py +16 -0
- {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/METADATA +16 -3
- {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/RECORD +62 -61
- udata/harvest/backends/ckan/models.py +0 -10
- {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/LICENSE +0 -0
- {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/WHEEL +0 -0
- {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/entry_points.txt +0 -0
- {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/top_level.txt +0 -0
udata/harvest/backends/dcat.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from datetime import date
|
|
3
|
-
from typing import Generator
|
|
3
|
+
from typing import ClassVar, Generator
|
|
4
4
|
|
|
5
5
|
import lxml.etree as ET
|
|
6
6
|
from flask import current_app
|
|
7
7
|
from rdflib import Graph
|
|
8
8
|
from rdflib.namespace import RDF
|
|
9
|
+
from typing_extensions import override
|
|
9
10
|
|
|
10
11
|
from udata.core.dataservices.rdf import dataservice_from_rdf
|
|
11
12
|
from udata.core.dataset.rdf import dataset_from_rdf
|
|
@@ -55,9 +56,6 @@ URIS_TO_REPLACE = {
|
|
|
55
56
|
}
|
|
56
57
|
|
|
57
58
|
|
|
58
|
-
SAFE_PARSER = ET.XMLParser(resolve_entities=False)
|
|
59
|
-
|
|
60
|
-
|
|
61
59
|
def extract_graph(source, target, node, specs):
|
|
62
60
|
for p, o in source.predicate_objects(node):
|
|
63
61
|
target.add((node, p, o))
|
|
@@ -68,20 +66,28 @@ def extract_graph(source, target, node, specs):
|
|
|
68
66
|
class DcatBackend(BaseBackend):
|
|
69
67
|
display_name = "DCAT"
|
|
70
68
|
|
|
69
|
+
def __init__(self, *args, **kwargs):
|
|
70
|
+
super().__init__(*args, **kwargs)
|
|
71
|
+
self.organizations_to_update = set()
|
|
72
|
+
|
|
71
73
|
def inner_harvest(self):
|
|
72
74
|
fmt = self.get_format()
|
|
73
75
|
self.job.data = {"format": fmt}
|
|
74
76
|
|
|
75
|
-
|
|
77
|
+
pages = []
|
|
76
78
|
|
|
77
79
|
for page_number, page in self.walk_graph(self.source.url, fmt):
|
|
78
80
|
self.process_one_datasets_page(page_number, page)
|
|
79
|
-
|
|
81
|
+
pages.append((page_number, page))
|
|
82
|
+
|
|
83
|
+
for org in self.organizations_to_update:
|
|
84
|
+
org.compute_aggregate_metrics = True
|
|
85
|
+
org.count_datasets()
|
|
80
86
|
|
|
81
87
|
# We do a second pass to have all datasets in memory and attach datasets
|
|
82
88
|
# to dataservices. It could be better to be one pass of graph walking and
|
|
83
89
|
# then one pass of attaching datasets to dataservices.
|
|
84
|
-
for page_number, page in
|
|
90
|
+
for page_number, page in pages:
|
|
85
91
|
self.process_one_dataservices_page(page_number, page)
|
|
86
92
|
|
|
87
93
|
if not self.dryrun and self.has_reached_max_items():
|
|
@@ -100,6 +106,8 @@ class DcatBackend(BaseBackend):
|
|
|
100
106
|
|
|
101
107
|
bucket = current_app.config.get("HARVEST_GRAPHS_S3_BUCKET")
|
|
102
108
|
|
|
109
|
+
serialized_graphs = [p.serialize(format=fmt, indent=None) for _, p in pages]
|
|
110
|
+
|
|
103
111
|
if (
|
|
104
112
|
bucket is not None
|
|
105
113
|
and sum([len(g.encode("utf-8")) for g in serialized_graphs])
|
|
@@ -202,7 +210,10 @@ class DcatBackend(BaseBackend):
|
|
|
202
210
|
)
|
|
203
211
|
|
|
204
212
|
def process_one_dataservices_page(self, page_number: int, page: Graph):
|
|
213
|
+
access_services = {o for _, _, o in page.triples((None, DCAT.accessService, None))}
|
|
205
214
|
for node in page.subjects(RDF.type, DCAT.DataService):
|
|
215
|
+
if node in access_services:
|
|
216
|
+
continue
|
|
206
217
|
remote_id = page.value(node, DCT.identifier)
|
|
207
218
|
self.process_dataservice(remote_id, page_number=page_number, page=page, node=node)
|
|
208
219
|
|
|
@@ -214,7 +225,11 @@ class DcatBackend(BaseBackend):
|
|
|
214
225
|
|
|
215
226
|
dataset = self.get_dataset(item.remote_id)
|
|
216
227
|
remote_url_prefix = self.get_extra_config_value("remote_url_prefix")
|
|
217
|
-
|
|
228
|
+
dataset = dataset_from_rdf(page, dataset, node=node, remote_url_prefix=remote_url_prefix)
|
|
229
|
+
if dataset.organization:
|
|
230
|
+
dataset.organization.compute_aggregate_metrics = False
|
|
231
|
+
self.organizations_to_update.add(dataset.organization)
|
|
232
|
+
return dataset
|
|
218
233
|
|
|
219
234
|
def inner_process_dataservice(self, item: HarvestItem, page_number: int, page: Graph, node):
|
|
220
235
|
item.kwargs["page_number"] = page_number
|
|
@@ -235,104 +250,165 @@ class DcatBackend(BaseBackend):
|
|
|
235
250
|
return node
|
|
236
251
|
raise ValueError(f"Unable to find dataset with DCT.identifier:{item.remote_id}")
|
|
237
252
|
|
|
238
|
-
def next_record_if_should_continue(self, start, search_results):
|
|
239
|
-
next_record = int(search_results.attrib["nextRecord"])
|
|
240
|
-
matched_count = int(search_results.attrib["numberOfRecordsMatched"])
|
|
241
|
-
returned_count = int(search_results.attrib["numberOfRecordsReturned"])
|
|
242
253
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
# Misbehaving CSW server returning a next record > matched count
|
|
249
|
-
next_record > matched_count,
|
|
250
|
-
# No results returned already
|
|
251
|
-
returned_count == 0,
|
|
252
|
-
# Current next record is lower than previous one
|
|
253
|
-
next_record < start,
|
|
254
|
-
# Enough items have been harvested already
|
|
255
|
-
self.max_items and len(self.job.items) >= self.max_items,
|
|
256
|
-
)
|
|
254
|
+
class CswDcatBackend(DcatBackend):
|
|
255
|
+
"""
|
|
256
|
+
CSW harvester fetching records as DCAT.
|
|
257
|
+
The parsing of items is then the same as for the DcatBackend.
|
|
258
|
+
"""
|
|
257
259
|
|
|
258
|
-
|
|
259
|
-
return None
|
|
260
|
-
else:
|
|
261
|
-
return next_record
|
|
260
|
+
display_name = "CSW-DCAT"
|
|
262
261
|
|
|
262
|
+
# CSW_REQUEST is based on:
|
|
263
|
+
# - Request syntax from spec [1] and example requests [1] [2].
|
|
264
|
+
# - Sort settings to ensure stable paging [3].
|
|
265
|
+
# - Filter settings to only retrieve record types currently mapped in udata.
|
|
266
|
+
#
|
|
267
|
+
# If you modify the request, make sure:
|
|
268
|
+
# - `typeNames` and `outputSchema` are consistent. You'll likely want to keep "gmd:MD_Metadata",
|
|
269
|
+
# since "csw:Record" contains less information.
|
|
270
|
+
# - `typeNames` and namespaces in `csw:Query` (`Filter`, `SortBy`, ...) are consistent, although
|
|
271
|
+
# they are ignored on some servers [4] [5].
|
|
272
|
+
# - It works on real catalogs! Not many servers implement the whole spec.
|
|
273
|
+
#
|
|
274
|
+
# References:
|
|
275
|
+
# [1] OpenGIS Catalogue Services Specification 2.0.2 – ISO Metadata Application Profile: Corrigendum
|
|
276
|
+
# https://portal.ogc.org/files/80534
|
|
277
|
+
# [2] GeoNetwork - CSW test requests
|
|
278
|
+
# https://github.com/geonetwork/core-geonetwork/tree/3.10.4/web/src/main/webapp/xml/csw/test
|
|
279
|
+
# [3] Udata - Support csw dcat harvest
|
|
280
|
+
# https://github.com/opendatateam/udata/pull/2800#discussion_r1129053500
|
|
281
|
+
# [4] GeoNetwork - GetRecords ignores namespaces for Filter/SortBy fields
|
|
282
|
+
# https://github.com/geonetwork/core-geonetwork/blob/3.10.4/csw-server/src/main/java/org/fao/geonet/kernel/csw/services/getrecords/FieldMapper.java#L92
|
|
283
|
+
# [5] GeoNetwork - GetRecords ignores `typeNames`
|
|
284
|
+
# https://github.com/geonetwork/core-geonetwork/blob/3.10.4/csw-server/src/main/java/org/fao/geonet/kernel/csw/services/getrecords/CatalogSearcher.java#L194
|
|
285
|
+
CSW_REQUEST: ClassVar[str] = """
|
|
286
|
+
<csw:GetRecords xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0"
|
|
287
|
+
xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
288
|
+
xmlns:ogc="http://www.opengis.net/ogc"
|
|
289
|
+
service="CSW" version="2.0.2" outputFormat="application/xml"
|
|
290
|
+
resultType="results" startPosition="{start}" maxRecords="25"
|
|
291
|
+
outputSchema="{output_schema}">
|
|
292
|
+
<csw:Query typeNames="gmd:MD_Metadata">
|
|
293
|
+
<csw:ElementSetName>full</csw:ElementSetName>
|
|
294
|
+
<csw:Constraint version="1.1.0">
|
|
295
|
+
<ogc:Filter>
|
|
296
|
+
<ogc:Or>
|
|
297
|
+
<ogc:PropertyIsEqualTo>
|
|
298
|
+
<ogc:PropertyName>apiso:type</ogc:PropertyName>
|
|
299
|
+
<ogc:Literal>dataset</ogc:Literal>
|
|
300
|
+
</ogc:PropertyIsEqualTo>
|
|
301
|
+
<ogc:PropertyIsEqualTo>
|
|
302
|
+
<ogc:PropertyName>apiso:type</ogc:PropertyName>
|
|
303
|
+
<ogc:Literal>nonGeographicDataset</ogc:Literal>
|
|
304
|
+
</ogc:PropertyIsEqualTo>
|
|
305
|
+
<ogc:PropertyIsEqualTo>
|
|
306
|
+
<ogc:PropertyName>apiso:type</ogc:PropertyName>
|
|
307
|
+
<ogc:Literal>series</ogc:Literal>
|
|
308
|
+
</ogc:PropertyIsEqualTo>
|
|
309
|
+
<ogc:PropertyIsEqualTo>
|
|
310
|
+
<ogc:PropertyName>apiso:type</ogc:PropertyName>
|
|
311
|
+
<ogc:Literal>service</ogc:Literal>
|
|
312
|
+
</ogc:PropertyIsEqualTo>
|
|
313
|
+
</ogc:Or>
|
|
314
|
+
</ogc:Filter>
|
|
315
|
+
</csw:Constraint>
|
|
316
|
+
<ogc:SortBy>
|
|
317
|
+
<ogc:SortProperty>
|
|
318
|
+
<ogc:PropertyName>apiso:identifier</ogc:PropertyName>
|
|
319
|
+
<ogc:SortOrder>ASC</ogc:SortOrder>
|
|
320
|
+
</ogc:SortProperty>
|
|
321
|
+
</ogc:SortBy>
|
|
322
|
+
</csw:Query>
|
|
323
|
+
</csw:GetRecords>
|
|
324
|
+
"""
|
|
263
325
|
|
|
264
|
-
|
|
265
|
-
display_name = "CSW-DCAT"
|
|
326
|
+
CSW_OUTPUT_SCHEMA = "http://www.w3.org/ns/dcat#"
|
|
266
327
|
|
|
267
|
-
|
|
328
|
+
def __init__(self, *args, **kwargs):
|
|
329
|
+
super().__init__(*args, **kwargs)
|
|
330
|
+
self.xml_parser = ET.XMLParser(resolve_entities=False)
|
|
268
331
|
|
|
269
332
|
def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
|
|
270
333
|
"""
|
|
271
334
|
Yield all RDF pages as `Graph` from the source
|
|
272
335
|
"""
|
|
273
|
-
body = """<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
274
|
-
xmlns:gmd="http://www.isotc211.org/2005/gmd"
|
|
275
|
-
service="CSW" version="2.0.2" resultType="results"
|
|
276
|
-
startPosition="{start}" maxPosition="200"
|
|
277
|
-
outputSchema="{schema}">
|
|
278
|
-
<csw:Query typeNames="gmd:MD_Metadata">
|
|
279
|
-
<csw:ElementSetName>full</csw:ElementSetName>
|
|
280
|
-
<ogc:SortBy xmlns:ogc="http://www.opengis.net/ogc">
|
|
281
|
-
<ogc:SortProperty>
|
|
282
|
-
<ogc:PropertyName>identifier</ogc:PropertyName>
|
|
283
|
-
<ogc:SortOrder>ASC</ogc:SortOrder>
|
|
284
|
-
</ogc:SortProperty>
|
|
285
|
-
</ogc:SortBy>
|
|
286
|
-
</csw:Query>
|
|
287
|
-
</csw:GetRecords>"""
|
|
288
|
-
headers = {"Content-Type": "application/xml"}
|
|
289
|
-
|
|
290
336
|
page_number = 0
|
|
291
337
|
start = 1
|
|
292
338
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
339
|
+
while True:
|
|
340
|
+
data = self.CSW_REQUEST.format(output_schema=self.CSW_OUTPUT_SCHEMA, start=start)
|
|
341
|
+
response = self.post(url, data=data, headers={"Content-Type": "application/xml"})
|
|
342
|
+
response.raise_for_status()
|
|
343
|
+
|
|
344
|
+
content = response.content
|
|
345
|
+
tree = ET.fromstring(content, parser=self.xml_parser)
|
|
346
|
+
if tree.tag == "{" + OWS_NAMESPACE + "}ExceptionReport":
|
|
347
|
+
raise ValueError(f"Failed to query CSW:\n{content}")
|
|
348
|
+
|
|
302
349
|
search_results = tree.find("csw:SearchResults", {"csw": CSW_NAMESPACE})
|
|
303
|
-
if search_results
|
|
350
|
+
if not search_results:
|
|
304
351
|
log.error(f"No search results found for {url} on page {page_number}")
|
|
305
|
-
|
|
306
|
-
|
|
352
|
+
return
|
|
353
|
+
|
|
354
|
+
for result in search_results:
|
|
307
355
|
subgraph = Graph(namespace_manager=namespace_manager)
|
|
308
|
-
|
|
356
|
+
doc = ET.tostring(self.as_dcat(result))
|
|
357
|
+
subgraph.parse(data=doc, format=fmt)
|
|
358
|
+
|
|
359
|
+
if not subgraph.subjects(
|
|
360
|
+
RDF.type, [DCAT.Dataset, DCAT.DatasetSeries, DCAT.DataService]
|
|
361
|
+
):
|
|
362
|
+
raise ValueError("Failed to fetch CSW content")
|
|
309
363
|
|
|
310
364
|
yield page_number, subgraph
|
|
365
|
+
|
|
311
366
|
if self.has_reached_max_items():
|
|
312
367
|
return
|
|
313
368
|
|
|
314
|
-
next_record = self.next_record_if_should_continue(start, search_results)
|
|
315
|
-
if not next_record:
|
|
316
|
-
break
|
|
317
|
-
|
|
318
|
-
start = next_record
|
|
319
369
|
page_number += 1
|
|
370
|
+
start = self.next_position(start, search_results)
|
|
371
|
+
if not start:
|
|
372
|
+
return
|
|
320
373
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
374
|
+
def as_dcat(self, tree: ET._Element) -> ET._Element:
|
|
375
|
+
"""
|
|
376
|
+
Return the input tree as a DCAT tree.
|
|
377
|
+
For CswDcatBackend, this method return the incoming tree as-is, since it's already DCAT.
|
|
378
|
+
For subclasses of CswDcatBackend, this method should convert the incoming tree to DCAT.
|
|
379
|
+
"""
|
|
380
|
+
return tree
|
|
327
381
|
|
|
382
|
+
def next_position(self, start: int, search_results: ET._Element) -> int | None:
|
|
383
|
+
next_record = int(search_results.attrib["nextRecord"])
|
|
384
|
+
matched_count = int(search_results.attrib["numberOfRecordsMatched"])
|
|
385
|
+
returned_count = int(search_results.attrib["numberOfRecordsReturned"])
|
|
328
386
|
|
|
329
|
-
|
|
387
|
+
# Break conditions copied gratefully from
|
|
388
|
+
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
|
|
389
|
+
should_break = (
|
|
390
|
+
# A value of 0 means all records have been returned (standard CSW)
|
|
391
|
+
(next_record == 0)
|
|
392
|
+
# Misbehaving CSW server returning a next record > matched count
|
|
393
|
+
or (next_record > matched_count)
|
|
394
|
+
# No results returned already
|
|
395
|
+
or (returned_count == 0)
|
|
396
|
+
# Current next record is lower than previous one
|
|
397
|
+
or (next_record < start)
|
|
398
|
+
# Enough items have been harvested already
|
|
399
|
+
or self.has_reached_max_items()
|
|
400
|
+
)
|
|
401
|
+
return None if should_break else next_record
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
class CswIso19139DcatBackend(CswDcatBackend):
|
|
330
405
|
"""
|
|
331
|
-
|
|
406
|
+
CSW harvester fetching records as ISO-19139 and using XSLT to convert them to DCAT.
|
|
332
407
|
The parsing of items is then the same as for the DcatBackend.
|
|
333
408
|
"""
|
|
334
409
|
|
|
335
410
|
display_name = "CSW-ISO-19139"
|
|
411
|
+
|
|
336
412
|
extra_configs = (
|
|
337
413
|
HarvestExtraConfig(
|
|
338
414
|
_("Remote URL prefix"),
|
|
@@ -342,94 +418,14 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
342
418
|
),
|
|
343
419
|
)
|
|
344
420
|
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
|
|
348
|
-
"""
|
|
349
|
-
Yield all RDF pages as `Graph` from the source
|
|
350
|
-
|
|
351
|
-
Parse CSW graph querying ISO schema.
|
|
352
|
-
Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
|
|
353
|
-
See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
|
|
354
|
-
"""
|
|
355
|
-
# Load XSLT
|
|
356
|
-
xsl_url = current_app.config["HARVEST_ISO19139_XSL_URL"]
|
|
357
|
-
xsl = ET.fromstring(self.get(xsl_url).content, parser=SAFE_PARSER)
|
|
358
|
-
transform = ET.XSLT(xsl)
|
|
359
|
-
|
|
360
|
-
# Start querying and parsing graph
|
|
361
|
-
# Filter on dataset or serie records
|
|
362
|
-
body = """<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
363
|
-
xmlns:gmd="http://www.isotc211.org/2005/gmd"
|
|
364
|
-
service="CSW" version="2.0.2" resultType="results"
|
|
365
|
-
startPosition="{start}" maxPosition="10"
|
|
366
|
-
outputSchema="{schema}">
|
|
367
|
-
<csw:Query typeNames="csw:Record">
|
|
368
|
-
<csw:ElementSetName>full</csw:ElementSetName>
|
|
369
|
-
<csw:Constraint version="1.1.0">
|
|
370
|
-
<ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
|
|
371
|
-
<ogc:Or xmlns:ogc="http://www.opengis.net/ogc">
|
|
372
|
-
<ogc:PropertyIsEqualTo>
|
|
373
|
-
<ogc:PropertyName>dc:type</ogc:PropertyName>
|
|
374
|
-
<ogc:Literal>dataset</ogc:Literal>
|
|
375
|
-
</ogc:PropertyIsEqualTo>
|
|
376
|
-
<ogc:PropertyIsEqualTo>
|
|
377
|
-
<ogc:PropertyName>dc:type</ogc:PropertyName>
|
|
378
|
-
<ogc:Literal>service</ogc:Literal>
|
|
379
|
-
</ogc:PropertyIsEqualTo>
|
|
380
|
-
<ogc:PropertyIsEqualTo>
|
|
381
|
-
<ogc:PropertyName>dc:type</ogc:PropertyName>
|
|
382
|
-
<ogc:Literal>series</ogc:Literal>
|
|
383
|
-
</ogc:PropertyIsEqualTo>
|
|
384
|
-
</ogc:Or>
|
|
385
|
-
</ogc:Filter>
|
|
386
|
-
</csw:Constraint>
|
|
387
|
-
</csw:Query>
|
|
388
|
-
</csw:GetRecords>"""
|
|
389
|
-
headers = {"Content-Type": "application/xml"}
|
|
390
|
-
|
|
391
|
-
page_number = 0
|
|
392
|
-
start = 1
|
|
393
|
-
|
|
394
|
-
response = self.post(
|
|
395
|
-
url, data=body.format(start=start, schema=self.ISO_SCHEMA), headers=headers
|
|
396
|
-
)
|
|
397
|
-
response.raise_for_status()
|
|
398
|
-
|
|
399
|
-
tree_before_transform = ET.fromstring(response.content, parser=SAFE_PARSER)
|
|
400
|
-
# Disabling CoupledResourceLookUp to prevent failure on xlink:href
|
|
401
|
-
# https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup
|
|
402
|
-
tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
|
|
403
|
-
|
|
404
|
-
while tree:
|
|
405
|
-
# We query the tree before the transformation because the XSLT remove the search results
|
|
406
|
-
# infos (useful for pagination)
|
|
407
|
-
search_results = tree_before_transform.find("csw:SearchResults", {"csw": CSW_NAMESPACE})
|
|
408
|
-
if search_results is None:
|
|
409
|
-
log.error(f"No search results found for {url} on page {page_number}")
|
|
410
|
-
break
|
|
411
|
-
|
|
412
|
-
subgraph = Graph(namespace_manager=namespace_manager)
|
|
413
|
-
subgraph.parse(ET.tostring(tree), format=fmt)
|
|
414
|
-
|
|
415
|
-
if not subgraph.subjects(RDF.type, DCAT.Dataset):
|
|
416
|
-
raise ValueError("Failed to fetch CSW content")
|
|
421
|
+
CSW_OUTPUT_SCHEMA = "http://www.isotc211.org/2005/gmd"
|
|
417
422
|
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
if not next_record:
|
|
424
|
-
break
|
|
425
|
-
|
|
426
|
-
start = next_record
|
|
427
|
-
page_number += 1
|
|
428
|
-
|
|
429
|
-
response = self.post(
|
|
430
|
-
url, data=body.format(start=start, schema=self.ISO_SCHEMA), headers=headers
|
|
431
|
-
)
|
|
432
|
-
response.raise_for_status()
|
|
423
|
+
def __init__(self, *args, **kwargs):
|
|
424
|
+
super().__init__(*args, **kwargs)
|
|
425
|
+
xslt_url = current_app.config["HARVEST_ISO19139_XSLT_URL"]
|
|
426
|
+
xslt = ET.fromstring(self.get(xslt_url).content, parser=self.xml_parser)
|
|
427
|
+
self.transform = ET.XSLT(xslt)
|
|
433
428
|
|
|
434
|
-
|
|
435
|
-
|
|
429
|
+
@override
|
|
430
|
+
def as_dcat(self, tree: ET._Element) -> ET._Element:
|
|
431
|
+
return self.transform(tree, CoupledResourceLookUp="'disabled'")
|
|
@@ -460,7 +460,7 @@ def test_skip_no_resources(source, result):
|
|
|
460
460
|
def test_ckan_url_is_url(data, result):
|
|
461
461
|
dataset = dataset_for(result)
|
|
462
462
|
assert dataset.harvest.remote_url == data["url"]
|
|
463
|
-
assert
|
|
463
|
+
assert dataset.harvest.ckan_source is None
|
|
464
464
|
|
|
465
465
|
|
|
466
466
|
@pytest.mark.ckan_data("ckan_url_is_a_string")
|
|
@@ -159,6 +159,7 @@
|
|
|
159
159
|
<dcat:accessURL>http://data.test.org/datasets/1/resources/4/services?SERVICE=WMS&REQUEST=GetCapabilities&VERSION=1.3.0</dcat:accessURL>
|
|
160
160
|
<dcat:accessService>
|
|
161
161
|
<dcat:DataService>
|
|
162
|
+
<rdf:type rdf:resource="http://www.w3.org/ns/dcat#DataService"/>
|
|
162
163
|
<dcterms:title xml:lang="fr">Geo Service</dcterms:title>
|
|
163
164
|
<dcat:endpointURL rdf:resource="http://data.test.org/datasets/1/resources/4/services"/>
|
|
164
165
|
<dcat:endpointDescription rdf:resource="http://data.test.org/datasets/1/resources/4/services?SERVICE=WMS&REQUEST=GetCapabilities&VERSION=1.3.0"/>
|
|
@@ -187,6 +187,23 @@ class DcatBackendTest:
|
|
|
187
187
|
== "https://data.paris2024.org/api/explore/v2.1/console"
|
|
188
188
|
)
|
|
189
189
|
|
|
190
|
+
def test_harvest_dataservices_ignore_accessservices(self, rmock):
|
|
191
|
+
rmock.get("https://example.com/schemas", json=ResourceSchemaMockData.get_mock_data())
|
|
192
|
+
|
|
193
|
+
url = mock_dcat(rmock, "catalog.xml")
|
|
194
|
+
org = OrganizationFactory()
|
|
195
|
+
source = HarvestSourceFactory(backend="dcat", url=url, organization=org)
|
|
196
|
+
|
|
197
|
+
actions.run(source)
|
|
198
|
+
|
|
199
|
+
source.reload()
|
|
200
|
+
|
|
201
|
+
job = source.get_last_job()
|
|
202
|
+
assert len(job.items) == 4
|
|
203
|
+
|
|
204
|
+
dataservices = Dataservice.objects
|
|
205
|
+
assert len(dataservices) == 0
|
|
206
|
+
|
|
190
207
|
def test_harvest_literal_spatial(self, rmock):
|
|
191
208
|
url = mock_dcat(rmock, "evian.json")
|
|
192
209
|
org = OrganizationFactory()
|
|
@@ -478,12 +495,8 @@ class DcatBackendTest:
|
|
|
478
495
|
|
|
479
496
|
assert job.status == "done"
|
|
480
497
|
assert job.errors == []
|
|
481
|
-
assert len(job.items) ==
|
|
482
|
-
# 4 datasets and one Dataservice mentionned but not described
|
|
483
|
-
# because it appears in a distribution as DCAT.accessService
|
|
484
|
-
# but is missing a proper DCT.identifier
|
|
498
|
+
assert len(job.items) == 4
|
|
485
499
|
assert len([item for item in job.items if item.status == "done"]) == 4
|
|
486
|
-
assert len([item for item in job.items if item.status == "skipped"]) == 1
|
|
487
500
|
|
|
488
501
|
def test_xml_catalog(self, rmock):
|
|
489
502
|
LicenseFactory(id="lov2", title="Licence Ouverte Version 2.0")
|
|
@@ -886,7 +899,7 @@ class CswIso19139DcatBackendTest:
|
|
|
886
899
|
with open(os.path.join(CSW_DCAT_FILES_DIR, "XSLT.xml"), "r") as f:
|
|
887
900
|
xslt = f.read()
|
|
888
901
|
url = mock_csw_pagination(rmock, "geonetwork/srv/eng/csw.rdf", "geonetwork-iso-page-{}.xml")
|
|
889
|
-
rmock.get(current_app.config.get("
|
|
902
|
+
rmock.get(current_app.config.get("HARVEST_ISO19139_XSLT_URL"), text=xslt)
|
|
890
903
|
org = OrganizationFactory()
|
|
891
904
|
source = HarvestSourceFactory(
|
|
892
905
|
backend="csw-iso-19139",
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This migration removes legacy harvest dynamic fields
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from mongoengine.connection import get_db
|
|
8
|
+
|
|
9
|
+
log = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def migrate(db):
|
|
13
|
+
# Remove legacy fields (`ods_has_records`, `ods_url`, ...) from old harvested datasets and resources
|
|
14
|
+
dataset_legacy_fields = ["ods_has_records", "ods_url", "ods_geo"]
|
|
15
|
+
for field in dataset_legacy_fields:
|
|
16
|
+
result = get_db().dataset.update_many({}, {"$unset": {f"harvest.{field}": 1}})
|
|
17
|
+
log.info(
|
|
18
|
+
f"Harvest Dataset dynamic legacy fields ({field}) removed from {result.modified_count} objects"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
resource_legacy_fields = ["ods_type"]
|
|
22
|
+
for field in resource_legacy_fields:
|
|
23
|
+
result = get_db().dataset.update_many(
|
|
24
|
+
{"resources": {"$exists": True, "$type": "array"}},
|
|
25
|
+
{"$unset": {f"resources.$[].harvest.{field}": 1}},
|
|
26
|
+
)
|
|
27
|
+
log.info(
|
|
28
|
+
f"Harvest Resource dynamic legacy fields ({field}) removed from {result.modified_count} objects"
|
|
29
|
+
)
|
udata/settings.py
CHANGED
|
@@ -283,7 +283,7 @@ class Defaults(object):
|
|
|
283
283
|
HARVEST_GRAPHS_S3_BUCKET = None # If the catalog is bigger than `HARVEST_MAX_CATALOG_SIZE_IN_MONGO` store the graph inside S3 instead of MongoDB
|
|
284
284
|
HARVEST_GRAPHS_S3_FILENAME_PREFIX = "" # Useful to store the graphs inside a subfolder of the bucket. For example by setting `HARVEST_GRAPHS_S3_FILENAME_PREFIX = 'graphs/'`
|
|
285
285
|
|
|
286
|
-
|
|
286
|
+
HARVEST_ISO19139_XSLT_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/refs/heads/geodcat-ap-2.0.0/iso-19139-to-dcat-ap.xsl"
|
|
287
287
|
|
|
288
288
|
# S3 connection details
|
|
289
289
|
S3_URL = None
|