udata 7.0.6.dev28263__py2.py3-none-any.whl → 7.0.6.dev28300__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- udata/commands/dcat.py +7 -4
- udata/harvest/backends/dcat.py +128 -24
- udata/harvest/tests/csw_dcat/XSLT.xml +4298 -0
- udata/harvest/tests/csw_dcat/geonetwork-iso-page-1.xml +1291 -0
- udata/harvest/tests/csw_dcat/geonetwork-iso-page-3.xml +1139 -0
- udata/harvest/tests/csw_dcat/geonetwork-iso-page-5.xml +1266 -0
- udata/harvest/tests/test_dcat_backend.py +63 -5
- udata/rdf.py +1 -0
- {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28300.dist-info}/METADATA +2 -1
- {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28300.dist-info}/RECORD +14 -10
- {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28300.dist-info}/entry_points.txt +1 -0
- {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28300.dist-info}/LICENSE +0 -0
- {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28300.dist-info}/WHEEL +0 -0
- {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28300.dist-info}/top_level.txt +0 -0
udata/commands/dcat.py
CHANGED
|
@@ -8,7 +8,7 @@ from rdflib import Graph
|
|
|
8
8
|
from udata.commands import cli, green, yellow, cyan, echo, magenta
|
|
9
9
|
from udata.core.dataset.factories import DatasetFactory
|
|
10
10
|
from udata.core.dataset.rdf import dataset_from_rdf
|
|
11
|
-
from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend
|
|
11
|
+
from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend, CswIso19139DcatBackend
|
|
12
12
|
from udata.rdf import namespace_manager
|
|
13
13
|
|
|
14
14
|
log = logging.getLogger(__name__)
|
|
@@ -23,9 +23,10 @@ def grp():
|
|
|
23
23
|
@grp.command()
|
|
24
24
|
@click.argument('url')
|
|
25
25
|
@click.option('-q', '--quiet', is_flag=True, help='Ignore warnings')
|
|
26
|
-
@click.option('-
|
|
27
|
-
@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint')
|
|
28
|
-
|
|
26
|
+
@click.option('-r', '--rid', help='Inspect specific remote id (contains)')
|
|
27
|
+
@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint with DCAT output')
|
|
28
|
+
@click.option('-i', '--iso', is_flag=True, help='The target is a CSW endpoint with ISO output')
|
|
29
|
+
def parse_url(url, csw, iso, quiet=False, rid=''):
|
|
29
30
|
'''Parse the datasets in a DCAT format located at URL (debug)'''
|
|
30
31
|
if quiet:
|
|
31
32
|
verbose_loggers = ['rdflib', 'udata.core.dataset']
|
|
@@ -49,6 +50,8 @@ def parse_url(url, csw, quiet=False, rid=''):
|
|
|
49
50
|
source.url = url
|
|
50
51
|
if csw:
|
|
51
52
|
backend = CswDcatBackend(source, dryrun=True)
|
|
53
|
+
elif iso:
|
|
54
|
+
backend = CswIso19139DcatBackend(source, dryrun=True)
|
|
52
55
|
else:
|
|
53
56
|
backend = DcatBackend(source, dryrun=True)
|
|
54
57
|
backend.job = MockJob()
|
udata/harvest/backends/dcat.py
CHANGED
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
|
|
3
3
|
from rdflib import Graph, URIRef
|
|
4
4
|
from rdflib.namespace import RDF
|
|
5
|
-
import
|
|
5
|
+
import lxml.etree as ET
|
|
6
6
|
import boto3
|
|
7
7
|
from flask import current_app
|
|
8
8
|
from datetime import date
|
|
@@ -173,7 +173,36 @@ class DcatBackend(BaseBackend):
|
|
|
173
173
|
dataset = self.get_dataset(item.remote_id)
|
|
174
174
|
dataset = dataset_from_rdf(graph, dataset, node=node)
|
|
175
175
|
return dataset
|
|
176
|
+
|
|
176
177
|
|
|
178
|
+
def next_record_if_should_continue(self, start, search_results):
|
|
179
|
+
next_record = int(search_results.attrib['nextRecord'])
|
|
180
|
+
matched_count = int(search_results.attrib['numberOfRecordsMatched'])
|
|
181
|
+
returned_count = int(search_results.attrib['numberOfRecordsReturned'])
|
|
182
|
+
|
|
183
|
+
# Break conditions copied gratefully from
|
|
184
|
+
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
|
|
185
|
+
break_conditions = (
|
|
186
|
+
# standard CSW: A value of 0 means all records have been returned.
|
|
187
|
+
next_record == 0,
|
|
188
|
+
|
|
189
|
+
# Misbehaving CSW server returning a next record > matched count
|
|
190
|
+
next_record > matched_count,
|
|
191
|
+
|
|
192
|
+
# No results returned already
|
|
193
|
+
returned_count == 0,
|
|
194
|
+
|
|
195
|
+
# Current next record is lower than previous one
|
|
196
|
+
next_record < start,
|
|
197
|
+
|
|
198
|
+
# Enough items have been harvested already
|
|
199
|
+
self.max_items and len(self.job.items) >= self.max_items
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
if any(break_conditions):
|
|
203
|
+
return None
|
|
204
|
+
else:
|
|
205
|
+
return next_record
|
|
177
206
|
|
|
178
207
|
class CswDcatBackend(DcatBackend):
|
|
179
208
|
display_name = 'CSW-DCAT'
|
|
@@ -201,17 +230,18 @@ class CswDcatBackend(DcatBackend):
|
|
|
201
230
|
graphs = []
|
|
202
231
|
page = 0
|
|
203
232
|
start = 1
|
|
233
|
+
|
|
204
234
|
response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
|
|
205
235
|
headers=headers)
|
|
206
236
|
response.raise_for_status()
|
|
207
|
-
content = response.
|
|
237
|
+
content = response.content
|
|
208
238
|
tree = ET.fromstring(content)
|
|
209
239
|
if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
|
|
210
240
|
raise ValueError(f'Failed to query CSW:\n{content}')
|
|
211
241
|
while tree:
|
|
212
242
|
graph = Graph(namespace_manager=namespace_manager)
|
|
213
243
|
search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
|
|
214
|
-
if
|
|
244
|
+
if search_results is None:
|
|
215
245
|
log.error(f'No search results found for {url} on page {page}')
|
|
216
246
|
break
|
|
217
247
|
for child in search_results:
|
|
@@ -225,37 +255,111 @@ class CswDcatBackend(DcatBackend):
|
|
|
225
255
|
kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
|
|
226
256
|
self.add_item(id, **kwargs)
|
|
227
257
|
graphs.append(graph)
|
|
258
|
+
|
|
259
|
+
next_record = self.next_record_if_should_continue(start, search_results)
|
|
260
|
+
if not next_record:
|
|
261
|
+
break
|
|
262
|
+
|
|
263
|
+
start = next_record
|
|
228
264
|
page += 1
|
|
229
265
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
266
|
+
tree = ET.fromstring(
|
|
267
|
+
self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
|
|
268
|
+
headers=headers).content)
|
|
269
|
+
|
|
270
|
+
return graphs
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class CswIso19139DcatBackend(DcatBackend):
|
|
275
|
+
'''
|
|
276
|
+
An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
|
|
277
|
+
The parsing of items is then the same as for the DcatBackend.
|
|
278
|
+
'''
|
|
279
|
+
|
|
280
|
+
display_name = 'CSW-ISO-19139'
|
|
281
|
+
|
|
282
|
+
ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd'
|
|
233
283
|
|
|
234
|
-
|
|
235
|
-
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
|
|
236
|
-
break_conditions = (
|
|
237
|
-
# standard CSW: A value of 0 means all records have been returned.
|
|
238
|
-
next_record == 0,
|
|
284
|
+
XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
|
|
239
285
|
|
|
240
|
-
|
|
241
|
-
|
|
286
|
+
def parse_graph(self, url: str, fmt: str) -> List[Graph]:
|
|
287
|
+
'''
|
|
288
|
+
Parse CSW graph querying ISO schema.
|
|
289
|
+
Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
|
|
290
|
+
See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
|
|
291
|
+
'''
|
|
292
|
+
|
|
293
|
+
# Load XSLT
|
|
294
|
+
xsl = ET.fromstring(self.get(self.XSL_URL).content)
|
|
295
|
+
transform = ET.XSLT(xsl)
|
|
296
|
+
|
|
297
|
+
# Start querying and parsing graph
|
|
298
|
+
body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
299
|
+
xmlns:gmd="http://www.isotc211.org/2005/gmd"
|
|
300
|
+
service="CSW" version="2.0.2" resultType="results"
|
|
301
|
+
startPosition="{start}" maxPosition="10"
|
|
302
|
+
outputSchema="{schema}">
|
|
303
|
+
<csw:Query typeNames="csw:Record">
|
|
304
|
+
<csw:ElementSetName>full</csw:ElementSetName>
|
|
305
|
+
<csw:Constraint version="1.1.0">
|
|
306
|
+
<ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
|
|
307
|
+
<ogc:PropertyIsEqualTo>
|
|
308
|
+
<ogc:PropertyName>dc:type</ogc:PropertyName>
|
|
309
|
+
<ogc:Literal>dataset</ogc:Literal>
|
|
310
|
+
</ogc:PropertyIsEqualTo>
|
|
311
|
+
</ogc:Filter>
|
|
312
|
+
</csw:Constraint>
|
|
313
|
+
</csw:Query>
|
|
314
|
+
</csw:GetRecords>'''
|
|
315
|
+
headers = {'Content-Type': 'application/xml'}
|
|
242
316
|
|
|
243
|
-
|
|
244
|
-
|
|
317
|
+
graphs = []
|
|
318
|
+
page = 0
|
|
319
|
+
start = 1
|
|
245
320
|
|
|
246
|
-
|
|
247
|
-
|
|
321
|
+
response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
|
|
322
|
+
headers=headers)
|
|
323
|
+
response.raise_for_status()
|
|
248
324
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
325
|
+
tree_before_transform = ET.fromstring(response.content)
|
|
326
|
+
# Disabling CoupledResourceLookUp to prevent failure on xlink:href
|
|
327
|
+
# https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup
|
|
328
|
+
tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
|
|
252
329
|
|
|
253
|
-
|
|
330
|
+
while tree:
|
|
331
|
+
# We query the tree before the transformation because the XSLT remove the search results
|
|
332
|
+
# infos (useful for pagination)
|
|
333
|
+
search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
|
|
334
|
+
if search_results is None:
|
|
335
|
+
log.error(f'No search results found for {url} on page {page}')
|
|
254
336
|
break
|
|
255
337
|
|
|
338
|
+
subgraph = Graph(namespace_manager=namespace_manager)
|
|
339
|
+
subgraph.parse(ET.tostring(tree), format=fmt)
|
|
340
|
+
|
|
341
|
+
if not subgraph.subjects(RDF.type, DCAT.Dataset):
|
|
342
|
+
raise ValueError("Failed to fetch CSW content")
|
|
343
|
+
|
|
344
|
+
for node in subgraph.subjects(RDF.type, DCAT.Dataset):
|
|
345
|
+
id = subgraph.value(node, DCT.identifier)
|
|
346
|
+
kwargs = {'nid': str(node), 'page': page}
|
|
347
|
+
kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
|
|
348
|
+
self.add_item(id, **kwargs)
|
|
349
|
+
graphs.append(subgraph)
|
|
350
|
+
|
|
351
|
+
next_record = self.next_record_if_should_continue(start, search_results)
|
|
352
|
+
if not next_record:
|
|
353
|
+
break
|
|
354
|
+
|
|
256
355
|
start = next_record
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
356
|
+
page += 1
|
|
357
|
+
|
|
358
|
+
response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
|
|
359
|
+
headers=headers)
|
|
360
|
+
response.raise_for_status()
|
|
361
|
+
|
|
362
|
+
tree_before_transform = ET.fromstring(response.content)
|
|
363
|
+
tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
|
|
260
364
|
|
|
261
365
|
return graphs
|