udata 7.0.6.dev28263__py2.py3-none-any.whl → 7.0.6.dev28300__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

udata/commands/dcat.py CHANGED
@@ -8,7 +8,7 @@ from rdflib import Graph
8
8
  from udata.commands import cli, green, yellow, cyan, echo, magenta
9
9
  from udata.core.dataset.factories import DatasetFactory
10
10
  from udata.core.dataset.rdf import dataset_from_rdf
11
- from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend
11
+ from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend, CswIso19139DcatBackend
12
12
  from udata.rdf import namespace_manager
13
13
 
14
14
  log = logging.getLogger(__name__)
@@ -23,9 +23,10 @@ def grp():
23
23
  @grp.command()
24
24
  @click.argument('url')
25
25
  @click.option('-q', '--quiet', is_flag=True, help='Ignore warnings')
26
- @click.option('-i', '--rid', help='Inspect specific remote id (contains)')
27
- @click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint')
28
- def parse_url(url, csw, quiet=False, rid=''):
26
+ @click.option('-r', '--rid', help='Inspect specific remote id (contains)')
27
+ @click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint with DCAT output')
28
+ @click.option('-i', '--iso', is_flag=True, help='The target is a CSW endpoint with ISO output')
29
+ def parse_url(url, csw, iso, quiet=False, rid=''):
29
30
  '''Parse the datasets in a DCAT format located at URL (debug)'''
30
31
  if quiet:
31
32
  verbose_loggers = ['rdflib', 'udata.core.dataset']
@@ -49,6 +50,8 @@ def parse_url(url, csw, quiet=False, rid=''):
49
50
  source.url = url
50
51
  if csw:
51
52
  backend = CswDcatBackend(source, dryrun=True)
53
+ elif iso:
54
+ backend = CswIso19139DcatBackend(source, dryrun=True)
52
55
  else:
53
56
  backend = DcatBackend(source, dryrun=True)
54
57
  backend.job = MockJob()
@@ -2,7 +2,7 @@ import logging
2
2
 
3
3
  from rdflib import Graph, URIRef
4
4
  from rdflib.namespace import RDF
5
- import xml.etree.ElementTree as ET
5
+ import lxml.etree as ET
6
6
  import boto3
7
7
  from flask import current_app
8
8
  from datetime import date
@@ -173,7 +173,36 @@ class DcatBackend(BaseBackend):
173
173
  dataset = self.get_dataset(item.remote_id)
174
174
  dataset = dataset_from_rdf(graph, dataset, node=node)
175
175
  return dataset
176
+
176
177
 
178
+ def next_record_if_should_continue(self, start, search_results):
179
+ next_record = int(search_results.attrib['nextRecord'])
180
+ matched_count = int(search_results.attrib['numberOfRecordsMatched'])
181
+ returned_count = int(search_results.attrib['numberOfRecordsReturned'])
182
+
183
+ # Break conditions copied gratefully from
184
+ # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
185
+ break_conditions = (
186
+ # standard CSW: A value of 0 means all records have been returned.
187
+ next_record == 0,
188
+
189
+ # Misbehaving CSW server returning a next record > matched count
190
+ next_record > matched_count,
191
+
192
+ # No results returned already
193
+ returned_count == 0,
194
+
195
+ # Current next record is lower than previous one
196
+ next_record < start,
197
+
198
+ # Enough items have been harvested already
199
+ self.max_items and len(self.job.items) >= self.max_items
200
+ )
201
+
202
+ if any(break_conditions):
203
+ return None
204
+ else:
205
+ return next_record
177
206
 
178
207
  class CswDcatBackend(DcatBackend):
179
208
  display_name = 'CSW-DCAT'
@@ -201,17 +230,18 @@ class CswDcatBackend(DcatBackend):
201
230
  graphs = []
202
231
  page = 0
203
232
  start = 1
233
+
204
234
  response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
205
235
  headers=headers)
206
236
  response.raise_for_status()
207
- content = response.text
237
+ content = response.content
208
238
  tree = ET.fromstring(content)
209
239
  if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
210
240
  raise ValueError(f'Failed to query CSW:\n{content}')
211
241
  while tree:
212
242
  graph = Graph(namespace_manager=namespace_manager)
213
243
  search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
214
- if not search_results:
244
+ if search_results is None:
215
245
  log.error(f'No search results found for {url} on page {page}')
216
246
  break
217
247
  for child in search_results:
@@ -225,37 +255,111 @@ class CswDcatBackend(DcatBackend):
225
255
  kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
226
256
  self.add_item(id, **kwargs)
227
257
  graphs.append(graph)
258
+
259
+ next_record = self.next_record_if_should_continue(start, search_results)
260
+ if not next_record:
261
+ break
262
+
263
+ start = next_record
228
264
  page += 1
229
265
 
230
- next_record = int(search_results.attrib['nextRecord'])
231
- matched_count = int(search_results.attrib['numberOfRecordsMatched'])
232
- returned_count = int(search_results.attrib['numberOfRecordsReturned'])
266
+ tree = ET.fromstring(
267
+ self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
268
+ headers=headers).content)
269
+
270
+ return graphs
271
+
272
+
273
+
274
+ class CswIso19139DcatBackend(DcatBackend):
275
+ '''
276
+ An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
277
+ The parsing of items is then the same as for the DcatBackend.
278
+ '''
279
+
280
+ display_name = 'CSW-ISO-19139'
281
+
282
+ ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd'
233
283
 
234
- # Break conditions copied gratefully from
235
- # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
236
- break_conditions = (
237
- # standard CSW: A value of 0 means all records have been returned.
238
- next_record == 0,
284
+ XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
239
285
 
240
- # Misbehaving CSW server returning a next record > matched count
241
- next_record > matched_count,
286
+ def parse_graph(self, url: str, fmt: str) -> List[Graph]:
287
+ '''
288
+ Parse CSW graph querying ISO schema.
289
+ Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
290
+ See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
291
+ '''
292
+
293
+ # Load XSLT
294
+ xsl = ET.fromstring(self.get(self.XSL_URL).content)
295
+ transform = ET.XSLT(xsl)
296
+
297
+ # Start querying and parsing graph
298
+ body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
299
+ xmlns:gmd="http://www.isotc211.org/2005/gmd"
300
+ service="CSW" version="2.0.2" resultType="results"
301
+ startPosition="{start}" maxPosition="10"
302
+ outputSchema="{schema}">
303
+ <csw:Query typeNames="csw:Record">
304
+ <csw:ElementSetName>full</csw:ElementSetName>
305
+ <csw:Constraint version="1.1.0">
306
+ <ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
307
+ <ogc:PropertyIsEqualTo>
308
+ <ogc:PropertyName>dc:type</ogc:PropertyName>
309
+ <ogc:Literal>dataset</ogc:Literal>
310
+ </ogc:PropertyIsEqualTo>
311
+ </ogc:Filter>
312
+ </csw:Constraint>
313
+ </csw:Query>
314
+ </csw:GetRecords>'''
315
+ headers = {'Content-Type': 'application/xml'}
242
316
 
243
- # No results returned already
244
- returned_count == 0,
317
+ graphs = []
318
+ page = 0
319
+ start = 1
245
320
 
246
- # Current next record is lower than previous one
247
- next_record < start,
321
+ response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
322
+ headers=headers)
323
+ response.raise_for_status()
248
324
 
249
- # Enough items have been harvested already
250
- self.max_items and len(self.job.items) >= self.max_items
251
- )
325
+ tree_before_transform = ET.fromstring(response.content)
326
+ # Disabling CoupledResourceLookUp to prevent failure on xlink:href
327
+ # https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup
328
+ tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
252
329
 
253
- if any(break_conditions):
330
+ while tree:
331
+ # We query the tree before the transformation because the XSLT remove the search results
332
+ # infos (useful for pagination)
333
+ search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
334
+ if search_results is None:
335
+ log.error(f'No search results found for {url} on page {page}')
254
336
  break
255
337
 
338
+ subgraph = Graph(namespace_manager=namespace_manager)
339
+ subgraph.parse(ET.tostring(tree), format=fmt)
340
+
341
+ if not subgraph.subjects(RDF.type, DCAT.Dataset):
342
+ raise ValueError("Failed to fetch CSW content")
343
+
344
+ for node in subgraph.subjects(RDF.type, DCAT.Dataset):
345
+ id = subgraph.value(node, DCT.identifier)
346
+ kwargs = {'nid': str(node), 'page': page}
347
+ kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
348
+ self.add_item(id, **kwargs)
349
+ graphs.append(subgraph)
350
+
351
+ next_record = self.next_record_if_should_continue(start, search_results)
352
+ if not next_record:
353
+ break
354
+
256
355
  start = next_record
257
- tree = ET.fromstring(
258
- self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
259
- headers=headers).text)
356
+ page += 1
357
+
358
+ response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
359
+ headers=headers)
360
+ response.raise_for_status()
361
+
362
+ tree_before_transform = ET.fromstring(response.content)
363
+ tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
260
364
 
261
365
  return graphs