udata 7.0.6.dev28263__py2.py3-none-any.whl → 7.0.6.dev28345__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (33) hide show
  1. udata/commands/dcat.py +7 -4
  2. udata/core/dataset/factories.py +2 -4
  3. udata/core/dataset/models.py +2 -5
  4. udata/core/dataset/search.py +0 -1
  5. udata/core/spatial/tests/test_api.py +7 -10
  6. udata/core/topic/factories.py +2 -2
  7. udata/harvest/backends/dcat.py +128 -24
  8. udata/harvest/tests/csw_dcat/XSLT.xml +4298 -0
  9. udata/harvest/tests/csw_dcat/geonetwork-iso-page-1.xml +1291 -0
  10. udata/harvest/tests/csw_dcat/geonetwork-iso-page-3.xml +1139 -0
  11. udata/harvest/tests/csw_dcat/geonetwork-iso-page-5.xml +1266 -0
  12. udata/harvest/tests/test_dcat_backend.py +63 -5
  13. udata/rdf.py +1 -0
  14. udata/search/__init__.py +2 -2
  15. udata/tests/api/test_datasets_api.py +43 -44
  16. udata/tests/api/test_me_api.py +13 -14
  17. udata/tests/dataset/test_dataset_actions.py +2 -2
  18. udata/tests/dataset/test_dataset_commands.py +3 -3
  19. udata/tests/organization/test_organization_model.py +3 -3
  20. udata/tests/organization/test_organization_rdf.py +3 -3
  21. udata/tests/reuse/test_reuse_model.py +2 -2
  22. udata/tests/search/test_adapter.py +12 -12
  23. udata/tests/search/test_results.py +4 -4
  24. udata/tests/site/test_site_api.py +3 -3
  25. udata/tests/site/test_site_metrics.py +3 -3
  26. udata/tests/site/test_site_rdf.py +6 -6
  27. udata/tests/test_transfer.py +18 -17
  28. {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28345.dist-info}/METADATA +3 -1
  29. {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28345.dist-info}/RECORD +33 -29
  30. {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28345.dist-info}/entry_points.txt +1 -0
  31. {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28345.dist-info}/LICENSE +0 -0
  32. {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28345.dist-info}/WHEEL +0 -0
  33. {udata-7.0.6.dev28263.dist-info → udata-7.0.6.dev28345.dist-info}/top_level.txt +0 -0
udata/commands/dcat.py CHANGED
@@ -8,7 +8,7 @@ from rdflib import Graph
8
8
  from udata.commands import cli, green, yellow, cyan, echo, magenta
9
9
  from udata.core.dataset.factories import DatasetFactory
10
10
  from udata.core.dataset.rdf import dataset_from_rdf
11
- from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend
11
+ from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend, CswIso19139DcatBackend
12
12
  from udata.rdf import namespace_manager
13
13
 
14
14
  log = logging.getLogger(__name__)
@@ -23,9 +23,10 @@ def grp():
23
23
  @grp.command()
24
24
  @click.argument('url')
25
25
  @click.option('-q', '--quiet', is_flag=True, help='Ignore warnings')
26
- @click.option('-i', '--rid', help='Inspect specific remote id (contains)')
27
- @click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint')
28
- def parse_url(url, csw, quiet=False, rid=''):
26
+ @click.option('-r', '--rid', help='Inspect specific remote id (contains)')
27
+ @click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint with DCAT output')
28
+ @click.option('-i', '--iso', is_flag=True, help='The target is a CSW endpoint with ISO output')
29
+ def parse_url(url, csw, iso, quiet=False, rid=''):
29
30
  '''Parse the datasets in a DCAT format located at URL (debug)'''
30
31
  if quiet:
31
32
  verbose_loggers = ['rdflib', 'udata.core.dataset']
@@ -49,6 +50,8 @@ def parse_url(url, csw, quiet=False, rid=''):
49
50
  source.url = url
50
51
  if csw:
51
52
  backend = CswDcatBackend(source, dryrun=True)
53
+ elif iso:
54
+ backend = CswIso19139DcatBackend(source, dryrun=True)
52
55
  else:
53
56
  backend = DcatBackend(source, dryrun=True)
54
57
  backend.job = MockJob()
@@ -34,10 +34,8 @@ class DatasetFactory(ModelFactory):
34
34
  nb_resources = 0
35
35
 
36
36
 
37
- class VisibleDatasetFactory(DatasetFactory):
38
- @factory.lazy_attribute
39
- def resources(self):
40
- return [ResourceFactory()]
37
+ class HiddenDatasetFactory(DatasetFactory):
38
+ private = True
41
39
 
42
40
 
43
41
  class ChecksumFactory(ModelFactory):
@@ -342,12 +342,10 @@ class License(db.Document):
342
342
 
343
343
  class DatasetQuerySet(db.OwnedQuerySet):
344
344
  def visible(self):
345
- return self(private__ne=True, resources__0__exists=True,
346
- deleted=None, archived=None)
345
+ return self(private__ne=True, deleted=None, archived=None)
347
346
 
348
347
  def hidden(self):
349
348
  return self(db.Q(private=True) |
350
- db.Q(resources__0__exists=False) |
351
349
  db.Q(deleted__ne=None) |
352
350
  db.Q(archived__ne=None))
353
351
 
@@ -677,8 +675,7 @@ class Dataset(WithMetrics, BadgeMixin, db.Owned, db.Document):
677
675
 
678
676
  @property
679
677
  def is_hidden(self):
680
- return (len(self.resources) == 0 or self.private or self.deleted
681
- or self.archived)
678
+ return self.private or self.deleted or self.archived
682
679
 
683
680
  @property
684
681
  def full_title(self):
@@ -47,7 +47,6 @@ class DatasetSearch(ModelSearchAdapter):
47
47
  @classmethod
48
48
  def is_indexable(cls, dataset):
49
49
  return (dataset.deleted is None and dataset.archived is None and
50
- len(dataset.resources) > 0 and
51
50
  not dataset.private)
52
51
 
53
52
  @classmethod
@@ -1,15 +1,12 @@
1
1
  from flask import url_for
2
2
 
3
- from udata.utils import get_by
4
-
5
3
  from udata.utils import faker
6
4
  from udata.tests.api import APITestCase
7
5
  from udata.tests.features.territories import (
8
6
  create_geozones_fixtures, TerritoriesSettings
9
7
  )
10
- from udata.tests.helpers import assert_json_equal
11
8
  from udata.core.organization.factories import OrganizationFactory
12
- from udata.core.dataset.factories import VisibleDatasetFactory
9
+ from udata.core.dataset.factories import DatasetFactory
13
10
  from udata.core.spatial.factories import (
14
11
  SpatialCoverageFactory, GeoZoneFactory, GeoLevelFactory
15
12
  )
@@ -173,7 +170,7 @@ class SpatialApiTest(APITestCase):
173
170
  paca, bdr, arles = create_geozones_fixtures()
174
171
  organization = OrganizationFactory()
175
172
  for _ in range(3):
176
- VisibleDatasetFactory(
173
+ DatasetFactory(
177
174
  organization=organization,
178
175
  spatial=SpatialCoverageFactory(zones=[paca.id]))
179
176
 
@@ -185,7 +182,7 @@ class SpatialApiTest(APITestCase):
185
182
  paca, bdr, arles = create_geozones_fixtures()
186
183
  organization = OrganizationFactory()
187
184
  for _ in range(3):
188
- VisibleDatasetFactory(
185
+ DatasetFactory(
189
186
  organization=organization,
190
187
  spatial=SpatialCoverageFactory(zones=[paca.id]))
191
188
 
@@ -198,7 +195,7 @@ class SpatialApiTest(APITestCase):
198
195
  paca, bdr, arles = create_geozones_fixtures()
199
196
  organization = OrganizationFactory()
200
197
  for _ in range(3):
201
- VisibleDatasetFactory(
198
+ DatasetFactory(
202
199
  organization=organization,
203
200
  spatial=SpatialCoverageFactory(zones=[paca.id]))
204
201
 
@@ -212,7 +209,7 @@ class SpatialApiTest(APITestCase):
212
209
  paca, bdr, arles = create_geozones_fixtures()
213
210
  organization = OrganizationFactory()
214
211
  for _ in range(3):
215
- VisibleDatasetFactory(
212
+ DatasetFactory(
216
213
  organization=organization,
217
214
  spatial=SpatialCoverageFactory(zones=[paca.id]))
218
215
 
@@ -241,7 +238,7 @@ class SpatialTerritoriesApiTest(APITestCase):
241
238
  paca, bdr, arles = create_geozones_fixtures()
242
239
  organization = OrganizationFactory()
243
240
  for _ in range(3):
244
- VisibleDatasetFactory(
241
+ DatasetFactory(
245
242
  organization=organization,
246
243
  spatial=SpatialCoverageFactory(zones=[paca.id]))
247
244
 
@@ -255,7 +252,7 @@ class SpatialTerritoriesApiTest(APITestCase):
255
252
  paca, bdr, arles = create_geozones_fixtures()
256
253
  organization = OrganizationFactory()
257
254
  for _ in range(3):
258
- VisibleDatasetFactory(
255
+ DatasetFactory(
259
256
  organization=organization,
260
257
  spatial=SpatialCoverageFactory(zones=[paca.id]))
261
258
 
@@ -1,7 +1,7 @@
1
1
  import factory
2
2
 
3
3
  from udata import utils
4
- from udata.core.dataset.factories import VisibleDatasetFactory
4
+ from udata.core.dataset.factories import DatasetFactory
5
5
  from udata.core.reuse.factories import VisibleReuseFactory
6
6
  from udata.factories import ModelFactory
7
7
 
@@ -19,7 +19,7 @@ class TopicFactory(ModelFactory):
19
19
 
20
20
  @factory.lazy_attribute
21
21
  def datasets(self):
22
- return VisibleDatasetFactory.create_batch(3)
22
+ return DatasetFactory.create_batch(3)
23
23
 
24
24
  @factory.lazy_attribute
25
25
  def reuses(self):
@@ -2,7 +2,7 @@ import logging
2
2
 
3
3
  from rdflib import Graph, URIRef
4
4
  from rdflib.namespace import RDF
5
- import xml.etree.ElementTree as ET
5
+ import lxml.etree as ET
6
6
  import boto3
7
7
  from flask import current_app
8
8
  from datetime import date
@@ -173,7 +173,36 @@ class DcatBackend(BaseBackend):
173
173
  dataset = self.get_dataset(item.remote_id)
174
174
  dataset = dataset_from_rdf(graph, dataset, node=node)
175
175
  return dataset
176
+
176
177
 
178
+ def next_record_if_should_continue(self, start, search_results):
179
+ next_record = int(search_results.attrib['nextRecord'])
180
+ matched_count = int(search_results.attrib['numberOfRecordsMatched'])
181
+ returned_count = int(search_results.attrib['numberOfRecordsReturned'])
182
+
183
+ # Break conditions copied gratefully from
184
+ # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
185
+ break_conditions = (
186
+ # standard CSW: A value of 0 means all records have been returned.
187
+ next_record == 0,
188
+
189
+ # Misbehaving CSW server returning a next record > matched count
190
+ next_record > matched_count,
191
+
192
+ # No results returned already
193
+ returned_count == 0,
194
+
195
+ # Current next record is lower than previous one
196
+ next_record < start,
197
+
198
+ # Enough items have been harvested already
199
+ self.max_items and len(self.job.items) >= self.max_items
200
+ )
201
+
202
+ if any(break_conditions):
203
+ return None
204
+ else:
205
+ return next_record
177
206
 
178
207
  class CswDcatBackend(DcatBackend):
179
208
  display_name = 'CSW-DCAT'
@@ -201,17 +230,18 @@ class CswDcatBackend(DcatBackend):
201
230
  graphs = []
202
231
  page = 0
203
232
  start = 1
233
+
204
234
  response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
205
235
  headers=headers)
206
236
  response.raise_for_status()
207
- content = response.text
237
+ content = response.content
208
238
  tree = ET.fromstring(content)
209
239
  if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
210
240
  raise ValueError(f'Failed to query CSW:\n{content}')
211
241
  while tree:
212
242
  graph = Graph(namespace_manager=namespace_manager)
213
243
  search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
214
- if not search_results:
244
+ if search_results is None:
215
245
  log.error(f'No search results found for {url} on page {page}')
216
246
  break
217
247
  for child in search_results:
@@ -225,37 +255,111 @@ class CswDcatBackend(DcatBackend):
225
255
  kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
226
256
  self.add_item(id, **kwargs)
227
257
  graphs.append(graph)
258
+
259
+ next_record = self.next_record_if_should_continue(start, search_results)
260
+ if not next_record:
261
+ break
262
+
263
+ start = next_record
228
264
  page += 1
229
265
 
230
- next_record = int(search_results.attrib['nextRecord'])
231
- matched_count = int(search_results.attrib['numberOfRecordsMatched'])
232
- returned_count = int(search_results.attrib['numberOfRecordsReturned'])
266
+ tree = ET.fromstring(
267
+ self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
268
+ headers=headers).content)
269
+
270
+ return graphs
271
+
272
+
273
+
274
+ class CswIso19139DcatBackend(DcatBackend):
275
+ '''
276
+ An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
277
+ The parsing of items is then the same as for the DcatBackend.
278
+ '''
279
+
280
+ display_name = 'CSW-ISO-19139'
281
+
282
+ ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd'
233
283
 
234
- # Break conditions copied gratefully from
235
- # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
236
- break_conditions = (
237
- # standard CSW: A value of 0 means all records have been returned.
238
- next_record == 0,
284
+ XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
239
285
 
240
- # Misbehaving CSW server returning a next record > matched count
241
- next_record > matched_count,
286
+ def parse_graph(self, url: str, fmt: str) -> List[Graph]:
287
+ '''
288
+ Parse CSW graph querying ISO schema.
289
+ Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
290
+ See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
291
+ '''
292
+
293
+ # Load XSLT
294
+ xsl = ET.fromstring(self.get(self.XSL_URL).content)
295
+ transform = ET.XSLT(xsl)
296
+
297
+ # Start querying and parsing graph
298
+ body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
299
+ xmlns:gmd="http://www.isotc211.org/2005/gmd"
300
+ service="CSW" version="2.0.2" resultType="results"
301
+ startPosition="{start}" maxPosition="10"
302
+ outputSchema="{schema}">
303
+ <csw:Query typeNames="csw:Record">
304
+ <csw:ElementSetName>full</csw:ElementSetName>
305
+ <csw:Constraint version="1.1.0">
306
+ <ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
307
+ <ogc:PropertyIsEqualTo>
308
+ <ogc:PropertyName>dc:type</ogc:PropertyName>
309
+ <ogc:Literal>dataset</ogc:Literal>
310
+ </ogc:PropertyIsEqualTo>
311
+ </ogc:Filter>
312
+ </csw:Constraint>
313
+ </csw:Query>
314
+ </csw:GetRecords>'''
315
+ headers = {'Content-Type': 'application/xml'}
242
316
 
243
- # No results returned already
244
- returned_count == 0,
317
+ graphs = []
318
+ page = 0
319
+ start = 1
245
320
 
246
- # Current next record is lower than previous one
247
- next_record < start,
321
+ response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
322
+ headers=headers)
323
+ response.raise_for_status()
248
324
 
249
- # Enough items have been harvested already
250
- self.max_items and len(self.job.items) >= self.max_items
251
- )
325
+ tree_before_transform = ET.fromstring(response.content)
326
+ # Disabling CoupledResourceLookUp to prevent failure on xlink:href
327
+ # https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup
328
+ tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
252
329
 
253
- if any(break_conditions):
330
+ while tree:
331
+ # We query the tree before the transformation because the XSLT remove the search results
332
+ # infos (useful for pagination)
333
+ search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
334
+ if search_results is None:
335
+ log.error(f'No search results found for {url} on page {page}')
254
336
  break
255
337
 
338
+ subgraph = Graph(namespace_manager=namespace_manager)
339
+ subgraph.parse(ET.tostring(tree), format=fmt)
340
+
341
+ if not subgraph.subjects(RDF.type, DCAT.Dataset):
342
+ raise ValueError("Failed to fetch CSW content")
343
+
344
+ for node in subgraph.subjects(RDF.type, DCAT.Dataset):
345
+ id = subgraph.value(node, DCT.identifier)
346
+ kwargs = {'nid': str(node), 'page': page}
347
+ kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
348
+ self.add_item(id, **kwargs)
349
+ graphs.append(subgraph)
350
+
351
+ next_record = self.next_record_if_should_continue(start, search_results)
352
+ if not next_record:
353
+ break
354
+
256
355
  start = next_record
257
- tree = ET.fromstring(
258
- self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
259
- headers=headers).text)
356
+ page += 1
357
+
358
+ response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
359
+ headers=headers)
360
+ response.raise_for_status()
361
+
362
+ tree_before_transform = ET.fromstring(response.content)
363
+ tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
260
364
 
261
365
  return graphs