udata 7.0.5.dev28140__py2.py3-none-any.whl → 7.0.6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (74) hide show
  1. udata/__init__.py +1 -1
  2. udata/commands/__init__.py +0 -3
  3. udata/commands/dcat.py +7 -4
  4. udata/core/activity/api.py +1 -1
  5. udata/core/activity/models.py +4 -3
  6. udata/core/activity/tasks.py +6 -5
  7. udata/core/dataset/factories.py +2 -4
  8. udata/core/dataset/models.py +2 -5
  9. udata/core/dataset/rdf.py +65 -25
  10. udata/core/dataset/search.py +0 -1
  11. udata/core/spatial/tests/test_api.py +7 -10
  12. udata/core/topic/factories.py +2 -2
  13. udata/harvest/backends/dcat.py +128 -24
  14. udata/harvest/tests/csw_dcat/XSLT.xml +4298 -0
  15. udata/harvest/tests/csw_dcat/geonetwork-iso-page-1.xml +1291 -0
  16. udata/harvest/tests/csw_dcat/geonetwork-iso-page-3.xml +1139 -0
  17. udata/harvest/tests/csw_dcat/geonetwork-iso-page-5.xml +1266 -0
  18. udata/harvest/tests/dcat/bnodes.xml +7 -1
  19. udata/harvest/tests/dcat/evian.json +464 -0
  20. udata/harvest/tests/test_dcat_backend.py +82 -9
  21. udata/migrations/2024-03-22-migrate-activity-kwargs-to-extras.py +16 -0
  22. udata/rdf.py +22 -1
  23. udata/search/__init__.py +2 -2
  24. udata/static/chunks/{11.c0ccea08914b6b41568e.js → 11.a23c110811a9ac943478.js} +3 -3
  25. udata/static/chunks/{11.c0ccea08914b6b41568e.js.map → 11.a23c110811a9ac943478.js.map} +1 -1
  26. udata/static/chunks/{13.526a25163ababaa44409.js → 13.0889e093f8664e38568c.js} +2 -2
  27. udata/static/chunks/{13.526a25163ababaa44409.js.map → 13.0889e093f8664e38568c.js.map} +1 -1
  28. udata/static/chunks/{16.7901839b4227881947f6.js → 16.f41599478d3e97ad9a30.js} +2 -2
  29. udata/static/chunks/{16.7901839b4227881947f6.js.map → 16.f41599478d3e97ad9a30.js.map} +1 -1
  30. udata/static/chunks/{19.471d5a2a08eef6e5338a.js → 19.2b534a26af8b17e9170b.js} +3 -3
  31. udata/static/chunks/{19.471d5a2a08eef6e5338a.js.map → 19.2b534a26af8b17e9170b.js.map} +1 -1
  32. udata/static/chunks/{5.534e0531d0e2b150146f.js → 5.7115454a1183e5c12eef.js} +3 -3
  33. udata/static/chunks/{5.534e0531d0e2b150146f.js.map → 5.7115454a1183e5c12eef.js.map} +1 -1
  34. udata/static/chunks/{6.e56975229e6065f68d2a.js → 6.16bb24fb8240f2746488.js} +3 -3
  35. udata/static/chunks/{6.e56975229e6065f68d2a.js.map → 6.16bb24fb8240f2746488.js.map} +1 -1
  36. udata/static/chunks/{9.534426728626f11f4571.js → 9.3e752966ff14e47e11f2.js} +2 -2
  37. udata/static/chunks/{9.534426728626f11f4571.js.map → 9.3e752966ff14e47e11f2.js.map} +1 -1
  38. udata/static/common.js +1 -1
  39. udata/static/common.js.map +1 -1
  40. udata/tests/api/test_datasets_api.py +45 -45
  41. udata/tests/api/test_me_api.py +13 -14
  42. udata/tests/dataset/test_dataset_actions.py +2 -2
  43. udata/tests/dataset/test_dataset_commands.py +3 -3
  44. udata/tests/dataset/test_dataset_model.py +2 -1
  45. udata/tests/organization/test_organization_model.py +3 -3
  46. udata/tests/organization/test_organization_rdf.py +3 -3
  47. udata/tests/reuse/test_reuse_model.py +2 -2
  48. udata/tests/search/test_adapter.py +12 -12
  49. udata/tests/search/test_results.py +4 -4
  50. udata/tests/site/test_site_api.py +3 -3
  51. udata/tests/site/test_site_metrics.py +3 -3
  52. udata/tests/site/test_site_rdf.py +6 -6
  53. udata/tests/test_activity.py +12 -0
  54. udata/tests/test_transfer.py +18 -17
  55. udata/translations/ar/LC_MESSAGES/udata.mo +0 -0
  56. udata/translations/ar/LC_MESSAGES/udata.po +90 -44
  57. udata/translations/de/LC_MESSAGES/udata.mo +0 -0
  58. udata/translations/de/LC_MESSAGES/udata.po +91 -45
  59. udata/translations/es/LC_MESSAGES/udata.mo +0 -0
  60. udata/translations/es/LC_MESSAGES/udata.po +90 -44
  61. udata/translations/fr/LC_MESSAGES/udata.mo +0 -0
  62. udata/translations/fr/LC_MESSAGES/udata.po +91 -45
  63. udata/translations/it/LC_MESSAGES/udata.mo +0 -0
  64. udata/translations/it/LC_MESSAGES/udata.po +90 -44
  65. udata/translations/pt/LC_MESSAGES/udata.mo +0 -0
  66. udata/translations/pt/LC_MESSAGES/udata.po +91 -45
  67. udata/translations/sr/LC_MESSAGES/udata.mo +0 -0
  68. udata/translations/sr/LC_MESSAGES/udata.po +91 -45
  69. {udata-7.0.5.dev28140.dist-info → udata-7.0.6.dist-info}/METADATA +13 -3
  70. {udata-7.0.5.dev28140.dist-info → udata-7.0.6.dist-info}/RECORD +74 -68
  71. {udata-7.0.5.dev28140.dist-info → udata-7.0.6.dist-info}/entry_points.txt +1 -0
  72. {udata-7.0.5.dev28140.dist-info → udata-7.0.6.dist-info}/LICENSE +0 -0
  73. {udata-7.0.5.dev28140.dist-info → udata-7.0.6.dist-info}/WHEEL +0 -0
  74. {udata-7.0.5.dev28140.dist-info → udata-7.0.6.dist-info}/top_level.txt +0 -0
udata/__init__.py CHANGED
@@ -4,5 +4,5 @@
4
4
  udata
5
5
  '''
6
6
 
7
- __version__ = '7.0.5.dev'
7
+ __version__ = '7.0.6'
8
8
  __description__ = 'Open data portal'
@@ -145,9 +145,6 @@ def init_logging(app):
145
145
  handler.setFormatter(CliFormatter())
146
146
  handler.setLevel(log_level)
147
147
 
148
- logger = logging.getLogger()
149
- logger.addHandler(handler)
150
-
151
148
  logger = logging.getLogger('__main__')
152
149
  logger.setLevel(log_level)
153
150
  logger.handlers = []
udata/commands/dcat.py CHANGED
@@ -8,7 +8,7 @@ from rdflib import Graph
8
8
  from udata.commands import cli, green, yellow, cyan, echo, magenta
9
9
  from udata.core.dataset.factories import DatasetFactory
10
10
  from udata.core.dataset.rdf import dataset_from_rdf
11
- from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend
11
+ from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend, CswIso19139DcatBackend
12
12
  from udata.rdf import namespace_manager
13
13
 
14
14
  log = logging.getLogger(__name__)
@@ -23,9 +23,10 @@ def grp():
23
23
  @grp.command()
24
24
  @click.argument('url')
25
25
  @click.option('-q', '--quiet', is_flag=True, help='Ignore warnings')
26
- @click.option('-i', '--rid', help='Inspect specific remote id (contains)')
27
- @click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint')
28
- def parse_url(url, csw, quiet=False, rid=''):
26
+ @click.option('-r', '--rid', help='Inspect specific remote id (contains)')
27
+ @click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint with DCAT output')
28
+ @click.option('-i', '--iso', is_flag=True, help='The target is a CSW endpoint with ISO output')
29
+ def parse_url(url, csw, iso, quiet=False, rid=''):
29
30
  '''Parse the datasets in a DCAT format located at URL (debug)'''
30
31
  if quiet:
31
32
  verbose_loggers = ['rdflib', 'udata.core.dataset']
@@ -49,6 +50,8 @@ def parse_url(url, csw, quiet=False, rid=''):
49
50
  source.url = url
50
51
  if csw:
51
52
  backend = CswDcatBackend(source, dryrun=True)
53
+ elif iso:
54
+ backend = CswIso19139DcatBackend(source, dryrun=True)
52
55
  else:
53
56
  backend = DcatBackend(source, dryrun=True)
54
57
  backend.job = MockJob()
@@ -37,7 +37,7 @@ activity_fields = api.model('Activity', {
37
37
  description='The key of the activity', required=True),
38
38
  'icon': fields.String(
39
39
  description='The icon of the activity', required=True),
40
- 'kwargs': fields.Raw(description='Some action specific context'),
40
+ 'extras': fields.Raw(description='Extras attributes as key-value pairs'),
41
41
  })
42
42
 
43
43
  activity_page_fields = api.model('ActivityPage', fields.pager(activity_fields))
@@ -37,7 +37,7 @@ class Activity(db.Document, metaclass=EmitNewActivityMetaClass):
37
37
  related_to = db.ReferenceField(db.DomainModel, required=True)
38
38
  created_at = db.DateTimeField(default=datetime.utcnow, required=True)
39
39
 
40
- kwargs = db.DictField()
40
+ extras = db.ExtrasField()
41
41
 
42
42
  on_new = Signal()
43
43
 
@@ -65,8 +65,9 @@ class Activity(db.Document, metaclass=EmitNewActivityMetaClass):
65
65
  return cls.on_new.connect(func, sender=cls)
66
66
 
67
67
  @classmethod
68
- def emit(cls, related_to, organization=None, **kwargs):
68
+ def emit(cls, related_to, organization=None, extras=None):
69
69
  new_activity.send(cls,
70
70
  related_to=related_to,
71
71
  actor=current_user._get_current_object(),
72
- organization=organization)
72
+ organization=organization,
73
+ extras=extras)
@@ -9,22 +9,23 @@ log = logging.getLogger(__name__)
9
9
 
10
10
 
11
11
  @new_activity.connect
12
- def delay_activity(cls, related_to, actor, organization=None):
12
+ def delay_activity(cls, related_to, actor, organization=None, extras=None):
13
13
  emit_activity.delay(
14
14
  cls.__name__,
15
15
  str(actor.id),
16
16
  related_to_cls=related_to.__class__.__name__,
17
17
  related_to_id=str(related_to.id),
18
18
  organization_id=str(organization.id) if organization else None,
19
+ extras=extras
19
20
  )
20
21
 
21
22
 
22
23
  @task
23
24
  def emit_activity(classname, actor_id, related_to_cls, related_to_id,
24
- organization_id=None):
25
- log.debug('Emit new activity: %s %s %s %s %s',
25
+ organization_id=None, extras=None):
26
+ log.debug('Emit new activity: %s %s %s %s %s %s',
26
27
  classname, actor_id, related_to_cls,
27
- related_to_id, organization_id)
28
+ related_to_id, organization_id, extras)
28
29
  cls = db.resolve_model(classname)
29
30
  actor = User.objects.get(pk=actor_id)
30
31
  related_to = db.resolve_model(related_to_cls).objects.get(pk=related_to_id)
@@ -33,4 +34,4 @@ def emit_activity(classname, actor_id, related_to_cls, related_to_id,
33
34
  else:
34
35
  organization = None
35
36
  cls.objects.create(actor=actor, related_to=related_to,
36
- organization=organization)
37
+ organization=organization, extras=extras)
@@ -34,10 +34,8 @@ class DatasetFactory(ModelFactory):
34
34
  nb_resources = 0
35
35
 
36
36
 
37
- class VisibleDatasetFactory(DatasetFactory):
38
- @factory.lazy_attribute
39
- def resources(self):
40
- return [ResourceFactory()]
37
+ class HiddenDatasetFactory(DatasetFactory):
38
+ private = True
41
39
 
42
40
 
43
41
  class ChecksumFactory(ModelFactory):
@@ -342,12 +342,10 @@ class License(db.Document):
342
342
 
343
343
  class DatasetQuerySet(db.OwnedQuerySet):
344
344
  def visible(self):
345
- return self(private__ne=True, resources__0__exists=True,
346
- deleted=None, archived=None)
345
+ return self(private__ne=True, deleted=None, archived=None)
347
346
 
348
347
  def hidden(self):
349
348
  return self(db.Q(private=True) |
350
- db.Q(resources__0__exists=False) |
351
349
  db.Q(deleted__ne=None) |
352
350
  db.Q(archived__ne=None))
353
351
 
@@ -677,8 +675,7 @@ class Dataset(WithMetrics, BadgeMixin, db.Owned, db.Document):
677
675
 
678
676
  @property
679
677
  def is_hidden(self):
680
- return (len(self.resources) == 0 or self.private or self.deleted
681
- or self.archived)
678
+ return self.private or self.deleted or self.archived
682
679
 
683
680
  @property
684
681
  def full_title(self):
udata/core/dataset/rdf.py CHANGED
@@ -7,6 +7,7 @@ import logging
7
7
 
8
8
  from datetime import date
9
9
  from html.parser import HTMLParser
10
+ from typing import Optional
10
11
  from dateutil.parser import parse as parse_dt
11
12
  from flask import current_app
12
13
  from geomet import wkt
@@ -339,36 +340,51 @@ def contact_point_from_rdf(rdf, dataset):
339
340
 
340
341
 
341
342
  def spatial_from_rdf(graph):
343
+ geojsons = []
342
344
  for term in graph.objects(DCT.spatial):
343
- for object in term.objects():
344
- if isinstance(object, Literal):
345
- if object.datatype.__str__() == 'https://www.iana.org/assignments/media-types/application/vnd.geo+json':
346
- try:
347
- geojson = json.loads(object.toPython())
348
- except ValueError as e:
349
- log.warning(f"Invalid JSON in spatial GeoJSON {object.toPython()} {e}")
350
- continue
351
- elif object.datatype.__str__() == 'http://www.opengis.net/rdf#wktLiteral':
352
- try:
353
- # .upper() si here because geomet doesn't support Polygon but only POLYGON
354
- geojson = wkt.loads(object.toPython().strip().upper())
355
- except ValueError as e:
356
- log.warning(f"Invalid JSON in spatial WKT {object.toPython()} {e}")
345
+ try:
346
+ # This may not be official in the norm but some ArcGis return
347
+ # bbox as literal directly in DCT.spatial.
348
+ if isinstance(term, Literal):
349
+ geojson = bbox_to_geojson_multipolygon(term.toPython())
350
+ if geojson is not None:
351
+ geojsons.append(geojson)
352
+
353
+ continue
354
+
355
+ for object in term.objects():
356
+ if isinstance(object, Literal):
357
+ if object.datatype.__str__() == 'https://www.iana.org/assignments/media-types/application/vnd.geo+json':
358
+ try:
359
+ geojson = json.loads(object.toPython())
360
+ except ValueError as e:
361
+ log.warning(f"Invalid JSON in spatial GeoJSON {object.toPython()} {e}")
362
+ continue
363
+ elif object.datatype.__str__() == 'http://www.opengis.net/rdf#wktLiteral':
364
+ try:
365
+ # .upper() si here because geomet doesn't support Polygon but only POLYGON
366
+ geojson = wkt.loads(object.toPython().strip().upper())
367
+ except ValueError as e:
368
+ log.warning(f"Invalid JSON in spatial WKT {object.toPython()} {e}")
369
+ continue
370
+ else:
357
371
  continue
358
- else:
359
- continue
360
372
 
361
- if geojson['type'] == 'Polygon':
362
- geojson['type'] = 'MultiPolygon'
363
- geojson['coordinates'] = [geojson['coordinates']]
373
+ if geojson['type'] == 'Polygon':
374
+ geojson['type'] = 'MultiPolygon'
375
+ geojson['coordinates'] = [geojson['coordinates']]
364
376
 
365
- spatial_coverage = SpatialCoverage(geom=geojson)
377
+ geojsons.append(geojson)
378
+ except Exception as e:
379
+ log.exception(f"Exception during `spatial_from_rdf` for term {term}: {e}", stack_info=True)
366
380
 
367
- try:
368
- spatial_coverage.clean()
369
- return spatial_coverage
370
- except ValidationError:
371
- continue
381
+ for geojson in geojsons:
382
+ spatial_coverage = SpatialCoverage(geom=geojson)
383
+ try:
384
+ spatial_coverage.clean()
385
+ return spatial_coverage
386
+ except ValidationError:
387
+ continue
372
388
 
373
389
  return None
374
390
 
@@ -609,3 +625,27 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None):
609
625
  dataset.harvest.modified_at = modified_at
610
626
 
611
627
  return dataset
628
+
629
+ def bbox_to_geojson_multipolygon(bbox_as_str: str) -> Optional[dict] :
630
+ bbox = bbox_as_str.strip().split(',')
631
+ if len(bbox) != 4:
632
+ return None
633
+
634
+ west = float(bbox[0])
635
+ south = float(bbox[1])
636
+ east = float(bbox[2])
637
+ north = float(bbox[3])
638
+
639
+ low_left = [west, south]
640
+ top_left = [west, north]
641
+ top_right = [east, north]
642
+ low_right = [east, south]
643
+
644
+ return {
645
+ 'type': 'MultiPolygon',
646
+ 'coordinates': [
647
+ [
648
+ [low_left, low_right, top_right, top_left, low_left],
649
+ ],
650
+ ],
651
+ }
@@ -47,7 +47,6 @@ class DatasetSearch(ModelSearchAdapter):
47
47
  @classmethod
48
48
  def is_indexable(cls, dataset):
49
49
  return (dataset.deleted is None and dataset.archived is None and
50
- len(dataset.resources) > 0 and
51
50
  not dataset.private)
52
51
 
53
52
  @classmethod
@@ -1,15 +1,12 @@
1
1
  from flask import url_for
2
2
 
3
- from udata.utils import get_by
4
-
5
3
  from udata.utils import faker
6
4
  from udata.tests.api import APITestCase
7
5
  from udata.tests.features.territories import (
8
6
  create_geozones_fixtures, TerritoriesSettings
9
7
  )
10
- from udata.tests.helpers import assert_json_equal
11
8
  from udata.core.organization.factories import OrganizationFactory
12
- from udata.core.dataset.factories import VisibleDatasetFactory
9
+ from udata.core.dataset.factories import DatasetFactory
13
10
  from udata.core.spatial.factories import (
14
11
  SpatialCoverageFactory, GeoZoneFactory, GeoLevelFactory
15
12
  )
@@ -173,7 +170,7 @@ class SpatialApiTest(APITestCase):
173
170
  paca, bdr, arles = create_geozones_fixtures()
174
171
  organization = OrganizationFactory()
175
172
  for _ in range(3):
176
- VisibleDatasetFactory(
173
+ DatasetFactory(
177
174
  organization=organization,
178
175
  spatial=SpatialCoverageFactory(zones=[paca.id]))
179
176
 
@@ -185,7 +182,7 @@ class SpatialApiTest(APITestCase):
185
182
  paca, bdr, arles = create_geozones_fixtures()
186
183
  organization = OrganizationFactory()
187
184
  for _ in range(3):
188
- VisibleDatasetFactory(
185
+ DatasetFactory(
189
186
  organization=organization,
190
187
  spatial=SpatialCoverageFactory(zones=[paca.id]))
191
188
 
@@ -198,7 +195,7 @@ class SpatialApiTest(APITestCase):
198
195
  paca, bdr, arles = create_geozones_fixtures()
199
196
  organization = OrganizationFactory()
200
197
  for _ in range(3):
201
- VisibleDatasetFactory(
198
+ DatasetFactory(
202
199
  organization=organization,
203
200
  spatial=SpatialCoverageFactory(zones=[paca.id]))
204
201
 
@@ -212,7 +209,7 @@ class SpatialApiTest(APITestCase):
212
209
  paca, bdr, arles = create_geozones_fixtures()
213
210
  organization = OrganizationFactory()
214
211
  for _ in range(3):
215
- VisibleDatasetFactory(
212
+ DatasetFactory(
216
213
  organization=organization,
217
214
  spatial=SpatialCoverageFactory(zones=[paca.id]))
218
215
 
@@ -241,7 +238,7 @@ class SpatialTerritoriesApiTest(APITestCase):
241
238
  paca, bdr, arles = create_geozones_fixtures()
242
239
  organization = OrganizationFactory()
243
240
  for _ in range(3):
244
- VisibleDatasetFactory(
241
+ DatasetFactory(
245
242
  organization=organization,
246
243
  spatial=SpatialCoverageFactory(zones=[paca.id]))
247
244
 
@@ -255,7 +252,7 @@ class SpatialTerritoriesApiTest(APITestCase):
255
252
  paca, bdr, arles = create_geozones_fixtures()
256
253
  organization = OrganizationFactory()
257
254
  for _ in range(3):
258
- VisibleDatasetFactory(
255
+ DatasetFactory(
259
256
  organization=organization,
260
257
  spatial=SpatialCoverageFactory(zones=[paca.id]))
261
258
 
@@ -1,7 +1,7 @@
1
1
  import factory
2
2
 
3
3
  from udata import utils
4
- from udata.core.dataset.factories import VisibleDatasetFactory
4
+ from udata.core.dataset.factories import DatasetFactory
5
5
  from udata.core.reuse.factories import VisibleReuseFactory
6
6
  from udata.factories import ModelFactory
7
7
 
@@ -19,7 +19,7 @@ class TopicFactory(ModelFactory):
19
19
 
20
20
  @factory.lazy_attribute
21
21
  def datasets(self):
22
- return VisibleDatasetFactory.create_batch(3)
22
+ return DatasetFactory.create_batch(3)
23
23
 
24
24
  @factory.lazy_attribute
25
25
  def reuses(self):
@@ -2,7 +2,7 @@ import logging
2
2
 
3
3
  from rdflib import Graph, URIRef
4
4
  from rdflib.namespace import RDF
5
- import xml.etree.ElementTree as ET
5
+ import lxml.etree as ET
6
6
  import boto3
7
7
  from flask import current_app
8
8
  from datetime import date
@@ -173,7 +173,36 @@ class DcatBackend(BaseBackend):
173
173
  dataset = self.get_dataset(item.remote_id)
174
174
  dataset = dataset_from_rdf(graph, dataset, node=node)
175
175
  return dataset
176
+
176
177
 
178
+ def next_record_if_should_continue(self, start, search_results):
179
+ next_record = int(search_results.attrib['nextRecord'])
180
+ matched_count = int(search_results.attrib['numberOfRecordsMatched'])
181
+ returned_count = int(search_results.attrib['numberOfRecordsReturned'])
182
+
183
+ # Break conditions copied gratefully from
184
+ # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
185
+ break_conditions = (
186
+ # standard CSW: A value of 0 means all records have been returned.
187
+ next_record == 0,
188
+
189
+ # Misbehaving CSW server returning a next record > matched count
190
+ next_record > matched_count,
191
+
192
+ # No results returned already
193
+ returned_count == 0,
194
+
195
+ # Current next record is lower than previous one
196
+ next_record < start,
197
+
198
+ # Enough items have been harvested already
199
+ self.max_items and len(self.job.items) >= self.max_items
200
+ )
201
+
202
+ if any(break_conditions):
203
+ return None
204
+ else:
205
+ return next_record
177
206
 
178
207
  class CswDcatBackend(DcatBackend):
179
208
  display_name = 'CSW-DCAT'
@@ -201,17 +230,18 @@ class CswDcatBackend(DcatBackend):
201
230
  graphs = []
202
231
  page = 0
203
232
  start = 1
233
+
204
234
  response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
205
235
  headers=headers)
206
236
  response.raise_for_status()
207
- content = response.text
237
+ content = response.content
208
238
  tree = ET.fromstring(content)
209
239
  if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
210
240
  raise ValueError(f'Failed to query CSW:\n{content}')
211
241
  while tree:
212
242
  graph = Graph(namespace_manager=namespace_manager)
213
243
  search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
214
- if not search_results:
244
+ if search_results is None:
215
245
  log.error(f'No search results found for {url} on page {page}')
216
246
  break
217
247
  for child in search_results:
@@ -225,37 +255,111 @@ class CswDcatBackend(DcatBackend):
225
255
  kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
226
256
  self.add_item(id, **kwargs)
227
257
  graphs.append(graph)
258
+
259
+ next_record = self.next_record_if_should_continue(start, search_results)
260
+ if not next_record:
261
+ break
262
+
263
+ start = next_record
228
264
  page += 1
229
265
 
230
- next_record = int(search_results.attrib['nextRecord'])
231
- matched_count = int(search_results.attrib['numberOfRecordsMatched'])
232
- returned_count = int(search_results.attrib['numberOfRecordsReturned'])
266
+ tree = ET.fromstring(
267
+ self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
268
+ headers=headers).content)
269
+
270
+ return graphs
271
+
272
+
273
+
274
+ class CswIso19139DcatBackend(DcatBackend):
275
+ '''
276
+ An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
277
+ The parsing of items is then the same as for the DcatBackend.
278
+ '''
279
+
280
+ display_name = 'CSW-ISO-19139'
281
+
282
+ ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd'
233
283
 
234
- # Break conditions copied gratefully from
235
- # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
236
- break_conditions = (
237
- # standard CSW: A value of 0 means all records have been returned.
238
- next_record == 0,
284
+ XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
239
285
 
240
- # Misbehaving CSW server returning a next record > matched count
241
- next_record > matched_count,
286
+ def parse_graph(self, url: str, fmt: str) -> List[Graph]:
287
+ '''
288
+ Parse CSW graph querying ISO schema.
289
+ Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
290
+ See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
291
+ '''
292
+
293
+ # Load XSLT
294
+ xsl = ET.fromstring(self.get(self.XSL_URL).content)
295
+ transform = ET.XSLT(xsl)
296
+
297
+ # Start querying and parsing graph
298
+ body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
299
+ xmlns:gmd="http://www.isotc211.org/2005/gmd"
300
+ service="CSW" version="2.0.2" resultType="results"
301
+ startPosition="{start}" maxPosition="10"
302
+ outputSchema="{schema}">
303
+ <csw:Query typeNames="csw:Record">
304
+ <csw:ElementSetName>full</csw:ElementSetName>
305
+ <csw:Constraint version="1.1.0">
306
+ <ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
307
+ <ogc:PropertyIsEqualTo>
308
+ <ogc:PropertyName>dc:type</ogc:PropertyName>
309
+ <ogc:Literal>dataset</ogc:Literal>
310
+ </ogc:PropertyIsEqualTo>
311
+ </ogc:Filter>
312
+ </csw:Constraint>
313
+ </csw:Query>
314
+ </csw:GetRecords>'''
315
+ headers = {'Content-Type': 'application/xml'}
242
316
 
243
- # No results returned already
244
- returned_count == 0,
317
+ graphs = []
318
+ page = 0
319
+ start = 1
245
320
 
246
- # Current next record is lower than previous one
247
- next_record < start,
321
+ response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
322
+ headers=headers)
323
+ response.raise_for_status()
248
324
 
249
- # Enough items have been harvested already
250
- self.max_items and len(self.job.items) >= self.max_items
251
- )
325
+ tree_before_transform = ET.fromstring(response.content)
326
+ # Disabling CoupledResourceLookUp to prevent failure on xlink:href
327
+ # https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup
328
+ tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
252
329
 
253
- if any(break_conditions):
330
+ while tree:
331
+ # We query the tree before the transformation because the XSLT remove the search results
332
+ # infos (useful for pagination)
333
+ search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
334
+ if search_results is None:
335
+ log.error(f'No search results found for {url} on page {page}')
254
336
  break
255
337
 
338
+ subgraph = Graph(namespace_manager=namespace_manager)
339
+ subgraph.parse(ET.tostring(tree), format=fmt)
340
+
341
+ if not subgraph.subjects(RDF.type, DCAT.Dataset):
342
+ raise ValueError("Failed to fetch CSW content")
343
+
344
+ for node in subgraph.subjects(RDF.type, DCAT.Dataset):
345
+ id = subgraph.value(node, DCT.identifier)
346
+ kwargs = {'nid': str(node), 'page': page}
347
+ kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
348
+ self.add_item(id, **kwargs)
349
+ graphs.append(subgraph)
350
+
351
+ next_record = self.next_record_if_should_continue(start, search_results)
352
+ if not next_record:
353
+ break
354
+
256
355
  start = next_record
257
- tree = ET.fromstring(
258
- self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
259
- headers=headers).text)
356
+ page += 1
357
+
358
+ response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
359
+ headers=headers)
360
+ response.raise_for_status()
361
+
362
+ tree_before_transform = ET.fromstring(response.content)
363
+ tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
260
364
 
261
365
  return graphs