udata 7.0.6__py2.py3-none-any.whl → 7.0.6.dev28209__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (43) hide show
  1. udata/__init__.py +1 -1
  2. udata/commands/__init__.py +3 -0
  3. udata/commands/dcat.py +4 -7
  4. udata/core/activity/api.py +1 -1
  5. udata/core/activity/models.py +3 -4
  6. udata/core/activity/tasks.py +5 -6
  7. udata/core/dataset/factories.py +4 -2
  8. udata/core/dataset/models.py +5 -2
  9. udata/core/dataset/rdf.py +25 -65
  10. udata/core/dataset/search.py +1 -0
  11. udata/core/spatial/tests/test_api.py +10 -7
  12. udata/core/topic/factories.py +2 -2
  13. udata/harvest/backends/dcat.py +24 -128
  14. udata/harvest/tests/test_dcat_backend.py +5 -78
  15. udata/rdf.py +0 -1
  16. udata/search/__init__.py +2 -2
  17. udata/tests/api/test_datasets_api.py +45 -45
  18. udata/tests/api/test_me_api.py +14 -13
  19. udata/tests/dataset/test_dataset_actions.py +2 -2
  20. udata/tests/dataset/test_dataset_commands.py +3 -3
  21. udata/tests/dataset/test_dataset_model.py +1 -2
  22. udata/tests/organization/test_organization_model.py +3 -3
  23. udata/tests/organization/test_organization_rdf.py +3 -3
  24. udata/tests/reuse/test_reuse_model.py +2 -2
  25. udata/tests/search/test_adapter.py +12 -12
  26. udata/tests/search/test_results.py +4 -4
  27. udata/tests/site/test_site_api.py +3 -3
  28. udata/tests/site/test_site_metrics.py +3 -3
  29. udata/tests/site/test_site_rdf.py +6 -6
  30. udata/tests/test_activity.py +0 -12
  31. udata/tests/test_transfer.py +17 -18
  32. {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/METADATA +4 -9
  33. {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/RECORD +37 -43
  34. {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/entry_points.txt +0 -1
  35. udata/harvest/tests/csw_dcat/XSLT.xml +0 -4298
  36. udata/harvest/tests/csw_dcat/geonetwork-iso-page-1.xml +0 -1291
  37. udata/harvest/tests/csw_dcat/geonetwork-iso-page-3.xml +0 -1139
  38. udata/harvest/tests/csw_dcat/geonetwork-iso-page-5.xml +0 -1266
  39. udata/harvest/tests/dcat/evian.json +0 -464
  40. udata/migrations/2024-03-22-migrate-activity-kwargs-to-extras.py +0 -16
  41. {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/LICENSE +0 -0
  42. {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/WHEEL +0 -0
  43. {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/top_level.txt +0 -0
udata/__init__.py CHANGED
@@ -4,5 +4,5 @@
4
4
  udata
5
5
  '''
6
6
 
7
- __version__ = '7.0.6'
7
+ __version__ = '7.0.6.dev'
8
8
  __description__ = 'Open data portal'
@@ -145,6 +145,9 @@ def init_logging(app):
145
145
  handler.setFormatter(CliFormatter())
146
146
  handler.setLevel(log_level)
147
147
 
148
+ logger = logging.getLogger()
149
+ logger.addHandler(handler)
150
+
148
151
  logger = logging.getLogger('__main__')
149
152
  logger.setLevel(log_level)
150
153
  logger.handlers = []
udata/commands/dcat.py CHANGED
@@ -8,7 +8,7 @@ from rdflib import Graph
8
8
  from udata.commands import cli, green, yellow, cyan, echo, magenta
9
9
  from udata.core.dataset.factories import DatasetFactory
10
10
  from udata.core.dataset.rdf import dataset_from_rdf
11
- from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend, CswIso19139DcatBackend
11
+ from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend
12
12
  from udata.rdf import namespace_manager
13
13
 
14
14
  log = logging.getLogger(__name__)
@@ -23,10 +23,9 @@ def grp():
23
23
  @grp.command()
24
24
  @click.argument('url')
25
25
  @click.option('-q', '--quiet', is_flag=True, help='Ignore warnings')
26
- @click.option('-r', '--rid', help='Inspect specific remote id (contains)')
27
- @click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint with DCAT output')
28
- @click.option('-i', '--iso', is_flag=True, help='The target is a CSW endpoint with ISO output')
29
- def parse_url(url, csw, iso, quiet=False, rid=''):
26
+ @click.option('-i', '--rid', help='Inspect specific remote id (contains)')
27
+ @click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint')
28
+ def parse_url(url, csw, quiet=False, rid=''):
30
29
  '''Parse the datasets in a DCAT format located at URL (debug)'''
31
30
  if quiet:
32
31
  verbose_loggers = ['rdflib', 'udata.core.dataset']
@@ -50,8 +49,6 @@ def parse_url(url, csw, iso, quiet=False, rid=''):
50
49
  source.url = url
51
50
  if csw:
52
51
  backend = CswDcatBackend(source, dryrun=True)
53
- elif iso:
54
- backend = CswIso19139DcatBackend(source, dryrun=True)
55
52
  else:
56
53
  backend = DcatBackend(source, dryrun=True)
57
54
  backend.job = MockJob()
@@ -37,7 +37,7 @@ activity_fields = api.model('Activity', {
37
37
  description='The key of the activity', required=True),
38
38
  'icon': fields.String(
39
39
  description='The icon of the activity', required=True),
40
- 'extras': fields.Raw(description='Extras attributes as key-value pairs'),
40
+ 'kwargs': fields.Raw(description='Some action specific context'),
41
41
  })
42
42
 
43
43
  activity_page_fields = api.model('ActivityPage', fields.pager(activity_fields))
@@ -37,7 +37,7 @@ class Activity(db.Document, metaclass=EmitNewActivityMetaClass):
37
37
  related_to = db.ReferenceField(db.DomainModel, required=True)
38
38
  created_at = db.DateTimeField(default=datetime.utcnow, required=True)
39
39
 
40
- extras = db.ExtrasField()
40
+ kwargs = db.DictField()
41
41
 
42
42
  on_new = Signal()
43
43
 
@@ -65,9 +65,8 @@ class Activity(db.Document, metaclass=EmitNewActivityMetaClass):
65
65
  return cls.on_new.connect(func, sender=cls)
66
66
 
67
67
  @classmethod
68
- def emit(cls, related_to, organization=None, extras=None):
68
+ def emit(cls, related_to, organization=None, **kwargs):
69
69
  new_activity.send(cls,
70
70
  related_to=related_to,
71
71
  actor=current_user._get_current_object(),
72
- organization=organization,
73
- extras=extras)
72
+ organization=organization)
@@ -9,23 +9,22 @@ log = logging.getLogger(__name__)
9
9
 
10
10
 
11
11
  @new_activity.connect
12
- def delay_activity(cls, related_to, actor, organization=None, extras=None):
12
+ def delay_activity(cls, related_to, actor, organization=None):
13
13
  emit_activity.delay(
14
14
  cls.__name__,
15
15
  str(actor.id),
16
16
  related_to_cls=related_to.__class__.__name__,
17
17
  related_to_id=str(related_to.id),
18
18
  organization_id=str(organization.id) if organization else None,
19
- extras=extras
20
19
  )
21
20
 
22
21
 
23
22
  @task
24
23
  def emit_activity(classname, actor_id, related_to_cls, related_to_id,
25
- organization_id=None, extras=None):
26
- log.debug('Emit new activity: %s %s %s %s %s %s',
24
+ organization_id=None):
25
+ log.debug('Emit new activity: %s %s %s %s %s',
27
26
  classname, actor_id, related_to_cls,
28
- related_to_id, organization_id, extras)
27
+ related_to_id, organization_id)
29
28
  cls = db.resolve_model(classname)
30
29
  actor = User.objects.get(pk=actor_id)
31
30
  related_to = db.resolve_model(related_to_cls).objects.get(pk=related_to_id)
@@ -34,4 +33,4 @@ def emit_activity(classname, actor_id, related_to_cls, related_to_id,
34
33
  else:
35
34
  organization = None
36
35
  cls.objects.create(actor=actor, related_to=related_to,
37
- organization=organization, extras=extras)
36
+ organization=organization)
@@ -34,8 +34,10 @@ class DatasetFactory(ModelFactory):
34
34
  nb_resources = 0
35
35
 
36
36
 
37
- class HiddenDatasetFactory(DatasetFactory):
38
- private = True
37
+ class VisibleDatasetFactory(DatasetFactory):
38
+ @factory.lazy_attribute
39
+ def resources(self):
40
+ return [ResourceFactory()]
39
41
 
40
42
 
41
43
  class ChecksumFactory(ModelFactory):
@@ -342,10 +342,12 @@ class License(db.Document):
342
342
 
343
343
  class DatasetQuerySet(db.OwnedQuerySet):
344
344
  def visible(self):
345
- return self(private__ne=True, deleted=None, archived=None)
345
+ return self(private__ne=True, resources__0__exists=True,
346
+ deleted=None, archived=None)
346
347
 
347
348
  def hidden(self):
348
349
  return self(db.Q(private=True) |
350
+ db.Q(resources__0__exists=False) |
349
351
  db.Q(deleted__ne=None) |
350
352
  db.Q(archived__ne=None))
351
353
 
@@ -675,7 +677,8 @@ class Dataset(WithMetrics, BadgeMixin, db.Owned, db.Document):
675
677
 
676
678
  @property
677
679
  def is_hidden(self):
678
- return self.private or self.deleted or self.archived
680
+ return (len(self.resources) == 0 or self.private or self.deleted
681
+ or self.archived)
679
682
 
680
683
  @property
681
684
  def full_title(self):
udata/core/dataset/rdf.py CHANGED
@@ -7,7 +7,6 @@ import logging
7
7
 
8
8
  from datetime import date
9
9
  from html.parser import HTMLParser
10
- from typing import Optional
11
10
  from dateutil.parser import parse as parse_dt
12
11
  from flask import current_app
13
12
  from geomet import wkt
@@ -340,51 +339,36 @@ def contact_point_from_rdf(rdf, dataset):
340
339
 
341
340
 
342
341
  def spatial_from_rdf(graph):
343
- geojsons = []
344
342
  for term in graph.objects(DCT.spatial):
345
- try:
346
- # This may not be official in the norm but some ArcGis return
347
- # bbox as literal directly in DCT.spatial.
348
- if isinstance(term, Literal):
349
- geojson = bbox_to_geojson_multipolygon(term.toPython())
350
- if geojson is not None:
351
- geojsons.append(geojson)
352
-
353
- continue
354
-
355
- for object in term.objects():
356
- if isinstance(object, Literal):
357
- if object.datatype.__str__() == 'https://www.iana.org/assignments/media-types/application/vnd.geo+json':
358
- try:
359
- geojson = json.loads(object.toPython())
360
- except ValueError as e:
361
- log.warning(f"Invalid JSON in spatial GeoJSON {object.toPython()} {e}")
362
- continue
363
- elif object.datatype.__str__() == 'http://www.opengis.net/rdf#wktLiteral':
364
- try:
365
- # .upper() si here because geomet doesn't support Polygon but only POLYGON
366
- geojson = wkt.loads(object.toPython().strip().upper())
367
- except ValueError as e:
368
- log.warning(f"Invalid JSON in spatial WKT {object.toPython()} {e}")
369
- continue
370
- else:
343
+ for object in term.objects():
344
+ if isinstance(object, Literal):
345
+ if object.datatype.__str__() == 'https://www.iana.org/assignments/media-types/application/vnd.geo+json':
346
+ try:
347
+ geojson = json.loads(object.toPython())
348
+ except ValueError as e:
349
+ log.warning(f"Invalid JSON in spatial GeoJSON {object.toPython()} {e}")
350
+ continue
351
+ elif object.datatype.__str__() == 'http://www.opengis.net/rdf#wktLiteral':
352
+ try:
353
+ # .upper() si here because geomet doesn't support Polygon but only POLYGON
354
+ geojson = wkt.loads(object.toPython().strip().upper())
355
+ except ValueError as e:
356
+ log.warning(f"Invalid JSON in spatial WKT {object.toPython()} {e}")
371
357
  continue
358
+ else:
359
+ continue
372
360
 
373
- if geojson['type'] == 'Polygon':
374
- geojson['type'] = 'MultiPolygon'
375
- geojson['coordinates'] = [geojson['coordinates']]
361
+ if geojson['type'] == 'Polygon':
362
+ geojson['type'] = 'MultiPolygon'
363
+ geojson['coordinates'] = [geojson['coordinates']]
376
364
 
377
- geojsons.append(geojson)
378
- except Exception as e:
379
- log.exception(f"Exception during `spatial_from_rdf` for term {term}: {e}", stack_info=True)
365
+ spatial_coverage = SpatialCoverage(geom=geojson)
380
366
 
381
- for geojson in geojsons:
382
- spatial_coverage = SpatialCoverage(geom=geojson)
383
- try:
384
- spatial_coverage.clean()
385
- return spatial_coverage
386
- except ValidationError:
387
- continue
367
+ try:
368
+ spatial_coverage.clean()
369
+ return spatial_coverage
370
+ except ValidationError:
371
+ continue
388
372
 
389
373
  return None
390
374
 
@@ -625,27 +609,3 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None):
625
609
  dataset.harvest.modified_at = modified_at
626
610
 
627
611
  return dataset
628
-
629
- def bbox_to_geojson_multipolygon(bbox_as_str: str) -> Optional[dict] :
630
- bbox = bbox_as_str.strip().split(',')
631
- if len(bbox) != 4:
632
- return None
633
-
634
- west = float(bbox[0])
635
- south = float(bbox[1])
636
- east = float(bbox[2])
637
- north = float(bbox[3])
638
-
639
- low_left = [west, south]
640
- top_left = [west, north]
641
- top_right = [east, north]
642
- low_right = [east, south]
643
-
644
- return {
645
- 'type': 'MultiPolygon',
646
- 'coordinates': [
647
- [
648
- [low_left, low_right, top_right, top_left, low_left],
649
- ],
650
- ],
651
- }
@@ -47,6 +47,7 @@ class DatasetSearch(ModelSearchAdapter):
47
47
  @classmethod
48
48
  def is_indexable(cls, dataset):
49
49
  return (dataset.deleted is None and dataset.archived is None and
50
+ len(dataset.resources) > 0 and
50
51
  not dataset.private)
51
52
 
52
53
  @classmethod
@@ -1,12 +1,15 @@
1
1
  from flask import url_for
2
2
 
3
+ from udata.utils import get_by
4
+
3
5
  from udata.utils import faker
4
6
  from udata.tests.api import APITestCase
5
7
  from udata.tests.features.territories import (
6
8
  create_geozones_fixtures, TerritoriesSettings
7
9
  )
10
+ from udata.tests.helpers import assert_json_equal
8
11
  from udata.core.organization.factories import OrganizationFactory
9
- from udata.core.dataset.factories import DatasetFactory
12
+ from udata.core.dataset.factories import VisibleDatasetFactory
10
13
  from udata.core.spatial.factories import (
11
14
  SpatialCoverageFactory, GeoZoneFactory, GeoLevelFactory
12
15
  )
@@ -170,7 +173,7 @@ class SpatialApiTest(APITestCase):
170
173
  paca, bdr, arles = create_geozones_fixtures()
171
174
  organization = OrganizationFactory()
172
175
  for _ in range(3):
173
- DatasetFactory(
176
+ VisibleDatasetFactory(
174
177
  organization=organization,
175
178
  spatial=SpatialCoverageFactory(zones=[paca.id]))
176
179
 
@@ -182,7 +185,7 @@ class SpatialApiTest(APITestCase):
182
185
  paca, bdr, arles = create_geozones_fixtures()
183
186
  organization = OrganizationFactory()
184
187
  for _ in range(3):
185
- DatasetFactory(
188
+ VisibleDatasetFactory(
186
189
  organization=organization,
187
190
  spatial=SpatialCoverageFactory(zones=[paca.id]))
188
191
 
@@ -195,7 +198,7 @@ class SpatialApiTest(APITestCase):
195
198
  paca, bdr, arles = create_geozones_fixtures()
196
199
  organization = OrganizationFactory()
197
200
  for _ in range(3):
198
- DatasetFactory(
201
+ VisibleDatasetFactory(
199
202
  organization=organization,
200
203
  spatial=SpatialCoverageFactory(zones=[paca.id]))
201
204
 
@@ -209,7 +212,7 @@ class SpatialApiTest(APITestCase):
209
212
  paca, bdr, arles = create_geozones_fixtures()
210
213
  organization = OrganizationFactory()
211
214
  for _ in range(3):
212
- DatasetFactory(
215
+ VisibleDatasetFactory(
213
216
  organization=organization,
214
217
  spatial=SpatialCoverageFactory(zones=[paca.id]))
215
218
 
@@ -238,7 +241,7 @@ class SpatialTerritoriesApiTest(APITestCase):
238
241
  paca, bdr, arles = create_geozones_fixtures()
239
242
  organization = OrganizationFactory()
240
243
  for _ in range(3):
241
- DatasetFactory(
244
+ VisibleDatasetFactory(
242
245
  organization=organization,
243
246
  spatial=SpatialCoverageFactory(zones=[paca.id]))
244
247
 
@@ -252,7 +255,7 @@ class SpatialTerritoriesApiTest(APITestCase):
252
255
  paca, bdr, arles = create_geozones_fixtures()
253
256
  organization = OrganizationFactory()
254
257
  for _ in range(3):
255
- DatasetFactory(
258
+ VisibleDatasetFactory(
256
259
  organization=organization,
257
260
  spatial=SpatialCoverageFactory(zones=[paca.id]))
258
261
 
@@ -1,7 +1,7 @@
1
1
  import factory
2
2
 
3
3
  from udata import utils
4
- from udata.core.dataset.factories import DatasetFactory
4
+ from udata.core.dataset.factories import VisibleDatasetFactory
5
5
  from udata.core.reuse.factories import VisibleReuseFactory
6
6
  from udata.factories import ModelFactory
7
7
 
@@ -19,7 +19,7 @@ class TopicFactory(ModelFactory):
19
19
 
20
20
  @factory.lazy_attribute
21
21
  def datasets(self):
22
- return DatasetFactory.create_batch(3)
22
+ return VisibleDatasetFactory.create_batch(3)
23
23
 
24
24
  @factory.lazy_attribute
25
25
  def reuses(self):
@@ -2,7 +2,7 @@ import logging
2
2
 
3
3
  from rdflib import Graph, URIRef
4
4
  from rdflib.namespace import RDF
5
- import lxml.etree as ET
5
+ import xml.etree.ElementTree as ET
6
6
  import boto3
7
7
  from flask import current_app
8
8
  from datetime import date
@@ -173,36 +173,7 @@ class DcatBackend(BaseBackend):
173
173
  dataset = self.get_dataset(item.remote_id)
174
174
  dataset = dataset_from_rdf(graph, dataset, node=node)
175
175
  return dataset
176
-
177
176
 
178
- def next_record_if_should_continue(self, start, search_results):
179
- next_record = int(search_results.attrib['nextRecord'])
180
- matched_count = int(search_results.attrib['numberOfRecordsMatched'])
181
- returned_count = int(search_results.attrib['numberOfRecordsReturned'])
182
-
183
- # Break conditions copied gratefully from
184
- # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
185
- break_conditions = (
186
- # standard CSW: A value of 0 means all records have been returned.
187
- next_record == 0,
188
-
189
- # Misbehaving CSW server returning a next record > matched count
190
- next_record > matched_count,
191
-
192
- # No results returned already
193
- returned_count == 0,
194
-
195
- # Current next record is lower than previous one
196
- next_record < start,
197
-
198
- # Enough items have been harvested already
199
- self.max_items and len(self.job.items) >= self.max_items
200
- )
201
-
202
- if any(break_conditions):
203
- return None
204
- else:
205
- return next_record
206
177
 
207
178
  class CswDcatBackend(DcatBackend):
208
179
  display_name = 'CSW-DCAT'
@@ -230,18 +201,17 @@ class CswDcatBackend(DcatBackend):
230
201
  graphs = []
231
202
  page = 0
232
203
  start = 1
233
-
234
204
  response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
235
205
  headers=headers)
236
206
  response.raise_for_status()
237
- content = response.content
207
+ content = response.text
238
208
  tree = ET.fromstring(content)
239
209
  if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
240
210
  raise ValueError(f'Failed to query CSW:\n{content}')
241
211
  while tree:
242
212
  graph = Graph(namespace_manager=namespace_manager)
243
213
  search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
244
- if search_results is None:
214
+ if not search_results:
245
215
  log.error(f'No search results found for {url} on page {page}')
246
216
  break
247
217
  for child in search_results:
@@ -255,111 +225,37 @@ class CswDcatBackend(DcatBackend):
255
225
  kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
256
226
  self.add_item(id, **kwargs)
257
227
  graphs.append(graph)
258
-
259
- next_record = self.next_record_if_should_continue(start, search_results)
260
- if not next_record:
261
- break
262
-
263
- start = next_record
264
228
  page += 1
265
229
 
266
- tree = ET.fromstring(
267
- self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
268
- headers=headers).content)
269
-
270
- return graphs
271
-
272
-
273
-
274
- class CswIso19139DcatBackend(DcatBackend):
275
- '''
276
- An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
277
- The parsing of items is then the same as for the DcatBackend.
278
- '''
279
-
280
- display_name = 'CSW-ISO-19139'
281
-
282
- ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd'
230
+ next_record = int(search_results.attrib['nextRecord'])
231
+ matched_count = int(search_results.attrib['numberOfRecordsMatched'])
232
+ returned_count = int(search_results.attrib['numberOfRecordsReturned'])
283
233
 
284
- XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
234
+ # Break conditions copied gratefully from
235
+ # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
236
+ break_conditions = (
237
+ # standard CSW: A value of 0 means all records have been returned.
238
+ next_record == 0,
285
239
 
286
- def parse_graph(self, url: str, fmt: str) -> List[Graph]:
287
- '''
288
- Parse CSW graph querying ISO schema.
289
- Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
290
- See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
291
- '''
292
-
293
- # Load XSLT
294
- xsl = ET.fromstring(self.get(self.XSL_URL).content)
295
- transform = ET.XSLT(xsl)
296
-
297
- # Start querying and parsing graph
298
- body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
299
- xmlns:gmd="http://www.isotc211.org/2005/gmd"
300
- service="CSW" version="2.0.2" resultType="results"
301
- startPosition="{start}" maxPosition="10"
302
- outputSchema="{schema}">
303
- <csw:Query typeNames="csw:Record">
304
- <csw:ElementSetName>full</csw:ElementSetName>
305
- <csw:Constraint version="1.1.0">
306
- <ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
307
- <ogc:PropertyIsEqualTo>
308
- <ogc:PropertyName>dc:type</ogc:PropertyName>
309
- <ogc:Literal>dataset</ogc:Literal>
310
- </ogc:PropertyIsEqualTo>
311
- </ogc:Filter>
312
- </csw:Constraint>
313
- </csw:Query>
314
- </csw:GetRecords>'''
315
- headers = {'Content-Type': 'application/xml'}
240
+ # Misbehaving CSW server returning a next record > matched count
241
+ next_record > matched_count,
316
242
 
317
- graphs = []
318
- page = 0
319
- start = 1
243
+ # No results returned already
244
+ returned_count == 0,
320
245
 
321
- response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
322
- headers=headers)
323
- response.raise_for_status()
246
+ # Current next record is lower than previous one
247
+ next_record < start,
324
248
 
325
- tree_before_transform = ET.fromstring(response.content)
326
- # Disabling CoupledResourceLookUp to prevent failure on xlink:href
327
- # https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup
328
- tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
249
+ # Enough items have been harvested already
250
+ self.max_items and len(self.job.items) >= self.max_items
251
+ )
329
252
 
330
- while tree:
331
- # We query the tree before the transformation because the XSLT remove the search results
332
- # infos (useful for pagination)
333
- search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
334
- if search_results is None:
335
- log.error(f'No search results found for {url} on page {page}')
253
+ if any(break_conditions):
336
254
  break
337
255
 
338
- subgraph = Graph(namespace_manager=namespace_manager)
339
- subgraph.parse(ET.tostring(tree), format=fmt)
340
-
341
- if not subgraph.subjects(RDF.type, DCAT.Dataset):
342
- raise ValueError("Failed to fetch CSW content")
343
-
344
- for node in subgraph.subjects(RDF.type, DCAT.Dataset):
345
- id = subgraph.value(node, DCT.identifier)
346
- kwargs = {'nid': str(node), 'page': page}
347
- kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
348
- self.add_item(id, **kwargs)
349
- graphs.append(subgraph)
350
-
351
- next_record = self.next_record_if_should_continue(start, search_results)
352
- if not next_record:
353
- break
354
-
355
256
  start = next_record
356
- page += 1
357
-
358
- response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
359
- headers=headers)
360
- response.raise_for_status()
361
-
362
- tree_before_transform = ET.fromstring(response.content)
363
- tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
257
+ tree = ET.fromstring(
258
+ self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
259
+ headers=headers).text)
364
260
 
365
261
  return graphs