udata 7.0.5.dev28172__py2.py3-none-any.whl → 7.0.6__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- udata/__init__.py +1 -1
- udata/commands/__init__.py +0 -3
- udata/commands/dcat.py +7 -4
- udata/core/activity/api.py +1 -1
- udata/core/activity/models.py +4 -3
- udata/core/activity/tasks.py +6 -5
- udata/core/dataset/factories.py +2 -4
- udata/core/dataset/models.py +2 -5
- udata/core/dataset/rdf.py +65 -25
- udata/core/dataset/search.py +0 -1
- udata/core/spatial/tests/test_api.py +7 -10
- udata/core/topic/factories.py +2 -2
- udata/harvest/backends/dcat.py +128 -24
- udata/harvest/tests/csw_dcat/XSLT.xml +4298 -0
- udata/harvest/tests/csw_dcat/geonetwork-iso-page-1.xml +1291 -0
- udata/harvest/tests/csw_dcat/geonetwork-iso-page-3.xml +1139 -0
- udata/harvest/tests/csw_dcat/geonetwork-iso-page-5.xml +1266 -0
- udata/harvest/tests/dcat/bnodes.xml +7 -1
- udata/harvest/tests/dcat/evian.json +464 -0
- udata/harvest/tests/test_dcat_backend.py +82 -9
- udata/migrations/2024-03-22-migrate-activity-kwargs-to-extras.py +16 -0
- udata/rdf.py +22 -1
- udata/search/__init__.py +2 -2
- udata/static/chunks/{11.c0ccea08914b6b41568e.js → 11.a23c110811a9ac943478.js} +3 -3
- udata/static/chunks/{11.c0ccea08914b6b41568e.js.map → 11.a23c110811a9ac943478.js.map} +1 -1
- udata/static/chunks/{13.526a25163ababaa44409.js → 13.0889e093f8664e38568c.js} +2 -2
- udata/static/chunks/{13.526a25163ababaa44409.js.map → 13.0889e093f8664e38568c.js.map} +1 -1
- udata/static/chunks/{16.7901839b4227881947f6.js → 16.f41599478d3e97ad9a30.js} +2 -2
- udata/static/chunks/{16.7901839b4227881947f6.js.map → 16.f41599478d3e97ad9a30.js.map} +1 -1
- udata/static/chunks/{19.471d5a2a08eef6e5338a.js → 19.2b534a26af8b17e9170b.js} +3 -3
- udata/static/chunks/{19.471d5a2a08eef6e5338a.js.map → 19.2b534a26af8b17e9170b.js.map} +1 -1
- udata/static/chunks/{5.534e0531d0e2b150146f.js → 5.7115454a1183e5c12eef.js} +3 -3
- udata/static/chunks/{5.534e0531d0e2b150146f.js.map → 5.7115454a1183e5c12eef.js.map} +1 -1
- udata/static/chunks/{6.e56975229e6065f68d2a.js → 6.16bb24fb8240f2746488.js} +3 -3
- udata/static/chunks/{6.e56975229e6065f68d2a.js.map → 6.16bb24fb8240f2746488.js.map} +1 -1
- udata/static/chunks/{9.534426728626f11f4571.js → 9.3e752966ff14e47e11f2.js} +2 -2
- udata/static/chunks/{9.534426728626f11f4571.js.map → 9.3e752966ff14e47e11f2.js.map} +1 -1
- udata/static/common.js +1 -1
- udata/static/common.js.map +1 -1
- udata/tests/api/test_datasets_api.py +45 -45
- udata/tests/api/test_me_api.py +13 -14
- udata/tests/dataset/test_dataset_actions.py +2 -2
- udata/tests/dataset/test_dataset_commands.py +3 -3
- udata/tests/dataset/test_dataset_model.py +2 -1
- udata/tests/organization/test_organization_model.py +3 -3
- udata/tests/organization/test_organization_rdf.py +3 -3
- udata/tests/reuse/test_reuse_model.py +2 -2
- udata/tests/search/test_adapter.py +12 -12
- udata/tests/search/test_results.py +4 -4
- udata/tests/site/test_site_api.py +3 -3
- udata/tests/site/test_site_metrics.py +3 -3
- udata/tests/site/test_site_rdf.py +6 -6
- udata/tests/test_activity.py +12 -0
- udata/tests/test_transfer.py +18 -17
- {udata-7.0.5.dev28172.dist-info → udata-7.0.6.dist-info}/METADATA +13 -3
- {udata-7.0.5.dev28172.dist-info → udata-7.0.6.dist-info}/RECORD +60 -54
- {udata-7.0.5.dev28172.dist-info → udata-7.0.6.dist-info}/entry_points.txt +1 -0
- {udata-7.0.5.dev28172.dist-info → udata-7.0.6.dist-info}/LICENSE +0 -0
- {udata-7.0.5.dev28172.dist-info → udata-7.0.6.dist-info}/WHEEL +0 -0
- {udata-7.0.5.dev28172.dist-info → udata-7.0.6.dist-info}/top_level.txt +0 -0
udata/__init__.py
CHANGED
udata/commands/__init__.py
CHANGED
|
@@ -145,9 +145,6 @@ def init_logging(app):
|
|
|
145
145
|
handler.setFormatter(CliFormatter())
|
|
146
146
|
handler.setLevel(log_level)
|
|
147
147
|
|
|
148
|
-
logger = logging.getLogger()
|
|
149
|
-
logger.addHandler(handler)
|
|
150
|
-
|
|
151
148
|
logger = logging.getLogger('__main__')
|
|
152
149
|
logger.setLevel(log_level)
|
|
153
150
|
logger.handlers = []
|
udata/commands/dcat.py
CHANGED
|
@@ -8,7 +8,7 @@ from rdflib import Graph
|
|
|
8
8
|
from udata.commands import cli, green, yellow, cyan, echo, magenta
|
|
9
9
|
from udata.core.dataset.factories import DatasetFactory
|
|
10
10
|
from udata.core.dataset.rdf import dataset_from_rdf
|
|
11
|
-
from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend
|
|
11
|
+
from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend, CswIso19139DcatBackend
|
|
12
12
|
from udata.rdf import namespace_manager
|
|
13
13
|
|
|
14
14
|
log = logging.getLogger(__name__)
|
|
@@ -23,9 +23,10 @@ def grp():
|
|
|
23
23
|
@grp.command()
|
|
24
24
|
@click.argument('url')
|
|
25
25
|
@click.option('-q', '--quiet', is_flag=True, help='Ignore warnings')
|
|
26
|
-
@click.option('-
|
|
27
|
-
@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint')
|
|
28
|
-
|
|
26
|
+
@click.option('-r', '--rid', help='Inspect specific remote id (contains)')
|
|
27
|
+
@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint with DCAT output')
|
|
28
|
+
@click.option('-i', '--iso', is_flag=True, help='The target is a CSW endpoint with ISO output')
|
|
29
|
+
def parse_url(url, csw, iso, quiet=False, rid=''):
|
|
29
30
|
'''Parse the datasets in a DCAT format located at URL (debug)'''
|
|
30
31
|
if quiet:
|
|
31
32
|
verbose_loggers = ['rdflib', 'udata.core.dataset']
|
|
@@ -49,6 +50,8 @@ def parse_url(url, csw, quiet=False, rid=''):
|
|
|
49
50
|
source.url = url
|
|
50
51
|
if csw:
|
|
51
52
|
backend = CswDcatBackend(source, dryrun=True)
|
|
53
|
+
elif iso:
|
|
54
|
+
backend = CswIso19139DcatBackend(source, dryrun=True)
|
|
52
55
|
else:
|
|
53
56
|
backend = DcatBackend(source, dryrun=True)
|
|
54
57
|
backend.job = MockJob()
|
udata/core/activity/api.py
CHANGED
|
@@ -37,7 +37,7 @@ activity_fields = api.model('Activity', {
|
|
|
37
37
|
description='The key of the activity', required=True),
|
|
38
38
|
'icon': fields.String(
|
|
39
39
|
description='The icon of the activity', required=True),
|
|
40
|
-
'
|
|
40
|
+
'extras': fields.Raw(description='Extras attributes as key-value pairs'),
|
|
41
41
|
})
|
|
42
42
|
|
|
43
43
|
activity_page_fields = api.model('ActivityPage', fields.pager(activity_fields))
|
udata/core/activity/models.py
CHANGED
|
@@ -37,7 +37,7 @@ class Activity(db.Document, metaclass=EmitNewActivityMetaClass):
|
|
|
37
37
|
related_to = db.ReferenceField(db.DomainModel, required=True)
|
|
38
38
|
created_at = db.DateTimeField(default=datetime.utcnow, required=True)
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
extras = db.ExtrasField()
|
|
41
41
|
|
|
42
42
|
on_new = Signal()
|
|
43
43
|
|
|
@@ -65,8 +65,9 @@ class Activity(db.Document, metaclass=EmitNewActivityMetaClass):
|
|
|
65
65
|
return cls.on_new.connect(func, sender=cls)
|
|
66
66
|
|
|
67
67
|
@classmethod
|
|
68
|
-
def emit(cls, related_to, organization=None,
|
|
68
|
+
def emit(cls, related_to, organization=None, extras=None):
|
|
69
69
|
new_activity.send(cls,
|
|
70
70
|
related_to=related_to,
|
|
71
71
|
actor=current_user._get_current_object(),
|
|
72
|
-
organization=organization
|
|
72
|
+
organization=organization,
|
|
73
|
+
extras=extras)
|
udata/core/activity/tasks.py
CHANGED
|
@@ -9,22 +9,23 @@ log = logging.getLogger(__name__)
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
@new_activity.connect
|
|
12
|
-
def delay_activity(cls, related_to, actor, organization=None):
|
|
12
|
+
def delay_activity(cls, related_to, actor, organization=None, extras=None):
|
|
13
13
|
emit_activity.delay(
|
|
14
14
|
cls.__name__,
|
|
15
15
|
str(actor.id),
|
|
16
16
|
related_to_cls=related_to.__class__.__name__,
|
|
17
17
|
related_to_id=str(related_to.id),
|
|
18
18
|
organization_id=str(organization.id) if organization else None,
|
|
19
|
+
extras=extras
|
|
19
20
|
)
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
@task
|
|
23
24
|
def emit_activity(classname, actor_id, related_to_cls, related_to_id,
|
|
24
|
-
organization_id=None):
|
|
25
|
-
log.debug('Emit new activity: %s %s %s %s %s',
|
|
25
|
+
organization_id=None, extras=None):
|
|
26
|
+
log.debug('Emit new activity: %s %s %s %s %s %s',
|
|
26
27
|
classname, actor_id, related_to_cls,
|
|
27
|
-
related_to_id, organization_id)
|
|
28
|
+
related_to_id, organization_id, extras)
|
|
28
29
|
cls = db.resolve_model(classname)
|
|
29
30
|
actor = User.objects.get(pk=actor_id)
|
|
30
31
|
related_to = db.resolve_model(related_to_cls).objects.get(pk=related_to_id)
|
|
@@ -33,4 +34,4 @@ def emit_activity(classname, actor_id, related_to_cls, related_to_id,
|
|
|
33
34
|
else:
|
|
34
35
|
organization = None
|
|
35
36
|
cls.objects.create(actor=actor, related_to=related_to,
|
|
36
|
-
organization=organization)
|
|
37
|
+
organization=organization, extras=extras)
|
udata/core/dataset/factories.py
CHANGED
|
@@ -34,10 +34,8 @@ class DatasetFactory(ModelFactory):
|
|
|
34
34
|
nb_resources = 0
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
class
|
|
38
|
-
|
|
39
|
-
def resources(self):
|
|
40
|
-
return [ResourceFactory()]
|
|
37
|
+
class HiddenDatasetFactory(DatasetFactory):
|
|
38
|
+
private = True
|
|
41
39
|
|
|
42
40
|
|
|
43
41
|
class ChecksumFactory(ModelFactory):
|
udata/core/dataset/models.py
CHANGED
|
@@ -342,12 +342,10 @@ class License(db.Document):
|
|
|
342
342
|
|
|
343
343
|
class DatasetQuerySet(db.OwnedQuerySet):
|
|
344
344
|
def visible(self):
|
|
345
|
-
return self(private__ne=True,
|
|
346
|
-
deleted=None, archived=None)
|
|
345
|
+
return self(private__ne=True, deleted=None, archived=None)
|
|
347
346
|
|
|
348
347
|
def hidden(self):
|
|
349
348
|
return self(db.Q(private=True) |
|
|
350
|
-
db.Q(resources__0__exists=False) |
|
|
351
349
|
db.Q(deleted__ne=None) |
|
|
352
350
|
db.Q(archived__ne=None))
|
|
353
351
|
|
|
@@ -677,8 +675,7 @@ class Dataset(WithMetrics, BadgeMixin, db.Owned, db.Document):
|
|
|
677
675
|
|
|
678
676
|
@property
|
|
679
677
|
def is_hidden(self):
|
|
680
|
-
return
|
|
681
|
-
or self.archived)
|
|
678
|
+
return self.private or self.deleted or self.archived
|
|
682
679
|
|
|
683
680
|
@property
|
|
684
681
|
def full_title(self):
|
udata/core/dataset/rdf.py
CHANGED
|
@@ -7,6 +7,7 @@ import logging
|
|
|
7
7
|
|
|
8
8
|
from datetime import date
|
|
9
9
|
from html.parser import HTMLParser
|
|
10
|
+
from typing import Optional
|
|
10
11
|
from dateutil.parser import parse as parse_dt
|
|
11
12
|
from flask import current_app
|
|
12
13
|
from geomet import wkt
|
|
@@ -339,36 +340,51 @@ def contact_point_from_rdf(rdf, dataset):
|
|
|
339
340
|
|
|
340
341
|
|
|
341
342
|
def spatial_from_rdf(graph):
|
|
343
|
+
geojsons = []
|
|
342
344
|
for term in graph.objects(DCT.spatial):
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
345
|
+
try:
|
|
346
|
+
# This may not be official in the norm but some ArcGis return
|
|
347
|
+
# bbox as literal directly in DCT.spatial.
|
|
348
|
+
if isinstance(term, Literal):
|
|
349
|
+
geojson = bbox_to_geojson_multipolygon(term.toPython())
|
|
350
|
+
if geojson is not None:
|
|
351
|
+
geojsons.append(geojson)
|
|
352
|
+
|
|
353
|
+
continue
|
|
354
|
+
|
|
355
|
+
for object in term.objects():
|
|
356
|
+
if isinstance(object, Literal):
|
|
357
|
+
if object.datatype.__str__() == 'https://www.iana.org/assignments/media-types/application/vnd.geo+json':
|
|
358
|
+
try:
|
|
359
|
+
geojson = json.loads(object.toPython())
|
|
360
|
+
except ValueError as e:
|
|
361
|
+
log.warning(f"Invalid JSON in spatial GeoJSON {object.toPython()} {e}")
|
|
362
|
+
continue
|
|
363
|
+
elif object.datatype.__str__() == 'http://www.opengis.net/rdf#wktLiteral':
|
|
364
|
+
try:
|
|
365
|
+
# .upper() si here because geomet doesn't support Polygon but only POLYGON
|
|
366
|
+
geojson = wkt.loads(object.toPython().strip().upper())
|
|
367
|
+
except ValueError as e:
|
|
368
|
+
log.warning(f"Invalid JSON in spatial WKT {object.toPython()} {e}")
|
|
369
|
+
continue
|
|
370
|
+
else:
|
|
357
371
|
continue
|
|
358
|
-
else:
|
|
359
|
-
continue
|
|
360
372
|
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
373
|
+
if geojson['type'] == 'Polygon':
|
|
374
|
+
geojson['type'] = 'MultiPolygon'
|
|
375
|
+
geojson['coordinates'] = [geojson['coordinates']]
|
|
364
376
|
|
|
365
|
-
|
|
377
|
+
geojsons.append(geojson)
|
|
378
|
+
except Exception as e:
|
|
379
|
+
log.exception(f"Exception during `spatial_from_rdf` for term {term}: {e}", stack_info=True)
|
|
366
380
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
381
|
+
for geojson in geojsons:
|
|
382
|
+
spatial_coverage = SpatialCoverage(geom=geojson)
|
|
383
|
+
try:
|
|
384
|
+
spatial_coverage.clean()
|
|
385
|
+
return spatial_coverage
|
|
386
|
+
except ValidationError:
|
|
387
|
+
continue
|
|
372
388
|
|
|
373
389
|
return None
|
|
374
390
|
|
|
@@ -609,3 +625,27 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None):
|
|
|
609
625
|
dataset.harvest.modified_at = modified_at
|
|
610
626
|
|
|
611
627
|
return dataset
|
|
628
|
+
|
|
629
|
+
def bbox_to_geojson_multipolygon(bbox_as_str: str) -> Optional[dict] :
|
|
630
|
+
bbox = bbox_as_str.strip().split(',')
|
|
631
|
+
if len(bbox) != 4:
|
|
632
|
+
return None
|
|
633
|
+
|
|
634
|
+
west = float(bbox[0])
|
|
635
|
+
south = float(bbox[1])
|
|
636
|
+
east = float(bbox[2])
|
|
637
|
+
north = float(bbox[3])
|
|
638
|
+
|
|
639
|
+
low_left = [west, south]
|
|
640
|
+
top_left = [west, north]
|
|
641
|
+
top_right = [east, north]
|
|
642
|
+
low_right = [east, south]
|
|
643
|
+
|
|
644
|
+
return {
|
|
645
|
+
'type': 'MultiPolygon',
|
|
646
|
+
'coordinates': [
|
|
647
|
+
[
|
|
648
|
+
[low_left, low_right, top_right, top_left, low_left],
|
|
649
|
+
],
|
|
650
|
+
],
|
|
651
|
+
}
|
udata/core/dataset/search.py
CHANGED
|
@@ -1,15 +1,12 @@
|
|
|
1
1
|
from flask import url_for
|
|
2
2
|
|
|
3
|
-
from udata.utils import get_by
|
|
4
|
-
|
|
5
3
|
from udata.utils import faker
|
|
6
4
|
from udata.tests.api import APITestCase
|
|
7
5
|
from udata.tests.features.territories import (
|
|
8
6
|
create_geozones_fixtures, TerritoriesSettings
|
|
9
7
|
)
|
|
10
|
-
from udata.tests.helpers import assert_json_equal
|
|
11
8
|
from udata.core.organization.factories import OrganizationFactory
|
|
12
|
-
from udata.core.dataset.factories import
|
|
9
|
+
from udata.core.dataset.factories import DatasetFactory
|
|
13
10
|
from udata.core.spatial.factories import (
|
|
14
11
|
SpatialCoverageFactory, GeoZoneFactory, GeoLevelFactory
|
|
15
12
|
)
|
|
@@ -173,7 +170,7 @@ class SpatialApiTest(APITestCase):
|
|
|
173
170
|
paca, bdr, arles = create_geozones_fixtures()
|
|
174
171
|
organization = OrganizationFactory()
|
|
175
172
|
for _ in range(3):
|
|
176
|
-
|
|
173
|
+
DatasetFactory(
|
|
177
174
|
organization=organization,
|
|
178
175
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
179
176
|
|
|
@@ -185,7 +182,7 @@ class SpatialApiTest(APITestCase):
|
|
|
185
182
|
paca, bdr, arles = create_geozones_fixtures()
|
|
186
183
|
organization = OrganizationFactory()
|
|
187
184
|
for _ in range(3):
|
|
188
|
-
|
|
185
|
+
DatasetFactory(
|
|
189
186
|
organization=organization,
|
|
190
187
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
191
188
|
|
|
@@ -198,7 +195,7 @@ class SpatialApiTest(APITestCase):
|
|
|
198
195
|
paca, bdr, arles = create_geozones_fixtures()
|
|
199
196
|
organization = OrganizationFactory()
|
|
200
197
|
for _ in range(3):
|
|
201
|
-
|
|
198
|
+
DatasetFactory(
|
|
202
199
|
organization=organization,
|
|
203
200
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
204
201
|
|
|
@@ -212,7 +209,7 @@ class SpatialApiTest(APITestCase):
|
|
|
212
209
|
paca, bdr, arles = create_geozones_fixtures()
|
|
213
210
|
organization = OrganizationFactory()
|
|
214
211
|
for _ in range(3):
|
|
215
|
-
|
|
212
|
+
DatasetFactory(
|
|
216
213
|
organization=organization,
|
|
217
214
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
218
215
|
|
|
@@ -241,7 +238,7 @@ class SpatialTerritoriesApiTest(APITestCase):
|
|
|
241
238
|
paca, bdr, arles = create_geozones_fixtures()
|
|
242
239
|
organization = OrganizationFactory()
|
|
243
240
|
for _ in range(3):
|
|
244
|
-
|
|
241
|
+
DatasetFactory(
|
|
245
242
|
organization=organization,
|
|
246
243
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
247
244
|
|
|
@@ -255,7 +252,7 @@ class SpatialTerritoriesApiTest(APITestCase):
|
|
|
255
252
|
paca, bdr, arles = create_geozones_fixtures()
|
|
256
253
|
organization = OrganizationFactory()
|
|
257
254
|
for _ in range(3):
|
|
258
|
-
|
|
255
|
+
DatasetFactory(
|
|
259
256
|
organization=organization,
|
|
260
257
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
261
258
|
|
udata/core/topic/factories.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import factory
|
|
2
2
|
|
|
3
3
|
from udata import utils
|
|
4
|
-
from udata.core.dataset.factories import
|
|
4
|
+
from udata.core.dataset.factories import DatasetFactory
|
|
5
5
|
from udata.core.reuse.factories import VisibleReuseFactory
|
|
6
6
|
from udata.factories import ModelFactory
|
|
7
7
|
|
|
@@ -19,7 +19,7 @@ class TopicFactory(ModelFactory):
|
|
|
19
19
|
|
|
20
20
|
@factory.lazy_attribute
|
|
21
21
|
def datasets(self):
|
|
22
|
-
return
|
|
22
|
+
return DatasetFactory.create_batch(3)
|
|
23
23
|
|
|
24
24
|
@factory.lazy_attribute
|
|
25
25
|
def reuses(self):
|
udata/harvest/backends/dcat.py
CHANGED
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
|
|
3
3
|
from rdflib import Graph, URIRef
|
|
4
4
|
from rdflib.namespace import RDF
|
|
5
|
-
import
|
|
5
|
+
import lxml.etree as ET
|
|
6
6
|
import boto3
|
|
7
7
|
from flask import current_app
|
|
8
8
|
from datetime import date
|
|
@@ -173,7 +173,36 @@ class DcatBackend(BaseBackend):
|
|
|
173
173
|
dataset = self.get_dataset(item.remote_id)
|
|
174
174
|
dataset = dataset_from_rdf(graph, dataset, node=node)
|
|
175
175
|
return dataset
|
|
176
|
+
|
|
176
177
|
|
|
178
|
+
def next_record_if_should_continue(self, start, search_results):
|
|
179
|
+
next_record = int(search_results.attrib['nextRecord'])
|
|
180
|
+
matched_count = int(search_results.attrib['numberOfRecordsMatched'])
|
|
181
|
+
returned_count = int(search_results.attrib['numberOfRecordsReturned'])
|
|
182
|
+
|
|
183
|
+
# Break conditions copied gratefully from
|
|
184
|
+
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
|
|
185
|
+
break_conditions = (
|
|
186
|
+
# standard CSW: A value of 0 means all records have been returned.
|
|
187
|
+
next_record == 0,
|
|
188
|
+
|
|
189
|
+
# Misbehaving CSW server returning a next record > matched count
|
|
190
|
+
next_record > matched_count,
|
|
191
|
+
|
|
192
|
+
# No results returned already
|
|
193
|
+
returned_count == 0,
|
|
194
|
+
|
|
195
|
+
# Current next record is lower than previous one
|
|
196
|
+
next_record < start,
|
|
197
|
+
|
|
198
|
+
# Enough items have been harvested already
|
|
199
|
+
self.max_items and len(self.job.items) >= self.max_items
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
if any(break_conditions):
|
|
203
|
+
return None
|
|
204
|
+
else:
|
|
205
|
+
return next_record
|
|
177
206
|
|
|
178
207
|
class CswDcatBackend(DcatBackend):
|
|
179
208
|
display_name = 'CSW-DCAT'
|
|
@@ -201,17 +230,18 @@ class CswDcatBackend(DcatBackend):
|
|
|
201
230
|
graphs = []
|
|
202
231
|
page = 0
|
|
203
232
|
start = 1
|
|
233
|
+
|
|
204
234
|
response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
|
|
205
235
|
headers=headers)
|
|
206
236
|
response.raise_for_status()
|
|
207
|
-
content = response.
|
|
237
|
+
content = response.content
|
|
208
238
|
tree = ET.fromstring(content)
|
|
209
239
|
if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
|
|
210
240
|
raise ValueError(f'Failed to query CSW:\n{content}')
|
|
211
241
|
while tree:
|
|
212
242
|
graph = Graph(namespace_manager=namespace_manager)
|
|
213
243
|
search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
|
|
214
|
-
if
|
|
244
|
+
if search_results is None:
|
|
215
245
|
log.error(f'No search results found for {url} on page {page}')
|
|
216
246
|
break
|
|
217
247
|
for child in search_results:
|
|
@@ -225,37 +255,111 @@ class CswDcatBackend(DcatBackend):
|
|
|
225
255
|
kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
|
|
226
256
|
self.add_item(id, **kwargs)
|
|
227
257
|
graphs.append(graph)
|
|
258
|
+
|
|
259
|
+
next_record = self.next_record_if_should_continue(start, search_results)
|
|
260
|
+
if not next_record:
|
|
261
|
+
break
|
|
262
|
+
|
|
263
|
+
start = next_record
|
|
228
264
|
page += 1
|
|
229
265
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
266
|
+
tree = ET.fromstring(
|
|
267
|
+
self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
|
|
268
|
+
headers=headers).content)
|
|
269
|
+
|
|
270
|
+
return graphs
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class CswIso19139DcatBackend(DcatBackend):
|
|
275
|
+
'''
|
|
276
|
+
An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
|
|
277
|
+
The parsing of items is then the same as for the DcatBackend.
|
|
278
|
+
'''
|
|
279
|
+
|
|
280
|
+
display_name = 'CSW-ISO-19139'
|
|
281
|
+
|
|
282
|
+
ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd'
|
|
233
283
|
|
|
234
|
-
|
|
235
|
-
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
|
|
236
|
-
break_conditions = (
|
|
237
|
-
# standard CSW: A value of 0 means all records have been returned.
|
|
238
|
-
next_record == 0,
|
|
284
|
+
XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
|
|
239
285
|
|
|
240
|
-
|
|
241
|
-
|
|
286
|
+
def parse_graph(self, url: str, fmt: str) -> List[Graph]:
|
|
287
|
+
'''
|
|
288
|
+
Parse CSW graph querying ISO schema.
|
|
289
|
+
Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
|
|
290
|
+
See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
|
|
291
|
+
'''
|
|
292
|
+
|
|
293
|
+
# Load XSLT
|
|
294
|
+
xsl = ET.fromstring(self.get(self.XSL_URL).content)
|
|
295
|
+
transform = ET.XSLT(xsl)
|
|
296
|
+
|
|
297
|
+
# Start querying and parsing graph
|
|
298
|
+
body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
299
|
+
xmlns:gmd="http://www.isotc211.org/2005/gmd"
|
|
300
|
+
service="CSW" version="2.0.2" resultType="results"
|
|
301
|
+
startPosition="{start}" maxPosition="10"
|
|
302
|
+
outputSchema="{schema}">
|
|
303
|
+
<csw:Query typeNames="csw:Record">
|
|
304
|
+
<csw:ElementSetName>full</csw:ElementSetName>
|
|
305
|
+
<csw:Constraint version="1.1.0">
|
|
306
|
+
<ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
|
|
307
|
+
<ogc:PropertyIsEqualTo>
|
|
308
|
+
<ogc:PropertyName>dc:type</ogc:PropertyName>
|
|
309
|
+
<ogc:Literal>dataset</ogc:Literal>
|
|
310
|
+
</ogc:PropertyIsEqualTo>
|
|
311
|
+
</ogc:Filter>
|
|
312
|
+
</csw:Constraint>
|
|
313
|
+
</csw:Query>
|
|
314
|
+
</csw:GetRecords>'''
|
|
315
|
+
headers = {'Content-Type': 'application/xml'}
|
|
242
316
|
|
|
243
|
-
|
|
244
|
-
|
|
317
|
+
graphs = []
|
|
318
|
+
page = 0
|
|
319
|
+
start = 1
|
|
245
320
|
|
|
246
|
-
|
|
247
|
-
|
|
321
|
+
response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
|
|
322
|
+
headers=headers)
|
|
323
|
+
response.raise_for_status()
|
|
248
324
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
325
|
+
tree_before_transform = ET.fromstring(response.content)
|
|
326
|
+
# Disabling CoupledResourceLookUp to prevent failure on xlink:href
|
|
327
|
+
# https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup
|
|
328
|
+
tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
|
|
252
329
|
|
|
253
|
-
|
|
330
|
+
while tree:
|
|
331
|
+
# We query the tree before the transformation because the XSLT remove the search results
|
|
332
|
+
# infos (useful for pagination)
|
|
333
|
+
search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
|
|
334
|
+
if search_results is None:
|
|
335
|
+
log.error(f'No search results found for {url} on page {page}')
|
|
254
336
|
break
|
|
255
337
|
|
|
338
|
+
subgraph = Graph(namespace_manager=namespace_manager)
|
|
339
|
+
subgraph.parse(ET.tostring(tree), format=fmt)
|
|
340
|
+
|
|
341
|
+
if not subgraph.subjects(RDF.type, DCAT.Dataset):
|
|
342
|
+
raise ValueError("Failed to fetch CSW content")
|
|
343
|
+
|
|
344
|
+
for node in subgraph.subjects(RDF.type, DCAT.Dataset):
|
|
345
|
+
id = subgraph.value(node, DCT.identifier)
|
|
346
|
+
kwargs = {'nid': str(node), 'page': page}
|
|
347
|
+
kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
|
|
348
|
+
self.add_item(id, **kwargs)
|
|
349
|
+
graphs.append(subgraph)
|
|
350
|
+
|
|
351
|
+
next_record = self.next_record_if_should_continue(start, search_results)
|
|
352
|
+
if not next_record:
|
|
353
|
+
break
|
|
354
|
+
|
|
256
355
|
start = next_record
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
356
|
+
page += 1
|
|
357
|
+
|
|
358
|
+
response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
|
|
359
|
+
headers=headers)
|
|
360
|
+
response.raise_for_status()
|
|
361
|
+
|
|
362
|
+
tree_before_transform = ET.fromstring(response.content)
|
|
363
|
+
tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
|
|
260
364
|
|
|
261
365
|
return graphs
|