udata 7.0.6__py2.py3-none-any.whl → 7.0.6.dev28209__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- udata/__init__.py +1 -1
- udata/commands/__init__.py +3 -0
- udata/commands/dcat.py +4 -7
- udata/core/activity/api.py +1 -1
- udata/core/activity/models.py +3 -4
- udata/core/activity/tasks.py +5 -6
- udata/core/dataset/factories.py +4 -2
- udata/core/dataset/models.py +5 -2
- udata/core/dataset/rdf.py +25 -65
- udata/core/dataset/search.py +1 -0
- udata/core/spatial/tests/test_api.py +10 -7
- udata/core/topic/factories.py +2 -2
- udata/harvest/backends/dcat.py +24 -128
- udata/harvest/tests/test_dcat_backend.py +5 -78
- udata/rdf.py +0 -1
- udata/search/__init__.py +2 -2
- udata/tests/api/test_datasets_api.py +45 -45
- udata/tests/api/test_me_api.py +14 -13
- udata/tests/dataset/test_dataset_actions.py +2 -2
- udata/tests/dataset/test_dataset_commands.py +3 -3
- udata/tests/dataset/test_dataset_model.py +1 -2
- udata/tests/organization/test_organization_model.py +3 -3
- udata/tests/organization/test_organization_rdf.py +3 -3
- udata/tests/reuse/test_reuse_model.py +2 -2
- udata/tests/search/test_adapter.py +12 -12
- udata/tests/search/test_results.py +4 -4
- udata/tests/site/test_site_api.py +3 -3
- udata/tests/site/test_site_metrics.py +3 -3
- udata/tests/site/test_site_rdf.py +6 -6
- udata/tests/test_activity.py +0 -12
- udata/tests/test_transfer.py +17 -18
- {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/METADATA +4 -9
- {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/RECORD +37 -43
- {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/entry_points.txt +0 -1
- udata/harvest/tests/csw_dcat/XSLT.xml +0 -4298
- udata/harvest/tests/csw_dcat/geonetwork-iso-page-1.xml +0 -1291
- udata/harvest/tests/csw_dcat/geonetwork-iso-page-3.xml +0 -1139
- udata/harvest/tests/csw_dcat/geonetwork-iso-page-5.xml +0 -1266
- udata/harvest/tests/dcat/evian.json +0 -464
- udata/migrations/2024-03-22-migrate-activity-kwargs-to-extras.py +0 -16
- {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/LICENSE +0 -0
- {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/WHEEL +0 -0
- {udata-7.0.6.dist-info → udata-7.0.6.dev28209.dist-info}/top_level.txt +0 -0
udata/__init__.py
CHANGED
udata/commands/__init__.py
CHANGED
|
@@ -145,6 +145,9 @@ def init_logging(app):
|
|
|
145
145
|
handler.setFormatter(CliFormatter())
|
|
146
146
|
handler.setLevel(log_level)
|
|
147
147
|
|
|
148
|
+
logger = logging.getLogger()
|
|
149
|
+
logger.addHandler(handler)
|
|
150
|
+
|
|
148
151
|
logger = logging.getLogger('__main__')
|
|
149
152
|
logger.setLevel(log_level)
|
|
150
153
|
logger.handlers = []
|
udata/commands/dcat.py
CHANGED
|
@@ -8,7 +8,7 @@ from rdflib import Graph
|
|
|
8
8
|
from udata.commands import cli, green, yellow, cyan, echo, magenta
|
|
9
9
|
from udata.core.dataset.factories import DatasetFactory
|
|
10
10
|
from udata.core.dataset.rdf import dataset_from_rdf
|
|
11
|
-
from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend
|
|
11
|
+
from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend
|
|
12
12
|
from udata.rdf import namespace_manager
|
|
13
13
|
|
|
14
14
|
log = logging.getLogger(__name__)
|
|
@@ -23,10 +23,9 @@ def grp():
|
|
|
23
23
|
@grp.command()
|
|
24
24
|
@click.argument('url')
|
|
25
25
|
@click.option('-q', '--quiet', is_flag=True, help='Ignore warnings')
|
|
26
|
-
@click.option('-
|
|
27
|
-
@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint
|
|
28
|
-
|
|
29
|
-
def parse_url(url, csw, iso, quiet=False, rid=''):
|
|
26
|
+
@click.option('-i', '--rid', help='Inspect specific remote id (contains)')
|
|
27
|
+
@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint')
|
|
28
|
+
def parse_url(url, csw, quiet=False, rid=''):
|
|
30
29
|
'''Parse the datasets in a DCAT format located at URL (debug)'''
|
|
31
30
|
if quiet:
|
|
32
31
|
verbose_loggers = ['rdflib', 'udata.core.dataset']
|
|
@@ -50,8 +49,6 @@ def parse_url(url, csw, iso, quiet=False, rid=''):
|
|
|
50
49
|
source.url = url
|
|
51
50
|
if csw:
|
|
52
51
|
backend = CswDcatBackend(source, dryrun=True)
|
|
53
|
-
elif iso:
|
|
54
|
-
backend = CswIso19139DcatBackend(source, dryrun=True)
|
|
55
52
|
else:
|
|
56
53
|
backend = DcatBackend(source, dryrun=True)
|
|
57
54
|
backend.job = MockJob()
|
udata/core/activity/api.py
CHANGED
|
@@ -37,7 +37,7 @@ activity_fields = api.model('Activity', {
|
|
|
37
37
|
description='The key of the activity', required=True),
|
|
38
38
|
'icon': fields.String(
|
|
39
39
|
description='The icon of the activity', required=True),
|
|
40
|
-
'
|
|
40
|
+
'kwargs': fields.Raw(description='Some action specific context'),
|
|
41
41
|
})
|
|
42
42
|
|
|
43
43
|
activity_page_fields = api.model('ActivityPage', fields.pager(activity_fields))
|
udata/core/activity/models.py
CHANGED
|
@@ -37,7 +37,7 @@ class Activity(db.Document, metaclass=EmitNewActivityMetaClass):
|
|
|
37
37
|
related_to = db.ReferenceField(db.DomainModel, required=True)
|
|
38
38
|
created_at = db.DateTimeField(default=datetime.utcnow, required=True)
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
kwargs = db.DictField()
|
|
41
41
|
|
|
42
42
|
on_new = Signal()
|
|
43
43
|
|
|
@@ -65,9 +65,8 @@ class Activity(db.Document, metaclass=EmitNewActivityMetaClass):
|
|
|
65
65
|
return cls.on_new.connect(func, sender=cls)
|
|
66
66
|
|
|
67
67
|
@classmethod
|
|
68
|
-
def emit(cls, related_to, organization=None,
|
|
68
|
+
def emit(cls, related_to, organization=None, **kwargs):
|
|
69
69
|
new_activity.send(cls,
|
|
70
70
|
related_to=related_to,
|
|
71
71
|
actor=current_user._get_current_object(),
|
|
72
|
-
organization=organization
|
|
73
|
-
extras=extras)
|
|
72
|
+
organization=organization)
|
udata/core/activity/tasks.py
CHANGED
|
@@ -9,23 +9,22 @@ log = logging.getLogger(__name__)
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
@new_activity.connect
|
|
12
|
-
def delay_activity(cls, related_to, actor, organization=None
|
|
12
|
+
def delay_activity(cls, related_to, actor, organization=None):
|
|
13
13
|
emit_activity.delay(
|
|
14
14
|
cls.__name__,
|
|
15
15
|
str(actor.id),
|
|
16
16
|
related_to_cls=related_to.__class__.__name__,
|
|
17
17
|
related_to_id=str(related_to.id),
|
|
18
18
|
organization_id=str(organization.id) if organization else None,
|
|
19
|
-
extras=extras
|
|
20
19
|
)
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
@task
|
|
24
23
|
def emit_activity(classname, actor_id, related_to_cls, related_to_id,
|
|
25
|
-
organization_id=None
|
|
26
|
-
log.debug('Emit new activity: %s %s %s %s %s
|
|
24
|
+
organization_id=None):
|
|
25
|
+
log.debug('Emit new activity: %s %s %s %s %s',
|
|
27
26
|
classname, actor_id, related_to_cls,
|
|
28
|
-
related_to_id, organization_id
|
|
27
|
+
related_to_id, organization_id)
|
|
29
28
|
cls = db.resolve_model(classname)
|
|
30
29
|
actor = User.objects.get(pk=actor_id)
|
|
31
30
|
related_to = db.resolve_model(related_to_cls).objects.get(pk=related_to_id)
|
|
@@ -34,4 +33,4 @@ def emit_activity(classname, actor_id, related_to_cls, related_to_id,
|
|
|
34
33
|
else:
|
|
35
34
|
organization = None
|
|
36
35
|
cls.objects.create(actor=actor, related_to=related_to,
|
|
37
|
-
organization=organization
|
|
36
|
+
organization=organization)
|
udata/core/dataset/factories.py
CHANGED
|
@@ -34,8 +34,10 @@ class DatasetFactory(ModelFactory):
|
|
|
34
34
|
nb_resources = 0
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
class
|
|
38
|
-
|
|
37
|
+
class VisibleDatasetFactory(DatasetFactory):
|
|
38
|
+
@factory.lazy_attribute
|
|
39
|
+
def resources(self):
|
|
40
|
+
return [ResourceFactory()]
|
|
39
41
|
|
|
40
42
|
|
|
41
43
|
class ChecksumFactory(ModelFactory):
|
udata/core/dataset/models.py
CHANGED
|
@@ -342,10 +342,12 @@ class License(db.Document):
|
|
|
342
342
|
|
|
343
343
|
class DatasetQuerySet(db.OwnedQuerySet):
|
|
344
344
|
def visible(self):
|
|
345
|
-
return self(private__ne=True,
|
|
345
|
+
return self(private__ne=True, resources__0__exists=True,
|
|
346
|
+
deleted=None, archived=None)
|
|
346
347
|
|
|
347
348
|
def hidden(self):
|
|
348
349
|
return self(db.Q(private=True) |
|
|
350
|
+
db.Q(resources__0__exists=False) |
|
|
349
351
|
db.Q(deleted__ne=None) |
|
|
350
352
|
db.Q(archived__ne=None))
|
|
351
353
|
|
|
@@ -675,7 +677,8 @@ class Dataset(WithMetrics, BadgeMixin, db.Owned, db.Document):
|
|
|
675
677
|
|
|
676
678
|
@property
|
|
677
679
|
def is_hidden(self):
|
|
678
|
-
return self.
|
|
680
|
+
return (len(self.resources) == 0 or self.private or self.deleted
|
|
681
|
+
or self.archived)
|
|
679
682
|
|
|
680
683
|
@property
|
|
681
684
|
def full_title(self):
|
udata/core/dataset/rdf.py
CHANGED
|
@@ -7,7 +7,6 @@ import logging
|
|
|
7
7
|
|
|
8
8
|
from datetime import date
|
|
9
9
|
from html.parser import HTMLParser
|
|
10
|
-
from typing import Optional
|
|
11
10
|
from dateutil.parser import parse as parse_dt
|
|
12
11
|
from flask import current_app
|
|
13
12
|
from geomet import wkt
|
|
@@ -340,51 +339,36 @@ def contact_point_from_rdf(rdf, dataset):
|
|
|
340
339
|
|
|
341
340
|
|
|
342
341
|
def spatial_from_rdf(graph):
|
|
343
|
-
geojsons = []
|
|
344
342
|
for term in graph.objects(DCT.spatial):
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
geojson = json.loads(object.toPython())
|
|
360
|
-
except ValueError as e:
|
|
361
|
-
log.warning(f"Invalid JSON in spatial GeoJSON {object.toPython()} {e}")
|
|
362
|
-
continue
|
|
363
|
-
elif object.datatype.__str__() == 'http://www.opengis.net/rdf#wktLiteral':
|
|
364
|
-
try:
|
|
365
|
-
# .upper() si here because geomet doesn't support Polygon but only POLYGON
|
|
366
|
-
geojson = wkt.loads(object.toPython().strip().upper())
|
|
367
|
-
except ValueError as e:
|
|
368
|
-
log.warning(f"Invalid JSON in spatial WKT {object.toPython()} {e}")
|
|
369
|
-
continue
|
|
370
|
-
else:
|
|
343
|
+
for object in term.objects():
|
|
344
|
+
if isinstance(object, Literal):
|
|
345
|
+
if object.datatype.__str__() == 'https://www.iana.org/assignments/media-types/application/vnd.geo+json':
|
|
346
|
+
try:
|
|
347
|
+
geojson = json.loads(object.toPython())
|
|
348
|
+
except ValueError as e:
|
|
349
|
+
log.warning(f"Invalid JSON in spatial GeoJSON {object.toPython()} {e}")
|
|
350
|
+
continue
|
|
351
|
+
elif object.datatype.__str__() == 'http://www.opengis.net/rdf#wktLiteral':
|
|
352
|
+
try:
|
|
353
|
+
# .upper() si here because geomet doesn't support Polygon but only POLYGON
|
|
354
|
+
geojson = wkt.loads(object.toPython().strip().upper())
|
|
355
|
+
except ValueError as e:
|
|
356
|
+
log.warning(f"Invalid JSON in spatial WKT {object.toPython()} {e}")
|
|
371
357
|
continue
|
|
358
|
+
else:
|
|
359
|
+
continue
|
|
372
360
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
361
|
+
if geojson['type'] == 'Polygon':
|
|
362
|
+
geojson['type'] = 'MultiPolygon'
|
|
363
|
+
geojson['coordinates'] = [geojson['coordinates']]
|
|
376
364
|
|
|
377
|
-
|
|
378
|
-
except Exception as e:
|
|
379
|
-
log.exception(f"Exception during `spatial_from_rdf` for term {term}: {e}", stack_info=True)
|
|
365
|
+
spatial_coverage = SpatialCoverage(geom=geojson)
|
|
380
366
|
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
except ValidationError:
|
|
387
|
-
continue
|
|
367
|
+
try:
|
|
368
|
+
spatial_coverage.clean()
|
|
369
|
+
return spatial_coverage
|
|
370
|
+
except ValidationError:
|
|
371
|
+
continue
|
|
388
372
|
|
|
389
373
|
return None
|
|
390
374
|
|
|
@@ -625,27 +609,3 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None):
|
|
|
625
609
|
dataset.harvest.modified_at = modified_at
|
|
626
610
|
|
|
627
611
|
return dataset
|
|
628
|
-
|
|
629
|
-
def bbox_to_geojson_multipolygon(bbox_as_str: str) -> Optional[dict] :
|
|
630
|
-
bbox = bbox_as_str.strip().split(',')
|
|
631
|
-
if len(bbox) != 4:
|
|
632
|
-
return None
|
|
633
|
-
|
|
634
|
-
west = float(bbox[0])
|
|
635
|
-
south = float(bbox[1])
|
|
636
|
-
east = float(bbox[2])
|
|
637
|
-
north = float(bbox[3])
|
|
638
|
-
|
|
639
|
-
low_left = [west, south]
|
|
640
|
-
top_left = [west, north]
|
|
641
|
-
top_right = [east, north]
|
|
642
|
-
low_right = [east, south]
|
|
643
|
-
|
|
644
|
-
return {
|
|
645
|
-
'type': 'MultiPolygon',
|
|
646
|
-
'coordinates': [
|
|
647
|
-
[
|
|
648
|
-
[low_left, low_right, top_right, top_left, low_left],
|
|
649
|
-
],
|
|
650
|
-
],
|
|
651
|
-
}
|
udata/core/dataset/search.py
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
from flask import url_for
|
|
2
2
|
|
|
3
|
+
from udata.utils import get_by
|
|
4
|
+
|
|
3
5
|
from udata.utils import faker
|
|
4
6
|
from udata.tests.api import APITestCase
|
|
5
7
|
from udata.tests.features.territories import (
|
|
6
8
|
create_geozones_fixtures, TerritoriesSettings
|
|
7
9
|
)
|
|
10
|
+
from udata.tests.helpers import assert_json_equal
|
|
8
11
|
from udata.core.organization.factories import OrganizationFactory
|
|
9
|
-
from udata.core.dataset.factories import
|
|
12
|
+
from udata.core.dataset.factories import VisibleDatasetFactory
|
|
10
13
|
from udata.core.spatial.factories import (
|
|
11
14
|
SpatialCoverageFactory, GeoZoneFactory, GeoLevelFactory
|
|
12
15
|
)
|
|
@@ -170,7 +173,7 @@ class SpatialApiTest(APITestCase):
|
|
|
170
173
|
paca, bdr, arles = create_geozones_fixtures()
|
|
171
174
|
organization = OrganizationFactory()
|
|
172
175
|
for _ in range(3):
|
|
173
|
-
|
|
176
|
+
VisibleDatasetFactory(
|
|
174
177
|
organization=organization,
|
|
175
178
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
176
179
|
|
|
@@ -182,7 +185,7 @@ class SpatialApiTest(APITestCase):
|
|
|
182
185
|
paca, bdr, arles = create_geozones_fixtures()
|
|
183
186
|
organization = OrganizationFactory()
|
|
184
187
|
for _ in range(3):
|
|
185
|
-
|
|
188
|
+
VisibleDatasetFactory(
|
|
186
189
|
organization=organization,
|
|
187
190
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
188
191
|
|
|
@@ -195,7 +198,7 @@ class SpatialApiTest(APITestCase):
|
|
|
195
198
|
paca, bdr, arles = create_geozones_fixtures()
|
|
196
199
|
organization = OrganizationFactory()
|
|
197
200
|
for _ in range(3):
|
|
198
|
-
|
|
201
|
+
VisibleDatasetFactory(
|
|
199
202
|
organization=organization,
|
|
200
203
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
201
204
|
|
|
@@ -209,7 +212,7 @@ class SpatialApiTest(APITestCase):
|
|
|
209
212
|
paca, bdr, arles = create_geozones_fixtures()
|
|
210
213
|
organization = OrganizationFactory()
|
|
211
214
|
for _ in range(3):
|
|
212
|
-
|
|
215
|
+
VisibleDatasetFactory(
|
|
213
216
|
organization=organization,
|
|
214
217
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
215
218
|
|
|
@@ -238,7 +241,7 @@ class SpatialTerritoriesApiTest(APITestCase):
|
|
|
238
241
|
paca, bdr, arles = create_geozones_fixtures()
|
|
239
242
|
organization = OrganizationFactory()
|
|
240
243
|
for _ in range(3):
|
|
241
|
-
|
|
244
|
+
VisibleDatasetFactory(
|
|
242
245
|
organization=organization,
|
|
243
246
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
244
247
|
|
|
@@ -252,7 +255,7 @@ class SpatialTerritoriesApiTest(APITestCase):
|
|
|
252
255
|
paca, bdr, arles = create_geozones_fixtures()
|
|
253
256
|
organization = OrganizationFactory()
|
|
254
257
|
for _ in range(3):
|
|
255
|
-
|
|
258
|
+
VisibleDatasetFactory(
|
|
256
259
|
organization=organization,
|
|
257
260
|
spatial=SpatialCoverageFactory(zones=[paca.id]))
|
|
258
261
|
|
udata/core/topic/factories.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import factory
|
|
2
2
|
|
|
3
3
|
from udata import utils
|
|
4
|
-
from udata.core.dataset.factories import
|
|
4
|
+
from udata.core.dataset.factories import VisibleDatasetFactory
|
|
5
5
|
from udata.core.reuse.factories import VisibleReuseFactory
|
|
6
6
|
from udata.factories import ModelFactory
|
|
7
7
|
|
|
@@ -19,7 +19,7 @@ class TopicFactory(ModelFactory):
|
|
|
19
19
|
|
|
20
20
|
@factory.lazy_attribute
|
|
21
21
|
def datasets(self):
|
|
22
|
-
return
|
|
22
|
+
return VisibleDatasetFactory.create_batch(3)
|
|
23
23
|
|
|
24
24
|
@factory.lazy_attribute
|
|
25
25
|
def reuses(self):
|
udata/harvest/backends/dcat.py
CHANGED
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
|
|
3
3
|
from rdflib import Graph, URIRef
|
|
4
4
|
from rdflib.namespace import RDF
|
|
5
|
-
import
|
|
5
|
+
import xml.etree.ElementTree as ET
|
|
6
6
|
import boto3
|
|
7
7
|
from flask import current_app
|
|
8
8
|
from datetime import date
|
|
@@ -173,36 +173,7 @@ class DcatBackend(BaseBackend):
|
|
|
173
173
|
dataset = self.get_dataset(item.remote_id)
|
|
174
174
|
dataset = dataset_from_rdf(graph, dataset, node=node)
|
|
175
175
|
return dataset
|
|
176
|
-
|
|
177
176
|
|
|
178
|
-
def next_record_if_should_continue(self, start, search_results):
|
|
179
|
-
next_record = int(search_results.attrib['nextRecord'])
|
|
180
|
-
matched_count = int(search_results.attrib['numberOfRecordsMatched'])
|
|
181
|
-
returned_count = int(search_results.attrib['numberOfRecordsReturned'])
|
|
182
|
-
|
|
183
|
-
# Break conditions copied gratefully from
|
|
184
|
-
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
|
|
185
|
-
break_conditions = (
|
|
186
|
-
# standard CSW: A value of 0 means all records have been returned.
|
|
187
|
-
next_record == 0,
|
|
188
|
-
|
|
189
|
-
# Misbehaving CSW server returning a next record > matched count
|
|
190
|
-
next_record > matched_count,
|
|
191
|
-
|
|
192
|
-
# No results returned already
|
|
193
|
-
returned_count == 0,
|
|
194
|
-
|
|
195
|
-
# Current next record is lower than previous one
|
|
196
|
-
next_record < start,
|
|
197
|
-
|
|
198
|
-
# Enough items have been harvested already
|
|
199
|
-
self.max_items and len(self.job.items) >= self.max_items
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
if any(break_conditions):
|
|
203
|
-
return None
|
|
204
|
-
else:
|
|
205
|
-
return next_record
|
|
206
177
|
|
|
207
178
|
class CswDcatBackend(DcatBackend):
|
|
208
179
|
display_name = 'CSW-DCAT'
|
|
@@ -230,18 +201,17 @@ class CswDcatBackend(DcatBackend):
|
|
|
230
201
|
graphs = []
|
|
231
202
|
page = 0
|
|
232
203
|
start = 1
|
|
233
|
-
|
|
234
204
|
response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
|
|
235
205
|
headers=headers)
|
|
236
206
|
response.raise_for_status()
|
|
237
|
-
content = response.
|
|
207
|
+
content = response.text
|
|
238
208
|
tree = ET.fromstring(content)
|
|
239
209
|
if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
|
|
240
210
|
raise ValueError(f'Failed to query CSW:\n{content}')
|
|
241
211
|
while tree:
|
|
242
212
|
graph = Graph(namespace_manager=namespace_manager)
|
|
243
213
|
search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
|
|
244
|
-
if search_results
|
|
214
|
+
if not search_results:
|
|
245
215
|
log.error(f'No search results found for {url} on page {page}')
|
|
246
216
|
break
|
|
247
217
|
for child in search_results:
|
|
@@ -255,111 +225,37 @@ class CswDcatBackend(DcatBackend):
|
|
|
255
225
|
kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
|
|
256
226
|
self.add_item(id, **kwargs)
|
|
257
227
|
graphs.append(graph)
|
|
258
|
-
|
|
259
|
-
next_record = self.next_record_if_should_continue(start, search_results)
|
|
260
|
-
if not next_record:
|
|
261
|
-
break
|
|
262
|
-
|
|
263
|
-
start = next_record
|
|
264
228
|
page += 1
|
|
265
229
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
return graphs
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
class CswIso19139DcatBackend(DcatBackend):
|
|
275
|
-
'''
|
|
276
|
-
An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
|
|
277
|
-
The parsing of items is then the same as for the DcatBackend.
|
|
278
|
-
'''
|
|
279
|
-
|
|
280
|
-
display_name = 'CSW-ISO-19139'
|
|
281
|
-
|
|
282
|
-
ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd'
|
|
230
|
+
next_record = int(search_results.attrib['nextRecord'])
|
|
231
|
+
matched_count = int(search_results.attrib['numberOfRecordsMatched'])
|
|
232
|
+
returned_count = int(search_results.attrib['numberOfRecordsReturned'])
|
|
283
233
|
|
|
284
|
-
|
|
234
|
+
# Break conditions copied gratefully from
|
|
235
|
+
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
|
|
236
|
+
break_conditions = (
|
|
237
|
+
# standard CSW: A value of 0 means all records have been returned.
|
|
238
|
+
next_record == 0,
|
|
285
239
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
Parse CSW graph querying ISO schema.
|
|
289
|
-
Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
|
|
290
|
-
See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
|
|
291
|
-
'''
|
|
292
|
-
|
|
293
|
-
# Load XSLT
|
|
294
|
-
xsl = ET.fromstring(self.get(self.XSL_URL).content)
|
|
295
|
-
transform = ET.XSLT(xsl)
|
|
296
|
-
|
|
297
|
-
# Start querying and parsing graph
|
|
298
|
-
body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
299
|
-
xmlns:gmd="http://www.isotc211.org/2005/gmd"
|
|
300
|
-
service="CSW" version="2.0.2" resultType="results"
|
|
301
|
-
startPosition="{start}" maxPosition="10"
|
|
302
|
-
outputSchema="{schema}">
|
|
303
|
-
<csw:Query typeNames="csw:Record">
|
|
304
|
-
<csw:ElementSetName>full</csw:ElementSetName>
|
|
305
|
-
<csw:Constraint version="1.1.0">
|
|
306
|
-
<ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
|
|
307
|
-
<ogc:PropertyIsEqualTo>
|
|
308
|
-
<ogc:PropertyName>dc:type</ogc:PropertyName>
|
|
309
|
-
<ogc:Literal>dataset</ogc:Literal>
|
|
310
|
-
</ogc:PropertyIsEqualTo>
|
|
311
|
-
</ogc:Filter>
|
|
312
|
-
</csw:Constraint>
|
|
313
|
-
</csw:Query>
|
|
314
|
-
</csw:GetRecords>'''
|
|
315
|
-
headers = {'Content-Type': 'application/xml'}
|
|
240
|
+
# Misbehaving CSW server returning a next record > matched count
|
|
241
|
+
next_record > matched_count,
|
|
316
242
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
start = 1
|
|
243
|
+
# No results returned already
|
|
244
|
+
returned_count == 0,
|
|
320
245
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
response.raise_for_status()
|
|
246
|
+
# Current next record is lower than previous one
|
|
247
|
+
next_record < start,
|
|
324
248
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
|
|
249
|
+
# Enough items have been harvested already
|
|
250
|
+
self.max_items and len(self.job.items) >= self.max_items
|
|
251
|
+
)
|
|
329
252
|
|
|
330
|
-
|
|
331
|
-
# We query the tree before the transformation because the XSLT remove the search results
|
|
332
|
-
# infos (useful for pagination)
|
|
333
|
-
search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
|
|
334
|
-
if search_results is None:
|
|
335
|
-
log.error(f'No search results found for {url} on page {page}')
|
|
253
|
+
if any(break_conditions):
|
|
336
254
|
break
|
|
337
255
|
|
|
338
|
-
subgraph = Graph(namespace_manager=namespace_manager)
|
|
339
|
-
subgraph.parse(ET.tostring(tree), format=fmt)
|
|
340
|
-
|
|
341
|
-
if not subgraph.subjects(RDF.type, DCAT.Dataset):
|
|
342
|
-
raise ValueError("Failed to fetch CSW content")
|
|
343
|
-
|
|
344
|
-
for node in subgraph.subjects(RDF.type, DCAT.Dataset):
|
|
345
|
-
id = subgraph.value(node, DCT.identifier)
|
|
346
|
-
kwargs = {'nid': str(node), 'page': page}
|
|
347
|
-
kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
|
|
348
|
-
self.add_item(id, **kwargs)
|
|
349
|
-
graphs.append(subgraph)
|
|
350
|
-
|
|
351
|
-
next_record = self.next_record_if_should_continue(start, search_results)
|
|
352
|
-
if not next_record:
|
|
353
|
-
break
|
|
354
|
-
|
|
355
256
|
start = next_record
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
headers=headers)
|
|
360
|
-
response.raise_for_status()
|
|
361
|
-
|
|
362
|
-
tree_before_transform = ET.fromstring(response.content)
|
|
363
|
-
tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
|
|
257
|
+
tree = ET.fromstring(
|
|
258
|
+
self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
|
|
259
|
+
headers=headers).text)
|
|
364
260
|
|
|
365
261
|
return graphs
|