udata 8.0.2.dev29304__py2.py3-none-any.whl → 9.1.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- udata/__init__.py +1 -1
- udata/api/__init__.py +2 -0
- udata/api/commands.py +0 -2
- udata/api_fields.py +41 -3
- udata/commands/db.py +88 -48
- udata/core/dataservices/factories.py +33 -0
- udata/core/dataservices/models.py +42 -4
- udata/core/dataservices/rdf.py +106 -0
- udata/core/dataset/csv.py +8 -1
- udata/core/dataset/models.py +1 -2
- udata/core/dataset/rdf.py +37 -128
- udata/core/discussions/models.py +20 -0
- udata/core/organization/csv.py +5 -3
- udata/core/reports/__init__.py +0 -0
- udata/core/reports/api.py +44 -0
- udata/core/reports/constants.py +30 -0
- udata/core/reports/models.py +58 -0
- udata/core/reuse/csv.py +3 -0
- udata/core/site/api.py +33 -2
- udata/core/site/rdf.py +6 -1
- udata/core/spam/models.py +6 -0
- udata/core/topic/models.py +3 -2
- udata/core/topic/parsers.py +3 -2
- udata/core/user/apiv2.py +28 -0
- udata/db/__init__.py +0 -0
- udata/db/tasks.py +6 -0
- udata/features/notifications/__init__.py +0 -1
- udata/forms/fields.py +2 -2
- udata/harvest/api.py +19 -1
- udata/harvest/backends/base.py +118 -10
- udata/harvest/backends/dcat.py +28 -7
- udata/harvest/models.py +6 -0
- udata/harvest/tests/dcat/bnodes.xml +13 -2
- udata/harvest/tests/test_dcat_backend.py +21 -0
- udata/migrations/2024-06-11-fix-reuse-datasets-references.py +35 -0
- udata/models/__init__.py +1 -0
- udata/rdf.py +113 -2
- udata/routing.py +1 -1
- udata/settings.py +3 -1
- udata/static/admin.js +17 -17
- udata/static/admin.js.map +1 -1
- udata/static/chunks/{18.ad41fb75ac4226e1f3ce.js → 18.1922fd0b2b7fad122991.js} +3 -3
- udata/static/chunks/18.1922fd0b2b7fad122991.js.map +1 -0
- udata/static/chunks/{7.11ac4de064ae59691d49.js → 7.e2106342e94ee09393b1.js} +2 -2
- udata/static/chunks/7.e2106342e94ee09393b1.js.map +1 -0
- udata/static/common.js +1 -1
- udata/static/common.js.map +1 -1
- udata/storage/s3.py +3 -3
- udata/tasks.py +1 -0
- udata/tests/api/test_dataservices_api.py +26 -2
- udata/tests/api/test_datasets_api.py +1 -1
- udata/tests/api/test_reports_api.py +87 -0
- udata/tests/apiv2/test_me_api.py +40 -0
- udata/tests/dataset/test_dataset_rdf.py +19 -1
- udata/tests/frontend/test_auth.py +1 -4
- udata/tests/organization/test_csv_adapter.py +0 -1
- udata/tests/plugin.py +2 -0
- udata/tests/site/test_site_api.py +0 -1
- udata/tests/site/test_site_rdf.py +66 -0
- udata/tests/test_discussions.py +24 -34
- udata/tests/test_model.py +3 -2
- udata/tests/test_utils.py +1 -1
- udata/translations/ar/LC_MESSAGES/udata.mo +0 -0
- udata/translations/ar/LC_MESSAGES/udata.po +128 -64
- udata/translations/de/LC_MESSAGES/udata.mo +0 -0
- udata/translations/de/LC_MESSAGES/udata.po +128 -64
- udata/translations/es/LC_MESSAGES/udata.mo +0 -0
- udata/translations/es/LC_MESSAGES/udata.po +128 -64
- udata/translations/fr/LC_MESSAGES/udata.mo +0 -0
- udata/translations/fr/LC_MESSAGES/udata.po +128 -64
- udata/translations/it/LC_MESSAGES/udata.mo +0 -0
- udata/translations/it/LC_MESSAGES/udata.po +128 -64
- udata/translations/pt/LC_MESSAGES/udata.mo +0 -0
- udata/translations/pt/LC_MESSAGES/udata.po +128 -64
- udata/translations/sr/LC_MESSAGES/udata.mo +0 -0
- udata/translations/sr/LC_MESSAGES/udata.po +128 -64
- udata/translations/udata.pot +129 -65
- udata/uris.py +14 -13
- {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/METADATA +26 -7
- {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/RECORD +84 -72
- udata/static/chunks/18.ad41fb75ac4226e1f3ce.js.map +0 -1
- udata/static/chunks/7.11ac4de064ae59691d49.js.map +0 -1
- {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/LICENSE +0 -0
- {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/WHEEL +0 -0
- {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/entry_points.txt +0 -0
- {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/top_level.txt +0 -0
udata/harvest/api.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
|
+
from bson import ObjectId
|
|
1
2
|
from werkzeug.exceptions import BadRequest
|
|
2
3
|
from flask import request
|
|
3
4
|
|
|
4
5
|
from udata.api import api, API, fields
|
|
5
6
|
from udata.auth import admin_permission
|
|
6
7
|
|
|
8
|
+
from udata.core.dataservices.models import Dataservice
|
|
7
9
|
from udata.core.dataset.api_fields import dataset_ref_fields, dataset_fields
|
|
8
10
|
from udata.core.organization.api_fields import org_ref_fields
|
|
9
11
|
from udata.core.organization.permissions import EditOrganizationPermission
|
|
@@ -31,12 +33,22 @@ error_fields = api.model('HarvestError', {
|
|
|
31
33
|
'details': fields.String(description='Optional details (ie. stacktrace)'),
|
|
32
34
|
})
|
|
33
35
|
|
|
36
|
+
|
|
37
|
+
log_fields = api.model('HarvestError', {
|
|
38
|
+
'level': fields.String(required=True),
|
|
39
|
+
'message': fields.String(required=True),
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
|
|
34
43
|
item_fields = api.model('HarvestItem', {
|
|
35
44
|
'remote_id': fields.String(description='The item remote ID to process',
|
|
36
45
|
required=True),
|
|
37
46
|
'dataset': fields.Nested(dataset_ref_fields,
|
|
38
47
|
description='The processed dataset',
|
|
39
48
|
allow_null=True),
|
|
49
|
+
'dataservice': fields.Nested(Dataservice.__read_fields__,
|
|
50
|
+
description='The processed dataservice',
|
|
51
|
+
allow_null=True),
|
|
40
52
|
'status': fields.String(description='The item status',
|
|
41
53
|
required=True,
|
|
42
54
|
enum=list(HARVEST_ITEM_STATUS)),
|
|
@@ -46,6 +58,8 @@ item_fields = api.model('HarvestItem', {
|
|
|
46
58
|
'ended': fields.ISODateTime(description='The item end date'),
|
|
47
59
|
'errors': fields.List(fields.Nested(error_fields),
|
|
48
60
|
description='The item errors'),
|
|
61
|
+
'logs': fields.List(fields.Nested(log_fields),
|
|
62
|
+
description='The item logs'),
|
|
49
63
|
'args': fields.List(fields.String,
|
|
50
64
|
description='The item positional arguments',
|
|
51
65
|
default=[]),
|
|
@@ -181,6 +195,10 @@ class SourcesAPI(API):
|
|
|
181
195
|
def get(self):
|
|
182
196
|
'''List all harvest sources'''
|
|
183
197
|
args = source_parser.parse_args()
|
|
198
|
+
|
|
199
|
+
if args.get('owner') and not ObjectId.is_valid(args.get('owner')):
|
|
200
|
+
api.abort(400, '`owner` arg must be an identifier')
|
|
201
|
+
|
|
184
202
|
return actions.paginate_sources(args.get('owner'),
|
|
185
203
|
page=args['page'],
|
|
186
204
|
page_size=args['page_size'],
|
|
@@ -256,7 +274,7 @@ class ScheduleSourceAPI(API):
|
|
|
256
274
|
# Handle both syntax: quoted and unquoted
|
|
257
275
|
try:
|
|
258
276
|
data = request.json
|
|
259
|
-
except BadRequest
|
|
277
|
+
except BadRequest:
|
|
260
278
|
data = request.data.decode('utf-8')
|
|
261
279
|
return actions.schedule(ident, data)
|
|
262
280
|
|
udata/harvest/backends/base.py
CHANGED
|
@@ -2,20 +2,21 @@ import logging
|
|
|
2
2
|
import traceback
|
|
3
3
|
|
|
4
4
|
from datetime import datetime, date, timedelta
|
|
5
|
-
from typing import Optional
|
|
6
5
|
from uuid import UUID
|
|
7
6
|
|
|
8
7
|
import requests
|
|
9
8
|
|
|
10
9
|
from flask import current_app
|
|
10
|
+
from udata.core.dataservices.models import Dataservice
|
|
11
11
|
from voluptuous import MultipleInvalid, RequiredFieldInvalid
|
|
12
12
|
|
|
13
13
|
from udata.core.dataset.models import HarvestDatasetMetadata
|
|
14
|
+
from udata.core.dataservices.models import HarvestMetadata as HarvestDataserviceMetadata
|
|
14
15
|
from udata.models import Dataset
|
|
15
16
|
from udata.utils import safe_unicode
|
|
16
17
|
|
|
17
18
|
from ..exceptions import HarvestException, HarvestSkipException, HarvestValidationError
|
|
18
|
-
from ..models import HarvestItem, HarvestJob, HarvestError, archive_harvested_dataset
|
|
19
|
+
from ..models import HarvestItem, HarvestJob, HarvestError, HarvestLog, archive_harvested_dataset
|
|
19
20
|
from ..signals import before_harvest_job, after_harvest_job
|
|
20
21
|
|
|
21
22
|
log = logging.getLogger(__name__)
|
|
@@ -72,8 +73,6 @@ class BaseBackend(object):
|
|
|
72
73
|
"""
|
|
73
74
|
Base class that wrap children methods to add error management and debug logs.
|
|
74
75
|
Also provides a few helpers needed on all or some backends.
|
|
75
|
-
|
|
76
|
-
|
|
77
76
|
"""
|
|
78
77
|
|
|
79
78
|
name = None
|
|
@@ -139,6 +138,9 @@ class BaseBackend(object):
|
|
|
139
138
|
def inner_process_dataset(self, item: HarvestItem) -> Dataset:
|
|
140
139
|
raise NotImplementedError
|
|
141
140
|
|
|
141
|
+
def inner_process_dataservice(self, item: HarvestItem) -> Dataservice:
|
|
142
|
+
raise NotImplementedError
|
|
143
|
+
|
|
142
144
|
def harvest(self):
|
|
143
145
|
log.debug(f'Starting harvesting {self.source.name} ({self.source.url})…')
|
|
144
146
|
factory = HarvestJob if self.dryrun else HarvestJob.objects.create
|
|
@@ -185,14 +187,17 @@ class BaseBackend(object):
|
|
|
185
187
|
self.job.items.append(item)
|
|
186
188
|
self.save_job()
|
|
187
189
|
|
|
190
|
+
log_catcher = LogCatcher()
|
|
191
|
+
|
|
188
192
|
try:
|
|
189
193
|
if not remote_id:
|
|
190
194
|
raise HarvestSkipException("missing identifier")
|
|
191
195
|
|
|
196
|
+
current_app.logger.addHandler(log_catcher)
|
|
192
197
|
dataset = self.inner_process_dataset(item, **kwargs)
|
|
193
198
|
|
|
194
199
|
# Use `item.remote_id` because `inner_process_dataset` could have modified it.
|
|
195
|
-
dataset.harvest = self.
|
|
200
|
+
dataset.harvest = self.update_dataset_harvest_info(dataset.harvest, item.remote_id)
|
|
196
201
|
dataset.archived = None
|
|
197
202
|
|
|
198
203
|
# TODO: Apply editable mappings
|
|
@@ -220,24 +225,94 @@ class BaseBackend(object):
|
|
|
220
225
|
error = HarvestError(message=safe_unicode(e), details=traceback.format_exc())
|
|
221
226
|
item.errors.append(error)
|
|
222
227
|
finally:
|
|
228
|
+
current_app.logger.removeHandler(log_catcher)
|
|
223
229
|
item.ended = datetime.utcnow()
|
|
230
|
+
item.logs = [HarvestLog(level=record.levelname, message=record.getMessage()) for record in log_catcher.records]
|
|
224
231
|
self.save_job()
|
|
225
232
|
|
|
226
233
|
def is_done(self) -> bool:
|
|
227
234
|
'''Should be called after process_dataset to know if we reach the max items'''
|
|
228
235
|
return self.max_items and len(self.job.items) >= self.max_items
|
|
229
236
|
|
|
230
|
-
def
|
|
237
|
+
def process_dataservice(self, remote_id: str, **kwargs) -> bool :
|
|
238
|
+
'''
|
|
239
|
+
Return `True` if the parent should stop iterating because we exceed the number
|
|
240
|
+
of items to process.
|
|
241
|
+
'''
|
|
242
|
+
log.debug(f'Processing dataservice {remote_id}…')
|
|
243
|
+
|
|
244
|
+
# TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice`
|
|
245
|
+
item = HarvestItem(status='started', started=datetime.utcnow(), remote_id=remote_id)
|
|
246
|
+
self.job.items.append(item)
|
|
247
|
+
self.save_job()
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
if not remote_id:
|
|
251
|
+
raise HarvestSkipException("missing identifier")
|
|
252
|
+
|
|
253
|
+
dataservice = self.inner_process_dataservice(item, **kwargs)
|
|
254
|
+
|
|
255
|
+
dataservice.harvest = self.update_dataservice_harvest_info(dataservice.harvest, remote_id)
|
|
256
|
+
dataservice.archived_at = None
|
|
257
|
+
|
|
258
|
+
# TODO: Apply editable mappings
|
|
259
|
+
|
|
260
|
+
if self.dryrun:
|
|
261
|
+
dataservice.validate()
|
|
262
|
+
else:
|
|
263
|
+
dataservice.save()
|
|
264
|
+
item.dataservice = dataservice
|
|
265
|
+
item.status = 'done'
|
|
266
|
+
except HarvestSkipException as e:
|
|
267
|
+
item.status = 'skipped'
|
|
268
|
+
|
|
269
|
+
log.info(f'Skipped item {item.remote_id} : {safe_unicode(e)}')
|
|
270
|
+
item.errors.append(HarvestError(message=safe_unicode(e)))
|
|
271
|
+
except HarvestValidationError as e:
|
|
272
|
+
item.status = 'failed'
|
|
273
|
+
|
|
274
|
+
log.info(f'Error validating item {item.remote_id} : {safe_unicode(e)}')
|
|
275
|
+
item.errors.append(HarvestError(message=safe_unicode(e)))
|
|
276
|
+
except Exception as e:
|
|
277
|
+
item.status = 'failed'
|
|
278
|
+
log.exception(f'Error while processing {item.remote_id} : {safe_unicode(e)}')
|
|
279
|
+
|
|
280
|
+
error = HarvestError(message=safe_unicode(e), details=traceback.format_exc())
|
|
281
|
+
item.errors.append(error)
|
|
282
|
+
finally:
|
|
283
|
+
item.ended = datetime.utcnow()
|
|
284
|
+
self.save_job()
|
|
285
|
+
|
|
286
|
+
def update_dataset_harvest_info(self, harvest: HarvestDatasetMetadata | None, remote_id: int):
|
|
231
287
|
if not harvest:
|
|
232
288
|
harvest = HarvestDatasetMetadata()
|
|
233
|
-
|
|
234
|
-
harvest.
|
|
289
|
+
|
|
290
|
+
harvest.backend = self.display_name
|
|
235
291
|
harvest.source_id = str(self.source.id)
|
|
292
|
+
harvest.remote_id = remote_id
|
|
293
|
+
harvest.domain = self.source.domain
|
|
236
294
|
harvest.last_update = datetime.utcnow()
|
|
295
|
+
harvest.archived_at = None
|
|
296
|
+
harvest.archived = None
|
|
297
|
+
|
|
298
|
+
# created_at, modified_at, remote_url, uri, dct_identifier are set in `dataset_from_rdf`
|
|
299
|
+
|
|
300
|
+
return harvest
|
|
301
|
+
|
|
302
|
+
def update_dataservice_harvest_info(self, harvest: HarvestDataserviceMetadata | None, remote_id: int):
|
|
303
|
+
if not harvest:
|
|
304
|
+
harvest = HarvestDataserviceMetadata()
|
|
305
|
+
|
|
237
306
|
harvest.backend = self.display_name
|
|
307
|
+
harvest.domain = self.source.domain
|
|
308
|
+
|
|
309
|
+
harvest.source_id = str(self.source.id)
|
|
310
|
+
harvest.source_url = str(self.source.url)
|
|
311
|
+
|
|
312
|
+
harvest.remote_id = remote_id
|
|
313
|
+
harvest.last_update = datetime.utcnow()
|
|
238
314
|
|
|
239
315
|
harvest.archived_at = None
|
|
240
|
-
harvest.archived = None
|
|
241
316
|
|
|
242
317
|
return harvest
|
|
243
318
|
|
|
@@ -302,6 +377,28 @@ class BaseBackend(object):
|
|
|
302
377
|
return Dataset(owner=self.source.owner)
|
|
303
378
|
|
|
304
379
|
return Dataset()
|
|
380
|
+
|
|
381
|
+
def get_dataservice(self, remote_id):
|
|
382
|
+
'''Get or create a dataservice given its remote ID (and its source)
|
|
383
|
+
We first try to match `source_id` to be source domain independent
|
|
384
|
+
'''
|
|
385
|
+
dataservice = Dataservice.objects(__raw__={
|
|
386
|
+
'harvest.remote_id': remote_id,
|
|
387
|
+
'$or': [
|
|
388
|
+
{'harvest.domain': self.source.domain},
|
|
389
|
+
{'harvest.source_id': str(self.source.id)},
|
|
390
|
+
],
|
|
391
|
+
}).first()
|
|
392
|
+
|
|
393
|
+
if dataservice:
|
|
394
|
+
return dataservice
|
|
395
|
+
|
|
396
|
+
if self.source.organization:
|
|
397
|
+
return Dataservice(organization=self.source.organization)
|
|
398
|
+
elif self.source.owner:
|
|
399
|
+
return Dataservice(owner=self.source.owner)
|
|
400
|
+
|
|
401
|
+
return Dataservice()
|
|
305
402
|
|
|
306
403
|
def validate(self, data, schema):
|
|
307
404
|
'''Perform a data validation against a given schema.
|
|
@@ -342,4 +439,15 @@ class BaseBackend(object):
|
|
|
342
439
|
msg = str(error)
|
|
343
440
|
errors.append(msg)
|
|
344
441
|
msg = '\n- '.join(['Validation error:'] + errors)
|
|
345
|
-
raise HarvestValidationError(msg)
|
|
442
|
+
raise HarvestValidationError(msg)
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
class LogCatcher(logging.Handler):
|
|
446
|
+
records: list[logging.LogRecord]
|
|
447
|
+
|
|
448
|
+
def __init__(self):
|
|
449
|
+
self.records = []
|
|
450
|
+
super().__init__()
|
|
451
|
+
|
|
452
|
+
def emit(self, record):
|
|
453
|
+
self.records.append(record)
|
udata/harvest/backends/dcat.py
CHANGED
|
@@ -1,20 +1,18 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
-
from rdflib import Graph
|
|
3
|
+
from rdflib import Graph
|
|
4
4
|
from rdflib.namespace import RDF
|
|
5
5
|
import lxml.etree as ET
|
|
6
|
-
import boto3
|
|
7
6
|
from flask import current_app
|
|
8
7
|
from datetime import date
|
|
9
|
-
import
|
|
10
|
-
from typing import Generator, List
|
|
8
|
+
from typing import Generator
|
|
11
9
|
|
|
12
|
-
from udata.core.dataset.models import Dataset
|
|
13
10
|
from udata.rdf import (
|
|
14
11
|
DCAT, DCT, HYDRA, SPDX, namespace_manager, guess_format, url_from_rdf
|
|
15
12
|
)
|
|
16
13
|
from udata.core.dataset.rdf import dataset_from_rdf
|
|
17
|
-
from udata.
|
|
14
|
+
from udata.core.dataservices.rdf import dataservice_from_rdf
|
|
15
|
+
from udata.storage.s3 import store_as_json
|
|
18
16
|
from udata.harvest.models import HarvestItem
|
|
19
17
|
|
|
20
18
|
from .base import BaseBackend
|
|
@@ -71,7 +69,8 @@ class DcatBackend(BaseBackend):
|
|
|
71
69
|
self.process_one_datasets_page(page_number, page)
|
|
72
70
|
serialized_graphs.append(page.serialize(format=fmt, indent=None))
|
|
73
71
|
|
|
74
|
-
|
|
72
|
+
for page_number, page in self.walk_graph(self.source.url, fmt):
|
|
73
|
+
self.process_one_dataservices_page(page_number, page)
|
|
75
74
|
|
|
76
75
|
# The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000).
|
|
77
76
|
max_harvest_graph_size_in_mongo = current_app.config.get('HARVEST_MAX_CATALOG_SIZE_IN_MONGO')
|
|
@@ -145,6 +144,14 @@ class DcatBackend(BaseBackend):
|
|
|
145
144
|
|
|
146
145
|
if self.is_done():
|
|
147
146
|
return
|
|
147
|
+
|
|
148
|
+
def process_one_dataservices_page(self, page_number: int, page: Graph):
|
|
149
|
+
for node in page.subjects(RDF.type, DCAT.DataService):
|
|
150
|
+
remote_id = page.value(node, DCT.identifier)
|
|
151
|
+
self.process_dataservice(remote_id, page_number=page_number, page=page, node=node)
|
|
152
|
+
|
|
153
|
+
if self.is_done():
|
|
154
|
+
return
|
|
148
155
|
|
|
149
156
|
def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node):
|
|
150
157
|
item.kwargs['page_number'] = page_number
|
|
@@ -152,6 +159,12 @@ class DcatBackend(BaseBackend):
|
|
|
152
159
|
dataset = self.get_dataset(item.remote_id)
|
|
153
160
|
return dataset_from_rdf(page, dataset, node=node)
|
|
154
161
|
|
|
162
|
+
def inner_process_dataservice(self, item: HarvestItem, page_number: int, page: Graph, node):
|
|
163
|
+
item.kwargs['page_number'] = page_number
|
|
164
|
+
|
|
165
|
+
dataservice = self.get_dataservice(item.remote_id)
|
|
166
|
+
return dataservice_from_rdf(page, dataservice, node, [item.dataset for item in self.job.items])
|
|
167
|
+
|
|
155
168
|
def get_node_from_item(self, graph, item):
|
|
156
169
|
for node in graph.subjects(RDF.type, DCAT.Dataset):
|
|
157
170
|
if str(graph.value(node, DCT.identifier)) == item.remote_id:
|
|
@@ -263,6 +276,10 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
263
276
|
def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
|
|
264
277
|
"""
|
|
265
278
|
Yield all RDF pages as `Graph` from the source
|
|
279
|
+
|
|
280
|
+
Parse CSW graph querying ISO schema.
|
|
281
|
+
Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
|
|
282
|
+
See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
|
|
266
283
|
"""
|
|
267
284
|
# Load XSLT
|
|
268
285
|
xsl = ET.fromstring(self.get(self.XSL_URL).content)
|
|
@@ -284,6 +301,10 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
284
301
|
<ogc:PropertyName>dc:type</ogc:PropertyName>
|
|
285
302
|
<ogc:Literal>dataset</ogc:Literal>
|
|
286
303
|
</ogc:PropertyIsEqualTo>
|
|
304
|
+
<ogc:PropertyIsEqualTo>
|
|
305
|
+
<ogc:PropertyName>dc:type</ogc:PropertyName>
|
|
306
|
+
<ogc:Literal>service</ogc:Literal>
|
|
307
|
+
</ogc:PropertyIsEqualTo>
|
|
287
308
|
<ogc:PropertyIsEqualTo>
|
|
288
309
|
<ogc:PropertyName>dc:type</ogc:PropertyName>
|
|
289
310
|
<ogc:Literal>series</ogc:Literal>
|
udata/harvest/models.py
CHANGED
|
@@ -3,6 +3,7 @@ from datetime import datetime
|
|
|
3
3
|
import logging
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
5
|
|
|
6
|
+
from udata.core.dataservices.models import Dataservice
|
|
6
7
|
from werkzeug.utils import cached_property
|
|
7
8
|
|
|
8
9
|
from udata.core.dataset.models import HarvestDatasetMetadata
|
|
@@ -49,16 +50,21 @@ class HarvestError(db.EmbeddedDocument):
|
|
|
49
50
|
message = db.StringField()
|
|
50
51
|
details = db.StringField()
|
|
51
52
|
|
|
53
|
+
class HarvestLog(db.EmbeddedDocument):
|
|
54
|
+
level = db.StringField()
|
|
55
|
+
message = db.StringField()
|
|
52
56
|
|
|
53
57
|
class HarvestItem(db.EmbeddedDocument):
|
|
54
58
|
remote_id = db.StringField()
|
|
55
59
|
dataset = db.ReferenceField(Dataset)
|
|
60
|
+
dataservice = db.ReferenceField(Dataservice)
|
|
56
61
|
status = db.StringField(choices=list(HARVEST_ITEM_STATUS),
|
|
57
62
|
default=DEFAULT_HARVEST_ITEM_STATUS, required=True)
|
|
58
63
|
created = db.DateTimeField(default=datetime.utcnow, required=True)
|
|
59
64
|
started = db.DateTimeField()
|
|
60
65
|
ended = db.DateTimeField()
|
|
61
66
|
errors = db.ListField(db.EmbeddedDocumentField(HarvestError))
|
|
67
|
+
logs = db.ListField(db.EmbeddedDocumentField(HarvestLog), default=[])
|
|
62
68
|
args = db.ListField(db.StringField())
|
|
63
69
|
kwargs = db.DictField()
|
|
64
70
|
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
>
|
|
15
15
|
<dcat:Catalog rdf:about="http://data.test.org/">
|
|
16
16
|
<dcat:dataset>
|
|
17
|
-
<dcat:Dataset>
|
|
17
|
+
<dcat:Dataset rdf:about="dataset-3">
|
|
18
18
|
<dcterms:title>Dataset 3</dcterms:title>
|
|
19
19
|
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T19:01:24.184120</dcterms:modified>
|
|
20
20
|
<owl:versionInfo>1.0</owl:versionInfo>
|
|
@@ -73,7 +73,7 @@
|
|
|
73
73
|
</dcat:Dataset>
|
|
74
74
|
</dcat:dataset>
|
|
75
75
|
<dcat:dataset>
|
|
76
|
-
<dcat:Dataset>
|
|
76
|
+
<dcat:Dataset rdf:about="dataset-2">
|
|
77
77
|
<dcat:keyword>Tag 1</dcat:keyword>
|
|
78
78
|
<dcat:distribution rdf:resource="http://data.test.org/datasets/2/resources/1"/>
|
|
79
79
|
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T19:01:24.184120</dcterms:modified>
|
|
@@ -107,6 +107,17 @@
|
|
|
107
107
|
<dct:conformsTo rdf:nodeID="Ne0189e93917c4f67a412fc44883322e7"/>
|
|
108
108
|
</dcat:Dataset>
|
|
109
109
|
</dcat:dataset>
|
|
110
|
+
<dcat:service>
|
|
111
|
+
<dcat:DataService rdf:about="https://data.paris2024.org/api/explore/v2.1/">
|
|
112
|
+
<dcterms:title xml:lang="en"><![CDATA[Explore API v2]]></dcterms:title>
|
|
113
|
+
<dcterms:identifier>https://data.paris2024.org/api/explore/v2.1/</dcterms:identifier>
|
|
114
|
+
<dcat:endpointURL rdf:resource="https://data.paris2024.org/api/explore/v2.1/" />
|
|
115
|
+
<dcat:endpointDescription rdf:resource="https://data.paris2024.org/api/explore/v2.1/swagger.json" />
|
|
116
|
+
<dcat:landingPage rdf:resource="https://data.paris2024.org/api/explore/v2.1/console" />
|
|
117
|
+
<dcat:servesDataset rdf:resource="dataset-2" />
|
|
118
|
+
<dcat:servesDataset rdf:resource="dataset-3" />
|
|
119
|
+
</dcat:DataService>
|
|
120
|
+
</dcat:service>
|
|
110
121
|
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-15T09:19:51.723691</dcterms:modified>
|
|
111
122
|
<foaf:homepage>http://data.test.org</foaf:homepage>
|
|
112
123
|
<dcterms:language>en</dcterms:language>
|
|
@@ -9,6 +9,7 @@ import boto3
|
|
|
9
9
|
from flask import current_app
|
|
10
10
|
import xml.etree.ElementTree as ET
|
|
11
11
|
|
|
12
|
+
from udata.core.dataservices.models import Dataservice
|
|
12
13
|
from udata.harvest.models import HarvestJob
|
|
13
14
|
from udata.models import Dataset
|
|
14
15
|
from udata.core.organization.factories import OrganizationFactory
|
|
@@ -161,6 +162,26 @@ class DcatBackendTest:
|
|
|
161
162
|
assert len(datasets['1'].resources) == 2
|
|
162
163
|
assert len(datasets['2'].resources) == 2
|
|
163
164
|
|
|
165
|
+
def test_harvest_dataservices(self, rmock):
|
|
166
|
+
rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data())
|
|
167
|
+
|
|
168
|
+
filename = 'bnodes.xml'
|
|
169
|
+
url = mock_dcat(rmock, filename)
|
|
170
|
+
org = OrganizationFactory()
|
|
171
|
+
source = HarvestSourceFactory(backend='dcat',
|
|
172
|
+
url=url,
|
|
173
|
+
organization=org)
|
|
174
|
+
|
|
175
|
+
actions.run(source.slug)
|
|
176
|
+
|
|
177
|
+
dataservices = Dataservice.objects
|
|
178
|
+
|
|
179
|
+
assert len(dataservices) == 1
|
|
180
|
+
assert dataservices[0].title == "Explore API v2"
|
|
181
|
+
assert dataservices[0].base_api_url == "https://data.paris2024.org/api/explore/v2.1/"
|
|
182
|
+
assert dataservices[0].endpoint_description_url == "https://data.paris2024.org/api/explore/v2.1/swagger.json"
|
|
183
|
+
assert dataservices[0].harvest.remote_url == "https://data.paris2024.org/api/explore/v2.1/console"
|
|
184
|
+
|
|
164
185
|
def test_harvest_literal_spatial(self, rmock):
|
|
165
186
|
url = mock_dcat(rmock, 'evian.json')
|
|
166
187
|
org = OrganizationFactory()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Add a default topic to all reuses in db
|
|
3
|
+
'''
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from bson import DBRef
|
|
7
|
+
import mongoengine
|
|
8
|
+
|
|
9
|
+
from udata.models import Reuse
|
|
10
|
+
|
|
11
|
+
log = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def migrate(db):
|
|
15
|
+
log.info('Processing Reuse.')
|
|
16
|
+
|
|
17
|
+
reuses = Reuse.objects().no_cache().timeout(False)
|
|
18
|
+
count = 0
|
|
19
|
+
errors = 0
|
|
20
|
+
|
|
21
|
+
for reuse in reuses:
|
|
22
|
+
datasets_ids = []
|
|
23
|
+
for dataset in reuse.datasets:
|
|
24
|
+
if not isinstance(dataset, DBRef):
|
|
25
|
+
datasets_ids.append(dataset.id)
|
|
26
|
+
else:
|
|
27
|
+
errors += 1
|
|
28
|
+
|
|
29
|
+
if len(datasets_ids) != len(reuse.datasets):
|
|
30
|
+
reuse.datasets = datasets_ids
|
|
31
|
+
reuse.save()
|
|
32
|
+
count += 1
|
|
33
|
+
|
|
34
|
+
log.info(f'Modified {count} Reuses objects (removed {errors} datasets)')
|
|
35
|
+
log.info('Done')
|
udata/models/__init__.py
CHANGED
|
@@ -21,6 +21,7 @@ from udata.core.post.models import * # noqa
|
|
|
21
21
|
from udata.core.jobs.models import * # noqa
|
|
22
22
|
from udata.core.tags.models import * # noqa
|
|
23
23
|
from udata.core.spam.models import * # noqa
|
|
24
|
+
from udata.core.reports.models import * # noqa
|
|
24
25
|
|
|
25
26
|
from udata.features.transfer.models import * # noqa
|
|
26
27
|
from udata.features.territories.models import * # noqa
|
udata/rdf.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
'''
|
|
2
2
|
This module centralize udata-wide RDF helpers and configuration
|
|
3
3
|
'''
|
|
4
|
+
from html.parser import HTMLParser
|
|
4
5
|
import logging
|
|
5
6
|
import re
|
|
6
7
|
|
|
7
|
-
from flask import request, url_for, abort
|
|
8
|
+
from flask import request, url_for, abort, current_app
|
|
8
9
|
|
|
9
10
|
from rdflib import Graph, Literal, URIRef
|
|
10
11
|
from rdflib.resource import Resource as RdfResource
|
|
@@ -13,14 +14,18 @@ from rdflib.namespace import (
|
|
|
13
14
|
)
|
|
14
15
|
from rdflib.util import SUFFIX_FORMAT_MAP, guess_format as raw_guess_format
|
|
15
16
|
from udata import uris
|
|
17
|
+
from udata.core.contact_point.models import ContactPoint
|
|
16
18
|
from udata.models import Schema
|
|
17
19
|
from udata.mongo.errors import FieldValidationError
|
|
20
|
+
from udata.frontend.markdown import parse_html
|
|
21
|
+
from udata.tags import slug as slugify_tag
|
|
18
22
|
|
|
19
23
|
log = logging.getLogger(__name__)
|
|
20
24
|
|
|
21
25
|
# Extra Namespaces
|
|
22
26
|
ADMS = Namespace('http://www.w3.org/ns/adms#')
|
|
23
27
|
DCAT = Namespace('http://www.w3.org/ns/dcat#')
|
|
28
|
+
DCATAP = Namespace('http://data.europa.eu/r5r/')
|
|
24
29
|
HYDRA = Namespace('http://www.w3.org/ns/hydra/core#')
|
|
25
30
|
SCHEMA = Namespace('http://schema.org/')
|
|
26
31
|
SCV = Namespace('http://purl.org/NET/scovo#')
|
|
@@ -35,6 +40,7 @@ VCARD = Namespace('http://www.w3.org/2006/vcard/ns#')
|
|
|
35
40
|
|
|
36
41
|
namespace_manager = NamespaceManager(Graph())
|
|
37
42
|
namespace_manager.bind('dcat', DCAT)
|
|
43
|
+
namespace_manager.bind('dcatap', DCATAP)
|
|
38
44
|
namespace_manager.bind('dct', DCT)
|
|
39
45
|
namespace_manager.bind('foaf', FOAF)
|
|
40
46
|
namespace_manager.bind('foaf', FOAF)
|
|
@@ -98,6 +104,17 @@ RDF_EXTENSIONS = {
|
|
|
98
104
|
# Includes control characters, unicode surrogate characters and unicode end-of-plane non-characters
|
|
99
105
|
ILLEGAL_XML_CHARS = '[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]'
|
|
100
106
|
|
|
107
|
+
# Map High Value Datasets URIs to keyword categories
|
|
108
|
+
EU_HVD_CATEGORIES = {
|
|
109
|
+
"http://data.europa.eu/bna/c_164e0bf5": "Météorologiques",
|
|
110
|
+
"http://data.europa.eu/bna/c_a9135398": "Entreprises et propriété d'entreprises",
|
|
111
|
+
"http://data.europa.eu/bna/c_ac64a52d": "Géospatiales",
|
|
112
|
+
"http://data.europa.eu/bna/c_b79e35eb": "Mobilité",
|
|
113
|
+
"http://data.europa.eu/bna/c_dd313021": "Observation de la terre et environnement",
|
|
114
|
+
"http://data.europa.eu/bna/c_e1da4e07": "Statistiques"
|
|
115
|
+
}
|
|
116
|
+
HVD_LEGISLATION = 'http://data.europa.eu/eli/reg_impl/2023/138/oj'
|
|
117
|
+
TAG_TO_EU_HVD_CATEGORIES = {slugify_tag(EU_HVD_CATEGORIES[uri]): uri for uri in EU_HVD_CATEGORIES}
|
|
101
118
|
|
|
102
119
|
def guess_format(string):
|
|
103
120
|
'''Guess format given an extension or a mime-type'''
|
|
@@ -212,6 +229,42 @@ CONTEXT = {
|
|
|
212
229
|
'totalItems': 'hydra:totalItems',
|
|
213
230
|
}
|
|
214
231
|
|
|
232
|
+
def serialize_value(value):
|
|
233
|
+
if isinstance(value, (URIRef, Literal)):
|
|
234
|
+
return value.toPython()
|
|
235
|
+
elif isinstance(value, RdfResource):
|
|
236
|
+
return value.identifier.toPython()
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def rdf_value(obj, predicate, default=None):
|
|
240
|
+
value = obj.value(predicate)
|
|
241
|
+
return serialize_value(value) if value else default
|
|
242
|
+
|
|
243
|
+
class HTMLDetector(HTMLParser):
|
|
244
|
+
def __init__(self, *args, **kwargs):
|
|
245
|
+
HTMLParser.__init__(self, *args, **kwargs)
|
|
246
|
+
self.elements = set()
|
|
247
|
+
|
|
248
|
+
def handle_starttag(self, tag, attrs):
|
|
249
|
+
self.elements.add(tag)
|
|
250
|
+
|
|
251
|
+
def handle_endtag(self, tag):
|
|
252
|
+
self.elements.add(tag)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def is_html(text):
|
|
256
|
+
parser = HTMLDetector()
|
|
257
|
+
parser.feed(text)
|
|
258
|
+
return bool(parser.elements)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def sanitize_html(text):
|
|
262
|
+
text = text.toPython() if isinstance(text, Literal) else ''
|
|
263
|
+
if is_html(text):
|
|
264
|
+
return parse_html(text)
|
|
265
|
+
else:
|
|
266
|
+
return text.strip()
|
|
267
|
+
|
|
215
268
|
|
|
216
269
|
def url_from_rdf(rdf, prop):
|
|
217
270
|
'''
|
|
@@ -224,6 +277,65 @@ def url_from_rdf(rdf, prop):
|
|
|
224
277
|
elif isinstance(value, RdfResource):
|
|
225
278
|
return value.identifier.toPython()
|
|
226
279
|
|
|
280
|
+
def theme_labels_from_rdf(rdf):
|
|
281
|
+
'''
|
|
282
|
+
Get theme labels to use as keywords.
|
|
283
|
+
Map HVD keywords from known URIs resources if HVD support is activated.
|
|
284
|
+
'''
|
|
285
|
+
for theme in rdf.objects(DCAT.theme):
|
|
286
|
+
if isinstance(theme, RdfResource):
|
|
287
|
+
uri = theme.identifier.toPython()
|
|
288
|
+
if current_app.config['HVD_SUPPORT'] and uri in EU_HVD_CATEGORIES:
|
|
289
|
+
label = EU_HVD_CATEGORIES[uri]
|
|
290
|
+
# Additionnally yield hvd keyword
|
|
291
|
+
yield 'hvd'
|
|
292
|
+
else:
|
|
293
|
+
label = rdf_value(theme, SKOS.prefLabel)
|
|
294
|
+
else:
|
|
295
|
+
label = theme.toPython()
|
|
296
|
+
if label:
|
|
297
|
+
yield label
|
|
298
|
+
|
|
299
|
+
def themes_from_rdf(rdf):
|
|
300
|
+
tags = [tag.toPython() for tag in rdf.objects(DCAT.keyword)]
|
|
301
|
+
tags += theme_labels_from_rdf(rdf)
|
|
302
|
+
return list(set(tags))
|
|
303
|
+
|
|
304
|
+
def contact_point_from_rdf(rdf, dataset):
|
|
305
|
+
contact_point = rdf.value(DCAT.contactPoint)
|
|
306
|
+
if contact_point:
|
|
307
|
+
name = rdf_value(contact_point, VCARD.fn) or ''
|
|
308
|
+
email = (rdf_value(contact_point, VCARD.hasEmail)
|
|
309
|
+
or rdf_value(contact_point, VCARD.email)
|
|
310
|
+
or rdf_value(contact_point, DCAT.email))
|
|
311
|
+
if not email:
|
|
312
|
+
return
|
|
313
|
+
email = email.replace('mailto:', '').strip()
|
|
314
|
+
if dataset.organization:
|
|
315
|
+
contact_point = ContactPoint.objects(
|
|
316
|
+
name=name, email=email, organization=dataset.organization).first()
|
|
317
|
+
return (contact_point or
|
|
318
|
+
ContactPoint(name=name, email=email, organization=dataset.organization).save())
|
|
319
|
+
elif dataset.owner:
|
|
320
|
+
contact_point = ContactPoint.objects(
|
|
321
|
+
name=name, email=email, owner=dataset.owner).first()
|
|
322
|
+
return (contact_point or
|
|
323
|
+
ContactPoint(name=name, email=email, owner=dataset.owner).save())
|
|
324
|
+
|
|
325
|
+
def remote_url_from_rdf(rdf):
|
|
326
|
+
'''
|
|
327
|
+
Return DCAT.landingPage if found and uri validation succeeds.
|
|
328
|
+
Use RDF identifier as fallback if uri validation succeeds.
|
|
329
|
+
'''
|
|
330
|
+
landing_page = url_from_rdf(rdf, DCAT.landingPage)
|
|
331
|
+
uri = rdf.identifier.toPython()
|
|
332
|
+
for candidate in [landing_page, uri]:
|
|
333
|
+
if candidate:
|
|
334
|
+
try:
|
|
335
|
+
uris.validate(candidate)
|
|
336
|
+
return candidate
|
|
337
|
+
except uris.ValidationError:
|
|
338
|
+
pass
|
|
227
339
|
|
|
228
340
|
def schema_from_rdf(rdf):
|
|
229
341
|
'''
|
|
@@ -252,7 +364,6 @@ def schema_from_rdf(rdf):
|
|
|
252
364
|
url = uris.validate(type.identifier.toPython())
|
|
253
365
|
except uris.ValidationError:
|
|
254
366
|
pass
|
|
255
|
-
pass
|
|
256
367
|
|
|
257
368
|
if url is None:
|
|
258
369
|
return None
|
udata/routing.py
CHANGED
|
@@ -217,7 +217,7 @@ def lazy_raise_or_redirect():
|
|
|
217
217
|
new_args = request.view_args
|
|
218
218
|
new_args[name] = value.arg
|
|
219
219
|
new_url = url_for(request.endpoint, **new_args)
|
|
220
|
-
return redirect(new_url, code=308)
|
|
220
|
+
return redirect(new_url, code=204 if request.method == 'OPTIONS' else 308)
|
|
221
221
|
|
|
222
222
|
|
|
223
223
|
def init_app(app):
|