udata 7.0.8.dev28841__py2.py3-none-any.whl → 9.0.1.dev29390__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- udata/__init__.py +1 -1
- udata/api/__init__.py +6 -4
- udata/api/oauth2.py +2 -1
- udata/api_fields.py +254 -0
- udata/commands/purge.py +8 -2
- udata/core/badges/models.py +2 -1
- udata/core/dataservices/__init__.py +0 -0
- udata/core/dataservices/api.py +92 -0
- udata/core/dataservices/models.py +142 -0
- udata/core/dataservices/permissions.py +7 -0
- udata/core/dataservices/tasks.py +25 -0
- udata/core/dataset/apiv2.py +2 -0
- udata/core/dataset/csv.py +8 -1
- udata/core/dataset/models.py +1 -0
- udata/core/dataset/rdf.py +77 -15
- udata/core/metrics/commands.py +18 -3
- udata/core/metrics/models.py +2 -3
- udata/core/organization/api_fields.py +28 -3
- udata/core/organization/csv.py +5 -3
- udata/core/organization/models.py +3 -1
- udata/core/owned.py +39 -2
- udata/core/reuse/csv.py +3 -0
- udata/core/site/api.py +4 -1
- udata/core/spatial/api.py +5 -10
- udata/core/spatial/models.py +7 -2
- udata/core/spatial/tasks.py +7 -0
- udata/core/spatial/tests/test_api.py +26 -0
- udata/core/user/api.py +11 -7
- udata/core/user/models.py +13 -2
- udata/harvest/backends/base.py +93 -103
- udata/harvest/backends/dcat.py +65 -90
- udata/harvest/tasks.py +3 -13
- udata/harvest/tests/dcat/bnodes.xml +10 -1
- udata/harvest/tests/dcat/catalog.xml +1 -0
- udata/harvest/tests/factories.py +13 -6
- udata/harvest/tests/test_actions.py +2 -2
- udata/harvest/tests/test_base_backend.py +9 -5
- udata/harvest/tests/test_dcat_backend.py +17 -1
- udata/rdf.py +4 -0
- udata/routing.py +6 -0
- udata/settings.py +4 -1
- udata/static/admin.css +2 -2
- udata/static/admin.css.map +1 -1
- udata/static/chunks/{0.6f1698738c9b0618b673.js → 0.93c3ae13b5b94753ee80.js} +3 -3
- udata/static/chunks/0.93c3ae13b5b94753ee80.js.map +1 -0
- udata/static/chunks/{14.f4037a917d5364cb564b.js → 14.e64890872b31c55fcdf7.js} +2 -2
- udata/static/chunks/14.e64890872b31c55fcdf7.js.map +1 -0
- udata/static/chunks/{2.7c89fae92899be371ed3.js → 2.614b3e73b072982fd9b1.js} +2 -2
- udata/static/chunks/2.614b3e73b072982fd9b1.js.map +1 -0
- udata/static/chunks/{5.3dc97ea195d251881552.js → 5.48417db6b33328fa9d6a.js} +2 -2
- udata/static/chunks/5.48417db6b33328fa9d6a.js.map +1 -0
- udata/static/common.js +1 -1
- udata/static/common.js.map +1 -1
- udata/tasks.py +1 -0
- udata/tests/api/__init__.py +3 -0
- udata/tests/api/test_dataservices_api.py +236 -0
- udata/tests/api/test_organizations_api.py +78 -5
- udata/tests/api/test_user_api.py +47 -13
- udata/tests/dataservice/test_dataservice_tasks.py +46 -0
- udata/tests/dataset/test_dataset_rdf.py +17 -2
- udata/tests/plugin.py +5 -0
- udata/tests/site/test_site_rdf.py +16 -0
- {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/METADATA +27 -1
- {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/RECORD +68 -60
- udata/core/metrics/api.py +0 -10
- udata/static/chunks/0.6f1698738c9b0618b673.js.map +0 -1
- udata/static/chunks/14.f4037a917d5364cb564b.js.map +0 -1
- udata/static/chunks/2.7c89fae92899be371ed3.js.map +0 -1
- udata/static/chunks/5.3dc97ea195d251881552.js.map +0 -1
- {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/LICENSE +0 -0
- {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/WHEEL +0 -0
- {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/entry_points.txt +0 -0
- {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/top_level.txt +0 -0
udata/core/user/models.py
CHANGED
|
@@ -13,6 +13,7 @@ from mongoengine.signals import pre_save, post_save
|
|
|
13
13
|
from werkzeug.utils import cached_property
|
|
14
14
|
|
|
15
15
|
from udata import mail
|
|
16
|
+
from udata.core import storages
|
|
16
17
|
from udata.uris import endpoint_for
|
|
17
18
|
from udata.frontend.markdown import mdstrip
|
|
18
19
|
from udata.i18n import lazy_gettext as _
|
|
@@ -233,7 +234,15 @@ class User(WithMetrics, UserMixin, db.Document):
|
|
|
233
234
|
raise NotImplementedError('''This method should not be using directly.
|
|
234
235
|
Use `mark_as_deleted` (or `_delete` if you know what you're doing)''')
|
|
235
236
|
|
|
236
|
-
def mark_as_deleted(self):
|
|
237
|
+
def mark_as_deleted(self, notify: bool = True):
|
|
238
|
+
if self.avatar.filename is not None:
|
|
239
|
+
storage = storages.avatars
|
|
240
|
+
storage.delete(self.avatar.filename)
|
|
241
|
+
storage.delete(self.avatar.original)
|
|
242
|
+
for key, value in self.avatar.thumbnails.items():
|
|
243
|
+
storage.delete(value)
|
|
244
|
+
|
|
245
|
+
|
|
237
246
|
copied_user = copy(self)
|
|
238
247
|
self.email = '{}@deleted'.format(self.id)
|
|
239
248
|
self.slug = 'deleted'
|
|
@@ -270,7 +279,9 @@ class User(WithMetrics, UserMixin, db.Document):
|
|
|
270
279
|
from udata.models import ContactPoint
|
|
271
280
|
ContactPoint.objects(owner=self).delete()
|
|
272
281
|
|
|
273
|
-
|
|
282
|
+
|
|
283
|
+
if notify:
|
|
284
|
+
mail.send(_('Account deletion'), copied_user, 'account_deleted')
|
|
274
285
|
|
|
275
286
|
def count_datasets(self):
|
|
276
287
|
from udata.models import Dataset
|
udata/harvest/backends/base.py
CHANGED
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
import traceback
|
|
3
3
|
|
|
4
4
|
from datetime import datetime, date, timedelta
|
|
5
|
+
from typing import Optional
|
|
5
6
|
from uuid import UUID
|
|
6
7
|
|
|
7
8
|
import requests
|
|
@@ -68,7 +69,12 @@ class HarvestFeature(object):
|
|
|
68
69
|
|
|
69
70
|
|
|
70
71
|
class BaseBackend(object):
|
|
71
|
-
|
|
72
|
+
"""
|
|
73
|
+
Base class that wrap children methods to add error management and debug logs.
|
|
74
|
+
Also provides a few helpers needed on all or some backends.
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
"""
|
|
72
78
|
|
|
73
79
|
name = None
|
|
74
80
|
display_name = None
|
|
@@ -127,92 +133,69 @@ class BaseBackend(object):
|
|
|
127
133
|
def get_filters(self):
|
|
128
134
|
return self.config.get('filters', [])
|
|
129
135
|
|
|
130
|
-
def
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
return self.job
|
|
136
|
+
def inner_harvest(self):
|
|
137
|
+
raise NotImplementedError
|
|
138
|
+
|
|
139
|
+
def inner_process_dataset(self, item: HarvestItem) -> Dataset:
|
|
140
|
+
raise NotImplementedError
|
|
136
141
|
|
|
137
|
-
def
|
|
138
|
-
'
|
|
139
|
-
log.debug('Initializing backend')
|
|
142
|
+
def harvest(self):
|
|
143
|
+
log.debug(f'Starting harvesting {self.source.name} ({self.source.url})…')
|
|
140
144
|
factory = HarvestJob if self.dryrun else HarvestJob.objects.create
|
|
141
|
-
self.job = factory(status='
|
|
145
|
+
self.job = factory(status='initialized',
|
|
142
146
|
started=datetime.utcnow(),
|
|
143
147
|
source=self.source)
|
|
144
148
|
|
|
145
149
|
before_harvest_job.send(self)
|
|
146
150
|
|
|
147
151
|
try:
|
|
148
|
-
self.
|
|
149
|
-
|
|
150
|
-
if
|
|
151
|
-
self.
|
|
152
|
+
self.inner_harvest()
|
|
153
|
+
|
|
154
|
+
if self.source.autoarchive:
|
|
155
|
+
self.autoarchive()
|
|
156
|
+
|
|
157
|
+
self.job.status = 'done'
|
|
158
|
+
|
|
159
|
+
if any(i.status == 'failed' for i in self.job.items):
|
|
160
|
+
self.job.status += '-errors'
|
|
152
161
|
except HarvestValidationError as e:
|
|
153
|
-
log.
|
|
154
|
-
|
|
155
|
-
error = HarvestError(message=safe_unicode(e))
|
|
156
|
-
self.job.errors.append(error)
|
|
157
|
-
self.job.status = 'failed'
|
|
158
|
-
self.end()
|
|
159
|
-
return None
|
|
160
|
-
except Exception as e:
|
|
162
|
+
log.exception(f'Harvesting validation failed for "{safe_unicode(self.source.name)}" ({self.source.backend})')
|
|
163
|
+
|
|
161
164
|
self.job.status = 'failed'
|
|
165
|
+
|
|
162
166
|
error = HarvestError(message=safe_unicode(e))
|
|
163
167
|
self.job.errors.append(error)
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
log.exception(msg.format(self.source))
|
|
167
|
-
return None
|
|
168
|
+
except Exception as e:
|
|
169
|
+
log.exception(f'Harvesting failed for "{safe_unicode(self.source.name)}" ({self.source.backend})')
|
|
168
170
|
|
|
169
|
-
|
|
170
|
-
self.job.items = self.job.items[:self.max_items]
|
|
171
|
+
self.job.status = 'failed'
|
|
171
172
|
|
|
172
|
-
|
|
173
|
-
|
|
173
|
+
error = HarvestError(message=safe_unicode(e), details=traceback.format_exc())
|
|
174
|
+
self.job.errors.append(error)
|
|
175
|
+
finally:
|
|
176
|
+
self.end_job()
|
|
177
|
+
|
|
178
|
+
return self.job
|
|
174
179
|
|
|
175
|
-
|
|
180
|
+
def process_dataset(self, remote_id: str, **kwargs):
|
|
181
|
+
log.debug(f'Processing dataset {remote_id}…')
|
|
176
182
|
|
|
177
|
-
|
|
178
|
-
|
|
183
|
+
# TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice`
|
|
184
|
+
item = HarvestItem(status='started', started=datetime.utcnow(), remote_id=remote_id)
|
|
185
|
+
self.job.items.append(item)
|
|
186
|
+
self.save_job()
|
|
179
187
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
self.process_item(item)
|
|
188
|
+
try:
|
|
189
|
+
if not remote_id:
|
|
190
|
+
raise HarvestSkipException("missing identifier")
|
|
184
191
|
|
|
185
|
-
|
|
186
|
-
log.debug('Processing: %s', item.remote_id)
|
|
187
|
-
item.status = 'started'
|
|
188
|
-
item.started = datetime.utcnow()
|
|
189
|
-
if not self.dryrun:
|
|
190
|
-
self.job.save()
|
|
192
|
+
dataset = self.inner_process_dataset(item, **kwargs)
|
|
191
193
|
|
|
192
|
-
|
|
193
|
-
dataset = self.
|
|
194
|
-
if not dataset.harvest:
|
|
195
|
-
dataset.harvest = HarvestDatasetMetadata()
|
|
196
|
-
dataset.harvest.domain = self.source.domain
|
|
197
|
-
dataset.harvest.remote_id = item.remote_id
|
|
198
|
-
dataset.harvest.source_id = str(self.source.id)
|
|
199
|
-
dataset.harvest.last_update = datetime.utcnow()
|
|
200
|
-
dataset.harvest.backend = self.display_name
|
|
201
|
-
|
|
202
|
-
# unset archived status if needed
|
|
203
|
-
if dataset.harvest:
|
|
204
|
-
dataset.harvest.archived_at = None
|
|
205
|
-
dataset.harvest.archived = None
|
|
194
|
+
# Use `item.remote_id` because `inner_process_dataset` could have modified it.
|
|
195
|
+
dataset.harvest = self.update_harvest_info(dataset.harvest, item.remote_id)
|
|
206
196
|
dataset.archived = None
|
|
207
197
|
|
|
208
|
-
# TODO
|
|
209
|
-
if not dataset.organization and not dataset.owner:
|
|
210
|
-
if self.source.organization:
|
|
211
|
-
dataset.organization = self.source.organization
|
|
212
|
-
elif self.source.owner:
|
|
213
|
-
dataset.owner = self.source.owner
|
|
214
|
-
|
|
215
|
-
# TODO: Apply editble mappings
|
|
198
|
+
# TODO: Apply editable mappings
|
|
216
199
|
|
|
217
200
|
if self.dryrun:
|
|
218
201
|
dataset.validate()
|
|
@@ -221,26 +204,54 @@ class BaseBackend(object):
|
|
|
221
204
|
item.dataset = dataset
|
|
222
205
|
item.status = 'done'
|
|
223
206
|
except HarvestSkipException as e:
|
|
224
|
-
log.info('Skipped item %s : %s', item.remote_id, safe_unicode(e))
|
|
225
207
|
item.status = 'skipped'
|
|
208
|
+
|
|
209
|
+
log.info(f'Skipped item {item.remote_id} : {safe_unicode(e)}')
|
|
226
210
|
item.errors.append(HarvestError(message=safe_unicode(e)))
|
|
227
211
|
except HarvestValidationError as e:
|
|
228
|
-
log.info('Error validating item %s : %s', item.remote_id, safe_unicode(e))
|
|
229
212
|
item.status = 'failed'
|
|
213
|
+
|
|
214
|
+
log.info(f'Error validating item {item.remote_id} : {safe_unicode(e)}')
|
|
230
215
|
item.errors.append(HarvestError(message=safe_unicode(e)))
|
|
231
216
|
except Exception as e:
|
|
232
|
-
log.exception('Error while processing %s : %s',
|
|
233
|
-
item.remote_id,
|
|
234
|
-
safe_unicode(e))
|
|
235
|
-
error = HarvestError(message=safe_unicode(e),
|
|
236
|
-
details=traceback.format_exc())
|
|
237
|
-
item.errors.append(error)
|
|
238
217
|
item.status = 'failed'
|
|
218
|
+
log.exception(f'Error while processing {item.remote_id} : {safe_unicode(e)}')
|
|
219
|
+
|
|
220
|
+
error = HarvestError(message=safe_unicode(e), details=traceback.format_exc())
|
|
221
|
+
item.errors.append(error)
|
|
222
|
+
finally:
|
|
223
|
+
item.ended = datetime.utcnow()
|
|
224
|
+
self.save_job()
|
|
225
|
+
|
|
226
|
+
def is_done(self) -> bool:
|
|
227
|
+
'''Should be called after process_dataset to know if we reach the max items'''
|
|
228
|
+
return self.max_items and len(self.job.items) >= self.max_items
|
|
229
|
+
|
|
230
|
+
def update_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int):
|
|
231
|
+
if not harvest:
|
|
232
|
+
harvest = HarvestDatasetMetadata()
|
|
233
|
+
harvest.domain = self.source.domain
|
|
234
|
+
harvest.remote_id = remote_id
|
|
235
|
+
harvest.source_id = str(self.source.id)
|
|
236
|
+
harvest.last_update = datetime.utcnow()
|
|
237
|
+
harvest.backend = self.display_name
|
|
239
238
|
|
|
240
|
-
|
|
239
|
+
harvest.archived_at = None
|
|
240
|
+
harvest.archived = None
|
|
241
|
+
|
|
242
|
+
return harvest
|
|
243
|
+
|
|
244
|
+
def save_job(self):
|
|
245
|
+
if not self.dryrun:
|
|
246
|
+
self.job.save()
|
|
247
|
+
|
|
248
|
+
def end_job(self):
|
|
249
|
+
self.job.ended = datetime.utcnow()
|
|
241
250
|
if not self.dryrun:
|
|
242
251
|
self.job.save()
|
|
243
252
|
|
|
253
|
+
after_harvest_job.send(self)
|
|
254
|
+
|
|
244
255
|
def autoarchive(self):
|
|
245
256
|
'''
|
|
246
257
|
Archive items that exist on the local instance but not on remote platform
|
|
@@ -262,34 +273,13 @@ class BaseBackend(object):
|
|
|
262
273
|
archive_harvested_dataset(dataset, reason='not-on-remote', dryrun=self.dryrun)
|
|
263
274
|
# add a HarvestItem to the job list (useful for report)
|
|
264
275
|
# even when archiving has already been done (useful for debug)
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
self.job.save()
|
|
276
|
+
self.job.items.append(HarvestItem(
|
|
277
|
+
remote_id=str(dataset.harvest.remote_id),
|
|
278
|
+
dataset=dataset,
|
|
279
|
+
status='archived'
|
|
280
|
+
))
|
|
271
281
|
|
|
272
|
-
|
|
273
|
-
raise NotImplementedError
|
|
274
|
-
|
|
275
|
-
def add_item(self, identifier, *args, **kwargs):
|
|
276
|
-
item = HarvestItem(remote_id=str(identifier), args=args, kwargs=kwargs)
|
|
277
|
-
self.job.items.append(item)
|
|
278
|
-
return item
|
|
279
|
-
|
|
280
|
-
def finalize(self):
|
|
281
|
-
if self.source.autoarchive:
|
|
282
|
-
self.autoarchive()
|
|
283
|
-
self.job.status = 'done'
|
|
284
|
-
if any(i.status == 'failed' for i in self.job.items):
|
|
285
|
-
self.job.status += '-errors'
|
|
286
|
-
self.end()
|
|
287
|
-
|
|
288
|
-
def end(self):
|
|
289
|
-
self.job.ended = datetime.utcnow()
|
|
290
|
-
if not self.dryrun:
|
|
291
|
-
self.job.save()
|
|
292
|
-
after_harvest_job.send(self)
|
|
282
|
+
self.save_job()
|
|
293
283
|
|
|
294
284
|
def get_dataset(self, remote_id):
|
|
295
285
|
'''Get or create a dataset given its remote ID (and its source)
|
|
@@ -352,4 +342,4 @@ class BaseBackend(object):
|
|
|
352
342
|
msg = str(error)
|
|
353
343
|
errors.append(msg)
|
|
354
344
|
msg = '\n- '.join(['Validation error:'] + errors)
|
|
355
|
-
raise HarvestValidationError(msg)
|
|
345
|
+
raise HarvestValidationError(msg)
|
udata/harvest/backends/dcat.py
CHANGED
|
@@ -7,13 +7,15 @@ import boto3
|
|
|
7
7
|
from flask import current_app
|
|
8
8
|
from datetime import date
|
|
9
9
|
import json
|
|
10
|
-
from typing import List
|
|
10
|
+
from typing import Generator, List
|
|
11
11
|
|
|
12
|
+
from udata.core.dataset.models import Dataset
|
|
12
13
|
from udata.rdf import (
|
|
13
14
|
DCAT, DCT, HYDRA, SPDX, namespace_manager, guess_format, url_from_rdf
|
|
14
15
|
)
|
|
15
16
|
from udata.core.dataset.rdf import dataset_from_rdf
|
|
16
17
|
from udata.storage.s3 import store_as_json, get_from_json
|
|
18
|
+
from udata.harvest.models import HarvestItem
|
|
17
19
|
|
|
18
20
|
from .base import BaseBackend
|
|
19
21
|
|
|
@@ -59,14 +61,17 @@ def extract_graph(source, target, node, specs):
|
|
|
59
61
|
class DcatBackend(BaseBackend):
|
|
60
62
|
display_name = 'DCAT'
|
|
61
63
|
|
|
62
|
-
def
|
|
63
|
-
'''List all datasets for a given ...'''
|
|
64
|
+
def inner_harvest(self):
|
|
64
65
|
fmt = self.get_format()
|
|
65
|
-
graphs = self.parse_graph(self.source.url, fmt)
|
|
66
|
-
|
|
67
66
|
self.job.data = { 'format': fmt }
|
|
68
67
|
|
|
69
|
-
serialized_graphs = [
|
|
68
|
+
serialized_graphs = []
|
|
69
|
+
|
|
70
|
+
for page_number, page in self.walk_graph(self.source.url, fmt):
|
|
71
|
+
self.process_one_datasets_page(page_number, page)
|
|
72
|
+
serialized_graphs.append(page.serialize(format=fmt, indent=None))
|
|
73
|
+
|
|
74
|
+
# TODO call `walk_graph` with `process_dataservices`
|
|
70
75
|
|
|
71
76
|
# The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000).
|
|
72
77
|
max_harvest_graph_size_in_mongo = current_app.config.get('HARVEST_MAX_CATALOG_SIZE_IN_MONGO')
|
|
@@ -105,13 +110,11 @@ class DcatBackend(BaseBackend):
|
|
|
105
110
|
raise ValueError(msg)
|
|
106
111
|
return fmt
|
|
107
112
|
|
|
108
|
-
def
|
|
113
|
+
def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
|
|
109
114
|
"""
|
|
110
|
-
|
|
111
|
-
The index in the list is the page number
|
|
115
|
+
Yield all RDF pages as `Graph` from the source
|
|
112
116
|
"""
|
|
113
|
-
|
|
114
|
-
page = 0
|
|
117
|
+
page_number = 0
|
|
115
118
|
while url:
|
|
116
119
|
subgraph = Graph(namespace_manager=namespace_manager)
|
|
117
120
|
response = self.get(url)
|
|
@@ -128,19 +131,26 @@ class DcatBackend(BaseBackend):
|
|
|
128
131
|
pagination = subgraph.resource(pagination)
|
|
129
132
|
url = url_from_rdf(pagination, prop)
|
|
130
133
|
break
|
|
131
|
-
graphs.append(subgraph)
|
|
132
134
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
self.add_item(id, **kwargs)
|
|
137
|
-
if self.max_items and len(self.job.items) >= self.max_items:
|
|
138
|
-
# this will stop iterating on pagination
|
|
139
|
-
url = None
|
|
135
|
+
yield page_number, subgraph
|
|
136
|
+
if self.is_done():
|
|
137
|
+
return
|
|
140
138
|
|
|
141
|
-
|
|
139
|
+
page_number += 1
|
|
140
|
+
|
|
141
|
+
def process_one_datasets_page(self, page_number: int, page: Graph):
|
|
142
|
+
for node in page.subjects(RDF.type, DCAT.Dataset):
|
|
143
|
+
remote_id = page.value(node, DCT.identifier)
|
|
144
|
+
self.process_dataset(remote_id, page_number=page_number, page=page, node=node)
|
|
142
145
|
|
|
143
|
-
|
|
146
|
+
if self.is_done():
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node):
|
|
150
|
+
item.kwargs['page_number'] = page_number
|
|
151
|
+
|
|
152
|
+
dataset = self.get_dataset(item.remote_id)
|
|
153
|
+
return dataset_from_rdf(page, dataset, node=node)
|
|
144
154
|
|
|
145
155
|
def get_node_from_item(self, graph, item):
|
|
146
156
|
for node in graph.subjects(RDF.type, DCAT.Dataset):
|
|
@@ -148,33 +158,6 @@ class DcatBackend(BaseBackend):
|
|
|
148
158
|
return node
|
|
149
159
|
raise ValueError(f'Unable to find dataset with DCT.identifier:{item.remote_id}')
|
|
150
160
|
|
|
151
|
-
def process(self, item):
|
|
152
|
-
if item.remote_id == 'None':
|
|
153
|
-
raise ValueError('The DCT.identifier is missing on this DCAT.Dataset record')
|
|
154
|
-
graph = Graph(namespace_manager=namespace_manager)
|
|
155
|
-
|
|
156
|
-
if self.job.data.get('graphs') is not None:
|
|
157
|
-
graphs = self.job.data['graphs']
|
|
158
|
-
else:
|
|
159
|
-
bucket = current_app.config.get('HARVEST_GRAPHS_S3_BUCKET')
|
|
160
|
-
if bucket is None:
|
|
161
|
-
raise ValueError(f"No bucket configured but the harvest job item {item.id} on job {self.job.id} doesn't have a graph in MongoDB.")
|
|
162
|
-
|
|
163
|
-
graphs = get_from_json(bucket, self.job.data['filename'])
|
|
164
|
-
if graphs is None:
|
|
165
|
-
raise ValueError(f"The file '{self.job.data['filename']}' is missing in S3 bucket '{bucket}'")
|
|
166
|
-
|
|
167
|
-
data = graphs[item.kwargs['page']]
|
|
168
|
-
format = self.job.data['format']
|
|
169
|
-
|
|
170
|
-
graph.parse(data=bytes(data, encoding='utf8'), format=format)
|
|
171
|
-
node = self.get_node_from_item(graph, item)
|
|
172
|
-
|
|
173
|
-
dataset = self.get_dataset(item.remote_id)
|
|
174
|
-
dataset = dataset_from_rdf(graph, dataset, node=node)
|
|
175
|
-
return dataset
|
|
176
|
-
|
|
177
|
-
|
|
178
161
|
def next_record_if_should_continue(self, start, search_results):
|
|
179
162
|
next_record = int(search_results.attrib['nextRecord'])
|
|
180
163
|
matched_count = int(search_results.attrib['numberOfRecordsMatched'])
|
|
@@ -209,7 +192,10 @@ class CswDcatBackend(DcatBackend):
|
|
|
209
192
|
|
|
210
193
|
DCAT_SCHEMA = 'http://www.w3.org/ns/dcat#'
|
|
211
194
|
|
|
212
|
-
def
|
|
195
|
+
def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
|
|
196
|
+
"""
|
|
197
|
+
Yield all RDF pages as `Graph` from the source
|
|
198
|
+
"""
|
|
213
199
|
body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
214
200
|
xmlns:gmd="http://www.isotc211.org/2005/gmd"
|
|
215
201
|
service="CSW" version="2.0.2" resultType="results"
|
|
@@ -227,8 +213,7 @@ class CswDcatBackend(DcatBackend):
|
|
|
227
213
|
</csw:GetRecords>'''
|
|
228
214
|
headers = {'Content-Type': 'application/xml'}
|
|
229
215
|
|
|
230
|
-
|
|
231
|
-
page = 0
|
|
216
|
+
page_number = 0
|
|
232
217
|
start = 1
|
|
233
218
|
|
|
234
219
|
response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
|
|
@@ -239,37 +224,29 @@ class CswDcatBackend(DcatBackend):
|
|
|
239
224
|
if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
|
|
240
225
|
raise ValueError(f'Failed to query CSW:\n{content}')
|
|
241
226
|
while tree:
|
|
242
|
-
graph = Graph(namespace_manager=namespace_manager)
|
|
243
227
|
search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
|
|
244
228
|
if search_results is None:
|
|
245
|
-
log.error(f'No search results found for {url} on page {
|
|
229
|
+
log.error(f'No search results found for {url} on page {page_number}')
|
|
246
230
|
break
|
|
247
231
|
for child in search_results:
|
|
248
232
|
subgraph = Graph(namespace_manager=namespace_manager)
|
|
249
233
|
subgraph.parse(data=ET.tostring(child), format=fmt)
|
|
250
|
-
graph += subgraph
|
|
251
234
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
|
|
256
|
-
self.add_item(id, **kwargs)
|
|
257
|
-
graphs.append(graph)
|
|
235
|
+
yield page_number, subgraph
|
|
236
|
+
if self.is_done():
|
|
237
|
+
return
|
|
258
238
|
|
|
259
239
|
next_record = self.next_record_if_should_continue(start, search_results)
|
|
260
240
|
if not next_record:
|
|
261
241
|
break
|
|
262
242
|
|
|
263
243
|
start = next_record
|
|
264
|
-
|
|
244
|
+
page_number += 1
|
|
265
245
|
|
|
266
246
|
tree = ET.fromstring(
|
|
267
247
|
self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
|
|
268
248
|
headers=headers).content)
|
|
269
249
|
|
|
270
|
-
return graphs
|
|
271
|
-
|
|
272
|
-
|
|
273
250
|
|
|
274
251
|
class CswIso19139DcatBackend(DcatBackend):
|
|
275
252
|
'''
|
|
@@ -283,18 +260,16 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
283
260
|
|
|
284
261
|
XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
|
|
285
262
|
|
|
286
|
-
def
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
|
|
291
|
-
'''
|
|
292
|
-
|
|
263
|
+
def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
|
|
264
|
+
"""
|
|
265
|
+
Yield all RDF pages as `Graph` from the source
|
|
266
|
+
"""
|
|
293
267
|
# Load XSLT
|
|
294
268
|
xsl = ET.fromstring(self.get(self.XSL_URL).content)
|
|
295
269
|
transform = ET.XSLT(xsl)
|
|
296
270
|
|
|
297
271
|
# Start querying and parsing graph
|
|
272
|
+
# Filter on dataset or serie records
|
|
298
273
|
body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
299
274
|
xmlns:gmd="http://www.isotc211.org/2005/gmd"
|
|
300
275
|
service="CSW" version="2.0.2" resultType="results"
|
|
@@ -304,22 +279,27 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
304
279
|
<csw:ElementSetName>full</csw:ElementSetName>
|
|
305
280
|
<csw:Constraint version="1.1.0">
|
|
306
281
|
<ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
|
|
307
|
-
<ogc:
|
|
308
|
-
<ogc:
|
|
309
|
-
|
|
310
|
-
|
|
282
|
+
<ogc:Or xmlns:ogc="http://www.opengis.net/ogc">
|
|
283
|
+
<ogc:PropertyIsEqualTo>
|
|
284
|
+
<ogc:PropertyName>dc:type</ogc:PropertyName>
|
|
285
|
+
<ogc:Literal>dataset</ogc:Literal>
|
|
286
|
+
</ogc:PropertyIsEqualTo>
|
|
287
|
+
<ogc:PropertyIsEqualTo>
|
|
288
|
+
<ogc:PropertyName>dc:type</ogc:PropertyName>
|
|
289
|
+
<ogc:Literal>series</ogc:Literal>
|
|
290
|
+
</ogc:PropertyIsEqualTo>
|
|
291
|
+
</ogc:Or>
|
|
311
292
|
</ogc:Filter>
|
|
312
293
|
</csw:Constraint>
|
|
313
294
|
</csw:Query>
|
|
314
295
|
</csw:GetRecords>'''
|
|
315
296
|
headers = {'Content-Type': 'application/xml'}
|
|
316
297
|
|
|
317
|
-
|
|
318
|
-
page = 0
|
|
298
|
+
page_number = 0
|
|
319
299
|
start = 1
|
|
320
300
|
|
|
321
301
|
response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
|
|
322
|
-
|
|
302
|
+
headers=headers)
|
|
323
303
|
response.raise_for_status()
|
|
324
304
|
|
|
325
305
|
tree_before_transform = ET.fromstring(response.content)
|
|
@@ -332,7 +312,7 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
332
312
|
# infos (useful for pagination)
|
|
333
313
|
search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
|
|
334
314
|
if search_results is None:
|
|
335
|
-
log.error(f'No search results found for {url} on page {
|
|
315
|
+
log.error(f'No search results found for {url} on page {page_number}')
|
|
336
316
|
break
|
|
337
317
|
|
|
338
318
|
subgraph = Graph(namespace_manager=namespace_manager)
|
|
@@ -341,25 +321,20 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
341
321
|
if not subgraph.subjects(RDF.type, DCAT.Dataset):
|
|
342
322
|
raise ValueError("Failed to fetch CSW content")
|
|
343
323
|
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
|
|
348
|
-
self.add_item(id, **kwargs)
|
|
349
|
-
graphs.append(subgraph)
|
|
324
|
+
yield page_number, subgraph
|
|
325
|
+
if self.is_done():
|
|
326
|
+
return
|
|
350
327
|
|
|
351
328
|
next_record = self.next_record_if_should_continue(start, search_results)
|
|
352
329
|
if not next_record:
|
|
353
330
|
break
|
|
354
|
-
|
|
331
|
+
|
|
355
332
|
start = next_record
|
|
356
|
-
|
|
333
|
+
page_number += 1
|
|
357
334
|
|
|
358
335
|
response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
|
|
359
|
-
|
|
336
|
+
headers=headers)
|
|
360
337
|
response.raise_for_status()
|
|
361
338
|
|
|
362
339
|
tree_before_transform = ET.fromstring(response.content)
|
|
363
340
|
tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
|
|
364
|
-
|
|
365
|
-
return graphs
|
udata/harvest/tasks.py
CHANGED
|
@@ -18,20 +18,10 @@ def harvest(self, ident):
|
|
|
18
18
|
return # Ignore deleted sources
|
|
19
19
|
Backend = backends.get(current_app, source.backend)
|
|
20
20
|
backend = Backend(source)
|
|
21
|
-
items = backend.perform_initialization()
|
|
22
|
-
if items is None:
|
|
23
|
-
pass
|
|
24
|
-
elif items == 0:
|
|
25
|
-
backend.finalize()
|
|
26
|
-
else:
|
|
27
|
-
finalize = harvest_job_finalize.s(backend.job.id)
|
|
28
|
-
items = [
|
|
29
|
-
harvest_job_item.s(backend.job.id, item.remote_id)
|
|
30
|
-
for item in backend.job.items
|
|
31
|
-
]
|
|
32
|
-
chord(items)(finalize)
|
|
33
|
-
|
|
34
21
|
|
|
22
|
+
backend.harvest()
|
|
23
|
+
|
|
24
|
+
|
|
35
25
|
|
|
36
26
|
@task(ignore_result=False, route='low.harvest')
|
|
37
27
|
def harvest_job_item(job_id, item_id):
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
xmlns:dct="http://purl.org/dc/terms/"
|
|
8
8
|
xmlns:ogc="http://www.opengis.net/ogc"
|
|
9
9
|
xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#"
|
|
10
|
+
xmlns:locn="http://www.w3.org/ns/locn#"
|
|
10
11
|
xmlns:dcterms="http://purl.org/dc/terms/"
|
|
11
12
|
xmlns:vcard="http://www.w3.org/2006/vcard/ns#"
|
|
12
13
|
xmlns:schema="http://schema.org/"
|
|
@@ -89,8 +90,16 @@
|
|
|
89
90
|
<dcterms:title>Dataset 2</dcterms:title>
|
|
90
91
|
<dct:spatial>
|
|
91
92
|
<ogc:Polygon>
|
|
93
|
+
<locn:geometry rdf:datatype="https://www.iana.org/assignments/media-types/application/vnd.geo+json"><![CDATA[{"type":"Polygon","coordinates":[[[-6,51],[10,51],[10,40],[-6,40],[-6,51]]]}]]></locn:geometry>
|
|
92
94
|
<geo:asWKT rdf:datatype="http://www.opengis.net/rdf#wktLiteral">
|
|
93
|
-
Polygon((
|
|
95
|
+
Polygon((159 -25, 159 -11, 212 -11, 212 -25, 159 -25))
|
|
96
|
+
</geo:asWKT>
|
|
97
|
+
<geo:asWKT rdf:datatype="http://www.opengis.net/rdf#wktLiteral">
|
|
98
|
+
Polygon((4 45, 4 46, 4 46, 4 45, 4 45))
|
|
99
|
+
</geo:asWKT>
|
|
100
|
+
<locn:geometry rdf:datatype="https://www.iana.org/assignments/media-types/application/vnd.geo+json"><![CDATA[{"type":"Polygon","coordinates":[[[4, 45], [4, 46], [4, 46], [4, 45], [4, 45]]]}]]></locn:geometry>
|
|
101
|
+
<geo:asWKT rdf:datatype="http://www.opengis.net/rdf#wktLiteral">
|
|
102
|
+
Polygon((159 -25, 159 -11, 212 -11, 212 -25, 159 -25))
|
|
94
103
|
</geo:asWKT>
|
|
95
104
|
</ogc:Polygon>
|
|
96
105
|
</dct:spatial>
|
|
@@ -23,6 +23,7 @@
|
|
|
23
23
|
<dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T18:59:02.737480</dcterms:issued>
|
|
24
24
|
<dcterms:description>Dataset 3 description</dcterms:description>
|
|
25
25
|
<dcat:keyword>Tag 1</dcat:keyword>
|
|
26
|
+
<dcat:theme rdf:resource="http://data.europa.eu/bna/c_dd313021"/>
|
|
26
27
|
<dcat:distribution rdf:resource="datasets/3/resources/1"/>
|
|
27
28
|
<dct:license>Licence Ouverte Version 2.0</dct:license>
|
|
28
29
|
<dct:accessRights rdf:resource="http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/INSPIRE_Directive_Article13_1e"/>
|