udata 7.0.8.dev28841__py2.py3-none-any.whl → 9.0.1.dev29390__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (73) hide show
  1. udata/__init__.py +1 -1
  2. udata/api/__init__.py +6 -4
  3. udata/api/oauth2.py +2 -1
  4. udata/api_fields.py +254 -0
  5. udata/commands/purge.py +8 -2
  6. udata/core/badges/models.py +2 -1
  7. udata/core/dataservices/__init__.py +0 -0
  8. udata/core/dataservices/api.py +92 -0
  9. udata/core/dataservices/models.py +142 -0
  10. udata/core/dataservices/permissions.py +7 -0
  11. udata/core/dataservices/tasks.py +25 -0
  12. udata/core/dataset/apiv2.py +2 -0
  13. udata/core/dataset/csv.py +8 -1
  14. udata/core/dataset/models.py +1 -0
  15. udata/core/dataset/rdf.py +77 -15
  16. udata/core/metrics/commands.py +18 -3
  17. udata/core/metrics/models.py +2 -3
  18. udata/core/organization/api_fields.py +28 -3
  19. udata/core/organization/csv.py +5 -3
  20. udata/core/organization/models.py +3 -1
  21. udata/core/owned.py +39 -2
  22. udata/core/reuse/csv.py +3 -0
  23. udata/core/site/api.py +4 -1
  24. udata/core/spatial/api.py +5 -10
  25. udata/core/spatial/models.py +7 -2
  26. udata/core/spatial/tasks.py +7 -0
  27. udata/core/spatial/tests/test_api.py +26 -0
  28. udata/core/user/api.py +11 -7
  29. udata/core/user/models.py +13 -2
  30. udata/harvest/backends/base.py +93 -103
  31. udata/harvest/backends/dcat.py +65 -90
  32. udata/harvest/tasks.py +3 -13
  33. udata/harvest/tests/dcat/bnodes.xml +10 -1
  34. udata/harvest/tests/dcat/catalog.xml +1 -0
  35. udata/harvest/tests/factories.py +13 -6
  36. udata/harvest/tests/test_actions.py +2 -2
  37. udata/harvest/tests/test_base_backend.py +9 -5
  38. udata/harvest/tests/test_dcat_backend.py +17 -1
  39. udata/rdf.py +4 -0
  40. udata/routing.py +6 -0
  41. udata/settings.py +4 -1
  42. udata/static/admin.css +2 -2
  43. udata/static/admin.css.map +1 -1
  44. udata/static/chunks/{0.6f1698738c9b0618b673.js → 0.93c3ae13b5b94753ee80.js} +3 -3
  45. udata/static/chunks/0.93c3ae13b5b94753ee80.js.map +1 -0
  46. udata/static/chunks/{14.f4037a917d5364cb564b.js → 14.e64890872b31c55fcdf7.js} +2 -2
  47. udata/static/chunks/14.e64890872b31c55fcdf7.js.map +1 -0
  48. udata/static/chunks/{2.7c89fae92899be371ed3.js → 2.614b3e73b072982fd9b1.js} +2 -2
  49. udata/static/chunks/2.614b3e73b072982fd9b1.js.map +1 -0
  50. udata/static/chunks/{5.3dc97ea195d251881552.js → 5.48417db6b33328fa9d6a.js} +2 -2
  51. udata/static/chunks/5.48417db6b33328fa9d6a.js.map +1 -0
  52. udata/static/common.js +1 -1
  53. udata/static/common.js.map +1 -1
  54. udata/tasks.py +1 -0
  55. udata/tests/api/__init__.py +3 -0
  56. udata/tests/api/test_dataservices_api.py +236 -0
  57. udata/tests/api/test_organizations_api.py +78 -5
  58. udata/tests/api/test_user_api.py +47 -13
  59. udata/tests/dataservice/test_dataservice_tasks.py +46 -0
  60. udata/tests/dataset/test_dataset_rdf.py +17 -2
  61. udata/tests/plugin.py +5 -0
  62. udata/tests/site/test_site_rdf.py +16 -0
  63. {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/METADATA +27 -1
  64. {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/RECORD +68 -60
  65. udata/core/metrics/api.py +0 -10
  66. udata/static/chunks/0.6f1698738c9b0618b673.js.map +0 -1
  67. udata/static/chunks/14.f4037a917d5364cb564b.js.map +0 -1
  68. udata/static/chunks/2.7c89fae92899be371ed3.js.map +0 -1
  69. udata/static/chunks/5.3dc97ea195d251881552.js.map +0 -1
  70. {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/LICENSE +0 -0
  71. {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/WHEEL +0 -0
  72. {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/entry_points.txt +0 -0
  73. {udata-7.0.8.dev28841.dist-info → udata-9.0.1.dev29390.dist-info}/top_level.txt +0 -0
udata/core/user/models.py CHANGED
@@ -13,6 +13,7 @@ from mongoengine.signals import pre_save, post_save
13
13
  from werkzeug.utils import cached_property
14
14
 
15
15
  from udata import mail
16
+ from udata.core import storages
16
17
  from udata.uris import endpoint_for
17
18
  from udata.frontend.markdown import mdstrip
18
19
  from udata.i18n import lazy_gettext as _
@@ -233,7 +234,15 @@ class User(WithMetrics, UserMixin, db.Document):
233
234
  raise NotImplementedError('''This method should not be using directly.
234
235
  Use `mark_as_deleted` (or `_delete` if you know what you're doing)''')
235
236
 
236
- def mark_as_deleted(self):
237
+ def mark_as_deleted(self, notify: bool = True):
238
+ if self.avatar.filename is not None:
239
+ storage = storages.avatars
240
+ storage.delete(self.avatar.filename)
241
+ storage.delete(self.avatar.original)
242
+ for key, value in self.avatar.thumbnails.items():
243
+ storage.delete(value)
244
+
245
+
237
246
  copied_user = copy(self)
238
247
  self.email = '{}@deleted'.format(self.id)
239
248
  self.slug = 'deleted'
@@ -270,7 +279,9 @@ class User(WithMetrics, UserMixin, db.Document):
270
279
  from udata.models import ContactPoint
271
280
  ContactPoint.objects(owner=self).delete()
272
281
 
273
- mail.send(_('Account deletion'), copied_user, 'account_deleted')
282
+
283
+ if notify:
284
+ mail.send(_('Account deletion'), copied_user, 'account_deleted')
274
285
 
275
286
  def count_datasets(self):
276
287
  from udata.models import Dataset
@@ -2,6 +2,7 @@ import logging
2
2
  import traceback
3
3
 
4
4
  from datetime import datetime, date, timedelta
5
+ from typing import Optional
5
6
  from uuid import UUID
6
7
 
7
8
  import requests
@@ -68,7 +69,12 @@ class HarvestFeature(object):
68
69
 
69
70
 
70
71
  class BaseBackend(object):
71
- '''Base class for Harvester implementations'''
72
+ """
73
+ Base class that wrap children methods to add error management and debug logs.
74
+ Also provides a few helpers needed on all or some backends.
75
+
76
+
77
+ """
72
78
 
73
79
  name = None
74
80
  display_name = None
@@ -127,92 +133,69 @@ class BaseBackend(object):
127
133
  def get_filters(self):
128
134
  return self.config.get('filters', [])
129
135
 
130
- def harvest(self):
131
- '''Start the harvesting process'''
132
- if self.perform_initialization() is not None:
133
- self.process_items()
134
- self.finalize()
135
- return self.job
136
+ def inner_harvest(self):
137
+ raise NotImplementedError
138
+
139
+ def inner_process_dataset(self, item: HarvestItem) -> Dataset:
140
+ raise NotImplementedError
136
141
 
137
- def perform_initialization(self):
138
- '''Initialize the harvesting for a given job'''
139
- log.debug('Initializing backend')
142
+ def harvest(self):
143
+ log.debug(f'Starting harvesting {self.source.name} ({self.source.url})…')
140
144
  factory = HarvestJob if self.dryrun else HarvestJob.objects.create
141
- self.job = factory(status='initializing',
145
+ self.job = factory(status='initialized',
142
146
  started=datetime.utcnow(),
143
147
  source=self.source)
144
148
 
145
149
  before_harvest_job.send(self)
146
150
 
147
151
  try:
148
- self.initialize()
149
- self.job.status = 'initialized'
150
- if not self.dryrun:
151
- self.job.save()
152
+ self.inner_harvest()
153
+
154
+ if self.source.autoarchive:
155
+ self.autoarchive()
156
+
157
+ self.job.status = 'done'
158
+
159
+ if any(i.status == 'failed' for i in self.job.items):
160
+ self.job.status += '-errors'
152
161
  except HarvestValidationError as e:
153
- log.info('Initialization failed for "%s" (%s)',
154
- safe_unicode(self.source.name), self.source.backend)
155
- error = HarvestError(message=safe_unicode(e))
156
- self.job.errors.append(error)
157
- self.job.status = 'failed'
158
- self.end()
159
- return None
160
- except Exception as e:
162
+ log.exception(f'Harvesting validation failed for "{safe_unicode(self.source.name)}" ({self.source.backend})')
163
+
161
164
  self.job.status = 'failed'
165
+
162
166
  error = HarvestError(message=safe_unicode(e))
163
167
  self.job.errors.append(error)
164
- self.end()
165
- msg = 'Initialization failed for "{0.name}" ({0.backend})'
166
- log.exception(msg.format(self.source))
167
- return None
168
+ except Exception as e:
169
+ log.exception(f'Harvesting failed for "{safe_unicode(self.source.name)}" ({self.source.backend})')
168
170
 
169
- if self.max_items:
170
- self.job.items = self.job.items[:self.max_items]
171
+ self.job.status = 'failed'
171
172
 
172
- if self.job.items:
173
- log.debug('Queued %s items', len(self.job.items))
173
+ error = HarvestError(message=safe_unicode(e), details=traceback.format_exc())
174
+ self.job.errors.append(error)
175
+ finally:
176
+ self.end_job()
177
+
178
+ return self.job
174
179
 
175
- return len(self.job.items)
180
+ def process_dataset(self, remote_id: str, **kwargs):
181
+ log.debug(f'Processing dataset {remote_id}…')
176
182
 
177
- def initialize(self):
178
- raise NotImplementedError
183
+ # TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice`
184
+ item = HarvestItem(status='started', started=datetime.utcnow(), remote_id=remote_id)
185
+ self.job.items.append(item)
186
+ self.save_job()
179
187
 
180
- def process_items(self):
181
- '''Process the data identified in the initialize stage'''
182
- for item in self.job.items:
183
- self.process_item(item)
188
+ try:
189
+ if not remote_id:
190
+ raise HarvestSkipException("missing identifier")
184
191
 
185
- def process_item(self, item):
186
- log.debug('Processing: %s', item.remote_id)
187
- item.status = 'started'
188
- item.started = datetime.utcnow()
189
- if not self.dryrun:
190
- self.job.save()
192
+ dataset = self.inner_process_dataset(item, **kwargs)
191
193
 
192
- try:
193
- dataset = self.process(item)
194
- if not dataset.harvest:
195
- dataset.harvest = HarvestDatasetMetadata()
196
- dataset.harvest.domain = self.source.domain
197
- dataset.harvest.remote_id = item.remote_id
198
- dataset.harvest.source_id = str(self.source.id)
199
- dataset.harvest.last_update = datetime.utcnow()
200
- dataset.harvest.backend = self.display_name
201
-
202
- # unset archived status if needed
203
- if dataset.harvest:
204
- dataset.harvest.archived_at = None
205
- dataset.harvest.archived = None
194
+ # Use `item.remote_id` because `inner_process_dataset` could have modified it.
195
+ dataset.harvest = self.update_harvest_info(dataset.harvest, item.remote_id)
206
196
  dataset.archived = None
207
197
 
208
- # TODO permissions checking
209
- if not dataset.organization and not dataset.owner:
210
- if self.source.organization:
211
- dataset.organization = self.source.organization
212
- elif self.source.owner:
213
- dataset.owner = self.source.owner
214
-
215
- # TODO: Apply editble mappings
198
+ # TODO: Apply editable mappings
216
199
 
217
200
  if self.dryrun:
218
201
  dataset.validate()
@@ -221,26 +204,54 @@ class BaseBackend(object):
221
204
  item.dataset = dataset
222
205
  item.status = 'done'
223
206
  except HarvestSkipException as e:
224
- log.info('Skipped item %s : %s', item.remote_id, safe_unicode(e))
225
207
  item.status = 'skipped'
208
+
209
+ log.info(f'Skipped item {item.remote_id} : {safe_unicode(e)}')
226
210
  item.errors.append(HarvestError(message=safe_unicode(e)))
227
211
  except HarvestValidationError as e:
228
- log.info('Error validating item %s : %s', item.remote_id, safe_unicode(e))
229
212
  item.status = 'failed'
213
+
214
+ log.info(f'Error validating item {item.remote_id} : {safe_unicode(e)}')
230
215
  item.errors.append(HarvestError(message=safe_unicode(e)))
231
216
  except Exception as e:
232
- log.exception('Error while processing %s : %s',
233
- item.remote_id,
234
- safe_unicode(e))
235
- error = HarvestError(message=safe_unicode(e),
236
- details=traceback.format_exc())
237
- item.errors.append(error)
238
217
  item.status = 'failed'
218
+ log.exception(f'Error while processing {item.remote_id} : {safe_unicode(e)}')
219
+
220
+ error = HarvestError(message=safe_unicode(e), details=traceback.format_exc())
221
+ item.errors.append(error)
222
+ finally:
223
+ item.ended = datetime.utcnow()
224
+ self.save_job()
225
+
226
+ def is_done(self) -> bool:
227
+ '''Should be called after process_dataset to know if we reach the max items'''
228
+ return self.max_items and len(self.job.items) >= self.max_items
229
+
230
+ def update_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int):
231
+ if not harvest:
232
+ harvest = HarvestDatasetMetadata()
233
+ harvest.domain = self.source.domain
234
+ harvest.remote_id = remote_id
235
+ harvest.source_id = str(self.source.id)
236
+ harvest.last_update = datetime.utcnow()
237
+ harvest.backend = self.display_name
239
238
 
240
- item.ended = datetime.utcnow()
239
+ harvest.archived_at = None
240
+ harvest.archived = None
241
+
242
+ return harvest
243
+
244
+ def save_job(self):
245
+ if not self.dryrun:
246
+ self.job.save()
247
+
248
+ def end_job(self):
249
+ self.job.ended = datetime.utcnow()
241
250
  if not self.dryrun:
242
251
  self.job.save()
243
252
 
253
+ after_harvest_job.send(self)
254
+
244
255
  def autoarchive(self):
245
256
  '''
246
257
  Archive items that exist on the local instance but not on remote platform
@@ -262,34 +273,13 @@ class BaseBackend(object):
262
273
  archive_harvested_dataset(dataset, reason='not-on-remote', dryrun=self.dryrun)
263
274
  # add a HarvestItem to the job list (useful for report)
264
275
  # even when archiving has already been done (useful for debug)
265
- item = self.add_item(dataset.harvest.remote_id)
266
- item.dataset = dataset
267
- item.status = 'archived'
268
-
269
- if not self.dryrun:
270
- self.job.save()
276
+ self.job.items.append(HarvestItem(
277
+ remote_id=str(dataset.harvest.remote_id),
278
+ dataset=dataset,
279
+ status='archived'
280
+ ))
271
281
 
272
- def process(self, item):
273
- raise NotImplementedError
274
-
275
- def add_item(self, identifier, *args, **kwargs):
276
- item = HarvestItem(remote_id=str(identifier), args=args, kwargs=kwargs)
277
- self.job.items.append(item)
278
- return item
279
-
280
- def finalize(self):
281
- if self.source.autoarchive:
282
- self.autoarchive()
283
- self.job.status = 'done'
284
- if any(i.status == 'failed' for i in self.job.items):
285
- self.job.status += '-errors'
286
- self.end()
287
-
288
- def end(self):
289
- self.job.ended = datetime.utcnow()
290
- if not self.dryrun:
291
- self.job.save()
292
- after_harvest_job.send(self)
282
+ self.save_job()
293
283
 
294
284
  def get_dataset(self, remote_id):
295
285
  '''Get or create a dataset given its remote ID (and its source)
@@ -352,4 +342,4 @@ class BaseBackend(object):
352
342
  msg = str(error)
353
343
  errors.append(msg)
354
344
  msg = '\n- '.join(['Validation error:'] + errors)
355
- raise HarvestValidationError(msg)
345
+ raise HarvestValidationError(msg)
@@ -7,13 +7,15 @@ import boto3
7
7
  from flask import current_app
8
8
  from datetime import date
9
9
  import json
10
- from typing import List
10
+ from typing import Generator, List
11
11
 
12
+ from udata.core.dataset.models import Dataset
12
13
  from udata.rdf import (
13
14
  DCAT, DCT, HYDRA, SPDX, namespace_manager, guess_format, url_from_rdf
14
15
  )
15
16
  from udata.core.dataset.rdf import dataset_from_rdf
16
17
  from udata.storage.s3 import store_as_json, get_from_json
18
+ from udata.harvest.models import HarvestItem
17
19
 
18
20
  from .base import BaseBackend
19
21
 
@@ -59,14 +61,17 @@ def extract_graph(source, target, node, specs):
59
61
  class DcatBackend(BaseBackend):
60
62
  display_name = 'DCAT'
61
63
 
62
- def initialize(self):
63
- '''List all datasets for a given ...'''
64
+ def inner_harvest(self):
64
65
  fmt = self.get_format()
65
- graphs = self.parse_graph(self.source.url, fmt)
66
-
67
66
  self.job.data = { 'format': fmt }
68
67
 
69
- serialized_graphs = [graph.serialize(format=fmt, indent=None) for graph in graphs]
68
+ serialized_graphs = []
69
+
70
+ for page_number, page in self.walk_graph(self.source.url, fmt):
71
+ self.process_one_datasets_page(page_number, page)
72
+ serialized_graphs.append(page.serialize(format=fmt, indent=None))
73
+
74
+ # TODO call `walk_graph` with `process_dataservices`
70
75
 
71
76
  # The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000).
72
77
  max_harvest_graph_size_in_mongo = current_app.config.get('HARVEST_MAX_CATALOG_SIZE_IN_MONGO')
@@ -105,13 +110,11 @@ class DcatBackend(BaseBackend):
105
110
  raise ValueError(msg)
106
111
  return fmt
107
112
 
108
- def parse_graph(self, url, fmt) -> List[Graph]:
113
+ def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
109
114
  """
110
- Returns an instance of rdflib.Graph for each detected page
111
- The index in the list is the page number
115
+ Yield all RDF pages as `Graph` from the source
112
116
  """
113
- graphs = []
114
- page = 0
117
+ page_number = 0
115
118
  while url:
116
119
  subgraph = Graph(namespace_manager=namespace_manager)
117
120
  response = self.get(url)
@@ -128,19 +131,26 @@ class DcatBackend(BaseBackend):
128
131
  pagination = subgraph.resource(pagination)
129
132
  url = url_from_rdf(pagination, prop)
130
133
  break
131
- graphs.append(subgraph)
132
134
 
133
- for node in subgraph.subjects(RDF.type, DCAT.Dataset):
134
- id = subgraph.value(node, DCT.identifier)
135
- kwargs = {'page': page}
136
- self.add_item(id, **kwargs)
137
- if self.max_items and len(self.job.items) >= self.max_items:
138
- # this will stop iterating on pagination
139
- url = None
135
+ yield page_number, subgraph
136
+ if self.is_done():
137
+ return
140
138
 
141
- page += 1
139
+ page_number += 1
140
+
141
+ def process_one_datasets_page(self, page_number: int, page: Graph):
142
+ for node in page.subjects(RDF.type, DCAT.Dataset):
143
+ remote_id = page.value(node, DCT.identifier)
144
+ self.process_dataset(remote_id, page_number=page_number, page=page, node=node)
142
145
 
143
- return graphs
146
+ if self.is_done():
147
+ return
148
+
149
+ def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node):
150
+ item.kwargs['page_number'] = page_number
151
+
152
+ dataset = self.get_dataset(item.remote_id)
153
+ return dataset_from_rdf(page, dataset, node=node)
144
154
 
145
155
  def get_node_from_item(self, graph, item):
146
156
  for node in graph.subjects(RDF.type, DCAT.Dataset):
@@ -148,33 +158,6 @@ class DcatBackend(BaseBackend):
148
158
  return node
149
159
  raise ValueError(f'Unable to find dataset with DCT.identifier:{item.remote_id}')
150
160
 
151
- def process(self, item):
152
- if item.remote_id == 'None':
153
- raise ValueError('The DCT.identifier is missing on this DCAT.Dataset record')
154
- graph = Graph(namespace_manager=namespace_manager)
155
-
156
- if self.job.data.get('graphs') is not None:
157
- graphs = self.job.data['graphs']
158
- else:
159
- bucket = current_app.config.get('HARVEST_GRAPHS_S3_BUCKET')
160
- if bucket is None:
161
- raise ValueError(f"No bucket configured but the harvest job item {item.id} on job {self.job.id} doesn't have a graph in MongoDB.")
162
-
163
- graphs = get_from_json(bucket, self.job.data['filename'])
164
- if graphs is None:
165
- raise ValueError(f"The file '{self.job.data['filename']}' is missing in S3 bucket '{bucket}'")
166
-
167
- data = graphs[item.kwargs['page']]
168
- format = self.job.data['format']
169
-
170
- graph.parse(data=bytes(data, encoding='utf8'), format=format)
171
- node = self.get_node_from_item(graph, item)
172
-
173
- dataset = self.get_dataset(item.remote_id)
174
- dataset = dataset_from_rdf(graph, dataset, node=node)
175
- return dataset
176
-
177
-
178
161
  def next_record_if_should_continue(self, start, search_results):
179
162
  next_record = int(search_results.attrib['nextRecord'])
180
163
  matched_count = int(search_results.attrib['numberOfRecordsMatched'])
@@ -209,7 +192,10 @@ class CswDcatBackend(DcatBackend):
209
192
 
210
193
  DCAT_SCHEMA = 'http://www.w3.org/ns/dcat#'
211
194
 
212
- def parse_graph(self, url: str, fmt: str) -> List[Graph]:
195
+ def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
196
+ """
197
+ Yield all RDF pages as `Graph` from the source
198
+ """
213
199
  body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
214
200
  xmlns:gmd="http://www.isotc211.org/2005/gmd"
215
201
  service="CSW" version="2.0.2" resultType="results"
@@ -227,8 +213,7 @@ class CswDcatBackend(DcatBackend):
227
213
  </csw:GetRecords>'''
228
214
  headers = {'Content-Type': 'application/xml'}
229
215
 
230
- graphs = []
231
- page = 0
216
+ page_number = 0
232
217
  start = 1
233
218
 
234
219
  response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
@@ -239,37 +224,29 @@ class CswDcatBackend(DcatBackend):
239
224
  if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
240
225
  raise ValueError(f'Failed to query CSW:\n{content}')
241
226
  while tree:
242
- graph = Graph(namespace_manager=namespace_manager)
243
227
  search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
244
228
  if search_results is None:
245
- log.error(f'No search results found for {url} on page {page}')
229
+ log.error(f'No search results found for {url} on page {page_number}')
246
230
  break
247
231
  for child in search_results:
248
232
  subgraph = Graph(namespace_manager=namespace_manager)
249
233
  subgraph.parse(data=ET.tostring(child), format=fmt)
250
- graph += subgraph
251
234
 
252
- for node in subgraph.subjects(RDF.type, DCAT.Dataset):
253
- id = subgraph.value(node, DCT.identifier)
254
- kwargs = {'nid': str(node), 'page': page}
255
- kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
256
- self.add_item(id, **kwargs)
257
- graphs.append(graph)
235
+ yield page_number, subgraph
236
+ if self.is_done():
237
+ return
258
238
 
259
239
  next_record = self.next_record_if_should_continue(start, search_results)
260
240
  if not next_record:
261
241
  break
262
242
 
263
243
  start = next_record
264
- page += 1
244
+ page_number += 1
265
245
 
266
246
  tree = ET.fromstring(
267
247
  self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
268
248
  headers=headers).content)
269
249
 
270
- return graphs
271
-
272
-
273
250
 
274
251
  class CswIso19139DcatBackend(DcatBackend):
275
252
  '''
@@ -283,18 +260,16 @@ class CswIso19139DcatBackend(DcatBackend):
283
260
 
284
261
  XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
285
262
 
286
- def parse_graph(self, url: str, fmt: str) -> List[Graph]:
287
- '''
288
- Parse CSW graph querying ISO schema.
289
- Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
290
- See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
291
- '''
292
-
263
+ def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
264
+ """
265
+ Yield all RDF pages as `Graph` from the source
266
+ """
293
267
  # Load XSLT
294
268
  xsl = ET.fromstring(self.get(self.XSL_URL).content)
295
269
  transform = ET.XSLT(xsl)
296
270
 
297
271
  # Start querying and parsing graph
272
+ # Filter on dataset or serie records
298
273
  body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
299
274
  xmlns:gmd="http://www.isotc211.org/2005/gmd"
300
275
  service="CSW" version="2.0.2" resultType="results"
@@ -304,22 +279,27 @@ class CswIso19139DcatBackend(DcatBackend):
304
279
  <csw:ElementSetName>full</csw:ElementSetName>
305
280
  <csw:Constraint version="1.1.0">
306
281
  <ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
307
- <ogc:PropertyIsEqualTo>
308
- <ogc:PropertyName>dc:type</ogc:PropertyName>
309
- <ogc:Literal>dataset</ogc:Literal>
310
- </ogc:PropertyIsEqualTo>
282
+ <ogc:Or xmlns:ogc="http://www.opengis.net/ogc">
283
+ <ogc:PropertyIsEqualTo>
284
+ <ogc:PropertyName>dc:type</ogc:PropertyName>
285
+ <ogc:Literal>dataset</ogc:Literal>
286
+ </ogc:PropertyIsEqualTo>
287
+ <ogc:PropertyIsEqualTo>
288
+ <ogc:PropertyName>dc:type</ogc:PropertyName>
289
+ <ogc:Literal>series</ogc:Literal>
290
+ </ogc:PropertyIsEqualTo>
291
+ </ogc:Or>
311
292
  </ogc:Filter>
312
293
  </csw:Constraint>
313
294
  </csw:Query>
314
295
  </csw:GetRecords>'''
315
296
  headers = {'Content-Type': 'application/xml'}
316
297
 
317
- graphs = []
318
- page = 0
298
+ page_number = 0
319
299
  start = 1
320
300
 
321
301
  response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
322
- headers=headers)
302
+ headers=headers)
323
303
  response.raise_for_status()
324
304
 
325
305
  tree_before_transform = ET.fromstring(response.content)
@@ -332,7 +312,7 @@ class CswIso19139DcatBackend(DcatBackend):
332
312
  # infos (useful for pagination)
333
313
  search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
334
314
  if search_results is None:
335
- log.error(f'No search results found for {url} on page {page}')
315
+ log.error(f'No search results found for {url} on page {page_number}')
336
316
  break
337
317
 
338
318
  subgraph = Graph(namespace_manager=namespace_manager)
@@ -341,25 +321,20 @@ class CswIso19139DcatBackend(DcatBackend):
341
321
  if not subgraph.subjects(RDF.type, DCAT.Dataset):
342
322
  raise ValueError("Failed to fetch CSW content")
343
323
 
344
- for node in subgraph.subjects(RDF.type, DCAT.Dataset):
345
- id = subgraph.value(node, DCT.identifier)
346
- kwargs = {'nid': str(node), 'page': page}
347
- kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
348
- self.add_item(id, **kwargs)
349
- graphs.append(subgraph)
324
+ yield page_number, subgraph
325
+ if self.is_done():
326
+ return
350
327
 
351
328
  next_record = self.next_record_if_should_continue(start, search_results)
352
329
  if not next_record:
353
330
  break
354
-
331
+
355
332
  start = next_record
356
- page += 1
333
+ page_number += 1
357
334
 
358
335
  response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
359
- headers=headers)
336
+ headers=headers)
360
337
  response.raise_for_status()
361
338
 
362
339
  tree_before_transform = ET.fromstring(response.content)
363
340
  tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
364
-
365
- return graphs
udata/harvest/tasks.py CHANGED
@@ -18,20 +18,10 @@ def harvest(self, ident):
18
18
  return # Ignore deleted sources
19
19
  Backend = backends.get(current_app, source.backend)
20
20
  backend = Backend(source)
21
- items = backend.perform_initialization()
22
- if items is None:
23
- pass
24
- elif items == 0:
25
- backend.finalize()
26
- else:
27
- finalize = harvest_job_finalize.s(backend.job.id)
28
- items = [
29
- harvest_job_item.s(backend.job.id, item.remote_id)
30
- for item in backend.job.items
31
- ]
32
- chord(items)(finalize)
33
-
34
21
 
22
+ backend.harvest()
23
+
24
+
35
25
 
36
26
  @task(ignore_result=False, route='low.harvest')
37
27
  def harvest_job_item(job_id, item_id):
@@ -7,6 +7,7 @@
7
7
  xmlns:dct="http://purl.org/dc/terms/"
8
8
  xmlns:ogc="http://www.opengis.net/ogc"
9
9
  xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#"
10
+ xmlns:locn="http://www.w3.org/ns/locn#"
10
11
  xmlns:dcterms="http://purl.org/dc/terms/"
11
12
  xmlns:vcard="http://www.w3.org/2006/vcard/ns#"
12
13
  xmlns:schema="http://schema.org/"
@@ -89,8 +90,16 @@
89
90
  <dcterms:title>Dataset 2</dcterms:title>
90
91
  <dct:spatial>
91
92
  <ogc:Polygon>
93
+ <locn:geometry rdf:datatype="https://www.iana.org/assignments/media-types/application/vnd.geo+json"><![CDATA[{"type":"Polygon","coordinates":[[[-6,51],[10,51],[10,40],[-6,40],[-6,51]]]}]]></locn:geometry>
92
94
  <geo:asWKT rdf:datatype="http://www.opengis.net/rdf#wktLiteral">
93
- Polygon((4.44641288 45.54214467, 4.44641288 46.01316963, 4.75655252 46.01316963, 4.75655252 45.54214467, 4.44641288 45.54214467))
95
+ Polygon((159 -25, 159 -11, 212 -11, 212 -25, 159 -25))
96
+ </geo:asWKT>
97
+ <geo:asWKT rdf:datatype="http://www.opengis.net/rdf#wktLiteral">
98
+ Polygon((4 45, 4 46, 4 46, 4 45, 4 45))
99
+ </geo:asWKT>
100
+ <locn:geometry rdf:datatype="https://www.iana.org/assignments/media-types/application/vnd.geo+json"><![CDATA[{"type":"Polygon","coordinates":[[[4, 45], [4, 46], [4, 46], [4, 45], [4, 45]]]}]]></locn:geometry>
101
+ <geo:asWKT rdf:datatype="http://www.opengis.net/rdf#wktLiteral">
102
+ Polygon((159 -25, 159 -11, 212 -11, 212 -25, 159 -25))
94
103
  </geo:asWKT>
95
104
  </ogc:Polygon>
96
105
  </dct:spatial>
@@ -23,6 +23,7 @@
23
23
  <dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T18:59:02.737480</dcterms:issued>
24
24
  <dcterms:description>Dataset 3 description</dcterms:description>
25
25
  <dcat:keyword>Tag 1</dcat:keyword>
26
+ <dcat:theme rdf:resource="http://data.europa.eu/bna/c_dd313021"/>
26
27
  <dcat:distribution rdf:resource="datasets/3/resources/1"/>
27
28
  <dct:license>Licence Ouverte Version 2.0</dct:license>
28
29
  <dct:accessRights rdf:resource="http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/INSPIRE_Directive_Article13_1e"/>