udata 8.0.2.dev29304__py2.py3-none-any.whl → 9.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (86) hide show
  1. udata/__init__.py +1 -1
  2. udata/api/__init__.py +2 -0
  3. udata/api/commands.py +0 -2
  4. udata/api_fields.py +41 -3
  5. udata/commands/db.py +88 -48
  6. udata/core/dataservices/factories.py +33 -0
  7. udata/core/dataservices/models.py +42 -4
  8. udata/core/dataservices/rdf.py +106 -0
  9. udata/core/dataset/csv.py +8 -1
  10. udata/core/dataset/models.py +1 -2
  11. udata/core/dataset/rdf.py +37 -128
  12. udata/core/discussions/models.py +20 -0
  13. udata/core/organization/csv.py +5 -3
  14. udata/core/reports/__init__.py +0 -0
  15. udata/core/reports/api.py +44 -0
  16. udata/core/reports/constants.py +30 -0
  17. udata/core/reports/models.py +58 -0
  18. udata/core/reuse/csv.py +3 -0
  19. udata/core/site/api.py +33 -2
  20. udata/core/site/rdf.py +6 -1
  21. udata/core/spam/models.py +6 -0
  22. udata/core/topic/models.py +3 -2
  23. udata/core/topic/parsers.py +3 -2
  24. udata/core/user/apiv2.py +28 -0
  25. udata/db/__init__.py +0 -0
  26. udata/db/tasks.py +6 -0
  27. udata/features/notifications/__init__.py +0 -1
  28. udata/forms/fields.py +2 -2
  29. udata/harvest/api.py +19 -1
  30. udata/harvest/backends/base.py +118 -10
  31. udata/harvest/backends/dcat.py +28 -7
  32. udata/harvest/models.py +6 -0
  33. udata/harvest/tests/dcat/bnodes.xml +13 -2
  34. udata/harvest/tests/test_dcat_backend.py +21 -0
  35. udata/migrations/2024-06-11-fix-reuse-datasets-references.py +35 -0
  36. udata/models/__init__.py +1 -0
  37. udata/rdf.py +113 -2
  38. udata/routing.py +1 -1
  39. udata/settings.py +3 -1
  40. udata/static/admin.js +17 -17
  41. udata/static/admin.js.map +1 -1
  42. udata/static/chunks/{18.ad41fb75ac4226e1f3ce.js → 18.1922fd0b2b7fad122991.js} +3 -3
  43. udata/static/chunks/18.1922fd0b2b7fad122991.js.map +1 -0
  44. udata/static/chunks/{7.11ac4de064ae59691d49.js → 7.e2106342e94ee09393b1.js} +2 -2
  45. udata/static/chunks/7.e2106342e94ee09393b1.js.map +1 -0
  46. udata/static/common.js +1 -1
  47. udata/static/common.js.map +1 -1
  48. udata/storage/s3.py +3 -3
  49. udata/tasks.py +1 -0
  50. udata/tests/api/test_dataservices_api.py +26 -2
  51. udata/tests/api/test_datasets_api.py +1 -1
  52. udata/tests/api/test_reports_api.py +87 -0
  53. udata/tests/apiv2/test_me_api.py +40 -0
  54. udata/tests/dataset/test_dataset_rdf.py +19 -1
  55. udata/tests/frontend/test_auth.py +1 -4
  56. udata/tests/organization/test_csv_adapter.py +0 -1
  57. udata/tests/plugin.py +2 -0
  58. udata/tests/site/test_site_api.py +0 -1
  59. udata/tests/site/test_site_rdf.py +66 -0
  60. udata/tests/test_discussions.py +24 -34
  61. udata/tests/test_model.py +3 -2
  62. udata/tests/test_utils.py +1 -1
  63. udata/translations/ar/LC_MESSAGES/udata.mo +0 -0
  64. udata/translations/ar/LC_MESSAGES/udata.po +128 -64
  65. udata/translations/de/LC_MESSAGES/udata.mo +0 -0
  66. udata/translations/de/LC_MESSAGES/udata.po +128 -64
  67. udata/translations/es/LC_MESSAGES/udata.mo +0 -0
  68. udata/translations/es/LC_MESSAGES/udata.po +128 -64
  69. udata/translations/fr/LC_MESSAGES/udata.mo +0 -0
  70. udata/translations/fr/LC_MESSAGES/udata.po +128 -64
  71. udata/translations/it/LC_MESSAGES/udata.mo +0 -0
  72. udata/translations/it/LC_MESSAGES/udata.po +128 -64
  73. udata/translations/pt/LC_MESSAGES/udata.mo +0 -0
  74. udata/translations/pt/LC_MESSAGES/udata.po +128 -64
  75. udata/translations/sr/LC_MESSAGES/udata.mo +0 -0
  76. udata/translations/sr/LC_MESSAGES/udata.po +128 -64
  77. udata/translations/udata.pot +129 -65
  78. udata/uris.py +14 -13
  79. {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/METADATA +26 -7
  80. {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/RECORD +84 -72
  81. udata/static/chunks/18.ad41fb75ac4226e1f3ce.js.map +0 -1
  82. udata/static/chunks/7.11ac4de064ae59691d49.js.map +0 -1
  83. {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/LICENSE +0 -0
  84. {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/WHEEL +0 -0
  85. {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/entry_points.txt +0 -0
  86. {udata-8.0.2.dev29304.dist-info → udata-9.1.0.dist-info}/top_level.txt +0 -0
udata/harvest/api.py CHANGED
@@ -1,9 +1,11 @@
1
+ from bson import ObjectId
1
2
  from werkzeug.exceptions import BadRequest
2
3
  from flask import request
3
4
 
4
5
  from udata.api import api, API, fields
5
6
  from udata.auth import admin_permission
6
7
 
8
+ from udata.core.dataservices.models import Dataservice
7
9
  from udata.core.dataset.api_fields import dataset_ref_fields, dataset_fields
8
10
  from udata.core.organization.api_fields import org_ref_fields
9
11
  from udata.core.organization.permissions import EditOrganizationPermission
@@ -31,12 +33,22 @@ error_fields = api.model('HarvestError', {
31
33
  'details': fields.String(description='Optional details (ie. stacktrace)'),
32
34
  })
33
35
 
36
+
37
+ log_fields = api.model('HarvestError', {
38
+ 'level': fields.String(required=True),
39
+ 'message': fields.String(required=True),
40
+ })
41
+
42
+
34
43
  item_fields = api.model('HarvestItem', {
35
44
  'remote_id': fields.String(description='The item remote ID to process',
36
45
  required=True),
37
46
  'dataset': fields.Nested(dataset_ref_fields,
38
47
  description='The processed dataset',
39
48
  allow_null=True),
49
+ 'dataservice': fields.Nested(Dataservice.__read_fields__,
50
+ description='The processed dataservice',
51
+ allow_null=True),
40
52
  'status': fields.String(description='The item status',
41
53
  required=True,
42
54
  enum=list(HARVEST_ITEM_STATUS)),
@@ -46,6 +58,8 @@ item_fields = api.model('HarvestItem', {
46
58
  'ended': fields.ISODateTime(description='The item end date'),
47
59
  'errors': fields.List(fields.Nested(error_fields),
48
60
  description='The item errors'),
61
+ 'logs': fields.List(fields.Nested(log_fields),
62
+ description='The item logs'),
49
63
  'args': fields.List(fields.String,
50
64
  description='The item positional arguments',
51
65
  default=[]),
@@ -181,6 +195,10 @@ class SourcesAPI(API):
181
195
  def get(self):
182
196
  '''List all harvest sources'''
183
197
  args = source_parser.parse_args()
198
+
199
+ if args.get('owner') and not ObjectId.is_valid(args.get('owner')):
200
+ api.abort(400, '`owner` arg must be an identifier')
201
+
184
202
  return actions.paginate_sources(args.get('owner'),
185
203
  page=args['page'],
186
204
  page_size=args['page_size'],
@@ -256,7 +274,7 @@ class ScheduleSourceAPI(API):
256
274
  # Handle both syntax: quoted and unquoted
257
275
  try:
258
276
  data = request.json
259
- except BadRequest as e:
277
+ except BadRequest:
260
278
  data = request.data.decode('utf-8')
261
279
  return actions.schedule(ident, data)
262
280
 
@@ -2,20 +2,21 @@ import logging
2
2
  import traceback
3
3
 
4
4
  from datetime import datetime, date, timedelta
5
- from typing import Optional
6
5
  from uuid import UUID
7
6
 
8
7
  import requests
9
8
 
10
9
  from flask import current_app
10
+ from udata.core.dataservices.models import Dataservice
11
11
  from voluptuous import MultipleInvalid, RequiredFieldInvalid
12
12
 
13
13
  from udata.core.dataset.models import HarvestDatasetMetadata
14
+ from udata.core.dataservices.models import HarvestMetadata as HarvestDataserviceMetadata
14
15
  from udata.models import Dataset
15
16
  from udata.utils import safe_unicode
16
17
 
17
18
  from ..exceptions import HarvestException, HarvestSkipException, HarvestValidationError
18
- from ..models import HarvestItem, HarvestJob, HarvestError, archive_harvested_dataset
19
+ from ..models import HarvestItem, HarvestJob, HarvestError, HarvestLog, archive_harvested_dataset
19
20
  from ..signals import before_harvest_job, after_harvest_job
20
21
 
21
22
  log = logging.getLogger(__name__)
@@ -72,8 +73,6 @@ class BaseBackend(object):
72
73
  """
73
74
  Base class that wrap children methods to add error management and debug logs.
74
75
  Also provides a few helpers needed on all or some backends.
75
-
76
-
77
76
  """
78
77
 
79
78
  name = None
@@ -139,6 +138,9 @@ class BaseBackend(object):
139
138
  def inner_process_dataset(self, item: HarvestItem) -> Dataset:
140
139
  raise NotImplementedError
141
140
 
141
+ def inner_process_dataservice(self, item: HarvestItem) -> Dataservice:
142
+ raise NotImplementedError
143
+
142
144
  def harvest(self):
143
145
  log.debug(f'Starting harvesting {self.source.name} ({self.source.url})…')
144
146
  factory = HarvestJob if self.dryrun else HarvestJob.objects.create
@@ -185,14 +187,17 @@ class BaseBackend(object):
185
187
  self.job.items.append(item)
186
188
  self.save_job()
187
189
 
190
+ log_catcher = LogCatcher()
191
+
188
192
  try:
189
193
  if not remote_id:
190
194
  raise HarvestSkipException("missing identifier")
191
195
 
196
+ current_app.logger.addHandler(log_catcher)
192
197
  dataset = self.inner_process_dataset(item, **kwargs)
193
198
 
194
199
  # Use `item.remote_id` because `inner_process_dataset` could have modified it.
195
- dataset.harvest = self.update_harvest_info(dataset.harvest, item.remote_id)
200
+ dataset.harvest = self.update_dataset_harvest_info(dataset.harvest, item.remote_id)
196
201
  dataset.archived = None
197
202
 
198
203
  # TODO: Apply editable mappings
@@ -220,24 +225,94 @@ class BaseBackend(object):
220
225
  error = HarvestError(message=safe_unicode(e), details=traceback.format_exc())
221
226
  item.errors.append(error)
222
227
  finally:
228
+ current_app.logger.removeHandler(log_catcher)
223
229
  item.ended = datetime.utcnow()
230
+ item.logs = [HarvestLog(level=record.levelname, message=record.getMessage()) for record in log_catcher.records]
224
231
  self.save_job()
225
232
 
226
233
  def is_done(self) -> bool:
227
234
  '''Should be called after process_dataset to know if we reach the max items'''
228
235
  return self.max_items and len(self.job.items) >= self.max_items
229
236
 
230
- def update_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int):
237
+ def process_dataservice(self, remote_id: str, **kwargs) -> bool :
238
+ '''
239
+ Return `True` if the parent should stop iterating because we exceed the number
240
+ of items to process.
241
+ '''
242
+ log.debug(f'Processing dataservice {remote_id}…')
243
+
244
+ # TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice`
245
+ item = HarvestItem(status='started', started=datetime.utcnow(), remote_id=remote_id)
246
+ self.job.items.append(item)
247
+ self.save_job()
248
+
249
+ try:
250
+ if not remote_id:
251
+ raise HarvestSkipException("missing identifier")
252
+
253
+ dataservice = self.inner_process_dataservice(item, **kwargs)
254
+
255
+ dataservice.harvest = self.update_dataservice_harvest_info(dataservice.harvest, remote_id)
256
+ dataservice.archived_at = None
257
+
258
+ # TODO: Apply editable mappings
259
+
260
+ if self.dryrun:
261
+ dataservice.validate()
262
+ else:
263
+ dataservice.save()
264
+ item.dataservice = dataservice
265
+ item.status = 'done'
266
+ except HarvestSkipException as e:
267
+ item.status = 'skipped'
268
+
269
+ log.info(f'Skipped item {item.remote_id} : {safe_unicode(e)}')
270
+ item.errors.append(HarvestError(message=safe_unicode(e)))
271
+ except HarvestValidationError as e:
272
+ item.status = 'failed'
273
+
274
+ log.info(f'Error validating item {item.remote_id} : {safe_unicode(e)}')
275
+ item.errors.append(HarvestError(message=safe_unicode(e)))
276
+ except Exception as e:
277
+ item.status = 'failed'
278
+ log.exception(f'Error while processing {item.remote_id} : {safe_unicode(e)}')
279
+
280
+ error = HarvestError(message=safe_unicode(e), details=traceback.format_exc())
281
+ item.errors.append(error)
282
+ finally:
283
+ item.ended = datetime.utcnow()
284
+ self.save_job()
285
+
286
+ def update_dataset_harvest_info(self, harvest: HarvestDatasetMetadata | None, remote_id: int):
231
287
  if not harvest:
232
288
  harvest = HarvestDatasetMetadata()
233
- harvest.domain = self.source.domain
234
- harvest.remote_id = remote_id
289
+
290
+ harvest.backend = self.display_name
235
291
  harvest.source_id = str(self.source.id)
292
+ harvest.remote_id = remote_id
293
+ harvest.domain = self.source.domain
236
294
  harvest.last_update = datetime.utcnow()
295
+ harvest.archived_at = None
296
+ harvest.archived = None
297
+
298
+ # created_at, modified_at, remote_url, uri, dct_identifier are set in `dataset_from_rdf`
299
+
300
+ return harvest
301
+
302
+ def update_dataservice_harvest_info(self, harvest: HarvestDataserviceMetadata | None, remote_id: int):
303
+ if not harvest:
304
+ harvest = HarvestDataserviceMetadata()
305
+
237
306
  harvest.backend = self.display_name
307
+ harvest.domain = self.source.domain
308
+
309
+ harvest.source_id = str(self.source.id)
310
+ harvest.source_url = str(self.source.url)
311
+
312
+ harvest.remote_id = remote_id
313
+ harvest.last_update = datetime.utcnow()
238
314
 
239
315
  harvest.archived_at = None
240
- harvest.archived = None
241
316
 
242
317
  return harvest
243
318
 
@@ -302,6 +377,28 @@ class BaseBackend(object):
302
377
  return Dataset(owner=self.source.owner)
303
378
 
304
379
  return Dataset()
380
+
381
+ def get_dataservice(self, remote_id):
382
+ '''Get or create a dataservice given its remote ID (and its source)
383
+ We first try to match `source_id` to be source domain independent
384
+ '''
385
+ dataservice = Dataservice.objects(__raw__={
386
+ 'harvest.remote_id': remote_id,
387
+ '$or': [
388
+ {'harvest.domain': self.source.domain},
389
+ {'harvest.source_id': str(self.source.id)},
390
+ ],
391
+ }).first()
392
+
393
+ if dataservice:
394
+ return dataservice
395
+
396
+ if self.source.organization:
397
+ return Dataservice(organization=self.source.organization)
398
+ elif self.source.owner:
399
+ return Dataservice(owner=self.source.owner)
400
+
401
+ return Dataservice()
305
402
 
306
403
  def validate(self, data, schema):
307
404
  '''Perform a data validation against a given schema.
@@ -342,4 +439,15 @@ class BaseBackend(object):
342
439
  msg = str(error)
343
440
  errors.append(msg)
344
441
  msg = '\n- '.join(['Validation error:'] + errors)
345
- raise HarvestValidationError(msg)
442
+ raise HarvestValidationError(msg)
443
+
444
+
445
+ class LogCatcher(logging.Handler):
446
+ records: list[logging.LogRecord]
447
+
448
+ def __init__(self):
449
+ self.records = []
450
+ super().__init__()
451
+
452
+ def emit(self, record):
453
+ self.records.append(record)
@@ -1,20 +1,18 @@
1
1
  import logging
2
2
 
3
- from rdflib import Graph, URIRef
3
+ from rdflib import Graph
4
4
  from rdflib.namespace import RDF
5
5
  import lxml.etree as ET
6
- import boto3
7
6
  from flask import current_app
8
7
  from datetime import date
9
- import json
10
- from typing import Generator, List
8
+ from typing import Generator
11
9
 
12
- from udata.core.dataset.models import Dataset
13
10
  from udata.rdf import (
14
11
  DCAT, DCT, HYDRA, SPDX, namespace_manager, guess_format, url_from_rdf
15
12
  )
16
13
  from udata.core.dataset.rdf import dataset_from_rdf
17
- from udata.storage.s3 import store_as_json, get_from_json
14
+ from udata.core.dataservices.rdf import dataservice_from_rdf
15
+ from udata.storage.s3 import store_as_json
18
16
  from udata.harvest.models import HarvestItem
19
17
 
20
18
  from .base import BaseBackend
@@ -71,7 +69,8 @@ class DcatBackend(BaseBackend):
71
69
  self.process_one_datasets_page(page_number, page)
72
70
  serialized_graphs.append(page.serialize(format=fmt, indent=None))
73
71
 
74
- # TODO call `walk_graph` with `process_dataservices`
72
+ for page_number, page in self.walk_graph(self.source.url, fmt):
73
+ self.process_one_dataservices_page(page_number, page)
75
74
 
76
75
  # The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000).
77
76
  max_harvest_graph_size_in_mongo = current_app.config.get('HARVEST_MAX_CATALOG_SIZE_IN_MONGO')
@@ -145,6 +144,14 @@ class DcatBackend(BaseBackend):
145
144
 
146
145
  if self.is_done():
147
146
  return
147
+
148
+ def process_one_dataservices_page(self, page_number: int, page: Graph):
149
+ for node in page.subjects(RDF.type, DCAT.DataService):
150
+ remote_id = page.value(node, DCT.identifier)
151
+ self.process_dataservice(remote_id, page_number=page_number, page=page, node=node)
152
+
153
+ if self.is_done():
154
+ return
148
155
 
149
156
  def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node):
150
157
  item.kwargs['page_number'] = page_number
@@ -152,6 +159,12 @@ class DcatBackend(BaseBackend):
152
159
  dataset = self.get_dataset(item.remote_id)
153
160
  return dataset_from_rdf(page, dataset, node=node)
154
161
 
162
+ def inner_process_dataservice(self, item: HarvestItem, page_number: int, page: Graph, node):
163
+ item.kwargs['page_number'] = page_number
164
+
165
+ dataservice = self.get_dataservice(item.remote_id)
166
+ return dataservice_from_rdf(page, dataservice, node, [item.dataset for item in self.job.items])
167
+
155
168
  def get_node_from_item(self, graph, item):
156
169
  for node in graph.subjects(RDF.type, DCAT.Dataset):
157
170
  if str(graph.value(node, DCT.identifier)) == item.remote_id:
@@ -263,6 +276,10 @@ class CswIso19139DcatBackend(DcatBackend):
263
276
  def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
264
277
  """
265
278
  Yield all RDF pages as `Graph` from the source
279
+
280
+ Parse CSW graph querying ISO schema.
281
+ Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
282
+ See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
266
283
  """
267
284
  # Load XSLT
268
285
  xsl = ET.fromstring(self.get(self.XSL_URL).content)
@@ -284,6 +301,10 @@ class CswIso19139DcatBackend(DcatBackend):
284
301
  <ogc:PropertyName>dc:type</ogc:PropertyName>
285
302
  <ogc:Literal>dataset</ogc:Literal>
286
303
  </ogc:PropertyIsEqualTo>
304
+ <ogc:PropertyIsEqualTo>
305
+ <ogc:PropertyName>dc:type</ogc:PropertyName>
306
+ <ogc:Literal>service</ogc:Literal>
307
+ </ogc:PropertyIsEqualTo>
287
308
  <ogc:PropertyIsEqualTo>
288
309
  <ogc:PropertyName>dc:type</ogc:PropertyName>
289
310
  <ogc:Literal>series</ogc:Literal>
udata/harvest/models.py CHANGED
@@ -3,6 +3,7 @@ from datetime import datetime
3
3
  import logging
4
4
  from urllib.parse import urlparse
5
5
 
6
+ from udata.core.dataservices.models import Dataservice
6
7
  from werkzeug.utils import cached_property
7
8
 
8
9
  from udata.core.dataset.models import HarvestDatasetMetadata
@@ -49,16 +50,21 @@ class HarvestError(db.EmbeddedDocument):
49
50
  message = db.StringField()
50
51
  details = db.StringField()
51
52
 
53
+ class HarvestLog(db.EmbeddedDocument):
54
+ level = db.StringField()
55
+ message = db.StringField()
52
56
 
53
57
  class HarvestItem(db.EmbeddedDocument):
54
58
  remote_id = db.StringField()
55
59
  dataset = db.ReferenceField(Dataset)
60
+ dataservice = db.ReferenceField(Dataservice)
56
61
  status = db.StringField(choices=list(HARVEST_ITEM_STATUS),
57
62
  default=DEFAULT_HARVEST_ITEM_STATUS, required=True)
58
63
  created = db.DateTimeField(default=datetime.utcnow, required=True)
59
64
  started = db.DateTimeField()
60
65
  ended = db.DateTimeField()
61
66
  errors = db.ListField(db.EmbeddedDocumentField(HarvestError))
67
+ logs = db.ListField(db.EmbeddedDocumentField(HarvestLog), default=[])
62
68
  args = db.ListField(db.StringField())
63
69
  kwargs = db.DictField()
64
70
 
@@ -14,7 +14,7 @@
14
14
  >
15
15
  <dcat:Catalog rdf:about="http://data.test.org/">
16
16
  <dcat:dataset>
17
- <dcat:Dataset>
17
+ <dcat:Dataset rdf:about="dataset-3">
18
18
  <dcterms:title>Dataset 3</dcterms:title>
19
19
  <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T19:01:24.184120</dcterms:modified>
20
20
  <owl:versionInfo>1.0</owl:versionInfo>
@@ -73,7 +73,7 @@
73
73
  </dcat:Dataset>
74
74
  </dcat:dataset>
75
75
  <dcat:dataset>
76
- <dcat:Dataset>
76
+ <dcat:Dataset rdf:about="dataset-2">
77
77
  <dcat:keyword>Tag 1</dcat:keyword>
78
78
  <dcat:distribution rdf:resource="http://data.test.org/datasets/2/resources/1"/>
79
79
  <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T19:01:24.184120</dcterms:modified>
@@ -107,6 +107,17 @@
107
107
  <dct:conformsTo rdf:nodeID="Ne0189e93917c4f67a412fc44883322e7"/>
108
108
  </dcat:Dataset>
109
109
  </dcat:dataset>
110
+ <dcat:service>
111
+ <dcat:DataService rdf:about="https://data.paris2024.org/api/explore/v2.1/">
112
+ <dcterms:title xml:lang="en"><![CDATA[Explore API v2]]></dcterms:title>
113
+ <dcterms:identifier>https://data.paris2024.org/api/explore/v2.1/</dcterms:identifier>
114
+ <dcat:endpointURL rdf:resource="https://data.paris2024.org/api/explore/v2.1/" />
115
+ <dcat:endpointDescription rdf:resource="https://data.paris2024.org/api/explore/v2.1/swagger.json" />
116
+ <dcat:landingPage rdf:resource="https://data.paris2024.org/api/explore/v2.1/console" />
117
+ <dcat:servesDataset rdf:resource="dataset-2" />
118
+ <dcat:servesDataset rdf:resource="dataset-3" />
119
+ </dcat:DataService>
120
+ </dcat:service>
110
121
  <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-15T09:19:51.723691</dcterms:modified>
111
122
  <foaf:homepage>http://data.test.org</foaf:homepage>
112
123
  <dcterms:language>en</dcterms:language>
@@ -9,6 +9,7 @@ import boto3
9
9
  from flask import current_app
10
10
  import xml.etree.ElementTree as ET
11
11
 
12
+ from udata.core.dataservices.models import Dataservice
12
13
  from udata.harvest.models import HarvestJob
13
14
  from udata.models import Dataset
14
15
  from udata.core.organization.factories import OrganizationFactory
@@ -161,6 +162,26 @@ class DcatBackendTest:
161
162
  assert len(datasets['1'].resources) == 2
162
163
  assert len(datasets['2'].resources) == 2
163
164
 
165
+ def test_harvest_dataservices(self, rmock):
166
+ rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data())
167
+
168
+ filename = 'bnodes.xml'
169
+ url = mock_dcat(rmock, filename)
170
+ org = OrganizationFactory()
171
+ source = HarvestSourceFactory(backend='dcat',
172
+ url=url,
173
+ organization=org)
174
+
175
+ actions.run(source.slug)
176
+
177
+ dataservices = Dataservice.objects
178
+
179
+ assert len(dataservices) == 1
180
+ assert dataservices[0].title == "Explore API v2"
181
+ assert dataservices[0].base_api_url == "https://data.paris2024.org/api/explore/v2.1/"
182
+ assert dataservices[0].endpoint_description_url == "https://data.paris2024.org/api/explore/v2.1/swagger.json"
183
+ assert dataservices[0].harvest.remote_url == "https://data.paris2024.org/api/explore/v2.1/console"
184
+
164
185
  def test_harvest_literal_spatial(self, rmock):
165
186
  url = mock_dcat(rmock, 'evian.json')
166
187
  org = OrganizationFactory()
@@ -0,0 +1,35 @@
1
+ '''
2
+ Add a default topic to all reuses in db
3
+ '''
4
+ import logging
5
+
6
+ from bson import DBRef
7
+ import mongoengine
8
+
9
+ from udata.models import Reuse
10
+
11
+ log = logging.getLogger(__name__)
12
+
13
+
14
+ def migrate(db):
15
+ log.info('Processing Reuse.')
16
+
17
+ reuses = Reuse.objects().no_cache().timeout(False)
18
+ count = 0
19
+ errors = 0
20
+
21
+ for reuse in reuses:
22
+ datasets_ids = []
23
+ for dataset in reuse.datasets:
24
+ if not isinstance(dataset, DBRef):
25
+ datasets_ids.append(dataset.id)
26
+ else:
27
+ errors += 1
28
+
29
+ if len(datasets_ids) != len(reuse.datasets):
30
+ reuse.datasets = datasets_ids
31
+ reuse.save()
32
+ count += 1
33
+
34
+ log.info(f'Modified {count} Reuses objects (removed {errors} datasets)')
35
+ log.info('Done')
udata/models/__init__.py CHANGED
@@ -21,6 +21,7 @@ from udata.core.post.models import * # noqa
21
21
  from udata.core.jobs.models import * # noqa
22
22
  from udata.core.tags.models import * # noqa
23
23
  from udata.core.spam.models import * # noqa
24
+ from udata.core.reports.models import * # noqa
24
25
 
25
26
  from udata.features.transfer.models import * # noqa
26
27
  from udata.features.territories.models import * # noqa
udata/rdf.py CHANGED
@@ -1,10 +1,11 @@
1
1
  '''
2
2
  This module centralize udata-wide RDF helpers and configuration
3
3
  '''
4
+ from html.parser import HTMLParser
4
5
  import logging
5
6
  import re
6
7
 
7
- from flask import request, url_for, abort
8
+ from flask import request, url_for, abort, current_app
8
9
 
9
10
  from rdflib import Graph, Literal, URIRef
10
11
  from rdflib.resource import Resource as RdfResource
@@ -13,14 +14,18 @@ from rdflib.namespace import (
13
14
  )
14
15
  from rdflib.util import SUFFIX_FORMAT_MAP, guess_format as raw_guess_format
15
16
  from udata import uris
17
+ from udata.core.contact_point.models import ContactPoint
16
18
  from udata.models import Schema
17
19
  from udata.mongo.errors import FieldValidationError
20
+ from udata.frontend.markdown import parse_html
21
+ from udata.tags import slug as slugify_tag
18
22
 
19
23
  log = logging.getLogger(__name__)
20
24
 
21
25
  # Extra Namespaces
22
26
  ADMS = Namespace('http://www.w3.org/ns/adms#')
23
27
  DCAT = Namespace('http://www.w3.org/ns/dcat#')
28
+ DCATAP = Namespace('http://data.europa.eu/r5r/')
24
29
  HYDRA = Namespace('http://www.w3.org/ns/hydra/core#')
25
30
  SCHEMA = Namespace('http://schema.org/')
26
31
  SCV = Namespace('http://purl.org/NET/scovo#')
@@ -35,6 +40,7 @@ VCARD = Namespace('http://www.w3.org/2006/vcard/ns#')
35
40
 
36
41
  namespace_manager = NamespaceManager(Graph())
37
42
  namespace_manager.bind('dcat', DCAT)
43
+ namespace_manager.bind('dcatap', DCATAP)
38
44
  namespace_manager.bind('dct', DCT)
39
45
  namespace_manager.bind('foaf', FOAF)
40
46
  namespace_manager.bind('foaf', FOAF)
@@ -98,6 +104,17 @@ RDF_EXTENSIONS = {
98
104
  # Includes control characters, unicode surrogate characters and unicode end-of-plane non-characters
99
105
  ILLEGAL_XML_CHARS = '[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]'
100
106
 
107
+ # Map High Value Datasets URIs to keyword categories
108
+ EU_HVD_CATEGORIES = {
109
+ "http://data.europa.eu/bna/c_164e0bf5": "Météorologiques",
110
+ "http://data.europa.eu/bna/c_a9135398": "Entreprises et propriété d'entreprises",
111
+ "http://data.europa.eu/bna/c_ac64a52d": "Géospatiales",
112
+ "http://data.europa.eu/bna/c_b79e35eb": "Mobilité",
113
+ "http://data.europa.eu/bna/c_dd313021": "Observation de la terre et environnement",
114
+ "http://data.europa.eu/bna/c_e1da4e07": "Statistiques"
115
+ }
116
+ HVD_LEGISLATION = 'http://data.europa.eu/eli/reg_impl/2023/138/oj'
117
+ TAG_TO_EU_HVD_CATEGORIES = {slugify_tag(EU_HVD_CATEGORIES[uri]): uri for uri in EU_HVD_CATEGORIES}
101
118
 
102
119
  def guess_format(string):
103
120
  '''Guess format given an extension or a mime-type'''
@@ -212,6 +229,42 @@ CONTEXT = {
212
229
  'totalItems': 'hydra:totalItems',
213
230
  }
214
231
 
232
+ def serialize_value(value):
233
+ if isinstance(value, (URIRef, Literal)):
234
+ return value.toPython()
235
+ elif isinstance(value, RdfResource):
236
+ return value.identifier.toPython()
237
+
238
+
239
+ def rdf_value(obj, predicate, default=None):
240
+ value = obj.value(predicate)
241
+ return serialize_value(value) if value else default
242
+
243
+ class HTMLDetector(HTMLParser):
244
+ def __init__(self, *args, **kwargs):
245
+ HTMLParser.__init__(self, *args, **kwargs)
246
+ self.elements = set()
247
+
248
+ def handle_starttag(self, tag, attrs):
249
+ self.elements.add(tag)
250
+
251
+ def handle_endtag(self, tag):
252
+ self.elements.add(tag)
253
+
254
+
255
+ def is_html(text):
256
+ parser = HTMLDetector()
257
+ parser.feed(text)
258
+ return bool(parser.elements)
259
+
260
+
261
+ def sanitize_html(text):
262
+ text = text.toPython() if isinstance(text, Literal) else ''
263
+ if is_html(text):
264
+ return parse_html(text)
265
+ else:
266
+ return text.strip()
267
+
215
268
 
216
269
  def url_from_rdf(rdf, prop):
217
270
  '''
@@ -224,6 +277,65 @@ def url_from_rdf(rdf, prop):
224
277
  elif isinstance(value, RdfResource):
225
278
  return value.identifier.toPython()
226
279
 
280
+ def theme_labels_from_rdf(rdf):
281
+ '''
282
+ Get theme labels to use as keywords.
283
+ Map HVD keywords from known URIs resources if HVD support is activated.
284
+ '''
285
+ for theme in rdf.objects(DCAT.theme):
286
+ if isinstance(theme, RdfResource):
287
+ uri = theme.identifier.toPython()
288
+ if current_app.config['HVD_SUPPORT'] and uri in EU_HVD_CATEGORIES:
289
+ label = EU_HVD_CATEGORIES[uri]
290
+ # Additionnally yield hvd keyword
291
+ yield 'hvd'
292
+ else:
293
+ label = rdf_value(theme, SKOS.prefLabel)
294
+ else:
295
+ label = theme.toPython()
296
+ if label:
297
+ yield label
298
+
299
+ def themes_from_rdf(rdf):
300
+ tags = [tag.toPython() for tag in rdf.objects(DCAT.keyword)]
301
+ tags += theme_labels_from_rdf(rdf)
302
+ return list(set(tags))
303
+
304
+ def contact_point_from_rdf(rdf, dataset):
305
+ contact_point = rdf.value(DCAT.contactPoint)
306
+ if contact_point:
307
+ name = rdf_value(contact_point, VCARD.fn) or ''
308
+ email = (rdf_value(contact_point, VCARD.hasEmail)
309
+ or rdf_value(contact_point, VCARD.email)
310
+ or rdf_value(contact_point, DCAT.email))
311
+ if not email:
312
+ return
313
+ email = email.replace('mailto:', '').strip()
314
+ if dataset.organization:
315
+ contact_point = ContactPoint.objects(
316
+ name=name, email=email, organization=dataset.organization).first()
317
+ return (contact_point or
318
+ ContactPoint(name=name, email=email, organization=dataset.organization).save())
319
+ elif dataset.owner:
320
+ contact_point = ContactPoint.objects(
321
+ name=name, email=email, owner=dataset.owner).first()
322
+ return (contact_point or
323
+ ContactPoint(name=name, email=email, owner=dataset.owner).save())
324
+
325
+ def remote_url_from_rdf(rdf):
326
+ '''
327
+ Return DCAT.landingPage if found and uri validation succeeds.
328
+ Use RDF identifier as fallback if uri validation succeeds.
329
+ '''
330
+ landing_page = url_from_rdf(rdf, DCAT.landingPage)
331
+ uri = rdf.identifier.toPython()
332
+ for candidate in [landing_page, uri]:
333
+ if candidate:
334
+ try:
335
+ uris.validate(candidate)
336
+ return candidate
337
+ except uris.ValidationError:
338
+ pass
227
339
 
228
340
  def schema_from_rdf(rdf):
229
341
  '''
@@ -252,7 +364,6 @@ def schema_from_rdf(rdf):
252
364
  url = uris.validate(type.identifier.toPython())
253
365
  except uris.ValidationError:
254
366
  pass
255
- pass
256
367
 
257
368
  if url is None:
258
369
  return None
udata/routing.py CHANGED
@@ -217,7 +217,7 @@ def lazy_raise_or_redirect():
217
217
  new_args = request.view_args
218
218
  new_args[name] = value.arg
219
219
  new_url = url_for(request.endpoint, **new_args)
220
- return redirect(new_url, code=308)
220
+ return redirect(new_url, code=204 if request.method == 'OPTIONS' else 308)
221
221
 
222
222
 
223
223
  def init_app(app):