udata 9.0.1.dev29504__py2.py3-none-any.whl → 9.0.1.dev29536__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

udata/api_fields.py CHANGED
@@ -70,10 +70,14 @@ def convert_db_to_field(key, field, info = {}):
70
70
  constructor_write = restx_fields.String
71
71
  elif isinstance(field, mongo_fields.EmbeddedDocumentField):
72
72
  nested_fields = info.get('nested_fields')
73
- if nested_fields is None:
74
- raise ValueError(f"EmbeddedDocumentField `{key}` requires a `nested_fields` param to serialize/deserialize.")
73
+ if nested_fields is not None:
74
+ constructor = lambda **kwargs: restx_fields.Nested(nested_fields, **kwargs)
75
+ elif hasattr(field.document_type_obj, '__read_fields__'):
76
+ constructor_read = lambda **kwargs: restx_fields.Nested(field.document_type_obj.__read_fields__, **kwargs)
77
+ constructor_write = lambda **kwargs: restx_fields.Nested(field.document_type_obj.__write_fields__, **kwargs)
78
+ else:
79
+ raise ValueError(f"EmbeddedDocumentField `{key}` requires a `nested_fields` param to serialize/deserialize or a `@generate_fields()` definition.")
75
80
 
76
- constructor = lambda **kwargs: restx_fields.Nested(nested_fields, **kwargs)
77
81
  else:
78
82
  raise ValueError(f"Unsupported MongoEngine field type {field.__class__.__name__}")
79
83
 
@@ -31,6 +31,35 @@ class DataserviceQuerySet(OwnedQuerySet):
31
31
  db.Q(deleted_at__ne=None) |
32
32
  db.Q(archived_at__ne=None))
33
33
 
34
+ @generate_fields()
35
+ class HarvestMetadata(db.EmbeddedDocument):
36
+ backend = field(db.StringField())
37
+ domain = field(db.StringField())
38
+
39
+ source_id = field(db.StringField())
40
+ source_url = field(db.URLField())
41
+
42
+ remote_id = field(db.StringField())
43
+ remote_url = field(db.URLField())
44
+
45
+ # If the node ID is a `URIRef` it means it links to something external, if it's not an `URIRef` it's often a
46
+ # auto-generated ID just to link multiple RDF node togethers. When exporting as RDF to other catalogs, we
47
+ # want to re-use this node ID (only if it's not auto-generated) to improve compatibility.
48
+ uri = field(
49
+ db.URLField(),
50
+ description="RDF node ID if it's an `URIRef`. `None` if it's not present or if it's a random auto-generated ID inside the graph.",
51
+ )
52
+
53
+ created_at = field(
54
+ db.DateTimeField(),
55
+ description="Date of the creation as provided by the harvested catalog"
56
+ )
57
+ last_update = field(
58
+ db.DateTimeField(),
59
+ description="Date of the last harvesting"
60
+ )
61
+ archived_at = field(db.DateTimeField())
62
+
34
63
  @generate_fields()
35
64
  class Dataservice(WithMetrics, Owned, db.Document):
36
65
  meta = {
@@ -119,12 +148,18 @@ class Dataservice(WithMetrics, Owned, db.Document):
119
148
  },
120
149
  )
121
150
 
151
+ harvest = field(
152
+ db.EmbeddedDocumentField(HarvestMetadata),
153
+ readonly=True,
154
+ )
155
+
122
156
  @function_field(description="Link to the API endpoint for this dataservice")
123
157
  def self_api_url(self):
124
158
  return endpoint_for('api.dataservice', dataservice=self, _external=True)
125
159
 
126
- def self_web_url():
127
- pass
160
+ @function_field(description="Link to the udata web page for this dataservice")
161
+ def self_web_url(self):
162
+ return endpoint_for('dataservices.show', dataservice=self, _external=True)
128
163
 
129
164
  # TODO
130
165
  # frequency = db.StringField(choices=list(UPDATE_FREQUENCIES.keys()))
@@ -0,0 +1,58 @@
1
+
2
+ from datetime import datetime
3
+ from typing import List, Optional
4
+ from rdflib import RDF, Graph, URIRef
5
+
6
+ from udata.core.dataservices.models import Dataservice, HarvestMetadata as HarvestDataserviceMetadata
7
+ from udata.core.dataset.models import Dataset, License
8
+ from udata.core.dataset.rdf import sanitize_html
9
+ from udata.harvest.models import HarvestSource
10
+ from udata.rdf import DCAT, DCT, contact_point_from_rdf, rdf_value, remote_url_from_rdf, theme_labels_from_rdf, themes_from_rdf, url_from_rdf
11
+
12
+ def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, all_datasets: List[Dataset]) -> Dataservice :
13
+ '''
14
+ Create or update a dataset from a RDF/DCAT graph
15
+ '''
16
+ if node is None: # Assume first match is the only match
17
+ node = graph.value(predicate=RDF.type, object=DCAT.DataService)
18
+
19
+ d = graph.resource(node)
20
+
21
+ dataservice.title = rdf_value(d, DCT.title)
22
+ dataservice.description = sanitize_html(d.value(DCT.description) or d.value(DCT.abstract))
23
+
24
+ dataservice.base_api_url = url_from_rdf(d, DCAT.endpointURL)
25
+ dataservice.endpoint_description_url = url_from_rdf(d, DCAT.endpointDescription)
26
+
27
+ dataservice.contact_point = contact_point_from_rdf(d, dataservice) or dataservice.contact_point
28
+
29
+ datasets = []
30
+ for dataset_node in d.objects(DCAT.servesDataset):
31
+ id = dataset_node.value(DCT.identifier)
32
+ dataset = next((d for d in all_datasets if d is not None and d.harvest.remote_id == id), None)
33
+
34
+ if dataset is None:
35
+ # We try with `endswith` because Europe XSLT have problems with IDs. Sometimes they are prefixed with the domain of the catalog, sometimes not.
36
+ dataset = next((d for d in all_datasets if d is not None and d.harvest.remote_id.endswith(id)), None)
37
+
38
+ if dataset is not None:
39
+ datasets.append(dataset.id)
40
+
41
+ if datasets:
42
+ dataservice.datasets = datasets
43
+
44
+ license = rdf_value(d, DCT.license)
45
+ if license is not None:
46
+ dataservice.license = License.guess(license)
47
+
48
+ if not dataservice.harvest:
49
+ dataservice.harvest = HarvestDataserviceMetadata()
50
+
51
+ dataservice.harvest.uri = d.identifier.toPython() if isinstance(d.identifier, URIRef) else None
52
+ dataservice.harvest.remote_url = remote_url_from_rdf(d)
53
+ dataservice.harvest.created_at = rdf_value(d, DCT.issued)
54
+ dataservice.metadata_modified_at = rdf_value(d, DCT.modified)
55
+
56
+ dataservice.tags = themes_from_rdf(d)
57
+
58
+ return dataservice
udata/core/dataset/rdf.py CHANGED
@@ -6,7 +6,6 @@ import json
6
6
  import logging
7
7
 
8
8
  from datetime import date
9
- from html.parser import HTMLParser
10
9
  from typing import Optional
11
10
  from dateutil.parser import parse as parse_dt
12
11
  from flask import current_app
@@ -18,14 +17,14 @@ from mongoengine.errors import ValidationError
18
17
 
19
18
  from udata import i18n, uris
20
19
  from udata.core.spatial.models import SpatialCoverage
21
- from udata.frontend.markdown import parse_html
22
20
  from udata.core.dataset.models import HarvestDatasetMetadata, HarvestResourceMetadata
23
- from udata.models import db, ContactPoint
21
+ from udata.harvest.exceptions import HarvestSkipException
22
+ from udata.models import db
24
23
  from udata.rdf import (
25
- DCAT, DCATAP, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS,
26
- HVD_LEGISLATION, namespace_manager, schema_from_rdf, url_from_rdf
24
+ DCAT, DCATAP, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, TAG_TO_EU_HVD_CATEGORIES, RDFS,
25
+ namespace_manager, rdf_value, remote_url_from_rdf, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf, HVD_LEGISLATION,
26
+ contact_point_from_rdf,
27
27
  )
28
- from udata.tags import slug as slugify_tag
29
28
  from udata.utils import get_by, safe_unicode
30
29
  from udata.uris import endpoint_for
31
30
 
@@ -77,44 +76,6 @@ EU_RDF_REQUENCIES = {
77
76
  EUFREQ.NEVER: 'punctual',
78
77
  }
79
78
 
80
- # Map High Value Datasets URIs to keyword categories
81
- EU_HVD_CATEGORIES = {
82
- "http://data.europa.eu/bna/c_164e0bf5": "Météorologiques",
83
- "http://data.europa.eu/bna/c_a9135398": "Entreprises et propriété d'entreprises",
84
- "http://data.europa.eu/bna/c_ac64a52d": "Géospatiales",
85
- "http://data.europa.eu/bna/c_b79e35eb": "Mobilité",
86
- "http://data.europa.eu/bna/c_dd313021": "Observation de la terre et environnement",
87
- "http://data.europa.eu/bna/c_e1da4e07": "Statistiques"
88
- }
89
- TAG_TO_EU_HVD_CATEGORIES = {slugify_tag(EU_HVD_CATEGORIES[uri]): uri for uri in EU_HVD_CATEGORIES}
90
-
91
-
92
- class HTMLDetector(HTMLParser):
93
- def __init__(self, *args, **kwargs):
94
- HTMLParser.__init__(self, *args, **kwargs)
95
- self.elements = set()
96
-
97
- def handle_starttag(self, tag, attrs):
98
- self.elements.add(tag)
99
-
100
- def handle_endtag(self, tag):
101
- self.elements.add(tag)
102
-
103
-
104
- def is_html(text):
105
- parser = HTMLDetector()
106
- parser.feed(text)
107
- return bool(parser.elements)
108
-
109
-
110
- def sanitize_html(text):
111
- text = text.toPython() if isinstance(text, Literal) else ''
112
- if is_html(text):
113
- return parse_html(text)
114
- else:
115
- return text.strip()
116
-
117
-
118
79
  def temporal_to_rdf(daterange, graph=None):
119
80
  if not daterange:
120
81
  return
@@ -255,18 +216,6 @@ CHECKSUM_ALGORITHMS = {
255
216
  }
256
217
 
257
218
 
258
- def serialize_value(value):
259
- if isinstance(value, (URIRef, Literal)):
260
- return value.toPython()
261
- elif isinstance(value, RdfResource):
262
- return value.identifier.toPython()
263
-
264
-
265
- def rdf_value(obj, predicate, default=None):
266
- value = obj.value(predicate)
267
- return serialize_value(value) if value else default
268
-
269
-
270
219
  def temporal_from_literal(text):
271
220
  '''
272
221
  Parse a temporal coverage from a literal ie. either:
@@ -341,29 +290,6 @@ def temporal_from_rdf(period_of_time):
341
290
  # so we log the error for future investigation and improvement
342
291
  log.warning('Unable to parse temporal coverage', exc_info=True)
343
292
 
344
-
345
- def contact_point_from_rdf(rdf, dataset):
346
- contact_point = rdf.value(DCAT.contactPoint)
347
- if contact_point:
348
- name = rdf_value(contact_point, VCARD.fn) or ''
349
- email = (rdf_value(contact_point, VCARD.hasEmail)
350
- or rdf_value(contact_point, VCARD.email)
351
- or rdf_value(contact_point, DCAT.email))
352
- if not email:
353
- return
354
- email = email.replace('mailto:', '').strip()
355
- if dataset.organization:
356
- contact_point = ContactPoint.objects(
357
- name=name, email=email, organization=dataset.organization).first()
358
- return (contact_point or
359
- ContactPoint(name=name, email=email, organization=dataset.organization).save())
360
- elif dataset.owner:
361
- contact_point = ContactPoint.objects(
362
- name=name, email=email, owner=dataset.owner).first()
363
- return (contact_point or
364
- ContactPoint(name=name, email=email, owner=dataset.owner).save())
365
-
366
-
367
293
  def spatial_from_rdf(graph):
368
294
  geojsons = []
369
295
  for term in graph.objects(DCT.spatial):
@@ -503,43 +429,6 @@ def title_from_rdf(rdf, url):
503
429
  else:
504
430
  return i18n._('Nameless resource')
505
431
 
506
-
507
- def remote_url_from_rdf(rdf):
508
- '''
509
- Return DCAT.landingPage if found and uri validation succeeds.
510
- Use RDF identifier as fallback if uri validation succeeds.
511
- '''
512
- landing_page = url_from_rdf(rdf, DCAT.landingPage)
513
- uri = rdf.identifier.toPython()
514
- for candidate in [landing_page, uri]:
515
- if candidate:
516
- try:
517
- uris.validate(candidate)
518
- return candidate
519
- except uris.ValidationError:
520
- pass
521
-
522
-
523
- def theme_labels_from_rdf(rdf):
524
- '''
525
- Get theme labels to use as keywords.
526
- Map HVD keywords from known URIs resources if HVD support is activated.
527
- '''
528
- for theme in rdf.objects(DCAT.theme):
529
- if isinstance(theme, RdfResource):
530
- uri = theme.identifier.toPython()
531
- if current_app.config['HVD_SUPPORT'] and uri in EU_HVD_CATEGORIES:
532
- label = EU_HVD_CATEGORIES[uri]
533
- # Additionnally yield hvd keyword
534
- yield 'hvd'
535
- else:
536
- label = rdf_value(theme, SKOS.prefLabel)
537
- else:
538
- label = theme.toPython()
539
- if label:
540
- yield label
541
-
542
-
543
432
  def resource_from_rdf(graph_or_distrib, dataset=None, is_additionnal=False):
544
433
  '''
545
434
  Map a Resource domain model to a DCAT/RDF graph
@@ -617,6 +506,9 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None):
617
506
  d = graph.resource(node)
618
507
 
619
508
  dataset.title = rdf_value(d, DCT.title)
509
+ if not dataset.title:
510
+ raise HarvestSkipException("missing title on dataset")
511
+
620
512
  # Support dct:abstract if dct:description is missing (sometimes used instead)
621
513
  description = d.value(DCT.description) or d.value(DCT.abstract)
622
514
  dataset.description = sanitize_html(description)
@@ -634,9 +526,7 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None):
634
526
  if acronym:
635
527
  dataset.acronym = acronym
636
528
 
637
- tags = [tag.toPython() for tag in d.objects(DCAT.keyword)]
638
- tags += theme_labels_from_rdf(d)
639
- dataset.tags = list(set(tags))
529
+ dataset.tags = themes_from_rdf(d)
640
530
 
641
531
  temporal_coverage = temporal_from_rdf(d.value(DCT.temporal))
642
532
  if temporal_coverage:
udata/harvest/api.py CHANGED
@@ -5,6 +5,7 @@ from flask import request
5
5
  from udata.api import api, API, fields
6
6
  from udata.auth import admin_permission
7
7
 
8
+ from udata.core.dataservices.models import Dataservice
8
9
  from udata.core.dataset.api_fields import dataset_ref_fields, dataset_fields
9
10
  from udata.core.organization.api_fields import org_ref_fields
10
11
  from udata.core.organization.permissions import EditOrganizationPermission
@@ -45,6 +46,9 @@ item_fields = api.model('HarvestItem', {
45
46
  'dataset': fields.Nested(dataset_ref_fields,
46
47
  description='The processed dataset',
47
48
  allow_null=True),
49
+ 'dataservice': fields.Nested(Dataservice.__read_fields__,
50
+ description='The processed dataservice',
51
+ allow_null=True),
48
52
  'status': fields.String(description='The item status',
49
53
  required=True,
50
54
  enum=list(HARVEST_ITEM_STATUS)),
@@ -8,9 +8,11 @@ from uuid import UUID
8
8
  import requests
9
9
 
10
10
  from flask import current_app
11
+ from udata.core.dataservices.models import Dataservice
11
12
  from voluptuous import MultipleInvalid, RequiredFieldInvalid
12
13
 
13
14
  from udata.core.dataset.models import HarvestDatasetMetadata
15
+ from udata.core.dataservices.models import HarvestMetadata as HarvestDataserviceMetadata
14
16
  from udata.models import Dataset
15
17
  from udata.utils import safe_unicode
16
18
 
@@ -72,8 +74,6 @@ class BaseBackend(object):
72
74
  """
73
75
  Base class that wrap children methods to add error management and debug logs.
74
76
  Also provides a few helpers needed on all or some backends.
75
-
76
-
77
77
  """
78
78
 
79
79
  name = None
@@ -139,6 +139,9 @@ class BaseBackend(object):
139
139
  def inner_process_dataset(self, item: HarvestItem) -> Dataset:
140
140
  raise NotImplementedError
141
141
 
142
+ def inner_process_dataservice(self, item: HarvestItem) -> Dataservice:
143
+ raise NotImplementedError
144
+
142
145
  def harvest(self):
143
146
  log.debug(f'Starting harvesting {self.source.name} ({self.source.url})…')
144
147
  factory = HarvestJob if self.dryrun else HarvestJob.objects.create
@@ -195,7 +198,7 @@ class BaseBackend(object):
195
198
  dataset = self.inner_process_dataset(item, **kwargs)
196
199
 
197
200
  # Use `item.remote_id` because `inner_process_dataset` could have modified it.
198
- dataset.harvest = self.update_harvest_info(dataset.harvest, item.remote_id)
201
+ dataset.harvest = self.update_dataset_harvest_info(dataset.harvest, item.remote_id)
199
202
  dataset.archived = None
200
203
 
201
204
  # TODO: Apply editable mappings
@@ -232,17 +235,85 @@ class BaseBackend(object):
232
235
  '''Should be called after process_dataset to know if we reach the max items'''
233
236
  return self.max_items and len(self.job.items) >= self.max_items
234
237
 
235
- def update_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int):
238
+ def process_dataservice(self, remote_id: str, **kwargs) -> bool :
239
+ '''
240
+ Return `True` if the parent should stop iterating because we exceed the number
241
+ of items to process.
242
+ '''
243
+ log.debug(f'Processing dataservice {remote_id}…')
244
+
245
+ # TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice`
246
+ item = HarvestItem(status='started', started=datetime.utcnow(), remote_id=remote_id)
247
+ self.job.items.append(item)
248
+ self.save_job()
249
+
250
+ try:
251
+ if not remote_id:
252
+ raise HarvestSkipException("missing identifier")
253
+
254
+ dataservice = self.inner_process_dataservice(item, **kwargs)
255
+
256
+ dataservice.harvest = self.update_dataservice_harvest_info(dataservice.harvest, remote_id)
257
+ dataservice.archived_at = None
258
+
259
+ # TODO: Apply editable mappings
260
+
261
+ if self.dryrun:
262
+ dataservice.validate()
263
+ else:
264
+ dataservice.save()
265
+ item.dataservice = dataservice
266
+ item.status = 'done'
267
+ except HarvestSkipException as e:
268
+ item.status = 'skipped'
269
+
270
+ log.info(f'Skipped item {item.remote_id} : {safe_unicode(e)}')
271
+ item.errors.append(HarvestError(message=safe_unicode(e)))
272
+ except HarvestValidationError as e:
273
+ item.status = 'failed'
274
+
275
+ log.info(f'Error validating item {item.remote_id} : {safe_unicode(e)}')
276
+ item.errors.append(HarvestError(message=safe_unicode(e)))
277
+ except Exception as e:
278
+ item.status = 'failed'
279
+ log.exception(f'Error while processing {item.remote_id} : {safe_unicode(e)}')
280
+
281
+ error = HarvestError(message=safe_unicode(e), details=traceback.format_exc())
282
+ item.errors.append(error)
283
+ finally:
284
+ item.ended = datetime.utcnow()
285
+ self.save_job()
286
+
287
+ def update_dataset_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int):
236
288
  if not harvest:
237
289
  harvest = HarvestDatasetMetadata()
238
- harvest.domain = self.source.domain
239
- harvest.remote_id = remote_id
290
+
291
+ harvest.backend = self.display_name
240
292
  harvest.source_id = str(self.source.id)
293
+ harvest.remote_id = remote_id
294
+ harvest.domain = self.source.domain
241
295
  harvest.last_update = datetime.utcnow()
296
+ harvest.archived_at = None
297
+ harvest.archived = None
298
+
299
+ # created_at, modified_at, remote_url, uri, dct_identifier are set in `dataset_from_rdf`
300
+
301
+ return harvest
302
+
303
+ def update_dataservice_harvest_info(self, harvest: Optional[HarvestDataserviceMetadata], remote_id: int):
304
+ if not harvest:
305
+ harvest = HarvestDataserviceMetadata()
306
+
242
307
  harvest.backend = self.display_name
308
+ harvest.domain = self.source.domain
309
+
310
+ harvest.source_id = str(self.source.id)
311
+ harvest.source_url = str(self.source.url)
312
+
313
+ harvest.remote_id = remote_id
314
+ harvest.last_update = datetime.utcnow()
243
315
 
244
316
  harvest.archived_at = None
245
- harvest.archived = None
246
317
 
247
318
  return harvest
248
319
 
@@ -307,6 +378,28 @@ class BaseBackend(object):
307
378
  return Dataset(owner=self.source.owner)
308
379
 
309
380
  return Dataset()
381
+
382
+ def get_dataservice(self, remote_id):
383
+ '''Get or create a dataservice given its remote ID (and its source)
384
+ We first try to match `source_id` to be source domain independent
385
+ '''
386
+ dataservice = Dataservice.objects(__raw__={
387
+ 'harvest.remote_id': remote_id,
388
+ '$or': [
389
+ {'harvest.domain': self.source.domain},
390
+ {'harvest.source_id': str(self.source.id)},
391
+ ],
392
+ }).first()
393
+
394
+ if dataservice:
395
+ return dataservice
396
+
397
+ if self.source.organization:
398
+ return Dataservice(organization=self.source.organization)
399
+ elif self.source.owner:
400
+ return Dataservice(owner=self.source.owner)
401
+
402
+ return Dataservice()
310
403
 
311
404
  def validate(self, data, schema):
312
405
  '''Perform a data validation against a given schema.
@@ -1,20 +1,19 @@
1
1
  import logging
2
2
 
3
- from rdflib import Graph, URIRef
3
+ from rdflib import Graph
4
4
  from rdflib.namespace import RDF
5
5
  import lxml.etree as ET
6
- import boto3
7
6
  from flask import current_app
8
7
  from datetime import date
9
- import json
10
- from typing import Generator, List
8
+ from typing import Generator
9
+ from typing import Generator
11
10
 
12
- from udata.core.dataset.models import Dataset
13
11
  from udata.rdf import (
14
12
  DCAT, DCT, HYDRA, SPDX, namespace_manager, guess_format, url_from_rdf
15
13
  )
16
14
  from udata.core.dataset.rdf import dataset_from_rdf
17
- from udata.storage.s3 import store_as_json, get_from_json
15
+ from udata.core.dataservices.rdf import dataservice_from_rdf
16
+ from udata.storage.s3 import store_as_json
18
17
  from udata.harvest.models import HarvestItem
19
18
 
20
19
  from .base import BaseBackend
@@ -71,7 +70,8 @@ class DcatBackend(BaseBackend):
71
70
  self.process_one_datasets_page(page_number, page)
72
71
  serialized_graphs.append(page.serialize(format=fmt, indent=None))
73
72
 
74
- # TODO call `walk_graph` with `process_dataservices`
73
+ for page_number, page in self.walk_graph(self.source.url, fmt):
74
+ self.process_one_dataservices_page(page_number, page)
75
75
 
76
76
  # The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000).
77
77
  max_harvest_graph_size_in_mongo = current_app.config.get('HARVEST_MAX_CATALOG_SIZE_IN_MONGO')
@@ -145,6 +145,14 @@ class DcatBackend(BaseBackend):
145
145
 
146
146
  if self.is_done():
147
147
  return
148
+
149
+ def process_one_dataservices_page(self, page_number: int, page: Graph):
150
+ for node in page.subjects(RDF.type, DCAT.DataService):
151
+ remote_id = page.value(node, DCT.identifier)
152
+ self.process_dataservice(remote_id, page_number=page_number, page=page, node=node)
153
+
154
+ if self.is_done():
155
+ return
148
156
 
149
157
  def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node):
150
158
  item.kwargs['page_number'] = page_number
@@ -152,6 +160,12 @@ class DcatBackend(BaseBackend):
152
160
  dataset = self.get_dataset(item.remote_id)
153
161
  return dataset_from_rdf(page, dataset, node=node)
154
162
 
163
+ def inner_process_dataservice(self, item: HarvestItem, page_number: int, page: Graph, node):
164
+ item.kwargs['page_number'] = page_number
165
+
166
+ dataservice = self.get_dataservice(item.remote_id)
167
+ return dataservice_from_rdf(page, dataservice, node, [item.dataset for item in self.job.items])
168
+
155
169
  def get_node_from_item(self, graph, item):
156
170
  for node in graph.subjects(RDF.type, DCAT.Dataset):
157
171
  if str(graph.value(node, DCT.identifier)) == item.remote_id:
@@ -263,6 +277,10 @@ class CswIso19139DcatBackend(DcatBackend):
263
277
  def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
264
278
  """
265
279
  Yield all RDF pages as `Graph` from the source
280
+
281
+ Parse CSW graph querying ISO schema.
282
+ Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
283
+ See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
266
284
  """
267
285
  # Load XSLT
268
286
  xsl = ET.fromstring(self.get(self.XSL_URL).content)
@@ -284,6 +302,10 @@ class CswIso19139DcatBackend(DcatBackend):
284
302
  <ogc:PropertyName>dc:type</ogc:PropertyName>
285
303
  <ogc:Literal>dataset</ogc:Literal>
286
304
  </ogc:PropertyIsEqualTo>
305
+ <ogc:PropertyIsEqualTo>
306
+ <ogc:PropertyName>dc:type</ogc:PropertyName>
307
+ <ogc:Literal>service</ogc:Literal>
308
+ </ogc:PropertyIsEqualTo>
287
309
  <ogc:PropertyIsEqualTo>
288
310
  <ogc:PropertyName>dc:type</ogc:PropertyName>
289
311
  <ogc:Literal>series</ogc:Literal>
udata/harvest/models.py CHANGED
@@ -3,6 +3,7 @@ from datetime import datetime
3
3
  import logging
4
4
  from urllib.parse import urlparse
5
5
 
6
+ from udata.core.dataservices.models import Dataservice
6
7
  from werkzeug.utils import cached_property
7
8
 
8
9
  from udata.core.dataset.models import HarvestDatasetMetadata
@@ -56,6 +57,7 @@ class HarvestLog(db.EmbeddedDocument):
56
57
  class HarvestItem(db.EmbeddedDocument):
57
58
  remote_id = db.StringField()
58
59
  dataset = db.ReferenceField(Dataset)
60
+ dataservice = db.ReferenceField(Dataservice)
59
61
  status = db.StringField(choices=list(HARVEST_ITEM_STATUS),
60
62
  default=DEFAULT_HARVEST_ITEM_STATUS, required=True)
61
63
  created = db.DateTimeField(default=datetime.utcnow, required=True)
@@ -14,7 +14,7 @@
14
14
  >
15
15
  <dcat:Catalog rdf:about="http://data.test.org/">
16
16
  <dcat:dataset>
17
- <dcat:Dataset>
17
+ <dcat:Dataset rdf:about="dataset-3">
18
18
  <dcterms:title>Dataset 3</dcterms:title>
19
19
  <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T19:01:24.184120</dcterms:modified>
20
20
  <owl:versionInfo>1.0</owl:versionInfo>
@@ -73,7 +73,7 @@
73
73
  </dcat:Dataset>
74
74
  </dcat:dataset>
75
75
  <dcat:dataset>
76
- <dcat:Dataset>
76
+ <dcat:Dataset rdf:about="dataset-2">
77
77
  <dcat:keyword>Tag 1</dcat:keyword>
78
78
  <dcat:distribution rdf:resource="http://data.test.org/datasets/2/resources/1"/>
79
79
  <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T19:01:24.184120</dcterms:modified>
@@ -107,6 +107,17 @@
107
107
  <dct:conformsTo rdf:nodeID="Ne0189e93917c4f67a412fc44883322e7"/>
108
108
  </dcat:Dataset>
109
109
  </dcat:dataset>
110
+ <dcat:service>
111
+ <dcat:DataService rdf:about="https://data.paris2024.org/api/explore/v2.1/">
112
+ <dcterms:title xml:lang="en"><![CDATA[Explore API v2]]></dcterms:title>
113
+ <dcterms:identifier>https://data.paris2024.org/api/explore/v2.1/</dcterms:identifier>
114
+ <dcat:endpointURL rdf:resource="https://data.paris2024.org/api/explore/v2.1/" />
115
+ <dcat:endpointDescription rdf:resource="https://data.paris2024.org/api/explore/v2.1/swagger.json" />
116
+ <dcat:landingPage rdf:resource="https://data.paris2024.org/api/explore/v2.1/console" />
117
+ <dcat:servesDataset rdf:resource="dataset-2" />
118
+ <dcat:servesDataset rdf:resource="dataset-3" />
119
+ </dcat:DataService>
120
+ </dcat:service>
110
121
  <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-15T09:19:51.723691</dcterms:modified>
111
122
  <foaf:homepage>http://data.test.org</foaf:homepage>
112
123
  <dcterms:language>en</dcterms:language>
@@ -9,6 +9,7 @@ import boto3
9
9
  from flask import current_app
10
10
  import xml.etree.ElementTree as ET
11
11
 
12
+ from udata.core.dataservices.models import Dataservice
12
13
  from udata.harvest.models import HarvestJob
13
14
  from udata.models import Dataset
14
15
  from udata.core.organization.factories import OrganizationFactory
@@ -161,6 +162,26 @@ class DcatBackendTest:
161
162
  assert len(datasets['1'].resources) == 2
162
163
  assert len(datasets['2'].resources) == 2
163
164
 
165
+ def test_harvest_dataservices(self, rmock):
166
+ rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data())
167
+
168
+ filename = 'bnodes.xml'
169
+ url = mock_dcat(rmock, filename)
170
+ org = OrganizationFactory()
171
+ source = HarvestSourceFactory(backend='dcat',
172
+ url=url,
173
+ organization=org)
174
+
175
+ actions.run(source.slug)
176
+
177
+ dataservices = Dataservice.objects
178
+
179
+ assert len(dataservices) == 1
180
+ assert dataservices[0].title == "Explore API v2"
181
+ assert dataservices[0].base_api_url == "https://data.paris2024.org/api/explore/v2.1/"
182
+ assert dataservices[0].endpoint_description_url == "https://data.paris2024.org/api/explore/v2.1/swagger.json"
183
+ assert dataservices[0].harvest.remote_url == "https://data.paris2024.org/api/explore/v2.1/console"
184
+
164
185
  def test_harvest_literal_spatial(self, rmock):
165
186
  url = mock_dcat(rmock, 'evian.json')
166
187
  org = OrganizationFactory()