udata 10.2.1.dev34683__py2.py3-none-any.whl → 10.2.1.dev34728__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (25) hide show
  1. udata/core/dataset/rdf.py +6 -0
  2. udata/harvest/backends/base.py +1 -1
  3. udata/harvest/backends/dcat.py +48 -6
  4. udata/harvest/tests/dcat/catalog.xml +15 -2
  5. udata/harvest/tests/factories.py +1 -1
  6. udata/harvest/tests/test_base_backend.py +2 -2
  7. udata/harvest/tests/test_dcat_backend.py +21 -2
  8. udata/static/chunks/{11.0f04e49a40a0a381bcce.js → 11.b6f741fcc366abfad9c4.js} +3 -3
  9. udata/static/chunks/{11.0f04e49a40a0a381bcce.js.map → 11.b6f741fcc366abfad9c4.js.map} +1 -1
  10. udata/static/chunks/{13.f29411b06be1883356a3.js → 13.2d06442dd9a05d9777b5.js} +2 -2
  11. udata/static/chunks/{13.f29411b06be1883356a3.js.map → 13.2d06442dd9a05d9777b5.js.map} +1 -1
  12. udata/static/chunks/{17.3bd0340930d4a314ce9c.js → 17.e8e4caaad5cb0cc0bacc.js} +2 -2
  13. udata/static/chunks/{17.3bd0340930d4a314ce9c.js.map → 17.e8e4caaad5cb0cc0bacc.js.map} +1 -1
  14. udata/static/chunks/{19.0586efa786ebf09fb288.js → 19.f03a102365af4315f9db.js} +3 -3
  15. udata/static/chunks/{19.0586efa786ebf09fb288.js.map → 19.f03a102365af4315f9db.js.map} +1 -1
  16. udata/static/chunks/{8.b966402f5d680d4bdf4a.js → 8.778091d55cd8ea39af6b.js} +2 -2
  17. udata/static/chunks/{8.b966402f5d680d4bdf4a.js.map → 8.778091d55cd8ea39af6b.js.map} +1 -1
  18. udata/static/common.js +1 -1
  19. udata/static/common.js.map +1 -1
  20. {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/METADATA +3 -2
  21. {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/RECORD +25 -25
  22. {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/LICENSE +0 -0
  23. {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/WHEEL +0 -0
  24. {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/entry_points.txt +0 -0
  25. {udata-10.2.1.dev34683.dist-info → udata-10.2.1.dev34728.dist-info}/top_level.txt +0 -0
udata/core/dataset/rdf.py CHANGED
@@ -722,6 +722,10 @@ def resource_from_rdf(graph_or_distrib, dataset=None, is_additionnal=False):
722
722
  resource.checksum.type = algorithm
723
723
  if is_additionnal:
724
724
  resource.type = "other"
725
+ elif distrib.value(DCAT.accessService):
726
+ # The distribution has a DCAT.accessService property, we deduce
727
+ # that the distribution is of type API
728
+ resource.type = "api"
725
729
 
726
730
  identifier = rdf_value(distrib, DCT.identifier)
727
731
  uri = distrib.identifier.toPython() if isinstance(distrib.identifier, URIRef) else None
@@ -751,6 +755,8 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None, remote_url_prefix: s
751
755
 
752
756
  dataset.title = rdf_value(d, DCT.title)
753
757
  if not dataset.title:
758
+ # If the dataset is externaly defined (so without title and just with a link to the dataset XML)
759
+ # we should have skipped it way before in :ExcludeExternalyDefinedDataset
754
760
  raise HarvestSkipException("missing title on dataset")
755
761
 
756
762
  # Support dct:abstract if dct:description is missing (sometimes used instead)
@@ -256,7 +256,7 @@ class BaseBackend(object):
256
256
  ]
257
257
  self.save_job()
258
258
 
259
- def is_done(self) -> bool:
259
+ def has_reached_max_items(self) -> bool:
260
260
  """Should be called after process_dataset to know if we reach the max items"""
261
261
  return self.max_items and len(self.job.items) >= self.max_items
262
262
 
@@ -9,7 +9,7 @@ from rdflib.namespace import RDF
9
9
 
10
10
  from udata.core.dataservices.rdf import dataservice_from_rdf
11
11
  from udata.core.dataset.rdf import dataset_from_rdf
12
- from udata.harvest.models import HarvestItem
12
+ from udata.harvest.models import HarvestError, HarvestItem
13
13
  from udata.i18n import gettext as _
14
14
  from udata.rdf import (
15
15
  DCAT,
@@ -18,6 +18,7 @@ from udata.rdf import (
18
18
  SPDX,
19
19
  guess_format,
20
20
  namespace_manager,
21
+ rdf_value,
21
22
  url_from_rdf,
22
23
  )
23
24
  from udata.storage.s3 import store_as_json
@@ -77,9 +78,19 @@ class DcatBackend(BaseBackend):
77
78
  self.process_one_datasets_page(page_number, page)
78
79
  serialized_graphs.append(page.serialize(format=fmt, indent=None))
79
80
 
81
+ # We do a second pass to have all datasets in memory and attach datasets
82
+ # to dataservices. It could be better to be one pass of graph walking and
83
+ # then one pass of attaching datasets to dataservices.
80
84
  for page_number, page in self.walk_graph(self.source.url, fmt):
81
85
  self.process_one_dataservices_page(page_number, page)
82
86
 
87
+ if not self.dryrun and self.has_reached_max_items():
88
+ # We have reached the max_items limit. Warn the user that all the datasets may not be present.
89
+ error = HarvestError(
90
+ message=f"{self.max_items} max items reached, not all datasets/dataservices were retrieved"
91
+ )
92
+ self.job.errors.append(error)
93
+
83
94
  # The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000).
84
95
  max_harvest_graph_size_in_mongo = current_app.config.get(
85
96
  "HARVEST_MAX_CATALOG_SIZE_IN_MONGO"
@@ -146,7 +157,7 @@ class DcatBackend(BaseBackend):
146
157
  break
147
158
 
148
159
  yield page_number, subgraph
149
- if self.is_done():
160
+ if self.has_reached_max_items():
150
161
  return
151
162
 
152
163
  page_number += 1
@@ -154,17 +165,48 @@ class DcatBackend(BaseBackend):
154
165
  def process_one_datasets_page(self, page_number: int, page: Graph):
155
166
  for node in page.subjects(RDF.type, DCAT.Dataset):
156
167
  remote_id = page.value(node, DCT.identifier)
168
+ if self.is_dataset_external_to_this_page(page, node):
169
+ continue
170
+
157
171
  self.process_dataset(remote_id, page_number=page_number, page=page, node=node)
158
172
 
159
- if self.is_done():
173
+ if self.has_reached_max_items():
160
174
  return
161
175
 
176
+ def is_dataset_external_to_this_page(self, page: Graph, node) -> bool:
177
+ # In dataservice nodes we have `servesDataset` or `hasPart` that can contains nodes
178
+ # with type=dataset. We don't want to process them because these nodes are empty (they
179
+ # only contains a link to the dataset definition).
180
+ # These datasets are either present in the catalog in previous or next pages or
181
+ # external from the catalog we are currently harvesting (so we don't want to harvest them).
182
+ # First we thought of skipping them inside `dataset_from_rdf` (see :ExcludeExternalyDefinedDataset)
183
+ # but it creates a lot of "fake" items in the job and raising problems (reaching the max harvest item for
184
+ # example and not getting to the "real" datasets/dataservices in subsequent pages)
185
+ # So to prevent creating a lot of useless items in the job we first thought about checking to see if there is no title and
186
+ # if `isPrimaryTopicOf` is present. But it may be better to check if the only link of the node with the current page is a
187
+ # `servesDataset` or `hasPart`. If it's the case, the node is only present in a dataservice. (maybe we could also check that
188
+ # the `_other_node` is a dataservice?)
189
+ # `isPrimaryTopicOf` is the tag present in the first harvester raising the problem, it may exists other
190
+ # values of the same sort we need to check here.
191
+
192
+ # This is not dangerous because we check for missing title in `dataset_from_rdf` later so we would have skipped
193
+ # this dataset anyway.
194
+ resource = page.resource(node)
195
+ title = rdf_value(resource, DCT.title)
196
+ if title:
197
+ return False
198
+
199
+ predicates = [link_type for (_other_node, link_type) in page.subject_predicates(node)]
200
+ return len(predicates) == 1 and (
201
+ predicates[0] == DCAT.servesDataset or predicates[0] == DCT.hasPart
202
+ )
203
+
162
204
  def process_one_dataservices_page(self, page_number: int, page: Graph):
163
205
  for node in page.subjects(RDF.type, DCAT.DataService):
164
206
  remote_id = page.value(node, DCT.identifier)
165
207
  self.process_dataservice(remote_id, page_number=page_number, page=page, node=node)
166
208
 
167
- if self.is_done():
209
+ if self.has_reached_max_items():
168
210
  return
169
211
 
170
212
  def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node):
@@ -266,7 +308,7 @@ class CswDcatBackend(DcatBackend):
266
308
  subgraph.parse(data=ET.tostring(child), format=fmt)
267
309
 
268
310
  yield page_number, subgraph
269
- if self.is_done():
311
+ if self.has_reached_max_items():
270
312
  return
271
313
 
272
314
  next_record = self.next_record_if_should_continue(start, search_results)
@@ -375,7 +417,7 @@ class CswIso19139DcatBackend(DcatBackend):
375
417
  raise ValueError("Failed to fetch CSW content")
376
418
 
377
419
  yield page_number, subgraph
378
- if self.is_done():
420
+ if self.has_reached_max_items():
379
421
  return
380
422
 
381
423
  next_record = self.next_record_if_should_continue(start, search_results)
@@ -61,16 +61,17 @@
61
61
  <dcat:theme>Theme 1</dcat:theme>
62
62
  <dcterms:publisher rdf:resource="http://data.test.org/organizations/1"/>
63
63
  <owl:versionInfo>1.0</owl:versionInfo>
64
- <dcat:distribution rdf:resource="http://data.test.org/datasets/1/resources/2"/>
65
64
  <dcat:keyword>Tag 4</dcat:keyword>
66
65
  <dcterms:spatial rdf:resource="http://wuEurope.com/"/>
67
66
  <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T19:01:24.184120</dcterms:modified>
68
67
  <dcat:keyword>Tag 2</dcat:keyword>
69
68
  <dcat:keyword>Tag 1</dcat:keyword>
70
- <dcat:distribution rdf:resource="http://data.test.org/datasets/1/resources/1"/>
71
69
  <dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T18:59:02.737480</dcterms:issued>
72
70
  <dcterms:identifier>1</dcterms:identifier>
71
+ <dcat:distribution rdf:resource="http://data.test.org/datasets/1/resources/1"/>
72
+ <dcat:distribution rdf:resource="http://data.test.org/datasets/1/resources/2"/>
73
73
  <dcterms:hasPart rdf:resource="http://data.test.org/datasets/1/resources/3"/>
74
+ <dcat:distribution rdf:resource="http://data.test.org/datasets/1/resources/4"/>
74
75
  </dcat:Dataset>
75
76
  </dcat:dataset>
76
77
  <dcat:dataset>
@@ -152,6 +153,18 @@
152
153
  <dcterms:title>Resource 1-3</dcterms:title>
153
154
  <dcterms:format>JSON</dcterms:format>
154
155
  </foaf:Document>
156
+ <dcat:Distribution rdf:about="http://data.test.org/datasets/1/resources/4">
157
+ <dcterms:description>A resource pointing towards a Geo Service</dcterms:description>
158
+ <dcterms:title>Resource 1-4</dcterms:title>
159
+ <dcat:accessURL>http://data.test.org/datasets/1/resources/4/services?SERVICE=WMS&amp;REQUEST=GetCapabilities&amp;VERSION=1.3.0</dcat:accessURL>
160
+ <dcat:accessService>
161
+ <dcat:DataService>
162
+ <dcterms:title xml:lang="fr">Geo Service</dcterms:title>
163
+ <dcat:endpointURL rdf:resource="http://data.test.org/datasets/1/resources/4/services"/>
164
+ <dcat:endpointDescription rdf:resource="http://data.test.org/datasets/1/resources/4/services?SERVICE=WMS&amp;REQUEST=GetCapabilities&amp;VERSION=1.3.0"/>
165
+ </dcat:DataService>
166
+ </dcat:accessService>
167
+ </dcat:Distribution>
155
168
  <!-- resources for dataset 2 -->
156
169
  <dcat:Distribution rdf:about="http://data.test.org/datasets/2/resources/1">
157
170
  <dcat:accessURL>http://data.test.org/datasets/2/resources/1/file.json</dcat:accessURL>
@@ -61,7 +61,7 @@ class FactoryBackend(backends.BaseBackend):
61
61
  mock_initialize.send(self)
62
62
  for i in range(self.config.get("count", DEFAULT_COUNT)):
63
63
  self.process_dataset(str(i))
64
- if self.is_done():
64
+ if self.has_reached_max_items():
65
65
  return
66
66
 
67
67
  def inner_process_dataset(self, item: HarvestItem):
@@ -44,12 +44,12 @@ class FakeBackend(BaseBackend):
44
44
  def inner_harvest(self):
45
45
  for remote_id in self.source.config.get("dataset_remote_ids", []):
46
46
  self.process_dataset(remote_id)
47
- if self.is_done():
47
+ if self.has_reached_max_items():
48
48
  return
49
49
 
50
50
  for remote_id in self.source.config.get("dataservice_remote_ids", []):
51
51
  self.process_dataservice(remote_id)
52
- if self.is_done():
52
+ if self.has_reached_max_items():
53
53
  return
54
54
 
55
55
  def inner_process_dataset(self, item: HarvestItem):
@@ -137,6 +137,7 @@ class DcatBackendTest:
137
137
  assert datasets["1"].resources[0].description == "A JSON resource"
138
138
  assert datasets["1"].resources[0].format == "json"
139
139
  assert datasets["1"].resources[0].mime == "application/json"
140
+ assert datasets["1"].resources[0].type == "main"
140
141
 
141
142
  @pytest.mark.options(
142
143
  SCHEMA_CATALOG_URL="https://example.com/schemas",
@@ -403,6 +404,7 @@ class DcatBackendTest:
403
404
  assert len(dataset.resources) == 1
404
405
 
405
406
  resource = dataset.resources[0]
407
+ assert resource.type == "main"
406
408
  assert resource.checksum is not None
407
409
  assert resource.checksum.type == "sha1"
408
410
  assert resource.checksum.value == "fb4106aa286a53be44ec99515f0f0421d4d7ad7d"
@@ -476,7 +478,12 @@ class DcatBackendTest:
476
478
 
477
479
  assert job.status == "done"
478
480
  assert job.errors == []
479
- assert len(job.items) == 4
481
+ assert len(job.items) == 5
482
+ # 4 datasets and one Dataservice mentionned but not described
483
+ # because it appears in a distribution as DCAT.accessService
484
+ # but is missing a proper DCT.identifier
485
+ assert len([item for item in job.items if item.status == "done"]) == 4
486
+ assert len([item for item in job.items if item.status == "skipped"]) == 1
480
487
 
481
488
  def test_xml_catalog(self, rmock):
482
489
  LicenseFactory(id="lov2", title="Licence Ouverte Version 2.0")
@@ -524,7 +531,7 @@ class DcatBackendTest:
524
531
  # test dct:license nested in distribution
525
532
  assert dataset.license.id == "lov1"
526
533
 
527
- assert len(dataset.resources) == 3
534
+ assert len(dataset.resources) == 4
528
535
 
529
536
  resource_1 = next(res for res in dataset.resources if res.title == "Resource 1-1")
530
537
  assert resource_1.filetype == "remote"
@@ -549,6 +556,16 @@ class DcatBackendTest:
549
556
  assert resource_3.url == "http://data.test.org/datasets/1/resources/3"
550
557
  assert resource_3.type == "other"
551
558
 
559
+ # Make sure a resource with an accessService is of type api
560
+ resource_4 = next(res for res in dataset.resources if res.title == "Resource 1-4")
561
+ assert resource_4.format is None
562
+ assert resource_4.description == "A resource pointing towards a Geo Service"
563
+ assert (
564
+ resource_4.url
565
+ == "http://data.test.org/datasets/1/resources/4/services?SERVICE=WMS&REQUEST=GetCapabilities&VERSION=1.3.0"
566
+ )
567
+ assert resource_4.type == "api"
568
+
552
569
  # test dct:rights -> license support from dataset
553
570
  dataset = Dataset.objects.get(harvest__dct_identifier="2")
554
571
  assert dataset.license.id == "lov2"
@@ -838,6 +855,7 @@ class CswDcatBackendTest:
838
855
  assert resource.title == "accidento_hdf_L93"
839
856
  assert resource.url == "https://www.geo2france.fr/geoserver/cr_hdf/ows"
840
857
  assert resource.format == "ogc:wms"
858
+ assert resource.type == "main"
841
859
 
842
860
  def test_user_agent_post(self, rmock):
843
861
  url = mock_csw_pagination(rmock, "geonetwork/srv/eng/csw.rdf", "geonetworkv4-page-{}.xml")
@@ -949,6 +967,7 @@ class CswIso19139DcatBackendTest:
949
967
  resource.url
950
968
  == "http://atom.geo-ide.developpement-durable.gouv.fr/atomArchive/GetResource?id=fr-120066022-ldd-cab63273-b3ae-4e8a-ae1c-6192e45faa94&datasetAggregate=true"
951
969
  )
970
+ assert resource.type == "main"
952
971
 
953
972
  # Sadly resource format is parsed as a blank node. Format parsing should be improved.
954
973
  assert re.match(r"n[0-9a-f]{32}", resource.format)