udata 10.2.1.dev34693__py2.py3-none-any.whl → 10.2.1.dev34761__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (32) hide show
  1. udata/core/dataset/models.py +49 -23
  2. udata/core/dataset/rdf.py +19 -4
  3. udata/harvest/backends/base.py +1 -1
  4. udata/harvest/backends/dcat.py +48 -6
  5. udata/harvest/tests/factories.py +1 -1
  6. udata/harvest/tests/test_base_backend.py +2 -2
  7. udata/migrations/2025-03-20-save-quality-for-datasets.py +25 -0
  8. udata/static/chunks/{10.471164b2a9fe15614797.js → 10.8ca60413647062717b1e.js} +3 -3
  9. udata/static/chunks/{10.471164b2a9fe15614797.js.map → 10.8ca60413647062717b1e.js.map} +1 -1
  10. udata/static/chunks/{11.51d706fb9521c16976bc.js → 11.b6f741fcc366abfad9c4.js} +3 -3
  11. udata/static/chunks/{11.51d706fb9521c16976bc.js.map → 11.b6f741fcc366abfad9c4.js.map} +1 -1
  12. udata/static/chunks/{13.f29411b06be1883356a3.js → 13.2d06442dd9a05d9777b5.js} +2 -2
  13. udata/static/chunks/{13.f29411b06be1883356a3.js.map → 13.2d06442dd9a05d9777b5.js.map} +1 -1
  14. udata/static/chunks/{17.3bd0340930d4a314ce9c.js → 17.e8e4caaad5cb0cc0bacc.js} +2 -2
  15. udata/static/chunks/{17.3bd0340930d4a314ce9c.js.map → 17.e8e4caaad5cb0cc0bacc.js.map} +1 -1
  16. udata/static/chunks/{19.8da42e8359d72afc2618.js → 19.f03a102365af4315f9db.js} +3 -3
  17. udata/static/chunks/{19.8da42e8359d72afc2618.js.map → 19.f03a102365af4315f9db.js.map} +1 -1
  18. udata/static/chunks/{8.54e44b102164ae5e7a67.js → 8.778091d55cd8ea39af6b.js} +2 -2
  19. udata/static/chunks/{8.54e44b102164ae5e7a67.js.map → 8.778091d55cd8ea39af6b.js.map} +1 -1
  20. udata/static/chunks/{9.07515e5187f475bce828.js → 9.033d7e190ca9e226a5d0.js} +3 -3
  21. udata/static/chunks/{9.07515e5187f475bce828.js.map → 9.033d7e190ca9e226a5d0.js.map} +1 -1
  22. udata/static/common.js +1 -1
  23. udata/static/common.js.map +1 -1
  24. udata/tests/apiv2/test_datasets.py +7 -1
  25. udata/tests/dataset/test_dataset_model.py +0 -10
  26. udata/tests/dataset/test_dataset_rdf.py +18 -0
  27. {udata-10.2.1.dev34693.dist-info → udata-10.2.1.dev34761.dist-info}/METADATA +3 -1
  28. {udata-10.2.1.dev34693.dist-info → udata-10.2.1.dev34761.dist-info}/RECORD +32 -31
  29. {udata-10.2.1.dev34693.dist-info → udata-10.2.1.dev34761.dist-info}/LICENSE +0 -0
  30. {udata-10.2.1.dev34693.dist-info → udata-10.2.1.dev34761.dist-info}/WHEEL +0 -0
  31. {udata-10.2.1.dev34693.dist-info → udata-10.2.1.dev34761.dist-info}/entry_points.txt +0 -0
  32. {udata-10.2.1.dev34693.dist-info → udata-10.2.1.dev34761.dist-info}/top_level.txt +0 -0
@@ -384,7 +384,7 @@ class ResourceMixin(object):
384
384
  return to_naive_datetime(self.harvest.modified_at)
385
385
  if self.filetype == "remote" and self.extras.get("analysis:last-modified-at"):
386
386
  return to_naive_datetime(self.extras.get("analysis:last-modified-at"))
387
- return self.last_modified_internal
387
+ return to_naive_datetime(self.last_modified_internal)
388
388
 
389
389
  def clean(self):
390
390
  super(ResourceMixin, self).clean()
@@ -565,6 +565,8 @@ class Dataset(WithMetrics, DatasetBadgeMixin, Owned, db.Document):
565
565
  extras = db.ExtrasField()
566
566
  harvest = db.EmbeddedDocumentField(HarvestDatasetMetadata)
567
567
 
568
+ quality_cached = db.DictField()
569
+
568
570
  featured = db.BooleanField(required=True, default=False)
569
571
 
570
572
  contact_points = db.ListField(db.ReferenceField("ContactPoint", reverse_delete_rule=db.PULL))
@@ -672,6 +674,8 @@ class Dataset(WithMetrics, DatasetBadgeMixin, Owned, db.Document):
672
674
  if len(set(res.id for res in self.resources)) != len(self.resources):
673
675
  raise MongoEngineValidationError(f"Duplicate resource ID in dataset #{self.id}.")
674
676
 
677
+ self.quality_cached = self.compute_quality()
678
+
675
679
  for key, value in self.extras.items():
676
680
  if not key.startswith("custom:"):
677
681
  continue
@@ -763,13 +767,9 @@ class Dataset(WithMetrics, DatasetBadgeMixin, Owned, db.Document):
763
767
 
764
768
  @property
765
769
  def last_modified(self):
766
- if (
767
- self.harvest
768
- and self.harvest.modified_at
769
- and to_naive_datetime(self.harvest.modified_at) < datetime.utcnow()
770
- ):
770
+ if self.harvest and self.harvest.modified_at:
771
771
  return to_naive_datetime(self.harvest.modified_at)
772
- return self.last_modified_internal
772
+ return to_naive_datetime(self.last_modified_internal)
773
773
 
774
774
  @property
775
775
  def last_update(self):
@@ -824,8 +824,34 @@ class Dataset(WithMetrics, DatasetBadgeMixin, Owned, db.Document):
824
824
  else:
825
825
  return self.last_update + delta
826
826
 
827
- @cached_property
827
+ @property
828
828
  def quality(self):
829
+ # `quality_cached` should always be set, except during the migration
830
+ # creating this property. We could remove `or self.compute_quality()`
831
+ # after the migration but since we need to keep the computed property for
832
+ # `update_fulfilled_in_time`, maybe we leave it here? Just in case?
833
+ quality = self.quality_cached or self.compute_quality()
834
+
835
+ # :UpdateFulfilledInTime
836
+ # `next_update_for_update_fulfilled_in_time` is only useful to compute the
837
+ # real `update_fulfilled_in_time` check, so we pop it to not polute the `quality`
838
+ # object for users.
839
+ next_update = quality.pop("next_update_for_update_fulfilled_in_time", None)
840
+ if next_update:
841
+ # Allow for being one day late on update.
842
+ # We may have up to one day delay due to harvesting for example
843
+ quality["update_fulfilled_in_time"] = (next_update - datetime.utcnow()).days >= -1
844
+ elif self.frequency in ["continuous", "irregular", "punctual"]:
845
+ # For these frequencies, we don't expect regular updates or can't quantify them.
846
+ # Thus we consider the update_fulfilled_in_time quality criterion to be true.
847
+ quality["update_fulfilled_in_time"] = True
848
+
849
+ # Since `update_fulfilled_in_time` cannot be precomputed, `score` cannot either.
850
+ quality["score"] = self.compute_quality_score(quality)
851
+
852
+ return quality
853
+
854
+ def compute_quality(self):
829
855
  """Return a dict filled with metrics related to the inner
830
856
 
831
857
  quality of the dataset:
@@ -835,25 +861,18 @@ class Dataset(WithMetrics, DatasetBadgeMixin, Owned, db.Document):
835
861
  * and so on
836
862
  """
837
863
  result = {}
838
- if not self.id:
839
- # Quality is only relevant on saved Datasets
840
- return result
841
864
 
842
865
  result["license"] = True if self.license else False
843
866
  result["temporal_coverage"] = True if self.temporal_coverage else False
844
867
  result["spatial"] = True if self.spatial else False
845
868
 
846
869
  result["update_frequency"] = self.frequency and self.frequency != "unknown"
847
- if self.next_update:
848
- # Allow for being one day late on update.
849
- # We may have up to one day delay due to harvesting for example
850
- result["update_fulfilled_in_time"] = (
851
- True if (self.next_update - datetime.utcnow()).days >= -1 else False
852
- )
853
- elif self.frequency in ["continuous", "irregular", "punctual"]:
854
- # For these frequencies, we don't expect regular updates or can't quantify them.
855
- # Thus we consider the update_fulfilled_in_time quality criterion to be true.
856
- result["update_fulfilled_in_time"] = True
870
+
871
+ # We only save the next_update here because it is based on resources
872
+ # We cannot save the `update_fulfilled_in_time` because it is time
873
+ # sensitive (so setting it on save is not really useful…)
874
+ # See :UpdateFulfilledInTime
875
+ result["next_update_for_update_fulfilled_in_time"] = self.next_update
857
876
 
858
877
  result["dataset_description_quality"] = (
859
878
  True
@@ -876,7 +895,6 @@ class Dataset(WithMetrics, DatasetBadgeMixin, Owned, db.Document):
876
895
  resource_desc = True
877
896
  result["resources_documentation"] = resource_doc or resource_desc
878
897
 
879
- result["score"] = self.compute_quality_score(result)
880
898
  return result
881
899
 
882
900
  @property
@@ -934,8 +952,16 @@ class Dataset(WithMetrics, DatasetBadgeMixin, Owned, db.Document):
934
952
  if resource.id in [r.id for r in self.resources]:
935
953
  raise MongoEngineValidationError("Cannot add resource with already existing ID")
936
954
 
955
+ self.resources.insert(0, resource)
937
956
  self.update(
938
- __raw__={"$push": {"resources": {"$each": [resource.to_mongo()], "$position": 0}}}
957
+ __raw__={
958
+ "$set": {
959
+ "quality_cached": self.compute_quality(),
960
+ },
961
+ "$push": {
962
+ "resources": {"$each": [resource.to_mongo()], "$position": 0},
963
+ },
964
+ }
939
965
  )
940
966
  self.reload()
941
967
  self.on_resource_added.send(self.__class__, document=self, resource_id=resource.id)
udata/core/dataset/rdf.py CHANGED
@@ -5,7 +5,7 @@ This module centralize dataset helpers for RDF/DCAT serialization and parsing
5
5
  import calendar
6
6
  import json
7
7
  import logging
8
- from datetime import date
8
+ from datetime import date, datetime
9
9
  from typing import Optional
10
10
 
11
11
  from dateutil.parser import parse as parse_dt
@@ -50,7 +50,7 @@ from udata.rdf import (
50
50
  url_from_rdf,
51
51
  )
52
52
  from udata.uris import endpoint_for
53
- from udata.utils import get_by, safe_unicode
53
+ from udata.utils import get_by, safe_unicode, to_naive_datetime
54
54
 
55
55
  from .constants import OGC_SERVICE_FORMATS, UPDATE_FREQUENCIES
56
56
  from .models import Checksum, Dataset, License, Resource
@@ -735,7 +735,14 @@ def resource_from_rdf(graph_or_distrib, dataset=None, is_additionnal=False):
735
735
  if not resource.harvest:
736
736
  resource.harvest = HarvestResourceMetadata()
737
737
  resource.harvest.created_at = created_at
738
- resource.harvest.modified_at = modified_at
738
+
739
+ # In the past, we've encountered future `modified_at` during harvesting
740
+ # do not save it. :FutureHarvestModifiedAt
741
+ if modified_at and to_naive_datetime(modified_at) > datetime.utcnow():
742
+ log.warning(f"Future `DCT.modified` date '{modified_at}' in resource")
743
+ else:
744
+ resource.harvest.modified_at = modified_at
745
+
739
746
  resource.harvest.dct_identifier = identifier
740
747
  resource.harvest.uri = uri
741
748
 
@@ -755,6 +762,8 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None, remote_url_prefix: s
755
762
 
756
763
  dataset.title = rdf_value(d, DCT.title)
757
764
  if not dataset.title:
765
+ # If the dataset is externaly defined (so without title and just with a link to the dataset XML)
766
+ # we should have skipped it way before in :ExcludeExternalyDefinedDataset
758
767
  raise HarvestSkipException("missing title on dataset")
759
768
 
760
769
  # Support dct:abstract if dct:description is missing (sometimes used instead)
@@ -834,7 +843,13 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None, remote_url_prefix: s
834
843
  dataset.harvest.uri = uri
835
844
  dataset.harvest.remote_url = remote_url
836
845
  dataset.harvest.created_at = created_at
837
- dataset.harvest.modified_at = modified_at
846
+
847
+ # In the past, we've encountered future `modified_at` during harvesting
848
+ # do not save it. :FutureHarvestModifiedAt
849
+ if modified_at and to_naive_datetime(modified_at) > datetime.utcnow():
850
+ log.warning(f"Future `DCT.modified` date '{modified_at}' in dataset")
851
+ else:
852
+ dataset.harvest.modified_at = modified_at
838
853
 
839
854
  return dataset
840
855
 
@@ -256,7 +256,7 @@ class BaseBackend(object):
256
256
  ]
257
257
  self.save_job()
258
258
 
259
- def is_done(self) -> bool:
259
+ def has_reached_max_items(self) -> bool:
260
260
  """Should be called after process_dataset to know if we reach the max items"""
261
261
  return self.max_items and len(self.job.items) >= self.max_items
262
262
 
@@ -9,7 +9,7 @@ from rdflib.namespace import RDF
9
9
 
10
10
  from udata.core.dataservices.rdf import dataservice_from_rdf
11
11
  from udata.core.dataset.rdf import dataset_from_rdf
12
- from udata.harvest.models import HarvestItem
12
+ from udata.harvest.models import HarvestError, HarvestItem
13
13
  from udata.i18n import gettext as _
14
14
  from udata.rdf import (
15
15
  DCAT,
@@ -18,6 +18,7 @@ from udata.rdf import (
18
18
  SPDX,
19
19
  guess_format,
20
20
  namespace_manager,
21
+ rdf_value,
21
22
  url_from_rdf,
22
23
  )
23
24
  from udata.storage.s3 import store_as_json
@@ -77,9 +78,19 @@ class DcatBackend(BaseBackend):
77
78
  self.process_one_datasets_page(page_number, page)
78
79
  serialized_graphs.append(page.serialize(format=fmt, indent=None))
79
80
 
81
+ # We do a second pass to have all datasets in memory and attach datasets
82
+ # to dataservices. It could be better to be one pass of graph walking and
83
+ # then one pass of attaching datasets to dataservices.
80
84
  for page_number, page in self.walk_graph(self.source.url, fmt):
81
85
  self.process_one_dataservices_page(page_number, page)
82
86
 
87
+ if not self.dryrun and self.has_reached_max_items():
88
+ # We have reached the max_items limit. Warn the user that all the datasets may not be present.
89
+ error = HarvestError(
90
+ message=f"{self.max_items} max items reached, not all datasets/dataservices were retrieved"
91
+ )
92
+ self.job.errors.append(error)
93
+
83
94
  # The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000).
84
95
  max_harvest_graph_size_in_mongo = current_app.config.get(
85
96
  "HARVEST_MAX_CATALOG_SIZE_IN_MONGO"
@@ -146,7 +157,7 @@ class DcatBackend(BaseBackend):
146
157
  break
147
158
 
148
159
  yield page_number, subgraph
149
- if self.is_done():
160
+ if self.has_reached_max_items():
150
161
  return
151
162
 
152
163
  page_number += 1
@@ -154,17 +165,48 @@ class DcatBackend(BaseBackend):
154
165
  def process_one_datasets_page(self, page_number: int, page: Graph):
155
166
  for node in page.subjects(RDF.type, DCAT.Dataset):
156
167
  remote_id = page.value(node, DCT.identifier)
168
+ if self.is_dataset_external_to_this_page(page, node):
169
+ continue
170
+
157
171
  self.process_dataset(remote_id, page_number=page_number, page=page, node=node)
158
172
 
159
- if self.is_done():
173
+ if self.has_reached_max_items():
160
174
  return
161
175
 
176
+ def is_dataset_external_to_this_page(self, page: Graph, node) -> bool:
177
+ # In dataservice nodes we have `servesDataset` or `hasPart` that can contains nodes
178
+ # with type=dataset. We don't want to process them because these nodes are empty (they
179
+ # only contains a link to the dataset definition).
180
+ # These datasets are either present in the catalog in previous or next pages or
181
+ # external from the catalog we are currently harvesting (so we don't want to harvest them).
182
+ # First we thought of skipping them inside `dataset_from_rdf` (see :ExcludeExternalyDefinedDataset)
183
+ # but it creates a lot of "fake" items in the job and raising problems (reaching the max harvest item for
184
+ # example and not getting to the "real" datasets/dataservices in subsequent pages)
185
+ # So to prevent creating a lot of useless items in the job we first thought about checking to see if there is no title and
186
+ # if `isPrimaryTopicOf` is present. But it may be better to check if the only link of the node with the current page is a
187
+ # `servesDataset` or `hasPart`. If it's the case, the node is only present in a dataservice. (maybe we could also check that
188
+ # the `_other_node` is a dataservice?)
189
+ # `isPrimaryTopicOf` is the tag present in the first harvester raising the problem, it may exists other
190
+ # values of the same sort we need to check here.
191
+
192
+ # This is not dangerous because we check for missing title in `dataset_from_rdf` later so we would have skipped
193
+ # this dataset anyway.
194
+ resource = page.resource(node)
195
+ title = rdf_value(resource, DCT.title)
196
+ if title:
197
+ return False
198
+
199
+ predicates = [link_type for (_other_node, link_type) in page.subject_predicates(node)]
200
+ return len(predicates) == 1 and (
201
+ predicates[0] == DCAT.servesDataset or predicates[0] == DCT.hasPart
202
+ )
203
+
162
204
  def process_one_dataservices_page(self, page_number: int, page: Graph):
163
205
  for node in page.subjects(RDF.type, DCAT.DataService):
164
206
  remote_id = page.value(node, DCT.identifier)
165
207
  self.process_dataservice(remote_id, page_number=page_number, page=page, node=node)
166
208
 
167
- if self.is_done():
209
+ if self.has_reached_max_items():
168
210
  return
169
211
 
170
212
  def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node):
@@ -266,7 +308,7 @@ class CswDcatBackend(DcatBackend):
266
308
  subgraph.parse(data=ET.tostring(child), format=fmt)
267
309
 
268
310
  yield page_number, subgraph
269
- if self.is_done():
311
+ if self.has_reached_max_items():
270
312
  return
271
313
 
272
314
  next_record = self.next_record_if_should_continue(start, search_results)
@@ -375,7 +417,7 @@ class CswIso19139DcatBackend(DcatBackend):
375
417
  raise ValueError("Failed to fetch CSW content")
376
418
 
377
419
  yield page_number, subgraph
378
- if self.is_done():
420
+ if self.has_reached_max_items():
379
421
  return
380
422
 
381
423
  next_record = self.next_record_if_should_continue(start, search_results)
@@ -61,7 +61,7 @@ class FactoryBackend(backends.BaseBackend):
61
61
  mock_initialize.send(self)
62
62
  for i in range(self.config.get("count", DEFAULT_COUNT)):
63
63
  self.process_dataset(str(i))
64
- if self.is_done():
64
+ if self.has_reached_max_items():
65
65
  return
66
66
 
67
67
  def inner_process_dataset(self, item: HarvestItem):
@@ -44,12 +44,12 @@ class FakeBackend(BaseBackend):
44
44
  def inner_harvest(self):
45
45
  for remote_id in self.source.config.get("dataset_remote_ids", []):
46
46
  self.process_dataset(remote_id)
47
- if self.is_done():
47
+ if self.has_reached_max_items():
48
48
  return
49
49
 
50
50
  for remote_id in self.source.config.get("dataservice_remote_ids", []):
51
51
  self.process_dataservice(remote_id)
52
- if self.is_done():
52
+ if self.has_reached_max_items():
53
53
  return
54
54
 
55
55
  def inner_process_dataset(self, item: HarvestItem):
@@ -0,0 +1,25 @@
1
+ """
2
+ This migration keeps only the "Local authority" badge if the organization also has the "Public service" badge.
3
+ """
4
+
5
+ import logging
6
+
7
+ import click
8
+
9
+ from udata.core.dataset.models import Dataset
10
+
11
+ log = logging.getLogger(__name__)
12
+
13
+
14
+ def migrate(db):
15
+ log.info("Saving all datasets")
16
+
17
+ count = Dataset.objects().count()
18
+ with click.progressbar(Dataset.objects(), length=count) as datasets:
19
+ for dataset in datasets:
20
+ try:
21
+ dataset.save()
22
+ except Exception as err:
23
+ log.error(f"Cannot save dataset {dataset.id} {err}")
24
+
25
+ log.info("Done")