udata 10.0.7.dev33415__py2.py3-none-any.whl → 10.0.7.dev33440__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

@@ -7,6 +7,7 @@ import requests
7
7
  from flask import current_app
8
8
  from voluptuous import MultipleInvalid, RequiredFieldInvalid
9
9
 
10
+ import udata.uris as uris
10
11
  from udata.core.dataservices.models import Dataservice
11
12
  from udata.core.dataservices.models import HarvestMetadata as HarvestDataserviceMetadata
12
13
  from udata.core.dataset.models import HarvestDatasetMetadata
@@ -388,15 +389,19 @@ class BaseBackend(object):
388
389
  """Get or create a dataset given its remote ID (and its source)
389
390
  We first try to match `source_id` to be source domain independent
390
391
  """
391
- dataset = Dataset.objects(
392
- __raw__={
393
- "harvest.remote_id": remote_id,
394
- "$or": [
395
- {"harvest.domain": self.source.domain},
396
- {"harvest.source_id": str(self.source.id)},
397
- ],
398
- }
399
- ).first()
392
+ try:
393
+ uris.validate(remote_id)
394
+ dataset = Dataset.objects(harvest__remote_id=remote_id).first()
395
+ except uris.ValidationError:
396
+ dataset = Dataset.objects(
397
+ __raw__={
398
+ "harvest.remote_id": remote_id,
399
+ "$or": [
400
+ {"harvest.domain": self.source.domain},
401
+ {"harvest.source_id": str(self.source.id)},
402
+ ],
403
+ }
404
+ ).first()
400
405
 
401
406
  if dataset:
402
407
  return dataset
@@ -20,6 +20,11 @@ class Unknown:
20
20
  pass
21
21
 
22
22
 
23
+ def gen_remote_IDs(num: int) -> list[str]:
24
+ """Generate remote IDs."""
25
+ return [f"fake-{i}" for i in range(num)]
26
+
27
+
23
28
  class FakeBackend(BaseBackend):
24
29
  filters = (
25
30
  HarvestFilter("First filter", "first", str),
@@ -35,8 +40,7 @@ class FakeBackend(BaseBackend):
35
40
  )
36
41
 
37
42
  def inner_harvest(self):
38
- for i in range(self.source.config.get("nb_datasets", 3)):
39
- remote_id = f"fake-{i}"
43
+ for remote_id in self.source.config.get("dataset_remote_ids", []):
40
44
  self.process_dataset(remote_id)
41
45
  if self.is_done():
42
46
  return
@@ -45,7 +49,8 @@ class FakeBackend(BaseBackend):
45
49
  dataset = self.get_dataset(item.remote_id)
46
50
 
47
51
  for key, value in DatasetFactory.as_dict(visible=True).items():
48
- setattr(dataset, key, value)
52
+ if getattr(dataset, key) is None:
53
+ setattr(dataset, key, value)
49
54
  if self.source.config.get("last_modified"):
50
55
  dataset.last_modified_internal = self.source.config["last_modified"]
51
56
  return dataset
@@ -76,7 +81,7 @@ class BaseBackendTest:
76
81
  def test_simple_harvest(self):
77
82
  now = datetime.utcnow()
78
83
  nb_datasets = 3
79
- source = HarvestSourceFactory(config={"nb_datasets": nb_datasets})
84
+ source = HarvestSourceFactory(config={"dataset_remote_ids": gen_remote_IDs(nb_datasets)})
80
85
  backend = FakeBackend(source)
81
86
 
82
87
  job = backend.harvest()
@@ -155,7 +160,7 @@ class BaseBackendTest:
155
160
 
156
161
  def test_harvest_source_id(self):
157
162
  nb_datasets = 3
158
- source = HarvestSourceFactory(config={"nb_datasets": nb_datasets})
163
+ source = HarvestSourceFactory(config={"dataset_remote_ids": gen_remote_IDs(nb_datasets)})
159
164
  backend = FakeBackend(source)
160
165
 
161
166
  job = backend.harvest()
@@ -176,7 +181,9 @@ class BaseBackendTest:
176
181
 
177
182
  def test_dont_overwrite_last_modified(self, mocker):
178
183
  last_modified = faker.date_time_between(start_date="-30y", end_date="-1y")
179
- source = HarvestSourceFactory(config={"nb_datasets": 1, "last_modified": last_modified})
184
+ source = HarvestSourceFactory(
185
+ config={"dataset_remote_ids": gen_remote_IDs(1), "last_modified": last_modified}
186
+ )
180
187
  backend = FakeBackend(source)
181
188
 
182
189
  backend.harvest()
@@ -188,7 +195,9 @@ class BaseBackendTest:
188
195
 
189
196
  def test_dont_overwrite_last_modified_even_if_set_to_same(self, mocker):
190
197
  last_modified = faker.date_time_between(start_date="-30y", end_date="-1y")
191
- source = HarvestSourceFactory(config={"nb_datasets": 1, "last_modified": last_modified})
198
+ source = HarvestSourceFactory(
199
+ config={"dataset_remote_ids": gen_remote_IDs(1), "last_modified": last_modified}
200
+ )
192
201
  backend = FakeBackend(source)
193
202
 
194
203
  backend.harvest()
@@ -201,7 +210,7 @@ class BaseBackendTest:
201
210
 
202
211
  def test_autoarchive(self, app):
203
212
  nb_datasets = 3
204
- source = HarvestSourceFactory(config={"nb_datasets": nb_datasets})
213
+ source = HarvestSourceFactory(config={"dataset_remote_ids": gen_remote_IDs(nb_datasets)})
205
214
  backend = FakeBackend(source)
206
215
 
207
216
  # create a dangling dataset to be archived
@@ -263,7 +272,7 @@ class BaseBackendTest:
263
272
 
264
273
  def test_harvest_datasets_get_deleted(self):
265
274
  nb_datasets = 3
266
- source = HarvestSourceFactory(config={"nb_datasets": nb_datasets})
275
+ source = HarvestSourceFactory(config={"dataset_remote_ids": gen_remote_IDs(nb_datasets)})
267
276
  backend = FakeBackend(source)
268
277
 
269
278
  job = backend.harvest()
@@ -281,6 +290,69 @@ class BaseBackendTest:
281
290
  for item in job.items:
282
291
  assert item.dataset is None
283
292
 
293
+ def test_no_datasets_duplication(self, app):
294
+ duplicated_remote_id_uri = "http://example.com/duplicated_remote_id_uri"
295
+ nb_datasets = 3
296
+ remote_ids = gen_remote_IDs(nb_datasets)
297
+ remote_ids.append(duplicated_remote_id_uri)
298
+ source = HarvestSourceFactory(config={"dataset_remote_ids": remote_ids})
299
+ backend = FakeBackend(source)
300
+
301
+ # Create a dataset that should be reused by the harvest, which will update it
302
+ # instead of creating a new one, as it has the same remote_id, domain and source_id.
303
+ dataset_reused = DatasetFactory(
304
+ title="Reused Dataset",
305
+ harvest={
306
+ "domain": source.domain,
307
+ "remote_id": "fake-0", # the FakeBackend harvest should reuse this dataset
308
+ "source_id": str(source.id),
309
+ },
310
+ )
311
+
312
+ # Create a dataset that should be reused even though it's a different `source_id` and `domain,
313
+ # because the remote_id is the same and an URI.
314
+ dataset_reused_uri = DatasetFactory(
315
+ title="Reused Dataset with URI",
316
+ harvest={
317
+ "domain": "some-other-domain",
318
+ # the FakeBackend harvest above should reuse this dataset with the same remote_id URI
319
+ "remote_id": duplicated_remote_id_uri,
320
+ "source_id": "some-other-source-id",
321
+ },
322
+ )
323
+
324
+ # Create a dataset that should not be reused even though it's the same `remote_id`,
325
+ # as it's not an URI, and has a different domain and source id.
326
+ dataset_not_reused = DatasetFactory(
327
+ title="Duplicated Dataset",
328
+ harvest={
329
+ "domain": "some-other-domain",
330
+ "remote_id": "fake-0", # the "source" harvest above should create another dataset with the same remote_id
331
+ "source_id": "some-other-source-id",
332
+ },
333
+ )
334
+
335
+ job = backend.harvest()
336
+
337
+ # 3 (nb_datasets) + 1 (dataset_remote_ids) created by the HarvestSourceFactory
338
+ assert len(job.items) == nb_datasets + 1
339
+ # all datasets : 4 mocks (3 nb_datasets + 1 dataset_remote_ids) + 3 created with DatasetFactory - 2 reused
340
+ assert Dataset.objects.count() == nb_datasets + 1 + 3 - 2
341
+ assert (
342
+ # and not 3, data_reused was not duplicated
343
+ Dataset.objects(harvest__remote_id="fake-0").count() == 2
344
+ )
345
+ # The dataset not reused wasn't overwritten nor updated by the harvest.
346
+ dataset_not_reused.reload()
347
+ assert dataset_not_reused.harvest.domain == "some-other-domain"
348
+ assert dataset_not_reused.harvest.source_id == "some-other-source-id"
349
+ # The "reused dataset" was overwritten and updated by the harvest.
350
+ dataset_reused.reload()
351
+ # The "reused dataset with uri" was overwritten and updated by the harvest.
352
+ dataset_reused_uri.reload()
353
+ assert dataset_reused_uri.harvest.domain == source.domain
354
+ assert dataset_reused_uri.harvest.source_id == str(source.id)
355
+
284
356
 
285
357
  @pytest.mark.usefixtures("clean_db")
286
358
  class BaseBackendValidateTest:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: udata
3
- Version: 10.0.7.dev33415
3
+ Version: 10.0.7.dev33440
4
4
  Summary: Open data portal
5
5
  Home-page: https://github.com/opendatateam/udata
6
6
  Author: Opendata Team
@@ -146,6 +146,7 @@ It is collectively taken care of by members of the
146
146
  - fix `function_field` wrong name [#3236](https://github.com/opendatateam/udata/pull/3236)
147
147
  - Allow dataservice archive [#3238](https://github.com/opendatateam/udata/pull/3238)
148
148
  - Improve OGC service detection to expose in RDF [#3241](https://github.com/opendatateam/udata/pull/3241)
149
+ - Don’t duplicate datasets with a harvest.remote_id URI when harvesting [#3219](https://github.com/opendatateam/udata/pull/3219)
149
150
 
150
151
  ## 10.0.6 (2024-12-19)
151
152
 
@@ -157,6 +158,7 @@ It is collectively taken care of by members of the
157
158
  - Fix failing dataset save in update reuses metrics [#3230](https://github.com/opendatateam/udata/pull/3230)
158
159
  - Fix catalog RDF by preventing memory increase on getting dataservice hvd tags [#3231](https://github.com/opendatateam/udata/pull/3231)
159
160
  - Update purge tasks [#3167](https://github.com/opendatateam/udata/pull/3167)
161
+ - Trigger GitLab infra deployment through simple-scaffolding script [#3232](https://github.com/opendatateam/udata/pull/3232)
160
162
 
161
163
  ## 10.0.5 (2024-12-09)
162
164
 
@@ -291,13 +291,13 @@ udata/harvest/notifications.py,sha256=8WkHtD68v6rfZC6jCmAuuuRp4NN6q71ZkksZU5m9oJ
291
291
  udata/harvest/signals.py,sha256=3AhFHMPIFH5vz01NX5ycR_RWH14MXFWnCT6__LSa-QI,1338
292
292
  udata/harvest/tasks.py,sha256=ddJtvE0s-kAYt27-rKH6n8U8vKD8qKczlJtdBJzMUns,1718
293
293
  udata/harvest/backends/__init__.py,sha256=QjoFfBJfpw_xgk5YYWI1SgKJOMEmTMlxSfW79GNkSTI,459
294
- udata/harvest/backends/base.py,sha256=gq39-0SN3DSXfXIv4qjhIkEmPkV-2g6z53V0FikooSs,16413
294
+ udata/harvest/backends/base.py,sha256=K-pfjYc7GiAQFprO-2Wknq3cmlEPImCIHmWdFK2yGVM,16638
295
295
  udata/harvest/backends/dcat.py,sha256=wbznPWCS9UJzpCWPAnmeUhUCREJg-yoQAHtEfr88RqU,15963
296
296
  udata/harvest/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
297
297
  udata/harvest/tests/factories.py,sha256=Jb_RwsYogWhquG8aj2bzT3JLySq3530OmZV9cofeWlE,2269
298
298
  udata/harvest/tests/test_actions.py,sha256=s5yK4YQXVvrbLgKMx8_-ifFBC5QmUYBJWvkfNAZuy_Y,26504
299
299
  udata/harvest/tests/test_api.py,sha256=pCIoBCWZYr1IKgGARMe5fZArTsTnFQXUy5ur11vs3ho,20363
300
- udata/harvest/tests/test_base_backend.py,sha256=Rkr6P-rHJ2OWuyEkFPeEXwnYvI5C0B5M6SkUEUss2N8,13056
300
+ udata/harvest/tests/test_base_backend.py,sha256=DrBwIr6uMweQ6S9oCpBS9tS54VlIcHMdPswGXAIDz9c,16446
301
301
  udata/harvest/tests/test_dcat_backend.py,sha256=IauCoeSylEH-Wyk3oXdC7sIk5bEEdnqBBxOiMJyIrSc,39877
302
302
  udata/harvest/tests/test_filters.py,sha256=2nXGnLTlM0iSOZBDBxmJp1KO3_59RXLS0vmXvEs2-KE,2468
303
303
  udata/harvest/tests/test_models.py,sha256=f9NRR2_S4oZFgF8qOumg0vv-lpnEBJbI5vNtcwFdSqM,831
@@ -710,9 +710,9 @@ udata/translations/pt/LC_MESSAGES/udata.mo,sha256=18Y5YtzVKInDejw-R-45HNzsB3OVwJ
710
710
  udata/translations/pt/LC_MESSAGES/udata.po,sha256=6IQvFk0NTDV5Jq-kLkkzpioWfrMaLDa1oQSevKFbxKQ,44943
711
711
  udata/translations/sr/LC_MESSAGES/udata.mo,sha256=O4zKHNkiX-2GUfLLa0kwbxIA5M1jxiqkHzaMh1t2wKs,29169
712
712
  udata/translations/sr/LC_MESSAGES/udata.po,sha256=W9C447pW0O-Q28ji8wGLgPNrnqlPYXaMD0AOWJPcpZc,51918
713
- udata-10.0.7.dev33415.dist-info/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
714
- udata-10.0.7.dev33415.dist-info/METADATA,sha256=Cu_0UIVNWzR3xiGS41CezB2EBc9dKYkbBkd0Ikd6704,138651
715
- udata-10.0.7.dev33415.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
716
- udata-10.0.7.dev33415.dist-info/entry_points.txt,sha256=3SKiqVy4HUqxf6iWspgMqH8d88Htk6KoLbG1BU-UddQ,451
717
- udata-10.0.7.dev33415.dist-info/top_level.txt,sha256=39OCg-VWFWOq4gCKnjKNu-s3OwFlZIu_dVH8Gl6ndHw,12
718
- udata-10.0.7.dev33415.dist-info/RECORD,,
713
+ udata-10.0.7.dev33440.dist-info/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
714
+ udata-10.0.7.dev33440.dist-info/METADATA,sha256=2fFeFMw--Coc91igNCZY4oslE5pF3ND5JTlLHLIMkHg,138907
715
+ udata-10.0.7.dev33440.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
716
+ udata-10.0.7.dev33440.dist-info/entry_points.txt,sha256=3SKiqVy4HUqxf6iWspgMqH8d88Htk6KoLbG1BU-UddQ,451
717
+ udata-10.0.7.dev33440.dist-info/top_level.txt,sha256=39OCg-VWFWOq4gCKnjKNu-s3OwFlZIu_dVH8Gl6ndHw,12
718
+ udata-10.0.7.dev33440.dist-info/RECORD,,