udata 10.0.7.dev33415__py2.py3-none-any.whl → 10.0.7.dev33440__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- udata/harvest/backends/base.py +14 -9
- udata/harvest/tests/test_base_backend.py +81 -9
- {udata-10.0.7.dev33415.dist-info → udata-10.0.7.dev33440.dist-info}/METADATA +3 -1
- {udata-10.0.7.dev33415.dist-info → udata-10.0.7.dev33440.dist-info}/RECORD +8 -8
- {udata-10.0.7.dev33415.dist-info → udata-10.0.7.dev33440.dist-info}/LICENSE +0 -0
- {udata-10.0.7.dev33415.dist-info → udata-10.0.7.dev33440.dist-info}/WHEEL +0 -0
- {udata-10.0.7.dev33415.dist-info → udata-10.0.7.dev33440.dist-info}/entry_points.txt +0 -0
- {udata-10.0.7.dev33415.dist-info → udata-10.0.7.dev33440.dist-info}/top_level.txt +0 -0
udata/harvest/backends/base.py
CHANGED
|
@@ -7,6 +7,7 @@ import requests
|
|
|
7
7
|
from flask import current_app
|
|
8
8
|
from voluptuous import MultipleInvalid, RequiredFieldInvalid
|
|
9
9
|
|
|
10
|
+
import udata.uris as uris
|
|
10
11
|
from udata.core.dataservices.models import Dataservice
|
|
11
12
|
from udata.core.dataservices.models import HarvestMetadata as HarvestDataserviceMetadata
|
|
12
13
|
from udata.core.dataset.models import HarvestDatasetMetadata
|
|
@@ -388,15 +389,19 @@ class BaseBackend(object):
|
|
|
388
389
|
"""Get or create a dataset given its remote ID (and its source)
|
|
389
390
|
We first try to match `source_id` to be source domain independent
|
|
390
391
|
"""
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
392
|
+
try:
|
|
393
|
+
uris.validate(remote_id)
|
|
394
|
+
dataset = Dataset.objects(harvest__remote_id=remote_id).first()
|
|
395
|
+
except uris.ValidationError:
|
|
396
|
+
dataset = Dataset.objects(
|
|
397
|
+
__raw__={
|
|
398
|
+
"harvest.remote_id": remote_id,
|
|
399
|
+
"$or": [
|
|
400
|
+
{"harvest.domain": self.source.domain},
|
|
401
|
+
{"harvest.source_id": str(self.source.id)},
|
|
402
|
+
],
|
|
403
|
+
}
|
|
404
|
+
).first()
|
|
400
405
|
|
|
401
406
|
if dataset:
|
|
402
407
|
return dataset
|
|
@@ -20,6 +20,11 @@ class Unknown:
|
|
|
20
20
|
pass
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
def gen_remote_IDs(num: int) -> list[str]:
|
|
24
|
+
"""Generate remote IDs."""
|
|
25
|
+
return [f"fake-{i}" for i in range(num)]
|
|
26
|
+
|
|
27
|
+
|
|
23
28
|
class FakeBackend(BaseBackend):
|
|
24
29
|
filters = (
|
|
25
30
|
HarvestFilter("First filter", "first", str),
|
|
@@ -35,8 +40,7 @@ class FakeBackend(BaseBackend):
|
|
|
35
40
|
)
|
|
36
41
|
|
|
37
42
|
def inner_harvest(self):
|
|
38
|
-
for
|
|
39
|
-
remote_id = f"fake-{i}"
|
|
43
|
+
for remote_id in self.source.config.get("dataset_remote_ids", []):
|
|
40
44
|
self.process_dataset(remote_id)
|
|
41
45
|
if self.is_done():
|
|
42
46
|
return
|
|
@@ -45,7 +49,8 @@ class FakeBackend(BaseBackend):
|
|
|
45
49
|
dataset = self.get_dataset(item.remote_id)
|
|
46
50
|
|
|
47
51
|
for key, value in DatasetFactory.as_dict(visible=True).items():
|
|
48
|
-
|
|
52
|
+
if getattr(dataset, key) is None:
|
|
53
|
+
setattr(dataset, key, value)
|
|
49
54
|
if self.source.config.get("last_modified"):
|
|
50
55
|
dataset.last_modified_internal = self.source.config["last_modified"]
|
|
51
56
|
return dataset
|
|
@@ -76,7 +81,7 @@ class BaseBackendTest:
|
|
|
76
81
|
def test_simple_harvest(self):
|
|
77
82
|
now = datetime.utcnow()
|
|
78
83
|
nb_datasets = 3
|
|
79
|
-
source = HarvestSourceFactory(config={"
|
|
84
|
+
source = HarvestSourceFactory(config={"dataset_remote_ids": gen_remote_IDs(nb_datasets)})
|
|
80
85
|
backend = FakeBackend(source)
|
|
81
86
|
|
|
82
87
|
job = backend.harvest()
|
|
@@ -155,7 +160,7 @@ class BaseBackendTest:
|
|
|
155
160
|
|
|
156
161
|
def test_harvest_source_id(self):
|
|
157
162
|
nb_datasets = 3
|
|
158
|
-
source = HarvestSourceFactory(config={"
|
|
163
|
+
source = HarvestSourceFactory(config={"dataset_remote_ids": gen_remote_IDs(nb_datasets)})
|
|
159
164
|
backend = FakeBackend(source)
|
|
160
165
|
|
|
161
166
|
job = backend.harvest()
|
|
@@ -176,7 +181,9 @@ class BaseBackendTest:
|
|
|
176
181
|
|
|
177
182
|
def test_dont_overwrite_last_modified(self, mocker):
|
|
178
183
|
last_modified = faker.date_time_between(start_date="-30y", end_date="-1y")
|
|
179
|
-
source = HarvestSourceFactory(
|
|
184
|
+
source = HarvestSourceFactory(
|
|
185
|
+
config={"dataset_remote_ids": gen_remote_IDs(1), "last_modified": last_modified}
|
|
186
|
+
)
|
|
180
187
|
backend = FakeBackend(source)
|
|
181
188
|
|
|
182
189
|
backend.harvest()
|
|
@@ -188,7 +195,9 @@ class BaseBackendTest:
|
|
|
188
195
|
|
|
189
196
|
def test_dont_overwrite_last_modified_even_if_set_to_same(self, mocker):
|
|
190
197
|
last_modified = faker.date_time_between(start_date="-30y", end_date="-1y")
|
|
191
|
-
source = HarvestSourceFactory(
|
|
198
|
+
source = HarvestSourceFactory(
|
|
199
|
+
config={"dataset_remote_ids": gen_remote_IDs(1), "last_modified": last_modified}
|
|
200
|
+
)
|
|
192
201
|
backend = FakeBackend(source)
|
|
193
202
|
|
|
194
203
|
backend.harvest()
|
|
@@ -201,7 +210,7 @@ class BaseBackendTest:
|
|
|
201
210
|
|
|
202
211
|
def test_autoarchive(self, app):
|
|
203
212
|
nb_datasets = 3
|
|
204
|
-
source = HarvestSourceFactory(config={"
|
|
213
|
+
source = HarvestSourceFactory(config={"dataset_remote_ids": gen_remote_IDs(nb_datasets)})
|
|
205
214
|
backend = FakeBackend(source)
|
|
206
215
|
|
|
207
216
|
# create a dangling dataset to be archived
|
|
@@ -263,7 +272,7 @@ class BaseBackendTest:
|
|
|
263
272
|
|
|
264
273
|
def test_harvest_datasets_get_deleted(self):
|
|
265
274
|
nb_datasets = 3
|
|
266
|
-
source = HarvestSourceFactory(config={"
|
|
275
|
+
source = HarvestSourceFactory(config={"dataset_remote_ids": gen_remote_IDs(nb_datasets)})
|
|
267
276
|
backend = FakeBackend(source)
|
|
268
277
|
|
|
269
278
|
job = backend.harvest()
|
|
@@ -281,6 +290,69 @@ class BaseBackendTest:
|
|
|
281
290
|
for item in job.items:
|
|
282
291
|
assert item.dataset is None
|
|
283
292
|
|
|
293
|
+
def test_no_datasets_duplication(self, app):
|
|
294
|
+
duplicated_remote_id_uri = "http://example.com/duplicated_remote_id_uri"
|
|
295
|
+
nb_datasets = 3
|
|
296
|
+
remote_ids = gen_remote_IDs(nb_datasets)
|
|
297
|
+
remote_ids.append(duplicated_remote_id_uri)
|
|
298
|
+
source = HarvestSourceFactory(config={"dataset_remote_ids": remote_ids})
|
|
299
|
+
backend = FakeBackend(source)
|
|
300
|
+
|
|
301
|
+
# Create a dataset that should be reused by the harvest, which will update it
|
|
302
|
+
# instead of creating a new one, as it has the same remote_id, domain and source_id.
|
|
303
|
+
dataset_reused = DatasetFactory(
|
|
304
|
+
title="Reused Dataset",
|
|
305
|
+
harvest={
|
|
306
|
+
"domain": source.domain,
|
|
307
|
+
"remote_id": "fake-0", # the FakeBackend harvest should reuse this dataset
|
|
308
|
+
"source_id": str(source.id),
|
|
309
|
+
},
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Create a dataset that should be reused even though it's a different `source_id` and `domain,
|
|
313
|
+
# because the remote_id is the same and an URI.
|
|
314
|
+
dataset_reused_uri = DatasetFactory(
|
|
315
|
+
title="Reused Dataset with URI",
|
|
316
|
+
harvest={
|
|
317
|
+
"domain": "some-other-domain",
|
|
318
|
+
# the FakeBackend harvest above should reuse this dataset with the same remote_id URI
|
|
319
|
+
"remote_id": duplicated_remote_id_uri,
|
|
320
|
+
"source_id": "some-other-source-id",
|
|
321
|
+
},
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# Create a dataset that should not be reused even though it's the same `remote_id`,
|
|
325
|
+
# as it's not an URI, and has a different domain and source id.
|
|
326
|
+
dataset_not_reused = DatasetFactory(
|
|
327
|
+
title="Duplicated Dataset",
|
|
328
|
+
harvest={
|
|
329
|
+
"domain": "some-other-domain",
|
|
330
|
+
"remote_id": "fake-0", # the "source" harvest above should create another dataset with the same remote_id
|
|
331
|
+
"source_id": "some-other-source-id",
|
|
332
|
+
},
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
job = backend.harvest()
|
|
336
|
+
|
|
337
|
+
# 3 (nb_datasets) + 1 (dataset_remote_ids) created by the HarvestSourceFactory
|
|
338
|
+
assert len(job.items) == nb_datasets + 1
|
|
339
|
+
# all datasets : 4 mocks (3 nb_datasets + 1 dataset_remote_ids) + 3 created with DatasetFactory - 2 reused
|
|
340
|
+
assert Dataset.objects.count() == nb_datasets + 1 + 3 - 2
|
|
341
|
+
assert (
|
|
342
|
+
# and not 3, data_reused was not duplicated
|
|
343
|
+
Dataset.objects(harvest__remote_id="fake-0").count() == 2
|
|
344
|
+
)
|
|
345
|
+
# The dataset not reused wasn't overwritten nor updated by the harvest.
|
|
346
|
+
dataset_not_reused.reload()
|
|
347
|
+
assert dataset_not_reused.harvest.domain == "some-other-domain"
|
|
348
|
+
assert dataset_not_reused.harvest.source_id == "some-other-source-id"
|
|
349
|
+
# The "reused dataset" was overwritten and updated by the harvest.
|
|
350
|
+
dataset_reused.reload()
|
|
351
|
+
# The "reused dataset with uri" was overwritten and updated by the harvest.
|
|
352
|
+
dataset_reused_uri.reload()
|
|
353
|
+
assert dataset_reused_uri.harvest.domain == source.domain
|
|
354
|
+
assert dataset_reused_uri.harvest.source_id == str(source.id)
|
|
355
|
+
|
|
284
356
|
|
|
285
357
|
@pytest.mark.usefixtures("clean_db")
|
|
286
358
|
class BaseBackendValidateTest:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: udata
|
|
3
|
-
Version: 10.0.7.
|
|
3
|
+
Version: 10.0.7.dev33440
|
|
4
4
|
Summary: Open data portal
|
|
5
5
|
Home-page: https://github.com/opendatateam/udata
|
|
6
6
|
Author: Opendata Team
|
|
@@ -146,6 +146,7 @@ It is collectively taken care of by members of the
|
|
|
146
146
|
- fix `function_field` wrong name [#3236](https://github.com/opendatateam/udata/pull/3236)
|
|
147
147
|
- Allow dataservice archive [#3238](https://github.com/opendatateam/udata/pull/3238)
|
|
148
148
|
- Improve OGC service detection to expose in RDF [#3241](https://github.com/opendatateam/udata/pull/3241)
|
|
149
|
+
- Don’t duplicate datasets with a harvest.remote_id URI when harvesting [#3219](https://github.com/opendatateam/udata/pull/3219)
|
|
149
150
|
|
|
150
151
|
## 10.0.6 (2024-12-19)
|
|
151
152
|
|
|
@@ -157,6 +158,7 @@ It is collectively taken care of by members of the
|
|
|
157
158
|
- Fix failing dataset save in update reuses metrics [#3230](https://github.com/opendatateam/udata/pull/3230)
|
|
158
159
|
- Fix catalog RDF by preventing memory increase on getting dataservice hvd tags [#3231](https://github.com/opendatateam/udata/pull/3231)
|
|
159
160
|
- Update purge tasks [#3167](https://github.com/opendatateam/udata/pull/3167)
|
|
161
|
+
- Trigger GitLab infra deployment through simple-scaffolding script [#3232](https://github.com/opendatateam/udata/pull/3232)
|
|
160
162
|
|
|
161
163
|
## 10.0.5 (2024-12-09)
|
|
162
164
|
|
|
@@ -291,13 +291,13 @@ udata/harvest/notifications.py,sha256=8WkHtD68v6rfZC6jCmAuuuRp4NN6q71ZkksZU5m9oJ
|
|
|
291
291
|
udata/harvest/signals.py,sha256=3AhFHMPIFH5vz01NX5ycR_RWH14MXFWnCT6__LSa-QI,1338
|
|
292
292
|
udata/harvest/tasks.py,sha256=ddJtvE0s-kAYt27-rKH6n8U8vKD8qKczlJtdBJzMUns,1718
|
|
293
293
|
udata/harvest/backends/__init__.py,sha256=QjoFfBJfpw_xgk5YYWI1SgKJOMEmTMlxSfW79GNkSTI,459
|
|
294
|
-
udata/harvest/backends/base.py,sha256=
|
|
294
|
+
udata/harvest/backends/base.py,sha256=K-pfjYc7GiAQFprO-2Wknq3cmlEPImCIHmWdFK2yGVM,16638
|
|
295
295
|
udata/harvest/backends/dcat.py,sha256=wbznPWCS9UJzpCWPAnmeUhUCREJg-yoQAHtEfr88RqU,15963
|
|
296
296
|
udata/harvest/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
297
297
|
udata/harvest/tests/factories.py,sha256=Jb_RwsYogWhquG8aj2bzT3JLySq3530OmZV9cofeWlE,2269
|
|
298
298
|
udata/harvest/tests/test_actions.py,sha256=s5yK4YQXVvrbLgKMx8_-ifFBC5QmUYBJWvkfNAZuy_Y,26504
|
|
299
299
|
udata/harvest/tests/test_api.py,sha256=pCIoBCWZYr1IKgGARMe5fZArTsTnFQXUy5ur11vs3ho,20363
|
|
300
|
-
udata/harvest/tests/test_base_backend.py,sha256=
|
|
300
|
+
udata/harvest/tests/test_base_backend.py,sha256=DrBwIr6uMweQ6S9oCpBS9tS54VlIcHMdPswGXAIDz9c,16446
|
|
301
301
|
udata/harvest/tests/test_dcat_backend.py,sha256=IauCoeSylEH-Wyk3oXdC7sIk5bEEdnqBBxOiMJyIrSc,39877
|
|
302
302
|
udata/harvest/tests/test_filters.py,sha256=2nXGnLTlM0iSOZBDBxmJp1KO3_59RXLS0vmXvEs2-KE,2468
|
|
303
303
|
udata/harvest/tests/test_models.py,sha256=f9NRR2_S4oZFgF8qOumg0vv-lpnEBJbI5vNtcwFdSqM,831
|
|
@@ -710,9 +710,9 @@ udata/translations/pt/LC_MESSAGES/udata.mo,sha256=18Y5YtzVKInDejw-R-45HNzsB3OVwJ
|
|
|
710
710
|
udata/translations/pt/LC_MESSAGES/udata.po,sha256=6IQvFk0NTDV5Jq-kLkkzpioWfrMaLDa1oQSevKFbxKQ,44943
|
|
711
711
|
udata/translations/sr/LC_MESSAGES/udata.mo,sha256=O4zKHNkiX-2GUfLLa0kwbxIA5M1jxiqkHzaMh1t2wKs,29169
|
|
712
712
|
udata/translations/sr/LC_MESSAGES/udata.po,sha256=W9C447pW0O-Q28ji8wGLgPNrnqlPYXaMD0AOWJPcpZc,51918
|
|
713
|
-
udata-10.0.7.
|
|
714
|
-
udata-10.0.7.
|
|
715
|
-
udata-10.0.7.
|
|
716
|
-
udata-10.0.7.
|
|
717
|
-
udata-10.0.7.
|
|
718
|
-
udata-10.0.7.
|
|
713
|
+
udata-10.0.7.dev33440.dist-info/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
|
|
714
|
+
udata-10.0.7.dev33440.dist-info/METADATA,sha256=2fFeFMw--Coc91igNCZY4oslE5pF3ND5JTlLHLIMkHg,138907
|
|
715
|
+
udata-10.0.7.dev33440.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
|
|
716
|
+
udata-10.0.7.dev33440.dist-info/entry_points.txt,sha256=3SKiqVy4HUqxf6iWspgMqH8d88Htk6KoLbG1BU-UddQ,451
|
|
717
|
+
udata-10.0.7.dev33440.dist-info/top_level.txt,sha256=39OCg-VWFWOq4gCKnjKNu-s3OwFlZIu_dVH8Gl6ndHw,12
|
|
718
|
+
udata-10.0.7.dev33440.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|