udata 10.8.2.dev36743__py2.py3-none-any.whl → 10.8.2.dev36809__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (35) hide show
  1. udata/harvest/backends/ckan/__init__.py +3 -0
  2. udata/harvest/backends/ckan/harvesters.py +274 -0
  3. udata/harvest/backends/ckan/models.py +10 -0
  4. udata/harvest/backends/ckan/schemas/__init__.py +0 -0
  5. udata/harvest/backends/ckan/schemas/ckan.py +86 -0
  6. udata/harvest/backends/ckan/schemas/dkan.py +98 -0
  7. udata/harvest/tests/ckan/conftest.py +67 -0
  8. udata/harvest/tests/ckan/data/dkan-french-w-license.json +226 -0
  9. udata/harvest/tests/ckan/test_ckan_backend.py +697 -0
  10. udata/harvest/tests/ckan/test_ckan_backend_errors.py +140 -0
  11. udata/harvest/tests/ckan/test_ckan_backend_filters.py +130 -0
  12. udata/harvest/tests/ckan/test_dkan_backend.py +68 -0
  13. udata/static/chunks/{10.8ca60413647062717b1e.js → 10.471164b2a9fe15614797.js} +3 -3
  14. udata/static/chunks/{10.8ca60413647062717b1e.js.map → 10.471164b2a9fe15614797.js.map} +1 -1
  15. udata/static/chunks/{11.b6f741fcc366abfad9c4.js → 11.51d706fb9521c16976bc.js} +3 -3
  16. udata/static/chunks/{11.b6f741fcc366abfad9c4.js.map → 11.51d706fb9521c16976bc.js.map} +1 -1
  17. udata/static/chunks/{13.2d06442dd9a05d9777b5.js → 13.f29411b06be1883356a3.js} +2 -2
  18. udata/static/chunks/{13.2d06442dd9a05d9777b5.js.map → 13.f29411b06be1883356a3.js.map} +1 -1
  19. udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js → 17.3bd0340930d4a314ce9c.js} +2 -2
  20. udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js.map → 17.3bd0340930d4a314ce9c.js.map} +1 -1
  21. udata/static/chunks/{19.f03a102365af4315f9db.js → 19.8da42e8359d72afc2618.js} +3 -3
  22. udata/static/chunks/{19.f03a102365af4315f9db.js.map → 19.8da42e8359d72afc2618.js.map} +1 -1
  23. udata/static/chunks/{8.778091d55cd8ea39af6b.js → 8.54e44b102164ae5e7a67.js} +2 -2
  24. udata/static/chunks/{8.778091d55cd8ea39af6b.js.map → 8.54e44b102164ae5e7a67.js.map} +1 -1
  25. udata/static/chunks/{9.033d7e190ca9e226a5d0.js → 9.07515e5187f475bce828.js} +3 -3
  26. udata/static/chunks/{9.033d7e190ca9e226a5d0.js.map → 9.07515e5187f475bce828.js.map} +1 -1
  27. udata/static/common.js +1 -1
  28. udata/static/common.js.map +1 -1
  29. udata/translations/udata.pot +74 -70
  30. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/METADATA +3 -1
  31. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/RECORD +35 -23
  32. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/entry_points.txt +2 -0
  33. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/LICENSE +0 -0
  34. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/WHEEL +0 -0
  35. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,697 @@
1
+ import json
2
+ import random
3
+ from datetime import date
4
+
5
+ import pytest
6
+
7
+ from udata.app import create_app
8
+ from udata.core.organization.factories import OrganizationFactory
9
+ from udata.core.spatial.factories import GeoZoneFactory
10
+ from udata.harvest import actions
11
+ from udata.harvest.backends.ckan.harvesters import ALLOWED_RESOURCE_TYPES
12
+ from udata.harvest.backends.ckan.schemas.ckan import RESOURCE_TYPES
13
+ from udata.harvest.tests.factories import HarvestSourceFactory
14
+ from udata.models import Dataset
15
+ from udata.settings import Defaults, Testing
16
+ from udata.tests.plugin import drop_db
17
+ from udata.utils import faker
18
+
19
+
20
+ class CkanSettings(Testing):
21
+ PLUGINS = ["ckan"]
22
+
23
+
24
+ @pytest.fixture
25
+ def app(request):
26
+ """Create an udata app."""
27
+ app = create_app(Defaults, override=CkanSettings)
28
+ with app.app_context():
29
+ drop_db(app)
30
+ yield app
31
+ with app.app_context():
32
+ drop_db(app)
33
+
34
+
35
+ @pytest.fixture
36
+ def source(app, ckan):
37
+ """
38
+ Create an harvest source for an organization.
39
+ """
40
+ with app.app_context():
41
+ org = OrganizationFactory()
42
+ return HarvestSourceFactory(backend="ckan", url=ckan.BASE_URL, organization=org)
43
+
44
+
45
+ def ckan_package(data):
46
+ result_data = {
47
+ "id": faker.uuid4(),
48
+ "metadata_modified": faker.date(),
49
+ "tags": [],
50
+ "license_id": None,
51
+ "metadata_created": faker.date(),
52
+ "type": "dataset",
53
+ "author": None,
54
+ "maintainer_email": None,
55
+ "state": None,
56
+ "author_email": None,
57
+ "license_title": None,
58
+ "organization": None,
59
+ "private": False,
60
+ "maintainer": None,
61
+ }
62
+ result_data.update(data)
63
+
64
+ for res in result_data.get("resources", []):
65
+ res["id"] = faker.uuid4()
66
+ res["resource_type"] = res.get("resource_type", "file")
67
+ res["created"] = res.get("created", faker.date())
68
+ res["last_modified"] = res.get("last_modified", faker.date())
69
+
70
+ return {"success": True, "result": result_data}
71
+
72
+
73
+ @pytest.fixture
74
+ def harvest_ckan(request, source, ckan, app, rmock):
75
+ """
76
+ This fixture performs the harvesting and return the data, result
77
+ and kwargs for this test case
78
+ """
79
+
80
+ fixture = request.node.get_closest_marker("ckan_data").args[0]
81
+ values = request.getfixturevalue(fixture)
82
+ data, kwargs = values if isinstance(values, tuple) else (values, {})
83
+ result = ckan_package(data)
84
+
85
+ rmock.get(
86
+ ckan.PACKAGE_SHOW_URL,
87
+ json=result,
88
+ status_code=200,
89
+ headers={"Content-Type": "application/json"},
90
+ )
91
+ rmock.get(
92
+ ckan.PACKAGE_LIST_URL,
93
+ json={"success": True, "result": [result["result"]["id"]]},
94
+ status_code=200,
95
+ headers={"Content-Type": "application/json"},
96
+ )
97
+
98
+ with app.app_context():
99
+ actions.run(source.slug)
100
+ source.reload()
101
+ job = source.get_last_job()
102
+ assert len(job.items) == 1
103
+
104
+ return data, result, kwargs
105
+
106
+
107
+ ##############################################################################
108
+ # Method fixtures and helpers #
109
+ ##############################################################################
110
+
111
+
112
+ @pytest.fixture
113
+ def data(harvest_ckan):
114
+ return harvest_ckan[0]
115
+
116
+
117
+ @pytest.fixture
118
+ def result(harvest_ckan):
119
+ return harvest_ckan[1]
120
+
121
+
122
+ @pytest.fixture
123
+ def kwargs(harvest_ckan):
124
+ return harvest_ckan[2]
125
+
126
+
127
+ def job_item_for(job, result):
128
+ """Get the job item for a given result"""
129
+ remote_id = result["result"]["id"]
130
+ return [i for i in job.items if i.remote_id == remote_id][0]
131
+
132
+
133
+ def dataset_for(result):
134
+ """Get the dataset associated to a given result"""
135
+ return Dataset.objects(harvest__remote_id=result["result"]["id"]).first()
136
+
137
+
138
+ ##############################################################################
139
+ # Data Fixtures #
140
+ # These are functions () -> (dict, Any) #
141
+ # The 1st dict is the ckan package_create API payload #
142
+ # The 2nd argument can ben whatever needs to be given to the test function #
143
+ ##############################################################################
144
+
145
+
146
+ @pytest.fixture
147
+ def resource_data():
148
+ return {
149
+ "id": faker.uuid4(),
150
+ "position": faker.random_digit(),
151
+ "name": faker.word(),
152
+ "description": faker.sentence(),
153
+ "format": faker.file_extension(),
154
+ "mimetype": faker.mime_type(),
155
+ "size": faker.random_digit(),
156
+ "hash": faker.md5(),
157
+ "created": faker.date(),
158
+ "last_modified": faker.date(),
159
+ "url": faker.unique_url(),
160
+ "resource_type": random.choice(list(set(RESOURCE_TYPES) & set(ALLOWED_RESOURCE_TYPES))),
161
+ }
162
+
163
+
164
+ @pytest.fixture
165
+ def minimal(resource_data):
166
+ data = {
167
+ "name": faker.unique_string(),
168
+ "title": faker.sentence(),
169
+ "notes": faker.paragraph(),
170
+ "resources": [resource_data],
171
+ }
172
+ return data, {"resource_url": resource_data["url"]}
173
+
174
+
175
+ @pytest.fixture
176
+ def all_metadata(resource_data):
177
+ resource_data.update(
178
+ {
179
+ "name": faker.sentence(),
180
+ "description": faker.paragraph(),
181
+ "url": faker.unique_url(),
182
+ "mimetype": faker.mime_type(),
183
+ "format": faker.file_extension(),
184
+ "last_modified": "2022-09-30",
185
+ "created": "2022-09-29",
186
+ }
187
+ )
188
+ data = {
189
+ "name": faker.unique_string(),
190
+ "title": faker.sentence(),
191
+ "notes": faker.paragraph(),
192
+ "tags": [{"name": faker.unique_string(), "id": faker.unique_string()} for _ in range(3)],
193
+ "resources": [resource_data],
194
+ }
195
+ return data, {"resource_data": resource_data}
196
+
197
+
198
+ @pytest.fixture
199
+ def spatial_geom_polygon(resource_data):
200
+ polygon = faker.polygon()
201
+ data = {
202
+ "name": faker.unique_string(),
203
+ "title": faker.sentence(),
204
+ "notes": faker.paragraph(),
205
+ "resources": [resource_data],
206
+ "extras": [{"key": "spatial", "value": json.dumps(polygon)}],
207
+ }
208
+ return data, {"polygon": polygon}
209
+
210
+
211
+ @pytest.fixture
212
+ def spatial_geom_multipolygon(resource_data):
213
+ multipolygon = faker.multipolygon()
214
+ data = {
215
+ "name": faker.unique_string(),
216
+ "title": faker.sentence(),
217
+ "notes": faker.paragraph(),
218
+ "resources": [resource_data],
219
+ "extras": [{"key": "spatial", "value": json.dumps(multipolygon)}],
220
+ }
221
+ return data, {"multipolygon": multipolygon}
222
+
223
+
224
+ @pytest.fixture
225
+ def known_spatial_text_name(app, resource_data):
226
+ with app.app_context():
227
+ zone = GeoZoneFactory()
228
+ data = {
229
+ "name": faker.unique_string(),
230
+ "title": faker.sentence(),
231
+ "notes": faker.paragraph(),
232
+ "resources": [resource_data],
233
+ "extras": [{"key": "spatial-text", "value": zone.name}],
234
+ }
235
+ return data, {"zone": zone}
236
+
237
+
238
+ @pytest.fixture
239
+ def known_spatial_text_slug(app, resource_data):
240
+ with app.app_context():
241
+ zone = GeoZoneFactory()
242
+ data = {
243
+ "name": faker.unique_string(),
244
+ "title": faker.sentence(),
245
+ "notes": faker.paragraph(),
246
+ "resources": [resource_data],
247
+ "extras": [{"key": "spatial-text", "value": zone.slug}],
248
+ }
249
+ return data, {"zone": zone}
250
+
251
+
252
+ @pytest.fixture
253
+ def multiple_known_spatial_text(app, resource_data):
254
+ name = faker.word()
255
+ with app.app_context():
256
+ GeoZoneFactory.create_batch(2, name=name)
257
+ data = {
258
+ "name": faker.unique_string(),
259
+ "title": faker.sentence(),
260
+ "notes": faker.paragraph(),
261
+ "resources": [resource_data],
262
+ "extras": [{"key": "spatial-text", "value": name}],
263
+ }
264
+ return data, {"name": name}
265
+
266
+
267
+ @pytest.fixture
268
+ def unknown_spatial_text(resource_data):
269
+ spatial = "Somewhere"
270
+ data = {
271
+ "name": faker.unique_string(),
272
+ "title": faker.sentence(),
273
+ "notes": faker.paragraph(),
274
+ "resources": [resource_data],
275
+ "extras": [{"key": "spatial-text", "value": spatial}],
276
+ }
277
+ return data, {"spatial": spatial}
278
+
279
+
280
+ @pytest.fixture
281
+ def spatial_uri(resource_data):
282
+ spatial = "http://www.geonames.org/2111964"
283
+ data = {
284
+ "name": faker.unique_string(),
285
+ "title": faker.sentence(),
286
+ "notes": faker.paragraph(),
287
+ "resources": [resource_data],
288
+ "extras": [{"key": "spatial-uri", "value": spatial}],
289
+ }
290
+ return data, {"spatial": spatial}
291
+
292
+
293
+ @pytest.fixture
294
+ def skipped_no_resources():
295
+ return {
296
+ "name": faker.unique_string(),
297
+ "title": faker.sentence(),
298
+ "notes": faker.paragraph(),
299
+ "tags": [{"name": faker.unique_string(), "id": faker.unique_string()} for _ in range(3)],
300
+ "resources": [],
301
+ }
302
+
303
+
304
+ @pytest.fixture
305
+ def ckan_url_is_url(resource_data):
306
+ url = faker.unique_url()
307
+ data = {
308
+ "name": faker.unique_string(),
309
+ "title": faker.sentence(),
310
+ "notes": faker.paragraph(),
311
+ "resources": [resource_data],
312
+ "url": url,
313
+ }
314
+ return data, {"url": url}
315
+
316
+
317
+ @pytest.fixture
318
+ def ckan_url_is_a_string(resource_data):
319
+ url = faker.sentence()
320
+ data = {
321
+ "name": faker.unique_string(),
322
+ "title": faker.sentence(),
323
+ "notes": faker.paragraph(),
324
+ "resources": [resource_data],
325
+ "url": url,
326
+ }
327
+ return data, {"url": url}
328
+
329
+
330
+ @pytest.fixture
331
+ def frequency_as_rdf_uri(resource_data):
332
+ data = {
333
+ "name": faker.unique_string(),
334
+ "title": faker.sentence(),
335
+ "notes": faker.paragraph(),
336
+ "resources": [resource_data],
337
+ "extras": [{"key": "frequency", "value": "http://purl.org/cld/freq/daily"}],
338
+ }
339
+ return data, {"expected": "daily"}
340
+
341
+
342
+ @pytest.fixture
343
+ def frequency_as_exact_match(resource_data):
344
+ data = {
345
+ "name": faker.unique_string(),
346
+ "title": faker.sentence(),
347
+ "notes": faker.paragraph(),
348
+ "resources": [resource_data],
349
+ "extras": [{"key": "frequency", "value": "daily"}],
350
+ }
351
+ return data, {"expected": "daily"}
352
+
353
+
354
+ @pytest.fixture
355
+ def frequency_as_unknown_value(resource_data):
356
+ value = "unkowwn-value"
357
+ data = {
358
+ "name": faker.unique_string(),
359
+ "title": faker.sentence(),
360
+ "notes": faker.paragraph(),
361
+ "resources": [resource_data],
362
+ "extras": [{"key": "frequency", "value": value}],
363
+ }
364
+ return data, {"expected": value}
365
+
366
+
367
+ @pytest.fixture
368
+ def empty_extras(resource_data):
369
+ return {
370
+ "name": faker.unique_string(),
371
+ "title": faker.sentence(),
372
+ "notes": faker.paragraph(),
373
+ "resources": [resource_data],
374
+ "extras": [
375
+ {"key": "none", "value": None},
376
+ {"key": "blank", "value": ""},
377
+ {"key": "spaces", "value": " "},
378
+ ],
379
+ }
380
+
381
+
382
+ ##############################################################################
383
+ # Tests #
384
+ # #
385
+ # They are using the `ckan_data` marker to specify the data fixture #
386
+ # they rely on. This allows `data`, `result` and `kwargs` fixtures to be #
387
+ # populated with the associated data harvest data #
388
+ ##############################################################################
389
+
390
+
391
+ @pytest.mark.ckan_data("minimal")
392
+ def test_minimal_metadata(data, result, kwargs):
393
+ resource_url = kwargs["resource_url"]
394
+
395
+ dataset = dataset_for(result)
396
+ assert dataset.title == data["title"]
397
+ assert dataset.description == data["notes"]
398
+ assert dataset.harvest.remote_id == result["result"]["id"]
399
+ assert dataset.harvest.domain == "localhost"
400
+ assert dataset.harvest.ckan_name == data["name"]
401
+ assert len(dataset.resources) == 1
402
+
403
+ resource = dataset.resources[0]
404
+ assert resource.url == resource_url
405
+
406
+
407
+ @pytest.mark.ckan_data("all_metadata")
408
+ def test_all_metadata(data, result):
409
+ resource_data = data["resources"][0]
410
+ resource_result = result["result"]["resources"][0]
411
+
412
+ dataset = dataset_for(result)
413
+ assert dataset.title == data["title"]
414
+ assert dataset.description == data["notes"]
415
+ assert set(dataset.tags) == set([t["name"] for t in data["tags"]])
416
+ assert dataset.harvest.remote_id == result["result"]["id"]
417
+ assert dataset.harvest.domain == "localhost"
418
+ assert dataset.harvest.ckan_name == data["name"]
419
+ assert len(dataset.resources) == 1
420
+
421
+ resource = dataset.resources[0]
422
+ assert resource.title == resource_data["name"]
423
+ assert resource.description == resource_data["description"]
424
+ assert resource.url == resource_data["url"]
425
+ # Use result because format is normalized by CKAN
426
+ assert resource.format == resource_result["format"].lower()
427
+ assert resource.mime == resource_data["mimetype"].lower()
428
+ assert resource.harvest.created_at.date() == date(2022, 9, 29)
429
+ assert resource.harvest.modified_at.date() == date(2022, 9, 30)
430
+
431
+
432
+ @pytest.mark.ckan_data("spatial_geom_polygon")
433
+ def test_geospatial_geom_polygon(result, kwargs):
434
+ polygon = kwargs["polygon"]
435
+ dataset = dataset_for(result)
436
+
437
+ assert dataset.spatial.geom == {
438
+ "type": "MultiPolygon",
439
+ "coordinates": [polygon["coordinates"]],
440
+ }
441
+
442
+
443
+ @pytest.mark.ckan_data("spatial_geom_multipolygon")
444
+ def test_geospatial_geom_multipolygon(result, kwargs):
445
+ multipolygon = kwargs["multipolygon"]
446
+
447
+ dataset = dataset_for(result)
448
+ assert dataset.spatial.geom == multipolygon
449
+
450
+
451
+ @pytest.mark.ckan_data("skipped_no_resources")
452
+ def test_skip_no_resources(source, result):
453
+ job = source.get_last_job()
454
+ item = job_item_for(job, result)
455
+ assert item.status == "skipped"
456
+ assert dataset_for(result) is None
457
+
458
+
459
+ @pytest.mark.ckan_data("ckan_url_is_url")
460
+ def test_ckan_url_is_url(data, result):
461
+ dataset = dataset_for(result)
462
+ assert dataset.harvest.remote_url == data["url"]
463
+ assert not hasattr(dataset.harvest, "ckan_source")
464
+
465
+
466
+ @pytest.mark.ckan_data("ckan_url_is_a_string")
467
+ def test_ckan_url_is_string(ckan, data, result):
468
+ dataset = dataset_for(result)
469
+ expected_url = "{0}/dataset/{1}".format(ckan.BASE_URL, data["name"])
470
+ assert dataset.harvest.remote_url == expected_url
471
+ assert dataset.harvest.ckan_source == data["url"]
472
+
473
+
474
+ @pytest.mark.ckan_data("frequency_as_rdf_uri")
475
+ def test_can_parse_frequency_as_uri(result, kwargs):
476
+ dataset = dataset_for(result)
477
+ assert dataset.frequency == kwargs["expected"]
478
+ assert "ckan:frequency" not in dataset.extras
479
+
480
+
481
+ @pytest.mark.ckan_data("frequency_as_exact_match")
482
+ def test_can_parse_frequency_as_exact_match(result, kwargs):
483
+ dataset = dataset_for(result)
484
+ assert dataset.frequency == kwargs["expected"]
485
+ assert "ckan:frequency" not in dataset.extras
486
+
487
+
488
+ @pytest.mark.ckan_data("frequency_as_unknown_value")
489
+ def test_can_parse_frequency_as_unkown_value(result, kwargs):
490
+ dataset = dataset_for(result)
491
+ assert dataset.extras["ckan:frequency"] == kwargs["expected"]
492
+ assert dataset.frequency is None
493
+
494
+
495
+ @pytest.mark.ckan_data("empty_extras")
496
+ def test_skip_empty_extras(result):
497
+ dataset = dataset_for(result)
498
+ assert "none" not in dataset.extras
499
+ assert "blank" not in dataset.extras
500
+ assert "spaces" not in dataset.extras
501
+
502
+
503
+ @pytest.mark.ckan_data("known_spatial_text_name")
504
+ def test_known_spatial_text_name(result, kwargs):
505
+ zone = kwargs["zone"]
506
+ dataset = dataset_for(result)
507
+ assert zone in dataset.spatial.zones
508
+ assert "ckan:spatial-text" not in dataset.extras
509
+
510
+
511
+ @pytest.mark.ckan_data("known_spatial_text_slug")
512
+ def test_known_spatial_text_slug(result, kwargs):
513
+ zone = kwargs["zone"]
514
+ dataset = dataset_for(result)
515
+ assert zone in dataset.spatial.zones
516
+ assert "ckan:spatial-text" not in dataset.extras
517
+
518
+
519
+ @pytest.mark.ckan_data("multiple_known_spatial_text")
520
+ def test_store_unsure_spatial_text_as_extra(result, kwargs):
521
+ dataset = dataset_for(result)
522
+ assert dataset.extras["ckan:spatial-text"] == kwargs["name"]
523
+ assert dataset.spatial is None
524
+
525
+
526
+ @pytest.mark.ckan_data("unknown_spatial_text")
527
+ def test_keep_unknown_spatial_text_as_extra(result, kwargs):
528
+ dataset = dataset_for(result)
529
+ assert dataset.extras["ckan:spatial-text"] == kwargs["spatial"]
530
+ assert dataset.spatial is None
531
+
532
+
533
+ @pytest.mark.ckan_data("spatial_uri")
534
+ def test_keep_unknown_spatial_uri_as_extra(result, kwargs):
535
+ dataset = dataset_for(result)
536
+ assert dataset.extras["ckan:spatial-uri"] == kwargs["spatial"]
537
+ assert dataset.spatial is None
538
+
539
+
540
+ ##############################################################################
541
+ # Edge cases manually written #
542
+ ##############################################################################
543
+ def test_minimal_ckan_response(app, rmock):
544
+ """CKAN Harvester should accept the minimum dataset payload"""
545
+ CKAN_URL = "https://harvest.me/"
546
+ API_URL = "{}api/3/action/".format(CKAN_URL)
547
+ PACKAGE_LIST_URL = "{}package_list".format(API_URL)
548
+ PACKAGE_SHOW_URL = "{}package_show".format(API_URL)
549
+
550
+ name = faker.unique_string()
551
+ json = {
552
+ "success": True,
553
+ "result": minimal_data(name=name),
554
+ }
555
+ source = HarvestSourceFactory(backend="ckan", url=CKAN_URL)
556
+ rmock.get(
557
+ PACKAGE_LIST_URL,
558
+ json={"success": True, "result": [name]},
559
+ status_code=200,
560
+ headers={"Content-Type": "application/json"},
561
+ )
562
+ rmock.get(
563
+ PACKAGE_SHOW_URL,
564
+ json=json,
565
+ status_code=200,
566
+ headers={"Content-Type": "application/json"},
567
+ )
568
+ actions.run(source.slug)
569
+ source.reload()
570
+ assert source.get_last_job().status == "done"
571
+
572
+
573
+ def test_flawed_ckan_response(app, rmock):
574
+ """CKAN Harvester should report item error with id == remote_id in item"""
575
+ CKAN_URL = "https://harvest.me/"
576
+ API_URL = "{}api/3/action/".format(CKAN_URL)
577
+ PACKAGE_LIST_URL = "{}package_list".format(API_URL)
578
+ PACKAGE_SHOW_URL = "{}package_show".format(API_URL)
579
+
580
+ name = faker.unique_string()
581
+ _id = faker.uuid4()
582
+ # flawed response, missing way too much required attrs
583
+ json = {
584
+ "success": True,
585
+ "result": {
586
+ "id": _id,
587
+ "name": name,
588
+ },
589
+ }
590
+ source = HarvestSourceFactory(backend="ckan", url=CKAN_URL)
591
+ rmock.get(
592
+ PACKAGE_LIST_URL,
593
+ json={"success": True, "result": [name]},
594
+ status_code=200,
595
+ headers={"Content-Type": "application/json"},
596
+ )
597
+ rmock.get(
598
+ PACKAGE_SHOW_URL,
599
+ json=json,
600
+ status_code=200,
601
+ headers={"Content-Type": "application/json"},
602
+ )
603
+ actions.run(source.slug)
604
+ source.reload()
605
+ assert source.get_last_job().status == "done-errors"
606
+ assert source.get_last_job().items[0].remote_id == _id
607
+ # flawed response, without an id, we should fallback on the name
608
+ json = {
609
+ "success": True,
610
+ "result": {
611
+ "name": name,
612
+ },
613
+ }
614
+ rmock.get(
615
+ PACKAGE_SHOW_URL,
616
+ json=json,
617
+ status_code=200,
618
+ headers={"Content-Type": "application/json"},
619
+ )
620
+ actions.run(source.slug)
621
+ source.reload()
622
+ assert source.get_last_job().status == "done-errors"
623
+ assert source.get_last_job().items[0].remote_id == name
624
+
625
+
626
+ @pytest.mark.options(HARVEST_MAX_ITEMS=1)
627
+ def test_max_items(app, rmock):
628
+ """CKAN Harvester should report item error with id == remote_id in item"""
629
+ CKAN_URL = "https://harvest.me/"
630
+ API_URL = "{}api/3/action/".format(CKAN_URL)
631
+ PACKAGE_LIST_URL = "{}package_list".format(API_URL)
632
+ PACKAGE_SHOW_URL = "{}package_show".format(API_URL)
633
+
634
+ name_a = faker.unique_string()
635
+ name_b = faker.unique_string()
636
+ id_a = faker.uuid4()
637
+ json_a = {
638
+ "success": True,
639
+ "result": minimal_data(id=id_a, name=name_a),
640
+ }
641
+ id_b = faker.uuid4()
642
+ json_b = {
643
+ "success": True,
644
+ "result": minimal_data(id=id_b, name=name_b),
645
+ }
646
+ source = HarvestSourceFactory(backend="ckan", url=CKAN_URL)
647
+ rmock.get(
648
+ PACKAGE_LIST_URL,
649
+ json={"success": True, "result": [name_a, name_b]},
650
+ status_code=200,
651
+ headers={"Content-Type": "application/json"},
652
+ )
653
+ rmock.get(
654
+ f"{PACKAGE_SHOW_URL}?id={name_a}",
655
+ json=json_a,
656
+ status_code=200,
657
+ headers={"Content-Type": "application/json"},
658
+ )
659
+ rmock.get(
660
+ f"{PACKAGE_SHOW_URL}?id={name_b}",
661
+ json=json_b,
662
+ status_code=200,
663
+ headers={"Content-Type": "application/json"},
664
+ )
665
+ actions.run(source.slug)
666
+ source.reload()
667
+ assert source.get_last_job().status == "done"
668
+ assert len(source.get_last_job().items) == 1
669
+ assert source.get_last_job().items[0].remote_id == id_a
670
+
671
+
672
+ def minimal_data(**kwargs):
673
+ # extras and revision_id are not always present so we exclude them
674
+ # from the minimal payload
675
+ return {
676
+ **{
677
+ "id": faker.uuid4(),
678
+ "name": faker.uuid4(),
679
+ "title": faker.sentence(),
680
+ "maintainer": faker.name(),
681
+ "tags": [],
682
+ "private": False,
683
+ "maintainer_email": faker.email(),
684
+ "license_id": None,
685
+ "metadata_created": faker.iso8601(),
686
+ "organization": None,
687
+ "metadata_modified": faker.iso8601(),
688
+ "author": None,
689
+ "author_email": None,
690
+ "notes": faker.paragraph(),
691
+ "license_title": None,
692
+ "state": None,
693
+ "type": "dataset",
694
+ "resources": [],
695
+ },
696
+ **kwargs,
697
+ }