udata 10.8.1.dev36703__py2.py3-none-any.whl → 10.8.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- udata/__init__.py +1 -1
- udata/app.py +0 -2
- udata/commands/db.py +22 -9
- udata/core/dataset/models.py +5 -3
- udata/core/discussions/api.py +2 -2
- udata/core/jobs/api.py +3 -3
- udata/core/metrics/helpers.py +10 -0
- udata/core/metrics/tasks.py +144 -1
- udata/core/organization/api.py +2 -2
- udata/core/post/api.py +1 -1
- udata/core/user/api.py +1 -1
- udata/features/identicon/api.py +1 -1
- udata/harvest/actions.py +24 -28
- udata/harvest/api.py +28 -36
- udata/harvest/backends/ckan/__init__.py +3 -0
- udata/harvest/backends/ckan/harvesters.py +274 -0
- udata/harvest/backends/ckan/schemas/__init__.py +0 -0
- udata/harvest/backends/ckan/schemas/ckan.py +86 -0
- udata/harvest/backends/ckan/schemas/dkan.py +98 -0
- udata/harvest/commands.py +7 -7
- udata/harvest/tasks.py +1 -1
- udata/harvest/tests/ckan/conftest.py +67 -0
- udata/harvest/tests/ckan/data/dkan-french-w-license.json +226 -0
- udata/harvest/tests/ckan/test_ckan_backend.py +697 -0
- udata/harvest/tests/ckan/test_ckan_backend_errors.py +140 -0
- udata/harvest/tests/ckan/test_ckan_backend_filters.py +130 -0
- udata/harvest/tests/ckan/test_dkan_backend.py +68 -0
- udata/harvest/tests/test_actions.py +27 -32
- udata/harvest/tests/test_api.py +23 -18
- udata/harvest/tests/test_dcat_backend.py +29 -29
- udata/migrations/2025-07-30-purge-old-harvest-dynamic-fields.py +29 -0
- udata/mongo/slug_fields.py +1 -1
- udata/routing.py +6 -0
- udata/static/chunks/{11.b6f741fcc366abfad9c4.js → 11.51d706fb9521c16976bc.js} +3 -3
- udata/static/chunks/{11.b6f741fcc366abfad9c4.js.map → 11.51d706fb9521c16976bc.js.map} +1 -1
- udata/static/chunks/{13.2d06442dd9a05d9777b5.js → 13.39e106d56f794ebd06a0.js} +2 -2
- udata/static/chunks/{13.2d06442dd9a05d9777b5.js.map → 13.39e106d56f794ebd06a0.js.map} +1 -1
- udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js → 17.70cbb4a91b002338007e.js} +2 -2
- udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js.map → 17.70cbb4a91b002338007e.js.map} +1 -1
- udata/static/chunks/{19.f03a102365af4315f9db.js → 19.a348a5fff8fe2801e52a.js} +3 -3
- udata/static/chunks/{19.f03a102365af4315f9db.js.map → 19.a348a5fff8fe2801e52a.js.map} +1 -1
- udata/static/chunks/{5.0fa1408dae4e76b87b2e.js → 5.343ca020a2d38cec1a14.js} +3 -3
- udata/static/chunks/{5.0fa1408dae4e76b87b2e.js.map → 5.343ca020a2d38cec1a14.js.map} +1 -1
- udata/static/chunks/{6.d663709d877baa44a71e.js → 6.a3b07de9dd2ca2d24e85.js} +3 -3
- udata/static/chunks/{6.d663709d877baa44a71e.js.map → 6.a3b07de9dd2ca2d24e85.js.map} +1 -1
- udata/static/chunks/{8.778091d55cd8ea39af6b.js → 8.462bb3029de008497675.js} +2 -2
- udata/static/chunks/{8.778091d55cd8ea39af6b.js.map → 8.462bb3029de008497675.js.map} +1 -1
- udata/static/common.js +1 -1
- udata/static/common.js.map +1 -1
- udata/tests/api/test_datasets_api.py +0 -46
- udata/tests/api/test_organizations_api.py +5 -0
- udata/tests/cli/test_db_cli.py +12 -0
- udata/tests/dataset/test_dataset_model.py +0 -16
- udata/tests/metrics/__init__.py +0 -0
- udata/tests/metrics/conftest.py +15 -0
- udata/tests/metrics/helpers.py +58 -0
- udata/tests/metrics/test_metrics.py +67 -0
- udata/tests/metrics/test_tasks.py +171 -0
- udata/translations/ar/LC_MESSAGES/udata.mo +0 -0
- udata/translations/ar/LC_MESSAGES/udata.po +72 -65
- udata/translations/de/LC_MESSAGES/udata.mo +0 -0
- udata/translations/de/LC_MESSAGES/udata.po +72 -65
- udata/translations/es/LC_MESSAGES/udata.mo +0 -0
- udata/translations/es/LC_MESSAGES/udata.po +72 -65
- udata/translations/fr/LC_MESSAGES/udata.mo +0 -0
- udata/translations/fr/LC_MESSAGES/udata.po +72 -65
- udata/translations/it/LC_MESSAGES/udata.mo +0 -0
- udata/translations/it/LC_MESSAGES/udata.po +72 -65
- udata/translations/pt/LC_MESSAGES/udata.mo +0 -0
- udata/translations/pt/LC_MESSAGES/udata.po +72 -65
- udata/translations/sr/LC_MESSAGES/udata.mo +0 -0
- udata/translations/sr/LC_MESSAGES/udata.po +72 -65
- udata/translations/udata.pot +74 -70
- {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/METADATA +15 -2
- {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/RECORD +79 -62
- {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/entry_points.txt +2 -0
- {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/LICENSE +0 -0
- {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/WHEEL +0 -0
- {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/top_level.txt +0 -0
udata/harvest/api.py
CHANGED
|
@@ -300,59 +300,54 @@ class SourcesAPI(API):
|
|
|
300
300
|
return source, 201
|
|
301
301
|
|
|
302
302
|
|
|
303
|
-
@ns.route("/source/<
|
|
304
|
-
@api.param("ident", "A source ID or slug")
|
|
303
|
+
@ns.route("/source/<harvest_source:source>/", endpoint="harvest_source")
|
|
305
304
|
class SourceAPI(API):
|
|
306
305
|
@api.doc("get_harvest_source")
|
|
307
306
|
@api.marshal_with(source_fields)
|
|
308
|
-
def get(self,
|
|
307
|
+
def get(self, source: HarvestSource):
|
|
309
308
|
"""Get a single source given an ID or a slug"""
|
|
310
|
-
return
|
|
309
|
+
return source
|
|
311
310
|
|
|
312
311
|
@api.secure
|
|
313
312
|
@api.doc("update_harvest_source")
|
|
314
313
|
@api.expect(source_fields)
|
|
315
314
|
@api.marshal_with(source_fields)
|
|
316
|
-
def put(self,
|
|
315
|
+
def put(self, source: HarvestSource):
|
|
317
316
|
"""Update a harvest source"""
|
|
318
|
-
source = actions.get_source(ident)
|
|
319
317
|
OwnablePermission(source).test()
|
|
320
318
|
form = api.validate(HarvestSourceForm, source)
|
|
321
|
-
source = actions.update_source(
|
|
319
|
+
source = actions.update_source(source, form.data)
|
|
322
320
|
return source
|
|
323
321
|
|
|
324
322
|
@api.secure
|
|
325
323
|
@api.doc("delete_harvest_source")
|
|
326
324
|
@api.marshal_with(source_fields)
|
|
327
|
-
def delete(self,
|
|
328
|
-
source: HarvestSource = actions.get_source(ident)
|
|
325
|
+
def delete(self, source: HarvestSource):
|
|
329
326
|
OwnablePermission(source).test()
|
|
330
|
-
return actions.delete_source(
|
|
327
|
+
return actions.delete_source(source), 204
|
|
331
328
|
|
|
332
329
|
|
|
333
|
-
@ns.route("/source/<
|
|
334
|
-
@api.param("ident", "A source ID or slug")
|
|
330
|
+
@ns.route("/source/<harvest_source:source>/validate/", endpoint="validate_harvest_source")
|
|
335
331
|
class ValidateSourceAPI(API):
|
|
336
332
|
@api.doc("validate_harvest_source")
|
|
337
333
|
@api.secure(admin_permission)
|
|
338
334
|
@api.expect(validation_fields)
|
|
339
335
|
@api.marshal_with(source_fields)
|
|
340
|
-
def post(self,
|
|
336
|
+
def post(self, source: HarvestSource):
|
|
341
337
|
"""Validate or reject an harvest source"""
|
|
342
338
|
form = api.validate(HarvestSourceValidationForm)
|
|
343
339
|
if form.state.data == VALIDATION_ACCEPTED:
|
|
344
|
-
return actions.validate_source(
|
|
340
|
+
return actions.validate_source(source, form.comment.data)
|
|
345
341
|
else:
|
|
346
|
-
return actions.reject_source(
|
|
342
|
+
return actions.reject_source(source, form.comment.data)
|
|
347
343
|
|
|
348
344
|
|
|
349
|
-
@ns.route("/source/<
|
|
350
|
-
@api.param("ident", "A source ID or slug")
|
|
345
|
+
@ns.route("/source/<harvest_source:source>/run/", endpoint="run_harvest_source")
|
|
351
346
|
class RunSourceAPI(API):
|
|
352
347
|
@api.doc("run_harvest_source")
|
|
353
348
|
@api.secure
|
|
354
349
|
@api.marshal_with(source_fields)
|
|
355
|
-
def post(self,
|
|
350
|
+
def post(self, source: HarvestSource):
|
|
356
351
|
enabled = current_app.config.get("HARVEST_ENABLE_MANUAL_RUN")
|
|
357
352
|
if not enabled and not current_user.sysadmin:
|
|
358
353
|
api.abort(
|
|
@@ -360,42 +355,40 @@ class RunSourceAPI(API):
|
|
|
360
355
|
"Cannot run source manually. Please contact the platform if you need to reschedule the harvester.",
|
|
361
356
|
)
|
|
362
357
|
|
|
363
|
-
source: HarvestSource = actions.get_source(ident)
|
|
364
358
|
OwnablePermission(source).test()
|
|
365
359
|
|
|
366
360
|
if source.validation.state != VALIDATION_ACCEPTED:
|
|
367
361
|
api.abort(400, "Source is not validated. Please validate the source before running.")
|
|
368
362
|
|
|
369
|
-
actions.launch(
|
|
363
|
+
actions.launch(source)
|
|
370
364
|
|
|
371
365
|
return source
|
|
372
366
|
|
|
373
367
|
|
|
374
|
-
@ns.route("/source/<
|
|
375
|
-
@api.param("ident", "A source ID or slug")
|
|
368
|
+
@ns.route("/source/<harvest_source:source>/schedule/", endpoint="schedule_harvest_source")
|
|
376
369
|
class ScheduleSourceAPI(API):
|
|
377
370
|
@api.doc("schedule_harvest_source")
|
|
378
371
|
@api.secure(admin_permission)
|
|
379
372
|
@api.expect((str, "A cron expression"))
|
|
380
373
|
@api.marshal_with(source_fields)
|
|
381
|
-
def post(self,
|
|
374
|
+
def post(self, source: HarvestSource):
|
|
382
375
|
"""Schedule an harvest source"""
|
|
383
376
|
# Handle both syntax: quoted and unquoted
|
|
384
377
|
try:
|
|
385
378
|
data = request.json
|
|
386
379
|
except BadRequest:
|
|
387
380
|
data = request.data.decode("utf-8")
|
|
388
|
-
return actions.schedule(
|
|
381
|
+
return actions.schedule(source, data)
|
|
389
382
|
|
|
390
383
|
@api.doc("unschedule_harvest_source")
|
|
391
384
|
@api.secure(admin_permission)
|
|
392
385
|
@api.marshal_with(source_fields)
|
|
393
|
-
def delete(self,
|
|
386
|
+
def delete(self, source: HarvestSource):
|
|
394
387
|
"""Unschedule an harvest source"""
|
|
395
|
-
return actions.unschedule(
|
|
388
|
+
return actions.unschedule(source), 204
|
|
396
389
|
|
|
397
390
|
|
|
398
|
-
@ns.route("/source/preview", endpoint="preview_harvest_source_config")
|
|
391
|
+
@ns.route("/source/preview/", endpoint="preview_harvest_source_config")
|
|
399
392
|
class PreviewSourceConfigAPI(API):
|
|
400
393
|
@api.secure
|
|
401
394
|
@api.expect(source_fields)
|
|
@@ -409,15 +402,14 @@ class PreviewSourceConfigAPI(API):
|
|
|
409
402
|
return actions.preview_from_config(**form.data)
|
|
410
403
|
|
|
411
404
|
|
|
412
|
-
@ns.route("/source/<
|
|
413
|
-
@api.param("ident", "A source ID or slug")
|
|
405
|
+
@ns.route("/source/<harvest_source:source>/preview/", endpoint="preview_harvest_source")
|
|
414
406
|
class PreviewSourceAPI(API):
|
|
415
407
|
@api.secure
|
|
416
408
|
@api.doc("preview_harvest_source")
|
|
417
409
|
@api.marshal_with(preview_job_fields)
|
|
418
|
-
def get(self,
|
|
410
|
+
def get(self, source: HarvestSource):
|
|
419
411
|
"""Preview a single harvest source given an ID or a slug"""
|
|
420
|
-
return actions.preview(
|
|
412
|
+
return actions.preview(source)
|
|
421
413
|
|
|
422
414
|
|
|
423
415
|
parser = api.parser()
|
|
@@ -427,15 +419,15 @@ parser.add_argument(
|
|
|
427
419
|
)
|
|
428
420
|
|
|
429
421
|
|
|
430
|
-
@ns.route("/source/<
|
|
422
|
+
@ns.route("/source/<harvest_source:source>/jobs/", endpoint="harvest_jobs")
|
|
431
423
|
class JobsAPI(API):
|
|
432
424
|
@api.doc("list_harvest_jobs")
|
|
433
425
|
@api.expect(parser)
|
|
434
426
|
@api.marshal_with(job_page_fields)
|
|
435
|
-
def get(self,
|
|
427
|
+
def get(self, source: HarvestSource):
|
|
436
428
|
"""List all jobs for a given source"""
|
|
437
429
|
args = parser.parse_args()
|
|
438
|
-
qs = HarvestJob.objects(source=
|
|
430
|
+
qs = HarvestJob.objects(source=source)
|
|
439
431
|
qs = qs.order_by("-created")
|
|
440
432
|
return qs.paginate(args["page"], args["page_size"])
|
|
441
433
|
|
|
@@ -450,7 +442,7 @@ class JobAPI(API):
|
|
|
450
442
|
return actions.get_job(ident)
|
|
451
443
|
|
|
452
444
|
|
|
453
|
-
@ns.route("/backends", endpoint="harvest_backends")
|
|
445
|
+
@ns.route("/backends/", endpoint="harvest_backends")
|
|
454
446
|
class ListBackendsAPI(API):
|
|
455
447
|
@api.doc("harvest_backends")
|
|
456
448
|
@api.marshal_with(backend_fields)
|
|
@@ -471,7 +463,7 @@ class ListBackendsAPI(API):
|
|
|
471
463
|
)
|
|
472
464
|
|
|
473
465
|
|
|
474
|
-
@ns.route("/job_status", endpoint="havest_job_status")
|
|
466
|
+
@ns.route("/job_status/", endpoint="havest_job_status")
|
|
475
467
|
class ListHarvesterAPI(API):
|
|
476
468
|
@api.doc(model=[str])
|
|
477
469
|
def get(self):
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from urllib.parse import urljoin
|
|
4
|
+
from uuid import UUID
|
|
5
|
+
|
|
6
|
+
from udata import uris
|
|
7
|
+
from udata.harvest.models import HarvestItem
|
|
8
|
+
from udata.i18n import lazy_gettext as _
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from udata.core.dataset.constants import UPDATE_FREQUENCIES
|
|
12
|
+
except ImportError:
|
|
13
|
+
# legacy import of constants in udata
|
|
14
|
+
from udata.models import UPDATE_FREQUENCIES
|
|
15
|
+
from udata.core.dataset.models import HarvestDatasetMetadata, HarvestResourceMetadata
|
|
16
|
+
from udata.core.dataset.rdf import frequency_from_rdf
|
|
17
|
+
from udata.frontend.markdown import parse_html
|
|
18
|
+
from udata.harvest.backends.base import BaseBackend, HarvestFilter
|
|
19
|
+
from udata.harvest.exceptions import HarvestException, HarvestSkipException
|
|
20
|
+
from udata.models import GeoZone, License, Resource, SpatialCoverage, db
|
|
21
|
+
from udata.utils import daterange_end, daterange_start, get_by
|
|
22
|
+
|
|
23
|
+
from .schemas.ckan import schema as ckan_schema
|
|
24
|
+
from .schemas.dkan import schema as dkan_schema
|
|
25
|
+
|
|
26
|
+
log = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
# dkan is a dummy value for dkan that does not provide resource_type
|
|
29
|
+
ALLOWED_RESOURCE_TYPES = ("dkan", "file", "file.upload", "api", "metadata")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class CkanBackend(BaseBackend):
|
|
33
|
+
display_name = "CKAN"
|
|
34
|
+
filters = (
|
|
35
|
+
HarvestFilter(_("Organization"), "organization", str, _("A CKAN Organization name")),
|
|
36
|
+
HarvestFilter(_("Tag"), "tags", str, _("A CKAN tag name")),
|
|
37
|
+
)
|
|
38
|
+
schema = ckan_schema
|
|
39
|
+
|
|
40
|
+
def get_headers(self):
|
|
41
|
+
headers = super(CkanBackend, self).get_headers()
|
|
42
|
+
headers["content-type"] = "application/json"
|
|
43
|
+
if self.config.get("apikey"):
|
|
44
|
+
headers["Authorization"] = self.config["apikey"]
|
|
45
|
+
return headers
|
|
46
|
+
|
|
47
|
+
def action_url(self, endpoint):
|
|
48
|
+
path = "/".join(["api/3/action", endpoint])
|
|
49
|
+
return urljoin(self.source.url, path)
|
|
50
|
+
|
|
51
|
+
def dataset_url(self, name):
|
|
52
|
+
path = "/".join(["dataset", name])
|
|
53
|
+
return urljoin(self.source.url, path)
|
|
54
|
+
|
|
55
|
+
def get_action(self, endpoint, fix=False, **kwargs):
|
|
56
|
+
url = self.action_url(endpoint)
|
|
57
|
+
if fix:
|
|
58
|
+
response = self.post(url, "{}", params=kwargs)
|
|
59
|
+
else:
|
|
60
|
+
response = self.get(url, params=kwargs)
|
|
61
|
+
|
|
62
|
+
response.raise_for_status()
|
|
63
|
+
content_type = response.headers.get("Content-Type", "")
|
|
64
|
+
mime_type = content_type.split(";", 1)[0]
|
|
65
|
+
|
|
66
|
+
if mime_type == "application/json": # Standard API JSON response
|
|
67
|
+
data = response.json()
|
|
68
|
+
# CKAN API can returns 200 even on errors
|
|
69
|
+
# Only the `success` property allows to detect errors
|
|
70
|
+
if data.get("success", False):
|
|
71
|
+
return data
|
|
72
|
+
else:
|
|
73
|
+
error = data.get("error")
|
|
74
|
+
if isinstance(error, dict):
|
|
75
|
+
# Error object with message
|
|
76
|
+
msg = error.get("message", "Unknown error")
|
|
77
|
+
if "__type" in error:
|
|
78
|
+
# Typed error
|
|
79
|
+
msg = ": ".join((error["__type"], msg))
|
|
80
|
+
else:
|
|
81
|
+
# Error only contains a message
|
|
82
|
+
msg = error
|
|
83
|
+
raise HarvestException(msg)
|
|
84
|
+
|
|
85
|
+
elif mime_type == "text/html": # Standard html error page
|
|
86
|
+
raise HarvestException("Unknown Error: {} returned HTML".format(url))
|
|
87
|
+
else:
|
|
88
|
+
# If it's not HTML, CKAN respond with raw quoted text
|
|
89
|
+
msg = response.text.strip('"')
|
|
90
|
+
raise HarvestException(msg)
|
|
91
|
+
|
|
92
|
+
def get_status(self):
|
|
93
|
+
url = urljoin(self.source.url, "/api/util/status")
|
|
94
|
+
response = self.get(url)
|
|
95
|
+
return response.json()
|
|
96
|
+
|
|
97
|
+
def inner_harvest(self):
|
|
98
|
+
"""List all datasets for a given ..."""
|
|
99
|
+
fix = False # Fix should be True for CKAN < '1.8'
|
|
100
|
+
|
|
101
|
+
filters = self.config.get("filters", [])
|
|
102
|
+
if len(filters) > 0:
|
|
103
|
+
# Build a q search query based on filters
|
|
104
|
+
# use package_search because package_list doesn't allow filtering
|
|
105
|
+
# use q parameters because fq is broken with multiple filters
|
|
106
|
+
params = []
|
|
107
|
+
for f in filters:
|
|
108
|
+
param = "{key}:{value}".format(**f)
|
|
109
|
+
if f.get("type") == "exclude":
|
|
110
|
+
param = "-" + param
|
|
111
|
+
params.append(param)
|
|
112
|
+
q = " AND ".join(params)
|
|
113
|
+
# max out rows count to 1000 as per
|
|
114
|
+
# https://docs.ckan.org/en/latest/api/#ckan.logic.action.get.package_search
|
|
115
|
+
response = self.get_action("package_search", fix=fix, q=q, rows=1000)
|
|
116
|
+
names = [r["name"] for r in response["result"]["results"]]
|
|
117
|
+
else:
|
|
118
|
+
response = self.get_action("package_list", fix=fix)
|
|
119
|
+
names = response["result"]
|
|
120
|
+
|
|
121
|
+
for name in names:
|
|
122
|
+
# We use `name` as `remote_id` for now, we'll be replace at the beginning of the process
|
|
123
|
+
self.process_dataset(name)
|
|
124
|
+
if self.has_reached_max_items():
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
def inner_process_dataset(self, item: HarvestItem):
|
|
128
|
+
response = self.get_action("package_show", id=item.remote_id)
|
|
129
|
+
|
|
130
|
+
result = response["result"]
|
|
131
|
+
# DKAN returns a list where CKAN returns an object
|
|
132
|
+
# we "unlist" here instead of after schema validation in order to get the id easily
|
|
133
|
+
if type(result) is list:
|
|
134
|
+
result = result[0]
|
|
135
|
+
|
|
136
|
+
# Replace the `remote_id` from `name` to `id`.
|
|
137
|
+
if result.get("id"):
|
|
138
|
+
item.remote_id = result["id"]
|
|
139
|
+
|
|
140
|
+
data = self.validate(result, self.schema)
|
|
141
|
+
|
|
142
|
+
# Skip if no resource
|
|
143
|
+
if not len(data.get("resources", [])):
|
|
144
|
+
raise HarvestSkipException(f"Dataset {data['name']} has no record")
|
|
145
|
+
|
|
146
|
+
dataset = self.get_dataset(item.remote_id)
|
|
147
|
+
|
|
148
|
+
if not dataset.harvest:
|
|
149
|
+
dataset.harvest = HarvestDatasetMetadata()
|
|
150
|
+
|
|
151
|
+
# Core attributes
|
|
152
|
+
if not dataset.slug:
|
|
153
|
+
dataset.slug = data["name"]
|
|
154
|
+
dataset.title = data["title"]
|
|
155
|
+
dataset.description = parse_html(data["notes"])
|
|
156
|
+
|
|
157
|
+
# Detect license
|
|
158
|
+
default_license = dataset.license or License.default()
|
|
159
|
+
dataset.license = License.guess(
|
|
160
|
+
data["license_id"], data["license_title"], default=default_license
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
dataset.tags = [t["name"] for t in data["tags"] if t["name"]]
|
|
164
|
+
|
|
165
|
+
dataset.harvest.created_at = data["metadata_created"]
|
|
166
|
+
dataset.harvest.modified_at = data["metadata_modified"]
|
|
167
|
+
|
|
168
|
+
dataset.harvest.ckan_name = data["name"]
|
|
169
|
+
|
|
170
|
+
temporal_start, temporal_end = None, None
|
|
171
|
+
spatial_geom, spatial_zone = None, None
|
|
172
|
+
|
|
173
|
+
for extra in data["extras"]:
|
|
174
|
+
key = extra["key"]
|
|
175
|
+
value = extra["value"]
|
|
176
|
+
if value is None or (isinstance(value, str) and not value.strip()):
|
|
177
|
+
# Skip empty extras
|
|
178
|
+
continue
|
|
179
|
+
elif key == "spatial":
|
|
180
|
+
# GeoJSON representation (Polygon or Point)
|
|
181
|
+
spatial_geom = json.loads(value)
|
|
182
|
+
elif key == "spatial-text":
|
|
183
|
+
# Textual representation of the extent / location
|
|
184
|
+
qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value))
|
|
185
|
+
if qs.count() == 1:
|
|
186
|
+
spatial_zone = qs.first()
|
|
187
|
+
else:
|
|
188
|
+
dataset.extras["ckan:spatial-text"] = value
|
|
189
|
+
log.debug("spatial-text value not handled: %s", value)
|
|
190
|
+
elif key == "spatial-uri":
|
|
191
|
+
# Linked Data URI representing the place name
|
|
192
|
+
dataset.extras["ckan:spatial-uri"] = value
|
|
193
|
+
log.debug("spatial-uri value not handled: %s", value)
|
|
194
|
+
elif key == "frequency":
|
|
195
|
+
# Update frequency
|
|
196
|
+
freq = frequency_from_rdf(value)
|
|
197
|
+
if freq:
|
|
198
|
+
dataset.frequency = freq
|
|
199
|
+
elif value in UPDATE_FREQUENCIES:
|
|
200
|
+
dataset.frequency = value
|
|
201
|
+
else:
|
|
202
|
+
dataset.extras["ckan:frequency"] = value
|
|
203
|
+
log.debug("frequency value not handled: %s", value)
|
|
204
|
+
# Temporal coverage start
|
|
205
|
+
elif key == "temporal_start":
|
|
206
|
+
temporal_start = daterange_start(value)
|
|
207
|
+
# Temporal coverage end
|
|
208
|
+
elif key == "temporal_end":
|
|
209
|
+
temporal_end = daterange_end(value)
|
|
210
|
+
else:
|
|
211
|
+
dataset.extras[extra["key"]] = value
|
|
212
|
+
|
|
213
|
+
if spatial_geom or spatial_zone:
|
|
214
|
+
dataset.spatial = SpatialCoverage()
|
|
215
|
+
|
|
216
|
+
if spatial_zone:
|
|
217
|
+
dataset.spatial.zones = [spatial_zone]
|
|
218
|
+
|
|
219
|
+
if spatial_geom:
|
|
220
|
+
if spatial_geom["type"] == "Polygon":
|
|
221
|
+
coordinates = [spatial_geom["coordinates"]]
|
|
222
|
+
elif spatial_geom["type"] == "MultiPolygon":
|
|
223
|
+
coordinates = spatial_geom["coordinates"]
|
|
224
|
+
else:
|
|
225
|
+
raise HarvestException("Unsupported spatial geometry")
|
|
226
|
+
dataset.spatial.geom = {"type": "MultiPolygon", "coordinates": coordinates}
|
|
227
|
+
|
|
228
|
+
if temporal_start and temporal_end:
|
|
229
|
+
dataset.temporal_coverage = db.DateRange(
|
|
230
|
+
start=temporal_start,
|
|
231
|
+
end=temporal_end,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Remote URL
|
|
235
|
+
dataset.harvest.remote_url = self.dataset_url(data["name"])
|
|
236
|
+
if data.get("url"):
|
|
237
|
+
try:
|
|
238
|
+
url = uris.validate(data["url"])
|
|
239
|
+
except uris.ValidationError:
|
|
240
|
+
dataset.harvest.ckan_source = data["url"]
|
|
241
|
+
else:
|
|
242
|
+
# use declared `url` as `remote_url` if any
|
|
243
|
+
dataset.harvest.remote_url = url
|
|
244
|
+
|
|
245
|
+
# Resources
|
|
246
|
+
for res in data["resources"]:
|
|
247
|
+
if res["resource_type"] not in ALLOWED_RESOURCE_TYPES:
|
|
248
|
+
continue
|
|
249
|
+
try:
|
|
250
|
+
resource = get_by(dataset.resources, "id", UUID(res["id"]))
|
|
251
|
+
except Exception:
|
|
252
|
+
log.error("Unable to parse resource ID %s", res["id"])
|
|
253
|
+
continue
|
|
254
|
+
if not resource:
|
|
255
|
+
resource = Resource(id=res["id"])
|
|
256
|
+
dataset.resources.append(resource)
|
|
257
|
+
if not resource.harvest:
|
|
258
|
+
resource.harvest = HarvestResourceMetadata()
|
|
259
|
+
resource.title = res.get("name", "") or ""
|
|
260
|
+
resource.description = parse_html(res.get("description"))
|
|
261
|
+
resource.url = res["url"]
|
|
262
|
+
resource.filetype = "remote"
|
|
263
|
+
resource.format = res.get("format")
|
|
264
|
+
resource.mime = res.get("mimetype")
|
|
265
|
+
resource.hash = res.get("hash")
|
|
266
|
+
resource.harvest.created_at = res["created"]
|
|
267
|
+
resource.harvest.modified_at = res["last_modified"]
|
|
268
|
+
|
|
269
|
+
return dataset
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class DkanBackend(CkanBackend):
|
|
273
|
+
schema = dkan_schema
|
|
274
|
+
filters = []
|
|
File without changes
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from voluptuous import All, Any, Coerce, DefaultTo, Lower, Optional, Schema
|
|
2
|
+
|
|
3
|
+
from udata.harvest.filters import (
|
|
4
|
+
boolean,
|
|
5
|
+
email,
|
|
6
|
+
empty_none,
|
|
7
|
+
hash,
|
|
8
|
+
is_url,
|
|
9
|
+
normalize_string,
|
|
10
|
+
normalize_tag,
|
|
11
|
+
slug,
|
|
12
|
+
to_date,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
RESOURCE_TYPES = ("file", "file.upload", "api", "documentation", "image", "visualization")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
resource = {
|
|
19
|
+
"id": str,
|
|
20
|
+
"position": int,
|
|
21
|
+
"name": All(DefaultTo(""), str),
|
|
22
|
+
"description": Any(All(str, normalize_string), None),
|
|
23
|
+
"format": All(str, Lower),
|
|
24
|
+
"mimetype": Any(All(str, Lower), None),
|
|
25
|
+
"size": Any(Coerce(int), None),
|
|
26
|
+
"hash": Any(All(str, hash), None),
|
|
27
|
+
"created": All(str, to_date),
|
|
28
|
+
"last_modified": Any(All(str, to_date), None),
|
|
29
|
+
"url": All(str, is_url()),
|
|
30
|
+
"resource_type": All(empty_none, DefaultTo("file"), str, Any(*RESOURCE_TYPES)),
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
tag = {
|
|
34
|
+
"id": str,
|
|
35
|
+
Optional("vocabulary_id"): Any(str, None),
|
|
36
|
+
Optional("display_name"): str,
|
|
37
|
+
"name": All(str, normalize_tag),
|
|
38
|
+
Optional("state"): str,
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
organization = {
|
|
42
|
+
"id": str,
|
|
43
|
+
"description": str,
|
|
44
|
+
"created": All(str, to_date),
|
|
45
|
+
"title": str,
|
|
46
|
+
"name": All(str, slug),
|
|
47
|
+
"revision_timestamp": All(str, to_date),
|
|
48
|
+
"is_organization": boolean,
|
|
49
|
+
"state": str,
|
|
50
|
+
"image_url": str,
|
|
51
|
+
"revision_id": str,
|
|
52
|
+
"type": "organization",
|
|
53
|
+
"approval_status": "approved",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
schema = Schema(
|
|
57
|
+
{
|
|
58
|
+
"id": str,
|
|
59
|
+
"name": str,
|
|
60
|
+
"title": str,
|
|
61
|
+
"notes": Any(All(str, normalize_string), None),
|
|
62
|
+
"license_id": All(DefaultTo("not-specified"), str),
|
|
63
|
+
"license_title": Any(str, None),
|
|
64
|
+
"tags": [tag],
|
|
65
|
+
"metadata_created": All(str, to_date),
|
|
66
|
+
"metadata_modified": All(str, to_date),
|
|
67
|
+
"organization": Any(organization, None),
|
|
68
|
+
"resources": [resource],
|
|
69
|
+
Optional("revision_id"): str,
|
|
70
|
+
Optional("extras", default=list): [
|
|
71
|
+
{
|
|
72
|
+
"key": str,
|
|
73
|
+
"value": Any(str, int, float, boolean, dict, list),
|
|
74
|
+
}
|
|
75
|
+
],
|
|
76
|
+
"private": boolean,
|
|
77
|
+
"type": "dataset",
|
|
78
|
+
"author": Any(str, None),
|
|
79
|
+
"author_email": All(empty_none, Any(All(str, email), None)),
|
|
80
|
+
"maintainer": Any(str, None),
|
|
81
|
+
"maintainer_email": All(empty_none, Any(All(str, email), None)),
|
|
82
|
+
"state": Any(str, None),
|
|
83
|
+
},
|
|
84
|
+
required=True,
|
|
85
|
+
extra=True,
|
|
86
|
+
)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import dateutil.parser
|
|
2
|
+
from humanfriendly import parse_size
|
|
3
|
+
from voluptuous import All, Any, DefaultTo, Lower, Optional, Schema
|
|
4
|
+
|
|
5
|
+
from udata.harvest.filters import boolean, email, empty_none, hash, is_url, normalize_string, slug
|
|
6
|
+
|
|
7
|
+
from .ckan import tag
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FrenchParserInfo(dateutil.parser.parserinfo):
|
|
11
|
+
WEEKDAYS = [
|
|
12
|
+
("Lun", "Lundi"),
|
|
13
|
+
("Mar", "Mardi"),
|
|
14
|
+
("Mer", "Mercredi"),
|
|
15
|
+
("Jeu", "Jeudi"),
|
|
16
|
+
("Ven", "Vendredi"),
|
|
17
|
+
("Sam", "Samedi"),
|
|
18
|
+
("Dim", "Dimanche"),
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def parse_date(value, **kwargs):
|
|
23
|
+
return dateutil.parser.parse(value, **kwargs).date()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def to_date(value):
|
|
27
|
+
"""
|
|
28
|
+
Try w/ french weekdays then dateutil's default
|
|
29
|
+
`fuzzy` is used when 'Date changed' is in the value
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
return parse_date(value, fuzzy=True, parserinfo=FrenchParserInfo(), dayfirst=True)
|
|
33
|
+
except ValueError:
|
|
34
|
+
return parse_date(value, fuzzy=True)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def dkan_parse_size(value):
|
|
38
|
+
if value:
|
|
39
|
+
# not strictly true but should be enough
|
|
40
|
+
value = value.replace("octets", "bytes")
|
|
41
|
+
return parse_size(value)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
resource = {
|
|
45
|
+
"id": str,
|
|
46
|
+
"name": All(DefaultTo(""), str),
|
|
47
|
+
"description": All(str, normalize_string),
|
|
48
|
+
"format": All(str, Lower),
|
|
49
|
+
"mimetype": Any(All(str, Lower), None),
|
|
50
|
+
"size": All(str, dkan_parse_size),
|
|
51
|
+
Optional("hash"): Any(All(str, hash), None),
|
|
52
|
+
"created": All(str, to_date),
|
|
53
|
+
"last_modified": Any(All(str, to_date), None),
|
|
54
|
+
"url": All(str, is_url()),
|
|
55
|
+
Optional("resource_type", default="dkan"): All(
|
|
56
|
+
empty_none,
|
|
57
|
+
str,
|
|
58
|
+
),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
group = {
|
|
62
|
+
"id": str,
|
|
63
|
+
"description": str,
|
|
64
|
+
"image_display_url": str,
|
|
65
|
+
"title": str,
|
|
66
|
+
"name": All(str, slug),
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
schema = Schema(
|
|
70
|
+
{
|
|
71
|
+
"id": str,
|
|
72
|
+
"name": str,
|
|
73
|
+
"title": str,
|
|
74
|
+
"notes": Any(All(str, normalize_string), None),
|
|
75
|
+
Optional("license_id", default=None): All(DefaultTo("not-specified"), str),
|
|
76
|
+
Optional("license_title", default=None): Any(str, None),
|
|
77
|
+
Optional("tags", default=list): [tag],
|
|
78
|
+
"metadata_created": All(str, to_date),
|
|
79
|
+
"metadata_modified": All(str, to_date),
|
|
80
|
+
Optional("groups"): [Any(group, None)],
|
|
81
|
+
"resources": [resource],
|
|
82
|
+
Optional("extras", default=list): [
|
|
83
|
+
{
|
|
84
|
+
"key": str,
|
|
85
|
+
"value": Any(str, int, float, boolean, dict, list),
|
|
86
|
+
}
|
|
87
|
+
],
|
|
88
|
+
"private": boolean,
|
|
89
|
+
"type": "Dataset",
|
|
90
|
+
Optional("author"): Any(str, None),
|
|
91
|
+
Optional("author_email"): All(empty_none, Any(All(str, email), None)),
|
|
92
|
+
"maintainer": Any(str, None),
|
|
93
|
+
"maintainer_email": All(empty_none, Any(All(str, email), None)),
|
|
94
|
+
"state": Any(str, None),
|
|
95
|
+
},
|
|
96
|
+
required=True,
|
|
97
|
+
extra=True,
|
|
98
|
+
)
|