udata 10.8.1.dev36703__py2.py3-none-any.whl → 10.8.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (79) hide show
  1. udata/__init__.py +1 -1
  2. udata/app.py +0 -2
  3. udata/commands/db.py +22 -9
  4. udata/core/dataset/models.py +5 -3
  5. udata/core/discussions/api.py +2 -2
  6. udata/core/jobs/api.py +3 -3
  7. udata/core/metrics/helpers.py +10 -0
  8. udata/core/metrics/tasks.py +144 -1
  9. udata/core/organization/api.py +2 -2
  10. udata/core/post/api.py +1 -1
  11. udata/core/user/api.py +1 -1
  12. udata/features/identicon/api.py +1 -1
  13. udata/harvest/actions.py +24 -28
  14. udata/harvest/api.py +28 -36
  15. udata/harvest/backends/ckan/__init__.py +3 -0
  16. udata/harvest/backends/ckan/harvesters.py +274 -0
  17. udata/harvest/backends/ckan/schemas/__init__.py +0 -0
  18. udata/harvest/backends/ckan/schemas/ckan.py +86 -0
  19. udata/harvest/backends/ckan/schemas/dkan.py +98 -0
  20. udata/harvest/commands.py +7 -7
  21. udata/harvest/tasks.py +1 -1
  22. udata/harvest/tests/ckan/conftest.py +67 -0
  23. udata/harvest/tests/ckan/data/dkan-french-w-license.json +226 -0
  24. udata/harvest/tests/ckan/test_ckan_backend.py +697 -0
  25. udata/harvest/tests/ckan/test_ckan_backend_errors.py +140 -0
  26. udata/harvest/tests/ckan/test_ckan_backend_filters.py +130 -0
  27. udata/harvest/tests/ckan/test_dkan_backend.py +68 -0
  28. udata/harvest/tests/test_actions.py +27 -32
  29. udata/harvest/tests/test_api.py +23 -18
  30. udata/harvest/tests/test_dcat_backend.py +29 -29
  31. udata/migrations/2025-07-30-purge-old-harvest-dynamic-fields.py +29 -0
  32. udata/mongo/slug_fields.py +1 -1
  33. udata/routing.py +6 -0
  34. udata/static/chunks/{11.b6f741fcc366abfad9c4.js → 11.51d706fb9521c16976bc.js} +3 -3
  35. udata/static/chunks/{11.b6f741fcc366abfad9c4.js.map → 11.51d706fb9521c16976bc.js.map} +1 -1
  36. udata/static/chunks/{13.2d06442dd9a05d9777b5.js → 13.39e106d56f794ebd06a0.js} +2 -2
  37. udata/static/chunks/{13.2d06442dd9a05d9777b5.js.map → 13.39e106d56f794ebd06a0.js.map} +1 -1
  38. udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js → 17.70cbb4a91b002338007e.js} +2 -2
  39. udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js.map → 17.70cbb4a91b002338007e.js.map} +1 -1
  40. udata/static/chunks/{19.f03a102365af4315f9db.js → 19.a348a5fff8fe2801e52a.js} +3 -3
  41. udata/static/chunks/{19.f03a102365af4315f9db.js.map → 19.a348a5fff8fe2801e52a.js.map} +1 -1
  42. udata/static/chunks/{5.0fa1408dae4e76b87b2e.js → 5.343ca020a2d38cec1a14.js} +3 -3
  43. udata/static/chunks/{5.0fa1408dae4e76b87b2e.js.map → 5.343ca020a2d38cec1a14.js.map} +1 -1
  44. udata/static/chunks/{6.d663709d877baa44a71e.js → 6.a3b07de9dd2ca2d24e85.js} +3 -3
  45. udata/static/chunks/{6.d663709d877baa44a71e.js.map → 6.a3b07de9dd2ca2d24e85.js.map} +1 -1
  46. udata/static/chunks/{8.778091d55cd8ea39af6b.js → 8.462bb3029de008497675.js} +2 -2
  47. udata/static/chunks/{8.778091d55cd8ea39af6b.js.map → 8.462bb3029de008497675.js.map} +1 -1
  48. udata/static/common.js +1 -1
  49. udata/static/common.js.map +1 -1
  50. udata/tests/api/test_datasets_api.py +0 -46
  51. udata/tests/api/test_organizations_api.py +5 -0
  52. udata/tests/cli/test_db_cli.py +12 -0
  53. udata/tests/dataset/test_dataset_model.py +0 -16
  54. udata/tests/metrics/__init__.py +0 -0
  55. udata/tests/metrics/conftest.py +15 -0
  56. udata/tests/metrics/helpers.py +58 -0
  57. udata/tests/metrics/test_metrics.py +67 -0
  58. udata/tests/metrics/test_tasks.py +171 -0
  59. udata/translations/ar/LC_MESSAGES/udata.mo +0 -0
  60. udata/translations/ar/LC_MESSAGES/udata.po +72 -65
  61. udata/translations/de/LC_MESSAGES/udata.mo +0 -0
  62. udata/translations/de/LC_MESSAGES/udata.po +72 -65
  63. udata/translations/es/LC_MESSAGES/udata.mo +0 -0
  64. udata/translations/es/LC_MESSAGES/udata.po +72 -65
  65. udata/translations/fr/LC_MESSAGES/udata.mo +0 -0
  66. udata/translations/fr/LC_MESSAGES/udata.po +72 -65
  67. udata/translations/it/LC_MESSAGES/udata.mo +0 -0
  68. udata/translations/it/LC_MESSAGES/udata.po +72 -65
  69. udata/translations/pt/LC_MESSAGES/udata.mo +0 -0
  70. udata/translations/pt/LC_MESSAGES/udata.po +72 -65
  71. udata/translations/sr/LC_MESSAGES/udata.mo +0 -0
  72. udata/translations/sr/LC_MESSAGES/udata.po +72 -65
  73. udata/translations/udata.pot +74 -70
  74. {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/METADATA +15 -2
  75. {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/RECORD +79 -62
  76. {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/entry_points.txt +2 -0
  77. {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/LICENSE +0 -0
  78. {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/WHEEL +0 -0
  79. {udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/top_level.txt +0 -0
udata/harvest/api.py CHANGED
@@ -300,59 +300,54 @@ class SourcesAPI(API):
300
300
  return source, 201
301
301
 
302
302
 
303
- @ns.route("/source/<string:ident>", endpoint="harvest_source")
304
- @api.param("ident", "A source ID or slug")
303
+ @ns.route("/source/<harvest_source:source>/", endpoint="harvest_source")
305
304
  class SourceAPI(API):
306
305
  @api.doc("get_harvest_source")
307
306
  @api.marshal_with(source_fields)
308
- def get(self, ident):
307
+ def get(self, source: HarvestSource):
309
308
  """Get a single source given an ID or a slug"""
310
- return actions.get_source(ident)
309
+ return source
311
310
 
312
311
  @api.secure
313
312
  @api.doc("update_harvest_source")
314
313
  @api.expect(source_fields)
315
314
  @api.marshal_with(source_fields)
316
- def put(self, ident):
315
+ def put(self, source: HarvestSource):
317
316
  """Update a harvest source"""
318
- source = actions.get_source(ident)
319
317
  OwnablePermission(source).test()
320
318
  form = api.validate(HarvestSourceForm, source)
321
- source = actions.update_source(ident, form.data)
319
+ source = actions.update_source(source, form.data)
322
320
  return source
323
321
 
324
322
  @api.secure
325
323
  @api.doc("delete_harvest_source")
326
324
  @api.marshal_with(source_fields)
327
- def delete(self, ident):
328
- source: HarvestSource = actions.get_source(ident)
325
+ def delete(self, source: HarvestSource):
329
326
  OwnablePermission(source).test()
330
- return actions.delete_source(ident), 204
327
+ return actions.delete_source(source), 204
331
328
 
332
329
 
333
- @ns.route("/source/<string:ident>/validate", endpoint="validate_harvest_source")
334
- @api.param("ident", "A source ID or slug")
330
+ @ns.route("/source/<harvest_source:source>/validate/", endpoint="validate_harvest_source")
335
331
  class ValidateSourceAPI(API):
336
332
  @api.doc("validate_harvest_source")
337
333
  @api.secure(admin_permission)
338
334
  @api.expect(validation_fields)
339
335
  @api.marshal_with(source_fields)
340
- def post(self, ident):
336
+ def post(self, source: HarvestSource):
341
337
  """Validate or reject an harvest source"""
342
338
  form = api.validate(HarvestSourceValidationForm)
343
339
  if form.state.data == VALIDATION_ACCEPTED:
344
- return actions.validate_source(ident, form.comment.data)
340
+ return actions.validate_source(source, form.comment.data)
345
341
  else:
346
- return actions.reject_source(ident, form.comment.data)
342
+ return actions.reject_source(source, form.comment.data)
347
343
 
348
344
 
349
- @ns.route("/source/<string:ident>/run", endpoint="run_harvest_source")
350
- @api.param("ident", "A source ID or slug")
345
+ @ns.route("/source/<harvest_source:source>/run/", endpoint="run_harvest_source")
351
346
  class RunSourceAPI(API):
352
347
  @api.doc("run_harvest_source")
353
348
  @api.secure
354
349
  @api.marshal_with(source_fields)
355
- def post(self, ident):
350
+ def post(self, source: HarvestSource):
356
351
  enabled = current_app.config.get("HARVEST_ENABLE_MANUAL_RUN")
357
352
  if not enabled and not current_user.sysadmin:
358
353
  api.abort(
@@ -360,42 +355,40 @@ class RunSourceAPI(API):
360
355
  "Cannot run source manually. Please contact the platform if you need to reschedule the harvester.",
361
356
  )
362
357
 
363
- source: HarvestSource = actions.get_source(ident)
364
358
  OwnablePermission(source).test()
365
359
 
366
360
  if source.validation.state != VALIDATION_ACCEPTED:
367
361
  api.abort(400, "Source is not validated. Please validate the source before running.")
368
362
 
369
- actions.launch(ident)
363
+ actions.launch(source)
370
364
 
371
365
  return source
372
366
 
373
367
 
374
- @ns.route("/source/<string:ident>/schedule", endpoint="schedule_harvest_source")
375
- @api.param("ident", "A source ID or slug")
368
+ @ns.route("/source/<harvest_source:source>/schedule/", endpoint="schedule_harvest_source")
376
369
  class ScheduleSourceAPI(API):
377
370
  @api.doc("schedule_harvest_source")
378
371
  @api.secure(admin_permission)
379
372
  @api.expect((str, "A cron expression"))
380
373
  @api.marshal_with(source_fields)
381
- def post(self, ident):
374
+ def post(self, source: HarvestSource):
382
375
  """Schedule an harvest source"""
383
376
  # Handle both syntax: quoted and unquoted
384
377
  try:
385
378
  data = request.json
386
379
  except BadRequest:
387
380
  data = request.data.decode("utf-8")
388
- return actions.schedule(ident, data)
381
+ return actions.schedule(source, data)
389
382
 
390
383
  @api.doc("unschedule_harvest_source")
391
384
  @api.secure(admin_permission)
392
385
  @api.marshal_with(source_fields)
393
- def delete(self, ident):
386
+ def delete(self, source: HarvestSource):
394
387
  """Unschedule an harvest source"""
395
- return actions.unschedule(ident), 204
388
+ return actions.unschedule(source), 204
396
389
 
397
390
 
398
- @ns.route("/source/preview", endpoint="preview_harvest_source_config")
391
+ @ns.route("/source/preview/", endpoint="preview_harvest_source_config")
399
392
  class PreviewSourceConfigAPI(API):
400
393
  @api.secure
401
394
  @api.expect(source_fields)
@@ -409,15 +402,14 @@ class PreviewSourceConfigAPI(API):
409
402
  return actions.preview_from_config(**form.data)
410
403
 
411
404
 
412
- @ns.route("/source/<string:ident>/preview", endpoint="preview_harvest_source")
413
- @api.param("ident", "A source ID or slug")
405
+ @ns.route("/source/<harvest_source:source>/preview/", endpoint="preview_harvest_source")
414
406
  class PreviewSourceAPI(API):
415
407
  @api.secure
416
408
  @api.doc("preview_harvest_source")
417
409
  @api.marshal_with(preview_job_fields)
418
- def get(self, ident):
410
+ def get(self, source: HarvestSource):
419
411
  """Preview a single harvest source given an ID or a slug"""
420
- return actions.preview(ident)
412
+ return actions.preview(source)
421
413
 
422
414
 
423
415
  parser = api.parser()
@@ -427,15 +419,15 @@ parser.add_argument(
427
419
  )
428
420
 
429
421
 
430
- @ns.route("/source/<string:ident>/jobs/", endpoint="harvest_jobs")
422
+ @ns.route("/source/<harvest_source:source>/jobs/", endpoint="harvest_jobs")
431
423
  class JobsAPI(API):
432
424
  @api.doc("list_harvest_jobs")
433
425
  @api.expect(parser)
434
426
  @api.marshal_with(job_page_fields)
435
- def get(self, ident):
427
+ def get(self, source: HarvestSource):
436
428
  """List all jobs for a given source"""
437
429
  args = parser.parse_args()
438
- qs = HarvestJob.objects(source=ident)
430
+ qs = HarvestJob.objects(source=source)
439
431
  qs = qs.order_by("-created")
440
432
  return qs.paginate(args["page"], args["page_size"])
441
433
 
@@ -450,7 +442,7 @@ class JobAPI(API):
450
442
  return actions.get_job(ident)
451
443
 
452
444
 
453
- @ns.route("/backends", endpoint="harvest_backends")
445
+ @ns.route("/backends/", endpoint="harvest_backends")
454
446
  class ListBackendsAPI(API):
455
447
  @api.doc("harvest_backends")
456
448
  @api.marshal_with(backend_fields)
@@ -471,7 +463,7 @@ class ListBackendsAPI(API):
471
463
  )
472
464
 
473
465
 
474
- @ns.route("/job_status", endpoint="havest_job_status")
466
+ @ns.route("/job_status/", endpoint="havest_job_status")
475
467
  class ListHarvesterAPI(API):
476
468
  @api.doc(model=[str])
477
469
  def get(self):
@@ -0,0 +1,3 @@
1
+ """
2
+ CKAN integration for udata
3
+ """
@@ -0,0 +1,274 @@
1
+ import json
2
+ import logging
3
+ from urllib.parse import urljoin
4
+ from uuid import UUID
5
+
6
+ from udata import uris
7
+ from udata.harvest.models import HarvestItem
8
+ from udata.i18n import lazy_gettext as _
9
+
10
+ try:
11
+ from udata.core.dataset.constants import UPDATE_FREQUENCIES
12
+ except ImportError:
13
+ # legacy import of constants in udata
14
+ from udata.models import UPDATE_FREQUENCIES
15
+ from udata.core.dataset.models import HarvestDatasetMetadata, HarvestResourceMetadata
16
+ from udata.core.dataset.rdf import frequency_from_rdf
17
+ from udata.frontend.markdown import parse_html
18
+ from udata.harvest.backends.base import BaseBackend, HarvestFilter
19
+ from udata.harvest.exceptions import HarvestException, HarvestSkipException
20
+ from udata.models import GeoZone, License, Resource, SpatialCoverage, db
21
+ from udata.utils import daterange_end, daterange_start, get_by
22
+
23
+ from .schemas.ckan import schema as ckan_schema
24
+ from .schemas.dkan import schema as dkan_schema
25
+
26
+ log = logging.getLogger(__name__)
27
+
28
+ # dkan is a dummy value for dkan that does not provide resource_type
29
+ ALLOWED_RESOURCE_TYPES = ("dkan", "file", "file.upload", "api", "metadata")
30
+
31
+
32
+ class CkanBackend(BaseBackend):
33
+ display_name = "CKAN"
34
+ filters = (
35
+ HarvestFilter(_("Organization"), "organization", str, _("A CKAN Organization name")),
36
+ HarvestFilter(_("Tag"), "tags", str, _("A CKAN tag name")),
37
+ )
38
+ schema = ckan_schema
39
+
40
+ def get_headers(self):
41
+ headers = super(CkanBackend, self).get_headers()
42
+ headers["content-type"] = "application/json"
43
+ if self.config.get("apikey"):
44
+ headers["Authorization"] = self.config["apikey"]
45
+ return headers
46
+
47
+ def action_url(self, endpoint):
48
+ path = "/".join(["api/3/action", endpoint])
49
+ return urljoin(self.source.url, path)
50
+
51
+ def dataset_url(self, name):
52
+ path = "/".join(["dataset", name])
53
+ return urljoin(self.source.url, path)
54
+
55
+ def get_action(self, endpoint, fix=False, **kwargs):
56
+ url = self.action_url(endpoint)
57
+ if fix:
58
+ response = self.post(url, "{}", params=kwargs)
59
+ else:
60
+ response = self.get(url, params=kwargs)
61
+
62
+ response.raise_for_status()
63
+ content_type = response.headers.get("Content-Type", "")
64
+ mime_type = content_type.split(";", 1)[0]
65
+
66
+ if mime_type == "application/json": # Standard API JSON response
67
+ data = response.json()
68
+ # CKAN API can returns 200 even on errors
69
+ # Only the `success` property allows to detect errors
70
+ if data.get("success", False):
71
+ return data
72
+ else:
73
+ error = data.get("error")
74
+ if isinstance(error, dict):
75
+ # Error object with message
76
+ msg = error.get("message", "Unknown error")
77
+ if "__type" in error:
78
+ # Typed error
79
+ msg = ": ".join((error["__type"], msg))
80
+ else:
81
+ # Error only contains a message
82
+ msg = error
83
+ raise HarvestException(msg)
84
+
85
+ elif mime_type == "text/html": # Standard html error page
86
+ raise HarvestException("Unknown Error: {} returned HTML".format(url))
87
+ else:
88
+ # If it's not HTML, CKAN respond with raw quoted text
89
+ msg = response.text.strip('"')
90
+ raise HarvestException(msg)
91
+
92
+ def get_status(self):
93
+ url = urljoin(self.source.url, "/api/util/status")
94
+ response = self.get(url)
95
+ return response.json()
96
+
97
+ def inner_harvest(self):
98
+ """List all datasets for a given ..."""
99
+ fix = False # Fix should be True for CKAN < '1.8'
100
+
101
+ filters = self.config.get("filters", [])
102
+ if len(filters) > 0:
103
+ # Build a q search query based on filters
104
+ # use package_search because package_list doesn't allow filtering
105
+ # use q parameters because fq is broken with multiple filters
106
+ params = []
107
+ for f in filters:
108
+ param = "{key}:{value}".format(**f)
109
+ if f.get("type") == "exclude":
110
+ param = "-" + param
111
+ params.append(param)
112
+ q = " AND ".join(params)
113
+ # max out rows count to 1000 as per
114
+ # https://docs.ckan.org/en/latest/api/#ckan.logic.action.get.package_search
115
+ response = self.get_action("package_search", fix=fix, q=q, rows=1000)
116
+ names = [r["name"] for r in response["result"]["results"]]
117
+ else:
118
+ response = self.get_action("package_list", fix=fix)
119
+ names = response["result"]
120
+
121
+ for name in names:
122
+ # We use `name` as `remote_id` for now, we'll be replace at the beginning of the process
123
+ self.process_dataset(name)
124
+ if self.has_reached_max_items():
125
+ return
126
+
127
+ def inner_process_dataset(self, item: HarvestItem):
128
+ response = self.get_action("package_show", id=item.remote_id)
129
+
130
+ result = response["result"]
131
+ # DKAN returns a list where CKAN returns an object
132
+ # we "unlist" here instead of after schema validation in order to get the id easily
133
+ if type(result) is list:
134
+ result = result[0]
135
+
136
+ # Replace the `remote_id` from `name` to `id`.
137
+ if result.get("id"):
138
+ item.remote_id = result["id"]
139
+
140
+ data = self.validate(result, self.schema)
141
+
142
+ # Skip if no resource
143
+ if not len(data.get("resources", [])):
144
+ raise HarvestSkipException(f"Dataset {data['name']} has no record")
145
+
146
+ dataset = self.get_dataset(item.remote_id)
147
+
148
+ if not dataset.harvest:
149
+ dataset.harvest = HarvestDatasetMetadata()
150
+
151
+ # Core attributes
152
+ if not dataset.slug:
153
+ dataset.slug = data["name"]
154
+ dataset.title = data["title"]
155
+ dataset.description = parse_html(data["notes"])
156
+
157
+ # Detect license
158
+ default_license = dataset.license or License.default()
159
+ dataset.license = License.guess(
160
+ data["license_id"], data["license_title"], default=default_license
161
+ )
162
+
163
+ dataset.tags = [t["name"] for t in data["tags"] if t["name"]]
164
+
165
+ dataset.harvest.created_at = data["metadata_created"]
166
+ dataset.harvest.modified_at = data["metadata_modified"]
167
+
168
+ dataset.harvest.ckan_name = data["name"]
169
+
170
+ temporal_start, temporal_end = None, None
171
+ spatial_geom, spatial_zone = None, None
172
+
173
+ for extra in data["extras"]:
174
+ key = extra["key"]
175
+ value = extra["value"]
176
+ if value is None or (isinstance(value, str) and not value.strip()):
177
+ # Skip empty extras
178
+ continue
179
+ elif key == "spatial":
180
+ # GeoJSON representation (Polygon or Point)
181
+ spatial_geom = json.loads(value)
182
+ elif key == "spatial-text":
183
+ # Textual representation of the extent / location
184
+ qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value))
185
+ if qs.count() == 1:
186
+ spatial_zone = qs.first()
187
+ else:
188
+ dataset.extras["ckan:spatial-text"] = value
189
+ log.debug("spatial-text value not handled: %s", value)
190
+ elif key == "spatial-uri":
191
+ # Linked Data URI representing the place name
192
+ dataset.extras["ckan:spatial-uri"] = value
193
+ log.debug("spatial-uri value not handled: %s", value)
194
+ elif key == "frequency":
195
+ # Update frequency
196
+ freq = frequency_from_rdf(value)
197
+ if freq:
198
+ dataset.frequency = freq
199
+ elif value in UPDATE_FREQUENCIES:
200
+ dataset.frequency = value
201
+ else:
202
+ dataset.extras["ckan:frequency"] = value
203
+ log.debug("frequency value not handled: %s", value)
204
+ # Temporal coverage start
205
+ elif key == "temporal_start":
206
+ temporal_start = daterange_start(value)
207
+ # Temporal coverage end
208
+ elif key == "temporal_end":
209
+ temporal_end = daterange_end(value)
210
+ else:
211
+ dataset.extras[extra["key"]] = value
212
+
213
+ if spatial_geom or spatial_zone:
214
+ dataset.spatial = SpatialCoverage()
215
+
216
+ if spatial_zone:
217
+ dataset.spatial.zones = [spatial_zone]
218
+
219
+ if spatial_geom:
220
+ if spatial_geom["type"] == "Polygon":
221
+ coordinates = [spatial_geom["coordinates"]]
222
+ elif spatial_geom["type"] == "MultiPolygon":
223
+ coordinates = spatial_geom["coordinates"]
224
+ else:
225
+ raise HarvestException("Unsupported spatial geometry")
226
+ dataset.spatial.geom = {"type": "MultiPolygon", "coordinates": coordinates}
227
+
228
+ if temporal_start and temporal_end:
229
+ dataset.temporal_coverage = db.DateRange(
230
+ start=temporal_start,
231
+ end=temporal_end,
232
+ )
233
+
234
+ # Remote URL
235
+ dataset.harvest.remote_url = self.dataset_url(data["name"])
236
+ if data.get("url"):
237
+ try:
238
+ url = uris.validate(data["url"])
239
+ except uris.ValidationError:
240
+ dataset.harvest.ckan_source = data["url"]
241
+ else:
242
+ # use declared `url` as `remote_url` if any
243
+ dataset.harvest.remote_url = url
244
+
245
+ # Resources
246
+ for res in data["resources"]:
247
+ if res["resource_type"] not in ALLOWED_RESOURCE_TYPES:
248
+ continue
249
+ try:
250
+ resource = get_by(dataset.resources, "id", UUID(res["id"]))
251
+ except Exception:
252
+ log.error("Unable to parse resource ID %s", res["id"])
253
+ continue
254
+ if not resource:
255
+ resource = Resource(id=res["id"])
256
+ dataset.resources.append(resource)
257
+ if not resource.harvest:
258
+ resource.harvest = HarvestResourceMetadata()
259
+ resource.title = res.get("name", "") or ""
260
+ resource.description = parse_html(res.get("description"))
261
+ resource.url = res["url"]
262
+ resource.filetype = "remote"
263
+ resource.format = res.get("format")
264
+ resource.mime = res.get("mimetype")
265
+ resource.hash = res.get("hash")
266
+ resource.harvest.created_at = res["created"]
267
+ resource.harvest.modified_at = res["last_modified"]
268
+
269
+ return dataset
270
+
271
+
272
+ class DkanBackend(CkanBackend):
273
+ schema = dkan_schema
274
+ filters = []
File without changes
@@ -0,0 +1,86 @@
1
+ from voluptuous import All, Any, Coerce, DefaultTo, Lower, Optional, Schema
2
+
3
+ from udata.harvest.filters import (
4
+ boolean,
5
+ email,
6
+ empty_none,
7
+ hash,
8
+ is_url,
9
+ normalize_string,
10
+ normalize_tag,
11
+ slug,
12
+ to_date,
13
+ )
14
+
15
+ RESOURCE_TYPES = ("file", "file.upload", "api", "documentation", "image", "visualization")
16
+
17
+
18
+ resource = {
19
+ "id": str,
20
+ "position": int,
21
+ "name": All(DefaultTo(""), str),
22
+ "description": Any(All(str, normalize_string), None),
23
+ "format": All(str, Lower),
24
+ "mimetype": Any(All(str, Lower), None),
25
+ "size": Any(Coerce(int), None),
26
+ "hash": Any(All(str, hash), None),
27
+ "created": All(str, to_date),
28
+ "last_modified": Any(All(str, to_date), None),
29
+ "url": All(str, is_url()),
30
+ "resource_type": All(empty_none, DefaultTo("file"), str, Any(*RESOURCE_TYPES)),
31
+ }
32
+
33
+ tag = {
34
+ "id": str,
35
+ Optional("vocabulary_id"): Any(str, None),
36
+ Optional("display_name"): str,
37
+ "name": All(str, normalize_tag),
38
+ Optional("state"): str,
39
+ }
40
+
41
+ organization = {
42
+ "id": str,
43
+ "description": str,
44
+ "created": All(str, to_date),
45
+ "title": str,
46
+ "name": All(str, slug),
47
+ "revision_timestamp": All(str, to_date),
48
+ "is_organization": boolean,
49
+ "state": str,
50
+ "image_url": str,
51
+ "revision_id": str,
52
+ "type": "organization",
53
+ "approval_status": "approved",
54
+ }
55
+
56
+ schema = Schema(
57
+ {
58
+ "id": str,
59
+ "name": str,
60
+ "title": str,
61
+ "notes": Any(All(str, normalize_string), None),
62
+ "license_id": All(DefaultTo("not-specified"), str),
63
+ "license_title": Any(str, None),
64
+ "tags": [tag],
65
+ "metadata_created": All(str, to_date),
66
+ "metadata_modified": All(str, to_date),
67
+ "organization": Any(organization, None),
68
+ "resources": [resource],
69
+ Optional("revision_id"): str,
70
+ Optional("extras", default=list): [
71
+ {
72
+ "key": str,
73
+ "value": Any(str, int, float, boolean, dict, list),
74
+ }
75
+ ],
76
+ "private": boolean,
77
+ "type": "dataset",
78
+ "author": Any(str, None),
79
+ "author_email": All(empty_none, Any(All(str, email), None)),
80
+ "maintainer": Any(str, None),
81
+ "maintainer_email": All(empty_none, Any(All(str, email), None)),
82
+ "state": Any(str, None),
83
+ },
84
+ required=True,
85
+ extra=True,
86
+ )
@@ -0,0 +1,98 @@
1
+ import dateutil.parser
2
+ from humanfriendly import parse_size
3
+ from voluptuous import All, Any, DefaultTo, Lower, Optional, Schema
4
+
5
+ from udata.harvest.filters import boolean, email, empty_none, hash, is_url, normalize_string, slug
6
+
7
+ from .ckan import tag
8
+
9
+
10
+ class FrenchParserInfo(dateutil.parser.parserinfo):
11
+ WEEKDAYS = [
12
+ ("Lun", "Lundi"),
13
+ ("Mar", "Mardi"),
14
+ ("Mer", "Mercredi"),
15
+ ("Jeu", "Jeudi"),
16
+ ("Ven", "Vendredi"),
17
+ ("Sam", "Samedi"),
18
+ ("Dim", "Dimanche"),
19
+ ]
20
+
21
+
22
+ def parse_date(value, **kwargs):
23
+ return dateutil.parser.parse(value, **kwargs).date()
24
+
25
+
26
+ def to_date(value):
27
+ """
28
+ Try w/ french weekdays then dateutil's default
29
+ `fuzzy` is used when 'Date changed' is in the value
30
+ """
31
+ try:
32
+ return parse_date(value, fuzzy=True, parserinfo=FrenchParserInfo(), dayfirst=True)
33
+ except ValueError:
34
+ return parse_date(value, fuzzy=True)
35
+
36
+
37
+ def dkan_parse_size(value):
38
+ if value:
39
+ # not strictly true but should be enough
40
+ value = value.replace("octets", "bytes")
41
+ return parse_size(value)
42
+
43
+
44
+ resource = {
45
+ "id": str,
46
+ "name": All(DefaultTo(""), str),
47
+ "description": All(str, normalize_string),
48
+ "format": All(str, Lower),
49
+ "mimetype": Any(All(str, Lower), None),
50
+ "size": All(str, dkan_parse_size),
51
+ Optional("hash"): Any(All(str, hash), None),
52
+ "created": All(str, to_date),
53
+ "last_modified": Any(All(str, to_date), None),
54
+ "url": All(str, is_url()),
55
+ Optional("resource_type", default="dkan"): All(
56
+ empty_none,
57
+ str,
58
+ ),
59
+ }
60
+
61
+ group = {
62
+ "id": str,
63
+ "description": str,
64
+ "image_display_url": str,
65
+ "title": str,
66
+ "name": All(str, slug),
67
+ }
68
+
69
+ schema = Schema(
70
+ {
71
+ "id": str,
72
+ "name": str,
73
+ "title": str,
74
+ "notes": Any(All(str, normalize_string), None),
75
+ Optional("license_id", default=None): All(DefaultTo("not-specified"), str),
76
+ Optional("license_title", default=None): Any(str, None),
77
+ Optional("tags", default=list): [tag],
78
+ "metadata_created": All(str, to_date),
79
+ "metadata_modified": All(str, to_date),
80
+ Optional("groups"): [Any(group, None)],
81
+ "resources": [resource],
82
+ Optional("extras", default=list): [
83
+ {
84
+ "key": str,
85
+ "value": Any(str, int, float, boolean, dict, list),
86
+ }
87
+ ],
88
+ "private": boolean,
89
+ "type": "Dataset",
90
+ Optional("author"): Any(str, None),
91
+ Optional("author_email"): All(empty_none, Any(All(str, email), None)),
92
+ "maintainer": Any(str, None),
93
+ "maintainer_email": All(empty_none, Any(All(str, email), None)),
94
+ "state": Any(str, None),
95
+ },
96
+ required=True,
97
+ extra=True,
98
+ )