udata 10.8.2.dev36743__py2.py3-none-any.whl → 10.8.2.dev36809__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (35) hide show
  1. udata/harvest/backends/ckan/__init__.py +3 -0
  2. udata/harvest/backends/ckan/harvesters.py +274 -0
  3. udata/harvest/backends/ckan/models.py +10 -0
  4. udata/harvest/backends/ckan/schemas/__init__.py +0 -0
  5. udata/harvest/backends/ckan/schemas/ckan.py +86 -0
  6. udata/harvest/backends/ckan/schemas/dkan.py +98 -0
  7. udata/harvest/tests/ckan/conftest.py +67 -0
  8. udata/harvest/tests/ckan/data/dkan-french-w-license.json +226 -0
  9. udata/harvest/tests/ckan/test_ckan_backend.py +697 -0
  10. udata/harvest/tests/ckan/test_ckan_backend_errors.py +140 -0
  11. udata/harvest/tests/ckan/test_ckan_backend_filters.py +130 -0
  12. udata/harvest/tests/ckan/test_dkan_backend.py +68 -0
  13. udata/static/chunks/{10.8ca60413647062717b1e.js → 10.471164b2a9fe15614797.js} +3 -3
  14. udata/static/chunks/{10.8ca60413647062717b1e.js.map → 10.471164b2a9fe15614797.js.map} +1 -1
  15. udata/static/chunks/{11.b6f741fcc366abfad9c4.js → 11.51d706fb9521c16976bc.js} +3 -3
  16. udata/static/chunks/{11.b6f741fcc366abfad9c4.js.map → 11.51d706fb9521c16976bc.js.map} +1 -1
  17. udata/static/chunks/{13.2d06442dd9a05d9777b5.js → 13.f29411b06be1883356a3.js} +2 -2
  18. udata/static/chunks/{13.2d06442dd9a05d9777b5.js.map → 13.f29411b06be1883356a3.js.map} +1 -1
  19. udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js → 17.3bd0340930d4a314ce9c.js} +2 -2
  20. udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js.map → 17.3bd0340930d4a314ce9c.js.map} +1 -1
  21. udata/static/chunks/{19.f03a102365af4315f9db.js → 19.8da42e8359d72afc2618.js} +3 -3
  22. udata/static/chunks/{19.f03a102365af4315f9db.js.map → 19.8da42e8359d72afc2618.js.map} +1 -1
  23. udata/static/chunks/{8.778091d55cd8ea39af6b.js → 8.54e44b102164ae5e7a67.js} +2 -2
  24. udata/static/chunks/{8.778091d55cd8ea39af6b.js.map → 8.54e44b102164ae5e7a67.js.map} +1 -1
  25. udata/static/chunks/{9.033d7e190ca9e226a5d0.js → 9.07515e5187f475bce828.js} +3 -3
  26. udata/static/chunks/{9.033d7e190ca9e226a5d0.js.map → 9.07515e5187f475bce828.js.map} +1 -1
  27. udata/static/common.js +1 -1
  28. udata/static/common.js.map +1 -1
  29. udata/translations/udata.pot +74 -70
  30. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/METADATA +3 -1
  31. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/RECORD +35 -23
  32. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/entry_points.txt +2 -0
  33. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/LICENSE +0 -0
  34. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/WHEEL +0 -0
  35. {udata-10.8.2.dev36743.dist-info → udata-10.8.2.dev36809.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3 @@
1
+ """
2
+ CKAN integration for udata
3
+ """
@@ -0,0 +1,274 @@
1
+ import json
2
+ import logging
3
+ from urllib.parse import urljoin
4
+ from uuid import UUID
5
+
6
+ from udata import uris
7
+ from udata.harvest.models import HarvestItem
8
+ from udata.i18n import lazy_gettext as _
9
+
10
+ try:
11
+ from udata.core.dataset.constants import UPDATE_FREQUENCIES
12
+ except ImportError:
13
+ # legacy import of constants in udata
14
+ from udata.models import UPDATE_FREQUENCIES
15
+ from udata.core.dataset.models import HarvestDatasetMetadata, HarvestResourceMetadata
16
+ from udata.core.dataset.rdf import frequency_from_rdf
17
+ from udata.frontend.markdown import parse_html
18
+ from udata.harvest.backends.base import BaseBackend, HarvestFilter
19
+ from udata.harvest.exceptions import HarvestException, HarvestSkipException
20
+ from udata.models import GeoZone, License, Resource, SpatialCoverage, db
21
+ from udata.utils import daterange_end, daterange_start, get_by
22
+
23
+ from .schemas.ckan import schema as ckan_schema
24
+ from .schemas.dkan import schema as dkan_schema
25
+
26
+ log = logging.getLogger(__name__)
27
+
28
+ # dkan is a dummy value for dkan that does not provide resource_type
29
+ ALLOWED_RESOURCE_TYPES = ("dkan", "file", "file.upload", "api", "metadata")
30
+
31
+
32
+ class CkanBackend(BaseBackend):
33
+ display_name = "CKAN"
34
+ filters = (
35
+ HarvestFilter(_("Organization"), "organization", str, _("A CKAN Organization name")),
36
+ HarvestFilter(_("Tag"), "tags", str, _("A CKAN tag name")),
37
+ )
38
+ schema = ckan_schema
39
+
40
+ def get_headers(self):
41
+ headers = super(CkanBackend, self).get_headers()
42
+ headers["content-type"] = "application/json"
43
+ if self.config.get("apikey"):
44
+ headers["Authorization"] = self.config["apikey"]
45
+ return headers
46
+
47
+ def action_url(self, endpoint):
48
+ path = "/".join(["api/3/action", endpoint])
49
+ return urljoin(self.source.url, path)
50
+
51
+ def dataset_url(self, name):
52
+ path = "/".join(["dataset", name])
53
+ return urljoin(self.source.url, path)
54
+
55
+ def get_action(self, endpoint, fix=False, **kwargs):
56
+ url = self.action_url(endpoint)
57
+ if fix:
58
+ response = self.post(url, "{}", params=kwargs)
59
+ else:
60
+ response = self.get(url, params=kwargs)
61
+
62
+ response.raise_for_status()
63
+ content_type = response.headers.get("Content-Type", "")
64
+ mime_type = content_type.split(";", 1)[0]
65
+
66
+ if mime_type == "application/json": # Standard API JSON response
67
+ data = response.json()
68
+ # CKAN API can returns 200 even on errors
69
+ # Only the `success` property allows to detect errors
70
+ if data.get("success", False):
71
+ return data
72
+ else:
73
+ error = data.get("error")
74
+ if isinstance(error, dict):
75
+ # Error object with message
76
+ msg = error.get("message", "Unknown error")
77
+ if "__type" in error:
78
+ # Typed error
79
+ msg = ": ".join((error["__type"], msg))
80
+ else:
81
+ # Error only contains a message
82
+ msg = error
83
+ raise HarvestException(msg)
84
+
85
+ elif mime_type == "text/html": # Standard html error page
86
+ raise HarvestException("Unknown Error: {} returned HTML".format(url))
87
+ else:
88
+ # If it's not HTML, CKAN respond with raw quoted text
89
+ msg = response.text.strip('"')
90
+ raise HarvestException(msg)
91
+
92
+ def get_status(self):
93
+ url = urljoin(self.source.url, "/api/util/status")
94
+ response = self.get(url)
95
+ return response.json()
96
+
97
+ def inner_harvest(self):
98
+ """List all datasets for a given ..."""
99
+ fix = False # Fix should be True for CKAN < '1.8'
100
+
101
+ filters = self.config.get("filters", [])
102
+ if len(filters) > 0:
103
+ # Build a q search query based on filters
104
+ # use package_search because package_list doesn't allow filtering
105
+ # use q parameters because fq is broken with multiple filters
106
+ params = []
107
+ for f in filters:
108
+ param = "{key}:{value}".format(**f)
109
+ if f.get("type") == "exclude":
110
+ param = "-" + param
111
+ params.append(param)
112
+ q = " AND ".join(params)
113
+ # max out rows count to 1000 as per
114
+ # https://docs.ckan.org/en/latest/api/#ckan.logic.action.get.package_search
115
+ response = self.get_action("package_search", fix=fix, q=q, rows=1000)
116
+ names = [r["name"] for r in response["result"]["results"]]
117
+ else:
118
+ response = self.get_action("package_list", fix=fix)
119
+ names = response["result"]
120
+
121
+ for name in names:
122
+ # We use `name` as `remote_id` for now, we'll be replace at the beginning of the process
123
+ self.process_dataset(name)
124
+ if self.has_reached_max_items():
125
+ return
126
+
127
+ def inner_process_dataset(self, item: HarvestItem):
128
+ response = self.get_action("package_show", id=item.remote_id)
129
+
130
+ result = response["result"]
131
+ # DKAN returns a list where CKAN returns an object
132
+ # we "unlist" here instead of after schema validation in order to get the id easily
133
+ if type(result) is list:
134
+ result = result[0]
135
+
136
+ # Replace the `remote_id` from `name` to `id`.
137
+ if result.get("id"):
138
+ item.remote_id = result["id"]
139
+
140
+ data = self.validate(result, self.schema)
141
+
142
+ # Skip if no resource
143
+ if not len(data.get("resources", [])):
144
+ raise HarvestSkipException(f"Dataset {data['name']} has no record")
145
+
146
+ dataset = self.get_dataset(item.remote_id)
147
+
148
+ if not dataset.harvest:
149
+ dataset.harvest = HarvestDatasetMetadata()
150
+
151
+ # Core attributes
152
+ if not dataset.slug:
153
+ dataset.slug = data["name"]
154
+ dataset.title = data["title"]
155
+ dataset.description = parse_html(data["notes"])
156
+
157
+ # Detect license
158
+ default_license = dataset.license or License.default()
159
+ dataset.license = License.guess(
160
+ data["license_id"], data["license_title"], default=default_license
161
+ )
162
+
163
+ dataset.tags = [t["name"] for t in data["tags"] if t["name"]]
164
+
165
+ dataset.harvest.created_at = data["metadata_created"]
166
+ dataset.harvest.modified_at = data["metadata_modified"]
167
+
168
+ dataset.harvest.ckan_name = data["name"]
169
+
170
+ temporal_start, temporal_end = None, None
171
+ spatial_geom, spatial_zone = None, None
172
+
173
+ for extra in data["extras"]:
174
+ key = extra["key"]
175
+ value = extra["value"]
176
+ if value is None or (isinstance(value, str) and not value.strip()):
177
+ # Skip empty extras
178
+ continue
179
+ elif key == "spatial":
180
+ # GeoJSON representation (Polygon or Point)
181
+ spatial_geom = json.loads(value)
182
+ elif key == "spatial-text":
183
+ # Textual representation of the extent / location
184
+ qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value))
185
+ if qs.count() == 1:
186
+ spatial_zone = qs.first()
187
+ else:
188
+ dataset.extras["ckan:spatial-text"] = value
189
+ log.debug("spatial-text value not handled: %s", value)
190
+ elif key == "spatial-uri":
191
+ # Linked Data URI representing the place name
192
+ dataset.extras["ckan:spatial-uri"] = value
193
+ log.debug("spatial-uri value not handled: %s", value)
194
+ elif key == "frequency":
195
+ # Update frequency
196
+ freq = frequency_from_rdf(value)
197
+ if freq:
198
+ dataset.frequency = freq
199
+ elif value in UPDATE_FREQUENCIES:
200
+ dataset.frequency = value
201
+ else:
202
+ dataset.extras["ckan:frequency"] = value
203
+ log.debug("frequency value not handled: %s", value)
204
+ # Temporal coverage start
205
+ elif key == "temporal_start":
206
+ temporal_start = daterange_start(value)
207
+ # Temporal coverage end
208
+ elif key == "temporal_end":
209
+ temporal_end = daterange_end(value)
210
+ else:
211
+ dataset.extras[extra["key"]] = value
212
+
213
+ if spatial_geom or spatial_zone:
214
+ dataset.spatial = SpatialCoverage()
215
+
216
+ if spatial_zone:
217
+ dataset.spatial.zones = [spatial_zone]
218
+
219
+ if spatial_geom:
220
+ if spatial_geom["type"] == "Polygon":
221
+ coordinates = [spatial_geom["coordinates"]]
222
+ elif spatial_geom["type"] == "MultiPolygon":
223
+ coordinates = spatial_geom["coordinates"]
224
+ else:
225
+ raise HarvestException("Unsupported spatial geometry")
226
+ dataset.spatial.geom = {"type": "MultiPolygon", "coordinates": coordinates}
227
+
228
+ if temporal_start and temporal_end:
229
+ dataset.temporal_coverage = db.DateRange(
230
+ start=temporal_start,
231
+ end=temporal_end,
232
+ )
233
+
234
+ # Remote URL
235
+ dataset.harvest.remote_url = self.dataset_url(data["name"])
236
+ if data.get("url"):
237
+ try:
238
+ url = uris.validate(data["url"])
239
+ except uris.ValidationError:
240
+ dataset.harvest.ckan_source = data["url"]
241
+ else:
242
+ # use declared `url` as `remote_url` if any
243
+ dataset.harvest.remote_url = url
244
+
245
+ # Resources
246
+ for res in data["resources"]:
247
+ if res["resource_type"] not in ALLOWED_RESOURCE_TYPES:
248
+ continue
249
+ try:
250
+ resource = get_by(dataset.resources, "id", UUID(res["id"]))
251
+ except Exception:
252
+ log.error("Unable to parse resource ID %s", res["id"])
253
+ continue
254
+ if not resource:
255
+ resource = Resource(id=res["id"])
256
+ dataset.resources.append(resource)
257
+ if not resource.harvest:
258
+ resource.harvest = HarvestResourceMetadata()
259
+ resource.title = res.get("name", "") or ""
260
+ resource.description = parse_html(res.get("description"))
261
+ resource.url = res["url"]
262
+ resource.filetype = "remote"
263
+ resource.format = res.get("format")
264
+ resource.mime = res.get("mimetype")
265
+ resource.hash = res.get("hash")
266
+ resource.harvest.created_at = res["created"]
267
+ resource.harvest.modified_at = res["last_modified"]
268
+
269
+ return dataset
270
+
271
+
272
+ class DkanBackend(CkanBackend):
273
+ schema = dkan_schema
274
+ filters = []
@@ -0,0 +1,10 @@
1
+ from udata.api import fields
2
+ from udata.core.dataset.api_fields import dataset_harvest_fields
3
+
4
+ # Register additional harvest fields to serve by api
5
+ dataset_harvest_fields["ckan_name"] = fields.String(
6
+ description="The ckan name property for ckan harvested dataset", allow_null=True
7
+ )
8
+ dataset_harvest_fields["ckan_source"] = fields.String(
9
+ description="The ckan source property for ckan harvested dataset", allow_null=True
10
+ )
File without changes
@@ -0,0 +1,86 @@
1
+ from voluptuous import All, Any, Coerce, DefaultTo, Lower, Optional, Schema
2
+
3
+ from udata.harvest.filters import (
4
+ boolean,
5
+ email,
6
+ empty_none,
7
+ hash,
8
+ is_url,
9
+ normalize_string,
10
+ normalize_tag,
11
+ slug,
12
+ to_date,
13
+ )
14
+
15
+ RESOURCE_TYPES = ("file", "file.upload", "api", "documentation", "image", "visualization")
16
+
17
+
18
+ resource = {
19
+ "id": str,
20
+ "position": int,
21
+ "name": All(DefaultTo(""), str),
22
+ "description": Any(All(str, normalize_string), None),
23
+ "format": All(str, Lower),
24
+ "mimetype": Any(All(str, Lower), None),
25
+ "size": Any(Coerce(int), None),
26
+ "hash": Any(All(str, hash), None),
27
+ "created": All(str, to_date),
28
+ "last_modified": Any(All(str, to_date), None),
29
+ "url": All(str, is_url()),
30
+ "resource_type": All(empty_none, DefaultTo("file"), str, Any(*RESOURCE_TYPES)),
31
+ }
32
+
33
+ tag = {
34
+ "id": str,
35
+ Optional("vocabulary_id"): Any(str, None),
36
+ Optional("display_name"): str,
37
+ "name": All(str, normalize_tag),
38
+ Optional("state"): str,
39
+ }
40
+
41
+ organization = {
42
+ "id": str,
43
+ "description": str,
44
+ "created": All(str, to_date),
45
+ "title": str,
46
+ "name": All(str, slug),
47
+ "revision_timestamp": All(str, to_date),
48
+ "is_organization": boolean,
49
+ "state": str,
50
+ "image_url": str,
51
+ "revision_id": str,
52
+ "type": "organization",
53
+ "approval_status": "approved",
54
+ }
55
+
56
+ schema = Schema(
57
+ {
58
+ "id": str,
59
+ "name": str,
60
+ "title": str,
61
+ "notes": Any(All(str, normalize_string), None),
62
+ "license_id": All(DefaultTo("not-specified"), str),
63
+ "license_title": Any(str, None),
64
+ "tags": [tag],
65
+ "metadata_created": All(str, to_date),
66
+ "metadata_modified": All(str, to_date),
67
+ "organization": Any(organization, None),
68
+ "resources": [resource],
69
+ Optional("revision_id"): str,
70
+ Optional("extras", default=list): [
71
+ {
72
+ "key": str,
73
+ "value": Any(str, int, float, boolean, dict, list),
74
+ }
75
+ ],
76
+ "private": boolean,
77
+ "type": "dataset",
78
+ "author": Any(str, None),
79
+ "author_email": All(empty_none, Any(All(str, email), None)),
80
+ "maintainer": Any(str, None),
81
+ "maintainer_email": All(empty_none, Any(All(str, email), None)),
82
+ "state": Any(str, None),
83
+ },
84
+ required=True,
85
+ extra=True,
86
+ )
@@ -0,0 +1,98 @@
1
+ import dateutil.parser
2
+ from humanfriendly import parse_size
3
+ from voluptuous import All, Any, DefaultTo, Lower, Optional, Schema
4
+
5
+ from udata.harvest.filters import boolean, email, empty_none, hash, is_url, normalize_string, slug
6
+
7
+ from .ckan import tag
8
+
9
+
10
+ class FrenchParserInfo(dateutil.parser.parserinfo):
11
+ WEEKDAYS = [
12
+ ("Lun", "Lundi"),
13
+ ("Mar", "Mardi"),
14
+ ("Mer", "Mercredi"),
15
+ ("Jeu", "Jeudi"),
16
+ ("Ven", "Vendredi"),
17
+ ("Sam", "Samedi"),
18
+ ("Dim", "Dimanche"),
19
+ ]
20
+
21
+
22
+ def parse_date(value, **kwargs):
23
+ return dateutil.parser.parse(value, **kwargs).date()
24
+
25
+
26
+ def to_date(value):
27
+ """
28
+ Try w/ french weekdays then dateutil's default
29
+ `fuzzy` is used when 'Date changed' is in the value
30
+ """
31
+ try:
32
+ return parse_date(value, fuzzy=True, parserinfo=FrenchParserInfo(), dayfirst=True)
33
+ except ValueError:
34
+ return parse_date(value, fuzzy=True)
35
+
36
+
37
+ def dkan_parse_size(value):
38
+ if value:
39
+ # not strictly true but should be enough
40
+ value = value.replace("octets", "bytes")
41
+ return parse_size(value)
42
+
43
+
44
+ resource = {
45
+ "id": str,
46
+ "name": All(DefaultTo(""), str),
47
+ "description": All(str, normalize_string),
48
+ "format": All(str, Lower),
49
+ "mimetype": Any(All(str, Lower), None),
50
+ "size": All(str, dkan_parse_size),
51
+ Optional("hash"): Any(All(str, hash), None),
52
+ "created": All(str, to_date),
53
+ "last_modified": Any(All(str, to_date), None),
54
+ "url": All(str, is_url()),
55
+ Optional("resource_type", default="dkan"): All(
56
+ empty_none,
57
+ str,
58
+ ),
59
+ }
60
+
61
+ group = {
62
+ "id": str,
63
+ "description": str,
64
+ "image_display_url": str,
65
+ "title": str,
66
+ "name": All(str, slug),
67
+ }
68
+
69
+ schema = Schema(
70
+ {
71
+ "id": str,
72
+ "name": str,
73
+ "title": str,
74
+ "notes": Any(All(str, normalize_string), None),
75
+ Optional("license_id", default=None): All(DefaultTo("not-specified"), str),
76
+ Optional("license_title", default=None): Any(str, None),
77
+ Optional("tags", default=list): [tag],
78
+ "metadata_created": All(str, to_date),
79
+ "metadata_modified": All(str, to_date),
80
+ Optional("groups"): [Any(group, None)],
81
+ "resources": [resource],
82
+ Optional("extras", default=list): [
83
+ {
84
+ "key": str,
85
+ "value": Any(str, int, float, boolean, dict, list),
86
+ }
87
+ ],
88
+ "private": boolean,
89
+ "type": "Dataset",
90
+ Optional("author"): Any(str, None),
91
+ Optional("author_email"): All(empty_none, Any(All(str, email), None)),
92
+ "maintainer": Any(str, None),
93
+ "maintainer_email": All(empty_none, Any(All(str, email), None)),
94
+ "state": Any(str, None),
95
+ },
96
+ required=True,
97
+ extra=True,
98
+ )
@@ -0,0 +1,67 @@
1
+ import json
2
+ from urllib.parse import urljoin
3
+
4
+ import pytest
5
+ import requests
6
+ from faker.providers import BaseProvider
7
+
8
+ from udata.utils import faker, faker_provider
9
+
10
+ CKAN_URL = "http://localhost:5000"
11
+
12
+
13
+ def pytest_configure(config):
14
+ config.addinivalue_line(
15
+ "markers",
16
+ "ckan_data(fixture): specify the data fixture they rely on. This allows `data`, `result` and `kwargs` fixtures to be populated with the associated data harvest data.",
17
+ )
18
+
19
+
20
+ class CkanError(ValueError):
21
+ pass
22
+
23
+
24
+ class CkanClient(object):
25
+ BASE_URL = CKAN_URL
26
+ API_URL = "{}/api/3/action/".format(BASE_URL)
27
+ PACKAGE_LIST_URL = "{}package_list".format(API_URL)
28
+ PACKAGE_SEARCH_URL = "{}package_search".format(API_URL)
29
+ PACKAGE_SHOW_URL = "{}package_show".format(API_URL)
30
+
31
+ @property
32
+ def headers(self):
33
+ return {
34
+ "Content-Type": "application/json",
35
+ "Authorization": "dummy_apikey",
36
+ }
37
+
38
+ def get(self, url, **kwargs):
39
+ return requests.get(url, headers=self.headers, **kwargs)
40
+
41
+ def post(self, url, data, **kwargs):
42
+ return requests.post(url, data=json.dumps(data), headers=self.headers, **kwargs)
43
+
44
+ def action_url(self, endpoint):
45
+ path = "/".join(["api/3/action", endpoint])
46
+ return urljoin(self.BASE_URL, path)
47
+
48
+ def action(self, endpoint, data=None, **kwargs):
49
+ url = self.action_url(endpoint)
50
+ if data:
51
+ response = self.post(url, data, params=kwargs)
52
+ else:
53
+ response = self.get(url, params=kwargs)
54
+ if not 200 <= response.status_code < 300:
55
+ raise CkanError(response.text.strip('"'))
56
+ return response.json()
57
+
58
+
59
+ @pytest.fixture(scope="session")
60
+ def ckan():
61
+ return CkanClient()
62
+
63
+
64
+ @faker_provider
65
+ class UdataCkanProvider(BaseProvider):
66
+ def unique_url(self):
67
+ return "{0}?_={1}".format(faker.uri(), faker.unique_string())