ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
  103. ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,430 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Functions to clean data before upload.
5
+ """
6
+ from typing import Union, List, Any, Type
7
+ from collections import OrderedDict
8
+ import copy
9
+ import math
10
+ import numbers
11
+ from warnings import warn
12
+ import datetime
13
+ import json
14
+ import re
15
+ import base64
16
+
17
+ import pandas as pd
18
+ try:
19
+ import bson
20
+ except ImportError:
21
+ bson = None
22
+
23
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanField
24
+ from ckanapi_harvesters.auxiliary.ckan_defs import ckan_timestamp_sep
25
+ from ckanapi_harvesters.auxiliary.ckan_errors import IntegrityError
26
+ from ckanapi_harvesters.auxiliary.list_records import ListRecords, records_to_df
27
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import assert_or_raise
28
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import datastore_id_col
29
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_errors import CleanError, CleanerRequirementError
30
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_abc import CkanDataCleanerABC
31
+
32
+ non_finite_authorized_types = {"numeric", "float4", "float8", "float2"}
33
+ real_number_types = non_finite_authorized_types
34
+ # see also: ckan_api_2_readonly ckan_dtype_mapper
35
+ dtype_ckan_mapper = {
36
+ "float64": "numeric",
37
+ "int64": "numeric",
38
+ "datetime64[ns]": "timestamp",
39
+ }
40
+
41
+
42
+ def _pd_series_type_detect(values: pd.Series, test_type:Type):
43
+ """
44
+ This function checks that the test_type matches all rows which are not NaN/None/NA in a pandas Series.
45
+ """
46
+ return values.map(lambda x: isinstance(x, test_type)).where(values.notna(), True).all()
47
+
48
+
49
+ class CkanDataCleanerUploadBasic(CkanDataCleanerABC):
50
+ """
51
+ Data cleaner for basic data types
52
+ """
53
+ def __init__(self):
54
+ super().__init__()
55
+ # options
56
+ self.param_json_as_text:bool = False # option to convert json fields (dicts and lists) to str
57
+ self.param_replace_nan:bool = True # option to replace non-authorized nan values by None
58
+ self.param_round_values:bool = True # option to round values when treating an integer field
59
+ self.param_rename_fields_underscore:bool = True # option to rename fields beginning with an underscore (in the subs step)
60
+
61
+ def copy(self, dest=None) -> "CkanDataCleanerUploadBasic":
62
+ if dest is None:
63
+ dest = CkanDataCleanerUploadBasic()
64
+ super().copy(dest=dest)
65
+ dest.param_json_as_text = self.param_json_as_text
66
+ dest.param_replace_nan = self.param_replace_nan
67
+ dest.param_round_values = self.param_round_values
68
+ dest.param_rename_fields_underscore = self.param_rename_fields_underscore
69
+ dest.param_field_subs = self.param_field_subs.copy()
70
+ return dest
71
+
72
+ ## field type detection
73
+ def create_new_field(self, field_name:str, values: Union[Any, pd.Series]) -> CkanField:
74
+ if field_name in self.fields_new.keys():
75
+ return self.fields_new[field_name]
76
+ else:
77
+ # detect type
78
+ if isinstance(values, pd.Series):
79
+ dtype = str(values.dtype)
80
+ if dtype == "object":
81
+ field_info = self._detect_standard_field_bypass(field_name, values)
82
+ if field_info is not None:
83
+ return field_info
84
+ elif _pd_series_type_detect(values, str):
85
+ return CkanField(field_name, "text")
86
+ elif _pd_series_type_detect(values, bool):
87
+ return CkanField(field_name, "bool")
88
+ elif (_pd_series_type_detect(values, dict)
89
+ or _pd_series_type_detect(values, list)):
90
+ if self.param_json_as_text:
91
+ return CkanField(field_name, "text")
92
+ else:
93
+ return CkanField(field_name, "json")
94
+ elif (_pd_series_type_detect(values, datetime.datetime)
95
+ or _pd_series_type_detect(values, pd.Timestamp)):
96
+ return CkanField(field_name, "timestamp")
97
+ else:
98
+ return self._detect_non_standard_field(field_name, values)
99
+ elif dtype in dtype_ckan_mapper.keys():
100
+ return CkanField(field_name, dtype_ckan_mapper[dtype])
101
+ else:
102
+ return CkanField(field_name, dtype)
103
+ else:
104
+ return CkanField(field_name, str(type(values)))
105
+
106
+ def _initial_field_subs(self, fields: OrderedDict[str, CkanField]) -> OrderedDict[str, CkanField]:
107
+ # rename fields beginning with '_'
108
+ for field_name, value in fields.items():
109
+ if field_name not in self.field_subs.keys():
110
+ if field_name in self.param_field_subs.keys():
111
+ self.field_subs[field_name] = self.param_field_subs[field_name]
112
+ elif self.param_rename_fields_underscore and field_name.startswith("_") and not field_name == datastore_id_col:
113
+ index = re.search(r"[a-zA-Z]", field_name)
114
+ if index is not None:
115
+ self.field_subs[field_name] = field_name[index.start():]
116
+ else:
117
+ raise NameError(f"Field {field_name} is invalid")
118
+ return fields
119
+
120
+ def detect_field_types_and_subs(self, records: Union[List[dict], pd.DataFrame],
121
+ known_fields:OrderedDict[str, CkanField]=None) -> OrderedDict[str, CkanField]:
122
+ self.clear_outputs_new_dataframe()
123
+ fields = OrderedDict()
124
+ if known_fields is not None:
125
+ for field_name, value in known_fields.items():
126
+ fields[field_name] = value
127
+ if isinstance(records, list):
128
+ df = records_to_df(records)
129
+ else:
130
+ df = records
131
+ for column in df.columns:
132
+ if column in self.field_subs.keys():
133
+ column_new = self.field_subs[column]
134
+ else:
135
+ column_new = column
136
+ if known_fields is None or column_new not in known_fields.keys():
137
+ fields[column_new] = self.create_new_field(column_new, df[column])
138
+ self.fields_new[column_new] = fields[column_new]
139
+ fields = self._initial_field_subs(fields)
140
+ return fields
141
+
142
+ ## Data cleaning
143
+ def _clean_subvalues_recursive(self, subvalue:Any, field:CkanField, path:str, level:int,
144
+ *, field_data_type:str) -> Any:
145
+ if isinstance(subvalue, dict):
146
+ for key, element in subvalue.items():
147
+ if not isinstance(key, str):
148
+ raise TypeError(f"Key {key} is of invalid type")
149
+ subvalue[key] = self._clean_subvalues_recursive(element, field, path + "." + str(key), level + 1,
150
+ field_data_type=field_data_type)
151
+ return subvalue
152
+ elif isinstance(subvalue, list):
153
+ for i, element in enumerate(subvalue):
154
+ subvalue[i] = self._clean_subvalues_recursive(element, field, path + "[" + str(i) + "]", level + 1,
155
+ field_data_type=field_data_type)
156
+ return subvalue
157
+ else:
158
+ return self._clean_subvalue(subvalue, field, path, level, field_data_type=field_data_type)
159
+
160
+ def _clean_subvalue(self, subvalue: Any, field: CkanField, path: str, level: int,
161
+ *, field_data_type: str) -> Any:
162
+ field_name = field.name if field is not None else None
163
+ new_subvalue, bypass = self._replace_standard_subvalue_bypass(subvalue, field, path, level, field_data_type=field_data_type)
164
+ if bypass:
165
+ pass # return new_subvalue
166
+ else:
167
+ new_subvalue = subvalue
168
+ if isinstance(subvalue, numbers.Number):
169
+ if not math.isfinite(subvalue):
170
+ if math.isnan(subvalue):
171
+ if self.param_replace_nan:
172
+ new_subvalue = None # replace nans with None when not authorized
173
+ else:
174
+ self.warnings[field_name].add("nan")
175
+ else:
176
+ self.warnings[field_name].add("inf") # infinite values are not authorized and no replacement can be made
177
+ if self.param_replace_forbidden:
178
+ new_subvalue = None
179
+ elif isinstance(subvalue, datetime.datetime):
180
+ if self.param_cast_types:
181
+ new_subvalue = subvalue.isoformat(sep=ckan_timestamp_sep)
182
+ else:
183
+ new_subvalue = self._replace_non_standard_subvalue(subvalue, field, path, level, field_data_type=field_data_type)
184
+ if path in self.field_subs_path.keys():
185
+ self._new_columns_in_row[path] = new_subvalue
186
+ return new_subvalue
187
+
188
+ def clean_value_field(self, value: Any, field:CkanField) -> Any:
189
+ field_name = field.name if field is not None else None
190
+ field_data_type = field.data_type if field is not None else None
191
+ field_data_type = field_data_type.lower() if field_data_type is not None else None
192
+ if field_name not in self.warnings:
193
+ self.warnings[field_name] = set()
194
+ self.fields_encountered[field_name] = None
195
+ new_value, bypass = self._replace_standard_value_bypass(value, field, field_data_type=field_data_type)
196
+ if bypass:
197
+ pass # return new_value
198
+ else:
199
+ new_value = value
200
+ if isinstance(value, dict) or isinstance(value, list):
201
+ if field_data_type == "text" and self.param_cast_types:
202
+ return json.dumps(value, default=str)
203
+ elif field_data_type == "bson":
204
+ if bson is None:
205
+ raise CleanerRequirementError("bson", "bson")
206
+ return base64.b64encode(bson.BSON.encode(value)) # TODO: confirm need to encode in base64
207
+ else:
208
+ return self._clean_subvalues_recursive(subvalue=value, field=field, path=field_name, level=0,
209
+ field_data_type=field_data_type)
210
+ elif isinstance(value, numbers.Number):
211
+ if (not math.isfinite(value)) and field_data_type not in non_finite_authorized_types:
212
+ if math.isnan(value):
213
+ if self.param_replace_nan:
214
+ return None # replace nans with None when not authorized
215
+ else:
216
+ self.warnings[field_name].add("nan")
217
+ else:
218
+ self.warnings[field_name].add("inf") # infinite values are not authorized and no replacement can be made
219
+ if self.param_replace_forbidden:
220
+ return None
221
+ elif isinstance(value, bool):
222
+ if field_data_type == "text":
223
+ if self.param_cast_types:
224
+ return str(value)
225
+ elif field_data_type == "numeric":
226
+ if self.param_cast_types:
227
+ return int(value)
228
+ elif not field_data_type == "bool":
229
+ self.field_changes[field_name] = CkanField(field_name, "bool")
230
+ elif field_data_type not in real_number_types and not round(value) == value:
231
+ if self.param_round_values:
232
+ return round(value)
233
+ else:
234
+ self.warnings[field_name].add("float")
235
+ elif isinstance(value, datetime.datetime):
236
+ if field_data_type == "timestamp":
237
+ if self.param_cast_types:
238
+ return value.isoformat(sep=ckan_timestamp_sep)
239
+ elif not field_data_type == "timestamp":
240
+ self.field_changes[field_name] = CkanField(field_name, "timestamp")
241
+ else:
242
+ new_value = self._replace_non_standard_value(value, field, field_data_type=field_data_type)
243
+ return new_value
244
+
245
+ def clean_records(self, records: Union[List[dict], pd.DataFrame],
246
+ known_fields:Union[OrderedDict[str, CkanField], OrderedDict[str,dict], List[Union[dict,CkanField]], None],
247
+ *, inplace:bool=False) -> Union[List[dict], pd.DataFrame]:
248
+ self.clear_outputs_new_dataframe()
249
+ if known_fields is not None and isinstance(known_fields, list):
250
+ fields_list = known_fields
251
+ known_fields = OrderedDict()
252
+ for field_info in fields_list:
253
+ if isinstance(field_info, dict):
254
+ field_dict = field_info
255
+ field_info = CkanField.from_ckan_dict(field_dict)
256
+ known_fields[field_info.name] = field_info
257
+ elif known_fields is not None and isinstance(known_fields, dict):
258
+ for field_name, field_info in known_fields.items():
259
+ if isinstance(field_info, dict):
260
+ field_dict = field_info
261
+ field_info = CkanField.from_ckan_dict(field_dict)
262
+ if field_info.name is None:
263
+ field_info.name = field_name
264
+ else:
265
+ assert_or_raise(field_info.name == field_name, IntegrityError(f"Field name {field_info.name} neq {field_name}"))
266
+ known_fields[field_info.name] = field_info
267
+ fields = self.detect_field_types_and_subs(records, known_fields=known_fields)
268
+ if not inplace:
269
+ records = copy.deepcopy(records)
270
+ if not self.param_enable:
271
+ return records
272
+ # iterate on records
273
+ mode_df = isinstance(records, pd.DataFrame)
274
+ if mode_df:
275
+ for new_field in self.field_subs_path.values():
276
+ records[new_field] = None
277
+ for column in records.columns:
278
+ field = fields[column]
279
+ # records[column] = records[column].apply(self.clean_value_field, field=field)
280
+ for index, value in enumerate(records[column]):
281
+ self._new_columns_in_row = {}
282
+ records.loc[index, column] = self.clean_value_field(value, field=field)
283
+ for path, new_value in self._new_columns_in_row.items():
284
+ if path in self.field_subs_path.keys():
285
+ new_field = self.field_subs_path[path]
286
+ records.loc[index, new_field] = new_value
287
+ else:
288
+ for row in records:
289
+ self._new_columns_in_row = {}
290
+ for key, value in row.items():
291
+ field = fields[key]
292
+ row[key] = self.clean_value_field(value, field=field)
293
+ for path, new_value in self._new_columns_in_row.items():
294
+ if path in self.field_subs_path.keys():
295
+ new_field = self.field_subs_path[path]
296
+ assert(new_field not in row.keys())
297
+ row[new_field] = new_value
298
+ if self.param_apply_field_subs:
299
+ for field_name, substitution in self.field_subs.items():
300
+ if field_name in row.keys():
301
+ assert_or_raise(substitution not in row.keys(), KeyError(substitution))
302
+ row[substitution] = row.pop(field_name)
303
+ return self._clean_final_steps(records, fields, known_fields)
304
+
305
+ def _clean_final_steps(self, records: Union[List[dict], pd.DataFrame], fields:Union[OrderedDict[str, CkanField], None],
306
+ known_fields:Union[OrderedDict[str, CkanField], None]) -> Union[List[dict], pd.DataFrame]:
307
+ # apply final modifications
308
+ mode_df = isinstance(records, pd.DataFrame)
309
+ self.warnings = {key: value for key, value in self.warnings.items() if len(value) > 0}
310
+ if len(self.warnings) > 0:
311
+ msg = "Some fields had anomalies: " + str(self.warnings)
312
+ if self.param_raise_error:
313
+ raise CleanError(msg)
314
+ elif self.param_verbose:
315
+ warn(msg)
316
+ if len(self.field_subs) > 0:
317
+ for field_name, substitution in self.field_subs.items():
318
+ if substitution in self.fields_encountered.keys():
319
+ msg = f"Substitution cannot be done for field '{field_name}' because '{substitution}' already exists"
320
+ if self.param_raise_error or self.param_apply_field_subs:
321
+ raise KeyError(msg)
322
+ elif self.param_verbose:
323
+ warn(msg)
324
+ if self.param_apply_field_subs:
325
+ if mode_df:
326
+ if len(self.field_subs) > 0:
327
+ records.rename(columns=self.field_subs, inplace=True)
328
+ # for field_name, substitution in self.field_subs.items():
329
+ # records[substitution] = records.pop(field)
330
+ else:
331
+ pass # already done above
332
+ # for row in records:
333
+ # for field_name, substitution in self.field_subs.items():
334
+ # if field_name in row.keys():
335
+ # row[substitution] = row.pop(field_name)
336
+ new_encountered_fields = self.fields_encountered
337
+ self.fields_encountered = OrderedDict()
338
+ for field_name in new_encountered_fields.keys():
339
+ if field_name in self.field_subs.keys():
340
+ substitution = self.field_subs[field_name]
341
+ self.fields_encountered[substitution] = None
342
+ else:
343
+ self.fields_encountered[field_name] = None
344
+ new_fields_copy = self.fields_new
345
+ self.fields_new = OrderedDict()
346
+ for field_name, field_info in new_fields_copy.items():
347
+ if field_name in self.field_subs.keys():
348
+ substitution = self.field_subs[field_name]
349
+ if known_fields is None or substitution not in known_fields.keys():
350
+ self.fields_new[substitution] = field_info
351
+ self.fields_new[substitution].name = substitution
352
+ elif known_fields is None or field_name not in known_fields.keys():
353
+ self.fields_new[field_name] = field_info
354
+ else:
355
+ pass # field already known
356
+ for field_name, substitution in self.field_subs.items():
357
+ if field_name in self.field_changes.keys():
358
+ self.field_changes[substitution] = self.field_changes.pop(field_name)
359
+ self.field_changes[substitution].name = substitution
360
+ if field_name in self.field_suggested_index:
361
+ self.field_suggested_index.remove(field_name)
362
+ self.field_suggested_index.add(substitution)
363
+ if self.field_suggested_primary_key is not None and field_name in self.field_suggested_primary_key:
364
+ index = self.field_suggested_primary_key.index(field_name)
365
+ self.field_suggested_primary_key[index] = substitution
366
+ if not mode_df:
367
+ # add columns attribute to List[dict]
368
+ if not(isinstance(records, ListRecords)):
369
+ records = ListRecords(records) # this is not compatible with the inplace=True argument
370
+ records.columns = list(self.fields_encountered.keys())
371
+ if len(self.field_changes) > 0:
372
+ if self.param_verbose:
373
+ msg = "Recommended field changes: " + ", ".join({field.name: field.data_type for field in self.field_changes.values()})
374
+ print(msg)
375
+ if self.field_suggested_primary_key is not None:
376
+ if not all([field_name in self.fields_encountered.keys() for field_name in self.field_suggested_primary_key]):
377
+ self.field_suggested_primary_key = None # cancel suggestion
378
+ if self.field_suggested_primary_key is not None and self.field_suggested_index is not None:
379
+ self.field_suggested_index = self.field_suggested_index - set(self.field_suggested_primary_key)
380
+ if len(self.fields_new) > 0 and self.param_verbose:
381
+ msg = ("The following new fields were detected: "
382
+ + str({field.name: field.data_type for field in self.fields_new.values()}))
383
+ warn(msg)
384
+ # user must call apply_new_fields_request in order to transmit new fields to CKAN
385
+ self._extra_checks(records, fields)
386
+ return records
387
+
388
+
389
+
390
+ def default_cleaner() -> CkanDataCleanerABC:
391
+ return CkanDataCleanerUploadBasic()
392
+
393
+
394
+ if __name__ == "__main__":
395
+ NaN = math.nan
396
+ date_example = datetime.datetime.today()
397
+ timestamp_example = date_example.isoformat(ckan_timestamp_sep)
398
+
399
+ A = {"text": "A", "int": 1, "number": 2, "json": {"key": "field"}, "timestamp": timestamp_example, "test": True}
400
+ B = {"text": "B", "int": 1.5, "number": 2.5, "json": {"key": [1, 2, "A"]}, "timestamp": None, "test": None}
401
+ C = {"text": None, "int": None, "number": None, "json": {"key": [1, 2, None]}, "timestamp": pd.NaT}
402
+ D = {"text": 1, "int": NaN, "number": NaN, "json": {"key": [1, 2, NaN]}}
403
+ E = {"text": "E", "int": 2, "number": 5.5, "json": None}
404
+ F = {"text": NaN, "int": None, "number": None, "json": NaN}
405
+ G = {"text": "G", "int": math.inf}
406
+ H = {"text": "H", "extra_field": 2}
407
+
408
+ records = [A, B, C, D, E, F, G, H]
409
+ df = records_to_df(records)
410
+
411
+ fields_list = [
412
+ CkanField("text", "text"),
413
+ CkanField("int", "int"),
414
+ CkanField("number", "numeric"),
415
+ CkanField("json", "json"),
416
+ CkanField("timestamp", "timestamp"),
417
+ ]
418
+ fields = OrderedDict([(field_info.name, field_info) for field_info in fields_list])
419
+
420
+ cleaner = CkanDataCleanerUploadBasic()
421
+ auto_fields = cleaner.detect_field_types_and_subs(records, known_fields=None)
422
+ df_cleaned = cleaner.clean_records(df, fields)
423
+ df_warnings = cleaner.warnings
424
+ fields_new = cleaner.fields_new
425
+ df_records = df_cleaned.to_dict(orient="records")
426
+ records_cleaned = cleaner.clean_records(records, fields)
427
+ records_warnings = cleaner.warnings
428
+
429
+ print("Done.")
430
+
@@ -0,0 +1,98 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Adding support for geometries
5
+ """
6
+ from typing import Any, Tuple, Union
7
+ from types import SimpleNamespace
8
+ import json
9
+ import re
10
+
11
+ try:
12
+ import shapely
13
+ except ImportError:
14
+ shapely = SimpleNamespace(Geometry=None)
15
+
16
+ try:
17
+ import pyproj
18
+ except ImportError:
19
+ pyproj = None
20
+
21
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanField
22
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import assert_or_raise
23
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_upload_1_basic import CkanDataCleanerUploadBasic
24
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_errors import UnexpectedGeometryError, FormatError, CleanerRequirementError
25
+
26
+ # mapping from Postgre geometric types to GeoJSON equivalents
27
+ # This does not enable the use of MultiPoint, MultiLine and MultiPolygon
28
+ postgre_geojson_mapping = {
29
+ "point": "Point",
30
+ "path": "LineString",
31
+ "polygon": "Polygon",
32
+ }
33
+
34
+ def shapely_geometry_from_value(value:Any) -> Union[shapely.Geometry,None]:
35
+ if shapely.Geometry is None:
36
+ raise CleanerRequirementError("shapely", "geometry")
37
+ if value is None:
38
+ return None
39
+ elif isinstance(value, shapely.Geometry):
40
+ return value
41
+ elif isinstance(value, str):
42
+ if len(value) == 0:
43
+ return None
44
+ elif value[0] in {'{', '[', '('}:
45
+ return shapely.from_geojson(value)
46
+ elif re.match("[a-zA-Z]+\(.+\)", value):
47
+ return shapely.from_wkt(value)
48
+ elif re.match("[0-9A-F]+", value):
49
+ return shapely.from_wkb(value)
50
+ else:
51
+ raise FormatError(value, "geometry")
52
+ elif isinstance(value, dict):
53
+ return shapely.geometry.shape(value)
54
+ else:
55
+ raise FormatError(value, "geometry")
56
+
57
+
58
+ class CkanDataCleanerUploadGeom(CkanDataCleanerUploadBasic):
59
+ def __init__(self):
60
+ super().__init__()
61
+
62
+ def _replace_standard_value_bypass(self, value: Any, field: CkanField, *, field_data_type: str) -> Tuple[Any, bool]:
63
+ if field_data_type == "geometry" or field_data_type.startswith("geometry("): # and field.internal_attrs.geometry_as_source:
64
+ value_shape = shapely_geometry_from_value(value)
65
+ geojson_type = field.internal_attrs.geometry_type
66
+ if geojson_type is not None:
67
+ assert_or_raise(value_shape.geom_type.casefold() == geojson_type.casefold(), UnexpectedGeometryError(value_shape.geom_type, geojson_type))
68
+ if field.internal_attrs.epsg_source is not None and field.internal_attrs.epsg_target is not None:
69
+ if not field.internal_attrs.epsg_source == field.internal_attrs.epsg_target:
70
+ if pyproj is None:
71
+ raise CleanerRequirementError("pyproj", "geometry projection")
72
+ crs_source = pyproj.CRS.from_epsg(field.internal_attrs.epsg_source)
73
+ crs_target = pyproj.CRS.from_epsg(field.internal_attrs.epsg_target)
74
+ transformer = pyproj.Transformer.from_crs(crs_source, crs_target, always_xy=True)
75
+ value_shape = shapely.transform(value_shape, transformer.transform, interleaved=False)
76
+ return shapely.to_wkb(value_shape, hex=True), True
77
+ elif field_data_type in postgre_geojson_mapping.keys():
78
+ if field.internal_attrs.geometry_as_source:
79
+ value_shape = shapely_geometry_from_value(value)
80
+ geojson_type = postgre_geojson_mapping[field_data_type]
81
+ assert_or_raise(value_shape.geom_type == geojson_type, UnexpectedGeometryError(value_shape.geom_type, geojson_type))
82
+ coordinates = shapely.get_coordinates(value_shape)
83
+ if field_data_type == "point":
84
+ # representation: (x,y)
85
+ return str(tuple(coordinates)), True
86
+ elif field_data_type == "path":
87
+ # representation: [(x1,y1),...]
88
+ return str([tuple(point) for point in coordinates]), True
89
+ elif field_data_type == "polygon":
90
+ # representation: ((x1,y1),...)
91
+ return str(tuple([tuple(point) for point in coordinates])), True
92
+ else:
93
+ raise NotImplementedError()
94
+ else:
95
+ return str(value), True
96
+ else:
97
+ return super()._replace_standard_value_bypass(value, field, field_data_type=field_data_type)
98
+
@@ -0,0 +1,10 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Classes to read specific file formats to load DataStore DataFrame/records from a system file
5
+ """
6
+
7
+ from . import file_format_abc
8
+ from . import csv_format
9
+ from . import shp_format
10
+ from . import file_format_init
@@ -0,0 +1,43 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ The basic file format for DataStore: CSV
5
+ """
6
+ from typing import Union, Dict
7
+ import io
8
+
9
+ import pandas as pd
10
+
11
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanField
12
+ from ckanapi_harvesters.auxiliary.list_records import ListRecords
13
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import df_download_to_csv_kwargs
14
+ from ckanapi_harvesters.harvesters.file_formats.file_format_abc import FileFormatABC
15
+
16
+
17
+ csv_file_upload_read_csv_kwargs = dict(dtype=str, keep_default_na=False)
18
+
19
+
20
+ class CsvFileFormat(FileFormatABC):
21
+ def __init__(self, read_csv_kwargs: dict=None, to_csv_kwargs: dict=None) -> None:
22
+ if read_csv_kwargs is None: read_csv_kwargs = csv_file_upload_read_csv_kwargs
23
+ if to_csv_kwargs is None: to_csv_kwargs = df_download_to_csv_kwargs
24
+ self.read_csv_kwargs:dict = read_csv_kwargs
25
+ self.to_csv_kwargs:dict = to_csv_kwargs
26
+
27
+ def read_file(self, file_path: str, fields: Union[Dict[str, CkanField],None]) -> Union[pd.DataFrame, ListRecords]:
28
+ return pd.read_csv(file_path, **self.read_csv_kwargs)
29
+
30
+ def read_buffer(self, buffer: io.StringIO, fields: Union[Dict[str, CkanField],None]) -> Union[pd.DataFrame, ListRecords]:
31
+ return pd.read_csv(buffer, **self.read_csv_kwargs)
32
+
33
+ def write_file(self, df: pd.DataFrame, file_path: str, fields: Union[Dict[str, CkanField],None]) -> None:
34
+ df.to_csv(file_path, index=False, **self.to_csv_kwargs)
35
+
36
+ def write_in_memory(self, df: pd.DataFrame, fields: Union[Dict[str, CkanField],None]) -> bytes:
37
+ buffer = io.StringIO()
38
+ df.to_csv(buffer, index=False, **self.to_csv_kwargs)
39
+ return buffer.getvalue().encode("utf8")
40
+
41
+ def copy(self):
42
+ return CsvFileFormat(self.read_csv_kwargs, self.to_csv_kwargs)
43
+
@@ -0,0 +1,39 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ File format base class
5
+ """
6
+ from abc import ABC, abstractmethod
7
+ from typing import Union, Dict
8
+ import io
9
+
10
+ import pandas as pd
11
+
12
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanField
13
+ from ckanapi_harvesters.auxiliary.list_records import ListRecords
14
+
15
+
16
+ class FileFormatABC(ABC):
17
+ @abstractmethod
18
+ def read_file(self, file_path: str, fields: Union[Dict[str, CkanField],None]) -> Union[pd.DataFrame, ListRecords]:
19
+ raise NotImplementedError()
20
+
21
+ @abstractmethod
22
+ def read_buffer(self, buffer: io.IOBase, fields: Union[Dict[str, CkanField],None]) -> Union[pd.DataFrame, ListRecords]:
23
+ raise NotImplementedError()
24
+
25
+ @abstractmethod
26
+ def write_file(self, df: Union[pd.DataFrame, ListRecords], file_path: str, fields: Union[Dict[str, CkanField],None]) -> None:
27
+ raise NotImplementedError()
28
+
29
+ @abstractmethod
30
+ def write_in_memory(self, df: Union[pd.DataFrame, ListRecords], fields: Union[Dict[str, CkanField],None]) -> bytes:
31
+ raise NotImplementedError()
32
+
33
+ @abstractmethod
34
+ def copy(self):
35
+ raise NotImplementedError()
36
+
37
+ def __copy__(self):
38
+ return self.copy()
39
+
@@ -0,0 +1,25 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ File format keyword selection
5
+ """
6
+ from ckanapi_harvesters.harvesters.file_formats.file_format_abc import FileFormatABC
7
+ from ckanapi_harvesters.harvesters.file_formats.csv_format import CsvFileFormat
8
+ from ckanapi_harvesters.harvesters.file_formats.shp_format import ShapeFileFormat
9
+
10
+ file_format_dict = {
11
+ "csv": CsvFileFormat,
12
+ "shp": ShapeFileFormat,
13
+ }
14
+
15
+ def init_file_format_datastore(format:str) -> FileFormatABC:
16
+ if format is None or len(format) == 0:
17
+ format = 'csv'
18
+ format = format.lower().strip()
19
+ if format in file_format_dict.keys():
20
+ file_format_class = file_format_dict[format]
21
+ return file_format_class()
22
+ else:
23
+ raise NotImplementedError('File format {} not implemented'.format(format))
24
+
25
+