ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
  103. ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,23 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Section of the package dedicated to the harvesting of data using APIs, or databases
5
+ """
6
+
7
+ from . import file_formats
8
+ from . import data_cleaner
9
+
10
+ from . import harvester_errors
11
+ from . import harvester_model
12
+ from . import harvester_params
13
+ from . import harvester_abc
14
+ from . import pymongo_data_cleaner
15
+ from . import pymongo_params
16
+ from . import pymongo_harvester
17
+ from . import postgre_params
18
+ from . import postgre_harvester
19
+ from . import harvester_init
20
+
21
+ # usage shortcuts
22
+
23
+
@@ -0,0 +1,17 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Section of the package dedicated to the conversion of records to a CKAN-compatible format.
5
+ This is linked to the data harvesters.
6
+ """
7
+
8
+ from . import data_cleaner_errors
9
+ from . import data_cleaner_abc
10
+ from . import data_cleaner_upload_1_basic
11
+ from . import data_cleaner_upload_2_geom
12
+ from . import data_cleaner_upload
13
+
14
+ # usage shortcuts
15
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_upload import CkanDataCleanerUpload
16
+
17
+
@@ -0,0 +1,240 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Functions to clean data before upload.
5
+ """
6
+ from typing import Union, List, Any, Dict, Set, Type, Tuple
7
+ from abc import ABC, abstractmethod
8
+ from collections import OrderedDict
9
+
10
+ import pandas as pd
11
+
12
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanField
13
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import assert_or_raise
14
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import dict_recursive_update
15
+
16
+ non_finite_authorized_types = {"numeric", "float4", "float8", "float2"}
17
+ real_number_types = non_finite_authorized_types
18
+ dtype_mapper = {
19
+ "float64": "numeric",
20
+ "int64": "numeric",
21
+ "datetime64[ns]": "timestamp",
22
+ }
23
+
24
+
25
+ class CkanDataCleanerABC(ABC):
26
+ """
27
+ Data cleaner abstract base class.
28
+
29
+ A table is defined by a list of fields with a data type.
30
+ Each row can specify the value of all/some fields.
31
+ When a value is nested (dictionary or list), the functions iterate over the values of these elements with a recursive implementation.
32
+ These elements are called sub-values.
33
+ """
34
+ def __init__(self):
35
+ # options
36
+ self.param_enable:bool = True # global activation flag
37
+ self.param_replace_forbidden:bool = False # option to replace all other forbidden values (Infs) by None
38
+ self.param_cast_types:bool = True # option to cast to strings fields which have text data type
39
+ self.param_apply_field_subs:bool = True # option to apply suggested field renamings (True by default because these are suggested only when necessary)
40
+ self.param_apply_field_changes:bool = False # option to apply suggested field type changes
41
+ self.param_raise_error:bool = False # recommended: do not raise an error: the CKAN server will
42
+ self.param_create_new_fields:bool = True # option to enable the requests to create missing fields in the CKAN DataStore (this requires the specific function to be called)
43
+ self.param_verbose:bool = True
44
+ self.param_field_subs:Dict[str,str] = {} # user-imposed field name substitutions
45
+ self.param_field_primary_key:Union[List[str],None] = None
46
+ # outputs
47
+ self.fields_encountered:OrderedDict[str,None] = OrderedDict()
48
+ self.warnings:Dict[str,Set[str]] = {}
49
+ self.fields_new:OrderedDict[str,CkanField] = OrderedDict()
50
+ self.field_changes:Dict[str,CkanField] = {}
51
+ self.field_subs:Dict[str, str] = {}
52
+ self.field_subs_path:Dict[str, str] = {}
53
+ self.field_suggested_primary_key:Union[List[str],None] = None
54
+ self.field_suggested_index:Set[str] = set()
55
+ self._new_columns_in_row: Dict[str,Any] = None # is initialized at each row
56
+
57
+ def clear_outputs_new_dataframe(self):
58
+ self.fields_encountered = OrderedDict()
59
+ self.warnings = {}
60
+ self.fields_new = OrderedDict()
61
+ self.field_changes = {}
62
+ self.field_subs = {}
63
+ self.field_subs_path = {}
64
+ self.field_suggested_primary_key = self.param_field_primary_key
65
+ self.field_suggested_index = set()
66
+ self._new_columns_in_row = None
67
+
68
+ def clear_all_outputs(self):
69
+ """
70
+ Some values must not be cleared for each DataFrame upload.
71
+ The cleaner is stateful for certain values cleared only here.
72
+ """
73
+ self.clear_outputs_new_dataframe()
74
+
75
+ @abstractmethod
76
+ def copy(self, dest=None):
77
+ dest.param_enable = self.param_enable
78
+ dest.param_replace_forbidden = self.param_replace_forbidden
79
+ dest.param_apply_field_subs = self.param_apply_field_subs
80
+ dest.param_apply_field_changes = self.param_apply_field_changes
81
+ dest.param_raise_error = self.param_raise_error
82
+ dest.param_create_new_fields = self.param_create_new_fields
83
+ dest.param_verbose = self.param_verbose
84
+ dest.clear_outputs_new_dataframe()
85
+ return dest
86
+
87
+ def __copy__(self):
88
+ return self.copy()
89
+
90
+ ## Field type detection ------------------
91
+ def _detect_standard_field_bypass(self, field_name: str, values: Union[Any, pd.Series]) -> Union[CkanField,None]:
92
+ """
93
+ Auxiliary function of create_new_field to detect field type used to bypass the default criteria.
94
+ """
95
+ return None
96
+
97
+ def _detect_non_standard_field(self, field_name: str, values: Union[Any, pd.Series]) -> CkanField:
98
+ """
99
+ Auxiliary function of create_new_field to detect field type used if the default criteria did not match any specific case.
100
+ """
101
+ return CkanField(field_name, "text")
102
+
103
+ @abstractmethod
104
+ def create_new_field(self, field_name:str, values: Union[Any, pd.Series]) -> CkanField:
105
+ """
106
+ This method adds a new field definition
107
+ """
108
+ raise NotImplementedError()
109
+
110
+ @abstractmethod
111
+ def detect_field_types_and_subs(self, records: Union[List[dict], pd.DataFrame]) -> OrderedDict[str, str]:
112
+ """
113
+ This function detects the initial fields and necessary field renamings
114
+ """
115
+ raise NotImplementedError()
116
+
117
+ ## Records cleansing -------------
118
+ @abstractmethod
119
+ def clean_value_field(self, value: Any, field: CkanField) -> Any:
120
+ """
121
+ Cleaning of a value. A value is directly the value of a cell.
122
+ """
123
+ raise NotImplementedError()
124
+
125
+ def _replace_standard_value_bypass(self, value: Any, field: CkanField, *, field_data_type: str) -> Tuple[Any, bool]:
126
+ """
127
+ Auxiliary function of clean_value_field to perform type castings/checks used to bypass the default criteria.
128
+ """
129
+ return None, False
130
+
131
+ def _replace_non_standard_value(self, value: Any, field: CkanField, *, field_data_type: str) -> Any:
132
+ """
133
+ Auxiliary function of clean_value_field to perform type castings/checks used if none of the default criteria were met.
134
+ """
135
+ return value
136
+
137
+ @abstractmethod
138
+ def _clean_subvalue(self, subvalue: Any, field: CkanField, path: str, level: int,
139
+ *, field_data_type: str) -> Any:
140
+ """
141
+ Cleaning of a subvalue. A subvalue is a value within a nested cell.
142
+ """
143
+ raise NotImplementedError()
144
+
145
+ def _replace_standard_subvalue_bypass(self, subvalue:Any, field:CkanField, path:str, level:int,
146
+ *, field_data_type:str) -> Tuple[Any,bool]:
147
+ """
148
+ Auxiliary function of _clean_subvalue to perform type castings/checks used to bypass the default criteria.
149
+ """
150
+ return None, False
151
+
152
+ def _replace_non_standard_subvalue(self, subvalue:Any, field:CkanField, path:str, level:int,
153
+ *, field_data_type:str) -> Any:
154
+ """
155
+ Auxiliary function of _clean_subvalue to perform type castings/checks used if none of the default criteria were met.
156
+ """
157
+ return subvalue
158
+
159
+ def _add_field_from_path(self, path:str, data_type:str, new_field_name:str=None,
160
+ suggest_index:bool=True, notes:str=None) -> None:
161
+ """
162
+ Auxiliary method to define a new column from a nested object.
163
+ """
164
+ if new_field_name is None:
165
+ new_field_name = path.replace(".", "_")
166
+ assert_or_raise(new_field_name not in self.fields_encountered, KeyError(f"{new_field_name} already exists and cannot be replaced"))
167
+ self.fields_new[new_field_name] = CkanField(new_field_name, data_type, notes=notes)
168
+ self.field_subs_path[path] = new_field_name
169
+ if suggest_index:
170
+ self.field_suggested_index.add(new_field_name)
171
+ self.fields_encountered[new_field_name] = None
172
+
173
+ @abstractmethod
174
+ def clean_records(self, records: Union[List[dict], pd.DataFrame], known_fields:Union[OrderedDict[str, CkanField], None],
175
+ *, inplace:bool=False) -> Union[List[dict], pd.DataFrame]:
176
+ """
177
+ Main function to clean a list of records.
178
+
179
+ :param records:
180
+ :param known_fields:
181
+ :param inplace:
182
+ :return:
183
+ """
184
+ raise NotImplementedError()
185
+
186
+ @abstractmethod
187
+ def _clean_final_steps(self, records: Union[List[dict], pd.DataFrame], fields:Union[OrderedDict[str, CkanField], None],
188
+ known_fields:Union[OrderedDict[str, CkanField], None]) -> Union[List[dict], pd.DataFrame]:
189
+ """
190
+ Method called at the end of clean_records
191
+ """
192
+ raise NotImplementedError()
193
+
194
+ def _extra_checks(self, records: Union[List[dict], pd.DataFrame], fields:Union[OrderedDict[str, CkanField], None]) -> None:
195
+ """
196
+ Method called at the end of _clean_final_steps
197
+ """
198
+ pass
199
+
200
+ ### post-treatments -------------
201
+ def apply_new_fields_request(self, ckan, resource_id:str):
202
+ """
203
+ This method performs the field patch if a new field was detected.
204
+ Call before upsert.
205
+ """
206
+ if self.param_create_new_fields and len(self.fields_new) > 0:
207
+ ckan.datastore_field_patch(resource_id, fields_update=self.fields_new)
208
+
209
+ def merge_field_changes(self, fields:List[dict]=None) -> List[dict]:
210
+ """
211
+ This method merges the fields argument of a datastore_create with the fields detected by the data cleaner.
212
+ Fields already defined in the fields argument are not overwritten.
213
+ """
214
+ if fields is not None:
215
+ fields_dict = OrderedDict([(field_dict["id"], CkanField.from_ckan_dict(field_dict)) for field_dict in fields])
216
+ else:
217
+ fields_dict = OrderedDict()
218
+ if len(self.fields_new) > 0:
219
+ for field_name, field_info in self.fields_new.items():
220
+ if field_name not in fields_dict.keys():
221
+ fields_dict[field_name] = field_info
222
+ else:
223
+ # was not new? => merge changes?
224
+ fields_dict[field_name] = fields_dict[field_name].merge(field_info)
225
+ raise RuntimeError()
226
+ # fields_dict = dict_recursive_update(fields_dict, {field_info.name: field_info.to_ckan_dict() for field_info in self.fields_new.values()})
227
+ if self.param_apply_field_changes:
228
+ if len(self.field_changes) > 0:
229
+ for field_name, field_info in self.field_changes.items():
230
+ if field_name not in fields_dict.keys():
231
+ # new? => create?
232
+ fields_dict[field_name] = field_info
233
+ raise RuntimeError()
234
+ else:
235
+ fields_dict[field_name] = fields_dict[field_name].merge(field_info)
236
+ # fields_dict = dict_recursive_update(fields_dict, {field_info.name: field_info.to_ckan_dict() for field_info in self.field_changes.values()})
237
+ return [field_info.to_ckan_dict() for field_info in fields_dict.values()]
238
+ else:
239
+ return fields
240
+
@@ -0,0 +1,23 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Error codes for data cleaner
5
+ """
6
+
7
+ from ckanapi_harvesters.auxiliary.ckan_errors import RequirementError
8
+
9
+
10
+ class CleanError(Exception):
11
+ pass
12
+
13
+ class CleanerRequirementError(RequirementError):
14
+ def __init__(self, requirement:str, data_type:str):
15
+ super().__init__(f"The package {requirement} is required to clean using this data type ({data_type}).")
16
+
17
+ class UnexpectedGeometryError(Exception):
18
+ def __init__(self, found_type:str, expected_type:str):
19
+ super().__init__(f"Unexpected GeoJSON type: {found_type}. Expected {expected_type}.")
20
+
21
+ class FormatError(Exception):
22
+ def __init__(self, data:str, data_type:str):
23
+ super().__init__(f"Format not recognized for type {data_type}: {data}.")
@@ -0,0 +1,9 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Alias
5
+ """
6
+
7
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_upload_1_basic import _pd_series_type_detect
8
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_upload_2_geom import CkanDataCleanerUploadGeom as CkanDataCleanerUpload # alias
9
+