ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
  103. ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,561 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to upload metadata to the CKAN server to create/update an existing package
5
+ The metadata is defined by the user in an Excel worksheet
6
+ This file implements functions to initiate a DataStore.
7
+ """
8
+ from abc import ABC, abstractmethod
9
+ from typing import Dict, List, Tuple, Union, Set, Any
10
+ import os
11
+ import io
12
+ from warnings import warn
13
+ from collections import OrderedDict
14
+ import copy
15
+
16
+ import pandas as pd
17
+
18
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
19
+ from ckanapi_harvesters.builder.builder_field import BuilderField
20
+ from ckanapi_harvesters.harvesters.file_formats.file_format_abc import FileFormatABC
21
+ from ckanapi_harvesters.harvesters.file_formats.file_format_init import init_file_format_datastore
22
+ from ckanapi_harvesters.builder.mapper_datastore import DataSchemeConversion
23
+ from ckanapi_harvesters.builder.builder_resource import BuilderResourceABC
24
+ from ckanapi_harvesters.auxiliary.ckan_errors import DuplicateNameError
25
+ from ckanapi_harvesters.auxiliary.path import resolve_rel_path
26
+ from ckanapi_harvesters.builder.builder_errors import RequiredDataFrameFieldsError, ResourceFileNotExistMessage, IncompletePatchError
27
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanResourceInfo, CkanDataStoreInfo
28
+ from ckanapi_harvesters.ckan_api import CkanApi
29
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element, find_duplicates, datastore_id_col
30
+ from ckanapi_harvesters.auxiliary.ckan_defs import ckan_tags_sep
31
+ from ckanapi_harvesters.auxiliary.ckan_model import UpsertChoice
32
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanField
33
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_abc import CkanDataCleanerABC
34
+
35
+ # number of rows to upload to initiate DataStore with datapusher, before explicitly specifying field data types and indexes
36
+ num_rows_patch_first_upload_partial: Union[int,None] = 50 # set to None to upload directly the whole DataFrame before the DataStore creation
37
+
38
+
39
+ default_alias_keyword:Union[str,None] = "default" # generate default alias if an alias with this value is found in parameters
40
+
41
+
42
+ class BuilderDataStoreABC(BuilderResourceABC, ABC):
43
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
44
+ resource_id:str=None, download_url:str=None):
45
+ super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
46
+ self.field_builders: Union[Dict[str, BuilderField],None] = None
47
+ self.primary_key: Union[List[str],None] = None
48
+ self.indexes: Union[List[str],None] = None
49
+ self.aliases: Union[List[str],None] = None
50
+ self.aux_upload_fun_name:str = ""
51
+ self.aux_download_fun_name:str = ""
52
+ # Functions input/outputs
53
+ self.data_cleaner_upload: Union[CkanDataCleanerABC,None] = None
54
+ self.reupload_on_update = False # do not reupload on update for DataStores
55
+ self.reupload_if_needed: bool = True
56
+ self.reupload_needed: Union[bool,None] = None
57
+ self.df_mapper = DataSchemeConversion()
58
+ self.local_file_format: FileFormatABC = init_file_format_datastore(self.format)
59
+
60
+ def copy(self, *, dest=None):
61
+ super().copy(dest=dest)
62
+ dest.field_builders = copy.deepcopy(self.field_builders)
63
+ dest.primary_key = copy.deepcopy(self.primary_key)
64
+ dest.indexes = copy.deepcopy(self.indexes)
65
+ dest.aliases = copy.deepcopy(self.aliases)
66
+ dest.aux_upload_fun_name = self.aux_upload_fun_name
67
+ dest.aux_download_fun_name = self.aux_download_fun_name
68
+ dest.reupload_on_update = self.reupload_on_update
69
+ dest.reupload_if_needed = self.reupload_if_needed
70
+ dest.reupload_needed = self.reupload_needed
71
+ dest.df_mapper = self.df_mapper.copy()
72
+ dest.local_file_format = self.local_file_format.copy()
73
+ return dest
74
+
75
+ def _init_file_format(self):
76
+ self.local_file_format = init_file_format_datastore(self.format) # default file format is CSV (user can change)
77
+
78
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
79
+ super()._load_from_df_row(row=row)
80
+ primary_keys_string: Union[str,None] = _string_from_element(row["primary key"])
81
+ indexes_string: Union[str,None] = _string_from_element(row["indexes"])
82
+ aliases_string: Union[str,None] = None
83
+ if "upload function" in row.keys():
84
+ self.aux_upload_fun_name: str = _string_from_element(row["upload function"], empty_value="")
85
+ if "download function" in row.keys():
86
+ self.aux_download_fun_name: str = _string_from_element(row["download function"], empty_value="")
87
+ if "aliases" in row.keys():
88
+ aliases_string = _string_from_element(row["aliases"])
89
+ if primary_keys_string is not None:
90
+ if primary_keys_string.lower() == "none":
91
+ self.primary_key = []
92
+ else:
93
+ self.primary_key = [field.strip() for field in primary_keys_string.split(ckan_tags_sep)]
94
+ if indexes_string is not None:
95
+ if indexes_string.lower() == "none":
96
+ self.indexes = []
97
+ else:
98
+ self.indexes = [field.strip() for field in indexes_string.split(ckan_tags_sep)]
99
+ if aliases_string is not None:
100
+ self.aliases = aliases_string.split(ckan_tags_sep)
101
+ self._init_file_format()
102
+
103
+ @abstractmethod
104
+ def _to_dict(self, include_id:bool=True) -> dict:
105
+ d = super()._to_dict(include_id=include_id)
106
+ d["Primary key"] = ckan_tags_sep.join(self.primary_key) if self.primary_key else ""
107
+ d["Indexes"] = ckan_tags_sep.join(self.indexes) if self.indexes is not None else ""
108
+ d["Upload function"] = self.aux_upload_fun_name
109
+ d["Download function"] = self.aux_download_fun_name
110
+ d["Aliases"] = ckan_tags_sep.join(self.aliases) if self.aliases is not None else ""
111
+ return d
112
+
113
+ def init_options_from_ckan(self, ckan:CkanApi) -> None:
114
+ super().init_options_from_ckan(ckan)
115
+ if self.field_builders is not None:
116
+ for field_builder in self.field_builders.values():
117
+ field_builder.internal_attrs.update_from_ckan(ckan)
118
+
119
+ def _check_field_duplicates(self):
120
+ if self.field_builders is not None:
121
+ duplicates = find_duplicates([field_builder.name for field_builder in self.field_builders.values()])
122
+ if len(duplicates) > 0:
123
+ raise DuplicateNameError("Field", duplicates)
124
+
125
+ def _get_fields_dict(self) -> Dict[str, dict]:
126
+ self._check_field_duplicates()
127
+ if self.field_builders is not None:
128
+ fields_dict = OrderedDict([(field_builder.name, field_builder._to_dict()) for field_builder in self.field_builders.values()])
129
+ else:
130
+ fields_dict = None
131
+ return fields_dict
132
+
133
+ def _get_fields_info(self) -> Dict[str, CkanField]:
134
+ self._check_field_duplicates()
135
+ if self.field_builders is not None:
136
+ builder_fields = OrderedDict([(field_builder.name, field_builder._to_ckan_field()) for field_builder in self.field_builders.values()])
137
+ else:
138
+ builder_fields = {}
139
+ return builder_fields
140
+
141
+ def _get_fields_df(self) -> pd.DataFrame:
142
+ fields_dict_list = [value for value in self._get_fields_dict().values()]
143
+ fields_df = pd.DataFrame.from_records(fields_dict_list)
144
+ return fields_df
145
+
146
+ def _load_fields_df(self, fields_df: pd.DataFrame):
147
+ fields_df.columns = fields_df.columns.map(str.lower)
148
+ fields_df.columns = fields_df.columns.map(str.strip)
149
+ self.field_builders = {}
150
+ for index, row in fields_df.iterrows():
151
+ field_builder = BuilderField()
152
+ field_builder._load_from_df_row(row=row)
153
+ self.field_builders[field_builder.name] = field_builder
154
+
155
+ def _to_ckan_resource_info(self, package_id:str, check_id:bool=True) -> CkanResourceInfo:
156
+ resource_info = super()._to_ckan_resource_info(package_id=package_id, check_id=check_id)
157
+ resource_info.datastore_info = CkanDataStoreInfo()
158
+ resource_info.datastore_info.resource_id = resource_info.id
159
+ if self.field_builders is not None:
160
+ resource_info.datastore_info.fields_dict = OrderedDict()
161
+ for name, field_builder in self.field_builders.items():
162
+ resource_info.datastore_info.fields_dict[name] = field_builder._to_ckan_field()
163
+ else:
164
+ resource_info.datastore_info.fields_dict = None
165
+ resource_info.datastore_info.fields_id_list = [name for name, field_builder in self.field_builders.items()] if self.field_builders is not None else []
166
+ if self.indexes is not None:
167
+ resource_info.datastore_info.index_fields = self.indexes.copy()
168
+ aliases = self._get_alias_list(None)
169
+ if aliases is not None:
170
+ resource_info.datastore_info.aliases = aliases.copy()
171
+ return resource_info
172
+
173
+ @abstractmethod
174
+ def load_sample_df(self, resources_base_dir:str, *, upload_alter:bool=True) -> pd.DataFrame:
175
+ """
176
+ Function returning the data from the indicated resources as a pandas DataFrame.
177
+ This is the DataFrame equivalent for load_sample_data.
178
+
179
+ :param resources_base_dir: base directory to find the resources on the local machine
180
+ :return:
181
+ """
182
+ raise NotImplementedError()
183
+
184
+ @staticmethod
185
+ def sample_file_path_is_url() -> bool:
186
+ return False
187
+
188
+ def get_sample_file_path(self, resources_base_dir: str) -> None:
189
+ return None
190
+
191
+ def load_sample_data(self, resources_base_dir:str) -> bytes:
192
+ df = self.load_sample_df(resources_base_dir=resources_base_dir)
193
+ return self.local_file_format.write_in_memory(df, fields=self._get_fields_info())
194
+
195
+ def upsert_request_df(self, ckan: CkanApi, df_upload:pd.DataFrame,
196
+ method:UpsertChoice=UpsertChoice.Upsert,
197
+ apply_last_condition:bool=None, always_last_condition:bool=None) -> Tuple[pd.DataFrame, pd.DataFrame]:
198
+ """
199
+ Call to ckan datastore_upset.
200
+ Before sending the DataFrame, a call to df_upload_alter is made.
201
+ This method is overloaded in BuilderDataStoreMultiABC and BuilderDataStoreFolder
202
+
203
+ :param ckan:
204
+ :param df_upload:
205
+ :param method:
206
+ :return:
207
+ """
208
+ resource_id = self.get_or_query_resource_id(ckan, error_not_found=True)
209
+ df_upload_transformed = self.df_mapper.df_upload_alter(df_upload, fields=self._get_fields_info())
210
+ ret_df = ckan.datastore_upsert(df_upload_transformed, resource_id, method=method,
211
+ apply_last_condition=apply_last_condition,
212
+ always_last_condition=always_last_condition, data_cleaner=self.data_cleaner_upload)
213
+ return df_upload_transformed, ret_df
214
+
215
+ def upsert_request_final(self, ckan: CkanApi, *, force:bool=False) -> None:
216
+ """
217
+ Final steps after the last upsert query.
218
+ These steps are automatically done for a DataStore defined by one file.
219
+
220
+ :param ckan:
221
+ :param force: perform request anyways
222
+ :return:
223
+ """
224
+ if force:
225
+ resource_id = self.get_or_query_resource_id(ckan, error_not_found=True)
226
+ ckan.datastore_upsert_last_line(resource_id=resource_id)
227
+
228
+ def _get_alias_list(self, ckan:Union[CkanApi,None]):
229
+ aliases = self.aliases
230
+ if default_alias_keyword is not None:
231
+ if ckan is not None:
232
+ default_alias_name = ckan.datastore_default_alias(self.name, self.package_name, error_not_found=False)
233
+ else:
234
+ default_alias_name = CkanApi.datastore_default_alias_of_names(self.name, self.package_name)
235
+ if aliases is not None:
236
+ for i, alias in enumerate(aliases):
237
+ if alias.lower().strip() == default_alias_keyword:
238
+ aliases[i] = default_alias_name
239
+ return aliases
240
+
241
+ def _check_necessary_fields(self, current_fields: Set[str] = None, empty_datastore:bool=False, raise_error: bool = True) -> Set[str]:
242
+ """
243
+ Auxiliary function to list the fields which are required:
244
+ - for df_mapper to determine the file names, associated requests, and recognize the last inserted row of a document.
245
+ - to initialize the DataStore with the columns for the primary key and indexes
246
+
247
+ The required fields are compared to current_fields, if provided.
248
+ """
249
+ if empty_datastore:
250
+ return set()
251
+ required_fields = self.df_mapper.get_necessary_fields()
252
+ if self.primary_key is not None:
253
+ required_fields = required_fields.union(set(self.primary_key))
254
+ if self.indexes is not None:
255
+ required_fields = required_fields.union(set(self.indexes))
256
+ if current_fields is not None:
257
+ missing_fields = required_fields - current_fields
258
+ if len(missing_fields) > 0:
259
+ msg = RequiredDataFrameFieldsError(missing_fields)
260
+ if raise_error:
261
+ raise msg
262
+ else:
263
+ warn(str(msg))
264
+ return required_fields
265
+
266
+ def _check_undocumented_fields(self, current_fields: Set[str]) -> None:
267
+ if self.field_builders is not None:
268
+ # list fields which are not documented
269
+ fields_doc = set(self.field_builders.keys())
270
+ missing_doc = current_fields - fields_doc
271
+ extra_doc = fields_doc - current_fields
272
+ if len(extra_doc) > 0:
273
+ msg = f"{len(extra_doc)} extra fields were documented but absent of sample data for table {self.name}: {', '.join(extra_doc)}"
274
+ warn(msg)
275
+ if len(missing_doc) > 0:
276
+ msg = f"{len(missing_doc)} fields are left documented for table {self.name}: {', '.join(missing_doc)}"
277
+ warn(msg)
278
+ else:
279
+ msg = f"No field documentation was provided for table {self.name}. {len(current_fields)} fields are left documented: {', '.join(current_fields)}"
280
+ warn(msg)
281
+
282
+ def _get_fields_update(self, ckan: CkanApi, current_fields:Union[Set[str],None], data_cleaner_fields:Union[List[dict],None],
283
+ reupload:bool) -> Dict[str, dict]:
284
+ if self.field_builders is not None:
285
+ if current_fields is not None:
286
+ builder_fields = [field_builder._to_ckan_field() for field_builder in self.field_builders.values() if field_builder.name in current_fields]
287
+ else:
288
+ # use case: get all known fields (before data_cleaner)
289
+ builder_fields = [field_builder._to_ckan_field() for field_builder in self.field_builders.values()]
290
+ else:
291
+ builder_fields = None
292
+ resource_id = self.get_or_query_resource_id(ckan, error_not_found=False)
293
+ if resource_id is not None and not reupload:
294
+ update_needed, fields_update = ckan.datastore_field_patch_dict(fields_merge=data_cleaner_fields, fields_update=builder_fields,
295
+ return_list=False,
296
+ resource_id=resource_id, error_not_found=False)
297
+ else:
298
+ fields_update = CkanApi.datastore_field_dict(fields_merge=data_cleaner_fields, fields_update=builder_fields, return_list=False)
299
+ return fields_update
300
+
301
+ def _collect_indexes_from_fields(self) -> Set[str]:
302
+ if self.field_builders is not None:
303
+ return {field_builder.name for field_builder in self.field_builders.values() if field_builder.is_index}
304
+ else:
305
+ return set()
306
+
307
+ def _get_primary_key_indexes(self, data_cleaner_index: Set[str], current_fields:Set[str], error_missing:bool, empty_datastore:bool=False) -> Tuple[Union[List[str],None], Union[List[str],None]]:
308
+ # update primary keys and indexes: only if present
309
+ if empty_datastore:
310
+ return None, None
311
+ primary_key = None
312
+ if current_fields is None:
313
+ primary_key = self.primary_key
314
+ elif self.primary_key is not None:
315
+ extra_primary_key = set(self.primary_key) - current_fields
316
+ if len(extra_primary_key) == 0:
317
+ primary_key = self.primary_key
318
+ elif error_missing:
319
+ raise RequiredDataFrameFieldsError(extra_primary_key)
320
+ indexes = None
321
+ if self.indexes is not None:
322
+ indexes_full_set = set(self.indexes).union(self._collect_indexes_from_fields()).union(data_cleaner_index)
323
+ else:
324
+ indexes_full_set = self._collect_indexes_from_fields().union(data_cleaner_index)
325
+ if primary_key is not None:
326
+ indexes_full_set = indexes_full_set - set(primary_key)
327
+ if len(indexes_full_set) == 0:
328
+ indexes_full = None
329
+ else:
330
+ indexes_full = list(indexes_full_set)
331
+ if current_fields is None:
332
+ indexes = indexes_full
333
+ elif indexes_full is not None:
334
+ extra_indexes = set(indexes_full) - current_fields
335
+ if len(extra_indexes) == 0:
336
+ indexes = indexes_full
337
+ elif error_missing:
338
+ raise RequiredDataFrameFieldsError(extra_indexes)
339
+ return primary_key, indexes
340
+
341
+ def _compare_fields_to_datastore_info(self, resource_info:CkanResourceInfo, current_fields: Set[str], ckan:CkanApi) -> None:
342
+ # compare fields with DataStore info (if present, for information)
343
+ if resource_info.datastore_info is not None:
344
+ fields_info = set(resource_info.datastore_info.fields_id_list)
345
+ missing_info = current_fields - fields_info
346
+ extra_info = fields_info - current_fields
347
+ if len(extra_info) > 0:
348
+ msg = f"{len(extra_info)} extra fields are in the database but absent of sample data for table {self.name}: {', '.join(extra_info)}"
349
+ warn(msg)
350
+ if len(missing_info) > 0 and ckan.params.verbose_request:
351
+ msg = f"{len(missing_info)} fields are not in DataStore info because they are being added for table {self.name}: {', '.join(missing_info)}"
352
+ print(msg)
353
+
354
+ def _apply_data_cleaner_before_patch(self, ckan:CkanApi, df_upload: pd.DataFrame, reupload:bool) -> Tuple[pd.DataFrame, List[dict], Set[str]]:
355
+ if df_upload is not None and self.data_cleaner_upload is not None:
356
+ fields_for_cleaner = self._get_fields_update(ckan, current_fields=None, data_cleaner_fields=None, reupload=reupload)
357
+ df_upload = self.data_cleaner_upload.clean_records(df_upload, known_fields=fields_for_cleaner, inplace=True)
358
+ data_cleaner_fields = self.data_cleaner_upload.merge_field_changes()
359
+ data_cleaner_index = self.data_cleaner_upload.field_suggested_index
360
+ else:
361
+ data_cleaner_fields = None
362
+ data_cleaner_index = set()
363
+ return df_upload, data_cleaner_fields, data_cleaner_index
364
+
365
+ def patch_request(self, ckan: CkanApi, package_id: str, *,
366
+ df_upload: pd.DataFrame=None, reupload: bool = None, resources_base_dir:str=None) -> CkanResourceInfo:
367
+ if reupload is None: reupload = self.reupload_on_update
368
+ if df_upload is None:
369
+ df_upload = self.load_sample_df(resources_base_dir=resources_base_dir, upload_alter=True)
370
+ else:
371
+ pass # do not alter df_upload because it should already be in the database format
372
+ df_upload, data_cleaner_fields, data_cleaner_index = self._apply_data_cleaner_before_patch(ckan, df_upload, reupload=reupload)
373
+ current_fields = set(df_upload.columns) - {datastore_id_col} # _id field cannot be documented
374
+ if num_rows_patch_first_upload_partial is not None and len(df_upload) > num_rows_patch_first_upload_partial:
375
+ df_upload_partial = df_upload.iloc[:num_rows_patch_first_upload_partial]
376
+ df_upload_upsert = df_upload.iloc[num_rows_patch_first_upload_partial:]
377
+ else:
378
+ df_upload_partial, df_upload_upsert = df_upload, None
379
+ empty_datastore = df_upload is None or len(df_upload) == 0
380
+ self._check_necessary_fields(current_fields, empty_datastore=empty_datastore, raise_error=True)
381
+ self._check_undocumented_fields(current_fields)
382
+ aliases = self._get_alias_list(ckan)
383
+ primary_key, indexes = self._get_primary_key_indexes(data_cleaner_index, current_fields=current_fields,
384
+ error_missing=True, empty_datastore=empty_datastore)
385
+ fields_update = self._get_fields_update(ckan, current_fields, data_cleaner_fields, reupload=reupload)
386
+ fields = list(fields_update.values()) if len(fields_update) > 0 else None
387
+ resource_info = ckan.resource_create(package_id, name=self.name, format=self.format, description=self.description, state=self.state,
388
+ create_default_view=self.create_default_view,
389
+ cancel_if_exists=True, update_if_exists=True, reupload=reupload,
390
+ datastore_create=True, records=df_upload_partial, fields=fields,
391
+ primary_key=primary_key, indexes=indexes, aliases=aliases)
392
+ resource_id = resource_info.id
393
+ self.known_id = resource_id
394
+ reupload = reupload or resource_info.newly_created
395
+ self._compare_fields_to_datastore_info(resource_info, current_fields, ckan)
396
+ if df_upload_upsert is not None and reupload:
397
+ if reupload:
398
+ ckan.datastore_upsert(df_upload_upsert, resource_id, method=UpsertChoice.Insert,
399
+ always_last_condition=None, data_cleaner=self.data_cleaner_upload, )
400
+ else:
401
+ # case where a reupload was needed but is not permitted by self.reupload_if_needed
402
+ msg = f"Did not upload the remaining part of the resource {self.name}."
403
+ raise IncompletePatchError(msg)
404
+ return resource_info
405
+
406
+ def download_sample_df(self, ckan: CkanApi, search_all:bool=True, download_alter:bool=True, **kwargs) -> Union[pd.DataFrame,None]:
407
+ """
408
+ Download the resource and return it as a DataFrame.
409
+ This is the DataFrame equivalent for download_sample.
410
+
411
+ :param ckan:
412
+ :param search_all:
413
+ :param download_alter:
414
+ :param kwargs:
415
+ :return:
416
+ """
417
+ resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=self.download_error_not_found)
418
+ if resource_id is None and not self.download_error_not_found:
419
+ return None
420
+ df_download = ckan.datastore_dump(resource_id, search_all=search_all, **kwargs)
421
+ if download_alter:
422
+ df_local = self.df_mapper.df_download_alter(df_download, fields=self._get_fields_info())
423
+ return df_local
424
+ else:
425
+ return df_download
426
+
427
+ def download_sample(self, ckan:CkanApi, full_download:bool=True, **kwargs) -> bytes:
428
+ df = self.download_sample_df(ckan=ckan, search_all=full_download, **kwargs)
429
+ return self.local_file_format.write_in_memory(df, fields=self._get_fields_info())
430
+
431
+
432
+ class BuilderDataStoreFile(BuilderDataStoreABC):
433
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
434
+ resource_id:str=None, download_url:str=None, file_name:str=None):
435
+ super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
436
+ self.file_name = file_name
437
+
438
+ def copy(self, *, dest=None):
439
+ if dest is None:
440
+ dest = BuilderDataStoreFile()
441
+ super().copy(dest=dest)
442
+ dest.file_name = self.file_name
443
+ return dest
444
+
445
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
446
+ super()._load_from_df_row(row=row)
447
+ self.file_name: str = _string_from_element(row["file/url"])
448
+
449
+ def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[None,ContextErrorLevelMessage]:
450
+ file_path = self.get_sample_file_path(resources_base_dir=resources_base_dir)
451
+ if os.path.isfile(file_path):
452
+ return None
453
+ else:
454
+ return ResourceFileNotExistMessage(self.name, ErrorLevel.Error, f"Missing file for resource {self.name}: {file_path}")
455
+
456
+ @staticmethod
457
+ def sample_file_path_is_url() -> bool:
458
+ return False
459
+
460
+ def get_sample_file_path(self, resources_base_dir:str) -> str:
461
+ return resolve_rel_path(resources_base_dir, self.file_name, field=f"File/URL of resource {self.name}")
462
+
463
+ def load_sample_df(self, resources_base_dir:str, *, upload_alter:bool=True) -> pd.DataFrame:
464
+ self.sample_data_source = self.get_sample_file_path(resources_base_dir)
465
+ df_local = self.local_file_format.read_file(self.sample_data_source, fields=self._get_fields_info())
466
+ if isinstance(df_local, pd.DataFrame):
467
+ df_local.attrs["source"] = self.sample_data_source
468
+ if upload_alter:
469
+ df_upload = self.df_mapper.df_upload_alter(df_local, self.sample_data_source, fields=self._get_fields_info())
470
+ return df_upload
471
+ else:
472
+ return df_local
473
+
474
+ @staticmethod
475
+ def resource_mode_str() -> str:
476
+ return "DataStore from File"
477
+
478
+ def _to_dict(self, include_id:bool=True) -> dict:
479
+ d = super()._to_dict(include_id=include_id)
480
+ d["File/URL"] = self.file_name
481
+ return d
482
+
483
+ def download_request(self, ckan: CkanApi, out_dir: str, *, full_download:bool=True,
484
+ force:bool=False, threads:int=1) -> Union[pd.DataFrame,None]:
485
+ if (not self.enable_download) and (not force):
486
+ msg = f"Did not download resource {self.name} because download was disabled."
487
+ warn(msg)
488
+ return None
489
+ if out_dir is not None:
490
+ self.downloaded_destination = resolve_rel_path(out_dir, self.file_name, field=f"File/URL of resource {self.name}")
491
+ if self.download_skip_existing and os.path.exists(self.downloaded_destination):
492
+ return None
493
+ resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=self.download_error_not_found)
494
+ if resource_id is None and not self.download_error_not_found:
495
+ return None
496
+ df_download = ckan.datastore_dump(resource_id, search_all=full_download)
497
+ df = self.df_mapper.df_download_alter(df_download, fields=self._get_fields_info())
498
+ if out_dir is not None:
499
+ os.makedirs(out_dir, exist_ok=True)
500
+ self.local_file_format.write_file(df, self.downloaded_destination, fields=self._get_fields_info())
501
+ return df
502
+
503
+
504
+ class BuilderResourceIgnored(BuilderDataStoreABC):
505
+ """
506
+ Class to maintain a line in the resource builders list but has no action and can hold field metadata.
507
+ """
508
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
509
+ resource_id:str=None, download_url:str=None, file_url:str=None):
510
+ super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
511
+ self.file_url: Union[str, None] = file_url
512
+
513
+ def copy(self, *, dest=None):
514
+ if dest is None:
515
+ dest = BuilderResourceIgnored()
516
+ super().copy(dest=dest)
517
+ dest.file_url = self.file_url
518
+ return dest
519
+
520
+ @staticmethod
521
+ def resource_mode_str() -> str:
522
+ return "Ignored"
523
+
524
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
525
+ super()._load_from_df_row(row=row)
526
+ self.file_url: str = _string_from_element(row["file/url"])
527
+ self._check_mandatory_attributes()
528
+
529
+ def _to_dict(self, include_id:bool=True) -> dict:
530
+ d = super()._to_dict(include_id=include_id)
531
+ d["File/URL"] = self.file_url
532
+ return d
533
+
534
+ @staticmethod
535
+ def sample_file_path_is_url() -> bool:
536
+ return False
537
+
538
+ def get_sample_file_path(self, resources_base_dir:str) -> Union[str,None]:
539
+ return None
540
+
541
+ def load_sample_data(self, resources_base_dir:str) -> Union[bytes,None]:
542
+ return None
543
+
544
+ def load_sample_df(self, resources_base_dir: str, *, upload_alter: bool = True) -> None:
545
+ return None
546
+
547
+ def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[ContextErrorLevelMessage,None]:
548
+ return None
549
+
550
+ def patch_request(self, ckan:CkanApi, package_id:str, *,
551
+ reupload:bool=None, resources_base_dir:str=None,
552
+ payload:Union[bytes, io.BufferedIOBase]=None) -> None:
553
+ return None
554
+
555
+ def download_request(self, ckan: CkanApi, out_dir: str, *, full_download: bool = True, force: bool = False,
556
+ threads: int = 1) -> Any:
557
+ return None
558
+
559
+ def download_sample(self, ckan: CkanApi, full_download: bool = True, **kwargs) -> bytes:
560
+ return bytes()
561
+