ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
  103. ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,278 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to initiate a DataStore defined by a large number of files to concatenate into one table.
5
+ This concrete implementation is linked to the file system.
6
+ """
7
+ from typing import Dict, List, Collection, Any, Tuple, Generator, Union, Set
8
+ from collections import OrderedDict
9
+ from warnings import warn
10
+ import glob
11
+ import copy
12
+
13
+ import pandas as pd
14
+
15
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
16
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import assert_or_raise
17
+ from ckanapi_harvesters.builder.mapper_datastore import DataSchemeConversion
18
+ # from ckanapi_harvesters.auxiliary.path import list_files_scandir
19
+ from ckanapi_harvesters.builder.builder_errors import ResourceFileNotExistMessage
20
+ from ckanapi_harvesters.builder.builder_resource_datastore_multi_abc import BuilderDataStoreMultiABC
21
+ from ckanapi_harvesters.builder.builder_resource_datastore_multi_abc import datastore_multi_apply_last_condition_intermediary
22
+ from ckanapi_harvesters.builder.builder_field import BuilderField
23
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanField, CkanResourceInfo, UpsertChoice
24
+ from ckanapi_harvesters.ckan_api import CkanApi
25
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element
26
+ from ckanapi_harvesters.builder.mapper_datastore_multi import RequestMapperABC, RequestFileMapperABC
27
+ from ckanapi_harvesters.builder.mapper_datastore_multi import default_file_mapper_from_primary_key
28
+ from ckanapi_harvesters.builder.builder_resource_datastore import BuilderDataStoreFile
29
+ from ckanapi_harvesters.harvesters.harvester_abc import TableHarvesterABC
30
+ from ckanapi_harvesters.harvesters.harvester_init import init_table_harvester_from_options_string
31
+ from ckanapi_harvesters.builder.builder_resource_datastore_multi_folder import BuilderDataStoreFolder
32
+
33
+
34
+ class BuilderDataStoreHarvester(BuilderDataStoreFolder):
35
+ def __init__(self, *, file_query_list: List[Tuple[str,dict]]=None, name:str=None, format:str=None, description:str=None,
36
+ resource_id:str=None, download_url:str=None, dir_name:str=None, file_url_attr:str=None, options_string:str=None, base_dir:str=None):
37
+ super().__init__(file_query_list=file_query_list, dir_name=dir_name,
38
+ name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
39
+ self.options_string = options_string
40
+ self.enable_multi_threaded_upload = False
41
+ # specific attributes
42
+ self.file_url_attr:Union[str,None] = file_url_attr
43
+ self._harvester: Union[TableHarvesterABC,None] = None
44
+ if self.options_string is not None and len(self.options_string) > 0:
45
+ self._apply_options(base_dir=base_dir)
46
+
47
+ @property
48
+ def harvester(self) -> Union[TableHarvesterABC,None]:
49
+ return self._harvester
50
+ @harvester.setter
51
+ def harvester(self, harvester: Union[TableHarvesterABC,None]):
52
+ assert_or_raise(self._harvester is None, RuntimeError("You can only set the harvester once"))
53
+ self._harvester = harvester
54
+ self._apply_harvester_metadata()
55
+
56
+ def _apply_options(self, base_dir: str = None):
57
+ self.harvester = init_table_harvester_from_options_string(self.options_string, file_url_attr=self.file_url_attr, base_dir=base_dir)
58
+
59
+ def init_options_from_ckan(self, ckan:CkanApi) -> None:
60
+ super().init_options_from_ckan(ckan)
61
+ self.harvester.update_from_ckan(ckan)
62
+
63
+ def _apply_harvester_metadata(self, base_dir:str=None):
64
+ self.dir_name = self.name # by default, take the resource name
65
+ if self.harvester.params.output_dir is not None:
66
+ self.dir_name = self.harvester.params.output_dir
67
+ if self.harvester.params.enable_download is not None:
68
+ self.enable_download = self.harvester.params.enable_download
69
+ # import default metadata
70
+ table_metadata = self.harvester.clean_table_metadata()
71
+ if self.df_mapper.df_upload_fun is None:
72
+ self.df_mapper.df_upload_fun = self.harvester.get_default_df_upload_fun()
73
+ if self.data_cleaner_upload is None:
74
+ self.data_cleaner_upload = self.harvester.get_default_data_cleaner()
75
+ if self.primary_key is None:
76
+ self.primary_key = self.harvester.get_default_primary_key()
77
+ if self.indexes is None:
78
+ self.indexes = table_metadata.indexes
79
+ if self.description is None:
80
+ self.description = table_metadata.description
81
+ if self.format is None:
82
+ self.format = "CSV"
83
+ if table_metadata.fields is not None:
84
+ if self.field_builders is None:
85
+ self.field_builders = OrderedDict()
86
+ for field_name, field_metadata in table_metadata.fields.items():
87
+ if field_name in self.field_builders.keys():
88
+ field_builder = self.field_builders[field_name]
89
+ if field_builder.type_override is None:
90
+ field_builder.type_override = field_metadata.data_type
91
+ else:
92
+ field_builder = BuilderField(name=field_metadata.name,
93
+ type_override=field_metadata.data_type)
94
+ if field_builder.label is None:
95
+ field_builder.label = field_metadata.label
96
+ if field_builder.description is None:
97
+ field_builder.description = field_metadata.description
98
+ if field_builder.uniquekey is None:
99
+ field_builder.uniquekey = field_metadata.uniquekey or (table_metadata.unique_keys is not None and field_name in table_metadata.unique_keys)
100
+ if field_builder.is_index is None:
101
+ field_builder.is_index = field_metadata.is_index
102
+ if field_builder.notnull is None:
103
+ field_builder.notnull = field_metadata.notnull
104
+ field_builder.internal_attrs = field_metadata.internal_attrs.merge(field_builder.internal_attrs)
105
+ self.field_builders[field_name] = field_builder
106
+ if table_metadata.unique_keys is not None and len(table_metadata.unique_keys) > 0:
107
+ if self.field_builders is None:
108
+ self.field_builders = OrderedDict()
109
+ for field_name in table_metadata.unique_keys:
110
+ if field_name in self.field_builders.keys():
111
+ field_builder = self.field_builders[field_name]
112
+ if field_builder.uniquekey is None:
113
+ field_builder.uniquekey = True
114
+ else:
115
+ pass # because we do not know the data type
116
+ # field_builder = BuilderField(name=field_name)
117
+ # field_builder.uniquekey = field_name
118
+ # self.field_builders[field_name] = field_builder
119
+
120
+ def copy(self, *, dest=None):
121
+ if dest is None:
122
+ dest = BuilderDataStoreHarvester()
123
+ super().copy(dest=dest)
124
+ dest.file_url_attr = self.file_url_attr
125
+ dest.harvester = self.harvester
126
+ return dest
127
+
128
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None) -> None:
129
+ super()._load_from_df_row(row=row)
130
+ self.df_mapper = default_file_mapper_from_primary_key(self.primary_key)
131
+ self.dir_name = ""
132
+ self.file_url_attr: str = _string_from_element(row["file/url"])
133
+ if self.options_string is not None and len(self.options_string) > 0:
134
+ self._apply_options(base_dir=base_dir)
135
+
136
+ def _to_dict(self, include_id:bool=True) -> dict:
137
+ d = super()._to_dict(include_id=include_id)
138
+ d["File/URL"] = self.file_url_attr
139
+ return d
140
+
141
+ @staticmethod
142
+ def resource_mode_str() -> str:
143
+ return "DataStore from Harvester"
144
+
145
+ @staticmethod
146
+ def from_file_datastore(resource_file: BuilderDataStoreFile,
147
+ *, dir_name:str=None, primary_key:List[str]=None,
148
+ file_query_list:Collection[Tuple[str,dict]]=None) -> "BuilderDataStoreHarvester":
149
+ """
150
+ Do not initialize a BuilderDataStoreHarvester with this method. Rather initialize a new instance of the class.
151
+
152
+ :raises NotImplementedError:
153
+ """
154
+ raise NotImplementedError("This method must not be called for a DataStore from Harvester. Rather initialize a new BuilderDataStoreHarvester.")
155
+
156
+
157
+ ## upload is specific to this class ---------------------------------------------------
158
+ def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[None,ContextErrorLevelMessage]:
159
+ return self.harvester.check_connection()
160
+
161
+ def get_sample_file_path(self, resources_base_dir:str, file_index:int=0) -> Union[Any,None]:
162
+ self.list_local_files(resources_base_dir=resources_base_dir)
163
+ return self.local_file_list[file_index]
164
+
165
+ def load_local_df(self, file: str, *, upload_alter:bool=True, fields:OrderedDict[str,CkanField]=None) -> pd.DataFrame:
166
+ # self.sample_data_source = resolve_rel_path(resources_base_dir, self.dir_name, file, field=f"File/URL of resource {self.name}")
167
+ self.sample_data_source = file
168
+ data_local = self.harvester.query_data(query=file)
169
+ if upload_alter:
170
+ df_upload = self.df_mapper.df_upload_alter(data_local, self.sample_data_source, fields=self._get_fields_info())
171
+ return df_upload
172
+ else:
173
+ raise RuntimeError("upload_alter must be True for a DataStore from Harvester.")
174
+
175
+ def get_local_file_generator(self, resources_base_dir:str, **kwargs) -> Generator[Any, None, None]:
176
+ self.list_local_files(resources_base_dir=resources_base_dir)
177
+ for query in self.local_file_list:
178
+ yield query
179
+
180
+ def list_local_files(self, resources_base_dir:str, cancel_if_present:bool=True) -> List[Any]:
181
+ if cancel_if_present and self.local_file_list is not None:
182
+ return self.local_file_list
183
+ self.local_file_list = self.harvester.list_queries(new_connection=not cancel_if_present)
184
+ return self.local_file_list
185
+
186
+ def init_local_files_list(self, resources_base_dir:str, cancel_if_present:bool=True, **kwargs) -> List[str]:
187
+ return self.list_local_files(resources_base_dir=resources_base_dir, cancel_if_present=cancel_if_present)
188
+
189
+ def get_local_df_generator(self, resources_base_dir:str, *, fields:OrderedDict[str,CkanField], **kwargs) -> Generator[pd.DataFrame, None, None]:
190
+ return super().get_local_df_generator(resources_base_dir=resources_base_dir, fields=fields, **kwargs)
191
+
192
+ def get_local_file_len(self) -> int:
193
+ if self.local_file_list is None:
194
+ raise RuntimeError("You must call list_local_files first")
195
+ return len(self.local_file_list)
196
+
197
+ # def patch_request(self, ckan: CkanApi, package_id: str, *,
198
+ # df_upload: pd.DataFrame=None, reupload: bool = None, resources_base_dir:str=None) -> CkanResourceInfo:
199
+ # # apply same treatments as super method to determine df_upload
200
+ # if reupload is None: reupload = self.reupload_on_update
201
+ # if df_upload is None:
202
+ # if not reupload:
203
+ # resource_id = ckan.map.get_resource_id(self.name, self.package_name, error_not_mapped=False)
204
+ # if resource_id is not None:
205
+ # fields = ckan.get_datastore_fields_or_request(resource_id, error_not_found=False)
206
+ # else:
207
+ # fields = None
208
+ # else:
209
+ # fields = None
210
+ # df_upload = self.load_sample_df(resources_base_dir=resources_base_dir, upload_alter=True, fields=fields)
211
+ # return super().patch_request(ckan, package_id, df_upload=df_upload, reupload=reupload, resources_base_dir=resources_base_dir)
212
+
213
+ # def upload_request_full(self, ckan:CkanApi, resources_base_dir:str, *,
214
+ # method:UpsertChoice=UpsertChoice.Upsert,
215
+ # threads:int=1, external_stop_event=None,
216
+ # only_missing:bool=False,
217
+ # start_index:int=0, end_index:int=None) -> None:
218
+ # resource_id = ckan.map.get_resource_id(self.name, self.package_name, error_not_mapped=False)
219
+ # if resource_id is not None:
220
+ # fields = ckan.get_datastore_fields_or_request(resource_id, error_not_found=False)
221
+ # else:
222
+ # fields = None
223
+ # super().upload_request_full(ckan=ckan, resources_base_dir=resources_base_dir,
224
+ # threads=threads, external_stop_event=external_stop_event,
225
+ # start_index=start_index, end_index=end_index,
226
+ # method=method, fields=fields)
227
+
228
+ def upsert_request_df(self, ckan: CkanApi, df_upload:pd.DataFrame,
229
+ method:UpsertChoice=UpsertChoice.Upsert,
230
+ apply_last_condition:bool=None, always_last_condition:bool=None) -> Tuple[pd.DataFrame, pd.DataFrame]:
231
+ """
232
+ Call to ckan datastore_upsert.
233
+ Before sending the DataFrame, a call to df_upload_alter is made.
234
+ This implementation optionally checks for the last line of the DataFrame based on the first columns of the primary key.
235
+
236
+ :param ckan:
237
+ :param df_upload:
238
+ :param method:
239
+ :return:
240
+ """
241
+
242
+ # resource_id = self.get_or_query_resource_id(ckan, error_not_found=True)
243
+ # df_upload_transformed = self.df_mapper.df_upload_alter(df_upload)
244
+ # ret_df = ckan.datastore_upsert(df_upload_transformed, resource_id, method=method,
245
+ # apply_last_condition=apply_last_condition,
246
+ # always_last_condition=always_last_condition)
247
+ # return df_upload_transformed, ret_df
248
+
249
+ if apply_last_condition is None:
250
+ apply_last_condition = True # datastore_multi_apply_last_condition_intermediary
251
+ resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=True)
252
+ df_upload_local = df_upload
253
+ df_upload_transformed = self.df_mapper.df_upload_alter(df_upload_local, fields=self._get_fields_info())
254
+ file_query = self.df_mapper.get_file_query_of_df(df_upload_transformed)
255
+ if file_query is not None:
256
+ i_restart, upload_needed, row_count, df_row = self.df_mapper.last_inserted_index_request(ckan=ckan,
257
+ resource_id=resource_id, df_upload=df_upload_transformed, file_query=file_query)
258
+ else:
259
+ i_restart, upload_needed, row_count, df_row = 0, True, -1, None
260
+ if upload_needed:
261
+ if i_restart > 0 and ckan.params.verbose_extra:
262
+ print(f"Starting transfer from index {i_restart}")
263
+ ret_df = ckan.datastore_upsert(df_upload_transformed.iloc[i_restart:], resource_id, method=method,
264
+ apply_last_condition=apply_last_condition,
265
+ always_last_condition=always_last_condition, data_cleaner=self.data_cleaner_upload)
266
+ elif 0 <= row_count and row_count < len(df_row):
267
+ msg = f"Sending full dataframe because is was shorter on server side"
268
+ warn(msg)
269
+ ret_df = ckan.datastore_upsert(df_upload_transformed, resource_id, method=method,
270
+ apply_last_condition=apply_last_condition,
271
+ always_last_condition=always_last_condition, data_cleaner=self.data_cleaner_upload)
272
+ else:
273
+ if ckan.params.verbose_extra:
274
+ print(f"File up to date on server side")
275
+ ret_df = None
276
+ return df_upload_transformed, ret_df
277
+
278
+
@@ -0,0 +1,145 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to upload metadata to the CKAN server to create/update an existing package
5
+ The metadata is defined by the user in an Excel worksheet
6
+ This file implements functions to initiate a DataStore without uploading any data.
7
+ """
8
+ import time
9
+ from abc import ABC, abstractmethod
10
+ from typing import Dict, List, Callable, Any, Tuple, Union, Set
11
+ import os
12
+ from io import StringIO
13
+ from warnings import warn
14
+ import copy
15
+
16
+ import pandas as pd
17
+
18
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
19
+ from ckanapi_harvesters.builder.builder_resource_datastore import BuilderDataStoreFile, num_rows_patch_first_upload_partial
20
+ # from ckanapi_harvesters.builder.builder_resource import BuilderResourceUnmanagedABC
21
+ from ckanapi_harvesters.auxiliary.ckan_model import UpsertChoice
22
+ from ckanapi_harvesters.auxiliary.ckan_errors import NotMappedObjectNameError, DataStoreNotFoundError
23
+ from ckanapi_harvesters.builder.builder_errors import RequiredDataFrameFieldsError, IncompletePatchError
24
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanResourceInfo, CkanDataStoreInfo
25
+ from ckanapi_harvesters.ckan_api import CkanApi
26
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element, assert_or_raise, find_duplicates, datastore_id_col
27
+
28
+
29
+ class BuilderDataStoreUnmanaged(BuilderDataStoreFile): # , BuilderResourceUnmanagedABC): # multiple inheritance can give undefined results
30
+ """
31
+ Class representing a DataStore (resource metadata and fields metadata) without managing its contents during the upload process.
32
+ """
33
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
34
+ resource_id:str=None, download_url:str=None):
35
+ super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
36
+ self.reupload_on_update = False
37
+ self.reupload_if_needed = True
38
+ self.initiate_by_user:bool = False
39
+ self.file_name = name
40
+ self.default_df_upload: Union[pd.DataFrame,None] = None
41
+
42
+ def copy(self, *, dest=None):
43
+ if dest is None:
44
+ dest = BuilderDataStoreUnmanaged()
45
+ super().copy(dest=dest)
46
+ dest.reupload_on_update = self.reupload_on_update
47
+ dest.reupload_if_needed = self.reupload_if_needed
48
+ dest.initiate_by_user = self.initiate_by_user
49
+ dest.file_name = self.file_name
50
+ dest.default_df_upload = copy.deepcopy(self.default_df_upload)
51
+ return dest
52
+
53
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
54
+ super()._load_from_df_row(row=row)
55
+ self.file_name = self.name
56
+
57
+ def get_sample_file_path(self, resources_base_dir: str) -> None:
58
+ return None
59
+
60
+ def load_sample_df(self, resources_base_dir:str, *, upload_alter:bool=True) -> Union[pd.DataFrame,None]:
61
+ return None
62
+
63
+ @staticmethod
64
+ def resource_mode_str() -> str:
65
+ return "Unmanaged DataStore"
66
+
67
+ def _to_dict(self, include_id:bool=True) -> dict:
68
+ d = super()._to_dict(include_id=include_id)
69
+ d["File/URL"] = ""
70
+ return d
71
+
72
+ def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[None,ContextErrorLevelMessage]:
73
+ return None
74
+
75
+ def patch_request(self, ckan: CkanApi, package_id: str, *,
76
+ df_upload: pd.DataFrame=None,
77
+ reupload: bool = None, resources_base_dir:str=None) -> CkanResourceInfo:
78
+ """
79
+ Specific implementation of patch_request which does not upload any data and only updates the fields currently present in the database
80
+ :param resources_base_dir:
81
+ :param ckan:
82
+ :param package_id:
83
+ :param reupload:
84
+ :return:
85
+ """
86
+ if df_upload is None:
87
+ df_upload = self.default_df_upload
88
+ if reupload is None: reupload = self.reupload_on_update and df_upload is not None
89
+ resource_id = self.get_or_query_resource_id(ckan, error_not_found=False)
90
+ if df_upload is None:
91
+ try:
92
+ df_download = self.download_sample_df(ckan, search_all=False, download_alter=False, limit=1)
93
+ if df_download is None:
94
+ assert_or_raise(resource_id is None, RuntimeError("Unexpected: resource_id should be None"))
95
+ raise NotMappedObjectNameError(self.name)
96
+ current_fields = set(df_download.columns)
97
+ except NotMappedObjectNameError as e:
98
+ df_download = None
99
+ current_fields = set()
100
+ except DataStoreNotFoundError as e:
101
+ df_download = None
102
+ current_fields = set()
103
+ df_upload_partial, df_upload_upsert = None, None
104
+ data_cleaner_fields = None
105
+ data_cleaner_index = set()
106
+ else:
107
+ df_upload, data_cleaner_fields, data_cleaner_index = self._apply_data_cleaner_before_patch(ckan, df_upload, reupload=reupload)
108
+ df_download = df_upload
109
+ current_fields = set(df_upload.columns)
110
+ if num_rows_patch_first_upload_partial is not None and len(df_upload) > num_rows_patch_first_upload_partial:
111
+ df_upload_partial = df_upload.iloc[:num_rows_patch_first_upload_partial]
112
+ df_upload_upsert = df_upload.iloc[num_rows_patch_first_upload_partial:]
113
+ else:
114
+ df_upload_partial, df_upload_upsert = df_upload, None
115
+ empty_datastore = df_download is None or len(df_download) == 0
116
+ current_fields -= {datastore_id_col} # _id does not require documentation
117
+ execute_datastore_create = df_upload_partial is not None or not (self.initiate_by_user and (df_download is None or df_download.empty))
118
+ aliases = self._get_alias_list(ckan)
119
+ self._check_necessary_fields(current_fields, raise_error=False, empty_datastore=empty_datastore)
120
+ self._check_undocumented_fields(current_fields)
121
+ primary_key, indexes = self._get_primary_key_indexes(data_cleaner_index, current_fields=current_fields,
122
+ error_missing=False, empty_datastore=empty_datastore)
123
+ fields_update = self._get_fields_update(ckan, current_fields, data_cleaner_fields, reupload=reupload)
124
+ fields = list(fields_update.values()) if len(fields_update) > 0 else None
125
+ resource_info = ckan.resource_create(package_id, name=self.name, format=self.format, description=self.description, state=self.state,
126
+ create_default_view=self.create_default_view,
127
+ cancel_if_exists=True, update_if_exists=True, reupload=reupload and df_upload_partial is not None,
128
+ datastore_create=execute_datastore_create, records=df_upload_partial, fields=fields,
129
+ primary_key=primary_key, indexes=indexes, aliases=aliases, data_cleaner=self.data_cleaner_upload)
130
+ reupload = reupload or resource_info.newly_created
131
+ resource_id = resource_info.id
132
+ self.known_id = resource_id
133
+ self._compare_fields_to_datastore_info(resource_info, current_fields, ckan)
134
+ if df_upload_upsert is not None and reupload:
135
+ if reupload:
136
+ ckan.datastore_upsert(df_upload_upsert, resource_id, method=UpsertChoice.Insert,
137
+ always_last_condition=None, data_cleaner=self.data_cleaner_upload)
138
+ else:
139
+ # case where a reupload was needed but is not permitted by self.reupload_if_needed
140
+ msg = f"Did not upload the remaining part of the resource {self.name}."
141
+ raise IncompletePatchError(msg)
142
+ return resource_info
143
+
144
+
145
+
@@ -0,0 +1,150 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to upload metadata to the CKAN server to create/update an existing package
5
+ The metadata is defined by the user in an Excel worksheet
6
+ This file implements functions to initiate a DataStore without uploading any data.
7
+ """
8
+ import time
9
+ from abc import ABC, abstractmethod
10
+ from typing import Dict, List, Callable, Any, Tuple, Union, Set
11
+ import os
12
+ import io
13
+ from warnings import warn
14
+
15
+ import pandas as pd
16
+
17
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
18
+ from ckanapi_harvesters.builder.builder_resource import builder_request_default_auth_if_ckan
19
+ from ckanapi_harvesters.builder.builder_resource_datastore import BuilderDataStoreFile
20
+ from ckanapi_harvesters.auxiliary.ckan_errors import NotMappedObjectNameError, DataStoreNotFoundError
21
+ from ckanapi_harvesters.builder.builder_errors import RequiredDataFrameFieldsError, ResourceFileNotExistMessage
22
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanResourceInfo, CkanDataStoreInfo
23
+ from ckanapi_harvesters.auxiliary.ckan_errors import CkanArgumentError, FunctionMissingArgumentError, ExternalUrlLockedError
24
+ from ckanapi_harvesters.ckan_api import CkanApi
25
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element, assert_or_raise, find_duplicates, datastore_id_col
26
+ from ckanapi_harvesters.ckan_api.ckan_api_2_readonly import df_download_read_csv_kwargs
27
+
28
+
29
+ class BuilderDataStoreUrl(BuilderDataStoreFile): #, BuilderUrlABC): # multiple inheritance can give undefined results
30
+ """
31
+ Class representing a DataStore (resource metadata and fields metadata) defined by a url.
32
+ """
33
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
34
+ resource_id:str=None, download_url:str=None, url:str=None):
35
+ super(BuilderDataStoreFile, self).__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
36
+ # super(BuilderUrlABC, self).__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url, url=url)
37
+ self.reupload_on_update = False
38
+ self.reupload_if_needed = False
39
+ self.url:str = url
40
+ self.file_name = name
41
+
42
+ def copy(self, *, dest=None):
43
+ if dest is None:
44
+ dest = BuilderDataStoreUrl()
45
+ super().copy(dest=dest)
46
+ dest.reupload_on_update = self.reupload_on_update
47
+ dest.reupload_if_needed = self.reupload_if_needed
48
+ dest.url = self.url
49
+ dest.file_name = self.file_name
50
+ return dest
51
+
52
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
53
+ super(BuilderDataStoreFile, self)._load_from_df_row(row=row)
54
+ # super(BuilderUrlABC, self)._load_from_df_row(row=row)
55
+ self.url: str = _string_from_element(row["file/url"])
56
+ self.file_name = self.name
57
+
58
+ @staticmethod
59
+ def sample_file_path_is_url() -> bool:
60
+ return True
61
+
62
+ def get_sample_file_path(self, resources_base_dir: str) -> str:
63
+ return self.url
64
+
65
+ def load_sample_data(self, resources_base_dir:str, *, ckan:CkanApi=None,
66
+ proxies:dict=None, headers:dict=None) -> bytes:
67
+ self.sample_source = self.url
68
+ if ckan is None:
69
+ raise FunctionMissingArgumentError("BuilderDataStoreUrl.load_sample_data", "ckan")
70
+ return ckan.download_url_proxy(self.url, proxies=proxies, headers=headers, auth_if_ckan=builder_request_default_auth_if_ckan).content
71
+
72
+ def load_sample_df(self, resources_base_dir:str, *, upload_alter:bool=True) -> pd.DataFrame:
73
+ payload = self.load_sample_data(resources_base_dir=resources_base_dir)
74
+ buffer = io.StringIO(payload.decode())
75
+ response_df = self.local_file_format.read_buffer(buffer, fields=self._get_fields_info())
76
+ if upload_alter:
77
+ df_upload = self.df_mapper.df_upload_alter(response_df, self.sample_data_source, fields=self._get_fields_info())
78
+ return df_upload
79
+ else:
80
+ return response_df
81
+
82
+ @staticmethod
83
+ def resource_mode_str() -> str:
84
+ return "DataStore from URL"
85
+
86
+ def _to_dict(self, include_id:bool=True) -> dict:
87
+ d = super()._to_dict(include_id=include_id)
88
+ d["File/URL"] = self.url
89
+ return d
90
+
91
+ def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[None,ContextErrorLevelMessage]:
92
+ if ckan is None:
93
+ return ResourceFileNotExistMessage(self.name, ErrorLevel.Warning, "Could not determine if resource url exists because ckan argument was not provided.")
94
+ else:
95
+ return ckan.download_url_proxy_test_head(self.url, **kwargs)
96
+
97
+ def patch_request(self, ckan: CkanApi, package_id: str, *,
98
+ df_upload:pd.DataFrame=None, payload:Union[bytes, io.BufferedIOBase]=None,
99
+ reupload: bool = None, resources_base_dir:str=None) -> CkanResourceInfo:
100
+ """
101
+ Specific implementation of patch_request which does not upload any data and only updates the fields currently present in the database
102
+ :param resources_base_dir:
103
+ :param ckan:
104
+ :param package_id:
105
+ :param reupload:
106
+ :return:
107
+ """
108
+ if reupload is None: reupload = self.reupload_on_update
109
+ if payload is not None or df_upload is not None:
110
+ raise CkanArgumentError("payload", "datastore defined from URL patch")
111
+ resource_id = self.get_or_query_resource_id(ckan, error_not_found=False)
112
+ try:
113
+ df_download = self.download_sample_df(ckan, download_alter=False, search_all=False, limit=1)
114
+ if df_download is None:
115
+ assert_or_raise(resource_id is None, RuntimeError("Unexpected: resource_id should be None"))
116
+ raise NotMappedObjectNameError(self.name)
117
+ current_fields = set(df_download.columns)
118
+ except NotMappedObjectNameError as e:
119
+ df_download = None
120
+ current_fields = set()
121
+ except DataStoreNotFoundError as e:
122
+ df_download = None
123
+ current_fields = set()
124
+ empty_datastore = df_download is None or len(df_download) == 0
125
+ data_cleaner_fields = None
126
+ data_cleaner_index = set()
127
+ current_fields -= {datastore_id_col} # _id does not require documentation
128
+ aliases = self._get_alias_list(ckan)
129
+ self._check_necessary_fields(current_fields, raise_error=False, empty_datastore=empty_datastore)
130
+ self._check_undocumented_fields(current_fields)
131
+ primary_key, indexes = self._get_primary_key_indexes(data_cleaner_index, current_fields=current_fields,
132
+ error_missing=False, empty_datastore=empty_datastore)
133
+ fields_update = self._get_fields_update(ckan, current_fields, data_cleaner_fields, reupload=reupload)
134
+ fields = list(fields_update.values()) if len(fields_update) > 0 else None
135
+ resource_info = ckan.resource_create(package_id, name=self.name, format=self.format, description=self.description, state=self.state,
136
+ url=self.url,
137
+ datastore_create=False, auto_submit=False, create_default_view=self.create_default_view,
138
+ cancel_if_exists=True, update_if_exists=True, aliases=aliases, reupload=False, data_cleaner=self.data_cleaner_upload)
139
+ resource_id = resource_info.id
140
+ self.known_id = resource_id
141
+ self._compare_fields_to_datastore_info(resource_info, current_fields, ckan)
142
+ if reupload:
143
+ # re-initialize datastore to reupload from url
144
+ # normally, data was automatically submitted to DataStore on resource_create (not needed)
145
+ ckan.datastore_create(resource_id, fields=fields, primary_key=primary_key, indexes=indexes, aliases=aliases)
146
+ ckan.datastore_submit(resource_id)
147
+ return resource_info
148
+
149
+
150
+