ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/METADATA +74 -38
  103. ckanapi_harvesters-0.0.2.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,367 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to initiate a DataStore defined by a large number of files to concatenate into one table
5
+ """
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ import threading
8
+ from threading import current_thread
9
+ from abc import ABC, abstractmethod
10
+ from typing import Dict, List, Callable, Any, Tuple, Generator, Union, Set
11
+ from warnings import warn
12
+ import copy
13
+
14
+ import pandas as pd
15
+
16
+ from ckanapi_harvesters.builder.builder_resource_datastore import BuilderDataStoreABC
17
+ from ckanapi_harvesters.builder.builder_aux import positive_end_index
18
+ from ckanapi_harvesters.auxiliary.ckan_model import UpsertChoice, CkanResourceInfo
19
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import datastore_id_col
20
+ from ckanapi_harvesters.ckan_api import CkanApi
21
+ from ckanapi_harvesters.builder.mapper_datastore_multi import RequestFileMapperABC, default_file_mapper_from_primary_key
22
+ from ckanapi_harvesters.builder.builder_resource_multi_file import BuilderMultiABC, default_progress_callback
23
+
24
+ # apply last_condition for each upsert request when in a multi-threaded upload on a same DataStore:
25
+ datastore_multi_threaded_always_last_condition:bool = True
26
+ # when there are multiple files, apply last insertion commands after each document? True: after each csv file, False: only at the end
27
+ datastore_multi_apply_last_condition_intermediary:bool = False
28
+
29
+
30
+ class BuilderDataStoreMultiABC(BuilderDataStoreABC, BuilderMultiABC, ABC):
31
+ """
32
+ generic class to manage large DataStore, divided into files/parts
33
+ This abstract class is intended to be overloaded in order to be used to generate data from the workspace, without using CSV files
34
+ """
35
+
36
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
37
+ resource_id:str=None, download_url:str=None, dirname:str=None):
38
+ super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
39
+ # Functions inputs/outputs
40
+ self.df_mapper: RequestFileMapperABC = default_file_mapper_from_primary_key(self.primary_key)
41
+ self.reupload_if_needed = False # do not reupload if needed because this could cause data loss (the upload function only uploads the first table)
42
+ self.upsert_method: UpsertChoice = UpsertChoice.Upsert
43
+ # BuilderMultiABC:
44
+ self.stop_event = threading.Event()
45
+ self.thread_ckan: Dict[str, CkanApi] = {}
46
+ self.progress_callback: Union[Callable[[int, int, Any], None], None] = default_progress_callback
47
+ self.progress_callback_kwargs: dict = {}
48
+ self.enable_multi_threaded_upload:bool = True
49
+ self.enable_multi_threaded_download:bool = True
50
+
51
+ def copy(self, *, dest=None):
52
+ super().copy(dest=dest)
53
+ dest.reupload_if_needed = self.reupload_if_needed
54
+ # BuilderMultiABC:
55
+ dest.progress_callback = self.progress_callback
56
+ dest.progress_callback_kwargs = copy.deepcopy(self.progress_callback_kwargs)
57
+ dest.enable_multi_threaded_upload = self.enable_multi_threaded_upload
58
+ dest.enable_multi_threaded_download = self.enable_multi_threaded_download
59
+ # do not copy stop_event
60
+ return dest
61
+
62
+ ## upload ---------
63
+ @abstractmethod
64
+ def get_local_df_generator(self, resources_base_dir:str) -> Generator[pd.DataFrame, None, None]:
65
+ """
66
+ Returns an iterator over the parts of the upload, loaded as DataFrames (not recommended in a multi-threaded context).
67
+ """
68
+ raise NotImplementedError()
69
+
70
+ @abstractmethod
71
+ def load_local_df(self, file: Any, **kwargs) -> pd.DataFrame:
72
+ """
73
+ Load the DataFrame pointed by the upload part "file"
74
+ """
75
+ raise NotImplementedError()
76
+
77
+ # do not change default argument apply_last_condition=True
78
+ # def upsert_request_df(self, ckan: CkanApi, df_upload:pd.DataFrame,
79
+ # method:UpsertChoice=UpsertChoice.Upsert,
80
+ # apply_last_condition:bool=None) -> Tuple[pd.DataFrame, pd.DataFrame]:
81
+ # # calls super method, with apply_last_condition defaulting to datastore_multi_apply_last_condition_intermediary
82
+ # if apply_last_condition is None:
83
+ # apply_last_condition = True # datastore_multi_apply_last_condition_intermediary
84
+ # return super().upsert_request_df(ckan=ckan, df_upload=df_upload, method=method,
85
+ # apply_last_condition=apply_last_condition)
86
+
87
+ def _get_primary_key_indexes(self, data_cleaner_index: Set[str], current_fields:Set[str], error_missing:bool, empty_datastore:bool=False) -> Tuple[Union[List[str],None], Union[List[str],None]]:
88
+ primary_key, indexes = super()._get_primary_key_indexes(data_cleaner_index, current_fields, error_missing, empty_datastore)
89
+ # it is highly recommended to specify a primary key: warning if not defined
90
+ if primary_key is None:
91
+ msg = f"It is highly recommended to specify the primary key for a DataStore defined from a directory to ensure no duplicate values are upserted to the database. Resource: {self.name}"
92
+ warn(msg)
93
+ else:
94
+ ultra_required_fields = set(primary_key)
95
+ missing_fields = ultra_required_fields
96
+ if current_fields is not None:
97
+ missing_fields -= current_fields
98
+ if len(missing_fields) > 0:
99
+ msg = f"The primary key {self.primary_key} is set for resource {self.name} but it is not present in the sample data."
100
+ warn(msg)
101
+ if primary_key is None or len(primary_key) == 0:
102
+ self.upsert_method = UpsertChoice.Insert # do not use upsert
103
+ return primary_key, indexes
104
+
105
+ def upsert_request_final(self, ckan: CkanApi, *, force:bool=False) -> None:
106
+ """
107
+ Final steps after the last upsert query.
108
+ This call is mandatory at the end of all requests if the user called upsert_request_df for a multi-part DataStore manually.
109
+
110
+ :param ckan:
111
+ :param force: perform request anyways
112
+ :return:
113
+ """
114
+ force = force or not datastore_multi_apply_last_condition_intermediary
115
+ return super().upsert_request_final(ckan, force=force)
116
+
117
+ def upload_request_final(self, ckan: CkanApi, *, force:bool=False) -> None:
118
+ return self.upsert_request_final(ckan=ckan, force=force)
119
+
120
+ def upsert_request_df_no_return(self, ckan: CkanApi, df_upload:pd.DataFrame,
121
+ method:UpsertChoice=UpsertChoice.Upsert,
122
+ apply_last_condition:bool=None, always_last_condition:bool=None) -> None:
123
+ """
124
+ Calls upsert_request_df but does not return anything
125
+
126
+ :return:
127
+ """
128
+ self.upsert_request_df(ckan=ckan, df_upload=df_upload, method=method,
129
+ apply_last_condition=apply_last_condition, always_last_condition=always_last_condition)
130
+ return None
131
+
132
+ def _unit_upload_apply(self, *, ckan: CkanApi, file: str,
133
+ index: int, start_index: int, end_index: int, total: int,
134
+ method: UpsertChoice, **kwargs) -> None:
135
+ if index == 0 and self.upsert_method == UpsertChoice.Insert:
136
+ return # do not reupload the first document, which was used for the initialization of the dataset
137
+ if start_index <= index and index < end_index:
138
+ df_upload_local = self.load_local_df(file, **kwargs)
139
+ self._call_progress_callback(index, total, info=df_upload_local,
140
+ context=f"{ckan.identifier} single-thread upload")
141
+ self.upsert_request_df_no_return(ckan=ckan, df_upload=df_upload_local, method=method,
142
+ apply_last_condition=datastore_multi_apply_last_condition_intermediary)
143
+ else:
144
+ # self._call_progress_callback(index, total, info=df_upload_local, context=f"{ckan.identifier} single-thread skip")
145
+ pass
146
+
147
+ def upload_request_full(self, ckan:CkanApi, resources_base_dir:str, *,
148
+ method:UpsertChoice=None,
149
+ threads:int=1, external_stop_event=None,
150
+ only_missing:bool=False,
151
+ start_index:int=0, end_index:int=None, **kwargs) -> None:
152
+ self.df_mapper.upsert_only_missing_rows = only_missing
153
+ if method is None:
154
+ if self.primary_key is None or len(self.primary_key) == 0:
155
+ self.upsert_method = UpsertChoice.Insert # do not use upsert if there is no primary key
156
+ method = self.upsert_method
157
+ super().upload_request_full(ckan=ckan, resources_base_dir=resources_base_dir,
158
+ threads=threads, external_stop_event=external_stop_event,
159
+ start_index=start_index, end_index=end_index,
160
+ method=method, **kwargs)
161
+ # if threads < 0:
162
+ # # cancel large uploads in this case
163
+ # return None
164
+ # elif threads is None or threads > 1:
165
+ # return self.upload_request_full_multi_threaded(resources_base_dir=resources_base_dir, ckan=ckan, method=method,
166
+ # threads=threads, external_stop_event=external_stop_event,
167
+ # start_index=start_index, end_index=end_index)
168
+ # else:
169
+ # self.init_local_files_list(resources_base_dir=resources_base_dir, cancel_if_present=True)
170
+ # if ckan.verbose_extra:
171
+ # print(f"Launching single-threaded upload of multi-file resource {self.name}")
172
+ # total = self.get_local_file_len()
173
+ # end_index = positive_end_index(end_index, total)
174
+ # for index, file in enumerate(self.get_local_file_generator(resources_base_dir=resources_base_dir)):
175
+ # if external_stop_event is not None and external_stop_event.is_set():
176
+ # print(f"{ckan.identifier} Interrupted")
177
+ # return
178
+ # self._unit_upload_apply(ckan=ckan, file=file,
179
+ # index=index, start_index=start_index, end_index=end_index, total=total,
180
+ # method=method)
181
+ # self._call_progress_callback(total, total, context=f"{ckan.identifier} single-thread upload")
182
+ # # at last, apply final actions:
183
+ # self.upload_request_final(ckan, force=not datastore_multi_apply_last_condition_intermediary)
184
+
185
+ # def upsert_request_file_graceful(self, ckan: CkanApi, file: Any, index:int,
186
+ # method: UpsertChoice = UpsertChoice.Upsert, external_stop_event=None,
187
+ # start_index:int=0, end_index:int=None) -> None:
188
+ # """
189
+ # Calls upsert_request_df_clear with checks specific to multi-threading.
190
+ #
191
+ # :return:
192
+ # """
193
+ # # ckan.session_reset()
194
+ # # ckan.identifier = current_thread().name
195
+ # ckan = self.thread_ckan[current_thread().name]
196
+ # total = self.get_local_file_len()
197
+ # end_index = positive_end_index(end_index, total)
198
+ # if self.stop_event.is_set():
199
+ # return
200
+ # if external_stop_event is not None and external_stop_event.is_set():
201
+ # print(f"{ckan.identifier} Interrupted")
202
+ # return
203
+ # try:
204
+ # self._unit_upload_apply(ckan=ckan, file=file,
205
+ # index=index, start_index=start_index, end_index=end_index, total=total,
206
+ # method=method)
207
+ # except Exception as e:
208
+ # self.stop_event.set() # Ensure all threads stop
209
+ # if ckan.verbose_extra:
210
+ # print(f"Stopping all threads because an exception occurred in thread: {e}")
211
+ # raise e from e
212
+
213
+ # def upload_request_full_multi_threaded(self, ckan: CkanApi, resources_base_dir: str, threads: int = None,
214
+ # method: UpsertChoice = UpsertChoice.Upsert, external_stop_event=None,
215
+ # start_index:int=0, end_index:int=None, **kwargs):
216
+ # """
217
+ # Multi-threaded implementation of upload_request_full, using ThreadPoolExecutor.
218
+ # """
219
+ # self.init_local_files_list(resources_base_dir=resources_base_dir, cancel_if_present=True)
220
+ # resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=True) # prepare CKAN object for multi-threading: perform mapping requests if necessary
221
+ # self._prepare_for_multithreading(ckan)
222
+ # try:
223
+ # with ThreadPoolExecutor(max_workers=threads, initializer=self._init_thread, initargs=(ckan,)) as executor:
224
+ # if ckan.verbose_extra:
225
+ # print(f"Launching multi-threaded upload of multi-file resource {self.name}")
226
+ # futures = [executor.submit(self.upsert_request_file_graceful, ckan=ckan, file=file, method=method, index=index,
227
+ # start_index=start_index, end_index=end_index, external_stop_event=external_stop_event)
228
+ # for index, file in enumerate(self.get_local_file_generator(resources_base_dir=resources_base_dir))]
229
+ # for future in futures:
230
+ # future.result() # This will propagate the exception
231
+ # total = self.get_local_file_len()
232
+ # self._call_progress_callback(total, total, context=f"{ckan.identifier} multi-thread upload")
233
+ # except Exception as e:
234
+ # self.stop_event.set() # Ensure all threads stop
235
+ # if ckan.verbose_extra:
236
+ # print(f"Stopping all threads because an exception occurred: {e}")
237
+ # raise e from e
238
+ # finally:
239
+ # self.stop_event.set() # Ensure all threads stop
240
+ # if ckan.verbose_extra:
241
+ # print("End of multi-threaded upload...")
242
+ # # at last, apply final actions:
243
+ # self.upload_request_final(ckan, force=not datastore_multi_apply_last_condition_intermediary)
244
+
245
+
246
+ ## download -------
247
+ def download_request_df(self, ckan: CkanApi, file_query:dict) -> Union[pd.DataFrame,None]:
248
+ """
249
+ Download the DataFrame with the file_query arguments
250
+ """
251
+ resource_id = self.get_or_query_resource_id(ckan, error_not_found=self.download_error_not_found)
252
+ if resource_id is None and not self.download_error_not_found:
253
+ return None
254
+ df_download = self.df_mapper.download_file_query(ckan=ckan, resource_id=resource_id, file_query=file_query)
255
+ df = self.df_mapper.df_download_alter(df_download, file_query=file_query, fields=self._get_fields_info())
256
+ return df
257
+
258
+ def _unit_download_apply(self, ckan:CkanApi, file_query_item:Any, out_dir:str,
259
+ index:int, start_index:int, end_index:int, total:int) -> Any:
260
+ if start_index <= index and index < end_index:
261
+ self._call_progress_callback(index, total, info=file_query_item,
262
+ context=f"{ckan.identifier} single-thread download")
263
+ self.download_file_query_item(ckan=ckan, out_dir=out_dir, file_query_item=file_query_item)
264
+ else:
265
+ pass
266
+ # self._call_progress_callback(index, total, info=file_query_item, context=f"{ckan.identifier} single-thread skip")
267
+
268
+ def download_request_full(self, ckan: CkanApi, out_dir: str, threads:int=1, external_stop_event=None,
269
+ start_index:int=0, end_index:int=None, force:bool=False) -> None:
270
+ return super().download_request_full(ckan=ckan, out_dir=out_dir,
271
+ threads=threads, external_stop_event=external_stop_event,
272
+ start_index=start_index, end_index=end_index, force=force)
273
+ # if (not self.enable_download) and (not force):
274
+ # msg = f"Did not download resource {self.name} because download was disabled."
275
+ # warn(msg)
276
+ # return None
277
+ # if threads < 0:
278
+ # # do not download large datasets in this case
279
+ # return None
280
+ # elif threads is None or threads > 1:
281
+ # return self.download_request_full_multi_threaded(ckan=ckan, out_dir=out_dir,
282
+ # threads=threads, external_stop_event=external_stop_event,
283
+ # start_index=start_index, end_index=end_index)
284
+ # else:
285
+ # self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True)
286
+ # if ckan.verbose_extra:
287
+ # print(f"Launching single-threaded download of multi-file resource {self.name}")
288
+ # total = self.get_file_query_len()
289
+ # end_index = positive_end_index(end_index, total)
290
+ # for index, file_query_item in enumerate(self.get_file_query_generator()):
291
+ # if external_stop_event is not None and external_stop_event.is_set():
292
+ # print(f"{ckan.identifier} Interrupted")
293
+ # return
294
+ # self._unit_download_apply(ckan=ckan, file_query_item=file_query_item,
295
+ # index=index, start_index=start_index, end_index=end_index, total=total)
296
+ # self._call_progress_callback(total, total, context=f"{ckan.identifier} single-thread download")
297
+
298
+ def download_request_generator(self, ckan: CkanApi, out_dir: str) -> Generator[Tuple[Any, pd.DataFrame], Any, None]:
299
+ """
300
+ Iterator on file_queries.
301
+ """
302
+ self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True)
303
+ for file_query_item in self.get_file_query_generator():
304
+ yield self.download_file_query_item(ckan=ckan, out_dir=out_dir, file_query_item=file_query_item)
305
+
306
+ # def download_file_query_item_graceful(self, ckan: CkanApi, out_dir: str, file_query_item: Any, index:int,
307
+ # external_stop_event=None, start_index:int=0, end_index:int=None) -> None:
308
+ # """
309
+ # Implementation of download_file_query_item with checks for a multi-threaded download.
310
+ # """
311
+ # # ckan.session_reset()
312
+ # # ckan.identifier = current_thread().name
313
+ # ckan = self.thread_ckan[current_thread().name]
314
+ # total = self.get_file_query_len()
315
+ # end_index = positive_end_index(end_index, total)
316
+ # if self.stop_event.is_set():
317
+ # return
318
+ # if external_stop_event is not None and external_stop_event.is_set():
319
+ # print(f"{ckan.identifier} Interrupted")
320
+ # return
321
+ # try:
322
+ # # self._unit_download_apply(ckan=ckan, file_query_item=file_query_item,
323
+ # # index=index, start_index=start_index, end_index=end_index, total=total)
324
+ # except Exception as e:
325
+ # self.stop_event.set() # Ensure all threads stop
326
+ # if ckan.verbose_extra:
327
+ # print(f"Stopping all threads because an exception occurred in thread: {e}")
328
+ # raise e from e
329
+
330
+ # def download_request_full_multi_threaded(self, ckan: CkanApi, out_dir: str,
331
+ # threads: int = None, external_stop_event=None,
332
+ # start_index:int=0, end_index:int=-1) -> None:
333
+ # """
334
+ # Multi-threaded implementation of download_request_full using ThreadPoolExecutor.
335
+ # """
336
+ # self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True)
337
+ # self._prepare_for_multithreading(ckan)
338
+ # try:
339
+ # with ThreadPoolExecutor(max_workers=threads, initializer=self._init_thread, initargs=(ckan,)) as executor:
340
+ # if ckan.verbose_extra:
341
+ # print(f"Launching multi-threaded download of multi-file resource {self.name}")
342
+ # futures = [executor.submit(self.download_file_query_item_graceful, ckan=ckan, out_dir=out_dir, file_query_item=file_query_item,
343
+ # index=index, external_stop_event=external_stop_event, start_index=start_index, end_index=end_index)
344
+ # for index, file_query_item in enumerate(self.get_file_query_generator())]
345
+ # for future in futures:
346
+ # future.result() # This will propagate the exception
347
+ # total = self.get_file_query_len()
348
+ # self._call_progress_callback(total, total, context=f"multi-thread download")
349
+ # except Exception as e:
350
+ # self.stop_event.set() # Ensure all threads stop
351
+ # if ckan.verbose_extra:
352
+ # print(f"Stopping all threads because an exception occurred: {e}")
353
+ # raise e from e
354
+ # finally:
355
+ # self.stop_event.set() # Ensure all threads stop
356
+ # if ckan.verbose_extra:
357
+ # print("End of multi-threaded download...")
358
+
359
+ def download_sample_df(self, ckan: CkanApi, search_all:bool=False, **kwargs) -> pd.DataFrame:
360
+ # alias with search_all=False by default
361
+ return super().download_sample_df(ckan=ckan, search_all=search_all, **kwargs)
362
+
363
+ def download_sample(self, ckan:CkanApi, full_download:bool=False, **kwargs) -> bytes:
364
+ # alias with full_download=False by default
365
+ return super().download_sample(ckan=ckan, full_download=full_download, **kwargs)
366
+
367
+
@@ -0,0 +1,273 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to initiate a DataStore defined by a large number of files to concatenate into one table.
5
+ This concrete implementation is linked to the file system.
6
+ """
7
+ from typing import Dict, List, Collection, Callable, Any, Tuple, Generator, Union
8
+ import os
9
+ from warnings import warn
10
+ import glob
11
+ import copy
12
+
13
+ import pandas as pd
14
+
15
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
16
+ from ckanapi_harvesters.builder.mapper_datastore import DataSchemeConversion
17
+ from ckanapi_harvesters.builder.builder_errors import ResourceFileNotExistMessage
18
+ from ckanapi_harvesters.builder.builder_resource_datastore_multi_abc import BuilderDataStoreMultiABC
19
+ from ckanapi_harvesters.builder.builder_resource_datastore_multi_abc import datastore_multi_apply_last_condition_intermediary
20
+ from ckanapi_harvesters.auxiliary.ckan_model import UpsertChoice
21
+ from ckanapi_harvesters.auxiliary.path import resolve_rel_path, glob_rm_glob, list_files_scandir
22
+ from ckanapi_harvesters.ckan_api import CkanApi
23
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element
24
+ from ckanapi_harvesters.builder.mapper_datastore_multi import RequestMapperABC, RequestFileMapperABC
25
+ from ckanapi_harvesters.builder.mapper_datastore_multi import default_file_mapper_from_primary_key
26
+ from ckanapi_harvesters.builder.builder_resource_datastore import BuilderDataStoreFile
27
+
28
+
29
+ class BuilderDataStoreFolder(BuilderDataStoreMultiABC):
30
+ def __init__(self, *, file_query_list: List[Tuple[str,dict]]=None, name:str=None, format:str=None, description:str=None,
31
+ resource_id:str=None, download_url:str=None, dir_name:str=None):
32
+ super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
33
+ self.dir_name = dir_name
34
+ # Functions inputs/outputs
35
+ self.df_mapper: RequestFileMapperABC = default_file_mapper_from_primary_key(self.primary_key, file_query_list)
36
+ self.local_file_list_base_dir:Union[str,None] = None
37
+ self.local_file_list:Union[List[str],None] = None
38
+ self.downloaded_file_query_list:Collection[Tuple[str,dict]] = file_query_list
39
+
40
+ def copy(self, *, dest=None):
41
+ if dest is None:
42
+ dest = BuilderDataStoreFolder()
43
+ super().copy(dest=dest)
44
+ dest.dir_name = self.dir_name
45
+ dest.local_file_list_base_dir = self.local_file_list_base_dir
46
+ dest.local_file_list = copy.deepcopy(self.local_file_list)
47
+ dest.downloaded_file_query_list = copy.deepcopy(self.downloaded_file_query_list)
48
+ return dest
49
+
50
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None) -> None:
51
+ super()._load_from_df_row(row=row)
52
+ self.df_mapper = default_file_mapper_from_primary_key(self.primary_key)
53
+ self.dir_name: str = _string_from_element(row["file/url"])
54
+
55
+ def setup_default_file_mapper(self, primary_key:List[str]=None, file_query_list:Collection[Tuple[str, dict]]=None) -> None:
56
+ """
57
+ This function enables the user to define the primary key and initializes the default file mapper.
58
+ :param primary_key: manually specify the primary key
59
+ :return:
60
+ """
61
+ df_mapper_mem = self.df_mapper
62
+ if primary_key is not None:
63
+ self.primary_key = primary_key
64
+ self.df_mapper = default_file_mapper_from_primary_key(self.primary_key, file_query_list)
65
+ if file_query_list is not None:
66
+ self.downloaded_file_query_list = file_query_list
67
+ # preserve upload/download functions
68
+ self.df_mapper.df_upload_fun = df_mapper_mem.df_upload_fun
69
+ self.df_mapper.df_download_fun = df_mapper_mem.df_download_fun
70
+
71
+ @staticmethod
72
+ def resource_mode_str() -> str:
73
+ return "DataStore from Folder"
74
+
75
+ def _to_dict(self, include_id:bool=True) -> dict:
76
+ d = super()._to_dict(include_id=include_id)
77
+ d["File/URL"] = self.dir_name
78
+ return d
79
+
80
+ @staticmethod
81
+ def from_file_datastore(resource_file: BuilderDataStoreFile,
82
+ *, dir_name:str=None, primary_key:List[str]=None,
83
+ file_query_list:Collection[Tuple[str,dict]]=None) -> "BuilderDataStoreFolder":
84
+ resource_folder = BuilderDataStoreFolder()
85
+ resource_folder._load_from_df_row(resource_file._to_row())
86
+ resource_folder.field_builders = resource_file.field_builders
87
+ if dir_name is not None:
88
+ resource_folder.dir_name = dir_name
89
+ elif isinstance(resource_file, BuilderDataStoreFolder):
90
+ resource_folder.dir_name = resource_file.dir_name
91
+ else:
92
+ resource_folder.dir_name, _ = os.path.splitext(resource_file.file_name)
93
+ resource_folder.package_name = resource_file.package_name
94
+ if isinstance(resource_file.df_mapper, RequestMapperABC):
95
+ resource_folder.df_mapper = resource_file.df_mapper.copy()
96
+ else:
97
+ resource_folder.df_mapper.df_upload_fun = resource_file.df_mapper.df_upload_fun
98
+ resource_folder.df_mapper.df_download_fun = resource_file.df_mapper.df_download_fun
99
+ if primary_key is not None or file_query_list is not None:
100
+ resource_folder.setup_default_file_mapper(primary_key=primary_key, file_query_list=file_query_list)
101
+ resource_folder.downloaded_file_query_list = file_query_list
102
+ return resource_folder
103
+
104
+
105
+ ## upload ---------------------------------------------------
106
+ def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[None,ContextErrorLevelMessage]:
107
+ if os.path.isdir(resolve_rel_path(resources_base_dir, glob_rm_glob(self.dir_name), field=f"File/URL of resource {self.name}")):
108
+ if len(self.list_local_files(resources_base_dir=resources_base_dir)) > 0:
109
+ return None
110
+ else:
111
+ return ResourceFileNotExistMessage(self.name, ErrorLevel.Error,
112
+ f"Empty resource directory for resource {self.name}: {os.path.join(resources_base_dir, self.dir_name)}")
113
+ else:
114
+ return ResourceFileNotExistMessage(self.name, ErrorLevel.Error,
115
+ f"Missing directory for resource {self.name}: {os.path.join(resources_base_dir, self.dir_name)}")
116
+
117
+ def get_sample_file_path(self, resources_base_dir:str, file_index:int=0) -> Union[str,None]:
118
+ self.list_local_files(resources_base_dir=resources_base_dir)
119
+ return self.local_file_list[file_index]
120
+
121
+ def load_sample_df(self, resources_base_dir:str, *, upload_alter:bool=True, file_index:int=0, **kwargs) -> pd.DataFrame:
122
+ file_path:str = self.get_sample_file_path(resources_base_dir, file_index=file_index)
123
+ return self.load_local_df(file=file_path, upload_alter=upload_alter, **kwargs)
124
+
125
+ def load_local_df(self, file: str, *, upload_alter:bool=True, **kwargs) -> pd.DataFrame:
126
+ # self.sample_data_source = resolve_rel_path(resources_base_dir, self.dir_name, file, field=f"File/URL of resource {self.name}")
127
+ self.sample_data_source = file
128
+ df_local = self.local_file_format.read_file(self.sample_data_source, fields=self._get_fields_info())
129
+ if isinstance(df_local, pd.DataFrame):
130
+ df_local.attrs["source"] = self.sample_data_source
131
+ if upload_alter:
132
+ df_upload = self.df_mapper.df_upload_alter(df_local, self.sample_data_source, fields=self._get_fields_info())
133
+ return df_upload
134
+ else:
135
+ return df_local
136
+
137
+ def get_local_file_generator(self, resources_base_dir:str, **kwargs) -> Generator[str, None, None]:
138
+ self.list_local_files(resources_base_dir=resources_base_dir)
139
+ for file_name in self.local_file_list:
140
+ yield file_name
141
+
142
+ def get_local_df_generator(self, resources_base_dir:str, **kwargs) -> Generator[pd.DataFrame, None, None]:
143
+ self.list_local_files(resources_base_dir=resources_base_dir)
144
+ for file_name in self.local_file_list:
145
+ yield self.load_local_df(file_name, **kwargs)
146
+
147
+ def list_local_files(self, resources_base_dir:str, cancel_if_present:bool=True) -> List[str]:
148
+ if cancel_if_present and self.local_file_list is not None and self.local_file_list_base_dir == resources_base_dir:
149
+ return self.local_file_list
150
+ dir_search_path = resolve_rel_path(resources_base_dir, self.dir_name, field=f"File/URL of resource {self.name}")
151
+ # file_list = [os.path.join(dir_search_path, file_name) for file_name in os.listdir(dir_search_path)]
152
+ # file_list = [os.path.join(file.path, file.name) for file in list(os.scandir(dir_search_path)) if file.is_file()]
153
+ search_query = dir_search_path
154
+ file_list = glob.glob(search_query)
155
+ # file_list = list_files_scandir(dir_search_path)
156
+ file_list.sort()
157
+ self.local_file_list = file_list
158
+ self.local_file_list_base_dir = resources_base_dir
159
+ return file_list
160
+
161
+ def init_local_files_list(self, resources_base_dir:str, cancel_if_present:bool=True, **kwargs) -> List[str]:
162
+ return self.list_local_files(resources_base_dir=resources_base_dir, cancel_if_present=cancel_if_present)
163
+
164
+ def get_local_file_len(self) -> int:
165
+ if self.local_file_list is None:
166
+ raise RuntimeError("You must call list_local_files first")
167
+ return len(self.local_file_list)
168
+
169
+ def upsert_request_df(self, ckan: CkanApi, df_upload:pd.DataFrame,
170
+ method:UpsertChoice=UpsertChoice.Upsert,
171
+ apply_last_condition:bool=None, always_last_condition:bool=None) -> Tuple[pd.DataFrame, pd.DataFrame]:
172
+ """
173
+ Call to ckan datastore_upsert.
174
+ Before sending the DataFrame, a call to df_upload_alter is made.
175
+ This implementation optionally checks for the last line of the DataFrame based on the first columns of the primary key.
176
+
177
+ :param ckan:
178
+ :param df_upload:
179
+ :param method:
180
+ :return:
181
+ """
182
+ if apply_last_condition is None:
183
+ apply_last_condition = True # datastore_multi_apply_last_condition_intermediary
184
+ resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=True)
185
+ df_upload_local = df_upload
186
+ df_upload_transformed = self.df_mapper.df_upload_alter(df_upload_local, fields=self._get_fields_info())
187
+ file_query = self.df_mapper.get_file_query_of_df(df_upload_transformed)
188
+ if file_query is not None:
189
+ i_restart, upload_needed, row_count, df_row = self.df_mapper.last_inserted_index_request(ckan=ckan,
190
+ resource_id=resource_id, df_upload=df_upload_transformed, file_query=file_query)
191
+ else:
192
+ i_restart, upload_needed, row_count, df_row = 0, True, -1, None
193
+ if upload_needed:
194
+ if i_restart > 0 and ckan.params.verbose_extra:
195
+ print(f"Starting transfer from index {i_restart}")
196
+ ret_df = ckan.datastore_upsert(df_upload_transformed.iloc[i_restart:], resource_id, method=method,
197
+ apply_last_condition=apply_last_condition,
198
+ always_last_condition=always_last_condition, data_cleaner=self.data_cleaner_upload)
199
+ elif 0 <= row_count and row_count < len(df_row):
200
+ msg = f"Sending full dataframe because is was shorter on server side"
201
+ warn(msg)
202
+ ret_df = ckan.datastore_upsert(df_upload_transformed, resource_id, method=method,
203
+ apply_last_condition=apply_last_condition,
204
+ always_last_condition=always_last_condition, data_cleaner=self.data_cleaner_upload)
205
+ else:
206
+ if ckan.params.verbose_extra:
207
+ print(f"File up to date on server side")
208
+ ret_df = None
209
+ return df_upload_transformed, ret_df
210
+
211
+
212
+ ## download ---------------------------------------------------------------------------
213
+ def download_file_query_list(self, ckan: CkanApi, cancel_if_present:bool=True) -> List[Tuple[str, dict]]:
214
+ resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=self.download_error_not_found)
215
+ if resource_id is None and self.download_error_not_found:
216
+ self.downloaded_file_query_list = []
217
+ return []
218
+ if not(cancel_if_present and self.downloaded_file_query_list is not None):
219
+ file_query_list = self.df_mapper.download_file_query_list(ckan=ckan, resource_id=resource_id)
220
+ self.downloaded_file_query_list = [(self.df_mapper.get_file_name_of_query(file_query), file_query) for file_query in file_query_list]
221
+ return self.downloaded_file_query_list
222
+
223
+ def setup_download_file_query_list(self, file_query_list: List[Tuple[str,dict]]) -> None:
224
+ self.downloaded_file_query_list = file_query_list
225
+
226
+ def init_download_file_query_list(self, ckan: CkanApi, out_dir: str, cancel_if_present:bool=True, **kwargs) -> List[Any]:
227
+ if out_dir is not None:
228
+ dir_tables = resolve_rel_path(out_dir, glob_rm_glob(self.dir_name, default_rec_dir=self.name), field=f"File/URL of resource {self.name}")
229
+ os.makedirs(dir_tables, exist_ok=True)
230
+ return self.download_file_query_list(ckan=ckan, cancel_if_present=cancel_if_present)
231
+
232
+ def get_file_query_len(self) -> int:
233
+ if self.downloaded_file_query_list is None:
234
+ raise RuntimeError("You must call download_file_query_list first")
235
+ return len(self.downloaded_file_query_list)
236
+
237
+ def get_file_query_generator(self) -> Generator[Tuple[str,dict], Any, None]:
238
+ for file_name, file_query in self.downloaded_file_query_list:
239
+ yield file_name, file_query
240
+
241
+ def download_file_query(self, ckan: CkanApi, out_dir: str, file_name:str, file_query:dict) \
242
+ -> Tuple[Union[str,None], Union[pd.DataFrame,None]]:
243
+ resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=self.download_error_not_found)
244
+ if resource_id is None and self.download_error_not_found:
245
+ return None, None
246
+ self.download_file_query_list(ckan, cancel_if_present=True)
247
+ file_out = None
248
+ if out_dir is not None:
249
+ file_out = resolve_rel_path(out_dir, glob_rm_glob(self.dir_name, default_rec_dir=self.name), file_name, field=f"File/URL of resource {self.name}")
250
+ if self.download_skip_existing and os.path.exists(file_out):
251
+ if ckan.params.verbose_extra:
252
+ print(f"Skipping existing file {file_out}")
253
+ return file_out, None
254
+ df_download = self.df_mapper.download_file_query(ckan=ckan, resource_id=resource_id, file_query=file_query)
255
+ df = self.df_mapper.df_download_alter(df_download, file_query=file_query, fields=self._get_fields_info())
256
+ if out_dir is not None:
257
+ self.local_file_format.write_file(df, file_out, fields=self._get_fields_info())
258
+ else:
259
+ file_out = None
260
+ return file_out, df
261
+
262
+ def download_file_query_item(self, ckan: CkanApi, out_dir: str, file_query_item: Tuple[str,dict]) -> Tuple[str, pd.DataFrame]:
263
+ file_name, file_query = file_query_item
264
+ return self.download_file_query(ckan=ckan, file_name=file_name, file_query=file_query, out_dir=out_dir)
265
+
266
+ def download_request(self, ckan: CkanApi, out_dir: str, *, full_download:bool=False, force:bool=False, threads:int=1) -> None:
267
+ # limit download to first page by default
268
+ if not full_download:
269
+ super().download_request(ckan=ckan, out_dir=out_dir, full_download=False, force=force, threads=threads)
270
+ else:
271
+ self.download_request_full(ckan=ckan, out_dir=out_dir, threads=threads, force=force)
272
+
273
+