ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/METADATA +74 -38
  103. ckanapi_harvesters-0.0.2.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,505 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to upload metadata to the CKAN server to create/update an existing package
5
+ The metadata is defined by the user in an Excel worksheet
6
+ This file implements the basic resources. See builder_datastore for specific functions to initiate datastores.
7
+ """
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ import threading
10
+ from threading import current_thread
11
+ from typing import Any, Generator, Union, Callable, Set, List, Dict, Tuple
12
+ from abc import ABC, abstractmethod
13
+ import io
14
+ import os
15
+ import glob
16
+ import fnmatch
17
+ from warnings import warn
18
+ import copy
19
+
20
+ import pandas as pd
21
+ import requests
22
+
23
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
24
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element
25
+ from ckanapi_harvesters.ckan_api import CkanApi
26
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanResourceInfo
27
+ from ckanapi_harvesters.auxiliary.path import resolve_rel_path, glob_rm_glob, glob_name
28
+ from ckanapi_harvesters.builder.builder_aux import positive_end_index
29
+ from ckanapi_harvesters.builder.builder_errors import ResourceFileNotExistMessage
30
+ from ckanapi_harvesters.builder.builder_resource_multi_abc import BuilderMultiABC
31
+ from ckanapi_harvesters.builder.builder_resource import BuilderResourceABC
32
+
33
+ multi_file_exclude_other_files:bool = True
34
+
35
+
36
+ def default_progress_callback(index:int, total:int, info:Any, *, context:str=None, **kwargs) -> None:
37
+ if context is None:
38
+ context = ""
39
+ if index == total:
40
+ # info is None
41
+ print(f"{context} Finished {index}/{total} (100%)")
42
+ elif info is None:
43
+ print(f"{context} Request {index}/{total} ({index/total*100.0:.2f}%)")
44
+ else:
45
+ if isinstance(info, str):
46
+ info_str = info
47
+ elif isinstance(info, pd.DataFrame):
48
+ if "source" in info.attrs.keys():
49
+ info_str = str(info.attrs["source"])
50
+ else:
51
+ info_str = "<DataFrame>"
52
+ else:
53
+ info_str = str(info)
54
+ print(f"{context} Request {index}/{total} ({index/total*100.0:.2f}%): " + info_str)
55
+
56
+
57
+ class BuilderMultiFile(BuilderResourceABC, BuilderMultiABC):
58
+ """
59
+ Class to manage a set of files to upload as separate resources
60
+ """
61
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
62
+ resource_id:str=None, download_url:str=None, dir_name:str=None):
63
+ super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
64
+ self.dir_name: str = dir_name
65
+ self.local_file_list_base_dir: str = ""
66
+ self.local_file_list: Union[List[str], None] = None
67
+ self.excluded_files: Set[str] = set()
68
+ self.remote_resource_names: Union[List[str], None] = None
69
+ self.excluded_resource_names: Set[str] = set()
70
+ # BuilderMultiABC:
71
+ self.stop_event = threading.Event()
72
+ self.thread_ckan: Dict[str, CkanApi] = {}
73
+ self.progress_callback: Union[Callable[[int, int, Any], None], None] = default_progress_callback
74
+ self.progress_callback_kwargs: dict = {}
75
+ self.enable_multi_threaded_upload:bool = True
76
+ self.enable_multi_threaded_download:bool = True
77
+
78
+ @staticmethod
79
+ def resource_mode_str() -> str:
80
+ return "MultiFile"
81
+
82
+ def copy(self, *, dest=None):
83
+ if dest is None:
84
+ dest = BuilderMultiFile()
85
+ super().copy(dest=dest)
86
+ dest.dir_name = self.dir_name
87
+ # BuilderMultiABC:
88
+ dest.progress_callback = self.progress_callback
89
+ dest.progress_callback_kwargs = copy.deepcopy(self.progress_callback_kwargs)
90
+ dest.enable_multi_threaded_upload = self.enable_multi_threaded_upload
91
+ dest.enable_multi_threaded_download = self.enable_multi_threaded_download
92
+ # do not copy stop_event
93
+ return dest
94
+
95
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
96
+ super()._load_from_df_row(row=row)
97
+ self.dir_name = _string_from_element(row["file/url"], empty_value="")
98
+
99
+ def _to_dict(self, include_id:bool=True) -> dict:
100
+ d = super()._to_dict(include_id=include_id)
101
+ d["File/URL"] = self.dir_name
102
+ return d
103
+
104
+ def get_or_query_resource_id(self, ckan: CkanApi, cancel_if_present:bool=True, error_not_found:bool=True) -> Union[None,str]:
105
+ return None
106
+
107
+
108
+ ## upload --------------------------------------------------------------------
109
+ def patch_request(self, ckan: CkanApi, package_id: str, *, reupload: bool = None, resources_base_dir:str=None,
110
+ payload:Union[bytes, io.BufferedIOBase]=None) -> Union[None, CkanResourceInfo]:
111
+ return None
112
+
113
+ def upload_request_final(self, ckan:CkanApi, *, force:bool=False) -> None:
114
+ return None
115
+
116
+ @staticmethod
117
+ def sample_file_path_is_url() -> bool:
118
+ return False
119
+
120
+ def get_sample_file_path(self, resources_base_dir:str, file_index:int=0) -> Union[str,None]:
121
+ self.list_local_files(resources_base_dir=resources_base_dir)
122
+ return self.local_file_list[file_index]
123
+
124
+ def load_sample_data(self, resources_base_dir:str, file_index:int=0) -> Union[bytes,None]:
125
+ file_path:str = self.get_sample_file_path(resources_base_dir, file_index=file_index)
126
+ with open(file_path, "rb") as f:
127
+ return f.read()
128
+
129
+ def list_local_files(self, resources_base_dir:str, cancel_if_present:bool=True,
130
+ excluded_files:Set[str]=None) -> Union[List[str],None]:
131
+ """
132
+ List files corresponding to the multi-file resource configuration and are not used in mono-resources
133
+
134
+ :param resources_base_dir:
135
+ :param cancel_if_present:
136
+ :param excluded_files: files from mono-resources
137
+ :return:
138
+ """
139
+ if excluded_files is None:
140
+ excluded_files = set()
141
+ if (cancel_if_present and self.local_file_list is not None
142
+ and self.local_file_list_base_dir == resources_base_dir
143
+ and self.excluded_files == excluded_files):
144
+ return self.local_file_list
145
+ dir_search_path = resolve_rel_path(resources_base_dir, self.dir_name, field=f"File/URL of resource {self.name}")
146
+ search_query = dir_search_path
147
+ file_set = set(glob.glob(search_query))
148
+ file_set = file_set - excluded_files
149
+ file_list = list(file_set)
150
+ file_list.sort()
151
+ self.local_file_list = file_list
152
+ self.local_file_list_base_dir = resources_base_dir
153
+ self.excluded_files = excluded_files
154
+ return file_list
155
+
156
+ def init_local_files_list(self, resources_base_dir:str, cancel_if_present:bool=True, excluded_files:Set[str]=None, **kwargs) -> List[str]:
157
+ return self.list_local_files(resources_base_dir=resources_base_dir, cancel_if_present=cancel_if_present,
158
+ excluded_files=excluded_files)
159
+
160
+ def get_local_file_len(self) -> int:
161
+ if self.local_file_list is None:
162
+ raise RuntimeError("You must call list_local_files first")
163
+ return len(self.local_file_list)
164
+
165
+ def get_local_file_generator(self, resources_base_dir:str, excluded_files:Set[str]=None, **kwargs) -> Generator[str, None, None]:
166
+ self.list_local_files(resources_base_dir=resources_base_dir, excluded_files=excluded_files)
167
+ for file_name in self.local_file_list:
168
+ yield file_name
169
+
170
+ def upload_file_checks(self, *, resources_base_dir: str = None, ckan: CkanApi = None, excluded_files:Set[str]=None, **kwargs) \
171
+ -> Union[None, ContextErrorLevelMessage]:
172
+ if os.path.isdir(resolve_rel_path(resources_base_dir, glob_rm_glob(self.dir_name), field=f"File/URL of resource {self.name}")):
173
+ if len(self.list_local_files(resources_base_dir=resources_base_dir, excluded_files=excluded_files)) > 0:
174
+ return None
175
+ else:
176
+ return ResourceFileNotExistMessage(self.name, ErrorLevel.Error,
177
+ f"Empty resource directory for multi-file resource {self.name}: {os.path.join(resources_base_dir, self.dir_name)}")
178
+ else:
179
+ return ResourceFileNotExistMessage(self.name, ErrorLevel.Error,
180
+ f"Missing directory for multi-file resource {self.name}: {os.path.join(resources_base_dir, self.dir_name)}")
181
+
182
+ def upload_file(self, ckan:CkanApi, package_id:str, file_path:str, *,
183
+ reupload:bool=False, cancel_if_present:bool=True) -> CkanResourceInfo:
184
+ """
185
+ Upload a file, using its name as resource name
186
+ """
187
+ _, resource_name = os.path.split(file_path)
188
+ resource_info = ckan.map.get_resource_info(resource_name, package_name=package_id, error_not_mapped=False)
189
+ if resource_info is not None and cancel_if_present and not reupload:
190
+ resource_info.newly_created = False
191
+ resource_info.newly_updated = False
192
+ return resource_info
193
+ return ckan.resource_create(package_id, resource_name, format=self.format, description=self.description,
194
+ state=self.state, file_path=file_path, reupload=reupload, cancel_if_exists=True, update_if_exists=True,
195
+ create_default_view=True, auto_submit=False)
196
+
197
+ def _unit_upload_apply(self, *, ckan:CkanApi, file:str,
198
+ index:int, start_index:int, end_index:int, total:int,
199
+ package_id:str, reupload:bool, only_missing:bool, excluded_files:Set[str]) -> None:
200
+ # For each file, this function initiates its own FileStore.
201
+ file_path = file
202
+ _, file_name = os.path.split(file_path)
203
+ if start_index <= index and index < end_index and file_path not in excluded_files:
204
+ self._call_progress_callback(index, total, info=file_path,
205
+ context=f"{ckan.identifier} single-thread upload")
206
+ self.upload_file(ckan=ckan, package_id=package_id, file_path=file_path,
207
+ reupload=reupload, cancel_if_present=only_missing)
208
+ else:
209
+ # self._call_progress_callback(index, total, info=df_upload_local, context=f"{ckan.identifier} single-thread skip")
210
+ pass
211
+
212
+ def upload_request_full(self, ckan:CkanApi, resources_base_dir:str, *,
213
+ threads:int=1, external_stop_event=None,
214
+ start_index:int=0, end_index:int=None,
215
+ reupload:bool=False, only_missing:bool=False, excluded_files:Set[str]=None) -> None:
216
+ if excluded_files is None:
217
+ excluded_files = set()
218
+ package_id = self.get_or_query_package_id(ckan)
219
+ super().upload_request_full(ckan=ckan, resources_base_dir=resources_base_dir, threads=threads,
220
+ external_stop_event=external_stop_event, start_index=start_index, end_index=end_index,
221
+ reupload=reupload, only_missing=only_missing,
222
+ package_id=package_id, excluded_files=excluded_files)
223
+ # if threads < 0:
224
+ # # cancel large uploads in this case
225
+ # return None
226
+ # elif threads is None or threads > 1:
227
+ # return self.upload_request_full_multi_threaded(ckan=ckan, resources_base_dir=resources_base_dir,
228
+ # threads=threads, external_stop_event=external_stop_event,
229
+ # start_index=start_index, end_index=end_index,
230
+ # reupload=reupload, only_missing=only_missing,
231
+ # excluded_files=excluded_files)
232
+ # else:
233
+ # self.init_local_files_list(resources_base_dir=resources_base_dir, cancel_if_present=True, excluded_files=excluded_files)
234
+ # package_id = self.get_or_query_package_id(ckan)
235
+ # if ckan.verbose_extra:
236
+ # print(f"Launching single-threaded upload of multi-file resource {self.name}")
237
+ # total = self.get_local_file_len()
238
+ # end_index = positive_end_index(end_index, total)
239
+ # for index, file_path in enumerate(self.get_local_file_generator(resources_base_dir=resources_base_dir, excluded_files=excluded_files)):
240
+ # if external_stop_event is not None and external_stop_event.is_set():
241
+ # print(f"{ckan.identifier} Interrupted")
242
+ # return
243
+ # self._unit_upload_apply(ckan, file=file_path, package_id=package_id,
244
+ # reupload=reupload, only_missing=only_missing,
245
+ # index=index, start_index=start_index, end_index=end_index, total=total,
246
+ # excluded_files=excluded_files)
247
+ # self._call_progress_callback(total, total, context=f"{ckan.identifier} single-thread upload")
248
+ # # at last, apply final actions:
249
+ # self.upload_request_final(ckan)
250
+
251
+ # def upload_request_graceful(self, ckan:CkanApi, file_path: str, *, index:int, package_id:str,
252
+ # external_stop_event=None,
253
+ # start_index:int=0, end_index:int=None,
254
+ # reupload:bool=False, only_missing:bool=False, excluded_files:Set[str]=None) -> None:
255
+ # """
256
+ # Calls upload_file with checks specific to multi-threading.
257
+ #
258
+ # :return:
259
+ # """
260
+ # # ckan.session_reset()
261
+ # # ckan.identifier = current_thread().name
262
+ # ckan = self.thread_ckan[current_thread().name]
263
+ # total = self.get_local_file_len()
264
+ # end_index = positive_end_index(end_index, total)
265
+ # if self.stop_event.is_set():
266
+ # return
267
+ # if external_stop_event is not None and external_stop_event.is_set():
268
+ # print(f"{ckan.identifier} Interrupted")
269
+ # return
270
+ # try:
271
+ # self._unit_upload_apply(ckan, file=file_path, package_id=package_id,
272
+ # reupload=reupload, only_missing=only_missing,
273
+ # index=index, start_index=start_index, end_index=end_index, total=total,
274
+ # excluded_files=excluded_files)
275
+ # except Exception as e:
276
+ # self.stop_event.set() # Ensure all threads stop
277
+ # if ckan.verbose_extra:
278
+ # print(f"Stopping all threads because an exception occurred in thread: {e}")
279
+ # raise e from e
280
+
281
+ # def upload_request_full_multi_threaded(self, ckan:CkanApi, resources_base_dir:str,
282
+ # threads:int=1, external_stop_event=None,
283
+ # start_index:int=0, end_index:int=None,
284
+ # reupload:bool=False, only_missing:bool=False, excluded_files:Set[str]=None):
285
+ # """
286
+ # Multi-threaded implementation of upload_request_full, using ThreadPoolExecutor.
287
+ # """
288
+ # self.init_local_files_list(resources_base_dir=resources_base_dir, cancel_if_present=True, excluded_files=excluded_files)
289
+ # package_id = self.get_or_query_package_id(ckan)
290
+ # self._prepare_for_multithreading(ckan)
291
+ # try:
292
+ # with ThreadPoolExecutor(max_workers=threads, initializer=self._init_thread, initargs=(ckan,)) as executor:
293
+ # if ckan.verbose_extra:
294
+ # print(f"Launching multi-threaded upload of multi-file resource {self.name}")
295
+ # futures = [executor.submit(self.upload_request_graceful, ckan=ckan, file_path=file_path, index=index, package_id=package_id,
296
+ # start_index=start_index, end_index=end_index, external_stop_event=external_stop_event,
297
+ # excluded_files=excluded_files, reupload=reupload, only_missing=only_missing)
298
+ # for index, file_path in enumerate(self.get_local_file_generator(resources_base_dir=resources_base_dir, excluded_files=excluded_files))]
299
+ # for future in futures:
300
+ # future.result() # This will propagate the exception
301
+ # total = self.get_local_file_len()
302
+ # self._call_progress_callback(total, total, context=f"{ckan.identifier} multi-thread upload")
303
+ # except Exception as e:
304
+ # self.stop_event.set() # Ensure all threads stop
305
+ # if ckan.verbose_extra:
306
+ # print(f"Stopping all threads because an exception occurred: {e}")
307
+ # raise e from e
308
+ # finally:
309
+ # self.stop_event.set() # Ensure all threads stop
310
+ # if ckan.verbose_extra:
311
+ # print("End of multi-threaded upload...")
312
+ # # at last, apply final actions:
313
+ # self.upload_request_final(ckan)
314
+
315
+
316
+ ## download ------------------------------------------------
317
+ def list_remote_resources(self, ckan:CkanApi, *, excluded_resource_names:Set[str]=None,
318
+ cancel_if_present: bool = True) -> List[str]:
319
+ """
320
+ Defines the list of resources to download that correspond to the definition and are not used in mono-resources.
321
+
322
+ :param ckan:
323
+ :param excluded_resource_names: resource names of mono-resources
324
+ :param cancel_if_present:
325
+ :return:
326
+ """
327
+ if cancel_if_present and self.remote_resource_names is not None and self.excluded_resource_names == excluded_resource_names:
328
+ return self.remote_resource_names
329
+ if excluded_resource_names is None:
330
+ excluded_resource_names = set()
331
+ package_info = ckan.get_package_info_or_request(self.package_name)
332
+ resource_names = set(package_info.resources_id_index.keys())
333
+ # resource_name_glob = self.name
334
+ resource_name_glob = glob_name(self.dir_name)
335
+ filtered_resource_names = set(fnmatch.filter(resource_names, resource_name_glob)) # apply name as wildcard filter
336
+ filtered_resource_names = filtered_resource_names - excluded_resource_names
337
+ self.remote_resource_names = sorted(list(filtered_resource_names))
338
+ self.excluded_resource_names = excluded_resource_names
339
+ return self.remote_resource_names
340
+
341
+ def list_remote_resource_ids(self, ckan:CkanApi, *, excluded_resource_names:Set[str]=None,
342
+ cancel_if_present: bool = True) -> List[str]:
343
+ resource_names = self.list_remote_resources(ckan, excluded_resource_names=excluded_resource_names,
344
+ cancel_if_present=cancel_if_present)
345
+ resource_ids = [ckan.map.get_resource_id(resource_name, package_name=self.package_name) for resource_name in resource_names]
346
+ return resource_ids
347
+
348
+ def init_download_file_query_list(self, ckan: CkanApi, out_dir: str=None,
349
+ cancel_if_present: bool = True,
350
+ excluded_resource_names:Set[str]=None, **kwargs) -> List[str]:
351
+ if out_dir is not None:
352
+ dir_tables = resolve_rel_path(out_dir, glob_rm_glob(self.dir_name, default_rec_dir=self.name), field=f"File/URL of resource {self.name}")
353
+ os.makedirs(dir_tables, exist_ok=True)
354
+ return self.list_remote_resources(ckan, cancel_if_present=cancel_if_present, excluded_resource_names=excluded_resource_names)
355
+
356
+ def get_file_query_generator(self) -> Generator[str, Any, None]:
357
+ for resource_name in self.remote_resource_names:
358
+ yield resource_name
359
+
360
+ def get_file_query_len(self) -> int:
361
+ if self.remote_resource_names is None:
362
+ raise RuntimeError("init_download_file_query_list must be called first")
363
+ return len(self.remote_resource_names)
364
+
365
+ def download_file_query_item(self, ckan: CkanApi, out_dir: str, file_query_item: str) \
366
+ -> Tuple[Union[str,None], Union[requests.Response,None]]:
367
+ resource_name = file_query_item
368
+ file_out = None
369
+ if out_dir is not None:
370
+ file_out = resolve_rel_path(out_dir, glob_rm_glob(self.dir_name, default_rec_dir=self.name), resource_name, field=f"File/URL of resource {self.name}")
371
+ if self.download_skip_existing and os.path.exists(file_out):
372
+ if ckan.params.verbose_extra:
373
+ print(f"Skipping existing file {file_out}")
374
+ return file_out, None
375
+ resource_id = ckan.map.get_resource_id(resource_name, package_name=self.package_name)
376
+ resource_info, response = ckan.resource_download(resource_id)
377
+ if out_dir is not None:
378
+ with open(file_out, 'wb') as f:
379
+ f.write(response.content)
380
+ else:
381
+ file_out = None
382
+ return file_out, response
383
+
384
+ def download_request_generator(self, ckan: CkanApi, out_dir: str,
385
+ excluded_resource_names:Set[str]=None) -> Generator[Tuple[Union[str,None], Union[requests.Response,None]], Any, None]:
386
+ self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True,
387
+ excluded_resource_names=excluded_resource_names)
388
+ for file_query_item in self.get_file_query_generator():
389
+ yield self.download_file_query_item(ckan=ckan, out_dir=out_dir, file_query_item=file_query_item)
390
+
391
+ def _unit_download_apply(self, ckan:CkanApi, file_query_item:Any, out_dir:str,
392
+ index:int, start_index:int, end_index:int, total:int, excluded_resource_names:Set[str]) -> Any:
393
+ if start_index <= index and index < end_index and file_query_item not in excluded_resource_names:
394
+ self._call_progress_callback(index, total, info=file_query_item,
395
+ context=f"{ckan.identifier} single-thread download")
396
+ self.download_file_query_item(ckan=ckan, out_dir=out_dir, file_query_item=file_query_item)
397
+ else:
398
+ pass
399
+ # self._call_progress_callback(index, total, info=file_query_item, context=f"{ckan.identifier} single-thread skip")
400
+
401
+ def download_request_full(self, ckan: CkanApi, out_dir: str, threads:int=1, external_stop_event=None,
402
+ start_index:int=0, end_index:int=None, force:bool=False,
403
+ excluded_resource_names:Set[str]=None) -> None:
404
+ return super().download_request_full(ckan=ckan, out_dir=out_dir, threads=threads,
405
+ external_stop_event=external_stop_event,
406
+ start_index=start_index, end_index=end_index, force=force,
407
+ excluded_resource_names=excluded_resource_names)
408
+ # if (not self.enable_download) and (not force):
409
+ # msg = f"Did not download resource {self.name} because download was disabled."
410
+ # warn(msg)
411
+ # return None
412
+ # if threads < 0:
413
+ # # do not download large datasets in this case
414
+ # return None
415
+ # elif threads is None or threads > 1:
416
+ # return self.download_request_full_multi_threaded(ckan=ckan, out_dir=out_dir,
417
+ # threads=threads, external_stop_event=external_stop_event,
418
+ # start_index=start_index, end_index=end_index,
419
+ # excluded_resource_names=excluded_resource_names)
420
+ # else:
421
+ # self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True,
422
+ # excluded_resource_names=excluded_resource_names)
423
+ # if ckan.verbose_extra:
424
+ # print(f"Launching single-threaded download of multi-file resource {self.name}")
425
+ # total = self.get_file_query_len()
426
+ # end_index = positive_end_index(end_index, total)
427
+ # for index, file_query_item in enumerate(self.get_file_query_generator()):
428
+ # if external_stop_event is not None and external_stop_event.is_set():
429
+ # print(f"{ckan.identifier} Interrupted")
430
+ # return
431
+ # self._unit_download_apply(ckan=ckan, file_query_item=file_query_item, out_dir=out_dir,
432
+ # index=index, start_index=start_index, end_index=end_index, total=total,
433
+ # excluded_resource_names=excluded_resource_names)
434
+ # self._call_progress_callback(total, total, context=f"{ckan.identifier} single-thread download")
435
+
436
+ # def download_file_query_item_graceful(self, ckan: CkanApi, out_dir: str, resource_name: str, index:int,
437
+ # external_stop_event=None, start_index:int=0, end_index:int=None,
438
+ # excluded_resource_names:Set[str]=None) -> None:
439
+ # """
440
+ # Implementation of download_file_query_item with checks for a multi-threaded download.
441
+ # """
442
+ # # ckan.session_reset()
443
+ # # ckan.identifier = current_thread().name
444
+ # ckan = self.thread_ckan[current_thread().name]
445
+ # total = self.get_file_query_len()
446
+ # end_index = positive_end_index(end_index, total)
447
+ # if self.stop_event.is_set():
448
+ # return
449
+ # if external_stop_event is not None and external_stop_event.is_set():
450
+ # print(f"{ckan.identifier} Interrupted")
451
+ # return
452
+ # try:
453
+ # self._unit_download_apply(ckan=ckan, file_query_item=file_query_item, out_dir=out_dir,
454
+ # index=index, start_index=start_index, end_index=end_index, total=total,
455
+ # excluded_resource_names=excluded_resource_names)
456
+ # except Exception as e:
457
+ # self.stop_event.set() # Ensure all threads stop
458
+ # if ckan.verbose_extra:
459
+ # print(f"Stopping all threads because an exception occurred in thread: {e}")
460
+ # raise e from e
461
+
462
+ # def download_request_full_multi_threaded(self, ckan: CkanApi, out_dir: str,
463
+ # threads: int = None, external_stop_event=None,
464
+ # start_index:int=0, end_index:int=-1,
465
+ # excluded_resource_names:Set[str]=None) -> None:
466
+ # """
467
+ # Multi-threaded implementation of download_request_full using ThreadPoolExecutor.
468
+ # """
469
+ # self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True, excluded_resource_names=excluded_resource_names)
470
+ # self._prepare_for_multithreading(ckan)
471
+ # try:
472
+ # with ThreadPoolExecutor(max_workers=threads, initializer=self._init_thread, initargs=(ckan,)) as executor:
473
+ # if ckan.verbose_extra:
474
+ # print(f"Launching multi-threaded download of multi-file resource {self.name}")
475
+ # futures = [executor.submit(self.download_file_query_item_graceful, ckan=ckan, out_dir=out_dir, resource_name=resource_name,
476
+ # index=index, external_stop_event=external_stop_event, start_index=start_index, end_index=end_index,
477
+ # excluded_resource_names=excluded_resource_names)
478
+ # for index, resource_name in enumerate(self.get_file_query_generator())]
479
+ # for future in futures:
480
+ # future.result() # This will propagate the exception
481
+ # total = self.get_file_query_len()
482
+ # self._call_progress_callback(total, total, context=f"multi-thread download")
483
+ # except Exception as e:
484
+ # self.stop_event.set() # Ensure all threads stop
485
+ # if ckan.verbose_extra:
486
+ # print(f"Stopping all threads because an exception occurred: {e}")
487
+ # raise e from e
488
+ # finally:
489
+ # self.stop_event.set() # Ensure all threads stop
490
+ # if ckan.verbose_extra:
491
+ # print("End of multi-threaded download...")
492
+
493
+ def download_sample(self, ckan:CkanApi, full_download:bool=True, **kwargs) -> Union[bytes, None]:
494
+ return None
495
+
496
+ def download_request(self, ckan: CkanApi, out_dir: str, *, full_download:bool=True, threads:int=1,
497
+ force:bool=False, excluded_resource_names:Set[str]=None, **kwargs) -> None:
498
+ if full_download:
499
+ return self.download_request_full(ckan=ckan, out_dir=out_dir, threads=threads, force=force,
500
+ excluded_resource_names=excluded_resource_names, **kwargs)
501
+
502
+ def resource_info_request(self, ckan:CkanApi, error_not_found:bool=True) -> Union[CkanResourceInfo, None]:
503
+ return None # there are multiple resource ids => do not return info
504
+ def _to_ckan_resource_info(self, package_id:str, check_id:bool=True) -> CkanResourceInfo:
505
+ return None
@@ -0,0 +1,21 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Section of the package dedicated to the initialization of a CKAN package
5
+ """
6
+
7
+ import os
8
+
9
+ # usage shortcuts
10
+ self_dir = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
11
+ example_package_dir = os.path.join(self_dir, "package")
12
+ from ..builder_package import example_package_xls
13
+
14
+ from . import builder_example
15
+ from . import builder_example_aux_fun
16
+ from . import builder_example_generate_data
17
+ from . import builder_example_patch_upload
18
+ from . import builder_example_tests
19
+ from . import builder_example_policy
20
+ from . import builder_example_download
21
+
@@ -0,0 +1,21 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Function to load the example package
5
+ """
6
+ import pandas as pd
7
+
8
+ from ckanapi_harvesters.builder.builder_package import BuilderPackage
9
+ from ckanapi_harvesters.builder.example import example_package_xls
10
+
11
+ def load_example_package() -> BuilderPackage:
12
+ BuilderPackage.unlock_external_code_execution()
13
+ mdl = BuilderPackage.from_excel(example_package_xls)
14
+ return BuilderPackage(src=mdl)
15
+
16
+ def load_help_page_df(*, engine:str=None) -> pd.DataFrame:
17
+ with pd.ExcelFile(example_package_xls, engine=engine) as help_file:
18
+ help_df = pd.read_excel(help_file, sheet_name="help", header=None)
19
+ help_file.close()
20
+ return help_df
21
+
@@ -0,0 +1,24 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Auxiliary functions for package upload/download example
5
+ """
6
+
7
+ import pandas as pd
8
+
9
+ def users_upload(df_users: pd.DataFrame, file_name:str, **kwargs) -> pd.DataFrame:
10
+ print("<<< Upload function example called on users dataframe containing ids " + ",".join([str(id) for id in df_users["user_id"].to_list()]))
11
+ print(f"<<< File {file_name}")
12
+ return df_users
13
+
14
+ def users_download(df_users: pd.DataFrame, file_query, **kwargs) -> pd.DataFrame:
15
+ print("<<< Download function example called on users dataframe containing ids " + ",".join([str(id) for id in df_users["user_id"].to_list()]))
16
+ print(f"<<< File query {file_query}")
17
+ return df_users
18
+
19
+
20
+ if __name__ == '__main__':
21
+ df_users = pd.DataFrame({"user_id": [1, 2, 3]})
22
+ df_users = users_upload(df_users)
23
+ print(df_users)
24
+
@@ -0,0 +1,44 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Example code to download the builder example from a CKAN server
5
+ """
6
+ from typing import Tuple
7
+ import os
8
+ import re
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+
13
+ from ckanapi_harvesters.builder.builder_package import BuilderPackage
14
+ from ckanapi_harvesters.ckan_api import CkanApi
15
+
16
+ from ckanapi_harvesters.builder.example import example_package_xls
17
+ self_dir = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
18
+ example_package_download_dir = os.path.abspath("package_download")
19
+
20
+
21
+ def run(ckan:CkanApi = None):
22
+ BuilderPackage.unlock_external_code_execution()
23
+
24
+ mdl = BuilderPackage.from_excel(example_package_xls)
25
+ ckan = mdl.init_ckan(ckan)
26
+ ckan.input_missing_info(input_args_if_necessary=True, input_owner_org=True)
27
+ ckan.set_verbosity(True)
28
+
29
+ # download into example_package_download_dir
30
+ threads = 3 # > 1: number of threads to download large datasets
31
+ mdl.download_request_full(ckan, example_package_download_dir, full_download=True, threads=threads,
32
+ skip_existing=False, rm_dir=True)
33
+
34
+ print("Package downloaded in")
35
+ print(example_package_download_dir)
36
+
37
+
38
+ if __name__ == '__main__':
39
+ ckan = CkanApi(None)
40
+ ckan.initialize_from_cli_args()
41
+ run(ckan)
42
+
43
+
44
+