ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
  103. ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,126 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to initialize a resource builder from a row
5
+ """
6
+ from typing import Union
7
+
8
+ import pandas as pd
9
+
10
+ from ckanapi_harvesters.ckan_api import CkanApiMap
11
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanResourceInfo, CkanDataStoreInfo
12
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import assert_or_raise
13
+ from ckanapi_harvesters.auxiliary.ckan_defs import ckan_tags_sep
14
+ from ckanapi_harvesters.auxiliary.ckan_errors import (UnexpectedError)
15
+ from ckanapi_harvesters.builder.builder_errors import MissingDataStoreInfoError
16
+ from ckanapi_harvesters.builder.builder_resource import (BuilderResourceABC, BuilderFileBinary, BuilderUrl,
17
+ BuilderResourceUnmanaged)
18
+ from ckanapi_harvesters.builder.builder_resource_multi_file import BuilderMultiFile
19
+ from ckanapi_harvesters.builder.builder_resource_datastore import (BuilderDataStoreABC, BuilderDataStoreFile,
20
+ BuilderResourceIgnored)
21
+ from ckanapi_harvesters.builder.builder_resource_multi_datastore import BuilderMultiDataStore
22
+ from ckanapi_harvesters.builder.builder_resource_datastore_url import BuilderDataStoreUrl
23
+ from ckanapi_harvesters.builder.builder_resource_datastore_multi_harvester import BuilderDataStoreHarvester
24
+ from ckanapi_harvesters.builder.builder_resource_datastore_unmanaged import BuilderDataStoreUnmanaged
25
+ from ckanapi_harvesters.builder.builder_resource_datastore_multi_abc import BuilderDataStoreMultiABC
26
+ from ckanapi_harvesters.builder.builder_resource_datastore_multi_folder import BuilderDataStoreFolder
27
+ from ckanapi_harvesters.builder.builder_field import BuilderField
28
+
29
+
30
+ import_as_folder_row_count_threshold: Union[int,None] = None
31
+
32
+
33
+ def init_resource_from_df(row: pd.Series, base_dir:str=None) -> BuilderResourceABC:
34
+ """
35
+ Function mapping keywords to a resource builder type.
36
+
37
+ :param row:
38
+ :return:
39
+ """
40
+ mode = row["mode"].lower().strip()
41
+ if mode == "file":
42
+ resource_builder = BuilderFileBinary()
43
+ elif mode == "url":
44
+ resource_builder = BuilderUrl()
45
+ elif mode == "datastore from file":
46
+ resource_builder = BuilderDataStoreFile()
47
+ elif mode == "datastore from folder":
48
+ resource_builder = BuilderDataStoreFolder()
49
+ elif mode == "datastore from url":
50
+ resource_builder = BuilderDataStoreUrl()
51
+ elif mode == "datastore from harvester":
52
+ resource_builder = BuilderDataStoreHarvester()
53
+ elif mode == "unmanaged":
54
+ resource_builder = BuilderResourceUnmanaged()
55
+ elif mode == "unmanaged datastore":
56
+ resource_builder = BuilderDataStoreUnmanaged()
57
+ elif mode == "multifile":
58
+ resource_builder = BuilderMultiFile()
59
+ elif mode == "multidatastore":
60
+ resource_builder = BuilderMultiDataStore()
61
+ elif mode == "ignored":
62
+ resource_builder = BuilderResourceIgnored()
63
+ else:
64
+ raise ValueError(f"{mode} is not a valid mode")
65
+ resource_builder._load_from_df_row(row=row, base_dir=base_dir)
66
+ return resource_builder
67
+
68
+
69
+ def init_resource_from_ckan(ckan: CkanApiMap, resource_info: CkanResourceInfo) -> BuilderResourceABC:
70
+ """
71
+ Function initiating a resource builder based on information provided by the CKAN API.
72
+
73
+ :return:
74
+ """
75
+ # assert_or_raise(ckan.map._mapping_query_datastore_info, MissingDataStoreInfoError())
76
+ assert_or_raise(resource_info.datastore_queried(), MissingDataStoreInfoError())
77
+ d = {
78
+ "name": resource_info.name,
79
+ "format": resource_info.format,
80
+ "description": resource_info.description,
81
+ "state": resource_info.state.name if resource_info.state is not None else "",
82
+ "file/url": resource_info.name,
83
+ "primary key": "",
84
+ "indexes": "",
85
+ "known id": resource_info.id,
86
+ "known url": resource_info.download_url,
87
+ }
88
+ if (isinstance(resource_info.datastore_info, CkanDataStoreInfo)
89
+ and resource_info.datastore_info.row_count is not None
90
+ and len(resource_info.datastore_info.fields_id_list) > 0):
91
+ # DataStore
92
+ d["indexes"] = ckan_tags_sep.join(resource_info.datastore_info.index_fields)
93
+ d["aliases"] = ckan_tags_sep.join(resource_info.datastore_info.aliases)
94
+ if len(resource_info.download_url) > 0 and not ckan.is_url_internal(resource_info.download_url):
95
+ d["file/url"] = resource_info.download_url
96
+ row = pd.Series(d)
97
+ resource = BuilderDataStoreUrl()
98
+ resource._load_from_df_row(row=row)
99
+ elif resource_info.format.lower() == "csv":
100
+ row = pd.Series(d)
101
+ resource = BuilderDataStoreUnmanaged()
102
+ resource._load_from_df_row(row=row)
103
+ if import_as_folder_row_count_threshold is not None and resource_info.datastore_info.row_count > import_as_folder_row_count_threshold:
104
+ resource = BuilderDataStoreFolder.from_file_datastore(resource)
105
+ else:
106
+ raise UnexpectedError(f"Format of data store {resource_info.name} ({resource_info.format}) is not recognized")
107
+ # load fields information
108
+ resource.field_builders = {}
109
+ for field_id in resource_info.datastore_info.fields_id_list:
110
+ field_info = resource_info.datastore_info.fields_dict[field_id]
111
+ resource.field_builders[field_id] = BuilderField._from_ckan_field(field_info)
112
+ elif len(resource_info.download_url) > 0 and not ckan.is_url_internal(resource_info.download_url):
113
+ # external resource
114
+ d["file/url"] = resource_info.download_url
115
+ row = pd.Series(d)
116
+ resource = BuilderUrl()
117
+ resource._load_from_df_row(row=row)
118
+ assert_or_raise(not resource_info.datastore_active and not isinstance(resource_info.datastore_info, CkanResourceInfo), UnexpectedError())
119
+ else:
120
+ # file
121
+ row = pd.Series(d)
122
+ resource = BuilderResourceUnmanaged()
123
+ resource._load_from_df_row(row=row)
124
+ resource.package_name = resource_info.package_id
125
+ return resource
126
+
@@ -0,0 +1,361 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to upload metadata to the CKAN server to create/update an existing package
5
+ The metadata is defined by the user in an Excel worksheet
6
+ This file implements the basic resources. See builder_datastore for specific functions to initiate datastores.
7
+ """
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ import threading
10
+ from threading import current_thread
11
+ from typing import Any, Generator, Union, Callable, Set, List, Dict, Tuple
12
+ from abc import ABC, abstractmethod
13
+ import io
14
+ import os
15
+ import glob
16
+ import fnmatch
17
+ from warnings import warn
18
+ import copy
19
+
20
+ import pandas as pd
21
+ import requests
22
+
23
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
24
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element
25
+ from ckanapi_harvesters.auxiliary.ckan_defs import ckan_tags_sep
26
+ from ckanapi_harvesters.ckan_api import CkanApi
27
+ from ckanapi_harvesters.auxiliary.ckan_model import UpsertChoice, CkanResourceInfo
28
+ from ckanapi_harvesters.builder.builder_aux import positive_end_index
29
+ from ckanapi_harvesters.builder.builder_errors import ResourceFileNotExistMessage
30
+ from ckanapi_harvesters.builder.builder_field import BuilderField
31
+ from ckanapi_harvesters.builder.builder_resource import BuilderResourceABC
32
+
33
+ multi_file_exclude_other_files:bool = True
34
+
35
+
36
+ def default_progress_callback(index:int, total:int, info:Any, *, context:str=None, **kwargs) -> None:
37
+ if context is None:
38
+ context = ""
39
+ if index == total:
40
+ # info is None
41
+ print(f"{context} Finished {index}/{total} (100%)")
42
+ elif info is None:
43
+ print(f"{context} Request {index}/{total} ({index/total*100.0:.2f}%)")
44
+ else:
45
+ if isinstance(info, str):
46
+ info_str = info
47
+ elif isinstance(info, pd.DataFrame):
48
+ if "source" in info.attrs.keys():
49
+ info_str = str(info.attrs["source"])
50
+ else:
51
+ info_str = "<DataFrame>"
52
+ elif isinstance(info, list):
53
+ info_str = "<records>"
54
+ else:
55
+ info_str = str(info)
56
+ print(f"{context} Request {index}/{total} ({index/total*100.0:.2f}%): " + info_str)
57
+
58
+
59
+ class BuilderMultiABC(ABC):
60
+ def __init__(self):
61
+ self.progress_callback: Union[Callable[[int, int, Any], None], None] = default_progress_callback
62
+ self.progress_callback_kwargs: dict = {}
63
+ self.stop_event = threading.Event()
64
+ self.thread_ckan: Dict[str, CkanApi] = {}
65
+ self.enable_multi_threaded_upload:bool = True
66
+ self.enable_multi_threaded_download:bool = True
67
+ # from Resource (for code validation)
68
+ self.name:str = ""
69
+ self.enable_download:bool = True
70
+
71
+ def copy(self, *, dest=None):
72
+ dest.progress_callback = self.progress_callback
73
+ dest.progress_callback_kwargs = copy.deepcopy(self.progress_callback_kwargs)
74
+ dest.enable_multi_threaded_upload = self.enable_multi_threaded_upload
75
+ dest.enable_multi_threaded_download = self.enable_multi_threaded_download
76
+ # do not copy stop_event
77
+ return dest
78
+
79
+ def _call_progress_callback(self, index:int, total:int, *, info:Any=None, context:str=None) -> None:
80
+ if self.progress_callback is not None:
81
+ self.progress_callback(index, total, info=info, context=context, **self.progress_callback_kwargs)
82
+
83
+ def _prepare_for_multithreading(self, ckan: CkanApi):
84
+ self.stop_event.clear()
85
+ self.thread_ckan.clear()
86
+
87
+ def _init_thread(self, ckan: CkanApi):
88
+ thread_name = current_thread().name
89
+ ckan_thread = ckan.copy(new_identifier=thread_name)
90
+ ckan_thread.prepare_for_multithreading(True) # prepare CKAN object for multi-threading
91
+ self.thread_ckan[thread_name] = ckan_thread
92
+
93
+ def _terminate_thread(self):
94
+ for ckan in self.thread_ckan.values():
95
+ ckan.disconnect()
96
+ self.thread_ckan.clear()
97
+
98
+
99
+ ## upload -----------------------------------------------------------------
100
+ @abstractmethod
101
+ def init_local_files_list(self, resources_base_dir:str, cancel_if_present:bool=True, **kwargs) -> List[str]:
102
+ """
103
+ Behavior to list parts of an upload.
104
+ """
105
+ raise NotImplementedError()
106
+
107
+ @abstractmethod
108
+ def get_local_file_len(self) -> int:
109
+ """
110
+ Get the number of parts of the upload.
111
+ """
112
+ raise NotImplementedError()
113
+
114
+ @abstractmethod
115
+ def get_local_file_generator(self, resources_base_dir:str, **kwargs) -> Generator[Any, None, None]:
116
+ """
117
+ Returns an iterator over the parts of the upload.
118
+ """
119
+ raise NotImplementedError()
120
+
121
+ @abstractmethod
122
+ def upload_request_final(self, ckan:CkanApi, *, force:bool=False) -> None:
123
+ raise NotImplementedError()
124
+
125
+ @abstractmethod
126
+ def _unit_upload_apply(self, *, ckan:CkanApi, file:Any,
127
+ index:int, start_index:int, end_index:int, total:int, **kwargs) -> Any:
128
+ """
129
+ Unitary function deciding whether to perform upload and making the steps for the upload.
130
+ """
131
+ raise NotImplementedError()
132
+
133
+ def upload_request_full(self, ckan:CkanApi, resources_base_dir:str, *,
134
+ threads:int=1, external_stop_event=None,
135
+ start_index:int=0, end_index:int=None, **kwargs) -> None:
136
+ """
137
+ Perform all the upload requests.
138
+
139
+ :param ckan:
140
+ :param resources_base_dir:
141
+ :param threads:
142
+ :param external_stop_event:
143
+ :param only_missing:
144
+ :param start_index:
145
+ :param end_index:
146
+ :return:
147
+ """
148
+ if threads < 0:
149
+ # cancel large uploads in this case
150
+ return None
151
+ elif (threads is None or threads > 1) and self.enable_multi_threaded_upload:
152
+ return self.upload_request_full_multi_threaded(ckan=ckan, resources_base_dir=resources_base_dir,
153
+ threads=threads, external_stop_event=external_stop_event,
154
+ start_index=start_index, end_index=end_index, **kwargs)
155
+ else:
156
+ self.init_local_files_list(resources_base_dir=resources_base_dir, cancel_if_present=True, **kwargs)
157
+ if ckan.params.verbose_extra:
158
+ print(f"Launching single-threaded upload of multi-file resource {self.name}")
159
+ total = self.get_local_file_len()
160
+ end_index = positive_end_index(end_index, total)
161
+ for index, file_path in enumerate(self.get_local_file_generator(resources_base_dir=resources_base_dir, **kwargs)):
162
+ if external_stop_event is not None and external_stop_event.is_set():
163
+ print(f"{ckan.identifier} Interrupted")
164
+ return
165
+ self._unit_upload_apply(ckan=ckan, file=file_path, index=index,
166
+ start_index=start_index, end_index=end_index, total=total, **kwargs)
167
+ self._call_progress_callback(total, total, context=f"{ckan.identifier} single-thread upload")
168
+ # at last, apply final actions:
169
+ self.upload_request_final(ckan)
170
+
171
+ def upload_request_graceful(self, ckan:CkanApi, file_path: str, *, index:int,
172
+ external_stop_event=None,
173
+ start_index:int=0, end_index:int=None, **kwargs) -> None:
174
+ """
175
+ Calls upload_file with checks specific to multi-threading.
176
+
177
+ :return:
178
+ """
179
+ # ckan.session_reset()
180
+ # ckan.identifier = current_thread().name
181
+ ckan = self.thread_ckan[current_thread().name]
182
+ total = self.get_local_file_len()
183
+ end_index = positive_end_index(end_index, total)
184
+ if self.stop_event.is_set():
185
+ return
186
+ if external_stop_event is not None and external_stop_event.is_set():
187
+ print(f"{ckan.identifier} Interrupted")
188
+ return
189
+ try:
190
+ self._unit_upload_apply(ckan=ckan, file=file_path, index=index,
191
+ start_index=start_index, end_index=end_index, total=total, **kwargs)
192
+ except Exception as e:
193
+ self.stop_event.set() # Ensure all threads stop
194
+ if ckan.params.verbose_extra:
195
+ print(f"Stopping all threads because an exception occurred in thread: {e}")
196
+ raise e from e
197
+
198
+ def upload_request_full_multi_threaded(self, ckan:CkanApi, resources_base_dir:str,
199
+ threads:int=1, external_stop_event=None,
200
+ start_index:int=0, end_index:int=None, **kwargs):
201
+ """
202
+ Multi-threaded implementation of upload_request_full, using ThreadPoolExecutor.
203
+ """
204
+ self.init_local_files_list(resources_base_dir=resources_base_dir, cancel_if_present=True, **kwargs)
205
+ self._prepare_for_multithreading(ckan)
206
+ try:
207
+ with ThreadPoolExecutor(max_workers=threads, initializer=self._init_thread, initargs=(ckan,)) as executor:
208
+ if ckan.params.verbose_extra:
209
+ print(f"Launching multi-threaded upload of multi-file resource {self.name}")
210
+ futures = [executor.submit(self.upload_request_graceful, ckan=ckan, file_path=file_path, index=index,
211
+ start_index=start_index, end_index=end_index, external_stop_event=external_stop_event,
212
+ **kwargs)
213
+ for index, file_path in enumerate(self.get_local_file_generator(resources_base_dir=resources_base_dir, **kwargs))]
214
+ for future in futures:
215
+ future.result() # This will propagate the exception
216
+ total = self.get_local_file_len()
217
+ self._call_progress_callback(total, total, context=f"{ckan.identifier} multi-thread upload")
218
+ except Exception as e:
219
+ self.stop_event.set() # Ensure all threads stop
220
+ if ckan.params.verbose_extra:
221
+ print(f"Stopping all threads because an exception occurred: {e}")
222
+ raise e from e
223
+ finally:
224
+ # self.stop_event.set() # Ensure all threads stop
225
+ if ckan.params.verbose_extra:
226
+ print("End of multi-threaded upload...")
227
+ # at last, apply final actions:
228
+ self._terminate_thread()
229
+ self.upload_request_final(ckan)
230
+
231
+ ## download -------------------------------------------------------------
232
+ @abstractmethod
233
+ def init_download_file_query_list(self, ckan: CkanApi, out_dir: str, cancel_if_present: bool = True, **kwargs) -> List[Any]:
234
+ """
235
+ Determine the list of queries to download to reconstruct the uploaded parts.
236
+ By default, the unique combinations of the first columns of the primary key are used.
237
+ """
238
+ raise NotImplementedError()
239
+
240
+ @abstractmethod
241
+ def get_file_query_generator(self) -> Generator[Any, Any, None]:
242
+ """
243
+ Returns an iterator on all the file_queries.
244
+ """
245
+ raise NotImplementedError()
246
+
247
+ @abstractmethod
248
+ def get_file_query_len(self) -> int:
249
+ """
250
+ Returns the total number of file_queries.
251
+ """
252
+ raise NotImplementedError()
253
+
254
+ @abstractmethod
255
+ def download_file_query_item(self, ckan: CkanApi, out_dir: str, file_query_item: Any) -> Any:
256
+ """
257
+ Download the file_query item with the its arguments
258
+ """
259
+ raise NotImplementedError()
260
+
261
+ @abstractmethod
262
+ def download_request_generator(self, ckan: CkanApi, out_dir: str) -> Generator[Any, Any, None]:
263
+ """
264
+ Generator to apply treatments after each request (single-threaded).
265
+
266
+ :param ckan:
267
+ :param out_dir:
268
+ :return:
269
+ """
270
+ raise NotImplementedError()
271
+
272
+ @abstractmethod
273
+ def _unit_download_apply(self, ckan:CkanApi, file_query_item:Any, out_dir:str,
274
+ index:int, start_index:int, end_index:int, total:int, **kwargs) -> Any:
275
+ """
276
+ Unitary function deciding whether to perform download and making the steps for the request.
277
+ """
278
+ raise NotImplementedError()
279
+
280
+ def download_request_full(self, ckan: CkanApi, out_dir: str, threads:int=1, external_stop_event=None,
281
+ start_index:int=0, end_index:int=None, force:bool=False, **kwargs) -> None:
282
+
283
+ if (not self.enable_download) and (not force):
284
+ msg = f"Did not download resource {self.name} because download was disabled."
285
+ warn(msg)
286
+ return None
287
+ if threads < 0:
288
+ # do not download large datasets in this case
289
+ return None
290
+ elif (threads is None or threads > 1) and self.enable_multi_threaded_download:
291
+ return self.download_request_full_multi_threaded(ckan=ckan, out_dir=out_dir,
292
+ threads=threads, external_stop_event=external_stop_event,
293
+ start_index=start_index, end_index=end_index, **kwargs)
294
+ else:
295
+ self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True, **kwargs)
296
+ if ckan.params.verbose_extra:
297
+ print(f"Launching single-threaded download of multi-file resource {self.name}")
298
+ total = self.get_file_query_len()
299
+ end_index = positive_end_index(end_index, total)
300
+ for index, file_query_item in enumerate(self.get_file_query_generator()):
301
+ if external_stop_event is not None and external_stop_event.is_set():
302
+ print(f"{ckan.identifier} Interrupted")
303
+ return
304
+ self._unit_download_apply(ckan=ckan, file_query_item=file_query_item, out_dir=out_dir,
305
+ index=index, start_index=start_index, end_index=end_index, total=total, **kwargs)
306
+ self._call_progress_callback(total, total, context=f"{ckan.identifier} single-thread download")
307
+
308
+ def download_file_query_item_graceful(self, ckan: CkanApi, out_dir: str, file_query_item: Any, index:int,
309
+ external_stop_event=None, start_index:int=0, end_index:int=None, **kwargs) -> None:
310
+ """
311
+ Implementation of download_file_query_item with checks for a multi-threaded download.
312
+ """
313
+ # ckan.session_reset()
314
+ # ckan.identifier = current_thread().name
315
+ ckan = self.thread_ckan[current_thread().name]
316
+ total = self.get_file_query_len()
317
+ end_index = positive_end_index(end_index, total)
318
+ if self.stop_event.is_set():
319
+ return
320
+ if external_stop_event is not None and external_stop_event.is_set():
321
+ print(f"{ckan.identifier} Interrupted")
322
+ return
323
+ try:
324
+ self._unit_download_apply(ckan=ckan, file_query_item=file_query_item, out_dir=out_dir,
325
+ index=index, start_index=start_index, end_index=end_index, total=total, **kwargs)
326
+ except Exception as e:
327
+ self.stop_event.set() # Ensure all threads stop
328
+ if ckan.params.verbose_extra:
329
+ print(f"Stopping all threads because an exception occurred in thread: {e}")
330
+ raise e from e
331
+
332
+ def download_request_full_multi_threaded(self, ckan: CkanApi, out_dir: str,
333
+ threads: int = None, external_stop_event=None,
334
+ start_index:int=0, end_index:int=-1, **kwargs) -> None:
335
+ """
336
+ Multi-threaded implementation of download_request_full using ThreadPoolExecutor.
337
+ """
338
+ self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True, **kwargs)
339
+ self._prepare_for_multithreading(ckan)
340
+ try:
341
+ with ThreadPoolExecutor(max_workers=threads, initializer=self._init_thread, initargs=(ckan,)) as executor:
342
+ if ckan.params.verbose_extra:
343
+ print(f"Launching multi-threaded download of multi-file resource {self.name}")
344
+ futures = [executor.submit(self.download_file_query_item_graceful, ckan=ckan, out_dir=out_dir, file_query_item=file_query_item,
345
+ index=index, external_stop_event=external_stop_event, start_index=start_index, end_index=end_index, **kwargs)
346
+ for index, file_query_item in enumerate(self.get_file_query_generator())]
347
+ for future in futures:
348
+ future.result() # This will propagate the exception
349
+ total = self.get_file_query_len()
350
+ self._call_progress_callback(total, total, context=f"multi-thread download")
351
+ except Exception as e:
352
+ self.stop_event.set() # Ensure all threads stop
353
+ if ckan.params.verbose_extra:
354
+ print(f"Stopping all threads because an exception occurred: {e}")
355
+ raise e from e
356
+ finally:
357
+ # self.stop_event.set() # Ensure all threads stop
358
+ if ckan.params.verbose_extra:
359
+ print("End of multi-threaded download...")
360
+ # at last, apply final actions:
361
+ self._terminate_thread()
@@ -0,0 +1,146 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to upload metadata to the CKAN server to create/update an existing package
5
+ The metadata is defined by the user in an Excel worksheet
6
+ This file implements the basic resources. See builder_datastore for specific functions to initiate datastores.
7
+ """
8
+ from typing import Any, Generator, Union, Set, List, Dict, Tuple
9
+ import os
10
+ import requests
11
+ import copy
12
+
13
+ import pandas as pd
14
+
15
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element, find_duplicates
16
+ from ckanapi_harvesters.auxiliary.ckan_defs import ckan_tags_sep
17
+ from ckanapi_harvesters.auxiliary.ckan_errors import DuplicateNameError
18
+ from ckanapi_harvesters.auxiliary.path import resolve_rel_path, glob_rm_glob
19
+ from ckanapi_harvesters.ckan_api import CkanApi
20
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_abc import CkanDataCleanerABC
21
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanResourceInfo
22
+ from ckanapi_harvesters.builder.builder_field import BuilderField
23
+ from ckanapi_harvesters.builder.builder_resource_datastore import BuilderDataStoreFile
24
+ from ckanapi_harvesters.builder.builder_resource_multi_file import BuilderMultiFile
25
+
26
+
27
+ class BuilderMultiDataStore(BuilderMultiFile):
28
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
29
+ resource_id:str=None, download_url:str=None):
30
+ super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
31
+ self.field_builders: Union[Dict[str, BuilderField],None] = None
32
+ self.primary_key: Union[List[str],None] = None
33
+ self.indexes: Union[List[str],None] = None
34
+ self.aux_upload_fun_name:str = ""
35
+ self.aux_download_fun_name:str = ""
36
+ self.data_cleaner_upload:Union[CkanDataCleanerABC,None] = None
37
+
38
+ def copy(self, *, dest=None):
39
+ if dest is None:
40
+ dest = BuilderMultiDataStore()
41
+ super().copy(dest=dest)
42
+ dest.field_builders = copy.deepcopy(self.field_builders)
43
+ dest.primary_key = copy.deepcopy(self.primary_key)
44
+ dest.indexes = copy.deepcopy(self.indexes)
45
+ dest.aux_upload_fun_name = self.aux_upload_fun_name
46
+ dest.aux_download_fun_name = self.aux_download_fun_name
47
+ return dest
48
+
49
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
50
+ super()._load_from_df_row(row=row)
51
+ primary_keys_string: str = _string_from_element(row["primary key"])
52
+ indexes_string: str = _string_from_element(row["indexes"])
53
+ if primary_keys_string is not None:
54
+ if primary_keys_string.lower() == "none":
55
+ self.primary_key = []
56
+ else:
57
+ self.primary_key = [field.strip() for field in primary_keys_string.split(ckan_tags_sep)]
58
+ if indexes_string is not None:
59
+ if indexes_string.lower() == "none":
60
+ self.indexes = []
61
+ else:
62
+ self.indexes = [field.strip() for field in indexes_string.split(ckan_tags_sep)]
63
+ if "upload function" in row.keys():
64
+ self.aux_upload_fun_name: str = _string_from_element(row["upload function"], empty_value="")
65
+ if "download function" in row.keys():
66
+ self.aux_download_fun_name: str = _string_from_element(row["download function"], empty_value="")
67
+
68
+ def _load_fields_df(self, fields_df: pd.DataFrame):
69
+ fields_df.columns = fields_df.columns.map(str.lower)
70
+ fields_df.columns = fields_df.columns.map(str.strip)
71
+ self.field_builders = {}
72
+ for index, row in fields_df.iterrows():
73
+ field_builder = BuilderField()
74
+ field_builder._load_from_df_row(row=row)
75
+ self.field_builders[field_builder.name] = field_builder
76
+
77
+ def _check_field_duplicates(self):
78
+ duplicates = find_duplicates([field_builder.name for field_builder in self.field_builders.values()])
79
+ if len(duplicates) > 0:
80
+ raise DuplicateNameError("Field", duplicates)
81
+
82
+ def _get_fields_dict(self) -> Dict[str, dict]:
83
+ self._check_field_duplicates()
84
+ if self.field_builders is not None:
85
+ fields_dict = {field_builder.name: field_builder._to_dict() for field_builder in self.field_builders.values()}
86
+ else:
87
+ fields_dict = None
88
+ return fields_dict
89
+
90
+ def _get_fields_df(self) -> pd.DataFrame:
91
+ fields_dict_list = [value for value in self._get_fields_dict().values()]
92
+ fields_df = pd.DataFrame.from_records(fields_dict_list)
93
+ return fields_df
94
+
95
+ @staticmethod
96
+ def resource_mode_str() -> str:
97
+ return "MultiDataStore"
98
+
99
+ def _to_dict(self, include_id:bool=True) -> dict:
100
+ d = super()._to_dict(include_id=include_id)
101
+ d["Primary key"] = ckan_tags_sep.join(self.primary_key) if self.primary_key else ""
102
+ d["Indexes"] = ckan_tags_sep.join(self.indexes) if self.indexes is not None else ""
103
+ return d
104
+
105
+ def _data_store_builder_of_file(self, file_path:str) -> Tuple[BuilderDataStoreFile, str]:
106
+ file_dir, file_name = os.path.split(file_path)
107
+ ds_builder = BuilderDataStoreFile(name=file_name, description=self.description, download_url=self.download_url,
108
+ format=self.format, file_name=file_name)
109
+ ds_builder.field_builders = self.field_builders
110
+ ds_builder.primary_key = self.primary_key
111
+ ds_builder.indexes = self.indexes
112
+ ds_builder.package_name = self.package_name
113
+ ds_builder.aux_upload_fun_name = self.aux_upload_fun_name
114
+ ds_builder.aux_download_fun_name = self.aux_download_fun_name
115
+ ds_builder.aliases = None
116
+ ds_builder.data_cleaner_upload = self.data_cleaner_upload
117
+ return ds_builder, file_dir
118
+
119
+
120
+ ## Upload ----------------
121
+ def upload_file(self, ckan:CkanApi, package_id:str, file_path:str, *,
122
+ reupload:bool=False, cancel_if_present:bool=True) -> CkanResourceInfo:
123
+ ds_builder, file_dir = self._data_store_builder_of_file(file_path=file_path)
124
+ return ds_builder.patch_request(ckan=ckan, package_id=package_id, reupload=reupload,
125
+ resources_base_dir=file_dir)
126
+
127
+
128
+ ## Download --------------
129
+ def download_file_query_item_df(self, ckan: CkanApi, out_dir: str, file_query_item: str, full_download:bool=True) -> Tuple[str, pd.DataFrame]:
130
+ resource_name = file_query_item
131
+ ds_builder, _ = self._data_store_builder_of_file(file_path=resource_name)
132
+ file_dir = resolve_rel_path(out_dir, glob_rm_glob(self.dir_name), field=f"File/URL of resource {self.name}")
133
+ df = ds_builder.download_request(ckan, out_dir=file_dir, full_download=full_download)
134
+ return ds_builder.downloaded_destination, df
135
+
136
+ def download_file_query_item(self, ckan: CkanApi, out_dir: str, file_query_item: str, full_download:bool=True) -> Tuple[Union[str,None], Union[requests.Response,None]]:
137
+ downloaded_destination, df = self.download_file_query_item_df(ckan=ckan, out_dir=out_dir, file_query_item=file_query_item,full_download=full_download)
138
+ return downloaded_destination, None
139
+
140
+ def download_request_generator_df(self, ckan: CkanApi, out_dir: str,
141
+ excluded_resource_names:Set[str]=None) -> Generator[Tuple[Union[str,None], Union[pd.DataFrame,None]], Any, None]:
142
+ self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True,
143
+ excluded_resource_names=excluded_resource_names)
144
+ for file_query_item in self.get_file_query_generator():
145
+ yield self.download_file_query_item_df(ckan=ckan, out_dir=out_dir, file_query_item=file_query_item)
146
+