ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
  103. ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,40 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to initiate a package builder from a Dataset harvester
5
+ """
6
+ from typing import List
7
+
8
+ from ckanapi_harvesters.builder.builder_package_1_basic import BuilderPackageBasic
9
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanState
10
+ from ckanapi_harvesters.builder.builder_resource_datastore_multi_harvester import BuilderDataStoreHarvester
11
+ from ckanapi_harvesters.harvesters.harvester_abc import DatasetHarvesterABC
12
+
13
+
14
+ class BuilderPackageWithHarvesters(BuilderPackageBasic):
15
+ @staticmethod
16
+ def init_from_harvester(dataset_harvester: DatasetHarvesterABC) -> "BuilderPackageWithHarvesters":
17
+ builder = BuilderPackageWithHarvesters()
18
+ params = dataset_harvester.params
19
+ builder.package_name = f"harvest_{params.harvest_method}_{params.database}_{params.dataset}".lower()
20
+ builder.package_attributes.title = f"Harvest result of {params.harvest_method} / database {params.database} / dataset {params.dataset}"
21
+ builder.package_attributes.description = f"Harvested from {params.url} / database {params.database} / dataset {params.dataset}"
22
+ builder.package_attributes.private = True
23
+ # builder.package_attributes.state = CkanState.Draft
24
+ tables: List[str] = dataset_harvester.list_tables(return_metadata=False)
25
+ for table_name in tables:
26
+ table_harvester = dataset_harvester.get_table_harvester(table_name)
27
+ resource_builder = BuilderDataStoreHarvester()
28
+ resource_builder.name = table_name
29
+ resource_builder.harvester = table_harvester
30
+ if resource_builder.description is None:
31
+ resource_builder.description = f"dataset {params.dataset} / table {table_name}"
32
+ # metadata is imported after a clean of metadata
33
+ builder.resource_builders[table_name] = resource_builder
34
+ return builder
35
+
36
+ def copy(self, dest=None) -> "BuilderPackageWithHarvesters":
37
+ if dest is None:
38
+ dest = BuilderPackageWithHarvesters()
39
+ super().copy(dest=dest)
40
+ return dest
@@ -0,0 +1,45 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to upload metadata to the CKAN server, with one thread per resource
5
+ """
6
+ from typing import List, Union, Dict, Callable, Any
7
+ import threading
8
+ import copy
9
+
10
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanState
11
+ from ckanapi_harvesters.builder.builder_package_2_harvesters import BuilderPackageWithHarvesters
12
+ from ckanapi_harvesters.builder.builder_resource_multi_abc import BuilderMultiABC, default_progress_callback
13
+ from ckanapi_harvesters.ckan_api import CkanApi
14
+
15
+
16
+ class BuilderPackageMultiThreaded(BuilderPackageWithHarvesters, BuilderMultiABC):
17
+ def __init__(self, package_name: str = None, *, package_id: str = None,
18
+ title: str = None, description: str = None, private: bool = None, state: CkanState = None,
19
+ version: str = None,
20
+ url: str = None, tags: List[str] = None,
21
+ organization_name: str = None, license_name: str = None):
22
+ super().__init__(package_name=package_name, package_id=package_id,
23
+ title=title, description=description, private=private, state=state, version=version,
24
+ url=url, tags=tags, organization_name=organization_name, license_name=license_name)
25
+ # BuilderMultiABC:
26
+ self.progress_callback: Union[Callable[[int, int, Any], None], None] = default_progress_callback
27
+ self.progress_callback_kwargs: dict = {}
28
+ self.stop_event = threading.Event()
29
+ self.thread_ckan: Dict[str, CkanApi] = {}
30
+ self.enable_multi_threaded_upload:bool = True
31
+ self.enable_multi_threaded_download:bool = True
32
+
33
+ def copy(self, dest=None) -> "BuilderPackageWithHarvesters":
34
+ if dest is None:
35
+ dest = BuilderPackageWithHarvesters()
36
+ super().copy(dest=dest)
37
+ dest.progress_callback = self.progress_callback
38
+ dest.progress_callback_kwargs = copy.deepcopy(self.progress_callback_kwargs)
39
+ dest.enable_multi_threaded_upload = self.enable_multi_threaded_upload
40
+ dest.enable_multi_threaded_download = self.enable_multi_threaded_download
41
+ return dest
42
+
43
+ # TODO: implement abstract methods
44
+
45
+
@@ -0,0 +1,589 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to upload metadata to the CKAN server to create/update an existing package
5
+ The metadata is defined by the user in an Excel worksheet
6
+ This file implements the basic resources. See builder_datastore for specific functions to initiate datastores.
7
+ """
8
+ from typing import Union, Any
9
+ from abc import ABC, abstractmethod
10
+ import os
11
+ from warnings import warn
12
+ import copy
13
+ import io
14
+
15
+ import pandas as pd
16
+
17
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
18
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import upload_prepare_requests_files_arg
19
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanResourceInfo
20
+ from ckanapi_harvesters.auxiliary.path import resolve_rel_path
21
+ from ckanapi_harvesters.ckan_api import CkanApi
22
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element, _bool_from_string
23
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanState
24
+ from ckanapi_harvesters.auxiliary.ckan_errors import CkanArgumentError, MissingIdError, FunctionMissingArgumentError, MandatoryAttributeError
25
+ from ckanapi_harvesters.builder.builder_errors import ResourceFileNotExistMessage, EmptyPackageNameException
26
+
27
+
28
+ builder_request_default_auth_if_ckan:Union[bool,None] = True # fill authentification headers for requests with CkanApi requests proxy method if same domain is used by default
29
+
30
+
31
+ class BuilderResourceABC(ABC):
32
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
33
+ state:CkanState=None, enable_download:bool=True,
34
+ resource_id:str=None, download_url:str=None):
35
+ self.name: Union[str,None] = name
36
+ self.format: Union[str,None] = format
37
+ self.description: Union[str,None] = description
38
+ self.state:Union[CkanState,None] = state
39
+ self.enable_download:bool = enable_download
40
+ self.options_string: Union[str,None] = None
41
+ # Map information, if present
42
+ self.package_name: str = "" # parent package name (update before any operation)
43
+ self.known_id: Union[str,None] = resource_id
44
+ self.download_url: Union[str,None] = download_url
45
+ self.comment: Union[str,None] = None
46
+ # Functions inputs/outputs
47
+ self.sample_data_source: str = ""
48
+ self.reupload_on_update: bool = True
49
+ self.downloaded_destination: str = ""
50
+ self.download_skip_existing:bool = True # True: do not overwrite files
51
+ self.download_error_not_found:bool = True
52
+ self.create_default_view:bool = True
53
+
54
+ def __copy__(self):
55
+ return self.copy()
56
+
57
+ @abstractmethod
58
+ def copy(self, *, dest=None):
59
+ dest.name = self.name
60
+ dest.format = self.format
61
+ dest.description = self.description
62
+ dest.state = self.state
63
+ dest.enable_download = self.enable_download
64
+ dest.options_string = self.options_string
65
+ dest.package_name = self.package_name
66
+ dest.known_id = self.known_id
67
+ dest.download_url = self.download_url
68
+ dest.comment = self.comment
69
+ dest.sample_data_source = self.sample_data_source
70
+ dest.reupload_on_update = self.reupload_on_update
71
+ dest.downloaded_destination = self.downloaded_destination
72
+ dest.download_skip_existing = self.download_skip_existing
73
+ dest.download_error_not_found = self.download_error_not_found
74
+ dest.create_default_view = self.create_default_view
75
+ return dest
76
+
77
+ def _check_mandatory_attributes(self):
78
+ if self.name is None:
79
+ raise MandatoryAttributeError("Resource", "name")
80
+
81
+ def init_options_from_ckan(self, ckan:CkanApi) -> None:
82
+ """
83
+ Function to initialize some parameters from the ckan object
84
+ """
85
+ pass
86
+
87
+ @abstractmethod
88
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
89
+ # abstract method because does not take into account file/url field
90
+ self.name = _string_from_element(row["name"]).strip()
91
+ self.format = _string_from_element(row["format"]).upper().strip()
92
+ self.description = None
93
+ if "description" in row.keys():
94
+ self.description = _string_from_element(row["description"])
95
+ self.enable_download = True
96
+ if "options" in row.keys():
97
+ self.options_string = _string_from_element(row["options"], empty_value="")
98
+ if "download" in row.keys():
99
+ self.enable_download = _bool_from_string(row["download"])
100
+ self.state = None
101
+ if "state" in row.keys():
102
+ state = _string_from_element(row["state"])
103
+ if state is not None:
104
+ self.state = CkanState.from_str(state)
105
+ # Map information, if present
106
+ self.known_id = None
107
+ self.download_url = None
108
+ if "known id" in row.keys():
109
+ self.known_id = _string_from_element(row["known id"])
110
+ if "known url" in row.keys():
111
+ self.download_url = _string_from_element(row["known url"])
112
+ if "comment" in row.keys():
113
+ self.comment = _string_from_element(row["comment"])
114
+
115
+ def get_or_query_resource_id(self, ckan: CkanApi, cancel_if_present:bool=True, error_not_found:bool=True) -> str:
116
+ """
117
+ Store/retrieve resource ID in the class attributes.
118
+ """
119
+ package_name = self.package_name
120
+ if package_name == "":
121
+ raise EmptyPackageNameException()
122
+ if self.known_id is None or not cancel_if_present:
123
+ ckan.map_resources(package_name, only_missing=True)
124
+ self.known_id = ckan.map.get_resource_id(self.name, package_name=package_name, error_not_mapped=error_not_found)
125
+ return self.known_id
126
+
127
+ def get_or_query_package_id(self, ckan: CkanApi) -> str:
128
+ """
129
+ Obtain package ID from the package name. This can lead to a request to the API.
130
+ """
131
+ package_name = self.package_name
132
+ if package_name == "":
133
+ raise EmptyPackageNameException()
134
+ ckan.map_resources(package_name, only_missing=True)
135
+ package_id = ckan.map.get_package_id(package_name=package_name)
136
+ return package_id
137
+
138
+ @staticmethod
139
+ @abstractmethod
140
+ def resource_mode_str() -> str:
141
+ raise NotImplementedError()
142
+
143
+ def __str__(self):
144
+ return f"Resource builder for {self.name} in mode {self.resource_mode_str()}"
145
+
146
+ @abstractmethod
147
+ def _to_dict(self, include_id:bool=True) -> dict:
148
+ d = {
149
+ "Name": self.name,
150
+ "Format": self.format if self.format else "",
151
+ "State": self.state.name if self.state is not None else "",
152
+ "Mode": self.resource_mode_str(),
153
+ "File/URL": None, # concrete implementations must fill this field
154
+ "Options": self.options_string,
155
+ "Download": str(self.enable_download),
156
+ "Description": self.description if self.description else "",
157
+ "Primary key": "",
158
+ "Indexes": "",
159
+ "Upload function": "",
160
+ "Download function": "",
161
+ "Aliases": "",
162
+ "Comment": self.comment if self.comment else "",
163
+ }
164
+ if include_id and self.known_id is not None:
165
+ d["Known ID"] = self.known_id
166
+ if include_id and self.download_url is not None:
167
+ d["Known URL"] = self.download_url
168
+ return d
169
+
170
+ def _to_row(self) -> pd.Series:
171
+ row = pd.Series(self._to_dict())
172
+ row.index = row.index.map(str.lower)
173
+ row.index = row.index.map(str.strip)
174
+ return row
175
+
176
+ @staticmethod
177
+ @abstractmethod
178
+ def sample_file_path_is_url() -> bool:
179
+ raise NotImplementedError()
180
+
181
+ @abstractmethod
182
+ def get_sample_file_path(self, resources_base_dir:str) -> Union[str,None]:
183
+ """
184
+ Function returning the local resource file name for the sample file.
185
+
186
+ :param resources_base_dir: base directory to find the resources on the local machine
187
+ :return:
188
+ """
189
+ raise NotImplementedError()
190
+
191
+ @abstractmethod
192
+ def load_sample_data(self, resources_base_dir:str) -> Union[bytes,None]:
193
+ """
194
+ Function returning the data from the indicated resources.
195
+
196
+ :param resources_base_dir: base directory to find the resources on the local machine
197
+ :return:
198
+ """
199
+ raise NotImplementedError()
200
+
201
+ @abstractmethod
202
+ def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[None,ContextErrorLevelMessage]:
203
+ """
204
+ Test the presence of the files/urls used in the upload/patch requests.
205
+
206
+ :param resources_base_dir:
207
+ :return: None if success, error message otherwise
208
+ """
209
+ raise NotImplementedError()
210
+
211
+ @abstractmethod
212
+ def patch_request(self, ckan:CkanApi, package_id:str, *,
213
+ reupload:bool=None, resources_base_dir:str=None) -> CkanResourceInfo:
214
+ """
215
+ Function to perform all the necessary requests to initiate/reupload the resource on the CKAN server.
216
+
217
+ :param resources_base_dir:
218
+ :param ckan:
219
+ :param reupload: option to reupload the resource
220
+ :return:
221
+ """
222
+ # TODO: call to API resource_patch
223
+ # ckan.resource_patch
224
+ raise NotImplementedError()
225
+
226
+ def upload_request(self, resources_base_dir:str, ckan:CkanApi, package_id:str):
227
+ # might be dead code
228
+ # this function (patch_request) gets specialized in certain cases
229
+ return self.patch_request(ckan, package_id, resources_base_dir=resources_base_dir, reupload=True)
230
+
231
+ @abstractmethod
232
+ def download_sample(self, ckan:CkanApi, full_download:bool=True, **kwargs) -> bytes:
233
+ """
234
+ Download the resource and return the data as bytes.
235
+
236
+ :param ckan:
237
+ :param out_dir:
238
+ :param full_download: Some resources like URLs are not downloaded by default. Large datasets are also limited to one request for this function by default.
239
+ :param threads:
240
+ :return:
241
+ """
242
+ raise NotImplementedError()
243
+
244
+ @abstractmethod
245
+ def download_request(self, ckan:CkanApi, out_dir:str, *, full_download:bool=True, force:bool=False, threads:int=1) -> Any:
246
+ """
247
+ Download the resource and save in a file pointed by out_dir.
248
+ In most implementations, this calls the download_sample method.
249
+
250
+ :param ckan:
251
+ :param out_dir:
252
+ :param full_download: Some resources like URLs are not downloaded by default. Large datasets are treated with a multi-threaded approach.
253
+ :param threads:
254
+ :param force: option to bypass the enable_download attribute of resources
255
+ :return:
256
+ """
257
+ raise NotImplementedError()
258
+
259
+ def _to_ckan_resource_info(self, package_id:str, check_id:bool=True) -> CkanResourceInfo:
260
+ """
261
+ Return resource info object from the information of the Excel workbook.
262
+ No requests are made but to use this data in the ckan object, the ID and name of the resource are mandatory.
263
+
264
+ :param package_id:
265
+ :param check_id:
266
+ :return:
267
+ """
268
+ if self.known_id is None and check_id:
269
+ msg = MissingIdError("resource", self.name)
270
+ raise msg
271
+ resource_info = CkanResourceInfo()
272
+ resource_info.id = self.known_id
273
+ resource_info.package_id = package_id
274
+ resource_info.name = self.name
275
+ resource_info.description = self.description
276
+ resource_info.download_url = self.download_url
277
+ return resource_info
278
+
279
+ def resource_info_request(self, ckan:CkanApi, error_not_found:bool=True) -> Union[CkanResourceInfo, None]:
280
+ resource_id = self.get_or_query_resource_id(ckan, cancel_if_present=False, error_not_found=error_not_found)
281
+ if resource_id is None and not self.download_error_not_found:
282
+ return None
283
+ res_info = ckan.get_resource_info_or_request(resource_id)
284
+ self.known_id = resource_id
285
+ return res_info
286
+
287
+ def delete_request(self, ckan:CkanApi, package_id:str, *, error_not_found:bool=False):
288
+ """
289
+ Delete the resource from the CKAN server.
290
+
291
+ :return:
292
+ """
293
+ self.package_name = package_id
294
+ resource_id = self.get_or_query_resource_id(ckan, error_not_found=error_not_found)
295
+ if resource_id is not None:
296
+ ckan.resource_delete(resource_id)
297
+
298
+
299
+ class BuilderFileABC(BuilderResourceABC, ABC):
300
+ """
301
+ Abstract class defining the behavior for a resource represented by a file (not a DataStore)
302
+ """
303
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
304
+ resource_id:str=None, download_url:str=None, file_name:str=None):
305
+ super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
306
+ self.file_name: str = file_name
307
+
308
+ def copy(self, *, dest=None):
309
+ super().copy(dest=dest)
310
+ dest.file_name = self.file_name
311
+ return dest
312
+
313
+ def _check_mandatory_attributes(self):
314
+ super()._check_mandatory_attributes()
315
+ if self.file_name is None:
316
+ raise MandatoryAttributeError(self.resource_mode_str(), "File")
317
+
318
+ @abstractmethod
319
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
320
+ super()._load_from_df_row(row=row)
321
+ self.file_name = _string_from_element(row["file/url"])
322
+
323
+ def patch_request(self, ckan: CkanApi, package_id: str, *, reupload: bool = None, resources_base_dir:str=None,
324
+ payload:Union[bytes, io.BufferedIOBase]=None) -> CkanResourceInfo:
325
+ """
326
+ Perform a patch of the resource on the CKAN server.
327
+ A patch is a full update of the metadata of the resource, and of the DataStore if appropriate.
328
+ The source file of the resource is also uploaded (or a first file for large DataStores).
329
+
330
+ :param ckan:
331
+ :param package_id:
332
+ :param reupload:
333
+ :param resources_base_dir:
334
+ :param payload:
335
+ :return:
336
+ """
337
+ if reupload is None: reupload = self.reupload_on_update
338
+ if payload is None:
339
+ payload = self.load_sample_data(resources_base_dir=resources_base_dir)
340
+ payload_file_name = self.file_name
341
+ files = upload_prepare_requests_files_arg(payload=payload, payload_name=payload_file_name)
342
+ res_info = ckan.resource_create(package_id, name=self.name, format=self.format, description=self.description, state=self.state,
343
+ files=files, datastore_create=False, auto_submit=False, create_default_view=self.create_default_view,
344
+ cancel_if_exists=True, update_if_exists=True, reupload=reupload)
345
+ self.known_id = res_info.id
346
+ return res_info
347
+
348
+ def download_sample(self, ckan:CkanApi, full_download:bool=True, **kwargs) -> Union[bytes, None]:
349
+ resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=self.download_error_not_found)
350
+ if resource_id is None and not self.download_error_not_found:
351
+ return None
352
+ resource_info, response = ckan.resource_download(resource_id, **kwargs)
353
+ if response is not None:
354
+ return response.content
355
+ else:
356
+ return None
357
+
358
+ def download_request(self, ckan: CkanApi, out_dir: str, *, full_download:bool=True, threads:int=1,
359
+ force:bool=False, **kwargs) -> None:
360
+ if (not self.enable_download) and (not force):
361
+ msg = f"Did not download resource {self.name} because download was disabled."
362
+ warn(msg)
363
+ return
364
+ if out_dir is not None:
365
+ self.downloaded_destination = resolve_rel_path(out_dir, self.file_name, field=f"File/URL of resource {self.name}")
366
+ if self.download_skip_existing and os.path.exists(self.downloaded_destination):
367
+ return
368
+ content = self.download_sample(ckan=ckan, full_download=full_download, **kwargs)
369
+ if out_dir is not None and content is not None:
370
+ os.makedirs(out_dir, exist_ok=True)
371
+ with open(self.downloaded_destination, "wb") as f:
372
+ f.write(content)
373
+ f.close()
374
+
375
+
376
+ # class BuilderResourceUnmanagedABC(BuilderResourceABC, ABC):
377
+ # # dead code
378
+ # def __init__(self, *, name:str=None, format:str=None, description:str=None,
379
+ # resource_id:str=None, download_url:str=None):
380
+ # super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
381
+ # self.file_name: str = name
382
+ #
383
+ # def copy(self, *, dest=None):
384
+ # super().copy(dest=dest)
385
+ # dest.file_name = self.file_name
386
+ # return dest
387
+ #
388
+ # def _load_from_df_row(self, row: pd.Series):
389
+ # super()._load_from_df_row(row=row)
390
+ # self.file_name = self.name
391
+ # self._check_mandatory_attributes()
392
+ #
393
+ # def _to_dict(self, include_id:bool=True) -> dict:
394
+ # d = super()._to_dict(include_id=include_id)
395
+ # d["File/URL"] = ""
396
+ # return d
397
+ #
398
+ # def load_sample_data(self, resources_base_dir:str) -> bytes:
399
+ # return None
400
+
401
+
402
+ class BuilderResourceUnmanaged(BuilderFileABC): #, BuilderResourceUnmanagedABC): # multiple inheritance can give undefined results
403
+ """
404
+ Class to manage a resource metadata without specifying its contents during the upload process.
405
+ """
406
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
407
+ resource_id:str=None, download_url:str=None):
408
+ super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
409
+ self.file_name: str = name
410
+ self.default_payload: Union[bytes, io.BufferedIOBase, None] = None
411
+
412
+ def copy(self, *, dest=None):
413
+ if dest is None:
414
+ dest = BuilderResourceUnmanaged()
415
+ super().copy(dest=dest)
416
+ dest.file_name = self.file_name
417
+ dest.default_payload = copy.deepcopy(self.default_payload)
418
+ return dest
419
+
420
+ @staticmethod
421
+ def resource_mode_str() -> str:
422
+ return "Unmanaged"
423
+
424
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
425
+ super()._load_from_df_row(row=row)
426
+ self.file_name = self.name
427
+ self._check_mandatory_attributes()
428
+
429
+ def _to_dict(self, include_id:bool=True) -> dict:
430
+ d = super()._to_dict(include_id=include_id)
431
+ d["File/URL"] = ""
432
+ return d
433
+
434
+ @staticmethod
435
+ def sample_file_path_is_url() -> bool:
436
+ return False
437
+
438
+ def get_sample_file_path(self, resources_base_dir:str) -> Union[str,None]:
439
+ return None
440
+
441
+ def load_sample_data(self, resources_base_dir:str) -> Union[bytes,None]:
442
+ return None
443
+
444
+ def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[ContextErrorLevelMessage,None]:
445
+ return None
446
+
447
+ def patch_request(self, ckan:CkanApi, package_id:str, *,
448
+ reupload:bool=None, resources_base_dir:str=None,
449
+ payload:Union[bytes, io.BufferedIOBase]=None) -> CkanResourceInfo:
450
+ if payload is None:
451
+ payload = self.default_payload
452
+ if reupload is None: reupload = self.reupload_on_update and payload is not None
453
+ payload_file_name = self.file_name
454
+ files = upload_prepare_requests_files_arg(payload=payload, payload_name=payload_file_name) if payload is not None else None
455
+ res_info = ckan.resource_create(package_id, name=self.name, format=self.format, description=self.description, state=self.state,
456
+ files=files, datastore_create=False, auto_submit=False, create_default_view=self.create_default_view,
457
+ cancel_if_exists=True, update_if_exists=True, reupload=reupload)
458
+ self.known_id = res_info.id
459
+ return res_info
460
+
461
+
462
+ class BuilderFileBinary(BuilderFileABC):
463
+ """
464
+ Concrete implementation for a binary file.
465
+ """
466
+ def copy(self, *, dest=None):
467
+ if dest is None:
468
+ dest = BuilderFileBinary()
469
+ super().copy(dest=dest)
470
+ return dest
471
+
472
+ @staticmethod
473
+ def sample_file_path_is_url() -> bool:
474
+ return False
475
+
476
+ def get_sample_file_path(self, resources_base_dir:str) -> str:
477
+ return resolve_rel_path(resources_base_dir, self.file_name, field=f"File/URL of resource {self.name}")
478
+
479
+ def load_sample_data(self, resources_base_dir:str) -> bytes:
480
+ self.sample_source = self.get_sample_file_path(resources_base_dir)
481
+ with open(self.sample_source, "rb") as f:
482
+ contents = f.read()
483
+ f.close()
484
+ return contents
485
+
486
+ def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[None,ContextErrorLevelMessage]:
487
+ file_path = self.get_sample_file_path(resources_base_dir=resources_base_dir)
488
+ if os.path.isfile(file_path):
489
+ return None
490
+ else:
491
+ return ResourceFileNotExistMessage(self.name, ErrorLevel.Error, f"Missing file for resource {self.name}: {file_path}")
492
+
493
+ @staticmethod
494
+ def resource_mode_str() -> str:
495
+ return "File"
496
+
497
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
498
+ super()._load_from_df_row(row=row)
499
+ self._check_mandatory_attributes()
500
+
501
+ def _to_dict(self, include_id:bool=True) -> dict:
502
+ d = super()._to_dict(include_id=include_id)
503
+ d["File/URL"] = self.file_name
504
+ return d
505
+
506
+
507
+ class BuilderUrlABC(BuilderFileABC, ABC):
508
+ """
509
+ Abstract behavior for a resource defined by an external URL.
510
+ """
511
+ def __init__(self, *, name:str=None, format:str=None, description:str=None,
512
+ resource_id:str=None, download_url:str=None, url:str=None):
513
+ super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
514
+ self.url = url
515
+ self.file_name: str = name
516
+
517
+ def copy(self, *, dest=None):
518
+ super().copy(dest=dest)
519
+ dest.url = self.url
520
+ dest.file_name = self.file_name
521
+ return dest
522
+
523
+ def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[None,ContextErrorLevelMessage]:
524
+ if ckan is None:
525
+ return ResourceFileNotExistMessage(self.name, ErrorLevel.Warning, "Could not determine if resource url exists because ckan argument was not provided.")
526
+ else:
527
+ return ckan.download_url_proxy_test_head(self.url, **kwargs)
528
+
529
+ def _check_mandatory_attributes(self):
530
+ super()._check_mandatory_attributes()
531
+ if self.url is None:
532
+ raise MandatoryAttributeError(self.resource_mode_str(), "URL")
533
+
534
+ def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
535
+ super()._load_from_df_row(row=row)
536
+ self.url: str = _string_from_element(row["file/url"])
537
+ self.file_name = self.name
538
+ self._check_mandatory_attributes()
539
+
540
+ def download_request(self, ckan: CkanApi, out_dir: str, *, full_download:bool=False, threads:int=1,
541
+ force:bool=False, **kwargs) -> None:
542
+ # do not download URLs by default
543
+ if full_download:
544
+ super().download_request(ckan=ckan, out_dir=out_dir,full_download=full_download, force=force,
545
+ threads=threads, **kwargs)
546
+
547
+ def _to_dict(self, include_id:bool=True) -> dict:
548
+ d = super()._to_dict(include_id=include_id)
549
+ d["File/URL"] = self.url
550
+ return d
551
+
552
+
553
+ class BuilderUrl(BuilderUrlABC):
554
+ """
555
+ Class for a resource defined by an external URL.
556
+ """
557
+ @staticmethod
558
+ def resource_mode_str() -> str:
559
+ return "URL"
560
+
561
+ def copy(self, *, dest=None):
562
+ if dest is None:
563
+ dest = BuilderUrl()
564
+ super().copy(dest=dest)
565
+ return dest
566
+
567
+ @staticmethod
568
+ def sample_file_path_is_url() -> bool:
569
+ return True
570
+
571
+ def get_sample_file_path(self, resources_base_dir: str) -> str:
572
+ return self.url
573
+
574
+ def load_sample_data(self, resources_base_dir:str, *, ckan:CkanApi=None,
575
+ proxies:dict=None, headers:dict=None) -> bytes:
576
+ self.sample_source = self.url
577
+ if ckan is None:
578
+ raise FunctionMissingArgumentError("BuilderDataStoreUrl.load_sample_data", "ckan")
579
+ return ckan.download_url_proxy(self.url, proxies=proxies, headers=headers, auth_if_ckan=builder_request_default_auth_if_ckan).content
580
+
581
+ def patch_request(self, ckan: CkanApi, package_id: str, *, reupload: bool = None, resources_base_dir:str=None,
582
+ payload:Union[bytes, io.BufferedIOBase]=None) -> CkanResourceInfo:
583
+ if reupload is None: reupload = self.reupload_on_update
584
+ if payload is not None:
585
+ raise CkanArgumentError("payload", "resource defined from URL patch")
586
+ return ckan.resource_create(package_id, name=self.name, format=self.format, description=self.description, state=self.state,
587
+ url=self.url, auto_submit=False, datastore_create=False, create_default_view=self.create_default_view,
588
+ cancel_if_exists=True, update_if_exists=True, reupload=reupload)
589
+