ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/METADATA +74 -38
  103. ckanapi_harvesters-0.0.2.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1291 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code to upload metadata to the CKAN server to create/update an existing package
5
+ The metadata is defined by the user in an Excel worksheet
6
+ This file implements the package definition.
7
+ """
8
+ from typing import Dict, List, Tuple, Union, Callable
9
+ from warnings import warn
10
+ import os
11
+ import shutil
12
+ import json
13
+ import re
14
+ from collections import OrderedDict
15
+
16
+ import pandas as pd
17
+ import numpy as np
18
+
19
+ from ckanapi_harvesters.policies.data_format_policy_errors import DataPolicyError
20
+ from ckanapi_harvesters.policies.data_format_policy import CkanPackageDataFormatPolicy
21
+ from ckanapi_harvesters.ckan_api import CkanApi, CkanApiMap
22
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
23
+ from ckanapi_harvesters.auxiliary.proxy_config import ProxyConfig
24
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanVisibility, CkanState, CkanPackageInfo, CkanResourceInfo, CkanDataStoreInfo, CkanLicenseInfo
25
+ from ckanapi_harvesters.auxiliary.path import sanitize_path, path_rel_to_dir, make_path_relative
26
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element, assert_or_raise, find_duplicates
27
+ from ckanapi_harvesters.auxiliary.ckan_defs import ckan_tags_sep
28
+ from ckanapi_harvesters.auxiliary.ckan_errors import (UnexpectedError, DuplicateNameError, ForbiddenNameError, MissingIdError,
29
+ MandatoryAttributeError, FileOrDirNotExistError)
30
+ from ckanapi_harvesters.auxiliary.ckan_configuration import unlock_external_url_resource_download, unlock_no_ca
31
+ from ckanapi_harvesters.builder.builder_errors import MissingDataStoreInfoError, UnsupportedBuilderVersionError
32
+ from ckanapi_harvesters.builder import BUILDER_FILE_FORMAT_VERSION as BUILDER_VER
33
+ from ckanapi_harvesters.builder.builder_resource import BuilderResourceABC
34
+ from ckanapi_harvesters.builder.builder_resource_multi_file import BuilderMultiFile, multi_file_exclude_other_files
35
+ from ckanapi_harvesters.builder.builder_resource_datastore import BuilderDataStoreABC
36
+ from ckanapi_harvesters.builder.builder_resource_multi_datastore import BuilderMultiDataStore
37
+ from ckanapi_harvesters.builder.builder_resource_datastore_multi_abc import BuilderDataStoreMultiABC
38
+ from ckanapi_harvesters.builder.builder_resource_datastore_multi_harvester import BuilderDataStoreHarvester
39
+ from ckanapi_harvesters.builder.builder_resource_init import init_resource_from_df, init_resource_from_ckan
40
+ from ckanapi_harvesters.builder.builder_ckan import BuilderCkan
41
+ from ckanapi_harvesters.auxiliary.external_code_import import PythonUserCode, unlock_external_code_execution
42
+
43
+ self_dir = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
44
+ example_package_xls = os.path.join(self_dir, "builder_package_example.xlsx")
45
+
46
+ forbidden_resource_names = {"ckan", "info", "package", "resources", "validation", "help"}
47
+ excel_subs_characters_re = r"[\*\?\[\]\+]" # characters used in wildcards (MultiFile & MultiDataStore), forbidden in Excel sheet names
48
+ excel_subs_dest_character = '#'
49
+
50
+
51
+ def excel_name_of_sheet(resource_name: str) -> str:
52
+ return re.sub(excel_subs_characters_re, excel_subs_dest_character, resource_name)
53
+
54
+ def excel_name_of_builder(resource_builder: BuilderResourceABC) -> str:
55
+ if isinstance(resource_builder, BuilderMultiDataStore):
56
+ return excel_name_of_sheet(resource_builder.name)
57
+ else:
58
+ return resource_builder.name
59
+
60
+
61
+ class BuilderPackageBasic:
62
+ """
63
+ Class to store an image of a CKAN package defined by an Excel worksheet
64
+
65
+
66
+ __NB__: There are several paths to distinguish:
67
+
68
+ - the path of the Excel worksheet
69
+ - base_dir: the base directory for relative paths
70
+ - resources_base_dir: the base directory for resources (for upload), which is generally defined relative to base_dir
71
+ - out_dir: the output directory, for download, absolute or relative to the cwd (current working directory)
72
+
73
+ __NB__: A builder can refer to the following external files:
74
+
75
+ - CKAN API key file (.txt)
76
+ - Proxy authentication file (.txt)
77
+ - CKAN CA certificate file (.pem)
78
+ - CA certificate for external connexions (.pem)
79
+ - Data format policy file (.json)
80
+ - External Python module (.py) containing DataFrame modification functions for upload/download of a DataStore
81
+ """
82
+ default_to_json_reduced_size:bool = False
83
+
84
+ def __init__(self, package_name:str=None, *, package_id:str=None,
85
+ title: str = None, description: str = None, private: bool = None, state: CkanState = None,
86
+ version: str = None,
87
+ url: str = None, tags: List[str] = None,
88
+ organization_name:str=None, license_name:str=None, src=None):
89
+ if src is not None:
90
+ src.copy(dest=self)
91
+ self.builder_source_file: Union[str, None] = None
92
+ self.builder_format_version: Union[str, None] = None
93
+ # package attributes
94
+ self.package_attributes: CkanPackageInfo = CkanPackageInfo(package_name=package_name, package_id=package_id,
95
+ title=title, description=description, private=private, state=state,
96
+ version=version, url=url, tags=tags)
97
+ self.organization_name: Union[str, None] = organization_name
98
+ self.license_name: Union[str, None] = license_name
99
+ # package resources
100
+ self._resources_base_dir_src: Union[str, None] = None # source of the resources_base_dir
101
+ self._resources_base_dir: Union[str, None] = None
102
+ self.resource_builders:OrderedDict[str,BuilderResourceABC] = OrderedDict()
103
+ self._default_out_dir_src: Union[str, None] = None
104
+ self._default_out_dir: Union[str, None] = None
105
+ # auxiliary builders
106
+ self.ckan_builder: BuilderCkan = BuilderCkan()
107
+ self.external_python_code: Union[PythonUserCode, None] = None
108
+ self.comment: str = ""
109
+
110
+ def __str__(self):
111
+ return f"Package builder for {self.package_name} ({len(self.resource_builders)} resources)"
112
+
113
+ def copy(self, dest=None) -> "BuilderPackageBasic":
114
+ if dest is None:
115
+ dest = BuilderPackageBasic()
116
+ dest.builder_source_file = self.builder_source_file
117
+ dest.builder_format_version = self.builder_format_version
118
+ dest.package_attributes = self.package_attributes.copy()
119
+ dest.organization_name = self.organization_name
120
+ dest.license_name = self.license_name
121
+ dest._resources_base_dir_src = self._resources_base_dir_src
122
+ dest._resources_base_dir = self._resources_base_dir
123
+ dest._default_out_dir_src = self._default_out_dir_src
124
+ dest._default_out_dir = self._default_out_dir
125
+ dest.resource_builders = OrderedDict()
126
+ dest.comment = self.comment
127
+ for key, value in self.resource_builders.items():
128
+ dest.resource_builders[key] = value.copy()
129
+ dest.ckan_builder = self.ckan_builder.copy()
130
+ if self.external_python_code is not None:
131
+ dest.external_python_code = self.external_python_code.copy()
132
+ return dest
133
+
134
+ def _check_mandatory_attributes(self):
135
+ if self.package_name is None:
136
+ raise MandatoryAttributeError("Package", "name")
137
+ # organization can be non-mandatory depending on CKAN configuration
138
+ # if self.organization_name is None:
139
+ # raise MissingMandatoryAttributeError("Package", "owner_org")
140
+
141
+ def clear_ids(self):
142
+ """
143
+ Clear all known ids from package and resource builders
144
+ :return:
145
+ """
146
+ self.package_attributes.id = None
147
+ for resource_builder in self.resource_builders.values():
148
+ resource_builder.known_id = None
149
+
150
+ @staticmethod
151
+ def unlock_external_code_execution(value:bool=True):
152
+ """
153
+ This function enables external code execution for the PythonUserCode class.
154
+ It is necessary to load builders which specify an Auxiliary functions file.
155
+
156
+ __Warning__:
157
+ only run code if you trust the source!
158
+
159
+ :return:
160
+ """
161
+ unlock_external_code_execution(value)
162
+
163
+ @staticmethod
164
+ def unlock_no_ca(value:bool=True):
165
+ """
166
+ This function enables you to disable the CA verification of the CKAN server.
167
+
168
+ __Warning__:
169
+ Only allow in a local environment!
170
+
171
+ """
172
+ unlock_no_ca(value)
173
+
174
+ @staticmethod
175
+ def unlock_external_url_resource_download(value:bool=True):
176
+ """
177
+ This function enables the download of resources external from the CKAN server.
178
+ """
179
+ unlock_external_url_resource_download(value)
180
+
181
+ @property
182
+ def package_name(self) -> str:
183
+ return self.package_attributes.name
184
+ @package_name.setter
185
+ def package_name(self, value:str):
186
+ self.package_attributes.name = value
187
+ self.update_package_name_in_resources()
188
+
189
+ @property
190
+ def resources_base_dir(self) -> str:
191
+ return self._resources_base_dir
192
+ def set_resources_base_dir(self, value:str, base_dir:str=None):
193
+ self._resources_base_dir_src = value
194
+ self._apply_resources_base_dir_src(base_dir=self.get_base_dir(base_dir=base_dir))
195
+
196
+ @property
197
+ def default_out_dir(self) -> str:
198
+ return self._default_out_dir
199
+ def set_default_out_dir(self, value:str, base_dir:str=None):
200
+ self._default_out_dir_src = value
201
+ self._apply_out_dir_src(base_dir=self.get_base_dir(base_dir=base_dir))
202
+
203
+ def update_package_name_in_resources(self):
204
+ """
205
+ Update package_name attribute in resource_builders
206
+ Call before any operation on resources
207
+ """
208
+ package_name = self.package_name
209
+ for resource_builder in self.resource_builders.values():
210
+ resource_builder.package_name = package_name
211
+
212
+ def update_ckan_options_name_in_resources(self, ckan:CkanApi):
213
+ """
214
+ Update ckan options in resource_builders
215
+ Call before any operation on resources
216
+ """
217
+ for resource_builder in self.resource_builders.values():
218
+ resource_builder.init_options_from_ckan(ckan)
219
+
220
+ def _apply_resources_base_dir_src(self, base_dir:str):
221
+ """
222
+ The resources base directory is specified in a field of the Excel workbook.
223
+ This function resolves the directory name, based on the location of the Excel file
224
+ or the base_dir, if provided.
225
+
226
+ :param base_dir:
227
+ :return:
228
+ """
229
+ resources_base_dir_src = self._resources_base_dir_src
230
+ if resources_base_dir_src is None:
231
+ resources_base_dir = base_dir
232
+ else:
233
+ resources_base_dir_src = os.path.expanduser(resources_base_dir_src)
234
+ if os.path.isabs(resources_base_dir_src):
235
+ resources_base_dir = resources_base_dir_src
236
+ else:
237
+ assert(base_dir is not None)
238
+ self._resources_base_dir_src = os.path.join(base_dir, resources_base_dir_src)
239
+ resources_base_dir = self._resources_base_dir_src
240
+ if resources_base_dir is not None and not os.path.isdir(resources_base_dir):
241
+ if not os.path.exists(resources_base_dir):
242
+ raise FileOrDirNotExistError(resources_base_dir)
243
+ # the field points to a text file containing the resources_base_dir
244
+ with open(resources_base_dir, "r") as f:
245
+ resources_base_dir = f.readline().strip()
246
+ f.close()
247
+ self._resources_base_dir = sanitize_path(resources_base_dir)
248
+
249
+ def _get_resources_base_dir_src(self, base_dir:str):
250
+ return make_path_relative(self._resources_base_dir, base_dir)
251
+ # elif self._resources_base_dir_src is not None and os.path.exists(self._resources_base_dir_src) and not os.path.isdir(self._resources_base_dir_src):
252
+ # return self._resources_base_dir_src if base_dir is None else os.path.relpath(self._resources_base_dir_src, base_dir)
253
+ # else:
254
+ # return self._resources_base_dir if base_dir is None else os.path.relpath(self._resources_base_dir, base_dir)
255
+
256
+ def _apply_out_dir_src(self, base_dir:str, not_exist_error:bool=False):
257
+ """
258
+ The default download directory is specified in a field of the Excel workbook.
259
+ This function resolves the directory name, based on the location of the Excel file
260
+ or the base_dir, if provided.
261
+
262
+ :param base_dir:
263
+ :return:
264
+ """
265
+ out_dir_src = self._default_out_dir_src
266
+ if out_dir_src is None:
267
+ out_dir = None # by default, do not define an output dir
268
+ else:
269
+ out_dir_keyword = out_dir_src.lower().strip()
270
+ out_dir_src = os.path.expanduser(out_dir_src)
271
+ if out_dir_keyword == "none":
272
+ out_dir = None # by default, do not define an output dir
273
+ elif os.path.isabs(out_dir_src):
274
+ out_dir = out_dir_src
275
+ else:
276
+ assert(base_dir is not None)
277
+ self._default_out_dir_src = os.path.join(base_dir, out_dir_src)
278
+ out_dir = self._default_out_dir_src
279
+ if out_dir is not None and not os.path.isdir(out_dir):
280
+ if not os.path.exists(out_dir):
281
+ if not_exist_error:
282
+ raise FileOrDirNotExistError(out_dir)
283
+ else:
284
+ msg = f"Default output directory {out_dir} does not exist! It will be created if you call the download function with no out_dir."
285
+ warn(msg)
286
+ self._default_out_dir = out_dir
287
+ return
288
+ # the field points to a text file containing the out_dir
289
+ with open(out_dir, "r") as f:
290
+ out_dir = f.readline().strip()
291
+ f.close()
292
+ self._default_out_dir = sanitize_path(out_dir)
293
+
294
+ def _get_out_dir_src(self, base_dir:str):
295
+ return make_path_relative(self._default_out_dir_src, base_dir)
296
+
297
+
298
+ def _load_from_df(self, info_df: pd.DataFrame, package_df: pd.DataFrame, base_dir:str=None) -> None:
299
+ """
300
+ Function to load builder parameters from a DataFrame, usually from an Excel worksheet
301
+
302
+ :param package_df:
303
+ :return:
304
+ """
305
+ if info_df is not None:
306
+ package_df = pd.concat([package_df, info_df], axis=1)
307
+ original_columns = list(package_df.columns)
308
+ package_df.columns = package_df.columns.map(str.lower)
309
+ package_df.columns = package_df.columns.map(str.strip)
310
+ renamed_columns = list(package_df.columns)
311
+ # info
312
+ base_dir = self.get_base_dir(base_dir=base_dir)
313
+ if "builder format version" in package_df.columns:
314
+ self.builder_format_version = _string_from_element(package_df.pop("builder format version")).strip()
315
+ assert_or_raise(self.builder_format_version == BUILDER_VER, UnsupportedBuilderVersionError(self.builder_format_version))
316
+ if "resources local directory" in package_df.columns:
317
+ resources_base_dir_src = sanitize_path(_string_from_element(package_df.pop("resources local directory")))
318
+ else:
319
+ resources_base_dir_src = None
320
+ self._resources_base_dir_src = resources_base_dir_src
321
+ self._apply_resources_base_dir_src(base_dir=base_dir)
322
+ if "download directory" in package_df.columns:
323
+ out_dir_src = sanitize_path(_string_from_element(package_df.pop("download directory")))
324
+ else:
325
+ out_dir_src = None
326
+ self._default_out_dir_src = out_dir_src
327
+ self._apply_out_dir_src(base_dir=base_dir)
328
+ if "auxiliary functions file" in package_df.columns:
329
+ auxiliary_functions_file = sanitize_path(_string_from_element(package_df.pop("auxiliary functions file")))
330
+ if auxiliary_functions_file is not None:
331
+ self.external_python_code = PythonUserCode(auxiliary_functions_file, base_dir=base_dir)
332
+ if "comment" in package_df.columns:
333
+ self.comment = _string_from_element(package_df.pop("comment"), empty_value="")
334
+ # package attributes
335
+ self.package_attributes: CkanPackageInfo
336
+ self.package_name = _string_from_element(package_df.pop("name")).strip()
337
+ self.package_attributes.title = _string_from_element(package_df.pop("title"))
338
+ if "known id" in package_df.columns:
339
+ self.package_attributes.id = _string_from_element(package_df.pop("known id"))
340
+ if "description" in package_df.columns:
341
+ self.package_attributes.description = _string_from_element(package_df.pop("description"))
342
+ if "version" in package_df.columns:
343
+ self.package_attributes.version = _string_from_element(package_df.pop("version"))
344
+ if "visibility" in package_df.columns:
345
+ visibility = _string_from_element(package_df.pop("visibility"))
346
+ if visibility is not None:
347
+ self.package_attributes.private = CkanVisibility.from_str(visibility).to_bool_is_private()
348
+ if "state" in package_df.columns:
349
+ state = _string_from_element(package_df.pop("state"))
350
+ if state is not None:
351
+ self.package_attributes.state = CkanState.from_str(state)
352
+ if "url" in package_df.columns:
353
+ # field not in the default Excel file
354
+ self.package_attributes.url = _string_from_element(package_df.pop("url"))
355
+ if "tags" in package_df.columns:
356
+ tags_string = _string_from_element(package_df.pop("tags"))
357
+ if tags_string is not None:
358
+ self.package_attributes.tags = [label.strip() for label in tags_string.split(ckan_tags_sep)]
359
+ if "author" in package_df.columns:
360
+ self.package_attributes.author = _string_from_element(package_df.pop("author"))
361
+ if "author email" in package_df.columns:
362
+ self.package_attributes.author_email = _string_from_element(package_df.pop("author email"))
363
+ if "maintainer" in package_df.columns:
364
+ self.package_attributes.maintainer = _string_from_element(package_df.pop("maintainer"))
365
+ if "maintainer email" in package_df.columns:
366
+ self.package_attributes.maintainer_email = _string_from_element(package_df.pop("maintainer email"))
367
+ # fields which may require additional CKAN requests to obtain ids of the designated objects
368
+ if "license" in package_df.columns:
369
+ self.license_name = _string_from_element(package_df.pop("license"))
370
+ if "organization" in package_df.columns:
371
+ self.organization_name = _string_from_element(package_df.pop("organization"))
372
+ # other fields = user custom fields
373
+ if "attribute" in package_df.columns:
374
+ package_df.pop("attribute") # reserved name for table header
375
+ remaining_columns = list(package_df.columns)
376
+ for column in remaining_columns:
377
+ original_column = original_columns[renamed_columns.index(column)]
378
+ self.package_attributes.custom_fields[original_column] = _string_from_element(package_df[column])
379
+ self._check_mandatory_attributes()
380
+
381
+ def _to_dict(self, base_dir:str=None, include_id:bool=True) -> Tuple[dict, dict]:
382
+ """
383
+ Function to export builder parameters to an Excel worksheet, using the same fields as the input format
384
+
385
+ :see: _load_from_df
386
+ :see: to_xls
387
+ :return:
388
+ """
389
+ info_dict = dict()
390
+ info_dict["Builder format version"] = BUILDER_VER
391
+ info_dict["Auxiliary functions file"] = make_path_relative(self.external_python_code.python_file, to_base_dir=base_dir) if self.external_python_code is not None else ""
392
+ info_dict["Resources local directory"] = self._get_resources_base_dir_src(base_dir=base_dir)
393
+ info_dict["Download directory"] = self._get_out_dir_src(base_dir=base_dir)
394
+ info_dict["Comment"] = self.comment
395
+ package_dict = dict()
396
+ package_dict["Name"] = self.package_name
397
+ package_dict["Title"] = self.package_attributes.title
398
+ if include_id and self.package_attributes.id:
399
+ package_dict["Known Id"] = self.package_attributes.id
400
+ package_dict["Description"] = self.package_attributes.description if self.package_attributes.description is not None else ""
401
+ package_dict["Version"] = self.package_attributes.version if self.package_attributes.version is not None else ""
402
+ package_dict["Visibility"] = CkanVisibility.from_bool_is_private(self.package_attributes.private).name if self.package_attributes.private is not None else ""
403
+ package_dict["State"] = self.package_attributes.state.name if self.package_attributes.state is not None else ""
404
+ package_dict["Organization"] = self.organization_name if self.organization_name is not None else ""
405
+ package_dict["License"] = self.license_name if self.license_name is not None else ""
406
+ package_dict["URL"] = self.package_attributes.url if self.package_attributes.url is not None else ""
407
+ package_dict["Tags"] = ckan_tags_sep.join(self.package_attributes.tags) if self.package_attributes.tags is not None else ""
408
+ package_dict["Author"] = self.package_attributes.author if self.package_attributes.author is not None else ""
409
+ package_dict["Author Email"] = self.package_attributes.author_email if self.package_attributes.author_email is not None else ""
410
+ package_dict["Maintainer"] = self.package_attributes.maintainer if self.package_attributes.maintainer is not None else ""
411
+ package_dict["Maintainer Email"] = self.package_attributes.maintainer_email if self.package_attributes.maintainer_email is not None else ""
412
+ for key, value in self.package_attributes.custom_fields.items():
413
+ package_dict[key] = value if value is not None else ""
414
+ return info_dict, package_dict
415
+
416
+ def _get_builder_df_help_dict(self) -> Tuple[dict, dict]:
417
+ info_help_dict = {
418
+ "Builder format version": "Version of the file format for the script that processes this file",
419
+ "Auxiliary functions file": "Path to a Python file containing auxiliary functions, relative to this Excel workbook folder\n"
420
+ + "Warning: only execute code if you trust the source !",
421
+ "Resources local directory": "Path to the local directory containing the resources to upload or text file defining this directory, relative to this Excel workbook folder",
422
+ "Download directory": "Default path to download the resources to, relative to this Excel workbook folder",
423
+ "Comment": "Place to add a comment on this file",
424
+ }
425
+ package_help_dict = {
426
+ "Name": "Name used in the URL (short name)",
427
+ "Title": "Title of the resource",
428
+ "Description": "Description can use Markdown formatting",
429
+ "Visibility": "Private/Public",
430
+ "State": "Active/Draft/Deleted",
431
+ "Organization": "Organization title, name or ID (mandatory)",
432
+ "License": "License title or ID",
433
+ "URL": "A URL for the dataset's source",
434
+ "Tags": "Comma-separated list of tags (refer to data format policy)",
435
+ }
436
+ if self.package_attributes.id:
437
+ package_help_dict["Known Id"] = "ID of the resource in the CKAN database, last requested"
438
+ package_help_dict.update({key: "Custom key-value pair (refer to data format policy)" for key in self.package_attributes.custom_fields.keys()})
439
+ return info_help_dict, package_help_dict
440
+
441
+ def _load_from_dict(self, info_dict: dict, package_dict: dict, base_dir:str=None) -> None:
442
+ if info_dict is not None:
443
+ info_df = pd.DataFrame([info_dict], index=["Value"])
444
+ info_df = info_df.transpose()
445
+ info_df.index.name = "Attribute"
446
+ info_df = info_df.transpose()
447
+ else:
448
+ info_df = None
449
+ package_df = pd.DataFrame([package_dict], index=["Value"])
450
+ package_df = package_df.transpose()
451
+ package_df.index.name = "Attribute"
452
+ package_df = package_df.transpose()
453
+ self._load_from_df(info_df, package_df, base_dir=base_dir)
454
+
455
+ def _get_builder_df(self, base_dir:str=None, include_id:bool=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
456
+ """
457
+ Converts the result of method _to_dict() into a DataFrame
458
+
459
+ :return:
460
+ """
461
+ info_dict, package_dict = self._to_dict(base_dir=base_dir, include_id=include_id)
462
+ info_help_dict, package_help_dict = self._get_builder_df_help_dict()
463
+ package_df = pd.DataFrame([package_dict, package_help_dict], index=["Value", "Help"])
464
+ package_df = package_df.transpose()
465
+ package_df.index.name = "Attribute"
466
+ info_df = pd.DataFrame([info_dict, info_help_dict], index=["Value", "Help"])
467
+ info_df = info_df.transpose()
468
+ info_df.index.name = "Attribute"
469
+ return info_df, package_df
470
+
471
+ def _check_resource_duplicates(self):
472
+ duplicates = find_duplicates([resource_builder.name for resource_builder in self.resource_builders.values()])
473
+ if len(duplicates) > 0:
474
+ raise DuplicateNameError("Resource", duplicates)
475
+
476
+ def _get_resources_dict(self, include_id:bool=True) -> Dict[str, dict]:
477
+ self._check_resource_duplicates()
478
+ resources_dict = {resource_builder.name: resource_builder._to_dict(include_id=include_id) for resource_builder in self.resource_builders.values()}
479
+ return resources_dict
480
+
481
+ def _get_resources_df(self, include_id:bool=True) -> pd.DataFrame:
482
+ """
483
+ Calls the method _to_dict() on all resources and returns the DataFrame listing the resources of the package
484
+
485
+ :return:
486
+ """
487
+ resources_dict_list = [value for value in self._get_resources_dict(include_id=include_id).values()]
488
+ resources_df = pd.DataFrame.from_records(resources_dict_list)
489
+ return resources_df
490
+
491
+ def _get_datastores_dict(self) -> Dict[str, dict]:
492
+ """
493
+ Calls the method _get_fields_dict() on all resources which are DataStores and returns a DataFrame per DataStore
494
+ listing the fields of the DataStore with their metadata
495
+
496
+ :return:
497
+ """
498
+ return {resource.name: resource._get_fields_dict() for resource in self.resource_builders.values()
499
+ if (isinstance(resource, BuilderDataStoreABC) or isinstance(resource, BuilderMultiDataStore)) and resource.field_builders is not None}
500
+
501
+ def _get_datastores_df(self) -> Dict[str, pd.DataFrame]:
502
+ """
503
+ Calls the method _get_fields_df() on all resources which are DataStores and returns a DataFrame per DataStore
504
+ listing the fields of the DataStore with their metadata
505
+
506
+ :return:
507
+ """
508
+ return {resource.name: resource._get_fields_df() for resource in self.resource_builders.values()
509
+ if (isinstance(resource, BuilderDataStoreABC) or isinstance(resource, BuilderMultiDataStore)) and resource.field_builders is not None}
510
+
511
+ def get_all_df(self, base_dir:str=None, include_id:bool=True) -> Dict[str, pd.DataFrame]:
512
+ """
513
+ Returns all the dataframes used to define the object and components
514
+
515
+ :return:
516
+ """
517
+ info_df, package_df = self._get_builder_df(base_dir=base_dir, include_id=include_id)
518
+ ckan_df = self.ckan_builder._get_builder_df(base_dir=base_dir)
519
+ resources_df = self._get_resources_df(include_id=include_id)
520
+ datastores_df = self._get_datastores_df()
521
+ df_dict = {"info": info_df, "ckan": ckan_df, "package": package_df, "resources": resources_df}
522
+ df_dict.update(datastores_df)
523
+ return df_dict
524
+
525
+ def to_excel(self, path_or_buffer, *, engine:str=None, include_id:bool=True, include_help:bool=True, **kwargs) -> None:
526
+ """
527
+ Call this function to export the builder parameters to an Excel worksheet
528
+
529
+ :param path_or_buffer:
530
+ :param engine:
531
+ :return:
532
+ """
533
+ if isinstance(path_or_buffer, str):
534
+ base_dir, _ = os.path.split(path_or_buffer)
535
+ else:
536
+ base_dir = None
537
+ info_df, package_df = self._get_builder_df(base_dir=base_dir, include_id=include_id)
538
+ ckan_df = self.ckan_builder._get_builder_df(base_dir=base_dir)
539
+ resources_df = self._get_resources_df(include_id=include_id)
540
+ datastores_df = self._get_datastores_df()
541
+ with pd.ExcelWriter(path_or_buffer, engine=engine, **kwargs) as writer:
542
+ ckan_df.to_excel(writer, sheet_name="ckan", index=True)
543
+ info_df.to_excel(writer, sheet_name="info", index=True)
544
+ package_df.to_excel(writer, sheet_name="package", index=True)
545
+ resources_df.to_excel(writer, sheet_name="resources", index=False)
546
+ for name, df in datastores_df.items():
547
+ df.to_excel(writer, sheet_name=excel_name_of_sheet(name), index=False)
548
+ if include_help:
549
+ with pd.ExcelFile(example_package_xls, engine=engine) as help_file:
550
+ help_df = pd.read_excel(help_file, sheet_name="help", header=None)
551
+ help_file.close()
552
+ help_df.to_excel(writer, sheet_name="help", index=False, header=False)
553
+ # writer.close()
554
+
555
+ def to_dict(self, base_dir:str=None, include_id:bool=True) -> dict:
556
+ """
557
+ Call this function to export the builder parameters to an Excel worksheet
558
+
559
+ :return:
560
+ """
561
+ d = dict()
562
+ d["Info"], d["Package"] = self._to_dict(base_dir=base_dir, include_id=include_id)
563
+ d["CKAN"] = self.ckan_builder._to_dict(base_dir=base_dir)
564
+ resources_dict = self._get_resources_dict(include_id=include_id)
565
+ datastores_dict = self._get_datastores_dict()
566
+ for name, fields_dict in datastores_dict.items():
567
+ resources_dict[name]["fields"] = list(fields_dict.values())
568
+ d["Resources"] = list(resources_dict.values())
569
+ return d
570
+
571
+ def to_json(self, json_file:str, *, include_id:bool=True, reduced_size:bool=None) -> None:
572
+ if reduced_size is None:
573
+ reduced_size = self.default_to_json_reduced_size
574
+ base_dir, _ = os.path.split(json_file)
575
+ builder_dict = self.to_dict(base_dir=base_dir, include_id=include_id)
576
+ with open(json_file, "w", encoding="utf-8") as f:
577
+ if reduced_size:
578
+ json.dump(builder_dict, f, ensure_ascii=False)
579
+ else:
580
+ json.dump(builder_dict, f, ensure_ascii=False, indent=4)
581
+ f.close()
582
+
583
+ def to_jsons(self, *, base_dir:str=None, include_id:bool=True, reduced_size:bool=None) -> str:
584
+ if reduced_size is None:
585
+ reduced_size = self.default_to_json_reduced_size
586
+ builder_dict = self.to_dict(base_dir=base_dir, include_id=include_id)
587
+ if reduced_size:
588
+ return json.dumps(builder_dict, ensure_ascii=False)
589
+ else:
590
+ return json.dumps(builder_dict, ensure_ascii=False, indent=4)
591
+
592
+ @staticmethod
593
+ def from_ckan(ckan: CkanApiMap, package_info: Union[CkanPackageInfo, str]) -> "BuilderPackageBasic":
594
+ """
595
+ Function to initialize a BuilderPackageBasic from information requested by the CKAN API
596
+
597
+ :param ckan:
598
+ :param package_info: The package to import or the package name
599
+ :return:
600
+ """
601
+ if isinstance(package_info, str):
602
+ package_info = ckan.get_package_info_or_request(package_info, datastore_info=True)
603
+ package_info: CkanPackageInfo
604
+ mdl = BuilderPackageBasic()
605
+ mdl.package_attributes = package_info
606
+ mdl.organization_name = package_info.organization_info.get_owner_org() if package_info.organization_info is not None else None
607
+ mdl.license_name = package_info.license_id if package_info.license_id else None
608
+ mdl.license_name = mdl.get_license_name(ckan)
609
+ for resource in package_info.package_resources.values():
610
+ mdl.resource_builders[resource.name] = init_resource_from_ckan(ckan, resource)
611
+ mdl.update_package_name_in_resources()
612
+ mdl.update_ckan_options_name_in_resources(ckan)
613
+ mdl.builder_source_file = "ckan"
614
+ return mdl
615
+
616
+ def update_from_ckan(self, ckan:CkanApiMap, *, error_not_found:bool=True) -> None:
617
+ """
618
+ Update IDs from CKAN mapped objects.
619
+ Objects must be mapped first.
620
+ """
621
+ package_info = ckan.map.get_package_info(self.package_name, error_not_mapped=error_not_found)
622
+ package_id = package_info.id
623
+ self.package_attributes.id = package_id
624
+ for resource_builder in self.resource_builders.values():
625
+ resource_info = ckan.map.get_resource_info(resource_builder.name, package_id, error_not_mapped=error_not_found)
626
+ resource_builder.id = resource_info.id if resource_info is not None else None
627
+
628
+ def _init_resource_from_df_aux_fun(self, resource_builder: BuilderResourceABC) -> None:
629
+ if isinstance(resource_builder, BuilderDataStoreABC):
630
+ resource_builder.df_mapper._connect_aux_functions(self.external_python_code,
631
+ aux_upload_fun_name=resource_builder.aux_upload_fun_name,
632
+ aux_download_fun_name=resource_builder.aux_download_fun_name)
633
+
634
+ def to_ckan_package_info(self, *, check_id:bool=True) -> CkanPackageInfo:
635
+ """
636
+ Function to insert the information coming from the builder into the CKAN map.
637
+ Requires the IDs of the package and resources to be known.
638
+ This enables to use the stored IDs instead of querying the CKAN API for these IDs.
639
+
640
+ :return:
641
+ """
642
+ package_id = self.package_attributes.id
643
+ package_info: CkanPackageInfo = self.package_attributes.copy()
644
+ if package_id is None and check_id:
645
+ msg = MissingIdError("package", self.package_name)
646
+ raise(msg)
647
+ for resource_builder in self.resource_builders.values():
648
+ if isinstance(resource_builder, BuilderMultiFile):
649
+ msg = f"Multi-resource builder is not compatible with updating CKAN resource ids from known ids because more than one id is expected (resource builder {resource_builder.name})"
650
+ warn(msg)
651
+ else:
652
+ package_info.package_resources[resource_builder.known_id] = resource_builder._to_ckan_resource_info(package_id, check_id=check_id)
653
+ package_info.resources_id_index = {resource_info.name: resource_info.id for resource_info in package_info.package_resources.values()} # resource name -> id
654
+ package_info.resources_id_index_counts = {} # resource name -> counter
655
+ for resource_info in package_info.package_resources.values():
656
+ if resource_info.name not in package_info.resources_id_index_counts.keys():
657
+ package_info.resources_id_index_counts[resource_info.name] = 1
658
+ else:
659
+ package_info.resources_id_index_counts[resource_info.name] += 1
660
+ return package_info
661
+
662
+ def update_ckan_map(self, ckan: CkanApiMap) -> CkanPackageInfo:
663
+ """
664
+ This function updates the CKAN map from the information contained in this builder.
665
+ For this to work, the package and resource ids must be known.
666
+ This is not the case if the package was not initialized.
667
+ Use if the builder was initialized from ckan or use with precaution.
668
+
669
+ :param ckan:
670
+ :return:
671
+ """
672
+ package_info = self.to_ckan_package_info(check_id=True)
673
+ ckan.map._update_package_info(package_info)
674
+ return package_info.copy()
675
+
676
+ def map_resources(self, ckan: CkanApiMap, *, error_not_found:bool=True, cancel_if_exists:bool=True,
677
+ datastore_info:bool=True) -> Union[CkanPackageInfo,None]:
678
+ """
679
+ proxy call to ckan.map_resources and returns package information from CKAN
680
+
681
+ :param ckan:
682
+ :param error_not_found:
683
+ :param cancel_if_exists:
684
+ :return:
685
+ """
686
+ ckan.map_resources(self.package_name, datastore_info=datastore_info, error_not_found=error_not_found, only_missing=cancel_if_exists)
687
+ package_info = ckan.map.get_package_info(self.package_name, error_not_mapped=error_not_found)
688
+ if package_info is None:
689
+ return None
690
+ self.package_attributes.id = package_info.id
691
+ self.update_from_ckan(ckan, error_not_found=error_not_found)
692
+ return package_info
693
+
694
+ def _load_package_resources_list_df(self, resources_df: pd.DataFrame, base_dir:str=None) -> None:
695
+ resources_df.columns = resources_df.columns.map(str.lower)
696
+ resources_df.columns = resources_df.columns.map(str.strip)
697
+ self.resource_builders = OrderedDict()
698
+ for index, row in resources_df.iterrows():
699
+ resource_builder = init_resource_from_df(row, base_dir=base_dir)
700
+ self._init_resource_from_df_aux_fun(resource_builder)
701
+ if resource_builder.name in self.resource_builders.keys():
702
+ raise DuplicateNameError("resource_builder", resource_builder.name)
703
+ if resource_builder.name.lower() in forbidden_resource_names:
704
+ raise ForbiddenNameError("resource_builder", resource_builder.name)
705
+ self.resource_builders[resource_builder.name] = resource_builder
706
+ # self._update_package_name_resources() # call after full init in caller function
707
+
708
+ @staticmethod
709
+ def from_excel(path_or_buffer, *, proxies:dict=None, engine:str=None, **kwargs) -> "BuilderPackageBasic":
710
+ """
711
+ Load package definition from an Excel workbook.
712
+
713
+ :param path_or_buffer: path to the Excel workbook
714
+ :param engine: Engine used by pandas.read_excel(). Supported engines: xlrd, openpyxl, odf, pyxlsb, calamine.
715
+ openpyxl makes part of this package's optional requirements
716
+ :return:
717
+ """
718
+ mdl = BuilderPackageBasic()
719
+ mdl.builder_source_file = path_or_buffer
720
+ with pd.ExcelFile(path_or_buffer, engine=engine, **kwargs) as xls:
721
+ sheet_names = set(xls.sheet_names)
722
+ sheet_names_lower_index = {sheet_name.lower().strip(): sheet_name for sheet_name in sheet_names}
723
+ package_df = pd.read_excel(xls, sheet_name=sheet_names_lower_index["package"], header=None)
724
+ package_df.set_index(0, inplace=True, verify_integrity=True)
725
+ package_df = package_df.T
726
+ if "info" in sheet_names_lower_index.keys():
727
+ info_df = pd.read_excel(xls, sheet_name=sheet_names_lower_index["info"], header=None)
728
+ info_df.set_index(0, inplace=True, verify_integrity=True)
729
+ info_df = info_df.T
730
+ else:
731
+ info_df = None
732
+ base_dir = mdl.get_base_dir(None)
733
+ mdl._load_from_df(info_df, package_df, base_dir=base_dir)
734
+ if "ckan" in sheet_names_lower_index.keys():
735
+ ckan_df = pd.read_excel(xls, sheet_name=sheet_names_lower_index["ckan"], header=None)
736
+ ckan_df.set_index(0, inplace=True, verify_integrity=True)
737
+ ckan_df = ckan_df.T
738
+ mdl.ckan_builder._load_from_df(ckan_df, base_dir=base_dir, proxies=proxies)
739
+ resources_df = pd.read_excel(xls, sheet_name=sheet_names_lower_index["resources"])
740
+ mdl._load_package_resources_list_df(resources_df, base_dir=base_dir)
741
+ resource_sheets = sheet_names - {sheet_names_lower_index[name] for name in forbidden_resource_names if name in sheet_names_lower_index.keys()}
742
+ for resource_builder in mdl.resource_builders.values():
743
+ resource_sheet = None
744
+ equiv_name = excel_name_of_builder(resource_builder)
745
+ if resource_builder.name in resource_sheets:
746
+ resource_sheet = resource_builder.name
747
+ elif equiv_name in resource_sheets:
748
+ resource_sheet = equiv_name
749
+ if resource_sheet is not None:
750
+ fields_df = pd.read_excel(xls, sheet_name=resource_sheet)
751
+ assert(isinstance(resource_builder, BuilderDataStoreABC) or isinstance(resource_builder, BuilderMultiDataStore))
752
+ resource_builder._load_fields_df(fields_df)
753
+ resource_sheets.remove(resource_sheet)
754
+ mdl.update_package_name_in_resources()
755
+ if len(resource_sheets) > 0:
756
+ msg = f"Sheets present but not used: {', '.join(resource_sheets)}"
757
+ warn(msg)
758
+ xls.close()
759
+ return mdl
760
+
761
+ @staticmethod
762
+ def from_dict(d:dict, base_dir:str=None, *, proxies:dict=None) -> "BuilderPackageBasic":
763
+ """
764
+ Load package definition from a dictionary.
765
+ In this case, the base directory used to specify the resources locations must be given manually.
766
+ This is usually the directory of the file where the dictionary comes from.
767
+
768
+ :param d:
769
+ :param base_dir:
770
+ :param proxies:
771
+ :return:
772
+ """
773
+ mdl = BuilderPackageBasic()
774
+ mdl.builder_source_file = None
775
+ sheet_names = set(d.keys())
776
+ sheet_names_lower_index = {sheet_name.lower().strip(): sheet_name for sheet_name in sheet_names}
777
+ info_dict = d[sheet_names_lower_index["info"]] if "info" in sheet_names_lower_index.keys() else None
778
+ mdl._load_from_dict(info_dict, d[sheet_names_lower_index["package"]], base_dir=base_dir)
779
+ if "ckan" in sheet_names_lower_index.keys():
780
+ ckan_dict = d[sheet_names_lower_index["ckan"]]
781
+ mdl.ckan_builder._load_from_dict(ckan_dict, base_dir=base_dir, proxies=proxies)
782
+ resources_dict = dict()
783
+ for resource_dict in d[sheet_names_lower_index["resources"]]:
784
+ resource_dict_alt = {k.lower().strip(): v for k, v in resource_dict.items()}
785
+ resources_dict[resource_dict_alt["name"]] = resource_dict_alt
786
+ resources_df = pd.DataFrame(list(resources_dict.values()))
787
+ mdl._load_package_resources_list_df(resources_df, base_dir=base_dir)
788
+ resource_sheets = sheet_names - {sheet_names_lower_index[name] for name in forbidden_resource_names if name in sheet_names_lower_index.keys()}
789
+ for resource_builder in mdl.resource_builders.values():
790
+ if "fields" in resources_dict[resource_builder.name]:
791
+ assert(isinstance(resource_builder, BuilderDataStoreABC) or isinstance(resource_builder, BuilderMultiDataStore))
792
+ fields_df = pd.DataFrame(resources_dict[resource_builder.name]["fields"])
793
+ resource_builder._load_fields_df(fields_df)
794
+ else:
795
+ resource_sheet = None
796
+ equiv_name = excel_name_of_builder(resource_builder)
797
+ if resource_builder.name in resource_sheets:
798
+ resource_sheet = resource_builder.name
799
+ elif equiv_name in resource_sheets:
800
+ resource_sheet = equiv_name
801
+ if resource_sheet is not None:
802
+ assert(isinstance(resource_builder, BuilderDataStoreABC) or isinstance(resource_builder, BuilderMultiDataStore))
803
+ fields_df = pd.DataFrame(list(d[resource_sheet].values()))
804
+ resource_builder._load_fields_df(fields_df)
805
+ resource_sheets.remove(resource_sheet)
806
+ mdl.update_package_name_in_resources()
807
+ if len(resource_sheets) > 0:
808
+ msg = f"Sheets present but not used: {', '.join(resource_sheets)}"
809
+ warn(msg)
810
+ return mdl
811
+
812
+ @staticmethod
813
+ def from_json(json_file, *, proxies:dict=None) -> "BuilderPackageBasic":
814
+ base_dir, _ = os.path.split(json_file)
815
+ with open(json_file, "r") as f:
816
+ builder_dict = json.load(f)
817
+ f.close()
818
+ mdl = BuilderPackageBasic.from_dict(builder_dict, base_dir=base_dir, proxies=proxies)
819
+ mdl.builder_source_file = json_file
820
+ return mdl
821
+
822
+ @staticmethod
823
+ def from_jsons(stream:str, *, source_file:str=None, proxies:dict=None) -> "BuilderPackageBasic":
824
+ base_dir, _ = os.path.split(source_file) if source_file is not None else (None, None)
825
+ builder_dict = json.loads(stream)
826
+ mdl = BuilderPackageBasic.from_dict(builder_dict, base_dir=base_dir, proxies=proxies)
827
+ mdl.builder_source_file = source_file
828
+ return mdl
829
+
830
+ def get_owner_org(self, ckan: CkanApiMap) -> str:
831
+ """
832
+ Returns the owner organization for the package.
833
+ The owner organization can be specified by its name, title or id
834
+
835
+ :param ckan:
836
+ :return:
837
+ """
838
+ if self.organization_name is not None:
839
+ ckan.organization_list_all(cancel_if_present=True)
840
+ # organization_info = ckan.get_organization_info_or_request(self.organization_name, error_not_found=True)
841
+ organization_info = ckan.map.get_organization_info(self.organization_name, error_not_mapped=True)
842
+ owner_org = organization_info.get_owner_org()
843
+ else:
844
+ owner_org = None
845
+ return owner_org
846
+
847
+ def get_license_id(self, ckan: CkanApiMap) -> str:
848
+ """
849
+ Returns the license for the package.
850
+ The license can be specified by its title or id
851
+
852
+ :param ckan:
853
+ :return:
854
+ """
855
+ if self.license_name is not None:
856
+ ckan.license_list(cancel_if_present=True)
857
+ license_id = ckan.map.get_license_id(self.license_name, error_not_mapped=True)
858
+ else:
859
+ license_id = None
860
+ return license_id
861
+
862
+ def get_license_info(self, ckan: CkanApiMap) -> CkanLicenseInfo:
863
+ license_id = self.get_license_id(ckan)
864
+ license_info = ckan.map.get_license_info(license_id) if license_id is not None else None
865
+ return license_info
866
+
867
+ def get_license_name(self, ckan: CkanApiMap) -> str:
868
+ license_info = self.get_license_info(ckan)
869
+ return license_info.title if license_info is not None else None
870
+
871
+ def patch_request_package(self, ckan:CkanApi) -> CkanPackageInfo:
872
+ """
873
+ Function to perform all the necessary requests to initiate/reupload the package on the CKAN server.
874
+ This function does not upload the package resources.
875
+ NB: the organization must be provided, especially if the package is private
876
+
877
+ :param ckan:
878
+ :return:
879
+ """
880
+ owner_org = self.get_owner_org(ckan)
881
+ license_id = self.get_license_id(ckan)
882
+ return ckan.package_create(self.package_name, private=self.package_attributes.private, state=self.package_attributes.state,
883
+ title=self.package_attributes.title, notes=self.package_attributes.description, owner_org=owner_org,
884
+ tags=self.package_attributes.tags, custom_fields=self.package_attributes.custom_fields,
885
+ url=self.package_attributes.url, version=self.package_attributes.version,
886
+ author=self.package_attributes.author, author_email=self.package_attributes.author_email,
887
+ maintainer=self.package_attributes.maintainer, maintainer_email=self.package_attributes.maintainer_email,
888
+ license_id=license_id,
889
+ cancel_if_exists=True, update_if_exists=True)
890
+
891
+ def patch_request_full(self, ckan:CkanApi, *,
892
+ reupload:bool=False, resources_base_dir:str=None,
893
+ create_default_view:bool=True) \
894
+ -> Tuple[CkanPackageInfo, Dict[str, CkanResourceInfo]]:
895
+ """
896
+ Perform necessary requests to initiate/reupload the package and resources on the CKAN server.
897
+ For folder resources, this only uploads the first file of the resource.
898
+
899
+ :param ckan:
900
+ :return:
901
+ """
902
+ # call to function update_request of package and update_request of resources
903
+ if ckan.params.policy_check_pre:
904
+ self.local_policy_check()
905
+ resources_base_dir = self.get_resources_base_dir(resources_base_dir)
906
+ self.upload_file_checks(resources_base_dir=resources_base_dir, ckan=ckan, verbose=True, raise_error=True)
907
+ pkg_info = self.patch_request_package(ckan)
908
+ ckan.map_resources(self.package_name, datastore_info=True)
909
+ package_id = pkg_info.id
910
+ self.package_attributes.id = package_id
911
+ resource_info_dict: Dict[str, CkanResourceInfo] = {}
912
+ self.update_package_name_in_resources()
913
+ self.update_ckan_options_name_in_resources(ckan)
914
+ for resource_builder in self.resource_builders.values():
915
+ if create_default_view is not None:
916
+ resource_builder.create_default_view = create_default_view
917
+ resource_info = resource_builder.patch_request(ckan, package_id, reupload=reupload, resources_base_dir=resources_base_dir)
918
+ resource_info_dict[resource_builder.name] = resource_info
919
+ if resource_info is not None: # this would be the case for BuilderMultiFile
920
+ pkg_info.update_resource(resource_info)
921
+ else:
922
+ assert(isinstance(resource_builder, BuilderMultiFile))
923
+ self.package_resource_reorder(ckan)
924
+ if ckan.params.policy_check_post:
925
+ self.remote_policy_check(ckan)
926
+ return pkg_info, resource_info_dict
927
+
928
+ def _get_mono_resource_used_files(self, resources_base_dir:str):
929
+ """
930
+ List files used by mono-resource builders
931
+
932
+ :param resources_base_dir:
933
+ :return:
934
+ """
935
+ mono_resource_used_files = set()
936
+ for resource_builder in self.resource_builders.values():
937
+ if isinstance(resource_builder, BuilderDataStoreMultiABC):
938
+ if not isinstance(resource_builder, BuilderDataStoreHarvester):
939
+ file_list = resource_builder.init_local_files_list(resources_base_dir=resources_base_dir)
940
+ mono_resource_used_files.update(set(file_list))
941
+ elif not (isinstance(resource_builder, BuilderMultiFile)):
942
+ if resource_builder.get_sample_file_path(resources_base_dir) is not None and not resource_builder.sample_file_path_is_url():
943
+ mono_resource_used_files.add(resource_builder.get_sample_file_path(resources_base_dir))
944
+ return mono_resource_used_files
945
+
946
+ def upload_file_checks(self, resource_name:Union[str, List[str]]=None, *, resources_base_dir:str=None,
947
+ messages:Dict[str, ContextErrorLevelMessage]=None,
948
+ verbose:bool=True, raise_error:bool=False, ckan:CkanApi=None, **kwargs) -> bool:
949
+ """
950
+ Method to check the presence of all needed files before uploading or patching resources.
951
+
952
+ :param resources_base_dir:
953
+ :param ckan: Optional CkanApi object used to parameterize the requests to test the presence of resources defined by an url.
954
+ :param kwargs: keyword arguments to specify connexion parameters for querying the urls.
955
+ :return:
956
+ """
957
+ if resource_name is None:
958
+ resource_name = list(self.resource_builders.keys())
959
+ elif isinstance(resource_name, str):
960
+ resource_name = [resource_name]
961
+ if messages is None:
962
+ messages = {}
963
+ self.update_package_name_in_resources()
964
+ resources_base_dir = self.get_resources_base_dir(resources_base_dir)
965
+ mono_resource_used_files = self._get_mono_resource_used_files(resources_base_dir)
966
+ for resource_builder_name in resource_name:
967
+ resource_builder = self.resource_builders[resource_builder_name]
968
+ if isinstance(resource_builder, BuilderMultiFile):
969
+ messages[resource_builder_name] = resource_builder.upload_file_checks(resources_base_dir=resources_base_dir, ckan=ckan,
970
+ excluded_files=mono_resource_used_files if multi_file_exclude_other_files else None, **kwargs)
971
+ else:
972
+ messages[resource_builder_name] = resource_builder.upload_file_checks(resources_base_dir=resources_base_dir, ckan=ckan, **kwargs)
973
+ num_messages = len([1 for message in messages.values() if message is not None])
974
+ success = len([1 for message in messages.values() if message is not None and message.error_level == ErrorLevel.Error]) == 0
975
+ if verbose and num_messages > 0:
976
+ print("\n".join([f"for resource {key}: {message}" for key, message in messages.items() if message is not None]))
977
+ if raise_error and not success:
978
+ raise FileNotFoundError("\n".join([f"for resource {key}: {message}" for key, message in messages.items() if message is not None and message.error_level == ErrorLevel.Error]))
979
+ return success
980
+
981
+ def upload_large_datasets(self, ckan:CkanApi, *, resources_base_dir:str=None, threads:int=1,
982
+ progress_callback:Callable=None, only_missing:bool=False) -> None:
983
+ """
984
+ Method to upload large datasets of the package.
985
+ The small datasets are to be uploaded with the patch_request_full method.
986
+
987
+ :param ckan:
988
+ :param resources_base_dir:
989
+ :param threads:
990
+ :param progress_callback:
991
+ :param only_missing: upsert only missing rows for DataStores and only missing files for MultiFile
992
+ :return:
993
+ """
994
+ self.info_request_package(ckan=ckan)
995
+ resources_base_dir = self.get_resources_base_dir(resources_base_dir)
996
+ self.update_package_name_in_resources()
997
+ self.update_ckan_options_name_in_resources(ckan)
998
+ resource_names = [key for key, resource_builder in self.resource_builders.items() if isinstance(resource_builder, BuilderDataStoreMultiABC)]
999
+ self.upload_file_checks(resource_names, resources_base_dir=resources_base_dir, ckan=ckan, verbose=True, raise_error=True)
1000
+ mono_resource_used_files = self._get_mono_resource_used_files(resources_base_dir)
1001
+ for resource_builder in self.resource_builders.values():
1002
+ if isinstance(resource_builder, BuilderDataStoreMultiABC):
1003
+ if progress_callback is not None:
1004
+ resource_builder.progress_callback = progress_callback
1005
+ resource_builder.upload_request_full(ckan=ckan, resources_base_dir=resources_base_dir, threads=threads,
1006
+ only_missing=only_missing)
1007
+ for resource_builder in self.resource_builders.values():
1008
+ if isinstance(resource_builder, BuilderMultiFile):
1009
+ if progress_callback is not None:
1010
+ resource_builder.progress_callback = progress_callback
1011
+ resource_builder.upload_request_full(ckan=ckan, resources_base_dir=resources_base_dir, threads=threads,
1012
+ only_missing=only_missing,
1013
+ excluded_files=mono_resource_used_files if multi_file_exclude_other_files else None)
1014
+ self.package_resource_reorder(ckan)
1015
+
1016
+ def download_resource_df(self, ckan:CkanApi, resource_name:str, search_all:bool=False, **kwargs) -> pd.DataFrame:
1017
+ """
1018
+ Proxy for download_sample_df for a DataStore
1019
+ """
1020
+ self.update_package_name_in_resources()
1021
+ self.update_ckan_options_name_in_resources(ckan)
1022
+ assert(isinstance(self.resource_builders[resource_name], BuilderDataStoreABC))
1023
+ return self.resource_builders[resource_name].download_sample_df(ckan=ckan, search_all=search_all, **kwargs)
1024
+
1025
+ def download_resource(self, ckan:CkanApi, resource_name:str, full_download:bool=False, **kwargs) -> bytes:
1026
+ """
1027
+ Proxy for download_sample for a resource
1028
+ """
1029
+ self.update_package_name_in_resources()
1030
+ self.update_ckan_options_name_in_resources(ckan)
1031
+ return self.resource_builders[resource_name].download_sample(ckan=ckan, full_download=full_download, **kwargs)
1032
+
1033
+ def get_or_query_resource_id(self, ckan:CkanApi, resource_name:str, error_not_found:bool=True) -> str:
1034
+ self.update_package_name_in_resources()
1035
+ self.update_ckan_options_name_in_resources(ckan)
1036
+ return self.resource_builders[resource_name].get_or_query_resource_id(ckan, error_not_found=error_not_found)
1037
+
1038
+ def _get_mono_resource_names(self):
1039
+ """
1040
+ List resource names of mono-resource builders.
1041
+
1042
+ :return:
1043
+ """
1044
+ return {resource_name for resource_name, resource_builder in self.resource_builders.items() if not isinstance(resource_builder, BuilderMultiFile)}
1045
+
1046
+ def download_request_full(self, ckan:CkanApi, out_dir:str=None, enforce_none_out_dir:bool=False, resource_name:str=None, full_download:bool=False,
1047
+ threads:int=1, skip_existing:bool=True, progress_callback:Callable=None,
1048
+ force:bool=False, rm_dir:bool=False) -> None:
1049
+ """
1050
+ Downloads the full package resources into out_dir.
1051
+
1052
+ :param ckan:
1053
+ :param out_dir: download directory
1054
+ :param rm_dir: remove directory if exists before downloading
1055
+ :param skip_existing: skip download of existing resources
1056
+ :param enforce_none_out_dir: if no out_dir is provided, True: files will not be saved after download, False: default output dir will be used, if defined
1057
+ :param resource_name:
1058
+ :param full_download: option to fully download the resources. If False, only a partial download is made.
1059
+ :param threads:
1060
+ :param progress_callback:
1061
+ :param force: option to bypass the enable_download attribute of resources
1062
+ :return:
1063
+ """
1064
+ out_dir = self.get_default_out_dir(out_dir, enforce_none=enforce_none_out_dir)
1065
+ if out_dir is not None and os.path.isdir(out_dir):
1066
+ if rm_dir:
1067
+ shutil.rmtree(out_dir)
1068
+ self.info_request_package(ckan=ckan)
1069
+ if resource_name is None:
1070
+ resource_builders = self.resource_builders
1071
+ else:
1072
+ resource_builders = {resource_name: self.resource_builders[resource_name]}
1073
+ self.update_package_name_in_resources()
1074
+ self.update_ckan_options_name_in_resources(ckan)
1075
+ mono_resource_names = self._get_mono_resource_names()
1076
+ for resource_builder in resource_builders.values():
1077
+ if skip_existing is not None:
1078
+ resource_builder.download_skip_existing = skip_existing
1079
+ if not (isinstance(resource_builder, BuilderDataStoreMultiABC) or isinstance(resource_builder, BuilderMultiFile)):
1080
+ resource_builder.download_request(ckan, out_dir=out_dir, full_download=full_download,
1081
+ threads=threads, force=force)
1082
+ for resource_builder in resource_builders.values():
1083
+ if isinstance(resource_builder, BuilderDataStoreMultiABC):
1084
+ if progress_callback is not None:
1085
+ resource_builder.progress_callback = progress_callback
1086
+ resource_builder.download_request(ckan, out_dir=out_dir, full_download=full_download,
1087
+ threads=threads, force=force)
1088
+ for resource_builder in resource_builders.values():
1089
+ if isinstance(resource_builder, BuilderMultiFile):
1090
+ if progress_callback is not None:
1091
+ resource_builder.progress_callback = progress_callback
1092
+ resource_builder.download_request(ckan, out_dir=out_dir, full_download=full_download,
1093
+ threads=threads, force=force, excluded_resource_names=mono_resource_names)
1094
+
1095
+ def download_sample_df(self, ckan:CkanApi, resource_name:str=None, *, search_all:bool=False, **kwargs) -> Dict[str, pd.DataFrame]:
1096
+ """
1097
+ Download a sample DataFrame for the DataStore type resources.
1098
+
1099
+ :param ckan:
1100
+ :param resource_name:
1101
+ :return:
1102
+ """
1103
+ self.info_request_package(ckan=ckan)
1104
+ if resource_name is None:
1105
+ resource_builders = self.resource_builders
1106
+ else:
1107
+ resource_builders = {resource_name: self.resource_builders[resource_name]}
1108
+ self.update_package_name_in_resources()
1109
+ self.update_ckan_options_name_in_resources(ckan)
1110
+ df_dict = {}
1111
+ for resource_builder in resource_builders.values():
1112
+ if isinstance(resource_builder, BuilderDataStoreABC):
1113
+ df_dict[resource_builder.name] = resource_builder.download_sample_df(ckan, search_all=search_all, **kwargs)
1114
+ return df_dict
1115
+
1116
+ def download_sample(self, ckan:CkanApi, resource_name:str=None, *, datastores_as_df:bool=True, search_all:bool=False, **kwargs) -> Dict[str, Union[bytes, pd.DataFrame]]:
1117
+ """
1118
+ Download samples from all resources.
1119
+
1120
+ :param ckan:
1121
+ :param resource_name:
1122
+ :return:
1123
+ """
1124
+ self.info_request_package(ckan=ckan)
1125
+ if resource_name is None:
1126
+ resource_builders = self.resource_builders
1127
+ else:
1128
+ resource_builders = {resource_name: self.resource_builders[resource_name]}
1129
+ self.update_package_name_in_resources()
1130
+ self.update_ckan_options_name_in_resources(ckan)
1131
+ df_dict = {}
1132
+ for resource_builder in resource_builders.values():
1133
+ if isinstance(resource_builder, BuilderDataStoreABC) and datastores_as_df:
1134
+ df_dict[resource_builder.name] = resource_builder.download_sample_df(ckan, search_all=search_all, **kwargs)
1135
+ else:
1136
+ df_dict[resource_builder.name] = resource_builder.download_sample(ckan, search_all=search_all, **kwargs)
1137
+ return df_dict
1138
+
1139
+ def info_request_package(self, ckan:CkanApi) -> CkanPackageInfo:
1140
+ pkg_info = ckan.get_package_info_or_request(package_name=self.package_name)
1141
+ self.package_attributes.id = pkg_info.id
1142
+ return pkg_info
1143
+
1144
+ def info_request_full(self, ckan:CkanApi) -> Tuple[CkanPackageInfo, List[CkanResourceInfo]]:
1145
+ pkg_info = self.info_request_package(ckan)
1146
+ self.update_package_name_in_resources()
1147
+ self.update_ckan_options_name_in_resources(ckan)
1148
+ res_info = [resource_builder.resource_info_request(ckan) for resource_builder in self.resource_builders.values()]
1149
+ return pkg_info, res_info
1150
+
1151
+ def get_base_dir(self, base_dir:str=None) -> str:
1152
+ """
1153
+ Returns the default base_dir if not specified. The base_dir is the location of the Excel workbook.
1154
+ If this was initialized from a dictionary, the current working directory will be used (cwd).
1155
+
1156
+ :return:
1157
+ """
1158
+ if base_dir is None:
1159
+ if self.builder_source_file is not None:
1160
+ base_dir, _ = os.path.split(self.builder_source_file)
1161
+ else:
1162
+ base_dir = os.path.abspath(".")
1163
+ return base_dir
1164
+
1165
+ def get_resources_base_dir(self, resources_base_dir:str) -> str:
1166
+ """
1167
+ This returns the base directory for the resource files.
1168
+ It is distinct from the base_dir and can be defined relative to the base_dir in the Excel workbook (see comment at the top of the class).
1169
+
1170
+ :param resources_base_dir:
1171
+ :return:
1172
+ """
1173
+ if resources_base_dir is None:
1174
+ resources_base_dir = self._resources_base_dir
1175
+ return resources_base_dir
1176
+
1177
+ def get_default_out_dir(self, out_dir:str, enforce_none:bool=False) -> str:
1178
+ """
1179
+ This returns the default download directory.
1180
+
1181
+ :param out_dir:
1182
+ :return:
1183
+ """
1184
+ if out_dir is None and not enforce_none:
1185
+ out_dir = self._default_out_dir
1186
+ return out_dir
1187
+
1188
+ def init_ckan(self, ckan:CkanApi=None, *, base_dir:str=None, set_owner_org:bool=False,
1189
+ default_proxies:dict=None, proxies:Union[str,dict,ProxyConfig]=None) -> CkanApi:
1190
+ """
1191
+ Initialize the CKAN instance from the parameters defined in the "ckan" tab of the Excel workbook.
1192
+
1193
+ :param ckan:
1194
+ :param base_dir:
1195
+ :param default_proxies:
1196
+ :param set_owner_org: Option to set the owner_org of the CKAN instance.
1197
+ This can be problematic because it requires some requests as the proxies are not set.
1198
+ It can be omitted because it has no influence on the patch_request_package function.
1199
+ :return:
1200
+ """
1201
+ base_dir = self.get_base_dir(base_dir) # base_dir is necessary to find the API key file, if provided
1202
+ ckan = self.ckan_builder.init_ckan(base_dir, ckan=ckan, default_proxies=default_proxies,
1203
+ proxies=proxies)
1204
+ if set_owner_org and self.organization_name is not None:
1205
+ ckan.owner_org = self.get_owner_org(ckan)
1206
+ return ckan
1207
+
1208
+ def get_or_query_package_id(self, ckan: CkanApi) -> str:
1209
+ package_info = ckan.get_package_info_or_request(self.package_name)
1210
+ self.package_attributes.id = package_info.id
1211
+ return package_info.id
1212
+
1213
+ def list_resource_ids(self, ckan: CkanApi) -> List[str]:
1214
+ """
1215
+ List resource ids on CKAN server, following the order of the package builder
1216
+
1217
+ :param ckan:
1218
+ :return:
1219
+ """
1220
+ self.update_package_name_in_resources()
1221
+ self.update_ckan_options_name_in_resources(ckan)
1222
+ mono_resource_names = {resource_name for resource_name, resource_builder in self.resource_builders.items() if not isinstance(resource_builder, BuilderMultiFile)}
1223
+ resource_ids = []
1224
+ for resource_builder in self.resource_builders.values():
1225
+ if not (isinstance(resource_builder, BuilderMultiFile)):
1226
+ resource_ids.append(resource_builder.get_or_query_resource_id(ckan))
1227
+ else:
1228
+ multi_resource_ids = resource_builder.list_remote_resource_ids(ckan, excluded_resource_names=mono_resource_names,
1229
+ cancel_if_present=False)
1230
+ resource_ids = resource_ids + multi_resource_ids
1231
+ np_resource_ids = np.array(resource_ids)
1232
+ _, I = np.unique(np_resource_ids, return_index=True)
1233
+ I.sort()
1234
+ np_resource_ids = np_resource_ids[I]
1235
+ resource_ids = np_resource_ids.tolist()
1236
+ return resource_ids
1237
+
1238
+ def package_resource_reorder(self, ckan: CkanApi) -> None:
1239
+ """
1240
+ Apply the order of the resources defined in the Excel workbook.
1241
+
1242
+ :param ckan:
1243
+ :return:
1244
+ """
1245
+ # OrderedDict ensures the order of resources is preserved
1246
+ package_id = self.get_or_query_package_id(ckan=ckan)
1247
+ resource_ids = self.list_resource_ids(ckan=ckan)
1248
+ ckan._api_package_resource_reorder(package_id=package_id, resource_ids=resource_ids)
1249
+
1250
+ def remote_policy_check(self, ckan: CkanApi, policy:CkanPackageDataFormatPolicy=None,
1251
+ *, buffer:Dict[str, List[DataPolicyError]]=None, raise_error:bool=False,
1252
+ verbose:bool=None) -> bool:
1253
+ """
1254
+ Check the package defined by this builder against a data format policy, based on the information from the API.
1255
+
1256
+ :param ckan:
1257
+ :param policy:
1258
+ :param buffer:
1259
+ :param raise_error:
1260
+ :param verbose:
1261
+ :return:
1262
+ """
1263
+ if policy is None:
1264
+ policy = self.ckan_builder.policy
1265
+ return ckan.policy_check(package_list=self.package_name, policy=policy, buffer=buffer,
1266
+ verbose=verbose, raise_error=raise_error)
1267
+
1268
+ def local_policy_check(self, policy:CkanPackageDataFormatPolicy=None,
1269
+ *, buffer:Dict[str, List[DataPolicyError]]=None, raise_error:bool=False,
1270
+ verbose:bool=True) -> bool:
1271
+ """
1272
+ Check if the package builder respects a data format policy (only on local definition).
1273
+
1274
+ :return:
1275
+ """
1276
+ if policy is None:
1277
+ policy = self.ckan_builder.policy
1278
+ if policy is None:
1279
+ # no policy loaded at all
1280
+ return True
1281
+ package_info = self.to_ckan_package_info(check_id=False)
1282
+ package_buffer: List[DataPolicyError] = []
1283
+ success = policy.policy_check_package(package_info, display_message=verbose,
1284
+ package_buffer=package_buffer, raise_error=raise_error)
1285
+ if buffer is not None:
1286
+ buffer[package_info.name] = package_buffer
1287
+ if verbose:
1288
+ print(f"Data format policy {policy.label} success: {success}")
1289
+ return success
1290
+
1291
+