ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ckanapi_harvesters/__init__.py +32 -10
- ckanapi_harvesters/auxiliary/__init__.py +26 -0
- ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
- ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
- ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
- ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
- ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
- ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
- ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
- ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
- ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
- ckanapi_harvesters/auxiliary/deprecated.py +82 -0
- ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
- ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
- ckanapi_harvesters/auxiliary/list_records.py +60 -0
- ckanapi_harvesters/auxiliary/login.py +163 -0
- ckanapi_harvesters/auxiliary/path.py +208 -0
- ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
- ckanapi_harvesters/auxiliary/urls.py +40 -0
- ckanapi_harvesters/builder/__init__.py +40 -0
- ckanapi_harvesters/builder/builder_aux.py +20 -0
- ckanapi_harvesters/builder/builder_ckan.py +238 -0
- ckanapi_harvesters/builder/builder_errors.py +36 -0
- ckanapi_harvesters/builder/builder_field.py +122 -0
- ckanapi_harvesters/builder/builder_package.py +9 -0
- ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
- ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
- ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
- ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
- ckanapi_harvesters/builder/builder_resource.py +589 -0
- ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
- ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
- ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
- ckanapi_harvesters/builder/builder_resource_init.py +126 -0
- ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
- ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
- ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
- ckanapi_harvesters/builder/example/__init__.py +21 -0
- ckanapi_harvesters/builder/example/builder_example.py +21 -0
- ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
- ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
- ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
- ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
- ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
- ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
- ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
- ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
- ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
- ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
- ckanapi_harvesters/builder/mapper_datastore.py +93 -0
- ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
- ckanapi_harvesters/builder/specific/__init__.py +11 -0
- ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
- ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
- ckanapi_harvesters/ckan_api/__init__.py +20 -0
- ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
- ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
- ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
- ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
- ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
- ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
- ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
- ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
- ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
- ckanapi_harvesters/harvesters/__init__.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
- ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
- ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
- ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
- ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
- ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
- ckanapi_harvesters/harvesters/harvester_init.py +30 -0
- ckanapi_harvesters/harvesters/harvester_model.py +49 -0
- ckanapi_harvesters/harvesters/harvester_params.py +323 -0
- ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
- ckanapi_harvesters/harvesters/postgre_params.py +86 -0
- ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
- ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
- ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
- ckanapi_harvesters/policies/__init__.py +20 -0
- ckanapi_harvesters/policies/data_format_policy.py +269 -0
- ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
- ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
- ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
- ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
- ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
- ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
- ckanapi_harvesters/reports/__init__.py +11 -0
- ckanapi_harvesters/reports/admin_report.py +292 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/METADATA +74 -38
- ckanapi_harvesters-0.0.2.dist-info/RECORD +105 -0
- ckanapi_harvesters/divider/__init__.py +0 -27
- ckanapi_harvesters/divider/divider.py +0 -53
- ckanapi_harvesters/divider/divider_error.py +0 -59
- ckanapi_harvesters/main.py +0 -30
- ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/WHEEL +0 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1291 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Code to upload metadata to the CKAN server to create/update an existing package
|
|
5
|
+
The metadata is defined by the user in an Excel worksheet
|
|
6
|
+
This file implements the package definition.
|
|
7
|
+
"""
|
|
8
|
+
from typing import Dict, List, Tuple, Union, Callable
|
|
9
|
+
from warnings import warn
|
|
10
|
+
import os
|
|
11
|
+
import shutil
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
from collections import OrderedDict
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
from ckanapi_harvesters.policies.data_format_policy_errors import DataPolicyError
|
|
20
|
+
from ckanapi_harvesters.policies.data_format_policy import CkanPackageDataFormatPolicy
|
|
21
|
+
from ckanapi_harvesters.ckan_api import CkanApi, CkanApiMap
|
|
22
|
+
from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
|
|
23
|
+
from ckanapi_harvesters.auxiliary.proxy_config import ProxyConfig
|
|
24
|
+
from ckanapi_harvesters.auxiliary.ckan_model import CkanVisibility, CkanState, CkanPackageInfo, CkanResourceInfo, CkanDataStoreInfo, CkanLicenseInfo
|
|
25
|
+
from ckanapi_harvesters.auxiliary.path import sanitize_path, path_rel_to_dir, make_path_relative
|
|
26
|
+
from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element, assert_or_raise, find_duplicates
|
|
27
|
+
from ckanapi_harvesters.auxiliary.ckan_defs import ckan_tags_sep
|
|
28
|
+
from ckanapi_harvesters.auxiliary.ckan_errors import (UnexpectedError, DuplicateNameError, ForbiddenNameError, MissingIdError,
|
|
29
|
+
MandatoryAttributeError, FileOrDirNotExistError)
|
|
30
|
+
from ckanapi_harvesters.auxiliary.ckan_configuration import unlock_external_url_resource_download, unlock_no_ca
|
|
31
|
+
from ckanapi_harvesters.builder.builder_errors import MissingDataStoreInfoError, UnsupportedBuilderVersionError
|
|
32
|
+
from ckanapi_harvesters.builder import BUILDER_FILE_FORMAT_VERSION as BUILDER_VER
|
|
33
|
+
from ckanapi_harvesters.builder.builder_resource import BuilderResourceABC
|
|
34
|
+
from ckanapi_harvesters.builder.builder_resource_multi_file import BuilderMultiFile, multi_file_exclude_other_files
|
|
35
|
+
from ckanapi_harvesters.builder.builder_resource_datastore import BuilderDataStoreABC
|
|
36
|
+
from ckanapi_harvesters.builder.builder_resource_multi_datastore import BuilderMultiDataStore
|
|
37
|
+
from ckanapi_harvesters.builder.builder_resource_datastore_multi_abc import BuilderDataStoreMultiABC
|
|
38
|
+
from ckanapi_harvesters.builder.builder_resource_datastore_multi_harvester import BuilderDataStoreHarvester
|
|
39
|
+
from ckanapi_harvesters.builder.builder_resource_init import init_resource_from_df, init_resource_from_ckan
|
|
40
|
+
from ckanapi_harvesters.builder.builder_ckan import BuilderCkan
|
|
41
|
+
from ckanapi_harvesters.auxiliary.external_code_import import PythonUserCode, unlock_external_code_execution
|
|
42
|
+
|
|
43
|
+
self_dir = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
|
|
44
|
+
example_package_xls = os.path.join(self_dir, "builder_package_example.xlsx")
|
|
45
|
+
|
|
46
|
+
forbidden_resource_names = {"ckan", "info", "package", "resources", "validation", "help"}
|
|
47
|
+
excel_subs_characters_re = r"[\*\?\[\]\+]" # characters used in wildcards (MultiFile & MultiDataStore), forbidden in Excel sheet names
|
|
48
|
+
excel_subs_dest_character = '#'
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def excel_name_of_sheet(resource_name: str) -> str:
|
|
52
|
+
return re.sub(excel_subs_characters_re, excel_subs_dest_character, resource_name)
|
|
53
|
+
|
|
54
|
+
def excel_name_of_builder(resource_builder: BuilderResourceABC) -> str:
|
|
55
|
+
if isinstance(resource_builder, BuilderMultiDataStore):
|
|
56
|
+
return excel_name_of_sheet(resource_builder.name)
|
|
57
|
+
else:
|
|
58
|
+
return resource_builder.name
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class BuilderPackageBasic:
|
|
62
|
+
"""
|
|
63
|
+
Class to store an image of a CKAN package defined by an Excel worksheet
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
__NB__: There are several paths to distinguish:
|
|
67
|
+
|
|
68
|
+
- the path of the Excel worksheet
|
|
69
|
+
- base_dir: the base directory for relative paths
|
|
70
|
+
- resources_base_dir: the base directory for resources (for upload), which is generally defined relative to base_dir
|
|
71
|
+
- out_dir: the output directory, for download, absolute or relative to the cwd (current working directory)
|
|
72
|
+
|
|
73
|
+
__NB__: A builder can refer to the following external files:
|
|
74
|
+
|
|
75
|
+
- CKAN API key file (.txt)
|
|
76
|
+
- Proxy authentication file (.txt)
|
|
77
|
+
- CKAN CA certificate file (.pem)
|
|
78
|
+
- CA certificate for external connexions (.pem)
|
|
79
|
+
- Data format policy file (.json)
|
|
80
|
+
- External Python module (.py) containing DataFrame modification functions for upload/download of a DataStore
|
|
81
|
+
"""
|
|
82
|
+
default_to_json_reduced_size:bool = False
|
|
83
|
+
|
|
84
|
+
def __init__(self, package_name:str=None, *, package_id:str=None,
|
|
85
|
+
title: str = None, description: str = None, private: bool = None, state: CkanState = None,
|
|
86
|
+
version: str = None,
|
|
87
|
+
url: str = None, tags: List[str] = None,
|
|
88
|
+
organization_name:str=None, license_name:str=None, src=None):
|
|
89
|
+
if src is not None:
|
|
90
|
+
src.copy(dest=self)
|
|
91
|
+
self.builder_source_file: Union[str, None] = None
|
|
92
|
+
self.builder_format_version: Union[str, None] = None
|
|
93
|
+
# package attributes
|
|
94
|
+
self.package_attributes: CkanPackageInfo = CkanPackageInfo(package_name=package_name, package_id=package_id,
|
|
95
|
+
title=title, description=description, private=private, state=state,
|
|
96
|
+
version=version, url=url, tags=tags)
|
|
97
|
+
self.organization_name: Union[str, None] = organization_name
|
|
98
|
+
self.license_name: Union[str, None] = license_name
|
|
99
|
+
# package resources
|
|
100
|
+
self._resources_base_dir_src: Union[str, None] = None # source of the resources_base_dir
|
|
101
|
+
self._resources_base_dir: Union[str, None] = None
|
|
102
|
+
self.resource_builders:OrderedDict[str,BuilderResourceABC] = OrderedDict()
|
|
103
|
+
self._default_out_dir_src: Union[str, None] = None
|
|
104
|
+
self._default_out_dir: Union[str, None] = None
|
|
105
|
+
# auxiliary builders
|
|
106
|
+
self.ckan_builder: BuilderCkan = BuilderCkan()
|
|
107
|
+
self.external_python_code: Union[PythonUserCode, None] = None
|
|
108
|
+
self.comment: str = ""
|
|
109
|
+
|
|
110
|
+
def __str__(self):
|
|
111
|
+
return f"Package builder for {self.package_name} ({len(self.resource_builders)} resources)"
|
|
112
|
+
|
|
113
|
+
def copy(self, dest=None) -> "BuilderPackageBasic":
|
|
114
|
+
if dest is None:
|
|
115
|
+
dest = BuilderPackageBasic()
|
|
116
|
+
dest.builder_source_file = self.builder_source_file
|
|
117
|
+
dest.builder_format_version = self.builder_format_version
|
|
118
|
+
dest.package_attributes = self.package_attributes.copy()
|
|
119
|
+
dest.organization_name = self.organization_name
|
|
120
|
+
dest.license_name = self.license_name
|
|
121
|
+
dest._resources_base_dir_src = self._resources_base_dir_src
|
|
122
|
+
dest._resources_base_dir = self._resources_base_dir
|
|
123
|
+
dest._default_out_dir_src = self._default_out_dir_src
|
|
124
|
+
dest._default_out_dir = self._default_out_dir
|
|
125
|
+
dest.resource_builders = OrderedDict()
|
|
126
|
+
dest.comment = self.comment
|
|
127
|
+
for key, value in self.resource_builders.items():
|
|
128
|
+
dest.resource_builders[key] = value.copy()
|
|
129
|
+
dest.ckan_builder = self.ckan_builder.copy()
|
|
130
|
+
if self.external_python_code is not None:
|
|
131
|
+
dest.external_python_code = self.external_python_code.copy()
|
|
132
|
+
return dest
|
|
133
|
+
|
|
134
|
+
def _check_mandatory_attributes(self):
|
|
135
|
+
if self.package_name is None:
|
|
136
|
+
raise MandatoryAttributeError("Package", "name")
|
|
137
|
+
# organization can be non-mandatory depending on CKAN configuration
|
|
138
|
+
# if self.organization_name is None:
|
|
139
|
+
# raise MissingMandatoryAttributeError("Package", "owner_org")
|
|
140
|
+
|
|
141
|
+
def clear_ids(self):
|
|
142
|
+
"""
|
|
143
|
+
Clear all known ids from package and resource builders
|
|
144
|
+
:return:
|
|
145
|
+
"""
|
|
146
|
+
self.package_attributes.id = None
|
|
147
|
+
for resource_builder in self.resource_builders.values():
|
|
148
|
+
resource_builder.known_id = None
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
def unlock_external_code_execution(value:bool=True):
|
|
152
|
+
"""
|
|
153
|
+
This function enables external code execution for the PythonUserCode class.
|
|
154
|
+
It is necessary to load builders which specify an Auxiliary functions file.
|
|
155
|
+
|
|
156
|
+
__Warning__:
|
|
157
|
+
only run code if you trust the source!
|
|
158
|
+
|
|
159
|
+
:return:
|
|
160
|
+
"""
|
|
161
|
+
unlock_external_code_execution(value)
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def unlock_no_ca(value:bool=True):
|
|
165
|
+
"""
|
|
166
|
+
This function enables you to disable the CA verification of the CKAN server.
|
|
167
|
+
|
|
168
|
+
__Warning__:
|
|
169
|
+
Only allow in a local environment!
|
|
170
|
+
|
|
171
|
+
"""
|
|
172
|
+
unlock_no_ca(value)
|
|
173
|
+
|
|
174
|
+
@staticmethod
|
|
175
|
+
def unlock_external_url_resource_download(value:bool=True):
|
|
176
|
+
"""
|
|
177
|
+
This function enables the download of resources external from the CKAN server.
|
|
178
|
+
"""
|
|
179
|
+
unlock_external_url_resource_download(value)
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def package_name(self) -> str:
|
|
183
|
+
return self.package_attributes.name
|
|
184
|
+
@package_name.setter
|
|
185
|
+
def package_name(self, value:str):
|
|
186
|
+
self.package_attributes.name = value
|
|
187
|
+
self.update_package_name_in_resources()
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def resources_base_dir(self) -> str:
|
|
191
|
+
return self._resources_base_dir
|
|
192
|
+
def set_resources_base_dir(self, value:str, base_dir:str=None):
|
|
193
|
+
self._resources_base_dir_src = value
|
|
194
|
+
self._apply_resources_base_dir_src(base_dir=self.get_base_dir(base_dir=base_dir))
|
|
195
|
+
|
|
196
|
+
@property
|
|
197
|
+
def default_out_dir(self) -> str:
|
|
198
|
+
return self._default_out_dir
|
|
199
|
+
def set_default_out_dir(self, value:str, base_dir:str=None):
|
|
200
|
+
self._default_out_dir_src = value
|
|
201
|
+
self._apply_out_dir_src(base_dir=self.get_base_dir(base_dir=base_dir))
|
|
202
|
+
|
|
203
|
+
def update_package_name_in_resources(self):
|
|
204
|
+
"""
|
|
205
|
+
Update package_name attribute in resource_builders
|
|
206
|
+
Call before any operation on resources
|
|
207
|
+
"""
|
|
208
|
+
package_name = self.package_name
|
|
209
|
+
for resource_builder in self.resource_builders.values():
|
|
210
|
+
resource_builder.package_name = package_name
|
|
211
|
+
|
|
212
|
+
def update_ckan_options_name_in_resources(self, ckan:CkanApi):
|
|
213
|
+
"""
|
|
214
|
+
Update ckan options in resource_builders
|
|
215
|
+
Call before any operation on resources
|
|
216
|
+
"""
|
|
217
|
+
for resource_builder in self.resource_builders.values():
|
|
218
|
+
resource_builder.init_options_from_ckan(ckan)
|
|
219
|
+
|
|
220
|
+
def _apply_resources_base_dir_src(self, base_dir:str):
|
|
221
|
+
"""
|
|
222
|
+
The resources base directory is specified in a field of the Excel workbook.
|
|
223
|
+
This function resolves the directory name, based on the location of the Excel file
|
|
224
|
+
or the base_dir, if provided.
|
|
225
|
+
|
|
226
|
+
:param base_dir:
|
|
227
|
+
:return:
|
|
228
|
+
"""
|
|
229
|
+
resources_base_dir_src = self._resources_base_dir_src
|
|
230
|
+
if resources_base_dir_src is None:
|
|
231
|
+
resources_base_dir = base_dir
|
|
232
|
+
else:
|
|
233
|
+
resources_base_dir_src = os.path.expanduser(resources_base_dir_src)
|
|
234
|
+
if os.path.isabs(resources_base_dir_src):
|
|
235
|
+
resources_base_dir = resources_base_dir_src
|
|
236
|
+
else:
|
|
237
|
+
assert(base_dir is not None)
|
|
238
|
+
self._resources_base_dir_src = os.path.join(base_dir, resources_base_dir_src)
|
|
239
|
+
resources_base_dir = self._resources_base_dir_src
|
|
240
|
+
if resources_base_dir is not None and not os.path.isdir(resources_base_dir):
|
|
241
|
+
if not os.path.exists(resources_base_dir):
|
|
242
|
+
raise FileOrDirNotExistError(resources_base_dir)
|
|
243
|
+
# the field points to a text file containing the resources_base_dir
|
|
244
|
+
with open(resources_base_dir, "r") as f:
|
|
245
|
+
resources_base_dir = f.readline().strip()
|
|
246
|
+
f.close()
|
|
247
|
+
self._resources_base_dir = sanitize_path(resources_base_dir)
|
|
248
|
+
|
|
249
|
+
def _get_resources_base_dir_src(self, base_dir:str):
|
|
250
|
+
return make_path_relative(self._resources_base_dir, base_dir)
|
|
251
|
+
# elif self._resources_base_dir_src is not None and os.path.exists(self._resources_base_dir_src) and not os.path.isdir(self._resources_base_dir_src):
|
|
252
|
+
# return self._resources_base_dir_src if base_dir is None else os.path.relpath(self._resources_base_dir_src, base_dir)
|
|
253
|
+
# else:
|
|
254
|
+
# return self._resources_base_dir if base_dir is None else os.path.relpath(self._resources_base_dir, base_dir)
|
|
255
|
+
|
|
256
|
+
def _apply_out_dir_src(self, base_dir:str, not_exist_error:bool=False):
|
|
257
|
+
"""
|
|
258
|
+
The default download directory is specified in a field of the Excel workbook.
|
|
259
|
+
This function resolves the directory name, based on the location of the Excel file
|
|
260
|
+
or the base_dir, if provided.
|
|
261
|
+
|
|
262
|
+
:param base_dir:
|
|
263
|
+
:return:
|
|
264
|
+
"""
|
|
265
|
+
out_dir_src = self._default_out_dir_src
|
|
266
|
+
if out_dir_src is None:
|
|
267
|
+
out_dir = None # by default, do not define an output dir
|
|
268
|
+
else:
|
|
269
|
+
out_dir_keyword = out_dir_src.lower().strip()
|
|
270
|
+
out_dir_src = os.path.expanduser(out_dir_src)
|
|
271
|
+
if out_dir_keyword == "none":
|
|
272
|
+
out_dir = None # by default, do not define an output dir
|
|
273
|
+
elif os.path.isabs(out_dir_src):
|
|
274
|
+
out_dir = out_dir_src
|
|
275
|
+
else:
|
|
276
|
+
assert(base_dir is not None)
|
|
277
|
+
self._default_out_dir_src = os.path.join(base_dir, out_dir_src)
|
|
278
|
+
out_dir = self._default_out_dir_src
|
|
279
|
+
if out_dir is not None and not os.path.isdir(out_dir):
|
|
280
|
+
if not os.path.exists(out_dir):
|
|
281
|
+
if not_exist_error:
|
|
282
|
+
raise FileOrDirNotExistError(out_dir)
|
|
283
|
+
else:
|
|
284
|
+
msg = f"Default output directory {out_dir} does not exist! It will be created if you call the download function with no out_dir."
|
|
285
|
+
warn(msg)
|
|
286
|
+
self._default_out_dir = out_dir
|
|
287
|
+
return
|
|
288
|
+
# the field points to a text file containing the out_dir
|
|
289
|
+
with open(out_dir, "r") as f:
|
|
290
|
+
out_dir = f.readline().strip()
|
|
291
|
+
f.close()
|
|
292
|
+
self._default_out_dir = sanitize_path(out_dir)
|
|
293
|
+
|
|
294
|
+
def _get_out_dir_src(self, base_dir:str):
|
|
295
|
+
return make_path_relative(self._default_out_dir_src, base_dir)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _load_from_df(self, info_df: pd.DataFrame, package_df: pd.DataFrame, base_dir:str=None) -> None:
|
|
299
|
+
"""
|
|
300
|
+
Function to load builder parameters from a DataFrame, usually from an Excel worksheet
|
|
301
|
+
|
|
302
|
+
:param package_df:
|
|
303
|
+
:return:
|
|
304
|
+
"""
|
|
305
|
+
if info_df is not None:
|
|
306
|
+
package_df = pd.concat([package_df, info_df], axis=1)
|
|
307
|
+
original_columns = list(package_df.columns)
|
|
308
|
+
package_df.columns = package_df.columns.map(str.lower)
|
|
309
|
+
package_df.columns = package_df.columns.map(str.strip)
|
|
310
|
+
renamed_columns = list(package_df.columns)
|
|
311
|
+
# info
|
|
312
|
+
base_dir = self.get_base_dir(base_dir=base_dir)
|
|
313
|
+
if "builder format version" in package_df.columns:
|
|
314
|
+
self.builder_format_version = _string_from_element(package_df.pop("builder format version")).strip()
|
|
315
|
+
assert_or_raise(self.builder_format_version == BUILDER_VER, UnsupportedBuilderVersionError(self.builder_format_version))
|
|
316
|
+
if "resources local directory" in package_df.columns:
|
|
317
|
+
resources_base_dir_src = sanitize_path(_string_from_element(package_df.pop("resources local directory")))
|
|
318
|
+
else:
|
|
319
|
+
resources_base_dir_src = None
|
|
320
|
+
self._resources_base_dir_src = resources_base_dir_src
|
|
321
|
+
self._apply_resources_base_dir_src(base_dir=base_dir)
|
|
322
|
+
if "download directory" in package_df.columns:
|
|
323
|
+
out_dir_src = sanitize_path(_string_from_element(package_df.pop("download directory")))
|
|
324
|
+
else:
|
|
325
|
+
out_dir_src = None
|
|
326
|
+
self._default_out_dir_src = out_dir_src
|
|
327
|
+
self._apply_out_dir_src(base_dir=base_dir)
|
|
328
|
+
if "auxiliary functions file" in package_df.columns:
|
|
329
|
+
auxiliary_functions_file = sanitize_path(_string_from_element(package_df.pop("auxiliary functions file")))
|
|
330
|
+
if auxiliary_functions_file is not None:
|
|
331
|
+
self.external_python_code = PythonUserCode(auxiliary_functions_file, base_dir=base_dir)
|
|
332
|
+
if "comment" in package_df.columns:
|
|
333
|
+
self.comment = _string_from_element(package_df.pop("comment"), empty_value="")
|
|
334
|
+
# package attributes
|
|
335
|
+
self.package_attributes: CkanPackageInfo
|
|
336
|
+
self.package_name = _string_from_element(package_df.pop("name")).strip()
|
|
337
|
+
self.package_attributes.title = _string_from_element(package_df.pop("title"))
|
|
338
|
+
if "known id" in package_df.columns:
|
|
339
|
+
self.package_attributes.id = _string_from_element(package_df.pop("known id"))
|
|
340
|
+
if "description" in package_df.columns:
|
|
341
|
+
self.package_attributes.description = _string_from_element(package_df.pop("description"))
|
|
342
|
+
if "version" in package_df.columns:
|
|
343
|
+
self.package_attributes.version = _string_from_element(package_df.pop("version"))
|
|
344
|
+
if "visibility" in package_df.columns:
|
|
345
|
+
visibility = _string_from_element(package_df.pop("visibility"))
|
|
346
|
+
if visibility is not None:
|
|
347
|
+
self.package_attributes.private = CkanVisibility.from_str(visibility).to_bool_is_private()
|
|
348
|
+
if "state" in package_df.columns:
|
|
349
|
+
state = _string_from_element(package_df.pop("state"))
|
|
350
|
+
if state is not None:
|
|
351
|
+
self.package_attributes.state = CkanState.from_str(state)
|
|
352
|
+
if "url" in package_df.columns:
|
|
353
|
+
# field not in the default Excel file
|
|
354
|
+
self.package_attributes.url = _string_from_element(package_df.pop("url"))
|
|
355
|
+
if "tags" in package_df.columns:
|
|
356
|
+
tags_string = _string_from_element(package_df.pop("tags"))
|
|
357
|
+
if tags_string is not None:
|
|
358
|
+
self.package_attributes.tags = [label.strip() for label in tags_string.split(ckan_tags_sep)]
|
|
359
|
+
if "author" in package_df.columns:
|
|
360
|
+
self.package_attributes.author = _string_from_element(package_df.pop("author"))
|
|
361
|
+
if "author email" in package_df.columns:
|
|
362
|
+
self.package_attributes.author_email = _string_from_element(package_df.pop("author email"))
|
|
363
|
+
if "maintainer" in package_df.columns:
|
|
364
|
+
self.package_attributes.maintainer = _string_from_element(package_df.pop("maintainer"))
|
|
365
|
+
if "maintainer email" in package_df.columns:
|
|
366
|
+
self.package_attributes.maintainer_email = _string_from_element(package_df.pop("maintainer email"))
|
|
367
|
+
# fields which may require additional CKAN requests to obtain ids of the designated objects
|
|
368
|
+
if "license" in package_df.columns:
|
|
369
|
+
self.license_name = _string_from_element(package_df.pop("license"))
|
|
370
|
+
if "organization" in package_df.columns:
|
|
371
|
+
self.organization_name = _string_from_element(package_df.pop("organization"))
|
|
372
|
+
# other fields = user custom fields
|
|
373
|
+
if "attribute" in package_df.columns:
|
|
374
|
+
package_df.pop("attribute") # reserved name for table header
|
|
375
|
+
remaining_columns = list(package_df.columns)
|
|
376
|
+
for column in remaining_columns:
|
|
377
|
+
original_column = original_columns[renamed_columns.index(column)]
|
|
378
|
+
self.package_attributes.custom_fields[original_column] = _string_from_element(package_df[column])
|
|
379
|
+
self._check_mandatory_attributes()
|
|
380
|
+
|
|
381
|
+
def _to_dict(self, base_dir:str=None, include_id:bool=True) -> Tuple[dict, dict]:
|
|
382
|
+
"""
|
|
383
|
+
Function to export builder parameters to an Excel worksheet, using the same fields as the input format
|
|
384
|
+
|
|
385
|
+
:see: _load_from_df
|
|
386
|
+
:see: to_xls
|
|
387
|
+
:return:
|
|
388
|
+
"""
|
|
389
|
+
info_dict = dict()
|
|
390
|
+
info_dict["Builder format version"] = BUILDER_VER
|
|
391
|
+
info_dict["Auxiliary functions file"] = make_path_relative(self.external_python_code.python_file, to_base_dir=base_dir) if self.external_python_code is not None else ""
|
|
392
|
+
info_dict["Resources local directory"] = self._get_resources_base_dir_src(base_dir=base_dir)
|
|
393
|
+
info_dict["Download directory"] = self._get_out_dir_src(base_dir=base_dir)
|
|
394
|
+
info_dict["Comment"] = self.comment
|
|
395
|
+
package_dict = dict()
|
|
396
|
+
package_dict["Name"] = self.package_name
|
|
397
|
+
package_dict["Title"] = self.package_attributes.title
|
|
398
|
+
if include_id and self.package_attributes.id:
|
|
399
|
+
package_dict["Known Id"] = self.package_attributes.id
|
|
400
|
+
package_dict["Description"] = self.package_attributes.description if self.package_attributes.description is not None else ""
|
|
401
|
+
package_dict["Version"] = self.package_attributes.version if self.package_attributes.version is not None else ""
|
|
402
|
+
package_dict["Visibility"] = CkanVisibility.from_bool_is_private(self.package_attributes.private).name if self.package_attributes.private is not None else ""
|
|
403
|
+
package_dict["State"] = self.package_attributes.state.name if self.package_attributes.state is not None else ""
|
|
404
|
+
package_dict["Organization"] = self.organization_name if self.organization_name is not None else ""
|
|
405
|
+
package_dict["License"] = self.license_name if self.license_name is not None else ""
|
|
406
|
+
package_dict["URL"] = self.package_attributes.url if self.package_attributes.url is not None else ""
|
|
407
|
+
package_dict["Tags"] = ckan_tags_sep.join(self.package_attributes.tags) if self.package_attributes.tags is not None else ""
|
|
408
|
+
package_dict["Author"] = self.package_attributes.author if self.package_attributes.author is not None else ""
|
|
409
|
+
package_dict["Author Email"] = self.package_attributes.author_email if self.package_attributes.author_email is not None else ""
|
|
410
|
+
package_dict["Maintainer"] = self.package_attributes.maintainer if self.package_attributes.maintainer is not None else ""
|
|
411
|
+
package_dict["Maintainer Email"] = self.package_attributes.maintainer_email if self.package_attributes.maintainer_email is not None else ""
|
|
412
|
+
for key, value in self.package_attributes.custom_fields.items():
|
|
413
|
+
package_dict[key] = value if value is not None else ""
|
|
414
|
+
return info_dict, package_dict
|
|
415
|
+
|
|
416
|
+
def _get_builder_df_help_dict(self) -> Tuple[dict, dict]:
|
|
417
|
+
info_help_dict = {
|
|
418
|
+
"Builder format version": "Version of the file format for the script that processes this file",
|
|
419
|
+
"Auxiliary functions file": "Path to a Python file containing auxiliary functions, relative to this Excel workbook folder\n"
|
|
420
|
+
+ "Warning: only execute code if you trust the source !",
|
|
421
|
+
"Resources local directory": "Path to the local directory containing the resources to upload or text file defining this directory, relative to this Excel workbook folder",
|
|
422
|
+
"Download directory": "Default path to download the resources to, relative to this Excel workbook folder",
|
|
423
|
+
"Comment": "Place to add a comment on this file",
|
|
424
|
+
}
|
|
425
|
+
package_help_dict = {
|
|
426
|
+
"Name": "Name used in the URL (short name)",
|
|
427
|
+
"Title": "Title of the resource",
|
|
428
|
+
"Description": "Description can use Markdown formatting",
|
|
429
|
+
"Visibility": "Private/Public",
|
|
430
|
+
"State": "Active/Draft/Deleted",
|
|
431
|
+
"Organization": "Organization title, name or ID (mandatory)",
|
|
432
|
+
"License": "License title or ID",
|
|
433
|
+
"URL": "A URL for the dataset's source",
|
|
434
|
+
"Tags": "Comma-separated list of tags (refer to data format policy)",
|
|
435
|
+
}
|
|
436
|
+
if self.package_attributes.id:
|
|
437
|
+
package_help_dict["Known Id"] = "ID of the resource in the CKAN database, last requested"
|
|
438
|
+
package_help_dict.update({key: "Custom key-value pair (refer to data format policy)" for key in self.package_attributes.custom_fields.keys()})
|
|
439
|
+
return info_help_dict, package_help_dict
|
|
440
|
+
|
|
441
|
+
def _load_from_dict(self, info_dict: dict, package_dict: dict, base_dir:str=None) -> None:
|
|
442
|
+
if info_dict is not None:
|
|
443
|
+
info_df = pd.DataFrame([info_dict], index=["Value"])
|
|
444
|
+
info_df = info_df.transpose()
|
|
445
|
+
info_df.index.name = "Attribute"
|
|
446
|
+
info_df = info_df.transpose()
|
|
447
|
+
else:
|
|
448
|
+
info_df = None
|
|
449
|
+
package_df = pd.DataFrame([package_dict], index=["Value"])
|
|
450
|
+
package_df = package_df.transpose()
|
|
451
|
+
package_df.index.name = "Attribute"
|
|
452
|
+
package_df = package_df.transpose()
|
|
453
|
+
self._load_from_df(info_df, package_df, base_dir=base_dir)
|
|
454
|
+
|
|
455
|
+
def _get_builder_df(self, base_dir:str=None, include_id:bool=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
456
|
+
"""
|
|
457
|
+
Converts the result of method _to_dict() into a DataFrame
|
|
458
|
+
|
|
459
|
+
:return:
|
|
460
|
+
"""
|
|
461
|
+
info_dict, package_dict = self._to_dict(base_dir=base_dir, include_id=include_id)
|
|
462
|
+
info_help_dict, package_help_dict = self._get_builder_df_help_dict()
|
|
463
|
+
package_df = pd.DataFrame([package_dict, package_help_dict], index=["Value", "Help"])
|
|
464
|
+
package_df = package_df.transpose()
|
|
465
|
+
package_df.index.name = "Attribute"
|
|
466
|
+
info_df = pd.DataFrame([info_dict, info_help_dict], index=["Value", "Help"])
|
|
467
|
+
info_df = info_df.transpose()
|
|
468
|
+
info_df.index.name = "Attribute"
|
|
469
|
+
return info_df, package_df
|
|
470
|
+
|
|
471
|
+
def _check_resource_duplicates(self):
|
|
472
|
+
duplicates = find_duplicates([resource_builder.name for resource_builder in self.resource_builders.values()])
|
|
473
|
+
if len(duplicates) > 0:
|
|
474
|
+
raise DuplicateNameError("Resource", duplicates)
|
|
475
|
+
|
|
476
|
+
def _get_resources_dict(self, include_id:bool=True) -> Dict[str, dict]:
|
|
477
|
+
self._check_resource_duplicates()
|
|
478
|
+
resources_dict = {resource_builder.name: resource_builder._to_dict(include_id=include_id) for resource_builder in self.resource_builders.values()}
|
|
479
|
+
return resources_dict
|
|
480
|
+
|
|
481
|
+
def _get_resources_df(self, include_id:bool=True) -> pd.DataFrame:
|
|
482
|
+
"""
|
|
483
|
+
Calls the method _to_dict() on all resources and returns the DataFrame listing the resources of the package
|
|
484
|
+
|
|
485
|
+
:return:
|
|
486
|
+
"""
|
|
487
|
+
resources_dict_list = [value for value in self._get_resources_dict(include_id=include_id).values()]
|
|
488
|
+
resources_df = pd.DataFrame.from_records(resources_dict_list)
|
|
489
|
+
return resources_df
|
|
490
|
+
|
|
491
|
+
def _get_datastores_dict(self) -> Dict[str, dict]:
|
|
492
|
+
"""
|
|
493
|
+
Calls the method _get_fields_dict() on all resources which are DataStores and returns a DataFrame per DataStore
|
|
494
|
+
listing the fields of the DataStore with their metadata
|
|
495
|
+
|
|
496
|
+
:return:
|
|
497
|
+
"""
|
|
498
|
+
return {resource.name: resource._get_fields_dict() for resource in self.resource_builders.values()
|
|
499
|
+
if (isinstance(resource, BuilderDataStoreABC) or isinstance(resource, BuilderMultiDataStore)) and resource.field_builders is not None}
|
|
500
|
+
|
|
501
|
+
def _get_datastores_df(self) -> Dict[str, pd.DataFrame]:
|
|
502
|
+
"""
|
|
503
|
+
Calls the method _get_fields_df() on all resources which are DataStores and returns a DataFrame per DataStore
|
|
504
|
+
listing the fields of the DataStore with their metadata
|
|
505
|
+
|
|
506
|
+
:return:
|
|
507
|
+
"""
|
|
508
|
+
return {resource.name: resource._get_fields_df() for resource in self.resource_builders.values()
|
|
509
|
+
if (isinstance(resource, BuilderDataStoreABC) or isinstance(resource, BuilderMultiDataStore)) and resource.field_builders is not None}
|
|
510
|
+
|
|
511
|
+
def get_all_df(self, base_dir:str=None, include_id:bool=True) -> Dict[str, pd.DataFrame]:
|
|
512
|
+
"""
|
|
513
|
+
Returns all the dataframes used to define the object and components
|
|
514
|
+
|
|
515
|
+
:return:
|
|
516
|
+
"""
|
|
517
|
+
info_df, package_df = self._get_builder_df(base_dir=base_dir, include_id=include_id)
|
|
518
|
+
ckan_df = self.ckan_builder._get_builder_df(base_dir=base_dir)
|
|
519
|
+
resources_df = self._get_resources_df(include_id=include_id)
|
|
520
|
+
datastores_df = self._get_datastores_df()
|
|
521
|
+
df_dict = {"info": info_df, "ckan": ckan_df, "package": package_df, "resources": resources_df}
|
|
522
|
+
df_dict.update(datastores_df)
|
|
523
|
+
return df_dict
|
|
524
|
+
|
|
525
|
+
def to_excel(self, path_or_buffer, *, engine:str=None, include_id:bool=True, include_help:bool=True, **kwargs) -> None:
|
|
526
|
+
"""
|
|
527
|
+
Call this function to export the builder parameters to an Excel worksheet
|
|
528
|
+
|
|
529
|
+
:param path_or_buffer:
|
|
530
|
+
:param engine:
|
|
531
|
+
:return:
|
|
532
|
+
"""
|
|
533
|
+
if isinstance(path_or_buffer, str):
|
|
534
|
+
base_dir, _ = os.path.split(path_or_buffer)
|
|
535
|
+
else:
|
|
536
|
+
base_dir = None
|
|
537
|
+
info_df, package_df = self._get_builder_df(base_dir=base_dir, include_id=include_id)
|
|
538
|
+
ckan_df = self.ckan_builder._get_builder_df(base_dir=base_dir)
|
|
539
|
+
resources_df = self._get_resources_df(include_id=include_id)
|
|
540
|
+
datastores_df = self._get_datastores_df()
|
|
541
|
+
with pd.ExcelWriter(path_or_buffer, engine=engine, **kwargs) as writer:
|
|
542
|
+
ckan_df.to_excel(writer, sheet_name="ckan", index=True)
|
|
543
|
+
info_df.to_excel(writer, sheet_name="info", index=True)
|
|
544
|
+
package_df.to_excel(writer, sheet_name="package", index=True)
|
|
545
|
+
resources_df.to_excel(writer, sheet_name="resources", index=False)
|
|
546
|
+
for name, df in datastores_df.items():
|
|
547
|
+
df.to_excel(writer, sheet_name=excel_name_of_sheet(name), index=False)
|
|
548
|
+
if include_help:
|
|
549
|
+
with pd.ExcelFile(example_package_xls, engine=engine) as help_file:
|
|
550
|
+
help_df = pd.read_excel(help_file, sheet_name="help", header=None)
|
|
551
|
+
help_file.close()
|
|
552
|
+
help_df.to_excel(writer, sheet_name="help", index=False, header=False)
|
|
553
|
+
# writer.close()
|
|
554
|
+
|
|
555
|
+
def to_dict(self, base_dir:str=None, include_id:bool=True) -> dict:
|
|
556
|
+
"""
|
|
557
|
+
Call this function to export the builder parameters to an Excel worksheet
|
|
558
|
+
|
|
559
|
+
:return:
|
|
560
|
+
"""
|
|
561
|
+
d = dict()
|
|
562
|
+
d["Info"], d["Package"] = self._to_dict(base_dir=base_dir, include_id=include_id)
|
|
563
|
+
d["CKAN"] = self.ckan_builder._to_dict(base_dir=base_dir)
|
|
564
|
+
resources_dict = self._get_resources_dict(include_id=include_id)
|
|
565
|
+
datastores_dict = self._get_datastores_dict()
|
|
566
|
+
for name, fields_dict in datastores_dict.items():
|
|
567
|
+
resources_dict[name]["fields"] = list(fields_dict.values())
|
|
568
|
+
d["Resources"] = list(resources_dict.values())
|
|
569
|
+
return d
|
|
570
|
+
|
|
571
|
+
def to_json(self, json_file:str, *, include_id:bool=True, reduced_size:bool=None) -> None:
|
|
572
|
+
if reduced_size is None:
|
|
573
|
+
reduced_size = self.default_to_json_reduced_size
|
|
574
|
+
base_dir, _ = os.path.split(json_file)
|
|
575
|
+
builder_dict = self.to_dict(base_dir=base_dir, include_id=include_id)
|
|
576
|
+
with open(json_file, "w", encoding="utf-8") as f:
|
|
577
|
+
if reduced_size:
|
|
578
|
+
json.dump(builder_dict, f, ensure_ascii=False)
|
|
579
|
+
else:
|
|
580
|
+
json.dump(builder_dict, f, ensure_ascii=False, indent=4)
|
|
581
|
+
f.close()
|
|
582
|
+
|
|
583
|
+
def to_jsons(self, *, base_dir:str=None, include_id:bool=True, reduced_size:bool=None) -> str:
|
|
584
|
+
if reduced_size is None:
|
|
585
|
+
reduced_size = self.default_to_json_reduced_size
|
|
586
|
+
builder_dict = self.to_dict(base_dir=base_dir, include_id=include_id)
|
|
587
|
+
if reduced_size:
|
|
588
|
+
return json.dumps(builder_dict, ensure_ascii=False)
|
|
589
|
+
else:
|
|
590
|
+
return json.dumps(builder_dict, ensure_ascii=False, indent=4)
|
|
591
|
+
|
|
592
|
+
@staticmethod
|
|
593
|
+
def from_ckan(ckan: CkanApiMap, package_info: Union[CkanPackageInfo, str]) -> "BuilderPackageBasic":
|
|
594
|
+
"""
|
|
595
|
+
Function to initialize a BuilderPackageBasic from information requested by the CKAN API
|
|
596
|
+
|
|
597
|
+
:param ckan:
|
|
598
|
+
:param package_info: The package to import or the package name
|
|
599
|
+
:return:
|
|
600
|
+
"""
|
|
601
|
+
if isinstance(package_info, str):
|
|
602
|
+
package_info = ckan.get_package_info_or_request(package_info, datastore_info=True)
|
|
603
|
+
package_info: CkanPackageInfo
|
|
604
|
+
mdl = BuilderPackageBasic()
|
|
605
|
+
mdl.package_attributes = package_info
|
|
606
|
+
mdl.organization_name = package_info.organization_info.get_owner_org() if package_info.organization_info is not None else None
|
|
607
|
+
mdl.license_name = package_info.license_id if package_info.license_id else None
|
|
608
|
+
mdl.license_name = mdl.get_license_name(ckan)
|
|
609
|
+
for resource in package_info.package_resources.values():
|
|
610
|
+
mdl.resource_builders[resource.name] = init_resource_from_ckan(ckan, resource)
|
|
611
|
+
mdl.update_package_name_in_resources()
|
|
612
|
+
mdl.update_ckan_options_name_in_resources(ckan)
|
|
613
|
+
mdl.builder_source_file = "ckan"
|
|
614
|
+
return mdl
|
|
615
|
+
|
|
616
|
+
def update_from_ckan(self, ckan:CkanApiMap, *, error_not_found:bool=True) -> None:
|
|
617
|
+
"""
|
|
618
|
+
Update IDs from CKAN mapped objects.
|
|
619
|
+
Objects must be mapped first.
|
|
620
|
+
"""
|
|
621
|
+
package_info = ckan.map.get_package_info(self.package_name, error_not_mapped=error_not_found)
|
|
622
|
+
package_id = package_info.id
|
|
623
|
+
self.package_attributes.id = package_id
|
|
624
|
+
for resource_builder in self.resource_builders.values():
|
|
625
|
+
resource_info = ckan.map.get_resource_info(resource_builder.name, package_id, error_not_mapped=error_not_found)
|
|
626
|
+
resource_builder.id = resource_info.id if resource_info is not None else None
|
|
627
|
+
|
|
628
|
+
def _init_resource_from_df_aux_fun(self, resource_builder: BuilderResourceABC) -> None:
|
|
629
|
+
if isinstance(resource_builder, BuilderDataStoreABC):
|
|
630
|
+
resource_builder.df_mapper._connect_aux_functions(self.external_python_code,
|
|
631
|
+
aux_upload_fun_name=resource_builder.aux_upload_fun_name,
|
|
632
|
+
aux_download_fun_name=resource_builder.aux_download_fun_name)
|
|
633
|
+
|
|
634
|
+
def to_ckan_package_info(self, *, check_id:bool=True) -> CkanPackageInfo:
|
|
635
|
+
"""
|
|
636
|
+
Function to insert the information coming from the builder into the CKAN map.
|
|
637
|
+
Requires the IDs of the package and resources to be known.
|
|
638
|
+
This enables to use the stored IDs instead of querying the CKAN API for these IDs.
|
|
639
|
+
|
|
640
|
+
:return:
|
|
641
|
+
"""
|
|
642
|
+
package_id = self.package_attributes.id
|
|
643
|
+
package_info: CkanPackageInfo = self.package_attributes.copy()
|
|
644
|
+
if package_id is None and check_id:
|
|
645
|
+
msg = MissingIdError("package", self.package_name)
|
|
646
|
+
raise(msg)
|
|
647
|
+
for resource_builder in self.resource_builders.values():
|
|
648
|
+
if isinstance(resource_builder, BuilderMultiFile):
|
|
649
|
+
msg = f"Multi-resource builder is not compatible with updating CKAN resource ids from known ids because more than one id is expected (resource builder {resource_builder.name})"
|
|
650
|
+
warn(msg)
|
|
651
|
+
else:
|
|
652
|
+
package_info.package_resources[resource_builder.known_id] = resource_builder._to_ckan_resource_info(package_id, check_id=check_id)
|
|
653
|
+
package_info.resources_id_index = {resource_info.name: resource_info.id for resource_info in package_info.package_resources.values()} # resource name -> id
|
|
654
|
+
package_info.resources_id_index_counts = {} # resource name -> counter
|
|
655
|
+
for resource_info in package_info.package_resources.values():
|
|
656
|
+
if resource_info.name not in package_info.resources_id_index_counts.keys():
|
|
657
|
+
package_info.resources_id_index_counts[resource_info.name] = 1
|
|
658
|
+
else:
|
|
659
|
+
package_info.resources_id_index_counts[resource_info.name] += 1
|
|
660
|
+
return package_info
|
|
661
|
+
|
|
662
|
+
def update_ckan_map(self, ckan: CkanApiMap) -> CkanPackageInfo:
|
|
663
|
+
"""
|
|
664
|
+
This function updates the CKAN map from the information contained in this builder.
|
|
665
|
+
For this to work, the package and resource ids must be known.
|
|
666
|
+
This is not the case if the package was not initialized.
|
|
667
|
+
Use if the builder was initialized from ckan or use with precaution.
|
|
668
|
+
|
|
669
|
+
:param ckan:
|
|
670
|
+
:return:
|
|
671
|
+
"""
|
|
672
|
+
package_info = self.to_ckan_package_info(check_id=True)
|
|
673
|
+
ckan.map._update_package_info(package_info)
|
|
674
|
+
return package_info.copy()
|
|
675
|
+
|
|
676
|
+
def map_resources(self, ckan: CkanApiMap, *, error_not_found:bool=True, cancel_if_exists:bool=True,
|
|
677
|
+
datastore_info:bool=True) -> Union[CkanPackageInfo,None]:
|
|
678
|
+
"""
|
|
679
|
+
proxy call to ckan.map_resources and returns package information from CKAN
|
|
680
|
+
|
|
681
|
+
:param ckan:
|
|
682
|
+
:param error_not_found:
|
|
683
|
+
:param cancel_if_exists:
|
|
684
|
+
:return:
|
|
685
|
+
"""
|
|
686
|
+
ckan.map_resources(self.package_name, datastore_info=datastore_info, error_not_found=error_not_found, only_missing=cancel_if_exists)
|
|
687
|
+
package_info = ckan.map.get_package_info(self.package_name, error_not_mapped=error_not_found)
|
|
688
|
+
if package_info is None:
|
|
689
|
+
return None
|
|
690
|
+
self.package_attributes.id = package_info.id
|
|
691
|
+
self.update_from_ckan(ckan, error_not_found=error_not_found)
|
|
692
|
+
return package_info
|
|
693
|
+
|
|
694
|
+
def _load_package_resources_list_df(self, resources_df: pd.DataFrame, base_dir:str=None) -> None:
|
|
695
|
+
resources_df.columns = resources_df.columns.map(str.lower)
|
|
696
|
+
resources_df.columns = resources_df.columns.map(str.strip)
|
|
697
|
+
self.resource_builders = OrderedDict()
|
|
698
|
+
for index, row in resources_df.iterrows():
|
|
699
|
+
resource_builder = init_resource_from_df(row, base_dir=base_dir)
|
|
700
|
+
self._init_resource_from_df_aux_fun(resource_builder)
|
|
701
|
+
if resource_builder.name in self.resource_builders.keys():
|
|
702
|
+
raise DuplicateNameError("resource_builder", resource_builder.name)
|
|
703
|
+
if resource_builder.name.lower() in forbidden_resource_names:
|
|
704
|
+
raise ForbiddenNameError("resource_builder", resource_builder.name)
|
|
705
|
+
self.resource_builders[resource_builder.name] = resource_builder
|
|
706
|
+
# self._update_package_name_resources() # call after full init in caller function
|
|
707
|
+
|
|
708
|
+
@staticmethod
|
|
709
|
+
def from_excel(path_or_buffer, *, proxies:dict=None, engine:str=None, **kwargs) -> "BuilderPackageBasic":
|
|
710
|
+
"""
|
|
711
|
+
Load package definition from an Excel workbook.
|
|
712
|
+
|
|
713
|
+
:param path_or_buffer: path to the Excel workbook
|
|
714
|
+
:param engine: Engine used by pandas.read_excel(). Supported engines: xlrd, openpyxl, odf, pyxlsb, calamine.
|
|
715
|
+
openpyxl makes part of this package's optional requirements
|
|
716
|
+
:return:
|
|
717
|
+
"""
|
|
718
|
+
mdl = BuilderPackageBasic()
|
|
719
|
+
mdl.builder_source_file = path_or_buffer
|
|
720
|
+
with pd.ExcelFile(path_or_buffer, engine=engine, **kwargs) as xls:
|
|
721
|
+
sheet_names = set(xls.sheet_names)
|
|
722
|
+
sheet_names_lower_index = {sheet_name.lower().strip(): sheet_name for sheet_name in sheet_names}
|
|
723
|
+
package_df = pd.read_excel(xls, sheet_name=sheet_names_lower_index["package"], header=None)
|
|
724
|
+
package_df.set_index(0, inplace=True, verify_integrity=True)
|
|
725
|
+
package_df = package_df.T
|
|
726
|
+
if "info" in sheet_names_lower_index.keys():
|
|
727
|
+
info_df = pd.read_excel(xls, sheet_name=sheet_names_lower_index["info"], header=None)
|
|
728
|
+
info_df.set_index(0, inplace=True, verify_integrity=True)
|
|
729
|
+
info_df = info_df.T
|
|
730
|
+
else:
|
|
731
|
+
info_df = None
|
|
732
|
+
base_dir = mdl.get_base_dir(None)
|
|
733
|
+
mdl._load_from_df(info_df, package_df, base_dir=base_dir)
|
|
734
|
+
if "ckan" in sheet_names_lower_index.keys():
|
|
735
|
+
ckan_df = pd.read_excel(xls, sheet_name=sheet_names_lower_index["ckan"], header=None)
|
|
736
|
+
ckan_df.set_index(0, inplace=True, verify_integrity=True)
|
|
737
|
+
ckan_df = ckan_df.T
|
|
738
|
+
mdl.ckan_builder._load_from_df(ckan_df, base_dir=base_dir, proxies=proxies)
|
|
739
|
+
resources_df = pd.read_excel(xls, sheet_name=sheet_names_lower_index["resources"])
|
|
740
|
+
mdl._load_package_resources_list_df(resources_df, base_dir=base_dir)
|
|
741
|
+
resource_sheets = sheet_names - {sheet_names_lower_index[name] for name in forbidden_resource_names if name in sheet_names_lower_index.keys()}
|
|
742
|
+
for resource_builder in mdl.resource_builders.values():
|
|
743
|
+
resource_sheet = None
|
|
744
|
+
equiv_name = excel_name_of_builder(resource_builder)
|
|
745
|
+
if resource_builder.name in resource_sheets:
|
|
746
|
+
resource_sheet = resource_builder.name
|
|
747
|
+
elif equiv_name in resource_sheets:
|
|
748
|
+
resource_sheet = equiv_name
|
|
749
|
+
if resource_sheet is not None:
|
|
750
|
+
fields_df = pd.read_excel(xls, sheet_name=resource_sheet)
|
|
751
|
+
assert(isinstance(resource_builder, BuilderDataStoreABC) or isinstance(resource_builder, BuilderMultiDataStore))
|
|
752
|
+
resource_builder._load_fields_df(fields_df)
|
|
753
|
+
resource_sheets.remove(resource_sheet)
|
|
754
|
+
mdl.update_package_name_in_resources()
|
|
755
|
+
if len(resource_sheets) > 0:
|
|
756
|
+
msg = f"Sheets present but not used: {', '.join(resource_sheets)}"
|
|
757
|
+
warn(msg)
|
|
758
|
+
xls.close()
|
|
759
|
+
return mdl
|
|
760
|
+
|
|
761
|
+
@staticmethod
|
|
762
|
+
def from_dict(d:dict, base_dir:str=None, *, proxies:dict=None) -> "BuilderPackageBasic":
|
|
763
|
+
"""
|
|
764
|
+
Load package definition from a dictionary.
|
|
765
|
+
In this case, the base directory used to specify the resources locations must be given manually.
|
|
766
|
+
This is usually the directory of the file where the dictionary comes from.
|
|
767
|
+
|
|
768
|
+
:param d:
|
|
769
|
+
:param base_dir:
|
|
770
|
+
:param proxies:
|
|
771
|
+
:return:
|
|
772
|
+
"""
|
|
773
|
+
mdl = BuilderPackageBasic()
|
|
774
|
+
mdl.builder_source_file = None
|
|
775
|
+
sheet_names = set(d.keys())
|
|
776
|
+
sheet_names_lower_index = {sheet_name.lower().strip(): sheet_name for sheet_name in sheet_names}
|
|
777
|
+
info_dict = d[sheet_names_lower_index["info"]] if "info" in sheet_names_lower_index.keys() else None
|
|
778
|
+
mdl._load_from_dict(info_dict, d[sheet_names_lower_index["package"]], base_dir=base_dir)
|
|
779
|
+
if "ckan" in sheet_names_lower_index.keys():
|
|
780
|
+
ckan_dict = d[sheet_names_lower_index["ckan"]]
|
|
781
|
+
mdl.ckan_builder._load_from_dict(ckan_dict, base_dir=base_dir, proxies=proxies)
|
|
782
|
+
resources_dict = dict()
|
|
783
|
+
for resource_dict in d[sheet_names_lower_index["resources"]]:
|
|
784
|
+
resource_dict_alt = {k.lower().strip(): v for k, v in resource_dict.items()}
|
|
785
|
+
resources_dict[resource_dict_alt["name"]] = resource_dict_alt
|
|
786
|
+
resources_df = pd.DataFrame(list(resources_dict.values()))
|
|
787
|
+
mdl._load_package_resources_list_df(resources_df, base_dir=base_dir)
|
|
788
|
+
resource_sheets = sheet_names - {sheet_names_lower_index[name] for name in forbidden_resource_names if name in sheet_names_lower_index.keys()}
|
|
789
|
+
for resource_builder in mdl.resource_builders.values():
|
|
790
|
+
if "fields" in resources_dict[resource_builder.name]:
|
|
791
|
+
assert(isinstance(resource_builder, BuilderDataStoreABC) or isinstance(resource_builder, BuilderMultiDataStore))
|
|
792
|
+
fields_df = pd.DataFrame(resources_dict[resource_builder.name]["fields"])
|
|
793
|
+
resource_builder._load_fields_df(fields_df)
|
|
794
|
+
else:
|
|
795
|
+
resource_sheet = None
|
|
796
|
+
equiv_name = excel_name_of_builder(resource_builder)
|
|
797
|
+
if resource_builder.name in resource_sheets:
|
|
798
|
+
resource_sheet = resource_builder.name
|
|
799
|
+
elif equiv_name in resource_sheets:
|
|
800
|
+
resource_sheet = equiv_name
|
|
801
|
+
if resource_sheet is not None:
|
|
802
|
+
assert(isinstance(resource_builder, BuilderDataStoreABC) or isinstance(resource_builder, BuilderMultiDataStore))
|
|
803
|
+
fields_df = pd.DataFrame(list(d[resource_sheet].values()))
|
|
804
|
+
resource_builder._load_fields_df(fields_df)
|
|
805
|
+
resource_sheets.remove(resource_sheet)
|
|
806
|
+
mdl.update_package_name_in_resources()
|
|
807
|
+
if len(resource_sheets) > 0:
|
|
808
|
+
msg = f"Sheets present but not used: {', '.join(resource_sheets)}"
|
|
809
|
+
warn(msg)
|
|
810
|
+
return mdl
|
|
811
|
+
|
|
812
|
+
@staticmethod
|
|
813
|
+
def from_json(json_file, *, proxies:dict=None) -> "BuilderPackageBasic":
|
|
814
|
+
base_dir, _ = os.path.split(json_file)
|
|
815
|
+
with open(json_file, "r") as f:
|
|
816
|
+
builder_dict = json.load(f)
|
|
817
|
+
f.close()
|
|
818
|
+
mdl = BuilderPackageBasic.from_dict(builder_dict, base_dir=base_dir, proxies=proxies)
|
|
819
|
+
mdl.builder_source_file = json_file
|
|
820
|
+
return mdl
|
|
821
|
+
|
|
822
|
+
@staticmethod
|
|
823
|
+
def from_jsons(stream:str, *, source_file:str=None, proxies:dict=None) -> "BuilderPackageBasic":
|
|
824
|
+
base_dir, _ = os.path.split(source_file) if source_file is not None else (None, None)
|
|
825
|
+
builder_dict = json.loads(stream)
|
|
826
|
+
mdl = BuilderPackageBasic.from_dict(builder_dict, base_dir=base_dir, proxies=proxies)
|
|
827
|
+
mdl.builder_source_file = source_file
|
|
828
|
+
return mdl
|
|
829
|
+
|
|
830
|
+
def get_owner_org(self, ckan: CkanApiMap) -> str:
|
|
831
|
+
"""
|
|
832
|
+
Returns the owner organization for the package.
|
|
833
|
+
The owner organization can be specified by its name, title or id
|
|
834
|
+
|
|
835
|
+
:param ckan:
|
|
836
|
+
:return:
|
|
837
|
+
"""
|
|
838
|
+
if self.organization_name is not None:
|
|
839
|
+
ckan.organization_list_all(cancel_if_present=True)
|
|
840
|
+
# organization_info = ckan.get_organization_info_or_request(self.organization_name, error_not_found=True)
|
|
841
|
+
organization_info = ckan.map.get_organization_info(self.organization_name, error_not_mapped=True)
|
|
842
|
+
owner_org = organization_info.get_owner_org()
|
|
843
|
+
else:
|
|
844
|
+
owner_org = None
|
|
845
|
+
return owner_org
|
|
846
|
+
|
|
847
|
+
def get_license_id(self, ckan: CkanApiMap) -> str:
|
|
848
|
+
"""
|
|
849
|
+
Returns the license for the package.
|
|
850
|
+
The license can be specified by its title or id
|
|
851
|
+
|
|
852
|
+
:param ckan:
|
|
853
|
+
:return:
|
|
854
|
+
"""
|
|
855
|
+
if self.license_name is not None:
|
|
856
|
+
ckan.license_list(cancel_if_present=True)
|
|
857
|
+
license_id = ckan.map.get_license_id(self.license_name, error_not_mapped=True)
|
|
858
|
+
else:
|
|
859
|
+
license_id = None
|
|
860
|
+
return license_id
|
|
861
|
+
|
|
862
|
+
def get_license_info(self, ckan: CkanApiMap) -> CkanLicenseInfo:
|
|
863
|
+
license_id = self.get_license_id(ckan)
|
|
864
|
+
license_info = ckan.map.get_license_info(license_id) if license_id is not None else None
|
|
865
|
+
return license_info
|
|
866
|
+
|
|
867
|
+
def get_license_name(self, ckan: CkanApiMap) -> str:
|
|
868
|
+
license_info = self.get_license_info(ckan)
|
|
869
|
+
return license_info.title if license_info is not None else None
|
|
870
|
+
|
|
871
|
+
def patch_request_package(self, ckan:CkanApi) -> CkanPackageInfo:
|
|
872
|
+
"""
|
|
873
|
+
Function to perform all the necessary requests to initiate/reupload the package on the CKAN server.
|
|
874
|
+
This function does not upload the package resources.
|
|
875
|
+
NB: the organization must be provided, especially if the package is private
|
|
876
|
+
|
|
877
|
+
:param ckan:
|
|
878
|
+
:return:
|
|
879
|
+
"""
|
|
880
|
+
owner_org = self.get_owner_org(ckan)
|
|
881
|
+
license_id = self.get_license_id(ckan)
|
|
882
|
+
return ckan.package_create(self.package_name, private=self.package_attributes.private, state=self.package_attributes.state,
|
|
883
|
+
title=self.package_attributes.title, notes=self.package_attributes.description, owner_org=owner_org,
|
|
884
|
+
tags=self.package_attributes.tags, custom_fields=self.package_attributes.custom_fields,
|
|
885
|
+
url=self.package_attributes.url, version=self.package_attributes.version,
|
|
886
|
+
author=self.package_attributes.author, author_email=self.package_attributes.author_email,
|
|
887
|
+
maintainer=self.package_attributes.maintainer, maintainer_email=self.package_attributes.maintainer_email,
|
|
888
|
+
license_id=license_id,
|
|
889
|
+
cancel_if_exists=True, update_if_exists=True)
|
|
890
|
+
|
|
891
|
+
def patch_request_full(self, ckan:CkanApi, *,
|
|
892
|
+
reupload:bool=False, resources_base_dir:str=None,
|
|
893
|
+
create_default_view:bool=True) \
|
|
894
|
+
-> Tuple[CkanPackageInfo, Dict[str, CkanResourceInfo]]:
|
|
895
|
+
"""
|
|
896
|
+
Perform necessary requests to initiate/reupload the package and resources on the CKAN server.
|
|
897
|
+
For folder resources, this only uploads the first file of the resource.
|
|
898
|
+
|
|
899
|
+
:param ckan:
|
|
900
|
+
:return:
|
|
901
|
+
"""
|
|
902
|
+
# call to function update_request of package and update_request of resources
|
|
903
|
+
if ckan.params.policy_check_pre:
|
|
904
|
+
self.local_policy_check()
|
|
905
|
+
resources_base_dir = self.get_resources_base_dir(resources_base_dir)
|
|
906
|
+
self.upload_file_checks(resources_base_dir=resources_base_dir, ckan=ckan, verbose=True, raise_error=True)
|
|
907
|
+
pkg_info = self.patch_request_package(ckan)
|
|
908
|
+
ckan.map_resources(self.package_name, datastore_info=True)
|
|
909
|
+
package_id = pkg_info.id
|
|
910
|
+
self.package_attributes.id = package_id
|
|
911
|
+
resource_info_dict: Dict[str, CkanResourceInfo] = {}
|
|
912
|
+
self.update_package_name_in_resources()
|
|
913
|
+
self.update_ckan_options_name_in_resources(ckan)
|
|
914
|
+
for resource_builder in self.resource_builders.values():
|
|
915
|
+
if create_default_view is not None:
|
|
916
|
+
resource_builder.create_default_view = create_default_view
|
|
917
|
+
resource_info = resource_builder.patch_request(ckan, package_id, reupload=reupload, resources_base_dir=resources_base_dir)
|
|
918
|
+
resource_info_dict[resource_builder.name] = resource_info
|
|
919
|
+
if resource_info is not None: # this would be the case for BuilderMultiFile
|
|
920
|
+
pkg_info.update_resource(resource_info)
|
|
921
|
+
else:
|
|
922
|
+
assert(isinstance(resource_builder, BuilderMultiFile))
|
|
923
|
+
self.package_resource_reorder(ckan)
|
|
924
|
+
if ckan.params.policy_check_post:
|
|
925
|
+
self.remote_policy_check(ckan)
|
|
926
|
+
return pkg_info, resource_info_dict
|
|
927
|
+
|
|
928
|
+
def _get_mono_resource_used_files(self, resources_base_dir:str):
|
|
929
|
+
"""
|
|
930
|
+
List files used by mono-resource builders
|
|
931
|
+
|
|
932
|
+
:param resources_base_dir:
|
|
933
|
+
:return:
|
|
934
|
+
"""
|
|
935
|
+
mono_resource_used_files = set()
|
|
936
|
+
for resource_builder in self.resource_builders.values():
|
|
937
|
+
if isinstance(resource_builder, BuilderDataStoreMultiABC):
|
|
938
|
+
if not isinstance(resource_builder, BuilderDataStoreHarvester):
|
|
939
|
+
file_list = resource_builder.init_local_files_list(resources_base_dir=resources_base_dir)
|
|
940
|
+
mono_resource_used_files.update(set(file_list))
|
|
941
|
+
elif not (isinstance(resource_builder, BuilderMultiFile)):
|
|
942
|
+
if resource_builder.get_sample_file_path(resources_base_dir) is not None and not resource_builder.sample_file_path_is_url():
|
|
943
|
+
mono_resource_used_files.add(resource_builder.get_sample_file_path(resources_base_dir))
|
|
944
|
+
return mono_resource_used_files
|
|
945
|
+
|
|
946
|
+
def upload_file_checks(self, resource_name:Union[str, List[str]]=None, *, resources_base_dir:str=None,
|
|
947
|
+
messages:Dict[str, ContextErrorLevelMessage]=None,
|
|
948
|
+
verbose:bool=True, raise_error:bool=False, ckan:CkanApi=None, **kwargs) -> bool:
|
|
949
|
+
"""
|
|
950
|
+
Method to check the presence of all needed files before uploading or patching resources.
|
|
951
|
+
|
|
952
|
+
:param resources_base_dir:
|
|
953
|
+
:param ckan: Optional CkanApi object used to parameterize the requests to test the presence of resources defined by an url.
|
|
954
|
+
:param kwargs: keyword arguments to specify connexion parameters for querying the urls.
|
|
955
|
+
:return:
|
|
956
|
+
"""
|
|
957
|
+
if resource_name is None:
|
|
958
|
+
resource_name = list(self.resource_builders.keys())
|
|
959
|
+
elif isinstance(resource_name, str):
|
|
960
|
+
resource_name = [resource_name]
|
|
961
|
+
if messages is None:
|
|
962
|
+
messages = {}
|
|
963
|
+
self.update_package_name_in_resources()
|
|
964
|
+
resources_base_dir = self.get_resources_base_dir(resources_base_dir)
|
|
965
|
+
mono_resource_used_files = self._get_mono_resource_used_files(resources_base_dir)
|
|
966
|
+
for resource_builder_name in resource_name:
|
|
967
|
+
resource_builder = self.resource_builders[resource_builder_name]
|
|
968
|
+
if isinstance(resource_builder, BuilderMultiFile):
|
|
969
|
+
messages[resource_builder_name] = resource_builder.upload_file_checks(resources_base_dir=resources_base_dir, ckan=ckan,
|
|
970
|
+
excluded_files=mono_resource_used_files if multi_file_exclude_other_files else None, **kwargs)
|
|
971
|
+
else:
|
|
972
|
+
messages[resource_builder_name] = resource_builder.upload_file_checks(resources_base_dir=resources_base_dir, ckan=ckan, **kwargs)
|
|
973
|
+
num_messages = len([1 for message in messages.values() if message is not None])
|
|
974
|
+
success = len([1 for message in messages.values() if message is not None and message.error_level == ErrorLevel.Error]) == 0
|
|
975
|
+
if verbose and num_messages > 0:
|
|
976
|
+
print("\n".join([f"for resource {key}: {message}" for key, message in messages.items() if message is not None]))
|
|
977
|
+
if raise_error and not success:
|
|
978
|
+
raise FileNotFoundError("\n".join([f"for resource {key}: {message}" for key, message in messages.items() if message is not None and message.error_level == ErrorLevel.Error]))
|
|
979
|
+
return success
|
|
980
|
+
|
|
981
|
+
def upload_large_datasets(self, ckan:CkanApi, *, resources_base_dir:str=None, threads:int=1,
|
|
982
|
+
progress_callback:Callable=None, only_missing:bool=False) -> None:
|
|
983
|
+
"""
|
|
984
|
+
Method to upload large datasets of the package.
|
|
985
|
+
The small datasets are to be uploaded with the patch_request_full method.
|
|
986
|
+
|
|
987
|
+
:param ckan:
|
|
988
|
+
:param resources_base_dir:
|
|
989
|
+
:param threads:
|
|
990
|
+
:param progress_callback:
|
|
991
|
+
:param only_missing: upsert only missing rows for DataStores and only missing files for MultiFile
|
|
992
|
+
:return:
|
|
993
|
+
"""
|
|
994
|
+
self.info_request_package(ckan=ckan)
|
|
995
|
+
resources_base_dir = self.get_resources_base_dir(resources_base_dir)
|
|
996
|
+
self.update_package_name_in_resources()
|
|
997
|
+
self.update_ckan_options_name_in_resources(ckan)
|
|
998
|
+
resource_names = [key for key, resource_builder in self.resource_builders.items() if isinstance(resource_builder, BuilderDataStoreMultiABC)]
|
|
999
|
+
self.upload_file_checks(resource_names, resources_base_dir=resources_base_dir, ckan=ckan, verbose=True, raise_error=True)
|
|
1000
|
+
mono_resource_used_files = self._get_mono_resource_used_files(resources_base_dir)
|
|
1001
|
+
for resource_builder in self.resource_builders.values():
|
|
1002
|
+
if isinstance(resource_builder, BuilderDataStoreMultiABC):
|
|
1003
|
+
if progress_callback is not None:
|
|
1004
|
+
resource_builder.progress_callback = progress_callback
|
|
1005
|
+
resource_builder.upload_request_full(ckan=ckan, resources_base_dir=resources_base_dir, threads=threads,
|
|
1006
|
+
only_missing=only_missing)
|
|
1007
|
+
for resource_builder in self.resource_builders.values():
|
|
1008
|
+
if isinstance(resource_builder, BuilderMultiFile):
|
|
1009
|
+
if progress_callback is not None:
|
|
1010
|
+
resource_builder.progress_callback = progress_callback
|
|
1011
|
+
resource_builder.upload_request_full(ckan=ckan, resources_base_dir=resources_base_dir, threads=threads,
|
|
1012
|
+
only_missing=only_missing,
|
|
1013
|
+
excluded_files=mono_resource_used_files if multi_file_exclude_other_files else None)
|
|
1014
|
+
self.package_resource_reorder(ckan)
|
|
1015
|
+
|
|
1016
|
+
def download_resource_df(self, ckan:CkanApi, resource_name:str, search_all:bool=False, **kwargs) -> pd.DataFrame:
|
|
1017
|
+
"""
|
|
1018
|
+
Proxy for download_sample_df for a DataStore
|
|
1019
|
+
"""
|
|
1020
|
+
self.update_package_name_in_resources()
|
|
1021
|
+
self.update_ckan_options_name_in_resources(ckan)
|
|
1022
|
+
assert(isinstance(self.resource_builders[resource_name], BuilderDataStoreABC))
|
|
1023
|
+
return self.resource_builders[resource_name].download_sample_df(ckan=ckan, search_all=search_all, **kwargs)
|
|
1024
|
+
|
|
1025
|
+
def download_resource(self, ckan:CkanApi, resource_name:str, full_download:bool=False, **kwargs) -> bytes:
|
|
1026
|
+
"""
|
|
1027
|
+
Proxy for download_sample for a resource
|
|
1028
|
+
"""
|
|
1029
|
+
self.update_package_name_in_resources()
|
|
1030
|
+
self.update_ckan_options_name_in_resources(ckan)
|
|
1031
|
+
return self.resource_builders[resource_name].download_sample(ckan=ckan, full_download=full_download, **kwargs)
|
|
1032
|
+
|
|
1033
|
+
def get_or_query_resource_id(self, ckan:CkanApi, resource_name:str, error_not_found:bool=True) -> str:
|
|
1034
|
+
self.update_package_name_in_resources()
|
|
1035
|
+
self.update_ckan_options_name_in_resources(ckan)
|
|
1036
|
+
return self.resource_builders[resource_name].get_or_query_resource_id(ckan, error_not_found=error_not_found)
|
|
1037
|
+
|
|
1038
|
+
def _get_mono_resource_names(self):
|
|
1039
|
+
"""
|
|
1040
|
+
List resource names of mono-resource builders.
|
|
1041
|
+
|
|
1042
|
+
:return:
|
|
1043
|
+
"""
|
|
1044
|
+
return {resource_name for resource_name, resource_builder in self.resource_builders.items() if not isinstance(resource_builder, BuilderMultiFile)}
|
|
1045
|
+
|
|
1046
|
+
def download_request_full(self, ckan:CkanApi, out_dir:str=None, enforce_none_out_dir:bool=False, resource_name:str=None, full_download:bool=False,
|
|
1047
|
+
threads:int=1, skip_existing:bool=True, progress_callback:Callable=None,
|
|
1048
|
+
force:bool=False, rm_dir:bool=False) -> None:
|
|
1049
|
+
"""
|
|
1050
|
+
Downloads the full package resources into out_dir.
|
|
1051
|
+
|
|
1052
|
+
:param ckan:
|
|
1053
|
+
:param out_dir: download directory
|
|
1054
|
+
:param rm_dir: remove directory if exists before downloading
|
|
1055
|
+
:param skip_existing: skip download of existing resources
|
|
1056
|
+
:param enforce_none_out_dir: if no out_dir is provided, True: files will not be saved after download, False: default output dir will be used, if defined
|
|
1057
|
+
:param resource_name:
|
|
1058
|
+
:param full_download: option to fully download the resources. If False, only a partial download is made.
|
|
1059
|
+
:param threads:
|
|
1060
|
+
:param progress_callback:
|
|
1061
|
+
:param force: option to bypass the enable_download attribute of resources
|
|
1062
|
+
:return:
|
|
1063
|
+
"""
|
|
1064
|
+
out_dir = self.get_default_out_dir(out_dir, enforce_none=enforce_none_out_dir)
|
|
1065
|
+
if out_dir is not None and os.path.isdir(out_dir):
|
|
1066
|
+
if rm_dir:
|
|
1067
|
+
shutil.rmtree(out_dir)
|
|
1068
|
+
self.info_request_package(ckan=ckan)
|
|
1069
|
+
if resource_name is None:
|
|
1070
|
+
resource_builders = self.resource_builders
|
|
1071
|
+
else:
|
|
1072
|
+
resource_builders = {resource_name: self.resource_builders[resource_name]}
|
|
1073
|
+
self.update_package_name_in_resources()
|
|
1074
|
+
self.update_ckan_options_name_in_resources(ckan)
|
|
1075
|
+
mono_resource_names = self._get_mono_resource_names()
|
|
1076
|
+
for resource_builder in resource_builders.values():
|
|
1077
|
+
if skip_existing is not None:
|
|
1078
|
+
resource_builder.download_skip_existing = skip_existing
|
|
1079
|
+
if not (isinstance(resource_builder, BuilderDataStoreMultiABC) or isinstance(resource_builder, BuilderMultiFile)):
|
|
1080
|
+
resource_builder.download_request(ckan, out_dir=out_dir, full_download=full_download,
|
|
1081
|
+
threads=threads, force=force)
|
|
1082
|
+
for resource_builder in resource_builders.values():
|
|
1083
|
+
if isinstance(resource_builder, BuilderDataStoreMultiABC):
|
|
1084
|
+
if progress_callback is not None:
|
|
1085
|
+
resource_builder.progress_callback = progress_callback
|
|
1086
|
+
resource_builder.download_request(ckan, out_dir=out_dir, full_download=full_download,
|
|
1087
|
+
threads=threads, force=force)
|
|
1088
|
+
for resource_builder in resource_builders.values():
|
|
1089
|
+
if isinstance(resource_builder, BuilderMultiFile):
|
|
1090
|
+
if progress_callback is not None:
|
|
1091
|
+
resource_builder.progress_callback = progress_callback
|
|
1092
|
+
resource_builder.download_request(ckan, out_dir=out_dir, full_download=full_download,
|
|
1093
|
+
threads=threads, force=force, excluded_resource_names=mono_resource_names)
|
|
1094
|
+
|
|
1095
|
+
def download_sample_df(self, ckan:CkanApi, resource_name:str=None, *, search_all:bool=False, **kwargs) -> Dict[str, pd.DataFrame]:
|
|
1096
|
+
"""
|
|
1097
|
+
Download a sample DataFrame for the DataStore type resources.
|
|
1098
|
+
|
|
1099
|
+
:param ckan:
|
|
1100
|
+
:param resource_name:
|
|
1101
|
+
:return:
|
|
1102
|
+
"""
|
|
1103
|
+
self.info_request_package(ckan=ckan)
|
|
1104
|
+
if resource_name is None:
|
|
1105
|
+
resource_builders = self.resource_builders
|
|
1106
|
+
else:
|
|
1107
|
+
resource_builders = {resource_name: self.resource_builders[resource_name]}
|
|
1108
|
+
self.update_package_name_in_resources()
|
|
1109
|
+
self.update_ckan_options_name_in_resources(ckan)
|
|
1110
|
+
df_dict = {}
|
|
1111
|
+
for resource_builder in resource_builders.values():
|
|
1112
|
+
if isinstance(resource_builder, BuilderDataStoreABC):
|
|
1113
|
+
df_dict[resource_builder.name] = resource_builder.download_sample_df(ckan, search_all=search_all, **kwargs)
|
|
1114
|
+
return df_dict
|
|
1115
|
+
|
|
1116
|
+
def download_sample(self, ckan:CkanApi, resource_name:str=None, *, datastores_as_df:bool=True, search_all:bool=False, **kwargs) -> Dict[str, Union[bytes, pd.DataFrame]]:
|
|
1117
|
+
"""
|
|
1118
|
+
Download samples from all resources.
|
|
1119
|
+
|
|
1120
|
+
:param ckan:
|
|
1121
|
+
:param resource_name:
|
|
1122
|
+
:return:
|
|
1123
|
+
"""
|
|
1124
|
+
self.info_request_package(ckan=ckan)
|
|
1125
|
+
if resource_name is None:
|
|
1126
|
+
resource_builders = self.resource_builders
|
|
1127
|
+
else:
|
|
1128
|
+
resource_builders = {resource_name: self.resource_builders[resource_name]}
|
|
1129
|
+
self.update_package_name_in_resources()
|
|
1130
|
+
self.update_ckan_options_name_in_resources(ckan)
|
|
1131
|
+
df_dict = {}
|
|
1132
|
+
for resource_builder in resource_builders.values():
|
|
1133
|
+
if isinstance(resource_builder, BuilderDataStoreABC) and datastores_as_df:
|
|
1134
|
+
df_dict[resource_builder.name] = resource_builder.download_sample_df(ckan, search_all=search_all, **kwargs)
|
|
1135
|
+
else:
|
|
1136
|
+
df_dict[resource_builder.name] = resource_builder.download_sample(ckan, search_all=search_all, **kwargs)
|
|
1137
|
+
return df_dict
|
|
1138
|
+
|
|
1139
|
+
def info_request_package(self, ckan:CkanApi) -> CkanPackageInfo:
|
|
1140
|
+
pkg_info = ckan.get_package_info_or_request(package_name=self.package_name)
|
|
1141
|
+
self.package_attributes.id = pkg_info.id
|
|
1142
|
+
return pkg_info
|
|
1143
|
+
|
|
1144
|
+
def info_request_full(self, ckan:CkanApi) -> Tuple[CkanPackageInfo, List[CkanResourceInfo]]:
|
|
1145
|
+
pkg_info = self.info_request_package(ckan)
|
|
1146
|
+
self.update_package_name_in_resources()
|
|
1147
|
+
self.update_ckan_options_name_in_resources(ckan)
|
|
1148
|
+
res_info = [resource_builder.resource_info_request(ckan) for resource_builder in self.resource_builders.values()]
|
|
1149
|
+
return pkg_info, res_info
|
|
1150
|
+
|
|
1151
|
+
def get_base_dir(self, base_dir:str=None) -> str:
|
|
1152
|
+
"""
|
|
1153
|
+
Returns the default base_dir if not specified. The base_dir is the location of the Excel workbook.
|
|
1154
|
+
If this was initialized from a dictionary, the current working directory will be used (cwd).
|
|
1155
|
+
|
|
1156
|
+
:return:
|
|
1157
|
+
"""
|
|
1158
|
+
if base_dir is None:
|
|
1159
|
+
if self.builder_source_file is not None:
|
|
1160
|
+
base_dir, _ = os.path.split(self.builder_source_file)
|
|
1161
|
+
else:
|
|
1162
|
+
base_dir = os.path.abspath(".")
|
|
1163
|
+
return base_dir
|
|
1164
|
+
|
|
1165
|
+
def get_resources_base_dir(self, resources_base_dir:str) -> str:
|
|
1166
|
+
"""
|
|
1167
|
+
This returns the base directory for the resource files.
|
|
1168
|
+
It is distinct from the base_dir and can be defined relative to the base_dir in the Excel workbook (see comment at the top of the class).
|
|
1169
|
+
|
|
1170
|
+
:param resources_base_dir:
|
|
1171
|
+
:return:
|
|
1172
|
+
"""
|
|
1173
|
+
if resources_base_dir is None:
|
|
1174
|
+
resources_base_dir = self._resources_base_dir
|
|
1175
|
+
return resources_base_dir
|
|
1176
|
+
|
|
1177
|
+
def get_default_out_dir(self, out_dir:str, enforce_none:bool=False) -> str:
|
|
1178
|
+
"""
|
|
1179
|
+
This returns the default download directory.
|
|
1180
|
+
|
|
1181
|
+
:param out_dir:
|
|
1182
|
+
:return:
|
|
1183
|
+
"""
|
|
1184
|
+
if out_dir is None and not enforce_none:
|
|
1185
|
+
out_dir = self._default_out_dir
|
|
1186
|
+
return out_dir
|
|
1187
|
+
|
|
1188
|
+
def init_ckan(self, ckan:CkanApi=None, *, base_dir:str=None, set_owner_org:bool=False,
|
|
1189
|
+
default_proxies:dict=None, proxies:Union[str,dict,ProxyConfig]=None) -> CkanApi:
|
|
1190
|
+
"""
|
|
1191
|
+
Initialize the CKAN instance from the parameters defined in the "ckan" tab of the Excel workbook.
|
|
1192
|
+
|
|
1193
|
+
:param ckan:
|
|
1194
|
+
:param base_dir:
|
|
1195
|
+
:param default_proxies:
|
|
1196
|
+
:param set_owner_org: Option to set the owner_org of the CKAN instance.
|
|
1197
|
+
This can be problematic because it requires some requests as the proxies are not set.
|
|
1198
|
+
It can be omitted because it has no influence on the patch_request_package function.
|
|
1199
|
+
:return:
|
|
1200
|
+
"""
|
|
1201
|
+
base_dir = self.get_base_dir(base_dir) # base_dir is necessary to find the API key file, if provided
|
|
1202
|
+
ckan = self.ckan_builder.init_ckan(base_dir, ckan=ckan, default_proxies=default_proxies,
|
|
1203
|
+
proxies=proxies)
|
|
1204
|
+
if set_owner_org and self.organization_name is not None:
|
|
1205
|
+
ckan.owner_org = self.get_owner_org(ckan)
|
|
1206
|
+
return ckan
|
|
1207
|
+
|
|
1208
|
+
def get_or_query_package_id(self, ckan: CkanApi) -> str:
|
|
1209
|
+
package_info = ckan.get_package_info_or_request(self.package_name)
|
|
1210
|
+
self.package_attributes.id = package_info.id
|
|
1211
|
+
return package_info.id
|
|
1212
|
+
|
|
1213
|
+
def list_resource_ids(self, ckan: CkanApi) -> List[str]:
|
|
1214
|
+
"""
|
|
1215
|
+
List resource ids on CKAN server, following the order of the package builder
|
|
1216
|
+
|
|
1217
|
+
:param ckan:
|
|
1218
|
+
:return:
|
|
1219
|
+
"""
|
|
1220
|
+
self.update_package_name_in_resources()
|
|
1221
|
+
self.update_ckan_options_name_in_resources(ckan)
|
|
1222
|
+
mono_resource_names = {resource_name for resource_name, resource_builder in self.resource_builders.items() if not isinstance(resource_builder, BuilderMultiFile)}
|
|
1223
|
+
resource_ids = []
|
|
1224
|
+
for resource_builder in self.resource_builders.values():
|
|
1225
|
+
if not (isinstance(resource_builder, BuilderMultiFile)):
|
|
1226
|
+
resource_ids.append(resource_builder.get_or_query_resource_id(ckan))
|
|
1227
|
+
else:
|
|
1228
|
+
multi_resource_ids = resource_builder.list_remote_resource_ids(ckan, excluded_resource_names=mono_resource_names,
|
|
1229
|
+
cancel_if_present=False)
|
|
1230
|
+
resource_ids = resource_ids + multi_resource_ids
|
|
1231
|
+
np_resource_ids = np.array(resource_ids)
|
|
1232
|
+
_, I = np.unique(np_resource_ids, return_index=True)
|
|
1233
|
+
I.sort()
|
|
1234
|
+
np_resource_ids = np_resource_ids[I]
|
|
1235
|
+
resource_ids = np_resource_ids.tolist()
|
|
1236
|
+
return resource_ids
|
|
1237
|
+
|
|
1238
|
+
def package_resource_reorder(self, ckan: CkanApi) -> None:
|
|
1239
|
+
"""
|
|
1240
|
+
Apply the order of the resources defined in the Excel workbook.
|
|
1241
|
+
|
|
1242
|
+
:param ckan:
|
|
1243
|
+
:return:
|
|
1244
|
+
"""
|
|
1245
|
+
# OrderedDict ensures the order of resources is preserved
|
|
1246
|
+
package_id = self.get_or_query_package_id(ckan=ckan)
|
|
1247
|
+
resource_ids = self.list_resource_ids(ckan=ckan)
|
|
1248
|
+
ckan._api_package_resource_reorder(package_id=package_id, resource_ids=resource_ids)
|
|
1249
|
+
|
|
1250
|
+
def remote_policy_check(self, ckan: CkanApi, policy:CkanPackageDataFormatPolicy=None,
|
|
1251
|
+
*, buffer:Dict[str, List[DataPolicyError]]=None, raise_error:bool=False,
|
|
1252
|
+
verbose:bool=None) -> bool:
|
|
1253
|
+
"""
|
|
1254
|
+
Check the package defined by this builder against a data format policy, based on the information from the API.
|
|
1255
|
+
|
|
1256
|
+
:param ckan:
|
|
1257
|
+
:param policy:
|
|
1258
|
+
:param buffer:
|
|
1259
|
+
:param raise_error:
|
|
1260
|
+
:param verbose:
|
|
1261
|
+
:return:
|
|
1262
|
+
"""
|
|
1263
|
+
if policy is None:
|
|
1264
|
+
policy = self.ckan_builder.policy
|
|
1265
|
+
return ckan.policy_check(package_list=self.package_name, policy=policy, buffer=buffer,
|
|
1266
|
+
verbose=verbose, raise_error=raise_error)
|
|
1267
|
+
|
|
1268
|
+
def local_policy_check(self, policy:CkanPackageDataFormatPolicy=None,
|
|
1269
|
+
*, buffer:Dict[str, List[DataPolicyError]]=None, raise_error:bool=False,
|
|
1270
|
+
verbose:bool=True) -> bool:
|
|
1271
|
+
"""
|
|
1272
|
+
Check if the package builder respects a data format policy (only on local definition).
|
|
1273
|
+
|
|
1274
|
+
:return:
|
|
1275
|
+
"""
|
|
1276
|
+
if policy is None:
|
|
1277
|
+
policy = self.ckan_builder.policy
|
|
1278
|
+
if policy is None:
|
|
1279
|
+
# no policy loaded at all
|
|
1280
|
+
return True
|
|
1281
|
+
package_info = self.to_ckan_package_info(check_id=False)
|
|
1282
|
+
package_buffer: List[DataPolicyError] = []
|
|
1283
|
+
success = policy.policy_check_package(package_info, display_message=verbose,
|
|
1284
|
+
package_buffer=package_buffer, raise_error=raise_error)
|
|
1285
|
+
if buffer is not None:
|
|
1286
|
+
buffer[package_info.name] = package_buffer
|
|
1287
|
+
if verbose:
|
|
1288
|
+
print(f"Data format policy {policy.label} success: {success}")
|
|
1289
|
+
return success
|
|
1290
|
+
|
|
1291
|
+
|