ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ckanapi_harvesters/__init__.py +32 -10
- ckanapi_harvesters/auxiliary/__init__.py +26 -0
- ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
- ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
- ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
- ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
- ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
- ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
- ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
- ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
- ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
- ckanapi_harvesters/auxiliary/deprecated.py +82 -0
- ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
- ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
- ckanapi_harvesters/auxiliary/list_records.py +60 -0
- ckanapi_harvesters/auxiliary/login.py +163 -0
- ckanapi_harvesters/auxiliary/path.py +208 -0
- ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
- ckanapi_harvesters/auxiliary/urls.py +40 -0
- ckanapi_harvesters/builder/__init__.py +40 -0
- ckanapi_harvesters/builder/builder_aux.py +20 -0
- ckanapi_harvesters/builder/builder_ckan.py +238 -0
- ckanapi_harvesters/builder/builder_errors.py +36 -0
- ckanapi_harvesters/builder/builder_field.py +122 -0
- ckanapi_harvesters/builder/builder_package.py +9 -0
- ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
- ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
- ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
- ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
- ckanapi_harvesters/builder/builder_resource.py +589 -0
- ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
- ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
- ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
- ckanapi_harvesters/builder/builder_resource_init.py +126 -0
- ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
- ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
- ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
- ckanapi_harvesters/builder/example/__init__.py +21 -0
- ckanapi_harvesters/builder/example/builder_example.py +21 -0
- ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
- ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
- ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
- ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
- ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
- ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
- ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
- ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
- ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
- ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
- ckanapi_harvesters/builder/mapper_datastore.py +93 -0
- ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
- ckanapi_harvesters/builder/specific/__init__.py +11 -0
- ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
- ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
- ckanapi_harvesters/ckan_api/__init__.py +20 -0
- ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
- ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
- ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
- ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
- ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
- ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
- ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
- ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
- ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
- ckanapi_harvesters/harvesters/__init__.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
- ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
- ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
- ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
- ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
- ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
- ckanapi_harvesters/harvesters/harvester_init.py +30 -0
- ckanapi_harvesters/harvesters/harvester_model.py +49 -0
- ckanapi_harvesters/harvesters/harvester_params.py +323 -0
- ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
- ckanapi_harvesters/harvesters/postgre_params.py +86 -0
- ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
- ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
- ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
- ckanapi_harvesters/policies/__init__.py +20 -0
- ckanapi_harvesters/policies/data_format_policy.py +269 -0
- ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
- ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
- ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
- ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
- ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
- ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
- ckanapi_harvesters/reports/__init__.py +11 -0
- ckanapi_harvesters/reports/admin_report.py +292 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
- ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
- ckanapi_harvesters/divider/__init__.py +0 -27
- ckanapi_harvesters/divider/divider.py +0 -53
- ckanapi_harvesters/divider/divider_error.py +0 -59
- ckanapi_harvesters/main.py +0 -30
- ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,561 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Code to upload metadata to the CKAN server to create/update an existing package
|
|
5
|
+
The metadata is defined by the user in an Excel worksheet
|
|
6
|
+
This file implements functions to initiate a DataStore.
|
|
7
|
+
"""
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Dict, List, Tuple, Union, Set, Any
|
|
10
|
+
import os
|
|
11
|
+
import io
|
|
12
|
+
from warnings import warn
|
|
13
|
+
from collections import OrderedDict
|
|
14
|
+
import copy
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
|
|
19
|
+
from ckanapi_harvesters.builder.builder_field import BuilderField
|
|
20
|
+
from ckanapi_harvesters.harvesters.file_formats.file_format_abc import FileFormatABC
|
|
21
|
+
from ckanapi_harvesters.harvesters.file_formats.file_format_init import init_file_format_datastore
|
|
22
|
+
from ckanapi_harvesters.builder.mapper_datastore import DataSchemeConversion
|
|
23
|
+
from ckanapi_harvesters.builder.builder_resource import BuilderResourceABC
|
|
24
|
+
from ckanapi_harvesters.auxiliary.ckan_errors import DuplicateNameError
|
|
25
|
+
from ckanapi_harvesters.auxiliary.path import resolve_rel_path
|
|
26
|
+
from ckanapi_harvesters.builder.builder_errors import RequiredDataFrameFieldsError, ResourceFileNotExistMessage, IncompletePatchError
|
|
27
|
+
from ckanapi_harvesters.auxiliary.ckan_model import CkanResourceInfo, CkanDataStoreInfo
|
|
28
|
+
from ckanapi_harvesters.ckan_api import CkanApi
|
|
29
|
+
from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element, find_duplicates, datastore_id_col
|
|
30
|
+
from ckanapi_harvesters.auxiliary.ckan_defs import ckan_tags_sep
|
|
31
|
+
from ckanapi_harvesters.auxiliary.ckan_model import UpsertChoice
|
|
32
|
+
from ckanapi_harvesters.auxiliary.ckan_model import CkanField
|
|
33
|
+
from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_abc import CkanDataCleanerABC
|
|
34
|
+
|
|
35
|
+
# number of rows to upload to initiate DataStore with datapusher, before explicitly specifying field data types and indexes
|
|
36
|
+
num_rows_patch_first_upload_partial: Union[int,None] = 50 # set to None to upload directly the whole DataFrame before the DataStore creation
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
default_alias_keyword:Union[str,None] = "default" # generate default alias if an alias with this value is found in parameters
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class BuilderDataStoreABC(BuilderResourceABC, ABC):
|
|
43
|
+
def __init__(self, *, name:str=None, format:str=None, description:str=None,
|
|
44
|
+
resource_id:str=None, download_url:str=None):
|
|
45
|
+
super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
|
|
46
|
+
self.field_builders: Union[Dict[str, BuilderField],None] = None
|
|
47
|
+
self.primary_key: Union[List[str],None] = None
|
|
48
|
+
self.indexes: Union[List[str],None] = None
|
|
49
|
+
self.aliases: Union[List[str],None] = None
|
|
50
|
+
self.aux_upload_fun_name:str = ""
|
|
51
|
+
self.aux_download_fun_name:str = ""
|
|
52
|
+
# Functions input/outputs
|
|
53
|
+
self.data_cleaner_upload: Union[CkanDataCleanerABC,None] = None
|
|
54
|
+
self.reupload_on_update = False # do not reupload on update for DataStores
|
|
55
|
+
self.reupload_if_needed: bool = True
|
|
56
|
+
self.reupload_needed: Union[bool,None] = None
|
|
57
|
+
self.df_mapper = DataSchemeConversion()
|
|
58
|
+
self.local_file_format: FileFormatABC = init_file_format_datastore(self.format)
|
|
59
|
+
|
|
60
|
+
def copy(self, *, dest=None):
|
|
61
|
+
super().copy(dest=dest)
|
|
62
|
+
dest.field_builders = copy.deepcopy(self.field_builders)
|
|
63
|
+
dest.primary_key = copy.deepcopy(self.primary_key)
|
|
64
|
+
dest.indexes = copy.deepcopy(self.indexes)
|
|
65
|
+
dest.aliases = copy.deepcopy(self.aliases)
|
|
66
|
+
dest.aux_upload_fun_name = self.aux_upload_fun_name
|
|
67
|
+
dest.aux_download_fun_name = self.aux_download_fun_name
|
|
68
|
+
dest.reupload_on_update = self.reupload_on_update
|
|
69
|
+
dest.reupload_if_needed = self.reupload_if_needed
|
|
70
|
+
dest.reupload_needed = self.reupload_needed
|
|
71
|
+
dest.df_mapper = self.df_mapper.copy()
|
|
72
|
+
dest.local_file_format = self.local_file_format.copy()
|
|
73
|
+
return dest
|
|
74
|
+
|
|
75
|
+
def _init_file_format(self):
|
|
76
|
+
self.local_file_format = init_file_format_datastore(self.format) # default file format is CSV (user can change)
|
|
77
|
+
|
|
78
|
+
def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
|
|
79
|
+
super()._load_from_df_row(row=row)
|
|
80
|
+
primary_keys_string: Union[str,None] = _string_from_element(row["primary key"])
|
|
81
|
+
indexes_string: Union[str,None] = _string_from_element(row["indexes"])
|
|
82
|
+
aliases_string: Union[str,None] = None
|
|
83
|
+
if "upload function" in row.keys():
|
|
84
|
+
self.aux_upload_fun_name: str = _string_from_element(row["upload function"], empty_value="")
|
|
85
|
+
if "download function" in row.keys():
|
|
86
|
+
self.aux_download_fun_name: str = _string_from_element(row["download function"], empty_value="")
|
|
87
|
+
if "aliases" in row.keys():
|
|
88
|
+
aliases_string = _string_from_element(row["aliases"])
|
|
89
|
+
if primary_keys_string is not None:
|
|
90
|
+
if primary_keys_string.lower() == "none":
|
|
91
|
+
self.primary_key = []
|
|
92
|
+
else:
|
|
93
|
+
self.primary_key = [field.strip() for field in primary_keys_string.split(ckan_tags_sep)]
|
|
94
|
+
if indexes_string is not None:
|
|
95
|
+
if indexes_string.lower() == "none":
|
|
96
|
+
self.indexes = []
|
|
97
|
+
else:
|
|
98
|
+
self.indexes = [field.strip() for field in indexes_string.split(ckan_tags_sep)]
|
|
99
|
+
if aliases_string is not None:
|
|
100
|
+
self.aliases = aliases_string.split(ckan_tags_sep)
|
|
101
|
+
self._init_file_format()
|
|
102
|
+
|
|
103
|
+
@abstractmethod
|
|
104
|
+
def _to_dict(self, include_id:bool=True) -> dict:
|
|
105
|
+
d = super()._to_dict(include_id=include_id)
|
|
106
|
+
d["Primary key"] = ckan_tags_sep.join(self.primary_key) if self.primary_key else ""
|
|
107
|
+
d["Indexes"] = ckan_tags_sep.join(self.indexes) if self.indexes is not None else ""
|
|
108
|
+
d["Upload function"] = self.aux_upload_fun_name
|
|
109
|
+
d["Download function"] = self.aux_download_fun_name
|
|
110
|
+
d["Aliases"] = ckan_tags_sep.join(self.aliases) if self.aliases is not None else ""
|
|
111
|
+
return d
|
|
112
|
+
|
|
113
|
+
def init_options_from_ckan(self, ckan:CkanApi) -> None:
|
|
114
|
+
super().init_options_from_ckan(ckan)
|
|
115
|
+
if self.field_builders is not None:
|
|
116
|
+
for field_builder in self.field_builders.values():
|
|
117
|
+
field_builder.internal_attrs.update_from_ckan(ckan)
|
|
118
|
+
|
|
119
|
+
def _check_field_duplicates(self):
|
|
120
|
+
if self.field_builders is not None:
|
|
121
|
+
duplicates = find_duplicates([field_builder.name for field_builder in self.field_builders.values()])
|
|
122
|
+
if len(duplicates) > 0:
|
|
123
|
+
raise DuplicateNameError("Field", duplicates)
|
|
124
|
+
|
|
125
|
+
def _get_fields_dict(self) -> Dict[str, dict]:
|
|
126
|
+
self._check_field_duplicates()
|
|
127
|
+
if self.field_builders is not None:
|
|
128
|
+
fields_dict = OrderedDict([(field_builder.name, field_builder._to_dict()) for field_builder in self.field_builders.values()])
|
|
129
|
+
else:
|
|
130
|
+
fields_dict = None
|
|
131
|
+
return fields_dict
|
|
132
|
+
|
|
133
|
+
def _get_fields_info(self) -> Dict[str, CkanField]:
|
|
134
|
+
self._check_field_duplicates()
|
|
135
|
+
if self.field_builders is not None:
|
|
136
|
+
builder_fields = OrderedDict([(field_builder.name, field_builder._to_ckan_field()) for field_builder in self.field_builders.values()])
|
|
137
|
+
else:
|
|
138
|
+
builder_fields = {}
|
|
139
|
+
return builder_fields
|
|
140
|
+
|
|
141
|
+
def _get_fields_df(self) -> pd.DataFrame:
|
|
142
|
+
fields_dict_list = [value for value in self._get_fields_dict().values()]
|
|
143
|
+
fields_df = pd.DataFrame.from_records(fields_dict_list)
|
|
144
|
+
return fields_df
|
|
145
|
+
|
|
146
|
+
def _load_fields_df(self, fields_df: pd.DataFrame):
|
|
147
|
+
fields_df.columns = fields_df.columns.map(str.lower)
|
|
148
|
+
fields_df.columns = fields_df.columns.map(str.strip)
|
|
149
|
+
self.field_builders = {}
|
|
150
|
+
for index, row in fields_df.iterrows():
|
|
151
|
+
field_builder = BuilderField()
|
|
152
|
+
field_builder._load_from_df_row(row=row)
|
|
153
|
+
self.field_builders[field_builder.name] = field_builder
|
|
154
|
+
|
|
155
|
+
def _to_ckan_resource_info(self, package_id:str, check_id:bool=True) -> CkanResourceInfo:
|
|
156
|
+
resource_info = super()._to_ckan_resource_info(package_id=package_id, check_id=check_id)
|
|
157
|
+
resource_info.datastore_info = CkanDataStoreInfo()
|
|
158
|
+
resource_info.datastore_info.resource_id = resource_info.id
|
|
159
|
+
if self.field_builders is not None:
|
|
160
|
+
resource_info.datastore_info.fields_dict = OrderedDict()
|
|
161
|
+
for name, field_builder in self.field_builders.items():
|
|
162
|
+
resource_info.datastore_info.fields_dict[name] = field_builder._to_ckan_field()
|
|
163
|
+
else:
|
|
164
|
+
resource_info.datastore_info.fields_dict = None
|
|
165
|
+
resource_info.datastore_info.fields_id_list = [name for name, field_builder in self.field_builders.items()] if self.field_builders is not None else []
|
|
166
|
+
if self.indexes is not None:
|
|
167
|
+
resource_info.datastore_info.index_fields = self.indexes.copy()
|
|
168
|
+
aliases = self._get_alias_list(None)
|
|
169
|
+
if aliases is not None:
|
|
170
|
+
resource_info.datastore_info.aliases = aliases.copy()
|
|
171
|
+
return resource_info
|
|
172
|
+
|
|
173
|
+
@abstractmethod
|
|
174
|
+
def load_sample_df(self, resources_base_dir:str, *, upload_alter:bool=True) -> pd.DataFrame:
|
|
175
|
+
"""
|
|
176
|
+
Function returning the data from the indicated resources as a pandas DataFrame.
|
|
177
|
+
This is the DataFrame equivalent for load_sample_data.
|
|
178
|
+
|
|
179
|
+
:param resources_base_dir: base directory to find the resources on the local machine
|
|
180
|
+
:return:
|
|
181
|
+
"""
|
|
182
|
+
raise NotImplementedError()
|
|
183
|
+
|
|
184
|
+
@staticmethod
|
|
185
|
+
def sample_file_path_is_url() -> bool:
|
|
186
|
+
return False
|
|
187
|
+
|
|
188
|
+
def get_sample_file_path(self, resources_base_dir: str) -> None:
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
def load_sample_data(self, resources_base_dir:str) -> bytes:
|
|
192
|
+
df = self.load_sample_df(resources_base_dir=resources_base_dir)
|
|
193
|
+
return self.local_file_format.write_in_memory(df, fields=self._get_fields_info())
|
|
194
|
+
|
|
195
|
+
def upsert_request_df(self, ckan: CkanApi, df_upload:pd.DataFrame,
|
|
196
|
+
method:UpsertChoice=UpsertChoice.Upsert,
|
|
197
|
+
apply_last_condition:bool=None, always_last_condition:bool=None) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
198
|
+
"""
|
|
199
|
+
Call to ckan datastore_upset.
|
|
200
|
+
Before sending the DataFrame, a call to df_upload_alter is made.
|
|
201
|
+
This method is overloaded in BuilderDataStoreMultiABC and BuilderDataStoreFolder
|
|
202
|
+
|
|
203
|
+
:param ckan:
|
|
204
|
+
:param df_upload:
|
|
205
|
+
:param method:
|
|
206
|
+
:return:
|
|
207
|
+
"""
|
|
208
|
+
resource_id = self.get_or_query_resource_id(ckan, error_not_found=True)
|
|
209
|
+
df_upload_transformed = self.df_mapper.df_upload_alter(df_upload, fields=self._get_fields_info())
|
|
210
|
+
ret_df = ckan.datastore_upsert(df_upload_transformed, resource_id, method=method,
|
|
211
|
+
apply_last_condition=apply_last_condition,
|
|
212
|
+
always_last_condition=always_last_condition, data_cleaner=self.data_cleaner_upload)
|
|
213
|
+
return df_upload_transformed, ret_df
|
|
214
|
+
|
|
215
|
+
def upsert_request_final(self, ckan: CkanApi, *, force:bool=False) -> None:
|
|
216
|
+
"""
|
|
217
|
+
Final steps after the last upsert query.
|
|
218
|
+
These steps are automatically done for a DataStore defined by one file.
|
|
219
|
+
|
|
220
|
+
:param ckan:
|
|
221
|
+
:param force: perform request anyways
|
|
222
|
+
:return:
|
|
223
|
+
"""
|
|
224
|
+
if force:
|
|
225
|
+
resource_id = self.get_or_query_resource_id(ckan, error_not_found=True)
|
|
226
|
+
ckan.datastore_upsert_last_line(resource_id=resource_id)
|
|
227
|
+
|
|
228
|
+
def _get_alias_list(self, ckan:Union[CkanApi,None]):
|
|
229
|
+
aliases = self.aliases
|
|
230
|
+
if default_alias_keyword is not None:
|
|
231
|
+
if ckan is not None:
|
|
232
|
+
default_alias_name = ckan.datastore_default_alias(self.name, self.package_name, error_not_found=False)
|
|
233
|
+
else:
|
|
234
|
+
default_alias_name = CkanApi.datastore_default_alias_of_names(self.name, self.package_name)
|
|
235
|
+
if aliases is not None:
|
|
236
|
+
for i, alias in enumerate(aliases):
|
|
237
|
+
if alias.lower().strip() == default_alias_keyword:
|
|
238
|
+
aliases[i] = default_alias_name
|
|
239
|
+
return aliases
|
|
240
|
+
|
|
241
|
+
def _check_necessary_fields(self, current_fields: Set[str] = None, empty_datastore:bool=False, raise_error: bool = True) -> Set[str]:
|
|
242
|
+
"""
|
|
243
|
+
Auxiliary function to list the fields which are required:
|
|
244
|
+
- for df_mapper to determine the file names, associated requests, and recognize the last inserted row of a document.
|
|
245
|
+
- to initialize the DataStore with the columns for the primary key and indexes
|
|
246
|
+
|
|
247
|
+
The required fields are compared to current_fields, if provided.
|
|
248
|
+
"""
|
|
249
|
+
if empty_datastore:
|
|
250
|
+
return set()
|
|
251
|
+
required_fields = self.df_mapper.get_necessary_fields()
|
|
252
|
+
if self.primary_key is not None:
|
|
253
|
+
required_fields = required_fields.union(set(self.primary_key))
|
|
254
|
+
if self.indexes is not None:
|
|
255
|
+
required_fields = required_fields.union(set(self.indexes))
|
|
256
|
+
if current_fields is not None:
|
|
257
|
+
missing_fields = required_fields - current_fields
|
|
258
|
+
if len(missing_fields) > 0:
|
|
259
|
+
msg = RequiredDataFrameFieldsError(missing_fields)
|
|
260
|
+
if raise_error:
|
|
261
|
+
raise msg
|
|
262
|
+
else:
|
|
263
|
+
warn(str(msg))
|
|
264
|
+
return required_fields
|
|
265
|
+
|
|
266
|
+
def _check_undocumented_fields(self, current_fields: Set[str]) -> None:
|
|
267
|
+
if self.field_builders is not None:
|
|
268
|
+
# list fields which are not documented
|
|
269
|
+
fields_doc = set(self.field_builders.keys())
|
|
270
|
+
missing_doc = current_fields - fields_doc
|
|
271
|
+
extra_doc = fields_doc - current_fields
|
|
272
|
+
if len(extra_doc) > 0:
|
|
273
|
+
msg = f"{len(extra_doc)} extra fields were documented but absent of sample data for table {self.name}: {', '.join(extra_doc)}"
|
|
274
|
+
warn(msg)
|
|
275
|
+
if len(missing_doc) > 0:
|
|
276
|
+
msg = f"{len(missing_doc)} fields are left documented for table {self.name}: {', '.join(missing_doc)}"
|
|
277
|
+
warn(msg)
|
|
278
|
+
else:
|
|
279
|
+
msg = f"No field documentation was provided for table {self.name}. {len(current_fields)} fields are left documented: {', '.join(current_fields)}"
|
|
280
|
+
warn(msg)
|
|
281
|
+
|
|
282
|
+
def _get_fields_update(self, ckan: CkanApi, current_fields:Union[Set[str],None], data_cleaner_fields:Union[List[dict],None],
|
|
283
|
+
reupload:bool) -> Dict[str, dict]:
|
|
284
|
+
if self.field_builders is not None:
|
|
285
|
+
if current_fields is not None:
|
|
286
|
+
builder_fields = [field_builder._to_ckan_field() for field_builder in self.field_builders.values() if field_builder.name in current_fields]
|
|
287
|
+
else:
|
|
288
|
+
# use case: get all known fields (before data_cleaner)
|
|
289
|
+
builder_fields = [field_builder._to_ckan_field() for field_builder in self.field_builders.values()]
|
|
290
|
+
else:
|
|
291
|
+
builder_fields = None
|
|
292
|
+
resource_id = self.get_or_query_resource_id(ckan, error_not_found=False)
|
|
293
|
+
if resource_id is not None and not reupload:
|
|
294
|
+
update_needed, fields_update = ckan.datastore_field_patch_dict(fields_merge=data_cleaner_fields, fields_update=builder_fields,
|
|
295
|
+
return_list=False,
|
|
296
|
+
resource_id=resource_id, error_not_found=False)
|
|
297
|
+
else:
|
|
298
|
+
fields_update = CkanApi.datastore_field_dict(fields_merge=data_cleaner_fields, fields_update=builder_fields, return_list=False)
|
|
299
|
+
return fields_update
|
|
300
|
+
|
|
301
|
+
def _collect_indexes_from_fields(self) -> Set[str]:
|
|
302
|
+
if self.field_builders is not None:
|
|
303
|
+
return {field_builder.name for field_builder in self.field_builders.values() if field_builder.is_index}
|
|
304
|
+
else:
|
|
305
|
+
return set()
|
|
306
|
+
|
|
307
|
+
def _get_primary_key_indexes(self, data_cleaner_index: Set[str], current_fields:Set[str], error_missing:bool, empty_datastore:bool=False) -> Tuple[Union[List[str],None], Union[List[str],None]]:
|
|
308
|
+
# update primary keys and indexes: only if present
|
|
309
|
+
if empty_datastore:
|
|
310
|
+
return None, None
|
|
311
|
+
primary_key = None
|
|
312
|
+
if current_fields is None:
|
|
313
|
+
primary_key = self.primary_key
|
|
314
|
+
elif self.primary_key is not None:
|
|
315
|
+
extra_primary_key = set(self.primary_key) - current_fields
|
|
316
|
+
if len(extra_primary_key) == 0:
|
|
317
|
+
primary_key = self.primary_key
|
|
318
|
+
elif error_missing:
|
|
319
|
+
raise RequiredDataFrameFieldsError(extra_primary_key)
|
|
320
|
+
indexes = None
|
|
321
|
+
if self.indexes is not None:
|
|
322
|
+
indexes_full_set = set(self.indexes).union(self._collect_indexes_from_fields()).union(data_cleaner_index)
|
|
323
|
+
else:
|
|
324
|
+
indexes_full_set = self._collect_indexes_from_fields().union(data_cleaner_index)
|
|
325
|
+
if primary_key is not None:
|
|
326
|
+
indexes_full_set = indexes_full_set - set(primary_key)
|
|
327
|
+
if len(indexes_full_set) == 0:
|
|
328
|
+
indexes_full = None
|
|
329
|
+
else:
|
|
330
|
+
indexes_full = list(indexes_full_set)
|
|
331
|
+
if current_fields is None:
|
|
332
|
+
indexes = indexes_full
|
|
333
|
+
elif indexes_full is not None:
|
|
334
|
+
extra_indexes = set(indexes_full) - current_fields
|
|
335
|
+
if len(extra_indexes) == 0:
|
|
336
|
+
indexes = indexes_full
|
|
337
|
+
elif error_missing:
|
|
338
|
+
raise RequiredDataFrameFieldsError(extra_indexes)
|
|
339
|
+
return primary_key, indexes
|
|
340
|
+
|
|
341
|
+
def _compare_fields_to_datastore_info(self, resource_info:CkanResourceInfo, current_fields: Set[str], ckan:CkanApi) -> None:
|
|
342
|
+
# compare fields with DataStore info (if present, for information)
|
|
343
|
+
if resource_info.datastore_info is not None:
|
|
344
|
+
fields_info = set(resource_info.datastore_info.fields_id_list)
|
|
345
|
+
missing_info = current_fields - fields_info
|
|
346
|
+
extra_info = fields_info - current_fields
|
|
347
|
+
if len(extra_info) > 0:
|
|
348
|
+
msg = f"{len(extra_info)} extra fields are in the database but absent of sample data for table {self.name}: {', '.join(extra_info)}"
|
|
349
|
+
warn(msg)
|
|
350
|
+
if len(missing_info) > 0 and ckan.params.verbose_request:
|
|
351
|
+
msg = f"{len(missing_info)} fields are not in DataStore info because they are being added for table {self.name}: {', '.join(missing_info)}"
|
|
352
|
+
print(msg)
|
|
353
|
+
|
|
354
|
+
def _apply_data_cleaner_before_patch(self, ckan:CkanApi, df_upload: pd.DataFrame, reupload:bool) -> Tuple[pd.DataFrame, List[dict], Set[str]]:
|
|
355
|
+
if df_upload is not None and self.data_cleaner_upload is not None:
|
|
356
|
+
fields_for_cleaner = self._get_fields_update(ckan, current_fields=None, data_cleaner_fields=None, reupload=reupload)
|
|
357
|
+
df_upload = self.data_cleaner_upload.clean_records(df_upload, known_fields=fields_for_cleaner, inplace=True)
|
|
358
|
+
data_cleaner_fields = self.data_cleaner_upload.merge_field_changes()
|
|
359
|
+
data_cleaner_index = self.data_cleaner_upload.field_suggested_index
|
|
360
|
+
else:
|
|
361
|
+
data_cleaner_fields = None
|
|
362
|
+
data_cleaner_index = set()
|
|
363
|
+
return df_upload, data_cleaner_fields, data_cleaner_index
|
|
364
|
+
|
|
365
|
+
def patch_request(self, ckan: CkanApi, package_id: str, *,
|
|
366
|
+
df_upload: pd.DataFrame=None, reupload: bool = None, resources_base_dir:str=None) -> CkanResourceInfo:
|
|
367
|
+
if reupload is None: reupload = self.reupload_on_update
|
|
368
|
+
if df_upload is None:
|
|
369
|
+
df_upload = self.load_sample_df(resources_base_dir=resources_base_dir, upload_alter=True)
|
|
370
|
+
else:
|
|
371
|
+
pass # do not alter df_upload because it should already be in the database format
|
|
372
|
+
df_upload, data_cleaner_fields, data_cleaner_index = self._apply_data_cleaner_before_patch(ckan, df_upload, reupload=reupload)
|
|
373
|
+
current_fields = set(df_upload.columns) - {datastore_id_col} # _id field cannot be documented
|
|
374
|
+
if num_rows_patch_first_upload_partial is not None and len(df_upload) > num_rows_patch_first_upload_partial:
|
|
375
|
+
df_upload_partial = df_upload.iloc[:num_rows_patch_first_upload_partial]
|
|
376
|
+
df_upload_upsert = df_upload.iloc[num_rows_patch_first_upload_partial:]
|
|
377
|
+
else:
|
|
378
|
+
df_upload_partial, df_upload_upsert = df_upload, None
|
|
379
|
+
empty_datastore = df_upload is None or len(df_upload) == 0
|
|
380
|
+
self._check_necessary_fields(current_fields, empty_datastore=empty_datastore, raise_error=True)
|
|
381
|
+
self._check_undocumented_fields(current_fields)
|
|
382
|
+
aliases = self._get_alias_list(ckan)
|
|
383
|
+
primary_key, indexes = self._get_primary_key_indexes(data_cleaner_index, current_fields=current_fields,
|
|
384
|
+
error_missing=True, empty_datastore=empty_datastore)
|
|
385
|
+
fields_update = self._get_fields_update(ckan, current_fields, data_cleaner_fields, reupload=reupload)
|
|
386
|
+
fields = list(fields_update.values()) if len(fields_update) > 0 else None
|
|
387
|
+
resource_info = ckan.resource_create(package_id, name=self.name, format=self.format, description=self.description, state=self.state,
|
|
388
|
+
create_default_view=self.create_default_view,
|
|
389
|
+
cancel_if_exists=True, update_if_exists=True, reupload=reupload,
|
|
390
|
+
datastore_create=True, records=df_upload_partial, fields=fields,
|
|
391
|
+
primary_key=primary_key, indexes=indexes, aliases=aliases)
|
|
392
|
+
resource_id = resource_info.id
|
|
393
|
+
self.known_id = resource_id
|
|
394
|
+
reupload = reupload or resource_info.newly_created
|
|
395
|
+
self._compare_fields_to_datastore_info(resource_info, current_fields, ckan)
|
|
396
|
+
if df_upload_upsert is not None and reupload:
|
|
397
|
+
if reupload:
|
|
398
|
+
ckan.datastore_upsert(df_upload_upsert, resource_id, method=UpsertChoice.Insert,
|
|
399
|
+
always_last_condition=None, data_cleaner=self.data_cleaner_upload, )
|
|
400
|
+
else:
|
|
401
|
+
# case where a reupload was needed but is not permitted by self.reupload_if_needed
|
|
402
|
+
msg = f"Did not upload the remaining part of the resource {self.name}."
|
|
403
|
+
raise IncompletePatchError(msg)
|
|
404
|
+
return resource_info
|
|
405
|
+
|
|
406
|
+
def download_sample_df(self, ckan: CkanApi, search_all:bool=True, download_alter:bool=True, **kwargs) -> Union[pd.DataFrame,None]:
|
|
407
|
+
"""
|
|
408
|
+
Download the resource and return it as a DataFrame.
|
|
409
|
+
This is the DataFrame equivalent for download_sample.
|
|
410
|
+
|
|
411
|
+
:param ckan:
|
|
412
|
+
:param search_all:
|
|
413
|
+
:param download_alter:
|
|
414
|
+
:param kwargs:
|
|
415
|
+
:return:
|
|
416
|
+
"""
|
|
417
|
+
resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=self.download_error_not_found)
|
|
418
|
+
if resource_id is None and not self.download_error_not_found:
|
|
419
|
+
return None
|
|
420
|
+
df_download = ckan.datastore_dump(resource_id, search_all=search_all, **kwargs)
|
|
421
|
+
if download_alter:
|
|
422
|
+
df_local = self.df_mapper.df_download_alter(df_download, fields=self._get_fields_info())
|
|
423
|
+
return df_local
|
|
424
|
+
else:
|
|
425
|
+
return df_download
|
|
426
|
+
|
|
427
|
+
def download_sample(self, ckan:CkanApi, full_download:bool=True, **kwargs) -> bytes:
|
|
428
|
+
df = self.download_sample_df(ckan=ckan, search_all=full_download, **kwargs)
|
|
429
|
+
return self.local_file_format.write_in_memory(df, fields=self._get_fields_info())
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
class BuilderDataStoreFile(BuilderDataStoreABC):
|
|
433
|
+
def __init__(self, *, name:str=None, format:str=None, description:str=None,
|
|
434
|
+
resource_id:str=None, download_url:str=None, file_name:str=None):
|
|
435
|
+
super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
|
|
436
|
+
self.file_name = file_name
|
|
437
|
+
|
|
438
|
+
def copy(self, *, dest=None):
|
|
439
|
+
if dest is None:
|
|
440
|
+
dest = BuilderDataStoreFile()
|
|
441
|
+
super().copy(dest=dest)
|
|
442
|
+
dest.file_name = self.file_name
|
|
443
|
+
return dest
|
|
444
|
+
|
|
445
|
+
def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
|
|
446
|
+
super()._load_from_df_row(row=row)
|
|
447
|
+
self.file_name: str = _string_from_element(row["file/url"])
|
|
448
|
+
|
|
449
|
+
def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[None,ContextErrorLevelMessage]:
|
|
450
|
+
file_path = self.get_sample_file_path(resources_base_dir=resources_base_dir)
|
|
451
|
+
if os.path.isfile(file_path):
|
|
452
|
+
return None
|
|
453
|
+
else:
|
|
454
|
+
return ResourceFileNotExistMessage(self.name, ErrorLevel.Error, f"Missing file for resource {self.name}: {file_path}")
|
|
455
|
+
|
|
456
|
+
@staticmethod
|
|
457
|
+
def sample_file_path_is_url() -> bool:
|
|
458
|
+
return False
|
|
459
|
+
|
|
460
|
+
def get_sample_file_path(self, resources_base_dir:str) -> str:
|
|
461
|
+
return resolve_rel_path(resources_base_dir, self.file_name, field=f"File/URL of resource {self.name}")
|
|
462
|
+
|
|
463
|
+
def load_sample_df(self, resources_base_dir:str, *, upload_alter:bool=True) -> pd.DataFrame:
|
|
464
|
+
self.sample_data_source = self.get_sample_file_path(resources_base_dir)
|
|
465
|
+
df_local = self.local_file_format.read_file(self.sample_data_source, fields=self._get_fields_info())
|
|
466
|
+
if isinstance(df_local, pd.DataFrame):
|
|
467
|
+
df_local.attrs["source"] = self.sample_data_source
|
|
468
|
+
if upload_alter:
|
|
469
|
+
df_upload = self.df_mapper.df_upload_alter(df_local, self.sample_data_source, fields=self._get_fields_info())
|
|
470
|
+
return df_upload
|
|
471
|
+
else:
|
|
472
|
+
return df_local
|
|
473
|
+
|
|
474
|
+
@staticmethod
|
|
475
|
+
def resource_mode_str() -> str:
|
|
476
|
+
return "DataStore from File"
|
|
477
|
+
|
|
478
|
+
def _to_dict(self, include_id:bool=True) -> dict:
|
|
479
|
+
d = super()._to_dict(include_id=include_id)
|
|
480
|
+
d["File/URL"] = self.file_name
|
|
481
|
+
return d
|
|
482
|
+
|
|
483
|
+
def download_request(self, ckan: CkanApi, out_dir: str, *, full_download:bool=True,
|
|
484
|
+
force:bool=False, threads:int=1) -> Union[pd.DataFrame,None]:
|
|
485
|
+
if (not self.enable_download) and (not force):
|
|
486
|
+
msg = f"Did not download resource {self.name} because download was disabled."
|
|
487
|
+
warn(msg)
|
|
488
|
+
return None
|
|
489
|
+
if out_dir is not None:
|
|
490
|
+
self.downloaded_destination = resolve_rel_path(out_dir, self.file_name, field=f"File/URL of resource {self.name}")
|
|
491
|
+
if self.download_skip_existing and os.path.exists(self.downloaded_destination):
|
|
492
|
+
return None
|
|
493
|
+
resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=self.download_error_not_found)
|
|
494
|
+
if resource_id is None and not self.download_error_not_found:
|
|
495
|
+
return None
|
|
496
|
+
df_download = ckan.datastore_dump(resource_id, search_all=full_download)
|
|
497
|
+
df = self.df_mapper.df_download_alter(df_download, fields=self._get_fields_info())
|
|
498
|
+
if out_dir is not None:
|
|
499
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
500
|
+
self.local_file_format.write_file(df, self.downloaded_destination, fields=self._get_fields_info())
|
|
501
|
+
return df
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
class BuilderResourceIgnored(BuilderDataStoreABC):
|
|
505
|
+
"""
|
|
506
|
+
Class to maintain a line in the resource builders list but has no action and can hold field metadata.
|
|
507
|
+
"""
|
|
508
|
+
def __init__(self, *, name:str=None, format:str=None, description:str=None,
|
|
509
|
+
resource_id:str=None, download_url:str=None, file_url:str=None):
|
|
510
|
+
super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
|
|
511
|
+
self.file_url: Union[str, None] = file_url
|
|
512
|
+
|
|
513
|
+
def copy(self, *, dest=None):
|
|
514
|
+
if dest is None:
|
|
515
|
+
dest = BuilderResourceIgnored()
|
|
516
|
+
super().copy(dest=dest)
|
|
517
|
+
dest.file_url = self.file_url
|
|
518
|
+
return dest
|
|
519
|
+
|
|
520
|
+
@staticmethod
|
|
521
|
+
def resource_mode_str() -> str:
|
|
522
|
+
return "Ignored"
|
|
523
|
+
|
|
524
|
+
def _load_from_df_row(self, row: pd.Series, base_dir:str=None):
|
|
525
|
+
super()._load_from_df_row(row=row)
|
|
526
|
+
self.file_url: str = _string_from_element(row["file/url"])
|
|
527
|
+
self._check_mandatory_attributes()
|
|
528
|
+
|
|
529
|
+
def _to_dict(self, include_id:bool=True) -> dict:
|
|
530
|
+
d = super()._to_dict(include_id=include_id)
|
|
531
|
+
d["File/URL"] = self.file_url
|
|
532
|
+
return d
|
|
533
|
+
|
|
534
|
+
@staticmethod
|
|
535
|
+
def sample_file_path_is_url() -> bool:
|
|
536
|
+
return False
|
|
537
|
+
|
|
538
|
+
def get_sample_file_path(self, resources_base_dir:str) -> Union[str,None]:
|
|
539
|
+
return None
|
|
540
|
+
|
|
541
|
+
def load_sample_data(self, resources_base_dir:str) -> Union[bytes,None]:
|
|
542
|
+
return None
|
|
543
|
+
|
|
544
|
+
def load_sample_df(self, resources_base_dir: str, *, upload_alter: bool = True) -> None:
|
|
545
|
+
return None
|
|
546
|
+
|
|
547
|
+
def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[ContextErrorLevelMessage,None]:
|
|
548
|
+
return None
|
|
549
|
+
|
|
550
|
+
def patch_request(self, ckan:CkanApi, package_id:str, *,
|
|
551
|
+
reupload:bool=None, resources_base_dir:str=None,
|
|
552
|
+
payload:Union[bytes, io.BufferedIOBase]=None) -> None:
|
|
553
|
+
return None
|
|
554
|
+
|
|
555
|
+
def download_request(self, ckan: CkanApi, out_dir: str, *, full_download: bool = True, force: bool = False,
|
|
556
|
+
threads: int = 1) -> Any:
|
|
557
|
+
return None
|
|
558
|
+
|
|
559
|
+
def download_sample(self, ckan: CkanApi, full_download: bool = True, **kwargs) -> bytes:
|
|
560
|
+
return bytes()
|
|
561
|
+
|