ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ckanapi_harvesters/__init__.py +32 -10
- ckanapi_harvesters/auxiliary/__init__.py +26 -0
- ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
- ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
- ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
- ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
- ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
- ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
- ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
- ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
- ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
- ckanapi_harvesters/auxiliary/deprecated.py +82 -0
- ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
- ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
- ckanapi_harvesters/auxiliary/list_records.py +60 -0
- ckanapi_harvesters/auxiliary/login.py +163 -0
- ckanapi_harvesters/auxiliary/path.py +208 -0
- ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
- ckanapi_harvesters/auxiliary/urls.py +40 -0
- ckanapi_harvesters/builder/__init__.py +40 -0
- ckanapi_harvesters/builder/builder_aux.py +20 -0
- ckanapi_harvesters/builder/builder_ckan.py +238 -0
- ckanapi_harvesters/builder/builder_errors.py +36 -0
- ckanapi_harvesters/builder/builder_field.py +122 -0
- ckanapi_harvesters/builder/builder_package.py +9 -0
- ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
- ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
- ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
- ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
- ckanapi_harvesters/builder/builder_resource.py +589 -0
- ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
- ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
- ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
- ckanapi_harvesters/builder/builder_resource_init.py +126 -0
- ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
- ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
- ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
- ckanapi_harvesters/builder/example/__init__.py +21 -0
- ckanapi_harvesters/builder/example/builder_example.py +21 -0
- ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
- ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
- ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
- ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
- ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
- ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
- ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
- ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
- ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
- ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
- ckanapi_harvesters/builder/mapper_datastore.py +93 -0
- ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
- ckanapi_harvesters/builder/specific/__init__.py +11 -0
- ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
- ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
- ckanapi_harvesters/ckan_api/__init__.py +20 -0
- ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
- ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
- ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
- ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
- ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
- ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
- ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
- ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
- ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
- ckanapi_harvesters/harvesters/__init__.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
- ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
- ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
- ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
- ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
- ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
- ckanapi_harvesters/harvesters/harvester_init.py +30 -0
- ckanapi_harvesters/harvesters/harvester_model.py +49 -0
- ckanapi_harvesters/harvesters/harvester_params.py +323 -0
- ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
- ckanapi_harvesters/harvesters/postgre_params.py +86 -0
- ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
- ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
- ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
- ckanapi_harvesters/policies/__init__.py +20 -0
- ckanapi_harvesters/policies/data_format_policy.py +269 -0
- ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
- ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
- ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
- ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
- ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
- ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
- ckanapi_harvesters/reports/__init__.py +11 -0
- ckanapi_harvesters/reports/admin_report.py +292 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
- ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
- ckanapi_harvesters/divider/__init__.py +0 -27
- ckanapi_harvesters/divider/divider.py +0 -53
- ckanapi_harvesters/divider/divider_error.py +0 -59
- ckanapi_harvesters/main.py +0 -30
- ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,579 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
from typing import List, Union, Tuple
|
|
7
|
+
import time
|
|
8
|
+
from warnings import warn
|
|
9
|
+
import io
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
from ckanapi_harvesters.auxiliary.proxy_config import ProxyConfig
|
|
14
|
+
from ckanapi_harvesters.auxiliary.ckan_model import CkanResourceInfo
|
|
15
|
+
from ckanapi_harvesters.auxiliary.ckan_model import UpsertChoice, CkanState
|
|
16
|
+
from ckanapi_harvesters.auxiliary.ckan_auxiliary import assert_or_raise
|
|
17
|
+
from ckanapi_harvesters.auxiliary.ckan_action import CkanActionResponse
|
|
18
|
+
# from ckanapi_harvesters.auxiliary.list_records import records_to_df
|
|
19
|
+
from ckanapi_harvesters.auxiliary.ckan_auxiliary import upload_prepare_requests_files_arg, RequestType, json_encode_params
|
|
20
|
+
from ckanapi_harvesters.auxiliary.ckan_errors import (ReadOnlyError, IntegrityError, MaxRequestsCountError,
|
|
21
|
+
UnexpectedError, InvalidParameterError, DataStoreNotFoundError)
|
|
22
|
+
from ckanapi_harvesters.policies.data_format_policy import CkanPackageDataFormatPolicy
|
|
23
|
+
|
|
24
|
+
from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_abc import CkanDataCleanerABC
|
|
25
|
+
from ckanapi_harvesters.ckan_api.ckan_api_3_policy import CkanApiPolicyParams
|
|
26
|
+
from ckanapi_harvesters.ckan_api.ckan_api_3_policy import CkanApiPolicy
|
|
27
|
+
|
|
28
|
+
from ckanapi_harvesters.auxiliary.ckan_map import CkanMap
|
|
29
|
+
from ckanapi_harvesters.auxiliary.ckan_api_key import CkanApiKey
|
|
30
|
+
from ckanapi_harvesters.auxiliary.ckan_auxiliary import df_upload_to_csv_kwargs
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CkanApiReadWriteParams(CkanApiPolicyParams):
|
|
36
|
+
# not read-only by default
|
|
37
|
+
default_readonly:bool = False
|
|
38
|
+
|
|
39
|
+
def __init__(self, *, proxies:Union[str,dict,ProxyConfig]=None,
|
|
40
|
+
ckan_headers:dict=None, http_headers:dict=None):
|
|
41
|
+
super().__init__(proxies=proxies, ckan_headers=ckan_headers, http_headers=http_headers)
|
|
42
|
+
self.default_limit_write: Union[int,None] = self.default_limit_read # limit the number of entries per upsert (used as default value)
|
|
43
|
+
self.default_force: bool = True # set to True to edit a read-only resource
|
|
44
|
+
self.read_only: bool = self.default_readonly
|
|
45
|
+
self.submit_delay: float = 2.0 # delay between requests when running datapusher_submit
|
|
46
|
+
self.submit_timeout: float = 90.0 # maximum wait time after datapusher_submit
|
|
47
|
+
|
|
48
|
+
def copy(self, new_identifier:str=None, *, dest=None):
|
|
49
|
+
if dest is None:
|
|
50
|
+
dest = CkanApiReadWriteParams()
|
|
51
|
+
super().copy(dest=dest)
|
|
52
|
+
dest.default_limit_write = self.default_limit_write
|
|
53
|
+
dest.default_force = self.default_force
|
|
54
|
+
dest.read_only = self.read_only
|
|
55
|
+
dest.submit_delay = self.submit_delay
|
|
56
|
+
dest.submit_timeout = self.submit_timeout
|
|
57
|
+
return dest
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class CkanApiReadWrite(CkanApiPolicy):
|
|
61
|
+
"""
|
|
62
|
+
CKAN Database API interface to CKAN server with helper functions using pandas DataFrames.
|
|
63
|
+
This class implements requests to write data to the CKAN server resources / DataStores.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, url:str=None, *, proxies:Union[str,dict,ProxyConfig]=None,
|
|
67
|
+
apikey:Union[str,CkanApiKey]=None, apikey_file:str=None,
|
|
68
|
+
owner_org: str = None, params:CkanApiPolicyParams=None,
|
|
69
|
+
map:CkanMap=None, policy: CkanPackageDataFormatPolicy = None, policy_file:str=None,
|
|
70
|
+
data_cleaner_upload:CkanDataCleanerABC=None,
|
|
71
|
+
identifier=None):
|
|
72
|
+
"""
|
|
73
|
+
CKAN Database API interface to CKAN server with helper functions using pandas DataFrames.
|
|
74
|
+
|
|
75
|
+
:param url: url of the CKAN server
|
|
76
|
+
:param proxies: proxies to use for requests
|
|
77
|
+
:param apikey: way to provide the API key directly (optional)
|
|
78
|
+
:param apikey_file: path to a file containing a valid API key in the first line of text (optional)
|
|
79
|
+
:param policy: data format policy to use with policy_check function
|
|
80
|
+
:param policy_file: path to a JSON file containing the data format policy to use with policy_check function
|
|
81
|
+
:param owner_org: name of the organization to limit package_search (optional)
|
|
82
|
+
:param params: other connection/behavior parameters
|
|
83
|
+
:param map: map of known resources
|
|
84
|
+
:param policy: data format policy to be used with the policy_check function.
|
|
85
|
+
:param policy_file: path to a JSON file containing the data format policy to load.
|
|
86
|
+
:param data_cleaner_upload: data cleaner object to use before uploading to a CKAN DataStore.
|
|
87
|
+
:param identifier: identifier of the ckan client
|
|
88
|
+
"""
|
|
89
|
+
super().__init__(url=url, proxies=proxies, apikey=apikey, apikey_file=apikey_file,
|
|
90
|
+
owner_org=owner_org, map=map, policy=policy, policy_file=policy_file, identifier=identifier)
|
|
91
|
+
self.data_cleaner_upload: Union[CkanDataCleanerABC, None] = data_cleaner_upload
|
|
92
|
+
if params is None:
|
|
93
|
+
params = CkanApiReadWriteParams()
|
|
94
|
+
if proxies is not None:
|
|
95
|
+
params.proxies = proxies
|
|
96
|
+
self.params: CkanApiReadWriteParams = params
|
|
97
|
+
|
|
98
|
+
def copy(self, new_identifier: str = None, *, dest=None):
|
|
99
|
+
if dest is None:
|
|
100
|
+
dest = CkanApiReadWrite()
|
|
101
|
+
super().copy(new_identifier=new_identifier, dest=dest)
|
|
102
|
+
dest.data_cleaner_upload = self.data_cleaner_upload.copy() if self.data_cleaner_upload is not None else None
|
|
103
|
+
return dest
|
|
104
|
+
|
|
105
|
+
def full_unlock(self, unlock:bool=True,
|
|
106
|
+
*, no_ca:bool=None, external_url_resource_download:bool=None) -> None:
|
|
107
|
+
"""
|
|
108
|
+
Function to unlock full capabilities of the CKAN API
|
|
109
|
+
|
|
110
|
+
:param unlock:
|
|
111
|
+
:return:
|
|
112
|
+
"""
|
|
113
|
+
super().full_unlock(unlock, no_ca=no_ca, external_url_resource_download=external_url_resource_download)
|
|
114
|
+
self.params.default_force = unlock
|
|
115
|
+
self.params.read_only = not unlock
|
|
116
|
+
|
|
117
|
+
def set_limits(self, limit_read:Union[int,None], limit_write:int=None) -> None:
|
|
118
|
+
"""
|
|
119
|
+
Set default query limits. If only one argument is provided, it applies to both limits.
|
|
120
|
+
|
|
121
|
+
:param limit_read: default limit for read requests
|
|
122
|
+
:param limit_write: default limit for upsert (write) requests
|
|
123
|
+
:return:
|
|
124
|
+
"""
|
|
125
|
+
super().set_limits(limit_read)
|
|
126
|
+
if limit_write is not None:
|
|
127
|
+
self.params.default_limit_write = limit_write
|
|
128
|
+
else:
|
|
129
|
+
self.params.default_limit_write = limit_read
|
|
130
|
+
|
|
131
|
+
def set_submit_timeout(self, submit_timeout:float, submit_delay:float=None) -> None:
|
|
132
|
+
"""
|
|
133
|
+
Set timeout for the datastore_wait method. This is called after datastore_submit.
|
|
134
|
+
|
|
135
|
+
:param submit_timeout: timeout after which a TimeoutError is raised
|
|
136
|
+
:param submit_delay: delay between requests to peer on DataStore initialization
|
|
137
|
+
:return:
|
|
138
|
+
"""
|
|
139
|
+
self.params.submit_timeout = submit_timeout
|
|
140
|
+
if submit_delay is not None:
|
|
141
|
+
self.params.submit_delay = submit_delay
|
|
142
|
+
|
|
143
|
+
## DataStore insertions ------------------
|
|
144
|
+
def _api_datastore_upsert_raw(self, records:Union[dict, List[dict], pd.DataFrame], resource_id:str, *,
|
|
145
|
+
method:Union[UpsertChoice,str], params:dict=None, force:bool=None, dry_run:bool=False,
|
|
146
|
+
last_insertion:bool=True) -> CkanActionResponse:
|
|
147
|
+
"""
|
|
148
|
+
API call to api_datastore_upsert.
|
|
149
|
+
|
|
150
|
+
:param records: records, preferably in a pandas DataFrame - they will be converted to a list of dictionaries.
|
|
151
|
+
:param resource_id: destination resource id
|
|
152
|
+
:param method: see UpsertChoice (insert, update or upsert)
|
|
153
|
+
:param force: set to True to edit a read-only resource. If not provided, this is overridden by self.default_force
|
|
154
|
+
:param params: additional parameters
|
|
155
|
+
:param dry_run: set to True to abort transaction instead of committing, e.g. to check for validation or type errors
|
|
156
|
+
:param last_insertion: trigger for calculate_record_count
|
|
157
|
+
(doc: updates the stored count of records, used to optimize datastore_search in combination with the
|
|
158
|
+
total_estimation_threshold parameter. If doing a series of requests to change a resource, you only need to set
|
|
159
|
+
this to True on the last request.)
|
|
160
|
+
:return: the inserted records as a pandas DataFrame, from the server response
|
|
161
|
+
"""
|
|
162
|
+
assert_or_raise(not self.params.read_only, ReadOnlyError())
|
|
163
|
+
if params is None: params = {}
|
|
164
|
+
if force is None: force = self.params.default_force
|
|
165
|
+
if method is not None: method = str(method)
|
|
166
|
+
params["resource_id"] = resource_id
|
|
167
|
+
params["force"] = force
|
|
168
|
+
params["dry_run"] = dry_run
|
|
169
|
+
# doc calculate_record_count: updates the stored count of records, used to optimize datastore_search in combination with the
|
|
170
|
+
# total_estimation_threshold parameter. If doing a series of requests to change a resource, you only need to set
|
|
171
|
+
# this to True on the last request.
|
|
172
|
+
params["calculate_record_count"] = last_insertion
|
|
173
|
+
# params["force_indexing"] = last_insertion
|
|
174
|
+
N = self.map.get_datastore_len(resource_id, error_not_mapped=False)
|
|
175
|
+
has_datastore_info = N is not None
|
|
176
|
+
format = None # API does not support csv or other formats
|
|
177
|
+
mode_df = True
|
|
178
|
+
if records is not None:
|
|
179
|
+
method = method.lower()
|
|
180
|
+
params["method"] = method
|
|
181
|
+
if isinstance(records, dict):
|
|
182
|
+
records = pd.DataFrame.from_dict(records)
|
|
183
|
+
elif isinstance(records, list):
|
|
184
|
+
# records = records_to_df(records)
|
|
185
|
+
mode_df = False
|
|
186
|
+
else:
|
|
187
|
+
assert(isinstance(records, pd.DataFrame))
|
|
188
|
+
n_upsert = len(records)
|
|
189
|
+
if not mode_df:
|
|
190
|
+
params["records"] = records
|
|
191
|
+
format = None
|
|
192
|
+
elif format is None or format == "objects":
|
|
193
|
+
params["records"] = records.to_dict(orient='records')
|
|
194
|
+
else:
|
|
195
|
+
# dead code
|
|
196
|
+
fields_id_list = records.columns.tolist()
|
|
197
|
+
params["fields"] = fields_id_list # [{"id": id} for id in fields_id_list]
|
|
198
|
+
params["records_format"] = format
|
|
199
|
+
if format == "csv":
|
|
200
|
+
params["records"] = records.to_csv(index=False, header=False, **df_upload_to_csv_kwargs)
|
|
201
|
+
elif format == "lists":
|
|
202
|
+
params["records"] = records.values.tolist()
|
|
203
|
+
else:
|
|
204
|
+
raise NotImplementedError()
|
|
205
|
+
else:
|
|
206
|
+
# possibility to call with None records and method to trigger row counts etc.
|
|
207
|
+
# this request may not be useful
|
|
208
|
+
assert(method is None)
|
|
209
|
+
n_upsert = 0
|
|
210
|
+
# json encode here in the case there are NaN values, not supported by the requests encoder
|
|
211
|
+
data_payload, json_headers = json_encode_params(params)
|
|
212
|
+
response = self._api_action_request(f"datastore_upsert", method=RequestType.Post,
|
|
213
|
+
data=data_payload, headers=json_headers)
|
|
214
|
+
# response = self._api_action_request(f"datastore_upsert", method=RequestType.Post,
|
|
215
|
+
# json=params)
|
|
216
|
+
if response.success:
|
|
217
|
+
if method is not None:
|
|
218
|
+
n_return = len(response.result["records"])
|
|
219
|
+
if has_datastore_info and not dry_run and method == "insert":
|
|
220
|
+
# in modes other than insert could be updated rather than inserted
|
|
221
|
+
self.map._update_datastore_len(resource_id, N + n_return)
|
|
222
|
+
assert_or_raise(n_return == n_upsert, IntegrityError("Returned dataframe does not match number of requested rows"))
|
|
223
|
+
return response
|
|
224
|
+
else:
|
|
225
|
+
raise response.default_error(self)
|
|
226
|
+
|
|
227
|
+
def _api_datastore_upsert(self, records:Union[dict, List[dict], pd.DataFrame], resource_id:str, *,
|
|
228
|
+
method:Union[UpsertChoice,str], params:dict=None, force:bool=None, dry_run:bool=False,
|
|
229
|
+
last_insertion:bool=True, return_df:bool=None) -> Union[pd.DataFrame, List[dict], dict]:
|
|
230
|
+
mode_df = True
|
|
231
|
+
if isinstance(records, list):
|
|
232
|
+
mode_df = False
|
|
233
|
+
if return_df is None:
|
|
234
|
+
return_df = mode_df
|
|
235
|
+
response = self._api_datastore_upsert_raw(records=records, resource_id=resource_id, method=method,
|
|
236
|
+
params=params, force=force, dry_run=dry_run,
|
|
237
|
+
last_insertion=last_insertion)
|
|
238
|
+
if method is not None:
|
|
239
|
+
if return_df:
|
|
240
|
+
response_df = pd.DataFrame.from_dict(response.result["records"])
|
|
241
|
+
self._rx_records_df_clean(response_df)
|
|
242
|
+
return response_df
|
|
243
|
+
else:
|
|
244
|
+
return response.result["records"]
|
|
245
|
+
else:
|
|
246
|
+
return response.result
|
|
247
|
+
|
|
248
|
+
def datastore_upsert_last_line(self, resource_id:str):
|
|
249
|
+
"""
|
|
250
|
+
Apply last line treatments to a resource.
|
|
251
|
+
"""
|
|
252
|
+
return self._api_datastore_upsert(None, resource_id=resource_id, method=None, last_insertion=True)
|
|
253
|
+
|
|
254
|
+
def datastore_upsert(self, records:Union[dict, List[dict], pd.DataFrame], resource_id:str, *,
|
|
255
|
+
dry_run:bool=False, limit:int=None, offset:int=0, force:bool=None,
|
|
256
|
+
method:Union[UpsertChoice,str]=UpsertChoice.Upsert, apply_last_condition:bool=True,
|
|
257
|
+
always_last_condition:bool=None, return_df:bool=None,
|
|
258
|
+
data_cleaner:CkanDataCleanerABC=None, params:dict=None) -> Union[pd.DataFrame, List[dict]]:
|
|
259
|
+
"""
|
|
260
|
+
Encapsulation of _api_datastore_upsert to cut the requests to a limited number of rows.
|
|
261
|
+
|
|
262
|
+
:see: _api_datastore_upsert()
|
|
263
|
+
:param records: records, preferably in a pandas DataFrame - they will be converted to a list of dictionaries.
|
|
264
|
+
:param resource_id: destination resource id
|
|
265
|
+
:param method: by default, set to Upsert
|
|
266
|
+
:param force: set to True to edit a read-only resource. If not provided, this is overridden by self.default_force
|
|
267
|
+
:param limit: number of records per transaction
|
|
268
|
+
:param offset: number of records to skip - use to restart the transfer
|
|
269
|
+
:param params: additional parameters
|
|
270
|
+
:param dry_run: set to True to abort transaction instead of committing, e.g. to check for validation or type errors
|
|
271
|
+
:param apply_last_condition: if True, the last upsert request applies the last insert operations (calculate_record_count and force_indexing).
|
|
272
|
+
:param always_last_condition: if True, each request applies the last insert operations - default is False
|
|
273
|
+
:return: the inserted records as a pandas DataFrame, from the server response
|
|
274
|
+
"""
|
|
275
|
+
method_str = str(method)
|
|
276
|
+
if apply_last_condition is None:
|
|
277
|
+
apply_last_condition = True
|
|
278
|
+
if always_last_condition is None:
|
|
279
|
+
always_last_condition = False
|
|
280
|
+
mode_df = True
|
|
281
|
+
assert(records is not None)
|
|
282
|
+
if isinstance(records, dict):
|
|
283
|
+
records = pd.DataFrame.from_dict(records)
|
|
284
|
+
elif isinstance(records, list):
|
|
285
|
+
# records = records_to_df(records)
|
|
286
|
+
mode_df = False
|
|
287
|
+
else:
|
|
288
|
+
assert(isinstance(records, pd.DataFrame))
|
|
289
|
+
if data_cleaner is None:
|
|
290
|
+
data_cleaner = self.data_cleaner_upload
|
|
291
|
+
if data_cleaner is not None:
|
|
292
|
+
datastore_info = self.get_datastore_info_or_request_of_id(resource_id=resource_id, error_not_found=True)
|
|
293
|
+
records = data_cleaner.clean_records(records, known_fields=datastore_info.fields_dict, inplace=True)
|
|
294
|
+
data_cleaner.apply_new_fields_request(self, resource_id=resource_id)
|
|
295
|
+
if return_df is None:
|
|
296
|
+
return_df = mode_df
|
|
297
|
+
if limit is None: limit = self.params.default_limit_write
|
|
298
|
+
if limit is None:
|
|
299
|
+
# direct API call with one request
|
|
300
|
+
if self.params.store_last_response_debug_info:
|
|
301
|
+
self.debug.multi_requests_last_successful_offset = offset
|
|
302
|
+
return self._api_datastore_upsert(records, return_df=return_df,
|
|
303
|
+
method=method_str, dry_run=dry_run, resource_id=resource_id,
|
|
304
|
+
force=force, params=params,
|
|
305
|
+
last_insertion=apply_last_condition or always_last_condition)
|
|
306
|
+
assert_or_raise(limit > 0, InvalidParameterError("limit"))
|
|
307
|
+
n = len(records)
|
|
308
|
+
if self.params.store_last_response_debug_info:
|
|
309
|
+
self.debug.multi_requests_last_successful_offset = offset
|
|
310
|
+
requests_count = 0
|
|
311
|
+
last_insertion = True
|
|
312
|
+
df, returned_rows = None, None
|
|
313
|
+
if return_df:
|
|
314
|
+
df = None
|
|
315
|
+
else:
|
|
316
|
+
returned_rows = []
|
|
317
|
+
start = time.time()
|
|
318
|
+
current = start
|
|
319
|
+
timeout = False
|
|
320
|
+
n_cum = 0
|
|
321
|
+
while offset < n and requests_count < self.params.max_requests_count and not timeout:
|
|
322
|
+
last_insertion = offset+limit >= n
|
|
323
|
+
i_end_add = min(n, offset+limit)
|
|
324
|
+
n_add = i_end_add-1 - offset + 1
|
|
325
|
+
if self.params.verbose_multi_requests:
|
|
326
|
+
print(f"{self.identifier} Multi-requests upsert {requests_count} to add {n_add} records ...")
|
|
327
|
+
if mode_df:
|
|
328
|
+
df_upsert = records.iloc[offset:i_end_add]
|
|
329
|
+
else:
|
|
330
|
+
df_upsert = records[offset:i_end_add]
|
|
331
|
+
df_add = self._api_datastore_upsert(df_upsert, return_df=return_df,
|
|
332
|
+
method=method_str, dry_run=dry_run, resource_id=resource_id,
|
|
333
|
+
force=force, params=params,
|
|
334
|
+
last_insertion=(last_insertion and apply_last_condition) or always_last_condition)
|
|
335
|
+
n_cum += len(df_add)
|
|
336
|
+
assert_or_raise(len(df_add) == n_add, IntegrityError("Second check on response len failed in datastore_upsert")) # consistency check, in double of _api_datastore_upsert
|
|
337
|
+
if self.params.store_last_response_debug_info:
|
|
338
|
+
self.debug.multi_requests_last_successful_offset = offset
|
|
339
|
+
if return_df:
|
|
340
|
+
if df is None:
|
|
341
|
+
# 1st execution: pandas cannot concatenate with an empty DataFrame => use None as indicator
|
|
342
|
+
assert(df_add is not None)
|
|
343
|
+
df = df_add
|
|
344
|
+
else:
|
|
345
|
+
df = pd.concat([df, df_add], ignore_index=True)
|
|
346
|
+
else:
|
|
347
|
+
returned_rows = returned_rows + df_add
|
|
348
|
+
if self.params.multi_requests_time_between_requests > 0 and not last_insertion:
|
|
349
|
+
time.sleep(self.params.multi_requests_time_between_requests)
|
|
350
|
+
if not last_insertion:
|
|
351
|
+
assert_or_raise(n_add == limit, IntegrityError("datastore_upsert implementation is wrong"))
|
|
352
|
+
offset += limit
|
|
353
|
+
requests_count += 1
|
|
354
|
+
current = time.time()
|
|
355
|
+
timeout = current - start > self.params.multi_requests_timeout
|
|
356
|
+
if return_df:
|
|
357
|
+
if df is None:
|
|
358
|
+
df = pd.DataFrame() # always return a DataFrame object and not None
|
|
359
|
+
df.attrs["requests_count"] = requests_count
|
|
360
|
+
df.attrs["elapsed_time"] = current - start
|
|
361
|
+
df.attrs["offset"] = offset
|
|
362
|
+
if self.params.verbose_multi_requests:
|
|
363
|
+
print(f"{self.identifier} Multi-requests upsert done to add {n_cum} records done in {requests_count} requests and {round(current - start, 2)} seconds.")
|
|
364
|
+
if timeout:
|
|
365
|
+
raise TimeoutError()
|
|
366
|
+
if requests_count >= self.params.max_requests_count:
|
|
367
|
+
raise MaxRequestsCountError()
|
|
368
|
+
assert_or_raise(last_insertion, UnexpectedError("last_insertion should be True at last iteration"))
|
|
369
|
+
if mode_df:
|
|
370
|
+
return df
|
|
371
|
+
else:
|
|
372
|
+
return returned_rows
|
|
373
|
+
|
|
374
|
+
def datastore_insert(self, records:Union[dict, List[dict], pd.DataFrame], resource_id:str, *,
|
|
375
|
+
dry_run:bool=False, limit:int=None, offset:int=0, apply_last_condition:bool=True,
|
|
376
|
+
always_last_condition:bool=None,
|
|
377
|
+
data_cleaner:CkanDataCleanerABC=None, force:bool=None, params:dict=None) -> pd.DataFrame:
|
|
378
|
+
"""
|
|
379
|
+
Alias function to insert data in a DataStore using datastore_upsert.
|
|
380
|
+
|
|
381
|
+
:see: _api_datastore_upsert()
|
|
382
|
+
:param records: records, preferably in a pandas DataFrame - they will be converted to a list of dictionaries.
|
|
383
|
+
:param resource_id: destination resource id
|
|
384
|
+
:param force: set to True to edit a read-only resource. If not provided, this is overridden by self.default_force
|
|
385
|
+
:param params: additional parameters
|
|
386
|
+
:param dry_run: set to True to abort transaction instead of committing, e.g. to check for validation or type errors
|
|
387
|
+
:return: the inserted records as a pandas DataFrame, from the server response
|
|
388
|
+
"""
|
|
389
|
+
return self.datastore_upsert(records, resource_id, dry_run=dry_run, limit=limit, offset=offset,
|
|
390
|
+
method=UpsertChoice.Insert, apply_last_condition=apply_last_condition,
|
|
391
|
+
always_last_condition=always_last_condition, data_cleaner=data_cleaner,
|
|
392
|
+
force=force, params=params)
|
|
393
|
+
|
|
394
|
+
def datastore_update(self, records:Union[dict, List[dict], pd.DataFrame], resource_id:str, *,
|
|
395
|
+
dry_run:bool=False, limit:int=None, offset:int=0, apply_last_condition:bool=True,
|
|
396
|
+
always_last_condition:bool=None,
|
|
397
|
+
data_cleaner:CkanDataCleanerABC=None, force:bool=None, params:dict=None) -> pd.DataFrame:
|
|
398
|
+
"""
|
|
399
|
+
Alias function to update data in a DataStore using datastore_upsert.
|
|
400
|
+
The update is performed based on the DataStore primary keys
|
|
401
|
+
|
|
402
|
+
:see: _api_datastore_upsert()
|
|
403
|
+
:param records: records, preferably in a pandas DataFrame - they will be converted to a list of dictionaries.
|
|
404
|
+
:param resource_id: destination resource id
|
|
405
|
+
:param force: set to True to edit a read-only resource. If not provided, this is overridden by self.default_force
|
|
406
|
+
:param params: additional parameters
|
|
407
|
+
:param dry_run: set to True to abort transaction instead of committing, e.g. to check for validation or type errors
|
|
408
|
+
:return: the inserted records as a pandas DataFrame, from the server response
|
|
409
|
+
"""
|
|
410
|
+
return self.datastore_upsert(records, resource_id, dry_run=dry_run, limit=limit, offset=offset,
|
|
411
|
+
method=UpsertChoice.Update, apply_last_condition=apply_last_condition,
|
|
412
|
+
always_last_condition=always_last_condition, data_cleaner=data_cleaner,
|
|
413
|
+
force=force, params=params)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
## Resource updates ------------------
|
|
417
|
+
def _api_resource_patch(self, resource_id:str, *, name:str=None, format:str=None, description:str=None, title:str=None,
|
|
418
|
+
state:CkanState=None,
|
|
419
|
+
df:pd.DataFrame=None, file_path:str=None, url:str=None, files=None,
|
|
420
|
+
payload: Union[bytes, io.BufferedIOBase] = None, payload_name: str = None,
|
|
421
|
+
params:dict=None) -> CkanResourceInfo:
|
|
422
|
+
"""
|
|
423
|
+
Call to resource_patch API. This call can be used to change the resource parameters via params (cf. API documentation)
|
|
424
|
+
or to reupload the resource file into FileStore.
|
|
425
|
+
The latter action replaces the current resource. If it is a DataStore, it is reset to the new contents of the file.
|
|
426
|
+
The file can be transmitted either as an url, a file path or a pandas DataFrame.
|
|
427
|
+
The files argument can pass through these arguments to the requests.post function.
|
|
428
|
+
A call to datapusher_submit() could be required to take immediately into account the newly downloaded file.
|
|
429
|
+
|
|
430
|
+
:see: _api_resource_create
|
|
431
|
+
:see: resource_create
|
|
432
|
+
:param resource_id: resource id
|
|
433
|
+
:param url: url of the resource to replace resource
|
|
434
|
+
:param params: parameters such as name, format, resource_type can be changed
|
|
435
|
+
|
|
436
|
+
For file uploads, the following parameters are taken, by order of priority:
|
|
437
|
+
See upload_prepare_requests_files_arg for an example of formatting.
|
|
438
|
+
|
|
439
|
+
:param files: files pass through argument to the requests.post function. Use to send other data formats.
|
|
440
|
+
:param payload: bytes to upload as a file
|
|
441
|
+
:param payload_name: name of the payload to use (associated with the payload argument) - this determines the format recognized in CKAN viewers.
|
|
442
|
+
:param file_path: path of the file to transmit (binary and text files are supported here)
|
|
443
|
+
:param df: pandas DataFrame to replace resource
|
|
444
|
+
|
|
445
|
+
:return:
|
|
446
|
+
"""
|
|
447
|
+
assert_or_raise(not self.params.read_only, ReadOnlyError())
|
|
448
|
+
if params is None: params = {}
|
|
449
|
+
params["id"] = resource_id
|
|
450
|
+
if description is not None:
|
|
451
|
+
params["description"] = description
|
|
452
|
+
if title is not None:
|
|
453
|
+
params["title"] = title
|
|
454
|
+
if name is not None:
|
|
455
|
+
params["name"] = name
|
|
456
|
+
if format is not None:
|
|
457
|
+
params["format"] = format
|
|
458
|
+
if state is not None:
|
|
459
|
+
params["state"] = str(state)
|
|
460
|
+
files = upload_prepare_requests_files_arg(files=files, file_path=file_path, df=df, payload=payload, payload_name=payload_name)
|
|
461
|
+
if url is not None:
|
|
462
|
+
params["url"] = url
|
|
463
|
+
params["clear_upload"] = True
|
|
464
|
+
assert(files is None)
|
|
465
|
+
if files is not None:
|
|
466
|
+
response = self._api_action_request(f"resource_patch", method=RequestType.Post,
|
|
467
|
+
files=files, data=params)
|
|
468
|
+
else:
|
|
469
|
+
response = self._api_action_request(f"resource_patch", method=RequestType.Post, json=params)
|
|
470
|
+
if response.success:
|
|
471
|
+
resource_info = CkanResourceInfo(response.result)
|
|
472
|
+
self.map._record_resource_update(resource_info)
|
|
473
|
+
return resource_info
|
|
474
|
+
else:
|
|
475
|
+
raise response.default_error(self)
|
|
476
|
+
|
|
477
|
+
def resource_patch(self, resource_id:str, *, name:str=None, format:str=None, description:str=None, title:str=None,
|
|
478
|
+
state:CkanState=None,
|
|
479
|
+
df:pd.DataFrame=None, file_path:str=None, url:str=None, files=None,
|
|
480
|
+
payload: Union[bytes, io.BufferedIOBase] = None, payload_name: str = None,
|
|
481
|
+
params:dict=None) -> CkanResourceInfo:
|
|
482
|
+
# function alias
|
|
483
|
+
return self._api_resource_patch(resource_id, name=name, format=format, description=description, state=state,
|
|
484
|
+
title=title, df=df, file_path=file_path, url=url, files=files,
|
|
485
|
+
payload=payload, payload_name=payload_name, params=params)
|
|
486
|
+
|
|
487
|
+
### DataPusher submit ------------------
|
|
488
|
+
def _api_datapusher_submit(self, resource_id: str, *, params: dict = None) -> bool:
|
|
489
|
+
"""
|
|
490
|
+
Call to API action datapusher_submit. This triggers the normally asynchronous DataPusher service for a given resource.
|
|
491
|
+
|
|
492
|
+
:param resource_id: resource id
|
|
493
|
+
:param params:
|
|
494
|
+
:return:
|
|
495
|
+
"""
|
|
496
|
+
if params is None: params = {}
|
|
497
|
+
params["resource_id"] = resource_id
|
|
498
|
+
response = self._api_action_request(f"datapusher_submit", method=RequestType.Post, json=params)
|
|
499
|
+
if response.success:
|
|
500
|
+
return response.result
|
|
501
|
+
else:
|
|
502
|
+
raise response.default_error(self)
|
|
503
|
+
|
|
504
|
+
def datastore_wait(self, resource_id: str, *,
|
|
505
|
+
apply_delay:bool=True, error_timeout:bool=True) -> Tuple[int, float]:
|
|
506
|
+
"""
|
|
507
|
+
Wait until a DataStore has at least one row.
|
|
508
|
+
The delay between requests to peer on the presence of the DataStore is given by the class attribute submit_delay.
|
|
509
|
+
If the loop exceeds submit_timeout, an exception is raised.
|
|
510
|
+
|
|
511
|
+
:param resource_id:
|
|
512
|
+
:param apply_delay:
|
|
513
|
+
:param error_timeout: option to raise an exception in case of timeout
|
|
514
|
+
:return:
|
|
515
|
+
"""
|
|
516
|
+
if self.params.submit_delay <= 0 or not apply_delay:
|
|
517
|
+
return 0, 0.0
|
|
518
|
+
# resource_info = self.resource_show(resource_id)
|
|
519
|
+
# init_timestamp = resource_info.last_modified
|
|
520
|
+
# current_timestamp = init_timestamp
|
|
521
|
+
if self.params.verbose_request:
|
|
522
|
+
print(f"Waiting for data treatments on DataStore {resource_id}...")
|
|
523
|
+
start = time.time()
|
|
524
|
+
current = start
|
|
525
|
+
timeout = False
|
|
526
|
+
counter = 0
|
|
527
|
+
df = pd.DataFrame() # empty DataFrame
|
|
528
|
+
while not timeout and df.empty: # current_timestamp <= init_timestamp:
|
|
529
|
+
time.sleep(self.params.submit_delay)
|
|
530
|
+
# resource_info = self.resource_show(resource_id)
|
|
531
|
+
# current_timestamp = resource_info.last_modified
|
|
532
|
+
try:
|
|
533
|
+
df = self.datastore_search(resource_id, limit=1, search_all=False, search_method=True)
|
|
534
|
+
except DataStoreNotFoundError:
|
|
535
|
+
pass
|
|
536
|
+
current = time.time()
|
|
537
|
+
timeout = (current - start) > self.params.submit_timeout
|
|
538
|
+
counter += 1
|
|
539
|
+
if timeout:
|
|
540
|
+
if error_timeout:
|
|
541
|
+
raise TimeoutError("datastore_wait")
|
|
542
|
+
else:
|
|
543
|
+
msg = str(TimeoutError("datastore_wait"))
|
|
544
|
+
warn(msg)
|
|
545
|
+
if self.params.verbose_request:
|
|
546
|
+
print(f"Resource updated after {current - start} seconds ({counter} iterations)")
|
|
547
|
+
return counter, current - start
|
|
548
|
+
|
|
549
|
+
def datastore_submit(self, resource_id: str,
|
|
550
|
+
*, apply_delay:bool=True, error_timeout:bool=True,
|
|
551
|
+
params: dict = None) -> bool:
|
|
552
|
+
"""
|
|
553
|
+
Submit file to re-initiate DataStore, using the preferred method.
|
|
554
|
+
Current method is datapusher_submit.
|
|
555
|
+
This encapsulation includes a call to datastore_wait.
|
|
556
|
+
|
|
557
|
+
:param resource_id:
|
|
558
|
+
:param apply_delay: Keep true to wait until the datastore is ready (a datastore_search query is performed as a test)
|
|
559
|
+
:param params:
|
|
560
|
+
:return:
|
|
561
|
+
"""
|
|
562
|
+
result = self._api_datapusher_submit(resource_id, params=params)
|
|
563
|
+
self.datastore_wait(resource_id, apply_delay=apply_delay, error_timeout=error_timeout)
|
|
564
|
+
return result
|
|
565
|
+
|
|
566
|
+
# def datapusher_submit_insert(self, resource_id: str, *, params: dict = None) -> dict:
|
|
567
|
+
# # idea: modify datapusher such as it would upsert data instead of replacing the entire datastore
|
|
568
|
+
# raise NotImplementedError()
|
|
569
|
+
# if params is None: params = {}
|
|
570
|
+
# params["insert"] = True
|
|
571
|
+
# return self._api_datapusher_submit(resource_id, params)
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
|