ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
  103. ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,579 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+
5
+ """
6
+ from typing import List, Union, Tuple
7
+ import time
8
+ from warnings import warn
9
+ import io
10
+
11
+ import pandas as pd
12
+
13
+ from ckanapi_harvesters.auxiliary.proxy_config import ProxyConfig
14
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanResourceInfo
15
+ from ckanapi_harvesters.auxiliary.ckan_model import UpsertChoice, CkanState
16
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import assert_or_raise
17
+ from ckanapi_harvesters.auxiliary.ckan_action import CkanActionResponse
18
+ # from ckanapi_harvesters.auxiliary.list_records import records_to_df
19
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import upload_prepare_requests_files_arg, RequestType, json_encode_params
20
+ from ckanapi_harvesters.auxiliary.ckan_errors import (ReadOnlyError, IntegrityError, MaxRequestsCountError,
21
+ UnexpectedError, InvalidParameterError, DataStoreNotFoundError)
22
+ from ckanapi_harvesters.policies.data_format_policy import CkanPackageDataFormatPolicy
23
+
24
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_abc import CkanDataCleanerABC
25
+ from ckanapi_harvesters.ckan_api.ckan_api_3_policy import CkanApiPolicyParams
26
+ from ckanapi_harvesters.ckan_api.ckan_api_3_policy import CkanApiPolicy
27
+
28
+ from ckanapi_harvesters.auxiliary.ckan_map import CkanMap
29
+ from ckanapi_harvesters.auxiliary.ckan_api_key import CkanApiKey
30
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import df_upload_to_csv_kwargs
31
+
32
+
33
+
34
+
35
+ class CkanApiReadWriteParams(CkanApiPolicyParams):
36
+ # not read-only by default
37
+ default_readonly:bool = False
38
+
39
+ def __init__(self, *, proxies:Union[str,dict,ProxyConfig]=None,
40
+ ckan_headers:dict=None, http_headers:dict=None):
41
+ super().__init__(proxies=proxies, ckan_headers=ckan_headers, http_headers=http_headers)
42
+ self.default_limit_write: Union[int,None] = self.default_limit_read # limit the number of entries per upsert (used as default value)
43
+ self.default_force: bool = True # set to True to edit a read-only resource
44
+ self.read_only: bool = self.default_readonly
45
+ self.submit_delay: float = 2.0 # delay between requests when running datapusher_submit
46
+ self.submit_timeout: float = 90.0 # maximum wait time after datapusher_submit
47
+
48
+ def copy(self, new_identifier:str=None, *, dest=None):
49
+ if dest is None:
50
+ dest = CkanApiReadWriteParams()
51
+ super().copy(dest=dest)
52
+ dest.default_limit_write = self.default_limit_write
53
+ dest.default_force = self.default_force
54
+ dest.read_only = self.read_only
55
+ dest.submit_delay = self.submit_delay
56
+ dest.submit_timeout = self.submit_timeout
57
+ return dest
58
+
59
+
60
+ class CkanApiReadWrite(CkanApiPolicy):
61
+ """
62
+ CKAN Database API interface to CKAN server with helper functions using pandas DataFrames.
63
+ This class implements requests to write data to the CKAN server resources / DataStores.
64
+ """
65
+
66
+ def __init__(self, url:str=None, *, proxies:Union[str,dict,ProxyConfig]=None,
67
+ apikey:Union[str,CkanApiKey]=None, apikey_file:str=None,
68
+ owner_org: str = None, params:CkanApiPolicyParams=None,
69
+ map:CkanMap=None, policy: CkanPackageDataFormatPolicy = None, policy_file:str=None,
70
+ data_cleaner_upload:CkanDataCleanerABC=None,
71
+ identifier=None):
72
+ """
73
+ CKAN Database API interface to CKAN server with helper functions using pandas DataFrames.
74
+
75
+ :param url: url of the CKAN server
76
+ :param proxies: proxies to use for requests
77
+ :param apikey: way to provide the API key directly (optional)
78
+ :param apikey_file: path to a file containing a valid API key in the first line of text (optional)
79
+ :param policy: data format policy to use with policy_check function
80
+ :param policy_file: path to a JSON file containing the data format policy to use with policy_check function
81
+ :param owner_org: name of the organization to limit package_search (optional)
82
+ :param params: other connection/behavior parameters
83
+ :param map: map of known resources
84
+ :param policy: data format policy to be used with the policy_check function.
85
+ :param policy_file: path to a JSON file containing the data format policy to load.
86
+ :param data_cleaner_upload: data cleaner object to use before uploading to a CKAN DataStore.
87
+ :param identifier: identifier of the ckan client
88
+ """
89
+ super().__init__(url=url, proxies=proxies, apikey=apikey, apikey_file=apikey_file,
90
+ owner_org=owner_org, map=map, policy=policy, policy_file=policy_file, identifier=identifier)
91
+ self.data_cleaner_upload: Union[CkanDataCleanerABC, None] = data_cleaner_upload
92
+ if params is None:
93
+ params = CkanApiReadWriteParams()
94
+ if proxies is not None:
95
+ params.proxies = proxies
96
+ self.params: CkanApiReadWriteParams = params
97
+
98
+ def copy(self, new_identifier: str = None, *, dest=None):
99
+ if dest is None:
100
+ dest = CkanApiReadWrite()
101
+ super().copy(new_identifier=new_identifier, dest=dest)
102
+ dest.data_cleaner_upload = self.data_cleaner_upload.copy() if self.data_cleaner_upload is not None else None
103
+ return dest
104
+
105
+ def full_unlock(self, unlock:bool=True,
106
+ *, no_ca:bool=None, external_url_resource_download:bool=None) -> None:
107
+ """
108
+ Function to unlock full capabilities of the CKAN API
109
+
110
+ :param unlock:
111
+ :return:
112
+ """
113
+ super().full_unlock(unlock, no_ca=no_ca, external_url_resource_download=external_url_resource_download)
114
+ self.params.default_force = unlock
115
+ self.params.read_only = not unlock
116
+
117
+ def set_limits(self, limit_read:Union[int,None], limit_write:int=None) -> None:
118
+ """
119
+ Set default query limits. If only one argument is provided, it applies to both limits.
120
+
121
+ :param limit_read: default limit for read requests
122
+ :param limit_write: default limit for upsert (write) requests
123
+ :return:
124
+ """
125
+ super().set_limits(limit_read)
126
+ if limit_write is not None:
127
+ self.params.default_limit_write = limit_write
128
+ else:
129
+ self.params.default_limit_write = limit_read
130
+
131
+ def set_submit_timeout(self, submit_timeout:float, submit_delay:float=None) -> None:
132
+ """
133
+ Set timeout for the datastore_wait method. This is called after datastore_submit.
134
+
135
+ :param submit_timeout: timeout after which a TimeoutError is raised
136
+ :param submit_delay: delay between requests to peer on DataStore initialization
137
+ :return:
138
+ """
139
+ self.params.submit_timeout = submit_timeout
140
+ if submit_delay is not None:
141
+ self.params.submit_delay = submit_delay
142
+
143
+ ## DataStore insertions ------------------
144
+ def _api_datastore_upsert_raw(self, records:Union[dict, List[dict], pd.DataFrame], resource_id:str, *,
145
+ method:Union[UpsertChoice,str], params:dict=None, force:bool=None, dry_run:bool=False,
146
+ last_insertion:bool=True) -> CkanActionResponse:
147
+ """
148
+ API call to api_datastore_upsert.
149
+
150
+ :param records: records, preferably in a pandas DataFrame - they will be converted to a list of dictionaries.
151
+ :param resource_id: destination resource id
152
+ :param method: see UpsertChoice (insert, update or upsert)
153
+ :param force: set to True to edit a read-only resource. If not provided, this is overridden by self.default_force
154
+ :param params: additional parameters
155
+ :param dry_run: set to True to abort transaction instead of committing, e.g. to check for validation or type errors
156
+ :param last_insertion: trigger for calculate_record_count
157
+ (doc: updates the stored count of records, used to optimize datastore_search in combination with the
158
+ total_estimation_threshold parameter. If doing a series of requests to change a resource, you only need to set
159
+ this to True on the last request.)
160
+ :return: the inserted records as a pandas DataFrame, from the server response
161
+ """
162
+ assert_or_raise(not self.params.read_only, ReadOnlyError())
163
+ if params is None: params = {}
164
+ if force is None: force = self.params.default_force
165
+ if method is not None: method = str(method)
166
+ params["resource_id"] = resource_id
167
+ params["force"] = force
168
+ params["dry_run"] = dry_run
169
+ # doc calculate_record_count: updates the stored count of records, used to optimize datastore_search in combination with the
170
+ # total_estimation_threshold parameter. If doing a series of requests to change a resource, you only need to set
171
+ # this to True on the last request.
172
+ params["calculate_record_count"] = last_insertion
173
+ # params["force_indexing"] = last_insertion
174
+ N = self.map.get_datastore_len(resource_id, error_not_mapped=False)
175
+ has_datastore_info = N is not None
176
+ format = None # API does not support csv or other formats
177
+ mode_df = True
178
+ if records is not None:
179
+ method = method.lower()
180
+ params["method"] = method
181
+ if isinstance(records, dict):
182
+ records = pd.DataFrame.from_dict(records)
183
+ elif isinstance(records, list):
184
+ # records = records_to_df(records)
185
+ mode_df = False
186
+ else:
187
+ assert(isinstance(records, pd.DataFrame))
188
+ n_upsert = len(records)
189
+ if not mode_df:
190
+ params["records"] = records
191
+ format = None
192
+ elif format is None or format == "objects":
193
+ params["records"] = records.to_dict(orient='records')
194
+ else:
195
+ # dead code
196
+ fields_id_list = records.columns.tolist()
197
+ params["fields"] = fields_id_list # [{"id": id} for id in fields_id_list]
198
+ params["records_format"] = format
199
+ if format == "csv":
200
+ params["records"] = records.to_csv(index=False, header=False, **df_upload_to_csv_kwargs)
201
+ elif format == "lists":
202
+ params["records"] = records.values.tolist()
203
+ else:
204
+ raise NotImplementedError()
205
+ else:
206
+ # possibility to call with None records and method to trigger row counts etc.
207
+ # this request may not be useful
208
+ assert(method is None)
209
+ n_upsert = 0
210
+ # json encode here in the case there are NaN values, not supported by the requests encoder
211
+ data_payload, json_headers = json_encode_params(params)
212
+ response = self._api_action_request(f"datastore_upsert", method=RequestType.Post,
213
+ data=data_payload, headers=json_headers)
214
+ # response = self._api_action_request(f"datastore_upsert", method=RequestType.Post,
215
+ # json=params)
216
+ if response.success:
217
+ if method is not None:
218
+ n_return = len(response.result["records"])
219
+ if has_datastore_info and not dry_run and method == "insert":
220
+ # in modes other than insert could be updated rather than inserted
221
+ self.map._update_datastore_len(resource_id, N + n_return)
222
+ assert_or_raise(n_return == n_upsert, IntegrityError("Returned dataframe does not match number of requested rows"))
223
+ return response
224
+ else:
225
+ raise response.default_error(self)
226
+
227
+ def _api_datastore_upsert(self, records:Union[dict, List[dict], pd.DataFrame], resource_id:str, *,
228
+ method:Union[UpsertChoice,str], params:dict=None, force:bool=None, dry_run:bool=False,
229
+ last_insertion:bool=True, return_df:bool=None) -> Union[pd.DataFrame, List[dict], dict]:
230
+ mode_df = True
231
+ if isinstance(records, list):
232
+ mode_df = False
233
+ if return_df is None:
234
+ return_df = mode_df
235
+ response = self._api_datastore_upsert_raw(records=records, resource_id=resource_id, method=method,
236
+ params=params, force=force, dry_run=dry_run,
237
+ last_insertion=last_insertion)
238
+ if method is not None:
239
+ if return_df:
240
+ response_df = pd.DataFrame.from_dict(response.result["records"])
241
+ self._rx_records_df_clean(response_df)
242
+ return response_df
243
+ else:
244
+ return response.result["records"]
245
+ else:
246
+ return response.result
247
+
248
+ def datastore_upsert_last_line(self, resource_id:str):
249
+ """
250
+ Apply last line treatments to a resource.
251
+ """
252
+ return self._api_datastore_upsert(None, resource_id=resource_id, method=None, last_insertion=True)
253
+
254
+ def datastore_upsert(self, records:Union[dict, List[dict], pd.DataFrame], resource_id:str, *,
255
+ dry_run:bool=False, limit:int=None, offset:int=0, force:bool=None,
256
+ method:Union[UpsertChoice,str]=UpsertChoice.Upsert, apply_last_condition:bool=True,
257
+ always_last_condition:bool=None, return_df:bool=None,
258
+ data_cleaner:CkanDataCleanerABC=None, params:dict=None) -> Union[pd.DataFrame, List[dict]]:
259
+ """
260
+ Encapsulation of _api_datastore_upsert to cut the requests to a limited number of rows.
261
+
262
+ :see: _api_datastore_upsert()
263
+ :param records: records, preferably in a pandas DataFrame - they will be converted to a list of dictionaries.
264
+ :param resource_id: destination resource id
265
+ :param method: by default, set to Upsert
266
+ :param force: set to True to edit a read-only resource. If not provided, this is overridden by self.default_force
267
+ :param limit: number of records per transaction
268
+ :param offset: number of records to skip - use to restart the transfer
269
+ :param params: additional parameters
270
+ :param dry_run: set to True to abort transaction instead of committing, e.g. to check for validation or type errors
271
+ :param apply_last_condition: if True, the last upsert request applies the last insert operations (calculate_record_count and force_indexing).
272
+ :param always_last_condition: if True, each request applies the last insert operations - default is False
273
+ :return: the inserted records as a pandas DataFrame, from the server response
274
+ """
275
+ method_str = str(method)
276
+ if apply_last_condition is None:
277
+ apply_last_condition = True
278
+ if always_last_condition is None:
279
+ always_last_condition = False
280
+ mode_df = True
281
+ assert(records is not None)
282
+ if isinstance(records, dict):
283
+ records = pd.DataFrame.from_dict(records)
284
+ elif isinstance(records, list):
285
+ # records = records_to_df(records)
286
+ mode_df = False
287
+ else:
288
+ assert(isinstance(records, pd.DataFrame))
289
+ if data_cleaner is None:
290
+ data_cleaner = self.data_cleaner_upload
291
+ if data_cleaner is not None:
292
+ datastore_info = self.get_datastore_info_or_request_of_id(resource_id=resource_id, error_not_found=True)
293
+ records = data_cleaner.clean_records(records, known_fields=datastore_info.fields_dict, inplace=True)
294
+ data_cleaner.apply_new_fields_request(self, resource_id=resource_id)
295
+ if return_df is None:
296
+ return_df = mode_df
297
+ if limit is None: limit = self.params.default_limit_write
298
+ if limit is None:
299
+ # direct API call with one request
300
+ if self.params.store_last_response_debug_info:
301
+ self.debug.multi_requests_last_successful_offset = offset
302
+ return self._api_datastore_upsert(records, return_df=return_df,
303
+ method=method_str, dry_run=dry_run, resource_id=resource_id,
304
+ force=force, params=params,
305
+ last_insertion=apply_last_condition or always_last_condition)
306
+ assert_or_raise(limit > 0, InvalidParameterError("limit"))
307
+ n = len(records)
308
+ if self.params.store_last_response_debug_info:
309
+ self.debug.multi_requests_last_successful_offset = offset
310
+ requests_count = 0
311
+ last_insertion = True
312
+ df, returned_rows = None, None
313
+ if return_df:
314
+ df = None
315
+ else:
316
+ returned_rows = []
317
+ start = time.time()
318
+ current = start
319
+ timeout = False
320
+ n_cum = 0
321
+ while offset < n and requests_count < self.params.max_requests_count and not timeout:
322
+ last_insertion = offset+limit >= n
323
+ i_end_add = min(n, offset+limit)
324
+ n_add = i_end_add-1 - offset + 1
325
+ if self.params.verbose_multi_requests:
326
+ print(f"{self.identifier} Multi-requests upsert {requests_count} to add {n_add} records ...")
327
+ if mode_df:
328
+ df_upsert = records.iloc[offset:i_end_add]
329
+ else:
330
+ df_upsert = records[offset:i_end_add]
331
+ df_add = self._api_datastore_upsert(df_upsert, return_df=return_df,
332
+ method=method_str, dry_run=dry_run, resource_id=resource_id,
333
+ force=force, params=params,
334
+ last_insertion=(last_insertion and apply_last_condition) or always_last_condition)
335
+ n_cum += len(df_add)
336
+ assert_or_raise(len(df_add) == n_add, IntegrityError("Second check on response len failed in datastore_upsert")) # consistency check, in double of _api_datastore_upsert
337
+ if self.params.store_last_response_debug_info:
338
+ self.debug.multi_requests_last_successful_offset = offset
339
+ if return_df:
340
+ if df is None:
341
+ # 1st execution: pandas cannot concatenate with an empty DataFrame => use None as indicator
342
+ assert(df_add is not None)
343
+ df = df_add
344
+ else:
345
+ df = pd.concat([df, df_add], ignore_index=True)
346
+ else:
347
+ returned_rows = returned_rows + df_add
348
+ if self.params.multi_requests_time_between_requests > 0 and not last_insertion:
349
+ time.sleep(self.params.multi_requests_time_between_requests)
350
+ if not last_insertion:
351
+ assert_or_raise(n_add == limit, IntegrityError("datastore_upsert implementation is wrong"))
352
+ offset += limit
353
+ requests_count += 1
354
+ current = time.time()
355
+ timeout = current - start > self.params.multi_requests_timeout
356
+ if return_df:
357
+ if df is None:
358
+ df = pd.DataFrame() # always return a DataFrame object and not None
359
+ df.attrs["requests_count"] = requests_count
360
+ df.attrs["elapsed_time"] = current - start
361
+ df.attrs["offset"] = offset
362
+ if self.params.verbose_multi_requests:
363
+ print(f"{self.identifier} Multi-requests upsert done to add {n_cum} records done in {requests_count} requests and {round(current - start, 2)} seconds.")
364
+ if timeout:
365
+ raise TimeoutError()
366
+ if requests_count >= self.params.max_requests_count:
367
+ raise MaxRequestsCountError()
368
+ assert_or_raise(last_insertion, UnexpectedError("last_insertion should be True at last iteration"))
369
+ if mode_df:
370
+ return df
371
+ else:
372
+ return returned_rows
373
+
374
+ def datastore_insert(self, records:Union[dict, List[dict], pd.DataFrame], resource_id:str, *,
375
+ dry_run:bool=False, limit:int=None, offset:int=0, apply_last_condition:bool=True,
376
+ always_last_condition:bool=None,
377
+ data_cleaner:CkanDataCleanerABC=None, force:bool=None, params:dict=None) -> pd.DataFrame:
378
+ """
379
+ Alias function to insert data in a DataStore using datastore_upsert.
380
+
381
+ :see: _api_datastore_upsert()
382
+ :param records: records, preferably in a pandas DataFrame - they will be converted to a list of dictionaries.
383
+ :param resource_id: destination resource id
384
+ :param force: set to True to edit a read-only resource. If not provided, this is overridden by self.default_force
385
+ :param params: additional parameters
386
+ :param dry_run: set to True to abort transaction instead of committing, e.g. to check for validation or type errors
387
+ :return: the inserted records as a pandas DataFrame, from the server response
388
+ """
389
+ return self.datastore_upsert(records, resource_id, dry_run=dry_run, limit=limit, offset=offset,
390
+ method=UpsertChoice.Insert, apply_last_condition=apply_last_condition,
391
+ always_last_condition=always_last_condition, data_cleaner=data_cleaner,
392
+ force=force, params=params)
393
+
394
+ def datastore_update(self, records:Union[dict, List[dict], pd.DataFrame], resource_id:str, *,
395
+ dry_run:bool=False, limit:int=None, offset:int=0, apply_last_condition:bool=True,
396
+ always_last_condition:bool=None,
397
+ data_cleaner:CkanDataCleanerABC=None, force:bool=None, params:dict=None) -> pd.DataFrame:
398
+ """
399
+ Alias function to update data in a DataStore using datastore_upsert.
400
+ The update is performed based on the DataStore primary keys
401
+
402
+ :see: _api_datastore_upsert()
403
+ :param records: records, preferably in a pandas DataFrame - they will be converted to a list of dictionaries.
404
+ :param resource_id: destination resource id
405
+ :param force: set to True to edit a read-only resource. If not provided, this is overridden by self.default_force
406
+ :param params: additional parameters
407
+ :param dry_run: set to True to abort transaction instead of committing, e.g. to check for validation or type errors
408
+ :return: the inserted records as a pandas DataFrame, from the server response
409
+ """
410
+ return self.datastore_upsert(records, resource_id, dry_run=dry_run, limit=limit, offset=offset,
411
+ method=UpsertChoice.Update, apply_last_condition=apply_last_condition,
412
+ always_last_condition=always_last_condition, data_cleaner=data_cleaner,
413
+ force=force, params=params)
414
+
415
+
416
+ ## Resource updates ------------------
417
+ def _api_resource_patch(self, resource_id:str, *, name:str=None, format:str=None, description:str=None, title:str=None,
418
+ state:CkanState=None,
419
+ df:pd.DataFrame=None, file_path:str=None, url:str=None, files=None,
420
+ payload: Union[bytes, io.BufferedIOBase] = None, payload_name: str = None,
421
+ params:dict=None) -> CkanResourceInfo:
422
+ """
423
+ Call to resource_patch API. This call can be used to change the resource parameters via params (cf. API documentation)
424
+ or to reupload the resource file into FileStore.
425
+ The latter action replaces the current resource. If it is a DataStore, it is reset to the new contents of the file.
426
+ The file can be transmitted either as an url, a file path or a pandas DataFrame.
427
+ The files argument can pass through these arguments to the requests.post function.
428
+ A call to datapusher_submit() could be required to take immediately into account the newly downloaded file.
429
+
430
+ :see: _api_resource_create
431
+ :see: resource_create
432
+ :param resource_id: resource id
433
+ :param url: url of the resource to replace resource
434
+ :param params: parameters such as name, format, resource_type can be changed
435
+
436
+ For file uploads, the following parameters are taken, by order of priority:
437
+ See upload_prepare_requests_files_arg for an example of formatting.
438
+
439
+ :param files: files pass through argument to the requests.post function. Use to send other data formats.
440
+ :param payload: bytes to upload as a file
441
+ :param payload_name: name of the payload to use (associated with the payload argument) - this determines the format recognized in CKAN viewers.
442
+ :param file_path: path of the file to transmit (binary and text files are supported here)
443
+ :param df: pandas DataFrame to replace resource
444
+
445
+ :return:
446
+ """
447
+ assert_or_raise(not self.params.read_only, ReadOnlyError())
448
+ if params is None: params = {}
449
+ params["id"] = resource_id
450
+ if description is not None:
451
+ params["description"] = description
452
+ if title is not None:
453
+ params["title"] = title
454
+ if name is not None:
455
+ params["name"] = name
456
+ if format is not None:
457
+ params["format"] = format
458
+ if state is not None:
459
+ params["state"] = str(state)
460
+ files = upload_prepare_requests_files_arg(files=files, file_path=file_path, df=df, payload=payload, payload_name=payload_name)
461
+ if url is not None:
462
+ params["url"] = url
463
+ params["clear_upload"] = True
464
+ assert(files is None)
465
+ if files is not None:
466
+ response = self._api_action_request(f"resource_patch", method=RequestType.Post,
467
+ files=files, data=params)
468
+ else:
469
+ response = self._api_action_request(f"resource_patch", method=RequestType.Post, json=params)
470
+ if response.success:
471
+ resource_info = CkanResourceInfo(response.result)
472
+ self.map._record_resource_update(resource_info)
473
+ return resource_info
474
+ else:
475
+ raise response.default_error(self)
476
+
477
+ def resource_patch(self, resource_id:str, *, name:str=None, format:str=None, description:str=None, title:str=None,
478
+ state:CkanState=None,
479
+ df:pd.DataFrame=None, file_path:str=None, url:str=None, files=None,
480
+ payload: Union[bytes, io.BufferedIOBase] = None, payload_name: str = None,
481
+ params:dict=None) -> CkanResourceInfo:
482
+ # function alias
483
+ return self._api_resource_patch(resource_id, name=name, format=format, description=description, state=state,
484
+ title=title, df=df, file_path=file_path, url=url, files=files,
485
+ payload=payload, payload_name=payload_name, params=params)
486
+
487
+ ### DataPusher submit ------------------
488
+ def _api_datapusher_submit(self, resource_id: str, *, params: dict = None) -> bool:
489
+ """
490
+ Call to API action datapusher_submit. This triggers the normally asynchronous DataPusher service for a given resource.
491
+
492
+ :param resource_id: resource id
493
+ :param params:
494
+ :return:
495
+ """
496
+ if params is None: params = {}
497
+ params["resource_id"] = resource_id
498
+ response = self._api_action_request(f"datapusher_submit", method=RequestType.Post, json=params)
499
+ if response.success:
500
+ return response.result
501
+ else:
502
+ raise response.default_error(self)
503
+
504
+ def datastore_wait(self, resource_id: str, *,
505
+ apply_delay:bool=True, error_timeout:bool=True) -> Tuple[int, float]:
506
+ """
507
+ Wait until a DataStore has at least one row.
508
+ The delay between requests to peer on the presence of the DataStore is given by the class attribute submit_delay.
509
+ If the loop exceeds submit_timeout, an exception is raised.
510
+
511
+ :param resource_id:
512
+ :param apply_delay:
513
+ :param error_timeout: option to raise an exception in case of timeout
514
+ :return:
515
+ """
516
+ if self.params.submit_delay <= 0 or not apply_delay:
517
+ return 0, 0.0
518
+ # resource_info = self.resource_show(resource_id)
519
+ # init_timestamp = resource_info.last_modified
520
+ # current_timestamp = init_timestamp
521
+ if self.params.verbose_request:
522
+ print(f"Waiting for data treatments on DataStore {resource_id}...")
523
+ start = time.time()
524
+ current = start
525
+ timeout = False
526
+ counter = 0
527
+ df = pd.DataFrame() # empty DataFrame
528
+ while not timeout and df.empty: # current_timestamp <= init_timestamp:
529
+ time.sleep(self.params.submit_delay)
530
+ # resource_info = self.resource_show(resource_id)
531
+ # current_timestamp = resource_info.last_modified
532
+ try:
533
+ df = self.datastore_search(resource_id, limit=1, search_all=False, search_method=True)
534
+ except DataStoreNotFoundError:
535
+ pass
536
+ current = time.time()
537
+ timeout = (current - start) > self.params.submit_timeout
538
+ counter += 1
539
+ if timeout:
540
+ if error_timeout:
541
+ raise TimeoutError("datastore_wait")
542
+ else:
543
+ msg = str(TimeoutError("datastore_wait"))
544
+ warn(msg)
545
+ if self.params.verbose_request:
546
+ print(f"Resource updated after {current - start} seconds ({counter} iterations)")
547
+ return counter, current - start
548
+
549
+ def datastore_submit(self, resource_id: str,
550
+ *, apply_delay:bool=True, error_timeout:bool=True,
551
+ params: dict = None) -> bool:
552
+ """
553
+ Submit file to re-initiate DataStore, using the preferred method.
554
+ Current method is datapusher_submit.
555
+ This encapsulation includes a call to datastore_wait.
556
+
557
+ :param resource_id:
558
+ :param apply_delay: Keep true to wait until the datastore is ready (a datastore_search query is performed as a test)
559
+ :param params:
560
+ :return:
561
+ """
562
+ result = self._api_datapusher_submit(resource_id, params=params)
563
+ self.datastore_wait(resource_id, apply_delay=apply_delay, error_timeout=error_timeout)
564
+ return result
565
+
566
+ # def datapusher_submit_insert(self, resource_id: str, *, params: dict = None) -> dict:
567
+ # # idea: modify datapusher such as it would upsert data instead of replacing the entire datastore
568
+ # raise NotImplementedError()
569
+ # if params is None: params = {}
570
+ # params["insert"] = True
571
+ # return self._api_datapusher_submit(resource_id, params)
572
+
573
+
574
+
575
+
576
+
577
+
578
+
579
+