ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
  103. ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,934 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+
5
+ """
6
+ from typing import List, Dict, Tuple, Generator, Any, Union, OrderedDict
7
+ import io
8
+ import json
9
+ from warnings import warn
10
+
11
+ import numpy as np
12
+ import requests
13
+ from requests.auth import AuthBase
14
+ import pandas as pd
15
+
16
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
17
+ from ckanapi_harvesters.auxiliary.list_records import ListRecords, records_to_df
18
+ from ckanapi_harvesters.auxiliary.proxy_config import ProxyConfig
19
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanResourceInfo, CkanAliasInfo, CkanField
20
+ from ckanapi_harvesters.auxiliary.ckan_map import CkanMap
21
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import bytes_to_megabytes
22
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import assert_or_raise, CkanIdFieldTreatment
23
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import datastore_id_col
24
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import RequestType
25
+ from ckanapi_harvesters.auxiliary.ckan_action import CkanActionResponse, CkanNotFoundError, CkanSqlCapabilityError
26
+ from ckanapi_harvesters.auxiliary.ckan_errors import (IntegrityError, CkanServerError, CkanArgumentError, SearchAllNoCountsError,
27
+ DataStoreNotFoundError, RequestError)
28
+ from ckanapi_harvesters.ckan_api.ckan_api_params import CkanApiParamsBasic
29
+ from ckanapi_harvesters.auxiliary.ckan_api_key import CkanApiKey
30
+ from ckanapi_harvesters.ckan_api.ckan_api_0_base import ckan_request_proxy_default_auth_if_ckan
31
+
32
+ from ckanapi_harvesters.ckan_api.ckan_api_1_map import CkanApiMap
33
+
34
+ df_download_read_csv_kwargs = dict(keep_default_na=False)
35
+
36
+ ckan_dtype_mapper = {
37
+ "text": "str",
38
+ "numeric": "float",
39
+ "timestamp": "datetime64",
40
+ "int": "int",
41
+ "name": "str",
42
+ "oid": "str", # to confirm
43
+ "bool": "object", # enable None values but if they are present, booleans are converted to str...
44
+ "json": "object",
45
+ }
46
+
47
+ class CkanApiReadOnlyParams(CkanApiParamsBasic):
48
+ map_all_aliases:bool = True
49
+ default_df_download_id_field_treatment: CkanIdFieldTreatment = CkanIdFieldTreatment.SetIndex
50
+
51
+ def __init__(self, *, proxies:Union[str,dict,ProxyConfig]=None,
52
+ ckan_headers:dict=None, http_headers:dict=None):
53
+ super().__init__(proxies=proxies, ckan_headers=ckan_headers, http_headers=http_headers)
54
+ self.df_download_id_field_treatment: CkanIdFieldTreatment = self.default_df_download_id_field_treatment
55
+
56
+ def copy(self, new_identifier:str=None, *, dest=None):
57
+ if dest is None:
58
+ dest = CkanApiReadOnlyParams()
59
+ super().copy(dest=dest)
60
+ dest.df_download_id_field_treatment = self.df_download_id_field_treatment
61
+ return dest
62
+
63
+
64
+ ## Main class ------------------
65
+ class CkanApiReadOnly(CkanApiMap):
66
+ """
67
+ CKAN Database API interface to CKAN server with helper functions using pandas DataFrames.
68
+ This class implements requests to read data from the CKAN server resources / DataStores.
69
+ """
70
+
71
+ def __init__(self, url:str=None, *, proxies:Union[str,dict,ProxyConfig]=None,
72
+ apikey:Union[str,CkanApiKey]=None, apikey_file:str=None,
73
+ owner_org:str=None, params:CkanApiReadOnlyParams=None,
74
+ map:CkanMap=None,
75
+ identifier=None):
76
+ """
77
+ CKAN Database API interface to CKAN server with helper functions using pandas DataFrames.
78
+
79
+ :param url: url of the CKAN server
80
+ :param proxies: proxies to use for requests
81
+ :param apikey: way to provide the API key directly (optional)
82
+ :param apikey_file: path to a file containing a valid API key in the first line of text (optional)
83
+ :param owner_org: name of the organization to limit package_search (optional)
84
+ :param params: other connection/behavior parameters
85
+ :param map: map of known resources
86
+ :param identifier: identifier of the ckan client
87
+ """
88
+ super().__init__(url=url, proxies=proxies, apikey=apikey, apikey_file=apikey_file,
89
+ owner_org=owner_org, map=map, identifier=identifier)
90
+ if params is None:
91
+ params = CkanApiReadOnlyParams()
92
+ if proxies is not None:
93
+ params.proxies = proxies
94
+ self.params: CkanApiReadOnlyParams = params
95
+
96
+ def _rx_records_df_clean(self, df: pd.DataFrame) -> None:
97
+ """
98
+ Auxiliary function for cleaning dataframe from DataStore requests
99
+
100
+ :param df:
101
+ :return:
102
+ """
103
+ if len(df) > 0 and datastore_id_col in df.columns:
104
+ if self.params.df_download_id_field_treatment == CkanIdFieldTreatment.SetIndex:
105
+ # use _id column as new index
106
+ df.set_index(datastore_id_col, drop=False, inplace=True, verify_integrity=True)
107
+ elif self.params.df_download_id_field_treatment == CkanIdFieldTreatment.Remove:
108
+ # remove "_id" column
109
+ df.pop(datastore_id_col)
110
+
111
+ @staticmethod
112
+ def read_fields_type_dict(fields_list_dict: List[dict]) -> OrderedDict:
113
+ return OrderedDict([(field_dict["id"], field_dict["type"]) for field_dict in fields_list_dict])
114
+
115
+ @staticmethod
116
+ def read_fields_df_args(fields_type_dict: OrderedDict) -> dict:
117
+ if fields_type_dict is None:
118
+ return {}
119
+ # fields_dtype_dict = fields_type_dict.copy()
120
+ # for key, ckan_type in fields_type_dict.items():
121
+ # if ckan_type in ckan_dtype_mapper:
122
+ # fields_dtype_dict[key] = ckan_dtype_mapper[ckan_type]
123
+ # else:
124
+ # fields_dtype_dict[key] = "object"
125
+ # return dict(names=list(fields_dtype_dict.keys()), dtype=fields_dtype_dict)
126
+ return dict(names=list(fields_type_dict.keys()))
127
+
128
+ @staticmethod
129
+ def from_dict_df_args(fields_type_dict: OrderedDict) -> dict:
130
+ df_args_dict = CkanApiReadOnly.read_fields_df_args(fields_type_dict)
131
+ df_args_dict.pop("names")
132
+ return df_args_dict
133
+
134
+ ## Data queries ------------------
135
+ ### Dump method ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
136
+ # NB: dump methods are not exposed to the user by default. Only datastore_search and resource_download methods are exposed.
137
+ def _api_datastore_dump_raw(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
138
+ sort:str=None, limit:int=None, offset:int=0, format:str=None, bom:bool=None, params:dict=None,
139
+ compute_len:bool=False) -> requests.Response:
140
+ """
141
+ URL call to datastore/dump URL. Dumps successive lines in the DataStore.
142
+
143
+ :param resource_id: resource id.
144
+ :param filters: The base argument to filter values in a table (optional)
145
+ :param q: Full text query (optional)
146
+ :param fields: The base argument to filter columns (optional)
147
+ :param format: The return format in the returned response (default=csv, tsv, json, xml) (optional)
148
+ :param params: Additional parameters such as filters, q, sort and fields can be given. See DataStore API documentation.
149
+ :return: raw response
150
+ """
151
+ if compute_len:
152
+ raise SearchAllNoCountsError("datastore_search", f"format={format}")
153
+ if params is None:
154
+ params = {}
155
+ if offset is None:
156
+ offset = 0
157
+ params["offset"] = offset
158
+ if limit is None:
159
+ limit = self.params.default_limit_read
160
+ if limit is not None:
161
+ params["limit"] = limit
162
+ if filters is not None:
163
+ if isinstance(filters, str):
164
+ # not recommended
165
+ params["filters"] = filters
166
+ else:
167
+ params["filters"] = json.dumps(filters)
168
+ if q is not None:
169
+ params["q"] = q
170
+ if fields is not None:
171
+ params["fields"] = fields
172
+ if sort is not None:
173
+ params["sort"] = sort
174
+ if format is not None:
175
+ format = format.lower()
176
+ params["format"] = format
177
+ if bom is None and format is not None:
178
+ bom = format not in {"json", "xml"}
179
+ if bom is not None:
180
+ params["bom"] = bom
181
+ # params["bom"] = True # useful?
182
+ response = self._url_request(f"datastore/dump/{resource_id}", method=RequestType.Get, params=params)
183
+ if response.status_code == 200:
184
+ return response
185
+ elif response.status_code == 404 and "DataStore resource not found" in response.text:
186
+ raise DataStoreNotFoundError(resource_id, response.content.decode())
187
+ else:
188
+ raise CkanServerError(self, response, response.content.decode())
189
+
190
+ def _api_datastore_dump_df(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
191
+ sort:str=None, limit:int=None, offset:int=0, format:str=None, bom:bool=None, params:dict=None) -> pd.DataFrame:
192
+ """
193
+ Convert output of _api_datastore_dump_raw to pandas DataFrame.
194
+ """
195
+ response = self._api_datastore_dump_raw(resource_id=resource_id, filters=filters, q=q, fields=fields,
196
+ sort=sort, limit=limit, offset=offset, format=format, bom=bom,
197
+ params=params, compute_len=False)
198
+ if format is not None:
199
+ format = format.lower()
200
+ buffer = io.StringIO(response.content.decode())
201
+ if format is None or format == "csv":
202
+ response_df = pd.read_csv(buffer, **df_download_read_csv_kwargs)
203
+ elif format == "tsv":
204
+ response_df = pd.read_csv(buffer, sep="\t", **df_download_read_csv_kwargs) # not tested
205
+ elif format == "json":
206
+ response_dict = json.load(buffer)
207
+ fields_type_dict = CkanApiReadOnly.read_fields_type_dict(response_dict["fields"])
208
+ df_args = CkanApiReadOnly.read_fields_df_args(fields_type_dict)
209
+ response_df = records_to_df(response_dict["records"], df_args)
210
+ response_df.attrs["fields"] = fields_type_dict
211
+ elif format == "xml":
212
+ response_df = pd.read_xml(buffer, parser="etree") # , xpath=".//row") # partially tested # otherwise, necessitates the installation of parser lxml
213
+ else:
214
+ raise NotImplementedError()
215
+ self._rx_records_df_clean(response_df)
216
+ return response_df
217
+
218
+ def _api_datastore_dump_all(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
219
+ sort:str=None, limit:int=None, offset:int=0, format:str=None, bom:bool=None,
220
+ params:dict=None, search_all:bool=True, return_df:bool=True) \
221
+ -> Union[pd.DataFrame, requests.Response]:
222
+ """
223
+ Successive calls to _api_datastore_dump_df until an empty list is received.
224
+
225
+ :see: _api_datastore_dump()
226
+ :param resource_id: resource id.
227
+ :param filters: The base argument to filter values in a table (optional)
228
+ :param q: Full text query (optional)
229
+ :param fields: The base argument to filter columns (optional)
230
+ :param format: The return format in the returned response (default=csv, tsv, json, xml) (optional)
231
+ :param params: Additional parameters such as filters, q, sort and fields can be given. See DataStore API documentation.
232
+ :param search_all: if False, only the first request is operated
233
+ :return:
234
+ """
235
+ if return_df:
236
+ return self._request_all_results_df(api_fun=self._api_datastore_dump_df, params=params, limit=limit, offset=offset,
237
+ search_all=search_all, resource_id=resource_id,
238
+ filters=filters, q=q, fields=fields, sort=sort, format=format, bom=bom)
239
+ elif search_all:
240
+ # cannot determine the number of records received if the response is not parsed with pandas in this mode
241
+ # at least, the total number of rows should be known
242
+ # concatenation of results requires parsing of the result
243
+ # => this mode is useless => raise error
244
+ raise SearchAllNoCountsError("datastore_dump")
245
+ else:
246
+ response = self._api_datastore_dump_raw(resource_id=resource_id, filters=filters, q=q, fields=fields,
247
+ sort=sort, limit=limit, offset=offset, format=format, bom=bom,
248
+ params=params, compute_len=search_all)
249
+ return response
250
+
251
+ def _api_datastore_dump_all_generator(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
252
+ sort:str=None, limit:int=None, offset:int=0, format:str=None, bom:bool=None,
253
+ params:dict=None, search_all:bool=True, return_df:bool=True) \
254
+ -> Union[Generator[pd.DataFrame, Any, None], Generator[requests.Response, Any, None]]:
255
+ """
256
+ Successive calls to _api_datastore_dump until an empty list is received.
257
+ Generator implementation which yields one DataFrame per request.
258
+
259
+ :see: _api_datastore_dump()
260
+ :param resource_id: resource id.
261
+ :param filters: The base argument to filter values in a table (optional)
262
+ :param q: Full text query (optional)
263
+ :param fields: The base argument to filter columns (optional)
264
+ :param format: The return format in the returned response (default=csv, tsv, json, xml) (optional)
265
+ :param params: Additional parameters such as filters, q, sort and fields can be given. See DataStore API documentation.
266
+ :param search_all: if False, only the first request is operated
267
+ :return:
268
+ """
269
+ if return_df:
270
+ return self._request_all_results_generator(api_fun=self._api_datastore_dump_df, params=params, limit=limit, offset=offset,
271
+ search_all=search_all, resource_id=resource_id,
272
+ filters=filters, q=q, fields=fields, sort=sort, format=format, bom=bom)
273
+ else:
274
+ return self._request_all_results_generator(api_fun=self._api_datastore_dump_raw, params=params, limit=limit, offset=offset,
275
+ search_all=search_all, resource_id=resource_id,
276
+ filters=filters, q=q, fields=fields, sort=sort, format=format, bom=bom,
277
+ compute_len=search_all)
278
+
279
+
280
+ ### Search method ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
281
+ def _api_datastore_search_raw(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
282
+ distinct:bool=None, sort:str=None, limit:int=None, offset:int=0, format:str=None,
283
+ params:dict=None, compute_len:int=False) -> CkanActionResponse:
284
+ """
285
+ API call to datastore_search. Performs queries on the DataStore.
286
+
287
+ :param resource_id: resource id.
288
+ :param filters: The base argument to filter values in a table (optional)
289
+ :param q: Full text query (optional)
290
+ :param fields: The base argument to filter columns (optional)
291
+ :param distinct: return only distinct rows (optional, default: false) e.g. to return distinct ids: fields="id", distinct=True
292
+ :param sort: Argument to sort results e.g. sort="index, quantity desc" or sort="index asc"
293
+ :param limit: Limit the number of records to return
294
+ :param offset: Offset in the returned records
295
+ :param format: The return format in the returned response (default=objects, csv, tsv, lists) (optional)
296
+ :param params: Additional parameters such as filters, q, sort and fields can be given. See DataStore API documentation.
297
+ :return:
298
+ """
299
+ if params is None:
300
+ params = {}
301
+ if offset is None:
302
+ offset = 0
303
+ params["offset"] = offset
304
+ if limit is None:
305
+ limit = self.params.default_limit_read
306
+ if limit is not None:
307
+ params["limit"] = limit
308
+ params["resource_id"] = resource_id
309
+ if filters is not None:
310
+ if isinstance(filters, str):
311
+ # not recommended
312
+ params["filters"] = filters
313
+ else:
314
+ params["filters"] = json.dumps(filters)
315
+ if q is not None:
316
+ params["q"] = q
317
+ if fields is not None:
318
+ params["fields"] = fields
319
+ if distinct is not None:
320
+ params["distinct"] = distinct
321
+ if sort is not None:
322
+ params["sort"] = sort
323
+ if format is not None:
324
+ format = format.lower()
325
+ params["records_format"] = format
326
+ response = self._api_action_request(f"datastore_search", method=RequestType.Get, params=params)
327
+ if response.success:
328
+ if response.dry_run:
329
+ return response
330
+ elif format is None or format in ["objects", "lists"]:
331
+ response.len = len(response.result["records"])
332
+ elif compute_len:
333
+ raise SearchAllNoCountsError("datastore_search", f"format={format}")
334
+ return response
335
+ elif response.status_code == 404 and response.error_message["__type"] == "Not Found Error":
336
+ raise DataStoreNotFoundError(resource_id, response.error_message)
337
+ else:
338
+ raise response.default_error(self)
339
+
340
+ def _api_datastore_search_df(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
341
+ distinct:bool=None, sort:str=None, limit:int=None, offset:int=0, format:str=None, params:dict=None) -> pd.DataFrame:
342
+ """
343
+ Convert output of _api_datastore_search_raw to pandas DataFrame.
344
+ """
345
+ response = self._api_datastore_search_raw(resource_id=resource_id, filters=filters, q=q, fields=fields, format=format,
346
+ distinct=distinct, sort=sort, limit=limit, offset=offset,
347
+ params=params, compute_len=False)
348
+ if response.dry_run:
349
+ return pd.DataFrame()
350
+ if format is not None:
351
+ format = format.lower()
352
+ fields_type_dict = CkanApiReadOnly.read_fields_type_dict(response.result["fields"])
353
+ if format is None or format == "objects":
354
+ df_args_dict = CkanApiReadOnly.from_dict_df_args(fields_type_dict)
355
+ response_df = pd.DataFrame.from_dict(response.result["records"], **df_args_dict)
356
+ else:
357
+ df_args = CkanApiReadOnly.read_fields_df_args(fields_type_dict)
358
+ if format == "lists":
359
+ response_df = records_to_df(response.result["records"], df_args)
360
+ else:
361
+ buffer = io.StringIO(response.result["records"])
362
+ if format == "csv":
363
+ response_df = pd.read_csv(buffer, **df_args, **df_download_read_csv_kwargs)
364
+ elif format == "tsv":
365
+ response_df = pd.read_csv(buffer, sep='\t', **df_args, **df_download_read_csv_kwargs)
366
+ else:
367
+ raise NotImplementedError()
368
+ self._rx_records_df_clean(response_df)
369
+ response.result.pop("records")
370
+ response_df.attrs["result"] = response.result
371
+ response_df.attrs["fields"] = fields_type_dict
372
+ response_df.attrs["total"] = response.result["total"]
373
+ response_df.attrs["total_was_estimated"] = response.result["total_was_estimated"]
374
+ response_df.attrs["limit"] = response.result["limit"]
375
+ return response_df
376
+
377
+ def _api_datastore_search_all(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
378
+ distinct:bool=None, sort:str=None, limit:int=None, offset:int=0, format:str=None,
379
+ search_all:bool=True, params:dict=None, return_df:bool=True, compute_len:bool=False) \
380
+ -> Union[pd.DataFrame, Tuple[ListRecords, OrderedDict], Any]:
381
+ """
382
+ Successive calls to _api_datastore_search_df until an empty list is received.
383
+
384
+ :see: _api_datastore_search()
385
+ :param resource_id: resource id.
386
+ :param filters: The base argument to filter values in a table (optional)
387
+ :param q: Full text query (optional)
388
+ :param fields: The base argument to filter columns (optional)
389
+ :param distinct: return only distinct rows (optional, default: false) e.g. to return distinct ids: fields="id", distinct=True
390
+ :param sort: Argument to sort results e.g. sort="index, quantity desc" or sort="index asc"
391
+ :param limit: Limit the number of records to return
392
+ :param offset: Offset in the returned records
393
+ :param format: The return format in the returned response (default=objects, csv, tsv, lists) (optional)
394
+ :param params: Additional parameters such as filters, q, sort and fields can be given. See DataStore API documentation.
395
+ :param search_all: if False, only the first request is operated
396
+ :return:
397
+ """
398
+ if return_df:
399
+ df = self._request_all_results_df(api_fun=self._api_datastore_search_df, params=params, limit=limit, offset=offset,
400
+ search_all=search_all, resource_id=resource_id, filters=filters, q=q, fields=fields, distinct=distinct, sort=sort, format=format)
401
+ if "fields" in df.attrs.keys():
402
+ df.attrs["fields"] = df.attrs["fields"][0]
403
+ if "total" in df.attrs.keys():
404
+ assert_or_raise(np.all(np.array(df.attrs["total"]) == df.attrs["total"][0]), IntegrityError("total field varied in the responses"))
405
+ df.attrs["total"] = df.attrs["total"][0]
406
+ return df
407
+ else:
408
+ responses = self._request_all_results_list(api_fun=self._api_datastore_search_raw, params=params, limit=limit, offset=offset,
409
+ search_all=search_all, resource_id=resource_id, filters=filters, q=q, fields=fields, distinct=distinct, sort=sort, format=format, compute_len=compute_len)
410
+ # aggregate results, depending on the format
411
+ if self.params.dry_run:
412
+ return [], {}
413
+ if format is not None:
414
+ format = format.lower()
415
+ if len(responses) > 0:
416
+ response = responses[0]
417
+ fields_type_dict = CkanApiReadOnly.read_fields_type_dict(response.result["fields"])
418
+ df_args = CkanApiReadOnly.read_fields_df_args(fields_type_dict)
419
+ else:
420
+ fields_type_dict = None
421
+ df_args = {}
422
+ if format is None or format == "objects":
423
+ return ListRecords(sum([response.result["records"] for response in responses], [])), fields_type_dict
424
+ else:
425
+ if format == "lists":
426
+ return sum([response.result["records"] for response in responses], []), fields_type_dict
427
+ else:
428
+ return "\n".join([response.result["records"] for response in responses]), fields_type_dict
429
+
430
+ def _api_datastore_search_all_generator(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
431
+ distinct:bool=None, sort:str=None, limit:int=None, offset:int=0,
432
+ format:str=None, search_all:bool=True, params:dict=None, return_df:bool=True) \
433
+ -> Union[Generator[pd.DataFrame, Any, None], Generator[CkanActionResponse, Any, None]]:
434
+ """
435
+ Successive calls to _api_datastore_search_df until an empty list is received.
436
+ Generator implementation which yields one DataFrame per request.
437
+
438
+ :see: _api_datastore_search()
439
+ :param resource_id: resource id.
440
+ :param filters: The base argument to filter values in a table (optional)
441
+ :param q: Full text query (optional)
442
+ :param fields: The base argument to filter columns (optional)
443
+ :param distinct: return only distinct rows (optional, default: false) e.g. to return distinct ids: fields="id", distinct=True
444
+ :param sort: Argument to sort results e.g. sort="index, quantity desc" or sort="index asc"
445
+ :param limit: Limit the number of records to return
446
+ :param offset: Offset in the returned records
447
+ :param format: The return format in the returned response (default=objects, csv, tsv, lists) (optional)
448
+ :param params: Additional parameters such as filters, q, sort and fields can be given. See DataStore API documentation.
449
+ :param search_all: if False, only the first request is operated
450
+ :return:
451
+ """
452
+ if return_df:
453
+ return self._request_all_results_generator(api_fun=self._api_datastore_search_df, params=params, limit=limit, offset=offset,
454
+ search_all=search_all, resource_id=resource_id, filters=filters, q=q, fields=fields, distinct=distinct, sort=sort, format=format, compute_len=True)
455
+ else:
456
+ return self._request_all_results_generator(api_fun=self._api_datastore_search_raw, params=params,
457
+ limit=limit, offset=offset, search_all=search_all,
458
+ resource_id=resource_id, filters=filters, q=q,
459
+ fields=fields, distinct=distinct, sort=sort,
460
+ format=format, compute_len=search_all)
461
+
462
+
463
+ ### search_sql method ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
464
+ def _api_datastore_search_sql_raw(self, sql:str, *, params:dict=None, limit:int=None, offset:int=0) -> CkanActionResponse:
465
+ """
466
+ API call to datastore_search_sql. Performs SQL queries on the DataStore. These queries can be more complex than
467
+ with datastore_search. The DataStores are referenced by their resource_id, surrounded by quotes. The field names
468
+ are referred by their name in upper case, surrounded by quotes.
469
+ __NB__: This action is not available when ckanapi_harvesters.datastore.sqlsearch.enabled is set to false
470
+
471
+ :param sql: SQL query e.g. f'SELECT * IN "{resource_id}" WHERE "USER_ID" < 0'
472
+ :param limit: Limit the number of records to return
473
+ :param offset: Offset in the returned records
474
+ :param params: N/A
475
+ :return:
476
+ """
477
+ if params is None:
478
+ params = {}
479
+ params["sql"] = sql
480
+ if offset is None:
481
+ offset = 0
482
+ params["offset"] = offset
483
+ if limit is None:
484
+ limit = self.params.default_limit_read
485
+ if limit is not None:
486
+ params["limit"] = limit
487
+ response = self._api_action_request(f"datastore_search_sql", method=RequestType.Post, params=params)
488
+ if response.success:
489
+ return response
490
+ elif response.status_code == 400 and response.success_json_loads and response.response.text == '"Bad request - Action name not known: datastore_search_sql"':
491
+ raise CkanSqlCapabilityError(self, response)
492
+ elif response.status_code == 404 and response.success_json_loads and response.error_message["__type"] == "Not Found Error":
493
+ raise CkanNotFoundError(self, "SQL", response)
494
+ else:
495
+ raise response.default_error(self)
496
+
497
+ def _api_datastore_search_sql_df(self, sql:str, *, params:dict=None, limit:int=None, offset:int=0) -> pd.DataFrame:
498
+ """
499
+ Convert output of _api_datastore_search_sql_raw to pandas DataFrame.
500
+ """
501
+ response = self._api_datastore_search_sql_raw(sql=sql, params=params, limit=limit, offset=offset)
502
+ fields_type_dict = CkanApiReadOnly.read_fields_type_dict(response.result["fields"])
503
+ df_args_dict = CkanApiReadOnly.from_dict_df_args(fields_type_dict)
504
+ response_df = pd.DataFrame.from_dict(response.result["records"], **df_args_dict)
505
+ response.result.pop("records")
506
+ response_df.attrs["result"] = response.result
507
+ response_df.attrs["fields"] = fields_type_dict
508
+ # response_df.attrs["total"] = response.result["total"]
509
+ # response_df.attrs["total_was_estimated"] = response.result["total_was_estimated"]
510
+ response_df.attrs["limit"] = response.result["limit"]
511
+ self._rx_records_df_clean(response_df)
512
+ return response_df
513
+
514
+ def _api_datastore_search_sql_all(self, sql:str, *, params:dict=None,
515
+ search_all:bool=True, limit:int=None, offset:int=0, return_df:bool=True) \
516
+ -> Union[pd.DataFrame, Tuple[ListRecords, dict]]:
517
+ """
518
+ Successive calls to _api_datastore_search_sql until an empty list is received.
519
+
520
+ :see: _api_datastore_search_sql()
521
+ :param sql: SQL query e.g. f'SELECT * IN "{resource_id}" WHERE "USER_ID" < 0'
522
+ :param limit: Limit the number of records to return
523
+ :param offset: Offset in the returned records
524
+ :param params: N/A
525
+ :param search_all: if False, only the first request is operated
526
+ :return:
527
+ """
528
+ if return_df:
529
+ df = self._request_all_results_df(api_fun=self._api_datastore_search_sql_df, params=params,
530
+ limit=limit, offset=offset, search_all=search_all, sql=sql)
531
+ if "fields" in df.attrs.keys():
532
+ df.attrs["fields"] = df.attrs["fields"][0]
533
+ # if "total" in df.attrs.keys():
534
+ # assert_or_raise(np.all(np.array(df.attrs["total"]) == df.attrs["total"][0]), IntegrityError("total field varied in the responses"))
535
+ # df.attrs["total"] = df.attrs["total"][0]
536
+ return df
537
+ else:
538
+ responses = self._request_all_results_list(api_fun=self._api_datastore_search_sql_raw, params=params,
539
+ limit=limit, offset=offset, search_all=search_all, sql=sql)
540
+ # TODO: test
541
+ if len(responses) > 0:
542
+ response = responses[0]
543
+ fields_type_dict = CkanApiReadOnly.read_fields_type_dict(response.result["fields"])
544
+ else:
545
+ fields_type_dict = None
546
+ return ListRecords(sum([response.result["records"] for response in responses], [])), fields_type_dict
547
+
548
+ def _api_datastore_search_sql_all_generator(self, sql:str, *, params:dict=None,
549
+ search_all:bool=True, limit:int=None, offset:int=0, return_df:bool=True) \
550
+ -> Union[Generator[pd.DataFrame, Any, None], Generator[CkanActionResponse, Any, None]]:
551
+ """
552
+ Successive calls to _api_datastore_search_sql until an empty list is received.
553
+ Generator implementation which yields one DataFrame per request.
554
+
555
+ :see: _api_datastore_search_sql()
556
+ :param sql: SQL query e.g. f'SELECT * IN "{resource_id}" WHERE "USER_ID" < 0'
557
+ :param limit: Limit the number of records to return
558
+ :param offset: Offset in the returned records
559
+ :param params: N/A
560
+ :param search_all: if False, only the first request is operated
561
+ :return:
562
+ """
563
+ if return_df:
564
+ return self._request_all_results_generator(api_fun=self._api_datastore_search_sql_df, params=params,
565
+ limit=limit, offset=offset, search_all=search_all, sql=sql)
566
+ else:
567
+ return self._request_all_results_generator(api_fun=self._api_datastore_search_sql_raw, params=params,
568
+ limit=limit, offset=offset, search_all=search_all, sql=sql)
569
+
570
+
571
+ ## Function aliases to limit the entry-points for the user -------------------------------------------------------
572
+ def datastore_search(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
573
+ distinct:bool=None, sort:str=None, limit:int=None, offset:int=0, params:dict=None,
574
+ search_all:bool=False, search_method:bool=True, format:str=None, return_df:bool=True) \
575
+ -> Union[pd.DataFrame, ListRecords, Any, List[CkanActionResponse]]:
576
+ """
577
+ Preferred entry-point for a DataStore read request.
578
+ Uses the API datastore_search
579
+
580
+ :param resource_id: resource id.
581
+ :param filters: The base argument to filter values in a table (optional)
582
+ :param q: Full text query (optional)
583
+ :param fields: The base argument to filter columns (optional)
584
+ :param distinct: return only distinct rows (optional, default: false) e.g. to return distinct ids: fields="id", distinct=True
585
+ :param sort: Argument to sort results e.g. sort="index, quantity desc" or sort="index asc"
586
+ :param limit: Limit the number of records to return
587
+ :param offset: Offset in the returned records
588
+ :param params: Additional parameters such as filters, q, sort and fields can be given. See DataStore API documentation.
589
+ :param search_all: Option to renew the request until there are no more records.
590
+ :param search_method: API method selection (True=datastore_search, False=datastore_dump)
591
+ :return:
592
+ """
593
+ if search_method:
594
+ if return_df and format is None: format = "csv"
595
+ return self._api_datastore_search_all(resource_id, filters=filters, q=q, fields=fields, distinct=distinct, sort=sort,
596
+ limit=limit, offset=offset, format=format, params=params, search_all=search_all, return_df=return_df)
597
+ else:
598
+ assert_or_raise(distinct is None, CkanArgumentError("DataStore dump", "distinct"))
599
+ if return_df and format is None: format, bom = "csv", True
600
+ return self._api_datastore_dump_all(resource_id, filters=filters, q=q, fields=fields, sort=sort,
601
+ limit=limit, offset=offset, format=format, bom=bom, params=params, search_all=search_all, return_df=return_df)
602
+
603
+ def datastore_dump(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
604
+ distinct:bool=None, sort:str=None, limit:int=None, offset:int=0, params:dict=None,
605
+ search_all:bool=True, search_method:bool=True, format:str=None, return_df:bool=True) \
606
+ -> Union[pd.DataFrame, ListRecords, Any, List[CkanActionResponse]]:
607
+ """
608
+ Alias of datastore_search with search_all=True by default.
609
+ Uses the API datastore_search
610
+
611
+ :see: datastore_search()
612
+ :param resource_id: resource id.
613
+ :param filters: The base argument to filter values in a table (optional)
614
+ :param q: Full text query (optional)
615
+ :param fields: The base argument to filter columns (optional)
616
+ :param distinct: return only distinct rows (optional, default: false) e.g. to return distinct ids: fields="id", distinct=True
617
+ :param sort: Argument to sort results e.g. sort="index, quantity desc" or sort="index asc"
618
+ :param limit: Limit the number of records to return
619
+ :param offset: Offset in the returned records
620
+ :param params: Additional parameters such as filters, q, sort and fields can be given. See DataStore API documentation.
621
+ :param search_all: Option to renew the request until there are no more records.
622
+ :param search_method: API method selection (True=datastore_search, False=datastore_dump)
623
+ :return:
624
+ """
625
+ return self.datastore_search(resource_id, filters=filters, q=q, fields=fields,
626
+ distinct=distinct, sort=sort, limit=limit, offset=offset, params=params,
627
+ search_all=search_all, search_method=search_method, format=format, return_df=return_df)
628
+
629
+ def datastore_search_generator(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
630
+ distinct:bool=None, sort:str=None, limit:int=None, offset:int=0, params:dict=None,
631
+ search_all:bool=False, search_method:bool=True, format:str=None, return_df:bool=True) \
632
+ -> Union[Generator[pd.DataFrame, Any, None], Generator[CkanActionResponse, Any, None], Generator[requests.Response, Any, None]]:
633
+ """
634
+ Preferred entry-point for a DataStore read request.
635
+ Uses the API datastore_search
636
+
637
+ :param resource_id: resource id.
638
+ :param filters: The base argument to filter values in a table (optional)
639
+ :param q: Full text query (optional)
640
+ :param fields: The base argument to filter columns (optional)
641
+ :param distinct: return only distinct rows (optional, default: false) e.g. to return distinct ids: fields="id", distinct=True
642
+ :param sort: Argument to sort results e.g. sort="index, quantity desc" or sort="index asc"
643
+ :param limit: Limit the number of records to return
644
+ :param offset: Offset in the returned records
645
+ :param params: Additional parameters such as filters, q, sort and fields can be given. See DataStore API documentation.
646
+ :param search_all: Option to renew the request until there are no more records.
647
+ :param search_method: API method selection (True=datastore_search, False=datastore_dump)
648
+ :return:
649
+ """
650
+ if search_method:
651
+ if return_df and format is None: format = "csv"
652
+ return self._api_datastore_search_all_generator(resource_id, filters=filters, q=q, fields=fields, distinct=distinct, sort=sort,
653
+ limit=limit, offset=offset, format=format, params=params, search_all=search_all, return_df=return_df)
654
+ else:
655
+ assert_or_raise(distinct is None, CkanArgumentError("DataStore dump", "distinct"))
656
+ if return_df and format is None: format, bom = "csv", True
657
+ return self._api_datastore_dump_all_generator(resource_id, filters=filters, q=q, fields=fields, sort=sort,
658
+ limit=limit, offset=offset, format=format, bom=bom, params=params, search_all=search_all, return_df=return_df)
659
+
660
+ def datastore_search_cursor(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
661
+ distinct:bool=None, sort:str=None, limit:int=None, offset:int=0, params:dict=None,
662
+ search_all:bool=False, search_method:bool=True, format:str=None, return_df:bool=True) \
663
+ -> Generator[Union[pd.Series, Tuple[dict,dict], Tuple[list,dict], Tuple[str,dict]], Any, None]:
664
+ """
665
+ Cursor on rows
666
+ """
667
+ generator = self.datastore_search_generator(resource_id, filters=filters, q=q, fields=fields,
668
+ distinct=distinct, sort=sort, limit=limit, offset=offset, params=params,
669
+ search_all=search_all, search_method=search_method, format=format, return_df=return_df)
670
+ if return_df:
671
+ df: pd.DataFrame
672
+ row: pd.Series
673
+ for df in generator:
674
+ for index, row in df.iterrows():
675
+ yield row
676
+ elif search_method:
677
+ response: CkanActionResponse
678
+ # response.result: list
679
+ if format is not None:
680
+ format = format.lower()
681
+ if format is None or format == "objects":
682
+ for response in generator:
683
+ fields_type_dict = CkanApiReadOnly.read_fields_type_dict(response.result["fields"])
684
+ for element in response.result["records"]:
685
+ yield element, fields_type_dict
686
+ else:
687
+ for response in generator:
688
+ fields_type_dict = CkanApiReadOnly.read_fields_type_dict(response.result["fields"])
689
+ for element in response.result["records"]:
690
+ yield element, fields_type_dict
691
+ else:
692
+ raise TypeError("dumping datastore without parsing with a DataFrame does not return an iterable object")
693
+
694
+ def datastore_dump_generator(self, resource_id:str, *, filters:dict=None, q:str=None, fields:List[str]=None,
695
+ distinct:bool=None, sort:str=None, limit:int=None, offset:int=0, params:dict=None,
696
+ search_all:bool=True, search_method:bool=True, format:str=None, return_df:bool=True) \
697
+ -> Union[Generator[pd.DataFrame, Any, None], Generator[CkanActionResponse, Any, None]]:
698
+ """
699
+ Function alias to datastore_search_generator with search_all=True by default.
700
+ Uses the API datastore_search
701
+
702
+ :see: datastore_search_generator
703
+ :param resource_id: resource id.
704
+ :param filters: The base argument to filter values in a table (optional)
705
+ :param q: Full text query (optional)
706
+ :param fields: The base argument to filter columns (optional)
707
+ :param distinct: return only distinct rows (optional, default: false) e.g. to return distinct ids: fields="id", distinct=True
708
+ :param sort: Argument to sort results e.g. sort="index, quantity desc" or sort="index asc"
709
+ :param limit: Limit the number of records to return
710
+ :param offset: Offset in the returned records
711
+ :param params: Additional parameters such as filters, q, sort and fields can be given. See DataStore API documentation.
712
+ :param search_all: Option to renew the request until there are no more records.
713
+ :param search_method: API method selection (True=datastore_search, False=datastore_dump)
714
+ :return:
715
+ """
716
+ return self.datastore_search_generator(resource_id, filters=filters, q=q, fields=fields,
717
+ distinct=distinct, sort=sort, limit=limit, offset=offset, params=params,
718
+ search_all=search_all, search_method=search_method, format=format, return_df=return_df)
719
+
720
+ def datastore_search_sql(self, sql:str, *, params:dict=None, search_all:bool=False,
721
+ limit:int=None, offset:int=0, return_df:bool=True) -> Union[pd.DataFrame, Tuple[ListRecords, dict]]:
722
+ """
723
+ Preferred entry-point for a DataStore SQL request.
724
+ :see: _api_datastore_search_sql()
725
+ __NB__: This action is not available when ckanapi_harvesters.datastore.sqlsearch.enabled is set to false
726
+
727
+ :param sql: SQL query e.g. f'SELECT * IN "{resource_id}" WHERE "USER_ID" < 0'
728
+ :param limit: Limit the number of records to return
729
+ :param offset: Offset in the returned records
730
+ :param params: N/A
731
+ :param search_all: Option to renew the request until there are no more records.
732
+ :return:
733
+ """
734
+ return self._api_datastore_search_sql_all(sql, params=params, limit=limit, offset=offset, search_all=search_all, return_df=return_df)
735
+
736
+ def datastore_search_sql_generator(self, sql:str, *, params:dict=None, search_all:bool=False,
737
+ limit:int=None, offset:int=0, return_df:bool=True) \
738
+ -> Union[Generator[pd.DataFrame, Any, None], Generator[CkanActionResponse, Any, None]]:
739
+ """
740
+ Preferred entry-point for a DataStore SQL request.
741
+ :see: _api_datastore_search_sql()
742
+
743
+ __NB__: This action is not available when ckanapi_harvesters.datastore.sqlsearch.enabled is set to false
744
+
745
+ :param sql: SQL query e.g. f'SELECT * IN "{resource_id}" WHERE "USER_ID" < 0'
746
+ :param limit: Limit the number of records to return
747
+ :param offset: Offset in the returned records
748
+ :param params: N/A
749
+ :param search_all: Option to renew the request until there are no more records.
750
+ :return:
751
+ """
752
+ return self._api_datastore_search_sql_all_generator(sql, params=params, limit=limit, offset=offset, search_all=search_all, return_df=return_df)
753
+
754
+ def datastore_search_sql_cursor(self, sql:str, *, params:dict=None, search_all:bool=False,
755
+ limit:int=None, offset:int=0, return_df:bool=True) \
756
+ -> Generator[Union[pd.Series,Tuple[dict,dict]], Any, None]:
757
+ generator = self.datastore_search_sql_generator(sql, params=params, search_all=search_all,
758
+ limit=limit, offset=offset, return_df=return_df)
759
+ if return_df:
760
+ df: pd.DataFrame
761
+ row: pd.Series
762
+ for df in generator:
763
+ for index, row in df.iterrows():
764
+ yield row
765
+ else:
766
+ response: CkanActionResponse
767
+ # response.result: list
768
+ element: Any
769
+ for response in generator:
770
+ fields_type_dict = CkanApiReadOnly.read_fields_type_dict(response.result["fields"])
771
+ for element in response.result["records"]:
772
+ yield element, fields_type_dict
773
+
774
+ def datastore_search_sql_find_one(self, sql:str, *, params:dict=None,
775
+ offset:int=0, return_df:bool=True) -> Union[pd.DataFrame, Tuple[ListRecords, dict]]:
776
+ df_row = self.datastore_search_sql(sql, limit=1, search_all=False, offset=offset, params=params, return_df=return_df)
777
+ return df_row
778
+
779
+ def datastore_search_sql_fields_type_dict(self, sql:str, *, params:dict=None) -> OrderedDict:
780
+ document, fields_dict = self.datastore_search_sql_find_one(sql, offset=0, params=params, return_df=False)
781
+ return fields_dict
782
+
783
+ def datastore_search_sql_row_count(self, sql:str, *, params:dict=None) -> int:
784
+ df_row = self.datastore_search_sql_find_one(sql, offset=0, params=params, return_df=True)
785
+ return df_row.attrs["total"]
786
+
787
+ def datastore_search_find_one(self, resource_id:str, *, filters:dict=None, q:str=None, distinct:bool=None,
788
+ fields:List[str]=None, offset:int=0, return_df:bool=True) \
789
+ -> Union[pd.DataFrame, ListRecords, Any, List[CkanActionResponse]]:
790
+ """
791
+ Request one result for a query
792
+
793
+ :param resource_id: resource id
794
+ :return:
795
+ """
796
+ # resource_info = self.get_resource_info_or_request(resource_id)
797
+ # return resource_info.datastore_info.row_count
798
+ df_row = self.datastore_search(resource_id, limit=1, search_all=False, filters=filters, q=q, distinct=distinct,
799
+ fields=fields, offset=offset, return_df=return_df)
800
+ return df_row
801
+
802
+ def datastore_search_fields_type_dict(self, resource_id:str, *,
803
+ filters:dict=None, q:str=None, distinct:bool=None, fields:List[str]=None,
804
+ request_missing:bool=True, error_not_mapped:bool=False,
805
+ error_not_found:bool=True) -> OrderedDict:
806
+ if fields is None:
807
+ # if no field restriction was provided, refer to the fields of the DataStore
808
+ fields_list = self.get_datastore_fields_or_request(resource_id, return_list=True,
809
+ request_missing=request_missing,
810
+ error_not_mapped=error_not_mapped,
811
+ error_not_found=error_not_found)
812
+ return CkanApiReadOnly.read_fields_type_dict(fields_list)
813
+ else:
814
+ document, fields_dict = self.datastore_search_find_one(resource_id, filters=filters, q=q, distinct=distinct,
815
+ fields=fields, return_df=False)
816
+ return fields_dict
817
+
818
+ def datastore_search_row_count(self, resource_id:str, *, filters:dict=None, q:str=None, distinct:bool=None,
819
+ fields:List[str]=None) -> int:
820
+ """
821
+ Request the number of rows in a DataStore
822
+
823
+ :param resource_id: resource id
824
+ :return:
825
+ """
826
+ df_row = self.datastore_search_find_one(resource_id, filters=filters, q=q, distinct=distinct,
827
+ fields=fields, return_df=True)
828
+ return df_row.attrs["total"]
829
+
830
+ def test_sql_capabilities(self, *, raise_error:bool=False) -> bool:
831
+ """
832
+ Test the availability of the API datastore_search_sql
833
+
834
+ :return:
835
+ """
836
+ try:
837
+ self.api_help_show("datastore_search_sql", print_output=False)
838
+ return True
839
+ except CkanNotFoundError:
840
+ if raise_error:
841
+ raise CkanSqlCapabilityError(self, CkanActionResponse(requests.Response()))
842
+ return False
843
+
844
+
845
+ ## Resource download by direct link (FileStore) -----------------------------------------------
846
+ def resource_download(self, resource_id:str, *, method:str=None,
847
+ proxies:dict=None, headers:dict=None, auth: Union[AuthBase, Tuple[str,str]]=None, verify:Union[bool,str,None]=None) \
848
+ -> Tuple[CkanResourceInfo, Union[requests.Response,None]]:
849
+ """
850
+ Uses the link provided in resource_show to download a resource.
851
+
852
+ :param resource_id: resource id
853
+ :return:
854
+ """
855
+ resource_info = self.get_resource_info_or_request(resource_id)
856
+ url = resource_info.download_url
857
+ if len(url) == 0:
858
+ return resource_info, None
859
+ response = self.download_url_proxy(url, method=method, auth_if_ckan=ckan_request_proxy_default_auth_if_ckan,
860
+ proxies=proxies, headers=headers, auth=auth, verify=verify)
861
+ return resource_info, response
862
+
863
+ def resource_download_test_head(self, resource_id:str, *, raise_error:bool=False,
864
+ proxies:dict=None, headers:dict=None, auth: Union[AuthBase, Tuple[str,str]]=None, verify:Union[bool,str,None]=None) \
865
+ -> Union[None,ContextErrorLevelMessage]:
866
+ """
867
+ This sends a HEAD request to the resource download url using the CKAN connexion parameters via resource_download.
868
+ The resource is not downloaded but the headers indicate if the url is valid.
869
+
870
+ :return: None if successful
871
+ """
872
+ resource_info = self.get_resource_info_or_request_of_id(resource_id)
873
+ try:
874
+ _, response = self.resource_download(resource_id, method="HEAD", proxies=proxies, headers=headers, auth=auth, verify=verify)
875
+ except Exception as e:
876
+ if raise_error:
877
+ raise e from e
878
+ return ContextErrorLevelMessage(f"Resource from URL {resource_info.name}", ErrorLevel.Error, f"Failed to query download url for resource id {resource_id}: {str(e)}")
879
+ if response.ok and response.status_code == 200:
880
+ return None
881
+ else:
882
+ if raise_error:
883
+ raise RequestError(f"Failed to query download url for resource id {resource_id}: status {response.status_code} {response.reason}")
884
+ return ContextErrorLevelMessage(f"Resource from URL {resource_info.name}", ErrorLevel.Error, f"Failed to query download url for resource id {resource_id}: status {response.status_code} {response.reason}")
885
+
886
+ def resource_download_df(self, resource_id:str, *, method:str=None,
887
+ proxies:dict=None, headers:dict=None, auth: Union[AuthBase, Tuple[str,str]]=None, verify:Union[bool,str,None]=None) \
888
+ -> Tuple[CkanResourceInfo, Union[pd.DataFrame,None]]:
889
+ """
890
+ Uses the link provided in resource_show to download a resource and interprets it as a DataFrame.
891
+
892
+ :param resource_id: resource id
893
+ :return:
894
+ """
895
+ resource_info, response = self.resource_download(resource_id, method=method, proxies=proxies, headers=headers, auth=auth, verify=verify)
896
+ if response is None:
897
+ return resource_info, None
898
+ buffer = io.StringIO(response.content.decode())
899
+ df = pd.read_csv(buffer, **df_download_read_csv_kwargs)
900
+ self._rx_records_df_clean(df)
901
+ return resource_info, df
902
+
903
+ def map_file_resource_sizes(self, cancel_if_present:bool=True) -> None:
904
+ for resource_id, resource_info in self.map.resources.items():
905
+ if resource_info.download_url:
906
+ if not (cancel_if_present and resource_info.download_size_mb is not None):
907
+ _, response = self.resource_download(resource_id, method="HEAD")
908
+ content_length = int(response.headers.get("content-length", None)) # raise error if not found or bad format
909
+ resource_info.download_size_mb = bytes_to_megabytes(content_length)
910
+
911
+
912
+ ## Mapping of resource aliases from table
913
+ def list_datastore_aliases(self) -> List[CkanAliasInfo]:
914
+ alias_resource_id = "_table_metadata" # resource name of table containing CKAN aliases
915
+ alias_list_dict, _ = self.datastore_search(alias_resource_id, search_all=True, return_df=False, format="objects", search_method=True)
916
+ alias_list = [CkanAliasInfo(alias_dict) for alias_dict in alias_list_dict]
917
+ for alias_info in alias_list:
918
+ if alias_info.alias_of is not None:
919
+ self.map.resource_alias_index[alias_info.name] = alias_info.alias_of
920
+ return alias_list
921
+
922
+ def map_resources(self, package_list:Union[str, List[str]]=None, *, params:dict=None,
923
+ datastore_info:bool=None, resource_view_list:bool=None, organization_info:bool=None, license_list:bool=None,
924
+ only_missing:bool=True, error_not_found:bool=True,
925
+ owner_org:str=None) -> CkanMap:
926
+ # overload including a call to list all aliases
927
+ if len(self.map.resource_alias_index) == 0 and self.params.map_all_aliases:
928
+ self.list_datastore_aliases()
929
+ map = super().map_resources(package_list=package_list, params=params, datastore_info=datastore_info,
930
+ resource_view_list=resource_view_list, organization_info=organization_info,
931
+ license_list=license_list, only_missing=only_missing, error_not_found=error_not_found,
932
+ owner_org=owner_org)
933
+ return map
934
+