ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ckanapi_harvesters/__init__.py +32 -10
- ckanapi_harvesters/auxiliary/__init__.py +26 -0
- ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
- ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
- ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
- ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
- ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
- ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
- ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
- ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
- ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
- ckanapi_harvesters/auxiliary/deprecated.py +82 -0
- ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
- ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
- ckanapi_harvesters/auxiliary/list_records.py +60 -0
- ckanapi_harvesters/auxiliary/login.py +163 -0
- ckanapi_harvesters/auxiliary/path.py +208 -0
- ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
- ckanapi_harvesters/auxiliary/urls.py +40 -0
- ckanapi_harvesters/builder/__init__.py +40 -0
- ckanapi_harvesters/builder/builder_aux.py +20 -0
- ckanapi_harvesters/builder/builder_ckan.py +238 -0
- ckanapi_harvesters/builder/builder_errors.py +36 -0
- ckanapi_harvesters/builder/builder_field.py +122 -0
- ckanapi_harvesters/builder/builder_package.py +9 -0
- ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
- ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
- ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
- ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
- ckanapi_harvesters/builder/builder_resource.py +589 -0
- ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
- ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
- ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
- ckanapi_harvesters/builder/builder_resource_init.py +126 -0
- ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
- ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
- ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
- ckanapi_harvesters/builder/example/__init__.py +21 -0
- ckanapi_harvesters/builder/example/builder_example.py +21 -0
- ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
- ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
- ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
- ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
- ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
- ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
- ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
- ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
- ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
- ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
- ckanapi_harvesters/builder/mapper_datastore.py +93 -0
- ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
- ckanapi_harvesters/builder/specific/__init__.py +11 -0
- ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
- ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
- ckanapi_harvesters/ckan_api/__init__.py +20 -0
- ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
- ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
- ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
- ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
- ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
- ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
- ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
- ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
- ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
- ckanapi_harvesters/harvesters/__init__.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
- ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
- ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
- ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
- ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
- ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
- ckanapi_harvesters/harvesters/harvester_init.py +30 -0
- ckanapi_harvesters/harvesters/harvester_model.py +49 -0
- ckanapi_harvesters/harvesters/harvester_params.py +323 -0
- ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
- ckanapi_harvesters/harvesters/postgre_params.py +86 -0
- ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
- ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
- ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
- ckanapi_harvesters/policies/__init__.py +20 -0
- ckanapi_harvesters/policies/data_format_policy.py +269 -0
- ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
- ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
- ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
- ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
- ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
- ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
- ckanapi_harvesters/reports/__init__.py +11 -0
- ckanapi_harvesters/reports/admin_report.py +292 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
- ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
- ckanapi_harvesters/divider/__init__.py +0 -27
- ckanapi_harvesters/divider/divider.py +0 -53
- ckanapi_harvesters/divider/divider_error.py +0 -59
- ckanapi_harvesters/main.py +0 -30
- ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Code to define the bondage between a file and a database query
|
|
5
|
+
in the context of a large DataStore defined by the concatenation of multiple files.
|
|
6
|
+
"""
|
|
7
|
+
from warnings import warn
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Dict, List, Iterable, Callable, Any, Tuple, Generator, Set, Union
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from ckanapi_harvesters.builder.builder_resource_datastore import DataSchemeConversion
|
|
15
|
+
from ckanapi_harvesters.auxiliary.ckan_model import UpsertChoice
|
|
16
|
+
from ckanapi_harvesters.auxiliary.ckan_defs import ckan_tags_sep
|
|
17
|
+
from ckanapi_harvesters.ckan_api import CkanApi
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RequestMapperABC(DataSchemeConversion, ABC):
|
|
21
|
+
"""
|
|
22
|
+
Class to define how to reconstruct a file from the full dataset
|
|
23
|
+
This class overloads some data scheme conversion class functions
|
|
24
|
+
This abstract class can be derived to specify custom data treatments
|
|
25
|
+
"""
|
|
26
|
+
def __init__(self,
|
|
27
|
+
*, df_upload_fun:Callable[[pd.DataFrame], Any] = None,
|
|
28
|
+
df_download_fun:Callable[[pd.DataFrame], Any] = None):
|
|
29
|
+
super().__init__(df_upload_fun=df_upload_fun, df_download_fun=df_download_fun)
|
|
30
|
+
self.upsert_only_missing_rows:bool = False
|
|
31
|
+
|
|
32
|
+
## upsert request preparation ----------------
|
|
33
|
+
def get_file_query_of_df(self, df_upload:pd.DataFrame) -> Union[dict,None]:
|
|
34
|
+
"""
|
|
35
|
+
Return the dict of {field: value} combinations representing the arguments of the query to reconstruct a file
|
|
36
|
+
|
|
37
|
+
:param df_upload: the DataFrame representing the file
|
|
38
|
+
:return:
|
|
39
|
+
"""
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
def last_inserted_row_request(self, ckan:CkanApi, resource_id:str, file_query:dict) -> Union[pd.DataFrame,None]:
|
|
43
|
+
"""
|
|
44
|
+
Request in CKAN the last inserted row(s) corresponding to a given file_query
|
|
45
|
+
|
|
46
|
+
:param ckan:
|
|
47
|
+
:param resource_id:
|
|
48
|
+
:param file_query: a dict of {field: value} combinations representing the arguments of the query to reconstruct a file
|
|
49
|
+
:return: The last row(s) in the database or None (if no specific method was defined)
|
|
50
|
+
"""
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
def last_inserted_index_request(self, ckan:CkanApi, resource_id:str, file_query:dict, df_upload:pd.DataFrame) -> Tuple[int, bool, int, Union[pd.DataFrame,None]]:
|
|
54
|
+
"""
|
|
55
|
+
Knowing the data which needs to be uploaded, this function compares the last known row(s) to the dataframe
|
|
56
|
+
and returns the index to restart the upload process.
|
|
57
|
+
|
|
58
|
+
:param ckan:
|
|
59
|
+
:param resource_id:
|
|
60
|
+
:param file_query: a dict of {field: value} combinations representing the arguments of the query to reconstruct a file
|
|
61
|
+
:param df_upload: the known data corresponding to the file_query to be sent
|
|
62
|
+
:return: a tuple (i_restart, upload_needed, row_count, df_last_row):
|
|
63
|
+
- i_restart: the last known index in the dataframe
|
|
64
|
+
- upload_needed: a boolean indicating if an update is necessary
|
|
65
|
+
- row_count: the number of rows corresponding to the file_query
|
|
66
|
+
- df_last_row: the last found row in the dataframe
|
|
67
|
+
"""
|
|
68
|
+
return 0, True, -1, None
|
|
69
|
+
|
|
70
|
+
## download preparation ----------------
|
|
71
|
+
@abstractmethod
|
|
72
|
+
def download_file_query_list(self, ckan: CkanApi, resource_id: str) -> List[dict]:
|
|
73
|
+
"""
|
|
74
|
+
Function to list the {key: value} combinations present in the CKAN datastore to reconstruct the file database before downloading.
|
|
75
|
+
|
|
76
|
+
:param ckan:
|
|
77
|
+
:param resource_id:
|
|
78
|
+
:return: a list of query arguments defining each file
|
|
79
|
+
"""
|
|
80
|
+
raise NotImplementedError()
|
|
81
|
+
|
|
82
|
+
def download_file_query_generator(self, ckan: CkanApi, resource_id: str) -> Generator[dict, Any, None]:
|
|
83
|
+
"""
|
|
84
|
+
Generator for download_file_query_list which can be customized
|
|
85
|
+
|
|
86
|
+
:param ckan:
|
|
87
|
+
:param resource_id:
|
|
88
|
+
:return:
|
|
89
|
+
"""
|
|
90
|
+
for file_query in self.download_file_query_list(ckan=ckan, resource_id=resource_id):
|
|
91
|
+
yield file_query
|
|
92
|
+
|
|
93
|
+
def download_file_query(self, ckan: CkanApi, resource_id: str, file_query:dict) -> pd.DataFrame:
|
|
94
|
+
return ckan.datastore_search(resource_id=resource_id, **file_query, search_all=True)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class RequestFileMapperABC(RequestMapperABC, ABC):
|
|
98
|
+
"""
|
|
99
|
+
Class to define how to reconstruct a file from the full dataset
|
|
100
|
+
This abstract class is oriented to treating files in the file system
|
|
101
|
+
"""
|
|
102
|
+
def __init__(self,
|
|
103
|
+
*, df_upload_fun:Callable[[pd.DataFrame], Any] = None,
|
|
104
|
+
df_download_fun:Callable[[pd.DataFrame], Any] = None):
|
|
105
|
+
super().__init__(df_upload_fun=df_upload_fun, df_download_fun=df_download_fun)
|
|
106
|
+
self.file_name_prefix:str = "table_"
|
|
107
|
+
self.file_name_suffix:str = ".csv"
|
|
108
|
+
self.file_name_function:Union[Callable[[dict], str], None] = None
|
|
109
|
+
|
|
110
|
+
def get_file_name_of_query(self, file_query:dict) -> str:
|
|
111
|
+
if self.file_name_function is None:
|
|
112
|
+
file_filters_str = '_'.join([str(key)+'_'+str(value) for key,value in file_query.items()])
|
|
113
|
+
else:
|
|
114
|
+
file_filters_str = self.file_name_function(file_query)
|
|
115
|
+
return f"{self.file_name_prefix}{file_filters_str}{self.file_name_suffix}"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class RequestFileMapperUser(RequestFileMapperABC):
|
|
119
|
+
"""
|
|
120
|
+
Use this basic implementation if the file query list is provided by the user or if the builder is only used to upload files.
|
|
121
|
+
"""
|
|
122
|
+
def __init__(self, file_query_list: Iterable[Tuple[str, dict]],
|
|
123
|
+
*, df_upload_fun:Callable[[pd.DataFrame], Any] = None,
|
|
124
|
+
df_download_fun:Callable[[pd.DataFrame], Any] = None):
|
|
125
|
+
super().__init__(df_upload_fun=df_upload_fun, df_download_fun=df_download_fun)
|
|
126
|
+
# file_query_list must be stored in the BuilderDataStoreMultiAbc instance
|
|
127
|
+
|
|
128
|
+
def download_file_query_list(self, ckan: CkanApi, resource_id: str) -> List[dict]:
|
|
129
|
+
raise RuntimeError("File query list is provided by user")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class RequestFileMapperLimit(RequestFileMapperABC):
|
|
133
|
+
"""
|
|
134
|
+
In this implementation, a file is defined by a certain amount of rows
|
|
135
|
+
"""
|
|
136
|
+
default_limit = 10000
|
|
137
|
+
|
|
138
|
+
def __init__(self, limit:int=None,
|
|
139
|
+
*, df_upload_fun:Callable[[pd.DataFrame], Any] = None,
|
|
140
|
+
df_download_fun:Callable[[pd.DataFrame], Any] = None):
|
|
141
|
+
super().__init__(df_upload_fun=df_upload_fun, df_download_fun=df_download_fun)
|
|
142
|
+
if limit is None:
|
|
143
|
+
limit = RequestFileMapperLimit.default_limit
|
|
144
|
+
self.limit:int = limit
|
|
145
|
+
|
|
146
|
+
## download preparation ----------------
|
|
147
|
+
def get_file_name_of_query(self, file_query:dict) -> str:
|
|
148
|
+
if self.file_name_function is None:
|
|
149
|
+
# file_filters_str = str(file_query["offset"] // self.limit)
|
|
150
|
+
file_filters_str = f'{file_query["offset"]}_{file_query["offset"]+self.limit-1}'
|
|
151
|
+
else:
|
|
152
|
+
file_filters_str = self.file_name_function(file_query)
|
|
153
|
+
return f"{self.file_name_prefix}{file_filters_str}{self.file_name_suffix}"
|
|
154
|
+
|
|
155
|
+
def download_file_query_list(self, ckan: CkanApi, resource_id: str) -> List[dict]:
|
|
156
|
+
# get number of rows and return a list of [offset,limit] combinations
|
|
157
|
+
row_count = ckan.datastore_search_row_count(resource_id)
|
|
158
|
+
return [{"offset": self.limit*counter, "limit": self.limit} for counter in range(row_count // self.limit + 1)]
|
|
159
|
+
|
|
160
|
+
def download_file_query(self, ckan: CkanApi, resource_id: str, file_query:dict) -> pd.DataFrame:
|
|
161
|
+
return ckan.datastore_search(resource_id=resource_id, offset=file_query["offset"], limit=file_query["limit"], search_all=True)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class RequestFileMapperIndexKeys(RequestFileMapperABC):
|
|
165
|
+
"""
|
|
166
|
+
In this implementation, a file is defined by a combination of file_keys values
|
|
167
|
+
It is optionally ordered by an index_keys which enables to restart a transfer when interrupted
|
|
168
|
+
By default, the index_keys is the last field of the primary key
|
|
169
|
+
and the file_keys are the fields preceding the index_keys in the primary key
|
|
170
|
+
"""
|
|
171
|
+
last_rows_limit = 1
|
|
172
|
+
def __init__(self, group_by_keys:List[str], sort_by_keys:List[str] = None,
|
|
173
|
+
*, df_upload_fun:Callable[[pd.DataFrame], Any] = None,
|
|
174
|
+
df_download_fun:Callable[[pd.DataFrame], Any] = None):
|
|
175
|
+
super().__init__(df_upload_fun=df_upload_fun, df_download_fun=df_download_fun)
|
|
176
|
+
self.group_by_keys: List[str] = group_by_keys # fields to filter to obtain one file
|
|
177
|
+
self.sort_by_keys: Union[List[str],None] = None # field to order the document
|
|
178
|
+
if sort_by_keys is not None:
|
|
179
|
+
self.sort_by_keys = sort_by_keys
|
|
180
|
+
|
|
181
|
+
def get_necessary_fields(self) -> Set[str]:
|
|
182
|
+
fields = set(self.group_by_keys)
|
|
183
|
+
if self.sort_by_keys is not None:
|
|
184
|
+
fields = fields.union(set(self.sort_by_keys))
|
|
185
|
+
return fields
|
|
186
|
+
|
|
187
|
+
def df_upload_alter(self, df_local: pd.DataFrame, file_name:str=None, mapper_kwargs:dict=None, **kwargs) -> pd.DataFrame:
|
|
188
|
+
# overload of df_upload_alter calling self.df_upload_fun
|
|
189
|
+
# order dataframes before sending to database in order to be able to restart transfer from last transmitted index
|
|
190
|
+
df_database = super().df_upload_alter(df_local, file_name=file_name, mapper_kwargs=mapper_kwargs, **kwargs)
|
|
191
|
+
if self.sort_by_keys is not None:
|
|
192
|
+
if self.df_upload_fun is None:
|
|
193
|
+
df_database = df_database.copy()
|
|
194
|
+
df_database.sort_values(self.sort_by_keys, inplace=True)
|
|
195
|
+
return df_database
|
|
196
|
+
|
|
197
|
+
## upsert request preparation ----------------
|
|
198
|
+
def get_file_query_of_df(self, df_upload:pd.DataFrame) -> Union[dict,None]:
|
|
199
|
+
df_file_query = df_upload[self.group_by_keys].drop_duplicates(subset=self.group_by_keys)
|
|
200
|
+
if len(df_file_query) == 1:
|
|
201
|
+
return {"filters": df_file_query.to_dict(orient="records")[0]}
|
|
202
|
+
else:
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
def last_inserted_row_request(self, ckan:CkanApi, resource_id:str, file_query:dict) -> Union[pd.DataFrame,None]:
|
|
206
|
+
if self.sort_by_keys is None or not self.upsert_only_missing_rows:
|
|
207
|
+
return None
|
|
208
|
+
else:
|
|
209
|
+
df = ckan.datastore_search(resource_id, filters=file_query["filters"], sort=ckan_tags_sep.join(self.sort_by_keys) + " desc",
|
|
210
|
+
limit=self.last_rows_limit, search_all=False) #, fields=self.file_keys + self.index_keys)
|
|
211
|
+
return df
|
|
212
|
+
|
|
213
|
+
def last_inserted_index_request(self, ckan:CkanApi, resource_id:str, file_query:dict, df_upload:pd.DataFrame) -> Tuple[int, bool, int, pd.DataFrame]:
|
|
214
|
+
# df_upload is in the database format (df_upload_fun has been applied)
|
|
215
|
+
# df_last_row has just been downloaded but no field typing has been applied
|
|
216
|
+
df_last_row = self.last_inserted_row_request(ckan=ckan, resource_id=resource_id, file_query=file_query)
|
|
217
|
+
if df_last_row is None or df_last_row.empty:
|
|
218
|
+
return 0, True, df_last_row.attrs["total"] if df_last_row is not None else 0, df_last_row
|
|
219
|
+
else:
|
|
220
|
+
for key in self.sort_by_keys:
|
|
221
|
+
if key in df_upload.columns:
|
|
222
|
+
# apply field typing from df_upload in order to perform line-by-line comparison
|
|
223
|
+
df_last_row[key] = df_last_row[key].astype(df_upload[key].dtype)
|
|
224
|
+
match_table = np.column_stack([df_upload[key] == df_last_row[key].iloc[0] for key in self.sort_by_keys])
|
|
225
|
+
match_array = np.logical_and.reduce(match_table, 1)
|
|
226
|
+
i_restart = np.argwhere(match_array) + 1
|
|
227
|
+
if len(i_restart) == 1 and len(i_restart[0]) == 1:
|
|
228
|
+
i_restart_py = int(i_restart[0][0])
|
|
229
|
+
return i_restart_py, i_restart_py < len(df_upload), df_last_row.attrs["total"], df_last_row
|
|
230
|
+
else:
|
|
231
|
+
msg = "Multiple results obtained when querying the last inserted index"
|
|
232
|
+
warn(msg)
|
|
233
|
+
return 0, True, df_last_row.attrs["total"], df_last_row
|
|
234
|
+
|
|
235
|
+
## download preparation ----------------
|
|
236
|
+
def get_file_name_of_query(self, file_query:dict) -> str:
|
|
237
|
+
if self.file_name_function is None:
|
|
238
|
+
file_filters_str = '_'.join([str(key)+'_'+str(value) for key,value in file_query['filters'].items()])
|
|
239
|
+
else:
|
|
240
|
+
file_filters_str = self.file_name_function(file_query['filters'])
|
|
241
|
+
return f"{self.file_name_prefix}{file_filters_str}{self.file_name_suffix}"
|
|
242
|
+
|
|
243
|
+
def download_file_query_list(self, ckan: CkanApi, resource_id: str) -> List[dict]:
|
|
244
|
+
# function to list the files which are defined by unique file_keys combinations in the database
|
|
245
|
+
df_list = ckan.datastore_search(resource_id, fields=self.group_by_keys, search_all=True, distinct=True)
|
|
246
|
+
# df_list = ckan.datastore_search(resource_id, filters={key: 0 for key in self.order_keys}, fields=self.file_keys, search_all=True)
|
|
247
|
+
filters_list = df_list.to_dict(orient="records")
|
|
248
|
+
return [{"filters": file_filter} for file_filter in filters_list]
|
|
249
|
+
|
|
250
|
+
# def download_file_query(self, ckan: CkanApi, resource_id: str, file_query:dict) -> pd.DataFrame:
|
|
251
|
+
# return ckan.datastore_search(resource_id=resource_id, filters=file_query["filters"], search_all=True)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def default_file_mapper_from_primary_key(primary_key:List[str]=None, file_query_list: Iterable[Tuple[str,dict]]=None) -> RequestFileMapperABC:
|
|
255
|
+
if primary_key is None or len(primary_key) <= 1:
|
|
256
|
+
if file_query_list is not None:
|
|
257
|
+
return RequestFileMapperUser(file_query_list)
|
|
258
|
+
else:
|
|
259
|
+
return RequestFileMapperLimit()
|
|
260
|
+
else:
|
|
261
|
+
return RequestFileMapperIndexKeys(group_by_keys=primary_key[:-1], sort_by_keys=[primary_key[-1]])
|
|
262
|
+
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
CKAN configuration builder
|
|
5
|
+
"""
|
|
6
|
+
from typing import Union, List, Dict
|
|
7
|
+
|
|
8
|
+
from ckanapi_harvesters.auxiliary import ckan_configuration
|
|
9
|
+
from ckanapi_harvesters.ckan_api import CkanApi
|
|
10
|
+
from ckanapi_harvesters.policies.data_format_policy import CkanPackageDataFormatPolicy
|
|
11
|
+
from ckanapi_harvesters.policies.data_format_policy_errors import DataPolicyError
|
|
12
|
+
from ckanapi_harvesters.builder.builder_resource import BuilderResourceUnmanaged
|
|
13
|
+
from ckanapi_harvesters.builder.specific_builder_abc import SpecificBuilderABC
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ConfigurationBuilder(SpecificBuilderABC):
|
|
17
|
+
def __init__(self, ckan:CkanApi, organization_name:str):
|
|
18
|
+
super().__init__(ckan, package_name=ckan_configuration.configuration_package_name, organization_name=organization_name,
|
|
19
|
+
title="Configuration for scripts",
|
|
20
|
+
description="Configuration for use with Python scripts",
|
|
21
|
+
private=True,
|
|
22
|
+
)
|
|
23
|
+
self.resource_builders[ckan_configuration.policy_resource] = \
|
|
24
|
+
BuilderResourceUnmanaged(name=ckan_configuration.policy_resource, format="JSON",
|
|
25
|
+
description="CKAN Data format policy for use with Python scripts")
|
|
26
|
+
|
|
27
|
+
def patch_policy(self, ckan:CkanApi, policy: CkanPackageDataFormatPolicy,
|
|
28
|
+
*, reduced_size:bool=None, update_ckan:bool=True):
|
|
29
|
+
package_info = self.patch_request_package(ckan)
|
|
30
|
+
package_id = package_info.id
|
|
31
|
+
if policy is not None:
|
|
32
|
+
payload = policy.to_jsons(reduced_size=reduced_size).encode()
|
|
33
|
+
policy_builder: BuilderResourceUnmanaged = self.resource_builders[ckan_configuration.policy_resource]
|
|
34
|
+
policy_builder.patch_request(ckan, package_id, payload=payload, reupload=True)
|
|
35
|
+
else:
|
|
36
|
+
# delete data format policy
|
|
37
|
+
self.resource_builders[ckan_configuration.policy_resource].delete_request(ckan, package_id)
|
|
38
|
+
if update_ckan:
|
|
39
|
+
ckan.policy = policy
|
|
40
|
+
|
|
41
|
+
def load_default_policy(self, ckan:CkanApi) -> CkanPackageDataFormatPolicy:
|
|
42
|
+
return ckan.load_default_policy(force=True)
|
|
43
|
+
|
|
44
|
+
def policy_check(self, ckan: CkanApi,
|
|
45
|
+
package_list: Union[str, List[str]] = None, *, owner_org:str=None,
|
|
46
|
+
policy:CkanPackageDataFormatPolicy=None, buffer:Dict[str, List[DataPolicyError]]=None,
|
|
47
|
+
raise_error:bool=False, verbose:bool=None) -> bool:
|
|
48
|
+
"""
|
|
49
|
+
Check package list against currently loaded data format policy loaded in CKAN (or the one provided by argument).
|
|
50
|
+
If not provided, the package list is the full list of packages, restrained to an organization (requires an API request).
|
|
51
|
+
:param ckan:
|
|
52
|
+
:param package_list:
|
|
53
|
+
:param owner_org:
|
|
54
|
+
:param policy:
|
|
55
|
+
:param buffer:
|
|
56
|
+
:param raise_error:
|
|
57
|
+
:param verbose:
|
|
58
|
+
:return:
|
|
59
|
+
"""
|
|
60
|
+
# recommended to run load_default_policy before
|
|
61
|
+
package_list = ckan.complete_package_list(package_list, owner_org=owner_org)
|
|
62
|
+
ckan.map_resources(package_list, owner_org=owner_org)
|
|
63
|
+
return ckan.policy_check(package_list, policy=policy, buffer=buffer, verbose=verbose, raise_error=raise_error)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Abstract class to implement specific builders from code
|
|
5
|
+
"""
|
|
6
|
+
from abc import ABC
|
|
7
|
+
from typing import List
|
|
8
|
+
|
|
9
|
+
from ckanapi_harvesters.auxiliary.ckan_model import CkanState
|
|
10
|
+
from ckanapi_harvesters.ckan_api import CkanApi
|
|
11
|
+
from ckanapi_harvesters.builder.builder_package import BuilderPackage
|
|
12
|
+
|
|
13
|
+
class SpecificBuilderABC(BuilderPackage, ABC):
|
|
14
|
+
def __init__(self, ckan:CkanApi, package_name:str, organization_name:str, *,
|
|
15
|
+
title: str = None, description: str = None, private: bool = None, state: CkanState = None,
|
|
16
|
+
version: str = None,
|
|
17
|
+
url: str = None, tags: List[str] = None,
|
|
18
|
+
license_name:str=None):
|
|
19
|
+
super().__init__(package_name=package_name, title=title,
|
|
20
|
+
description=description, private=private, state=state, version=version, url=url,
|
|
21
|
+
tags=tags, organization_name=organization_name, license_name=license_name)
|
|
22
|
+
self.ckan_builder.from_ckan(ckan)
|
|
23
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Package with helper functions for CKAN requests using pandas DataFrames.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from . import ckan_api_params
|
|
8
|
+
from . import ckan_api_0_base
|
|
9
|
+
from . import ckan_api_1_map
|
|
10
|
+
from . import ckan_api_2_readonly
|
|
11
|
+
from . import ckan_api_3_policy
|
|
12
|
+
from . import ckan_api_4_readwrite
|
|
13
|
+
from . import ckan_api_5_manage
|
|
14
|
+
from . import ckan_api
|
|
15
|
+
# from . import deprecated
|
|
16
|
+
|
|
17
|
+
# usage shortcuts
|
|
18
|
+
from ckanapi_harvesters.ckan_api.ckan_api import CkanApi, CkanApiParams, CkanApiABC, CKAN_API_VERSION, CkanApiMap
|
|
19
|
+
|
|
20
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Alias to most complete CkanApi implementation
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ckanapi_harvesters.ckan_api.ckan_api_0_base import CkanApiABC, CKAN_API_VERSION
|
|
8
|
+
from ckanapi_harvesters.ckan_api.ckan_api_1_map import CkanApiMap
|
|
9
|
+
from ckanapi_harvesters.ckan_api.ckan_api_5_manage import CkanApiManage as CkanApi # alias
|
|
10
|
+
from ckanapi_harvesters.ckan_api.ckan_api_5_manage import CkanApiExtendedParams as CkanApiParams # alias
|
|
11
|
+
|