ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ckanapi_harvesters/__init__.py +32 -10
- ckanapi_harvesters/auxiliary/__init__.py +26 -0
- ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
- ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
- ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
- ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
- ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
- ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
- ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
- ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
- ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
- ckanapi_harvesters/auxiliary/deprecated.py +82 -0
- ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
- ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
- ckanapi_harvesters/auxiliary/list_records.py +60 -0
- ckanapi_harvesters/auxiliary/login.py +163 -0
- ckanapi_harvesters/auxiliary/path.py +208 -0
- ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
- ckanapi_harvesters/auxiliary/urls.py +40 -0
- ckanapi_harvesters/builder/__init__.py +40 -0
- ckanapi_harvesters/builder/builder_aux.py +20 -0
- ckanapi_harvesters/builder/builder_ckan.py +238 -0
- ckanapi_harvesters/builder/builder_errors.py +36 -0
- ckanapi_harvesters/builder/builder_field.py +122 -0
- ckanapi_harvesters/builder/builder_package.py +9 -0
- ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
- ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
- ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
- ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
- ckanapi_harvesters/builder/builder_resource.py +589 -0
- ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
- ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
- ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
- ckanapi_harvesters/builder/builder_resource_init.py +126 -0
- ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
- ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
- ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
- ckanapi_harvesters/builder/example/__init__.py +21 -0
- ckanapi_harvesters/builder/example/builder_example.py +21 -0
- ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
- ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
- ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
- ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
- ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
- ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
- ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
- ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
- ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
- ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
- ckanapi_harvesters/builder/mapper_datastore.py +93 -0
- ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
- ckanapi_harvesters/builder/specific/__init__.py +11 -0
- ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
- ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
- ckanapi_harvesters/ckan_api/__init__.py +20 -0
- ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
- ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
- ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
- ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
- ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
- ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
- ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
- ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
- ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
- ckanapi_harvesters/harvesters/__init__.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
- ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
- ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
- ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
- ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
- ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
- ckanapi_harvesters/harvesters/harvester_init.py +30 -0
- ckanapi_harvesters/harvesters/harvester_model.py +49 -0
- ckanapi_harvesters/harvesters/harvester_params.py +323 -0
- ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
- ckanapi_harvesters/harvesters/postgre_params.py +86 -0
- ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
- ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
- ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
- ckanapi_harvesters/policies/__init__.py +20 -0
- ckanapi_harvesters/policies/data_format_policy.py +269 -0
- ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
- ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
- ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
- ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
- ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
- ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
- ckanapi_harvesters/reports/__init__.py +11 -0
- ckanapi_harvesters/reports/admin_report.py +292 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
- ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
- ckanapi_harvesters/divider/__init__.py +0 -27
- ckanapi_harvesters/divider/divider.py +0 -53
- ckanapi_harvesters/divider/divider_error.py +0 -59
- ckanapi_harvesters/main.py +0 -30
- ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Harvest from a PostgreSQL database using sqlalchemy
|
|
5
|
+
"""
|
|
6
|
+
from typing import Union, List, Any, Dict
|
|
7
|
+
from types import SimpleNamespace
|
|
8
|
+
from collections import OrderedDict
|
|
9
|
+
import urllib.parse
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import sqlalchemy
|
|
15
|
+
import psycopg2
|
|
16
|
+
except ImportError:
|
|
17
|
+
sqlalchemy = SimpleNamespace(Engine=None, Connection=None)
|
|
18
|
+
psycopg2 = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
from ckanapi_harvesters.harvesters.harvester_errors import (HarvesterRequirementError, HarvesterArgumentRequiredError)
|
|
22
|
+
from ckanapi_harvesters.harvesters.harvester_abc import TableHarvesterABC, DatasetHarvesterABC, DatabaseHarvesterABC
|
|
23
|
+
from ckanapi_harvesters.harvesters.harvester_model import FieldMetadata, TableMetadata, DatasetMetadata
|
|
24
|
+
from ckanapi_harvesters.harvesters.harvester_params import DatabaseParams
|
|
25
|
+
from ckanapi_harvesters.harvesters.postgre_params import DatasetParamsPostgreSchema, TableParamsPostgre
|
|
26
|
+
from ckanapi_harvesters.auxiliary.urls import url_join, url_insert_login
|
|
27
|
+
from ckanapi_harvesters.auxiliary.ckan_auxiliary import ssl_arguments_decompose
|
|
28
|
+
from ckanapi_harvesters.auxiliary.ckan_auxiliary import parse_geometry_native_type
|
|
29
|
+
from ckanapi_harvesters.auxiliary.ckan_errors import UrlError
|
|
30
|
+
from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
|
|
31
|
+
from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_abc import CkanDataCleanerABC
|
|
32
|
+
from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_upload_2_geom import CkanDataCleanerUploadGeom
|
|
33
|
+
|
|
34
|
+
postgre_type_mapper = {}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DatabaseHarvesterPostgre(DatabaseHarvesterABC):
|
|
38
|
+
"""
|
|
39
|
+
This class manages the connection to a PostgreSQL database server.
|
|
40
|
+
It can list schemas (corresponding to CKAN datasets).
|
|
41
|
+
"""
|
|
42
|
+
def __init__(self, params:DatabaseParams=None):
|
|
43
|
+
super().__init__(params)
|
|
44
|
+
if sqlalchemy.Engine is None:
|
|
45
|
+
raise HarvesterRequirementError("sqlalchemy", "postgre")
|
|
46
|
+
if psycopg2 is None:
|
|
47
|
+
raise HarvesterRequirementError("psycopg2", "postgre")
|
|
48
|
+
self.alchemy_engine: Union[sqlalchemy.Engine,None] = None
|
|
49
|
+
self.alchemy_connection: Union[sqlalchemy.Connection,None] = None
|
|
50
|
+
if self.params.auth_url is None and self.params.host is None and self.params.port is None and self.params.database is None:
|
|
51
|
+
raise HarvesterArgumentRequiredError("auth-url", "postgre", "This argument defines the url used to authenticate.")
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def init_from_options_string(options_string:str, base_dir:str=None) -> "DatabaseHarvesterPostgre":
|
|
55
|
+
params = DatabaseParams()
|
|
56
|
+
params.parse_options_string(options_string, base_dir=base_dir)
|
|
57
|
+
return DatabaseHarvesterPostgre(params)
|
|
58
|
+
|
|
59
|
+
def copy(self, *, dest=None):
|
|
60
|
+
if dest is None:
|
|
61
|
+
dest = DatabaseHarvesterPostgre()
|
|
62
|
+
return super().copy(dest=dest)
|
|
63
|
+
|
|
64
|
+
def connect(self, *, cancel_if_connected:bool=True) -> Any:
|
|
65
|
+
if cancel_if_connected and self.alchemy_engine is not None:
|
|
66
|
+
return self.alchemy_engine
|
|
67
|
+
else:
|
|
68
|
+
if self.alchemy_engine is not None:
|
|
69
|
+
self.alchemy_connection.close()
|
|
70
|
+
self.alchemy_engine.dispose()
|
|
71
|
+
self.alchemy_connection = None
|
|
72
|
+
self.alchemy_engine = None
|
|
73
|
+
ssl, ssl_certfile = ssl_arguments_decompose(self.params.verify_ca)
|
|
74
|
+
auth_url = self.params.auth_url
|
|
75
|
+
if auth_url is None:
|
|
76
|
+
if self.params.url is not None:
|
|
77
|
+
auth_url = self.params.url
|
|
78
|
+
elif self.params.host is not None:
|
|
79
|
+
auth_url = f"postgresql+psycopg2://{self.params.host}"
|
|
80
|
+
if self.params.port is not None:
|
|
81
|
+
auth_url += f":{self.params.port}"
|
|
82
|
+
else:
|
|
83
|
+
raise UrlError("No Postgre URL provided")
|
|
84
|
+
if self.params.auth_url_suffix is not None:
|
|
85
|
+
auth_url = url_join(auth_url, self.params.auth_url_suffix)
|
|
86
|
+
elif self.params.database is not None:
|
|
87
|
+
auth_url = url_join(auth_url, self.params.database)
|
|
88
|
+
self.params.auth_url = auth_url
|
|
89
|
+
auth_url_with_login = url_insert_login(auth_url, self.params.login)
|
|
90
|
+
self.alchemy_engine = sqlalchemy.create_engine(auth_url_with_login)
|
|
91
|
+
# ssl=ssl, tlscafile=ssl_certfile,
|
|
92
|
+
# timeoutMS=self.params.timeout*1000.0 if self.params.timeout is not None else None)
|
|
93
|
+
self.alchemy_connection = self.alchemy_engine.connect()
|
|
94
|
+
if self.params.host is None and self.params.port is None:
|
|
95
|
+
# complete with host and port parsed by sqlalchemy
|
|
96
|
+
parsed_url = urllib.parse.urlparse(auth_url)
|
|
97
|
+
self.params.host, self.params.port = parsed_url.hostname, parsed_url.port
|
|
98
|
+
return self.alchemy_engine
|
|
99
|
+
|
|
100
|
+
def is_connected(self) -> bool:
|
|
101
|
+
return self.alchemy_engine is not None
|
|
102
|
+
|
|
103
|
+
def disconnect(self) -> None:
|
|
104
|
+
if self.alchemy_engine is not None:
|
|
105
|
+
self.alchemy_connection.close()
|
|
106
|
+
self.alchemy_engine.dispose()
|
|
107
|
+
self.alchemy_engine = None
|
|
108
|
+
self.alchemy_connection = None
|
|
109
|
+
|
|
110
|
+
def check_connection(self, *, new_connection:bool=False, raise_error:bool=False) -> Union[None, ContextErrorLevelMessage]:
|
|
111
|
+
try:
|
|
112
|
+
self.connect(cancel_if_connected=not new_connection)
|
|
113
|
+
remote_collections = self.list_datasets(return_metadata=False)
|
|
114
|
+
except Exception as e:
|
|
115
|
+
if raise_error:
|
|
116
|
+
raise e from e
|
|
117
|
+
else:
|
|
118
|
+
return ContextErrorLevelMessage("Postgre Harvester", ErrorLevel.Error, f"Failed to connect to {self.params.auth_url}: {e}")
|
|
119
|
+
|
|
120
|
+
def get_dataset_harvester(self, dataset_name: str) -> "DatasetHarvesterPostgre":
|
|
121
|
+
params_dataset = self.params.copy(dest=DatasetParamsPostgreSchema())
|
|
122
|
+
params_dataset.dataset = dataset_name
|
|
123
|
+
dataset_harvester = DatasetHarvesterPostgre(params_dataset)
|
|
124
|
+
self.copy(dest=dataset_harvester)
|
|
125
|
+
dataset_harvester.params = params_dataset
|
|
126
|
+
dataset_harvester._finalize_connection()
|
|
127
|
+
return dataset_harvester
|
|
128
|
+
|
|
129
|
+
def list_datasets(self, return_metadata: bool = True) -> Union[List[str], OrderedDict[str, DatasetMetadata]]:
|
|
130
|
+
self.connect()
|
|
131
|
+
query = "SELECT schema_name FROM information_schema.schemata;"
|
|
132
|
+
df_schemas = pd.read_sql(query, self.alchemy_engine)
|
|
133
|
+
dataset_list = df_schemas["schema_name"].tolist()
|
|
134
|
+
if return_metadata:
|
|
135
|
+
return OrderedDict([(name, self.get_dataset_harvester(name).query_dataset_metadata()) for name in dataset_list])
|
|
136
|
+
else:
|
|
137
|
+
return dataset_list
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class DatasetHarvesterPostgre(DatabaseHarvesterPostgre, DatasetHarvesterABC):
|
|
141
|
+
"""
|
|
142
|
+
A CKAN dataset corresponds to a PostgreSQL schema (set of tables).
|
|
143
|
+
"""
|
|
144
|
+
def __init__(self, params:DatasetParamsPostgreSchema=None):
|
|
145
|
+
super().__init__(params)
|
|
146
|
+
self.dataset_metadata: Union[DatasetMetadata, None] = None # DatasetHarvesterABC
|
|
147
|
+
if self.params.dataset is None:
|
|
148
|
+
raise HarvesterArgumentRequiredError("dataset", "postgre", "This argument defines the Postgre schema to be used")
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
def init_from_options_string(options_string:str, base_dir:str=None) -> "DatasetHarvesterPostgre":
|
|
152
|
+
params = DatasetParamsPostgreSchema()
|
|
153
|
+
params.parse_options_string(options_string, base_dir=base_dir)
|
|
154
|
+
return DatasetHarvesterPostgre(params)
|
|
155
|
+
|
|
156
|
+
def _finalize_connection(self):
|
|
157
|
+
if super().is_connected():
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
def connect(self, *, cancel_if_connected:bool=True) -> Any:
|
|
161
|
+
if not (cancel_if_connected and self.is_connected()):
|
|
162
|
+
super().connect(cancel_if_connected=cancel_if_connected)
|
|
163
|
+
self._finalize_connection()
|
|
164
|
+
return self.alchemy_connection
|
|
165
|
+
|
|
166
|
+
def is_connected(self) -> bool:
|
|
167
|
+
return super().is_connected()
|
|
168
|
+
|
|
169
|
+
def disconnect(self) -> None:
|
|
170
|
+
if super().is_connected():
|
|
171
|
+
super().disconnect()
|
|
172
|
+
|
|
173
|
+
def check_connection(self, *, new_connection: bool = False, raise_error: bool = False) -> Union[None, ContextErrorLevelMessage]:
|
|
174
|
+
try:
|
|
175
|
+
super().check_connection(new_connection=new_connection, raise_error=raise_error)
|
|
176
|
+
tables_list = self.list_tables(return_metadata=False)
|
|
177
|
+
except Exception as e:
|
|
178
|
+
if raise_error:
|
|
179
|
+
raise e from e
|
|
180
|
+
else:
|
|
181
|
+
return ContextErrorLevelMessage("Postgre Harvester", ErrorLevel.Error,
|
|
182
|
+
f"Failed to connect to {self.params.auth_url}: {e}")
|
|
183
|
+
|
|
184
|
+
def query_dataset_metadata(self, cancel_if_present:bool=True) -> DatasetMetadata:
|
|
185
|
+
self.connect()
|
|
186
|
+
if cancel_if_present and self.dataset_metadata is not None:
|
|
187
|
+
return self.dataset_metadata
|
|
188
|
+
else:
|
|
189
|
+
# query schema comment
|
|
190
|
+
postgre_schema_name = self.params.dataset
|
|
191
|
+
query = f"""
|
|
192
|
+
SELECT
|
|
193
|
+
n.nspname AS {postgre_schema_name},
|
|
194
|
+
d.description AS schema_comment
|
|
195
|
+
FROM
|
|
196
|
+
pg_namespace n
|
|
197
|
+
LEFT JOIN
|
|
198
|
+
pg_description d ON n.oid = d.objoid
|
|
199
|
+
WHERE
|
|
200
|
+
n.nspname = '{postgre_schema_name}';
|
|
201
|
+
"""
|
|
202
|
+
table_df = pd.read_sql(query, self.alchemy_engine)
|
|
203
|
+
schema_comment = table_df.iloc[0]['schema_comment']
|
|
204
|
+
self.dataset_metadata = DatasetMetadata()
|
|
205
|
+
self.dataset_metadata.name = self.params.dataset
|
|
206
|
+
self.dataset_metadata.description = schema_comment
|
|
207
|
+
self.dataset_metadata.tables = self.list_tables(return_metadata=True)
|
|
208
|
+
return self.dataset_metadata
|
|
209
|
+
|
|
210
|
+
def clean_dataset_metadata(self) -> DatasetMetadata:
|
|
211
|
+
return self.query_dataset_metadata().copy()
|
|
212
|
+
|
|
213
|
+
def get_table_harvester(self, table_name:str) -> "TableHarvesterPostgre":
|
|
214
|
+
params_table = self.params.copy(dest=TableParamsPostgre())
|
|
215
|
+
if self.params.options_string is not None:
|
|
216
|
+
# reparse options_string for table-specific arguments
|
|
217
|
+
params_table.parse_options_string(self.params.options_string, base_dir=self.params.base_dir)
|
|
218
|
+
params_table.table = table_name
|
|
219
|
+
table_harvester = TableHarvesterPostgre(params_table)
|
|
220
|
+
self.copy(dest=table_harvester)
|
|
221
|
+
table_harvester.params = params_table
|
|
222
|
+
table_harvester._finalize_connection()
|
|
223
|
+
return table_harvester
|
|
224
|
+
|
|
225
|
+
def list_tables(self, return_metadata:bool=True) -> Union[List[str], OrderedDict[str, TableMetadata]]:
|
|
226
|
+
self.connect()
|
|
227
|
+
postgre_schema_name = self.params.dataset
|
|
228
|
+
query = f"""
|
|
229
|
+
SELECT table_name
|
|
230
|
+
FROM information_schema.tables
|
|
231
|
+
WHERE table_schema = '{postgre_schema_name}'
|
|
232
|
+
AND table_type = 'BASE TABLE';
|
|
233
|
+
"""
|
|
234
|
+
df_tables = pd.read_sql(query, self.alchemy_engine)
|
|
235
|
+
tables_list = df_tables["table_name"].tolist()
|
|
236
|
+
if return_metadata:
|
|
237
|
+
return OrderedDict([(name, self.get_table_harvester(name).query_table_metadata()) for name in tables_list])
|
|
238
|
+
else:
|
|
239
|
+
return tables_list
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class TableHarvesterPostgre(DatasetHarvesterPostgre, TableHarvesterABC):
|
|
243
|
+
"""
|
|
244
|
+
A CKAN table (DataStore) corresponds to a PostgreSQL table.
|
|
245
|
+
"""
|
|
246
|
+
_default_upload_fun = None
|
|
247
|
+
_default_primary_key = None
|
|
248
|
+
|
|
249
|
+
def __init__(self, params:TableParamsPostgre=None):
|
|
250
|
+
super().__init__(params)
|
|
251
|
+
self.params: TableParamsPostgre = params
|
|
252
|
+
self.table_metadata: Union[TableMetadata, None] = None # TableHarvesterABC
|
|
253
|
+
if self.params.file_url_attr is not None:
|
|
254
|
+
# File/URL attribute has priority over CLI
|
|
255
|
+
self.params.table = self.params.file_url_attr
|
|
256
|
+
if self.params.table is None:
|
|
257
|
+
raise HarvesterArgumentRequiredError("table", "postgre", "This argument defines the Postgre table used")
|
|
258
|
+
|
|
259
|
+
@staticmethod
|
|
260
|
+
def init_from_options_string(options_string:str, *, base_dir:str=None, file_url_attr:str=None) -> "TableHarvesterPostgre":
|
|
261
|
+
params = TableParamsPostgre()
|
|
262
|
+
params.parse_options_string(options_string, file_url_attr=file_url_attr, base_dir=base_dir)
|
|
263
|
+
return TableHarvesterPostgre(params)
|
|
264
|
+
|
|
265
|
+
def copy(self, *, dest=None):
|
|
266
|
+
if dest is None:
|
|
267
|
+
dest = TableHarvesterPostgre()
|
|
268
|
+
super().copy(dest=dest)
|
|
269
|
+
return dest
|
|
270
|
+
|
|
271
|
+
def disconnect(self) -> None:
|
|
272
|
+
if super().is_connected():
|
|
273
|
+
super().disconnect()
|
|
274
|
+
|
|
275
|
+
def _finalize_connection(self):
|
|
276
|
+
super()._finalize_connection()
|
|
277
|
+
if super().is_connected():
|
|
278
|
+
pass
|
|
279
|
+
|
|
280
|
+
def connect(self, *, cancel_if_connected:bool=True) -> Any:
|
|
281
|
+
if not (cancel_if_connected and self.is_connected()):
|
|
282
|
+
super().connect()
|
|
283
|
+
self._finalize_connection()
|
|
284
|
+
return self.alchemy_engine
|
|
285
|
+
|
|
286
|
+
def check_connection(self, *, new_connection:bool=False, raise_error:bool=False) -> Union[None, ContextErrorLevelMessage]:
|
|
287
|
+
super().check_connection(new_connection=new_connection, raise_error=raise_error)
|
|
288
|
+
|
|
289
|
+
def query_table_metadata(self, cancel_if_present:bool=True) -> TableMetadata:
|
|
290
|
+
self.connect()
|
|
291
|
+
if cancel_if_present and self.table_metadata is not None:
|
|
292
|
+
return self.table_metadata
|
|
293
|
+
else:
|
|
294
|
+
postgre_schema_name = self.params.dataset
|
|
295
|
+
postgre_table_name = self.params.table
|
|
296
|
+
# request comment on table
|
|
297
|
+
query = f"""
|
|
298
|
+
SELECT
|
|
299
|
+
obj_description('{postgre_schema_name}.{postgre_table_name}'::regclass) AS table_comment;
|
|
300
|
+
"""
|
|
301
|
+
table_df = pd.read_sql(query, self.alchemy_engine)
|
|
302
|
+
table_comment = table_df.iloc[0]['table_comment']
|
|
303
|
+
# request information on fields
|
|
304
|
+
query = f"""
|
|
305
|
+
SELECT
|
|
306
|
+
cols.column_name,
|
|
307
|
+
cols.ordinal_position,
|
|
308
|
+
cols.is_nullable = 'NO' AS is_not_null,
|
|
309
|
+
cols.data_type AS apparent_data_type,
|
|
310
|
+
COALESCE(pt.typname, cols.data_type) AS full_data_type, -- Use user-defined type name if available
|
|
311
|
+
pgd.description AS column_comment,
|
|
312
|
+
EXISTS (
|
|
313
|
+
SELECT 1
|
|
314
|
+
FROM pg_index i
|
|
315
|
+
JOIN pg_attribute a ON a.attnum = ANY(i.indkey)
|
|
316
|
+
WHERE i.indrelid = ('{postgre_schema_name}.' || cols.table_name)::regclass
|
|
317
|
+
AND a.attname = cols.column_name
|
|
318
|
+
) AS is_indexed,
|
|
319
|
+
con.constraint_type = 'UNIQUE' AS is_unique
|
|
320
|
+
FROM
|
|
321
|
+
information_schema.columns AS cols
|
|
322
|
+
LEFT JOIN
|
|
323
|
+
pg_catalog.pg_statio_all_tables AS st
|
|
324
|
+
ON cols.table_schema = st.schemaname AND cols.table_name = st.relname
|
|
325
|
+
LEFT JOIN
|
|
326
|
+
pg_catalog.pg_description AS pgd
|
|
327
|
+
ON pgd.objoid = st.relid AND pgd.objsubid = cols.ordinal_position
|
|
328
|
+
LEFT JOIN
|
|
329
|
+
information_schema.key_column_usage AS kcu
|
|
330
|
+
ON cols.table_schema = kcu.table_schema
|
|
331
|
+
AND cols.table_name = kcu.table_name
|
|
332
|
+
AND cols.column_name = kcu.column_name
|
|
333
|
+
LEFT JOIN
|
|
334
|
+
information_schema.table_constraints AS con
|
|
335
|
+
ON kcu.constraint_name = con.constraint_name
|
|
336
|
+
AND kcu.table_schema = con.table_schema
|
|
337
|
+
LEFT JOIN
|
|
338
|
+
pg_type pt
|
|
339
|
+
ON cols.udt_name = pt.typname -- Match user-defined type name
|
|
340
|
+
LEFT JOIN
|
|
341
|
+
pg_namespace pn
|
|
342
|
+
ON pt.typnamespace = pn.oid
|
|
343
|
+
WHERE
|
|
344
|
+
cols.table_schema = '{postgre_schema_name}'
|
|
345
|
+
AND cols.table_name = '{postgre_table_name}'
|
|
346
|
+
ORDER BY
|
|
347
|
+
cols.ordinal_position;
|
|
348
|
+
"""
|
|
349
|
+
# DataFrame with columns: ["column_name", "order", "is_not_null", "apparent_data_type", "full_data_type", "description", "is_indexed", "is_unique"]
|
|
350
|
+
fields_df = pd.read_sql(query, self.alchemy_engine)
|
|
351
|
+
fields_df.set_index("column_name", inplace=True, drop=False, verify_integrity=True)
|
|
352
|
+
# querying details on column types
|
|
353
|
+
fields_df["definitive_data_type"] = fields_df["full_data_type"]
|
|
354
|
+
fields_df["geo_type"] = ""
|
|
355
|
+
fields_df["srid"] = 0
|
|
356
|
+
# PostGIS geometry type
|
|
357
|
+
if any(fields_df["full_data_type"] == "geometry"):
|
|
358
|
+
query = f"""
|
|
359
|
+
SELECT
|
|
360
|
+
f_geometry_column,
|
|
361
|
+
type,
|
|
362
|
+
srid
|
|
363
|
+
FROM geometry_columns
|
|
364
|
+
WHERE f_table_schema = '{postgre_schema_name}'
|
|
365
|
+
AND f_table_name = '{postgre_table_name}';
|
|
366
|
+
"""
|
|
367
|
+
geo_df = pd.read_sql(query, self.alchemy_engine)
|
|
368
|
+
for index, row in geo_df.iterrows():
|
|
369
|
+
column_name = row["f_geometry_column"]
|
|
370
|
+
fields_df.loc[column_name, "definitive_data_type"] = f"geometry({row['type']}, {row['srid']})"
|
|
371
|
+
fields_df.loc[column_name, "geo_type"] = row['type']
|
|
372
|
+
fields_df.loc[column_name, "geo_srid"] = row['srid']
|
|
373
|
+
# query primary key
|
|
374
|
+
query = f"""
|
|
375
|
+
SELECT
|
|
376
|
+
a.attname AS column_name
|
|
377
|
+
FROM
|
|
378
|
+
pg_index i
|
|
379
|
+
JOIN
|
|
380
|
+
pg_attribute a ON a.attnum = ANY(i.indkey)
|
|
381
|
+
WHERE
|
|
382
|
+
i.indrelid = '{postgre_schema_name}.{postgre_table_name}'::regclass
|
|
383
|
+
AND i.indisprimary;
|
|
384
|
+
"""
|
|
385
|
+
primary_key_df = pd.read_sql(query, self.alchemy_engine)
|
|
386
|
+
primary_key = primary_key_df["column_name"].tolist()
|
|
387
|
+
if len(primary_key) == 0:
|
|
388
|
+
primary_key = None
|
|
389
|
+
self.table_metadata = TableMetadata()
|
|
390
|
+
self.table_metadata.name = self.params.table
|
|
391
|
+
self.table_metadata.description = table_comment
|
|
392
|
+
self.table_metadata.fields = OrderedDict()
|
|
393
|
+
for index, row in fields_df.iterrows():
|
|
394
|
+
field_metadata = FieldMetadata()
|
|
395
|
+
field_metadata.name = row["column_name"]
|
|
396
|
+
field_metadata.data_type = row["definitive_data_type"]
|
|
397
|
+
field_metadata.harvester_attrs["datatype_keyword"] = row["full_data_type"]
|
|
398
|
+
field_metadata.internal_attrs.geometry_as_source = row["geo_srid"] > 0
|
|
399
|
+
field_metadata.internal_attrs.geometry_type = row["geo_type"] if row["geo_type"] else None
|
|
400
|
+
field_metadata.internal_attrs.epsg_source = row["geo_srid"] if row["geo_srid"] > 0 else None
|
|
401
|
+
field_metadata.internal_attrs.init_from_native_type(field_metadata.data_type)
|
|
402
|
+
field_metadata.description = row["column_comment"]
|
|
403
|
+
field_metadata.uniquekey = row["is_unique"] if row["is_unique"] is not None else False
|
|
404
|
+
field_metadata.is_index = row["is_indexed"]
|
|
405
|
+
field_metadata.notnull = row["is_not_null"]
|
|
406
|
+
self.table_metadata.fields[field_metadata.name] = field_metadata
|
|
407
|
+
if primary_key is None:
|
|
408
|
+
# first field with unicity can be used as primary key
|
|
409
|
+
primary_key = [field_metadata.name for field_metadata in self.table_metadata.fields.values() if field_metadata.uniquekey]
|
|
410
|
+
if len(primary_key) > 0:
|
|
411
|
+
primary_key = primary_key[0]
|
|
412
|
+
else:
|
|
413
|
+
primary_key = None
|
|
414
|
+
self.table_metadata.primary_key = primary_key
|
|
415
|
+
self.table_metadata.indexes = [field_metadata.name for field_metadata in self.table_metadata.fields.values() if field_metadata.is_index]
|
|
416
|
+
return self.table_metadata
|
|
417
|
+
|
|
418
|
+
def _data_type_map_to_ckan(self, field_metadata:FieldMetadata) -> None:
|
|
419
|
+
"""
|
|
420
|
+
Some data types need to be translated
|
|
421
|
+
"""
|
|
422
|
+
if field_metadata.harvester_attrs["datatype_keyword"] == "geometry":
|
|
423
|
+
if self.params.ckan_postgis:
|
|
424
|
+
if self.params.ckan_default_target_epsg is not None:
|
|
425
|
+
# TODO: at this point, the ckan_default_target_epsg does not inherit from ckan
|
|
426
|
+
geometry_type, geo_epsg = parse_geometry_native_type(field_metadata.data_type)
|
|
427
|
+
field_metadata.data_type = f"geometry({geometry_type},{self.params.ckan_default_target_epsg})"
|
|
428
|
+
field_metadata.internal_attrs.init_from_native_type(field_metadata.data_type)
|
|
429
|
+
else:
|
|
430
|
+
field_metadata.data_type = "json"
|
|
431
|
+
return
|
|
432
|
+
|
|
433
|
+
def _get_field_query_function(self, field_metadata: FieldMetadata) -> str:
|
|
434
|
+
"""
|
|
435
|
+
Force some data types to return as text
|
|
436
|
+
"""
|
|
437
|
+
if field_metadata.harvester_attrs["datatype_keyword"] == "geometry":
|
|
438
|
+
if self.params.ckan_postgis:
|
|
439
|
+
return f"{field_metadata.name}" # TODO: test if transfer is successful without converting to a GeoJSON string
|
|
440
|
+
else:
|
|
441
|
+
return f"ST_AsGeoJSON({field_metadata.name})"
|
|
442
|
+
elif field_metadata.data_type == "jsonb":
|
|
443
|
+
return f"{field_metadata.name}::text"
|
|
444
|
+
else:
|
|
445
|
+
return field_metadata.name
|
|
446
|
+
|
|
447
|
+
def clean_table_metadata(self) -> TableMetadata:
|
|
448
|
+
table_metadata = self.query_table_metadata().copy()
|
|
449
|
+
for field_metadata in table_metadata.fields.values():
|
|
450
|
+
self._data_type_map_to_ckan(field_metadata)
|
|
451
|
+
return table_metadata
|
|
452
|
+
|
|
453
|
+
def update_from_ckan(self, ckan):
|
|
454
|
+
super().update_from_ckan(ckan)
|
|
455
|
+
for field_name, field_metadata in self.table_metadata.fields.items():
|
|
456
|
+
field_metadata.internal_attrs.update_from_ckan(ckan)
|
|
457
|
+
|
|
458
|
+
def get_default_primary_key(self) -> List[str]:
|
|
459
|
+
table_metadata = self.query_table_metadata()
|
|
460
|
+
return table_metadata.primary_key
|
|
461
|
+
|
|
462
|
+
def _get_sql_fields_query(self):
|
|
463
|
+
return ", ".join([self._get_field_query_function(field_metadata) for field_metadata in self.table_metadata.fields.values()])
|
|
464
|
+
|
|
465
|
+
def get_default_data_cleaner(self) -> CkanDataCleanerABC:
|
|
466
|
+
data_cleaner = CkanDataCleanerUploadGeom()
|
|
467
|
+
return data_cleaner
|
|
468
|
+
|
|
469
|
+
def list_queries(self, *, new_connection:bool=False) -> List[str]:
|
|
470
|
+
self.connect(cancel_if_connected=not new_connection)
|
|
471
|
+
postgre_schema_name = self.params.dataset
|
|
472
|
+
postgre_table_name = self.params.table
|
|
473
|
+
if self.params.verbose_harvester:
|
|
474
|
+
print(f"Counting documents of table {self.params.table}")
|
|
475
|
+
count_query = f"SELECT COUNT(*) FROM {postgre_schema_name}.{postgre_table_name}"
|
|
476
|
+
if self.params.query_string is not None:
|
|
477
|
+
count_query += " " + self.params.query_string
|
|
478
|
+
count_df = pd.read_sql(count_query, self.alchemy_engine)
|
|
479
|
+
num_rows = count_df["count"].iloc[0]
|
|
480
|
+
fields_query = self._get_sql_fields_query()
|
|
481
|
+
request_query_base = f"SELECT {fields_query} FROM {postgre_schema_name}.{postgre_table_name}"
|
|
482
|
+
if self.params.query_string is not None:
|
|
483
|
+
request_query_base += " " + self.params.query_string
|
|
484
|
+
num_queries = num_rows // self.params.limit + 1
|
|
485
|
+
if self.params.single_request:
|
|
486
|
+
return [f"{request_query_base} LIMIT {self.params.limit} OFFSET {i * self.params.limit}" for i in range(1)]
|
|
487
|
+
else:
|
|
488
|
+
queries_exact = [f"{request_query_base} LIMIT {self.params.limit} OFFSET {i * self.params.limit}" for i in range(num_queries)]
|
|
489
|
+
query_extra = f"{request_query_base} LIMIT {self.params.limit} OFFSET {num_queries * self.params.limit}"
|
|
490
|
+
return queries_exact + [query_extra]
|
|
491
|
+
|
|
492
|
+
def query_data(self, query:Dict[str,Any]) -> pd.DataFrame:
|
|
493
|
+
df = pd.read_sql(query, self.alchemy_engine)
|
|
494
|
+
return df
|
|
495
|
+
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Harvest from a PostgreSQL database
|
|
5
|
+
"""
|
|
6
|
+
import argparse
|
|
7
|
+
|
|
8
|
+
from ckanapi_harvesters.harvesters.harvester_params import DatasetParams, TableParams
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DatasetParamsPostgreSchema(DatasetParams):
|
|
12
|
+
"""
|
|
13
|
+
A CKAN dataset corresponds to a PostgreSQL schema (set of tables).
|
|
14
|
+
This subclass of DatasetParams implements an alias attribute for dataset name called schema.
|
|
15
|
+
"""
|
|
16
|
+
def __init__(self, source: "DatasetParamsPostgreSchema" =None):
|
|
17
|
+
super().__init__(source)
|
|
18
|
+
if source is not None:
|
|
19
|
+
source.copy(dest=self)
|
|
20
|
+
|
|
21
|
+
# alias property for the dataset name setting: schema in PostgreSQL
|
|
22
|
+
@property
|
|
23
|
+
def schema(self) -> str:
|
|
24
|
+
return self.dataset
|
|
25
|
+
@schema.setter
|
|
26
|
+
def schema(self, value: str):
|
|
27
|
+
self.dataset = value
|
|
28
|
+
|
|
29
|
+
def copy(self, *, dest=None):
|
|
30
|
+
if dest is None:
|
|
31
|
+
dest = DatasetParamsPostgreSchema()
|
|
32
|
+
super().copy(dest=dest)
|
|
33
|
+
return dest
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def setup_cli_harvester_parser(parser: argparse.ArgumentParser = None) -> argparse.ArgumentParser:
|
|
37
|
+
parser = DatasetParams.setup_cli_harvester_parser(parser=parser)
|
|
38
|
+
parser.add_argument("--schema", type=str,
|
|
39
|
+
help="PostgreSQL schema name")
|
|
40
|
+
# parser.add_argument("--dataset", help=argparse.SUPPRESS) # do not display in help ==> conflict
|
|
41
|
+
return parser
|
|
42
|
+
|
|
43
|
+
def initialize_from_cli_args(self, args: argparse.Namespace, base_dir: str = None, error_not_found: bool = True,
|
|
44
|
+
default_proxies: dict = None, proxy_headers: dict = None) -> None:
|
|
45
|
+
super().initialize_from_cli_args(args, base_dir=base_dir, error_not_found=error_not_found,
|
|
46
|
+
default_proxies=default_proxies, proxy_headers=proxy_headers)
|
|
47
|
+
if args.schema is not None:
|
|
48
|
+
self.schema = args.schema
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class TableParamsPostgre(TableParams): #, DatasetParamsPostgreSchema):
|
|
52
|
+
def __init__(self, source: "TableParamsPostgre" =None):
|
|
53
|
+
super().__init__(source)
|
|
54
|
+
|
|
55
|
+
def copy(self, *, dest=None):
|
|
56
|
+
if dest is None:
|
|
57
|
+
dest = TableParamsPostgre()
|
|
58
|
+
super().copy(dest=dest)
|
|
59
|
+
return dest
|
|
60
|
+
|
|
61
|
+
# DatasetParamsPostgreSchema:
|
|
62
|
+
@property
|
|
63
|
+
def schema(self) -> str:
|
|
64
|
+
return self.dataset
|
|
65
|
+
@schema.setter
|
|
66
|
+
def schema(self, value: str):
|
|
67
|
+
self.dataset = value
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
def setup_cli_harvester_parser(parser: argparse.ArgumentParser = None) -> argparse.ArgumentParser:
|
|
71
|
+
parser = TableParams.setup_cli_harvester_parser(parser=parser)
|
|
72
|
+
# DatasetParamsPostgreSchema:
|
|
73
|
+
# parser = DatasetParamsPostgreSchema.setup_cli_harvester_parser(parser=parser):
|
|
74
|
+
parser.add_argument("--schema", type=str,
|
|
75
|
+
help="PostgreSQL schema name")
|
|
76
|
+
# parser.add_argument("--dataset", help=argparse.SUPPRESS) # do not display in help ==> conflict
|
|
77
|
+
return parser
|
|
78
|
+
|
|
79
|
+
def initialize_from_cli_args(self, args: argparse.Namespace, base_dir: str = None, error_not_found: bool = True,
|
|
80
|
+
default_proxies: dict = None, proxy_headers: dict = None) -> None:
|
|
81
|
+
super().initialize_from_cli_args(args, base_dir=base_dir, error_not_found=error_not_found,
|
|
82
|
+
default_proxies=default_proxies, proxy_headers=proxy_headers)
|
|
83
|
+
# DatasetParamsPostgreSchema:
|
|
84
|
+
if args.schema is not None:
|
|
85
|
+
self.schema = args.schema
|
|
86
|
+
|