ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
  103. ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,495 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Harvest from a PostgreSQL database using sqlalchemy
5
+ """
6
+ from typing import Union, List, Any, Dict
7
+ from types import SimpleNamespace
8
+ from collections import OrderedDict
9
+ import urllib.parse
10
+
11
+ import pandas as pd
12
+
13
+ try:
14
+ import sqlalchemy
15
+ import psycopg2
16
+ except ImportError:
17
+ sqlalchemy = SimpleNamespace(Engine=None, Connection=None)
18
+ psycopg2 = None
19
+
20
+
21
+ from ckanapi_harvesters.harvesters.harvester_errors import (HarvesterRequirementError, HarvesterArgumentRequiredError)
22
+ from ckanapi_harvesters.harvesters.harvester_abc import TableHarvesterABC, DatasetHarvesterABC, DatabaseHarvesterABC
23
+ from ckanapi_harvesters.harvesters.harvester_model import FieldMetadata, TableMetadata, DatasetMetadata
24
+ from ckanapi_harvesters.harvesters.harvester_params import DatabaseParams
25
+ from ckanapi_harvesters.harvesters.postgre_params import DatasetParamsPostgreSchema, TableParamsPostgre
26
+ from ckanapi_harvesters.auxiliary.urls import url_join, url_insert_login
27
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import ssl_arguments_decompose
28
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import parse_geometry_native_type
29
+ from ckanapi_harvesters.auxiliary.ckan_errors import UrlError
30
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
31
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_abc import CkanDataCleanerABC
32
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_upload_2_geom import CkanDataCleanerUploadGeom
33
+
34
+ postgre_type_mapper = {}
35
+
36
+
37
+ class DatabaseHarvesterPostgre(DatabaseHarvesterABC):
38
+ """
39
+ This class manages the connection to a PostgreSQL database server.
40
+ It can list schemas (corresponding to CKAN datasets).
41
+ """
42
+ def __init__(self, params:DatabaseParams=None):
43
+ super().__init__(params)
44
+ if sqlalchemy.Engine is None:
45
+ raise HarvesterRequirementError("sqlalchemy", "postgre")
46
+ if psycopg2 is None:
47
+ raise HarvesterRequirementError("psycopg2", "postgre")
48
+ self.alchemy_engine: Union[sqlalchemy.Engine,None] = None
49
+ self.alchemy_connection: Union[sqlalchemy.Connection,None] = None
50
+ if self.params.auth_url is None and self.params.host is None and self.params.port is None and self.params.database is None:
51
+ raise HarvesterArgumentRequiredError("auth-url", "postgre", "This argument defines the url used to authenticate.")
52
+
53
+ @staticmethod
54
+ def init_from_options_string(options_string:str, base_dir:str=None) -> "DatabaseHarvesterPostgre":
55
+ params = DatabaseParams()
56
+ params.parse_options_string(options_string, base_dir=base_dir)
57
+ return DatabaseHarvesterPostgre(params)
58
+
59
+ def copy(self, *, dest=None):
60
+ if dest is None:
61
+ dest = DatabaseHarvesterPostgre()
62
+ return super().copy(dest=dest)
63
+
64
+ def connect(self, *, cancel_if_connected:bool=True) -> Any:
65
+ if cancel_if_connected and self.alchemy_engine is not None:
66
+ return self.alchemy_engine
67
+ else:
68
+ if self.alchemy_engine is not None:
69
+ self.alchemy_connection.close()
70
+ self.alchemy_engine.dispose()
71
+ self.alchemy_connection = None
72
+ self.alchemy_engine = None
73
+ ssl, ssl_certfile = ssl_arguments_decompose(self.params.verify_ca)
74
+ auth_url = self.params.auth_url
75
+ if auth_url is None:
76
+ if self.params.url is not None:
77
+ auth_url = self.params.url
78
+ elif self.params.host is not None:
79
+ auth_url = f"postgresql+psycopg2://{self.params.host}"
80
+ if self.params.port is not None:
81
+ auth_url += f":{self.params.port}"
82
+ else:
83
+ raise UrlError("No Postgre URL provided")
84
+ if self.params.auth_url_suffix is not None:
85
+ auth_url = url_join(auth_url, self.params.auth_url_suffix)
86
+ elif self.params.database is not None:
87
+ auth_url = url_join(auth_url, self.params.database)
88
+ self.params.auth_url = auth_url
89
+ auth_url_with_login = url_insert_login(auth_url, self.params.login)
90
+ self.alchemy_engine = sqlalchemy.create_engine(auth_url_with_login)
91
+ # ssl=ssl, tlscafile=ssl_certfile,
92
+ # timeoutMS=self.params.timeout*1000.0 if self.params.timeout is not None else None)
93
+ self.alchemy_connection = self.alchemy_engine.connect()
94
+ if self.params.host is None and self.params.port is None:
95
+ # complete with host and port parsed by sqlalchemy
96
+ parsed_url = urllib.parse.urlparse(auth_url)
97
+ self.params.host, self.params.port = parsed_url.hostname, parsed_url.port
98
+ return self.alchemy_engine
99
+
100
+ def is_connected(self) -> bool:
101
+ return self.alchemy_engine is not None
102
+
103
+ def disconnect(self) -> None:
104
+ if self.alchemy_engine is not None:
105
+ self.alchemy_connection.close()
106
+ self.alchemy_engine.dispose()
107
+ self.alchemy_engine = None
108
+ self.alchemy_connection = None
109
+
110
+ def check_connection(self, *, new_connection:bool=False, raise_error:bool=False) -> Union[None, ContextErrorLevelMessage]:
111
+ try:
112
+ self.connect(cancel_if_connected=not new_connection)
113
+ remote_collections = self.list_datasets(return_metadata=False)
114
+ except Exception as e:
115
+ if raise_error:
116
+ raise e from e
117
+ else:
118
+ return ContextErrorLevelMessage("Postgre Harvester", ErrorLevel.Error, f"Failed to connect to {self.params.auth_url}: {e}")
119
+
120
+ def get_dataset_harvester(self, dataset_name: str) -> "DatasetHarvesterPostgre":
121
+ params_dataset = self.params.copy(dest=DatasetParamsPostgreSchema())
122
+ params_dataset.dataset = dataset_name
123
+ dataset_harvester = DatasetHarvesterPostgre(params_dataset)
124
+ self.copy(dest=dataset_harvester)
125
+ dataset_harvester.params = params_dataset
126
+ dataset_harvester._finalize_connection()
127
+ return dataset_harvester
128
+
129
+ def list_datasets(self, return_metadata: bool = True) -> Union[List[str], OrderedDict[str, DatasetMetadata]]:
130
+ self.connect()
131
+ query = "SELECT schema_name FROM information_schema.schemata;"
132
+ df_schemas = pd.read_sql(query, self.alchemy_engine)
133
+ dataset_list = df_schemas["schema_name"].tolist()
134
+ if return_metadata:
135
+ return OrderedDict([(name, self.get_dataset_harvester(name).query_dataset_metadata()) for name in dataset_list])
136
+ else:
137
+ return dataset_list
138
+
139
+
140
+ class DatasetHarvesterPostgre(DatabaseHarvesterPostgre, DatasetHarvesterABC):
141
+ """
142
+ A CKAN dataset corresponds to a PostgreSQL schema (set of tables).
143
+ """
144
+ def __init__(self, params:DatasetParamsPostgreSchema=None):
145
+ super().__init__(params)
146
+ self.dataset_metadata: Union[DatasetMetadata, None] = None # DatasetHarvesterABC
147
+ if self.params.dataset is None:
148
+ raise HarvesterArgumentRequiredError("dataset", "postgre", "This argument defines the Postgre schema to be used")
149
+
150
+ @staticmethod
151
+ def init_from_options_string(options_string:str, base_dir:str=None) -> "DatasetHarvesterPostgre":
152
+ params = DatasetParamsPostgreSchema()
153
+ params.parse_options_string(options_string, base_dir=base_dir)
154
+ return DatasetHarvesterPostgre(params)
155
+
156
+ def _finalize_connection(self):
157
+ if super().is_connected():
158
+ pass
159
+
160
+ def connect(self, *, cancel_if_connected:bool=True) -> Any:
161
+ if not (cancel_if_connected and self.is_connected()):
162
+ super().connect(cancel_if_connected=cancel_if_connected)
163
+ self._finalize_connection()
164
+ return self.alchemy_connection
165
+
166
+ def is_connected(self) -> bool:
167
+ return super().is_connected()
168
+
169
+ def disconnect(self) -> None:
170
+ if super().is_connected():
171
+ super().disconnect()
172
+
173
+ def check_connection(self, *, new_connection: bool = False, raise_error: bool = False) -> Union[None, ContextErrorLevelMessage]:
174
+ try:
175
+ super().check_connection(new_connection=new_connection, raise_error=raise_error)
176
+ tables_list = self.list_tables(return_metadata=False)
177
+ except Exception as e:
178
+ if raise_error:
179
+ raise e from e
180
+ else:
181
+ return ContextErrorLevelMessage("Postgre Harvester", ErrorLevel.Error,
182
+ f"Failed to connect to {self.params.auth_url}: {e}")
183
+
184
+ def query_dataset_metadata(self, cancel_if_present:bool=True) -> DatasetMetadata:
185
+ self.connect()
186
+ if cancel_if_present and self.dataset_metadata is not None:
187
+ return self.dataset_metadata
188
+ else:
189
+ # query schema comment
190
+ postgre_schema_name = self.params.dataset
191
+ query = f"""
192
+ SELECT
193
+ n.nspname AS {postgre_schema_name},
194
+ d.description AS schema_comment
195
+ FROM
196
+ pg_namespace n
197
+ LEFT JOIN
198
+ pg_description d ON n.oid = d.objoid
199
+ WHERE
200
+ n.nspname = '{postgre_schema_name}';
201
+ """
202
+ table_df = pd.read_sql(query, self.alchemy_engine)
203
+ schema_comment = table_df.iloc[0]['schema_comment']
204
+ self.dataset_metadata = DatasetMetadata()
205
+ self.dataset_metadata.name = self.params.dataset
206
+ self.dataset_metadata.description = schema_comment
207
+ self.dataset_metadata.tables = self.list_tables(return_metadata=True)
208
+ return self.dataset_metadata
209
+
210
+ def clean_dataset_metadata(self) -> DatasetMetadata:
211
+ return self.query_dataset_metadata().copy()
212
+
213
+ def get_table_harvester(self, table_name:str) -> "TableHarvesterPostgre":
214
+ params_table = self.params.copy(dest=TableParamsPostgre())
215
+ if self.params.options_string is not None:
216
+ # reparse options_string for table-specific arguments
217
+ params_table.parse_options_string(self.params.options_string, base_dir=self.params.base_dir)
218
+ params_table.table = table_name
219
+ table_harvester = TableHarvesterPostgre(params_table)
220
+ self.copy(dest=table_harvester)
221
+ table_harvester.params = params_table
222
+ table_harvester._finalize_connection()
223
+ return table_harvester
224
+
225
+ def list_tables(self, return_metadata:bool=True) -> Union[List[str], OrderedDict[str, TableMetadata]]:
226
+ self.connect()
227
+ postgre_schema_name = self.params.dataset
228
+ query = f"""
229
+ SELECT table_name
230
+ FROM information_schema.tables
231
+ WHERE table_schema = '{postgre_schema_name}'
232
+ AND table_type = 'BASE TABLE';
233
+ """
234
+ df_tables = pd.read_sql(query, self.alchemy_engine)
235
+ tables_list = df_tables["table_name"].tolist()
236
+ if return_metadata:
237
+ return OrderedDict([(name, self.get_table_harvester(name).query_table_metadata()) for name in tables_list])
238
+ else:
239
+ return tables_list
240
+
241
+
242
+ class TableHarvesterPostgre(DatasetHarvesterPostgre, TableHarvesterABC):
243
+ """
244
+ A CKAN table (DataStore) corresponds to a PostgreSQL table.
245
+ """
246
+ _default_upload_fun = None
247
+ _default_primary_key = None
248
+
249
+ def __init__(self, params:TableParamsPostgre=None):
250
+ super().__init__(params)
251
+ self.params: TableParamsPostgre = params
252
+ self.table_metadata: Union[TableMetadata, None] = None # TableHarvesterABC
253
+ if self.params.file_url_attr is not None:
254
+ # File/URL attribute has priority over CLI
255
+ self.params.table = self.params.file_url_attr
256
+ if self.params.table is None:
257
+ raise HarvesterArgumentRequiredError("table", "postgre", "This argument defines the Postgre table used")
258
+
259
+ @staticmethod
260
+ def init_from_options_string(options_string:str, *, base_dir:str=None, file_url_attr:str=None) -> "TableHarvesterPostgre":
261
+ params = TableParamsPostgre()
262
+ params.parse_options_string(options_string, file_url_attr=file_url_attr, base_dir=base_dir)
263
+ return TableHarvesterPostgre(params)
264
+
265
+ def copy(self, *, dest=None):
266
+ if dest is None:
267
+ dest = TableHarvesterPostgre()
268
+ super().copy(dest=dest)
269
+ return dest
270
+
271
+ def disconnect(self) -> None:
272
+ if super().is_connected():
273
+ super().disconnect()
274
+
275
+ def _finalize_connection(self):
276
+ super()._finalize_connection()
277
+ if super().is_connected():
278
+ pass
279
+
280
+ def connect(self, *, cancel_if_connected:bool=True) -> Any:
281
+ if not (cancel_if_connected and self.is_connected()):
282
+ super().connect()
283
+ self._finalize_connection()
284
+ return self.alchemy_engine
285
+
286
+ def check_connection(self, *, new_connection:bool=False, raise_error:bool=False) -> Union[None, ContextErrorLevelMessage]:
287
+ super().check_connection(new_connection=new_connection, raise_error=raise_error)
288
+
289
+ def query_table_metadata(self, cancel_if_present:bool=True) -> TableMetadata:
290
+ self.connect()
291
+ if cancel_if_present and self.table_metadata is not None:
292
+ return self.table_metadata
293
+ else:
294
+ postgre_schema_name = self.params.dataset
295
+ postgre_table_name = self.params.table
296
+ # request comment on table
297
+ query = f"""
298
+ SELECT
299
+ obj_description('{postgre_schema_name}.{postgre_table_name}'::regclass) AS table_comment;
300
+ """
301
+ table_df = pd.read_sql(query, self.alchemy_engine)
302
+ table_comment = table_df.iloc[0]['table_comment']
303
+ # request information on fields
304
+ query = f"""
305
+ SELECT
306
+ cols.column_name,
307
+ cols.ordinal_position,
308
+ cols.is_nullable = 'NO' AS is_not_null,
309
+ cols.data_type AS apparent_data_type,
310
+ COALESCE(pt.typname, cols.data_type) AS full_data_type, -- Use user-defined type name if available
311
+ pgd.description AS column_comment,
312
+ EXISTS (
313
+ SELECT 1
314
+ FROM pg_index i
315
+ JOIN pg_attribute a ON a.attnum = ANY(i.indkey)
316
+ WHERE i.indrelid = ('{postgre_schema_name}.' || cols.table_name)::regclass
317
+ AND a.attname = cols.column_name
318
+ ) AS is_indexed,
319
+ con.constraint_type = 'UNIQUE' AS is_unique
320
+ FROM
321
+ information_schema.columns AS cols
322
+ LEFT JOIN
323
+ pg_catalog.pg_statio_all_tables AS st
324
+ ON cols.table_schema = st.schemaname AND cols.table_name = st.relname
325
+ LEFT JOIN
326
+ pg_catalog.pg_description AS pgd
327
+ ON pgd.objoid = st.relid AND pgd.objsubid = cols.ordinal_position
328
+ LEFT JOIN
329
+ information_schema.key_column_usage AS kcu
330
+ ON cols.table_schema = kcu.table_schema
331
+ AND cols.table_name = kcu.table_name
332
+ AND cols.column_name = kcu.column_name
333
+ LEFT JOIN
334
+ information_schema.table_constraints AS con
335
+ ON kcu.constraint_name = con.constraint_name
336
+ AND kcu.table_schema = con.table_schema
337
+ LEFT JOIN
338
+ pg_type pt
339
+ ON cols.udt_name = pt.typname -- Match user-defined type name
340
+ LEFT JOIN
341
+ pg_namespace pn
342
+ ON pt.typnamespace = pn.oid
343
+ WHERE
344
+ cols.table_schema = '{postgre_schema_name}'
345
+ AND cols.table_name = '{postgre_table_name}'
346
+ ORDER BY
347
+ cols.ordinal_position;
348
+ """
349
+ # DataFrame with columns: ["column_name", "order", "is_not_null", "apparent_data_type", "full_data_type", "description", "is_indexed", "is_unique"]
350
+ fields_df = pd.read_sql(query, self.alchemy_engine)
351
+ fields_df.set_index("column_name", inplace=True, drop=False, verify_integrity=True)
352
+ # querying details on column types
353
+ fields_df["definitive_data_type"] = fields_df["full_data_type"]
354
+ fields_df["geo_type"] = ""
355
+ fields_df["srid"] = 0
356
+ # PostGIS geometry type
357
+ if any(fields_df["full_data_type"] == "geometry"):
358
+ query = f"""
359
+ SELECT
360
+ f_geometry_column,
361
+ type,
362
+ srid
363
+ FROM geometry_columns
364
+ WHERE f_table_schema = '{postgre_schema_name}'
365
+ AND f_table_name = '{postgre_table_name}';
366
+ """
367
+ geo_df = pd.read_sql(query, self.alchemy_engine)
368
+ for index, row in geo_df.iterrows():
369
+ column_name = row["f_geometry_column"]
370
+ fields_df.loc[column_name, "definitive_data_type"] = f"geometry({row['type']}, {row['srid']})"
371
+ fields_df.loc[column_name, "geo_type"] = row['type']
372
+ fields_df.loc[column_name, "geo_srid"] = row['srid']
373
+ # query primary key
374
+ query = f"""
375
+ SELECT
376
+ a.attname AS column_name
377
+ FROM
378
+ pg_index i
379
+ JOIN
380
+ pg_attribute a ON a.attnum = ANY(i.indkey)
381
+ WHERE
382
+ i.indrelid = '{postgre_schema_name}.{postgre_table_name}'::regclass
383
+ AND i.indisprimary;
384
+ """
385
+ primary_key_df = pd.read_sql(query, self.alchemy_engine)
386
+ primary_key = primary_key_df["column_name"].tolist()
387
+ if len(primary_key) == 0:
388
+ primary_key = None
389
+ self.table_metadata = TableMetadata()
390
+ self.table_metadata.name = self.params.table
391
+ self.table_metadata.description = table_comment
392
+ self.table_metadata.fields = OrderedDict()
393
+ for index, row in fields_df.iterrows():
394
+ field_metadata = FieldMetadata()
395
+ field_metadata.name = row["column_name"]
396
+ field_metadata.data_type = row["definitive_data_type"]
397
+ field_metadata.harvester_attrs["datatype_keyword"] = row["full_data_type"]
398
+ field_metadata.internal_attrs.geometry_as_source = row["geo_srid"] > 0
399
+ field_metadata.internal_attrs.geometry_type = row["geo_type"] if row["geo_type"] else None
400
+ field_metadata.internal_attrs.epsg_source = row["geo_srid"] if row["geo_srid"] > 0 else None
401
+ field_metadata.internal_attrs.init_from_native_type(field_metadata.data_type)
402
+ field_metadata.description = row["column_comment"]
403
+ field_metadata.uniquekey = row["is_unique"] if row["is_unique"] is not None else False
404
+ field_metadata.is_index = row["is_indexed"]
405
+ field_metadata.notnull = row["is_not_null"]
406
+ self.table_metadata.fields[field_metadata.name] = field_metadata
407
+ if primary_key is None:
408
+ # first field with unicity can be used as primary key
409
+ primary_key = [field_metadata.name for field_metadata in self.table_metadata.fields.values() if field_metadata.uniquekey]
410
+ if len(primary_key) > 0:
411
+ primary_key = primary_key[0]
412
+ else:
413
+ primary_key = None
414
+ self.table_metadata.primary_key = primary_key
415
+ self.table_metadata.indexes = [field_metadata.name for field_metadata in self.table_metadata.fields.values() if field_metadata.is_index]
416
+ return self.table_metadata
417
+
418
+ def _data_type_map_to_ckan(self, field_metadata:FieldMetadata) -> None:
419
+ """
420
+ Some data types need to be translated
421
+ """
422
+ if field_metadata.harvester_attrs["datatype_keyword"] == "geometry":
423
+ if self.params.ckan_postgis:
424
+ if self.params.ckan_default_target_epsg is not None:
425
+ # TODO: at this point, the ckan_default_target_epsg does not inherit from ckan
426
+ geometry_type, geo_epsg = parse_geometry_native_type(field_metadata.data_type)
427
+ field_metadata.data_type = f"geometry({geometry_type},{self.params.ckan_default_target_epsg})"
428
+ field_metadata.internal_attrs.init_from_native_type(field_metadata.data_type)
429
+ else:
430
+ field_metadata.data_type = "json"
431
+ return
432
+
433
+ def _get_field_query_function(self, field_metadata: FieldMetadata) -> str:
434
+ """
435
+ Force some data types to return as text
436
+ """
437
+ if field_metadata.harvester_attrs["datatype_keyword"] == "geometry":
438
+ if self.params.ckan_postgis:
439
+ return f"{field_metadata.name}" # TODO: test if transfer is successful without converting to a GeoJSON string
440
+ else:
441
+ return f"ST_AsGeoJSON({field_metadata.name})"
442
+ elif field_metadata.data_type == "jsonb":
443
+ return f"{field_metadata.name}::text"
444
+ else:
445
+ return field_metadata.name
446
+
447
+ def clean_table_metadata(self) -> TableMetadata:
448
+ table_metadata = self.query_table_metadata().copy()
449
+ for field_metadata in table_metadata.fields.values():
450
+ self._data_type_map_to_ckan(field_metadata)
451
+ return table_metadata
452
+
453
+ def update_from_ckan(self, ckan):
454
+ super().update_from_ckan(ckan)
455
+ for field_name, field_metadata in self.table_metadata.fields.items():
456
+ field_metadata.internal_attrs.update_from_ckan(ckan)
457
+
458
+ def get_default_primary_key(self) -> List[str]:
459
+ table_metadata = self.query_table_metadata()
460
+ return table_metadata.primary_key
461
+
462
+ def _get_sql_fields_query(self):
463
+ return ", ".join([self._get_field_query_function(field_metadata) for field_metadata in self.table_metadata.fields.values()])
464
+
465
+ def get_default_data_cleaner(self) -> CkanDataCleanerABC:
466
+ data_cleaner = CkanDataCleanerUploadGeom()
467
+ return data_cleaner
468
+
469
+ def list_queries(self, *, new_connection:bool=False) -> List[str]:
470
+ self.connect(cancel_if_connected=not new_connection)
471
+ postgre_schema_name = self.params.dataset
472
+ postgre_table_name = self.params.table
473
+ if self.params.verbose_harvester:
474
+ print(f"Counting documents of table {self.params.table}")
475
+ count_query = f"SELECT COUNT(*) FROM {postgre_schema_name}.{postgre_table_name}"
476
+ if self.params.query_string is not None:
477
+ count_query += " " + self.params.query_string
478
+ count_df = pd.read_sql(count_query, self.alchemy_engine)
479
+ num_rows = count_df["count"].iloc[0]
480
+ fields_query = self._get_sql_fields_query()
481
+ request_query_base = f"SELECT {fields_query} FROM {postgre_schema_name}.{postgre_table_name}"
482
+ if self.params.query_string is not None:
483
+ request_query_base += " " + self.params.query_string
484
+ num_queries = num_rows // self.params.limit + 1
485
+ if self.params.single_request:
486
+ return [f"{request_query_base} LIMIT {self.params.limit} OFFSET {i * self.params.limit}" for i in range(1)]
487
+ else:
488
+ queries_exact = [f"{request_query_base} LIMIT {self.params.limit} OFFSET {i * self.params.limit}" for i in range(num_queries)]
489
+ query_extra = f"{request_query_base} LIMIT {self.params.limit} OFFSET {num_queries * self.params.limit}"
490
+ return queries_exact + [query_extra]
491
+
492
+ def query_data(self, query:Dict[str,Any]) -> pd.DataFrame:
493
+ df = pd.read_sql(query, self.alchemy_engine)
494
+ return df
495
+
@@ -0,0 +1,86 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Harvest from a PostgreSQL database
5
+ """
6
+ import argparse
7
+
8
+ from ckanapi_harvesters.harvesters.harvester_params import DatasetParams, TableParams
9
+
10
+
11
+ class DatasetParamsPostgreSchema(DatasetParams):
12
+ """
13
+ A CKAN dataset corresponds to a PostgreSQL schema (set of tables).
14
+ This subclass of DatasetParams implements an alias attribute for dataset name called schema.
15
+ """
16
+ def __init__(self, source: "DatasetParamsPostgreSchema" =None):
17
+ super().__init__(source)
18
+ if source is not None:
19
+ source.copy(dest=self)
20
+
21
+ # alias property for the dataset name setting: schema in PostgreSQL
22
+ @property
23
+ def schema(self) -> str:
24
+ return self.dataset
25
+ @schema.setter
26
+ def schema(self, value: str):
27
+ self.dataset = value
28
+
29
+ def copy(self, *, dest=None):
30
+ if dest is None:
31
+ dest = DatasetParamsPostgreSchema()
32
+ super().copy(dest=dest)
33
+ return dest
34
+
35
+ @staticmethod
36
+ def setup_cli_harvester_parser(parser: argparse.ArgumentParser = None) -> argparse.ArgumentParser:
37
+ parser = DatasetParams.setup_cli_harvester_parser(parser=parser)
38
+ parser.add_argument("--schema", type=str,
39
+ help="PostgreSQL schema name")
40
+ # parser.add_argument("--dataset", help=argparse.SUPPRESS) # do not display in help ==> conflict
41
+ return parser
42
+
43
+ def initialize_from_cli_args(self, args: argparse.Namespace, base_dir: str = None, error_not_found: bool = True,
44
+ default_proxies: dict = None, proxy_headers: dict = None) -> None:
45
+ super().initialize_from_cli_args(args, base_dir=base_dir, error_not_found=error_not_found,
46
+ default_proxies=default_proxies, proxy_headers=proxy_headers)
47
+ if args.schema is not None:
48
+ self.schema = args.schema
49
+
50
+
51
+ class TableParamsPostgre(TableParams): #, DatasetParamsPostgreSchema):
52
+ def __init__(self, source: "TableParamsPostgre" =None):
53
+ super().__init__(source)
54
+
55
+ def copy(self, *, dest=None):
56
+ if dest is None:
57
+ dest = TableParamsPostgre()
58
+ super().copy(dest=dest)
59
+ return dest
60
+
61
+ # DatasetParamsPostgreSchema:
62
+ @property
63
+ def schema(self) -> str:
64
+ return self.dataset
65
+ @schema.setter
66
+ def schema(self, value: str):
67
+ self.dataset = value
68
+
69
+ @staticmethod
70
+ def setup_cli_harvester_parser(parser: argparse.ArgumentParser = None) -> argparse.ArgumentParser:
71
+ parser = TableParams.setup_cli_harvester_parser(parser=parser)
72
+ # DatasetParamsPostgreSchema:
73
+ # parser = DatasetParamsPostgreSchema.setup_cli_harvester_parser(parser=parser):
74
+ parser.add_argument("--schema", type=str,
75
+ help="PostgreSQL schema name")
76
+ # parser.add_argument("--dataset", help=argparse.SUPPRESS) # do not display in help ==> conflict
77
+ return parser
78
+
79
+ def initialize_from_cli_args(self, args: argparse.Namespace, base_dir: str = None, error_not_found: bool = True,
80
+ default_proxies: dict = None, proxy_headers: dict = None) -> None:
81
+ super().initialize_from_cli_args(args, base_dir=base_dir, error_not_found=error_not_found,
82
+ default_proxies=default_proxies, proxy_headers=proxy_headers)
83
+ # DatasetParamsPostgreSchema:
84
+ if args.schema is not None:
85
+ self.schema = args.schema
86
+