ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ckanapi_harvesters/__init__.py +32 -10
- ckanapi_harvesters/auxiliary/__init__.py +26 -0
- ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
- ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
- ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
- ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
- ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
- ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
- ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
- ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
- ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
- ckanapi_harvesters/auxiliary/deprecated.py +82 -0
- ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
- ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
- ckanapi_harvesters/auxiliary/list_records.py +60 -0
- ckanapi_harvesters/auxiliary/login.py +163 -0
- ckanapi_harvesters/auxiliary/path.py +208 -0
- ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
- ckanapi_harvesters/auxiliary/urls.py +40 -0
- ckanapi_harvesters/builder/__init__.py +40 -0
- ckanapi_harvesters/builder/builder_aux.py +20 -0
- ckanapi_harvesters/builder/builder_ckan.py +238 -0
- ckanapi_harvesters/builder/builder_errors.py +36 -0
- ckanapi_harvesters/builder/builder_field.py +122 -0
- ckanapi_harvesters/builder/builder_package.py +9 -0
- ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
- ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
- ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
- ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
- ckanapi_harvesters/builder/builder_resource.py +589 -0
- ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
- ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
- ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
- ckanapi_harvesters/builder/builder_resource_init.py +126 -0
- ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
- ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
- ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
- ckanapi_harvesters/builder/example/__init__.py +21 -0
- ckanapi_harvesters/builder/example/builder_example.py +21 -0
- ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
- ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
- ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
- ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
- ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
- ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
- ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
- ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
- ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
- ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
- ckanapi_harvesters/builder/mapper_datastore.py +93 -0
- ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
- ckanapi_harvesters/builder/specific/__init__.py +11 -0
- ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
- ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
- ckanapi_harvesters/ckan_api/__init__.py +20 -0
- ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
- ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
- ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
- ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
- ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
- ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
- ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
- ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
- ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
- ckanapi_harvesters/harvesters/__init__.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
- ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
- ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
- ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
- ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
- ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
- ckanapi_harvesters/harvesters/harvester_init.py +30 -0
- ckanapi_harvesters/harvesters/harvester_model.py +49 -0
- ckanapi_harvesters/harvesters/harvester_params.py +323 -0
- ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
- ckanapi_harvesters/harvesters/postgre_params.py +86 -0
- ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
- ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
- ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
- ckanapi_harvesters/policies/__init__.py +20 -0
- ckanapi_harvesters/policies/data_format_policy.py +269 -0
- ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
- ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
- ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
- ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
- ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
- ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
- ckanapi_harvesters/reports/__init__.py +11 -0
- ckanapi_harvesters/reports/admin_report.py +292 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/METADATA +74 -38
- ckanapi_harvesters-0.0.2.dist-info/RECORD +105 -0
- ckanapi_harvesters/divider/__init__.py +0 -27
- ckanapi_harvesters/divider/divider.py +0 -53
- ckanapi_harvesters/divider/divider_error.py +0 -59
- ckanapi_harvesters/main.py +0 -30
- ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/WHEEL +0 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Shapefile format support
|
|
5
|
+
"""
|
|
6
|
+
from typing import Union, Dict
|
|
7
|
+
from types import SimpleNamespace
|
|
8
|
+
import io
|
|
9
|
+
from warnings import warn
|
|
10
|
+
from enum import IntEnum
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
try:
|
|
14
|
+
import geopandas as gpd
|
|
15
|
+
except ImportError:
|
|
16
|
+
gpd = SimpleNamespace(GeoDataFrame=None)
|
|
17
|
+
try:
|
|
18
|
+
import pyproj
|
|
19
|
+
except ImportError:
|
|
20
|
+
pyproj = None
|
|
21
|
+
|
|
22
|
+
from ckanapi_harvesters.auxiliary.list_records import ListRecords
|
|
23
|
+
from ckanapi_harvesters.auxiliary.ckan_model import CkanField
|
|
24
|
+
from ckanapi_harvesters.auxiliary.ckan_errors import FileFormatRequirementError, UnknownTargetCRSError
|
|
25
|
+
from ckanapi_harvesters.auxiliary.ckan_configuration import default_ckan_target_epsg
|
|
26
|
+
from ckanapi_harvesters.harvesters.file_formats.file_format_abc import FileFormatABC
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
shp_upload_read_file_kwargs = dict(encoding='utf-8')
|
|
30
|
+
|
|
31
|
+
class DownloadedShapeFileConversion(IntEnum):
|
|
32
|
+
CsvWkb = 0
|
|
33
|
+
ShapefileProjection = 2
|
|
34
|
+
ShapefileAsIs = 3
|
|
35
|
+
|
|
36
|
+
class ShapeFileFormat(FileFormatABC):
|
|
37
|
+
def __init__(self, read_file_kwargs=None) -> None:
|
|
38
|
+
if gpd.GeoDataFrame is None:
|
|
39
|
+
raise FileFormatRequirementError("geopandas", "SHP")
|
|
40
|
+
if pyproj is None:
|
|
41
|
+
raise FileFormatRequirementError("pyproj", "SHP")
|
|
42
|
+
if read_file_kwargs is None: read_file_kwargs = shp_upload_read_file_kwargs
|
|
43
|
+
self.read_file_kwargs:dict = read_file_kwargs
|
|
44
|
+
self.require_field_crs:bool = True
|
|
45
|
+
self.download_conversion = DownloadedShapeFileConversion.ShapefileProjection
|
|
46
|
+
|
|
47
|
+
# loading a file before upload ----------------
|
|
48
|
+
def read_file(self, file_path: Union[str,io.StringIO], fields: Union[Dict[str, CkanField],None]) -> Union[pd.DataFrame, ListRecords]:
|
|
49
|
+
# target EPSG = EPSG used in CKAN, source EPSG read from SHP file
|
|
50
|
+
gdf = gpd.read_file(file_path, **self.read_file_kwargs)
|
|
51
|
+
geo_columns = list(gdf.select_dtypes('geometry'))
|
|
52
|
+
for field_name in geo_columns:
|
|
53
|
+
gdf.set_geometry(field_name, inplace=True) # select the current column for geometry computations
|
|
54
|
+
crs_source = gdf.crs
|
|
55
|
+
if field_name in fields.keys():
|
|
56
|
+
field = fields[field_name]
|
|
57
|
+
field_data_type = field.data_type
|
|
58
|
+
epsg_target = field.internal_attrs.epsg_target
|
|
59
|
+
epsg_source_from_params = field.internal_attrs.epsg_source
|
|
60
|
+
else:
|
|
61
|
+
# default field data type, with a generic geometry type and the default EPSG
|
|
62
|
+
epsg_target = default_ckan_target_epsg
|
|
63
|
+
field_data_type = f"geometry(geometry,{epsg_target})"
|
|
64
|
+
fields[field_name] = CkanField(field_name, field_data_type) # TODO: update field data type in caller? user can change data type afterwards?
|
|
65
|
+
epsg_source_from_params = None
|
|
66
|
+
msg = f"PostGIS geometric destination type was not specified and will not be transmitted to CKAN. Assuming default {field_data_type}."
|
|
67
|
+
warn(msg)
|
|
68
|
+
if field_data_type == "geometry" or field_data_type.startswith("geometry("): # and field.internal_attrs.geometry_as_source:
|
|
69
|
+
if epsg_target is not None:
|
|
70
|
+
crs_target = pyproj.CRS.from_epsg(epsg_target)
|
|
71
|
+
if not crs_source == crs_target:
|
|
72
|
+
gdf.to_crs(crs_target, inplace=True)
|
|
73
|
+
elif self.require_field_crs:
|
|
74
|
+
raise UnknownTargetCRSError(crs_source, file_path)
|
|
75
|
+
if epsg_source_from_params is not None:
|
|
76
|
+
crs_source_from_params = pyproj.CRS.from_epsg(epsg_source_from_params)
|
|
77
|
+
if not crs_source_from_params == crs_source:
|
|
78
|
+
msg = f"EPSG in SHP file ({crs_source}) does not match given source EPSG ({crs_source_from_params}). The downloaded result will differ from original format."
|
|
79
|
+
warn(msg)
|
|
80
|
+
else:
|
|
81
|
+
raise NotImplementedError(f"Field {field_data_type} is not implemented or not compatible with geometric representations.")
|
|
82
|
+
df = gdf.to_wkb(hex=True) # converts all geometric fields to WKB and returns a standard DataFrame object
|
|
83
|
+
return df
|
|
84
|
+
|
|
85
|
+
def read_buffer(self, buffer: io.StringIO, fields: Union[Dict[str, CkanField],None]) -> Union[pd.DataFrame, ListRecords]:
|
|
86
|
+
return self.read_file(buffer, fields=fields)
|
|
87
|
+
|
|
88
|
+
# saving a file after download -------------
|
|
89
|
+
def downloaded_df_to_gdf(self, df: pd.DataFrame, *, fields: Union[Dict[str, CkanField],None], context:str=None) -> gpd.GeoDataFrame:
|
|
90
|
+
# NB: target EPSG = CRS in database (required), source = option to recover original CRS
|
|
91
|
+
gdf = gpd.GeoDataFrame(df)
|
|
92
|
+
if self.download_conversion == DownloadedShapeFileConversion.CsvWkb:
|
|
93
|
+
# do not look at CRS information and leave in WKB format
|
|
94
|
+
return gdf
|
|
95
|
+
for field_name in df.columns:
|
|
96
|
+
if field_name in fields.keys():
|
|
97
|
+
field = fields[field_name]
|
|
98
|
+
field_data_type = field.data_type
|
|
99
|
+
if field_data_type == "geometry" or field_data_type.startswith("geometry("):
|
|
100
|
+
crs_target = pyproj.CRS.from_epsg(field.internal_attrs.epsg_target)
|
|
101
|
+
if crs_target is None and self.require_field_crs:
|
|
102
|
+
raise UnknownTargetCRSError(field.internal_attrs.epsg_source, context)
|
|
103
|
+
gdf[field_name] = gpd.geoseries.from_wkb(df[field_name], crs=crs_target)
|
|
104
|
+
if (self.download_conversion == DownloadedShapeFileConversion.ShapefileProjection
|
|
105
|
+
and field.internal_attrs.epsg_source is not None and crs_target is not None
|
|
106
|
+
and not field.internal_attrs.epsg_target == field.internal_attrs.epsg_source):
|
|
107
|
+
crs_source = pyproj.CRS.from_epsg(field.internal_attrs.epsg_source)
|
|
108
|
+
if not crs_source == crs_target:
|
|
109
|
+
gdf.to_crs(crs_source, inplace=True)
|
|
110
|
+
return gdf
|
|
111
|
+
|
|
112
|
+
def write_file(self, df: pd.DataFrame, file_path: str, fields: Union[Dict[str, CkanField],None]) -> None:
|
|
113
|
+
# this writes the shp file and auxiliary shx, dbf, cpg, prj files
|
|
114
|
+
gdf = self.downloaded_df_to_gdf(df, fields=fields, context=file_path)
|
|
115
|
+
gdf.to_file(file_path, driver="ESRI Shapefile")
|
|
116
|
+
|
|
117
|
+
def write_in_memory(self, df: pd.DataFrame, fields: Union[Dict[str, CkanField],None]) -> bytes:
|
|
118
|
+
# how could this work because there are multiple files?
|
|
119
|
+
gdf = self.downloaded_df_to_gdf(df, fields=fields)
|
|
120
|
+
buffer = io.StringIO()
|
|
121
|
+
gdf.to_file(buffer, driver="ESRI Shapefile")
|
|
122
|
+
return buffer.getvalue().encode("utf8")
|
|
123
|
+
|
|
124
|
+
def copy(self):
|
|
125
|
+
dest = ShapeFileFormat(self.read_file_kwargs)
|
|
126
|
+
dest.download_conversion = self.download_conversion
|
|
127
|
+
dest.require_field_crs = self.require_field_crs
|
|
128
|
+
return dest
|
|
129
|
+
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Harvester base class
|
|
5
|
+
"""
|
|
6
|
+
from typing import Union, List, Any, Callable
|
|
7
|
+
from collections import OrderedDict
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage
|
|
13
|
+
from ckanapi_harvesters.harvesters.harvester_model import DatasetMetadata, TableMetadata
|
|
14
|
+
from ckanapi_harvesters.harvesters.harvester_params import DatabaseParams, DatasetParams, TableParams
|
|
15
|
+
from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_abc import CkanDataCleanerABC
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class HarvesterConnectABC(ABC):
|
|
19
|
+
def __del__(self):
|
|
20
|
+
self.disconnect()
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def connect(self, *, cancel_if_connected:bool=True) -> Any:
|
|
24
|
+
raise NotImplementedError()
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def _finalize_connection(self):
|
|
28
|
+
raise NotImplementedError()
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def is_connected(self) -> bool:
|
|
32
|
+
raise NotImplementedError()
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def check_connection(self, *, new_connection:bool=False, raise_error:bool=False) -> Union[None, ContextErrorLevelMessage]:
|
|
36
|
+
raise NotImplementedError()
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def disconnect(self) -> None:
|
|
40
|
+
raise NotImplementedError()
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def update_from_ckan(self, ckan):
|
|
44
|
+
raise NotImplementedError()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class DatabaseHarvesterABC(HarvesterConnectABC, ABC):
|
|
48
|
+
def __init__(self, params:DatabaseParams=None):
|
|
49
|
+
if params is None:
|
|
50
|
+
params = DatabaseParams()
|
|
51
|
+
self.params: DatabaseParams = params
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def copy(self, *, dest=None):
|
|
55
|
+
dest.params = self.params.copy()
|
|
56
|
+
return dest
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def init_from_options_string(options_string:str, *, base_dir:str=None) -> "DatabaseHarvesterABC":
|
|
61
|
+
raise NotImplementedError()
|
|
62
|
+
|
|
63
|
+
def _finalize_connection(self):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
## query methods interface ---------------
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def get_dataset_harvester(self, dataset_name:str) -> "DatasetHarvesterABC":
|
|
69
|
+
raise NotImplementedError()
|
|
70
|
+
|
|
71
|
+
@abstractmethod
|
|
72
|
+
def list_datasets(self, return_metadata:bool=True) -> Union[List[str], OrderedDict[str, DatasetMetadata]]:
|
|
73
|
+
raise NotImplementedError()
|
|
74
|
+
|
|
75
|
+
def update_from_ckan(self, ckan):
|
|
76
|
+
self.params._update_from_ckan(ckan)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class DatasetHarvesterABC(DatabaseHarvesterABC, ABC):
|
|
80
|
+
def __init__(self, params:DatasetParams=None):
|
|
81
|
+
if params is None:
|
|
82
|
+
params = DatasetParams()
|
|
83
|
+
super().__init__(params)
|
|
84
|
+
self.params: DatasetParams = params
|
|
85
|
+
self.dataset_metadata: Union[DatasetMetadata, None] = None
|
|
86
|
+
|
|
87
|
+
def __del__(self):
|
|
88
|
+
self.disconnect()
|
|
89
|
+
|
|
90
|
+
@abstractmethod
|
|
91
|
+
def _finalize_connection(self):
|
|
92
|
+
raise NotImplementedError()
|
|
93
|
+
|
|
94
|
+
@abstractmethod
|
|
95
|
+
def copy(self, *, dest=None):
|
|
96
|
+
super().copy(dest=dest)
|
|
97
|
+
dest.dataset_metadata = self.dataset_metadata.copy() if self.dataset_metadata is not None else None
|
|
98
|
+
return dest
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
@abstractmethod
|
|
102
|
+
def init_from_options_string(options_string:str, *, base_dir:str=None) -> "DatasetHarvesterABC":
|
|
103
|
+
raise NotImplementedError()
|
|
104
|
+
|
|
105
|
+
## metadata interface ---------------
|
|
106
|
+
@abstractmethod
|
|
107
|
+
def query_dataset_metadata(self, cancel_if_present:bool=True) -> DatasetMetadata:
|
|
108
|
+
self.connect()
|
|
109
|
+
if cancel_if_present and self.dataset_metadata is not None:
|
|
110
|
+
return self.dataset_metadata
|
|
111
|
+
else:
|
|
112
|
+
self.dataset_metadata = DatasetMetadata()
|
|
113
|
+
# user needs to complete here
|
|
114
|
+
self.dataset_metadata.tables = self.list_tables(return_metadata=True)
|
|
115
|
+
return self.dataset_metadata
|
|
116
|
+
|
|
117
|
+
def clean_dataset_metadata(self) -> DatasetMetadata:
|
|
118
|
+
return self.query_dataset_metadata().copy()
|
|
119
|
+
|
|
120
|
+
## query methods interface ---------------
|
|
121
|
+
@abstractmethod
|
|
122
|
+
def get_table_harvester(self, table_name:str) -> "TableHarvesterABC":
|
|
123
|
+
raise NotImplementedError()
|
|
124
|
+
|
|
125
|
+
@abstractmethod
|
|
126
|
+
def list_tables(self, return_metadata:bool=True) -> Union[List[str], OrderedDict[str, TableMetadata]]:
|
|
127
|
+
raise NotImplementedError()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class TableHarvesterABC(DatasetHarvesterABC, ABC):
|
|
131
|
+
_default_upload_fun: Union[Callable[[Any], pd.DataFrame], None] = None
|
|
132
|
+
_default_primary_key: Union[List[str], None] = None
|
|
133
|
+
|
|
134
|
+
def __init__(self, params:TableParams=None):
|
|
135
|
+
if params is None:
|
|
136
|
+
params = TableParams()
|
|
137
|
+
super().__init__(params)
|
|
138
|
+
self.params: TableParams = params
|
|
139
|
+
self.table_metadata: Union[TableMetadata, None] = None
|
|
140
|
+
|
|
141
|
+
def __del__(self):
|
|
142
|
+
self.disconnect()
|
|
143
|
+
|
|
144
|
+
@abstractmethod
|
|
145
|
+
def copy(self, *, dest=None):
|
|
146
|
+
super().copy(dest=dest)
|
|
147
|
+
dest.table_metadata = self.table_metadata.copy() if self.table_metadata is not None else None
|
|
148
|
+
return dest
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
@abstractmethod
|
|
152
|
+
def init_from_options_string(options_string:str, *, base_dir:str=None, file_url_attr:str=None) -> "TableHarvesterABC":
|
|
153
|
+
raise NotImplementedError()
|
|
154
|
+
|
|
155
|
+
## metadata interface ---------------
|
|
156
|
+
@abstractmethod
|
|
157
|
+
def query_table_metadata(self, cancel_if_present:bool=True) -> TableMetadata:
|
|
158
|
+
self.connect()
|
|
159
|
+
if cancel_if_present and self.table_metadata is not None:
|
|
160
|
+
return self.table_metadata
|
|
161
|
+
else:
|
|
162
|
+
self.table_metadata = TableMetadata()
|
|
163
|
+
# user needs to complete here
|
|
164
|
+
return self.table_metadata
|
|
165
|
+
|
|
166
|
+
def clean_table_metadata(self) -> TableMetadata:
|
|
167
|
+
return self.query_table_metadata().copy()
|
|
168
|
+
|
|
169
|
+
@classmethod
|
|
170
|
+
def get_default_df_upload_fun(cls) -> Union[Callable[[Any], pd.DataFrame], None]:
|
|
171
|
+
return cls._default_upload_fun
|
|
172
|
+
|
|
173
|
+
def get_default_data_cleaner(self) -> Union[CkanDataCleanerABC, None]:
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
@abstractmethod
|
|
177
|
+
def get_default_primary_key(self) -> List[str]:
|
|
178
|
+
return []
|
|
179
|
+
|
|
180
|
+
## query methods interface ---------------
|
|
181
|
+
@abstractmethod
|
|
182
|
+
def list_queries(self, *, new_connection:bool=False) -> List[Any]:
|
|
183
|
+
self.connect(cancel_if_connected=not new_connection)
|
|
184
|
+
raise NotImplementedError()
|
|
185
|
+
|
|
186
|
+
@abstractmethod
|
|
187
|
+
def query_data(self, query:Any) -> Union[List[dict], pd.DataFrame]:
|
|
188
|
+
raise NotImplementedError()
|
|
189
|
+
|
|
190
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Errors specific to harvesting data
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ckanapi_harvesters.auxiliary.ckan_errors import RequirementError
|
|
8
|
+
from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_errors import CleanError, CleanerRequirementError # alias
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HarvestMethodRequiredError(Exception):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__("The harvesting method argument --harvester is required.")
|
|
14
|
+
|
|
15
|
+
class HarvesterArgumentError(Exception):
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
class HarvesterArgumentRequiredError(HarvesterArgumentError):
|
|
19
|
+
def __init__(self, argument:str, harvest_method:str, help:str=None):
|
|
20
|
+
if help is None: help = ""
|
|
21
|
+
super().__init__(f"The argument {argument} is required for harvest method {harvest_method}. " + help)
|
|
22
|
+
|
|
23
|
+
class HarvesterRequirementError(RequirementError):
|
|
24
|
+
def __init__(self, requirement:str, harvest_method:str):
|
|
25
|
+
super().__init__(f"The package {requirement} is required for this harvester ({harvest_method}).")
|
|
26
|
+
|
|
27
|
+
class ResourceNotFoundError(Exception):
|
|
28
|
+
def __init__(self, resource_type:str, table_name:str, host:str):
|
|
29
|
+
super().__init__(f"{resource_type} {table_name} was not found on host ({host}).")
|
|
30
|
+
|
|
31
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Harvester initialization from the options_string arguments
|
|
5
|
+
"""
|
|
6
|
+
from ckanapi_harvesters.harvesters.harvester_abc import TableHarvesterABC, DatasetHarvesterABC
|
|
7
|
+
from ckanapi_harvesters.harvesters.harvester_params import TableParams
|
|
8
|
+
from ckanapi_harvesters.harvesters.harvester_params import DatasetParams
|
|
9
|
+
from ckanapi_harvesters.harvesters.postgre_harvester import TableHarvesterPostgre, DatasetHarvesterPostgre
|
|
10
|
+
from ckanapi_harvesters.harvesters.pymongo_harvester import TableHarvesterMongoCollection, DatasetHarvesterMongoDatabase
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def init_table_harvester_from_options_string(options_string:str, *, file_url_attr:str, base_dir:str=None) -> TableHarvesterABC:
|
|
14
|
+
harvest_method = TableParams.parse_harvest_method(options_string)
|
|
15
|
+
if harvest_method == "pymongo":
|
|
16
|
+
return TableHarvesterMongoCollection.init_from_options_string(options_string, file_url_attr=file_url_attr, base_dir=base_dir)
|
|
17
|
+
elif harvest_method == "postgre":
|
|
18
|
+
return TableHarvesterPostgre.init_from_options_string(options_string, file_url_attr=file_url_attr, base_dir=base_dir)
|
|
19
|
+
else:
|
|
20
|
+
raise NotImplementedError(f"harvester method {harvest_method} not implemented")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def init_dataset_harvester_from_options_string(options_string:str, *, base_dir:str=None) -> DatasetHarvesterABC:
|
|
24
|
+
harvest_method = DatasetParams.parse_harvest_method(options_string)
|
|
25
|
+
if harvest_method == "pymongo":
|
|
26
|
+
return DatasetHarvesterMongoDatabase.init_from_options_string(options_string, base_dir=base_dir)
|
|
27
|
+
elif harvest_method == "postgre":
|
|
28
|
+
return DatasetHarvesterPostgre.init_from_options_string(options_string, base_dir=base_dir)
|
|
29
|
+
else:
|
|
30
|
+
raise NotImplementedError(f"harvester method {harvest_method} not implemented")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Harvester base class
|
|
5
|
+
"""
|
|
6
|
+
from typing import Union, List
|
|
7
|
+
from collections import OrderedDict
|
|
8
|
+
import copy
|
|
9
|
+
|
|
10
|
+
from ckanapi_harvesters.auxiliary.ckan_auxiliary import CkanFieldInternalAttrs
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FieldMetadata:
|
|
14
|
+
def __init__(self):
|
|
15
|
+
self.name:str = ""
|
|
16
|
+
self.description:Union[str,None] = None
|
|
17
|
+
self.label:Union[str,None] = None
|
|
18
|
+
self.data_type:Union[str,None] = None
|
|
19
|
+
self.is_index:Union[bool,None] = None
|
|
20
|
+
self.uniquekey:Union[bool,None] = None
|
|
21
|
+
self.notnull:Union[bool,None] = None
|
|
22
|
+
self.internal_attrs: CkanFieldInternalAttrs = CkanFieldInternalAttrs()
|
|
23
|
+
self.harvester_attrs: dict = {}
|
|
24
|
+
|
|
25
|
+
def copy(self):
|
|
26
|
+
return copy.deepcopy(self)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TableMetadata:
|
|
30
|
+
def __init__(self):
|
|
31
|
+
self.name: str = ""
|
|
32
|
+
self.primary_key: Union[List[str],None] = None
|
|
33
|
+
self.indexes: Union[List[str],None] = None
|
|
34
|
+
self.unique_keys: Union[List[str],None] = None
|
|
35
|
+
self.description: Union[str,None] = None
|
|
36
|
+
self.fields: Union[OrderedDict[str,FieldMetadata],None] = None
|
|
37
|
+
|
|
38
|
+
def copy(self):
|
|
39
|
+
return copy.deepcopy(self)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DatasetMetadata:
|
|
43
|
+
def __init__(self):
|
|
44
|
+
self.name:str = ""
|
|
45
|
+
self.description:Union[str,None] = None
|
|
46
|
+
self.tables:Union[OrderedDict[str,TableMetadata],None] = None
|
|
47
|
+
|
|
48
|
+
def copy(self):
|
|
49
|
+
return copy.deepcopy(self)
|