ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ckanapi_harvesters/__init__.py +32 -10
- ckanapi_harvesters/auxiliary/__init__.py +26 -0
- ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
- ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
- ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
- ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
- ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
- ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
- ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
- ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
- ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
- ckanapi_harvesters/auxiliary/deprecated.py +82 -0
- ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
- ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
- ckanapi_harvesters/auxiliary/list_records.py +60 -0
- ckanapi_harvesters/auxiliary/login.py +163 -0
- ckanapi_harvesters/auxiliary/path.py +208 -0
- ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
- ckanapi_harvesters/auxiliary/urls.py +40 -0
- ckanapi_harvesters/builder/__init__.py +40 -0
- ckanapi_harvesters/builder/builder_aux.py +20 -0
- ckanapi_harvesters/builder/builder_ckan.py +238 -0
- ckanapi_harvesters/builder/builder_errors.py +36 -0
- ckanapi_harvesters/builder/builder_field.py +122 -0
- ckanapi_harvesters/builder/builder_package.py +9 -0
- ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
- ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
- ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
- ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
- ckanapi_harvesters/builder/builder_resource.py +589 -0
- ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
- ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
- ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
- ckanapi_harvesters/builder/builder_resource_init.py +126 -0
- ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
- ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
- ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
- ckanapi_harvesters/builder/example/__init__.py +21 -0
- ckanapi_harvesters/builder/example/builder_example.py +21 -0
- ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
- ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
- ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
- ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
- ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
- ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
- ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
- ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
- ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
- ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
- ckanapi_harvesters/builder/mapper_datastore.py +93 -0
- ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
- ckanapi_harvesters/builder/specific/__init__.py +11 -0
- ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
- ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
- ckanapi_harvesters/ckan_api/__init__.py +20 -0
- ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
- ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
- ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
- ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
- ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
- ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
- ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
- ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
- ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
- ckanapi_harvesters/harvesters/__init__.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
- ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
- ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
- ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
- ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
- ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
- ckanapi_harvesters/harvesters/harvester_init.py +30 -0
- ckanapi_harvesters/harvesters/harvester_model.py +49 -0
- ckanapi_harvesters/harvesters/harvester_params.py +323 -0
- ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
- ckanapi_harvesters/harvesters/postgre_params.py +86 -0
- ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
- ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
- ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
- ckanapi_harvesters/policies/__init__.py +20 -0
- ckanapi_harvesters/policies/data_format_policy.py +269 -0
- ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
- ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
- ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
- ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
- ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
- ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
- ckanapi_harvesters/reports/__init__.py +11 -0
- ckanapi_harvesters/reports/admin_report.py +292 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
- ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
- ckanapi_harvesters/divider/__init__.py +0 -27
- ckanapi_harvesters/divider/divider.py +0 -53
- ckanapi_harvesters/divider/divider_error.py +0 -59
- ckanapi_harvesters/main.py +0 -30
- ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Harvest from a mongo database using pymongo
|
|
5
|
+
"""
|
|
6
|
+
from typing import Union, List, Any, Dict, Set
|
|
7
|
+
from types import SimpleNamespace
|
|
8
|
+
from collections import OrderedDict
|
|
9
|
+
from warnings import warn
|
|
10
|
+
import copy
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import bson
|
|
17
|
+
except ImportError:
|
|
18
|
+
bson = SimpleNamespace(ObjectId=None, DBRef=None)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
from ckanapi_harvesters.auxiliary.ckan_model import CkanField
|
|
22
|
+
from ckanapi_harvesters.auxiliary.list_records import ListRecords, records_to_df
|
|
23
|
+
from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_errors import CleanerRequirementError
|
|
24
|
+
from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_upload import CkanDataCleanerUpload, _pd_series_type_detect
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
mongodb_keep_id_column_trace:bool = True
|
|
28
|
+
mongodb_keep_class_column_trace:bool = True
|
|
29
|
+
mongodb_id_column:str = "_id"
|
|
30
|
+
mongodb_id_new_column:str = "ObjectId"
|
|
31
|
+
mongodb_id_datatype_numeric:bool = False # option to store ids as a numeric datatype - there is no numeric datatype which corresponds
|
|
32
|
+
mongodb_id_alt_type_numeric:str = "int12" # the ObjectIds are 12 byte integers (int96) - there is no such data type in Postgre (oid?)
|
|
33
|
+
mongodb_dbref_extract_new_id_column_max_level:int = 1 # option to create a new column if a DBRef is found in a json field
|
|
34
|
+
mongodb_dbref_alt_type:str = "json" # used if not resumed in one column
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def pymongo_default_df_conversion(documents: List[dict], **kwargs) -> Union[pd.DataFrame, ListRecords]:
|
|
38
|
+
df = ListRecords(documents)
|
|
39
|
+
# if df.columns is not None:
|
|
40
|
+
# for i, field_name in enumerate(df.columns):
|
|
41
|
+
# if field_name == mongodb_id_column:
|
|
42
|
+
# df.columns[i] = mongodb_id_new_column
|
|
43
|
+
# df = records_to_df(documents)
|
|
44
|
+
return df
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class BrokenMongoRefError(Exception):
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class MongoDataCleanerUpload(CkanDataCleanerUpload):
|
|
52
|
+
"""
|
|
53
|
+
Data cleaner operations specific to MongoDB objects.
|
|
54
|
+
"""
|
|
55
|
+
def __init__(self):
|
|
56
|
+
super().__init__()
|
|
57
|
+
# change default parameters
|
|
58
|
+
self.param_field_subs[mongodb_id_column] = mongodb_id_new_column
|
|
59
|
+
self.param_field_primary_key = [mongodb_id_new_column]
|
|
60
|
+
self.param_apply_field_changes = True
|
|
61
|
+
# specific options
|
|
62
|
+
self.param_mongodb_dbref_as_one_column:bool = True # option to extract only the ObjectId referenced by the DBRefs
|
|
63
|
+
self.collection_refs:Dict[str,Set[str]] = {}
|
|
64
|
+
self.database_refs:Dict[str,Set[str]] = {}
|
|
65
|
+
self.broken_collection_refs:List[str] = []
|
|
66
|
+
self.broken_database_refs:List[str] = []
|
|
67
|
+
if bson.DBRef is None:
|
|
68
|
+
raise CleanerRequirementError("bson", "DBRef, ObjectId")
|
|
69
|
+
|
|
70
|
+
def clear_outputs_new_dataframe(self):
|
|
71
|
+
super().clear_outputs_new_dataframe()
|
|
72
|
+
self.broken_collection_refs:List[str] = []
|
|
73
|
+
self.broken_database_refs:List[str] = []
|
|
74
|
+
|
|
75
|
+
def clear_all_outputs(self):
|
|
76
|
+
super().clear_all_outputs()
|
|
77
|
+
self.collection_refs:Dict[str,Set[str]] = {}
|
|
78
|
+
self.database_refs:Dict[str,Set[str]] = {}
|
|
79
|
+
|
|
80
|
+
def copy(self, dest=None) -> "MongoDataCleanerUpload":
|
|
81
|
+
if dest is None:
|
|
82
|
+
dest = MongoDataCleanerUpload()
|
|
83
|
+
super().copy(dest=dest)
|
|
84
|
+
dest.param_mongodb_dbref_as_one_column = self.param_mongodb_dbref_as_one_column
|
|
85
|
+
dest.collection_refs = copy.deepcopy(self.collection_refs)
|
|
86
|
+
dest.database_refs = copy.deepcopy(self.database_refs)
|
|
87
|
+
return dest
|
|
88
|
+
|
|
89
|
+
def _detect_standard_field_bypass(self, field_name: str, values: Union[Any, pd.Series]) -> Union[CkanField,None]:
|
|
90
|
+
if _pd_series_type_detect(values, bson.DBRef):
|
|
91
|
+
if self.param_mongodb_dbref_as_one_column:
|
|
92
|
+
return CkanField(field_name, mongodb_id_alt_type_numeric if mongodb_id_datatype_numeric else "text")
|
|
93
|
+
else:
|
|
94
|
+
return CkanField(field_name, mongodb_dbref_alt_type)
|
|
95
|
+
elif _pd_series_type_detect(values, bson.ObjectId):
|
|
96
|
+
return CkanField(field_name, mongodb_id_alt_type_numeric if mongodb_id_datatype_numeric else "text")
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
def _replace_non_standard_subvalue(self, subvalue:Any, field:CkanField, path:str, level:int,
|
|
100
|
+
*, field_data_type:str) -> Any:
|
|
101
|
+
field_name = field.name if field is not None else None
|
|
102
|
+
if isinstance(subvalue, bson.ObjectId):
|
|
103
|
+
if mongodb_id_datatype_numeric:
|
|
104
|
+
new_subvalue = int(str(subvalue), 16)
|
|
105
|
+
# new_subvalue = str(subvalue)
|
|
106
|
+
else:
|
|
107
|
+
new_subvalue = str(subvalue)
|
|
108
|
+
if level == 0:
|
|
109
|
+
self.field_suggested_index.add(field_name)
|
|
110
|
+
return new_subvalue
|
|
111
|
+
elif isinstance(subvalue, bson.DBRef):
|
|
112
|
+
id_field = path.replace(".", "_")
|
|
113
|
+
if level == 0 and self.param_mongodb_dbref_as_one_column:
|
|
114
|
+
id_path = None
|
|
115
|
+
elif self.param_mongodb_dbref_as_one_column: # and level > 0
|
|
116
|
+
id_path = path
|
|
117
|
+
elif level == 0 and not self.param_mongodb_dbref_as_one_column:
|
|
118
|
+
id_path = path + "." + mongodb_id_new_column
|
|
119
|
+
id_field = id_field + "_id"
|
|
120
|
+
else:
|
|
121
|
+
id_path = path + "." + mongodb_id_new_column
|
|
122
|
+
if (level <= mongodb_dbref_extract_new_id_column_max_level and
|
|
123
|
+
id_path is not None and id_path not in self.field_subs_path.keys()):
|
|
124
|
+
self._add_field_from_path(id_path,
|
|
125
|
+
data_type=mongodb_id_alt_type_numeric if mongodb_id_datatype_numeric else "text",
|
|
126
|
+
new_field_name=id_field,
|
|
127
|
+
notes=f"Column extracted from {id_path}")
|
|
128
|
+
if mongodb_id_datatype_numeric:
|
|
129
|
+
id_value = int(str(subvalue.id), 16)
|
|
130
|
+
# id_value = str(subvalue.id)
|
|
131
|
+
else:
|
|
132
|
+
id_value = str(subvalue.id)
|
|
133
|
+
if self.param_mongodb_dbref_as_one_column:
|
|
134
|
+
if path in self.collection_refs.keys():
|
|
135
|
+
self.collection_refs[path].add(str(subvalue.collection))
|
|
136
|
+
self.database_refs[path].add(str(subvalue.database))
|
|
137
|
+
else:
|
|
138
|
+
self.collection_refs[path] = {str(subvalue.collection)}
|
|
139
|
+
self.database_refs[path] = {str(subvalue.database)}
|
|
140
|
+
new_subvalue = id_value
|
|
141
|
+
else:
|
|
142
|
+
new_subvalue = {mongodb_id_new_column: id_value,
|
|
143
|
+
"collection": subvalue.collection,
|
|
144
|
+
"database": subvalue.database,
|
|
145
|
+
}
|
|
146
|
+
if id_path in self.field_subs_path.keys():
|
|
147
|
+
self._new_columns_in_row[id_path] = new_subvalue[mongodb_id_new_column]
|
|
148
|
+
return new_subvalue
|
|
149
|
+
elif level == 0:
|
|
150
|
+
return super()._replace_non_standard_value(subvalue, field, field_data_type=field_data_type)
|
|
151
|
+
else:
|
|
152
|
+
return super()._replace_non_standard_subvalue(subvalue, field, path, level, field_data_type=field_data_type)
|
|
153
|
+
|
|
154
|
+
def _replace_non_standard_value(self, value:Any, field:CkanField,
|
|
155
|
+
*, field_data_type:str) -> Any:
|
|
156
|
+
field_name = field.name if field is not None else None
|
|
157
|
+
return self._replace_non_standard_subvalue(value, field, path=field_name,
|
|
158
|
+
level=0, field_data_type=field_data_type)
|
|
159
|
+
|
|
160
|
+
def _extra_checks(self, records: Union[List[dict], pd.DataFrame], fields:Union[OrderedDict[str, CkanField], None]) -> None:
|
|
161
|
+
self.broken_collection_refs = [path for path, refs in self.collection_refs.items() if len(refs) > 1]
|
|
162
|
+
self.broken_database_refs = [path for path, refs in self.database_refs.items() if len(refs) > 1]
|
|
163
|
+
if len(self.broken_collection_refs) > 0 or len(self.broken_database_refs) > 0:
|
|
164
|
+
broken_refs = set(self.broken_collection_refs).union(set(self.broken_database_refs))
|
|
165
|
+
msg = f"DBRefs do not point to an unique collection: {', '.join(broken_refs)}"
|
|
166
|
+
if self.param_raise_error or self.param_mongodb_dbref_as_one_column:
|
|
167
|
+
raise BrokenMongoRefError(msg)
|
|
168
|
+
else:
|
|
169
|
+
warn(msg)
|
|
170
|
+
|
|
171
|
+
def pymongo_default_data_cleaner() -> MongoDataCleanerUpload:
|
|
172
|
+
return MongoDataCleanerUpload()
|
|
173
|
+
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Harvest from a mongo database using pymongo
|
|
5
|
+
"""
|
|
6
|
+
from typing import Union, List, Any, Dict
|
|
7
|
+
from types import SimpleNamespace
|
|
8
|
+
from collections import OrderedDict
|
|
9
|
+
import json
|
|
10
|
+
import argparse
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import pymongo
|
|
15
|
+
import pymongo.client_session
|
|
16
|
+
import pymongo.database
|
|
17
|
+
except ImportError:
|
|
18
|
+
pymongo = SimpleNamespace(MongoClient=None, client_session=SimpleNamespace(ClientSession=None),
|
|
19
|
+
database=SimpleNamespace(Database=None), collection=SimpleNamespace(Collection=None))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
from ckanapi_harvesters.harvesters.harvester_errors import (HarvesterRequirementError, HarvesterArgumentRequiredError, ResourceNotFoundError)
|
|
23
|
+
from ckanapi_harvesters.harvesters.harvester_abc import TableHarvesterABC, DatasetHarvesterABC, DatabaseHarvesterABC
|
|
24
|
+
from ckanapi_harvesters.harvesters.harvester_model import TableMetadata, DatasetMetadata
|
|
25
|
+
from ckanapi_harvesters.harvesters.harvester_params import DatasetParams, DatabaseParams
|
|
26
|
+
from ckanapi_harvesters.harvesters.pymongo_data_cleaner import pymongo_default_data_cleaner, pymongo_default_df_conversion
|
|
27
|
+
from ckanapi_harvesters.harvesters.pymongo_data_cleaner import mongodb_keep_id_column_trace, mongodb_id_new_column
|
|
28
|
+
from ckanapi_harvesters.harvesters.pymongo_params import TableParamsMongoCollection
|
|
29
|
+
from ckanapi_harvesters.auxiliary.urls import url_join
|
|
30
|
+
from ckanapi_harvesters.auxiliary.ckan_auxiliary import ssl_arguments_decompose, assert_or_raise
|
|
31
|
+
from ckanapi_harvesters.auxiliary.ckan_errors import UrlError
|
|
32
|
+
from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_abc import CkanDataCleanerABC
|
|
33
|
+
from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
mongodb_excluded_collections = {"system.profile"}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DatabaseHarvesterMongoServer(DatabaseHarvesterABC):
|
|
40
|
+
"""
|
|
41
|
+
This class manages the connection to a MongoDB server.
|
|
42
|
+
It can list datasets (MongoDB databases) but this call could lead to an error.
|
|
43
|
+
"""
|
|
44
|
+
def __init__(self, params:DatabaseParams=None):
|
|
45
|
+
super().__init__(params)
|
|
46
|
+
if pymongo.MongoClient is None:
|
|
47
|
+
raise HarvesterRequirementError("pymongo", "pymongo")
|
|
48
|
+
self.params.harvest_method = "Pymongo"
|
|
49
|
+
self.mongo_client: Union[pymongo.MongoClient,None] = None
|
|
50
|
+
self.mongo_session: Union[pymongo.client_session.ClientSession,None] = None
|
|
51
|
+
if self.params.auth_url is None and self.params.port is None and self.params.host is None:
|
|
52
|
+
raise HarvesterArgumentRequiredError("auth-url", "pymongo", "This argument defines the url used to authenticate.")
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def init_from_options_string(options_string:str, base_dir:str=None) -> "DatabaseHarvesterMongoServer":
|
|
56
|
+
params = DatabaseParams()
|
|
57
|
+
params.parse_options_string(options_string, base_dir=base_dir)
|
|
58
|
+
return DatabaseHarvesterMongoServer(params)
|
|
59
|
+
|
|
60
|
+
def copy(self, *, dest=None):
|
|
61
|
+
if dest is None:
|
|
62
|
+
dest = DatabaseHarvesterMongoServer()
|
|
63
|
+
return super().copy(dest=dest)
|
|
64
|
+
|
|
65
|
+
def connect(self, *, cancel_if_connected:bool=True) -> Any:
|
|
66
|
+
if cancel_if_connected and self.mongo_client is not None:
|
|
67
|
+
return self.mongo_client
|
|
68
|
+
else:
|
|
69
|
+
if self.mongo_client is not None:
|
|
70
|
+
self.mongo_session.end_session()
|
|
71
|
+
self.mongo_client.close()
|
|
72
|
+
self.mongo_session = None
|
|
73
|
+
self.mongo_client = None
|
|
74
|
+
ssl, ssl_certfile = ssl_arguments_decompose(self.params.verify_ca)
|
|
75
|
+
auth_url = self.params.auth_url
|
|
76
|
+
if auth_url is None:
|
|
77
|
+
if self.params.url is not None:
|
|
78
|
+
auth_url = self.params.url
|
|
79
|
+
elif self.params.host is not None:
|
|
80
|
+
auth_url = f"mongodb://{self.params.host}"
|
|
81
|
+
if self.params.port is not None:
|
|
82
|
+
auth_url += f":{self.params.port}"
|
|
83
|
+
else:
|
|
84
|
+
raise UrlError("No Mongo URL provided")
|
|
85
|
+
if self.params.auth_url_suffix is not None:
|
|
86
|
+
auth_url = url_join(auth_url, self.params.auth_url_suffix)
|
|
87
|
+
self.params.auth_url = auth_url
|
|
88
|
+
self.mongo_client = pymongo.MongoClient(auth_url, username=self.params.login.username, password=self.params.login.password,
|
|
89
|
+
ssl=ssl, tlscafile=ssl_certfile,
|
|
90
|
+
timeoutMS=self.params.timeout*1000.0 if self.params.timeout is not None else None)
|
|
91
|
+
self.mongo_session = self.mongo_client.start_session()
|
|
92
|
+
if self.params.host is None and self.params.port is None:
|
|
93
|
+
# complete with host and port parsed by MongoClient
|
|
94
|
+
mongo_address = self.mongo_client.address
|
|
95
|
+
if mongo_address is not None:
|
|
96
|
+
self.params.host, self.params.port = mongo_address
|
|
97
|
+
return self.mongo_client
|
|
98
|
+
|
|
99
|
+
def is_connected(self) -> bool:
|
|
100
|
+
return self.mongo_client is not None
|
|
101
|
+
|
|
102
|
+
def disconnect(self) -> None:
|
|
103
|
+
if self.mongo_client is not None:
|
|
104
|
+
if self.mongo_session is not None:
|
|
105
|
+
self.mongo_session.end_session()
|
|
106
|
+
self.mongo_client.close()
|
|
107
|
+
self.mongo_client = None
|
|
108
|
+
self.mongo_session = None
|
|
109
|
+
|
|
110
|
+
def check_connection(self, *, new_connection:bool=False, raise_error:bool=False) -> Union[None, ContextErrorLevelMessage]:
|
|
111
|
+
try:
|
|
112
|
+
self.connect(cancel_if_connected=not new_connection)
|
|
113
|
+
# the following line requires specific admin rights (unexpected)
|
|
114
|
+
# remote_collections = self.mongo_client.list_database_names(self.mongo_session) # this tests the database connection
|
|
115
|
+
except Exception as e:
|
|
116
|
+
if raise_error:
|
|
117
|
+
raise e from e
|
|
118
|
+
else:
|
|
119
|
+
return ContextErrorLevelMessage("Mongo Harvester", ErrorLevel.Error, f"Failed to connect to {self.params.auth_url}: {e}")
|
|
120
|
+
|
|
121
|
+
def get_dataset_harvester(self, dataset_name: str) -> "DatasetHarvesterMongoDatabase":
|
|
122
|
+
params_dataset = self.params.copy(dest=DatasetParams())
|
|
123
|
+
params_dataset.dataset = dataset_name
|
|
124
|
+
dataset_harvester = DatasetHarvesterMongoDatabase(params_dataset)
|
|
125
|
+
self.copy(dest=dataset_harvester)
|
|
126
|
+
dataset_harvester.params = params_dataset
|
|
127
|
+
dataset_harvester._finalize_connection()
|
|
128
|
+
return dataset_harvester
|
|
129
|
+
|
|
130
|
+
def list_datasets(self, return_metadata: bool = True) -> Union[List[str], OrderedDict[str, DatasetMetadata]]:
|
|
131
|
+
# this would raise an unauthorized error !
|
|
132
|
+
self.connect()
|
|
133
|
+
dataset_list = self.mongo_client.list_database_names(self.mongo_session)
|
|
134
|
+
if return_metadata:
|
|
135
|
+
return OrderedDict([(name, self.get_dataset_harvester(name).query_dataset_metadata()) for name in dataset_list])
|
|
136
|
+
else:
|
|
137
|
+
return dataset_list
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class DatasetHarvesterMongoDatabase(DatabaseHarvesterMongoServer, DatasetHarvesterABC):
|
|
141
|
+
"""
|
|
142
|
+
A CKAN dataset corresponds to a MongoDB database (set of collections).
|
|
143
|
+
"""
|
|
144
|
+
def __init__(self, params:DatasetParams=None):
|
|
145
|
+
super().__init__(params)
|
|
146
|
+
self.mongo_database: Union[pymongo.database.Database,None] = None
|
|
147
|
+
self.dataset_metadata: Union[DatasetMetadata, None] = None # DatasetHarvesterABC
|
|
148
|
+
# use database attribute if dataset is not specified (ambiguity on name - database attribute is not used above)
|
|
149
|
+
if self.params.dataset is None:
|
|
150
|
+
self.params.dataset = self.params.database
|
|
151
|
+
if self.params.dataset is None:
|
|
152
|
+
raise HarvesterArgumentRequiredError("dataset", "pymongo", "This argument defines the mongo database used")
|
|
153
|
+
|
|
154
|
+
@staticmethod
|
|
155
|
+
def init_from_options_string(options_string:str, base_dir:str=None) -> "DatasetHarvesterMongoDatabase":
|
|
156
|
+
params = DatasetParams()
|
|
157
|
+
params.parse_options_string(options_string, base_dir=base_dir)
|
|
158
|
+
return DatasetHarvesterMongoDatabase(params)
|
|
159
|
+
|
|
160
|
+
def _finalize_connection(self):
|
|
161
|
+
if super().is_connected() and self.mongo_database is None:
|
|
162
|
+
# remote_datasets = self.mongo_client.list_database_names(self.mongo_session)
|
|
163
|
+
# assert_or_raise(self.params.dataset in remote_datasets, ResourceNotFoundError("Database", self.params.dataset, self.params.auth_url))
|
|
164
|
+
self.mongo_database = self.mongo_client[self.params.dataset]
|
|
165
|
+
|
|
166
|
+
def connect(self, *, cancel_if_connected:bool=True) -> Any:
|
|
167
|
+
if not (cancel_if_connected and self.is_connected()):
|
|
168
|
+
super().connect(cancel_if_connected=cancel_if_connected)
|
|
169
|
+
self._finalize_connection()
|
|
170
|
+
return self.mongo_client
|
|
171
|
+
|
|
172
|
+
def is_connected(self) -> bool:
|
|
173
|
+
return super().is_connected()
|
|
174
|
+
|
|
175
|
+
def disconnect(self) -> None:
|
|
176
|
+
if super().is_connected():
|
|
177
|
+
self.mongo_database = None
|
|
178
|
+
super().disconnect()
|
|
179
|
+
|
|
180
|
+
def check_connection(self, *, new_connection: bool = False, raise_error: bool = False) -> Union[None, ContextErrorLevelMessage]:
|
|
181
|
+
try:
|
|
182
|
+
super().check_connection(new_connection=new_connection, raise_error=raise_error)
|
|
183
|
+
remote_collections = self.mongo_database.list_collection_names(self.mongo_session) # this tests the database connection
|
|
184
|
+
except Exception as e:
|
|
185
|
+
if raise_error:
|
|
186
|
+
raise e from e
|
|
187
|
+
else:
|
|
188
|
+
return ContextErrorLevelMessage("Mongo Harvester", ErrorLevel.Error,
|
|
189
|
+
f"Failed to connect to {self.params.auth_url}: {e}")
|
|
190
|
+
if self.mongo_database is None:
|
|
191
|
+
return ContextErrorLevelMessage("Mongo Harvester", ErrorLevel.Error, f"Failed to connect to {self.params.auth_url}: <no error message>")
|
|
192
|
+
else:
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
def query_dataset_metadata(self, cancel_if_present:bool=True) -> DatasetMetadata:
|
|
196
|
+
self.connect()
|
|
197
|
+
if cancel_if_present and self.dataset_metadata is not None:
|
|
198
|
+
return self.dataset_metadata
|
|
199
|
+
else:
|
|
200
|
+
self.dataset_metadata = DatasetMetadata()
|
|
201
|
+
self.dataset_metadata.name = self.mongo_database.name
|
|
202
|
+
self.dataset_metadata.tables = self.list_tables(return_metadata=True)
|
|
203
|
+
return self.dataset_metadata
|
|
204
|
+
|
|
205
|
+
def clean_dataset_metadata(self) -> DatasetMetadata:
|
|
206
|
+
return self.query_dataset_metadata().copy()
|
|
207
|
+
|
|
208
|
+
def get_table_harvester(self, table_name:str) -> "TableHarvesterMongoCollection":
|
|
209
|
+
params_table = self.params.copy(dest=TableParamsMongoCollection())
|
|
210
|
+
if self.params.options_string is not None:
|
|
211
|
+
# reparse options_string for table-specific arguments
|
|
212
|
+
params_table.parse_options_string(self.params.options_string, base_dir=self.params.base_dir)
|
|
213
|
+
params_table.table = table_name
|
|
214
|
+
table_harvester = TableHarvesterMongoCollection(params_table)
|
|
215
|
+
self.copy(dest=table_harvester)
|
|
216
|
+
table_harvester.params = params_table
|
|
217
|
+
table_harvester._finalize_connection()
|
|
218
|
+
return table_harvester
|
|
219
|
+
|
|
220
|
+
def list_tables(self, return_metadata:bool=True) -> Union[List[str], OrderedDict[str, TableMetadata]]:
|
|
221
|
+
self.connect()
|
|
222
|
+
remote_collections = [collection_name for collection_name in self.mongo_database.list_collection_names(session=self.mongo_session) if collection_name not in mongodb_excluded_collections]
|
|
223
|
+
if return_metadata:
|
|
224
|
+
return OrderedDict([(name, self.get_table_harvester(name).query_table_metadata()) for name in remote_collections])
|
|
225
|
+
else:
|
|
226
|
+
return remote_collections
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class TableHarvesterMongoCollection(DatasetHarvesterMongoDatabase, TableHarvesterABC):
|
|
230
|
+
"""
|
|
231
|
+
A table (CKAN DataStore) corresponds to a MongoDB collection.
|
|
232
|
+
"""
|
|
233
|
+
_default_upload_fun = pymongo_default_df_conversion
|
|
234
|
+
_default_primary_key = [mongodb_id_new_column]
|
|
235
|
+
|
|
236
|
+
def __init__(self, params:TableParamsMongoCollection=None):
|
|
237
|
+
super().__init__(params)
|
|
238
|
+
self.params: TableParamsMongoCollection = params
|
|
239
|
+
self.mongo_collection: Union[pymongo.collection.Collection,None] = None
|
|
240
|
+
self.table_metadata: Union[TableMetadata, None] = None # TableHarvesterABC
|
|
241
|
+
if self.params.file_url_attr is not None:
|
|
242
|
+
# File/URL attribute has priority over CLI
|
|
243
|
+
self.params.table = self.params.file_url_attr
|
|
244
|
+
if self.params.table is None:
|
|
245
|
+
raise HarvesterArgumentRequiredError("table", "pymongo", "This argument defines the mongo collection used")
|
|
246
|
+
|
|
247
|
+
@staticmethod
|
|
248
|
+
def init_from_options_string(options_string:str, *, base_dir:str=None, file_url_attr:str=None) -> "TableHarvesterMongoCollection":
|
|
249
|
+
params = TableParamsMongoCollection()
|
|
250
|
+
params.parse_options_string(options_string, file_url_attr=file_url_attr, base_dir=base_dir)
|
|
251
|
+
return TableHarvesterMongoCollection(params)
|
|
252
|
+
|
|
253
|
+
def copy(self, *, dest=None):
|
|
254
|
+
if dest is None:
|
|
255
|
+
dest = TableHarvesterMongoCollection()
|
|
256
|
+
super().copy(dest=dest)
|
|
257
|
+
return dest
|
|
258
|
+
|
|
259
|
+
def disconnect(self) -> None:
|
|
260
|
+
if super().is_connected():
|
|
261
|
+
self.mongo_collection = None
|
|
262
|
+
super().disconnect()
|
|
263
|
+
|
|
264
|
+
def _finalize_connection(self):
|
|
265
|
+
super()._finalize_connection()
|
|
266
|
+
if super().is_connected() and self.mongo_collection is None:
|
|
267
|
+
mongo_database = self.mongo_database
|
|
268
|
+
remote_collections = mongo_database.list_collection_names()
|
|
269
|
+
collection_name = self.params.table
|
|
270
|
+
assert_or_raise(collection_name in remote_collections, ResourceNotFoundError("Collection", self.params.dataset + '.' + collection_name, self.params.auth_url))
|
|
271
|
+
collection = mongo_database[collection_name]
|
|
272
|
+
self.mongo_collection = collection
|
|
273
|
+
|
|
274
|
+
def connect(self, *, cancel_if_connected:bool=True) -> Any:
|
|
275
|
+
if not (cancel_if_connected and self.is_connected()):
|
|
276
|
+
super().connect()
|
|
277
|
+
self._finalize_connection()
|
|
278
|
+
return self.mongo_client
|
|
279
|
+
|
|
280
|
+
def check_connection(self, *, new_connection:bool=False, raise_error:bool=False) -> Union[None, ContextErrorLevelMessage]:
|
|
281
|
+
super().check_connection(new_connection=new_connection, raise_error=raise_error)
|
|
282
|
+
if self.mongo_collection is None:
|
|
283
|
+
return ContextErrorLevelMessage("Mongo Harvester", ErrorLevel.Error, f"Failed to connect to {self.params.auth_url}: <no error message>")
|
|
284
|
+
else:
|
|
285
|
+
return None
|
|
286
|
+
|
|
287
|
+
def query_table_metadata(self, cancel_if_present:bool=True) -> TableMetadata:
|
|
288
|
+
self.connect()
|
|
289
|
+
if cancel_if_present and self.table_metadata is not None:
|
|
290
|
+
return self.table_metadata
|
|
291
|
+
else:
|
|
292
|
+
# TODO: query at least primary key and indexes
|
|
293
|
+
self.table_metadata = TableMetadata()
|
|
294
|
+
index_dict = self.mongo_collection.index_information(session=self.mongo_session)
|
|
295
|
+
self.table_metadata.name = self.mongo_collection.name
|
|
296
|
+
self.table_metadata.indexes = sum([[key[0] for key in index["key"]] for index in index_dict.values()], [])
|
|
297
|
+
return self.table_metadata
|
|
298
|
+
|
|
299
|
+
def clean_table_metadata(self) -> TableMetadata:
|
|
300
|
+
clean_metadata = self.query_table_metadata().copy()
|
|
301
|
+
if clean_metadata.indexes is not None:
|
|
302
|
+
i_rm = []
|
|
303
|
+
for i, name in enumerate(clean_metadata.indexes):
|
|
304
|
+
if name == "_id":
|
|
305
|
+
if mongodb_keep_id_column_trace:
|
|
306
|
+
clean_metadata.indexes[i] = mongodb_id_new_column
|
|
307
|
+
else:
|
|
308
|
+
i_rm.append(i)
|
|
309
|
+
elif "." in name:
|
|
310
|
+
i_rm.append(i)
|
|
311
|
+
for i in reversed(i_rm):
|
|
312
|
+
clean_metadata.indexes.pop(i)
|
|
313
|
+
clean_metadata.indexes = None # finally, do not specify indexes at all
|
|
314
|
+
return clean_metadata
|
|
315
|
+
|
|
316
|
+
def get_default_primary_key(self) -> List[str]:
|
|
317
|
+
table_metadata = self.query_table_metadata()
|
|
318
|
+
if table_metadata.primary_key is not None:
|
|
319
|
+
return table_metadata.primary_key
|
|
320
|
+
else:
|
|
321
|
+
return TableHarvesterMongoCollection._default_primary_key
|
|
322
|
+
|
|
323
|
+
def get_default_data_cleaner(self) -> Union[CkanDataCleanerABC, None]:
|
|
324
|
+
data_cleaner = pymongo_default_data_cleaner()
|
|
325
|
+
data_cleaner.param_mongodb_dbref_as_one_column = not self.params.dbref_expand
|
|
326
|
+
return data_cleaner
|
|
327
|
+
|
|
328
|
+
def list_queries(self, *, new_connection:bool=False) -> List[Dict[str,Any]]:
|
|
329
|
+
self.connect(cancel_if_connected=not new_connection)
|
|
330
|
+
assert(self.mongo_collection is not None)
|
|
331
|
+
query = json.loads(self.params.query_string) if self.params.query_string is not None else {}
|
|
332
|
+
if self.params.verbose_harvester:
|
|
333
|
+
print(f"Counting documents of table {self.params.table}")
|
|
334
|
+
# num_rows = self.mongo_collection.count_documents(query, session=self.mongo_session)
|
|
335
|
+
num_rows = self.mongo_collection.estimated_document_count()
|
|
336
|
+
num_queries = num_rows // self.params.limit + 1
|
|
337
|
+
if self.params.single_request:
|
|
338
|
+
return [OrderedDict([("$match", query), ("$skip", i * self.params.limit), ("$limit", self.params.limit)]) for i in range(1)]
|
|
339
|
+
else:
|
|
340
|
+
queries_exact = [OrderedDict([("$match", query), ("$skip", i * self.params.limit), ("$limit", self.params.limit)]) for i in range(num_queries)]
|
|
341
|
+
query_extra = OrderedDict([("$match", query), ("$skip", num_queries * self.params.limit)])
|
|
342
|
+
return queries_exact + [query_extra]
|
|
343
|
+
|
|
344
|
+
def query_data(self, query:Dict[str,Any]) -> List[dict]:
|
|
345
|
+
assert(self.mongo_collection is not None)
|
|
346
|
+
if isinstance(query, str):
|
|
347
|
+
query = json.loads(query)
|
|
348
|
+
if self.params.verbose_harvester:
|
|
349
|
+
print(f"Pymongo request {query} on table {self.params.table}")
|
|
350
|
+
cursor = self.mongo_collection.find(query["$match"], session=self.mongo_session).skip(query["$skip"])
|
|
351
|
+
if "$limit" in query.keys():
|
|
352
|
+
cursor = cursor.limit(query["$limit"])
|
|
353
|
+
documents = list(cursor)
|
|
354
|
+
return documents
|
|
355
|
+
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Harvest from a mongo database using pymongo
|
|
5
|
+
"""
|
|
6
|
+
import argparse
|
|
7
|
+
|
|
8
|
+
from ckanapi_harvesters.harvesters.harvester_params import TableParams
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TableParamsMongoCollection(TableParams):
|
|
12
|
+
"""
|
|
13
|
+
A table (CKAN DataStore) corresponds to a MongoDB collection.
|
|
14
|
+
This subclass of TableParams implements an alias attribute for table name called collection.
|
|
15
|
+
"""
|
|
16
|
+
def __init__(self, source: "TableParamsMongoCollection" =None):
|
|
17
|
+
super().__init__(source)
|
|
18
|
+
self.dbref_expand:bool = False
|
|
19
|
+
if source is not None:
|
|
20
|
+
source.copy(dest=self)
|
|
21
|
+
|
|
22
|
+
# alias property for the table name setting: collection in MongoDB
|
|
23
|
+
@property
|
|
24
|
+
def collection(self) -> str:
|
|
25
|
+
return self.table
|
|
26
|
+
@collection.setter
|
|
27
|
+
def collection(self, value: str):
|
|
28
|
+
self.table = value
|
|
29
|
+
|
|
30
|
+
def copy(self, *, dest=None):
|
|
31
|
+
if dest is None:
|
|
32
|
+
dest = TableParamsMongoCollection()
|
|
33
|
+
super().copy(dest=dest)
|
|
34
|
+
dest.dbref_expand = self.dbref_expand
|
|
35
|
+
return dest
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def setup_cli_harvester_parser(parser: argparse.ArgumentParser = None) -> argparse.ArgumentParser:
|
|
39
|
+
parser = TableParams.setup_cli_harvester_parser(parser=parser)
|
|
40
|
+
parser.add_argument("--collection", type=str,
|
|
41
|
+
help="MongoDB collection name") # normally specified in the File/URL attribute of builder
|
|
42
|
+
# parser.add_argument("--table", help=argparse.SUPPRESS) # do not display in help ==> conflict
|
|
43
|
+
parser.add_argument("--dbref-expand",
|
|
44
|
+
help="Option to expand DBRefs",
|
|
45
|
+
action="store_true", default=False) # applies to data cleaner
|
|
46
|
+
return parser
|
|
47
|
+
|
|
48
|
+
def initialize_from_cli_args(self, args: argparse.Namespace, base_dir: str = None, error_not_found: bool = True,
|
|
49
|
+
default_proxies: dict = None, proxy_headers: dict = None) -> None:
|
|
50
|
+
super().initialize_from_cli_args(args, base_dir=base_dir, error_not_found=error_not_found,
|
|
51
|
+
default_proxies=default_proxies, proxy_headers=proxy_headers)
|
|
52
|
+
self.dbref_expand = args.dbref_expand
|
|
53
|
+
if args.collection is not None:
|
|
54
|
+
self.collection = args.collection
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Package to enforce CKAN data policies.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
POLICY_FILE_FORMAT_VERSION = "0.0.0" # version of the data format policy file format
|
|
8
|
+
|
|
9
|
+
from . import data_format_policy_defs
|
|
10
|
+
from . import data_format_policy_errors
|
|
11
|
+
from . import data_format_policy_abc
|
|
12
|
+
from . import data_format_policy_lists
|
|
13
|
+
from . import data_format_policy_tag_groups
|
|
14
|
+
from . import data_format_policy_custom_fields
|
|
15
|
+
from . import data_format_policy
|
|
16
|
+
|
|
17
|
+
# usage shortcuts
|
|
18
|
+
from ckanapi_harvesters.policies.data_format_policy import CkanPackageDataFormatPolicy
|
|
19
|
+
|
|
20
|
+
|