ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
  103. ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,173 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Harvest from a mongo database using pymongo
5
+ """
6
+ from typing import Union, List, Any, Dict, Set
7
+ from types import SimpleNamespace
8
+ from collections import OrderedDict
9
+ from warnings import warn
10
+ import copy
11
+
12
+ import pandas as pd
13
+
14
+
15
+ try:
16
+ import bson
17
+ except ImportError:
18
+ bson = SimpleNamespace(ObjectId=None, DBRef=None)
19
+
20
+
21
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanField
22
+ from ckanapi_harvesters.auxiliary.list_records import ListRecords, records_to_df
23
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_errors import CleanerRequirementError
24
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_upload import CkanDataCleanerUpload, _pd_series_type_detect
25
+
26
+
27
+ mongodb_keep_id_column_trace:bool = True
28
+ mongodb_keep_class_column_trace:bool = True
29
+ mongodb_id_column:str = "_id"
30
+ mongodb_id_new_column:str = "ObjectId"
31
+ mongodb_id_datatype_numeric:bool = False # option to store ids as a numeric datatype - there is no numeric datatype which corresponds
32
+ mongodb_id_alt_type_numeric:str = "int12" # the ObjectIds are 12 byte integers (int96) - there is no such data type in Postgre (oid?)
33
+ mongodb_dbref_extract_new_id_column_max_level:int = 1 # option to create a new column if a DBRef is found in a json field
34
+ mongodb_dbref_alt_type:str = "json" # used if not resumed in one column
35
+
36
+
37
+ def pymongo_default_df_conversion(documents: List[dict], **kwargs) -> Union[pd.DataFrame, ListRecords]:
38
+ df = ListRecords(documents)
39
+ # if df.columns is not None:
40
+ # for i, field_name in enumerate(df.columns):
41
+ # if field_name == mongodb_id_column:
42
+ # df.columns[i] = mongodb_id_new_column
43
+ # df = records_to_df(documents)
44
+ return df
45
+
46
+
47
+ class BrokenMongoRefError(Exception):
48
+ pass
49
+
50
+
51
+ class MongoDataCleanerUpload(CkanDataCleanerUpload):
52
+ """
53
+ Data cleaner operations specific to MongoDB objects.
54
+ """
55
+ def __init__(self):
56
+ super().__init__()
57
+ # change default parameters
58
+ self.param_field_subs[mongodb_id_column] = mongodb_id_new_column
59
+ self.param_field_primary_key = [mongodb_id_new_column]
60
+ self.param_apply_field_changes = True
61
+ # specific options
62
+ self.param_mongodb_dbref_as_one_column:bool = True # option to extract only the ObjectId referenced by the DBRefs
63
+ self.collection_refs:Dict[str,Set[str]] = {}
64
+ self.database_refs:Dict[str,Set[str]] = {}
65
+ self.broken_collection_refs:List[str] = []
66
+ self.broken_database_refs:List[str] = []
67
+ if bson.DBRef is None:
68
+ raise CleanerRequirementError("bson", "DBRef, ObjectId")
69
+
70
+ def clear_outputs_new_dataframe(self):
71
+ super().clear_outputs_new_dataframe()
72
+ self.broken_collection_refs:List[str] = []
73
+ self.broken_database_refs:List[str] = []
74
+
75
+ def clear_all_outputs(self):
76
+ super().clear_all_outputs()
77
+ self.collection_refs:Dict[str,Set[str]] = {}
78
+ self.database_refs:Dict[str,Set[str]] = {}
79
+
80
+ def copy(self, dest=None) -> "MongoDataCleanerUpload":
81
+ if dest is None:
82
+ dest = MongoDataCleanerUpload()
83
+ super().copy(dest=dest)
84
+ dest.param_mongodb_dbref_as_one_column = self.param_mongodb_dbref_as_one_column
85
+ dest.collection_refs = copy.deepcopy(self.collection_refs)
86
+ dest.database_refs = copy.deepcopy(self.database_refs)
87
+ return dest
88
+
89
+ def _detect_standard_field_bypass(self, field_name: str, values: Union[Any, pd.Series]) -> Union[CkanField,None]:
90
+ if _pd_series_type_detect(values, bson.DBRef):
91
+ if self.param_mongodb_dbref_as_one_column:
92
+ return CkanField(field_name, mongodb_id_alt_type_numeric if mongodb_id_datatype_numeric else "text")
93
+ else:
94
+ return CkanField(field_name, mongodb_dbref_alt_type)
95
+ elif _pd_series_type_detect(values, bson.ObjectId):
96
+ return CkanField(field_name, mongodb_id_alt_type_numeric if mongodb_id_datatype_numeric else "text")
97
+ return None
98
+
99
+ def _replace_non_standard_subvalue(self, subvalue:Any, field:CkanField, path:str, level:int,
100
+ *, field_data_type:str) -> Any:
101
+ field_name = field.name if field is not None else None
102
+ if isinstance(subvalue, bson.ObjectId):
103
+ if mongodb_id_datatype_numeric:
104
+ new_subvalue = int(str(subvalue), 16)
105
+ # new_subvalue = str(subvalue)
106
+ else:
107
+ new_subvalue = str(subvalue)
108
+ if level == 0:
109
+ self.field_suggested_index.add(field_name)
110
+ return new_subvalue
111
+ elif isinstance(subvalue, bson.DBRef):
112
+ id_field = path.replace(".", "_")
113
+ if level == 0 and self.param_mongodb_dbref_as_one_column:
114
+ id_path = None
115
+ elif self.param_mongodb_dbref_as_one_column: # and level > 0
116
+ id_path = path
117
+ elif level == 0 and not self.param_mongodb_dbref_as_one_column:
118
+ id_path = path + "." + mongodb_id_new_column
119
+ id_field = id_field + "_id"
120
+ else:
121
+ id_path = path + "." + mongodb_id_new_column
122
+ if (level <= mongodb_dbref_extract_new_id_column_max_level and
123
+ id_path is not None and id_path not in self.field_subs_path.keys()):
124
+ self._add_field_from_path(id_path,
125
+ data_type=mongodb_id_alt_type_numeric if mongodb_id_datatype_numeric else "text",
126
+ new_field_name=id_field,
127
+ notes=f"Column extracted from {id_path}")
128
+ if mongodb_id_datatype_numeric:
129
+ id_value = int(str(subvalue.id), 16)
130
+ # id_value = str(subvalue.id)
131
+ else:
132
+ id_value = str(subvalue.id)
133
+ if self.param_mongodb_dbref_as_one_column:
134
+ if path in self.collection_refs.keys():
135
+ self.collection_refs[path].add(str(subvalue.collection))
136
+ self.database_refs[path].add(str(subvalue.database))
137
+ else:
138
+ self.collection_refs[path] = {str(subvalue.collection)}
139
+ self.database_refs[path] = {str(subvalue.database)}
140
+ new_subvalue = id_value
141
+ else:
142
+ new_subvalue = {mongodb_id_new_column: id_value,
143
+ "collection": subvalue.collection,
144
+ "database": subvalue.database,
145
+ }
146
+ if id_path in self.field_subs_path.keys():
147
+ self._new_columns_in_row[id_path] = new_subvalue[mongodb_id_new_column]
148
+ return new_subvalue
149
+ elif level == 0:
150
+ return super()._replace_non_standard_value(subvalue, field, field_data_type=field_data_type)
151
+ else:
152
+ return super()._replace_non_standard_subvalue(subvalue, field, path, level, field_data_type=field_data_type)
153
+
154
+ def _replace_non_standard_value(self, value:Any, field:CkanField,
155
+ *, field_data_type:str) -> Any:
156
+ field_name = field.name if field is not None else None
157
+ return self._replace_non_standard_subvalue(value, field, path=field_name,
158
+ level=0, field_data_type=field_data_type)
159
+
160
+ def _extra_checks(self, records: Union[List[dict], pd.DataFrame], fields:Union[OrderedDict[str, CkanField], None]) -> None:
161
+ self.broken_collection_refs = [path for path, refs in self.collection_refs.items() if len(refs) > 1]
162
+ self.broken_database_refs = [path for path, refs in self.database_refs.items() if len(refs) > 1]
163
+ if len(self.broken_collection_refs) > 0 or len(self.broken_database_refs) > 0:
164
+ broken_refs = set(self.broken_collection_refs).union(set(self.broken_database_refs))
165
+ msg = f"DBRefs do not point to an unique collection: {', '.join(broken_refs)}"
166
+ if self.param_raise_error or self.param_mongodb_dbref_as_one_column:
167
+ raise BrokenMongoRefError(msg)
168
+ else:
169
+ warn(msg)
170
+
171
+ def pymongo_default_data_cleaner() -> MongoDataCleanerUpload:
172
+ return MongoDataCleanerUpload()
173
+
@@ -0,0 +1,355 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Harvest from a mongo database using pymongo
5
+ """
6
+ from typing import Union, List, Any, Dict
7
+ from types import SimpleNamespace
8
+ from collections import OrderedDict
9
+ import json
10
+ import argparse
11
+
12
+
13
+ try:
14
+ import pymongo
15
+ import pymongo.client_session
16
+ import pymongo.database
17
+ except ImportError:
18
+ pymongo = SimpleNamespace(MongoClient=None, client_session=SimpleNamespace(ClientSession=None),
19
+ database=SimpleNamespace(Database=None), collection=SimpleNamespace(Collection=None))
20
+
21
+
22
+ from ckanapi_harvesters.harvesters.harvester_errors import (HarvesterRequirementError, HarvesterArgumentRequiredError, ResourceNotFoundError)
23
+ from ckanapi_harvesters.harvesters.harvester_abc import TableHarvesterABC, DatasetHarvesterABC, DatabaseHarvesterABC
24
+ from ckanapi_harvesters.harvesters.harvester_model import TableMetadata, DatasetMetadata
25
+ from ckanapi_harvesters.harvesters.harvester_params import DatasetParams, DatabaseParams
26
+ from ckanapi_harvesters.harvesters.pymongo_data_cleaner import pymongo_default_data_cleaner, pymongo_default_df_conversion
27
+ from ckanapi_harvesters.harvesters.pymongo_data_cleaner import mongodb_keep_id_column_trace, mongodb_id_new_column
28
+ from ckanapi_harvesters.harvesters.pymongo_params import TableParamsMongoCollection
29
+ from ckanapi_harvesters.auxiliary.urls import url_join
30
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import ssl_arguments_decompose, assert_or_raise
31
+ from ckanapi_harvesters.auxiliary.ckan_errors import UrlError
32
+ from ckanapi_harvesters.harvesters.data_cleaner.data_cleaner_abc import CkanDataCleanerABC
33
+ from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
34
+
35
+
36
+ mongodb_excluded_collections = {"system.profile"}
37
+
38
+
39
+ class DatabaseHarvesterMongoServer(DatabaseHarvesterABC):
40
+ """
41
+ This class manages the connection to a MongoDB server.
42
+ It can list datasets (MongoDB databases) but this call could lead to an error.
43
+ """
44
+ def __init__(self, params:DatabaseParams=None):
45
+ super().__init__(params)
46
+ if pymongo.MongoClient is None:
47
+ raise HarvesterRequirementError("pymongo", "pymongo")
48
+ self.params.harvest_method = "Pymongo"
49
+ self.mongo_client: Union[pymongo.MongoClient,None] = None
50
+ self.mongo_session: Union[pymongo.client_session.ClientSession,None] = None
51
+ if self.params.auth_url is None and self.params.port is None and self.params.host is None:
52
+ raise HarvesterArgumentRequiredError("auth-url", "pymongo", "This argument defines the url used to authenticate.")
53
+
54
+ @staticmethod
55
+ def init_from_options_string(options_string:str, base_dir:str=None) -> "DatabaseHarvesterMongoServer":
56
+ params = DatabaseParams()
57
+ params.parse_options_string(options_string, base_dir=base_dir)
58
+ return DatabaseHarvesterMongoServer(params)
59
+
60
+ def copy(self, *, dest=None):
61
+ if dest is None:
62
+ dest = DatabaseHarvesterMongoServer()
63
+ return super().copy(dest=dest)
64
+
65
+ def connect(self, *, cancel_if_connected:bool=True) -> Any:
66
+ if cancel_if_connected and self.mongo_client is not None:
67
+ return self.mongo_client
68
+ else:
69
+ if self.mongo_client is not None:
70
+ self.mongo_session.end_session()
71
+ self.mongo_client.close()
72
+ self.mongo_session = None
73
+ self.mongo_client = None
74
+ ssl, ssl_certfile = ssl_arguments_decompose(self.params.verify_ca)
75
+ auth_url = self.params.auth_url
76
+ if auth_url is None:
77
+ if self.params.url is not None:
78
+ auth_url = self.params.url
79
+ elif self.params.host is not None:
80
+ auth_url = f"mongodb://{self.params.host}"
81
+ if self.params.port is not None:
82
+ auth_url += f":{self.params.port}"
83
+ else:
84
+ raise UrlError("No Mongo URL provided")
85
+ if self.params.auth_url_suffix is not None:
86
+ auth_url = url_join(auth_url, self.params.auth_url_suffix)
87
+ self.params.auth_url = auth_url
88
+ self.mongo_client = pymongo.MongoClient(auth_url, username=self.params.login.username, password=self.params.login.password,
89
+ ssl=ssl, tlscafile=ssl_certfile,
90
+ timeoutMS=self.params.timeout*1000.0 if self.params.timeout is not None else None)
91
+ self.mongo_session = self.mongo_client.start_session()
92
+ if self.params.host is None and self.params.port is None:
93
+ # complete with host and port parsed by MongoClient
94
+ mongo_address = self.mongo_client.address
95
+ if mongo_address is not None:
96
+ self.params.host, self.params.port = mongo_address
97
+ return self.mongo_client
98
+
99
+ def is_connected(self) -> bool:
100
+ return self.mongo_client is not None
101
+
102
+ def disconnect(self) -> None:
103
+ if self.mongo_client is not None:
104
+ if self.mongo_session is not None:
105
+ self.mongo_session.end_session()
106
+ self.mongo_client.close()
107
+ self.mongo_client = None
108
+ self.mongo_session = None
109
+
110
+ def check_connection(self, *, new_connection:bool=False, raise_error:bool=False) -> Union[None, ContextErrorLevelMessage]:
111
+ try:
112
+ self.connect(cancel_if_connected=not new_connection)
113
+ # the following line requires specific admin rights (unexpected)
114
+ # remote_collections = self.mongo_client.list_database_names(self.mongo_session) # this tests the database connection
115
+ except Exception as e:
116
+ if raise_error:
117
+ raise e from e
118
+ else:
119
+ return ContextErrorLevelMessage("Mongo Harvester", ErrorLevel.Error, f"Failed to connect to {self.params.auth_url}: {e}")
120
+
121
+ def get_dataset_harvester(self, dataset_name: str) -> "DatasetHarvesterMongoDatabase":
122
+ params_dataset = self.params.copy(dest=DatasetParams())
123
+ params_dataset.dataset = dataset_name
124
+ dataset_harvester = DatasetHarvesterMongoDatabase(params_dataset)
125
+ self.copy(dest=dataset_harvester)
126
+ dataset_harvester.params = params_dataset
127
+ dataset_harvester._finalize_connection()
128
+ return dataset_harvester
129
+
130
+ def list_datasets(self, return_metadata: bool = True) -> Union[List[str], OrderedDict[str, DatasetMetadata]]:
131
+ # this would raise an unauthorized error !
132
+ self.connect()
133
+ dataset_list = self.mongo_client.list_database_names(self.mongo_session)
134
+ if return_metadata:
135
+ return OrderedDict([(name, self.get_dataset_harvester(name).query_dataset_metadata()) for name in dataset_list])
136
+ else:
137
+ return dataset_list
138
+
139
+
140
+ class DatasetHarvesterMongoDatabase(DatabaseHarvesterMongoServer, DatasetHarvesterABC):
141
+ """
142
+ A CKAN dataset corresponds to a MongoDB database (set of collections).
143
+ """
144
+ def __init__(self, params:DatasetParams=None):
145
+ super().__init__(params)
146
+ self.mongo_database: Union[pymongo.database.Database,None] = None
147
+ self.dataset_metadata: Union[DatasetMetadata, None] = None # DatasetHarvesterABC
148
+ # use database attribute if dataset is not specified (ambiguity on name - database attribute is not used above)
149
+ if self.params.dataset is None:
150
+ self.params.dataset = self.params.database
151
+ if self.params.dataset is None:
152
+ raise HarvesterArgumentRequiredError("dataset", "pymongo", "This argument defines the mongo database used")
153
+
154
+ @staticmethod
155
+ def init_from_options_string(options_string:str, base_dir:str=None) -> "DatasetHarvesterMongoDatabase":
156
+ params = DatasetParams()
157
+ params.parse_options_string(options_string, base_dir=base_dir)
158
+ return DatasetHarvesterMongoDatabase(params)
159
+
160
+ def _finalize_connection(self):
161
+ if super().is_connected() and self.mongo_database is None:
162
+ # remote_datasets = self.mongo_client.list_database_names(self.mongo_session)
163
+ # assert_or_raise(self.params.dataset in remote_datasets, ResourceNotFoundError("Database", self.params.dataset, self.params.auth_url))
164
+ self.mongo_database = self.mongo_client[self.params.dataset]
165
+
166
+ def connect(self, *, cancel_if_connected:bool=True) -> Any:
167
+ if not (cancel_if_connected and self.is_connected()):
168
+ super().connect(cancel_if_connected=cancel_if_connected)
169
+ self._finalize_connection()
170
+ return self.mongo_client
171
+
172
+ def is_connected(self) -> bool:
173
+ return super().is_connected()
174
+
175
+ def disconnect(self) -> None:
176
+ if super().is_connected():
177
+ self.mongo_database = None
178
+ super().disconnect()
179
+
180
+ def check_connection(self, *, new_connection: bool = False, raise_error: bool = False) -> Union[None, ContextErrorLevelMessage]:
181
+ try:
182
+ super().check_connection(new_connection=new_connection, raise_error=raise_error)
183
+ remote_collections = self.mongo_database.list_collection_names(self.mongo_session) # this tests the database connection
184
+ except Exception as e:
185
+ if raise_error:
186
+ raise e from e
187
+ else:
188
+ return ContextErrorLevelMessage("Mongo Harvester", ErrorLevel.Error,
189
+ f"Failed to connect to {self.params.auth_url}: {e}")
190
+ if self.mongo_database is None:
191
+ return ContextErrorLevelMessage("Mongo Harvester", ErrorLevel.Error, f"Failed to connect to {self.params.auth_url}: <no error message>")
192
+ else:
193
+ return None
194
+
195
+ def query_dataset_metadata(self, cancel_if_present:bool=True) -> DatasetMetadata:
196
+ self.connect()
197
+ if cancel_if_present and self.dataset_metadata is not None:
198
+ return self.dataset_metadata
199
+ else:
200
+ self.dataset_metadata = DatasetMetadata()
201
+ self.dataset_metadata.name = self.mongo_database.name
202
+ self.dataset_metadata.tables = self.list_tables(return_metadata=True)
203
+ return self.dataset_metadata
204
+
205
+ def clean_dataset_metadata(self) -> DatasetMetadata:
206
+ return self.query_dataset_metadata().copy()
207
+
208
+ def get_table_harvester(self, table_name:str) -> "TableHarvesterMongoCollection":
209
+ params_table = self.params.copy(dest=TableParamsMongoCollection())
210
+ if self.params.options_string is not None:
211
+ # reparse options_string for table-specific arguments
212
+ params_table.parse_options_string(self.params.options_string, base_dir=self.params.base_dir)
213
+ params_table.table = table_name
214
+ table_harvester = TableHarvesterMongoCollection(params_table)
215
+ self.copy(dest=table_harvester)
216
+ table_harvester.params = params_table
217
+ table_harvester._finalize_connection()
218
+ return table_harvester
219
+
220
+ def list_tables(self, return_metadata:bool=True) -> Union[List[str], OrderedDict[str, TableMetadata]]:
221
+ self.connect()
222
+ remote_collections = [collection_name for collection_name in self.mongo_database.list_collection_names(session=self.mongo_session) if collection_name not in mongodb_excluded_collections]
223
+ if return_metadata:
224
+ return OrderedDict([(name, self.get_table_harvester(name).query_table_metadata()) for name in remote_collections])
225
+ else:
226
+ return remote_collections
227
+
228
+
229
+ class TableHarvesterMongoCollection(DatasetHarvesterMongoDatabase, TableHarvesterABC):
230
+ """
231
+ A table (CKAN DataStore) corresponds to a MongoDB collection.
232
+ """
233
+ _default_upload_fun = pymongo_default_df_conversion
234
+ _default_primary_key = [mongodb_id_new_column]
235
+
236
+ def __init__(self, params:TableParamsMongoCollection=None):
237
+ super().__init__(params)
238
+ self.params: TableParamsMongoCollection = params
239
+ self.mongo_collection: Union[pymongo.collection.Collection,None] = None
240
+ self.table_metadata: Union[TableMetadata, None] = None # TableHarvesterABC
241
+ if self.params.file_url_attr is not None:
242
+ # File/URL attribute has priority over CLI
243
+ self.params.table = self.params.file_url_attr
244
+ if self.params.table is None:
245
+ raise HarvesterArgumentRequiredError("table", "pymongo", "This argument defines the mongo collection used")
246
+
247
+ @staticmethod
248
+ def init_from_options_string(options_string:str, *, base_dir:str=None, file_url_attr:str=None) -> "TableHarvesterMongoCollection":
249
+ params = TableParamsMongoCollection()
250
+ params.parse_options_string(options_string, file_url_attr=file_url_attr, base_dir=base_dir)
251
+ return TableHarvesterMongoCollection(params)
252
+
253
+ def copy(self, *, dest=None):
254
+ if dest is None:
255
+ dest = TableHarvesterMongoCollection()
256
+ super().copy(dest=dest)
257
+ return dest
258
+
259
+ def disconnect(self) -> None:
260
+ if super().is_connected():
261
+ self.mongo_collection = None
262
+ super().disconnect()
263
+
264
+ def _finalize_connection(self):
265
+ super()._finalize_connection()
266
+ if super().is_connected() and self.mongo_collection is None:
267
+ mongo_database = self.mongo_database
268
+ remote_collections = mongo_database.list_collection_names()
269
+ collection_name = self.params.table
270
+ assert_or_raise(collection_name in remote_collections, ResourceNotFoundError("Collection", self.params.dataset + '.' + collection_name, self.params.auth_url))
271
+ collection = mongo_database[collection_name]
272
+ self.mongo_collection = collection
273
+
274
+ def connect(self, *, cancel_if_connected:bool=True) -> Any:
275
+ if not (cancel_if_connected and self.is_connected()):
276
+ super().connect()
277
+ self._finalize_connection()
278
+ return self.mongo_client
279
+
280
+ def check_connection(self, *, new_connection:bool=False, raise_error:bool=False) -> Union[None, ContextErrorLevelMessage]:
281
+ super().check_connection(new_connection=new_connection, raise_error=raise_error)
282
+ if self.mongo_collection is None:
283
+ return ContextErrorLevelMessage("Mongo Harvester", ErrorLevel.Error, f"Failed to connect to {self.params.auth_url}: <no error message>")
284
+ else:
285
+ return None
286
+
287
+ def query_table_metadata(self, cancel_if_present:bool=True) -> TableMetadata:
288
+ self.connect()
289
+ if cancel_if_present and self.table_metadata is not None:
290
+ return self.table_metadata
291
+ else:
292
+ # TODO: query at least primary key and indexes
293
+ self.table_metadata = TableMetadata()
294
+ index_dict = self.mongo_collection.index_information(session=self.mongo_session)
295
+ self.table_metadata.name = self.mongo_collection.name
296
+ self.table_metadata.indexes = sum([[key[0] for key in index["key"]] for index in index_dict.values()], [])
297
+ return self.table_metadata
298
+
299
+ def clean_table_metadata(self) -> TableMetadata:
300
+ clean_metadata = self.query_table_metadata().copy()
301
+ if clean_metadata.indexes is not None:
302
+ i_rm = []
303
+ for i, name in enumerate(clean_metadata.indexes):
304
+ if name == "_id":
305
+ if mongodb_keep_id_column_trace:
306
+ clean_metadata.indexes[i] = mongodb_id_new_column
307
+ else:
308
+ i_rm.append(i)
309
+ elif "." in name:
310
+ i_rm.append(i)
311
+ for i in reversed(i_rm):
312
+ clean_metadata.indexes.pop(i)
313
+ clean_metadata.indexes = None # finally, do not specify indexes at all
314
+ return clean_metadata
315
+
316
+ def get_default_primary_key(self) -> List[str]:
317
+ table_metadata = self.query_table_metadata()
318
+ if table_metadata.primary_key is not None:
319
+ return table_metadata.primary_key
320
+ else:
321
+ return TableHarvesterMongoCollection._default_primary_key
322
+
323
+ def get_default_data_cleaner(self) -> Union[CkanDataCleanerABC, None]:
324
+ data_cleaner = pymongo_default_data_cleaner()
325
+ data_cleaner.param_mongodb_dbref_as_one_column = not self.params.dbref_expand
326
+ return data_cleaner
327
+
328
+ def list_queries(self, *, new_connection:bool=False) -> List[Dict[str,Any]]:
329
+ self.connect(cancel_if_connected=not new_connection)
330
+ assert(self.mongo_collection is not None)
331
+ query = json.loads(self.params.query_string) if self.params.query_string is not None else {}
332
+ if self.params.verbose_harvester:
333
+ print(f"Counting documents of table {self.params.table}")
334
+ # num_rows = self.mongo_collection.count_documents(query, session=self.mongo_session)
335
+ num_rows = self.mongo_collection.estimated_document_count()
336
+ num_queries = num_rows // self.params.limit + 1
337
+ if self.params.single_request:
338
+ return [OrderedDict([("$match", query), ("$skip", i * self.params.limit), ("$limit", self.params.limit)]) for i in range(1)]
339
+ else:
340
+ queries_exact = [OrderedDict([("$match", query), ("$skip", i * self.params.limit), ("$limit", self.params.limit)]) for i in range(num_queries)]
341
+ query_extra = OrderedDict([("$match", query), ("$skip", num_queries * self.params.limit)])
342
+ return queries_exact + [query_extra]
343
+
344
+ def query_data(self, query:Dict[str,Any]) -> List[dict]:
345
+ assert(self.mongo_collection is not None)
346
+ if isinstance(query, str):
347
+ query = json.loads(query)
348
+ if self.params.verbose_harvester:
349
+ print(f"Pymongo request {query} on table {self.params.table}")
350
+ cursor = self.mongo_collection.find(query["$match"], session=self.mongo_session).skip(query["$skip"])
351
+ if "$limit" in query.keys():
352
+ cursor = cursor.limit(query["$limit"])
353
+ documents = list(cursor)
354
+ return documents
355
+
@@ -0,0 +1,54 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Harvest from a mongo database using pymongo
5
+ """
6
+ import argparse
7
+
8
+ from ckanapi_harvesters.harvesters.harvester_params import TableParams
9
+
10
+
11
+ class TableParamsMongoCollection(TableParams):
12
+ """
13
+ A table (CKAN DataStore) corresponds to a MongoDB collection.
14
+ This subclass of TableParams implements an alias attribute for table name called collection.
15
+ """
16
+ def __init__(self, source: "TableParamsMongoCollection" =None):
17
+ super().__init__(source)
18
+ self.dbref_expand:bool = False
19
+ if source is not None:
20
+ source.copy(dest=self)
21
+
22
+ # alias property for the table name setting: collection in MongoDB
23
+ @property
24
+ def collection(self) -> str:
25
+ return self.table
26
+ @collection.setter
27
+ def collection(self, value: str):
28
+ self.table = value
29
+
30
+ def copy(self, *, dest=None):
31
+ if dest is None:
32
+ dest = TableParamsMongoCollection()
33
+ super().copy(dest=dest)
34
+ dest.dbref_expand = self.dbref_expand
35
+ return dest
36
+
37
+ @staticmethod
38
+ def setup_cli_harvester_parser(parser: argparse.ArgumentParser = None) -> argparse.ArgumentParser:
39
+ parser = TableParams.setup_cli_harvester_parser(parser=parser)
40
+ parser.add_argument("--collection", type=str,
41
+ help="MongoDB collection name") # normally specified in the File/URL attribute of builder
42
+ # parser.add_argument("--table", help=argparse.SUPPRESS) # do not display in help ==> conflict
43
+ parser.add_argument("--dbref-expand",
44
+ help="Option to expand DBRefs",
45
+ action="store_true", default=False) # applies to data cleaner
46
+ return parser
47
+
48
+ def initialize_from_cli_args(self, args: argparse.Namespace, base_dir: str = None, error_not_found: bool = True,
49
+ default_proxies: dict = None, proxy_headers: dict = None) -> None:
50
+ super().initialize_from_cli_args(args, base_dir=base_dir, error_not_found=error_not_found,
51
+ default_proxies=default_proxies, proxy_headers=proxy_headers)
52
+ self.dbref_expand = args.dbref_expand
53
+ if args.collection is not None:
54
+ self.collection = args.collection
@@ -0,0 +1,20 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Package to enforce CKAN data policies.
5
+ """
6
+
7
+ POLICY_FILE_FORMAT_VERSION = "0.0.0" # version of the data format policy file format
8
+
9
+ from . import data_format_policy_defs
10
+ from . import data_format_policy_errors
11
+ from . import data_format_policy_abc
12
+ from . import data_format_policy_lists
13
+ from . import data_format_policy_tag_groups
14
+ from . import data_format_policy_custom_fields
15
+ from . import data_format_policy
16
+
17
+ # usage shortcuts
18
+ from ckanapi_harvesters.policies.data_format_policy import CkanPackageDataFormatPolicy
19
+
20
+