ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. ckanapi_harvesters/__init__.py +32 -10
  2. ckanapi_harvesters/auxiliary/__init__.py +26 -0
  3. ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
  4. ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
  5. ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
  6. ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
  7. ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
  8. ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
  9. ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
  10. ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
  11. ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
  12. ckanapi_harvesters/auxiliary/deprecated.py +82 -0
  13. ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
  14. ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
  15. ckanapi_harvesters/auxiliary/list_records.py +60 -0
  16. ckanapi_harvesters/auxiliary/login.py +163 -0
  17. ckanapi_harvesters/auxiliary/path.py +208 -0
  18. ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
  19. ckanapi_harvesters/auxiliary/urls.py +40 -0
  20. ckanapi_harvesters/builder/__init__.py +40 -0
  21. ckanapi_harvesters/builder/builder_aux.py +20 -0
  22. ckanapi_harvesters/builder/builder_ckan.py +238 -0
  23. ckanapi_harvesters/builder/builder_errors.py +36 -0
  24. ckanapi_harvesters/builder/builder_field.py +122 -0
  25. ckanapi_harvesters/builder/builder_package.py +9 -0
  26. ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
  27. ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
  28. ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
  29. ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
  30. ckanapi_harvesters/builder/builder_resource.py +589 -0
  31. ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
  32. ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
  33. ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
  34. ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
  35. ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
  36. ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
  37. ckanapi_harvesters/builder/builder_resource_init.py +126 -0
  38. ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
  39. ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
  40. ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
  41. ckanapi_harvesters/builder/example/__init__.py +21 -0
  42. ckanapi_harvesters/builder/example/builder_example.py +21 -0
  43. ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
  44. ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
  45. ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
  46. ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
  47. ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
  48. ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
  49. ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
  50. ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
  51. ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
  52. ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
  53. ckanapi_harvesters/builder/mapper_datastore.py +93 -0
  54. ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
  55. ckanapi_harvesters/builder/specific/__init__.py +11 -0
  56. ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
  57. ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
  58. ckanapi_harvesters/ckan_api/__init__.py +20 -0
  59. ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
  60. ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
  61. ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
  62. ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
  63. ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
  64. ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
  65. ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
  66. ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
  67. ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
  68. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
  69. ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
  70. ckanapi_harvesters/harvesters/__init__.py +23 -0
  71. ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
  72. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
  73. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
  74. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
  75. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
  76. ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
  77. ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
  78. ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
  79. ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
  80. ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
  81. ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
  82. ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
  83. ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
  84. ckanapi_harvesters/harvesters/harvester_init.py +30 -0
  85. ckanapi_harvesters/harvesters/harvester_model.py +49 -0
  86. ckanapi_harvesters/harvesters/harvester_params.py +323 -0
  87. ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
  88. ckanapi_harvesters/harvesters/postgre_params.py +86 -0
  89. ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
  90. ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
  91. ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
  92. ckanapi_harvesters/policies/__init__.py +20 -0
  93. ckanapi_harvesters/policies/data_format_policy.py +269 -0
  94. ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
  95. ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
  96. ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
  97. ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
  98. ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
  99. ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
  100. ckanapi_harvesters/reports/__init__.py +11 -0
  101. ckanapi_harvesters/reports/admin_report.py +292 -0
  102. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
  103. ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
  104. ckanapi_harvesters/divider/__init__.py +0 -27
  105. ckanapi_harvesters/divider/divider.py +0 -53
  106. ckanapi_harvesters/divider/divider_error.py +0 -59
  107. ckanapi_harvesters/main.py +0 -30
  108. ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
  109. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
  110. {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,79 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Data format policy representation and enforcing
5
+ """
6
+ from typing import List, Tuple
7
+ from warnings import warn
8
+ from collections import OrderedDict
9
+
10
+ from ckanapi_harvesters.auxiliary.error_level_message import ErrorLevelMessage, ErrorLevel
11
+
12
+
13
+ class DataPolicyError(ErrorLevelMessage):
14
+ def __init__(self, context:str, error_level:ErrorLevel, policy_message: str):
15
+ message = f"In {context} / Data format policy {error_level.name}: {policy_message}"
16
+ super().__init__(error_level, message)
17
+ self.context: str = context
18
+ self.specific_message: str = policy_message
19
+
20
+ def to_dict(self) -> dict:
21
+ return OrderedDict([
22
+ ("level", str(self.error_level)),
23
+ ("context", self.context),
24
+ ("message", self.specific_message),
25
+ ])
26
+
27
+
28
+ class UnsupportedPolicyVersionError(Exception):
29
+ def __init__(self, file_version):
30
+ super().__init__(f"Version error: policy file version {file_version} is not supported")
31
+
32
+ class UrlPolicyLockedError(Exception):
33
+ def __init__(self, url):
34
+ super().__init__(f"Url is not allowed a policy definition - feature locked (url: {url})")
35
+
36
+ def _policy_msg(msg:DataPolicyError, *, error_level:ErrorLevel, buffer:List[DataPolicyError], verbose:bool) -> None:
37
+ if buffer is not None:
38
+ buffer.append(msg)
39
+ elif error_level == ErrorLevel.Information and verbose:
40
+ print(str(msg))
41
+ elif error_level == ErrorLevel.Warning:
42
+ msg = str(msg)
43
+ warn(msg)
44
+ elif error_level == ErrorLevel.Error:
45
+ raise msg
46
+
47
+
48
+ class ErrorCount:
49
+ def __init__(self, messages_list:List[DataPolicyError]):
50
+ self.messages_list:List[DataPolicyError] = messages_list
51
+ self.information:int = 0
52
+ self.warning:int = 0
53
+ self.error:int = 0
54
+ self.total:int = len(messages_list)
55
+ for message in messages_list:
56
+ if message.error_level == ErrorLevel.Information:
57
+ self.information += 1
58
+ elif message.error_level == ErrorLevel.Warning:
59
+ self.warning += 1
60
+ elif message.error_level == ErrorLevel.Error:
61
+ self.error += 1
62
+
63
+ def error_count_message(self) -> str:
64
+ if self.total == 0:
65
+ return "All tests passed"
66
+ else:
67
+ return f"{self.error} errors, {self.warning} warnings, {self.information} messages"
68
+
69
+ def __str__(self) -> str:
70
+ return "ErrorCount: " + self.error_count_message()
71
+
72
+ def __add__(self, other):
73
+ return ErrorCount(self.messages_list + other.messages_list)
74
+
75
+ def to_tuple(self) -> Tuple[int, int, int]:
76
+ return (self.error, self.warning, self.information)
77
+
78
+ def to_dict(self) -> dict[str,int]:
79
+ return OrderedDict([("errors", self.error), ("warnings", self.warning), ("information", self.information)])
@@ -0,0 +1,234 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Data format policy representation and enforcing for lists of values such as tags
5
+ """
6
+ from typing import List, Any, Iterable, Union, Dict, Set
7
+ from abc import ABC, abstractmethod
8
+ from warnings import warn
9
+ import re
10
+
11
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element, assert_or_raise
12
+ from ckanapi_harvesters.auxiliary.ckan_defs import ckan_tags_sep
13
+ from ckanapi_harvesters.policies.data_format_policy_errors import DataPolicyError, ErrorLevel, _policy_msg
14
+ from ckanapi_harvesters.policies.data_format_policy_defs import ListChoiceMode
15
+ from ckanapi_harvesters.policies.data_format_policy_defs import StringValueSpecification
16
+ from ckanapi_harvesters.policies.data_format_policy_abc import DataPolicyElementABC
17
+
18
+
19
+ extra_group_name = "extra"
20
+
21
+
22
+ class ValueListPolicy(DataPolicyElementABC):
23
+ _group_type_str = "group"
24
+
25
+ def __init__(self, list_specs:List[StringValueSpecification]=None, group_name:str=None,
26
+ value_select:ListChoiceMode=ListChoiceMode.Any,
27
+ mandatory:bool=False, error_level:ErrorLevel=ErrorLevel.Information):
28
+ super().__init__(mandatory=mandatory, error_level=error_level)
29
+ if list_specs is None:
30
+ list_specs = []
31
+ self.list_specs:List[StringValueSpecification] = list_specs
32
+ self.value_select: ListChoiceMode = value_select
33
+ self.group_name: str = group_name
34
+
35
+ def to_dict(self) -> dict:
36
+ d = {}
37
+ if self.group_name:
38
+ d["group_name"] = self.group_name
39
+ d.update(super().to_dict())
40
+ d.update({"values": [spec.to_dict() for spec in self.list_specs],
41
+ "value_select": self.value_select.name})
42
+ return d
43
+
44
+ def list_specs_str(self) -> List[str]:
45
+ return [value_spec.value for value_spec in self.list_specs]
46
+
47
+ @staticmethod
48
+ def from_dict(d:dict) -> "ValueListPolicy":
49
+ obj = ValueListPolicy()
50
+ obj._load_from_dict(d)
51
+ return obj
52
+
53
+ def _load_from_dict(self, d:dict):
54
+ super()._load_from_dict(d)
55
+ self.group_name = d["group_name"] if len(d["group_name"]) > 0 else None
56
+ self.list_specs = [StringValueSpecification.from_dict(value) for value in d["values"]]
57
+ self.value_select = ListChoiceMode.from_str(d["value_select"]) if "value_select" in d.keys() else ListChoiceMode.Any
58
+
59
+ def enforce(self, values: Union[str, List[str]], *, context:str=None,
60
+ verbose: bool = True, buffer:List[DataPolicyError]=None) -> bool:
61
+ if self.group_name is not None:
62
+ context = context + " / " + self._group_type_str + " " + self.group_name
63
+ success = True
64
+ spec = [tag_spec.value for tag_spec in self.list_specs]
65
+ if values is None:
66
+ values = []
67
+ elif isinstance(values, str):
68
+ values = values.split(ckan_tags_sep)
69
+ values = list(set(values).intersection(set(spec)))
70
+ msg = None
71
+ value_context = context + " / value '" + ','.join(values).join(values) + "'"
72
+ if (self.value_select == ListChoiceMode.MaxOne and len(values) > 1):
73
+ success = False
74
+ msg = DataPolicyError(value_context, self.error_level, f"Too many values for value list group '{self.group_name}'. Max one value is admitted within {spec}.")
75
+ if (self.value_select == ListChoiceMode.NoExtra and len(values) > 0):
76
+ success = False
77
+ msg = DataPolicyError(value_context, self.error_level, f"Too many values for value list group '{self.group_name}'. No values can be selected for this group ({spec}).")
78
+ if (self.value_select == ListChoiceMode.MandatoryOne and not len(values) == 1):
79
+ success = False
80
+ msg = DataPolicyError(value_context, self.error_level, f"Exactly one value must be present for value list group '{self.group_name}' ({spec}).")
81
+ if (self.value_select == ListChoiceMode.MandatoryMulti and not len(values) < 1):
82
+ success = False
83
+ msg = DataPolicyError(value_context, self.error_level, f"At least one value must be present for value list group '{self.group_name}' ({spec}).")
84
+ if not success:
85
+ _policy_msg(msg, error_level=self.error_level, buffer=buffer, verbose=verbose)
86
+ if len(spec) > 0:
87
+ for tag in values:
88
+ success &= self._enforce_unit_string(tag, spec, context=context, verbose=verbose, buffer=buffer)
89
+ return success
90
+
91
+
92
+ class ExtraValueListPolicy(ValueListPolicy):
93
+ def __init__(self, list_specs:List[StringValueSpecification]=None,
94
+ value_select:ListChoiceMode=ListChoiceMode.Any,
95
+ mandatory:bool=False, error_level:ErrorLevel=ErrorLevel.Information):
96
+ super().__init__(list_specs=list_specs, group_name=extra_group_name,
97
+ value_select=value_select, mandatory=mandatory, error_level=error_level)
98
+
99
+ @staticmethod
100
+ def from_ValueListPolicy(value: ValueListPolicy) -> "ExtraValueListPolicy":
101
+ obj = ExtraValueListPolicy()
102
+ obj.__dict__.update(value.__dict__)
103
+ return obj
104
+
105
+ @staticmethod
106
+ def from_dict(d:dict) -> "ExtraValueListPolicy":
107
+ obj = ExtraValueListPolicy()
108
+ obj._load_from_dict(d)
109
+ return obj
110
+
111
+ def enforce(self, values: Union[str, List[str]], *, context:str=None,
112
+ verbose: bool = True, buffer:List[DataPolicyError]=None, extra_spec_rm:Set[str]=None) -> bool:
113
+ if self.group_name is not None:
114
+ context = context + " / group " + self.group_name
115
+ success = True
116
+ spec = [tag_spec.value for tag_spec in self.list_specs]
117
+ if values is None:
118
+ values = []
119
+ elif isinstance(values, str):
120
+ values = values.split(ckan_tags_sep)
121
+ values = list(set(values) - extra_spec_rm)
122
+ msg = None
123
+ value_context = context + " / value '" + ','.join(values).join(values) + "'"
124
+ if (self.value_select == ListChoiceMode.MaxOne and len(values) > 1):
125
+ success = False
126
+ msg = DataPolicyError(value_context, self.error_level, f"Too many values for value list group '{self.group_name}'. Max one value is admitted within {spec}.")
127
+ if (self.value_select == ListChoiceMode.NoExtra and len(values) > 0):
128
+ success = False
129
+ msg = DataPolicyError(value_context, self.error_level, f"Too many values for value list group '{self.group_name}'. No values can be selected for this group ({spec}). Admitted values from other groups: {extra_spec_rm}")
130
+ if (self.value_select == ListChoiceMode.MandatoryOne and not len(values) == 1):
131
+ success = False
132
+ msg = DataPolicyError(value_context, self.error_level, f"Exactly one value must be present for value list group '{self.group_name}' ({spec}).")
133
+ if (self.value_select == ListChoiceMode.MandatoryMulti and not len(values) < 1):
134
+ success = False
135
+ msg = DataPolicyError(value_context, self.error_level, f"At least one value must be present for value list group '{self.group_name}' ({spec}).")
136
+ if not success:
137
+ _policy_msg(msg, error_level=self.error_level, buffer=buffer, verbose=verbose)
138
+ if len(spec) > 0:
139
+ for tag in values:
140
+ success &= self._enforce_unit_string(tag, spec, context=context, verbose=verbose, buffer=buffer)
141
+ return success
142
+
143
+
144
+ class GroupedValueListPolicy(DataPolicyElementABC):
145
+ def __init__(self, value_group_specs:List[ValueListPolicy]=None,
146
+ extra_values:ExtraValueListPolicy=None,
147
+ mandatory:bool=False, error_level:ErrorLevel=ErrorLevel.Information):
148
+ super().__init__(mandatory=mandatory, error_level=error_level)
149
+ if value_group_specs is None:
150
+ value_group_specs = []
151
+ self.value_group_specs:List[ValueListPolicy] = value_group_specs
152
+ self.extra_values_spec:ExtraValueListPolicy = extra_values
153
+ self._extract_extra_values()
154
+
155
+ def _extract_extra_values(self):
156
+ i_rm = []
157
+ extra_values = self.extra_values_spec
158
+ for i, value_group_spec in enumerate(self.value_group_specs):
159
+ if value_group_spec.group_name.lower() == extra_group_name.lower():
160
+ assert(extra_values is None)
161
+ extra_values = ExtraValueListPolicy.from_ValueListPolicy(value_group_spec)
162
+ i_rm.append(i)
163
+ for i in reversed(i_rm):
164
+ self.value_group_specs.pop(i)
165
+ if extra_values is not None:
166
+ self.extra_values_spec:ExtraValueListPolicy = extra_values
167
+
168
+ def to_dict(self) -> dict:
169
+ d = super().to_dict()
170
+ if self.extra_values_spec is not None:
171
+ self.extra_values_spec.group_name = extra_group_name
172
+ extra_values_dict = [self.extra_values_spec.to_dict()]
173
+ else:
174
+ extra_values_dict = []
175
+ d.update({"groups": [spec.to_dict() for spec in self.value_group_specs] + extra_values_dict})
176
+ return d
177
+
178
+ @staticmethod
179
+ def from_dict(d:dict) -> "GroupedValueListPolicy":
180
+ obj = GroupedValueListPolicy()
181
+ obj._load_from_dict(d)
182
+ return obj
183
+
184
+ def _load_from_dict(self, d:dict, child_cls:type=None):
185
+ super()._load_from_dict(d)
186
+ if child_cls is None:
187
+ child_cls = ValueListPolicy
188
+ self.value_group_specs = [child_cls.from_dict(group_spec) for group_spec in d["groups"]]
189
+ self.extra_values_spec = None
190
+ self._extract_extra_values()
191
+
192
+ def enforce(self, values: Union[str, List[str]], *, context:str=None, verbose: bool = True, buffer:List[DataPolicyError]=None) -> bool:
193
+ success = True
194
+ extra_spec_rm = set()
195
+ for value_group_spec in self.value_group_specs:
196
+ if not value_group_spec.group_name == extra_group_name.lower():
197
+ extra_spec_rm = extra_spec_rm.union({tag_spec.value for tag_spec in value_group_spec.list_specs})
198
+ for value_group_spec in self.value_group_specs:
199
+ success &= value_group_spec.enforce(values, context=context, verbose=verbose, buffer=buffer)
200
+ if self.extra_values_spec is not None:
201
+ self.extra_values_spec.group_name = extra_group_name
202
+ success &= self.extra_values_spec.enforce(values, context=context, verbose=verbose, buffer=buffer, extra_spec_rm=extra_spec_rm)
203
+ return success
204
+
205
+
206
+ class SingleValueListPolicy(DataPolicyElementABC):
207
+ def __init__(self, base_list:ValueListPolicy=None, extra_values:ListChoiceMode=ListChoiceMode.Any, mandatory:bool=False):
208
+ super().__init__(mandatory=mandatory)
209
+ self.base_list: GroupedValueListPolicy = GroupedValueListPolicy(extra_values=ExtraValueListPolicy(value_select=extra_values))
210
+ self.update_base_list(base_list)
211
+
212
+ def to_dict(self) -> dict:
213
+ return self.base_list.to_dict()
214
+
215
+ @staticmethod
216
+ def from_dict(d:dict) -> "SingleValueListPolicy":
217
+ obj = SingleValueListPolicy()
218
+ obj._load_from_dict(d)
219
+ return obj
220
+
221
+ def _load_from_dict(self, d:dict):
222
+ # super()._load_from_dict(d)
223
+ self.base_list._load_from_dict(d)
224
+
225
+ def update_base_list(self, base_list:ValueListPolicy):
226
+ if base_list is not None:
227
+ base_list.group_name = "base"
228
+ self.base_list.value_group_specs = [base_list]
229
+ else:
230
+ self.base_list.value_group_specs = []
231
+
232
+ def enforce(self, values: Union[str, List[str]], *, context:str=None, verbose: bool = True, buffer:List[DataPolicyError]=None) -> bool:
233
+ success = self.base_list.enforce(values, context=context, verbose=verbose, buffer=buffer)
234
+ return success
@@ -0,0 +1,35 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Data format policy representation and enforcing for lists of tags grouped in vocabularies
5
+ """
6
+ from typing import List, Dict
7
+
8
+ from ckanapi_harvesters.auxiliary.error_level_message import ErrorLevel
9
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import assert_or_raise
10
+ from ckanapi_harvesters.auxiliary.ckan_errors import MandatoryAttributeError
11
+ from ckanapi_harvesters.policies.data_format_policy_lists import ValueListPolicy, GroupedValueListPolicy, ExtraValueListPolicy
12
+ from ckanapi_harvesters.policies.data_format_policy_defs import ListChoiceMode
13
+ from ckanapi_harvesters.policies.data_format_policy_defs import StringValueSpecification
14
+
15
+ tag_subs_re = r"[^a-zA-Z0-9_\-\.]"
16
+
17
+
18
+ class TagListPolicy(ValueListPolicy):
19
+ def get_tags_list_dict(self, vocabulary_id: str=None) -> List[Dict[str, str]]:
20
+ """
21
+ Generate tags dictionary to initiate a vocabulary using the CKAN API.
22
+ :param vocabulary_id:
23
+ :return:
24
+ """
25
+ if vocabulary_id is not None:
26
+ tags_list_dict = [{"name": tag_spec.value, "vocabulary_id": vocabulary_id} for tag_spec in self.list_specs]
27
+ else:
28
+ tags_list_dict = [{"name": tag_spec.value} for tag_spec in self.list_specs]
29
+ return tags_list_dict
30
+
31
+
32
+ class TagGroupsListPolicy(GroupedValueListPolicy):
33
+ pass
34
+
35
+
@@ -0,0 +1,11 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Package to make reports on the CKAN database.
5
+ """
6
+
7
+ from . import admin_report
8
+
9
+ # usage shortcuts
10
+
11
+
@@ -0,0 +1,292 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Detailed report on package resources: size, access rights and data format policy scores
5
+ """
6
+ from typing import List, Union, Dict
7
+ from collections import OrderedDict
8
+ import time
9
+ import datetime
10
+ import os
11
+ from warnings import warn
12
+
13
+ from ckanapi_harvesters.ckan_api import CkanApi
14
+ from ckanapi_harvesters.auxiliary.ckan_auxiliary import to_jsons_indent_lists_single_line
15
+ from ckanapi_harvesters.auxiliary.ckan_model import CkanVisibility, CkanUserInfo
16
+ from ckanapi_harvesters.policies.data_format_policy_errors import ErrorCount, DataPolicyError
17
+
18
+
19
+ def round_size(value_mb:float) -> float:
20
+ return round(value_mb, 2)
21
+
22
+
23
+ class CkanAdminReport:
24
+ def __init__(self, package_list:List[str]=None, cancel_if_present:bool=True,
25
+ package_custom_fields:List[str]=None, ckan:CkanApi=None, full_report:bool=False):
26
+ if package_custom_fields is None:
27
+ package_custom_fields = [] # option to include specific custom fields in the report e.g. a end of license date
28
+ if isinstance(package_list, str):
29
+ package_list = [package_list]
30
+ self.package_list: Union[List[str],None] = package_list
31
+ self.cancel_if_present: bool = cancel_if_present
32
+ self.include_package_custom_fields: List[str] = package_custom_fields
33
+ self.include_resources_detail: bool = True
34
+ self.include_policy_messages: bool = full_report
35
+ self.include_group_report: bool = full_report
36
+ self.date_format:Union[str,None] = '%d/%m/%Y %H:%M'
37
+ self._connected_user: Union[CkanUserInfo, None] = None
38
+ self.report_date: Union[datetime.datetime, None] = None
39
+ self._elapsed_time_requests: Union[float,None] = None
40
+ self._request_count: Union[int,None] = None
41
+ self.report: Union[dict,None] = None # report output
42
+ if ckan is not None:
43
+ self.execute(ckan)
44
+
45
+ def _date_format_str(self, date:datetime.datetime) -> str:
46
+ if self.date_format is not None:
47
+ return date.strftime(self.date_format)
48
+ else:
49
+ return date.isoformat()
50
+
51
+ def _perform_requests(self, ckan: CkanApi) -> None:
52
+ if not self.cancel_if_present:
53
+ ckan.purge(purge_map=True)
54
+ start = time.time()
55
+ self.report_date = datetime.datetime.now()
56
+ request_count_init = ckan.debug.ckan_request_counter
57
+ self._connected_user = ckan.query_current_user()
58
+ if not self._connected_user.sysadmin:
59
+ msg = f"It is recommended to run the report with a user with sysadmin rights. Current user: {self._connected_user.name}"
60
+ warn(msg)
61
+ ckan.map_resources(self.package_list, datastore_info=True, only_missing=self.cancel_if_present)
62
+ ckan.organization_list_all(cancel_if_present=False, include_users=True)
63
+ ckan.license_list(cancel_if_present=self.cancel_if_present)
64
+ ckan.map_file_resource_sizes(cancel_if_present=self.cancel_if_present)
65
+ ckan.map_user_rights(cancel_if_present=self.cancel_if_present)
66
+ self._elapsed_time_requests = time.time() - start
67
+ self._request_count = ckan.debug.ckan_request_counter - request_count_init
68
+
69
+ def _consolidate(self, ckan: CkanApi) -> None:
70
+ for user_info in ckan.map.users.values():
71
+ user_info.organizations = []
72
+ for organization_info in ckan.map.organizations.values():
73
+ for user_id in organization_info.user_members.keys():
74
+ ckan.map.users[user_id].organizations.append(organization_info.name)
75
+
76
+ def _create_report(self, ckan: CkanApi) -> None:
77
+ policy_messages: Dict[str, List[DataPolicyError]] = {}
78
+ ckan.policy_check(buffer=policy_messages)
79
+
80
+ report_header = OrderedDict([
81
+ ("title", "Admin report on packages and resources"),
82
+ ("date", self._date_format_str(self.report_date)),
83
+ ("ckan", ckan.url),
84
+ ("user", self._connected_user.name),
85
+ ("user_sysadmin", self._connected_user.sysadmin),
86
+ ("package_selection", self.package_list if self.package_list is not None else "All"),
87
+ ])
88
+ packages_report = {}
89
+ total_policy_errors = ErrorCount([])
90
+ total_filestore_size_mb = 0.
91
+ total_external_size_mb = 0.
92
+ total_datastore_size_mb = 0.
93
+ total_resource_count = 0
94
+ total_external_resource_count = 0
95
+ total_datastore_count = 0
96
+ total_datastore_lines = 0
97
+ global_last_modified_resources = None
98
+ global_last_modified_metadata = None
99
+ for package_id, package_info in ckan.map.packages.items():
100
+ package_name = package_info.name
101
+ package_data_format_messages = policy_messages.get(package_name, [])
102
+ data_format_policy_scores = ErrorCount(package_data_format_messages)
103
+ total_policy_errors += data_format_policy_scores
104
+ resources_report = []
105
+ last_modified_resource = None
106
+ last_modified_resource_metadata = None
107
+ package_resource_count = len(package_info.package_resources)
108
+ package_external_resource_count = 0
109
+ package_datastore_count = 0
110
+ package_filestore_size_mb = 0.
111
+ package_external_size_mb = 0.
112
+ package_datastore_size_mb = 0.
113
+ package_datastore_lines = 0
114
+ for resource_id, resource_info in package_info.package_resources.items():
115
+ resource_modified = resource_info.last_modified if resource_info.last_modified is not None else resource_info.created
116
+ internal_filestore = ckan.is_url_internal(resource_info.download_url)
117
+ resource_report = OrderedDict([
118
+ ("resource_name", resource_info.name),
119
+ ("id", resource_id),
120
+ ("state", str(resource_info.state)),
121
+ ("external_url", resource_info.download_url if resource_info.download_url and not internal_filestore else None),
122
+ ("filestore_size_mb", resource_info.download_size_mb if internal_filestore else None),
123
+ ("external_size_mb", resource_info.download_size_mb if not internal_filestore else None),
124
+ ("datastore_size_mb", 0),
125
+ ("datastore_active", resource_info.datastore_active),
126
+ ("datastore_lines", None),
127
+ ("date_modified", self._date_format_str(resource_modified) if resource_modified is not None else None),
128
+ ("metadata_modified", self._date_format_str(resource_info.metadata_modified) if resource_info.metadata_modified is not None else None),
129
+ ("datastore_aliases", None),
130
+ ])
131
+ if resource_modified is not None:
132
+ last_modified_resource = max(last_modified_resource, resource_modified) \
133
+ if last_modified_resource else resource_modified
134
+ global_last_modified_resources = max(global_last_modified_resources, resource_modified) \
135
+ if global_last_modified_resources else resource_modified
136
+ if resource_info.metadata_modified is not None:
137
+ last_modified_resource_metadata = max(last_modified_resource_metadata, resource_info.metadata_modified) \
138
+ if last_modified_resource_metadata else resource_info.metadata_modified
139
+ global_last_modified_metadata = max(global_last_modified_metadata, resource_modified) \
140
+ if global_last_modified_metadata else resource_modified
141
+ if resource_info.download_url:
142
+ if internal_filestore:
143
+ package_filestore_size_mb += resource_info.download_size_mb
144
+ else:
145
+ package_external_size_mb += resource_info.download_size_mb
146
+ package_external_resource_count += 1
147
+ if resource_info.datastore_info is not None:
148
+ datastore_size = round_size(resource_info.datastore_info.table_size_mb + resource_info.datastore_info.index_size_mb)
149
+ resource_report["datastore_aliases"] = resource_info.datastore_info.aliases
150
+ resource_report["datastore_size_mb"] = datastore_size
151
+ package_datastore_size_mb += datastore_size
152
+ resource_report["datastore_lines"] = resource_info.datastore_info.row_count
153
+ package_datastore_lines += resource_info.datastore_info.row_count
154
+ package_datastore_count += 1
155
+ resources_report.append(resource_report)
156
+ package_report = OrderedDict([
157
+ ("package_title", package_info.title),
158
+ ("state", str(package_info.state)),
159
+ ("organization", package_info.organization_info.name if package_info.organization_info else None),
160
+ ("version", package_info.version),
161
+ ("license", ckan.map.licenses[package_info.license_id].title if package_info.license_id else None),
162
+ ("license_domain", ckan.map.licenses[package_info.license_id].domain.to_dict() if package_info.license_id else None),
163
+ ("author", package_info.author),
164
+ ("maintainer", package_info.maintainer),
165
+ ("metadata_modified", self._date_format_str(package_info.metadata_modified)),
166
+ ("resources_modified", self._date_format_str(last_modified_resource) if last_modified_resource is not None else None),
167
+ ("resources_metadata_modified", self._date_format_str(last_modified_resource_metadata) if last_modified_resource_metadata is not None else None),
168
+ ("visibility", str(CkanVisibility.from_bool_is_private(package_info.private))),
169
+ ("filestore_total_size_mb", round_size(package_filestore_size_mb)),
170
+ ("external_total_size_mb", round_size(package_external_size_mb)),
171
+ ("datastore_total_size_mb", round_size(package_datastore_size_mb)),
172
+ ("datastore_total_lines", package_datastore_lines),
173
+ ("resource_count", package_resource_count),
174
+ ("among_resources_external", package_external_resource_count),
175
+ ("among_resources_datastore", package_datastore_count),
176
+ ("data_format_policy_scores", data_format_policy_scores.to_dict()),
177
+ ("tags", package_info.tags),
178
+ ])
179
+ for custom_field in self.include_package_custom_fields:
180
+ package_report[custom_field] = package_info.custom_fields.get(custom_field, None)
181
+ package_report["users"] = []
182
+ package_report["groups"] = []
183
+ if self.include_resources_detail:
184
+ package_report["resources"] = resources_report
185
+ if package_info.private:
186
+ users_dict = {ckan.map.users[user_id].name: collaboration.to_dict(ckan.map.users[user_id], ckan.map.groups, self.date_format)
187
+ for user_id, collaboration in package_info.user_access.items()}
188
+ package_report["users"] = OrderedDict(sorted(users_dict.items()))
189
+ else:
190
+ # TODO: do all users have write access if package is Public
191
+ package_report["users"] = "all (Public)"
192
+ package_report["groups"] = sorted([group_info.name for group_info in package_info.groups])
193
+ if self.include_policy_messages:
194
+ package_report["policy_messages"] = [message.to_dict() for message in package_data_format_messages]
195
+ total_filestore_size_mb += package_filestore_size_mb
196
+ total_external_size_mb += package_external_size_mb
197
+ total_datastore_size_mb += package_datastore_size_mb
198
+ total_resource_count += package_resource_count
199
+ total_external_resource_count += package_external_resource_count
200
+ total_datastore_count += package_datastore_count
201
+ total_datastore_lines += package_datastore_lines
202
+ global_last_modified_metadata = max(global_last_modified_metadata, package_info.metadata_modified) \
203
+ if global_last_modified_metadata else package_info.metadata_modified
204
+ packages_report[package_name] = package_report
205
+ packages_report = OrderedDict(sorted(packages_report.items()))
206
+ report_totals = OrderedDict([
207
+ ("total_filestore_size_mb", round_size(total_filestore_size_mb)),
208
+ ("total_datastore_size_mb", round_size(total_datastore_size_mb)),
209
+ ("total_external_size_mb", round_size(total_external_size_mb)),
210
+ ("total_datastore_lines", total_datastore_lines),
211
+ ("num_packages", len(packages_report)),
212
+ ("total_resource_count", total_resource_count),
213
+ ("among_resources_external", total_external_resource_count),
214
+ ("among_resources_datastore", total_datastore_count),
215
+ ("last_modified_data", self._date_format_str(global_last_modified_resources) if global_last_modified_resources else None),
216
+ ("last_modified_metadata", self._date_format_str(global_last_modified_metadata) if global_last_modified_metadata else None),
217
+ ("total_policy_errors", total_policy_errors.to_dict()),
218
+ ])
219
+ sysadmin_report = {user_info.name: OrderedDict([
220
+ ("fullname", user_info.fullname),
221
+ ("last_active", self._date_format_str(user_info.last_active) if user_info.last_active is not None else None),
222
+ ("organizations", user_info.organizations),
223
+ ]) for user_info in ckan.map.users.values() if user_info.sysadmin}
224
+ sysadmin_report = OrderedDict(sorted(sysadmin_report.items()))
225
+ users_report = {user_info.name: OrderedDict([
226
+ ("fullname", user_info.fullname),
227
+ ("last_active", self._date_format_str(user_info.last_active) if user_info.last_active is not None else None),
228
+ ("organizations", user_info.organizations),
229
+ ]) for user_info in ckan.map.users.values() if not user_info.sysadmin}
230
+ users_report = OrderedDict(sorted(users_report.items()))
231
+ groups_report = {group_info.name: OrderedDict([
232
+ ("group_title", group_info.title),
233
+ ("package_count", group_info.package_count),
234
+ ("users_count", len(group_info.user_members)),
235
+ ("users", OrderedDict(sorted({ckan.map.users[user_id].name: str(capacity) for user_id, capacity in group_info.user_members.items()}.items())) if group_info.user_members is not None else None),
236
+ ]) for group_info in ckan.map.groups.values()}
237
+ groups_report = OrderedDict(sorted(groups_report.items()))
238
+ report_footer = OrderedDict([
239
+ ("requests_count", self._request_count),
240
+ ("time_elapsed_seconds", self._elapsed_time_requests),
241
+ ])
242
+ report = OrderedDict([
243
+ ("header", report_header),
244
+ ("totals", report_totals),
245
+ ("packages", packages_report),
246
+ ("users", OrderedDict([
247
+ ("sysadmins", sysadmin_report),
248
+ ("other", users_report),
249
+ ])),
250
+ ])
251
+ if self.include_group_report:
252
+ report["groups"] = groups_report
253
+ report["footer"] = report_footer
254
+ self.report = report
255
+
256
+ def execute(self, ckan: CkanApi) -> dict:
257
+ self._perform_requests(ckan)
258
+ self._consolidate(ckan)
259
+ self._create_report(ckan)
260
+ return self.report
261
+
262
+ def refresh_report(self, ckan: CkanApi) -> dict:
263
+ self._create_report(ckan)
264
+ return self.report
265
+
266
+ def to_jsons(self) -> str:
267
+ return to_jsons_indent_lists_single_line(self.report)
268
+
269
+ def to_json(self, file_path:str) -> None:
270
+ with open(file_path, "w", encoding="utf8") as f:
271
+ f.write(self.to_jsons())
272
+
273
+
274
+ if __name__ == '__main__':
275
+ ckan = CkanApi()
276
+ ckan.initialize_from_cli_args()
277
+ ckan.input_missing_info(input_args_if_necessary=True, input_owner_org=True)
278
+
279
+ package_list = None # use this argument or no argument to make a full report
280
+ # package_list = ["builder-example-py"] # limit to the example package
281
+
282
+ report = CkanAdminReport(ckan=ckan, package_list=package_list, full_report=True)
283
+ print(report.to_jsons())
284
+
285
+ self_dir = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
286
+ top_dir = os.path.abspath(os.path.join(self_dir, "..", ".."))
287
+ tests_dir = os.path.abspath(os.path.join(top_dir, "..", "tests"))
288
+ out_file = os.path.join(tests_dir, "admin_report.json")
289
+ # out_file = os.path.join(tests_dir, f"admin_report_{report.report_date.strftime('%Y%m%dT%H%M')}.json")
290
+ report.to_json(out_file)
291
+
292
+ print(f"Done. Saved report to {out_file}")