ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ckanapi_harvesters/__init__.py +32 -10
- ckanapi_harvesters/auxiliary/__init__.py +26 -0
- ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
- ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
- ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
- ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
- ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
- ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
- ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
- ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
- ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
- ckanapi_harvesters/auxiliary/deprecated.py +82 -0
- ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
- ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
- ckanapi_harvesters/auxiliary/list_records.py +60 -0
- ckanapi_harvesters/auxiliary/login.py +163 -0
- ckanapi_harvesters/auxiliary/path.py +208 -0
- ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
- ckanapi_harvesters/auxiliary/urls.py +40 -0
- ckanapi_harvesters/builder/__init__.py +40 -0
- ckanapi_harvesters/builder/builder_aux.py +20 -0
- ckanapi_harvesters/builder/builder_ckan.py +238 -0
- ckanapi_harvesters/builder/builder_errors.py +36 -0
- ckanapi_harvesters/builder/builder_field.py +122 -0
- ckanapi_harvesters/builder/builder_package.py +9 -0
- ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
- ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
- ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
- ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
- ckanapi_harvesters/builder/builder_resource.py +589 -0
- ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
- ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
- ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
- ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
- ckanapi_harvesters/builder/builder_resource_init.py +126 -0
- ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
- ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
- ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
- ckanapi_harvesters/builder/example/__init__.py +21 -0
- ckanapi_harvesters/builder/example/builder_example.py +21 -0
- ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
- ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
- ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
- ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
- ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
- ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
- ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
- ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
- ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
- ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
- ckanapi_harvesters/builder/mapper_datastore.py +93 -0
- ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
- ckanapi_harvesters/builder/specific/__init__.py +11 -0
- ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
- ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
- ckanapi_harvesters/ckan_api/__init__.py +20 -0
- ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
- ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
- ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
- ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
- ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
- ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
- ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
- ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
- ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
- ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
- ckanapi_harvesters/harvesters/__init__.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
- ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
- ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
- ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
- ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
- ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
- ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
- ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
- ckanapi_harvesters/harvesters/harvester_init.py +30 -0
- ckanapi_harvesters/harvesters/harvester_model.py +49 -0
- ckanapi_harvesters/harvesters/harvester_params.py +323 -0
- ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
- ckanapi_harvesters/harvesters/postgre_params.py +86 -0
- ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
- ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
- ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
- ckanapi_harvesters/policies/__init__.py +20 -0
- ckanapi_harvesters/policies/data_format_policy.py +269 -0
- ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
- ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
- ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
- ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
- ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
- ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
- ckanapi_harvesters/reports/__init__.py +11 -0
- ckanapi_harvesters/reports/admin_report.py +292 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/METADATA +84 -38
- ckanapi_harvesters-0.0.3.dist-info/RECORD +105 -0
- ckanapi_harvesters/divider/__init__.py +0 -27
- ckanapi_harvesters/divider/divider.py +0 -53
- ckanapi_harvesters/divider/divider_error.py +0 -59
- ckanapi_harvesters/main.py +0 -30
- ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/WHEEL +0 -0
- {ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Extensions of os.path and operations on urls
|
|
5
|
+
"""
|
|
6
|
+
from typing import Union, Set, List
|
|
7
|
+
from warnings import warn
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
disable_relative_path_constraint = False
|
|
13
|
+
|
|
14
|
+
def unlock_relative_path_constraint(value:bool=True) -> None:
|
|
15
|
+
"""
|
|
16
|
+
This function disables relative path error messages when a relative path is required.
|
|
17
|
+
|
|
18
|
+
:return:
|
|
19
|
+
"""
|
|
20
|
+
global disable_relative_path_constraint
|
|
21
|
+
disable_relative_path_constraint = value
|
|
22
|
+
if value:
|
|
23
|
+
msg = "Relative path constraint is disabled to your own risk."
|
|
24
|
+
warn(msg)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BaseDirUndefError(Exception):
|
|
28
|
+
def __init__(self, path: str):
|
|
29
|
+
super().__init__(f"Could not determine the file path because no base_dir was provided: {path}")
|
|
30
|
+
|
|
31
|
+
class AbsolutePathError(Exception):
|
|
32
|
+
def __init__(self, field:str, path: str):
|
|
33
|
+
super().__init__(f"A relative path is highly suggested for this field ({field}): {path}. To disable this error message, run unlock_relative_path_constraint().")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def sanitize_path(path:Union[str,None],
|
|
37
|
+
*, expand_path:bool=False, keyword_exceptions:Set[str]=None) -> Union[str,None]:
|
|
38
|
+
"""
|
|
39
|
+
Sanitize paths from user inputs
|
|
40
|
+
"""
|
|
41
|
+
if path is not None:
|
|
42
|
+
if os.path.sep == '\\':
|
|
43
|
+
path = re.sub(r"[\\/]", "\\\\", path)
|
|
44
|
+
else:
|
|
45
|
+
path = re.sub(r"[\\/]", os.path.sep, path)
|
|
46
|
+
if expand_path:
|
|
47
|
+
path = os.path.expandvars(path)
|
|
48
|
+
path_keyword = path.lower().strip()
|
|
49
|
+
if path_keyword in keyword_exceptions:
|
|
50
|
+
return path_keyword
|
|
51
|
+
else:
|
|
52
|
+
path = os.path.expanduser(path) # cover the case where the path starts with '~'
|
|
53
|
+
return path
|
|
54
|
+
else:
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def path_rel_to_dir(path:Union[str,None], base_dir:str=None, *, keyword_exceptions:Set[str]=None,
|
|
59
|
+
error_base_dir_undef:bool=False, default_value:str=None,
|
|
60
|
+
only_relative:bool=False, abs_error:bool=False, field:str=None) -> Union[str,None]:
|
|
61
|
+
"""
|
|
62
|
+
Returns the absolute path. If relative, the base directory can be specified. If not specified, the cwd is used.
|
|
63
|
+
|
|
64
|
+
:param path: original path string
|
|
65
|
+
:param base_dir: the base directory, for relative paths if provided (default = cwd)
|
|
66
|
+
:param keyword_exceptions: some values are not replaced and must be treated after this function call.
|
|
67
|
+
:param error_base_dir_undef: Option to raise an error if no base_dir was provided (cwd is used by default).
|
|
68
|
+
:param default_value: the value to return if path is None.
|
|
69
|
+
:param only_relative: If set to True, a warning or error message is raised if an absolute path is provided.
|
|
70
|
+
:param abs_error: Condition to choose between a warning or an error message.
|
|
71
|
+
:param field: name of the field for the error message.
|
|
72
|
+
:return: absolute path or keyword
|
|
73
|
+
"""
|
|
74
|
+
if keyword_exceptions is None:
|
|
75
|
+
keyword_exceptions = set()
|
|
76
|
+
path_src = path
|
|
77
|
+
path = sanitize_path(path)
|
|
78
|
+
if path is None:
|
|
79
|
+
return default_value
|
|
80
|
+
else:
|
|
81
|
+
path = os.path.expandvars(path) # replace environment variables mentioned in the path, if any
|
|
82
|
+
path_keyword = path.lower().strip()
|
|
83
|
+
if path_keyword in keyword_exceptions:
|
|
84
|
+
return path_keyword
|
|
85
|
+
else:
|
|
86
|
+
path = os.path.expanduser(path) # cover the case where the path starts with '~'
|
|
87
|
+
if os.path.isabs(path):
|
|
88
|
+
if only_relative:
|
|
89
|
+
msg = AbsolutePathError(field, path_src)
|
|
90
|
+
if abs_error:
|
|
91
|
+
raise msg
|
|
92
|
+
else:
|
|
93
|
+
warn(str(msg))
|
|
94
|
+
return path.strip()
|
|
95
|
+
elif base_dir is not None:
|
|
96
|
+
return os.path.join(base_dir, path.strip())
|
|
97
|
+
elif error_base_dir_undef:
|
|
98
|
+
raise BaseDirUndefError(path)
|
|
99
|
+
else:
|
|
100
|
+
return os.path.abspath(path)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def resolve_rel_path(base_dir:str, rel_path:str, *args: str, field:str, only_relative:bool=True) -> str:
|
|
104
|
+
"""
|
|
105
|
+
Alias to path_rel_to_dir, with arguments order similar to os.path.join and requirement for a relative path.
|
|
106
|
+
Relative path verification can be removed by calling unlock_relative_path_constraint.
|
|
107
|
+
field: name of the field for the error message.
|
|
108
|
+
|
|
109
|
+
:return:
|
|
110
|
+
"""
|
|
111
|
+
if len(args) > 0:
|
|
112
|
+
rel_path = os.path.join(rel_path, *args)
|
|
113
|
+
return path_rel_to_dir(rel_path, base_dir=base_dir, field=field,
|
|
114
|
+
only_relative=only_relative, abs_error=not disable_relative_path_constraint,
|
|
115
|
+
error_base_dir_undef=True)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# reverse function:
|
|
119
|
+
def make_path_relative(path:str, to_base_dir:str = None, *, default_value:str=None,
|
|
120
|
+
source_string:str=None, keyword_exceptions:Set[str]=None, same_destination:bool=True) -> str:
|
|
121
|
+
"""
|
|
122
|
+
When you save a file to a new location, make relative paths relative to the new file location,
|
|
123
|
+
pointing to the same destination (except if same_destination is False -> source_string is used in this case, if present and relative path)
|
|
124
|
+
The source_string is the path present in the original document.
|
|
125
|
+
|
|
126
|
+
:param path: full file path (absolute, ideally output from path_rel_to_dir)
|
|
127
|
+
:param to_base_dir: the new base directory, to derive the relative paths from
|
|
128
|
+
:param default_value: the value to return if the path is None
|
|
129
|
+
:param source_string: string representing the path in the original document, without any treatments
|
|
130
|
+
:param keyword_exceptions: keywords to return as-is
|
|
131
|
+
:return: path relative to to_base_dir or keyword/path relative to environment variable/home directory symbol (~)
|
|
132
|
+
"""
|
|
133
|
+
if path is None:
|
|
134
|
+
return default_value
|
|
135
|
+
if source_string is not None:
|
|
136
|
+
if (not same_destination) and (not os.path.isabs(source_string)):
|
|
137
|
+
return source_string
|
|
138
|
+
if keyword_exceptions is None:
|
|
139
|
+
keyword_exceptions = set()
|
|
140
|
+
source_string = sanitize_path(source_string)
|
|
141
|
+
source_keyword = source_string.lower().strip()
|
|
142
|
+
if source_keyword in keyword_exceptions:
|
|
143
|
+
return source_keyword
|
|
144
|
+
elif source_string.startswith('~') or source_string.startswith('$'):
|
|
145
|
+
return source_string
|
|
146
|
+
# elif not (os.path.expanduser(source_string) == source_string or os.path.expandvars(source_string) == source_string):
|
|
147
|
+
# # condition to confirm
|
|
148
|
+
# return source_string
|
|
149
|
+
if path.startswith('~') or path.startswith('$'):
|
|
150
|
+
return path
|
|
151
|
+
else:
|
|
152
|
+
return path if to_base_dir is None else os.path.relpath(path, to_base_dir)
|
|
153
|
+
|
|
154
|
+
# File search
|
|
155
|
+
def list_files_scandir(path:str) -> List[str]:
|
|
156
|
+
# see also: glob.glob - this does not apply any filter
|
|
157
|
+
list_files = []
|
|
158
|
+
with os.scandir(path) as entries:
|
|
159
|
+
for entry in entries:
|
|
160
|
+
if entry.is_file():
|
|
161
|
+
list_files.append(entry.path)
|
|
162
|
+
elif entry.is_dir():
|
|
163
|
+
list_files += list_files_scandir(entry.path)
|
|
164
|
+
entries.close()
|
|
165
|
+
return list_files
|
|
166
|
+
|
|
167
|
+
glob_chars = r"*?![]"
|
|
168
|
+
glob_re = "[\\*\\?\\!\\[\\]]+"
|
|
169
|
+
|
|
170
|
+
def glob_rm_glob(glob_str:str, *, default_rec_dir:str=None) -> str:
|
|
171
|
+
"""
|
|
172
|
+
Extract directory name from a glob string (first elements of path without glob characters).
|
|
173
|
+
|
|
174
|
+
:param glob_str: the glob string
|
|
175
|
+
:param default_rec_dir: if the last removed element is "**" (directory recursion), the name of the directory to use instead
|
|
176
|
+
:return: a path without glob characters
|
|
177
|
+
|
|
178
|
+
Examples:
|
|
179
|
+
>>> glob_rm_glob(r"test\*.csv")
|
|
180
|
+
'test'
|
|
181
|
+
|
|
182
|
+
>>> glob_rm_glob(r"**\*.csv", default_rec_dir="hello")
|
|
183
|
+
'hello'
|
|
184
|
+
"""
|
|
185
|
+
glob_free = glob_str
|
|
186
|
+
while re.search(glob_re, glob_free):
|
|
187
|
+
glob_free, glob_sub = os.path.split(glob_free)
|
|
188
|
+
if glob_sub == "**" and re.search(glob_re, glob_free) is None and default_rec_dir is not None:
|
|
189
|
+
return os.path.join(glob_free, default_rec_dir)
|
|
190
|
+
return glob_free
|
|
191
|
+
|
|
192
|
+
def glob_name(glob_str:str):
|
|
193
|
+
"""
|
|
194
|
+
Extract file name glob from a glob string (last element of path, except if it is "**")
|
|
195
|
+
|
|
196
|
+
:param glob_str:
|
|
197
|
+
:return:
|
|
198
|
+
|
|
199
|
+
Example:
|
|
200
|
+
>>> glob_name(r"**\*.csv")
|
|
201
|
+
'*.csv'
|
|
202
|
+
"""
|
|
203
|
+
glob_dir, glob_file = os.path.split(glob_str)
|
|
204
|
+
if not glob_file == "**":
|
|
205
|
+
return glob_file
|
|
206
|
+
else:
|
|
207
|
+
return ""
|
|
208
|
+
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Setting the proxy from simple command line arguments
|
|
5
|
+
"""
|
|
6
|
+
import urllib.request
|
|
7
|
+
from typing import Union, Sequence, Tuple
|
|
8
|
+
import os
|
|
9
|
+
import argparse
|
|
10
|
+
from warnings import warn
|
|
11
|
+
import copy
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
from ckanapi_harvesters.auxiliary.ckan_defs import environ_keyword
|
|
15
|
+
from ckanapi_harvesters.auxiliary.path import sanitize_path, path_rel_to_dir
|
|
16
|
+
from ckanapi_harvesters.auxiliary.login import Login
|
|
17
|
+
|
|
18
|
+
import requests
|
|
19
|
+
from requests.auth import AuthBase, HTTPProxyAuth, HTTPBasicAuth
|
|
20
|
+
|
|
21
|
+
PROXY_AUTH_ENVIRON = "PROXY_AUTH_FILE"
|
|
22
|
+
|
|
23
|
+
class HttpsProxyDefError(Exception):
|
|
24
|
+
def __init__(self):
|
|
25
|
+
super().__init__("Only one of http_proxy or https_proxy is set")
|
|
26
|
+
|
|
27
|
+
def get_proxies_from_environ() -> dict:
|
|
28
|
+
proxies = urllib.request.getproxies()
|
|
29
|
+
return proxies
|
|
30
|
+
# http_proxy = os.environ.get("http_proxy")
|
|
31
|
+
# https_proxy = os.environ.get("https_proxy")
|
|
32
|
+
# no_proxy = os.environ.get("no_proxy")
|
|
33
|
+
# if http_proxy is not None and https_proxy is not None:
|
|
34
|
+
# proxies = {"http": http_proxy, "https": https_proxy}
|
|
35
|
+
# elif http_proxy is not None:
|
|
36
|
+
# proxies = {"http": http_proxy, "https": http_proxy}
|
|
37
|
+
# elif https_proxy is not None:
|
|
38
|
+
# raise HttpsProxyDefError()
|
|
39
|
+
# else:
|
|
40
|
+
# proxies = None
|
|
41
|
+
# if proxies is not None and no_proxy is not None:
|
|
42
|
+
# proxies["no"] = no_proxy
|
|
43
|
+
# return proxies
|
|
44
|
+
|
|
45
|
+
def host_port_sep(url:Union[str,None], *, default_port:int=None) -> Tuple[Union[str,None],Union[int,None]]:
|
|
46
|
+
if url is None:
|
|
47
|
+
return None, None
|
|
48
|
+
if ':' in url:
|
|
49
|
+
host_prefix, host_suffix = "", ""
|
|
50
|
+
if '@' in url:
|
|
51
|
+
host_prefix, url = url.split('@')
|
|
52
|
+
if '?' in url:
|
|
53
|
+
url, host_suffix = url.split('?')
|
|
54
|
+
host, port_str = url.split(':')
|
|
55
|
+
host = host_prefix + host + host_suffix
|
|
56
|
+
port = int(port_str)
|
|
57
|
+
else:
|
|
58
|
+
host, port = url, default_port
|
|
59
|
+
return host, port
|
|
60
|
+
|
|
61
|
+
def _define_proxies(proxy_string:Union[str, dict], default_proxies:dict=None) -> dict:
|
|
62
|
+
if proxy_string is None:
|
|
63
|
+
proxies = None
|
|
64
|
+
elif isinstance(proxy_string, dict):
|
|
65
|
+
proxies = proxy_string
|
|
66
|
+
elif isinstance(proxy_string, str):
|
|
67
|
+
proxy_string = proxy_string.strip()
|
|
68
|
+
proxy_mode = proxy_string.lower()
|
|
69
|
+
if proxy_mode == environ_keyword:
|
|
70
|
+
proxies = get_proxies_from_environ()
|
|
71
|
+
if proxies is None:
|
|
72
|
+
proxies = default_proxies
|
|
73
|
+
elif proxy_mode == "unspecified":
|
|
74
|
+
proxies = None # do not specify the proxies - is equivalent to "environ"
|
|
75
|
+
elif proxy_mode == "noproxy":
|
|
76
|
+
proxies = {"http": "", "https": ""} # do not use any proxy
|
|
77
|
+
elif proxy_mode == "default":
|
|
78
|
+
proxies = default_proxies # default proxies, provided in argument
|
|
79
|
+
elif proxy_string.startswith('{'):
|
|
80
|
+
# proxy string is a string representation of proxy dictionary
|
|
81
|
+
proxies = json.loads(proxy_string)
|
|
82
|
+
else:
|
|
83
|
+
# suppose string contains an url to a proxy server
|
|
84
|
+
proxies = {"http": proxy_string, "https": proxy_string}
|
|
85
|
+
# if "http" not in proxy_string:
|
|
86
|
+
# # url without http
|
|
87
|
+
# proxies = {"http": f"http://{proxy_string}", "https": f"http://{proxy_string}"}
|
|
88
|
+
# else:
|
|
89
|
+
# proxies = {"http": proxy_string, "https": proxy_string}
|
|
90
|
+
else:
|
|
91
|
+
raise TypeError("proxy must be str or dict")
|
|
92
|
+
return proxies
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ProxyConfig:
|
|
96
|
+
def __init__(self, proxy_string:Union[str,dict]=None, default_proxies:dict=None,
|
|
97
|
+
proxy_headers:dict = None, proxy_auth:Union[AuthBase, Tuple[str,str]]=None) -> None:
|
|
98
|
+
"""
|
|
99
|
+
:param proxy_string: string or proxies dict or ProxyConfig object.
|
|
100
|
+
If a string is provided, it must be an url to a proxy or one of the following values:
|
|
101
|
+
- "environ": use the proxies specified in the environment variables "http_proxy" and "https_proxy"
|
|
102
|
+
- "noproxy": do not use any proxies
|
|
103
|
+
- "unspecified": do not specify the proxies
|
|
104
|
+
- "default": use value provided by default_proxies
|
|
105
|
+
:param default_proxies: proxies used if proxies="default"
|
|
106
|
+
:param proxy_headers: headers used to access the proxies, generally for authentication
|
|
107
|
+
"""
|
|
108
|
+
if proxy_headers is None: proxy_headers = {}
|
|
109
|
+
self._proxy_string:Union[str, dict, None] = None
|
|
110
|
+
self._proxies:Union[dict,None] = None
|
|
111
|
+
self._is_defined:bool = False
|
|
112
|
+
self._default_proxies:Union[dict,None] = default_proxies
|
|
113
|
+
self.proxy_headers: dict = proxy_headers
|
|
114
|
+
self._proxy_auth: Union[AuthBase, Tuple[str,str], None] = proxy_auth
|
|
115
|
+
self.proxy_auth_file: Union[str,None] = None
|
|
116
|
+
self.proxy_auth_from_env: bool = False
|
|
117
|
+
self.proxy_string = proxy_string # property
|
|
118
|
+
# self.load_proxy_auth_environ(error_not_found=False) # recommended to base these parameters on user demand (confirm if there is a risk of leakage)
|
|
119
|
+
|
|
120
|
+
def __str__(self):
|
|
121
|
+
return str(self._proxies)
|
|
122
|
+
|
|
123
|
+
def __copy__(self):
|
|
124
|
+
return self.copy()
|
|
125
|
+
|
|
126
|
+
def copy(self) -> "ProxyConfig":
|
|
127
|
+
dest = ProxyConfig()
|
|
128
|
+
dest._proxies = copy.deepcopy(self._proxies)
|
|
129
|
+
dest._proxy_string = copy.deepcopy(self._proxy_string)
|
|
130
|
+
dest._default_proxies = copy.deepcopy(self._default_proxies)
|
|
131
|
+
dest.proxy_headers = copy.deepcopy(self.proxy_headers)
|
|
132
|
+
dest._proxy_auth = copy.deepcopy(self._proxy_auth)
|
|
133
|
+
dest._is_defined = self._is_defined
|
|
134
|
+
dest.proxy_auth_file = self.proxy_auth_file
|
|
135
|
+
dest.proxy_auth_from_env = self.proxy_auth_from_env
|
|
136
|
+
return dest
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def proxy_string(self) -> Union[str, dict, None]:
|
|
141
|
+
return self._proxy_string
|
|
142
|
+
@proxy_string.setter
|
|
143
|
+
def proxy_string(self, proxy_string:Union[str, dict, None]):
|
|
144
|
+
self._proxy_string = proxy_string
|
|
145
|
+
self._is_defined = proxy_string is not None
|
|
146
|
+
self._proxies = _define_proxies(proxy_string, default_proxies=self._default_proxies)
|
|
147
|
+
@property
|
|
148
|
+
def proxies(self) -> dict:
|
|
149
|
+
return self._proxies
|
|
150
|
+
@proxies.setter
|
|
151
|
+
def proxies(self, proxies:dict):
|
|
152
|
+
self._proxy_string = proxies
|
|
153
|
+
self._proxies = proxies
|
|
154
|
+
self._is_defined = True
|
|
155
|
+
@property
|
|
156
|
+
def proxy_auth(self) -> Union[AuthBase, Tuple[str,str]]:
|
|
157
|
+
return self._proxy_auth
|
|
158
|
+
@proxy_auth.setter
|
|
159
|
+
def proxy_auth(self, proxy_auth:Union[AuthBase, Tuple[str,str]]):
|
|
160
|
+
self._proxy_auth = proxy_auth
|
|
161
|
+
self.proxy_auth_file = None
|
|
162
|
+
self.proxy_auth_from_env = False
|
|
163
|
+
|
|
164
|
+
def get_host_port(self) -> Tuple[Union[str,None],Union[int,None]]:
|
|
165
|
+
# special mode
|
|
166
|
+
if self._proxies is None:
|
|
167
|
+
return None, None
|
|
168
|
+
elif "http_proxy" in self._proxies.keys() and self._proxies["http_proxy"] is not None:
|
|
169
|
+
return host_port_sep(self._proxies["http_proxy"])
|
|
170
|
+
else:
|
|
171
|
+
return None, None
|
|
172
|
+
|
|
173
|
+
def get_proxy_login(self) -> Login:
|
|
174
|
+
if self._proxy_auth is None:
|
|
175
|
+
return Login()
|
|
176
|
+
else:
|
|
177
|
+
assert(isinstance(self._proxy_auth, HTTPBasicAuth)) # HTTPProxyAuth is a super class of HTTPBasicAuth
|
|
178
|
+
return Login(self._proxy_auth.username, self._proxy_auth.password)
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def from_str_or_config(proxies:Union[str,dict, "ProxyConfig"],
|
|
182
|
+
*, default_proxies:dict=None, proxy_headers:dict=None) -> "ProxyConfig":
|
|
183
|
+
if proxies is None:
|
|
184
|
+
return ProxyConfig(None, default_proxies=default_proxies , proxy_headers=proxy_headers)
|
|
185
|
+
elif isinstance(proxies, ProxyConfig):
|
|
186
|
+
if proxy_headers is not None:
|
|
187
|
+
proxies.proxy_headers = proxy_headers
|
|
188
|
+
return proxies
|
|
189
|
+
else:
|
|
190
|
+
return ProxyConfig(proxies, default_proxies=default_proxies, proxy_headers=proxy_headers)
|
|
191
|
+
|
|
192
|
+
def replace_default_proxy(self, default_proxies:dict) -> None:
|
|
193
|
+
if self._proxy_string is not None and self._proxy_string.lower() == "default":
|
|
194
|
+
self._proxies = default_proxies
|
|
195
|
+
|
|
196
|
+
def reset(self) -> None:
|
|
197
|
+
self._proxy_string = None
|
|
198
|
+
self._proxies = None
|
|
199
|
+
self._is_defined = False
|
|
200
|
+
|
|
201
|
+
def is_defined(self) -> bool:
|
|
202
|
+
return self._is_defined
|
|
203
|
+
|
|
204
|
+
def load_proxy_auth_environ(self, *, error_not_found:bool=False) -> bool:
|
|
205
|
+
proxy_auth_file = sanitize_path(os.environ.get(PROXY_AUTH_ENVIRON)) # "PROXY_AUTH_FILE"
|
|
206
|
+
if proxy_auth_file is not None:
|
|
207
|
+
proxy_keyword = proxy_auth_file.strip().lower()
|
|
208
|
+
assert(not proxy_keyword == environ_keyword) # this value would create an infinite loop
|
|
209
|
+
if self.load_proxy_auth_from_file(proxy_auth_file, error_not_found=error_not_found):
|
|
210
|
+
self.proxy_auth_from_env = True
|
|
211
|
+
return True
|
|
212
|
+
return False
|
|
213
|
+
|
|
214
|
+
def load_proxy_auth_from_file(self, file_path:str, *, base_dir:str=None, error_not_found:bool=True) -> bool:
|
|
215
|
+
file_path = path_rel_to_dir(file_path, base_dir=base_dir, keyword_exceptions={environ_keyword})
|
|
216
|
+
proxy_keyword = file_path.strip().lower()
|
|
217
|
+
if proxy_keyword == environ_keyword:
|
|
218
|
+
# this keyword is not very useful if proxy authentication file is loaded from environment anyway
|
|
219
|
+
return self.load_proxy_auth_environ(error_not_found=error_not_found)
|
|
220
|
+
if (not error_not_found) and (not os.path.isfile(file_path)):
|
|
221
|
+
msg = f"Proxy authentication file does not exist: {file_path}"
|
|
222
|
+
warn(msg)
|
|
223
|
+
return False
|
|
224
|
+
self.proxy_auth_file = file_path
|
|
225
|
+
self.proxy_auth_from_env = False
|
|
226
|
+
with open(file_path, "r") as f:
|
|
227
|
+
auth_type = f.readline().strip().lower()
|
|
228
|
+
username = f.readline().strip()
|
|
229
|
+
password = f.readline().strip()
|
|
230
|
+
if auth_type == "basic" or auth_type == "httpbasicauth":
|
|
231
|
+
self._proxy_auth = requests.auth.HTTPBasicAuth(username, password)
|
|
232
|
+
elif auth_type == "proxy" or auth_type == "httpproxyauth":
|
|
233
|
+
self._proxy_auth = requests.auth.HTTPProxyAuth(username, password)
|
|
234
|
+
elif auth_type == "digest" or auth_type == "httpdigestauth":
|
|
235
|
+
self._proxy_auth = requests.auth.HTTPDigestAuth(username, password)
|
|
236
|
+
elif auth_type == "none":
|
|
237
|
+
self._proxy_auth = None
|
|
238
|
+
else:
|
|
239
|
+
raise KeyError(f"Unknown auth type {auth_type}")
|
|
240
|
+
return True
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@staticmethod
|
|
244
|
+
def _setup_cli_proxy_parser(parser: argparse.ArgumentParser = None) -> argparse.ArgumentParser:
|
|
245
|
+
"""
|
|
246
|
+
Define or add CLI arguments to initialize the proxy
|
|
247
|
+
parser help message:
|
|
248
|
+
|
|
249
|
+
Proxy parameters initialization
|
|
250
|
+
|
|
251
|
+
options:
|
|
252
|
+
-h, --help show this help message and exit
|
|
253
|
+
--proxy PROXY Proxy for HTTP and HTTPS
|
|
254
|
+
|
|
255
|
+
:param parser: option to provide an existing parser to add the specific fields needed to initialize a CKAN API connection
|
|
256
|
+
:return:
|
|
257
|
+
"""
|
|
258
|
+
if parser is None:
|
|
259
|
+
parser = argparse.ArgumentParser(description="Proxy parameters initialization")
|
|
260
|
+
parser.add_argument("--proxy", type=str,
|
|
261
|
+
help="Proxy for HTTP and HTTPS")
|
|
262
|
+
parser.add_argument("--http-proxy", type=str,
|
|
263
|
+
help="HTTP proxy")
|
|
264
|
+
parser.add_argument("--https-proxy", type=str,
|
|
265
|
+
help="HTTPS proxy")
|
|
266
|
+
parser.add_argument("--no-proxy", type=str,
|
|
267
|
+
help="Proxy exceptions")
|
|
268
|
+
parser.add_argument("--proxy-auth-file", type=str,
|
|
269
|
+
help="Path to a proxy authentication file with 3 lines (authentication method, username, password)")
|
|
270
|
+
return parser
|
|
271
|
+
|
|
272
|
+
@staticmethod
|
|
273
|
+
def from_cli_args(args: argparse.Namespace, *, base_dir:str=None, error_not_found:bool=True,
|
|
274
|
+
default_proxies:dict=None, proxy_headers:dict=None) -> "ProxyConfig":
|
|
275
|
+
proxy_string, proxies = None, None
|
|
276
|
+
if args.proxy is not None:
|
|
277
|
+
proxy_string = args.proxy
|
|
278
|
+
elif args.http_proxy is not None and args.https_proxy is not None:
|
|
279
|
+
proxies = {"http": args.http_proxy, "https": args.https_proxy}
|
|
280
|
+
elif args.http_proxy is not None:
|
|
281
|
+
proxies = {"http": args.http_proxy, "https": args.http_proxy}
|
|
282
|
+
if proxies is not None and args.no_proxy is not None:
|
|
283
|
+
proxies["no"] = args.no_proxy
|
|
284
|
+
elif args.https_proxy is not None:
|
|
285
|
+
raise HttpsProxyDefError()
|
|
286
|
+
if proxy_string is not None:
|
|
287
|
+
proxy_config = ProxyConfig(proxy_string, default_proxies=default_proxies, proxy_headers=proxy_headers)
|
|
288
|
+
elif proxies is not None:
|
|
289
|
+
proxy_config = ProxyConfig(proxies, default_proxies=default_proxies, proxy_headers=proxy_headers)
|
|
290
|
+
else:
|
|
291
|
+
proxy_config = None
|
|
292
|
+
if args.proxy_auth_file is not None:
|
|
293
|
+
if proxy_config is not None:
|
|
294
|
+
proxy_config.load_proxy_auth_from_file(args.proxy_auth_file, base_dir=base_dir, error_not_found=error_not_found)
|
|
295
|
+
else:
|
|
296
|
+
raise Exception(f"Proxy authentication file specified without proxy specification: {args.proxy_auth_file}")
|
|
297
|
+
return proxy_config
|
|
298
|
+
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Operations on urls
|
|
5
|
+
"""
|
|
6
|
+
import urllib.parse
|
|
7
|
+
|
|
8
|
+
from ckanapi_harvesters.auxiliary.login import Login
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
urlsep = '/'
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def is_valid_url(url:str) -> bool:
|
|
15
|
+
try:
|
|
16
|
+
result = urllib.parse.urlparse(url)
|
|
17
|
+
return all([result.scheme, result.netloc, result.path])
|
|
18
|
+
except ValueError as e:
|
|
19
|
+
return False
|
|
20
|
+
|
|
21
|
+
def url_join(base:str, *args:str) -> str:
|
|
22
|
+
url = base
|
|
23
|
+
for arg in args:
|
|
24
|
+
if len(arg) > 0:
|
|
25
|
+
if not url.endswith(urlsep):
|
|
26
|
+
url += urlsep
|
|
27
|
+
url += arg
|
|
28
|
+
return url
|
|
29
|
+
|
|
30
|
+
def url_insert_login(url:str, login:Login):
|
|
31
|
+
"""
|
|
32
|
+
Insert user authentication parameters in a url
|
|
33
|
+
"""
|
|
34
|
+
if login is None:
|
|
35
|
+
return url
|
|
36
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
37
|
+
netloc_with_auth = f"{login.username}:{login.password}@{parsed_url.netloc}"
|
|
38
|
+
updated_url = parsed_url._replace(netloc=netloc_with_auth)
|
|
39
|
+
final_url = urllib.parse.urlunparse(updated_url)
|
|
40
|
+
return final_url
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Section of the package dedicated to the initialization of a CKAN package
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
BUILDER_FILE_FORMAT_VERSION = "0.0.0" # version of the Excel & JSON file format
|
|
8
|
+
|
|
9
|
+
from . import builder_aux
|
|
10
|
+
from . import builder_errors
|
|
11
|
+
from . import builder_field
|
|
12
|
+
from . import builder_resource
|
|
13
|
+
from . import builder_resource_multi_abc
|
|
14
|
+
from . import builder_resource_multi_file
|
|
15
|
+
from . import mapper_datastore
|
|
16
|
+
from . import builder_resource_datastore
|
|
17
|
+
from . import builder_resource_multi_datastore
|
|
18
|
+
from . import builder_resource_datastore_url
|
|
19
|
+
from . import builder_resource_datastore_unmanaged
|
|
20
|
+
from . import mapper_datastore_multi
|
|
21
|
+
from . import builder_resource_datastore_multi_abc
|
|
22
|
+
from . import builder_resource_datastore_multi_folder
|
|
23
|
+
from . import builder_resource_datastore_multi_harvester
|
|
24
|
+
from . import builder_resource_init
|
|
25
|
+
from . import builder_ckan
|
|
26
|
+
from . import builder_package_1_basic
|
|
27
|
+
from . import builder_package_2_harvesters
|
|
28
|
+
from . import builder_package_3_multi_threaded
|
|
29
|
+
from . import builder_package
|
|
30
|
+
|
|
31
|
+
from . import specific
|
|
32
|
+
from . import example
|
|
33
|
+
|
|
34
|
+
# usage shortcuts
|
|
35
|
+
from .builder_package import BuilderPackage
|
|
36
|
+
from .mapper_datastore_multi import RequestFileMapperIndexKeys
|
|
37
|
+
from .builder_resource_datastore_multi_abc import BuilderDataStoreMultiABC
|
|
38
|
+
from .builder_resource_datastore_multi_folder import BuilderDataStoreFolder
|
|
39
|
+
|
|
40
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Auxiliary functions
|
|
5
|
+
"""
|
|
6
|
+
from typing import List, Union
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
def positive_end_index(end_index:Union[int,None], total:int) -> int:
|
|
10
|
+
"""
|
|
11
|
+
Return stop index for a loop, following pythonic definition for slices (last index treated = end_index-1).
|
|
12
|
+
If end_index is negative, the index is taken from the end of the slice. end_index = -1 means end just before the last element.
|
|
13
|
+
"""
|
|
14
|
+
if end_index is None:
|
|
15
|
+
return total
|
|
16
|
+
elif end_index < 0:
|
|
17
|
+
return max(0, total + end_index)
|
|
18
|
+
else:
|
|
19
|
+
return end_index
|
|
20
|
+
|