ebi-eva-common-pyutils 0.6.15__1-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ebi_eva_common_pyutils/__init__.py +0 -0
- ebi_eva_common_pyutils/assembly/__init__.py +1 -0
- ebi_eva_common_pyutils/assembly/assembly.py +69 -0
- ebi_eva_common_pyutils/assembly_utils.py +91 -0
- ebi_eva_common_pyutils/biosamples_communicators.py +186 -0
- ebi_eva_common_pyutils/command_utils.py +54 -0
- ebi_eva_common_pyutils/common_utils.py +30 -0
- ebi_eva_common_pyutils/config.py +152 -0
- ebi_eva_common_pyutils/contig_alias/__init__.py +0 -0
- ebi_eva_common_pyutils/contig_alias/contig_alias.py +115 -0
- ebi_eva_common_pyutils/ena_utils.py +35 -0
- ebi_eva_common_pyutils/file_utils.py +31 -0
- ebi_eva_common_pyutils/logger.py +150 -0
- ebi_eva_common_pyutils/ncbi_utils.py +117 -0
- ebi_eva_common_pyutils/network_utils.py +64 -0
- ebi_eva_common_pyutils/reference/__init__.py +2 -0
- ebi_eva_common_pyutils/reference/assembly.py +247 -0
- ebi_eva_common_pyutils/reference/sequence.py +101 -0
- ebi_eva_common_pyutils/taxonomy/__init__.py +0 -0
- ebi_eva_common_pyutils/taxonomy/taxonomy.py +60 -0
- ebi_eva_common_pyutils/variation/__init__.py +0 -0
- ebi_eva_common_pyutils/variation/contig_utils.py +113 -0
- ebi_eva_common_pyutils-0.6.15.data/scripts/archive_directory.py +114 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/LICENSE +201 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/METADATA +23 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/RECORD +39 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/WHEEL +5 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/top_level.txt +2 -0
- ebi_eva_internal_pyutils/__init__.py +0 -0
- ebi_eva_internal_pyutils/archive_directory.py +114 -0
- ebi_eva_internal_pyutils/config_utils.py +188 -0
- ebi_eva_internal_pyutils/metadata_utils.py +288 -0
- ebi_eva_internal_pyutils/mongo_utils.py +71 -0
- ebi_eva_internal_pyutils/mongodb/__init__.py +3 -0
- ebi_eva_internal_pyutils/mongodb/mongo_database.py +170 -0
- ebi_eva_internal_pyutils/nextflow/__init__.py +1 -0
- ebi_eva_internal_pyutils/nextflow/nextflow_pipeline.py +195 -0
- ebi_eva_internal_pyutils/pg_utils.py +107 -0
- ebi_eva_internal_pyutils/spring_properties.py +294 -0
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from ebi_eva_common_pyutils.reference.assembly import NCBIAssembly
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# Copyright 2019 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from functools import lru_cache
|
|
16
|
+
|
|
17
|
+
from ebi_eva_common_pyutils.logger import logging_config
|
|
18
|
+
from ebi_eva_common_pyutils.network_utils import json_request
|
|
19
|
+
from ebi_eva_common_pyutils.taxonomy.taxonomy import get_normalized_scientific_name_from_ensembl
|
|
20
|
+
|
|
21
|
+
logger = logging_config.get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_supported_asm_from_ensembl(tax_id: int) -> str:
|
|
25
|
+
logger.info(f'Query Ensembl for species name using taxonomy {tax_id}')
|
|
26
|
+
scientific_name_api_param = get_normalized_scientific_name_from_ensembl(tax_id)
|
|
27
|
+
ENSEMBL_REST_API_URL = "http://rest.ensembl.org/info/assembly/{0}?content-type=application/json".format(
|
|
28
|
+
scientific_name_api_param)
|
|
29
|
+
response = json_request(ENSEMBL_REST_API_URL)
|
|
30
|
+
assembly_accession_attribute = 'assembly_accession'
|
|
31
|
+
if assembly_accession_attribute in response:
|
|
32
|
+
return str(response.get(assembly_accession_attribute))
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@lru_cache(maxsize=None)
|
|
37
|
+
def get_taxonomy_to_assembly_mapping_from_ensembl_rapid_release():
|
|
38
|
+
"""
|
|
39
|
+
Returns a dict mapping taxonomy ID to assembly accession, choosing the most recently released,
|
|
40
|
+
lexicographically last, non-alternate haplotype assembly when multiple are present.
|
|
41
|
+
"""
|
|
42
|
+
list_data = json_request('https://ftp.ensembl.org/pub/rapid-release/species_metadata.json')
|
|
43
|
+
results = {}
|
|
44
|
+
for asm_data in list_data:
|
|
45
|
+
tax_id = asm_data['taxonomy_id']
|
|
46
|
+
asm_accession = asm_data['assembly_accession']
|
|
47
|
+
strain = asm_data['strain']
|
|
48
|
+
release_date = datetime.strptime(asm_data['release_date'], '%Y-%m-%d')
|
|
49
|
+
|
|
50
|
+
# If we haven't seen this taxonomy before, just use this assembly
|
|
51
|
+
if tax_id not in results:
|
|
52
|
+
results[tax_id] = (asm_accession, release_date)
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
# Skip alternate haplotype assemblies
|
|
56
|
+
if strain and strain.lower() == 'alternate haplotype':
|
|
57
|
+
continue
|
|
58
|
+
current_assembly, current_date = results[tax_id]
|
|
59
|
+
# Keep the more recent assembly, or the lexicographically last one if release dates are equal
|
|
60
|
+
if current_date < release_date or (current_date == release_date and asm_accession > current_assembly):
|
|
61
|
+
results[tax_id] = (asm_accession, release_date)
|
|
62
|
+
|
|
63
|
+
return {key: val[0] for key, val in results.items()}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def get_supported_asm_from_ensembl_rapid_release(tax_id: int) -> str:
|
|
67
|
+
# TODO: Replace with API call once supported
|
|
68
|
+
rapid_release_data = get_taxonomy_to_assembly_mapping_from_ensembl_rapid_release()
|
|
69
|
+
return rapid_release_data.get(tax_id, None)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import http
|
|
16
|
+
import requests
|
|
17
|
+
|
|
18
|
+
from ebi_eva_common_pyutils.assembly import NCBIAssembly
|
|
19
|
+
from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena
|
|
20
|
+
from ebi_eva_common_pyutils.logger import logging_config as log_cfg
|
|
21
|
+
|
|
22
|
+
EUTILS_URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
|
23
|
+
ESEARCH_URL = EUTILS_URL + 'esearch.fcgi'
|
|
24
|
+
ESUMMARY_URL = EUTILS_URL + 'esummary.fcgi'
|
|
25
|
+
EFETCH_URL = EUTILS_URL + 'efetch.fcgi'
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logger = log_cfg.get_logger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def is_patch_assembly(assembly_accession: str) -> bool:
|
|
32
|
+
"""
|
|
33
|
+
Check if a given assembly is a patch assembly
|
|
34
|
+
Please see: https://www.ncbi.nlm.nih.gov/grc/help/patches/
|
|
35
|
+
"""
|
|
36
|
+
xml_root = download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{assembly_accession}')
|
|
37
|
+
xml_assembly = xml_root.xpath("//ASSEMBLY_ATTRIBUTE[TAG='count-patches']/VALUE")
|
|
38
|
+
if len(xml_assembly) == 0:
|
|
39
|
+
return False
|
|
40
|
+
return int(xml_assembly[0].text) > 0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def retrieve_genbank_assembly_accessions_from_ncbi(assembly_txt, api_key=None):
|
|
44
|
+
"""
|
|
45
|
+
Attempt to find any assembly genebank accession base on a free text search.
|
|
46
|
+
"""
|
|
47
|
+
assembly_accessions = set()
|
|
48
|
+
payload = {'db': 'Assembly', 'term': '"{}"'.format(assembly_txt), 'retmode': 'JSON'}
|
|
49
|
+
if api_key:
|
|
50
|
+
payload['api_key'] = api_key
|
|
51
|
+
data = requests.get(ESEARCH_URL, params=payload).json()
|
|
52
|
+
if data and data.get('esearchresult', {}).get('idlist'):
|
|
53
|
+
assembly_id_list = data.get('esearchresult').get('idlist')
|
|
54
|
+
payload = {'db': 'Assembly', 'id': ','.join(assembly_id_list), 'retmode': 'JSON'}
|
|
55
|
+
if api_key:
|
|
56
|
+
payload['api_key'] = api_key
|
|
57
|
+
summary_list = requests.get(ESUMMARY_URL, params=payload).json()
|
|
58
|
+
for assembly_id in summary_list.get('result', {}).get('uids', []):
|
|
59
|
+
assembly_info = summary_list.get('result').get(assembly_id)
|
|
60
|
+
if 'genbank' in assembly_info['synonym']:
|
|
61
|
+
assembly_accessions.add(assembly_info['synonym']['genbank'])
|
|
62
|
+
if len(assembly_accessions) != 1:
|
|
63
|
+
logger.warning('%s Genbank synonyms found for assembly %s ', len(assembly_accessions), assembly_txt)
|
|
64
|
+
return list(assembly_accessions)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def retrieve_genbank_equivalent_for_GCF_accession(assembly_accession, ncbi_api_key=None):
|
|
68
|
+
genbank_synonyms = retrieve_genbank_assembly_accessions_from_ncbi(assembly_accession, api_key=ncbi_api_key)
|
|
69
|
+
if len(genbank_synonyms) != 1:
|
|
70
|
+
raise ValueError('%s Genbank synonyms found for assembly %s ' % (len(genbank_synonyms), assembly_accession))
|
|
71
|
+
return genbank_synonyms.pop()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def resolve_assembly_name_to_GCA_accession(assembly_name):
|
|
75
|
+
ENA_ASSEMBLY_NAME_QUERY_URL = "https://www.ebi.ac.uk/ena/portal/api/search" \
|
|
76
|
+
"?result=assembly&query=assembly_name%3D%22{0}%22&format=json".format(assembly_name)
|
|
77
|
+
response = requests.get(ENA_ASSEMBLY_NAME_QUERY_URL)
|
|
78
|
+
if response.status_code == http.HTTPStatus.OK.value:
|
|
79
|
+
response_json = response.json()
|
|
80
|
+
if len(response_json) == 0:
|
|
81
|
+
raise ValueError("Could not resolve assembly name {0} to a GCA accession!".format(assembly_name))
|
|
82
|
+
elif len(response_json) > 1:
|
|
83
|
+
raise ValueError("Assembly name {0} resolved to more than one GCA accession!".format(assembly_name))
|
|
84
|
+
else:
|
|
85
|
+
return response.json()[0]["accession"] + "." + response.json()[0]["version"]
|
|
86
|
+
else:
|
|
87
|
+
raise ValueError("Could not resolve assembly name {0} to a GCA accession!".format(assembly_name))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_assembly_report_url(assembly_accession):
|
|
91
|
+
return NCBIAssembly(assembly_accession).assembly_report_url
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
|
|
18
|
+
import requests
|
|
19
|
+
from functools import cached_property
|
|
20
|
+
from ebi_eva_common_pyutils.logger import AppLogger
|
|
21
|
+
from retry import retry
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HALNotReadyError(Exception):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class HALCommunicator(AppLogger):
|
|
29
|
+
"""
|
|
30
|
+
This class helps navigate through REST API that uses the HAL standard.
|
|
31
|
+
"""
|
|
32
|
+
acceptable_code = [200, 201]
|
|
33
|
+
|
|
34
|
+
def __init__(self, auth_url, bsd_url, username, password):
|
|
35
|
+
self.auth_url = auth_url
|
|
36
|
+
self.bsd_url = bsd_url
|
|
37
|
+
self.username = username
|
|
38
|
+
self.password = password
|
|
39
|
+
|
|
40
|
+
def _validate_response(self, response):
|
|
41
|
+
"""Check that the response has an acceptable code and raise if it does not"""
|
|
42
|
+
if response.status_code not in self.acceptable_code:
|
|
43
|
+
self.error(response.request.method + ': ' + response.request.url + " with " + str(response.request.body))
|
|
44
|
+
self.error("headers: {}".format(response.request.headers))
|
|
45
|
+
self.error("<{}>: {}".format(response.status_code, response.text))
|
|
46
|
+
raise ValueError('The HTTP status code ({}) is not one of the acceptable codes ({})'.format(
|
|
47
|
+
str(response.status_code), str(self.acceptable_code))
|
|
48
|
+
)
|
|
49
|
+
return response
|
|
50
|
+
|
|
51
|
+
@cached_property
|
|
52
|
+
def token(self):
|
|
53
|
+
"""Retrieve the token from the AAP REST API then cache it for further quering"""
|
|
54
|
+
response = requests.get(self.auth_url, auth=(self.username, self.password))
|
|
55
|
+
self._validate_response(response)
|
|
56
|
+
return response.text
|
|
57
|
+
|
|
58
|
+
@retry(exceptions=(ValueError, requests.RequestException), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
|
|
59
|
+
def _req(self, method, url, **kwargs):
|
|
60
|
+
"""Private method that sends a request using the specified method. It adds the headers required by bsd"""
|
|
61
|
+
headers = kwargs.pop('headers', {})
|
|
62
|
+
headers.update({'Accept': 'application/hal+json'})
|
|
63
|
+
if self.token is not None:
|
|
64
|
+
headers.update({'Authorization': 'Bearer ' + self.token})
|
|
65
|
+
if 'json' in kwargs:
|
|
66
|
+
headers['Content-Type'] = 'application/json'
|
|
67
|
+
response = requests.request(
|
|
68
|
+
method=method,
|
|
69
|
+
url=url,
|
|
70
|
+
headers=headers,
|
|
71
|
+
**kwargs
|
|
72
|
+
)
|
|
73
|
+
self._validate_response(response)
|
|
74
|
+
return response
|
|
75
|
+
|
|
76
|
+
def follows(self, query, json_obj=None, method='GET', url_template_values=None, join_url=None, **kwargs):
|
|
77
|
+
"""
|
|
78
|
+
Finds a link within the json_obj using a query string or list, modify the link using the
|
|
79
|
+
url_template_values dictionary then query the link using the method and any additional keyword argument.
|
|
80
|
+
If the json_obj is not specified then it will use the root query defined by the base url.
|
|
81
|
+
"""
|
|
82
|
+
all_pages = kwargs.pop('all_pages', False)
|
|
83
|
+
|
|
84
|
+
if json_obj is None:
|
|
85
|
+
json_obj = self.root
|
|
86
|
+
# Drill down into a dict using dot notation
|
|
87
|
+
_json_obj = json_obj
|
|
88
|
+
if isinstance(query, str):
|
|
89
|
+
query_list = query.split('.')
|
|
90
|
+
else:
|
|
91
|
+
query_list = query
|
|
92
|
+
for query_element in query_list:
|
|
93
|
+
if query_element in _json_obj:
|
|
94
|
+
_json_obj = _json_obj[query_element]
|
|
95
|
+
else:
|
|
96
|
+
raise KeyError('{} does not exist in json object'.format(query_element, _json_obj))
|
|
97
|
+
if not isinstance(_json_obj, str):
|
|
98
|
+
raise ValueError('The result of the query_string must be a string to use as a url')
|
|
99
|
+
url = _json_obj
|
|
100
|
+
# replace the template in the url with the value provided
|
|
101
|
+
if url_template_values:
|
|
102
|
+
for k, v in url_template_values.items():
|
|
103
|
+
url = re.sub('{(' + k + ')(:.*)?}', v, url)
|
|
104
|
+
if join_url:
|
|
105
|
+
url += '/' + join_url
|
|
106
|
+
text_only = False
|
|
107
|
+
if 'text_only' in kwargs and kwargs.get('text_only'):
|
|
108
|
+
text_only = kwargs.pop('text_only')
|
|
109
|
+
# Now query the url
|
|
110
|
+
response = self._req(method, url, **kwargs)
|
|
111
|
+
if text_only:
|
|
112
|
+
return response.text
|
|
113
|
+
|
|
114
|
+
json_response = response.json()
|
|
115
|
+
# Depaginate the call if requested
|
|
116
|
+
if all_pages is True:
|
|
117
|
+
# This depagination code will iterate over all the pages available until the pages comes back without a
|
|
118
|
+
# next page. It stores the embedded elements in the initial query's json response
|
|
119
|
+
content = json_response
|
|
120
|
+
while 'next' in content.get('_links'):
|
|
121
|
+
content = self._req(method, content.get('_links').get('next').get('href'), **kwargs).json()
|
|
122
|
+
for key in content.get('_embedded'):
|
|
123
|
+
json_response['_embedded'][key].extend(content.get('_embedded').get(key))
|
|
124
|
+
# Remove the pagination information as it is not relevant to the depaginated response
|
|
125
|
+
if 'page' in json_response: json_response.pop('page')
|
|
126
|
+
if 'first' in json_response['_links']: json_response['_links'].pop('first')
|
|
127
|
+
if 'last' in json_response['_links']: json_response['_links'].pop('last')
|
|
128
|
+
if 'next' in json_response['_links']: json_response['_links'].pop('next')
|
|
129
|
+
return json_response
|
|
130
|
+
|
|
131
|
+
def follows_link(self, key, json_obj=None, method='GET', url_template_values=None, join_url=None, **kwargs):
|
|
132
|
+
"""
|
|
133
|
+
Same function as follows but construct the query_string from a single keyword surrounded by '_links' and 'href'.
|
|
134
|
+
"""
|
|
135
|
+
return self.follows(('_links', key, 'href'),
|
|
136
|
+
json_obj=json_obj, method=method, url_template_values=url_template_values,
|
|
137
|
+
join_url=join_url, **kwargs)
|
|
138
|
+
|
|
139
|
+
@cached_property
|
|
140
|
+
def root(self):
|
|
141
|
+
return self._req('GET', self.bsd_url).json()
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def communicator_attributes(self):
|
|
145
|
+
raise NotImplementedError
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class AAPHALCommunicator(HALCommunicator):
|
|
149
|
+
"""Class to navigate BioSamples API using AAP authentication."""
|
|
150
|
+
|
|
151
|
+
def __init__(self, auth_url, bsd_url, username, password, domain=None):
|
|
152
|
+
super(AAPHALCommunicator, self).__init__(auth_url, bsd_url, username, password)
|
|
153
|
+
self.domain = domain
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def communicator_attributes(self):
|
|
157
|
+
return {'domain': self.domain}
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class WebinHALCommunicator(HALCommunicator):
|
|
161
|
+
"""Class to navigate BioSamples API using Webin authentication."""
|
|
162
|
+
|
|
163
|
+
@cached_property
|
|
164
|
+
def token(self):
|
|
165
|
+
"""Retrieve the token from the ENA Webin REST API then cache it for further querying"""
|
|
166
|
+
response = requests.post(self.auth_url,
|
|
167
|
+
json={"authRealms": ["ENA"], "password": self.password,
|
|
168
|
+
"username": self.username})
|
|
169
|
+
self._validate_response(response)
|
|
170
|
+
return response.text
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def communicator_attributes(self):
|
|
174
|
+
return {'webinSubmissionAccountId': self.username}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class NoAuthHALCommunicator(HALCommunicator):
|
|
178
|
+
"""Class to navigate BioSamples API without authentication."""
|
|
179
|
+
|
|
180
|
+
def __init__(self, bsd_url):
|
|
181
|
+
super(NoAuthHALCommunicator, self).__init__(None, bsd_url, None, None)
|
|
182
|
+
|
|
183
|
+
@cached_property
|
|
184
|
+
def token(self):
|
|
185
|
+
"""No auth token, so errors will be raised if auth is required for requests"""
|
|
186
|
+
return None
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import logging
|
|
15
|
+
import subprocess
|
|
16
|
+
|
|
17
|
+
from ebi_eva_common_pyutils.logger import logging_config as log_cfg
|
|
18
|
+
|
|
19
|
+
logger = log_cfg.get_logger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def run_command_with_output(command_description, command, return_process_output=False,
|
|
23
|
+
log_error_stream_to_output=False, stdout_log_level=logging.INFO,
|
|
24
|
+
stderr_log_level=logging.ERROR):
|
|
25
|
+
process_output = ""
|
|
26
|
+
|
|
27
|
+
logger.log(stdout_log_level, "Starting process: " + command_description)
|
|
28
|
+
logger.log(stdout_log_level, "Running command: " + command)
|
|
29
|
+
|
|
30
|
+
stdout = subprocess.PIPE
|
|
31
|
+
# Some lame utilities like mongodump and mongorestore output non-error messages to error stream
|
|
32
|
+
# This is a workaround for that
|
|
33
|
+
stderr = subprocess.STDOUT if log_error_stream_to_output else subprocess.PIPE
|
|
34
|
+
|
|
35
|
+
with subprocess.Popen(command, stdout=stdout, stderr=stderr, bufsize=1, universal_newlines=True,
|
|
36
|
+
shell=True) as process:
|
|
37
|
+
for line in iter(process.stdout.readline, ''):
|
|
38
|
+
line = str(line).rstrip()
|
|
39
|
+
logger.log(stdout_log_level, line)
|
|
40
|
+
if return_process_output:
|
|
41
|
+
process_output += line + "\n"
|
|
42
|
+
if not log_error_stream_to_output:
|
|
43
|
+
for line in iter(process.stderr.readline, ''):
|
|
44
|
+
line = str(line).rstrip()
|
|
45
|
+
logger.log(stderr_log_level, line)
|
|
46
|
+
if process.returncode != 0:
|
|
47
|
+
logger.error(command_description + " failed! Refer to the error messages for details.")
|
|
48
|
+
raise subprocess.CalledProcessError(process.returncode, process.args)
|
|
49
|
+
else:
|
|
50
|
+
logger.log(stdout_log_level, command_description + " - completed successfully")
|
|
51
|
+
if return_process_output:
|
|
52
|
+
return process_output
|
|
53
|
+
|
|
54
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def merge_two_dicts(x, y):
|
|
17
|
+
z = x.copy() # start with x's keys and values
|
|
18
|
+
z.update(y) # modifies z with y's keys and values & returns None
|
|
19
|
+
return z
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def pretty_print(header, table):
|
|
23
|
+
cell_widths = [len(h) for h in header]
|
|
24
|
+
for row in table:
|
|
25
|
+
for i, cell in enumerate(row):
|
|
26
|
+
cell_widths[i] = max(cell_widths[i], len(str(cell)))
|
|
27
|
+
format_string = ' | '.join('{%s:>%s}' % (i, w) for i, w in enumerate(cell_widths))
|
|
28
|
+
print('| ' + format_string.format(*header) + ' |')
|
|
29
|
+
for row in table:
|
|
30
|
+
print('| ' + format_string.format(*row) + ' |')
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Configuration:
|
|
7
|
+
"""
|
|
8
|
+
Configuration class that allow to load a yaml file either at construction or later in the execution.
|
|
9
|
+
It can be used like a dict but should be used as readonly.
|
|
10
|
+
"""
|
|
11
|
+
config_file = None
|
|
12
|
+
content = {}
|
|
13
|
+
|
|
14
|
+
def __init__(self, *search_path):
|
|
15
|
+
if search_path:
|
|
16
|
+
self.load_config_file(*search_path)
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def _find_config_file(search_path):
|
|
20
|
+
for p in search_path:
|
|
21
|
+
if p and os.path.isfile(p):
|
|
22
|
+
return p
|
|
23
|
+
|
|
24
|
+
def load_config_file(self, *search_path):
|
|
25
|
+
self.config_file = self._find_config_file(search_path)
|
|
26
|
+
if self.config_file:
|
|
27
|
+
with open(self.config_file, 'r') as f:
|
|
28
|
+
self.content = yaml.safe_load(f)
|
|
29
|
+
else:
|
|
30
|
+
raise FileNotFoundError('Could not find any config file in specified search path')
|
|
31
|
+
|
|
32
|
+
def get(self, item, ret_default=None):
|
|
33
|
+
"""
|
|
34
|
+
Dict-style item retrieval with default
|
|
35
|
+
:param item: The key to search for
|
|
36
|
+
:param ret_default: What to return if the key is not present
|
|
37
|
+
"""
|
|
38
|
+
try:
|
|
39
|
+
return self[item]
|
|
40
|
+
except KeyError:
|
|
41
|
+
return ret_default
|
|
42
|
+
|
|
43
|
+
def query(self, *parts, ret_default=None):
|
|
44
|
+
"""
|
|
45
|
+
Drill down into a config, e.g. cfg.query('logging', 'handlers', 'a_handler', 'level')
|
|
46
|
+
:param ret_default:
|
|
47
|
+
:return: The relevant item if it exists in the config, else ret_default.
|
|
48
|
+
"""
|
|
49
|
+
top_level = self.content
|
|
50
|
+
item = None
|
|
51
|
+
|
|
52
|
+
for p in parts:
|
|
53
|
+
item = top_level.get(p)
|
|
54
|
+
if item:
|
|
55
|
+
top_level = item
|
|
56
|
+
else:
|
|
57
|
+
return ret_default
|
|
58
|
+
return item
|
|
59
|
+
|
|
60
|
+
def report(self):
|
|
61
|
+
return yaml.safe_dump(self.content, default_flow_style=False)
|
|
62
|
+
|
|
63
|
+
def __getitem__(self, item):
|
|
64
|
+
"""Allow dict-style access, e.g. config['this'] or config['this']['that']."""
|
|
65
|
+
return self.content[item]
|
|
66
|
+
|
|
67
|
+
def __contains__(self, item):
|
|
68
|
+
"""Allow search in the first layer of the config with 'in' operator."""
|
|
69
|
+
return self.content.__contains__(item)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
cfg = Configuration()
|
|
73
|
+
"""
|
|
74
|
+
Provides a singleton that can be used as a central place for configuration.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class WritableConfig(Configuration):
|
|
81
|
+
"""Configuration object that allows writes to the config file"""
|
|
82
|
+
|
|
83
|
+
def __init__(self, *search_path, version=None):
|
|
84
|
+
super().__init__(*search_path)
|
|
85
|
+
self.version = version
|
|
86
|
+
|
|
87
|
+
def load_config_file(self, *search_path):
|
|
88
|
+
try:
|
|
89
|
+
super().load_config_file(*search_path)
|
|
90
|
+
except FileNotFoundError:
|
|
91
|
+
# expected if it's the first time we are creating the config file
|
|
92
|
+
# In that case the first search path is set to be the config files
|
|
93
|
+
self.config_file = search_path[0]
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
def backup(self):
|
|
97
|
+
"""
|
|
98
|
+
Rename the config file by adding a '.1' at the end. If the '.1' file exists it move it to a '.2' and so on.
|
|
99
|
+
"""
|
|
100
|
+
if os.path.isfile(self.config_file):
|
|
101
|
+
file_name = self.config_file
|
|
102
|
+
suffix = 1
|
|
103
|
+
backup_name = f'{file_name}.{suffix}'
|
|
104
|
+
while os.path.exists(backup_name):
|
|
105
|
+
suffix += 1
|
|
106
|
+
backup_name = f'{file_name}.{suffix}'
|
|
107
|
+
|
|
108
|
+
for i in range(suffix, 1, -1):
|
|
109
|
+
os.rename(f'{file_name}.{i - 1}', f'{file_name}.{i}')
|
|
110
|
+
os.rename(file_name, file_name + '.1')
|
|
111
|
+
|
|
112
|
+
def write(self):
|
|
113
|
+
if self.config_file and self.content and os.path.isdir(os.path.dirname(self.config_file)):
|
|
114
|
+
with open(self.config_file, 'w') as open_config:
|
|
115
|
+
yaml.safe_dump(self.content, open_config)
|
|
116
|
+
|
|
117
|
+
def set(self, *path, value):
|
|
118
|
+
self._set_version()
|
|
119
|
+
top_level = self.content
|
|
120
|
+
for p in path[:-1]:
|
|
121
|
+
if p not in top_level:
|
|
122
|
+
top_level[p] = {}
|
|
123
|
+
top_level = top_level[p]
|
|
124
|
+
top_level[path[-1]] = value
|
|
125
|
+
|
|
126
|
+
def pop(self, *path, default=None):
|
|
127
|
+
"""Recursive dictionary pop with default"""
|
|
128
|
+
top_level = self.content
|
|
129
|
+
for p in path[:-1]:
|
|
130
|
+
if p not in top_level:
|
|
131
|
+
return default
|
|
132
|
+
top_level = top_level[p]
|
|
133
|
+
return top_level.pop(path[-1], default)
|
|
134
|
+
|
|
135
|
+
def is_empty(self):
|
|
136
|
+
return not self.content
|
|
137
|
+
|
|
138
|
+
def clear(self):
|
|
139
|
+
self.content = {}
|
|
140
|
+
|
|
141
|
+
def _set_version(self):
|
|
142
|
+
# If we're starting to fill in an empty config, set the version if available
|
|
143
|
+
if self.is_empty() and self.version:
|
|
144
|
+
self.content['version'] = self.version
|
|
145
|
+
|
|
146
|
+
def __contains__(self, item):
|
|
147
|
+
return item in self.content
|
|
148
|
+
|
|
149
|
+
def __setitem__(self, item, value):
|
|
150
|
+
"""Allow dict-style write access, e.g. config['this']='that'."""
|
|
151
|
+
self._set_version()
|
|
152
|
+
self.content[item] = value
|
|
File without changes
|