ebi-eva-common-pyutils 0.6.15__1-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ebi_eva_common_pyutils/__init__.py +0 -0
- ebi_eva_common_pyutils/assembly/__init__.py +1 -0
- ebi_eva_common_pyutils/assembly/assembly.py +69 -0
- ebi_eva_common_pyutils/assembly_utils.py +91 -0
- ebi_eva_common_pyutils/biosamples_communicators.py +186 -0
- ebi_eva_common_pyutils/command_utils.py +54 -0
- ebi_eva_common_pyutils/common_utils.py +30 -0
- ebi_eva_common_pyutils/config.py +152 -0
- ebi_eva_common_pyutils/contig_alias/__init__.py +0 -0
- ebi_eva_common_pyutils/contig_alias/contig_alias.py +115 -0
- ebi_eva_common_pyutils/ena_utils.py +35 -0
- ebi_eva_common_pyutils/file_utils.py +31 -0
- ebi_eva_common_pyutils/logger.py +150 -0
- ebi_eva_common_pyutils/ncbi_utils.py +117 -0
- ebi_eva_common_pyutils/network_utils.py +64 -0
- ebi_eva_common_pyutils/reference/__init__.py +2 -0
- ebi_eva_common_pyutils/reference/assembly.py +247 -0
- ebi_eva_common_pyutils/reference/sequence.py +101 -0
- ebi_eva_common_pyutils/taxonomy/__init__.py +0 -0
- ebi_eva_common_pyutils/taxonomy/taxonomy.py +60 -0
- ebi_eva_common_pyutils/variation/__init__.py +0 -0
- ebi_eva_common_pyutils/variation/contig_utils.py +113 -0
- ebi_eva_common_pyutils-0.6.15.data/scripts/archive_directory.py +114 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/LICENSE +201 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/METADATA +23 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/RECORD +39 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/WHEEL +5 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/top_level.txt +2 -0
- ebi_eva_internal_pyutils/__init__.py +0 -0
- ebi_eva_internal_pyutils/archive_directory.py +114 -0
- ebi_eva_internal_pyutils/config_utils.py +188 -0
- ebi_eva_internal_pyutils/metadata_utils.py +288 -0
- ebi_eva_internal_pyutils/mongo_utils.py +71 -0
- ebi_eva_internal_pyutils/mongodb/__init__.py +3 -0
- ebi_eva_internal_pyutils/mongodb/mongo_database.py +170 -0
- ebi_eva_internal_pyutils/nextflow/__init__.py +1 -0
- ebi_eva_internal_pyutils/nextflow/nextflow_pipeline.py +195 -0
- ebi_eva_internal_pyutils/pg_utils.py +107 -0
- ebi_eva_internal_pyutils/spring_properties.py +294 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# Copyright 2022 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
from ebi_eva_common_pyutils.logger import AppLogger
|
|
17
|
+
import requests
|
|
18
|
+
from retry import retry
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class InternalServerError(Exception):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
CONTING_ALIAS_URL = 'https://www.ebi.ac.uk/eva/webservices/contig-alias'
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# TODO add the get methods
|
|
29
|
+
class ContigAliasClient(AppLogger):
|
|
30
|
+
"""
|
|
31
|
+
Python client for interfacing with the contig alias service.
|
|
32
|
+
Authentication is required if using admin endpoints.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, base_url=None, username=None, password=None, default_page_size=1000):
|
|
36
|
+
if base_url:
|
|
37
|
+
self.base_url = base_url
|
|
38
|
+
else:
|
|
39
|
+
self.base_url = os.environ.get('CONTING_ALIAS_URL') or CONTING_ALIAS_URL
|
|
40
|
+
# Used for get method
|
|
41
|
+
self.default_page_size=default_page_size
|
|
42
|
+
# Only required for admin endpoints
|
|
43
|
+
self.username = username
|
|
44
|
+
self.password = password
|
|
45
|
+
|
|
46
|
+
def check_auth(self):
|
|
47
|
+
if self.username is None or self.password is None:
|
|
48
|
+
raise ValueError('Need admin username and password for this method')
|
|
49
|
+
|
|
50
|
+
@retry(InternalServerError, tries=3, delay=2, backoff=1.5, jitter=(1, 3))
|
|
51
|
+
def insert_assembly(self, assembly):
|
|
52
|
+
self.check_auth()
|
|
53
|
+
full_url = os.path.join(self.base_url, f'v1/admin/assemblies/{assembly}')
|
|
54
|
+
|
|
55
|
+
response = requests.put(full_url, auth=(self.username, self.password))
|
|
56
|
+
if response.status_code == 200:
|
|
57
|
+
self.info(f'Assembly accession {assembly} successfully added to Contig-Alias DB')
|
|
58
|
+
elif response.status_code == 409:
|
|
59
|
+
self.warning(f'Assembly accession {assembly} already exists in Contig-Alias DB. Response: {response.text}')
|
|
60
|
+
elif response.status_code == 500:
|
|
61
|
+
self.error(f'Could not save Assembly accession {assembly} to Contig-Alias DB. Error: {response.text}')
|
|
62
|
+
raise InternalServerError
|
|
63
|
+
else:
|
|
64
|
+
self.error(f'Could not save Assembly accession {assembly} to Contig-Alias DB. Error: {response.text}')
|
|
65
|
+
response.raise_for_status()
|
|
66
|
+
|
|
67
|
+
@retry(InternalServerError, tries=3, delay=2, backoff=1.5, jitter=(1, 3))
|
|
68
|
+
def delete_assembly(self, assembly):
|
|
69
|
+
self.check_auth()
|
|
70
|
+
full_url = os.path.join(self.base_url, f'v1/admin/assemblies/{assembly}')
|
|
71
|
+
|
|
72
|
+
response = requests.delete(full_url, auth=(self.username, self.password))
|
|
73
|
+
if response.status_code == 200:
|
|
74
|
+
self.info(f'Assembly accession {assembly} successfully deleted from Contig-Alias DB')
|
|
75
|
+
elif response.status_code == 500:
|
|
76
|
+
self.error(f'Assembly accession {assembly} could not be deleted. Response: {response.text}')
|
|
77
|
+
raise InternalServerError
|
|
78
|
+
else:
|
|
79
|
+
self.error(f'Assembly accession {assembly} could not be deleted. Response: {response.text}')
|
|
80
|
+
|
|
81
|
+
@retry(tries=3, delay=2, backoff=1.2, jitter=(1, 3))
|
|
82
|
+
def _get_page_for_contig_alias_url(self, sub_url, page=0):
|
|
83
|
+
"""queries the contig alias to retrieve the page of the provided url"""
|
|
84
|
+
url = f'{self.base_url}/{sub_url}?page={page}&size={self.default_page_size}'
|
|
85
|
+
response = requests.get(url, headers={'accept': 'application/json'})
|
|
86
|
+
response.raise_for_status()
|
|
87
|
+
response_json = response.json()
|
|
88
|
+
return response_json
|
|
89
|
+
|
|
90
|
+
def _depaginate_iter(self, sub_url, entity_to_retrieve):
|
|
91
|
+
"""Generator that provides the contigs in the assembly requested."""
|
|
92
|
+
page = 0
|
|
93
|
+
response_json = self._get_page_for_contig_alias_url(sub_url, page=page)
|
|
94
|
+
for entity in response_json.get('_embedded', {}).get(entity_to_retrieve, []):
|
|
95
|
+
yield entity
|
|
96
|
+
while 'next' in response_json['_links']:
|
|
97
|
+
page += 1
|
|
98
|
+
response_json = self._get_page_for_contig_alias_url(sub_url, page=page)
|
|
99
|
+
for entity in response_json.get('_embedded', {}).get(entity_to_retrieve, []):
|
|
100
|
+
yield entity
|
|
101
|
+
|
|
102
|
+
def assembly_contig_iter(self, assembly_accession):
|
|
103
|
+
"""Generator that provides the contigs in the assembly requested."""
|
|
104
|
+
sub_url = f'v1/assemblies/{assembly_accession}/chromosomes'
|
|
105
|
+
return self._depaginate_iter(sub_url, 'chromosomeEntities')
|
|
106
|
+
|
|
107
|
+
def assembly(self, assembly_accession):
|
|
108
|
+
"""provides the description of the requested assembly."""
|
|
109
|
+
sub_url = f'v1/assemblies/{assembly_accession}'
|
|
110
|
+
response_json = self._get_page_for_contig_alias_url(sub_url)
|
|
111
|
+
return response_json.get('_embedded', {}).get('assemblyEntities', [])[0]
|
|
112
|
+
|
|
113
|
+
def contig_iter(self, insdc_accession):
|
|
114
|
+
sub_url = f'v1/chromosomes/genbank/{insdc_accession}'
|
|
115
|
+
return self._depaginate_iter(sub_url, 'chromosomeEntities')
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from lxml import etree
|
|
3
|
+
from retry import retry
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@retry(tries=3, delay=2, backoff=1.2, jitter=(1, 3))
|
|
7
|
+
def download_xml_from_ena(ena_url) -> etree.XML:
|
|
8
|
+
"""Download and parse XML from ENA"""
|
|
9
|
+
try: # catches any kind of request error, including non-20X status code
|
|
10
|
+
response = requests.get(ena_url)
|
|
11
|
+
response.raise_for_status()
|
|
12
|
+
except requests.exceptions.RequestException as e:
|
|
13
|
+
raise e
|
|
14
|
+
root = etree.XML(bytes(response.text, encoding='utf-8'))
|
|
15
|
+
return root
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_assembly_name_and_taxonomy_id(assembly_accession):
|
|
19
|
+
xml_root = download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{assembly_accession}')
|
|
20
|
+
xml_assembly = xml_root.xpath('/ASSEMBLY_SET/ASSEMBLY')
|
|
21
|
+
if len(xml_assembly) == 0:
|
|
22
|
+
raise ValueError(f'Assembly {assembly_accession} not found in ENA')
|
|
23
|
+
assembly_name = xml_assembly[0].get('alias')
|
|
24
|
+
taxonomy_id = int(xml_assembly[0].xpath('TAXON/TAXON_ID')[0].text)
|
|
25
|
+
return assembly_name, taxonomy_id
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_scientific_name_and_common_name(taxonomy_id):
|
|
29
|
+
xml_root = download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_id}')
|
|
30
|
+
xml_taxon = xml_root.xpath('/TAXON_SET/taxon')
|
|
31
|
+
if len(xml_taxon) == 0:
|
|
32
|
+
raise ValueError(f'Taxonomy {taxonomy_id} not found in ENA')
|
|
33
|
+
scientific_name = xml_taxon[0].get('scientificName')
|
|
34
|
+
optional_common_name = xml_taxon[0].get('commonName')
|
|
35
|
+
return scientific_name, optional_common_name
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from ebi_eva_common_pyutils.command_utils import run_command_with_output
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FileDiffOption(Enum):
|
|
21
|
+
NOT_IN = 1
|
|
22
|
+
COMMON = 2
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def file_diff(file1_path: str, file2_path: str, diff_option: FileDiffOption, output_file_path: str):
|
|
26
|
+
if diff_option == FileDiffOption.NOT_IN:
|
|
27
|
+
run_command_with_output("Finding entries in {0} not in {1}".format(file1_path, file2_path),
|
|
28
|
+
"comm -23 {0} {1} > {2}".format(file1_path, file2_path, output_file_path))
|
|
29
|
+
elif diff_option == FileDiffOption.COMMON:
|
|
30
|
+
run_command_with_output("Finding entries common to {0} and {1}".format(file1_path, file2_path),
|
|
31
|
+
"comm -12 {0} {1} > {2}".format(file1_path, file2_path, output_file_path))
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import logging
|
|
15
|
+
import logging.config
|
|
16
|
+
import logging.handlers
|
|
17
|
+
from sys import stdout, stderr
|
|
18
|
+
from cached_property import cached_property
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LoggingConfiguration:
|
|
22
|
+
"""
|
|
23
|
+
This class provides an all in one management of all loggers in the stack. By default it pulls existing loggers,
|
|
24
|
+
stores additional ones along with handlers and formatters.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
default_fmt = '[%(asctime)s][%(name)s][%(levelname)s] %(message)s'
|
|
28
|
+
default_datefmt = '%Y-%b-%d %H:%M:%S'
|
|
29
|
+
|
|
30
|
+
def __init__(self, use_existing_logger=True, log_level=logging.INFO):
|
|
31
|
+
self.blank_formatter = logging.Formatter()
|
|
32
|
+
self.handlers = set()
|
|
33
|
+
if use_existing_logger:
|
|
34
|
+
# retrieve all third party loggers
|
|
35
|
+
self.loggers = dict((name, logger)
|
|
36
|
+
for name, logger in logging.root.manager.loggerDict.items()
|
|
37
|
+
if isinstance(logger, logging.Logger))
|
|
38
|
+
else:
|
|
39
|
+
self.loggers = {}
|
|
40
|
+
self._log_level = log_level
|
|
41
|
+
|
|
42
|
+
@cached_property
|
|
43
|
+
def formatter(self):
|
|
44
|
+
return self.default_formatter
|
|
45
|
+
|
|
46
|
+
@cached_property
|
|
47
|
+
def default_formatter(self):
|
|
48
|
+
return logging.Formatter(
|
|
49
|
+
fmt=self.default_fmt,
|
|
50
|
+
datefmt=self.default_datefmt
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def get_logger(self, name, level=logging.NOTSET):
|
|
54
|
+
"""
|
|
55
|
+
Return a logging.Logger object with formatters and handlers added.
|
|
56
|
+
:param name: Name to assign to the logger (usually __name__)
|
|
57
|
+
:param int level: Log level to assign to the logger upon creation
|
|
58
|
+
"""
|
|
59
|
+
if name in self.loggers:
|
|
60
|
+
logger = self.loggers[name]
|
|
61
|
+
else:
|
|
62
|
+
logger = logging.getLogger(name)
|
|
63
|
+
self.loggers[name] = logger
|
|
64
|
+
|
|
65
|
+
logger.setLevel(level or self._log_level)
|
|
66
|
+
for h in self.handlers:
|
|
67
|
+
logger.addHandler(h)
|
|
68
|
+
|
|
69
|
+
return logger
|
|
70
|
+
|
|
71
|
+
def add_handler(self, handler, level=logging.NOTSET):
|
|
72
|
+
"""
|
|
73
|
+
Add a created handler, set its format/level if needed and register all loggers to it
|
|
74
|
+
:param logging.Handler handler:
|
|
75
|
+
:param int level: Log level to assign to the created handler
|
|
76
|
+
"""
|
|
77
|
+
handler.setLevel(level or self._log_level)
|
|
78
|
+
handler.setFormatter(self.formatter)
|
|
79
|
+
for name in self.loggers:
|
|
80
|
+
self.loggers[name].addHandler(handler)
|
|
81
|
+
self.handlers.add(handler)
|
|
82
|
+
|
|
83
|
+
def add_stdout_handler(self, level=None):
|
|
84
|
+
self.add_handler(logging.StreamHandler(stdout), level=level or self._log_level)
|
|
85
|
+
|
|
86
|
+
def add_stderr_handler(self, level=None):
|
|
87
|
+
self.add_handler(logging.StreamHandler(stderr), level=level or self._log_level)
|
|
88
|
+
|
|
89
|
+
def add_file_handler(self, filename, level=None):
|
|
90
|
+
self.add_handler(logging.FileHandler(filename=filename), level=level or self._log_level)
|
|
91
|
+
|
|
92
|
+
def set_log_level(self, level):
|
|
93
|
+
self._log_level = level
|
|
94
|
+
for h in self.handlers:
|
|
95
|
+
h.setLevel(self._log_level)
|
|
96
|
+
for name in self.loggers:
|
|
97
|
+
self.loggers[name].setLevel(self._log_level)
|
|
98
|
+
|
|
99
|
+
def set_formatter(self, formatter):
|
|
100
|
+
"""
|
|
101
|
+
Set all handlers to use formatter
|
|
102
|
+
:param logging.Formatter formatter:
|
|
103
|
+
"""
|
|
104
|
+
self.__dict__['formatter'] = formatter
|
|
105
|
+
for h in self.handlers:
|
|
106
|
+
h.setFormatter(self.formatter)
|
|
107
|
+
|
|
108
|
+
def reset(self):
|
|
109
|
+
"""Remove all handlers of existing logger"""
|
|
110
|
+
for l in self.loggers.values():
|
|
111
|
+
while l.handlers:
|
|
112
|
+
l.removeHandler(l.handlers[0])
|
|
113
|
+
|
|
114
|
+
while self.handlers:
|
|
115
|
+
h = self.handlers.pop()
|
|
116
|
+
del h
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# A logging configuration singleton that will be the only source of logger
|
|
120
|
+
logging_config = LoggingConfiguration()
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class AppLogger:
|
|
124
|
+
"""
|
|
125
|
+
Mixin class for logging. An object subclassing this can log using its class name. Contains a
|
|
126
|
+
logging.Logger object and exposes its log methods.
|
|
127
|
+
"""
|
|
128
|
+
log_cfg = logging_config
|
|
129
|
+
|
|
130
|
+
def log(self, level, msg, *args, **kwargs):
|
|
131
|
+
self._logger.log(level, msg, *args, **kwargs)
|
|
132
|
+
|
|
133
|
+
def debug(self, msg, *args):
|
|
134
|
+
self._logger.debug(msg, *args)
|
|
135
|
+
|
|
136
|
+
def info(self, msg, *args):
|
|
137
|
+
self._logger.info(msg, *args)
|
|
138
|
+
|
|
139
|
+
def warning(self, msg, *args):
|
|
140
|
+
self._logger.warning(msg, *args)
|
|
141
|
+
|
|
142
|
+
def error(self, msg, *args):
|
|
143
|
+
self._logger.error(msg, *args)
|
|
144
|
+
|
|
145
|
+
def critical(self, msg, *args):
|
|
146
|
+
self._logger.critical(msg, *args)
|
|
147
|
+
|
|
148
|
+
@cached_property
|
|
149
|
+
def _logger(self):
|
|
150
|
+
return self.log_cfg.get_logger(self.__class__.__name__)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
from retry import retry
|
|
5
|
+
|
|
6
|
+
from ebi_eva_common_pyutils.logger import logging_config as log_cfg
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
logger = log_cfg.get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
eutils_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
|
12
|
+
esearch_url = eutils_url + 'esearch.fcgi'
|
|
13
|
+
esummary_url = eutils_url + 'esummary.fcgi'
|
|
14
|
+
efetch_url = eutils_url + 'efetch.fcgi'
|
|
15
|
+
ensembl_url = 'http://rest.ensembl.org/info/assembly'
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@retry(tries=3, delay=2, backoff=1.2, jitter=(1, 3))
|
|
19
|
+
def get_ncbi_assembly_dicts_from_term(term, api_key=None):
|
|
20
|
+
"""Function to return NCBI assembly objects in the form of a list of dictionaries based on a search term."""
|
|
21
|
+
payload = {'db': 'Assembly', 'term': '"{}"'.format(term), 'retmode': 'JSON'}
|
|
22
|
+
if api_key:
|
|
23
|
+
payload['api_key'] = api_key
|
|
24
|
+
req = requests.get(esearch_url, params=payload)
|
|
25
|
+
req.raise_for_status()
|
|
26
|
+
data = req.json()
|
|
27
|
+
assembly_dicts = []
|
|
28
|
+
if data:
|
|
29
|
+
assembly_id_list = data.get('esearchresult').get('idlist')
|
|
30
|
+
payload = {'db': 'Assembly', 'id': ','.join(assembly_id_list), 'retmode': 'JSON'}
|
|
31
|
+
if api_key:
|
|
32
|
+
payload['api_key'] = api_key
|
|
33
|
+
req = requests.get(esummary_url, params=payload)
|
|
34
|
+
req.raise_for_status()
|
|
35
|
+
summary_list = req.json()
|
|
36
|
+
for assembly_id in summary_list.get('result', {}).get('uids', []):
|
|
37
|
+
assembly_dicts.append(summary_list.get('result').get(assembly_id))
|
|
38
|
+
return assembly_dicts
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@retry(tries=3, delay=2, backoff=1.2, jitter=(1, 3))
|
|
42
|
+
def get_ncbi_taxonomy_dicts_from_term(term, api_key=None):
|
|
43
|
+
"""Function to return NCBI taxonomy objects in the form of a list of dictionaries based on a search term."""
|
|
44
|
+
payload = {'db': 'Taxonomy', 'term': '"{}"'.format(term), 'retmode': 'JSON'}
|
|
45
|
+
if api_key:
|
|
46
|
+
payload['api_key'] = api_key
|
|
47
|
+
req = requests.get(esearch_url, params=payload)
|
|
48
|
+
req.raise_for_status()
|
|
49
|
+
data = req.json()
|
|
50
|
+
taxonomy_dicts = []
|
|
51
|
+
if data:
|
|
52
|
+
taxonomy_dicts = get_ncbi_taxonomy_dicts_from_ids(data.get('esearchresult').get('idlist'))
|
|
53
|
+
return taxonomy_dicts
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@retry(tries=3, delay=2, backoff=1.2, jitter=(1, 3))
|
|
57
|
+
def get_ncbi_taxonomy_dicts_from_ids(taxonomy_ids, api_key=None):
|
|
58
|
+
"""Function to return NCBI taxonomy objects in the form of a list of dictionaries
|
|
59
|
+
based on a list of taxonomy ids."""
|
|
60
|
+
taxonomy_dicts = []
|
|
61
|
+
payload = {'db': 'Taxonomy', 'id': ','.join(taxonomy_ids), 'retmode': 'JSON'}
|
|
62
|
+
if api_key:
|
|
63
|
+
payload['api_key'] = api_key
|
|
64
|
+
req = requests.get(esummary_url, params=payload)
|
|
65
|
+
req.raise_for_status()
|
|
66
|
+
summary_list = req.json()
|
|
67
|
+
for taxonomy_id in summary_list.get('result', {}).get('uids', []):
|
|
68
|
+
taxonomy_dicts.append(summary_list.get('result').get(taxonomy_id))
|
|
69
|
+
return taxonomy_dicts
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_ncbi_assembly_name_from_term(term, api_key=None):
|
|
73
|
+
assembl_dicts = get_ncbi_assembly_dicts_from_term(term, api_key=api_key)
|
|
74
|
+
assembly_names = set([d.get('assemblyname') for d in assembl_dicts])
|
|
75
|
+
if len(assembly_names) > 1:
|
|
76
|
+
# Only keep the one that have the assembly accession as a synonymous and check again
|
|
77
|
+
assembly_names = set([d.get('assemblyname') for d in assembl_dicts
|
|
78
|
+
if term in d['synonym'].values() or term == d['assemblyaccession']])
|
|
79
|
+
if len(assembly_names) != 1:
|
|
80
|
+
raise ValueError(f'Cannot resolve assembly name for assembly {term} in NCBI. '
|
|
81
|
+
f'Found {",".join([str(a) for a in assembly_names])}')
|
|
82
|
+
return assembly_names.pop() if assembly_names else None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def retrieve_species_scientific_name_from_tax_id_ncbi(taxid, api_key=None):
|
|
86
|
+
payload = {'db': 'Taxonomy', 'id': taxid}
|
|
87
|
+
if api_key:
|
|
88
|
+
payload['api_key'] = api_key
|
|
89
|
+
r = requests.get(efetch_url, params=payload)
|
|
90
|
+
match = re.search('<Rank>(.+?)</Rank>', r.text, re.MULTILINE)
|
|
91
|
+
rank = None
|
|
92
|
+
if match:
|
|
93
|
+
rank = match.group(1)
|
|
94
|
+
if rank not in ['species', 'subspecies']:
|
|
95
|
+
logger.warning('Taxonomy id %s does not point to a species', taxid)
|
|
96
|
+
match = re.search('<ScientificName>(.+?)</ScientificName>', r.text, re.MULTILINE)
|
|
97
|
+
if match:
|
|
98
|
+
return match.group(1)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def get_species_name_from_ncbi(assembly_acc, api_key=None):
|
|
102
|
+
# We first need to search for the species associated with the assembly
|
|
103
|
+
assembly_dicts = get_ncbi_assembly_dicts_from_term(assembly_acc, api_key=api_key)
|
|
104
|
+
taxids = set([assembly_dict.get('taxid')
|
|
105
|
+
for assembly_dict in assembly_dicts
|
|
106
|
+
if assembly_dict.get('assemblyaccession') == assembly_acc or
|
|
107
|
+
assembly_dict.get('synonym', {}).get('genbank') == assembly_acc])
|
|
108
|
+
|
|
109
|
+
# This is a search so could retrieve multiple results
|
|
110
|
+
if len(taxids) != 1:
|
|
111
|
+
raise ValueError(f'Multiple species found for {assembly_acc}. '
|
|
112
|
+
f'Cannot resolve single species for assembly {assembly_acc} in NCBI.')
|
|
113
|
+
|
|
114
|
+
taxonomy_id = taxids.pop()
|
|
115
|
+
|
|
116
|
+
scientific_name = retrieve_species_scientific_name_from_tax_id_ncbi(taxonomy_id, api_key=api_key)
|
|
117
|
+
return scientific_name.replace(' ', '_').lower()
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import time
|
|
15
|
+
|
|
16
|
+
import requests
|
|
17
|
+
import subprocess
|
|
18
|
+
from retry import retry
|
|
19
|
+
|
|
20
|
+
from ebi_eva_common_pyutils.logger import logging_config as log_cfg
|
|
21
|
+
|
|
22
|
+
logger = log_cfg.get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def is_port_in_use(port):
|
|
26
|
+
import socket
|
|
27
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
28
|
+
return s.connect_ex(('localhost', port)) == 0
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_available_local_port(try_starting_with_port):
|
|
32
|
+
for i in range(0, 20):
|
|
33
|
+
port_to_try = try_starting_with_port + i
|
|
34
|
+
logger.info("Attempting to forward remote mongo port to local port {0}...".format(port_to_try))
|
|
35
|
+
if is_port_in_use(port_to_try):
|
|
36
|
+
logger.info("Port {0} already in use...".format(port_to_try))
|
|
37
|
+
else:
|
|
38
|
+
return port_to_try
|
|
39
|
+
logger.error("Could not forward to any local port!")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def forward_remote_port_to_local_port(remote_host: str, remote_port: int, local_port: int) -> int:
|
|
43
|
+
port_forward_command = 'ssh -N -L{0}:localhost:{1} {2}'.format(local_port, remote_port, remote_host)
|
|
44
|
+
logger.info("Forwarding port to local port using command: " + port_forward_command)
|
|
45
|
+
proc = subprocess.Popen(port_forward_command.split(" "))
|
|
46
|
+
time.sleep(5)
|
|
47
|
+
# Ensure that the process is still running
|
|
48
|
+
poll = proc.poll()
|
|
49
|
+
if poll is not None:
|
|
50
|
+
# The process already completed which mean it most likely crashed
|
|
51
|
+
logger.error(f'Port Forwarding {remote_host}:{remote_port} -> {local_port} failed!')
|
|
52
|
+
raise subprocess.CalledProcessError(proc.returncode, proc.args)
|
|
53
|
+
return proc.pid
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@retry(exceptions=(ConnectionError, requests.RequestException), logger=logger,
|
|
57
|
+
tries=4, delay=2, backoff=1.2, jitter=(1, 3))
|
|
58
|
+
def json_request(url: str, payload: dict = None, method=requests.get) -> dict:
|
|
59
|
+
"""Makes a request of a specified type (by default GET) with the specified URL and payload, attempts to parse the
|
|
60
|
+
result as a JSON string and return it as a dictionary, on failure raises an exception."""
|
|
61
|
+
result = method(url, data=payload)
|
|
62
|
+
result.raise_for_status()
|
|
63
|
+
return result.json()
|
|
64
|
+
|