ebi-eva-common-pyutils 0.6.15__1-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ebi_eva_common_pyutils/__init__.py +0 -0
- ebi_eva_common_pyutils/assembly/__init__.py +1 -0
- ebi_eva_common_pyutils/assembly/assembly.py +69 -0
- ebi_eva_common_pyutils/assembly_utils.py +91 -0
- ebi_eva_common_pyutils/biosamples_communicators.py +186 -0
- ebi_eva_common_pyutils/command_utils.py +54 -0
- ebi_eva_common_pyutils/common_utils.py +30 -0
- ebi_eva_common_pyutils/config.py +152 -0
- ebi_eva_common_pyutils/contig_alias/__init__.py +0 -0
- ebi_eva_common_pyutils/contig_alias/contig_alias.py +115 -0
- ebi_eva_common_pyutils/ena_utils.py +35 -0
- ebi_eva_common_pyutils/file_utils.py +31 -0
- ebi_eva_common_pyutils/logger.py +150 -0
- ebi_eva_common_pyutils/ncbi_utils.py +117 -0
- ebi_eva_common_pyutils/network_utils.py +64 -0
- ebi_eva_common_pyutils/reference/__init__.py +2 -0
- ebi_eva_common_pyutils/reference/assembly.py +247 -0
- ebi_eva_common_pyutils/reference/sequence.py +101 -0
- ebi_eva_common_pyutils/taxonomy/__init__.py +0 -0
- ebi_eva_common_pyutils/taxonomy/taxonomy.py +60 -0
- ebi_eva_common_pyutils/variation/__init__.py +0 -0
- ebi_eva_common_pyutils/variation/contig_utils.py +113 -0
- ebi_eva_common_pyutils-0.6.15.data/scripts/archive_directory.py +114 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/LICENSE +201 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/METADATA +23 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/RECORD +39 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/WHEEL +5 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/top_level.txt +2 -0
- ebi_eva_internal_pyutils/__init__.py +0 -0
- ebi_eva_internal_pyutils/archive_directory.py +114 -0
- ebi_eva_internal_pyutils/config_utils.py +188 -0
- ebi_eva_internal_pyutils/metadata_utils.py +288 -0
- ebi_eva_internal_pyutils/mongo_utils.py +71 -0
- ebi_eva_internal_pyutils/mongodb/__init__.py +3 -0
- ebi_eva_internal_pyutils/mongodb/mongo_database.py +170 -0
- ebi_eva_internal_pyutils/nextflow/__init__.py +1 -0
- ebi_eva_internal_pyutils/nextflow/nextflow_pipeline.py +195 -0
- ebi_eva_internal_pyutils/pg_utils.py +107 -0
- ebi_eva_internal_pyutils/spring_properties.py +294 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
# Copyright 2019 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import os
|
|
15
|
+
import urllib
|
|
16
|
+
from csv import DictReader, excel_tab
|
|
17
|
+
from ftplib import FTP
|
|
18
|
+
import re
|
|
19
|
+
from urllib import request
|
|
20
|
+
|
|
21
|
+
from cached_property import cached_property
|
|
22
|
+
from retry import retry
|
|
23
|
+
|
|
24
|
+
from ebi_eva_common_pyutils.command_utils import run_command_with_output
|
|
25
|
+
from ebi_eva_common_pyutils.logger import AppLogger
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class NCBIAssembly(AppLogger):
|
|
29
|
+
"""
|
|
30
|
+
Class that represent an assembly that would originate from NCBI data
|
|
31
|
+
It takes a GCA or GCF accession and can download the assembly report and genomics fasta.
|
|
32
|
+
Using species_scientific_name and assembly_accession it create a directory structure in the provided
|
|
33
|
+
reference_directory:
|
|
34
|
+
- species_scientific_name1
|
|
35
|
+
- assembly_accession1
|
|
36
|
+
- assembly_accession2
|
|
37
|
+
- species_scientific_name2
|
|
38
|
+
the eutils_api_key is only used to retrieve additional contigs if required.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, assembly_accession, species_scientific_name, reference_directory, eutils_api_key=None):
|
|
42
|
+
self.check_assembly_accession_format(assembly_accession)
|
|
43
|
+
self.assembly_accession = assembly_accession
|
|
44
|
+
self.species_scientific_name = species_scientific_name
|
|
45
|
+
self.reference_directory = reference_directory
|
|
46
|
+
self.eutils_api_key = eutils_api_key
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def is_assembly_accession_format(assembly_accession):
|
|
50
|
+
if re.match(r"^GC[F|A]_\d+\.\d+$", assembly_accession) is not None:
|
|
51
|
+
return True
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def check_assembly_accession_format(assembly_accession):
|
|
56
|
+
if not NCBIAssembly.is_assembly_accession_format(assembly_accession):
|
|
57
|
+
raise ValueError(f'Invalid assembly accession: {assembly_accession} it has to be in the form of '
|
|
58
|
+
'GCF_XXXXXXXXX.X or GCA_XXXXXXXXX.X where X is a number')
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def assembly_directory(self):
|
|
62
|
+
assembly_directory = os.path.join(
|
|
63
|
+
self.reference_directory, self.species_scientific_name.lower().replace(' ', '_'), self.assembly_accession
|
|
64
|
+
)
|
|
65
|
+
os.makedirs(assembly_directory, exist_ok=True),
|
|
66
|
+
return assembly_directory
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def assembly_report_path(self):
|
|
70
|
+
return os.path.join(self.assembly_directory, self.assembly_accession + '_assembly_report.txt')
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def assembly_fasta_path(self):
|
|
74
|
+
return os.path.join(self.assembly_directory, self.assembly_accession + '.fa')
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def assembly_compressed_fasta_path(self):
|
|
78
|
+
return os.path.join(self.assembly_directory, self.assembly_accession + '.fa.gz')
|
|
79
|
+
|
|
80
|
+
@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
|
|
81
|
+
def _download_file(self, destination_file, url):
|
|
82
|
+
self.info('Download assembly file for %s to %s', self.assembly_accession, destination_file)
|
|
83
|
+
request.urlretrieve(url, destination_file)
|
|
84
|
+
request.urlcleanup()
|
|
85
|
+
|
|
86
|
+
@cached_property
|
|
87
|
+
def _ncbi_genome_folder_url_and_content(self):
|
|
88
|
+
"""
|
|
89
|
+
Internal property that retrieve and store the NCBI ftp url and content of the genome folder.
|
|
90
|
+
"""
|
|
91
|
+
ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=600)
|
|
92
|
+
ftp.login('anonymous', 'anonymous')
|
|
93
|
+
genome_folder = 'genomes/all/' + '/'.join([self.assembly_accession[0:3], self.assembly_accession[4:7],
|
|
94
|
+
self.assembly_accession[7:10],
|
|
95
|
+
self.assembly_accession[10:13]]) + '/'
|
|
96
|
+
ftp.cwd(genome_folder)
|
|
97
|
+
all_genome_subfolders = []
|
|
98
|
+
ftp.retrlines('NLST', lambda line: all_genome_subfolders.append(line))
|
|
99
|
+
|
|
100
|
+
genome_subfolders = [folder for folder in all_genome_subfolders if folder == self.assembly_accession]
|
|
101
|
+
if len(genome_subfolders) != 1:
|
|
102
|
+
self.debug('Cannot find good match for accession folder with "%s": %s match found', self.assembly_accession, len(genome_subfolders))
|
|
103
|
+
genome_subfolders = [folder for folder in all_genome_subfolders if folder.startswith(self.assembly_accession + '_')]
|
|
104
|
+
if len(genome_subfolders) != 1:
|
|
105
|
+
self.debug('Cannot find good match for accession folder with "starting with %s_": %s match found', self.assembly_accession, len(genome_subfolders))
|
|
106
|
+
genome_subfolders = [folder for folder in all_genome_subfolders if folder.startswith(self.assembly_accession)]
|
|
107
|
+
if len(genome_subfolders) != 1:
|
|
108
|
+
self.debug('Cannot find good match for accession folder with "starting with %s": %s match found', self.assembly_accession, len(genome_subfolders))
|
|
109
|
+
genome_subfolders = [folder for folder in all_genome_subfolders if self.assembly_accession in folder]
|
|
110
|
+
if len(genome_subfolders) != 1:
|
|
111
|
+
self.debug('Cannot find good match for accession folder with "%s in name": %s match found', self.assembly_accession, len(genome_subfolders))
|
|
112
|
+
raise Exception('more than one folder matches the assembly accession: ' + str(genome_subfolders))
|
|
113
|
+
ftp.cwd(genome_subfolders[0])
|
|
114
|
+
genome_files = []
|
|
115
|
+
ftp.retrlines('NLST', lambda line: genome_files.append(line))
|
|
116
|
+
url = 'ftp://' + 'ftp.ncbi.nlm.nih.gov' + '/' + genome_folder + genome_subfolders[0]
|
|
117
|
+
ftp.close()
|
|
118
|
+
return url, genome_files
|
|
119
|
+
|
|
120
|
+
@cached_property
|
|
121
|
+
def assembly_report_url(self):
|
|
122
|
+
"""
|
|
123
|
+
Search on the NCBI FTP for the assembly report file and return the full url if only one found.
|
|
124
|
+
Raise if not.
|
|
125
|
+
"""
|
|
126
|
+
url, genome_files = self._ncbi_genome_folder_url_and_content
|
|
127
|
+
assembly_reports = [genome_file for genome_file in genome_files if 'assembly_report.txt' in genome_file]
|
|
128
|
+
if len(assembly_reports) != 1:
|
|
129
|
+
raise Exception('more than one file has "assembly_report" in its name: ' + str(assembly_reports))
|
|
130
|
+
return url + '/' + assembly_reports[0]
|
|
131
|
+
|
|
132
|
+
@cached_property
|
|
133
|
+
def assembly_fasta_url(self):
|
|
134
|
+
"""
|
|
135
|
+
Search on the NCBI FTP for the assembly genomics fasta file and return the full url if only one found.
|
|
136
|
+
Raise if not.
|
|
137
|
+
"""
|
|
138
|
+
url, genome_files = self._ncbi_genome_folder_url_and_content
|
|
139
|
+
assembly_fasta = [genome_file for genome_file in genome_files if genome_file.endswith('_genomic.fna.gz')]
|
|
140
|
+
# Remove the entries that are from genomics dna but the whole genome
|
|
141
|
+
assembly_fasta = [fasta for fasta in assembly_fasta if '_from_' not in fasta]
|
|
142
|
+
if len(assembly_fasta) > 1:
|
|
143
|
+
raise Exception('{} file found ending with "_genomic.fna.gz" in its name: {}'.format(len(assembly_fasta),
|
|
144
|
+
assembly_fasta))
|
|
145
|
+
return url + '/' + assembly_fasta[0]
|
|
146
|
+
|
|
147
|
+
def get_assembly_report_rows(self):
|
|
148
|
+
"""Download the assembly report if it does not exist then parse it to create a generator
|
|
149
|
+
that return each row as a dict."""
|
|
150
|
+
self.download_assembly_report()
|
|
151
|
+
with open(self.assembly_report_path) as open_file:
|
|
152
|
+
headers = None
|
|
153
|
+
# Parse the assembly report file to find the header then stop
|
|
154
|
+
for line in open_file:
|
|
155
|
+
if line.lower().startswith("# sequence-name") and "sequence-role" in line.lower():
|
|
156
|
+
headers = line.strip().split('\t')
|
|
157
|
+
break
|
|
158
|
+
reader = DictReader(open_file, fieldnames=headers, dialect=excel_tab)
|
|
159
|
+
for record in reader:
|
|
160
|
+
yield record
|
|
161
|
+
|
|
162
|
+
def download_assembly_report(self, overwrite=False):
|
|
163
|
+
if not os.path.isfile(self.assembly_report_path) or overwrite:
|
|
164
|
+
self._download_file(self.assembly_report_path, self.assembly_report_url)
|
|
165
|
+
|
|
166
|
+
def download_assembly_fasta(self, overwrite=False):
|
|
167
|
+
if not os.path.isfile(self.assembly_fasta_path) or overwrite:
|
|
168
|
+
self._download_file(self.assembly_compressed_fasta_path, self.assembly_fasta_url)
|
|
169
|
+
run_command_with_output(
|
|
170
|
+
'Uncompress {}'.format(self.assembly_compressed_fasta_path),
|
|
171
|
+
'gunzip -f {}'.format(self.assembly_compressed_fasta_path)
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
def construct_fasta_from_report(self, genbank_only=False):
|
|
175
|
+
"""
|
|
176
|
+
Download the assembly report if it does not exist then create the assembly fasta from the contig.
|
|
177
|
+
If the assembly already exist then it only add any missing contig.
|
|
178
|
+
"""
|
|
179
|
+
written_contigs = self.get_written_contigs(self.assembly_fasta_path)
|
|
180
|
+
contig_to_append = []
|
|
181
|
+
for row in self.get_assembly_report_rows():
|
|
182
|
+
genbank_accession = row['GenBank-Accn']
|
|
183
|
+
refseq_accession = row['RefSeq-Accn']
|
|
184
|
+
relationship = row['Relationship']
|
|
185
|
+
accession = genbank_accession
|
|
186
|
+
if relationship != '=' and genbank_accession == 'na' and not genbank_only:
|
|
187
|
+
accession = refseq_accession
|
|
188
|
+
if accession in written_contigs:
|
|
189
|
+
self.debug('Accession ' + accession + ' already in the FASTA file, don\'t need to be downloaded')
|
|
190
|
+
continue
|
|
191
|
+
if not accession or accession == 'na':
|
|
192
|
+
raise ValueError('Accession {} found in report is not valid'.format(accession))
|
|
193
|
+
contig_to_append.append(self.download_contig_sequence_from_ncbi(accession))
|
|
194
|
+
|
|
195
|
+
# Now append all the new contigs to the existing fasta
|
|
196
|
+
with open(self.assembly_fasta_path, 'a+') as fasta:
|
|
197
|
+
for contig_path in contig_to_append:
|
|
198
|
+
with open(contig_path) as sequence:
|
|
199
|
+
for line in sequence:
|
|
200
|
+
# Check that the line is not empty
|
|
201
|
+
if line.strip():
|
|
202
|
+
fasta.write(line)
|
|
203
|
+
os.remove(contig_path)
|
|
204
|
+
|
|
205
|
+
def download_contig_sequence_from_ncbi(self, accession):
|
|
206
|
+
sequence_tmp_path = os.path.join(self.assembly_directory, accession + '.fa')
|
|
207
|
+
self.download_contig_from_ncbi(accession, sequence_tmp_path)
|
|
208
|
+
self.info(accession + " downloaded and added to FASTA sequence")
|
|
209
|
+
return sequence_tmp_path
|
|
210
|
+
|
|
211
|
+
@staticmethod
|
|
212
|
+
def get_written_contigs(fasta_path):
|
|
213
|
+
written_contigs = []
|
|
214
|
+
match = re.compile(r'>(.*?)\s')
|
|
215
|
+
if os.path.isfile(fasta_path):
|
|
216
|
+
with open(fasta_path, 'r') as file:
|
|
217
|
+
for line in file:
|
|
218
|
+
written_contigs.extend(match.findall(line))
|
|
219
|
+
return written_contigs
|
|
220
|
+
|
|
221
|
+
@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
|
|
222
|
+
def download_contig_from_ncbi(self, contig_accession, output_file):
|
|
223
|
+
parameters = {
|
|
224
|
+
'db': 'nuccore',
|
|
225
|
+
'id': contig_accession,
|
|
226
|
+
'rettype': 'fasta',
|
|
227
|
+
'retmode': 'text',
|
|
228
|
+
'tool': 'eva',
|
|
229
|
+
'email': 'eva-dev@ebi.ac.uk'
|
|
230
|
+
}
|
|
231
|
+
if self.eutils_api_key:
|
|
232
|
+
parameters['api_key'] = self.eutils_api_key
|
|
233
|
+
|
|
234
|
+
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + urllib.parse.urlencode(parameters)
|
|
235
|
+
self.info('Downloading ' + contig_accession)
|
|
236
|
+
urllib.request.urlretrieve(url, output_file)
|
|
237
|
+
|
|
238
|
+
def download_or_construct(self, genbank_only=False, overwrite=False):
|
|
239
|
+
"""First download the assembly report and fasta from the FTP, then append any missing contig from
|
|
240
|
+
the assembly report to the assembly fasta."""
|
|
241
|
+
self.download_assembly_report(overwrite)
|
|
242
|
+
try:
|
|
243
|
+
self.download_assembly_fasta(overwrite)
|
|
244
|
+
except:
|
|
245
|
+
pass
|
|
246
|
+
# This will either confirm the presence of all the contig or download any one missing
|
|
247
|
+
self.construct_fasta_from_report(genbank_only)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Copyright 2019 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import os
|
|
15
|
+
import urllib
|
|
16
|
+
from csv import DictReader, excel_tab
|
|
17
|
+
from ftplib import FTP
|
|
18
|
+
import re
|
|
19
|
+
from urllib import request
|
|
20
|
+
|
|
21
|
+
from cached_property import cached_property
|
|
22
|
+
from retry import retry
|
|
23
|
+
|
|
24
|
+
from ebi_eva_common_pyutils.command_utils import run_command_with_output
|
|
25
|
+
from ebi_eva_common_pyutils.logger import AppLogger
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class NCBISequence(AppLogger):
|
|
29
|
+
"""
|
|
30
|
+
Class that represent an Sequence that would originate from NCBI data
|
|
31
|
+
It takes a any Genbank or refseq accession and can download the sequence genomics fasta form.
|
|
32
|
+
Using species_scientific_name and assembly_accession it create a directory structure in the provided
|
|
33
|
+
reference_directory:
|
|
34
|
+
- species_scientific_name1
|
|
35
|
+
- assembly_accession1
|
|
36
|
+
- assembly_accession2
|
|
37
|
+
- species_scientific_name2
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
insdc_accession_formats = [
|
|
41
|
+
r'[A-Z][0-9]{5}\.[0-9]+',
|
|
42
|
+
r'[A-Z]{2}[0-9]{6}\.[0-9]+',
|
|
43
|
+
r'[A-Z]{2}[0-9]{8}\.[0-9]+',
|
|
44
|
+
r'[A-Z]{4}[0-9]{8}\.[0-9]+',
|
|
45
|
+
r'[A-Z]{6}[0-9]{9}\.[0-9]+'
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
def __init__(self, sequence_accession, species_scientific_name, reference_directory, eutils_api_key=None):
|
|
49
|
+
self.sequence_accession = sequence_accession
|
|
50
|
+
self.species_scientific_name = species_scientific_name
|
|
51
|
+
self.reference_directory = reference_directory
|
|
52
|
+
self.eutils_api_key = eutils_api_key
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def is_genbank_accession_format(accession):
|
|
56
|
+
if any(
|
|
57
|
+
re.match(insdc_accession_format, accession)
|
|
58
|
+
for insdc_accession_format in NCBISequence.insdc_accession_formats
|
|
59
|
+
):
|
|
60
|
+
return True
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def check_genbank_accession_format(accession):
|
|
65
|
+
if not NCBISequence.is_genbank_accession_format(accession):
|
|
66
|
+
raise ValueError('Invalid INSDC accession: %s' % accession)
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def sequence_directory(self):
|
|
70
|
+
sequence_directory = os.path.join(
|
|
71
|
+
self.reference_directory, self.species_scientific_name.lower().replace(' ', '_'), self.sequence_accession
|
|
72
|
+
)
|
|
73
|
+
os.makedirs(sequence_directory, exist_ok=True),
|
|
74
|
+
return sequence_directory
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def sequence_fasta_path(self):
|
|
78
|
+
return os.path.join(self.sequence_directory, self.sequence_accession + '.fa')
|
|
79
|
+
|
|
80
|
+
def download_contig_sequence_from_ncbi(self, genbank_only=True):
|
|
81
|
+
if genbank_only:
|
|
82
|
+
self.check_genbank_accession_format(self.sequence_accession)
|
|
83
|
+
self._download_contig_from_ncbi(self.sequence_accession, self.sequence_fasta_path)
|
|
84
|
+
self.info(self.sequence_fasta_path + " downloaded and added to FASTA sequence")
|
|
85
|
+
|
|
86
|
+
@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
|
|
87
|
+
def _download_contig_from_ncbi(self, contig_accession, output_file):
|
|
88
|
+
parameters = {
|
|
89
|
+
'db': 'nuccore',
|
|
90
|
+
'id': contig_accession,
|
|
91
|
+
'rettype': 'fasta',
|
|
92
|
+
'retmode': 'text',
|
|
93
|
+
'tool': 'eva',
|
|
94
|
+
'email': 'eva-dev@ebi.ac.uk'
|
|
95
|
+
}
|
|
96
|
+
if self.eutils_api_key:
|
|
97
|
+
parameters['api_key'] = self.eutils_api_key
|
|
98
|
+
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + urllib.parse.urlencode(parameters)
|
|
99
|
+
self.info('Downloading ' + contig_accession)
|
|
100
|
+
urllib.request.urlretrieve(url, output_file)
|
|
101
|
+
|
|
File without changes
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
|
|
17
|
+
from ebi_eva_common_pyutils.ncbi_utils import retrieve_species_scientific_name_from_tax_id_ncbi
|
|
18
|
+
from ebi_eva_common_pyutils.network_utils import json_request
|
|
19
|
+
from ebi_eva_common_pyutils.logger import logging_config as log_cfg
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = log_cfg.get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_scientific_name_from_ensembl(taxonomy_id: int) -> str:
|
|
26
|
+
ENSEMBL_REST_API_URL = "https://rest.ensembl.org/taxonomy/id/{0}?content-type=application/json".format(taxonomy_id)
|
|
27
|
+
response = json_request(ENSEMBL_REST_API_URL)
|
|
28
|
+
if "scientific_name" not in response:
|
|
29
|
+
raise Exception("Scientific name could not be found for taxonomy {0} using the Ensembl API URL: {1}"
|
|
30
|
+
.format(taxonomy_id, ENSEMBL_REST_API_URL))
|
|
31
|
+
return response["scientific_name"]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def normalise_taxon_scientific_name(taxon_name):
|
|
35
|
+
"""
|
|
36
|
+
Match Ensembl representation
|
|
37
|
+
See Clostridium sp. SS2/1 represented as clostridium_sp_ss2_1 in
|
|
38
|
+
ftp://ftp.ensemblgenomes.org/pub/bacteria/release-48/fasta/bacteria_25_collection/clostridium_sp_ss2_1/
|
|
39
|
+
"""
|
|
40
|
+
return re.sub('[^0-9a-zA-Z]+', '_', taxon_name.lower())
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_normalized_scientific_name_from_ensembl(taxonomy_id: int) -> str:
|
|
44
|
+
"""Get the scientific name for that taxon"""
|
|
45
|
+
return normalise_taxon_scientific_name(get_scientific_name_from_ensembl(taxonomy_id))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_scientific_name_from_taxonomy(taxonomy_id: int, api_key: str=None) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Search for a species scientific name based on the taxonomy id.
|
|
51
|
+
Will first attempt to retrieve from Ensembl and then NCBI, if not found returns None.
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
species_name = get_scientific_name_from_ensembl(taxonomy_id)
|
|
55
|
+
except Exception:
|
|
56
|
+
logger.warning("Failed to retrieve scientific name in Ensembl for taxonomy id {0}".format(taxonomy_id))
|
|
57
|
+
species_name = None
|
|
58
|
+
if not species_name:
|
|
59
|
+
species_name = retrieve_species_scientific_name_from_tax_id_ncbi(taxonomy_id, api_key=api_key)
|
|
60
|
+
return species_name
|
|
File without changes
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
import requests
|
|
17
|
+
from retry import retry
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# TODO: Might be a good idea to re-visit this after a production implementation
|
|
21
|
+
# of the contig-alias resolution project is available
|
|
22
|
+
@retry(tries=10, delay=5, backoff=1.2, jitter=(1, 3))
|
|
23
|
+
def resolve_contig_accession_to_chromosome_name(contig_accession, line_limit=100):
|
|
24
|
+
"""
|
|
25
|
+
Given a Genbank contig accession, get the corresponding chromosome name from the ENA Text API
|
|
26
|
+
which returns results in a EMBL Flatfile format
|
|
27
|
+
|
|
28
|
+
:param contig_accession: Genbank contig accession (ex: CM003032.1)
|
|
29
|
+
:param line_limit: number of lines to parse in the EMBL Flatfile result to find the chromosome before giving up
|
|
30
|
+
:return: Chromosome name (ex: 12 when given an accession CM003032.1)
|
|
31
|
+
"""
|
|
32
|
+
ENA_TEXT_API_URL = "https://www.ebi.ac.uk/ena/browser/api/text/{0}?lineLimit={1}&annotationOnly=true"
|
|
33
|
+
response = requests.get(ENA_TEXT_API_URL.format(contig_accession, line_limit))
|
|
34
|
+
response_lines = response.content.decode("utf-8").split("\n")
|
|
35
|
+
num_lines = len(response_lines)
|
|
36
|
+
|
|
37
|
+
features_section_found, source_line_found = False, False
|
|
38
|
+
chosen_response = []
|
|
39
|
+
line_index = 0
|
|
40
|
+
# Look for the "source" feature under the "Features" section in the text response
|
|
41
|
+
while line_index < num_lines:
|
|
42
|
+
line = response_lines[line_index]
|
|
43
|
+
if not (features_section_found or line.lower().startswith("fh key")):
|
|
44
|
+
line_index += 1
|
|
45
|
+
continue
|
|
46
|
+
features_section_found = True
|
|
47
|
+
# Based on "Data item positions" described here, http://www.insdc.org/files/feature_table.html#3.4.2
|
|
48
|
+
# the sixth character represents the start of the feature key
|
|
49
|
+
if not (source_line_found or line[5:].lower().startswith("source")):
|
|
50
|
+
line_index += 1
|
|
51
|
+
continue
|
|
52
|
+
source_line_found = True
|
|
53
|
+
if line[21:].startswith("/"):
|
|
54
|
+
assembled_line = line.strip()
|
|
55
|
+
line_index += 1
|
|
56
|
+
# Assemble text spread across multiple lines until
|
|
57
|
+
# we hit the next qualifier (starts with /) or the next section
|
|
58
|
+
while line_index < num_lines and \
|
|
59
|
+
not (response_lines[line_index][21:].startswith("/")
|
|
60
|
+
or response_lines[line_index][5:6].strip() != ''):
|
|
61
|
+
line = response_lines[line_index]
|
|
62
|
+
assembled_line += " " + line[21:].strip()
|
|
63
|
+
line_index += 1
|
|
64
|
+
|
|
65
|
+
# Fall back to organelle in case of MT/Chloroplast accessions
|
|
66
|
+
# and the reference notes in case of Linkage Group molecules
|
|
67
|
+
chosen_response = re.findall('.*/chromosome=".+"', assembled_line) or \
|
|
68
|
+
re.findall('.*/organelle=".+"', assembled_line) or \
|
|
69
|
+
re.findall('.*/note=".+"', assembled_line)
|
|
70
|
+
|
|
71
|
+
# If we have a response to give, no need to continue further
|
|
72
|
+
# If the sixth character is not empty, we have reached the next feature, so no need to continue further
|
|
73
|
+
if chosen_response or line[5:6].strip() != '':
|
|
74
|
+
break
|
|
75
|
+
else:
|
|
76
|
+
line_index += 1
|
|
77
|
+
|
|
78
|
+
if not chosen_response:
|
|
79
|
+
return ""
|
|
80
|
+
|
|
81
|
+
return str.split(chosen_response[0], '"')[1].strip()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def is_wgs_accession_format(contig_accession):
|
|
85
|
+
"""
|
|
86
|
+
Check if a Genbank contig is part of WGS (Whole Genome Shotgun) sequence
|
|
87
|
+
|
|
88
|
+
:param contig_accession: Genbank contig accession (ex: CM003032.1)
|
|
89
|
+
:return: True if the provided contig is in the WGS format
|
|
90
|
+
"""
|
|
91
|
+
wgs_prefix = contig_accession[:4]
|
|
92
|
+
wgs_numeric_suffix = contig_accession[4:].replace(".", "")
|
|
93
|
+
return str.isalpha(wgs_prefix) and str.isnumeric(wgs_numeric_suffix)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def get_chromosome_name_for_contig_accession(contig_accession):
|
|
97
|
+
"""
|
|
98
|
+
Given a Genbank contig accession, get the corresponding chromosome name
|
|
99
|
+
|
|
100
|
+
:param contig_accession: Genbank contig accession (ex: CM003032.1)
|
|
101
|
+
:return: Chromosome name (ex: 12 when given an accession CM003032.1)
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
# Don't bother calling the ENA web service to get the chromosome number if the accession is a WGS accession
|
|
105
|
+
# since the API will proceed to download the entire WGS dataset which can be in hundreds of MBs or even GBs
|
|
106
|
+
# See https://www.ebi.ac.uk/ena/browser/api/text/AABR07050911.1?lineLimit=100&annotationOnly=true for example
|
|
107
|
+
if is_wgs_accession_format(contig_accession):
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
return \
|
|
111
|
+
resolve_contig_accession_to_chromosome_name(contig_accession, 1000) or \
|
|
112
|
+
resolve_contig_accession_to_chromosome_name(contig_accession, 10000) or \
|
|
113
|
+
resolve_contig_accession_to_chromosome_name(contig_accession, 100000)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
import gzip
|
|
3
|
+
import shutil
|
|
4
|
+
import tarfile
|
|
5
|
+
import os.path
|
|
6
|
+
from argparse import ArgumentParser
|
|
7
|
+
|
|
8
|
+
from ebi_eva_common_pyutils.logger import logging_config
|
|
9
|
+
from retry import retry
|
|
10
|
+
|
|
11
|
+
logger = logging_config.get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def make_tarfile(output_filename, source_dir):
|
|
15
|
+
logger.info(f'Create Final Tar file {output_filename}.')
|
|
16
|
+
with tarfile.open(output_filename, "w") as tar:
|
|
17
|
+
tar.add(source_dir, arcname=os.path.basename(source_dir))
|
|
18
|
+
file_stats = os.stat(output_filename)
|
|
19
|
+
logger.info(f'{output_filename} completed. File Size in Bytes is {file_stats.st_size}')
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def is_compressed(file_path):
|
|
23
|
+
compressed_ext = ['.gz', '.zip', '.bz', '.tbi', '.csi']
|
|
24
|
+
for ext in compressed_ext:
|
|
25
|
+
if file_path.lower().endswith(ext):
|
|
26
|
+
return True
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@retry(tries=5, delay=3, backoff=2, logger=logger)
|
|
31
|
+
def retriable_compress(src_file_path, dest_file_path):
|
|
32
|
+
MEG = 2 ** 20
|
|
33
|
+
with open(src_file_path, 'rb') as f_in:
|
|
34
|
+
with gzip.open(dest_file_path, 'wb') as f_out:
|
|
35
|
+
shutil.copyfileobj(f_in, f_out, length=16 * MEG)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def matches(name, patterns):
|
|
39
|
+
return any((pattern for pattern in patterns if pattern in name))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@retry(tries=5, delay=3, backoff=2, logger=logger)
|
|
43
|
+
def retriable_remove(path):
|
|
44
|
+
shutil.rmtree(path)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@retry(tries=5, delay=3, backoff=2, logger=logger)
|
|
48
|
+
def retryable_copy(src_file_path, dest_file_path, **kwargs):
|
|
49
|
+
if os.path.exists(dest_file_path):
|
|
50
|
+
os.remove(dest_file_path)
|
|
51
|
+
shutil.copyfile(src_file_path, dest_file_path, **kwargs)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def archive_directory(source_dir, scratch_dir, destination_dir, filter_patterns=None):
|
|
55
|
+
"""
|
|
56
|
+
Archive a directory by copying the data it contains to a scratch directory compressing all files that are not
|
|
57
|
+
already compressed. Then it create a tar file to the destination_dir under the name of the original directory.
|
|
58
|
+
"""
|
|
59
|
+
source_dir_name = os.path.basename(source_dir)
|
|
60
|
+
logger.info(f'Archive {source_dir_name} from {source_dir}')
|
|
61
|
+
parent_source_dir = os.path.dirname(source_dir)
|
|
62
|
+
for base, dirs, files in os.walk(source_dir, topdown=True, followlinks=False):
|
|
63
|
+
# Filter the downstream directory to
|
|
64
|
+
filtered_dir = []
|
|
65
|
+
for d in dirs:
|
|
66
|
+
if matches(d, filter_patterns):
|
|
67
|
+
logger.info(f'Ignore directory {d} because of filters: {filter_patterns}')
|
|
68
|
+
else:
|
|
69
|
+
filtered_dir.append(d)
|
|
70
|
+
# modify dirs in place
|
|
71
|
+
dirs[:] = filtered_dir
|
|
72
|
+
src_basename = os.path.relpath(base, parent_source_dir)
|
|
73
|
+
scratch_dest_dir = os.path.join(scratch_dir, src_basename)
|
|
74
|
+
os.makedirs(scratch_dest_dir, exist_ok=True)
|
|
75
|
+
for fname in files:
|
|
76
|
+
src_file_path = os.path.join(base, fname)
|
|
77
|
+
dest_file_path = os.path.join(scratch_dest_dir, fname)
|
|
78
|
+
if matches(fname, filter_patterns):
|
|
79
|
+
logger.info(f'Ignore file {src_file_path} because of filters: {filter_patterns}')
|
|
80
|
+
continue
|
|
81
|
+
if os.path.islink(src_file_path) or is_compressed(src_file_path):
|
|
82
|
+
logger.info(f'Copy {src_file_path}')
|
|
83
|
+
retryable_copy(src_file_path, dest_file_path, follow_symlinks=False)
|
|
84
|
+
else:
|
|
85
|
+
logger.info(f'Compress {src_file_path}')
|
|
86
|
+
retriable_compress(src_file_path, dest_file_path + '.gz')
|
|
87
|
+
final_tar_file = os.path.join(destination_dir, source_dir_name + '.tar')
|
|
88
|
+
scratch_dir_archived = os.path.join(scratch_dir, source_dir_name)
|
|
89
|
+
make_tarfile(final_tar_file, scratch_dir_archived)
|
|
90
|
+
logger.info(f'Delete scratch folder {scratch_dir_archived}.')
|
|
91
|
+
retriable_remove(scratch_dir_archived)
|
|
92
|
+
logger.info(f'Scratch folder {scratch_dir_archived} deleted.')
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def main():
|
|
96
|
+
parser = ArgumentParser(description='Archive a directory by copying the data it contains to a scratch directory '
|
|
97
|
+
'compressing all files that are not already compressed. Then it create a tar '
|
|
98
|
+
'file to the destination_dir under the name of the original directory.')
|
|
99
|
+
parser.add_argument('--source_dir', required=True, type=str,
|
|
100
|
+
help='base directory you want to archive. All sub directories will be included.')
|
|
101
|
+
parser.add_argument('--destination_dir', required=True, type=str,
|
|
102
|
+
help='Directory where the archive should be placed at the end of the process.')
|
|
103
|
+
parser.add_argument('--scratch_dir', required=True, type=str,
|
|
104
|
+
help='Directory where the archive will be constructed.')
|
|
105
|
+
parser.add_argument('--filter_patterns', type=str, nargs='*', default=[],
|
|
106
|
+
help='keyword found in file and directory names that not be included to the archive.')
|
|
107
|
+
args = parser.parse_args()
|
|
108
|
+
|
|
109
|
+
logging_config.add_stdout_handler()
|
|
110
|
+
archive_directory(args.source_dir, args.scratch_dir, args.destination_dir, args.filter_patterns)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
if __name__ == '__main__':
|
|
114
|
+
main()
|