ebi-eva-common-pyutils 0.6.17__2-py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. ebi_eva_common_pyutils/__init__.py +0 -0
  2. ebi_eva_common_pyutils/assembly/__init__.py +1 -0
  3. ebi_eva_common_pyutils/assembly/assembly.py +69 -0
  4. ebi_eva_common_pyutils/assembly_utils.py +91 -0
  5. ebi_eva_common_pyutils/biosamples_communicators.py +186 -0
  6. ebi_eva_common_pyutils/command_utils.py +54 -0
  7. ebi_eva_common_pyutils/common_utils.py +30 -0
  8. ebi_eva_common_pyutils/config.py +152 -0
  9. ebi_eva_common_pyutils/contig_alias/__init__.py +0 -0
  10. ebi_eva_common_pyutils/contig_alias/contig_alias.py +115 -0
  11. ebi_eva_common_pyutils/ena_utils.py +35 -0
  12. ebi_eva_common_pyutils/file_utils.py +31 -0
  13. ebi_eva_common_pyutils/logger.py +150 -0
  14. ebi_eva_common_pyutils/ncbi_utils.py +117 -0
  15. ebi_eva_common_pyutils/network_utils.py +64 -0
  16. ebi_eva_common_pyutils/reference/__init__.py +2 -0
  17. ebi_eva_common_pyutils/reference/assembly.py +247 -0
  18. ebi_eva_common_pyutils/reference/sequence.py +101 -0
  19. ebi_eva_common_pyutils/spreadsheet/__init__.py +0 -0
  20. ebi_eva_common_pyutils/spreadsheet/metadata_xlsx_utils.py +15 -0
  21. ebi_eva_common_pyutils/taxonomy/__init__.py +0 -0
  22. ebi_eva_common_pyutils/taxonomy/taxonomy.py +60 -0
  23. ebi_eva_common_pyutils/variation/__init__.py +0 -0
  24. ebi_eva_common_pyutils/variation/contig_utils.py +113 -0
  25. ebi_eva_common_pyutils-0.6.17.data/scripts/archive_directory.py +114 -0
  26. ebi_eva_common_pyutils-0.6.17.dist-info/LICENSE +201 -0
  27. ebi_eva_common_pyutils-0.6.17.dist-info/METADATA +24 -0
  28. ebi_eva_common_pyutils-0.6.17.dist-info/RECORD +41 -0
  29. ebi_eva_common_pyutils-0.6.17.dist-info/WHEEL +5 -0
  30. ebi_eva_common_pyutils-0.6.17.dist-info/top_level.txt +2 -0
  31. ebi_eva_internal_pyutils/__init__.py +0 -0
  32. ebi_eva_internal_pyutils/archive_directory.py +114 -0
  33. ebi_eva_internal_pyutils/config_utils.py +188 -0
  34. ebi_eva_internal_pyutils/metadata_utils.py +288 -0
  35. ebi_eva_internal_pyutils/mongo_utils.py +71 -0
  36. ebi_eva_internal_pyutils/mongodb/__init__.py +3 -0
  37. ebi_eva_internal_pyutils/mongodb/mongo_database.py +170 -0
  38. ebi_eva_internal_pyutils/nextflow/__init__.py +1 -0
  39. ebi_eva_internal_pyutils/nextflow/nextflow_pipeline.py +195 -0
  40. ebi_eva_internal_pyutils/pg_utils.py +107 -0
  41. ebi_eva_internal_pyutils/spring_properties.py +294 -0
@@ -0,0 +1,247 @@
1
+ # Copyright 2019 EMBL - European Bioinformatics Institute
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+ import urllib
16
+ from csv import DictReader, excel_tab
17
+ from ftplib import FTP
18
+ import re
19
+ from urllib import request
20
+
21
+ from cached_property import cached_property
22
+ from retry import retry
23
+
24
+ from ebi_eva_common_pyutils.command_utils import run_command_with_output
25
+ from ebi_eva_common_pyutils.logger import AppLogger
26
+
27
+
28
+ class NCBIAssembly(AppLogger):
29
+ """
30
+ Class that represent an assembly that would originate from NCBI data
31
+ It takes a GCA or GCF accession and can download the assembly report and genomics fasta.
32
+ Using species_scientific_name and assembly_accession it create a directory structure in the provided
33
+ reference_directory:
34
+ - species_scientific_name1
35
+ - assembly_accession1
36
+ - assembly_accession2
37
+ - species_scientific_name2
38
+ the eutils_api_key is only used to retrieve additional contigs if required.
39
+ """
40
+
41
+ def __init__(self, assembly_accession, species_scientific_name, reference_directory, eutils_api_key=None):
42
+ self.check_assembly_accession_format(assembly_accession)
43
+ self.assembly_accession = assembly_accession
44
+ self.species_scientific_name = species_scientific_name
45
+ self.reference_directory = reference_directory
46
+ self.eutils_api_key = eutils_api_key
47
+
48
+ @staticmethod
49
+ def is_assembly_accession_format(assembly_accession):
50
+ if re.match(r"^GC[F|A]_\d+\.\d+$", assembly_accession) is not None:
51
+ return True
52
+ return False
53
+
54
+ @staticmethod
55
+ def check_assembly_accession_format(assembly_accession):
56
+ if not NCBIAssembly.is_assembly_accession_format(assembly_accession):
57
+ raise ValueError(f'Invalid assembly accession: {assembly_accession} it has to be in the form of '
58
+ 'GCF_XXXXXXXXX.X or GCA_XXXXXXXXX.X where X is a number')
59
+
60
+ @property
61
+ def assembly_directory(self):
62
+ assembly_directory = os.path.join(
63
+ self.reference_directory, self.species_scientific_name.lower().replace(' ', '_'), self.assembly_accession
64
+ )
65
+ os.makedirs(assembly_directory, exist_ok=True),
66
+ return assembly_directory
67
+
68
+ @property
69
+ def assembly_report_path(self):
70
+ return os.path.join(self.assembly_directory, self.assembly_accession + '_assembly_report.txt')
71
+
72
+ @property
73
+ def assembly_fasta_path(self):
74
+ return os.path.join(self.assembly_directory, self.assembly_accession + '.fa')
75
+
76
+ @property
77
+ def assembly_compressed_fasta_path(self):
78
+ return os.path.join(self.assembly_directory, self.assembly_accession + '.fa.gz')
79
+
80
+ @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
81
+ def _download_file(self, destination_file, url):
82
+ self.info('Download assembly file for %s to %s', self.assembly_accession, destination_file)
83
+ request.urlretrieve(url, destination_file)
84
+ request.urlcleanup()
85
+
86
+ @cached_property
87
+ def _ncbi_genome_folder_url_and_content(self):
88
+ """
89
+ Internal property that retrieve and store the NCBI ftp url and content of the genome folder.
90
+ """
91
+ ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=600)
92
+ ftp.login('anonymous', 'anonymous')
93
+ genome_folder = 'genomes/all/' + '/'.join([self.assembly_accession[0:3], self.assembly_accession[4:7],
94
+ self.assembly_accession[7:10],
95
+ self.assembly_accession[10:13]]) + '/'
96
+ ftp.cwd(genome_folder)
97
+ all_genome_subfolders = []
98
+ ftp.retrlines('NLST', lambda line: all_genome_subfolders.append(line))
99
+
100
+ genome_subfolders = [folder for folder in all_genome_subfolders if folder == self.assembly_accession]
101
+ if len(genome_subfolders) != 1:
102
+ self.debug('Cannot find good match for accession folder with "%s": %s match found', self.assembly_accession, len(genome_subfolders))
103
+ genome_subfolders = [folder for folder in all_genome_subfolders if folder.startswith(self.assembly_accession + '_')]
104
+ if len(genome_subfolders) != 1:
105
+ self.debug('Cannot find good match for accession folder with "starting with %s_": %s match found', self.assembly_accession, len(genome_subfolders))
106
+ genome_subfolders = [folder for folder in all_genome_subfolders if folder.startswith(self.assembly_accession)]
107
+ if len(genome_subfolders) != 1:
108
+ self.debug('Cannot find good match for accession folder with "starting with %s": %s match found', self.assembly_accession, len(genome_subfolders))
109
+ genome_subfolders = [folder for folder in all_genome_subfolders if self.assembly_accession in folder]
110
+ if len(genome_subfolders) != 1:
111
+ self.debug('Cannot find good match for accession folder with "%s in name": %s match found', self.assembly_accession, len(genome_subfolders))
112
+ raise Exception('more than one folder matches the assembly accession: ' + str(genome_subfolders))
113
+ ftp.cwd(genome_subfolders[0])
114
+ genome_files = []
115
+ ftp.retrlines('NLST', lambda line: genome_files.append(line))
116
+ url = 'ftp://' + 'ftp.ncbi.nlm.nih.gov' + '/' + genome_folder + genome_subfolders[0]
117
+ ftp.close()
118
+ return url, genome_files
119
+
120
+ @cached_property
121
+ def assembly_report_url(self):
122
+ """
123
+ Search on the NCBI FTP for the assembly report file and return the full url if only one found.
124
+ Raise if not.
125
+ """
126
+ url, genome_files = self._ncbi_genome_folder_url_and_content
127
+ assembly_reports = [genome_file for genome_file in genome_files if 'assembly_report.txt' in genome_file]
128
+ if len(assembly_reports) != 1:
129
+ raise Exception('more than one file has "assembly_report" in its name: ' + str(assembly_reports))
130
+ return url + '/' + assembly_reports[0]
131
+
132
+ @cached_property
133
+ def assembly_fasta_url(self):
134
+ """
135
+ Search on the NCBI FTP for the assembly genomics fasta file and return the full url if only one found.
136
+ Raise if not.
137
+ """
138
+ url, genome_files = self._ncbi_genome_folder_url_and_content
139
+ assembly_fasta = [genome_file for genome_file in genome_files if genome_file.endswith('_genomic.fna.gz')]
140
+ # Remove the entries that are from genomics dna but the whole genome
141
+ assembly_fasta = [fasta for fasta in assembly_fasta if '_from_' not in fasta]
142
+ if len(assembly_fasta) > 1:
143
+ raise Exception('{} file found ending with "_genomic.fna.gz" in its name: {}'.format(len(assembly_fasta),
144
+ assembly_fasta))
145
+ return url + '/' + assembly_fasta[0]
146
+
147
+ def get_assembly_report_rows(self):
148
+ """Download the assembly report if it does not exist then parse it to create a generator
149
+ that return each row as a dict."""
150
+ self.download_assembly_report()
151
+ with open(self.assembly_report_path) as open_file:
152
+ headers = None
153
+ # Parse the assembly report file to find the header then stop
154
+ for line in open_file:
155
+ if line.lower().startswith("# sequence-name") and "sequence-role" in line.lower():
156
+ headers = line.strip().split('\t')
157
+ break
158
+ reader = DictReader(open_file, fieldnames=headers, dialect=excel_tab)
159
+ for record in reader:
160
+ yield record
161
+
162
+ def download_assembly_report(self, overwrite=False):
163
+ if not os.path.isfile(self.assembly_report_path) or overwrite:
164
+ self._download_file(self.assembly_report_path, self.assembly_report_url)
165
+
166
+ def download_assembly_fasta(self, overwrite=False):
167
+ if not os.path.isfile(self.assembly_fasta_path) or overwrite:
168
+ self._download_file(self.assembly_compressed_fasta_path, self.assembly_fasta_url)
169
+ run_command_with_output(
170
+ 'Uncompress {}'.format(self.assembly_compressed_fasta_path),
171
+ 'gunzip -f {}'.format(self.assembly_compressed_fasta_path)
172
+ )
173
+
174
+ def construct_fasta_from_report(self, genbank_only=False):
175
+ """
176
+ Download the assembly report if it does not exist then create the assembly fasta from the contig.
177
+ If the assembly already exist then it only add any missing contig.
178
+ """
179
+ written_contigs = self.get_written_contigs(self.assembly_fasta_path)
180
+ contig_to_append = []
181
+ for row in self.get_assembly_report_rows():
182
+ genbank_accession = row['GenBank-Accn']
183
+ refseq_accession = row['RefSeq-Accn']
184
+ relationship = row['Relationship']
185
+ accession = genbank_accession
186
+ if relationship != '=' and genbank_accession == 'na' and not genbank_only:
187
+ accession = refseq_accession
188
+ if accession in written_contigs:
189
+ self.debug('Accession ' + accession + ' already in the FASTA file, don\'t need to be downloaded')
190
+ continue
191
+ if not accession or accession == 'na':
192
+ raise ValueError('Accession {} found in report is not valid'.format(accession))
193
+ contig_to_append.append(self.download_contig_sequence_from_ncbi(accession))
194
+
195
+ # Now append all the new contigs to the existing fasta
196
+ with open(self.assembly_fasta_path, 'a+') as fasta:
197
+ for contig_path in contig_to_append:
198
+ with open(contig_path) as sequence:
199
+ for line in sequence:
200
+ # Check that the line is not empty
201
+ if line.strip():
202
+ fasta.write(line)
203
+ os.remove(contig_path)
204
+
205
+ def download_contig_sequence_from_ncbi(self, accession):
206
+ sequence_tmp_path = os.path.join(self.assembly_directory, accession + '.fa')
207
+ self.download_contig_from_ncbi(accession, sequence_tmp_path)
208
+ self.info(accession + " downloaded and added to FASTA sequence")
209
+ return sequence_tmp_path
210
+
211
+ @staticmethod
212
+ def get_written_contigs(fasta_path):
213
+ written_contigs = []
214
+ match = re.compile(r'>(.*?)\s')
215
+ if os.path.isfile(fasta_path):
216
+ with open(fasta_path, 'r') as file:
217
+ for line in file:
218
+ written_contigs.extend(match.findall(line))
219
+ return written_contigs
220
+
221
+ @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
222
+ def download_contig_from_ncbi(self, contig_accession, output_file):
223
+ parameters = {
224
+ 'db': 'nuccore',
225
+ 'id': contig_accession,
226
+ 'rettype': 'fasta',
227
+ 'retmode': 'text',
228
+ 'tool': 'eva',
229
+ 'email': 'eva-dev@ebi.ac.uk'
230
+ }
231
+ if self.eutils_api_key:
232
+ parameters['api_key'] = self.eutils_api_key
233
+
234
+ url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + urllib.parse.urlencode(parameters)
235
+ self.info('Downloading ' + contig_accession)
236
+ urllib.request.urlretrieve(url, output_file)
237
+
238
+ def download_or_construct(self, genbank_only=False, overwrite=False):
239
+ """First download the assembly report and fasta from the FTP, then append any missing contig from
240
+ the assembly report to the assembly fasta."""
241
+ self.download_assembly_report(overwrite)
242
+ try:
243
+ self.download_assembly_fasta(overwrite)
244
+ except:
245
+ pass
246
+ # This will either confirm the presence of all the contig or download any one missing
247
+ self.construct_fasta_from_report(genbank_only)
@@ -0,0 +1,101 @@
1
+ # Copyright 2019 EMBL - European Bioinformatics Institute
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+ import urllib
16
+ from csv import DictReader, excel_tab
17
+ from ftplib import FTP
18
+ import re
19
+ from urllib import request
20
+
21
+ from cached_property import cached_property
22
+ from retry import retry
23
+
24
+ from ebi_eva_common_pyutils.command_utils import run_command_with_output
25
+ from ebi_eva_common_pyutils.logger import AppLogger
26
+
27
+
28
+ class NCBISequence(AppLogger):
29
+ """
30
+ Class that represent an Sequence that would originate from NCBI data
31
+ It takes a any Genbank or refseq accession and can download the sequence genomics fasta form.
32
+ Using species_scientific_name and assembly_accession it create a directory structure in the provided
33
+ reference_directory:
34
+ - species_scientific_name1
35
+ - assembly_accession1
36
+ - assembly_accession2
37
+ - species_scientific_name2
38
+ """
39
+
40
+ insdc_accession_formats = [
41
+ r'[A-Z][0-9]{5}\.[0-9]+',
42
+ r'[A-Z]{2}[0-9]{6}\.[0-9]+',
43
+ r'[A-Z]{2}[0-9]{8}\.[0-9]+',
44
+ r'[A-Z]{4}[0-9]{8}\.[0-9]+',
45
+ r'[A-Z]{6}[0-9]{9}\.[0-9]+'
46
+ ]
47
+
48
+ def __init__(self, sequence_accession, species_scientific_name, reference_directory, eutils_api_key=None):
49
+ self.sequence_accession = sequence_accession
50
+ self.species_scientific_name = species_scientific_name
51
+ self.reference_directory = reference_directory
52
+ self.eutils_api_key = eutils_api_key
53
+
54
+ @staticmethod
55
+ def is_genbank_accession_format(accession):
56
+ if any(
57
+ re.match(insdc_accession_format, accession)
58
+ for insdc_accession_format in NCBISequence.insdc_accession_formats
59
+ ):
60
+ return True
61
+ return False
62
+
63
+ @staticmethod
64
+ def check_genbank_accession_format(accession):
65
+ if not NCBISequence.is_genbank_accession_format(accession):
66
+ raise ValueError('Invalid INSDC accession: %s' % accession)
67
+
68
+ @property
69
+ def sequence_directory(self):
70
+ sequence_directory = os.path.join(
71
+ self.reference_directory, self.species_scientific_name.lower().replace(' ', '_'), self.sequence_accession
72
+ )
73
+ os.makedirs(sequence_directory, exist_ok=True),
74
+ return sequence_directory
75
+
76
+ @property
77
+ def sequence_fasta_path(self):
78
+ return os.path.join(self.sequence_directory, self.sequence_accession + '.fa')
79
+
80
+ def download_contig_sequence_from_ncbi(self, genbank_only=True):
81
+ if genbank_only:
82
+ self.check_genbank_accession_format(self.sequence_accession)
83
+ self._download_contig_from_ncbi(self.sequence_accession, self.sequence_fasta_path)
84
+ self.info(self.sequence_fasta_path + " downloaded and added to FASTA sequence")
85
+
86
+ @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
87
+ def _download_contig_from_ncbi(self, contig_accession, output_file):
88
+ parameters = {
89
+ 'db': 'nuccore',
90
+ 'id': contig_accession,
91
+ 'rettype': 'fasta',
92
+ 'retmode': 'text',
93
+ 'tool': 'eva',
94
+ 'email': 'eva-dev@ebi.ac.uk'
95
+ }
96
+ if self.eutils_api_key:
97
+ parameters['api_key'] = self.eutils_api_key
98
+ url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + urllib.parse.urlencode(parameters)
99
+ self.info('Downloading ' + contig_accession)
100
+ urllib.request.urlretrieve(url, output_file)
101
+
File without changes
@@ -0,0 +1,15 @@
1
+ import re
2
+
3
+ from openpyxl.reader.excel import load_workbook
4
+
5
+
6
+ def metadata_xlsx_version(metadata_xlsx):
7
+ workbook = load_workbook(metadata_xlsx)
8
+ try:
9
+ instructions_sheet = workbook['PLEASE READ FIRST']
10
+ xlsx_sheet_version_value = instructions_sheet[3][0].value
11
+ match = re.search(r'(\d+\.\d+\.\d+)', '' if xlsx_sheet_version_value is None else xlsx_sheet_version_value)
12
+ xlsx_version = match.group(1) if match else None
13
+ except (KeyError, IndexError):
14
+ xlsx_version = None
15
+ return xlsx_version
File without changes
@@ -0,0 +1,60 @@
1
+ # Copyright 2020 EMBL - European Bioinformatics Institute
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import re
16
+
17
+ from ebi_eva_common_pyutils.ncbi_utils import retrieve_species_scientific_name_from_tax_id_ncbi
18
+ from ebi_eva_common_pyutils.network_utils import json_request
19
+ from ebi_eva_common_pyutils.logger import logging_config as log_cfg
20
+
21
+
22
+ logger = log_cfg.get_logger(__name__)
23
+
24
+
25
+ def get_scientific_name_from_ensembl(taxonomy_id: int) -> str:
26
+ ENSEMBL_REST_API_URL = "https://rest.ensembl.org/taxonomy/id/{0}?content-type=application/json".format(taxonomy_id)
27
+ response = json_request(ENSEMBL_REST_API_URL)
28
+ if "scientific_name" not in response:
29
+ raise Exception("Scientific name could not be found for taxonomy {0} using the Ensembl API URL: {1}"
30
+ .format(taxonomy_id, ENSEMBL_REST_API_URL))
31
+ return response["scientific_name"]
32
+
33
+
34
+ def normalise_taxon_scientific_name(taxon_name):
35
+ """
36
+ Match Ensembl representation
37
+ See Clostridium sp. SS2/1 represented as clostridium_sp_ss2_1 in
38
+ ftp://ftp.ensemblgenomes.org/pub/bacteria/release-48/fasta/bacteria_25_collection/clostridium_sp_ss2_1/
39
+ """
40
+ return re.sub('[^0-9a-zA-Z]+', '_', taxon_name.lower())
41
+
42
+
43
+ def get_normalized_scientific_name_from_ensembl(taxonomy_id: int) -> str:
44
+ """Get the scientific name for that taxon"""
45
+ return normalise_taxon_scientific_name(get_scientific_name_from_ensembl(taxonomy_id))
46
+
47
+
48
+ def get_scientific_name_from_taxonomy(taxonomy_id: int, api_key: str=None) -> str:
49
+ """
50
+ Search for a species scientific name based on the taxonomy id.
51
+ Will first attempt to retrieve from Ensembl and then NCBI, if not found returns None.
52
+ """
53
+ try:
54
+ species_name = get_scientific_name_from_ensembl(taxonomy_id)
55
+ except Exception:
56
+ logger.warning("Failed to retrieve scientific name in Ensembl for taxonomy id {0}".format(taxonomy_id))
57
+ species_name = None
58
+ if not species_name:
59
+ species_name = retrieve_species_scientific_name_from_tax_id_ncbi(taxonomy_id, api_key=api_key)
60
+ return species_name
File without changes
@@ -0,0 +1,113 @@
1
+ # Copyright 2020 EMBL - European Bioinformatics Institute
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import re
16
+ import requests
17
+ from retry import retry
18
+
19
+
20
+ # TODO: Might be a good idea to re-visit this after a production implementation
21
+ # of the contig-alias resolution project is available
22
+ @retry(tries=10, delay=5, backoff=1.2, jitter=(1, 3))
23
+ def resolve_contig_accession_to_chromosome_name(contig_accession, line_limit=100):
24
+ """
25
+ Given a Genbank contig accession, get the corresponding chromosome name from the ENA Text API
26
+ which returns results in a EMBL Flatfile format
27
+
28
+ :param contig_accession: Genbank contig accession (ex: CM003032.1)
29
+ :param line_limit: number of lines to parse in the EMBL Flatfile result to find the chromosome before giving up
30
+ :return: Chromosome name (ex: 12 when given an accession CM003032.1)
31
+ """
32
+ ENA_TEXT_API_URL = "https://www.ebi.ac.uk/ena/browser/api/text/{0}?lineLimit={1}&annotationOnly=true"
33
+ response = requests.get(ENA_TEXT_API_URL.format(contig_accession, line_limit))
34
+ response_lines = response.content.decode("utf-8").split("\n")
35
+ num_lines = len(response_lines)
36
+
37
+ features_section_found, source_line_found = False, False
38
+ chosen_response = []
39
+ line_index = 0
40
+ # Look for the "source" feature under the "Features" section in the text response
41
+ while line_index < num_lines:
42
+ line = response_lines[line_index]
43
+ if not (features_section_found or line.lower().startswith("fh key")):
44
+ line_index += 1
45
+ continue
46
+ features_section_found = True
47
+ # Based on "Data item positions" described here, http://www.insdc.org/files/feature_table.html#3.4.2
48
+ # the sixth character represents the start of the feature key
49
+ if not (source_line_found or line[5:].lower().startswith("source")):
50
+ line_index += 1
51
+ continue
52
+ source_line_found = True
53
+ if line[21:].startswith("/"):
54
+ assembled_line = line.strip()
55
+ line_index += 1
56
+ # Assemble text spread across multiple lines until
57
+ # we hit the next qualifier (starts with /) or the next section
58
+ while line_index < num_lines and \
59
+ not (response_lines[line_index][21:].startswith("/")
60
+ or response_lines[line_index][5:6].strip() != ''):
61
+ line = response_lines[line_index]
62
+ assembled_line += " " + line[21:].strip()
63
+ line_index += 1
64
+
65
+ # Fall back to organelle in case of MT/Chloroplast accessions
66
+ # and the reference notes in case of Linkage Group molecules
67
+ chosen_response = re.findall('.*/chromosome=".+"', assembled_line) or \
68
+ re.findall('.*/organelle=".+"', assembled_line) or \
69
+ re.findall('.*/note=".+"', assembled_line)
70
+
71
+ # If we have a response to give, no need to continue further
72
+ # If the sixth character is not empty, we have reached the next feature, so no need to continue further
73
+ if chosen_response or line[5:6].strip() != '':
74
+ break
75
+ else:
76
+ line_index += 1
77
+
78
+ if not chosen_response:
79
+ return ""
80
+
81
+ return str.split(chosen_response[0], '"')[1].strip()
82
+
83
+
84
+ def is_wgs_accession_format(contig_accession):
85
+ """
86
+ Check if a Genbank contig is part of WGS (Whole Genome Shotgun) sequence
87
+
88
+ :param contig_accession: Genbank contig accession (ex: CM003032.1)
89
+ :return: True if the provided contig is in the WGS format
90
+ """
91
+ wgs_prefix = contig_accession[:4]
92
+ wgs_numeric_suffix = contig_accession[4:].replace(".", "")
93
+ return str.isalpha(wgs_prefix) and str.isnumeric(wgs_numeric_suffix)
94
+
95
+
96
+ def get_chromosome_name_for_contig_accession(contig_accession):
97
+ """
98
+ Given a Genbank contig accession, get the corresponding chromosome name
99
+
100
+ :param contig_accession: Genbank contig accession (ex: CM003032.1)
101
+ :return: Chromosome name (ex: 12 when given an accession CM003032.1)
102
+ """
103
+
104
+ # Don't bother calling the ENA web service to get the chromosome number if the accession is a WGS accession
105
+ # since the API will proceed to download the entire WGS dataset which can be in hundreds of MBs or even GBs
106
+ # See https://www.ebi.ac.uk/ena/browser/api/text/AABR07050911.1?lineLimit=100&annotationOnly=true for example
107
+ if is_wgs_accession_format(contig_accession):
108
+ return None
109
+
110
+ return \
111
+ resolve_contig_accession_to_chromosome_name(contig_accession, 1000) or \
112
+ resolve_contig_accession_to_chromosome_name(contig_accession, 10000) or \
113
+ resolve_contig_accession_to_chromosome_name(contig_accession, 100000)
@@ -0,0 +1,114 @@
1
+ #!python
2
+ import gzip
3
+ import shutil
4
+ import tarfile
5
+ import os.path
6
+ from argparse import ArgumentParser
7
+
8
+ from ebi_eva_common_pyutils.logger import logging_config
9
+ from retry import retry
10
+
11
+ logger = logging_config.get_logger(__name__)
12
+
13
+
14
+ def make_tarfile(output_filename, source_dir):
15
+ logger.info(f'Create Final Tar file {output_filename}.')
16
+ with tarfile.open(output_filename, "w") as tar:
17
+ tar.add(source_dir, arcname=os.path.basename(source_dir))
18
+ file_stats = os.stat(output_filename)
19
+ logger.info(f'{output_filename} completed. File Size in Bytes is {file_stats.st_size}')
20
+
21
+
22
+ def is_compressed(file_path):
23
+ compressed_ext = ['.gz', '.zip', '.bz', '.tbi', '.csi']
24
+ for ext in compressed_ext:
25
+ if file_path.lower().endswith(ext):
26
+ return True
27
+ return False
28
+
29
+
30
+ @retry(tries=5, delay=3, backoff=2, logger=logger)
31
+ def retriable_compress(src_file_path, dest_file_path):
32
+ MEG = 2 ** 20
33
+ with open(src_file_path, 'rb') as f_in:
34
+ with gzip.open(dest_file_path, 'wb') as f_out:
35
+ shutil.copyfileobj(f_in, f_out, length=16 * MEG)
36
+
37
+
38
+ def matches(name, patterns):
39
+ return any((pattern for pattern in patterns if pattern in name))
40
+
41
+
42
+ @retry(tries=5, delay=3, backoff=2, logger=logger)
43
+ def retriable_remove(path):
44
+ shutil.rmtree(path)
45
+
46
+
47
+ @retry(tries=5, delay=3, backoff=2, logger=logger)
48
+ def retryable_copy(src_file_path, dest_file_path, **kwargs):
49
+ if os.path.exists(dest_file_path):
50
+ os.remove(dest_file_path)
51
+ shutil.copyfile(src_file_path, dest_file_path, **kwargs)
52
+
53
+
54
+ def archive_directory(source_dir, scratch_dir, destination_dir, filter_patterns=None):
55
+ """
56
+ Archive a directory by copying the data it contains to a scratch directory compressing all files that are not
57
+ already compressed. Then it create a tar file to the destination_dir under the name of the original directory.
58
+ """
59
+ source_dir_name = os.path.basename(source_dir)
60
+ logger.info(f'Archive {source_dir_name} from {source_dir}')
61
+ parent_source_dir = os.path.dirname(source_dir)
62
+ for base, dirs, files in os.walk(source_dir, topdown=True, followlinks=False):
63
+ # Filter the downstream directory to
64
+ filtered_dir = []
65
+ for d in dirs:
66
+ if matches(d, filter_patterns):
67
+ logger.info(f'Ignore directory {d} because of filters: {filter_patterns}')
68
+ else:
69
+ filtered_dir.append(d)
70
+ # modify dirs in place
71
+ dirs[:] = filtered_dir
72
+ src_basename = os.path.relpath(base, parent_source_dir)
73
+ scratch_dest_dir = os.path.join(scratch_dir, src_basename)
74
+ os.makedirs(scratch_dest_dir, exist_ok=True)
75
+ for fname in files:
76
+ src_file_path = os.path.join(base, fname)
77
+ dest_file_path = os.path.join(scratch_dest_dir, fname)
78
+ if matches(fname, filter_patterns):
79
+ logger.info(f'Ignore file {src_file_path} because of filters: {filter_patterns}')
80
+ continue
81
+ if os.path.islink(src_file_path) or is_compressed(src_file_path):
82
+ logger.info(f'Copy {src_file_path}')
83
+ retryable_copy(src_file_path, dest_file_path, follow_symlinks=False)
84
+ else:
85
+ logger.info(f'Compress {src_file_path}')
86
+ retriable_compress(src_file_path, dest_file_path + '.gz')
87
+ final_tar_file = os.path.join(destination_dir, source_dir_name + '.tar')
88
+ scratch_dir_archived = os.path.join(scratch_dir, source_dir_name)
89
+ make_tarfile(final_tar_file, scratch_dir_archived)
90
+ logger.info(f'Delete scratch folder {scratch_dir_archived}.')
91
+ retriable_remove(scratch_dir_archived)
92
+ logger.info(f'Scratch folder {scratch_dir_archived} deleted.')
93
+
94
+
95
+ def main():
96
+ parser = ArgumentParser(description='Archive a directory by copying the data it contains to a scratch directory '
97
+ 'compressing all files that are not already compressed. Then it create a tar '
98
+ 'file to the destination_dir under the name of the original directory.')
99
+ parser.add_argument('--source_dir', required=True, type=str,
100
+ help='base directory you want to archive. All sub directories will be included.')
101
+ parser.add_argument('--destination_dir', required=True, type=str,
102
+ help='Directory where the archive should be placed at the end of the process.')
103
+ parser.add_argument('--scratch_dir', required=True, type=str,
104
+ help='Directory where the archive will be constructed.')
105
+ parser.add_argument('--filter_patterns', type=str, nargs='*', default=[],
106
+ help='keyword found in file and directory names that not be included to the archive.')
107
+ args = parser.parse_args()
108
+
109
+ logging_config.add_stdout_handler()
110
+ archive_directory(args.source_dir, args.scratch_dir, args.destination_dir, args.filter_patterns)
111
+
112
+
113
+ if __name__ == '__main__':
114
+ main()