ebi-eva-common-pyutils 0.6.17__2-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ebi_eva_common_pyutils/__init__.py +0 -0
- ebi_eva_common_pyutils/assembly/__init__.py +1 -0
- ebi_eva_common_pyutils/assembly/assembly.py +69 -0
- ebi_eva_common_pyutils/assembly_utils.py +91 -0
- ebi_eva_common_pyutils/biosamples_communicators.py +186 -0
- ebi_eva_common_pyutils/command_utils.py +54 -0
- ebi_eva_common_pyutils/common_utils.py +30 -0
- ebi_eva_common_pyutils/config.py +152 -0
- ebi_eva_common_pyutils/contig_alias/__init__.py +0 -0
- ebi_eva_common_pyutils/contig_alias/contig_alias.py +115 -0
- ebi_eva_common_pyutils/ena_utils.py +35 -0
- ebi_eva_common_pyutils/file_utils.py +31 -0
- ebi_eva_common_pyutils/logger.py +150 -0
- ebi_eva_common_pyutils/ncbi_utils.py +117 -0
- ebi_eva_common_pyutils/network_utils.py +64 -0
- ebi_eva_common_pyutils/reference/__init__.py +2 -0
- ebi_eva_common_pyutils/reference/assembly.py +247 -0
- ebi_eva_common_pyutils/reference/sequence.py +101 -0
- ebi_eva_common_pyutils/spreadsheet/__init__.py +0 -0
- ebi_eva_common_pyutils/spreadsheet/metadata_xlsx_utils.py +15 -0
- ebi_eva_common_pyutils/taxonomy/__init__.py +0 -0
- ebi_eva_common_pyutils/taxonomy/taxonomy.py +60 -0
- ebi_eva_common_pyutils/variation/__init__.py +0 -0
- ebi_eva_common_pyutils/variation/contig_utils.py +113 -0
- ebi_eva_common_pyutils-0.6.17.data/scripts/archive_directory.py +114 -0
- ebi_eva_common_pyutils-0.6.17.dist-info/LICENSE +201 -0
- ebi_eva_common_pyutils-0.6.17.dist-info/METADATA +24 -0
- ebi_eva_common_pyutils-0.6.17.dist-info/RECORD +41 -0
- ebi_eva_common_pyutils-0.6.17.dist-info/WHEEL +5 -0
- ebi_eva_common_pyutils-0.6.17.dist-info/top_level.txt +2 -0
- ebi_eva_internal_pyutils/__init__.py +0 -0
- ebi_eva_internal_pyutils/archive_directory.py +114 -0
- ebi_eva_internal_pyutils/config_utils.py +188 -0
- ebi_eva_internal_pyutils/metadata_utils.py +288 -0
- ebi_eva_internal_pyutils/mongo_utils.py +71 -0
- ebi_eva_internal_pyutils/mongodb/__init__.py +3 -0
- ebi_eva_internal_pyutils/mongodb/mongo_database.py +170 -0
- ebi_eva_internal_pyutils/nextflow/__init__.py +1 -0
- ebi_eva_internal_pyutils/nextflow/nextflow_pipeline.py +195 -0
- ebi_eva_internal_pyutils/pg_utils.py +107 -0
- ebi_eva_internal_pyutils/spring_properties.py +294 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import datetime
|
|
15
|
+
import re
|
|
16
|
+
from urllib.parse import urlsplit
|
|
17
|
+
|
|
18
|
+
import psycopg2
|
|
19
|
+
|
|
20
|
+
from ebi_eva_common_pyutils.assembly_utils import is_patch_assembly
|
|
21
|
+
from ebi_eva_internal_pyutils.config_utils import get_metadata_creds_for_profile
|
|
22
|
+
from ebi_eva_common_pyutils.ena_utils import get_scientific_name_and_common_name
|
|
23
|
+
from ebi_eva_common_pyutils.logger import logging_config
|
|
24
|
+
from ebi_eva_common_pyutils.ncbi_utils import get_ncbi_assembly_name_from_term
|
|
25
|
+
from ebi_eva_internal_pyutils.pg_utils import get_result_cursor, get_all_results_for_query, execute_query
|
|
26
|
+
from ebi_eva_common_pyutils.taxonomy.taxonomy import get_scientific_name_from_ensembl
|
|
27
|
+
|
|
28
|
+
logger = logging_config.get_logger(__name__)
|
|
29
|
+
SUPPORTED_ASSEMBLY_TRACKER_TABLE = "evapro.supported_assembly_tracker"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_metadata_connection_handle(profile, settings_xml_file):
|
|
33
|
+
pg_url, pg_user, pg_pass = get_metadata_creds_for_profile(profile, settings_xml_file)
|
|
34
|
+
return psycopg2.connect(urlsplit(pg_url).path, user=pg_user, password=pg_pass)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_db_conn_for_species(species_db_info):
|
|
38
|
+
db_name = "dbsnp_{0}".format(species_db_info["dbsnp_build"])
|
|
39
|
+
pg_conn = psycopg2.connect("dbname='{0}' user='{1}' host='{2}' port={3}".
|
|
40
|
+
format(db_name, "dbsnp", species_db_info["pg_host"], species_db_info["pg_port"]))
|
|
41
|
+
return pg_conn
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_species_info(metadata_connection_handle, dbsnp_species_name="all"):
|
|
45
|
+
get_species_info_query = "SELECT DISTINCT database_name, scientific_name, dbsnp_build, pg_host, pg_port " \
|
|
46
|
+
"FROM dbsnp_ensembl_species.import_progress a " \
|
|
47
|
+
"JOIN dbsnp_ensembl_species.dbsnp_build_instance b " \
|
|
48
|
+
"ON b.dbsnp_build = a.ebi_pg_dbsnp_build "
|
|
49
|
+
if dbsnp_species_name != "all":
|
|
50
|
+
get_species_info_query += "where database_name = '{0}' ".format(dbsnp_species_name)
|
|
51
|
+
get_species_info_query += "order by database_name"
|
|
52
|
+
|
|
53
|
+
pg_cursor = get_result_cursor(metadata_connection_handle, get_species_info_query)
|
|
54
|
+
species_set = [{"database_name": result[0], "scientific_name": result[1], "dbsnp_build":result[2],
|
|
55
|
+
"pg_host":result[3], "pg_port":result[4]}
|
|
56
|
+
for result in pg_cursor.fetchall()]
|
|
57
|
+
pg_cursor.close()
|
|
58
|
+
return species_set
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Get connection information for each Postgres instance of the dbSNP mirror
|
|
62
|
+
def get_dbsnp_mirror_db_info(pg_metadata_dbname, pg_metadata_user, pg_metadata_host):
|
|
63
|
+
with psycopg2.connect("dbname='{0}' user='{1}' host='{2}'".format(pg_metadata_dbname, pg_metadata_user,
|
|
64
|
+
pg_metadata_host)) as pg_conn:
|
|
65
|
+
dbsnp_mirror_db_info_query = "SELECT * FROM dbsnp_ensembl_species.dbsnp_build_instance"
|
|
66
|
+
dbsnp_mirror_db_info = [{"dbsnp_build": result[0], "pg_host": result[1], "pg_port": result[2]}
|
|
67
|
+
for result in get_all_results_for_query(pg_conn, dbsnp_mirror_db_info_query)]
|
|
68
|
+
return dbsnp_mirror_db_info
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_taxonomy_code_from_metadata(metadata_connection_handle, taxonomy):
|
|
72
|
+
"""
|
|
73
|
+
Retrieve an existing taxonomy code registered in the metadata database.
|
|
74
|
+
"""
|
|
75
|
+
query = f"SELECT DISTINCT t.taxonomy_code FROM taxonomy t WHERE t.taxonomy_id = {taxonomy}"
|
|
76
|
+
rows = get_all_results_for_query(metadata_connection_handle, query)
|
|
77
|
+
if len(rows) == 0:
|
|
78
|
+
return None
|
|
79
|
+
elif len(rows) > 1:
|
|
80
|
+
options = ', '.join(rows)
|
|
81
|
+
raise ValueError(f'More than one possible code for taxonomy {taxonomy} found: {options}')
|
|
82
|
+
return rows[0][0]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_assembly_code_from_metadata(metadata_connection_handle, assembly):
|
|
86
|
+
"""
|
|
87
|
+
Retrieve an existing assembly code registered in the metadata database.
|
|
88
|
+
"""
|
|
89
|
+
query = f"SELECT DISTINCT assembly_code FROM assembly WHERE assembly_accession='{assembly}'"
|
|
90
|
+
rows = get_all_results_for_query(metadata_connection_handle, query)
|
|
91
|
+
if len(rows) == 0:
|
|
92
|
+
return None
|
|
93
|
+
elif len(rows) > 1:
|
|
94
|
+
options = ', '.join([row for row, in rows])
|
|
95
|
+
raise ValueError(f'More than one possible code for assembly {assembly} found: {options}')
|
|
96
|
+
return rows[0][0]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def build_variant_warehouse_database_name(taxonomy_code, assembly_code):
|
|
100
|
+
if taxonomy_code and assembly_code:
|
|
101
|
+
return f'eva_{taxonomy_code}_{assembly_code}'
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def resolve_existing_variant_warehouse_db_name(metadata_connection_handle, assembly, taxonomy):
|
|
106
|
+
"""
|
|
107
|
+
Retrieve an existing database name by combining the taxonomy_code and assembly code registered in the metadata
|
|
108
|
+
database.
|
|
109
|
+
"""
|
|
110
|
+
return build_variant_warehouse_database_name(
|
|
111
|
+
get_taxonomy_code_from_metadata(metadata_connection_handle, taxonomy),
|
|
112
|
+
get_assembly_code_from_metadata(metadata_connection_handle, assembly)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# For backward compatibility
|
|
117
|
+
get_variant_warehouse_db_name_from_assembly_and_taxonomy = resolve_existing_variant_warehouse_db_name
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def get_assembly_code(metadata_connection_handle, assembly, ncbi_api_key=None):
|
|
121
|
+
assembly_code = get_assembly_code_from_metadata(metadata_connection_handle, assembly)
|
|
122
|
+
if not assembly_code:
|
|
123
|
+
assembly_name = get_ncbi_assembly_name_from_term(assembly, api_key=ncbi_api_key)
|
|
124
|
+
# If the assembly is a patch assembly ex: GRCh37.p8, drop the trailing patch i.e., just return grch37
|
|
125
|
+
if is_patch_assembly(assembly):
|
|
126
|
+
assembly_name = re.sub('\\.p[0-9]+$', '', assembly_name.lower())
|
|
127
|
+
assembly_code = re.sub('[^0-9a-zA-Z]+', '', assembly_name.lower())
|
|
128
|
+
return assembly_code
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def build_taxonomy_code(scientific_name):
|
|
132
|
+
"""Given a scientific name like "Zea mays", the corresponding taxonomy code should be zmays"""
|
|
133
|
+
return scientific_name[0].lower() + re.sub('[^0-9a-zA-Z]+', '', ''.join(scientific_name.split()[1:])).lower()
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def get_taxonomy_code(metadata_connection_handle, taxonomy):
|
|
137
|
+
taxonomy_code = get_taxonomy_code_from_metadata(metadata_connection_handle, taxonomy)
|
|
138
|
+
if not taxonomy_code:
|
|
139
|
+
scientific_name = get_scientific_name_from_ensembl(taxonomy)
|
|
140
|
+
taxonomy_code = build_taxonomy_code(scientific_name)
|
|
141
|
+
return taxonomy_code
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def resolve_variant_warehouse_db_name(metadata_connection_handle, assembly, taxonomy, ncbi_api_key=None):
|
|
145
|
+
"""
|
|
146
|
+
Retrieve the database name for this taxonomy/assembly pair whether it exists or not.
|
|
147
|
+
It will use existing taxonomy code or assembly code if available in the metadata database.
|
|
148
|
+
"""
|
|
149
|
+
taxonomy_code = get_taxonomy_code(metadata_connection_handle, taxonomy)
|
|
150
|
+
assembly_code = get_assembly_code(metadata_connection_handle, assembly, ncbi_api_key=ncbi_api_key)
|
|
151
|
+
return build_variant_warehouse_database_name(taxonomy_code, assembly_code)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def insert_new_assembly_and_taxonomy(metadata_connection_handle, assembly_accession, taxonomy_id, eva_species_name=None,
|
|
155
|
+
in_accessioning=True, ncbi_api_key=None):
|
|
156
|
+
"""
|
|
157
|
+
This script adds new assemblies and taxonomies to EVAPRO.
|
|
158
|
+
You can also add the assembly with a different taxonomy if you provide the
|
|
159
|
+
taxonomy parameters. Example taxonomy page:
|
|
160
|
+
https://www.ebi.ac.uk/ena/data/view/Taxon:9031
|
|
161
|
+
|
|
162
|
+
:param assembly_accession: Assembly accession (Example: GCA_000002315.3)
|
|
163
|
+
:param metadata_connection_handle: Metadata DB connection
|
|
164
|
+
:param taxonomy_id: Taxonomy id (Example: 9031)
|
|
165
|
+
:param eva_species_name: EVA species name (Example: chicken).
|
|
166
|
+
Not required if the taxonomy exists or ENA has a common name available.
|
|
167
|
+
:param in_accessioning: Flag that this assembly is in the accessioning data store.
|
|
168
|
+
"""
|
|
169
|
+
# check if assembly is already in EVAPRO, adding it if not
|
|
170
|
+
assembly_set_id = get_assembly_set_from_metadata(metadata_connection_handle, taxonomy_id, assembly_accession)
|
|
171
|
+
if assembly_set_id is None:
|
|
172
|
+
assembly_name = get_ncbi_assembly_name_from_term(assembly_accession, api_key=ncbi_api_key)
|
|
173
|
+
ensure_taxonomy_is_in_evapro(metadata_connection_handle, taxonomy_id, eva_species_name)
|
|
174
|
+
assembly_code = get_assembly_code(metadata_connection_handle, assembly_accession)
|
|
175
|
+
insert_assembly_in_evapro(metadata_connection_handle, taxonomy_id, assembly_accession, assembly_name, assembly_code)
|
|
176
|
+
|
|
177
|
+
update_accessioning_status(metadata_connection_handle, assembly_accession, in_accessioning)
|
|
178
|
+
metadata_connection_handle.commit()
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def ensure_taxonomy_is_in_evapro(metadata_connection_handle, taxonomy, eva_species_name=None):
|
|
182
|
+
if is_taxonomy_in_evapro(metadata_connection_handle, taxonomy):
|
|
183
|
+
logger.debug('Taxonomy {} is already in the database'.format(taxonomy))
|
|
184
|
+
else:
|
|
185
|
+
logger.info("Taxonomy {} not present in EVAPRO. Adding taxonomy ...".format(taxonomy))
|
|
186
|
+
scientific_name, common_name = get_scientific_name_and_common_name(taxonomy)
|
|
187
|
+
taxonomy_code = build_taxonomy_code(scientific_name)
|
|
188
|
+
# If a common name cannot be found then we should use the scientific name
|
|
189
|
+
eva_species_name = eva_species_name or common_name or scientific_name
|
|
190
|
+
insert_taxonomy(metadata_connection_handle, taxonomy, scientific_name, common_name, taxonomy_code, eva_species_name)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def insert_assembly_in_evapro(metadata_connection_handle, taxonomy_id, assembly_accession, assembly_name, assembly_code):
|
|
194
|
+
cur = metadata_connection_handle.cursor()
|
|
195
|
+
cur.execute('INSERT INTO evapro.assembly_set(taxonomy_id, assembly_name, assembly_code) VALUES (%s, %s, %s)',
|
|
196
|
+
(taxonomy_id, assembly_name, assembly_code))
|
|
197
|
+
|
|
198
|
+
# get the assembly_set_id that was autogenerated in the row that we just inserted in assembly_set
|
|
199
|
+
assembly_set_id = get_all_results_for_query(metadata_connection_handle,
|
|
200
|
+
'SELECT assembly_set_id FROM evapro.assembly_set '
|
|
201
|
+
'WHERE taxonomy_id={} and assembly_name=\'{}\' and assembly_code=\'{}\''
|
|
202
|
+
.format(taxonomy_id, assembly_name, assembly_code))[0][0]
|
|
203
|
+
|
|
204
|
+
assembly_chain = assembly_accession.split('.')[0]
|
|
205
|
+
assembly_version = assembly_accession.split('.')[1]
|
|
206
|
+
cur.execute('INSERT INTO evapro.accessioned_assembly('
|
|
207
|
+
'assembly_set_id, assembly_accession, assembly_chain, assembly_version) VALUES (%s,%s,%s,%s)',
|
|
208
|
+
(assembly_set_id, assembly_accession, assembly_chain, assembly_version))
|
|
209
|
+
|
|
210
|
+
logger.info('New assembly added with assembly_set_id: {0}'.format(assembly_set_id))
|
|
211
|
+
return assembly_set_id
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def update_accessioning_status(metadata_connection_handle, assembly_accession, in_accessioning_flag):
|
|
215
|
+
cur = metadata_connection_handle.cursor()
|
|
216
|
+
# Only insert assembly accessions which are NOT already in the assembly_accessioning_store_status table
|
|
217
|
+
assembly_accessioning_store_insert_query = "INSERT INTO evapro.assembly_accessioning_store_status " \
|
|
218
|
+
"SELECT * FROM (SELECT " \
|
|
219
|
+
"cast('{0}' as text) as assembly_accession" \
|
|
220
|
+
", cast('{1}' as boolean) as loaded) temp " \
|
|
221
|
+
"WHERE assembly_accession NOT IN " \
|
|
222
|
+
"(SELECT assembly_accession FROM " \
|
|
223
|
+
"evapro.assembly_accessioning_store_status)" \
|
|
224
|
+
.format(assembly_accession, in_accessioning_flag)
|
|
225
|
+
cur.execute(assembly_accessioning_store_insert_query)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def get_assembly_set_from_metadata(metadata_connection_handle, taxonomy, assembly_accession):
|
|
229
|
+
query = (f"SELECT acc.assembly_set_id "
|
|
230
|
+
f"FROM evapro.accessioned_assembly acc "
|
|
231
|
+
f"JOIN assembly_set asm on acc.assembly_set_id = asm.assembly_set_id "
|
|
232
|
+
f"WHERE assembly_accession='{assembly_accession}' AND taxonomy_id={taxonomy}")
|
|
233
|
+
rows = get_all_results_for_query(metadata_connection_handle, query)
|
|
234
|
+
|
|
235
|
+
if len(rows) == 1:
|
|
236
|
+
return rows[0][0]
|
|
237
|
+
elif len(rows) == 0:
|
|
238
|
+
return None
|
|
239
|
+
else:
|
|
240
|
+
raise ValueError('Inconsistent database state: several assembly_set_ids for the same taxonomy ({}) and '
|
|
241
|
+
'assembly accession ({}): {}'.format(taxonomy, assembly_accession, rows))
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def is_taxonomy_in_evapro(metadata_connection_handle, taxonomy_id):
|
|
245
|
+
taxonomy_query = 'SELECT taxonomy_id FROM evapro.taxonomy WHERE taxonomy_id={}'.format(taxonomy_id)
|
|
246
|
+
taxonomy_ids_in_evapro = get_all_results_for_query(metadata_connection_handle, taxonomy_query)
|
|
247
|
+
return len(taxonomy_ids_in_evapro) > 0
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def insert_taxonomy(metadata_connection_handle, taxonomy_id, scientific_name, common_name, taxonomy_code, eva_species_name):
|
|
251
|
+
if taxonomy_code is None or eva_species_name is None:
|
|
252
|
+
raise ValueError('Error: taxonomy code ({}) and EVA taxonomy name ({}) are required '
|
|
253
|
+
'for inserting a taxonomy'.format(taxonomy_code, eva_species_name))
|
|
254
|
+
cur = metadata_connection_handle.cursor()
|
|
255
|
+
cur.execute('INSERT INTO evapro.taxonomy(taxonomy_id, common_name, scientific_name, taxonomy_code, eva_name) '
|
|
256
|
+
'VALUES (%s, %s, %s, %s, %s)',
|
|
257
|
+
(taxonomy_id, common_name, scientific_name, taxonomy_code, eva_species_name))
|
|
258
|
+
logger.info('New taxonomy {} added'.format(taxonomy_id))
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def add_to_supported_assemblies(metadata_connection_handle, source_of_assembly: str, target_assembly: str,
|
|
262
|
+
taxonomy_id: int):
|
|
263
|
+
today = datetime.date.today().strftime('%Y-%m-%d')
|
|
264
|
+
# First check if the current assembly is already target - if so don't do anything
|
|
265
|
+
current_query = (
|
|
266
|
+
f"SELECT assembly_id FROM {SUPPORTED_ASSEMBLY_TRACKER_TABLE} "
|
|
267
|
+
f"WHERE taxonomy_id={taxonomy_id} AND current=true;"
|
|
268
|
+
)
|
|
269
|
+
results = get_all_results_for_query(metadata_connection_handle, current_query)
|
|
270
|
+
if len(results) > 0 and results[0][0] == target_assembly:
|
|
271
|
+
logger.warning(f'Current assembly for taxonomy {taxonomy_id} is already {target_assembly}!')
|
|
272
|
+
return
|
|
273
|
+
|
|
274
|
+
# Deprecate the last current assembly
|
|
275
|
+
update_query = (
|
|
276
|
+
f"UPDATE {SUPPORTED_ASSEMBLY_TRACKER_TABLE} "
|
|
277
|
+
f"SET current=false, end_date='{today}' "
|
|
278
|
+
f"WHERE taxonomy_id={taxonomy_id} AND current=true;"
|
|
279
|
+
)
|
|
280
|
+
execute_query(metadata_connection_handle, update_query)
|
|
281
|
+
|
|
282
|
+
# Then insert the new assembly
|
|
283
|
+
insert_query = (
|
|
284
|
+
f"INSERT INTO {SUPPORTED_ASSEMBLY_TRACKER_TABLE} "
|
|
285
|
+
f"(taxonomy_id, source, assembly_id, current, start_date) "
|
|
286
|
+
f"VALUES({taxonomy_id}, '{source_of_assembly}', '{target_assembly}', true, '{today}');"
|
|
287
|
+
)
|
|
288
|
+
execute_query(metadata_connection_handle, insert_query)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import pymongo
|
|
16
|
+
from urllib.parse import quote_plus
|
|
17
|
+
|
|
18
|
+
from pymongo import ReadPreference
|
|
19
|
+
|
|
20
|
+
from ebi_eva_common_pyutils.command_utils import run_command_with_output
|
|
21
|
+
from ebi_eva_common_pyutils.common_utils import merge_two_dicts
|
|
22
|
+
from ebi_eva_internal_pyutils.config_utils import get_mongo_uri_for_eva_profile, get_mongo_creds_for_profile
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MongoConfig:
|
|
26
|
+
parameters = None
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kwargs):
|
|
29
|
+
self.parameters = kwargs
|
|
30
|
+
if "port" not in kwargs:
|
|
31
|
+
self.parameters["port"] = 27017
|
|
32
|
+
if "host" not in kwargs:
|
|
33
|
+
self.parameters["host"] = "localhost"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_mongo_connection_handle(profile: str, settings_xml_file: str,
|
|
37
|
+
read_concern: str = "majority",
|
|
38
|
+
read_preference: ReadPreference = ReadPreference.PRIMARY,
|
|
39
|
+
write_concern: str = "majority") -> pymongo.MongoClient:
|
|
40
|
+
mongo_connection_uri = get_mongo_uri_for_eva_profile(profile, settings_xml_file)
|
|
41
|
+
return pymongo.MongoClient(mongo_connection_uri,
|
|
42
|
+
readConcernLevel=read_concern,
|
|
43
|
+
read_preference=read_preference,
|
|
44
|
+
w=write_concern)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_primary_mongo_connection_handle(profile: str, settings_xml_file: str,
|
|
48
|
+
read_concern: str = "majority",
|
|
49
|
+
read_preference: ReadPreference = ReadPreference.PRIMARY,
|
|
50
|
+
write_concern: str = "majority") -> pymongo.MongoClient:
|
|
51
|
+
host, username, password = get_mongo_creds_for_profile(profile, settings_xml_file)
|
|
52
|
+
mongo_connection_uri = "mongodb://{0}:{1}@{2}:{3}/{4}".format(username, quote_plus(password), host,
|
|
53
|
+
27017, "admin")
|
|
54
|
+
return pymongo.MongoClient(mongo_connection_uri,
|
|
55
|
+
readConcernLevel=read_concern,
|
|
56
|
+
read_preference=read_preference,
|
|
57
|
+
w=write_concern)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def copy_db_with_config(mongo_source_config: MongoConfig, mongo_destination_config: MongoConfig, mongodump_args: dict,
|
|
61
|
+
mongorestore_args: dict):
|
|
62
|
+
copy_db(merge_two_dicts(mongo_source_config.parameters, mongodump_args),
|
|
63
|
+
merge_two_dicts(mongo_destination_config.parameters, mongorestore_args))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def copy_db(mongodump_args: dict, mongorestore_args: dict):
|
|
67
|
+
mongodump_args_str = " ".join(["--{0} {1}".format(key, value) for key, value in mongodump_args.items()])
|
|
68
|
+
mongorestore_args_str = " ".join(["--{0} {1}".format(key, value) for key, value in mongorestore_args.items()])
|
|
69
|
+
run_command_with_output("Running mongodump", "mongodump " + mongodump_args_str, log_error_stream_to_output=True)
|
|
70
|
+
run_command_with_output("Running mongorestore", "mongorestore " + mongorestore_args_str,
|
|
71
|
+
log_error_stream_to_output=True)
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# Copyright 2021 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import copy
|
|
16
|
+
import subprocess
|
|
17
|
+
|
|
18
|
+
from cached_property import cached_property
|
|
19
|
+
from pymongo import MongoClient, uri_parser
|
|
20
|
+
|
|
21
|
+
from ebi_eva_common_pyutils.command_utils import run_command_with_output
|
|
22
|
+
from ebi_eva_common_pyutils.logger import AppLogger
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MongoDatabase(AppLogger):
|
|
26
|
+
def __init__(self, uri: str, secrets_file: str = None, db_name: str = "admin"):
|
|
27
|
+
self.uri = uri
|
|
28
|
+
self.secrets_file = secrets_file
|
|
29
|
+
self.db_name = db_name
|
|
30
|
+
|
|
31
|
+
@cached_property
|
|
32
|
+
def uri_with_db_name(self):
|
|
33
|
+
"""
|
|
34
|
+
Return URI with the database name substituted
|
|
35
|
+
ex:
|
|
36
|
+
If the URI is mongodb://user@localhost:27017/admin and database name is eva_fcatus_90,
|
|
37
|
+
then the URI with the database name will be mongodb://user@localhost:27017/eva_fcatus_90?authSource=admin
|
|
38
|
+
"""
|
|
39
|
+
if self.db_name == "admin":
|
|
40
|
+
return self.uri
|
|
41
|
+
uri_components = uri_parser.parse_uri(self.uri)
|
|
42
|
+
username_component = f"{uri_components['username']}@" if uri_components['username'] else ""
|
|
43
|
+
# Hack needed to log in to a different DB but retain authentication source
|
|
44
|
+
# See https://docs.mongodb.com/v4.0/reference/connection-string/#records-database and https://docs.mongodb.com/v4.0/reference/connection-string/#urioption.authSource
|
|
45
|
+
uri_with_db_name = f"mongodb://{username_component}" + \
|
|
46
|
+
",".join([node + ':' + str(port) for node, port in uri_components['nodelist']]) + \
|
|
47
|
+
f"/{self.db_name}"
|
|
48
|
+
uri_with_db_name += f"?authSource=" \
|
|
49
|
+
f"{uri_components['options'].get('authSource', 'admin')}" if self.secrets_file else ""
|
|
50
|
+
return uri_with_db_name
|
|
51
|
+
|
|
52
|
+
@cached_property
|
|
53
|
+
def mongo_handle(self):
|
|
54
|
+
if self.secrets_file:
|
|
55
|
+
with open(self.secrets_file) as secrets_file_handle:
|
|
56
|
+
mongo_password = secrets_file_handle.read().strip()
|
|
57
|
+
return MongoClient(self.uri, password=mongo_password)
|
|
58
|
+
else:
|
|
59
|
+
return MongoClient(self.uri)
|
|
60
|
+
|
|
61
|
+
def __del__(self):
|
|
62
|
+
self.mongo_handle.close()
|
|
63
|
+
|
|
64
|
+
def _get_optional_secrets_file_stdin(self):
|
|
65
|
+
# Mongodump and restore tools are notorious in displaying clear text passwords
|
|
66
|
+
# in commands - See https://jira.mongodb.org/browse/TOOLS-1020
|
|
67
|
+
# A secrets file can be provided to work around it
|
|
68
|
+
if self.secrets_file:
|
|
69
|
+
return f" < {self.secrets_file}"
|
|
70
|
+
return ""
|
|
71
|
+
|
|
72
|
+
def drop(self):
|
|
73
|
+
self.mongo_handle.drop_database(self.db_name)
|
|
74
|
+
|
|
75
|
+
def get_collection_names(self):
|
|
76
|
+
return self.mongo_handle[self.db_name].list_collection_names()
|
|
77
|
+
|
|
78
|
+
def get_indexes(self):
|
|
79
|
+
collection_index_map = {}
|
|
80
|
+
for collection_name in self.get_collection_names():
|
|
81
|
+
collection_index_map[collection_name] = self.mongo_handle[self.db_name][collection_name].index_information()
|
|
82
|
+
return collection_index_map
|
|
83
|
+
|
|
84
|
+
def create_index_on_collections(self, collection_index_map):
|
|
85
|
+
collection_index_map_copy = copy.deepcopy(collection_index_map)
|
|
86
|
+
for collection_name, index_info_map in collection_index_map_copy.items():
|
|
87
|
+
# Copy indexes from one collection to another See https://stackoverflow.com/a/51445278
|
|
88
|
+
for name, index_info in index_info_map.items():
|
|
89
|
+
index_keys = index_info['key']
|
|
90
|
+
del (index_info['ns'])
|
|
91
|
+
del (index_info['v'])
|
|
92
|
+
del (index_info['key'])
|
|
93
|
+
if 'background' in index_info:
|
|
94
|
+
del (index_info['background'])
|
|
95
|
+
# Due to https://jira.mongodb.org/browse/SERVER-11064
|
|
96
|
+
# pre-v3.2 index sort indicators could allow for floats like 1.0
|
|
97
|
+
# ex: (_id, 1.0) for ascending index on _id or (_id, -1.0) for descending index on _id
|
|
98
|
+
# Since validation is stricter in database versions newer than v3.3.1, cast sort indicators to int
|
|
99
|
+
for i, _ in enumerate(index_keys):
|
|
100
|
+
index_keys[i] = (index_keys[i][0], int(index_keys[i][1]))
|
|
101
|
+
self.mongo_handle[self.db_name][collection_name].create_index(index_keys, name=name, **index_info)
|
|
102
|
+
|
|
103
|
+
def enable_sharding(self):
|
|
104
|
+
self.mongo_handle.admin.command({"enableSharding": self.db_name})
|
|
105
|
+
|
|
106
|
+
def shard_collections(self, collections_shard_key_map, collections_to_shard):
|
|
107
|
+
for collection_name in collections_to_shard:
|
|
108
|
+
shard_key, shard_key_uniqueness_flag = collections_shard_key_map.get(collection_name, (["_id"], True))
|
|
109
|
+
# Shard key representation in the format {"key1": 1, "key2": 1}
|
|
110
|
+
shard_key_repr = "{{{0}}}".format(",".join([f'"{attribute}": 1' for attribute in shard_key]))
|
|
111
|
+
shard_collection_command = f'sh.shardCollection(' \
|
|
112
|
+
f'"{self.db_name}.{collection_name}", ' \
|
|
113
|
+
f'{shard_key_repr}, {str(shard_key_uniqueness_flag).lower()})'
|
|
114
|
+
sharding_command = f"mongosh --eval '{shard_collection_command}' {self.uri} "
|
|
115
|
+
sharding_command += self._get_optional_secrets_file_stdin()
|
|
116
|
+
run_command_with_output(f"Sharding collection {collection_name} in the database {self.uri_with_db_name} "
|
|
117
|
+
f"with key {shard_key_repr}...", sharding_command,
|
|
118
|
+
log_error_stream_to_output=True, )
|
|
119
|
+
|
|
120
|
+
def dump_data(self, dump_dir, mongodump_args=None):
|
|
121
|
+
mongodump_args = " ".join([f"--{arg} {val}"
|
|
122
|
+
for arg, val in mongodump_args.items()]) if mongodump_args else ""
|
|
123
|
+
mongodump_command = f"mongodump --uri {self.uri_with_db_name} --out {dump_dir} {mongodump_args}" + \
|
|
124
|
+
self._get_optional_secrets_file_stdin()
|
|
125
|
+
try:
|
|
126
|
+
run_command_with_output("mongodump", mongodump_command, log_error_stream_to_output=True)
|
|
127
|
+
except subprocess.CalledProcessError as ex:
|
|
128
|
+
raise Exception("mongodump failed! HINT: Did you forget to provide a secrets file for authentication?")
|
|
129
|
+
|
|
130
|
+
def archive_data(self, archive_dir, archive_name="archive", mongodump_args=None):
|
|
131
|
+
mongodump_args = " ".join([f"--{arg} {val}"
|
|
132
|
+
for arg, val in mongodump_args.items()]) if mongodump_args else ""
|
|
133
|
+
mongodump_command = f"mongodump --uri {self.uri_with_db_name} --archive={archive_dir}/{archive_name} {mongodump_args}" + \
|
|
134
|
+
self._get_optional_secrets_file_stdin()
|
|
135
|
+
try:
|
|
136
|
+
run_command_with_output("mongodump", mongodump_command, log_error_stream_to_output=True)
|
|
137
|
+
except subprocess.CalledProcessError as ex:
|
|
138
|
+
raise Exception("mongodump failed! HINT: Did you forget to provide a secrets file for authentication?")
|
|
139
|
+
|
|
140
|
+
def restore_data(self, dump_dir, mongorestore_args=None):
|
|
141
|
+
mongorestore_args = " ".join([f"--{arg} {val}"
|
|
142
|
+
for arg, val in mongorestore_args.items()]) if mongorestore_args else ""
|
|
143
|
+
mongorestore_command = f"mongorestore --uri {self.uri_with_db_name} " \
|
|
144
|
+
f"{mongorestore_args} " \
|
|
145
|
+
f"--dir {dump_dir} "
|
|
146
|
+
mongorestore_command += self._get_optional_secrets_file_stdin()
|
|
147
|
+
try:
|
|
148
|
+
run_command_with_output("mongorestore", mongorestore_command, log_error_stream_to_output=True)
|
|
149
|
+
except subprocess.CalledProcessError as ex:
|
|
150
|
+
raise Exception("mongorestore failed! HINT: Did you forget to provide a secrets file for authentication?")
|
|
151
|
+
|
|
152
|
+
def export_data(self, export_directory, mongoexport_args=None):
|
|
153
|
+
mongoexport_args = " ".join([f"--{arg} {val}"
|
|
154
|
+
for arg, val in mongoexport_args.items()]) if mongoexport_args else ""
|
|
155
|
+
mongoexport_command = f"mongoexport --uri {self.uri_with_db_name} --out {export_directory} {mongoexport_args}" + \
|
|
156
|
+
self._get_optional_secrets_file_stdin()
|
|
157
|
+
try:
|
|
158
|
+
run_command_with_output("mongoexport", mongoexport_command, log_error_stream_to_output=True)
|
|
159
|
+
except subprocess.CalledProcessError as ex:
|
|
160
|
+
raise Exception("mongoexport failed! HINT: Did you forget to provide a secrets file for authentication?")
|
|
161
|
+
|
|
162
|
+
def import_data(self, coll_file_loc, mongoimport_args=None):
|
|
163
|
+
mongoimport_args = " ".join([f"--{arg} {val}"
|
|
164
|
+
for arg, val in mongoimport_args.items()]) if mongoimport_args else ""
|
|
165
|
+
mongoimport_command = f"mongoimport --uri {self.uri_with_db_name} --file {coll_file_loc} {mongoimport_args}" + \
|
|
166
|
+
self._get_optional_secrets_file_stdin()
|
|
167
|
+
try:
|
|
168
|
+
run_command_with_output("mongoimport", mongoimport_command, log_error_stream_to_output=True)
|
|
169
|
+
except subprocess.CalledProcessError as ex:
|
|
170
|
+
raise Exception("mongoexport failed! HINT: Did you forget to provide a secrets file for authentication?")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from ebi_eva_internal_pyutils.nextflow.nextflow_pipeline import LinearNextFlowPipeline, NextFlowPipeline, NextFlowProcess
|