ebi-eva-common-pyutils 0.6.15__1-py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. ebi_eva_common_pyutils/__init__.py +0 -0
  2. ebi_eva_common_pyutils/assembly/__init__.py +1 -0
  3. ebi_eva_common_pyutils/assembly/assembly.py +69 -0
  4. ebi_eva_common_pyutils/assembly_utils.py +91 -0
  5. ebi_eva_common_pyutils/biosamples_communicators.py +186 -0
  6. ebi_eva_common_pyutils/command_utils.py +54 -0
  7. ebi_eva_common_pyutils/common_utils.py +30 -0
  8. ebi_eva_common_pyutils/config.py +152 -0
  9. ebi_eva_common_pyutils/contig_alias/__init__.py +0 -0
  10. ebi_eva_common_pyutils/contig_alias/contig_alias.py +115 -0
  11. ebi_eva_common_pyutils/ena_utils.py +35 -0
  12. ebi_eva_common_pyutils/file_utils.py +31 -0
  13. ebi_eva_common_pyutils/logger.py +150 -0
  14. ebi_eva_common_pyutils/ncbi_utils.py +117 -0
  15. ebi_eva_common_pyutils/network_utils.py +64 -0
  16. ebi_eva_common_pyutils/reference/__init__.py +2 -0
  17. ebi_eva_common_pyutils/reference/assembly.py +247 -0
  18. ebi_eva_common_pyutils/reference/sequence.py +101 -0
  19. ebi_eva_common_pyutils/taxonomy/__init__.py +0 -0
  20. ebi_eva_common_pyutils/taxonomy/taxonomy.py +60 -0
  21. ebi_eva_common_pyutils/variation/__init__.py +0 -0
  22. ebi_eva_common_pyutils/variation/contig_utils.py +113 -0
  23. ebi_eva_common_pyutils-0.6.15.data/scripts/archive_directory.py +114 -0
  24. ebi_eva_common_pyutils-0.6.15.dist-info/LICENSE +201 -0
  25. ebi_eva_common_pyutils-0.6.15.dist-info/METADATA +23 -0
  26. ebi_eva_common_pyutils-0.6.15.dist-info/RECORD +39 -0
  27. ebi_eva_common_pyutils-0.6.15.dist-info/WHEEL +5 -0
  28. ebi_eva_common_pyutils-0.6.15.dist-info/top_level.txt +2 -0
  29. ebi_eva_internal_pyutils/__init__.py +0 -0
  30. ebi_eva_internal_pyutils/archive_directory.py +114 -0
  31. ebi_eva_internal_pyutils/config_utils.py +188 -0
  32. ebi_eva_internal_pyutils/metadata_utils.py +288 -0
  33. ebi_eva_internal_pyutils/mongo_utils.py +71 -0
  34. ebi_eva_internal_pyutils/mongodb/__init__.py +3 -0
  35. ebi_eva_internal_pyutils/mongodb/mongo_database.py +170 -0
  36. ebi_eva_internal_pyutils/nextflow/__init__.py +1 -0
  37. ebi_eva_internal_pyutils/nextflow/nextflow_pipeline.py +195 -0
  38. ebi_eva_internal_pyutils/pg_utils.py +107 -0
  39. ebi_eva_internal_pyutils/spring_properties.py +294 -0
@@ -0,0 +1,288 @@
1
+ # Copyright 2020 EMBL - European Bioinformatics Institute
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import datetime
15
+ import re
16
+ from urllib.parse import urlsplit
17
+
18
+ import psycopg2
19
+
20
+ from ebi_eva_common_pyutils.assembly_utils import is_patch_assembly
21
+ from ebi_eva_internal_pyutils.config_utils import get_metadata_creds_for_profile
22
+ from ebi_eva_common_pyutils.ena_utils import get_scientific_name_and_common_name
23
+ from ebi_eva_common_pyutils.logger import logging_config
24
+ from ebi_eva_common_pyutils.ncbi_utils import get_ncbi_assembly_name_from_term
25
+ from ebi_eva_internal_pyutils.pg_utils import get_result_cursor, get_all_results_for_query, execute_query
26
+ from ebi_eva_common_pyutils.taxonomy.taxonomy import get_scientific_name_from_ensembl
27
+
28
+ logger = logging_config.get_logger(__name__)
29
+ SUPPORTED_ASSEMBLY_TRACKER_TABLE = "evapro.supported_assembly_tracker"
30
+
31
+
32
+ def get_metadata_connection_handle(profile, settings_xml_file):
33
+ pg_url, pg_user, pg_pass = get_metadata_creds_for_profile(profile, settings_xml_file)
34
+ return psycopg2.connect(urlsplit(pg_url).path, user=pg_user, password=pg_pass)
35
+
36
+
37
+ def get_db_conn_for_species(species_db_info):
38
+ db_name = "dbsnp_{0}".format(species_db_info["dbsnp_build"])
39
+ pg_conn = psycopg2.connect("dbname='{0}' user='{1}' host='{2}' port={3}".
40
+ format(db_name, "dbsnp", species_db_info["pg_host"], species_db_info["pg_port"]))
41
+ return pg_conn
42
+
43
+
44
+ def get_species_info(metadata_connection_handle, dbsnp_species_name="all"):
45
+ get_species_info_query = "SELECT DISTINCT database_name, scientific_name, dbsnp_build, pg_host, pg_port " \
46
+ "FROM dbsnp_ensembl_species.import_progress a " \
47
+ "JOIN dbsnp_ensembl_species.dbsnp_build_instance b " \
48
+ "ON b.dbsnp_build = a.ebi_pg_dbsnp_build "
49
+ if dbsnp_species_name != "all":
50
+ get_species_info_query += "where database_name = '{0}' ".format(dbsnp_species_name)
51
+ get_species_info_query += "order by database_name"
52
+
53
+ pg_cursor = get_result_cursor(metadata_connection_handle, get_species_info_query)
54
+ species_set = [{"database_name": result[0], "scientific_name": result[1], "dbsnp_build":result[2],
55
+ "pg_host":result[3], "pg_port":result[4]}
56
+ for result in pg_cursor.fetchall()]
57
+ pg_cursor.close()
58
+ return species_set
59
+
60
+
61
+ # Get connection information for each Postgres instance of the dbSNP mirror
62
+ def get_dbsnp_mirror_db_info(pg_metadata_dbname, pg_metadata_user, pg_metadata_host):
63
+ with psycopg2.connect("dbname='{0}' user='{1}' host='{2}'".format(pg_metadata_dbname, pg_metadata_user,
64
+ pg_metadata_host)) as pg_conn:
65
+ dbsnp_mirror_db_info_query = "SELECT * FROM dbsnp_ensembl_species.dbsnp_build_instance"
66
+ dbsnp_mirror_db_info = [{"dbsnp_build": result[0], "pg_host": result[1], "pg_port": result[2]}
67
+ for result in get_all_results_for_query(pg_conn, dbsnp_mirror_db_info_query)]
68
+ return dbsnp_mirror_db_info
69
+
70
+
71
+ def get_taxonomy_code_from_metadata(metadata_connection_handle, taxonomy):
72
+ """
73
+ Retrieve an existing taxonomy code registered in the metadata database.
74
+ """
75
+ query = f"SELECT DISTINCT t.taxonomy_code FROM taxonomy t WHERE t.taxonomy_id = {taxonomy}"
76
+ rows = get_all_results_for_query(metadata_connection_handle, query)
77
+ if len(rows) == 0:
78
+ return None
79
+ elif len(rows) > 1:
80
+ options = ', '.join(rows)
81
+ raise ValueError(f'More than one possible code for taxonomy {taxonomy} found: {options}')
82
+ return rows[0][0]
83
+
84
+
85
+ def get_assembly_code_from_metadata(metadata_connection_handle, assembly):
86
+ """
87
+ Retrieve an existing assembly code registered in the metadata database.
88
+ """
89
+ query = f"SELECT DISTINCT assembly_code FROM assembly WHERE assembly_accession='{assembly}'"
90
+ rows = get_all_results_for_query(metadata_connection_handle, query)
91
+ if len(rows) == 0:
92
+ return None
93
+ elif len(rows) > 1:
94
+ options = ', '.join([row for row, in rows])
95
+ raise ValueError(f'More than one possible code for assembly {assembly} found: {options}')
96
+ return rows[0][0]
97
+
98
+
99
+ def build_variant_warehouse_database_name(taxonomy_code, assembly_code):
100
+ if taxonomy_code and assembly_code:
101
+ return f'eva_{taxonomy_code}_{assembly_code}'
102
+ return None
103
+
104
+
105
+ def resolve_existing_variant_warehouse_db_name(metadata_connection_handle, assembly, taxonomy):
106
+ """
107
+ Retrieve an existing database name by combining the taxonomy_code and assembly code registered in the metadata
108
+ database.
109
+ """
110
+ return build_variant_warehouse_database_name(
111
+ get_taxonomy_code_from_metadata(metadata_connection_handle, taxonomy),
112
+ get_assembly_code_from_metadata(metadata_connection_handle, assembly)
113
+ )
114
+
115
+
116
+ # For backward compatibility
117
+ get_variant_warehouse_db_name_from_assembly_and_taxonomy = resolve_existing_variant_warehouse_db_name
118
+
119
+
120
+ def get_assembly_code(metadata_connection_handle, assembly, ncbi_api_key=None):
121
+ assembly_code = get_assembly_code_from_metadata(metadata_connection_handle, assembly)
122
+ if not assembly_code:
123
+ assembly_name = get_ncbi_assembly_name_from_term(assembly, api_key=ncbi_api_key)
124
+ # If the assembly is a patch assembly ex: GRCh37.p8, drop the trailing patch i.e., just return grch37
125
+ if is_patch_assembly(assembly):
126
+ assembly_name = re.sub('\\.p[0-9]+$', '', assembly_name.lower())
127
+ assembly_code = re.sub('[^0-9a-zA-Z]+', '', assembly_name.lower())
128
+ return assembly_code
129
+
130
+
131
+ def build_taxonomy_code(scientific_name):
132
+ """Given a scientific name like "Zea mays", the corresponding taxonomy code should be zmays"""
133
+ return scientific_name[0].lower() + re.sub('[^0-9a-zA-Z]+', '', ''.join(scientific_name.split()[1:])).lower()
134
+
135
+
136
+ def get_taxonomy_code(metadata_connection_handle, taxonomy):
137
+ taxonomy_code = get_taxonomy_code_from_metadata(metadata_connection_handle, taxonomy)
138
+ if not taxonomy_code:
139
+ scientific_name = get_scientific_name_from_ensembl(taxonomy)
140
+ taxonomy_code = build_taxonomy_code(scientific_name)
141
+ return taxonomy_code
142
+
143
+
144
+ def resolve_variant_warehouse_db_name(metadata_connection_handle, assembly, taxonomy, ncbi_api_key=None):
145
+ """
146
+ Retrieve the database name for this taxonomy/assembly pair whether it exists or not.
147
+ It will use existing taxonomy code or assembly code if available in the metadata database.
148
+ """
149
+ taxonomy_code = get_taxonomy_code(metadata_connection_handle, taxonomy)
150
+ assembly_code = get_assembly_code(metadata_connection_handle, assembly, ncbi_api_key=ncbi_api_key)
151
+ return build_variant_warehouse_database_name(taxonomy_code, assembly_code)
152
+
153
+
154
+ def insert_new_assembly_and_taxonomy(metadata_connection_handle, assembly_accession, taxonomy_id, eva_species_name=None,
155
+ in_accessioning=True, ncbi_api_key=None):
156
+ """
157
+ This script adds new assemblies and taxonomies to EVAPRO.
158
+ You can also add the assembly with a different taxonomy if you provide the
159
+ taxonomy parameters. Example taxonomy page:
160
+ https://www.ebi.ac.uk/ena/data/view/Taxon:9031
161
+
162
+ :param assembly_accession: Assembly accession (Example: GCA_000002315.3)
163
+ :param metadata_connection_handle: Metadata DB connection
164
+ :param taxonomy_id: Taxonomy id (Example: 9031)
165
+ :param eva_species_name: EVA species name (Example: chicken).
166
+ Not required if the taxonomy exists or ENA has a common name available.
167
+ :param in_accessioning: Flag that this assembly is in the accessioning data store.
168
+ """
169
+ # check if assembly is already in EVAPRO, adding it if not
170
+ assembly_set_id = get_assembly_set_from_metadata(metadata_connection_handle, taxonomy_id, assembly_accession)
171
+ if assembly_set_id is None:
172
+ assembly_name = get_ncbi_assembly_name_from_term(assembly_accession, api_key=ncbi_api_key)
173
+ ensure_taxonomy_is_in_evapro(metadata_connection_handle, taxonomy_id, eva_species_name)
174
+ assembly_code = get_assembly_code(metadata_connection_handle, assembly_accession)
175
+ insert_assembly_in_evapro(metadata_connection_handle, taxonomy_id, assembly_accession, assembly_name, assembly_code)
176
+
177
+ update_accessioning_status(metadata_connection_handle, assembly_accession, in_accessioning)
178
+ metadata_connection_handle.commit()
179
+
180
+
181
+ def ensure_taxonomy_is_in_evapro(metadata_connection_handle, taxonomy, eva_species_name=None):
182
+ if is_taxonomy_in_evapro(metadata_connection_handle, taxonomy):
183
+ logger.debug('Taxonomy {} is already in the database'.format(taxonomy))
184
+ else:
185
+ logger.info("Taxonomy {} not present in EVAPRO. Adding taxonomy ...".format(taxonomy))
186
+ scientific_name, common_name = get_scientific_name_and_common_name(taxonomy)
187
+ taxonomy_code = build_taxonomy_code(scientific_name)
188
+ # If a common name cannot be found then we should use the scientific name
189
+ eva_species_name = eva_species_name or common_name or scientific_name
190
+ insert_taxonomy(metadata_connection_handle, taxonomy, scientific_name, common_name, taxonomy_code, eva_species_name)
191
+
192
+
193
+ def insert_assembly_in_evapro(metadata_connection_handle, taxonomy_id, assembly_accession, assembly_name, assembly_code):
194
+ cur = metadata_connection_handle.cursor()
195
+ cur.execute('INSERT INTO evapro.assembly_set(taxonomy_id, assembly_name, assembly_code) VALUES (%s, %s, %s)',
196
+ (taxonomy_id, assembly_name, assembly_code))
197
+
198
+ # get the assembly_set_id that was autogenerated in the row that we just inserted in assembly_set
199
+ assembly_set_id = get_all_results_for_query(metadata_connection_handle,
200
+ 'SELECT assembly_set_id FROM evapro.assembly_set '
201
+ 'WHERE taxonomy_id={} and assembly_name=\'{}\' and assembly_code=\'{}\''
202
+ .format(taxonomy_id, assembly_name, assembly_code))[0][0]
203
+
204
+ assembly_chain = assembly_accession.split('.')[0]
205
+ assembly_version = assembly_accession.split('.')[1]
206
+ cur.execute('INSERT INTO evapro.accessioned_assembly('
207
+ 'assembly_set_id, assembly_accession, assembly_chain, assembly_version) VALUES (%s,%s,%s,%s)',
208
+ (assembly_set_id, assembly_accession, assembly_chain, assembly_version))
209
+
210
+ logger.info('New assembly added with assembly_set_id: {0}'.format(assembly_set_id))
211
+ return assembly_set_id
212
+
213
+
214
+ def update_accessioning_status(metadata_connection_handle, assembly_accession, in_accessioning_flag):
215
+ cur = metadata_connection_handle.cursor()
216
+ # Only insert assembly accessions which are NOT already in the assembly_accessioning_store_status table
217
+ assembly_accessioning_store_insert_query = "INSERT INTO evapro.assembly_accessioning_store_status " \
218
+ "SELECT * FROM (SELECT " \
219
+ "cast('{0}' as text) as assembly_accession" \
220
+ ", cast('{1}' as boolean) as loaded) temp " \
221
+ "WHERE assembly_accession NOT IN " \
222
+ "(SELECT assembly_accession FROM " \
223
+ "evapro.assembly_accessioning_store_status)" \
224
+ .format(assembly_accession, in_accessioning_flag)
225
+ cur.execute(assembly_accessioning_store_insert_query)
226
+
227
+
228
+ def get_assembly_set_from_metadata(metadata_connection_handle, taxonomy, assembly_accession):
229
+ query = (f"SELECT acc.assembly_set_id "
230
+ f"FROM evapro.accessioned_assembly acc "
231
+ f"JOIN assembly_set asm on acc.assembly_set_id = asm.assembly_set_id "
232
+ f"WHERE assembly_accession='{assembly_accession}' AND taxonomy_id={taxonomy}")
233
+ rows = get_all_results_for_query(metadata_connection_handle, query)
234
+
235
+ if len(rows) == 1:
236
+ return rows[0][0]
237
+ elif len(rows) == 0:
238
+ return None
239
+ else:
240
+ raise ValueError('Inconsistent database state: several assembly_set_ids for the same taxonomy ({}) and '
241
+ 'assembly accession ({}): {}'.format(taxonomy, assembly_accession, rows))
242
+
243
+
244
+ def is_taxonomy_in_evapro(metadata_connection_handle, taxonomy_id):
245
+ taxonomy_query = 'SELECT taxonomy_id FROM evapro.taxonomy WHERE taxonomy_id={}'.format(taxonomy_id)
246
+ taxonomy_ids_in_evapro = get_all_results_for_query(metadata_connection_handle, taxonomy_query)
247
+ return len(taxonomy_ids_in_evapro) > 0
248
+
249
+
250
+ def insert_taxonomy(metadata_connection_handle, taxonomy_id, scientific_name, common_name, taxonomy_code, eva_species_name):
251
+ if taxonomy_code is None or eva_species_name is None:
252
+ raise ValueError('Error: taxonomy code ({}) and EVA taxonomy name ({}) are required '
253
+ 'for inserting a taxonomy'.format(taxonomy_code, eva_species_name))
254
+ cur = metadata_connection_handle.cursor()
255
+ cur.execute('INSERT INTO evapro.taxonomy(taxonomy_id, common_name, scientific_name, taxonomy_code, eva_name) '
256
+ 'VALUES (%s, %s, %s, %s, %s)',
257
+ (taxonomy_id, common_name, scientific_name, taxonomy_code, eva_species_name))
258
+ logger.info('New taxonomy {} added'.format(taxonomy_id))
259
+
260
+
261
+ def add_to_supported_assemblies(metadata_connection_handle, source_of_assembly: str, target_assembly: str,
262
+ taxonomy_id: int):
263
+ today = datetime.date.today().strftime('%Y-%m-%d')
264
+ # First check if the current assembly is already target - if so don't do anything
265
+ current_query = (
266
+ f"SELECT assembly_id FROM {SUPPORTED_ASSEMBLY_TRACKER_TABLE} "
267
+ f"WHERE taxonomy_id={taxonomy_id} AND current=true;"
268
+ )
269
+ results = get_all_results_for_query(metadata_connection_handle, current_query)
270
+ if len(results) > 0 and results[0][0] == target_assembly:
271
+ logger.warning(f'Current assembly for taxonomy {taxonomy_id} is already {target_assembly}!')
272
+ return
273
+
274
+ # Deprecate the last current assembly
275
+ update_query = (
276
+ f"UPDATE {SUPPORTED_ASSEMBLY_TRACKER_TABLE} "
277
+ f"SET current=false, end_date='{today}' "
278
+ f"WHERE taxonomy_id={taxonomy_id} AND current=true;"
279
+ )
280
+ execute_query(metadata_connection_handle, update_query)
281
+
282
+ # Then insert the new assembly
283
+ insert_query = (
284
+ f"INSERT INTO {SUPPORTED_ASSEMBLY_TRACKER_TABLE} "
285
+ f"(taxonomy_id, source, assembly_id, current, start_date) "
286
+ f"VALUES({taxonomy_id}, '{source_of_assembly}', '{target_assembly}', true, '{today}');"
287
+ )
288
+ execute_query(metadata_connection_handle, insert_query)
@@ -0,0 +1,71 @@
1
+ # Copyright 2020 EMBL - European Bioinformatics Institute
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import pymongo
16
+ from urllib.parse import quote_plus
17
+
18
+ from pymongo import ReadPreference
19
+
20
+ from ebi_eva_common_pyutils.command_utils import run_command_with_output
21
+ from ebi_eva_common_pyutils.common_utils import merge_two_dicts
22
+ from ebi_eva_internal_pyutils.config_utils import get_mongo_uri_for_eva_profile, get_mongo_creds_for_profile
23
+
24
+
25
+ class MongoConfig:
26
+ parameters = None
27
+
28
+ def __init__(self, **kwargs):
29
+ self.parameters = kwargs
30
+ if "port" not in kwargs:
31
+ self.parameters["port"] = 27017
32
+ if "host" not in kwargs:
33
+ self.parameters["host"] = "localhost"
34
+
35
+
36
+ def get_mongo_connection_handle(profile: str, settings_xml_file: str,
37
+ read_concern: str = "majority",
38
+ read_preference: ReadPreference = ReadPreference.PRIMARY,
39
+ write_concern: str = "majority") -> pymongo.MongoClient:
40
+ mongo_connection_uri = get_mongo_uri_for_eva_profile(profile, settings_xml_file)
41
+ return pymongo.MongoClient(mongo_connection_uri,
42
+ readConcernLevel=read_concern,
43
+ read_preference=read_preference,
44
+ w=write_concern)
45
+
46
+
47
+ def get_primary_mongo_connection_handle(profile: str, settings_xml_file: str,
48
+ read_concern: str = "majority",
49
+ read_preference: ReadPreference = ReadPreference.PRIMARY,
50
+ write_concern: str = "majority") -> pymongo.MongoClient:
51
+ host, username, password = get_mongo_creds_for_profile(profile, settings_xml_file)
52
+ mongo_connection_uri = "mongodb://{0}:{1}@{2}:{3}/{4}".format(username, quote_plus(password), host,
53
+ 27017, "admin")
54
+ return pymongo.MongoClient(mongo_connection_uri,
55
+ readConcernLevel=read_concern,
56
+ read_preference=read_preference,
57
+ w=write_concern)
58
+
59
+
60
+ def copy_db_with_config(mongo_source_config: MongoConfig, mongo_destination_config: MongoConfig, mongodump_args: dict,
61
+ mongorestore_args: dict):
62
+ copy_db(merge_two_dicts(mongo_source_config.parameters, mongodump_args),
63
+ merge_two_dicts(mongo_destination_config.parameters, mongorestore_args))
64
+
65
+
66
+ def copy_db(mongodump_args: dict, mongorestore_args: dict):
67
+ mongodump_args_str = " ".join(["--{0} {1}".format(key, value) for key, value in mongodump_args.items()])
68
+ mongorestore_args_str = " ".join(["--{0} {1}".format(key, value) for key, value in mongorestore_args.items()])
69
+ run_command_with_output("Running mongodump", "mongodump " + mongodump_args_str, log_error_stream_to_output=True)
70
+ run_command_with_output("Running mongorestore", "mongorestore " + mongorestore_args_str,
71
+ log_error_stream_to_output=True)
@@ -0,0 +1,3 @@
1
+ from ebi_eva_internal_pyutils.mongodb.mongo_database import MongoDatabase
2
+
3
+
@@ -0,0 +1,170 @@
1
+ # Copyright 2021 EMBL - European Bioinformatics Institute
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import copy
16
+ import subprocess
17
+
18
+ from cached_property import cached_property
19
+ from pymongo import MongoClient, uri_parser
20
+
21
+ from ebi_eva_common_pyutils.command_utils import run_command_with_output
22
+ from ebi_eva_common_pyutils.logger import AppLogger
23
+
24
+
25
+ class MongoDatabase(AppLogger):
26
+ def __init__(self, uri: str, secrets_file: str = None, db_name: str = "admin"):
27
+ self.uri = uri
28
+ self.secrets_file = secrets_file
29
+ self.db_name = db_name
30
+
31
+ @cached_property
32
+ def uri_with_db_name(self):
33
+ """
34
+ Return URI with the database name substituted
35
+ ex:
36
+ If the URI is mongodb://user@localhost:27017/admin and database name is eva_fcatus_90,
37
+ then the URI with the database name will be mongodb://user@localhost:27017/eva_fcatus_90?authSource=admin
38
+ """
39
+ if self.db_name == "admin":
40
+ return self.uri
41
+ uri_components = uri_parser.parse_uri(self.uri)
42
+ username_component = f"{uri_components['username']}@" if uri_components['username'] else ""
43
+ # Hack needed to log in to a different DB but retain authentication source
44
+ # See https://docs.mongodb.com/v4.0/reference/connection-string/#records-database and https://docs.mongodb.com/v4.0/reference/connection-string/#urioption.authSource
45
+ uri_with_db_name = f"mongodb://{username_component}" + \
46
+ ",".join([node + ':' + str(port) for node, port in uri_components['nodelist']]) + \
47
+ f"/{self.db_name}"
48
+ uri_with_db_name += f"?authSource=" \
49
+ f"{uri_components['options'].get('authSource', 'admin')}" if self.secrets_file else ""
50
+ return uri_with_db_name
51
+
52
+ @cached_property
53
+ def mongo_handle(self):
54
+ if self.secrets_file:
55
+ with open(self.secrets_file) as secrets_file_handle:
56
+ mongo_password = secrets_file_handle.read().strip()
57
+ return MongoClient(self.uri, password=mongo_password)
58
+ else:
59
+ return MongoClient(self.uri)
60
+
61
+ def __del__(self):
62
+ self.mongo_handle.close()
63
+
64
+ def _get_optional_secrets_file_stdin(self):
65
+ # Mongodump and restore tools are notorious in displaying clear text passwords
66
+ # in commands - See https://jira.mongodb.org/browse/TOOLS-1020
67
+ # A secrets file can be provided to work around it
68
+ if self.secrets_file:
69
+ return f" < {self.secrets_file}"
70
+ return ""
71
+
72
+ def drop(self):
73
+ self.mongo_handle.drop_database(self.db_name)
74
+
75
+ def get_collection_names(self):
76
+ return self.mongo_handle[self.db_name].list_collection_names()
77
+
78
+ def get_indexes(self):
79
+ collection_index_map = {}
80
+ for collection_name in self.get_collection_names():
81
+ collection_index_map[collection_name] = self.mongo_handle[self.db_name][collection_name].index_information()
82
+ return collection_index_map
83
+
84
+ def create_index_on_collections(self, collection_index_map):
85
+ collection_index_map_copy = copy.deepcopy(collection_index_map)
86
+ for collection_name, index_info_map in collection_index_map_copy.items():
87
+ # Copy indexes from one collection to another See https://stackoverflow.com/a/51445278
88
+ for name, index_info in index_info_map.items():
89
+ index_keys = index_info['key']
90
+ del (index_info['ns'])
91
+ del (index_info['v'])
92
+ del (index_info['key'])
93
+ if 'background' in index_info:
94
+ del (index_info['background'])
95
+ # Due to https://jira.mongodb.org/browse/SERVER-11064
96
+ # pre-v3.2 index sort indicators could allow for floats like 1.0
97
+ # ex: (_id, 1.0) for ascending index on _id or (_id, -1.0) for descending index on _id
98
+ # Since validation is stricter in database versions newer than v3.3.1, cast sort indicators to int
99
+ for i, _ in enumerate(index_keys):
100
+ index_keys[i] = (index_keys[i][0], int(index_keys[i][1]))
101
+ self.mongo_handle[self.db_name][collection_name].create_index(index_keys, name=name, **index_info)
102
+
103
+ def enable_sharding(self):
104
+ self.mongo_handle.admin.command({"enableSharding": self.db_name})
105
+
106
+ def shard_collections(self, collections_shard_key_map, collections_to_shard):
107
+ for collection_name in collections_to_shard:
108
+ shard_key, shard_key_uniqueness_flag = collections_shard_key_map.get(collection_name, (["_id"], True))
109
+ # Shard key representation in the format {"key1": 1, "key2": 1}
110
+ shard_key_repr = "{{{0}}}".format(",".join([f'"{attribute}": 1' for attribute in shard_key]))
111
+ shard_collection_command = f'sh.shardCollection(' \
112
+ f'"{self.db_name}.{collection_name}", ' \
113
+ f'{shard_key_repr}, {str(shard_key_uniqueness_flag).lower()})'
114
+ sharding_command = f"mongosh --eval '{shard_collection_command}' {self.uri} "
115
+ sharding_command += self._get_optional_secrets_file_stdin()
116
+ run_command_with_output(f"Sharding collection {collection_name} in the database {self.uri_with_db_name} "
117
+ f"with key {shard_key_repr}...", sharding_command,
118
+ log_error_stream_to_output=True, )
119
+
120
+ def dump_data(self, dump_dir, mongodump_args=None):
121
+ mongodump_args = " ".join([f"--{arg} {val}"
122
+ for arg, val in mongodump_args.items()]) if mongodump_args else ""
123
+ mongodump_command = f"mongodump --uri {self.uri_with_db_name} --out {dump_dir} {mongodump_args}" + \
124
+ self._get_optional_secrets_file_stdin()
125
+ try:
126
+ run_command_with_output("mongodump", mongodump_command, log_error_stream_to_output=True)
127
+ except subprocess.CalledProcessError as ex:
128
+ raise Exception("mongodump failed! HINT: Did you forget to provide a secrets file for authentication?")
129
+
130
+ def archive_data(self, archive_dir, archive_name="archive", mongodump_args=None):
131
+ mongodump_args = " ".join([f"--{arg} {val}"
132
+ for arg, val in mongodump_args.items()]) if mongodump_args else ""
133
+ mongodump_command = f"mongodump --uri {self.uri_with_db_name} --archive={archive_dir}/{archive_name} {mongodump_args}" + \
134
+ self._get_optional_secrets_file_stdin()
135
+ try:
136
+ run_command_with_output("mongodump", mongodump_command, log_error_stream_to_output=True)
137
+ except subprocess.CalledProcessError as ex:
138
+ raise Exception("mongodump failed! HINT: Did you forget to provide a secrets file for authentication?")
139
+
140
+ def restore_data(self, dump_dir, mongorestore_args=None):
141
+ mongorestore_args = " ".join([f"--{arg} {val}"
142
+ for arg, val in mongorestore_args.items()]) if mongorestore_args else ""
143
+ mongorestore_command = f"mongorestore --uri {self.uri_with_db_name} " \
144
+ f"{mongorestore_args} " \
145
+ f"--dir {dump_dir} "
146
+ mongorestore_command += self._get_optional_secrets_file_stdin()
147
+ try:
148
+ run_command_with_output("mongorestore", mongorestore_command, log_error_stream_to_output=True)
149
+ except subprocess.CalledProcessError as ex:
150
+ raise Exception("mongorestore failed! HINT: Did you forget to provide a secrets file for authentication?")
151
+
152
+ def export_data(self, export_directory, mongoexport_args=None):
153
+ mongoexport_args = " ".join([f"--{arg} {val}"
154
+ for arg, val in mongoexport_args.items()]) if mongoexport_args else ""
155
+ mongoexport_command = f"mongoexport --uri {self.uri_with_db_name} --out {export_directory} {mongoexport_args}" + \
156
+ self._get_optional_secrets_file_stdin()
157
+ try:
158
+ run_command_with_output("mongoexport", mongoexport_command, log_error_stream_to_output=True)
159
+ except subprocess.CalledProcessError as ex:
160
+ raise Exception("mongoexport failed! HINT: Did you forget to provide a secrets file for authentication?")
161
+
162
+ def import_data(self, coll_file_loc, mongoimport_args=None):
163
+ mongoimport_args = " ".join([f"--{arg} {val}"
164
+ for arg, val in mongoimport_args.items()]) if mongoimport_args else ""
165
+ mongoimport_command = f"mongoimport --uri {self.uri_with_db_name} --file {coll_file_loc} {mongoimport_args}" + \
166
+ self._get_optional_secrets_file_stdin()
167
+ try:
168
+ run_command_with_output("mongoimport", mongoimport_command, log_error_stream_to_output=True)
169
+ except subprocess.CalledProcessError as ex:
170
+ raise Exception("mongoexport failed! HINT: Did you forget to provide a secrets file for authentication?")
@@ -0,0 +1 @@
1
+ from ebi_eva_internal_pyutils.nextflow.nextflow_pipeline import LinearNextFlowPipeline, NextFlowPipeline, NextFlowProcess