ebi-eva-common-pyutils 0.6.15__1-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ebi_eva_common_pyutils/__init__.py +0 -0
- ebi_eva_common_pyutils/assembly/__init__.py +1 -0
- ebi_eva_common_pyutils/assembly/assembly.py +69 -0
- ebi_eva_common_pyutils/assembly_utils.py +91 -0
- ebi_eva_common_pyutils/biosamples_communicators.py +186 -0
- ebi_eva_common_pyutils/command_utils.py +54 -0
- ebi_eva_common_pyutils/common_utils.py +30 -0
- ebi_eva_common_pyutils/config.py +152 -0
- ebi_eva_common_pyutils/contig_alias/__init__.py +0 -0
- ebi_eva_common_pyutils/contig_alias/contig_alias.py +115 -0
- ebi_eva_common_pyutils/ena_utils.py +35 -0
- ebi_eva_common_pyutils/file_utils.py +31 -0
- ebi_eva_common_pyutils/logger.py +150 -0
- ebi_eva_common_pyutils/ncbi_utils.py +117 -0
- ebi_eva_common_pyutils/network_utils.py +64 -0
- ebi_eva_common_pyutils/reference/__init__.py +2 -0
- ebi_eva_common_pyutils/reference/assembly.py +247 -0
- ebi_eva_common_pyutils/reference/sequence.py +101 -0
- ebi_eva_common_pyutils/taxonomy/__init__.py +0 -0
- ebi_eva_common_pyutils/taxonomy/taxonomy.py +60 -0
- ebi_eva_common_pyutils/variation/__init__.py +0 -0
- ebi_eva_common_pyutils/variation/contig_utils.py +113 -0
- ebi_eva_common_pyutils-0.6.15.data/scripts/archive_directory.py +114 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/LICENSE +201 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/METADATA +23 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/RECORD +39 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/WHEEL +5 -0
- ebi_eva_common_pyutils-0.6.15.dist-info/top_level.txt +2 -0
- ebi_eva_internal_pyutils/__init__.py +0 -0
- ebi_eva_internal_pyutils/archive_directory.py +114 -0
- ebi_eva_internal_pyutils/config_utils.py +188 -0
- ebi_eva_internal_pyutils/metadata_utils.py +288 -0
- ebi_eva_internal_pyutils/mongo_utils.py +71 -0
- ebi_eva_internal_pyutils/mongodb/__init__.py +3 -0
- ebi_eva_internal_pyutils/mongodb/mongo_database.py +170 -0
- ebi_eva_internal_pyutils/nextflow/__init__.py +1 -0
- ebi_eva_internal_pyutils/nextflow/nextflow_pipeline.py +195 -0
- ebi_eva_internal_pyutils/pg_utils.py +107 -0
- ebi_eva_internal_pyutils/spring_properties.py +294 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# Copyright 2021 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
# Rationale for a Nextflow pipeline abstraction
|
|
16
|
+
# ---------------------------------------------
|
|
17
|
+
# Dynamic pipeline generation
|
|
18
|
+
# Abstraction to represent process dependencies
|
|
19
|
+
# Unit testability of individual steps without scattering logic between Python and Nextflow
|
|
20
|
+
# Ability to combine pipelines
|
|
21
|
+
|
|
22
|
+
import networkx as nx
|
|
23
|
+
import os
|
|
24
|
+
from typing import List, Dict, Union
|
|
25
|
+
|
|
26
|
+
from ebi_eva_common_pyutils.logger import AppLogger
|
|
27
|
+
from ebi_eva_common_pyutils.command_utils import run_command_with_output
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class NextFlowProcess:
|
|
31
|
+
|
|
32
|
+
def __init__(self, process_name: str, command_to_run: str, process_directives: Dict[str, str] = None) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Create a Nextflow process
|
|
35
|
+
:rtype: None
|
|
36
|
+
:param process_name: Name of the process - should be a valid identifier - ex: p1_merge
|
|
37
|
+
:type process_name: str
|
|
38
|
+
:param command_to_run: Command to be run - ex: bash -c "echo p1"
|
|
39
|
+
:type command_to_run: str
|
|
40
|
+
:param process_directives: Additional process directives - ex: {"memory": "4GB", "executor": "lsf"}
|
|
41
|
+
:type process_directives: dict
|
|
42
|
+
"""
|
|
43
|
+
if not process_name.isidentifier():
|
|
44
|
+
raise ValueError(f"{process_name} is not a valid Nextflow process name")
|
|
45
|
+
self.process_name = process_name
|
|
46
|
+
self.success_flag = f"{self.process_name}_success"
|
|
47
|
+
self.command_to_run = command_to_run
|
|
48
|
+
self.process_directives = process_directives if process_directives else dict()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class NextFlowPipeline(AppLogger):
|
|
52
|
+
def __init__(self, process_dependency_map: Dict[NextFlowProcess, List[NextFlowProcess]] = None) -> None:
|
|
53
|
+
"""
|
|
54
|
+
Create a Nextflow pipeline with a process dependency map
|
|
55
|
+
|
|
56
|
+
:param process_dependency_map: Map of Nextflow processes and their corresponding dependencies
|
|
57
|
+
- ex: {p3 : [p2], p2: [p1]} where p1, p2 and p3 are Nextflow processes that should be executed sequentially
|
|
58
|
+
"""
|
|
59
|
+
# Modeling the dependency map as a DiGraph (Directed graph) is advantageous
|
|
60
|
+
# in ordering/combining flows and detecting cycles
|
|
61
|
+
self.process_dependency_map = nx.ordered.DiGraph()
|
|
62
|
+
if process_dependency_map:
|
|
63
|
+
self.add_dependencies(process_dependency_map)
|
|
64
|
+
|
|
65
|
+
def add_dependencies(self, process_dependency_map: Dict[NextFlowProcess, List[NextFlowProcess]]):
|
|
66
|
+
for process, dependencies in process_dependency_map.items():
|
|
67
|
+
if dependencies:
|
|
68
|
+
for dependency in dependencies:
|
|
69
|
+
self.add_process_dependency(process, dependency)
|
|
70
|
+
else:
|
|
71
|
+
self.add_process_dependency(process, None)
|
|
72
|
+
|
|
73
|
+
def add_process_dependency(self, process: NextFlowProcess, dependency: Union[NextFlowProcess, None]):
|
|
74
|
+
if dependency:
|
|
75
|
+
self.process_dependency_map.add_edge(process, dependency)
|
|
76
|
+
if not nx.dag.is_directed_acyclic_graph(self.process_dependency_map):
|
|
77
|
+
raise ValueError(f"Cycles found in pipeline when adding process {process.process_name} "
|
|
78
|
+
f"and its dependency {dependency.process_name}")
|
|
79
|
+
else:
|
|
80
|
+
# If no dependency is specified, the process will just be a single node in the DAG
|
|
81
|
+
self.process_dependency_map.add_node(process)
|
|
82
|
+
|
|
83
|
+
def _write_to_pipeline_file(self, workflow_file_path: str):
|
|
84
|
+
with open(workflow_file_path, "a") as pipeline_file_handle:
|
|
85
|
+
pipeline_file_handle.write(self.__str__() + "\n")
|
|
86
|
+
|
|
87
|
+
def run_pipeline(self, workflow_file_path: str, nextflow_binary_path: str = 'nextflow',
|
|
88
|
+
nextflow_config_path: str = None, working_dir: str = ".", resume: bool = False,
|
|
89
|
+
other_args: dict = None):
|
|
90
|
+
# Remove pipeline file if it already exists
|
|
91
|
+
if os.path.exists(workflow_file_path):
|
|
92
|
+
os.remove(workflow_file_path)
|
|
93
|
+
self._write_to_pipeline_file(workflow_file_path)
|
|
94
|
+
workflow_command = f"cd {working_dir} && {nextflow_binary_path} run {workflow_file_path}"
|
|
95
|
+
workflow_command += f" -c {nextflow_config_path}" if nextflow_config_path else ""
|
|
96
|
+
workflow_command += f" -with-report {workflow_file_path}.report.html"
|
|
97
|
+
workflow_command += f" -with-dag {workflow_file_path}.dag.png"
|
|
98
|
+
workflow_command += " -resume" if resume else ""
|
|
99
|
+
workflow_command += " ".join([f" -{arg} {val}" for arg, val in other_args.items()]) if other_args else ""
|
|
100
|
+
run_command_with_output(f"Running pipeline {workflow_file_path}...", workflow_command)
|
|
101
|
+
|
|
102
|
+
@staticmethod
|
|
103
|
+
def join_pipelines(main_pipeline: 'NextFlowPipeline', dependent_pipeline: 'NextFlowPipeline',
|
|
104
|
+
with_dependencies: bool = True) -> 'NextFlowPipeline':
|
|
105
|
+
"""
|
|
106
|
+
Join two pipelines with or without dependencies
|
|
107
|
+
|
|
108
|
+
With Dependencies it returns a new pipeline where:
|
|
109
|
+
1) root processes are those of the main pipeline.
|
|
110
|
+
2) final processes are those of the dependent pipeline and
|
|
111
|
+
3) every root process of the dependent pipeline depends on the final processes of the main pipeline.
|
|
112
|
+
Without Dependencies it returns a new pipeline where:
|
|
113
|
+
1) the two pipeline are left independent
|
|
114
|
+
2) Only shared dependencies
|
|
115
|
+
3) every root process of the dependent pipeline depends on the final processes of the main pipeline.
|
|
116
|
+
|
|
117
|
+
"""
|
|
118
|
+
joined_pipeline = NextFlowPipeline()
|
|
119
|
+
# Aggregate dependency maps of both pipelines
|
|
120
|
+
joined_pipeline.process_dependency_map = nx.compose(main_pipeline.process_dependency_map,
|
|
121
|
+
dependent_pipeline.process_dependency_map)
|
|
122
|
+
if with_dependencies:
|
|
123
|
+
for final_process_in_main_pipeline in main_pipeline._get_final_processes():
|
|
124
|
+
for root_process_in_dependent_pipeline in dependent_pipeline._get_root_processes():
|
|
125
|
+
joined_pipeline.add_process_dependency(root_process_in_dependent_pipeline,
|
|
126
|
+
final_process_in_main_pipeline)
|
|
127
|
+
return joined_pipeline
|
|
128
|
+
|
|
129
|
+
def _get_root_processes(self) -> List[NextFlowProcess]:
|
|
130
|
+
# Root processes are those which have no dependencies
|
|
131
|
+
# See https://stackoverflow.com/a/62948641
|
|
132
|
+
roots = []
|
|
133
|
+
for component in nx.weakly_connected_components(self.process_dependency_map):
|
|
134
|
+
subgraph = self.process_dependency_map.subgraph(component)
|
|
135
|
+
roots.extend([n for n, d in subgraph.out_degree() if d == 0])
|
|
136
|
+
return roots
|
|
137
|
+
|
|
138
|
+
def _get_final_processes(self) -> List[NextFlowProcess]:
|
|
139
|
+
# Final processes are those which have no other processes depending on them
|
|
140
|
+
# See https://stackoverflow.com/a/62948641
|
|
141
|
+
roots = []
|
|
142
|
+
for component in nx.weakly_connected_components(self.process_dependency_map):
|
|
143
|
+
subgraph = self.process_dependency_map.subgraph(component)
|
|
144
|
+
roots.extend([n for n, d in subgraph.in_degree() if d == 0])
|
|
145
|
+
return roots
|
|
146
|
+
|
|
147
|
+
@staticmethod
|
|
148
|
+
def _get_process_repr(process: NextFlowProcess, dependencies: List[NextFlowProcess]) -> str:
|
|
149
|
+
process_directives_str = "\n".join([f"{key}='{value}'" for key, value in process.process_directives.items()])
|
|
150
|
+
input_dependencies = "val flag from true"
|
|
151
|
+
if dependencies:
|
|
152
|
+
input_dependencies = "\n".join([f"val {dependency.success_flag} from {dependency.success_flag}"
|
|
153
|
+
for dependency in dependencies])
|
|
154
|
+
return "\n".join(map(str.strip, f"""
|
|
155
|
+
process {process.process_name} {{
|
|
156
|
+
{process_directives_str}
|
|
157
|
+
input:
|
|
158
|
+
{input_dependencies}
|
|
159
|
+
output:
|
|
160
|
+
val true into {process.success_flag}
|
|
161
|
+
script:
|
|
162
|
+
\"\"\"
|
|
163
|
+
{process.command_to_run}
|
|
164
|
+
\"\"\"
|
|
165
|
+
}}""".split("\n")))
|
|
166
|
+
|
|
167
|
+
def __str__(self):
|
|
168
|
+
# Order the list of nodes based on the dependency
|
|
169
|
+
# See https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.depth_first_search.dfs_postorder_nodes.html?highlight=dfs_postorder_nodes#networkx.algorithms.traversal.depth_first_search.dfs_postorder_nodes
|
|
170
|
+
ordered_list_of_processes_to_run = list(nx.dfs_postorder_nodes(self.process_dependency_map))
|
|
171
|
+
# Get a Nextflow pipeline representation of each process and its dependencies
|
|
172
|
+
return "\n\n".join([NextFlowPipeline._get_process_repr(process, list(self.process_dependency_map[process]))
|
|
173
|
+
for process in ordered_list_of_processes_to_run])
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class LinearNextFlowPipeline(NextFlowPipeline):
|
|
177
|
+
"""
|
|
178
|
+
Simple linear pipeline that supports resumption
|
|
179
|
+
"""
|
|
180
|
+
previous_process: NextFlowProcess = None
|
|
181
|
+
|
|
182
|
+
def __init__(self, process_list: List[NextFlowProcess] = None):
|
|
183
|
+
dependency_map = {}
|
|
184
|
+
if process_list:
|
|
185
|
+
for index, process in enumerate(process_list):
|
|
186
|
+
dependency_map[process] = [] if index == 0 else [process_list[index - 1]]
|
|
187
|
+
super().__init__(dependency_map)
|
|
188
|
+
|
|
189
|
+
def add_process(self, process_name, command_to_run):
|
|
190
|
+
current_process = NextFlowProcess(process_name=process_name, command_to_run=command_to_run)
|
|
191
|
+
self._add_new_process(current_process)
|
|
192
|
+
|
|
193
|
+
def _add_new_process(self, current_process):
|
|
194
|
+
super().add_process_dependency(current_process, self.previous_process)
|
|
195
|
+
self.previous_process = current_process
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# Copyright 2020 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import psycopg2
|
|
17
|
+
from ebi_eva_common_pyutils.logger import logging_config as log_cfg
|
|
18
|
+
|
|
19
|
+
logger = log_cfg.get_logger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_all_results_for_query(pg_conn, query):
|
|
23
|
+
with get_result_cursor(pg_conn, query) as pg_cursor:
|
|
24
|
+
results = pg_cursor.fetchall()
|
|
25
|
+
return results
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def execute_query(pg_conn, query):
|
|
29
|
+
with get_result_cursor(pg_conn, query) as _:
|
|
30
|
+
pg_conn.commit()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_result_cursor(pg_conn, query):
|
|
34
|
+
pg_cursor = pg_conn.cursor()
|
|
35
|
+
pg_cursor.execute(query)
|
|
36
|
+
return pg_cursor
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_pg_connection_handle(dbname, user, host):
|
|
40
|
+
return psycopg2.connect("dbname='{0}' user='{1}' host='{2}'".format(dbname, user, host))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def index_already_exists_on_table(pg_conn, schema_name, table_name, index_columns):
|
|
44
|
+
index_columns_lower_case = list(map(str.lower, index_columns))
|
|
45
|
+
query = """select unnest(column_names) from (
|
|
46
|
+
select
|
|
47
|
+
nmsp.nspname as schema_name,
|
|
48
|
+
t.relname as table_name,
|
|
49
|
+
i.relname as index_name,
|
|
50
|
+
array_agg(a.attname) as column_names,
|
|
51
|
+
count(*) as number_of_columns
|
|
52
|
+
from
|
|
53
|
+
pg_class t,
|
|
54
|
+
pg_class i,
|
|
55
|
+
pg_index ix,
|
|
56
|
+
pg_attribute a,
|
|
57
|
+
pg_namespace nmsp
|
|
58
|
+
where
|
|
59
|
+
t.oid = ix.indrelid
|
|
60
|
+
and i.oid = ix.indexrelid
|
|
61
|
+
and a.attrelid = t.oid
|
|
62
|
+
and a.attnum = ANY(ix.indkey)
|
|
63
|
+
and t.relkind = 'r'
|
|
64
|
+
and nmsp.oid = t.relnamespace
|
|
65
|
+
and nmsp.nspname = '{0}'
|
|
66
|
+
and t.relname = '{1}'
|
|
67
|
+
and a.attname in ({2})
|
|
68
|
+
group by
|
|
69
|
+
schema_name, table_name, index_name
|
|
70
|
+
order by
|
|
71
|
+
t.relname,
|
|
72
|
+
i.relname
|
|
73
|
+
) temp
|
|
74
|
+
where number_of_columns = {3};
|
|
75
|
+
""".format(schema_name, table_name,
|
|
76
|
+
",".join(["'{0}'".format(col) for col in index_columns_lower_case]), len(index_columns))
|
|
77
|
+
results = [result[0] for result in get_all_results_for_query(pg_conn, query)]
|
|
78
|
+
return sorted(results) == index_columns_lower_case
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def create_index_on_table(pg_conn, schema_name, table_name, index_columns):
|
|
82
|
+
if index_already_exists_on_table(pg_conn, schema_name, table_name, index_columns):
|
|
83
|
+
logger.info("Index on {0} column(s) on {1}.{2} already exists. Skipping..."
|
|
84
|
+
.format(",".join(list(map(str.lower, sorted(index_columns)))), schema_name, table_name))
|
|
85
|
+
else:
|
|
86
|
+
query = "create index on {0}.{1} ({2})".format(schema_name, table_name,
|
|
87
|
+
",".join(list(map(str.lower, sorted(index_columns))))
|
|
88
|
+
)
|
|
89
|
+
logger.info("Building index with query: " + query)
|
|
90
|
+
execute_query(pg_conn, query)
|
|
91
|
+
pg_conn.commit()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def vacuum_analyze_table(pg_conn, schema_name, table_name, columns=()):
|
|
95
|
+
query = "vacuum analyze {0}.{1}".format(schema_name, table_name)
|
|
96
|
+
if columns:
|
|
97
|
+
query += "({0})".format(",".join(columns))
|
|
98
|
+
isolation_level_pre_analyze = pg_conn.isolation_level
|
|
99
|
+
try:
|
|
100
|
+
# This is needed for vacuum analyze to work since it can't work inside transactions!
|
|
101
|
+
pg_conn.set_isolation_level(0)
|
|
102
|
+
logger.info("Vacuum analyze with query: " + query)
|
|
103
|
+
execute_query(pg_conn, query)
|
|
104
|
+
except Exception as ex:
|
|
105
|
+
logger.error(ex)
|
|
106
|
+
finally:
|
|
107
|
+
pg_conn.set_isolation_level(isolation_level_pre_analyze)
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
# Copyright 2022 EMBL - European Bioinformatics Institute
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from collections import defaultdict
|
|
15
|
+
from urllib.parse import quote_plus
|
|
16
|
+
|
|
17
|
+
from ebi_eva_internal_pyutils.config_utils import get_mongo_creds_for_profile, get_accession_pg_creds_for_profile, \
|
|
18
|
+
get_count_service_creds_for_profile, get_properties_from_xml_file, get_variant_load_job_tracker_creds_for_profile
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SpringPropertiesGenerator:
|
|
22
|
+
"""
|
|
23
|
+
Class to generate Spring properties for various Spring Batch pipelines.
|
|
24
|
+
These methods can be used to generate complete properties files entirely in Python; alternatively, certain
|
|
25
|
+
properties can be left unfilled and supplied as command-line arguments (e.g. by a NextFlow process).
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, maven_profile, private_settings_file):
|
|
29
|
+
self.maven_profile = maven_profile
|
|
30
|
+
self.private_settings_file = private_settings_file
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def _format(*key_value_maps):
|
|
34
|
+
all_params = defaultdict(list)
|
|
35
|
+
for key_value_map in key_value_maps:
|
|
36
|
+
for key in key_value_map:
|
|
37
|
+
if key_value_map[key] is not None:
|
|
38
|
+
all_params[key.split('.')[0]].append(f'{key}={key_value_map[key]}')
|
|
39
|
+
lines = []
|
|
40
|
+
for key_type in all_params:
|
|
41
|
+
for line in all_params[key_type]:
|
|
42
|
+
lines.append(line)
|
|
43
|
+
lines.append('')
|
|
44
|
+
|
|
45
|
+
return '\n'.join(lines)
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _format_str(string, param):
|
|
49
|
+
if param is None:
|
|
50
|
+
return None
|
|
51
|
+
elif not param:
|
|
52
|
+
return ''
|
|
53
|
+
else:
|
|
54
|
+
return string.format(param)
|
|
55
|
+
|
|
56
|
+
def _mongo_properties(self):
|
|
57
|
+
mongo_host, mongo_user, mongo_pass = get_mongo_creds_for_profile(
|
|
58
|
+
self.maven_profile, self.private_settings_file)
|
|
59
|
+
username_with_password = (f'{quote_plus(mongo_user)}:{quote_plus(mongo_pass)}@'
|
|
60
|
+
if mongo_user is not None and mongo_pass is not None else '')
|
|
61
|
+
return {
|
|
62
|
+
'spring.data.mongodb.uri': f'mongodb://{username_with_password}{mongo_host}/?retryWrites=true&authSource=admin',
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
def _variant_load_job_tracker_properties(self):
|
|
66
|
+
variant_url, variant_user, variant_pass = get_variant_load_job_tracker_creds_for_profile(self.maven_profile,
|
|
67
|
+
self.private_settings_file)
|
|
68
|
+
return {
|
|
69
|
+
'job.repository.url': variant_url,
|
|
70
|
+
'job.repository.username': variant_user,
|
|
71
|
+
'job.repository.password': variant_pass,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
def _count_stats_properties(self):
|
|
75
|
+
counts_url, counts_username, counts_password = get_count_service_creds_for_profile(
|
|
76
|
+
self.maven_profile, self.private_settings_file)
|
|
77
|
+
return {
|
|
78
|
+
'eva.count-stats.url': counts_url,
|
|
79
|
+
'eva.count-stats.username': counts_username,
|
|
80
|
+
'eva.count-stats.password': counts_password
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
def _common_properties(self, *, read_preference='primary', chunk_size=100, max_pool_size=2):
|
|
84
|
+
"""Properties common to all Spring pipelines"""
|
|
85
|
+
props = {
|
|
86
|
+
'spring.datasource.driver-class-name': 'org.postgresql.Driver',
|
|
87
|
+
'spring.datasource.tomcat.max-active': 3,
|
|
88
|
+
'spring.jpa.generate-ddl': 'true',
|
|
89
|
+
|
|
90
|
+
'mongodb.read-preference': read_preference,
|
|
91
|
+
|
|
92
|
+
'spring.main.web-application-type': 'none',
|
|
93
|
+
'spring.main.allow-bean-definition-overriding': 'true',
|
|
94
|
+
'spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation': 'true',
|
|
95
|
+
'spring.jpa.properties.hibernate.temp.use_jdbc_metadata_defaults': 'false',
|
|
96
|
+
'spring.jpa.database-platform': 'org.hibernate.dialect.PostgreSQL9Dialect',
|
|
97
|
+
'parameters.chunkSize': chunk_size,
|
|
98
|
+
'spring.datasource.hikari.maximum-pool-size': max_pool_size
|
|
99
|
+
}
|
|
100
|
+
merge = {**self._mongo_properties(), **self._count_stats_properties(), **props}
|
|
101
|
+
return merge
|
|
102
|
+
|
|
103
|
+
def _common_accessioning_properties(self, assembly_accession, read_preference, chunk_size):
|
|
104
|
+
pg_url, pg_user, pg_pass = get_accession_pg_creds_for_profile(self.maven_profile, self.private_settings_file)
|
|
105
|
+
accession_db = get_properties_from_xml_file(
|
|
106
|
+
self.maven_profile, self.private_settings_file)['eva.accession.mongo.database']
|
|
107
|
+
props = {
|
|
108
|
+
'spring.datasource.url': pg_url,
|
|
109
|
+
'spring.datasource.username': pg_user,
|
|
110
|
+
'spring.datasource.password': pg_pass,
|
|
111
|
+
'spring.data.mongodb.database': accession_db,
|
|
112
|
+
'parameters.assemblyAccession': assembly_accession,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
merge = {**self._common_properties(read_preference=read_preference, chunk_size=chunk_size), **props}
|
|
116
|
+
return merge
|
|
117
|
+
|
|
118
|
+
def _common_accessioning_clustering_properties(self, *, assembly_accession, read_preference, chunk_size):
|
|
119
|
+
"""Properties common to accessioning and clustering pipelines."""
|
|
120
|
+
props = {
|
|
121
|
+
'accessioning.submitted.categoryId': 'ss',
|
|
122
|
+
'accessioning.clustered.categoryId': 'rs',
|
|
123
|
+
'accessioning.monotonic.ss.blockSize': 100000,
|
|
124
|
+
'accessioning.monotonic.ss.blockStartValue': 5000000000,
|
|
125
|
+
'accessioning.monotonic.ss.nextBlockInterval': 1000000000,
|
|
126
|
+
'accessioning.monotonic.rs.blockSize': 100000,
|
|
127
|
+
'accessioning.monotonic.rs.blockStartValue': 3000000000,
|
|
128
|
+
'accessioning.monotonic.rs.nextBlockInterval': 1000000000,
|
|
129
|
+
# This value is not used but is required to create beans in Java
|
|
130
|
+
'recovery.cutoff.days': 9999999
|
|
131
|
+
}
|
|
132
|
+
merge = {**self._common_accessioning_properties(assembly_accession, read_preference, chunk_size), **props}
|
|
133
|
+
return merge
|
|
134
|
+
|
|
135
|
+
def get_accessioning_properties(self, *, target_assembly=None, fasta=None, assembly_report=None,
|
|
136
|
+
project_accession=None, aggregation='BASIC', taxonomy_accession=None,
|
|
137
|
+
vcf_file='', output_vcf='', chunk_size=100):
|
|
138
|
+
"""Properties for accessioning pipeline."""
|
|
139
|
+
return self._format(
|
|
140
|
+
self._common_accessioning_clustering_properties(assembly_accession=target_assembly,
|
|
141
|
+
read_preference='secondaryPreferred',
|
|
142
|
+
chunk_size=chunk_size),
|
|
143
|
+
{
|
|
144
|
+
'spring.batch.job.names': 'CREATE_SUBSNP_ACCESSION_JOB',
|
|
145
|
+
'parameters.assemblyReportUrl': self._format_str('file:{0}', assembly_report),
|
|
146
|
+
'parameters.contigNaming': 'NO_REPLACEMENT',
|
|
147
|
+
'parameters.fasta': fasta,
|
|
148
|
+
'parameters.forceRestart': 'false',
|
|
149
|
+
'parameters.projectAccession': project_accession,
|
|
150
|
+
'parameters.taxonomyAccession': taxonomy_accession,
|
|
151
|
+
'parameters.vcfAggregation': aggregation,
|
|
152
|
+
'parameters.vcf': vcf_file,
|
|
153
|
+
'parameters.outputVcf': output_vcf
|
|
154
|
+
},
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
def get_clustering_properties(self, *, read_preference='primary', job_name=None, source_assembly='',
|
|
158
|
+
target_assembly='', rs_report_path='', rs_acc_file='', duplicate_rs_acc_file='',
|
|
159
|
+
projects='', project_accession='', vcf=''):
|
|
160
|
+
"""Properties common to all clustering pipelines, though not all are always used."""
|
|
161
|
+
return self._format(
|
|
162
|
+
self._common_accessioning_clustering_properties(assembly_accession=target_assembly,
|
|
163
|
+
read_preference=read_preference, chunk_size=100),
|
|
164
|
+
{
|
|
165
|
+
'spring.batch.job.names': job_name,
|
|
166
|
+
'parameters.remappedFrom': source_assembly,
|
|
167
|
+
'parameters.projects': projects,
|
|
168
|
+
'parameters.projectAccession': project_accession,
|
|
169
|
+
'parameters.vcf': vcf,
|
|
170
|
+
'parameters.rsReportPath': rs_report_path,
|
|
171
|
+
'parameters.rsAccFile': rs_acc_file,
|
|
172
|
+
'parameters.duplicateRSAccFile': duplicate_rs_acc_file,
|
|
173
|
+
}
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def get_remapping_extraction_properties(self, *, taxonomy=None, source_assembly=None, fasta=None,
|
|
177
|
+
assembly_report=None,
|
|
178
|
+
projects='', output_folder=None):
|
|
179
|
+
"""Properties for remapping extraction pipeline."""
|
|
180
|
+
return self._format(
|
|
181
|
+
self._common_accessioning_properties(assembly_accession=source_assembly,
|
|
182
|
+
read_preference='secondaryPreferred',
|
|
183
|
+
chunk_size=1000),
|
|
184
|
+
{
|
|
185
|
+
'spring.batch.job.names': 'EXPORT_SUBMITTED_VARIANTS_JOB',
|
|
186
|
+
'parameters.taxonomy': taxonomy,
|
|
187
|
+
'parameters.fasta': fasta,
|
|
188
|
+
'parameters.assemblyReportUrl': self._format_str('file:{0}', assembly_report),
|
|
189
|
+
'parameters.projects': projects,
|
|
190
|
+
'parameters.outputFolder': output_folder
|
|
191
|
+
})
|
|
192
|
+
|
|
193
|
+
def get_remapping_ingestion_properties(self, *, source_assembly=None, target_assembly=None, vcf=None, load_to=None,
|
|
194
|
+
remapping_version=1.0):
|
|
195
|
+
"""Properties for remapping ingestion pipeline."""
|
|
196
|
+
return self._format(
|
|
197
|
+
self._common_accessioning_properties(assembly_accession=target_assembly,
|
|
198
|
+
read_preference='secondaryPreferred',
|
|
199
|
+
chunk_size=1000),
|
|
200
|
+
{
|
|
201
|
+
'spring.batch.job.names': 'INGEST_REMAPPED_VARIANTS_FROM_VCF_JOB',
|
|
202
|
+
'parameters.vcf': vcf,
|
|
203
|
+
'parameters.remappedFrom': source_assembly,
|
|
204
|
+
'parameters.loadTo': load_to,
|
|
205
|
+
'parameters.remappingVersion': remapping_version,
|
|
206
|
+
}
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def get_release_properties(self, *, job_name=None, assembly_accession=None, taxonomy_accession=None, fasta=None,
|
|
210
|
+
assembly_report=None, contig_naming=None, output_folder=None, accessioned_vcf=None,
|
|
211
|
+
temp_mongo_db=None):
|
|
212
|
+
common_props = self._common_accessioning_properties(assembly_accession=assembly_accession,
|
|
213
|
+
read_preference='secondaryPreferred', chunk_size=1000)
|
|
214
|
+
# For release in Embassy only
|
|
215
|
+
if temp_mongo_db:
|
|
216
|
+
common_props['spring.data.mongodb.database'] = temp_mongo_db
|
|
217
|
+
common_props['mongodb.read-preference'] = 'primaryPreferred'
|
|
218
|
+
common_props.pop('spring.data.mongodb.host')
|
|
219
|
+
common_props.pop('spring.data.mongodb.port')
|
|
220
|
+
common_props.pop('spring.data.mongodb.username')
|
|
221
|
+
common_props.pop('spring.data.mongodb.password')
|
|
222
|
+
return self._format(
|
|
223
|
+
common_props,
|
|
224
|
+
{
|
|
225
|
+
'spring.batch.job.names': job_name,
|
|
226
|
+
'parameters.taxonomyAccession': taxonomy_accession,
|
|
227
|
+
'parameters.contigNaming': contig_naming,
|
|
228
|
+
'parameters.fasta': fasta,
|
|
229
|
+
'parameters.assemblyReportUrl': self._format_str('file:{0}', assembly_report),
|
|
230
|
+
'parameters.outputFolder': output_folder,
|
|
231
|
+
'parameters.accessionedVcf': '' if accessioned_vcf is None else accessioned_vcf,
|
|
232
|
+
'logging.level.uk.ac.ebi.eva.accession.release': 'INFO'
|
|
233
|
+
})
|
|
234
|
+
|
|
235
|
+
def _common_eva_pipeline_properties(self, opencga_path, read_preference='secondaryPreferred'):
|
|
236
|
+
files_collection = get_properties_from_xml_file(
|
|
237
|
+
self.maven_profile, self.private_settings_file)['eva.mongo.collections.files']
|
|
238
|
+
annotation_metadata_collection = get_properties_from_xml_file(
|
|
239
|
+
self.maven_profile, self.private_settings_file)['eva.mongo.collections.annotation-metadata']
|
|
240
|
+
annotation_collection = get_properties_from_xml_file(
|
|
241
|
+
self.maven_profile, self.private_settings_file)['eva.mongo.collections.annotations']
|
|
242
|
+
variants_collection = get_properties_from_xml_file(
|
|
243
|
+
self.maven_profile, self.private_settings_file)['eva.mongo.collections.variants']
|
|
244
|
+
job_tracker_properties = self._variant_load_job_tracker_properties()
|
|
245
|
+
props = {
|
|
246
|
+
'spring.profiles.active': 'production,mongo',
|
|
247
|
+
'spring.profiles.include': 'variant-writer-mongo,variant-annotation-mongo',
|
|
248
|
+
|
|
249
|
+
'spring.data.mongodb.authentication-mechanism': 'SCRAM-SHA-1',
|
|
250
|
+
'job.repository.driverClassName': 'org.postgresql.Driver',
|
|
251
|
+
|
|
252
|
+
'db.collections.variants.name': variants_collection,
|
|
253
|
+
'db.collections.files.name': files_collection,
|
|
254
|
+
'db.collections.annotation-metadata.name': annotation_metadata_collection,
|
|
255
|
+
'db.collections.annotations.name': annotation_collection,
|
|
256
|
+
|
|
257
|
+
'app.opencga.path': opencga_path,
|
|
258
|
+
'config.restartability.allow': 'false',
|
|
259
|
+
'config.db.read-preference': read_preference,
|
|
260
|
+
|
|
261
|
+
'logging.level.embl.ebi.variation.eva': 'DEBUG',
|
|
262
|
+
'logging.level.org.opencb.opencga': 'DEBUG',
|
|
263
|
+
'logging.level.org.springframework': 'INFO',
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
merge = {**self._common_properties(read_preference=read_preference, chunk_size=100), **props,
|
|
267
|
+
**job_tracker_properties}
|
|
268
|
+
return merge
|
|
269
|
+
|
|
270
|
+
def get_accession_import_properties(self, opencga_path, read_preference='secondaryPreferred'):
|
|
271
|
+
return self._format(self._common_eva_pipeline_properties(opencga_path, read_preference))
|
|
272
|
+
|
|
273
|
+
def get_variant_load_properties(self, project_accession, study_name, output_dir, annotation_dir, stats_dir,
|
|
274
|
+
vep_cache_path, opencga_path, read_preference='secondaryPreferred'):
|
|
275
|
+
return self._format(
|
|
276
|
+
self._common_eva_pipeline_properties(opencga_path, read_preference),
|
|
277
|
+
{
|
|
278
|
+
'annotation.overwrite': False,
|
|
279
|
+
'app.vep.cache.path': vep_cache_path,
|
|
280
|
+
'app.vep.num-forks': 4,
|
|
281
|
+
'app.vep.timeout': 500,
|
|
282
|
+
'config.chunk.size': 200,
|
|
283
|
+
|
|
284
|
+
'input.study.id': project_accession,
|
|
285
|
+
'input.study.name': study_name,
|
|
286
|
+
'input.study.type': 'COLLECTION',
|
|
287
|
+
|
|
288
|
+
'output.dir': str(output_dir),
|
|
289
|
+
'output.dir.annotation': str(annotation_dir),
|
|
290
|
+
'output.dir.statistics': str(stats_dir),
|
|
291
|
+
|
|
292
|
+
'statistics.skip': False
|
|
293
|
+
},
|
|
294
|
+
)
|