ebi-eva-common-pyutils 0.6.15__1-py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. ebi_eva_common_pyutils/__init__.py +0 -0
  2. ebi_eva_common_pyutils/assembly/__init__.py +1 -0
  3. ebi_eva_common_pyutils/assembly/assembly.py +69 -0
  4. ebi_eva_common_pyutils/assembly_utils.py +91 -0
  5. ebi_eva_common_pyutils/biosamples_communicators.py +186 -0
  6. ebi_eva_common_pyutils/command_utils.py +54 -0
  7. ebi_eva_common_pyutils/common_utils.py +30 -0
  8. ebi_eva_common_pyutils/config.py +152 -0
  9. ebi_eva_common_pyutils/contig_alias/__init__.py +0 -0
  10. ebi_eva_common_pyutils/contig_alias/contig_alias.py +115 -0
  11. ebi_eva_common_pyutils/ena_utils.py +35 -0
  12. ebi_eva_common_pyutils/file_utils.py +31 -0
  13. ebi_eva_common_pyutils/logger.py +150 -0
  14. ebi_eva_common_pyutils/ncbi_utils.py +117 -0
  15. ebi_eva_common_pyutils/network_utils.py +64 -0
  16. ebi_eva_common_pyutils/reference/__init__.py +2 -0
  17. ebi_eva_common_pyutils/reference/assembly.py +247 -0
  18. ebi_eva_common_pyutils/reference/sequence.py +101 -0
  19. ebi_eva_common_pyutils/taxonomy/__init__.py +0 -0
  20. ebi_eva_common_pyutils/taxonomy/taxonomy.py +60 -0
  21. ebi_eva_common_pyutils/variation/__init__.py +0 -0
  22. ebi_eva_common_pyutils/variation/contig_utils.py +113 -0
  23. ebi_eva_common_pyutils-0.6.15.data/scripts/archive_directory.py +114 -0
  24. ebi_eva_common_pyutils-0.6.15.dist-info/LICENSE +201 -0
  25. ebi_eva_common_pyutils-0.6.15.dist-info/METADATA +23 -0
  26. ebi_eva_common_pyutils-0.6.15.dist-info/RECORD +39 -0
  27. ebi_eva_common_pyutils-0.6.15.dist-info/WHEEL +5 -0
  28. ebi_eva_common_pyutils-0.6.15.dist-info/top_level.txt +2 -0
  29. ebi_eva_internal_pyutils/__init__.py +0 -0
  30. ebi_eva_internal_pyutils/archive_directory.py +114 -0
  31. ebi_eva_internal_pyutils/config_utils.py +188 -0
  32. ebi_eva_internal_pyutils/metadata_utils.py +288 -0
  33. ebi_eva_internal_pyutils/mongo_utils.py +71 -0
  34. ebi_eva_internal_pyutils/mongodb/__init__.py +3 -0
  35. ebi_eva_internal_pyutils/mongodb/mongo_database.py +170 -0
  36. ebi_eva_internal_pyutils/nextflow/__init__.py +1 -0
  37. ebi_eva_internal_pyutils/nextflow/nextflow_pipeline.py +195 -0
  38. ebi_eva_internal_pyutils/pg_utils.py +107 -0
  39. ebi_eva_internal_pyutils/spring_properties.py +294 -0
@@ -0,0 +1,195 @@
1
+ # Copyright 2021 EMBL - European Bioinformatics Institute
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Rationale for a Nextflow pipeline abstraction
16
+ # ---------------------------------------------
17
+ # Dynamic pipeline generation
18
+ # Abstraction to represent process dependencies
19
+ # Unit testability of individual steps without scattering logic between Python and Nextflow
20
+ # Ability to combine pipelines
21
+
22
+ import networkx as nx
23
+ import os
24
+ from typing import List, Dict, Union
25
+
26
+ from ebi_eva_common_pyutils.logger import AppLogger
27
+ from ebi_eva_common_pyutils.command_utils import run_command_with_output
28
+
29
+
30
+ class NextFlowProcess:
31
+
32
+ def __init__(self, process_name: str, command_to_run: str, process_directives: Dict[str, str] = None) -> None:
33
+ """
34
+ Create a Nextflow process
35
+ :rtype: None
36
+ :param process_name: Name of the process - should be a valid identifier - ex: p1_merge
37
+ :type process_name: str
38
+ :param command_to_run: Command to be run - ex: bash -c "echo p1"
39
+ :type command_to_run: str
40
+ :param process_directives: Additional process directives - ex: {"memory": "4GB", "executor": "lsf"}
41
+ :type process_directives: dict
42
+ """
43
+ if not process_name.isidentifier():
44
+ raise ValueError(f"{process_name} is not a valid Nextflow process name")
45
+ self.process_name = process_name
46
+ self.success_flag = f"{self.process_name}_success"
47
+ self.command_to_run = command_to_run
48
+ self.process_directives = process_directives if process_directives else dict()
49
+
50
+
51
+ class NextFlowPipeline(AppLogger):
52
+ def __init__(self, process_dependency_map: Dict[NextFlowProcess, List[NextFlowProcess]] = None) -> None:
53
+ """
54
+ Create a Nextflow pipeline with a process dependency map
55
+
56
+ :param process_dependency_map: Map of Nextflow processes and their corresponding dependencies
57
+ - ex: {p3 : [p2], p2: [p1]} where p1, p2 and p3 are Nextflow processes that should be executed sequentially
58
+ """
59
+ # Modeling the dependency map as a DiGraph (Directed graph) is advantageous
60
+ # in ordering/combining flows and detecting cycles
61
+ self.process_dependency_map = nx.ordered.DiGraph()
62
+ if process_dependency_map:
63
+ self.add_dependencies(process_dependency_map)
64
+
65
+ def add_dependencies(self, process_dependency_map: Dict[NextFlowProcess, List[NextFlowProcess]]):
66
+ for process, dependencies in process_dependency_map.items():
67
+ if dependencies:
68
+ for dependency in dependencies:
69
+ self.add_process_dependency(process, dependency)
70
+ else:
71
+ self.add_process_dependency(process, None)
72
+
73
+ def add_process_dependency(self, process: NextFlowProcess, dependency: Union[NextFlowProcess, None]):
74
+ if dependency:
75
+ self.process_dependency_map.add_edge(process, dependency)
76
+ if not nx.dag.is_directed_acyclic_graph(self.process_dependency_map):
77
+ raise ValueError(f"Cycles found in pipeline when adding process {process.process_name} "
78
+ f"and its dependency {dependency.process_name}")
79
+ else:
80
+ # If no dependency is specified, the process will just be a single node in the DAG
81
+ self.process_dependency_map.add_node(process)
82
+
83
+ def _write_to_pipeline_file(self, workflow_file_path: str):
84
+ with open(workflow_file_path, "a") as pipeline_file_handle:
85
+ pipeline_file_handle.write(self.__str__() + "\n")
86
+
87
+ def run_pipeline(self, workflow_file_path: str, nextflow_binary_path: str = 'nextflow',
88
+ nextflow_config_path: str = None, working_dir: str = ".", resume: bool = False,
89
+ other_args: dict = None):
90
+ # Remove pipeline file if it already exists
91
+ if os.path.exists(workflow_file_path):
92
+ os.remove(workflow_file_path)
93
+ self._write_to_pipeline_file(workflow_file_path)
94
+ workflow_command = f"cd {working_dir} && {nextflow_binary_path} run {workflow_file_path}"
95
+ workflow_command += f" -c {nextflow_config_path}" if nextflow_config_path else ""
96
+ workflow_command += f" -with-report {workflow_file_path}.report.html"
97
+ workflow_command += f" -with-dag {workflow_file_path}.dag.png"
98
+ workflow_command += " -resume" if resume else ""
99
+ workflow_command += " ".join([f" -{arg} {val}" for arg, val in other_args.items()]) if other_args else ""
100
+ run_command_with_output(f"Running pipeline {workflow_file_path}...", workflow_command)
101
+
102
+ @staticmethod
103
+ def join_pipelines(main_pipeline: 'NextFlowPipeline', dependent_pipeline: 'NextFlowPipeline',
104
+ with_dependencies: bool = True) -> 'NextFlowPipeline':
105
+ """
106
+ Join two pipelines with or without dependencies
107
+
108
+ With Dependencies it returns a new pipeline where:
109
+ 1) root processes are those of the main pipeline.
110
+ 2) final processes are those of the dependent pipeline and
111
+ 3) every root process of the dependent pipeline depends on the final processes of the main pipeline.
112
+ Without Dependencies it returns a new pipeline where:
113
+ 1) the two pipeline are left independent
114
+ 2) Only shared dependencies
115
+ 3) every root process of the dependent pipeline depends on the final processes of the main pipeline.
116
+
117
+ """
118
+ joined_pipeline = NextFlowPipeline()
119
+ # Aggregate dependency maps of both pipelines
120
+ joined_pipeline.process_dependency_map = nx.compose(main_pipeline.process_dependency_map,
121
+ dependent_pipeline.process_dependency_map)
122
+ if with_dependencies:
123
+ for final_process_in_main_pipeline in main_pipeline._get_final_processes():
124
+ for root_process_in_dependent_pipeline in dependent_pipeline._get_root_processes():
125
+ joined_pipeline.add_process_dependency(root_process_in_dependent_pipeline,
126
+ final_process_in_main_pipeline)
127
+ return joined_pipeline
128
+
129
+ def _get_root_processes(self) -> List[NextFlowProcess]:
130
+ # Root processes are those which have no dependencies
131
+ # See https://stackoverflow.com/a/62948641
132
+ roots = []
133
+ for component in nx.weakly_connected_components(self.process_dependency_map):
134
+ subgraph = self.process_dependency_map.subgraph(component)
135
+ roots.extend([n for n, d in subgraph.out_degree() if d == 0])
136
+ return roots
137
+
138
+ def _get_final_processes(self) -> List[NextFlowProcess]:
139
+ # Final processes are those which have no other processes depending on them
140
+ # See https://stackoverflow.com/a/62948641
141
+ roots = []
142
+ for component in nx.weakly_connected_components(self.process_dependency_map):
143
+ subgraph = self.process_dependency_map.subgraph(component)
144
+ roots.extend([n for n, d in subgraph.in_degree() if d == 0])
145
+ return roots
146
+
147
+ @staticmethod
148
+ def _get_process_repr(process: NextFlowProcess, dependencies: List[NextFlowProcess]) -> str:
149
+ process_directives_str = "\n".join([f"{key}='{value}'" for key, value in process.process_directives.items()])
150
+ input_dependencies = "val flag from true"
151
+ if dependencies:
152
+ input_dependencies = "\n".join([f"val {dependency.success_flag} from {dependency.success_flag}"
153
+ for dependency in dependencies])
154
+ return "\n".join(map(str.strip, f"""
155
+ process {process.process_name} {{
156
+ {process_directives_str}
157
+ input:
158
+ {input_dependencies}
159
+ output:
160
+ val true into {process.success_flag}
161
+ script:
162
+ \"\"\"
163
+ {process.command_to_run}
164
+ \"\"\"
165
+ }}""".split("\n")))
166
+
167
+ def __str__(self):
168
+ # Order the list of nodes based on the dependency
169
+ # See https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.depth_first_search.dfs_postorder_nodes.html?highlight=dfs_postorder_nodes#networkx.algorithms.traversal.depth_first_search.dfs_postorder_nodes
170
+ ordered_list_of_processes_to_run = list(nx.dfs_postorder_nodes(self.process_dependency_map))
171
+ # Get a Nextflow pipeline representation of each process and its dependencies
172
+ return "\n\n".join([NextFlowPipeline._get_process_repr(process, list(self.process_dependency_map[process]))
173
+ for process in ordered_list_of_processes_to_run])
174
+
175
+
176
+ class LinearNextFlowPipeline(NextFlowPipeline):
177
+ """
178
+ Simple linear pipeline that supports resumption
179
+ """
180
+ previous_process: NextFlowProcess = None
181
+
182
+ def __init__(self, process_list: List[NextFlowProcess] = None):
183
+ dependency_map = {}
184
+ if process_list:
185
+ for index, process in enumerate(process_list):
186
+ dependency_map[process] = [] if index == 0 else [process_list[index - 1]]
187
+ super().__init__(dependency_map)
188
+
189
+ def add_process(self, process_name, command_to_run):
190
+ current_process = NextFlowProcess(process_name=process_name, command_to_run=command_to_run)
191
+ self._add_new_process(current_process)
192
+
193
+ def _add_new_process(self, current_process):
194
+ super().add_process_dependency(current_process, self.previous_process)
195
+ self.previous_process = current_process
@@ -0,0 +1,107 @@
1
+ # Copyright 2020 EMBL - European Bioinformatics Institute
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ import psycopg2
17
+ from ebi_eva_common_pyutils.logger import logging_config as log_cfg
18
+
19
+ logger = log_cfg.get_logger(__name__)
20
+
21
+
22
+ def get_all_results_for_query(pg_conn, query):
23
+ with get_result_cursor(pg_conn, query) as pg_cursor:
24
+ results = pg_cursor.fetchall()
25
+ return results
26
+
27
+
28
+ def execute_query(pg_conn, query):
29
+ with get_result_cursor(pg_conn, query) as _:
30
+ pg_conn.commit()
31
+
32
+
33
+ def get_result_cursor(pg_conn, query):
34
+ pg_cursor = pg_conn.cursor()
35
+ pg_cursor.execute(query)
36
+ return pg_cursor
37
+
38
+
39
+ def get_pg_connection_handle(dbname, user, host):
40
+ return psycopg2.connect("dbname='{0}' user='{1}' host='{2}'".format(dbname, user, host))
41
+
42
+
43
+ def index_already_exists_on_table(pg_conn, schema_name, table_name, index_columns):
44
+ index_columns_lower_case = list(map(str.lower, index_columns))
45
+ query = """select unnest(column_names) from (
46
+ select
47
+ nmsp.nspname as schema_name,
48
+ t.relname as table_name,
49
+ i.relname as index_name,
50
+ array_agg(a.attname) as column_names,
51
+ count(*) as number_of_columns
52
+ from
53
+ pg_class t,
54
+ pg_class i,
55
+ pg_index ix,
56
+ pg_attribute a,
57
+ pg_namespace nmsp
58
+ where
59
+ t.oid = ix.indrelid
60
+ and i.oid = ix.indexrelid
61
+ and a.attrelid = t.oid
62
+ and a.attnum = ANY(ix.indkey)
63
+ and t.relkind = 'r'
64
+ and nmsp.oid = t.relnamespace
65
+ and nmsp.nspname = '{0}'
66
+ and t.relname = '{1}'
67
+ and a.attname in ({2})
68
+ group by
69
+ schema_name, table_name, index_name
70
+ order by
71
+ t.relname,
72
+ i.relname
73
+ ) temp
74
+ where number_of_columns = {3};
75
+ """.format(schema_name, table_name,
76
+ ",".join(["'{0}'".format(col) for col in index_columns_lower_case]), len(index_columns))
77
+ results = [result[0] for result in get_all_results_for_query(pg_conn, query)]
78
+ return sorted(results) == index_columns_lower_case
79
+
80
+
81
+ def create_index_on_table(pg_conn, schema_name, table_name, index_columns):
82
+ if index_already_exists_on_table(pg_conn, schema_name, table_name, index_columns):
83
+ logger.info("Index on {0} column(s) on {1}.{2} already exists. Skipping..."
84
+ .format(",".join(list(map(str.lower, sorted(index_columns)))), schema_name, table_name))
85
+ else:
86
+ query = "create index on {0}.{1} ({2})".format(schema_name, table_name,
87
+ ",".join(list(map(str.lower, sorted(index_columns))))
88
+ )
89
+ logger.info("Building index with query: " + query)
90
+ execute_query(pg_conn, query)
91
+ pg_conn.commit()
92
+
93
+
94
+ def vacuum_analyze_table(pg_conn, schema_name, table_name, columns=()):
95
+ query = "vacuum analyze {0}.{1}".format(schema_name, table_name)
96
+ if columns:
97
+ query += "({0})".format(",".join(columns))
98
+ isolation_level_pre_analyze = pg_conn.isolation_level
99
+ try:
100
+ # This is needed for vacuum analyze to work since it can't work inside transactions!
101
+ pg_conn.set_isolation_level(0)
102
+ logger.info("Vacuum analyze with query: " + query)
103
+ execute_query(pg_conn, query)
104
+ except Exception as ex:
105
+ logger.error(ex)
106
+ finally:
107
+ pg_conn.set_isolation_level(isolation_level_pre_analyze)
@@ -0,0 +1,294 @@
1
+ # Copyright 2022 EMBL - European Bioinformatics Institute
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from collections import defaultdict
15
+ from urllib.parse import quote_plus
16
+
17
+ from ebi_eva_internal_pyutils.config_utils import get_mongo_creds_for_profile, get_accession_pg_creds_for_profile, \
18
+ get_count_service_creds_for_profile, get_properties_from_xml_file, get_variant_load_job_tracker_creds_for_profile
19
+
20
+
21
+ class SpringPropertiesGenerator:
22
+ """
23
+ Class to generate Spring properties for various Spring Batch pipelines.
24
+ These methods can be used to generate complete properties files entirely in Python; alternatively, certain
25
+ properties can be left unfilled and supplied as command-line arguments (e.g. by a NextFlow process).
26
+ """
27
+
28
+ def __init__(self, maven_profile, private_settings_file):
29
+ self.maven_profile = maven_profile
30
+ self.private_settings_file = private_settings_file
31
+
32
+ @staticmethod
33
+ def _format(*key_value_maps):
34
+ all_params = defaultdict(list)
35
+ for key_value_map in key_value_maps:
36
+ for key in key_value_map:
37
+ if key_value_map[key] is not None:
38
+ all_params[key.split('.')[0]].append(f'{key}={key_value_map[key]}')
39
+ lines = []
40
+ for key_type in all_params:
41
+ for line in all_params[key_type]:
42
+ lines.append(line)
43
+ lines.append('')
44
+
45
+ return '\n'.join(lines)
46
+
47
+ @staticmethod
48
+ def _format_str(string, param):
49
+ if param is None:
50
+ return None
51
+ elif not param:
52
+ return ''
53
+ else:
54
+ return string.format(param)
55
+
56
+ def _mongo_properties(self):
57
+ mongo_host, mongo_user, mongo_pass = get_mongo_creds_for_profile(
58
+ self.maven_profile, self.private_settings_file)
59
+ username_with_password = (f'{quote_plus(mongo_user)}:{quote_plus(mongo_pass)}@'
60
+ if mongo_user is not None and mongo_pass is not None else '')
61
+ return {
62
+ 'spring.data.mongodb.uri': f'mongodb://{username_with_password}{mongo_host}/?retryWrites=true&authSource=admin',
63
+ }
64
+
65
+ def _variant_load_job_tracker_properties(self):
66
+ variant_url, variant_user, variant_pass = get_variant_load_job_tracker_creds_for_profile(self.maven_profile,
67
+ self.private_settings_file)
68
+ return {
69
+ 'job.repository.url': variant_url,
70
+ 'job.repository.username': variant_user,
71
+ 'job.repository.password': variant_pass,
72
+ }
73
+
74
+ def _count_stats_properties(self):
75
+ counts_url, counts_username, counts_password = get_count_service_creds_for_profile(
76
+ self.maven_profile, self.private_settings_file)
77
+ return {
78
+ 'eva.count-stats.url': counts_url,
79
+ 'eva.count-stats.username': counts_username,
80
+ 'eva.count-stats.password': counts_password
81
+ }
82
+
83
+ def _common_properties(self, *, read_preference='primary', chunk_size=100, max_pool_size=2):
84
+ """Properties common to all Spring pipelines"""
85
+ props = {
86
+ 'spring.datasource.driver-class-name': 'org.postgresql.Driver',
87
+ 'spring.datasource.tomcat.max-active': 3,
88
+ 'spring.jpa.generate-ddl': 'true',
89
+
90
+ 'mongodb.read-preference': read_preference,
91
+
92
+ 'spring.main.web-application-type': 'none',
93
+ 'spring.main.allow-bean-definition-overriding': 'true',
94
+ 'spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation': 'true',
95
+ 'spring.jpa.properties.hibernate.temp.use_jdbc_metadata_defaults': 'false',
96
+ 'spring.jpa.database-platform': 'org.hibernate.dialect.PostgreSQL9Dialect',
97
+ 'parameters.chunkSize': chunk_size,
98
+ 'spring.datasource.hikari.maximum-pool-size': max_pool_size
99
+ }
100
+ merge = {**self._mongo_properties(), **self._count_stats_properties(), **props}
101
+ return merge
102
+
103
+ def _common_accessioning_properties(self, assembly_accession, read_preference, chunk_size):
104
+ pg_url, pg_user, pg_pass = get_accession_pg_creds_for_profile(self.maven_profile, self.private_settings_file)
105
+ accession_db = get_properties_from_xml_file(
106
+ self.maven_profile, self.private_settings_file)['eva.accession.mongo.database']
107
+ props = {
108
+ 'spring.datasource.url': pg_url,
109
+ 'spring.datasource.username': pg_user,
110
+ 'spring.datasource.password': pg_pass,
111
+ 'spring.data.mongodb.database': accession_db,
112
+ 'parameters.assemblyAccession': assembly_accession,
113
+ }
114
+
115
+ merge = {**self._common_properties(read_preference=read_preference, chunk_size=chunk_size), **props}
116
+ return merge
117
+
118
+ def _common_accessioning_clustering_properties(self, *, assembly_accession, read_preference, chunk_size):
119
+ """Properties common to accessioning and clustering pipelines."""
120
+ props = {
121
+ 'accessioning.submitted.categoryId': 'ss',
122
+ 'accessioning.clustered.categoryId': 'rs',
123
+ 'accessioning.monotonic.ss.blockSize': 100000,
124
+ 'accessioning.monotonic.ss.blockStartValue': 5000000000,
125
+ 'accessioning.monotonic.ss.nextBlockInterval': 1000000000,
126
+ 'accessioning.monotonic.rs.blockSize': 100000,
127
+ 'accessioning.monotonic.rs.blockStartValue': 3000000000,
128
+ 'accessioning.monotonic.rs.nextBlockInterval': 1000000000,
129
+ # This value is not used but is required to create beans in Java
130
+ 'recovery.cutoff.days': 9999999
131
+ }
132
+ merge = {**self._common_accessioning_properties(assembly_accession, read_preference, chunk_size), **props}
133
+ return merge
134
+
135
+ def get_accessioning_properties(self, *, target_assembly=None, fasta=None, assembly_report=None,
136
+ project_accession=None, aggregation='BASIC', taxonomy_accession=None,
137
+ vcf_file='', output_vcf='', chunk_size=100):
138
+ """Properties for accessioning pipeline."""
139
+ return self._format(
140
+ self._common_accessioning_clustering_properties(assembly_accession=target_assembly,
141
+ read_preference='secondaryPreferred',
142
+ chunk_size=chunk_size),
143
+ {
144
+ 'spring.batch.job.names': 'CREATE_SUBSNP_ACCESSION_JOB',
145
+ 'parameters.assemblyReportUrl': self._format_str('file:{0}', assembly_report),
146
+ 'parameters.contigNaming': 'NO_REPLACEMENT',
147
+ 'parameters.fasta': fasta,
148
+ 'parameters.forceRestart': 'false',
149
+ 'parameters.projectAccession': project_accession,
150
+ 'parameters.taxonomyAccession': taxonomy_accession,
151
+ 'parameters.vcfAggregation': aggregation,
152
+ 'parameters.vcf': vcf_file,
153
+ 'parameters.outputVcf': output_vcf
154
+ },
155
+ )
156
+
157
+ def get_clustering_properties(self, *, read_preference='primary', job_name=None, source_assembly='',
158
+ target_assembly='', rs_report_path='', rs_acc_file='', duplicate_rs_acc_file='',
159
+ projects='', project_accession='', vcf=''):
160
+ """Properties common to all clustering pipelines, though not all are always used."""
161
+ return self._format(
162
+ self._common_accessioning_clustering_properties(assembly_accession=target_assembly,
163
+ read_preference=read_preference, chunk_size=100),
164
+ {
165
+ 'spring.batch.job.names': job_name,
166
+ 'parameters.remappedFrom': source_assembly,
167
+ 'parameters.projects': projects,
168
+ 'parameters.projectAccession': project_accession,
169
+ 'parameters.vcf': vcf,
170
+ 'parameters.rsReportPath': rs_report_path,
171
+ 'parameters.rsAccFile': rs_acc_file,
172
+ 'parameters.duplicateRSAccFile': duplicate_rs_acc_file,
173
+ }
174
+ )
175
+
176
+ def get_remapping_extraction_properties(self, *, taxonomy=None, source_assembly=None, fasta=None,
177
+ assembly_report=None,
178
+ projects='', output_folder=None):
179
+ """Properties for remapping extraction pipeline."""
180
+ return self._format(
181
+ self._common_accessioning_properties(assembly_accession=source_assembly,
182
+ read_preference='secondaryPreferred',
183
+ chunk_size=1000),
184
+ {
185
+ 'spring.batch.job.names': 'EXPORT_SUBMITTED_VARIANTS_JOB',
186
+ 'parameters.taxonomy': taxonomy,
187
+ 'parameters.fasta': fasta,
188
+ 'parameters.assemblyReportUrl': self._format_str('file:{0}', assembly_report),
189
+ 'parameters.projects': projects,
190
+ 'parameters.outputFolder': output_folder
191
+ })
192
+
193
+ def get_remapping_ingestion_properties(self, *, source_assembly=None, target_assembly=None, vcf=None, load_to=None,
194
+ remapping_version=1.0):
195
+ """Properties for remapping ingestion pipeline."""
196
+ return self._format(
197
+ self._common_accessioning_properties(assembly_accession=target_assembly,
198
+ read_preference='secondaryPreferred',
199
+ chunk_size=1000),
200
+ {
201
+ 'spring.batch.job.names': 'INGEST_REMAPPED_VARIANTS_FROM_VCF_JOB',
202
+ 'parameters.vcf': vcf,
203
+ 'parameters.remappedFrom': source_assembly,
204
+ 'parameters.loadTo': load_to,
205
+ 'parameters.remappingVersion': remapping_version,
206
+ }
207
+ )
208
+
209
+ def get_release_properties(self, *, job_name=None, assembly_accession=None, taxonomy_accession=None, fasta=None,
210
+ assembly_report=None, contig_naming=None, output_folder=None, accessioned_vcf=None,
211
+ temp_mongo_db=None):
212
+ common_props = self._common_accessioning_properties(assembly_accession=assembly_accession,
213
+ read_preference='secondaryPreferred', chunk_size=1000)
214
+ # For release in Embassy only
215
+ if temp_mongo_db:
216
+ common_props['spring.data.mongodb.database'] = temp_mongo_db
217
+ common_props['mongodb.read-preference'] = 'primaryPreferred'
218
+ common_props.pop('spring.data.mongodb.host')
219
+ common_props.pop('spring.data.mongodb.port')
220
+ common_props.pop('spring.data.mongodb.username')
221
+ common_props.pop('spring.data.mongodb.password')
222
+ return self._format(
223
+ common_props,
224
+ {
225
+ 'spring.batch.job.names': job_name,
226
+ 'parameters.taxonomyAccession': taxonomy_accession,
227
+ 'parameters.contigNaming': contig_naming,
228
+ 'parameters.fasta': fasta,
229
+ 'parameters.assemblyReportUrl': self._format_str('file:{0}', assembly_report),
230
+ 'parameters.outputFolder': output_folder,
231
+ 'parameters.accessionedVcf': '' if accessioned_vcf is None else accessioned_vcf,
232
+ 'logging.level.uk.ac.ebi.eva.accession.release': 'INFO'
233
+ })
234
+
235
+ def _common_eva_pipeline_properties(self, opencga_path, read_preference='secondaryPreferred'):
236
+ files_collection = get_properties_from_xml_file(
237
+ self.maven_profile, self.private_settings_file)['eva.mongo.collections.files']
238
+ annotation_metadata_collection = get_properties_from_xml_file(
239
+ self.maven_profile, self.private_settings_file)['eva.mongo.collections.annotation-metadata']
240
+ annotation_collection = get_properties_from_xml_file(
241
+ self.maven_profile, self.private_settings_file)['eva.mongo.collections.annotations']
242
+ variants_collection = get_properties_from_xml_file(
243
+ self.maven_profile, self.private_settings_file)['eva.mongo.collections.variants']
244
+ job_tracker_properties = self._variant_load_job_tracker_properties()
245
+ props = {
246
+ 'spring.profiles.active': 'production,mongo',
247
+ 'spring.profiles.include': 'variant-writer-mongo,variant-annotation-mongo',
248
+
249
+ 'spring.data.mongodb.authentication-mechanism': 'SCRAM-SHA-1',
250
+ 'job.repository.driverClassName': 'org.postgresql.Driver',
251
+
252
+ 'db.collections.variants.name': variants_collection,
253
+ 'db.collections.files.name': files_collection,
254
+ 'db.collections.annotation-metadata.name': annotation_metadata_collection,
255
+ 'db.collections.annotations.name': annotation_collection,
256
+
257
+ 'app.opencga.path': opencga_path,
258
+ 'config.restartability.allow': 'false',
259
+ 'config.db.read-preference': read_preference,
260
+
261
+ 'logging.level.embl.ebi.variation.eva': 'DEBUG',
262
+ 'logging.level.org.opencb.opencga': 'DEBUG',
263
+ 'logging.level.org.springframework': 'INFO',
264
+ }
265
+
266
+ merge = {**self._common_properties(read_preference=read_preference, chunk_size=100), **props,
267
+ **job_tracker_properties}
268
+ return merge
269
+
270
+ def get_accession_import_properties(self, opencga_path, read_preference='secondaryPreferred'):
271
+ return self._format(self._common_eva_pipeline_properties(opencga_path, read_preference))
272
+
273
+ def get_variant_load_properties(self, project_accession, study_name, output_dir, annotation_dir, stats_dir,
274
+ vep_cache_path, opencga_path, read_preference='secondaryPreferred'):
275
+ return self._format(
276
+ self._common_eva_pipeline_properties(opencga_path, read_preference),
277
+ {
278
+ 'annotation.overwrite': False,
279
+ 'app.vep.cache.path': vep_cache_path,
280
+ 'app.vep.num-forks': 4,
281
+ 'app.vep.timeout': 500,
282
+ 'config.chunk.size': 200,
283
+
284
+ 'input.study.id': project_accession,
285
+ 'input.study.name': study_name,
286
+ 'input.study.type': 'COLLECTION',
287
+
288
+ 'output.dir': str(output_dir),
289
+ 'output.dir.annotation': str(annotation_dir),
290
+ 'output.dir.statistics': str(stats_dir),
291
+
292
+ 'statistics.skip': False
293
+ },
294
+ )