ebi-eva-common-pyutils 0.7.4__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ebi_eva_common_pyutils/assembly_utils.py +6 -1
- {ebi_eva_common_pyutils-0.7.4.dist-info → ebi_eva_common_pyutils-0.8.0.dist-info}/METADATA +1 -1
- {ebi_eva_common_pyutils-0.7.4.dist-info → ebi_eva_common_pyutils-0.8.0.dist-info}/RECORD +10 -12
- ebi_eva_internal_pyutils/__init__.py +1 -0
- ebi_eva_internal_pyutils/mongodb/__init__.py +0 -2
- ebi_eva_internal_pyutils/mongodb/mongo_database.py +1 -0
- ebi_eva_internal_pyutils/nextflow/__init__.py +0 -1
- ebi_eva_internal_pyutils/nextflow/nextflow_pipeline.py +0 -195
- {ebi_eva_common_pyutils-0.7.4.data → ebi_eva_common_pyutils-0.8.0.data}/scripts/archive_directory.py +0 -0
- {ebi_eva_common_pyutils-0.7.4.dist-info → ebi_eva_common_pyutils-0.8.0.dist-info}/LICENSE +0 -0
- {ebi_eva_common_pyutils-0.7.4.dist-info → ebi_eva_common_pyutils-0.8.0.dist-info}/WHEEL +0 -0
- {ebi_eva_common_pyutils-0.7.4.dist-info → ebi_eva_common_pyutils-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import http
|
|
16
16
|
import requests
|
|
17
|
+
from requests import HTTPError
|
|
17
18
|
|
|
18
19
|
from ebi_eva_common_pyutils.assembly import NCBIAssembly
|
|
19
20
|
from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena
|
|
@@ -33,7 +34,11 @@ def is_patch_assembly(assembly_accession: str) -> bool:
|
|
|
33
34
|
Check if a given assembly is a patch assembly
|
|
34
35
|
Please see: https://www.ncbi.nlm.nih.gov/grc/help/patches/
|
|
35
36
|
"""
|
|
36
|
-
|
|
37
|
+
try:
|
|
38
|
+
xml_root = download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{assembly_accession}')
|
|
39
|
+
except HTTPError as e:
|
|
40
|
+
logger.warning(f'Failed to download assembly {assembly_accession} from ENA: {str(e)}')
|
|
41
|
+
return False
|
|
37
42
|
xml_assembly = xml_root.xpath("//ASSEMBLY_ATTRIBUTE[TAG='count-patches']/VALUE")
|
|
38
43
|
if len(xml_assembly) == 0:
|
|
39
44
|
return False
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
ebi_eva_common_pyutils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
ebi_eva_common_pyutils/assembly_utils.py,sha256=
|
|
2
|
+
ebi_eva_common_pyutils/assembly_utils.py,sha256=hpOxiZTxHJ-yRexuZ2yUVYUhZTkF3ee3dq4S2HJhgc8,4374
|
|
3
3
|
ebi_eva_common_pyutils/biosamples_communicators.py,sha256=ZkemchAYGrHwqbGviJN5X80nYFizDNVTwUX3c_5PZcM,7799
|
|
4
4
|
ebi_eva_common_pyutils/command_utils.py,sha256=PtelWWqcC0eOwIVesjwBw3F9KaXRzEE_uAUJhQFZ4l8,2340
|
|
5
5
|
ebi_eva_common_pyutils/common_utils.py,sha256=ty_glvfRa3VGhnpAht4qtVkNNmv-IYfVtO958mY-BaA,1192
|
|
@@ -22,20 +22,18 @@ ebi_eva_common_pyutils/taxonomy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
22
22
|
ebi_eva_common_pyutils/taxonomy/taxonomy.py,sha256=aXmRQ3NAaJotwmmOA2-u2XtcUT6iih-0_e-3QOxynoA,2578
|
|
23
23
|
ebi_eva_common_pyutils/variation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
ebi_eva_common_pyutils/variation/contig_utils.py,sha256=kMNEW_P2yPnd8Xx1tep19hy5ee7ojxz6ZOO1grTQsRQ,5230
|
|
25
|
-
ebi_eva_common_pyutils-0.
|
|
26
|
-
ebi_eva_internal_pyutils/__init__.py,sha256=
|
|
25
|
+
ebi_eva_common_pyutils-0.8.0.data/scripts/archive_directory.py,sha256=0lWJ0ju_AB2ni7lMnJXPFx6U2OdTGbe-WoQs-4BfKOM,4976
|
|
26
|
+
ebi_eva_internal_pyutils/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
27
27
|
ebi_eva_internal_pyutils/archive_directory.py,sha256=IxVEfh_gaCiT652k0Q_-58fonRusy1yzXu7BCO8yVLo,4989
|
|
28
28
|
ebi_eva_internal_pyutils/config_utils.py,sha256=EGRC5rsmU_ug7OY9-t1UW1XZXRsauSyZB9xPcBux8ts,7909
|
|
29
29
|
ebi_eva_internal_pyutils/metadata_utils.py,sha256=t9PcXZdbfjDBP04GJenC4bxm2nOLd8oI_MP9eNe9IBQ,15221
|
|
30
30
|
ebi_eva_internal_pyutils/mongo_utils.py,sha256=YxKHtb5ygDiGLOtEiiAMFCP2ow6FL9Kq0K5R0mWNdXY,3575
|
|
31
31
|
ebi_eva_internal_pyutils/pg_utils.py,sha256=FUQVwiX_7F2-4sSzoaCVX2me0zAqR8nGIj6NW5d304A,4398
|
|
32
32
|
ebi_eva_internal_pyutils/spring_properties.py,sha256=Tn207DmZehFt7oExseNsXFAnsxr7bX9yiGl4t9mpGVA,15165
|
|
33
|
-
ebi_eva_internal_pyutils/mongodb/__init__.py,sha256=
|
|
34
|
-
ebi_eva_internal_pyutils/mongodb/mongo_database.py,sha256=
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
ebi_eva_common_pyutils-0.
|
|
38
|
-
ebi_eva_common_pyutils-0.
|
|
39
|
-
ebi_eva_common_pyutils-0.
|
|
40
|
-
ebi_eva_common_pyutils-0.7.4.dist-info/top_level.txt,sha256=sXoiqiGU8vlMQpFWDlKrekxhlusk06AhkOH3kSvDT6c,48
|
|
41
|
-
ebi_eva_common_pyutils-0.7.4.dist-info/RECORD,,
|
|
33
|
+
ebi_eva_internal_pyutils/mongodb/__init__.py,sha256=cH89mspotx2u8XxvpaDjjLCaSQqE8-8cCd11s2LMvpg,74
|
|
34
|
+
ebi_eva_internal_pyutils/mongodb/mongo_database.py,sha256=P6_PR9_KICxafypM1hESxkOJI52T098ynNUML2FzJac,9668
|
|
35
|
+
ebi_eva_common_pyutils-0.8.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
36
|
+
ebi_eva_common_pyutils-0.8.0.dist-info/METADATA,sha256=HI0gRr-e_clv_BenWgJWkHVIBSR5l4uq3i-iMarOjhk,1022
|
|
37
|
+
ebi_eva_common_pyutils-0.8.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
38
|
+
ebi_eva_common_pyutils-0.8.0.dist-info/top_level.txt,sha256=sXoiqiGU8vlMQpFWDlKrekxhlusk06AhkOH3kSvDT6c,48
|
|
39
|
+
ebi_eva_common_pyutils-0.8.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -101,6 +101,7 @@ class MongoDatabase(AppLogger):
|
|
|
101
101
|
self.mongo_handle[self.db_name][collection_name].create_index(index_keys, name=name, **index_info)
|
|
102
102
|
|
|
103
103
|
def enable_sharding(self):
|
|
104
|
+
# From mongodb 6.0 all database have sharding enable by default
|
|
104
105
|
self.mongo_handle.admin.command({"enableSharding": self.db_name})
|
|
105
106
|
|
|
106
107
|
def shard_collections(self, collections_shard_key_map, collections_to_shard):
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from ebi_eva_internal_pyutils.nextflow.nextflow_pipeline import LinearNextFlowPipeline, NextFlowPipeline, NextFlowProcess
|
|
@@ -1,195 +0,0 @@
|
|
|
1
|
-
# Copyright 2021 EMBL - European Bioinformatics Institute
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
# Rationale for a Nextflow pipeline abstraction
|
|
16
|
-
# ---------------------------------------------
|
|
17
|
-
# Dynamic pipeline generation
|
|
18
|
-
# Abstraction to represent process dependencies
|
|
19
|
-
# Unit testability of individual steps without scattering logic between Python and Nextflow
|
|
20
|
-
# Ability to combine pipelines
|
|
21
|
-
|
|
22
|
-
import networkx as nx
|
|
23
|
-
import os
|
|
24
|
-
from typing import List, Dict, Union
|
|
25
|
-
|
|
26
|
-
from ebi_eva_common_pyutils.logger import AppLogger
|
|
27
|
-
from ebi_eva_common_pyutils.command_utils import run_command_with_output
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class NextFlowProcess:
|
|
31
|
-
|
|
32
|
-
def __init__(self, process_name: str, command_to_run: str, process_directives: Dict[str, str] = None) -> None:
|
|
33
|
-
"""
|
|
34
|
-
Create a Nextflow process
|
|
35
|
-
:rtype: None
|
|
36
|
-
:param process_name: Name of the process - should be a valid identifier - ex: p1_merge
|
|
37
|
-
:type process_name: str
|
|
38
|
-
:param command_to_run: Command to be run - ex: bash -c "echo p1"
|
|
39
|
-
:type command_to_run: str
|
|
40
|
-
:param process_directives: Additional process directives - ex: {"memory": "4GB", "executor": "lsf"}
|
|
41
|
-
:type process_directives: dict
|
|
42
|
-
"""
|
|
43
|
-
if not process_name.isidentifier():
|
|
44
|
-
raise ValueError(f"{process_name} is not a valid Nextflow process name")
|
|
45
|
-
self.process_name = process_name
|
|
46
|
-
self.success_flag = f"{self.process_name}_success"
|
|
47
|
-
self.command_to_run = command_to_run
|
|
48
|
-
self.process_directives = process_directives if process_directives else dict()
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class NextFlowPipeline(AppLogger):
|
|
52
|
-
def __init__(self, process_dependency_map: Dict[NextFlowProcess, List[NextFlowProcess]] = None) -> None:
|
|
53
|
-
"""
|
|
54
|
-
Create a Nextflow pipeline with a process dependency map
|
|
55
|
-
|
|
56
|
-
:param process_dependency_map: Map of Nextflow processes and their corresponding dependencies
|
|
57
|
-
- ex: {p3 : [p2], p2: [p1]} where p1, p2 and p3 are Nextflow processes that should be executed sequentially
|
|
58
|
-
"""
|
|
59
|
-
# Modeling the dependency map as a DiGraph (Directed graph) is advantageous
|
|
60
|
-
# in ordering/combining flows and detecting cycles
|
|
61
|
-
self.process_dependency_map = nx.ordered.DiGraph()
|
|
62
|
-
if process_dependency_map:
|
|
63
|
-
self.add_dependencies(process_dependency_map)
|
|
64
|
-
|
|
65
|
-
def add_dependencies(self, process_dependency_map: Dict[NextFlowProcess, List[NextFlowProcess]]):
|
|
66
|
-
for process, dependencies in process_dependency_map.items():
|
|
67
|
-
if dependencies:
|
|
68
|
-
for dependency in dependencies:
|
|
69
|
-
self.add_process_dependency(process, dependency)
|
|
70
|
-
else:
|
|
71
|
-
self.add_process_dependency(process, None)
|
|
72
|
-
|
|
73
|
-
def add_process_dependency(self, process: NextFlowProcess, dependency: Union[NextFlowProcess, None]):
|
|
74
|
-
if dependency:
|
|
75
|
-
self.process_dependency_map.add_edge(process, dependency)
|
|
76
|
-
if not nx.dag.is_directed_acyclic_graph(self.process_dependency_map):
|
|
77
|
-
raise ValueError(f"Cycles found in pipeline when adding process {process.process_name} "
|
|
78
|
-
f"and its dependency {dependency.process_name}")
|
|
79
|
-
else:
|
|
80
|
-
# If no dependency is specified, the process will just be a single node in the DAG
|
|
81
|
-
self.process_dependency_map.add_node(process)
|
|
82
|
-
|
|
83
|
-
def _write_to_pipeline_file(self, workflow_file_path: str):
|
|
84
|
-
with open(workflow_file_path, "a") as pipeline_file_handle:
|
|
85
|
-
pipeline_file_handle.write(self.__str__() + "\n")
|
|
86
|
-
|
|
87
|
-
def run_pipeline(self, workflow_file_path: str, nextflow_binary_path: str = 'nextflow',
|
|
88
|
-
nextflow_config_path: str = None, working_dir: str = ".", resume: bool = False,
|
|
89
|
-
other_args: dict = None):
|
|
90
|
-
# Remove pipeline file if it already exists
|
|
91
|
-
if os.path.exists(workflow_file_path):
|
|
92
|
-
os.remove(workflow_file_path)
|
|
93
|
-
self._write_to_pipeline_file(workflow_file_path)
|
|
94
|
-
workflow_command = f"cd {working_dir} && {nextflow_binary_path} run {workflow_file_path}"
|
|
95
|
-
workflow_command += f" -c {nextflow_config_path}" if nextflow_config_path else ""
|
|
96
|
-
workflow_command += f" -with-report {workflow_file_path}.report.html"
|
|
97
|
-
workflow_command += f" -with-dag {workflow_file_path}.dag.png"
|
|
98
|
-
workflow_command += " -resume" if resume else ""
|
|
99
|
-
workflow_command += " ".join([f" -{arg} {val}" for arg, val in other_args.items()]) if other_args else ""
|
|
100
|
-
run_command_with_output(f"Running pipeline {workflow_file_path}...", workflow_command)
|
|
101
|
-
|
|
102
|
-
@staticmethod
|
|
103
|
-
def join_pipelines(main_pipeline: 'NextFlowPipeline', dependent_pipeline: 'NextFlowPipeline',
|
|
104
|
-
with_dependencies: bool = True) -> 'NextFlowPipeline':
|
|
105
|
-
"""
|
|
106
|
-
Join two pipelines with or without dependencies
|
|
107
|
-
|
|
108
|
-
With Dependencies it returns a new pipeline where:
|
|
109
|
-
1) root processes are those of the main pipeline.
|
|
110
|
-
2) final processes are those of the dependent pipeline and
|
|
111
|
-
3) every root process of the dependent pipeline depends on the final processes of the main pipeline.
|
|
112
|
-
Without Dependencies it returns a new pipeline where:
|
|
113
|
-
1) the two pipeline are left independent
|
|
114
|
-
2) Only shared dependencies
|
|
115
|
-
3) every root process of the dependent pipeline depends on the final processes of the main pipeline.
|
|
116
|
-
|
|
117
|
-
"""
|
|
118
|
-
joined_pipeline = NextFlowPipeline()
|
|
119
|
-
# Aggregate dependency maps of both pipelines
|
|
120
|
-
joined_pipeline.process_dependency_map = nx.compose(main_pipeline.process_dependency_map,
|
|
121
|
-
dependent_pipeline.process_dependency_map)
|
|
122
|
-
if with_dependencies:
|
|
123
|
-
for final_process_in_main_pipeline in main_pipeline._get_final_processes():
|
|
124
|
-
for root_process_in_dependent_pipeline in dependent_pipeline._get_root_processes():
|
|
125
|
-
joined_pipeline.add_process_dependency(root_process_in_dependent_pipeline,
|
|
126
|
-
final_process_in_main_pipeline)
|
|
127
|
-
return joined_pipeline
|
|
128
|
-
|
|
129
|
-
def _get_root_processes(self) -> List[NextFlowProcess]:
|
|
130
|
-
# Root processes are those which have no dependencies
|
|
131
|
-
# See https://stackoverflow.com/a/62948641
|
|
132
|
-
roots = []
|
|
133
|
-
for component in nx.weakly_connected_components(self.process_dependency_map):
|
|
134
|
-
subgraph = self.process_dependency_map.subgraph(component)
|
|
135
|
-
roots.extend([n for n, d in subgraph.out_degree() if d == 0])
|
|
136
|
-
return roots
|
|
137
|
-
|
|
138
|
-
def _get_final_processes(self) -> List[NextFlowProcess]:
|
|
139
|
-
# Final processes are those which have no other processes depending on them
|
|
140
|
-
# See https://stackoverflow.com/a/62948641
|
|
141
|
-
roots = []
|
|
142
|
-
for component in nx.weakly_connected_components(self.process_dependency_map):
|
|
143
|
-
subgraph = self.process_dependency_map.subgraph(component)
|
|
144
|
-
roots.extend([n for n, d in subgraph.in_degree() if d == 0])
|
|
145
|
-
return roots
|
|
146
|
-
|
|
147
|
-
@staticmethod
|
|
148
|
-
def _get_process_repr(process: NextFlowProcess, dependencies: List[NextFlowProcess]) -> str:
|
|
149
|
-
process_directives_str = "\n".join([f"{key}='{value}'" for key, value in process.process_directives.items()])
|
|
150
|
-
input_dependencies = "val flag from true"
|
|
151
|
-
if dependencies:
|
|
152
|
-
input_dependencies = "\n".join([f"val {dependency.success_flag} from {dependency.success_flag}"
|
|
153
|
-
for dependency in dependencies])
|
|
154
|
-
return "\n".join(map(str.strip, f"""
|
|
155
|
-
process {process.process_name} {{
|
|
156
|
-
{process_directives_str}
|
|
157
|
-
input:
|
|
158
|
-
{input_dependencies}
|
|
159
|
-
output:
|
|
160
|
-
val true into {process.success_flag}
|
|
161
|
-
script:
|
|
162
|
-
\"\"\"
|
|
163
|
-
{process.command_to_run}
|
|
164
|
-
\"\"\"
|
|
165
|
-
}}""".split("\n")))
|
|
166
|
-
|
|
167
|
-
def __str__(self):
|
|
168
|
-
# Order the list of nodes based on the dependency
|
|
169
|
-
# See https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.depth_first_search.dfs_postorder_nodes.html?highlight=dfs_postorder_nodes#networkx.algorithms.traversal.depth_first_search.dfs_postorder_nodes
|
|
170
|
-
ordered_list_of_processes_to_run = list(nx.dfs_postorder_nodes(self.process_dependency_map))
|
|
171
|
-
# Get a Nextflow pipeline representation of each process and its dependencies
|
|
172
|
-
return "\n\n".join([NextFlowPipeline._get_process_repr(process, list(self.process_dependency_map[process]))
|
|
173
|
-
for process in ordered_list_of_processes_to_run])
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
class LinearNextFlowPipeline(NextFlowPipeline):
|
|
177
|
-
"""
|
|
178
|
-
Simple linear pipeline that supports resumption
|
|
179
|
-
"""
|
|
180
|
-
previous_process: NextFlowProcess = None
|
|
181
|
-
|
|
182
|
-
def __init__(self, process_list: List[NextFlowProcess] = None):
|
|
183
|
-
dependency_map = {}
|
|
184
|
-
if process_list:
|
|
185
|
-
for index, process in enumerate(process_list):
|
|
186
|
-
dependency_map[process] = [] if index == 0 else [process_list[index - 1]]
|
|
187
|
-
super().__init__(dependency_map)
|
|
188
|
-
|
|
189
|
-
def add_process(self, process_name, command_to_run):
|
|
190
|
-
current_process = NextFlowProcess(process_name=process_name, command_to_run=command_to_run)
|
|
191
|
-
self._add_new_process(current_process)
|
|
192
|
-
|
|
193
|
-
def _add_new_process(self, current_process):
|
|
194
|
-
super().add_process_dependency(current_process, self.previous_process)
|
|
195
|
-
self.previous_process = current_process
|
{ebi_eva_common_pyutils-0.7.4.data → ebi_eva_common_pyutils-0.8.0.data}/scripts/archive_directory.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ebi_eva_common_pyutils-0.7.4.dist-info → ebi_eva_common_pyutils-0.8.0.dist-info}/top_level.txt
RENAMED
|
File without changes
|