genelastic 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/__init__.py +0 -13
- genelastic/api/__init__.py +0 -0
- genelastic/api/extends/__init__.py +0 -0
- genelastic/api/extends/example.py +6 -0
- genelastic/api/routes.py +221 -0
- genelastic/api/server.py +80 -0
- genelastic/api/settings.py +14 -0
- genelastic/common/__init__.py +39 -0
- genelastic/common/cli.py +63 -0
- genelastic/common/elastic.py +214 -0
- genelastic/common/exceptions.py +4 -0
- genelastic/common/types.py +25 -0
- genelastic/import_data/__init__.py +27 -0
- genelastic/{analyses.py → import_data/analyses.py} +19 -20
- genelastic/{analysis.py → import_data/analysis.py} +71 -66
- genelastic/{bi_process.py → import_data/bi_process.py} +8 -6
- genelastic/{bi_processes.py → import_data/bi_processes.py} +10 -9
- genelastic/import_data/cli_gen_data.py +116 -0
- genelastic/import_data/cli_import.py +379 -0
- genelastic/import_data/cli_info.py +256 -0
- genelastic/import_data/cli_integrity.py +384 -0
- genelastic/import_data/cli_validate.py +54 -0
- genelastic/import_data/constants.py +24 -0
- genelastic/{data_file.py → import_data/data_file.py} +26 -21
- genelastic/import_data/filename_pattern.py +57 -0
- genelastic/{import_bundle.py → import_data/import_bundle.py} +58 -48
- genelastic/import_data/import_bundle_factory.py +298 -0
- genelastic/{logger.py → import_data/logger.py} +22 -18
- genelastic/import_data/random_bundle.py +402 -0
- genelastic/{tags.py → import_data/tags.py} +48 -27
- genelastic/{wet_process.py → import_data/wet_process.py} +8 -4
- genelastic/{wet_processes.py → import_data/wet_processes.py} +15 -9
- genelastic/ui/__init__.py +0 -0
- genelastic/ui/server.py +87 -0
- genelastic/ui/settings.py +11 -0
- genelastic-0.7.0.dist-info/METADATA +105 -0
- genelastic-0.7.0.dist-info/RECORD +40 -0
- {genelastic-0.6.0.dist-info → genelastic-0.7.0.dist-info}/WHEEL +1 -1
- genelastic-0.7.0.dist-info/entry_points.txt +6 -0
- genelastic/common.py +0 -151
- genelastic/constants.py +0 -45
- genelastic/filename_pattern.py +0 -62
- genelastic/gen_data.py +0 -193
- genelastic/import_bundle_factory.py +0 -288
- genelastic/import_data.py +0 -294
- genelastic/info.py +0 -248
- genelastic/integrity.py +0 -324
- genelastic/validate_data.py +0 -41
- genelastic-0.6.0.dist-info/METADATA +0 -36
- genelastic-0.6.0.dist-info/RECORD +0 -25
- genelastic-0.6.0.dist-info/entry_points.txt +0 -6
- {genelastic-0.6.0.dist-info → genelastic-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Genelastic package for importing Genomic data into Elasticsearch."""
|
|
2
|
+
|
|
3
|
+
from .analysis import Analysis
|
|
4
|
+
from .import_bundle import ImportBundle
|
|
5
|
+
from .import_bundle_factory import (
|
|
6
|
+
load_import_bundle_file,
|
|
7
|
+
make_import_bundle_from_files,
|
|
8
|
+
)
|
|
9
|
+
from .random_bundle import (
|
|
10
|
+
RandomAnalysis,
|
|
11
|
+
RandomBiProcess,
|
|
12
|
+
RandomBundle,
|
|
13
|
+
RandomWetProcess,
|
|
14
|
+
)
|
|
15
|
+
from .tags import Tags
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"Analysis",
|
|
19
|
+
"ImportBundle",
|
|
20
|
+
"RandomAnalysis",
|
|
21
|
+
"RandomBiProcess",
|
|
22
|
+
"RandomBundle",
|
|
23
|
+
"RandomWetProcess",
|
|
24
|
+
"Tags",
|
|
25
|
+
"load_import_bundle_file",
|
|
26
|
+
"make_import_bundle_from_files",
|
|
27
|
+
]
|
|
@@ -1,21 +1,22 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
1
|
import typing
|
|
2
|
+
|
|
3
|
+
from genelastic.common import BundleDict
|
|
4
|
+
|
|
3
5
|
from .analysis import Analysis
|
|
4
|
-
from .common import BundleDict
|
|
5
6
|
from .data_file import DataFile
|
|
6
7
|
|
|
8
|
+
|
|
7
9
|
class Analyses:
|
|
8
|
-
"""Class Analyses is a container of Analysis objects.
|
|
9
|
-
"""
|
|
10
|
+
"""Class Analyses is a container of Analysis objects."""
|
|
10
11
|
|
|
11
12
|
def __init__(self) -> None:
|
|
12
|
-
self._arr:
|
|
13
|
+
self._arr: list[Analysis] = []
|
|
13
14
|
self._iter_index: int = 0
|
|
14
15
|
|
|
15
16
|
def __len__(self) -> int:
|
|
16
17
|
return len(self._arr)
|
|
17
18
|
|
|
18
|
-
def __iter__(self) -> typing.
|
|
19
|
+
def __iter__(self) -> typing.Iterator[Analysis]:
|
|
19
20
|
yield from self._arr
|
|
20
21
|
|
|
21
22
|
@typing.overload
|
|
@@ -23,13 +24,13 @@ class Analyses:
|
|
|
23
24
|
pass
|
|
24
25
|
|
|
25
26
|
@typing.overload
|
|
26
|
-
def __getitem__(self, k: slice) ->
|
|
27
|
+
def __getitem__(self, k: slice) -> list[Analysis]:
|
|
27
28
|
pass
|
|
28
29
|
|
|
29
|
-
def __getitem__(self, k):
|
|
30
|
+
def __getitem__(self, k): # type: ignore[no-untyped-def]
|
|
30
31
|
if isinstance(k, int):
|
|
31
32
|
return self._arr[k]
|
|
32
|
-
return self._arr[k.start:k.stop]
|
|
33
|
+
return self._arr[k.start : k.stop]
|
|
33
34
|
|
|
34
35
|
def add(self, a: Analysis) -> None:
|
|
35
36
|
"""Add one Analysis object."""
|
|
@@ -37,20 +38,18 @@ class Analyses:
|
|
|
37
38
|
|
|
38
39
|
def get_nb_files(self, cat: str | None = None) -> int:
|
|
39
40
|
"""Get the total number of files as paths."""
|
|
40
|
-
return len(self.get_data_files(cat
|
|
41
|
+
return len(self.get_data_files(cat=cat))
|
|
41
42
|
|
|
42
|
-
def get_data_files(self, cat: str | None = None) ->
|
|
43
|
-
"""Get the total number of files as DataFile objects.
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
data_files: typing.List[DataFile] = []
|
|
43
|
+
def get_data_files(self, cat: str | None = None) -> list[DataFile]:
|
|
44
|
+
"""Get the total number of files as DataFile objects."""
|
|
45
|
+
data_files: list[DataFile] = []
|
|
47
46
|
|
|
48
47
|
for a in self._arr:
|
|
49
|
-
data_files.extend(a.get_data_files(cat
|
|
48
|
+
data_files.extend(a.get_data_files(cat=cat))
|
|
50
49
|
|
|
51
50
|
return data_files
|
|
52
51
|
|
|
53
|
-
def get_all_categories(self) ->
|
|
52
|
+
def get_all_categories(self) -> set[str]:
|
|
54
53
|
"""Return all the categories of the analyses."""
|
|
55
54
|
categories = set()
|
|
56
55
|
for a in self._arr:
|
|
@@ -58,10 +57,10 @@ class Analyses:
|
|
|
58
57
|
return categories
|
|
59
58
|
|
|
60
59
|
@classmethod
|
|
61
|
-
def from_array_of_dicts(
|
|
62
|
-
|
|
60
|
+
def from_array_of_dicts(
|
|
61
|
+
cls, arr: typing.Sequence[BundleDict]
|
|
62
|
+
) -> typing.Self:
|
|
63
63
|
"""Build an Analyses instance."""
|
|
64
|
-
|
|
65
64
|
analyses = cls()
|
|
66
65
|
|
|
67
66
|
for d in arr:
|
|
@@ -1,40 +1,41 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
1
|
import copy
|
|
3
|
-
import glob
|
|
4
2
|
import logging
|
|
5
|
-
import os
|
|
6
3
|
import re
|
|
7
4
|
import typing
|
|
8
5
|
from pathlib import Path
|
|
9
6
|
|
|
10
|
-
from .common import AnalysisMetaData
|
|
7
|
+
from genelastic.common import AnalysisMetaData
|
|
8
|
+
|
|
11
9
|
from .constants import ALLOWED_CATEGORIES
|
|
12
10
|
from .data_file import DataFile
|
|
13
11
|
from .filename_pattern import FilenamePattern
|
|
14
12
|
from .tags import Tags
|
|
15
13
|
|
|
16
|
-
logger = logging.getLogger(
|
|
14
|
+
logger = logging.getLogger("genelastic")
|
|
17
15
|
|
|
18
16
|
|
|
19
17
|
class Analysis:
|
|
20
18
|
"""Class Analysis that represents an analysis."""
|
|
21
19
|
|
|
22
|
-
#
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
20
|
+
def __init__( # noqa: PLR0913
|
|
21
|
+
self,
|
|
22
|
+
tags: Tags,
|
|
23
|
+
root_dir: str = ".",
|
|
24
|
+
bundle_file: str | None = None,
|
|
25
|
+
file_prefix: str | None = None,
|
|
26
|
+
files: typing.Sequence[str] | None = None,
|
|
27
|
+
data_path: str | None = None,
|
|
28
|
+
**metadata: str | int,
|
|
29
|
+
) -> None:
|
|
30
|
+
self._bundle_file = Path(bundle_file) if bundle_file else None
|
|
32
31
|
self._file_prefix = file_prefix
|
|
33
32
|
self._files = files
|
|
34
|
-
self._data_path = Analysis._resolve_data_path(
|
|
33
|
+
self._data_path = Analysis._resolve_data_path(
|
|
34
|
+
Path(root_dir), Path(data_path) if data_path else None
|
|
35
|
+
)
|
|
35
36
|
self._tags = tags
|
|
36
37
|
self._metadata: AnalysisMetaData = metadata
|
|
37
|
-
self._categories:
|
|
38
|
+
self._categories: set[str] = set()
|
|
38
39
|
|
|
39
40
|
@property
|
|
40
41
|
def metadata(self) -> AnalysisMetaData:
|
|
@@ -42,17 +43,15 @@ class Analysis:
|
|
|
42
43
|
return copy.deepcopy(self._metadata)
|
|
43
44
|
|
|
44
45
|
@property
|
|
45
|
-
def bundle_file(self) ->
|
|
46
|
+
def bundle_file(self) -> Path | None:
|
|
46
47
|
"""Get the bundle file."""
|
|
47
48
|
return self._bundle_file
|
|
48
49
|
|
|
49
50
|
@property
|
|
50
51
|
def filename_regex(self) -> str:
|
|
51
|
-
"""
|
|
52
|
-
Resolve placeholders in a file prefix using metadata
|
|
52
|
+
"""Resolve placeholders in a file prefix using metadata
|
|
53
53
|
and unresolved placeholders are converted to regex groups
|
|
54
54
|
"""
|
|
55
|
-
|
|
56
55
|
x: str = r"^.+\.(?P<ext>vcf|cov)(\.gz)?$"
|
|
57
56
|
|
|
58
57
|
# Use existing generic prefix
|
|
@@ -64,84 +63,87 @@ class Analysis:
|
|
|
64
63
|
regex = tag_attrs["regex"]
|
|
65
64
|
|
|
66
65
|
# Build field regex
|
|
67
|
-
field_regex = (
|
|
68
|
-
|
|
69
|
-
|
|
66
|
+
field_regex = (
|
|
67
|
+
f"(?P<{field}>{self._metadata.get(field)})"
|
|
68
|
+
if field in self._metadata
|
|
69
|
+
else f"(?P<{field}>{regex})"
|
|
70
|
+
)
|
|
70
71
|
# Replace tag with field regex
|
|
71
72
|
x = x.replace(tag_name, field_regex)
|
|
72
73
|
|
|
73
74
|
# Check for tags that were not replaced.
|
|
74
75
|
groups = re.findall(self._tags.search_regex, x)
|
|
75
76
|
for match in groups:
|
|
76
|
-
logger.warning(
|
|
77
|
-
|
|
78
|
-
|
|
77
|
+
logger.warning(
|
|
78
|
+
"String '%s' in key 'file_prefix' looks like an undefined tag. "
|
|
79
|
+
"If this string is not a tag, you can ignore this warning.",
|
|
80
|
+
match,
|
|
81
|
+
)
|
|
79
82
|
|
|
80
83
|
# Add missing start and end markers
|
|
81
84
|
if not x.startswith("^"):
|
|
82
85
|
x = "^" + x
|
|
83
86
|
if not x.endswith("$"):
|
|
84
|
-
x +=
|
|
85
|
-
+ r")(\.gz)?$")
|
|
87
|
+
x += r"\.(?P<ext>" + "|".join(ALLOWED_CATEGORIES) + r")(\.gz)?$"
|
|
86
88
|
logger.debug("File regex for %s: %s", self._bundle_file, x)
|
|
87
89
|
|
|
88
90
|
return x
|
|
89
91
|
|
|
90
92
|
def get_nb_files(self, cat: str | None = None) -> int:
|
|
91
|
-
"""Returns the total number of files.
|
|
92
|
-
"""
|
|
93
|
+
"""Returns the total number of files."""
|
|
93
94
|
return len(self.get_file_paths(cat=cat))
|
|
94
95
|
|
|
95
|
-
def get_data_files(self, cat: str | None = None) ->
|
|
96
|
-
"""Returns the list of matched files as DataFile objects.
|
|
97
|
-
"""
|
|
98
|
-
|
|
96
|
+
def get_data_files(self, cat: str | None = None) -> list[DataFile]:
|
|
97
|
+
"""Returns the list of matched files as DataFile objects."""
|
|
99
98
|
files = self.get_file_paths(cat=cat)
|
|
100
99
|
filename_pattern = FilenamePattern(self.filename_regex)
|
|
101
100
|
|
|
102
|
-
data_files:
|
|
101
|
+
data_files: list[DataFile] = []
|
|
103
102
|
|
|
104
103
|
for f in files:
|
|
105
104
|
try:
|
|
106
|
-
data_files.append(
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
105
|
+
data_files.append(
|
|
106
|
+
DataFile.make_from_bundle(
|
|
107
|
+
path=f,
|
|
108
|
+
bundle_path=self._bundle_file,
|
|
109
|
+
pattern=filename_pattern,
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
except (OSError, ValueError) as e:
|
|
110
113
|
logger.error("Error processing file %s: %s", f, str(e))
|
|
111
114
|
|
|
112
115
|
return data_files
|
|
113
116
|
|
|
114
|
-
def get_file_paths(self, cat: str | None = None) -> typing.Sequence[
|
|
115
|
-
"""Returns the list of matched files.
|
|
116
|
-
"""
|
|
117
|
+
def get_file_paths(self, cat: str | None = None) -> typing.Sequence[Path]:
|
|
118
|
+
"""Returns the list of matched files."""
|
|
117
119
|
files, _, _ = self._do_get_file_paths(cat=cat)
|
|
118
120
|
return files
|
|
119
121
|
|
|
120
|
-
def get_unmatched_file_paths(
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
"""
|
|
122
|
+
def get_unmatched_file_paths(
|
|
123
|
+
self, cat: str | None = None
|
|
124
|
+
) -> typing.Sequence[Path]:
|
|
125
|
+
"""Returns the list of unmatched files."""
|
|
124
126
|
_, files, _ = self._do_get_file_paths(cat=cat)
|
|
125
127
|
return files
|
|
126
128
|
|
|
127
|
-
def get_all_categories(self) ->
|
|
129
|
+
def get_all_categories(self) -> set[str]:
|
|
128
130
|
"""Returns all categories of the analysis."""
|
|
129
131
|
_, _, categories = self._do_get_file_paths()
|
|
130
132
|
return categories
|
|
131
133
|
|
|
132
134
|
@staticmethod
|
|
133
|
-
def _resolve_data_path(root_dir:
|
|
134
|
-
resolved_data_path =
|
|
135
|
+
def _resolve_data_path(root_dir: Path, data_path: Path | None) -> Path:
|
|
136
|
+
resolved_data_path = Path() if data_path is None else data_path
|
|
135
137
|
|
|
136
|
-
if not
|
|
137
|
-
resolved_data_path =
|
|
138
|
+
if not resolved_data_path.is_absolute():
|
|
139
|
+
resolved_data_path = (root_dir / resolved_data_path).absolute()
|
|
138
140
|
|
|
139
141
|
return resolved_data_path
|
|
140
142
|
|
|
141
|
-
def _get_files_with_allowed_categories(self) ->
|
|
143
|
+
def _get_files_with_allowed_categories(self) -> dict[Path, str]:
|
|
142
144
|
# Create a dict to store allowed files. Keys are the filepaths,
|
|
143
145
|
# and values are their corresponding category.
|
|
144
|
-
allowed_files:
|
|
146
|
+
allowed_files: dict[Path, str] = {}
|
|
145
147
|
# If files are listed explicitly in the YAML in the 'files' attribute, process them.
|
|
146
148
|
if self._files is not None:
|
|
147
149
|
abs_filepaths = [Path(self._data_path) / f for f in self._files]
|
|
@@ -150,14 +152,14 @@ class Analysis:
|
|
|
150
152
|
cat = file.suffixes[0][1:]
|
|
151
153
|
# Add each matching file and its category to the dict.
|
|
152
154
|
if cat in ALLOWED_CATEGORIES:
|
|
153
|
-
allowed_files[
|
|
155
|
+
allowed_files[file] = cat
|
|
154
156
|
# Else, look for files on disk using the YAML 'data_path' attribute.
|
|
155
157
|
else:
|
|
156
158
|
# Try to retrieve files matching allowed categories using glob.
|
|
157
159
|
for cat in ALLOWED_CATEGORIES:
|
|
158
|
-
glob_res = []
|
|
159
|
-
glob_res.extend(
|
|
160
|
-
glob_res.extend(
|
|
160
|
+
glob_res: list[Path] = []
|
|
161
|
+
glob_res.extend(self._data_path.glob(f"*.{cat}"))
|
|
162
|
+
glob_res.extend(self._data_path.glob(f"*.{cat}.gz"))
|
|
161
163
|
|
|
162
164
|
# Add each globed file and its category to the dict.
|
|
163
165
|
for g_file in glob_res:
|
|
@@ -165,12 +167,13 @@ class Analysis:
|
|
|
165
167
|
|
|
166
168
|
return allowed_files
|
|
167
169
|
|
|
168
|
-
def _do_get_file_paths(
|
|
169
|
-
|
|
170
|
-
|
|
170
|
+
def _do_get_file_paths(
|
|
171
|
+
self, cat: str | None = None
|
|
172
|
+
) -> tuple[typing.Sequence[Path], typing.Sequence[Path], set[str]]:
|
|
171
173
|
# Raise an error if the category given as a parameter is not part of the allowed categories.
|
|
172
174
|
if cat is not None and cat not in ALLOWED_CATEGORIES:
|
|
173
|
-
|
|
175
|
+
msg = f"Unknown category {cat}."
|
|
176
|
+
raise ValueError(msg)
|
|
174
177
|
|
|
175
178
|
# Obtain a dict of all files matching the allowed categories.
|
|
176
179
|
allowed_files = self._get_files_with_allowed_categories()
|
|
@@ -180,16 +183,18 @@ class Analysis:
|
|
|
180
183
|
files_to_match = allowed_files
|
|
181
184
|
else:
|
|
182
185
|
# A category was given as a parameter, so we match only this specific category.
|
|
183
|
-
files_to_match =
|
|
186
|
+
files_to_match = {
|
|
187
|
+
k: v for k, v in allowed_files.items() if v == cat
|
|
188
|
+
}
|
|
184
189
|
|
|
185
190
|
filename_pattern = FilenamePattern(self.filename_regex)
|
|
186
|
-
matching_files:
|
|
187
|
-
non_matching_files:
|
|
191
|
+
matching_files: list[Path] = []
|
|
192
|
+
non_matching_files: list[Path] = []
|
|
188
193
|
categories = set()
|
|
189
194
|
|
|
190
195
|
# We filter files by ensuring that they match the filename pattern defined in the analysis.
|
|
191
196
|
for file, category in sorted(files_to_match.items()):
|
|
192
|
-
if filename_pattern.matches_pattern(
|
|
197
|
+
if filename_pattern.matches_pattern(file.name):
|
|
193
198
|
matching_files.append(file)
|
|
194
199
|
logger.info("MATCHED file %s.", file)
|
|
195
200
|
# Add the file category to the categories set.
|
|
@@ -1,15 +1,17 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
1
|
import copy
|
|
3
|
-
import typing
|
|
4
2
|
|
|
5
|
-
from .common import BioInfoProcessData
|
|
3
|
+
from genelastic.common import BioInfoProcessData
|
|
6
4
|
|
|
7
5
|
|
|
8
6
|
class BioInfoProcess:
|
|
9
7
|
"""Class representing a bio process."""
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
8
|
+
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
proc_id: str,
|
|
12
|
+
bundle_file: str | None = None,
|
|
13
|
+
**data: str | list[str],
|
|
14
|
+
) -> None:
|
|
13
15
|
self._proc_id = proc_id
|
|
14
16
|
self._bundle_file = bundle_file
|
|
15
17
|
self._data: BioInfoProcessData = data
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
1
|
import logging
|
|
3
2
|
import typing
|
|
4
3
|
|
|
4
|
+
from genelastic.common import BundleDict
|
|
5
|
+
|
|
5
6
|
from .bi_process import BioInfoProcess
|
|
6
|
-
from .common import BundleDict
|
|
7
7
|
|
|
8
|
-
logger = logging.getLogger(
|
|
8
|
+
logger = logging.getLogger("genelastic")
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class BioInfoProcesses:
|
|
12
12
|
"""Class BioInfoProcesses is a container of BioInfoProcess objects."""
|
|
13
13
|
|
|
14
14
|
def __init__(self) -> None:
|
|
15
|
-
self._dict:
|
|
15
|
+
self._dict: dict[str, BioInfoProcess] = {}
|
|
16
16
|
|
|
17
17
|
def __len__(self) -> int:
|
|
18
18
|
return len(self._dict)
|
|
@@ -26,20 +26,21 @@ class BioInfoProcesses:
|
|
|
26
26
|
the program exits.
|
|
27
27
|
"""
|
|
28
28
|
if process.id in self._dict:
|
|
29
|
-
|
|
29
|
+
msg = f"A bi process with the id '{process.id}' is already present."
|
|
30
|
+
raise ValueError(msg)
|
|
30
31
|
|
|
31
32
|
# Add one WetProcess object.
|
|
32
33
|
self._dict[process.id] = process
|
|
33
34
|
|
|
34
|
-
def get_process_ids(self) ->
|
|
35
|
+
def get_process_ids(self) -> set[str]:
|
|
35
36
|
"""Get a list of the bio processes IDs."""
|
|
36
37
|
return set(self._dict.keys())
|
|
37
38
|
|
|
38
39
|
@classmethod
|
|
39
|
-
def from_array_of_dicts(
|
|
40
|
-
|
|
40
|
+
def from_array_of_dicts(
|
|
41
|
+
cls, arr: typing.Sequence[BundleDict]
|
|
42
|
+
) -> typing.Self:
|
|
41
43
|
"""Build a BioInfoProcesses instance."""
|
|
42
|
-
|
|
43
44
|
bi_processes = cls()
|
|
44
45
|
|
|
45
46
|
for d in arr:
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from genelastic.common import add_verbose_control_args
|
|
6
|
+
|
|
7
|
+
from .logger import configure_logging
|
|
8
|
+
from .random_bundle import (
|
|
9
|
+
RandomBundle,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("genelastic")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def read_args() -> argparse.Namespace:
|
|
16
|
+
"""Read arguments from command line."""
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
description="Genetics data random generator.",
|
|
19
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
20
|
+
allow_abbrev=False,
|
|
21
|
+
)
|
|
22
|
+
add_verbose_control_args(parser)
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
"-d",
|
|
25
|
+
"--data-folder",
|
|
26
|
+
dest="data_folder",
|
|
27
|
+
required=True,
|
|
28
|
+
help="Data destination folder.",
|
|
29
|
+
type=Path,
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--log-file", dest="log_file", help="Path to a log file."
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"-n",
|
|
36
|
+
"--chrom-nb",
|
|
37
|
+
dest="chrom_nb",
|
|
38
|
+
type=int,
|
|
39
|
+
default=5,
|
|
40
|
+
help="Number of chromosomes to include in the generated VCF file.",
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"-o",
|
|
44
|
+
"--output-yaml-file",
|
|
45
|
+
dest="output_file",
|
|
46
|
+
default=None,
|
|
47
|
+
help="Output YAML file.",
|
|
48
|
+
type=Path,
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"-s",
|
|
52
|
+
"--sequence-size",
|
|
53
|
+
type=int,
|
|
54
|
+
default=2000,
|
|
55
|
+
help="Sequence size (number of nucleotides) generated for each chromosome.",
|
|
56
|
+
)
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"-c",
|
|
59
|
+
"--coverage",
|
|
60
|
+
action="store_true",
|
|
61
|
+
help="Generate a coverage file for each analysis.",
|
|
62
|
+
)
|
|
63
|
+
parser.add_argument(
|
|
64
|
+
"-a",
|
|
65
|
+
"--analyses",
|
|
66
|
+
help="Number of analyses to generate. "
|
|
67
|
+
"Each analysis is composed of a YAML bundle file declaring its wet lab and bioinformatics processes, "
|
|
68
|
+
"a VCF file and optionally a coverage file.",
|
|
69
|
+
default=1,
|
|
70
|
+
type=int,
|
|
71
|
+
)
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"-p",
|
|
74
|
+
"--processes",
|
|
75
|
+
help="Number of Wet Lab and Bioinformatics processes to generate.",
|
|
76
|
+
default=1,
|
|
77
|
+
type=int,
|
|
78
|
+
)
|
|
79
|
+
return parser.parse_args()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def main() -> None:
|
|
83
|
+
"""Entry point of the gen-data script."""
|
|
84
|
+
# Read command line arguments
|
|
85
|
+
args = read_args()
|
|
86
|
+
folder = args.data_folder.resolve()
|
|
87
|
+
|
|
88
|
+
if not folder.is_dir():
|
|
89
|
+
msg = f"ERROR: '{folder}' does not exist or is not a directory."
|
|
90
|
+
raise SystemExit(msg)
|
|
91
|
+
|
|
92
|
+
if args.analyses < 1:
|
|
93
|
+
msg = "Analyses count must be at least 1."
|
|
94
|
+
raise SystemExit(msg)
|
|
95
|
+
|
|
96
|
+
if args.processes < 1:
|
|
97
|
+
msg = "Processes count must be at least 1."
|
|
98
|
+
raise SystemExit(msg)
|
|
99
|
+
|
|
100
|
+
# Configure logging
|
|
101
|
+
configure_logging(args.verbose, log_file=args.log_file)
|
|
102
|
+
logger.debug("Arguments: %s", args)
|
|
103
|
+
|
|
104
|
+
# Write to stdout or file
|
|
105
|
+
RandomBundle(
|
|
106
|
+
folder,
|
|
107
|
+
args.analyses,
|
|
108
|
+
args.processes,
|
|
109
|
+
args.chrom_nb,
|
|
110
|
+
args.sequence_size,
|
|
111
|
+
do_gen_coverage=args.coverage,
|
|
112
|
+
).to_yaml(args.output_file)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__ == "__main__":
|
|
116
|
+
main()
|