genelastic 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/.env +4 -0
- genelastic/api/cli_start_api.py +2 -2
- genelastic/api/errors.py +52 -0
- genelastic/api/extends/example.py +0 -6
- genelastic/api/extends/example.yml +0 -20
- genelastic/api/routes.py +313 -181
- genelastic/api/server.py +8 -3
- genelastic/api/specification.yml +343 -181
- genelastic/common/__init__.py +0 -44
- genelastic/common/cli.py +48 -0
- genelastic/common/elastic.py +374 -46
- genelastic/common/exceptions.py +34 -2
- genelastic/common/server.py +9 -1
- genelastic/common/types.py +1 -14
- genelastic/import_data/__init__.py +0 -27
- genelastic/import_data/checker.py +99 -0
- genelastic/import_data/checker_observer.py +13 -0
- genelastic/import_data/cli/__init__.py +0 -0
- genelastic/import_data/cli/cli_check.py +136 -0
- genelastic/import_data/{cli_gen_data.py → cli/gen_data.py} +4 -4
- genelastic/import_data/cli/import_data.py +346 -0
- genelastic/import_data/cli/info.py +247 -0
- genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
- genelastic/import_data/cli/validate.py +146 -0
- genelastic/import_data/collect.py +185 -0
- genelastic/import_data/constants.py +136 -11
- genelastic/import_data/import_bundle.py +102 -59
- genelastic/import_data/import_bundle_factory.py +70 -149
- genelastic/import_data/importers/__init__.py +0 -0
- genelastic/import_data/importers/importer_base.py +131 -0
- genelastic/import_data/importers/importer_factory.py +85 -0
- genelastic/import_data/importers/importer_types.py +223 -0
- genelastic/import_data/logger.py +2 -1
- genelastic/import_data/models/__init__.py +0 -0
- genelastic/import_data/models/analyses.py +178 -0
- genelastic/import_data/models/analysis.py +144 -0
- genelastic/import_data/models/data_file.py +110 -0
- genelastic/import_data/models/process.py +45 -0
- genelastic/import_data/models/processes.py +84 -0
- genelastic/import_data/models/tags.py +170 -0
- genelastic/import_data/models/unique_list.py +109 -0
- genelastic/import_data/models/validate.py +26 -0
- genelastic/import_data/patterns.py +90 -0
- genelastic/import_data/random_bundle.py +10 -8
- genelastic/import_data/resolve.py +157 -0
- genelastic/ui/.env +1 -0
- genelastic/ui/cli_start_ui.py +4 -2
- genelastic/ui/routes.py +289 -42
- genelastic/ui/static/cea-cnrgh.ico +0 -0
- genelastic/ui/static/cea.ico +0 -0
- genelastic/ui/static/layout.ico +0 -0
- genelastic/ui/static/novaseq6000.png +0 -0
- genelastic/ui/static/style.css +430 -0
- genelastic/ui/static/ui.js +458 -0
- genelastic/ui/templates/analyses.html +96 -9
- genelastic/ui/templates/analysis_detail.html +44 -0
- genelastic/ui/templates/bi_process_detail.html +129 -0
- genelastic/ui/templates/bi_processes.html +114 -9
- genelastic/ui/templates/explorer.html +356 -0
- genelastic/ui/templates/home.html +205 -2
- genelastic/ui/templates/layout.html +148 -29
- genelastic/ui/templates/version.html +19 -7
- genelastic/ui/templates/wet_process_detail.html +131 -0
- genelastic/ui/templates/wet_processes.html +114 -9
- genelastic-0.9.0.dist-info/METADATA +686 -0
- genelastic-0.9.0.dist-info/RECORD +76 -0
- genelastic-0.9.0.dist-info/WHEEL +4 -0
- genelastic-0.9.0.dist-info/entry_points.txt +10 -0
- genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
- genelastic/import_data/analyses.py +0 -69
- genelastic/import_data/analysis.py +0 -205
- genelastic/import_data/bi_process.py +0 -27
- genelastic/import_data/bi_processes.py +0 -49
- genelastic/import_data/cli_import.py +0 -379
- genelastic/import_data/cli_info.py +0 -256
- genelastic/import_data/cli_validate.py +0 -54
- genelastic/import_data/data_file.py +0 -87
- genelastic/import_data/filename_pattern.py +0 -57
- genelastic/import_data/tags.py +0 -123
- genelastic/import_data/wet_process.py +0 -28
- genelastic/import_data/wet_processes.py +0 -53
- genelastic-0.8.0.dist-info/METADATA +0 -109
- genelastic-0.8.0.dist-info/RECORD +0 -52
- genelastic-0.8.0.dist-info/WHEEL +0 -5
- genelastic-0.8.0.dist-info/entry_points.txt +0 -8
- genelastic-0.8.0.dist-info/top_level.txt +0 -1
|
@@ -5,20 +5,145 @@ This module contains genelastic constants.
|
|
|
5
5
|
|
|
6
6
|
import typing
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
import schema
|
|
9
|
+
|
|
10
|
+
ALLOWED_EXTENSIONS: typing.Final[list[str]] = [
|
|
11
|
+
"vcf",
|
|
12
|
+
"cov",
|
|
13
|
+
"json",
|
|
14
|
+
"yml",
|
|
15
|
+
"yaml",
|
|
16
|
+
]
|
|
9
17
|
|
|
10
18
|
BUNDLE_CURRENT_VERSION = 3
|
|
11
19
|
|
|
12
|
-
DEFAULT_TAG_REGEX = "[^_
|
|
13
|
-
|
|
14
|
-
|
|
20
|
+
DEFAULT_TAG_REGEX = "[^_]+"
|
|
21
|
+
DEFAULT_TAG_DELIMITER_START = "%"
|
|
22
|
+
DEFAULT_TAG_DELIMITER_END = ""
|
|
15
23
|
|
|
16
24
|
DEFAULT_TAG2FIELD: typing.Final[dict[str, dict[str, str]]] = {
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
"
|
|
25
|
+
"S": {"field": "sample_name", "regex": DEFAULT_TAG_REGEX},
|
|
26
|
+
"F": {"field": "source", "regex": DEFAULT_TAG_REGEX},
|
|
27
|
+
"W": {"field": "wet_process", "regex": DEFAULT_TAG_REGEX},
|
|
28
|
+
"B": {"field": "bi_process", "regex": DEFAULT_TAG_REGEX},
|
|
29
|
+
"D": {"field": "cov_depth", "regex": DEFAULT_TAG_REGEX},
|
|
30
|
+
"A": {"field": "barcode", "regex": DEFAULT_TAG_REGEX},
|
|
31
|
+
"R": {"field": "reference_genome", "regex": DEFAULT_TAG_REGEX},
|
|
24
32
|
}
|
|
33
|
+
|
|
34
|
+
TOOLS_SUFFIX_RE = r"_(?P<tool>[a-zA-Z0-9]+)-(?P<version>\d+(?:-\d+){0,2})(?!-)"
|
|
35
|
+
"""
|
|
36
|
+
Regular expression to extract individual tool-version metadata pairs from a
|
|
37
|
+
validated ``.metrics`` suffix in filenames.
|
|
38
|
+
|
|
39
|
+
- Captures exactly one tool-version pair, where:
|
|
40
|
+
|
|
41
|
+
- ``tool`` is an alphanumeric identifier (letters and digits),
|
|
42
|
+
- ``version`` consists of 1 to 3 numeric components separated by hyphens
|
|
43
|
+
(e.g., '1', '1-0', '1-0-0'),
|
|
44
|
+
- Uses named capture groups (``tool`` and ``version``) to extract data,
|
|
45
|
+
- The negative lookahead ``(?!-)`` ensures the version does not end with a
|
|
46
|
+
hyphen,
|
|
47
|
+
- Intended for extracting all matching pairs after the ``.metrics`` prefix has
|
|
48
|
+
been validated.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
_METRICS_SUFFIX_RE = r"(?:\.metrics(?:_[a-zA-Z0-9]+-\d+(?:-\d+){0,2}(?!-))*)?"
|
|
52
|
+
"""
|
|
53
|
+
Regular expression to match and validate the entire optional ``.metrics``
|
|
54
|
+
suffix in filenames.
|
|
55
|
+
|
|
56
|
+
- Matches zero or one occurrence of:
|
|
57
|
+
|
|
58
|
+
- A literal ``.metrics`` prefix, which must be the first suffix in the
|
|
59
|
+
filename,
|
|
60
|
+
- Followed optionally by zero or more tool-version pairs, each starting with
|
|
61
|
+
an underscore ``_`` and matching the same format as ``TOOLS_SUFFIX_RE``,
|
|
62
|
+
- Validates that the whole suffix structure is correct (including optional
|
|
63
|
+
presence),
|
|
64
|
+
- Ensures that when present, the suffix starts with ``.metrics`` and is
|
|
65
|
+
correctly formatted,
|
|
66
|
+
- Does not extract individual tool-version pairs; its role is to validate the
|
|
67
|
+
suffix as a whole.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
_EXTENSIONS_SUFFIX_RE = rf"\.(?P<ext>{'|'.join(ALLOWED_EXTENSIONS)})(\.gz)?"
|
|
71
|
+
"""
|
|
72
|
+
Regular expression for matching allowed file extensions with optional gzip
|
|
73
|
+
compression.
|
|
74
|
+
|
|
75
|
+
This regex matches the file extension suffixes for files belonging to
|
|
76
|
+
a set of predefined allowed extensions, specified in the ``ALLOWED_EXTENSIONS``
|
|
77
|
+
list.
|
|
78
|
+
|
|
79
|
+
The pattern matches:
|
|
80
|
+
|
|
81
|
+
- a dot (``.``) followed by one of the allowed extensions,
|
|
82
|
+
- optionally, a second extension ``.gz`` indicating gzip compression.
|
|
83
|
+
|
|
84
|
+
Examples of matched suffixes: ``.vcf``, ``.cov``, ``.json``, ``.vcf.gz``,
|
|
85
|
+
``.json.gz``.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
FILE_SUFFIXES_RE = rf"{_METRICS_SUFFIX_RE}{_EXTENSIONS_SUFFIX_RE}"
|
|
89
|
+
"""Regex used to validate the suffix part of a filename.
|
|
90
|
+
|
|
91
|
+
It matches an optional metrics suffix (containing tool-version metadata),
|
|
92
|
+
immediately followed by a required allowed file extension suffix
|
|
93
|
+
(possibly compressed with .gz).
|
|
94
|
+
|
|
95
|
+
This regex is the combination of ``_METRICS_SUFFIX_RE`` and
|
|
96
|
+
``_EXTENSIONS_SUFFIX_RE``.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
QC_METRICS_SCHEMA = schema.Schema(
|
|
100
|
+
{
|
|
101
|
+
"id": str,
|
|
102
|
+
"genome_coverage_size": float,
|
|
103
|
+
"genome_coverage_percent": float,
|
|
104
|
+
"n50": int,
|
|
105
|
+
"larger_contig": int,
|
|
106
|
+
"iqr": int,
|
|
107
|
+
"outlier_percent": float,
|
|
108
|
+
"mean_depth": float,
|
|
109
|
+
"mean_duplicat_percent": float,
|
|
110
|
+
"fold_regions_percents": {
|
|
111
|
+
"5": float,
|
|
112
|
+
"10": float,
|
|
113
|
+
"20": float,
|
|
114
|
+
"30": float,
|
|
115
|
+
"40": float,
|
|
116
|
+
},
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
SV_METRICS_SCHEMA = schema.Schema(
|
|
122
|
+
{
|
|
123
|
+
"metadata_mandatory": [{str: schema.Or(str, int, float, bool)}],
|
|
124
|
+
schema.Optional("metadata_optional"): [
|
|
125
|
+
{str: schema.Or(str, int, float, bool)}
|
|
126
|
+
],
|
|
127
|
+
"regions": [
|
|
128
|
+
{
|
|
129
|
+
"name": str,
|
|
130
|
+
"bed": str,
|
|
131
|
+
"results": [
|
|
132
|
+
{
|
|
133
|
+
"svtype": str,
|
|
134
|
+
"size": str,
|
|
135
|
+
"FP_query": int,
|
|
136
|
+
"TP_truth": int,
|
|
137
|
+
"TP_query": int,
|
|
138
|
+
"FN_truth": int,
|
|
139
|
+
"total_truth": int,
|
|
140
|
+
"total_query": int,
|
|
141
|
+
"precision": schema.Or(int, float),
|
|
142
|
+
"recall": schema.Or(int, float),
|
|
143
|
+
"f1": schema.Or(int, float),
|
|
144
|
+
}
|
|
145
|
+
],
|
|
146
|
+
}
|
|
147
|
+
],
|
|
148
|
+
}
|
|
149
|
+
)
|
|
@@ -6,50 +6,87 @@ This module provides functionality for importing data bundles.
|
|
|
6
6
|
import logging
|
|
7
7
|
import sys
|
|
8
8
|
import typing
|
|
9
|
+
from pathlib import Path
|
|
9
10
|
|
|
10
|
-
from genelastic.common import
|
|
11
|
-
|
|
12
|
-
from .analyses import Analyses
|
|
13
|
-
from .
|
|
14
|
-
from .
|
|
15
|
-
from .
|
|
16
|
-
from .tags import Tags
|
|
17
|
-
from .wet_processes import WetProcesses
|
|
11
|
+
from genelastic.common.cli import log_subsection
|
|
12
|
+
from genelastic.common.types import BundleDict
|
|
13
|
+
from genelastic.import_data.models.analyses import Analyses
|
|
14
|
+
from genelastic.import_data.models.process import BioInfoProcess, WetProcess
|
|
15
|
+
from genelastic.import_data.models.processes import Processes
|
|
16
|
+
from genelastic.import_data.models.tags import Tags
|
|
18
17
|
|
|
19
18
|
logger = logging.getLogger("genelastic")
|
|
20
19
|
|
|
21
20
|
|
|
21
|
+
def resolve_data_path(bundle_file: Path, data_path: Path | None) -> Path:
|
|
22
|
+
"""Resolves the data path relative to the given bundle file if necessary.
|
|
23
|
+
|
|
24
|
+
If ``data_path`` is:
|
|
25
|
+
|
|
26
|
+
- Absolute: it is returned as-is,
|
|
27
|
+
- Relative: it is resolved relative to the parent of ``bundle_file``,
|
|
28
|
+
- None: considered as the current directory (``.``) and resolved
|
|
29
|
+
accordingly.
|
|
30
|
+
|
|
31
|
+
:param bundle_file: Path to the bundle file used for resolution context.
|
|
32
|
+
:param data_path: Optional path to the data directory or file.
|
|
33
|
+
:return: An absolute Path object pointing to the resolved data location.
|
|
34
|
+
"""
|
|
35
|
+
resolved_data_path = data_path if data_path else Path()
|
|
36
|
+
if not resolved_data_path.is_absolute():
|
|
37
|
+
resolved_data_path = Path(
|
|
38
|
+
bundle_file.parent / resolved_data_path
|
|
39
|
+
).resolve()
|
|
40
|
+
return resolved_data_path
|
|
41
|
+
|
|
42
|
+
|
|
22
43
|
class ImportBundle:
|
|
23
44
|
"""Class for handling an import bundle description."""
|
|
24
45
|
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
x: typing.Sequence[BundleDict],
|
|
49
|
+
*,
|
|
50
|
+
multi_match: bool = False,
|
|
51
|
+
check: bool = False,
|
|
27
52
|
) -> None:
|
|
53
|
+
self._documents = x
|
|
54
|
+
self._custom_tags_doc: (
|
|
55
|
+
dict[str, dict[str, str | dict[str, str]]] | None
|
|
56
|
+
) = None
|
|
57
|
+
|
|
28
58
|
analyses: list[BundleDict] = []
|
|
29
59
|
wet_processes: list[BundleDict] = []
|
|
30
60
|
bi_processes: list[BundleDict] = []
|
|
31
|
-
|
|
61
|
+
|
|
62
|
+
self._search_custom_tags()
|
|
63
|
+
tags = (
|
|
64
|
+
Tags.from_dict(self._custom_tags_doc)
|
|
65
|
+
if self._custom_tags_doc
|
|
66
|
+
else Tags()
|
|
67
|
+
)
|
|
32
68
|
|
|
33
69
|
# Loop on dicts
|
|
34
70
|
for d in x:
|
|
35
|
-
# Check version
|
|
36
|
-
if "version" not in d:
|
|
37
|
-
msg = "No version inside YAML document."
|
|
38
|
-
raise RuntimeError(msg)
|
|
39
|
-
if int(d["version"]) != BUNDLE_CURRENT_VERSION:
|
|
40
|
-
raise RuntimeError
|
|
41
|
-
|
|
42
71
|
# Gather all analyses
|
|
43
72
|
if "analyses" in d and d["analyses"] is not None:
|
|
44
73
|
# Copy some bundle properties into each analysis
|
|
45
74
|
for analysis in d["analyses"]:
|
|
46
|
-
|
|
47
|
-
if key in d:
|
|
48
|
-
analysis[key] = d[key]
|
|
75
|
+
bundle_file = d["bundle_file"]
|
|
49
76
|
|
|
50
|
-
|
|
77
|
+
analysis["bundle_file"] = bundle_file
|
|
51
78
|
analysis["tags"] = tags
|
|
52
|
-
|
|
79
|
+
analysis["multi_match"] = multi_match
|
|
80
|
+
|
|
81
|
+
# Resolve data path
|
|
82
|
+
data_path = (
|
|
83
|
+
Path(analysis["data_path"])
|
|
84
|
+
if "data_path" in analysis
|
|
85
|
+
else None
|
|
86
|
+
)
|
|
87
|
+
analysis["data_path"] = resolve_data_path(
|
|
88
|
+
bundle_file, data_path
|
|
89
|
+
)
|
|
53
90
|
analyses.extend(d["analyses"])
|
|
54
91
|
|
|
55
92
|
# If some wet processes are defined, copy the bundle file path into each of them.
|
|
@@ -65,18 +102,32 @@ class ImportBundle:
|
|
|
65
102
|
bi_processes.extend(d["bi_processes"])
|
|
66
103
|
|
|
67
104
|
# Instantiate all objects
|
|
68
|
-
|
|
69
|
-
|
|
105
|
+
log_subsection("Loading wet processes...")
|
|
106
|
+
self._wet_processes = Processes.from_dicts(wet_processes, WetProcess)
|
|
107
|
+
logger.info(
|
|
108
|
+
"=> %s wet process(es) loaded from bundle(s).",
|
|
109
|
+
len(self._wet_processes),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
log_subsection("Loading bioinformatics processes...")
|
|
113
|
+
self._bi_processes = Processes.from_dicts(bi_processes, BioInfoProcess)
|
|
114
|
+
logger.info(
|
|
115
|
+
"=> %s bioinformatics process(es) loaded from bundle(s).",
|
|
116
|
+
len(self._bi_processes),
|
|
70
117
|
)
|
|
71
|
-
|
|
72
|
-
|
|
118
|
+
|
|
119
|
+
log_subsection("Loading analyses...")
|
|
120
|
+
self._analyses = Analyses.from_dicts(analyses)
|
|
121
|
+
|
|
122
|
+
logger.info(
|
|
123
|
+
"=> %s analysis(es) loaded from bundle(s).", len(self._analyses)
|
|
73
124
|
)
|
|
74
|
-
|
|
125
|
+
logger.info("")
|
|
75
126
|
|
|
76
127
|
if check:
|
|
77
|
-
self.
|
|
128
|
+
self._check_referenced_processes()
|
|
78
129
|
|
|
79
|
-
def
|
|
130
|
+
def _check_referenced_processes(self) -> None:
|
|
80
131
|
"""Check if wet and bi processes referenced inside each analysis are defined.
|
|
81
132
|
If one of the processes is not defined, the program exits.
|
|
82
133
|
"""
|
|
@@ -85,8 +136,7 @@ class ImportBundle:
|
|
|
85
136
|
|
|
86
137
|
if (
|
|
87
138
|
analysis_wet_process
|
|
88
|
-
and analysis_wet_process
|
|
89
|
-
not in self._wet_processes.get_process_ids()
|
|
139
|
+
and analysis_wet_process not in self._wet_processes
|
|
90
140
|
):
|
|
91
141
|
sys.exit(
|
|
92
142
|
f"Analysis at index {index} in file {analysis.bundle_file} "
|
|
@@ -97,48 +147,41 @@ class ImportBundle:
|
|
|
97
147
|
|
|
98
148
|
if (
|
|
99
149
|
analysis_bi_process
|
|
100
|
-
and analysis_bi_process
|
|
101
|
-
not in self._bi_processes.get_process_ids()
|
|
150
|
+
and analysis_bi_process not in self._bi_processes
|
|
102
151
|
):
|
|
103
152
|
sys.exit(
|
|
104
153
|
f"Analysis at index {index} in file {analysis.bundle_file} "
|
|
105
154
|
f"is referencing an undefined bi process: {analysis_bi_process}"
|
|
106
155
|
)
|
|
107
156
|
|
|
157
|
+
def _search_custom_tags(self) -> None:
|
|
158
|
+
docs_with_custom_tags = [d for d in self._documents if "tags" in d]
|
|
159
|
+
|
|
160
|
+
# Only one 'tags' redefinition is allowed across all the documents.
|
|
161
|
+
if len(docs_with_custom_tags) > 1:
|
|
162
|
+
bundle_files = sorted(
|
|
163
|
+
[str(d["bundle_file"]) for d in docs_with_custom_tags]
|
|
164
|
+
)
|
|
165
|
+
msg = (
|
|
166
|
+
f"Only one 'tags' key should be defined across all documents, "
|
|
167
|
+
f"but multiple were found : {', '.join(bundle_files)}"
|
|
168
|
+
)
|
|
169
|
+
raise RuntimeError(msg)
|
|
170
|
+
|
|
171
|
+
if len(docs_with_custom_tags) == 1:
|
|
172
|
+
self._custom_tags_doc = docs_with_custom_tags[0]
|
|
173
|
+
|
|
108
174
|
@property
|
|
109
175
|
def analyses(self) -> Analyses:
|
|
110
176
|
"""The analyses."""
|
|
111
177
|
return self._analyses
|
|
112
178
|
|
|
113
179
|
@property
|
|
114
|
-
def wet_processes(self) ->
|
|
180
|
+
def wet_processes(self) -> Processes:
|
|
115
181
|
"""The wet processes."""
|
|
116
182
|
return self._wet_processes
|
|
117
183
|
|
|
118
184
|
@property
|
|
119
|
-
def bi_processes(self) ->
|
|
185
|
+
def bi_processes(self) -> Processes:
|
|
120
186
|
"""The bi processes."""
|
|
121
187
|
return self._bi_processes
|
|
122
|
-
|
|
123
|
-
def get_nb_files(self, cat: str | None = None) -> int:
|
|
124
|
-
"""Get the number of files in a category."""
|
|
125
|
-
files = self.get_files(cat)
|
|
126
|
-
return len(files)
|
|
127
|
-
|
|
128
|
-
def get_files(self, cat: str | None = None) -> list[DataFile]:
|
|
129
|
-
"""Returns all files of a category."""
|
|
130
|
-
files: list[DataFile] = []
|
|
131
|
-
|
|
132
|
-
# Loop on all analyses
|
|
133
|
-
for analysis in self.analyses:
|
|
134
|
-
files += analysis.get_data_files(cat)
|
|
135
|
-
|
|
136
|
-
return files
|
|
137
|
-
|
|
138
|
-
def get_nb_matched_files(self) -> int:
|
|
139
|
-
"""Get the number of files that match the pattern."""
|
|
140
|
-
return sum(a.get_nb_files() for a in self.analyses)
|
|
141
|
-
|
|
142
|
-
def get_nb_unmatched_files(self) -> int:
|
|
143
|
-
"""Get the number of files that do not match."""
|
|
144
|
-
return sum(len(a.get_unmatched_file_paths()) for a in self.analyses)
|
|
@@ -1,52 +1,26 @@
|
|
|
1
1
|
"""ImportBundle factory module."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
import re
|
|
5
|
-
import sys
|
|
6
4
|
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
7
6
|
|
|
8
7
|
import schema
|
|
9
8
|
import yaml
|
|
10
|
-
from yaml
|
|
11
|
-
from yaml.scanner import ScannerError
|
|
9
|
+
from yaml import YAMLError
|
|
12
10
|
|
|
13
|
-
from genelastic.common import
|
|
11
|
+
from genelastic.common.exceptions import (
|
|
12
|
+
ValidationError,
|
|
13
|
+
YAMLFileReadError,
|
|
14
|
+
)
|
|
15
|
+
from genelastic.common.types import BundleDict
|
|
14
16
|
|
|
15
17
|
from .constants import BUNDLE_CURRENT_VERSION
|
|
16
18
|
from .import_bundle import ImportBundle
|
|
19
|
+
from .models.tags import Tags
|
|
17
20
|
|
|
18
21
|
logger = logging.getLogger("genelastic")
|
|
19
22
|
|
|
20
23
|
|
|
21
|
-
def validate_tag_char(s: str) -> bool:
|
|
22
|
-
"""A tag should only contain one special character, excluding the following : (, ), ?, <, >."""
|
|
23
|
-
if len(s) > 1:
|
|
24
|
-
return False
|
|
25
|
-
|
|
26
|
-
return re.match(r"^[^\w()<>?]$", s) is not None
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def validate_field_chars(s: str) -> bool:
|
|
30
|
-
"""Fields should only contain word characters.
|
|
31
|
-
A word character is a character a-z, A-Z, 0-9, including _ (underscore).
|
|
32
|
-
"""
|
|
33
|
-
return re.match(r"^\w+$", s) is not None
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
_SCHEMA_V1 = schema.Schema(
|
|
37
|
-
{"version": 1, schema.Optional("vcf_files"): schema.Or(None, [str])}
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
_SCHEMA_V2 = schema.Schema(
|
|
41
|
-
{
|
|
42
|
-
"version": 2,
|
|
43
|
-
schema.Optional("vcf"): {
|
|
44
|
-
schema.Optional("filename_pattern"): str,
|
|
45
|
-
"files": [str],
|
|
46
|
-
},
|
|
47
|
-
}
|
|
48
|
-
)
|
|
49
|
-
|
|
50
24
|
_SCHEMA_V3 = schema.Schema(
|
|
51
25
|
{
|
|
52
26
|
"version": 3,
|
|
@@ -54,8 +28,8 @@ _SCHEMA_V3 = schema.Schema(
|
|
|
54
28
|
None,
|
|
55
29
|
[
|
|
56
30
|
{
|
|
57
|
-
|
|
58
|
-
schema.Optional("
|
|
31
|
+
"file_prefix": str,
|
|
32
|
+
schema.Optional("suffix"): str,
|
|
59
33
|
schema.Optional("sample_name"): str,
|
|
60
34
|
schema.Optional("source"): str,
|
|
61
35
|
schema.Optional("barcode"): str,
|
|
@@ -113,24 +87,24 @@ _SCHEMA_V3 = schema.Schema(
|
|
|
113
87
|
],
|
|
114
88
|
),
|
|
115
89
|
schema.Optional("tags"): {
|
|
116
|
-
schema.Optional("
|
|
117
|
-
schema.Optional("
|
|
90
|
+
schema.Optional("delimiter"): {
|
|
91
|
+
schema.Optional("start"): schema.And(
|
|
118
92
|
str,
|
|
119
|
-
|
|
120
|
-
error="Key '
|
|
93
|
+
Tags.validate_tag_delimiter,
|
|
94
|
+
error="Key 'delimiter.start' should only contain one special character, "
|
|
121
95
|
"excluding the following : (, ), ?, <, >.",
|
|
122
96
|
),
|
|
123
|
-
schema.Optional("
|
|
97
|
+
schema.Optional("end"): schema.And(
|
|
124
98
|
str,
|
|
125
|
-
|
|
126
|
-
error="Key '
|
|
99
|
+
Tags.validate_tag_delimiter,
|
|
100
|
+
error="Key 'delimiter.end' should only contain one special character, "
|
|
127
101
|
"excluding the following : (, ), ?, <, >.",
|
|
128
102
|
),
|
|
129
103
|
},
|
|
130
|
-
"match": {
|
|
104
|
+
schema.Optional("match"): {
|
|
131
105
|
schema.And(
|
|
132
106
|
str,
|
|
133
|
-
|
|
107
|
+
Tags.validate_tag_name,
|
|
134
108
|
error="Tags listed under the 'match' key should only contain "
|
|
135
109
|
"word characters. A word character is a character "
|
|
136
110
|
"a-z, A-Z, 0-9, including _ (underscore).",
|
|
@@ -142,106 +116,81 @@ _SCHEMA_V3 = schema.Schema(
|
|
|
142
116
|
|
|
143
117
|
|
|
144
118
|
def make_import_bundle_from_files(
|
|
145
|
-
files: list[Path], *, check: bool = False
|
|
119
|
+
files: list[Path], *, multi_match: bool = False, check: bool = False
|
|
146
120
|
) -> ImportBundle:
|
|
147
|
-
"""Create an ImportBundle instance from a list of YAML files.
|
|
148
|
-
|
|
121
|
+
"""Create an ImportBundle instance from a list of YAML files.
|
|
122
|
+
|
|
123
|
+
:raises YAMLFileReadError: If a YAML file cannot be read.
|
|
124
|
+
:raises ValidationError: If an import bundle is invalid.
|
|
125
|
+
:return: An ImportBundle instance.
|
|
126
|
+
"""
|
|
127
|
+
all_docs = []
|
|
149
128
|
for file in files:
|
|
150
129
|
# Load documents stored in each file.
|
|
151
|
-
|
|
130
|
+
docs = load_yaml_file(file)
|
|
131
|
+
|
|
132
|
+
for doc in docs:
|
|
133
|
+
# Let schema handle structure/type/version validation.
|
|
134
|
+
validate_doc(doc)
|
|
152
135
|
|
|
153
|
-
for i, new_document in enumerate(new_documents):
|
|
154
|
-
# Upgrade each new document to the latest/current version.
|
|
155
|
-
if new_document["version"] != BUNDLE_CURRENT_VERSION:
|
|
156
|
-
new_documents[i] = upgrade_bundle_version(
|
|
157
|
-
new_document, BUNDLE_CURRENT_VERSION
|
|
158
|
-
)
|
|
159
|
-
# Set the root directory path in each new document.
|
|
160
|
-
new_documents[i]["root_dir"] = str(file.parent)
|
|
161
136
|
# Set the original bundle YAML file path in each new document.
|
|
162
|
-
|
|
137
|
+
doc["bundle_file"] = Path(file).resolve()
|
|
163
138
|
|
|
164
|
-
|
|
139
|
+
all_docs.extend(docs)
|
|
165
140
|
|
|
166
141
|
# Create bundle instance.
|
|
167
|
-
return ImportBundle(
|
|
142
|
+
return ImportBundle(all_docs, multi_match=multi_match, check=check)
|
|
168
143
|
|
|
169
144
|
|
|
170
|
-
def
|
|
171
|
-
"""
|
|
145
|
+
def validate_doc(doc: Any) -> None: # noqa: ANN401
|
|
146
|
+
"""Validate a single YAML document against its versioned bundle schema.
|
|
172
147
|
|
|
173
|
-
|
|
148
|
+
:param doc: Dictionary with a 'version' key indicating the schema to use.
|
|
149
|
+
:raises ValidationError: If validation fails.
|
|
174
150
|
"""
|
|
175
|
-
|
|
176
|
-
if len(x) == 0:
|
|
177
|
-
x["version"] = BUNDLE_CURRENT_VERSION
|
|
151
|
+
bundle_version = None
|
|
178
152
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
153
|
+
if isinstance(doc, dict):
|
|
154
|
+
# If the document is a dict but lacks a version,
|
|
155
|
+
# assume current version.
|
|
156
|
+
if "version" not in doc:
|
|
157
|
+
doc["version"] = BUNDLE_CURRENT_VERSION
|
|
184
158
|
|
|
185
|
-
|
|
186
|
-
elif "vcf_files" in x or "cov_files" in x:
|
|
187
|
-
x["version"] = 1
|
|
159
|
+
bundle_version = doc["version"]
|
|
188
160
|
|
|
189
|
-
# Version 2
|
|
190
|
-
elif "vcf" in x and "filename_pattern" in x["vcf"]:
|
|
191
|
-
x["version"] = 2
|
|
192
|
-
|
|
193
|
-
# Latest version
|
|
194
|
-
else:
|
|
195
|
-
x["version"] = BUNDLE_CURRENT_VERSION
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
def validate_doc(x: BundleDict) -> None:
|
|
199
|
-
"""Validate the dictionary using its corresponding schema."""
|
|
200
161
|
# Get schema
|
|
201
|
-
bundle_schema = globals().get("_SCHEMA_V"
|
|
202
|
-
if bundle_schema
|
|
203
|
-
|
|
204
|
-
f"
|
|
162
|
+
bundle_schema = globals().get(f"_SCHEMA_V{bundle_version}")
|
|
163
|
+
if not bundle_schema:
|
|
164
|
+
msg = (
|
|
165
|
+
f"Failed to validate import bundle. "
|
|
166
|
+
f"Reason: unsupported version found ({bundle_version})."
|
|
205
167
|
)
|
|
168
|
+
raise ValidationError(msg)
|
|
206
169
|
|
|
207
170
|
# Validate
|
|
208
|
-
|
|
171
|
+
try:
|
|
172
|
+
bundle_schema.validate(doc)
|
|
173
|
+
except schema.SchemaError as e:
|
|
174
|
+
msg = f"Failed to validate import bundle. Reason: {e}"
|
|
175
|
+
raise ValidationError(msg) from None
|
|
209
176
|
|
|
210
177
|
|
|
211
|
-
def
|
|
212
|
-
"""
|
|
213
|
-
# Load YAML
|
|
214
|
-
logger.info('Load YAML data import file "%s".', file)
|
|
215
|
-
docs: list[BundleDict] = []
|
|
178
|
+
def load_yaml_file(file_path: Path) -> list[Any]:
|
|
179
|
+
"""Load a YAML file.
|
|
216
180
|
|
|
181
|
+
:param file_path: Path to the file to load.
|
|
182
|
+
:raises YAMLFileError: If the file cannot be opened, decoded or
|
|
183
|
+
parsed as valid YAML.
|
|
184
|
+
:returns: A list of documents loaded from the YAML file.
|
|
185
|
+
"""
|
|
217
186
|
try:
|
|
218
|
-
with
|
|
219
|
-
|
|
220
|
-
except (
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
except ScannerError as e:
|
|
224
|
-
logger.error("YAML file lexical analysis failed : %s", e)
|
|
225
|
-
sys.exit(1)
|
|
226
|
-
except ParserError as e:
|
|
227
|
-
logger.error("YAML file syntactic analysis failed : %s", e)
|
|
228
|
-
sys.exit(1)
|
|
187
|
+
with file_path.open(encoding="utf-8") as f:
|
|
188
|
+
documents = list(yaml.safe_load_all(f))
|
|
189
|
+
except (OSError, YAMLError, UnicodeDecodeError) as e:
|
|
190
|
+
msg = f"Failed to read YAML file '{file_path}'. Reason: {e}"
|
|
191
|
+
raise YAMLFileReadError(msg) from None
|
|
229
192
|
|
|
230
|
-
|
|
231
|
-
if docs is None:
|
|
232
|
-
docs = [{"version": BUNDLE_CURRENT_VERSION}]
|
|
233
|
-
else:
|
|
234
|
-
for i, x in enumerate(docs):
|
|
235
|
-
if x is None:
|
|
236
|
-
docs[i] = {"version": BUNDLE_CURRENT_VERSION}
|
|
237
|
-
else:
|
|
238
|
-
set_version(x)
|
|
239
|
-
|
|
240
|
-
# Find schema and validate document
|
|
241
|
-
for x in docs:
|
|
242
|
-
validate_doc(x)
|
|
243
|
-
|
|
244
|
-
return docs
|
|
193
|
+
return documents
|
|
245
194
|
|
|
246
195
|
|
|
247
196
|
def upgrade_bundle_version(x: BundleDict, to_version: int) -> BundleDict:
|
|
@@ -268,31 +217,3 @@ def upgrade_bundle_version(x: BundleDict, to_version: int) -> BundleDict:
|
|
|
268
217
|
y = upgrade_fct(y) # type: ignore[misc]
|
|
269
218
|
|
|
270
219
|
return y
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
def _upgrade_from_v1_to_v2(x: BundleDict) -> BundleDict:
|
|
274
|
-
# Upgrade
|
|
275
|
-
y = {"version": 2, "vcf": {"files": []}}
|
|
276
|
-
if "vcf_files" in x and x["vcf_files"] is not None:
|
|
277
|
-
y["vcf"]["files"] = x["vcf_files"] # type: ignore[index]
|
|
278
|
-
|
|
279
|
-
# Validate schema
|
|
280
|
-
_SCHEMA_V2.validate(y)
|
|
281
|
-
|
|
282
|
-
return y
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
def _upgrade_from_v2_to_v3(x: BundleDict) -> BundleDict:
|
|
286
|
-
# Upgrade
|
|
287
|
-
y: BundleDict = {"version": 3, "analyses": []}
|
|
288
|
-
if "vcf" in x:
|
|
289
|
-
analysis_entry = {}
|
|
290
|
-
if "files" in x["vcf"]:
|
|
291
|
-
analysis_entry["files"] = x["vcf"]["files"]
|
|
292
|
-
if "filename_pattern" in x["vcf"]:
|
|
293
|
-
analysis_entry["file_prefix"] = x["vcf"]["filename_pattern"]
|
|
294
|
-
y["analyses"].append(analysis_entry)
|
|
295
|
-
|
|
296
|
-
_SCHEMA_V3.validate(y)
|
|
297
|
-
|
|
298
|
-
return y
|
|
File without changes
|