genelastic 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. genelastic/api/.env +4 -0
  2. genelastic/api/cli_start_api.py +2 -2
  3. genelastic/api/errors.py +52 -0
  4. genelastic/api/extends/example.py +0 -6
  5. genelastic/api/extends/example.yml +0 -20
  6. genelastic/api/routes.py +313 -181
  7. genelastic/api/server.py +8 -3
  8. genelastic/api/specification.yml +343 -181
  9. genelastic/common/__init__.py +0 -44
  10. genelastic/common/cli.py +48 -0
  11. genelastic/common/elastic.py +374 -46
  12. genelastic/common/exceptions.py +34 -2
  13. genelastic/common/server.py +9 -1
  14. genelastic/common/types.py +1 -14
  15. genelastic/import_data/__init__.py +0 -27
  16. genelastic/import_data/checker.py +99 -0
  17. genelastic/import_data/checker_observer.py +13 -0
  18. genelastic/import_data/cli/__init__.py +0 -0
  19. genelastic/import_data/cli/cli_check.py +136 -0
  20. genelastic/import_data/{cli_gen_data.py → cli/gen_data.py} +4 -4
  21. genelastic/import_data/cli/import_data.py +346 -0
  22. genelastic/import_data/cli/info.py +247 -0
  23. genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
  24. genelastic/import_data/cli/validate.py +146 -0
  25. genelastic/import_data/collect.py +185 -0
  26. genelastic/import_data/constants.py +136 -11
  27. genelastic/import_data/import_bundle.py +102 -59
  28. genelastic/import_data/import_bundle_factory.py +70 -149
  29. genelastic/import_data/importers/__init__.py +0 -0
  30. genelastic/import_data/importers/importer_base.py +131 -0
  31. genelastic/import_data/importers/importer_factory.py +85 -0
  32. genelastic/import_data/importers/importer_types.py +223 -0
  33. genelastic/import_data/logger.py +2 -1
  34. genelastic/import_data/models/__init__.py +0 -0
  35. genelastic/import_data/models/analyses.py +178 -0
  36. genelastic/import_data/models/analysis.py +144 -0
  37. genelastic/import_data/models/data_file.py +110 -0
  38. genelastic/import_data/models/process.py +45 -0
  39. genelastic/import_data/models/processes.py +84 -0
  40. genelastic/import_data/models/tags.py +170 -0
  41. genelastic/import_data/models/unique_list.py +109 -0
  42. genelastic/import_data/models/validate.py +26 -0
  43. genelastic/import_data/patterns.py +90 -0
  44. genelastic/import_data/random_bundle.py +10 -8
  45. genelastic/import_data/resolve.py +157 -0
  46. genelastic/ui/.env +1 -0
  47. genelastic/ui/cli_start_ui.py +4 -2
  48. genelastic/ui/routes.py +289 -42
  49. genelastic/ui/static/cea-cnrgh.ico +0 -0
  50. genelastic/ui/static/cea.ico +0 -0
  51. genelastic/ui/static/layout.ico +0 -0
  52. genelastic/ui/static/novaseq6000.png +0 -0
  53. genelastic/ui/static/style.css +430 -0
  54. genelastic/ui/static/ui.js +458 -0
  55. genelastic/ui/templates/analyses.html +96 -9
  56. genelastic/ui/templates/analysis_detail.html +44 -0
  57. genelastic/ui/templates/bi_process_detail.html +129 -0
  58. genelastic/ui/templates/bi_processes.html +114 -9
  59. genelastic/ui/templates/explorer.html +356 -0
  60. genelastic/ui/templates/home.html +205 -2
  61. genelastic/ui/templates/layout.html +148 -29
  62. genelastic/ui/templates/version.html +19 -7
  63. genelastic/ui/templates/wet_process_detail.html +131 -0
  64. genelastic/ui/templates/wet_processes.html +114 -9
  65. genelastic-0.9.0.dist-info/METADATA +686 -0
  66. genelastic-0.9.0.dist-info/RECORD +76 -0
  67. genelastic-0.9.0.dist-info/WHEEL +4 -0
  68. genelastic-0.9.0.dist-info/entry_points.txt +10 -0
  69. genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
  70. genelastic/import_data/analyses.py +0 -69
  71. genelastic/import_data/analysis.py +0 -205
  72. genelastic/import_data/bi_process.py +0 -27
  73. genelastic/import_data/bi_processes.py +0 -49
  74. genelastic/import_data/cli_import.py +0 -379
  75. genelastic/import_data/cli_info.py +0 -256
  76. genelastic/import_data/cli_validate.py +0 -54
  77. genelastic/import_data/data_file.py +0 -87
  78. genelastic/import_data/filename_pattern.py +0 -57
  79. genelastic/import_data/tags.py +0 -123
  80. genelastic/import_data/wet_process.py +0 -28
  81. genelastic/import_data/wet_processes.py +0 -53
  82. genelastic-0.8.0.dist-info/METADATA +0 -109
  83. genelastic-0.8.0.dist-info/RECORD +0 -52
  84. genelastic-0.8.0.dist-info/WHEEL +0 -5
  85. genelastic-0.8.0.dist-info/entry_points.txt +0 -8
  86. genelastic-0.8.0.dist-info/top_level.txt +0 -1
@@ -5,20 +5,145 @@ This module contains genelastic constants.
5
5
 
6
6
  import typing
7
7
 
8
- ALLOWED_CATEGORIES: typing.Final[list[str]] = ["vcf", "cov"]
8
+ import schema
9
+
10
+ ALLOWED_EXTENSIONS: typing.Final[list[str]] = [
11
+ "vcf",
12
+ "cov",
13
+ "json",
14
+ "yml",
15
+ "yaml",
16
+ ]
9
17
 
10
18
  BUNDLE_CURRENT_VERSION = 3
11
19
 
12
- DEFAULT_TAG_REGEX = "[^_-]+"
13
- DEFAULT_TAG_PREFIX = "%"
14
- DEFAULT_TAG_SUFFIX = ""
20
+ DEFAULT_TAG_REGEX = "[^_]+"
21
+ DEFAULT_TAG_DELIMITER_START = "%"
22
+ DEFAULT_TAG_DELIMITER_END = ""
15
23
 
16
24
  DEFAULT_TAG2FIELD: typing.Final[dict[str, dict[str, str]]] = {
17
- "%S": {"field": "sample_name", "regex": DEFAULT_TAG_REGEX},
18
- "%F": {"field": "source", "regex": DEFAULT_TAG_REGEX},
19
- "%W": {"field": "wet_process", "regex": DEFAULT_TAG_REGEX},
20
- "%B": {"field": "bi_process", "regex": DEFAULT_TAG_REGEX},
21
- "%D": {"field": "cov_depth", "regex": DEFAULT_TAG_REGEX},
22
- "%A": {"field": "barcode", "regex": DEFAULT_TAG_REGEX},
23
- "%R": {"field": "reference_genome", "regex": DEFAULT_TAG_REGEX},
25
+ "S": {"field": "sample_name", "regex": DEFAULT_TAG_REGEX},
26
+ "F": {"field": "source", "regex": DEFAULT_TAG_REGEX},
27
+ "W": {"field": "wet_process", "regex": DEFAULT_TAG_REGEX},
28
+ "B": {"field": "bi_process", "regex": DEFAULT_TAG_REGEX},
29
+ "D": {"field": "cov_depth", "regex": DEFAULT_TAG_REGEX},
30
+ "A": {"field": "barcode", "regex": DEFAULT_TAG_REGEX},
31
+ "R": {"field": "reference_genome", "regex": DEFAULT_TAG_REGEX},
24
32
  }
33
+
34
+ TOOLS_SUFFIX_RE = r"_(?P<tool>[a-zA-Z0-9]+)-(?P<version>\d+(?:-\d+){0,2})(?!-)"
35
+ """
36
+ Regular expression to extract individual tool-version metadata pairs from a
37
+ validated ``.metrics`` suffix in filenames.
38
+
39
+ - Captures exactly one tool-version pair, where:
40
+
41
+ - ``tool`` is an alphanumeric identifier (letters and digits),
42
+ - ``version`` consists of 1 to 3 numeric components separated by hyphens
43
+ (e.g., '1', '1-0', '1-0-0'),
44
+ - Uses named capture groups (``tool`` and ``version``) to extract data,
45
+ - The negative lookahead ``(?!-)`` ensures the version does not end with a
46
+ hyphen,
47
+ - Intended for extracting all matching pairs after the ``.metrics`` prefix has
48
+ been validated.
49
+ """
50
+
51
+ _METRICS_SUFFIX_RE = r"(?:\.metrics(?:_[a-zA-Z0-9]+-\d+(?:-\d+){0,2}(?!-))*)?"
52
+ """
53
+ Regular expression to match and validate the entire optional ``.metrics``
54
+ suffix in filenames.
55
+
56
+ - Matches zero or one occurrence of:
57
+
58
+ - A literal ``.metrics`` prefix, which must be the first suffix in the
59
+ filename,
60
+ - Followed optionally by zero or more tool-version pairs, each starting with
61
+ an underscore ``_`` and matching the same format as ``TOOLS_SUFFIX_RE``,
62
+ - Validates that the whole suffix structure is correct (including optional
63
+ presence),
64
+ - Ensures that when present, the suffix starts with ``.metrics`` and is
65
+ correctly formatted,
66
+ - Does not extract individual tool-version pairs; its role is to validate the
67
+ suffix as a whole.
68
+ """
69
+
70
+ _EXTENSIONS_SUFFIX_RE = rf"\.(?P<ext>{'|'.join(ALLOWED_EXTENSIONS)})(\.gz)?"
71
+ """
72
+ Regular expression for matching allowed file extensions with optional gzip
73
+ compression.
74
+
75
+ This regex matches the file extension suffixes for files belonging to
76
+ a set of predefined allowed extensions, specified in the ``ALLOWED_EXTENSIONS``
77
+ list.
78
+
79
+ The pattern matches:
80
+
81
+ - a dot (``.``) followed by one of the allowed extensions,
82
+ - optionally, a second extension ``.gz`` indicating gzip compression.
83
+
84
+ Examples of matched suffixes: ``.vcf``, ``.cov``, ``.json``, ``.vcf.gz``,
85
+ ``.json.gz``.
86
+ """
87
+
88
+ FILE_SUFFIXES_RE = rf"{_METRICS_SUFFIX_RE}{_EXTENSIONS_SUFFIX_RE}"
89
+ """Regex used to validate the suffix part of a filename.
90
+
91
+ It matches an optional metrics suffix (containing tool-version metadata),
92
+ immediately followed by a required allowed file extension suffix
93
+ (possibly compressed with .gz).
94
+
95
+ This regex is the combination of ``_METRICS_SUFFIX_RE`` and
96
+ ``_EXTENSIONS_SUFFIX_RE``.
97
+ """
98
+
99
+ QC_METRICS_SCHEMA = schema.Schema(
100
+ {
101
+ "id": str,
102
+ "genome_coverage_size": float,
103
+ "genome_coverage_percent": float,
104
+ "n50": int,
105
+ "larger_contig": int,
106
+ "iqr": int,
107
+ "outlier_percent": float,
108
+ "mean_depth": float,
109
+ "mean_duplicat_percent": float,
110
+ "fold_regions_percents": {
111
+ "5": float,
112
+ "10": float,
113
+ "20": float,
114
+ "30": float,
115
+ "40": float,
116
+ },
117
+ }
118
+ )
119
+
120
+
121
+ SV_METRICS_SCHEMA = schema.Schema(
122
+ {
123
+ "metadata_mandatory": [{str: schema.Or(str, int, float, bool)}],
124
+ schema.Optional("metadata_optional"): [
125
+ {str: schema.Or(str, int, float, bool)}
126
+ ],
127
+ "regions": [
128
+ {
129
+ "name": str,
130
+ "bed": str,
131
+ "results": [
132
+ {
133
+ "svtype": str,
134
+ "size": str,
135
+ "FP_query": int,
136
+ "TP_truth": int,
137
+ "TP_query": int,
138
+ "FN_truth": int,
139
+ "total_truth": int,
140
+ "total_query": int,
141
+ "precision": schema.Or(int, float),
142
+ "recall": schema.Or(int, float),
143
+ "f1": schema.Or(int, float),
144
+ }
145
+ ],
146
+ }
147
+ ],
148
+ }
149
+ )
@@ -6,50 +6,87 @@ This module provides functionality for importing data bundles.
6
6
  import logging
7
7
  import sys
8
8
  import typing
9
+ from pathlib import Path
9
10
 
10
- from genelastic.common import BundleDict
11
-
12
- from .analyses import Analyses
13
- from .bi_processes import BioInfoProcesses
14
- from .constants import BUNDLE_CURRENT_VERSION
15
- from .data_file import DataFile
16
- from .tags import Tags
17
- from .wet_processes import WetProcesses
11
+ from genelastic.common.cli import log_subsection
12
+ from genelastic.common.types import BundleDict
13
+ from genelastic.import_data.models.analyses import Analyses
14
+ from genelastic.import_data.models.process import BioInfoProcess, WetProcess
15
+ from genelastic.import_data.models.processes import Processes
16
+ from genelastic.import_data.models.tags import Tags
18
17
 
19
18
  logger = logging.getLogger("genelastic")
20
19
 
21
20
 
21
+ def resolve_data_path(bundle_file: Path, data_path: Path | None) -> Path:
22
+ """Resolves the data path relative to the given bundle file if necessary.
23
+
24
+ If ``data_path`` is:
25
+
26
+ - Absolute: it is returned as-is,
27
+ - Relative: it is resolved relative to the parent of ``bundle_file``,
28
+ - None: considered as the current directory (``.``) and resolved
29
+ accordingly.
30
+
31
+ :param bundle_file: Path to the bundle file used for resolution context.
32
+ :param data_path: Optional path to the data directory or file.
33
+ :return: An absolute Path object pointing to the resolved data location.
34
+ """
35
+ resolved_data_path = data_path if data_path else Path()
36
+ if not resolved_data_path.is_absolute():
37
+ resolved_data_path = Path(
38
+ bundle_file.parent / resolved_data_path
39
+ ).resolve()
40
+ return resolved_data_path
41
+
42
+
22
43
  class ImportBundle:
23
44
  """Class for handling an import bundle description."""
24
45
 
25
- def __init__( # noqa: C901
26
- self, x: typing.Sequence[BundleDict], *, check: bool = False
46
+ def __init__(
47
+ self,
48
+ x: typing.Sequence[BundleDict],
49
+ *,
50
+ multi_match: bool = False,
51
+ check: bool = False,
27
52
  ) -> None:
53
+ self._documents = x
54
+ self._custom_tags_doc: (
55
+ dict[str, dict[str, str | dict[str, str]]] | None
56
+ ) = None
57
+
28
58
  analyses: list[BundleDict] = []
29
59
  wet_processes: list[BundleDict] = []
30
60
  bi_processes: list[BundleDict] = []
31
- tags = Tags(x)
61
+
62
+ self._search_custom_tags()
63
+ tags = (
64
+ Tags.from_dict(self._custom_tags_doc)
65
+ if self._custom_tags_doc
66
+ else Tags()
67
+ )
32
68
 
33
69
  # Loop on dicts
34
70
  for d in x:
35
- # Check version
36
- if "version" not in d:
37
- msg = "No version inside YAML document."
38
- raise RuntimeError(msg)
39
- if int(d["version"]) != BUNDLE_CURRENT_VERSION:
40
- raise RuntimeError
41
-
42
71
  # Gather all analyses
43
72
  if "analyses" in d and d["analyses"] is not None:
44
73
  # Copy some bundle properties into each analysis
45
74
  for analysis in d["analyses"]:
46
- for key in ["bundle_file", "root_dir"]:
47
- if key in d:
48
- analysis[key] = d[key]
75
+ bundle_file = d["bundle_file"]
49
76
 
50
- # Add the tags to use.
77
+ analysis["bundle_file"] = bundle_file
51
78
  analysis["tags"] = tags
52
-
79
+ analysis["multi_match"] = multi_match
80
+
81
+ # Resolve data path
82
+ data_path = (
83
+ Path(analysis["data_path"])
84
+ if "data_path" in analysis
85
+ else None
86
+ )
87
+ analysis["data_path"] = resolve_data_path(
88
+ bundle_file, data_path
89
+ )
53
90
  analyses.extend(d["analyses"])
54
91
 
55
92
  # If some wet processes are defined, copy the bundle file path into each of them.
@@ -65,18 +102,32 @@ class ImportBundle:
65
102
  bi_processes.extend(d["bi_processes"])
66
103
 
67
104
  # Instantiate all objects
68
- self._wet_processes: WetProcesses = WetProcesses.from_array_of_dicts(
69
- wet_processes
105
+ log_subsection("Loading wet processes...")
106
+ self._wet_processes = Processes.from_dicts(wet_processes, WetProcess)
107
+ logger.info(
108
+ "=> %s wet process(es) loaded from bundle(s).",
109
+ len(self._wet_processes),
110
+ )
111
+
112
+ log_subsection("Loading bioinformatics processes...")
113
+ self._bi_processes = Processes.from_dicts(bi_processes, BioInfoProcess)
114
+ logger.info(
115
+ "=> %s bioinformatics process(es) loaded from bundle(s).",
116
+ len(self._bi_processes),
70
117
  )
71
- self._bi_processes: BioInfoProcesses = (
72
- BioInfoProcesses.from_array_of_dicts(bi_processes)
118
+
119
+ log_subsection("Loading analyses...")
120
+ self._analyses = Analyses.from_dicts(analyses)
121
+
122
+ logger.info(
123
+ "=> %s analysis(es) loaded from bundle(s).", len(self._analyses)
73
124
  )
74
- self._analyses: Analyses = Analyses.from_array_of_dicts(analyses)
125
+ logger.info("")
75
126
 
76
127
  if check:
77
- self.check_referenced_processes()
128
+ self._check_referenced_processes()
78
129
 
79
- def check_referenced_processes(self) -> None:
130
+ def _check_referenced_processes(self) -> None:
80
131
  """Check if wet and bi processes referenced inside each analysis are defined.
81
132
  If one of the processes is not defined, the program exits.
82
133
  """
@@ -85,8 +136,7 @@ class ImportBundle:
85
136
 
86
137
  if (
87
138
  analysis_wet_process
88
- and analysis_wet_process
89
- not in self._wet_processes.get_process_ids()
139
+ and analysis_wet_process not in self._wet_processes
90
140
  ):
91
141
  sys.exit(
92
142
  f"Analysis at index {index} in file {analysis.bundle_file} "
@@ -97,48 +147,41 @@ class ImportBundle:
97
147
 
98
148
  if (
99
149
  analysis_bi_process
100
- and analysis_bi_process
101
- not in self._bi_processes.get_process_ids()
150
+ and analysis_bi_process not in self._bi_processes
102
151
  ):
103
152
  sys.exit(
104
153
  f"Analysis at index {index} in file {analysis.bundle_file} "
105
154
  f"is referencing an undefined bi process: {analysis_bi_process}"
106
155
  )
107
156
 
157
+ def _search_custom_tags(self) -> None:
158
+ docs_with_custom_tags = [d for d in self._documents if "tags" in d]
159
+
160
+ # Only one 'tags' redefinition is allowed across all the documents.
161
+ if len(docs_with_custom_tags) > 1:
162
+ bundle_files = sorted(
163
+ [str(d["bundle_file"]) for d in docs_with_custom_tags]
164
+ )
165
+ msg = (
166
+ f"Only one 'tags' key should be defined across all documents, "
167
+ f"but multiple were found : {', '.join(bundle_files)}"
168
+ )
169
+ raise RuntimeError(msg)
170
+
171
+ if len(docs_with_custom_tags) == 1:
172
+ self._custom_tags_doc = docs_with_custom_tags[0]
173
+
108
174
  @property
109
175
  def analyses(self) -> Analyses:
110
176
  """The analyses."""
111
177
  return self._analyses
112
178
 
113
179
  @property
114
- def wet_processes(self) -> WetProcesses:
180
+ def wet_processes(self) -> Processes:
115
181
  """The wet processes."""
116
182
  return self._wet_processes
117
183
 
118
184
  @property
119
- def bi_processes(self) -> BioInfoProcesses:
185
+ def bi_processes(self) -> Processes:
120
186
  """The bi processes."""
121
187
  return self._bi_processes
122
-
123
- def get_nb_files(self, cat: str | None = None) -> int:
124
- """Get the number of files in a category."""
125
- files = self.get_files(cat)
126
- return len(files)
127
-
128
- def get_files(self, cat: str | None = None) -> list[DataFile]:
129
- """Returns all files of a category."""
130
- files: list[DataFile] = []
131
-
132
- # Loop on all analyses
133
- for analysis in self.analyses:
134
- files += analysis.get_data_files(cat)
135
-
136
- return files
137
-
138
- def get_nb_matched_files(self) -> int:
139
- """Get the number of files that match the pattern."""
140
- return sum(a.get_nb_files() for a in self.analyses)
141
-
142
- def get_nb_unmatched_files(self) -> int:
143
- """Get the number of files that do not match."""
144
- return sum(len(a.get_unmatched_file_paths()) for a in self.analyses)
@@ -1,52 +1,26 @@
1
1
  """ImportBundle factory module."""
2
2
 
3
3
  import logging
4
- import re
5
- import sys
6
4
  from pathlib import Path
5
+ from typing import Any
7
6
 
8
7
  import schema
9
8
  import yaml
10
- from yaml.parser import ParserError
11
- from yaml.scanner import ScannerError
9
+ from yaml import YAMLError
12
10
 
13
- from genelastic.common import BundleDict
11
+ from genelastic.common.exceptions import (
12
+ ValidationError,
13
+ YAMLFileReadError,
14
+ )
15
+ from genelastic.common.types import BundleDict
14
16
 
15
17
  from .constants import BUNDLE_CURRENT_VERSION
16
18
  from .import_bundle import ImportBundle
19
+ from .models.tags import Tags
17
20
 
18
21
  logger = logging.getLogger("genelastic")
19
22
 
20
23
 
21
- def validate_tag_char(s: str) -> bool:
22
- """A tag should only contain one special character, excluding the following : (, ), ?, <, >."""
23
- if len(s) > 1:
24
- return False
25
-
26
- return re.match(r"^[^\w()<>?]$", s) is not None
27
-
28
-
29
- def validate_field_chars(s: str) -> bool:
30
- """Fields should only contain word characters.
31
- A word character is a character a-z, A-Z, 0-9, including _ (underscore).
32
- """
33
- return re.match(r"^\w+$", s) is not None
34
-
35
-
36
- _SCHEMA_V1 = schema.Schema(
37
- {"version": 1, schema.Optional("vcf_files"): schema.Or(None, [str])}
38
- )
39
-
40
- _SCHEMA_V2 = schema.Schema(
41
- {
42
- "version": 2,
43
- schema.Optional("vcf"): {
44
- schema.Optional("filename_pattern"): str,
45
- "files": [str],
46
- },
47
- }
48
- )
49
-
50
24
  _SCHEMA_V3 = schema.Schema(
51
25
  {
52
26
  "version": 3,
@@ -54,8 +28,8 @@ _SCHEMA_V3 = schema.Schema(
54
28
  None,
55
29
  [
56
30
  {
57
- schema.Optional("file_prefix"): str,
58
- schema.Optional("files"): [str],
31
+ "file_prefix": str,
32
+ schema.Optional("suffix"): str,
59
33
  schema.Optional("sample_name"): str,
60
34
  schema.Optional("source"): str,
61
35
  schema.Optional("barcode"): str,
@@ -113,24 +87,24 @@ _SCHEMA_V3 = schema.Schema(
113
87
  ],
114
88
  ),
115
89
  schema.Optional("tags"): {
116
- schema.Optional("format"): {
117
- schema.Optional("prefix"): schema.And(
90
+ schema.Optional("delimiter"): {
91
+ schema.Optional("start"): schema.And(
118
92
  str,
119
- validate_tag_char,
120
- error="Key 'prefix' should only contain one special character, "
93
+ Tags.validate_tag_delimiter,
94
+ error="Key 'delimiter.start' should only contain one special character, "
121
95
  "excluding the following : (, ), ?, <, >.",
122
96
  ),
123
- schema.Optional("suffix"): schema.And(
97
+ schema.Optional("end"): schema.And(
124
98
  str,
125
- validate_tag_char,
126
- error="Key 'suffix' should only contain one special character, "
99
+ Tags.validate_tag_delimiter,
100
+ error="Key 'delimiter.end' should only contain one special character, "
127
101
  "excluding the following : (, ), ?, <, >.",
128
102
  ),
129
103
  },
130
- "match": {
104
+ schema.Optional("match"): {
131
105
  schema.And(
132
106
  str,
133
- validate_field_chars,
107
+ Tags.validate_tag_name,
134
108
  error="Tags listed under the 'match' key should only contain "
135
109
  "word characters. A word character is a character "
136
110
  "a-z, A-Z, 0-9, including _ (underscore).",
@@ -142,106 +116,81 @@ _SCHEMA_V3 = schema.Schema(
142
116
 
143
117
 
144
118
  def make_import_bundle_from_files(
145
- files: list[Path], *, check: bool = False
119
+ files: list[Path], *, multi_match: bool = False, check: bool = False
146
120
  ) -> ImportBundle:
147
- """Create an ImportBundle instance from a list of YAML files."""
148
- all_documents = []
121
+ """Create an ImportBundle instance from a list of YAML files.
122
+
123
+ :raises YAMLFileReadError: If a YAML file cannot be read.
124
+ :raises ValidationError: If an import bundle is invalid.
125
+ :return: An ImportBundle instance.
126
+ """
127
+ all_docs = []
149
128
  for file in files:
150
129
  # Load documents stored in each file.
151
- new_documents = load_import_bundle_file(file)
130
+ docs = load_yaml_file(file)
131
+
132
+ for doc in docs:
133
+ # Let schema handle structure/type/version validation.
134
+ validate_doc(doc)
152
135
 
153
- for i, new_document in enumerate(new_documents):
154
- # Upgrade each new document to the latest/current version.
155
- if new_document["version"] != BUNDLE_CURRENT_VERSION:
156
- new_documents[i] = upgrade_bundle_version(
157
- new_document, BUNDLE_CURRENT_VERSION
158
- )
159
- # Set the root directory path in each new document.
160
- new_documents[i]["root_dir"] = str(file.parent)
161
136
  # Set the original bundle YAML file path in each new document.
162
- new_documents[i]["bundle_file"] = str(file)
137
+ doc["bundle_file"] = Path(file).resolve()
163
138
 
164
- all_documents.extend(new_documents)
139
+ all_docs.extend(docs)
165
140
 
166
141
  # Create bundle instance.
167
- return ImportBundle(all_documents, check=check)
142
+ return ImportBundle(all_docs, multi_match=multi_match, check=check)
168
143
 
169
144
 
170
- def set_version(x: BundleDict) -> None:
171
- """Set version number.
145
+ def validate_doc(doc: Any) -> None: # noqa: ANN401
146
+ """Validate a single YAML document against its versioned bundle schema.
172
147
 
173
- Deduce the version number from the keys present inside the dictionary.
148
+ :param doc: Dictionary with a 'version' key indicating the schema to use.
149
+ :raises ValidationError: If validation fails.
174
150
  """
175
- # Empty doc
176
- if len(x) == 0:
177
- x["version"] = BUNDLE_CURRENT_VERSION
151
+ bundle_version = None
178
152
 
179
- # Wrong content in version field
180
- elif "version" in x:
181
- if not isinstance(x["version"], int):
182
- msg = "Version must be an integer."
183
- raise ValueError(msg)
153
+ if isinstance(doc, dict):
154
+ # If the document is a dict but lacks a version,
155
+ # assume current version.
156
+ if "version" not in doc:
157
+ doc["version"] = BUNDLE_CURRENT_VERSION
184
158
 
185
- # Version 1
186
- elif "vcf_files" in x or "cov_files" in x:
187
- x["version"] = 1
159
+ bundle_version = doc["version"]
188
160
 
189
- # Version 2
190
- elif "vcf" in x and "filename_pattern" in x["vcf"]:
191
- x["version"] = 2
192
-
193
- # Latest version
194
- else:
195
- x["version"] = BUNDLE_CURRENT_VERSION
196
-
197
-
198
- def validate_doc(x: BundleDict) -> None:
199
- """Validate the dictionary using its corresponding schema."""
200
161
  # Get schema
201
- bundle_schema = globals().get("_SCHEMA_V" + str(x["version"]))
202
- if bundle_schema is None:
203
- raise ValueError(
204
- f"Unknown version \"{x['version']}\" for import " + "bundle file."
162
+ bundle_schema = globals().get(f"_SCHEMA_V{bundle_version}")
163
+ if not bundle_schema:
164
+ msg = (
165
+ f"Failed to validate import bundle. "
166
+ f"Reason: unsupported version found ({bundle_version})."
205
167
  )
168
+ raise ValidationError(msg)
206
169
 
207
170
  # Validate
208
- bundle_schema.validate(x)
171
+ try:
172
+ bundle_schema.validate(doc)
173
+ except schema.SchemaError as e:
174
+ msg = f"Failed to validate import bundle. Reason: {e}"
175
+ raise ValidationError(msg) from None
209
176
 
210
177
 
211
- def load_import_bundle_file(file: Path) -> list[BundleDict]:
212
- """Loads a YAML import bundle file."""
213
- # Load YAML
214
- logger.info('Load YAML data import file "%s".', file)
215
- docs: list[BundleDict] = []
178
+ def load_yaml_file(file_path: Path) -> list[Any]:
179
+ """Load a YAML file.
216
180
 
181
+ :param file_path: Path to the file to load.
182
+ :raises YAMLFileError: If the file cannot be opened, decoded or
183
+ parsed as valid YAML.
184
+ :returns: A list of documents loaded from the YAML file.
185
+ """
217
186
  try:
218
- with file.open(encoding="utf-8") as f:
219
- docs = list(yaml.safe_load_all(f))
220
- except (IsADirectoryError, FileNotFoundError) as e:
221
- logger.error(e)
222
- sys.exit(1)
223
- except ScannerError as e:
224
- logger.error("YAML file lexical analysis failed : %s", e)
225
- sys.exit(1)
226
- except ParserError as e:
227
- logger.error("YAML file syntactic analysis failed : %s", e)
228
- sys.exit(1)
187
+ with file_path.open(encoding="utf-8") as f:
188
+ documents = list(yaml.safe_load_all(f))
189
+ except (OSError, YAMLError, UnicodeDecodeError) as e:
190
+ msg = f"Failed to read YAML file '{file_path}'. Reason: {e}"
191
+ raise YAMLFileReadError(msg) from None
229
192
 
230
- # Guess/set version
231
- if docs is None:
232
- docs = [{"version": BUNDLE_CURRENT_VERSION}]
233
- else:
234
- for i, x in enumerate(docs):
235
- if x is None:
236
- docs[i] = {"version": BUNDLE_CURRENT_VERSION}
237
- else:
238
- set_version(x)
239
-
240
- # Find schema and validate document
241
- for x in docs:
242
- validate_doc(x)
243
-
244
- return docs
193
+ return documents
245
194
 
246
195
 
247
196
  def upgrade_bundle_version(x: BundleDict, to_version: int) -> BundleDict:
@@ -268,31 +217,3 @@ def upgrade_bundle_version(x: BundleDict, to_version: int) -> BundleDict:
268
217
  y = upgrade_fct(y) # type: ignore[misc]
269
218
 
270
219
  return y
271
-
272
-
273
- def _upgrade_from_v1_to_v2(x: BundleDict) -> BundleDict:
274
- # Upgrade
275
- y = {"version": 2, "vcf": {"files": []}}
276
- if "vcf_files" in x and x["vcf_files"] is not None:
277
- y["vcf"]["files"] = x["vcf_files"] # type: ignore[index]
278
-
279
- # Validate schema
280
- _SCHEMA_V2.validate(y)
281
-
282
- return y
283
-
284
-
285
- def _upgrade_from_v2_to_v3(x: BundleDict) -> BundleDict:
286
- # Upgrade
287
- y: BundleDict = {"version": 3, "analyses": []}
288
- if "vcf" in x:
289
- analysis_entry = {}
290
- if "files" in x["vcf"]:
291
- analysis_entry["files"] = x["vcf"]["files"]
292
- if "filename_pattern" in x["vcf"]:
293
- analysis_entry["file_prefix"] = x["vcf"]["filename_pattern"]
294
- y["analyses"].append(analysis_entry)
295
-
296
- _SCHEMA_V3.validate(y)
297
-
298
- return y
File without changes