genelastic 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. genelastic/api/.env +4 -0
  2. genelastic/api/cli_start_api.py +18 -0
  3. genelastic/api/errors.py +52 -0
  4. genelastic/api/extends/example.py +0 -6
  5. genelastic/api/extends/example.yml +0 -0
  6. genelastic/api/routes.py +313 -181
  7. genelastic/api/server.py +34 -26
  8. genelastic/api/settings.py +5 -9
  9. genelastic/api/specification.yml +512 -0
  10. genelastic/common/__init__.py +0 -39
  11. genelastic/common/cli.py +100 -0
  12. genelastic/common/elastic.py +374 -46
  13. genelastic/common/exceptions.py +34 -2
  14. genelastic/common/server.py +59 -0
  15. genelastic/common/types.py +1 -14
  16. genelastic/import_data/__init__.py +0 -27
  17. genelastic/import_data/checker.py +99 -0
  18. genelastic/import_data/checker_observer.py +13 -0
  19. genelastic/import_data/cli/__init__.py +0 -0
  20. genelastic/import_data/cli/cli_check.py +136 -0
  21. genelastic/import_data/cli/gen_data.py +143 -0
  22. genelastic/import_data/cli/import_data.py +346 -0
  23. genelastic/import_data/cli/info.py +247 -0
  24. genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
  25. genelastic/import_data/cli/validate.py +146 -0
  26. genelastic/import_data/collect.py +185 -0
  27. genelastic/import_data/constants.py +136 -11
  28. genelastic/import_data/import_bundle.py +102 -59
  29. genelastic/import_data/import_bundle_factory.py +70 -149
  30. genelastic/import_data/importers/__init__.py +0 -0
  31. genelastic/import_data/importers/importer_base.py +131 -0
  32. genelastic/import_data/importers/importer_factory.py +85 -0
  33. genelastic/import_data/importers/importer_types.py +223 -0
  34. genelastic/import_data/logger.py +2 -1
  35. genelastic/import_data/models/__init__.py +0 -0
  36. genelastic/import_data/models/analyses.py +178 -0
  37. genelastic/import_data/models/analysis.py +144 -0
  38. genelastic/import_data/models/data_file.py +110 -0
  39. genelastic/import_data/models/process.py +45 -0
  40. genelastic/import_data/models/processes.py +84 -0
  41. genelastic/import_data/models/tags.py +170 -0
  42. genelastic/import_data/models/unique_list.py +109 -0
  43. genelastic/import_data/models/validate.py +26 -0
  44. genelastic/import_data/patterns.py +90 -0
  45. genelastic/import_data/random_bundle.py +79 -54
  46. genelastic/import_data/resolve.py +157 -0
  47. genelastic/ui/.env +1 -0
  48. genelastic/ui/cli_start_ui.py +20 -0
  49. genelastic/ui/routes.py +333 -0
  50. genelastic/ui/server.py +9 -82
  51. genelastic/ui/settings.py +2 -6
  52. genelastic/ui/static/cea-cnrgh.ico +0 -0
  53. genelastic/ui/static/cea.ico +0 -0
  54. genelastic/ui/static/layout.ico +0 -0
  55. genelastic/ui/static/novaseq6000.png +0 -0
  56. genelastic/ui/static/style.css +430 -0
  57. genelastic/ui/static/ui.js +458 -0
  58. genelastic/ui/templates/analyses.html +98 -0
  59. genelastic/ui/templates/analysis_detail.html +44 -0
  60. genelastic/ui/templates/bi_process_detail.html +129 -0
  61. genelastic/ui/templates/bi_processes.html +116 -0
  62. genelastic/ui/templates/explorer.html +356 -0
  63. genelastic/ui/templates/home.html +207 -0
  64. genelastic/ui/templates/layout.html +153 -0
  65. genelastic/ui/templates/version.html +21 -0
  66. genelastic/ui/templates/wet_process_detail.html +131 -0
  67. genelastic/ui/templates/wet_processes.html +116 -0
  68. genelastic-0.9.0.dist-info/METADATA +686 -0
  69. genelastic-0.9.0.dist-info/RECORD +76 -0
  70. genelastic-0.9.0.dist-info/WHEEL +4 -0
  71. genelastic-0.9.0.dist-info/entry_points.txt +10 -0
  72. genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
  73. genelastic/import_data/analyses.py +0 -69
  74. genelastic/import_data/analysis.py +0 -205
  75. genelastic/import_data/bi_process.py +0 -27
  76. genelastic/import_data/bi_processes.py +0 -49
  77. genelastic/import_data/cli_gen_data.py +0 -116
  78. genelastic/import_data/cli_import.py +0 -379
  79. genelastic/import_data/cli_info.py +0 -256
  80. genelastic/import_data/cli_validate.py +0 -54
  81. genelastic/import_data/data_file.py +0 -87
  82. genelastic/import_data/filename_pattern.py +0 -57
  83. genelastic/import_data/tags.py +0 -123
  84. genelastic/import_data/wet_process.py +0 -28
  85. genelastic/import_data/wet_processes.py +0 -53
  86. genelastic-0.7.0.dist-info/METADATA +0 -105
  87. genelastic-0.7.0.dist-info/RECORD +0 -40
  88. genelastic-0.7.0.dist-info/WHEEL +0 -5
  89. genelastic-0.7.0.dist-info/entry_points.txt +0 -6
  90. genelastic-0.7.0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,146 @@
1
+ import argparse
2
+ import logging
3
+ from pathlib import Path
4
+
5
+ from genelastic.common.cli import add_verbose_control_args, add_version_arg
6
+ from genelastic.common.exceptions import (
7
+ ValidationError,
8
+ YAMLFileReadError,
9
+ )
10
+ from genelastic.import_data.import_bundle_factory import (
11
+ load_yaml_file,
12
+ validate_doc,
13
+ )
14
+ from genelastic.import_data.logger import configure_logging
15
+ from genelastic.import_data.models.validate import ValidationIssue
16
+
17
+ logger = logging.getLogger("genelastic")
18
+
19
+
20
+ def read_args() -> argparse.Namespace:
21
+ """Read arguments from command line."""
22
+ parser = argparse.ArgumentParser(
23
+ description="Statically validates YAML bundles: "
24
+ "ensure they comply to the bundle schema.",
25
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
26
+ allow_abbrev=False,
27
+ )
28
+ add_version_arg(parser)
29
+ add_verbose_control_args(parser)
30
+ parser.add_argument(
31
+ "files",
32
+ type=Path,
33
+ nargs="+",
34
+ default=None,
35
+ help="Paths to YAML files containing bundles to validate.",
36
+ )
37
+ parser.add_argument(
38
+ "-x",
39
+ "--fail-fast",
40
+ dest="fail_fast",
41
+ action="store_true",
42
+ help="Stop validating files after the first error is encountered.",
43
+ )
44
+ return parser.parse_args()
45
+
46
+
47
+ def main() -> int:
48
+ """Entry point of the validate script."""
49
+ args = read_args()
50
+ configure_logging(args.verbose)
51
+
52
+ validation_issues = []
53
+ file_count = len(args.files)
54
+
55
+ for file_index, file_path in enumerate(args.files):
56
+ resolved_file_path = file_path.resolve()
57
+
58
+ logger.info(
59
+ "[%s/%s] Validating bundle(s) from file '%s'.",
60
+ file_index + 1,
61
+ file_count,
62
+ resolved_file_path,
63
+ )
64
+ logger.info("Loading YAML file...")
65
+
66
+ try:
67
+ docs = load_yaml_file(resolved_file_path)
68
+ except YAMLFileReadError as e:
69
+ logger.error(e)
70
+
71
+ if args.fail_fast:
72
+ raise SystemExit(1) from None
73
+
74
+ validation_issues.append(
75
+ ValidationIssue(
76
+ exc_type=type(e).__name__,
77
+ file_path=resolved_file_path,
78
+ file_index=file_index + 1,
79
+ file_count=file_count,
80
+ )
81
+ )
82
+ continue
83
+
84
+ logger.info("-> YAML file successfully loaded.")
85
+
86
+ doc_count = len(docs)
87
+ logger.info("Found %s document(s) in the YAML file.", doc_count)
88
+
89
+ for doc_index, doc in enumerate(docs):
90
+ logger.info(
91
+ " Validating bundle format for document #%s/%s...",
92
+ doc_index + 1,
93
+ doc_count,
94
+ )
95
+
96
+ try:
97
+ validate_doc(doc)
98
+ except ValidationError as e:
99
+ logger.error(e)
100
+
101
+ if args.fail_fast:
102
+ raise SystemExit(1) from None
103
+
104
+ validation_issues.append(
105
+ ValidationIssue(
106
+ exc_type=type(e).__name__,
107
+ file_path=resolved_file_path,
108
+ file_index=file_index + 1,
109
+ file_count=file_count,
110
+ doc_index=doc_index + 1,
111
+ doc_count=doc_count,
112
+ )
113
+ )
114
+ continue
115
+
116
+ logger.info(" -> Bundle format is valid.")
117
+
118
+ logger.info("")
119
+
120
+ if len(validation_issues) > 0:
121
+ logger.error("Some files raised exceptions:")
122
+ for issue in validation_issues:
123
+ logger.error(" - %s", issue)
124
+
125
+ ret_code = 1
126
+ else:
127
+ logger.info("All bundles respect the genelastic YAML bundle format.")
128
+ ret_code = 0
129
+
130
+ files_failing_validation = len(
131
+ {issue.file_path for issue in validation_issues}
132
+ )
133
+ files_passing_validation = file_count - files_failing_validation
134
+
135
+ logger.info(
136
+ "Out of %s file(s), validation passed for %s and failed for %s.",
137
+ file_count,
138
+ files_passing_validation,
139
+ files_failing_validation,
140
+ )
141
+
142
+ return ret_code
143
+
144
+
145
+ if __name__ == "__main__":
146
+ raise SystemExit(main())
@@ -0,0 +1,185 @@
1
+ import logging
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+
5
+ from genelastic.common.exceptions import DataFileCollectorError
6
+ from genelastic.common.types import Metadata
7
+ from genelastic.import_data.models.data_file import DataFile
8
+ from genelastic.import_data.models.tags import Tags
9
+ from genelastic.import_data.patterns import FilenamePattern
10
+ from genelastic.import_data.resolve import resolve_analysis_id
11
+
12
+ logger = logging.getLogger("genelastic")
13
+
14
+
15
+ def collect_files(data_path: Path) -> set[Path]:
16
+ """Collect files for a given analysis.
17
+ All files directly under ``data_path`` are returned.
18
+
19
+ :param data_path: Directory containing the files.
20
+ :raises DataFileCollectorError: If ``data_path`` is not an existing
21
+ directory.
22
+ :return: Set of absolute paths to collected files.
23
+ """
24
+ try:
25
+ collected_files = {x for x in data_path.iterdir() if x.is_file()}
26
+ except OSError as e:
27
+ msg = f"Error collecting files: data directory is invalid. {e}."
28
+ raise DataFileCollectorError(msg) from e
29
+ return collected_files
30
+
31
+
32
+ def match_files(
33
+ files: set[Path],
34
+ filename_pattern: FilenamePattern,
35
+ ) -> tuple[set[Path], set[Path]]:
36
+ """Splits a set of files into those that match a given filename pattern and
37
+ those that don't.
38
+
39
+ This function applies the provided ``filename_pattern`` to each file name
40
+ in ``files``, and returns two sets: one containing files that match the
41
+ pattern, and one containing those that do not.
42
+
43
+ :param files: A set of file paths to check.
44
+ :param filename_pattern: The filename pattern used for matching.
45
+
46
+ :returns: A tuple containing in first position a set of files that match
47
+ the pattern, and in second position a set of files that do not match the
48
+ pattern.
49
+ """
50
+ matched_files = {
51
+ f for f in files if filename_pattern.matches_pattern(f.name)
52
+ }
53
+ return matched_files, files - matched_files
54
+
55
+
56
+ def extract_analysis_metadata(
57
+ data_path: Path,
58
+ file_prefix: str,
59
+ tags: Tags,
60
+ filename_pattern: FilenamePattern,
61
+ ) -> dict[str, Metadata]:
62
+ analysis = {}
63
+
64
+ for file in collect_files(data_path):
65
+ if not filename_pattern.matches_pattern(file.name):
66
+ logger.debug("File '%s' was not matched.", file.name)
67
+ continue
68
+
69
+ filename_metadata = filename_pattern.extract_metadata(file.name)
70
+ analysis_id = resolve_analysis_id(file_prefix, tags, filename_metadata)
71
+ analysis[analysis_id] = filename_metadata
72
+
73
+ return analysis
74
+
75
+
76
+ def init_data_files(
77
+ analysis_id: str,
78
+ files: set[Path],
79
+ filename_pattern: FilenamePattern,
80
+ bundle_file: Path,
81
+ ) -> set[DataFile]:
82
+ """Instantiate ``DataFile`` objects from a set of file paths associated
83
+ with an analysis.
84
+
85
+ :param analysis_id: ID of the analysis, shared by all created ``DataFile``
86
+ instances.
87
+ :param files: Set of file paths associated with the analysis.
88
+ :param filename_pattern: Pattern used to extract metadata from filenames.
89
+ The extracted metadata is included in each ``DataFile``.
90
+ :param bundle_file: Path to the YAML bundle file from which the analysis is
91
+ defined.
92
+ :raises DataFileCollectorError: If metadata extraction or instantiation
93
+ of a data file objet fails for a given file.
94
+ :return: A set of successfully instantiated ``DataFile`` objects.
95
+ """
96
+ data_files = set()
97
+ for file in files:
98
+ try:
99
+ metadata = filename_pattern.extract_metadata(file.name)
100
+ data_file = DataFile(
101
+ analysis_id=analysis_id,
102
+ path=file,
103
+ bundle_file=bundle_file,
104
+ metadata=metadata,
105
+ )
106
+ data_files.add(data_file)
107
+ except RuntimeError as e:
108
+ msg = f"Error instantiating data files: {e}"
109
+ raise DataFileCollectorError(msg) from None
110
+ return data_files
111
+
112
+
113
+ @dataclass(frozen=True)
114
+ class DataFileCollectorResult:
115
+ """Result of a data file collection."""
116
+
117
+ matched_files: set[Path]
118
+ unmatched_files: set[Path]
119
+ data_files: set[DataFile]
120
+
121
+
122
+ class DataFileCollector:
123
+ """Collect all data files belonging to an analysis."""
124
+
125
+ def __init__(
126
+ self,
127
+ analysis_id: str,
128
+ bundle_file: Path,
129
+ data_path: Path,
130
+ filename_pattern: FilenamePattern,
131
+ *,
132
+ multi_match: bool = False,
133
+ ) -> None:
134
+ self._analysis_id = analysis_id
135
+ self._bundle_file = bundle_file
136
+ self._data_path = data_path
137
+ self._filename_pattern = filename_pattern
138
+ self._multi_match = multi_match
139
+
140
+ def run(self) -> DataFileCollectorResult:
141
+ """Collects files from the analysis data path, matches them against the
142
+ analysis filename pattern, and instantiates ``DataFile`` objects for
143
+ each matched file.
144
+
145
+ :raises DataFileCollectorError: If the ``data_path`` is not an existing
146
+ directory or if metadata extraction or instantiation of a data file
147
+ objet fails for a given file.
148
+ :return: A ``DataFileCollectorResult`` containing the sets of matched
149
+ and unmatched files, as well as a set of instantiated ``DataFile``
150
+ objects.
151
+ """
152
+ files = collect_files(self._data_path)
153
+ logger.debug(
154
+ " -> Collected %s file(s):",
155
+ len(files),
156
+ )
157
+ for path in sorted(files):
158
+ logger.debug(" - '%s'", path.name)
159
+
160
+ matched_files, unmatched_files = match_files(
161
+ files, self._filename_pattern
162
+ )
163
+ logger.info(" -> Found %s matching file(s):", len(matched_files))
164
+ for path in sorted(matched_files):
165
+ logger.info(" - '%s'", path.name)
166
+
167
+ logger.info(
168
+ " -> Found %s non-matching file(s):",
169
+ len(unmatched_files),
170
+ )
171
+ for path in sorted(unmatched_files):
172
+ logger.info(" - '%s'", path.name)
173
+
174
+ data_files = init_data_files(
175
+ self._analysis_id,
176
+ matched_files,
177
+ self._filename_pattern,
178
+ self._bundle_file,
179
+ )
180
+
181
+ return DataFileCollectorResult(
182
+ matched_files=matched_files,
183
+ unmatched_files=unmatched_files,
184
+ data_files=data_files,
185
+ )
@@ -5,20 +5,145 @@ This module contains genelastic constants.
5
5
 
6
6
  import typing
7
7
 
8
- ALLOWED_CATEGORIES: typing.Final[list[str]] = ["vcf", "cov"]
8
+ import schema
9
+
10
+ ALLOWED_EXTENSIONS: typing.Final[list[str]] = [
11
+ "vcf",
12
+ "cov",
13
+ "json",
14
+ "yml",
15
+ "yaml",
16
+ ]
9
17
 
10
18
  BUNDLE_CURRENT_VERSION = 3
11
19
 
12
- DEFAULT_TAG_REGEX = "[^_-]+"
13
- DEFAULT_TAG_PREFIX = "%"
14
- DEFAULT_TAG_SUFFIX = ""
20
+ DEFAULT_TAG_REGEX = "[^_]+"
21
+ DEFAULT_TAG_DELIMITER_START = "%"
22
+ DEFAULT_TAG_DELIMITER_END = ""
15
23
 
16
24
  DEFAULT_TAG2FIELD: typing.Final[dict[str, dict[str, str]]] = {
17
- "%S": {"field": "sample_name", "regex": DEFAULT_TAG_REGEX},
18
- "%F": {"field": "source", "regex": DEFAULT_TAG_REGEX},
19
- "%W": {"field": "wet_process", "regex": DEFAULT_TAG_REGEX},
20
- "%B": {"field": "bi_process", "regex": DEFAULT_TAG_REGEX},
21
- "%D": {"field": "cov_depth", "regex": DEFAULT_TAG_REGEX},
22
- "%A": {"field": "barcode", "regex": DEFAULT_TAG_REGEX},
23
- "%R": {"field": "reference_genome", "regex": DEFAULT_TAG_REGEX},
25
+ "S": {"field": "sample_name", "regex": DEFAULT_TAG_REGEX},
26
+ "F": {"field": "source", "regex": DEFAULT_TAG_REGEX},
27
+ "W": {"field": "wet_process", "regex": DEFAULT_TAG_REGEX},
28
+ "B": {"field": "bi_process", "regex": DEFAULT_TAG_REGEX},
29
+ "D": {"field": "cov_depth", "regex": DEFAULT_TAG_REGEX},
30
+ "A": {"field": "barcode", "regex": DEFAULT_TAG_REGEX},
31
+ "R": {"field": "reference_genome", "regex": DEFAULT_TAG_REGEX},
24
32
  }
33
+
34
+ TOOLS_SUFFIX_RE = r"_(?P<tool>[a-zA-Z0-9]+)-(?P<version>\d+(?:-\d+){0,2})(?!-)"
35
+ """
36
+ Regular expression to extract individual tool-version metadata pairs from a
37
+ validated ``.metrics`` suffix in filenames.
38
+
39
+ - Captures exactly one tool-version pair, where:
40
+
41
+ - ``tool`` is an alphanumeric identifier (letters and digits),
42
+ - ``version`` consists of 1 to 3 numeric components separated by hyphens
43
+ (e.g., '1', '1-0', '1-0-0'),
44
+ - Uses named capture groups (``tool`` and ``version``) to extract data,
45
+ - The negative lookahead ``(?!-)`` ensures the version does not end with a
46
+ hyphen,
47
+ - Intended for extracting all matching pairs after the ``.metrics`` prefix has
48
+ been validated.
49
+ """
50
+
51
+ _METRICS_SUFFIX_RE = r"(?:\.metrics(?:_[a-zA-Z0-9]+-\d+(?:-\d+){0,2}(?!-))*)?"
52
+ """
53
+ Regular expression to match and validate the entire optional ``.metrics``
54
+ suffix in filenames.
55
+
56
+ - Matches zero or one occurrence of:
57
+
58
+ - A literal ``.metrics`` prefix, which must be the first suffix in the
59
+ filename,
60
+ - Followed optionally by zero or more tool-version pairs, each starting with
61
+ an underscore ``_`` and matching the same format as ``TOOLS_SUFFIX_RE``,
62
+ - Validates that the whole suffix structure is correct (including optional
63
+ presence),
64
+ - Ensures that when present, the suffix starts with ``.metrics`` and is
65
+ correctly formatted,
66
+ - Does not extract individual tool-version pairs; its role is to validate the
67
+ suffix as a whole.
68
+ """
69
+
70
+ _EXTENSIONS_SUFFIX_RE = rf"\.(?P<ext>{'|'.join(ALLOWED_EXTENSIONS)})(\.gz)?"
71
+ """
72
+ Regular expression for matching allowed file extensions with optional gzip
73
+ compression.
74
+
75
+ This regex matches the file extension suffixes for files belonging to
76
+ a set of predefined allowed extensions, specified in the ``ALLOWED_EXTENSIONS``
77
+ list.
78
+
79
+ The pattern matches:
80
+
81
+ - a dot (``.``) followed by one of the allowed extensions,
82
+ - optionally, a second extension ``.gz`` indicating gzip compression.
83
+
84
+ Examples of matched suffixes: ``.vcf``, ``.cov``, ``.json``, ``.vcf.gz``,
85
+ ``.json.gz``.
86
+ """
87
+
88
+ FILE_SUFFIXES_RE = rf"{_METRICS_SUFFIX_RE}{_EXTENSIONS_SUFFIX_RE}"
89
+ """Regex used to validate the suffix part of a filename.
90
+
91
+ It matches an optional metrics suffix (containing tool-version metadata),
92
+ immediately followed by a required allowed file extension suffix
93
+ (possibly compressed with .gz).
94
+
95
+ This regex is the combination of ``_METRICS_SUFFIX_RE`` and
96
+ ``_EXTENSIONS_SUFFIX_RE``.
97
+ """
98
+
99
+ QC_METRICS_SCHEMA = schema.Schema(
100
+ {
101
+ "id": str,
102
+ "genome_coverage_size": float,
103
+ "genome_coverage_percent": float,
104
+ "n50": int,
105
+ "larger_contig": int,
106
+ "iqr": int,
107
+ "outlier_percent": float,
108
+ "mean_depth": float,
109
+ "mean_duplicat_percent": float,
110
+ "fold_regions_percents": {
111
+ "5": float,
112
+ "10": float,
113
+ "20": float,
114
+ "30": float,
115
+ "40": float,
116
+ },
117
+ }
118
+ )
119
+
120
+
121
+ SV_METRICS_SCHEMA = schema.Schema(
122
+ {
123
+ "metadata_mandatory": [{str: schema.Or(str, int, float, bool)}],
124
+ schema.Optional("metadata_optional"): [
125
+ {str: schema.Or(str, int, float, bool)}
126
+ ],
127
+ "regions": [
128
+ {
129
+ "name": str,
130
+ "bed": str,
131
+ "results": [
132
+ {
133
+ "svtype": str,
134
+ "size": str,
135
+ "FP_query": int,
136
+ "TP_truth": int,
137
+ "TP_query": int,
138
+ "FN_truth": int,
139
+ "total_truth": int,
140
+ "total_query": int,
141
+ "precision": schema.Or(int, float),
142
+ "recall": schema.Or(int, float),
143
+ "f1": schema.Or(int, float),
144
+ }
145
+ ],
146
+ }
147
+ ],
148
+ }
149
+ )