genelastic 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. genelastic/api/.env +4 -0
  2. genelastic/api/cli_start_api.py +18 -0
  3. genelastic/api/errors.py +52 -0
  4. genelastic/api/extends/example.py +0 -6
  5. genelastic/api/extends/example.yml +0 -0
  6. genelastic/api/routes.py +313 -181
  7. genelastic/api/server.py +34 -26
  8. genelastic/api/settings.py +5 -9
  9. genelastic/api/specification.yml +512 -0
  10. genelastic/common/__init__.py +0 -39
  11. genelastic/common/cli.py +100 -0
  12. genelastic/common/elastic.py +374 -46
  13. genelastic/common/exceptions.py +34 -2
  14. genelastic/common/server.py +59 -0
  15. genelastic/common/types.py +1 -14
  16. genelastic/import_data/__init__.py +0 -27
  17. genelastic/import_data/checker.py +99 -0
  18. genelastic/import_data/checker_observer.py +13 -0
  19. genelastic/import_data/cli/__init__.py +0 -0
  20. genelastic/import_data/cli/cli_check.py +136 -0
  21. genelastic/import_data/cli/gen_data.py +143 -0
  22. genelastic/import_data/cli/import_data.py +346 -0
  23. genelastic/import_data/cli/info.py +247 -0
  24. genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
  25. genelastic/import_data/cli/validate.py +146 -0
  26. genelastic/import_data/collect.py +185 -0
  27. genelastic/import_data/constants.py +136 -11
  28. genelastic/import_data/import_bundle.py +102 -59
  29. genelastic/import_data/import_bundle_factory.py +70 -149
  30. genelastic/import_data/importers/__init__.py +0 -0
  31. genelastic/import_data/importers/importer_base.py +131 -0
  32. genelastic/import_data/importers/importer_factory.py +85 -0
  33. genelastic/import_data/importers/importer_types.py +223 -0
  34. genelastic/import_data/logger.py +2 -1
  35. genelastic/import_data/models/__init__.py +0 -0
  36. genelastic/import_data/models/analyses.py +178 -0
  37. genelastic/import_data/models/analysis.py +144 -0
  38. genelastic/import_data/models/data_file.py +110 -0
  39. genelastic/import_data/models/process.py +45 -0
  40. genelastic/import_data/models/processes.py +84 -0
  41. genelastic/import_data/models/tags.py +170 -0
  42. genelastic/import_data/models/unique_list.py +109 -0
  43. genelastic/import_data/models/validate.py +26 -0
  44. genelastic/import_data/patterns.py +90 -0
  45. genelastic/import_data/random_bundle.py +79 -54
  46. genelastic/import_data/resolve.py +157 -0
  47. genelastic/ui/.env +1 -0
  48. genelastic/ui/cli_start_ui.py +20 -0
  49. genelastic/ui/routes.py +333 -0
  50. genelastic/ui/server.py +9 -82
  51. genelastic/ui/settings.py +2 -6
  52. genelastic/ui/static/cea-cnrgh.ico +0 -0
  53. genelastic/ui/static/cea.ico +0 -0
  54. genelastic/ui/static/layout.ico +0 -0
  55. genelastic/ui/static/novaseq6000.png +0 -0
  56. genelastic/ui/static/style.css +430 -0
  57. genelastic/ui/static/ui.js +458 -0
  58. genelastic/ui/templates/analyses.html +98 -0
  59. genelastic/ui/templates/analysis_detail.html +44 -0
  60. genelastic/ui/templates/bi_process_detail.html +129 -0
  61. genelastic/ui/templates/bi_processes.html +116 -0
  62. genelastic/ui/templates/explorer.html +356 -0
  63. genelastic/ui/templates/home.html +207 -0
  64. genelastic/ui/templates/layout.html +153 -0
  65. genelastic/ui/templates/version.html +21 -0
  66. genelastic/ui/templates/wet_process_detail.html +131 -0
  67. genelastic/ui/templates/wet_processes.html +116 -0
  68. genelastic-0.9.0.dist-info/METADATA +686 -0
  69. genelastic-0.9.0.dist-info/RECORD +76 -0
  70. genelastic-0.9.0.dist-info/WHEEL +4 -0
  71. genelastic-0.9.0.dist-info/entry_points.txt +10 -0
  72. genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
  73. genelastic/import_data/analyses.py +0 -69
  74. genelastic/import_data/analysis.py +0 -205
  75. genelastic/import_data/bi_process.py +0 -27
  76. genelastic/import_data/bi_processes.py +0 -49
  77. genelastic/import_data/cli_gen_data.py +0 -116
  78. genelastic/import_data/cli_import.py +0 -379
  79. genelastic/import_data/cli_info.py +0 -256
  80. genelastic/import_data/cli_validate.py +0 -54
  81. genelastic/import_data/data_file.py +0 -87
  82. genelastic/import_data/filename_pattern.py +0 -57
  83. genelastic/import_data/tags.py +0 -123
  84. genelastic/import_data/wet_process.py +0 -28
  85. genelastic/import_data/wet_processes.py +0 -53
  86. genelastic-0.7.0.dist-info/METADATA +0 -105
  87. genelastic-0.7.0.dist-info/RECORD +0 -40
  88. genelastic-0.7.0.dist-info/WHEEL +0 -5
  89. genelastic-0.7.0.dist-info/entry_points.txt +0 -6
  90. genelastic-0.7.0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,85 @@
1
+ import logging
2
+ import typing
3
+ from typing import ClassVar, TypedDict
4
+
5
+ from genelastic.common.elastic import ElasticImportConn
6
+ from genelastic.import_data.importers.importer_base import (
7
+ BaseImporter,
8
+ ImporterError,
9
+ )
10
+ from genelastic.import_data.importers.importer_types import (
11
+ CoverageImporter,
12
+ QCImporter,
13
+ SmallvarImporter,
14
+ SVImporter,
15
+ VCFImporter,
16
+ )
17
+ from genelastic.import_data.models.data_file import DataFile
18
+
19
+ logger = logging.getLogger("genelastic")
20
+
21
+
22
+ class _ImporterConfig(TypedDict):
23
+ """Internal configuration mapping an importer class to its supported file
24
+ extensions.
25
+ """
26
+
27
+ cls: type[BaseImporter[typing.Any]]
28
+ extensions: set[str]
29
+
30
+
31
+ class ImporterFactory:
32
+ """Factory to create a BaseImporter instance based on the file's
33
+ extension and type.
34
+ """
35
+
36
+ _importers: ClassVar[dict[str, _ImporterConfig]] = {
37
+ "vcf": _ImporterConfig(cls=VCFImporter, extensions={"vcf"}),
38
+ "cov": _ImporterConfig(cls=CoverageImporter, extensions={"cov"}),
39
+ "qc": _ImporterConfig(cls=QCImporter, extensions={"yaml", "yml"}),
40
+ "smallvar": _ImporterConfig(cls=SmallvarImporter, extensions={"json"}),
41
+ "sv": _ImporterConfig(cls=SVImporter, extensions={"json"}),
42
+ }
43
+
44
+ @staticmethod
45
+ def get_importer(
46
+ data_file: DataFile,
47
+ es_import_conn: ElasticImportConn,
48
+ thread_count: int = 4,
49
+ ) -> BaseImporter[typing.Any]:
50
+ """Create an appropriate BaseImporter instance based on the data
51
+ file's extension and type.
52
+
53
+ :param data_file: Data file to process and import.
54
+ :param es_import_conn: Elasticsearch import connector instance.
55
+ :param thread_count: Number of threads to use for parallel data file
56
+ import.
57
+ :return: An instance of the appropriate BaseImporter subclass.
58
+ :raises ImporterError: If the data file extension or type is invalid.
59
+ """
60
+ try:
61
+ importer = ImporterFactory._importers[data_file.type]
62
+ except KeyError:
63
+ supported_types = sorted(
64
+ [f"'{i_type}'" for i_type in ImporterFactory._importers]
65
+ )
66
+ msg = (
67
+ f"Data file '{data_file.path.name}': no importer for type "
68
+ f"'{data_file.type}'. Supported types are: "
69
+ f"{', '.join(supported_types)}."
70
+ )
71
+ raise ImporterError(msg) from None
72
+
73
+ if data_file.ext not in importer["extensions"]:
74
+ supported_exts = sorted(
75
+ [f"'{ext}'" for ext in importer["extensions"]]
76
+ )
77
+ msg = (
78
+ f"Data file '{data_file.path.name}': extension "
79
+ f"'{data_file.ext}' not supported by importer "
80
+ f"{importer['cls'].__name__}. Supported extensions are: "
81
+ f"{', '.join(supported_exts)}."
82
+ )
83
+ raise ImporterError(msg)
84
+
85
+ return importer["cls"](data_file, es_import_conn, thread_count)
@@ -0,0 +1,223 @@
1
+ import csv
2
+ import logging
3
+ from collections.abc import Iterable
4
+ from datetime import UTC, datetime
5
+ from typing import Any
6
+
7
+ import schema
8
+ import vcf
9
+ from vcf.model import _Record
10
+
11
+ from genelastic.import_data.constants import (
12
+ QC_METRICS_SCHEMA,
13
+ SV_METRICS_SCHEMA,
14
+ )
15
+ from genelastic.import_data.importers.importer_base import (
16
+ BaseImporter,
17
+ ImporterError,
18
+ JSONBaseImporter,
19
+ YAMLBaseImporter,
20
+ )
21
+
22
+ logger = logging.getLogger("genelastic")
23
+
24
+
25
+ class CoverageImporter(BaseImporter[Iterable[list[str]]]):
26
+ """Importer for coverage files."""
27
+
28
+ @property
29
+ def target_index(self) -> str:
30
+ """Returns the import target index name."""
31
+ return self._es_import_conn.coverage_index
32
+
33
+ def _load(self) -> Iterable[list[str]]:
34
+ """Load a TSV formatted coverage file.
35
+ :raises ImporterError: If the file could not be opened or decoded.
36
+ """
37
+ try:
38
+ with self._data_file.path.open(newline="", encoding="utf-8") as f:
39
+ reader = csv.reader(f, delimiter="\t")
40
+ try:
41
+ first_row = next(reader)
42
+ except StopIteration:
43
+ msg = f"Coverage file '{self._data_file.path}' is empty."
44
+ raise ImporterError(msg) from None
45
+ yield first_row
46
+ yield from reader
47
+ except (OSError, UnicodeDecodeError) as e:
48
+ raise ImporterError(e) from None
49
+
50
+ def _transform(self, data: Iterable[list[str]]) -> Iterable[dict[str, Any]]:
51
+ """Transform each coverage file row into a JSON document."""
52
+ for row in data:
53
+ yield {
54
+ "_index": self.target_index,
55
+ "_source": {
56
+ "analysis_id": self._data_file.analysis_id,
57
+ "created_at": datetime.now(UTC).isoformat(),
58
+ "row": {
59
+ "chr": row[0],
60
+ "pos": int(row[1]) + 1,
61
+ "depth": int(row[2]),
62
+ },
63
+ },
64
+ }
65
+
66
+
67
+ class VCFImporter(BaseImporter[Iterable[_Record]]):
68
+ """Importer for VCF files."""
69
+
70
+ @property
71
+ def target_index(self) -> str:
72
+ """Returns the import target index name."""
73
+ return self._es_import_conn.vcf_variants_index
74
+
75
+ def _load(self) -> Iterable[_Record]:
76
+ """Load a VCF file. GZ compressed VCF files are supported.
77
+ :raises ImporterError: If the file could not be opened, decoded or is empty.
78
+ """
79
+ try:
80
+ yield from vcf.Reader(
81
+ filename=str(self._data_file.path), encoding="utf-8"
82
+ )
83
+ except StopIteration:
84
+ msg = f"VCF file '{self._data_file.path}' is empty."
85
+ raise ImporterError(msg) from None
86
+ except (OSError, UnicodeDecodeError) as e:
87
+ raise ImporterError(e) from None
88
+
89
+ def _transform(self, data: Iterable[_Record]) -> Iterable[dict[str, Any]]:
90
+ """Transform each VCF file record into a JSON document."""
91
+ for record in data:
92
+ # Fix values
93
+ if not record.CHROM.startswith("chr"):
94
+ if record.CHROM.lower().startswith("chr"):
95
+ record.CHROM = "chr" + record.CHROM[3:]
96
+ else:
97
+ record.CHROM = "chr" + record.CHROM
98
+
99
+ # Build document
100
+ alt = [x if x is None else x.type for x in record.ALT]
101
+
102
+ yield {
103
+ "_index": self.target_index,
104
+ "_source": {
105
+ "created_at": datetime.now(UTC).isoformat(),
106
+ "analysis_id": self._data_file.analysis_id,
107
+ "record": {
108
+ "type": "vcf",
109
+ "chr": record.CHROM,
110
+ "pos": record.POS,
111
+ "alt": alt,
112
+ "info": record.INFO,
113
+ },
114
+ },
115
+ }
116
+
117
+
118
+ class QCImporter(YAMLBaseImporter):
119
+ """Importer for QC YAML metrics files."""
120
+
121
+ @property
122
+ def target_index(self) -> str:
123
+ """Returns the import target index name."""
124
+ return self._es_import_conn.qc_metrics_index
125
+
126
+ def _validate(self, data: dict[str, Any]) -> None:
127
+ """Validate the YAML document against the expected schema.
128
+
129
+ :raises ImporterError: If the file format is invalid.
130
+ """
131
+ try:
132
+ QC_METRICS_SCHEMA.validate(data)
133
+ except schema.SchemaError as e:
134
+ raise ImporterError(e) from None
135
+
136
+ def _transform(self, data: dict[str, Any]) -> Iterable[dict[str, Any]]:
137
+ """Transform a QC YAML metrics file into a JSON document."""
138
+ yield {
139
+ "_index": self.target_index,
140
+ "_source": {
141
+ "created_at": datetime.now(UTC).isoformat(),
142
+ "analysis_id": self._data_file.analysis_id,
143
+ "metrics": data,
144
+ },
145
+ }
146
+
147
+
148
+ class SmallvarImporter(JSONBaseImporter):
149
+ """Importer for SmallVar JSON metrics files."""
150
+
151
+ @property
152
+ def target_index(self) -> str:
153
+ """Returns the import target index name."""
154
+ return self._es_import_conn.smallvar_metrics_index
155
+
156
+ def _transform(self, data: dict[str, Any]) -> Iterable[dict[str, Any]]:
157
+ """Transform a SmallVar metrics file into JSON documents."""
158
+ try:
159
+ for metric in data["metrics"]:
160
+ values_count = len(metric["data"][0]["values"])
161
+
162
+ metric_id = metric["id"].replace(".", "_").lower()
163
+
164
+ for i in range(values_count):
165
+ doc = {}
166
+ for item in metric["data"]:
167
+ # Attribute name should not use '.' as it refers
168
+ # to nested objects.
169
+ label = item["label"].replace(".", "_")
170
+ doc[label] = item["values"][i]
171
+
172
+ yield {
173
+ "_index": self.target_index,
174
+ "_source": {
175
+ "created_at": datetime.now(UTC).isoformat(),
176
+ "analysis_id": self._data_file.analysis_id,
177
+ "metric_id": metric_id,
178
+ "metrics": doc,
179
+ },
180
+ }
181
+ except KeyError as e:
182
+ msg = (
183
+ f"Smallvar metrics file '{self._data_file.path}' "
184
+ f"is invalid: missing key {e}."
185
+ )
186
+ raise ImporterError(msg) from None
187
+
188
+
189
+ class SVImporter(JSONBaseImporter):
190
+ """Importer for SV JSON metrics files."""
191
+
192
+ @property
193
+ def target_index(self) -> str:
194
+ """Returns the import target index name."""
195
+ return self._es_import_conn.sv_metrics_index
196
+
197
+ def _validate(self, data: dict[str, Any]) -> None:
198
+ """Validate the YAML document against the expected schema.
199
+
200
+ :raises ImporterError: If the file format is invalid.
201
+ """
202
+ try:
203
+ SV_METRICS_SCHEMA.validate(data)
204
+ except schema.SchemaError as e:
205
+ raise ImporterError(e) from None
206
+
207
+ def _transform(self, data: dict[str, Any]) -> Iterable[dict[str, Any]]:
208
+ """Transform a SV metrics file into a JSON document."""
209
+ for region in data["regions"]:
210
+ for result in region["results"]:
211
+ # Convert all values to float to avoid mapping issues.
212
+ result["precision"] = float(result["precision"])
213
+ result["recall"] = float(result["recall"])
214
+ result["f1"] = float(result["f1"])
215
+
216
+ yield {
217
+ "_index": self.target_index,
218
+ "_source": {
219
+ "created_at": datetime.now(UTC).isoformat(),
220
+ "analysis_id": self._data_file.analysis_id,
221
+ "metrics": data,
222
+ },
223
+ }
@@ -56,5 +56,6 @@ def configure_logging(verbose: int, log_file: str | None = None) -> None:
56
56
  1: logging.INFO, # default
57
57
  2: logging.DEBUG, # verbose mode
58
58
  }
59
+ level = level_map.get(verbose)
59
60
  # If verbose is greater than 2, set level to TRACE.
60
- root.setLevel(level_map.get(verbose, logging.TRACE)) # type: ignore[attr-defined]
61
+ root.setLevel(level if level else logging.TRACE) # type: ignore[attr-defined]
File without changes
@@ -0,0 +1,178 @@
1
+ import logging
2
+ import typing
3
+ from pathlib import Path
4
+
5
+ from genelastic.common.types import BundleDict
6
+ from genelastic.import_data.collect import (
7
+ extract_analysis_metadata,
8
+ )
9
+ from genelastic.import_data.models.analysis import Analysis
10
+ from genelastic.import_data.models.data_file import DataFile
11
+ from genelastic.import_data.models.unique_list import UniqueList
12
+ from genelastic.import_data.resolve import (
13
+ resolve_filename_pattern,
14
+ validate_file_prefix,
15
+ )
16
+
17
+ logger = logging.getLogger("genelastic")
18
+
19
+
20
+ class Analyses(UniqueList[Analysis]):
21
+ """Container of Analysis objects."""
22
+
23
+ def get_data_files(self, ext: str | None = None) -> list[DataFile]:
24
+ """Returns matched files as DataFile objects across all analyses.
25
+
26
+ :param ext: Filter the list of matched files by their extension
27
+ (case-sensitive).
28
+ """
29
+ return [df for a in self for df in a.get_data_files(ext=ext)]
30
+
31
+ @property
32
+ def extensions(self) -> set[str]:
33
+ """Returns all matched files extensions across all analyses."""
34
+ return {ext for a in self for ext in a.extensions}
35
+
36
+ @property
37
+ def matched_files(self) -> set[Path]:
38
+ """Returns the number of files that matched the pattern across all
39
+ analyses.
40
+ """
41
+ return {f for a in self for f in a.matched_files}
42
+
43
+ @property
44
+ def unmatched_files(self) -> set[Path]:
45
+ """Return the set of files that were not matched by any analysis.
46
+
47
+ The behavior differs depending on whether analyses share the same
48
+ ``data_path``:
49
+
50
+ - Within the same directory: a file is considered unmatched only if
51
+ **all** analyses in that directory failed to match it. This is
52
+ computed as the intersection of their respective ``unmatched_files``
53
+ sets.
54
+
55
+ - Across different directories: unmatched files are simply aggregated
56
+ (union of sets), since each directory is independent.
57
+
58
+ :return: A set of paths corresponding to unmatched files across all
59
+ analyses.
60
+ """
61
+ unmatched_per_dir: dict[Path, set[Path]] = {}
62
+
63
+ for a in self:
64
+ try:
65
+ unmatched_per_dir[a.data_path] = set.intersection(
66
+ unmatched_per_dir[a.data_path], a.unmatched_files
67
+ )
68
+ except KeyError:
69
+ unmatched_per_dir[a.data_path] = a.unmatched_files
70
+
71
+ if not unmatched_per_dir.values():
72
+ return set()
73
+ return set.union(*unmatched_per_dir.values())
74
+
75
+ @classmethod
76
+ def from_dict(cls, bundle: BundleDict) -> typing.Self:
77
+ """Initialize an ``Analyses`` container from a single bundle dictionary.
78
+
79
+ Expected bundle keys:
80
+
81
+ - Mandatory: ``file_prefix``, ``tags``, ``bundle_file``, ``data_path``.
82
+ - Optional: ``multi_match`` (default: ``False``), ``suffix`` (default: ``None``).
83
+
84
+ :param bundle: A dictionary describing one analysis configuration.
85
+ :raises InvalidFilePrefixError: If the ``file_prefix`` is invalid.
86
+ :raises FilenamePatternResolveError: If ``multi_match=False`` and some
87
+ tag fields are missing from the bundle metadata.
88
+ :raises UniqueListDuplicateError: If two ``Analysis`` objects happens
89
+ to share the same ID inside the ``Analyses`` instance.
90
+ :raises DataFileCollectorError: If the ``data_path`` is not an existing
91
+ directory or if metadata extraction or instantiation of a data file
92
+ objet fails for a given file.
93
+ :return: An ``Analyses`` instance containing one or several
94
+ ``Analysis`` objects.
95
+ """
96
+ analyses = cls()
97
+
98
+ # Validate file prefix structure.
99
+ logger.info("- Validating file prefix '%s'...", bundle["file_prefix"])
100
+ validate_file_prefix(
101
+ file_prefix=bundle["file_prefix"], tags=bundle["tags"]
102
+ )
103
+
104
+ # Resolve the filename pattern. In multi-match mode, tags without
105
+ # metadata values are accepted. They will be resolved later from
106
+ # filename-extracted metadata. In single-match mode, a
107
+ # FilenamePatternResolveError exception will be raised.
108
+ strict_mode = not bool(bundle.get("multi_match"))
109
+ logger.info(
110
+ "- Resolving filename pattern in %s mode...",
111
+ "strict" if strict_mode else "non-strict",
112
+ )
113
+ filename_pattern = resolve_filename_pattern(
114
+ file_prefix=bundle["file_prefix"],
115
+ tags=bundle["tags"],
116
+ metadata=bundle,
117
+ suffix=bundle.get("suffix"),
118
+ strict=strict_mode,
119
+ )
120
+
121
+ # Scan the data path to extract metadata from filenames.
122
+ logger.info(
123
+ "- Collecting files to extract metadata from using the resolved "
124
+ "filename pattern."
125
+ )
126
+ extracted_metadata = extract_analysis_metadata(
127
+ data_path=bundle["data_path"],
128
+ file_prefix=bundle["file_prefix"],
129
+ tags=bundle["tags"],
130
+ filename_pattern=filename_pattern,
131
+ )
132
+
133
+ logger.info(
134
+ "- Extracted metadata from %d analysis(es): %s",
135
+ len(extracted_metadata.keys()),
136
+ ", ".join(extracted_metadata.keys()),
137
+ )
138
+
139
+ for analysis_id, metadata in extracted_metadata.items():
140
+ # For each file match, merge filename-extracted metadata with the
141
+ # original bundle to describe one analysis.
142
+ full_metadata = {**bundle, **metadata}
143
+ full_metadata["analysis_id"] = analysis_id
144
+
145
+ # Re-resolve filename pattern in strict mode to let the analysis
146
+ # collect its own files (all tags should now be defined).
147
+ full_metadata["filename_pattern"] = resolve_filename_pattern(
148
+ file_prefix=full_metadata["file_prefix"],
149
+ tags=full_metadata["tags"],
150
+ metadata=full_metadata,
151
+ suffix=full_metadata.get("suffix"),
152
+ strict=True,
153
+ )
154
+
155
+ # Instantiate the Analysis and add it to the container.
156
+ analyses.append(Analysis(**full_metadata))
157
+ logger.info("")
158
+
159
+ return analyses
160
+
161
+ @classmethod
162
+ def from_dicts(cls, arr: typing.Sequence[BundleDict]) -> typing.Self:
163
+ """Initialize an ``Analyses`` container from multiple bundle
164
+ dictionaries.
165
+
166
+ This is a convenience wrapper that calls ``from_dict`` for each
167
+ bundle in the sequence and concatenates the results.
168
+
169
+ :param arr: A sequence of bundle dictionaries.
170
+ :return: An ``Analyses`` instance containing all analyses from the
171
+ input bundles.
172
+ """
173
+ analyses = cls()
174
+
175
+ for bundle in arr:
176
+ analyses.extend(analyses.from_dict(bundle))
177
+
178
+ return analyses
@@ -0,0 +1,144 @@
1
+ import contextlib
2
+ import copy
3
+ import logging
4
+ from collections import defaultdict
5
+ from pathlib import Path
6
+ from types import NotImplementedType
7
+
8
+ from genelastic.common.types import Metadata
9
+ from genelastic.import_data.collect import (
10
+ DataFileCollector,
11
+ )
12
+ from genelastic.import_data.constants import (
13
+ ALLOWED_EXTENSIONS,
14
+ )
15
+ from genelastic.import_data.models.data_file import DataFile
16
+ from genelastic.import_data.patterns import FilenamePattern
17
+
18
+ logger = logging.getLogger("genelastic")
19
+
20
+
21
+ class Analysis:
22
+ """Class Analysis that represents an analysis."""
23
+
24
+ METADATA_INTERNAL_KEYS = frozenset(
25
+ ["tags", "multi_match", "ext", "file_prefix"]
26
+ )
27
+
28
+ def __init__(
29
+ self,
30
+ analysis_id: str,
31
+ bundle_file: Path,
32
+ data_path: Path,
33
+ filename_pattern: FilenamePattern,
34
+ **metadata: str | int,
35
+ ) -> None:
36
+ self._analysis_id = analysis_id
37
+ self._bundle_file = bundle_file
38
+ self._data_path = data_path
39
+ self._metadata = self._remove_internal_keys(metadata)
40
+ self._data_files_by_ext: dict[str, set[DataFile]] = defaultdict(set)
41
+
42
+ logger.info("")
43
+ logger.info("[ Analysis ID %s ]", self._analysis_id)
44
+
45
+ self._collected_files = DataFileCollector(
46
+ analysis_id,
47
+ bundle_file,
48
+ data_path,
49
+ filename_pattern,
50
+ ).run()
51
+
52
+ for data_file in self._collected_files.data_files:
53
+ self._data_files_by_ext[data_file.ext].add(data_file)
54
+
55
+ logger.info(
56
+ " -> Extracted %s file extension(s): %s.",
57
+ len(self._data_files_by_ext.keys()),
58
+ ", ".join(ext.upper() for ext in self._data_files_by_ext),
59
+ )
60
+
61
+ def __eq__(self, other: object) -> bool | NotImplementedType:
62
+ """Defines equality comparison for Analysis instances based on their
63
+ ID.
64
+ """
65
+ if isinstance(other, Analysis):
66
+ return self._analysis_id == other._analysis_id
67
+ return NotImplemented
68
+
69
+ def __lt__(self, other: object) -> bool | NotImplementedType:
70
+ """Defines sort order for Analysis instances based on their ID."""
71
+ if isinstance(other, Analysis):
72
+ return self._analysis_id < other._analysis_id
73
+ return NotImplemented
74
+
75
+ def __str__(self) -> str:
76
+ return (
77
+ f"Analysis(id='{self._analysis_id}', "
78
+ f"bundle_file='{self._bundle_file}', "
79
+ f"data_path='{self._data_path}', "
80
+ f"metadata={self._metadata})"
81
+ )
82
+
83
+ @staticmethod
84
+ def _remove_internal_keys(
85
+ metadata: Metadata,
86
+ ) -> Metadata:
87
+ updated_metadata = metadata.copy()
88
+
89
+ for key in Analysis.METADATA_INTERNAL_KEYS:
90
+ with contextlib.suppress(KeyError):
91
+ del updated_metadata[key]
92
+
93
+ return updated_metadata
94
+
95
+ @property
96
+ def metadata(self) -> Metadata:
97
+ """Get metadata."""
98
+ return copy.deepcopy(self._metadata)
99
+
100
+ @property
101
+ def bundle_file(self) -> Path:
102
+ """Get the bundle file."""
103
+ return self._bundle_file
104
+
105
+ @property
106
+ def data_path(self) -> Path:
107
+ """Get the data path specified in the bundle file."""
108
+ return self._data_path
109
+
110
+ @property
111
+ def id(self) -> str:
112
+ """Get the analysis ID."""
113
+ return self._analysis_id
114
+
115
+ @property
116
+ def matched_files(self) -> set[Path]:
117
+ """Returns the list of files that matched the filename pattern."""
118
+ return self._collected_files.matched_files
119
+
120
+ @property
121
+ def unmatched_files(self) -> set[Path]:
122
+ """Returns the list of files that did not match the filename pattern."""
123
+ return self._collected_files.unmatched_files
124
+
125
+ @property
126
+ def extensions(self) -> set[str]:
127
+ """Returns all the matched files extensions."""
128
+ return set(self._data_files_by_ext.keys())
129
+
130
+ def get_data_files(self, ext: str | None = None) -> set[DataFile]:
131
+ """Returns the list of matched files as DataFile objects.
132
+
133
+ :param ext: Filter the list of matched files by their extension
134
+ (case-sensitive).
135
+ """
136
+ if ext:
137
+ if ext not in ALLOWED_EXTENSIONS:
138
+ msg = f"Unsupported extension {ext}."
139
+ raise ValueError(msg)
140
+
141
+ if ext in self._data_files_by_ext:
142
+ return self._data_files_by_ext[ext]
143
+ return set()
144
+ return {f for value in self._data_files_by_ext.values() for f in value}