genelastic 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/.env +4 -0
- genelastic/api/cli_start_api.py +18 -0
- genelastic/api/errors.py +52 -0
- genelastic/api/extends/example.py +0 -6
- genelastic/api/extends/example.yml +0 -0
- genelastic/api/routes.py +313 -181
- genelastic/api/server.py +34 -26
- genelastic/api/settings.py +5 -9
- genelastic/api/specification.yml +512 -0
- genelastic/common/__init__.py +0 -39
- genelastic/common/cli.py +100 -0
- genelastic/common/elastic.py +374 -46
- genelastic/common/exceptions.py +34 -2
- genelastic/common/server.py +59 -0
- genelastic/common/types.py +1 -14
- genelastic/import_data/__init__.py +0 -27
- genelastic/import_data/checker.py +99 -0
- genelastic/import_data/checker_observer.py +13 -0
- genelastic/import_data/cli/__init__.py +0 -0
- genelastic/import_data/cli/cli_check.py +136 -0
- genelastic/import_data/cli/gen_data.py +143 -0
- genelastic/import_data/cli/import_data.py +346 -0
- genelastic/import_data/cli/info.py +247 -0
- genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
- genelastic/import_data/cli/validate.py +146 -0
- genelastic/import_data/collect.py +185 -0
- genelastic/import_data/constants.py +136 -11
- genelastic/import_data/import_bundle.py +102 -59
- genelastic/import_data/import_bundle_factory.py +70 -149
- genelastic/import_data/importers/__init__.py +0 -0
- genelastic/import_data/importers/importer_base.py +131 -0
- genelastic/import_data/importers/importer_factory.py +85 -0
- genelastic/import_data/importers/importer_types.py +223 -0
- genelastic/import_data/logger.py +2 -1
- genelastic/import_data/models/__init__.py +0 -0
- genelastic/import_data/models/analyses.py +178 -0
- genelastic/import_data/models/analysis.py +144 -0
- genelastic/import_data/models/data_file.py +110 -0
- genelastic/import_data/models/process.py +45 -0
- genelastic/import_data/models/processes.py +84 -0
- genelastic/import_data/models/tags.py +170 -0
- genelastic/import_data/models/unique_list.py +109 -0
- genelastic/import_data/models/validate.py +26 -0
- genelastic/import_data/patterns.py +90 -0
- genelastic/import_data/random_bundle.py +79 -54
- genelastic/import_data/resolve.py +157 -0
- genelastic/ui/.env +1 -0
- genelastic/ui/cli_start_ui.py +20 -0
- genelastic/ui/routes.py +333 -0
- genelastic/ui/server.py +9 -82
- genelastic/ui/settings.py +2 -6
- genelastic/ui/static/cea-cnrgh.ico +0 -0
- genelastic/ui/static/cea.ico +0 -0
- genelastic/ui/static/layout.ico +0 -0
- genelastic/ui/static/novaseq6000.png +0 -0
- genelastic/ui/static/style.css +430 -0
- genelastic/ui/static/ui.js +458 -0
- genelastic/ui/templates/analyses.html +98 -0
- genelastic/ui/templates/analysis_detail.html +44 -0
- genelastic/ui/templates/bi_process_detail.html +129 -0
- genelastic/ui/templates/bi_processes.html +116 -0
- genelastic/ui/templates/explorer.html +356 -0
- genelastic/ui/templates/home.html +207 -0
- genelastic/ui/templates/layout.html +153 -0
- genelastic/ui/templates/version.html +21 -0
- genelastic/ui/templates/wet_process_detail.html +131 -0
- genelastic/ui/templates/wet_processes.html +116 -0
- genelastic-0.9.0.dist-info/METADATA +686 -0
- genelastic-0.9.0.dist-info/RECORD +76 -0
- genelastic-0.9.0.dist-info/WHEEL +4 -0
- genelastic-0.9.0.dist-info/entry_points.txt +10 -0
- genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
- genelastic/import_data/analyses.py +0 -69
- genelastic/import_data/analysis.py +0 -205
- genelastic/import_data/bi_process.py +0 -27
- genelastic/import_data/bi_processes.py +0 -49
- genelastic/import_data/cli_gen_data.py +0 -116
- genelastic/import_data/cli_import.py +0 -379
- genelastic/import_data/cli_info.py +0 -256
- genelastic/import_data/cli_validate.py +0 -54
- genelastic/import_data/data_file.py +0 -87
- genelastic/import_data/filename_pattern.py +0 -57
- genelastic/import_data/tags.py +0 -123
- genelastic/import_data/wet_process.py +0 -28
- genelastic/import_data/wet_processes.py +0 -53
- genelastic-0.7.0.dist-info/METADATA +0 -105
- genelastic-0.7.0.dist-info/RECORD +0 -40
- genelastic-0.7.0.dist-info/WHEEL +0 -5
- genelastic-0.7.0.dist-info/entry_points.txt +0 -6
- genelastic-0.7.0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import typing
|
|
3
|
+
from typing import ClassVar, TypedDict
|
|
4
|
+
|
|
5
|
+
from genelastic.common.elastic import ElasticImportConn
|
|
6
|
+
from genelastic.import_data.importers.importer_base import (
|
|
7
|
+
BaseImporter,
|
|
8
|
+
ImporterError,
|
|
9
|
+
)
|
|
10
|
+
from genelastic.import_data.importers.importer_types import (
|
|
11
|
+
CoverageImporter,
|
|
12
|
+
QCImporter,
|
|
13
|
+
SmallvarImporter,
|
|
14
|
+
SVImporter,
|
|
15
|
+
VCFImporter,
|
|
16
|
+
)
|
|
17
|
+
from genelastic.import_data.models.data_file import DataFile
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("genelastic")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class _ImporterConfig(TypedDict):
|
|
23
|
+
"""Internal configuration mapping an importer class to its supported file
|
|
24
|
+
extensions.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
cls: type[BaseImporter[typing.Any]]
|
|
28
|
+
extensions: set[str]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ImporterFactory:
|
|
32
|
+
"""Factory to create a BaseImporter instance based on the file's
|
|
33
|
+
extension and type.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
_importers: ClassVar[dict[str, _ImporterConfig]] = {
|
|
37
|
+
"vcf": _ImporterConfig(cls=VCFImporter, extensions={"vcf"}),
|
|
38
|
+
"cov": _ImporterConfig(cls=CoverageImporter, extensions={"cov"}),
|
|
39
|
+
"qc": _ImporterConfig(cls=QCImporter, extensions={"yaml", "yml"}),
|
|
40
|
+
"smallvar": _ImporterConfig(cls=SmallvarImporter, extensions={"json"}),
|
|
41
|
+
"sv": _ImporterConfig(cls=SVImporter, extensions={"json"}),
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def get_importer(
|
|
46
|
+
data_file: DataFile,
|
|
47
|
+
es_import_conn: ElasticImportConn,
|
|
48
|
+
thread_count: int = 4,
|
|
49
|
+
) -> BaseImporter[typing.Any]:
|
|
50
|
+
"""Create an appropriate BaseImporter instance based on the data
|
|
51
|
+
file's extension and type.
|
|
52
|
+
|
|
53
|
+
:param data_file: Data file to process and import.
|
|
54
|
+
:param es_import_conn: Elasticsearch import connector instance.
|
|
55
|
+
:param thread_count: Number of threads to use for parallel data file
|
|
56
|
+
import.
|
|
57
|
+
:return: An instance of the appropriate BaseImporter subclass.
|
|
58
|
+
:raises ImporterError: If the data file extension or type is invalid.
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
importer = ImporterFactory._importers[data_file.type]
|
|
62
|
+
except KeyError:
|
|
63
|
+
supported_types = sorted(
|
|
64
|
+
[f"'{i_type}'" for i_type in ImporterFactory._importers]
|
|
65
|
+
)
|
|
66
|
+
msg = (
|
|
67
|
+
f"Data file '{data_file.path.name}': no importer for type "
|
|
68
|
+
f"'{data_file.type}'. Supported types are: "
|
|
69
|
+
f"{', '.join(supported_types)}."
|
|
70
|
+
)
|
|
71
|
+
raise ImporterError(msg) from None
|
|
72
|
+
|
|
73
|
+
if data_file.ext not in importer["extensions"]:
|
|
74
|
+
supported_exts = sorted(
|
|
75
|
+
[f"'{ext}'" for ext in importer["extensions"]]
|
|
76
|
+
)
|
|
77
|
+
msg = (
|
|
78
|
+
f"Data file '{data_file.path.name}': extension "
|
|
79
|
+
f"'{data_file.ext}' not supported by importer "
|
|
80
|
+
f"{importer['cls'].__name__}. Supported extensions are: "
|
|
81
|
+
f"{', '.join(supported_exts)}."
|
|
82
|
+
)
|
|
83
|
+
raise ImporterError(msg)
|
|
84
|
+
|
|
85
|
+
return importer["cls"](data_file, es_import_conn, thread_count)
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import logging
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import schema
|
|
8
|
+
import vcf
|
|
9
|
+
from vcf.model import _Record
|
|
10
|
+
|
|
11
|
+
from genelastic.import_data.constants import (
|
|
12
|
+
QC_METRICS_SCHEMA,
|
|
13
|
+
SV_METRICS_SCHEMA,
|
|
14
|
+
)
|
|
15
|
+
from genelastic.import_data.importers.importer_base import (
|
|
16
|
+
BaseImporter,
|
|
17
|
+
ImporterError,
|
|
18
|
+
JSONBaseImporter,
|
|
19
|
+
YAMLBaseImporter,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("genelastic")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CoverageImporter(BaseImporter[Iterable[list[str]]]):
|
|
26
|
+
"""Importer for coverage files."""
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def target_index(self) -> str:
|
|
30
|
+
"""Returns the import target index name."""
|
|
31
|
+
return self._es_import_conn.coverage_index
|
|
32
|
+
|
|
33
|
+
def _load(self) -> Iterable[list[str]]:
|
|
34
|
+
"""Load a TSV formatted coverage file.
|
|
35
|
+
:raises ImporterError: If the file could not be opened or decoded.
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
with self._data_file.path.open(newline="", encoding="utf-8") as f:
|
|
39
|
+
reader = csv.reader(f, delimiter="\t")
|
|
40
|
+
try:
|
|
41
|
+
first_row = next(reader)
|
|
42
|
+
except StopIteration:
|
|
43
|
+
msg = f"Coverage file '{self._data_file.path}' is empty."
|
|
44
|
+
raise ImporterError(msg) from None
|
|
45
|
+
yield first_row
|
|
46
|
+
yield from reader
|
|
47
|
+
except (OSError, UnicodeDecodeError) as e:
|
|
48
|
+
raise ImporterError(e) from None
|
|
49
|
+
|
|
50
|
+
def _transform(self, data: Iterable[list[str]]) -> Iterable[dict[str, Any]]:
|
|
51
|
+
"""Transform each coverage file row into a JSON document."""
|
|
52
|
+
for row in data:
|
|
53
|
+
yield {
|
|
54
|
+
"_index": self.target_index,
|
|
55
|
+
"_source": {
|
|
56
|
+
"analysis_id": self._data_file.analysis_id,
|
|
57
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
58
|
+
"row": {
|
|
59
|
+
"chr": row[0],
|
|
60
|
+
"pos": int(row[1]) + 1,
|
|
61
|
+
"depth": int(row[2]),
|
|
62
|
+
},
|
|
63
|
+
},
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class VCFImporter(BaseImporter[Iterable[_Record]]):
|
|
68
|
+
"""Importer for VCF files."""
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def target_index(self) -> str:
|
|
72
|
+
"""Returns the import target index name."""
|
|
73
|
+
return self._es_import_conn.vcf_variants_index
|
|
74
|
+
|
|
75
|
+
def _load(self) -> Iterable[_Record]:
|
|
76
|
+
"""Load a VCF file. GZ compressed VCF files are supported.
|
|
77
|
+
:raises ImporterError: If the file could not be opened, decoded or is empty.
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
yield from vcf.Reader(
|
|
81
|
+
filename=str(self._data_file.path), encoding="utf-8"
|
|
82
|
+
)
|
|
83
|
+
except StopIteration:
|
|
84
|
+
msg = f"VCF file '{self._data_file.path}' is empty."
|
|
85
|
+
raise ImporterError(msg) from None
|
|
86
|
+
except (OSError, UnicodeDecodeError) as e:
|
|
87
|
+
raise ImporterError(e) from None
|
|
88
|
+
|
|
89
|
+
def _transform(self, data: Iterable[_Record]) -> Iterable[dict[str, Any]]:
|
|
90
|
+
"""Transform each VCF file record into a JSON document."""
|
|
91
|
+
for record in data:
|
|
92
|
+
# Fix values
|
|
93
|
+
if not record.CHROM.startswith("chr"):
|
|
94
|
+
if record.CHROM.lower().startswith("chr"):
|
|
95
|
+
record.CHROM = "chr" + record.CHROM[3:]
|
|
96
|
+
else:
|
|
97
|
+
record.CHROM = "chr" + record.CHROM
|
|
98
|
+
|
|
99
|
+
# Build document
|
|
100
|
+
alt = [x if x is None else x.type for x in record.ALT]
|
|
101
|
+
|
|
102
|
+
yield {
|
|
103
|
+
"_index": self.target_index,
|
|
104
|
+
"_source": {
|
|
105
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
106
|
+
"analysis_id": self._data_file.analysis_id,
|
|
107
|
+
"record": {
|
|
108
|
+
"type": "vcf",
|
|
109
|
+
"chr": record.CHROM,
|
|
110
|
+
"pos": record.POS,
|
|
111
|
+
"alt": alt,
|
|
112
|
+
"info": record.INFO,
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class QCImporter(YAMLBaseImporter):
|
|
119
|
+
"""Importer for QC YAML metrics files."""
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def target_index(self) -> str:
|
|
123
|
+
"""Returns the import target index name."""
|
|
124
|
+
return self._es_import_conn.qc_metrics_index
|
|
125
|
+
|
|
126
|
+
def _validate(self, data: dict[str, Any]) -> None:
|
|
127
|
+
"""Validate the YAML document against the expected schema.
|
|
128
|
+
|
|
129
|
+
:raises ImporterError: If the file format is invalid.
|
|
130
|
+
"""
|
|
131
|
+
try:
|
|
132
|
+
QC_METRICS_SCHEMA.validate(data)
|
|
133
|
+
except schema.SchemaError as e:
|
|
134
|
+
raise ImporterError(e) from None
|
|
135
|
+
|
|
136
|
+
def _transform(self, data: dict[str, Any]) -> Iterable[dict[str, Any]]:
|
|
137
|
+
"""Transform a QC YAML metrics file into a JSON document."""
|
|
138
|
+
yield {
|
|
139
|
+
"_index": self.target_index,
|
|
140
|
+
"_source": {
|
|
141
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
142
|
+
"analysis_id": self._data_file.analysis_id,
|
|
143
|
+
"metrics": data,
|
|
144
|
+
},
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class SmallvarImporter(JSONBaseImporter):
|
|
149
|
+
"""Importer for SmallVar JSON metrics files."""
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def target_index(self) -> str:
|
|
153
|
+
"""Returns the import target index name."""
|
|
154
|
+
return self._es_import_conn.smallvar_metrics_index
|
|
155
|
+
|
|
156
|
+
def _transform(self, data: dict[str, Any]) -> Iterable[dict[str, Any]]:
|
|
157
|
+
"""Transform a SmallVar metrics file into JSON documents."""
|
|
158
|
+
try:
|
|
159
|
+
for metric in data["metrics"]:
|
|
160
|
+
values_count = len(metric["data"][0]["values"])
|
|
161
|
+
|
|
162
|
+
metric_id = metric["id"].replace(".", "_").lower()
|
|
163
|
+
|
|
164
|
+
for i in range(values_count):
|
|
165
|
+
doc = {}
|
|
166
|
+
for item in metric["data"]:
|
|
167
|
+
# Attribute name should not use '.' as it refers
|
|
168
|
+
# to nested objects.
|
|
169
|
+
label = item["label"].replace(".", "_")
|
|
170
|
+
doc[label] = item["values"][i]
|
|
171
|
+
|
|
172
|
+
yield {
|
|
173
|
+
"_index": self.target_index,
|
|
174
|
+
"_source": {
|
|
175
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
176
|
+
"analysis_id": self._data_file.analysis_id,
|
|
177
|
+
"metric_id": metric_id,
|
|
178
|
+
"metrics": doc,
|
|
179
|
+
},
|
|
180
|
+
}
|
|
181
|
+
except KeyError as e:
|
|
182
|
+
msg = (
|
|
183
|
+
f"Smallvar metrics file '{self._data_file.path}' "
|
|
184
|
+
f"is invalid: missing key {e}."
|
|
185
|
+
)
|
|
186
|
+
raise ImporterError(msg) from None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class SVImporter(JSONBaseImporter):
|
|
190
|
+
"""Importer for SV JSON metrics files."""
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def target_index(self) -> str:
|
|
194
|
+
"""Returns the import target index name."""
|
|
195
|
+
return self._es_import_conn.sv_metrics_index
|
|
196
|
+
|
|
197
|
+
def _validate(self, data: dict[str, Any]) -> None:
|
|
198
|
+
"""Validate the YAML document against the expected schema.
|
|
199
|
+
|
|
200
|
+
:raises ImporterError: If the file format is invalid.
|
|
201
|
+
"""
|
|
202
|
+
try:
|
|
203
|
+
SV_METRICS_SCHEMA.validate(data)
|
|
204
|
+
except schema.SchemaError as e:
|
|
205
|
+
raise ImporterError(e) from None
|
|
206
|
+
|
|
207
|
+
def _transform(self, data: dict[str, Any]) -> Iterable[dict[str, Any]]:
|
|
208
|
+
"""Transform a SV metrics file into a JSON document."""
|
|
209
|
+
for region in data["regions"]:
|
|
210
|
+
for result in region["results"]:
|
|
211
|
+
# Convert all values to float to avoid mapping issues.
|
|
212
|
+
result["precision"] = float(result["precision"])
|
|
213
|
+
result["recall"] = float(result["recall"])
|
|
214
|
+
result["f1"] = float(result["f1"])
|
|
215
|
+
|
|
216
|
+
yield {
|
|
217
|
+
"_index": self.target_index,
|
|
218
|
+
"_source": {
|
|
219
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
220
|
+
"analysis_id": self._data_file.analysis_id,
|
|
221
|
+
"metrics": data,
|
|
222
|
+
},
|
|
223
|
+
}
|
genelastic/import_data/logger.py
CHANGED
|
@@ -56,5 +56,6 @@ def configure_logging(verbose: int, log_file: str | None = None) -> None:
|
|
|
56
56
|
1: logging.INFO, # default
|
|
57
57
|
2: logging.DEBUG, # verbose mode
|
|
58
58
|
}
|
|
59
|
+
level = level_map.get(verbose)
|
|
59
60
|
# If verbose is greater than 2, set level to TRACE.
|
|
60
|
-
root.setLevel(
|
|
61
|
+
root.setLevel(level if level else logging.TRACE) # type: ignore[attr-defined]
|
|
File without changes
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import typing
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from genelastic.common.types import BundleDict
|
|
6
|
+
from genelastic.import_data.collect import (
|
|
7
|
+
extract_analysis_metadata,
|
|
8
|
+
)
|
|
9
|
+
from genelastic.import_data.models.analysis import Analysis
|
|
10
|
+
from genelastic.import_data.models.data_file import DataFile
|
|
11
|
+
from genelastic.import_data.models.unique_list import UniqueList
|
|
12
|
+
from genelastic.import_data.resolve import (
|
|
13
|
+
resolve_filename_pattern,
|
|
14
|
+
validate_file_prefix,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger("genelastic")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Analyses(UniqueList[Analysis]):
|
|
21
|
+
"""Container of Analysis objects."""
|
|
22
|
+
|
|
23
|
+
def get_data_files(self, ext: str | None = None) -> list[DataFile]:
|
|
24
|
+
"""Returns matched files as DataFile objects across all analyses.
|
|
25
|
+
|
|
26
|
+
:param ext: Filter the list of matched files by their extension
|
|
27
|
+
(case-sensitive).
|
|
28
|
+
"""
|
|
29
|
+
return [df for a in self for df in a.get_data_files(ext=ext)]
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def extensions(self) -> set[str]:
|
|
33
|
+
"""Returns all matched files extensions across all analyses."""
|
|
34
|
+
return {ext for a in self for ext in a.extensions}
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def matched_files(self) -> set[Path]:
|
|
38
|
+
"""Returns the number of files that matched the pattern across all
|
|
39
|
+
analyses.
|
|
40
|
+
"""
|
|
41
|
+
return {f for a in self for f in a.matched_files}
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def unmatched_files(self) -> set[Path]:
|
|
45
|
+
"""Return the set of files that were not matched by any analysis.
|
|
46
|
+
|
|
47
|
+
The behavior differs depending on whether analyses share the same
|
|
48
|
+
``data_path``:
|
|
49
|
+
|
|
50
|
+
- Within the same directory: a file is considered unmatched only if
|
|
51
|
+
**all** analyses in that directory failed to match it. This is
|
|
52
|
+
computed as the intersection of their respective ``unmatched_files``
|
|
53
|
+
sets.
|
|
54
|
+
|
|
55
|
+
- Across different directories: unmatched files are simply aggregated
|
|
56
|
+
(union of sets), since each directory is independent.
|
|
57
|
+
|
|
58
|
+
:return: A set of paths corresponding to unmatched files across all
|
|
59
|
+
analyses.
|
|
60
|
+
"""
|
|
61
|
+
unmatched_per_dir: dict[Path, set[Path]] = {}
|
|
62
|
+
|
|
63
|
+
for a in self:
|
|
64
|
+
try:
|
|
65
|
+
unmatched_per_dir[a.data_path] = set.intersection(
|
|
66
|
+
unmatched_per_dir[a.data_path], a.unmatched_files
|
|
67
|
+
)
|
|
68
|
+
except KeyError:
|
|
69
|
+
unmatched_per_dir[a.data_path] = a.unmatched_files
|
|
70
|
+
|
|
71
|
+
if not unmatched_per_dir.values():
|
|
72
|
+
return set()
|
|
73
|
+
return set.union(*unmatched_per_dir.values())
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def from_dict(cls, bundle: BundleDict) -> typing.Self:
|
|
77
|
+
"""Initialize an ``Analyses`` container from a single bundle dictionary.
|
|
78
|
+
|
|
79
|
+
Expected bundle keys:
|
|
80
|
+
|
|
81
|
+
- Mandatory: ``file_prefix``, ``tags``, ``bundle_file``, ``data_path``.
|
|
82
|
+
- Optional: ``multi_match`` (default: ``False``), ``suffix`` (default: ``None``).
|
|
83
|
+
|
|
84
|
+
:param bundle: A dictionary describing one analysis configuration.
|
|
85
|
+
:raises InvalidFilePrefixError: If the ``file_prefix`` is invalid.
|
|
86
|
+
:raises FilenamePatternResolveError: If ``multi_match=False`` and some
|
|
87
|
+
tag fields are missing from the bundle metadata.
|
|
88
|
+
:raises UniqueListDuplicateError: If two ``Analysis`` objects happens
|
|
89
|
+
to share the same ID inside the ``Analyses`` instance.
|
|
90
|
+
:raises DataFileCollectorError: If the ``data_path`` is not an existing
|
|
91
|
+
directory or if metadata extraction or instantiation of a data file
|
|
92
|
+
objet fails for a given file.
|
|
93
|
+
:return: An ``Analyses`` instance containing one or several
|
|
94
|
+
``Analysis`` objects.
|
|
95
|
+
"""
|
|
96
|
+
analyses = cls()
|
|
97
|
+
|
|
98
|
+
# Validate file prefix structure.
|
|
99
|
+
logger.info("- Validating file prefix '%s'...", bundle["file_prefix"])
|
|
100
|
+
validate_file_prefix(
|
|
101
|
+
file_prefix=bundle["file_prefix"], tags=bundle["tags"]
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Resolve the filename pattern. In multi-match mode, tags without
|
|
105
|
+
# metadata values are accepted. They will be resolved later from
|
|
106
|
+
# filename-extracted metadata. In single-match mode, a
|
|
107
|
+
# FilenamePatternResolveError exception will be raised.
|
|
108
|
+
strict_mode = not bool(bundle.get("multi_match"))
|
|
109
|
+
logger.info(
|
|
110
|
+
"- Resolving filename pattern in %s mode...",
|
|
111
|
+
"strict" if strict_mode else "non-strict",
|
|
112
|
+
)
|
|
113
|
+
filename_pattern = resolve_filename_pattern(
|
|
114
|
+
file_prefix=bundle["file_prefix"],
|
|
115
|
+
tags=bundle["tags"],
|
|
116
|
+
metadata=bundle,
|
|
117
|
+
suffix=bundle.get("suffix"),
|
|
118
|
+
strict=strict_mode,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Scan the data path to extract metadata from filenames.
|
|
122
|
+
logger.info(
|
|
123
|
+
"- Collecting files to extract metadata from using the resolved "
|
|
124
|
+
"filename pattern."
|
|
125
|
+
)
|
|
126
|
+
extracted_metadata = extract_analysis_metadata(
|
|
127
|
+
data_path=bundle["data_path"],
|
|
128
|
+
file_prefix=bundle["file_prefix"],
|
|
129
|
+
tags=bundle["tags"],
|
|
130
|
+
filename_pattern=filename_pattern,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
logger.info(
|
|
134
|
+
"- Extracted metadata from %d analysis(es): %s",
|
|
135
|
+
len(extracted_metadata.keys()),
|
|
136
|
+
", ".join(extracted_metadata.keys()),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
for analysis_id, metadata in extracted_metadata.items():
|
|
140
|
+
# For each file match, merge filename-extracted metadata with the
|
|
141
|
+
# original bundle to describe one analysis.
|
|
142
|
+
full_metadata = {**bundle, **metadata}
|
|
143
|
+
full_metadata["analysis_id"] = analysis_id
|
|
144
|
+
|
|
145
|
+
# Re-resolve filename pattern in strict mode to let the analysis
|
|
146
|
+
# collect its own files (all tags should now be defined).
|
|
147
|
+
full_metadata["filename_pattern"] = resolve_filename_pattern(
|
|
148
|
+
file_prefix=full_metadata["file_prefix"],
|
|
149
|
+
tags=full_metadata["tags"],
|
|
150
|
+
metadata=full_metadata,
|
|
151
|
+
suffix=full_metadata.get("suffix"),
|
|
152
|
+
strict=True,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Instantiate the Analysis and add it to the container.
|
|
156
|
+
analyses.append(Analysis(**full_metadata))
|
|
157
|
+
logger.info("")
|
|
158
|
+
|
|
159
|
+
return analyses
|
|
160
|
+
|
|
161
|
+
@classmethod
|
|
162
|
+
def from_dicts(cls, arr: typing.Sequence[BundleDict]) -> typing.Self:
|
|
163
|
+
"""Initialize an ``Analyses`` container from multiple bundle
|
|
164
|
+
dictionaries.
|
|
165
|
+
|
|
166
|
+
This is a convenience wrapper that calls ``from_dict`` for each
|
|
167
|
+
bundle in the sequence and concatenates the results.
|
|
168
|
+
|
|
169
|
+
:param arr: A sequence of bundle dictionaries.
|
|
170
|
+
:return: An ``Analyses`` instance containing all analyses from the
|
|
171
|
+
input bundles.
|
|
172
|
+
"""
|
|
173
|
+
analyses = cls()
|
|
174
|
+
|
|
175
|
+
for bundle in arr:
|
|
176
|
+
analyses.extend(analyses.from_dict(bundle))
|
|
177
|
+
|
|
178
|
+
return analyses
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import copy
|
|
3
|
+
import logging
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from types import NotImplementedType
|
|
7
|
+
|
|
8
|
+
from genelastic.common.types import Metadata
|
|
9
|
+
from genelastic.import_data.collect import (
|
|
10
|
+
DataFileCollector,
|
|
11
|
+
)
|
|
12
|
+
from genelastic.import_data.constants import (
|
|
13
|
+
ALLOWED_EXTENSIONS,
|
|
14
|
+
)
|
|
15
|
+
from genelastic.import_data.models.data_file import DataFile
|
|
16
|
+
from genelastic.import_data.patterns import FilenamePattern
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("genelastic")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Analysis:
|
|
22
|
+
"""Class Analysis that represents an analysis."""
|
|
23
|
+
|
|
24
|
+
METADATA_INTERNAL_KEYS = frozenset(
|
|
25
|
+
["tags", "multi_match", "ext", "file_prefix"]
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
analysis_id: str,
|
|
31
|
+
bundle_file: Path,
|
|
32
|
+
data_path: Path,
|
|
33
|
+
filename_pattern: FilenamePattern,
|
|
34
|
+
**metadata: str | int,
|
|
35
|
+
) -> None:
|
|
36
|
+
self._analysis_id = analysis_id
|
|
37
|
+
self._bundle_file = bundle_file
|
|
38
|
+
self._data_path = data_path
|
|
39
|
+
self._metadata = self._remove_internal_keys(metadata)
|
|
40
|
+
self._data_files_by_ext: dict[str, set[DataFile]] = defaultdict(set)
|
|
41
|
+
|
|
42
|
+
logger.info("")
|
|
43
|
+
logger.info("[ Analysis ID %s ]", self._analysis_id)
|
|
44
|
+
|
|
45
|
+
self._collected_files = DataFileCollector(
|
|
46
|
+
analysis_id,
|
|
47
|
+
bundle_file,
|
|
48
|
+
data_path,
|
|
49
|
+
filename_pattern,
|
|
50
|
+
).run()
|
|
51
|
+
|
|
52
|
+
for data_file in self._collected_files.data_files:
|
|
53
|
+
self._data_files_by_ext[data_file.ext].add(data_file)
|
|
54
|
+
|
|
55
|
+
logger.info(
|
|
56
|
+
" -> Extracted %s file extension(s): %s.",
|
|
57
|
+
len(self._data_files_by_ext.keys()),
|
|
58
|
+
", ".join(ext.upper() for ext in self._data_files_by_ext),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def __eq__(self, other: object) -> bool | NotImplementedType:
|
|
62
|
+
"""Defines equality comparison for Analysis instances based on their
|
|
63
|
+
ID.
|
|
64
|
+
"""
|
|
65
|
+
if isinstance(other, Analysis):
|
|
66
|
+
return self._analysis_id == other._analysis_id
|
|
67
|
+
return NotImplemented
|
|
68
|
+
|
|
69
|
+
def __lt__(self, other: object) -> bool | NotImplementedType:
|
|
70
|
+
"""Defines sort order for Analysis instances based on their ID."""
|
|
71
|
+
if isinstance(other, Analysis):
|
|
72
|
+
return self._analysis_id < other._analysis_id
|
|
73
|
+
return NotImplemented
|
|
74
|
+
|
|
75
|
+
def __str__(self) -> str:
|
|
76
|
+
return (
|
|
77
|
+
f"Analysis(id='{self._analysis_id}', "
|
|
78
|
+
f"bundle_file='{self._bundle_file}', "
|
|
79
|
+
f"data_path='{self._data_path}', "
|
|
80
|
+
f"metadata={self._metadata})"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def _remove_internal_keys(
|
|
85
|
+
metadata: Metadata,
|
|
86
|
+
) -> Metadata:
|
|
87
|
+
updated_metadata = metadata.copy()
|
|
88
|
+
|
|
89
|
+
for key in Analysis.METADATA_INTERNAL_KEYS:
|
|
90
|
+
with contextlib.suppress(KeyError):
|
|
91
|
+
del updated_metadata[key]
|
|
92
|
+
|
|
93
|
+
return updated_metadata
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def metadata(self) -> Metadata:
|
|
97
|
+
"""Get metadata."""
|
|
98
|
+
return copy.deepcopy(self._metadata)
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def bundle_file(self) -> Path:
|
|
102
|
+
"""Get the bundle file."""
|
|
103
|
+
return self._bundle_file
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def data_path(self) -> Path:
|
|
107
|
+
"""Get the data path specified in the bundle file."""
|
|
108
|
+
return self._data_path
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def id(self) -> str:
|
|
112
|
+
"""Get the analysis ID."""
|
|
113
|
+
return self._analysis_id
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def matched_files(self) -> set[Path]:
|
|
117
|
+
"""Returns the list of files that matched the filename pattern."""
|
|
118
|
+
return self._collected_files.matched_files
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def unmatched_files(self) -> set[Path]:
|
|
122
|
+
"""Returns the list of files that did not match the filename pattern."""
|
|
123
|
+
return self._collected_files.unmatched_files
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def extensions(self) -> set[str]:
|
|
127
|
+
"""Returns all the matched files extensions."""
|
|
128
|
+
return set(self._data_files_by_ext.keys())
|
|
129
|
+
|
|
130
|
+
def get_data_files(self, ext: str | None = None) -> set[DataFile]:
|
|
131
|
+
"""Returns the list of matched files as DataFile objects.
|
|
132
|
+
|
|
133
|
+
:param ext: Filter the list of matched files by their extension
|
|
134
|
+
(case-sensitive).
|
|
135
|
+
"""
|
|
136
|
+
if ext:
|
|
137
|
+
if ext not in ALLOWED_EXTENSIONS:
|
|
138
|
+
msg = f"Unsupported extension {ext}."
|
|
139
|
+
raise ValueError(msg)
|
|
140
|
+
|
|
141
|
+
if ext in self._data_files_by_ext:
|
|
142
|
+
return self._data_files_by_ext[ext]
|
|
143
|
+
return set()
|
|
144
|
+
return {f for value in self._data_files_by_ext.values() for f in value}
|