genelastic 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/.env +4 -0
- genelastic/api/cli_start_api.py +2 -2
- genelastic/api/errors.py +52 -0
- genelastic/api/extends/example.py +0 -6
- genelastic/api/extends/example.yml +0 -20
- genelastic/api/routes.py +313 -181
- genelastic/api/server.py +8 -3
- genelastic/api/specification.yml +343 -181
- genelastic/common/__init__.py +0 -44
- genelastic/common/cli.py +48 -0
- genelastic/common/elastic.py +374 -46
- genelastic/common/exceptions.py +34 -2
- genelastic/common/server.py +9 -1
- genelastic/common/types.py +1 -14
- genelastic/import_data/__init__.py +0 -27
- genelastic/import_data/checker.py +99 -0
- genelastic/import_data/checker_observer.py +13 -0
- genelastic/import_data/cli/__init__.py +0 -0
- genelastic/import_data/cli/cli_check.py +136 -0
- genelastic/import_data/{cli_gen_data.py → cli/gen_data.py} +4 -4
- genelastic/import_data/cli/import_data.py +346 -0
- genelastic/import_data/cli/info.py +247 -0
- genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
- genelastic/import_data/cli/validate.py +146 -0
- genelastic/import_data/collect.py +185 -0
- genelastic/import_data/constants.py +136 -11
- genelastic/import_data/import_bundle.py +102 -59
- genelastic/import_data/import_bundle_factory.py +70 -149
- genelastic/import_data/importers/__init__.py +0 -0
- genelastic/import_data/importers/importer_base.py +131 -0
- genelastic/import_data/importers/importer_factory.py +85 -0
- genelastic/import_data/importers/importer_types.py +223 -0
- genelastic/import_data/logger.py +2 -1
- genelastic/import_data/models/__init__.py +0 -0
- genelastic/import_data/models/analyses.py +178 -0
- genelastic/import_data/models/analysis.py +144 -0
- genelastic/import_data/models/data_file.py +110 -0
- genelastic/import_data/models/process.py +45 -0
- genelastic/import_data/models/processes.py +84 -0
- genelastic/import_data/models/tags.py +170 -0
- genelastic/import_data/models/unique_list.py +109 -0
- genelastic/import_data/models/validate.py +26 -0
- genelastic/import_data/patterns.py +90 -0
- genelastic/import_data/random_bundle.py +10 -8
- genelastic/import_data/resolve.py +157 -0
- genelastic/ui/.env +1 -0
- genelastic/ui/cli_start_ui.py +4 -2
- genelastic/ui/routes.py +289 -42
- genelastic/ui/static/cea-cnrgh.ico +0 -0
- genelastic/ui/static/cea.ico +0 -0
- genelastic/ui/static/layout.ico +0 -0
- genelastic/ui/static/novaseq6000.png +0 -0
- genelastic/ui/static/style.css +430 -0
- genelastic/ui/static/ui.js +458 -0
- genelastic/ui/templates/analyses.html +96 -9
- genelastic/ui/templates/analysis_detail.html +44 -0
- genelastic/ui/templates/bi_process_detail.html +129 -0
- genelastic/ui/templates/bi_processes.html +114 -9
- genelastic/ui/templates/explorer.html +356 -0
- genelastic/ui/templates/home.html +205 -2
- genelastic/ui/templates/layout.html +148 -29
- genelastic/ui/templates/version.html +19 -7
- genelastic/ui/templates/wet_process_detail.html +131 -0
- genelastic/ui/templates/wet_processes.html +114 -9
- genelastic-0.9.0.dist-info/METADATA +686 -0
- genelastic-0.9.0.dist-info/RECORD +76 -0
- genelastic-0.9.0.dist-info/WHEEL +4 -0
- genelastic-0.9.0.dist-info/entry_points.txt +10 -0
- genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
- genelastic/import_data/analyses.py +0 -69
- genelastic/import_data/analysis.py +0 -205
- genelastic/import_data/bi_process.py +0 -27
- genelastic/import_data/bi_processes.py +0 -49
- genelastic/import_data/cli_import.py +0 -379
- genelastic/import_data/cli_info.py +0 -256
- genelastic/import_data/cli_validate.py +0 -54
- genelastic/import_data/data_file.py +0 -87
- genelastic/import_data/filename_pattern.py +0 -57
- genelastic/import_data/tags.py +0 -123
- genelastic/import_data/wet_process.py +0 -28
- genelastic/import_data/wet_processes.py +0 -53
- genelastic-0.8.0.dist-info/METADATA +0 -109
- genelastic-0.8.0.dist-info/RECORD +0 -52
- genelastic-0.8.0.dist-info/WHEEL +0 -5
- genelastic-0.8.0.dist-info/entry_points.txt +0 -8
- genelastic-0.8.0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from json import JSONDecodeError
|
|
7
|
+
from typing import Any, Generic, TypeVar
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
from genelastic.common.elastic import ElasticImportConn
|
|
12
|
+
from genelastic.import_data.models.data_file import DataFile
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("genelastic")
|
|
15
|
+
|
|
16
|
+
T = TypeVar("T")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ImporterError(Exception):
|
|
20
|
+
"""An error occurred while loading, validating or transforming a data file
|
|
21
|
+
into JSON documents.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class BaseImporter(ABC, Generic[T]):
|
|
26
|
+
"""Abstract base class for all importers."""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
data_file: DataFile,
|
|
31
|
+
es_import_conn: ElasticImportConn,
|
|
32
|
+
thread_count: int = 4,
|
|
33
|
+
) -> None:
|
|
34
|
+
self._data_file = data_file
|
|
35
|
+
self._es_import_conn = es_import_conn
|
|
36
|
+
self._thread_count = thread_count
|
|
37
|
+
|
|
38
|
+
self._cls_name = self.__class__.__name__
|
|
39
|
+
self._process_file()
|
|
40
|
+
|
|
41
|
+
def _process_file(self) -> None:
|
|
42
|
+
"""Process the file before import: load, validate and transform the
|
|
43
|
+
data into JSON documents.
|
|
44
|
+
:raises ImporterError: If an error occurs while processing the file.
|
|
45
|
+
"""
|
|
46
|
+
logger.debug("%s: Loading data...", self._cls_name)
|
|
47
|
+
data = self._load()
|
|
48
|
+
logger.debug("%s: Validating data...", self._cls_name)
|
|
49
|
+
self._validate(data)
|
|
50
|
+
logger.debug("%s: Transforming data...", self._cls_name)
|
|
51
|
+
self._documents = self._transform(data)
|
|
52
|
+
|
|
53
|
+
def import_docs(self) -> None:
|
|
54
|
+
"""Import the JSON documents into Elasticsearch."""
|
|
55
|
+
logger.debug("%s: Indexing documents...", self._cls_name)
|
|
56
|
+
self._es_import_conn.parallel_bulk_import(
|
|
57
|
+
self._documents, self._thread_count
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def target_index(self) -> str:
|
|
63
|
+
"""Returns the import target index name."""
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def documents(self) -> Iterable[dict[str, Any]]:
|
|
67
|
+
"""Return the documents about to be indexed."""
|
|
68
|
+
return self._documents
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def _load(self) -> T:
|
|
72
|
+
"""Load and parse raw data from the file."""
|
|
73
|
+
|
|
74
|
+
def _validate(self, data: T) -> None:
|
|
75
|
+
"""Validate the data structure (optional)."""
|
|
76
|
+
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def _transform(self, data: T) -> Iterable[dict[str, Any]]:
|
|
79
|
+
"""Transform raw data into Elasticsearch-ready documents."""
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class JSONBaseImporter(BaseImporter[Any], ABC):
|
|
83
|
+
"""Base importer to load JSON and gzipped JSON data files."""
|
|
84
|
+
|
|
85
|
+
def _load(self) -> Any: # noqa: ANN401
|
|
86
|
+
try:
|
|
87
|
+
if self._data_file.path.suffix == ".gz":
|
|
88
|
+
logger.debug(
|
|
89
|
+
"Opening gzip-compressed file in text mode and "
|
|
90
|
+
"reading content...",
|
|
91
|
+
)
|
|
92
|
+
with gzip.open(
|
|
93
|
+
self._data_file.path, "rt", encoding="utf-8"
|
|
94
|
+
) as f:
|
|
95
|
+
content = f.read()
|
|
96
|
+
else:
|
|
97
|
+
logger.debug(
|
|
98
|
+
"Opening uncompressed file in text mode and "
|
|
99
|
+
"reading content...",
|
|
100
|
+
)
|
|
101
|
+
content = self._data_file.path.read_text(encoding="utf-8")
|
|
102
|
+
except (OSError, UnicodeDecodeError) as e:
|
|
103
|
+
raise ImporterError(e) from None
|
|
104
|
+
|
|
105
|
+
if not content.strip():
|
|
106
|
+
msg = f"JSON file '{self._data_file.path}' is empty."
|
|
107
|
+
raise ImporterError(msg) from None
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
data = json.loads(content)
|
|
111
|
+
except JSONDecodeError as e:
|
|
112
|
+
raise ImporterError(e) from None
|
|
113
|
+
|
|
114
|
+
return data
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class YAMLBaseImporter(BaseImporter[Any], ABC):
|
|
118
|
+
"""Base importer to load YAML data files."""
|
|
119
|
+
|
|
120
|
+
def _load(self) -> dict[str, Any]:
|
|
121
|
+
try:
|
|
122
|
+
with self._data_file.path.open(encoding="utf-8") as f:
|
|
123
|
+
doc: dict[str, Any] = yaml.safe_load(f)
|
|
124
|
+
except (yaml.YAMLError, OSError, UnicodeDecodeError) as e:
|
|
125
|
+
raise ImporterError(e) from None
|
|
126
|
+
|
|
127
|
+
if doc is None:
|
|
128
|
+
msg = f"YAML file '{self._data_file.path}' is empty."
|
|
129
|
+
raise ImporterError(msg) from None
|
|
130
|
+
|
|
131
|
+
return doc
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import typing
|
|
3
|
+
from typing import ClassVar, TypedDict
|
|
4
|
+
|
|
5
|
+
from genelastic.common.elastic import ElasticImportConn
|
|
6
|
+
from genelastic.import_data.importers.importer_base import (
|
|
7
|
+
BaseImporter,
|
|
8
|
+
ImporterError,
|
|
9
|
+
)
|
|
10
|
+
from genelastic.import_data.importers.importer_types import (
|
|
11
|
+
CoverageImporter,
|
|
12
|
+
QCImporter,
|
|
13
|
+
SmallvarImporter,
|
|
14
|
+
SVImporter,
|
|
15
|
+
VCFImporter,
|
|
16
|
+
)
|
|
17
|
+
from genelastic.import_data.models.data_file import DataFile
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("genelastic")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class _ImporterConfig(TypedDict):
|
|
23
|
+
"""Internal configuration mapping an importer class to its supported file
|
|
24
|
+
extensions.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
cls: type[BaseImporter[typing.Any]]
|
|
28
|
+
extensions: set[str]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ImporterFactory:
|
|
32
|
+
"""Factory to create a BaseImporter instance based on the file's
|
|
33
|
+
extension and type.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
_importers: ClassVar[dict[str, _ImporterConfig]] = {
|
|
37
|
+
"vcf": _ImporterConfig(cls=VCFImporter, extensions={"vcf"}),
|
|
38
|
+
"cov": _ImporterConfig(cls=CoverageImporter, extensions={"cov"}),
|
|
39
|
+
"qc": _ImporterConfig(cls=QCImporter, extensions={"yaml", "yml"}),
|
|
40
|
+
"smallvar": _ImporterConfig(cls=SmallvarImporter, extensions={"json"}),
|
|
41
|
+
"sv": _ImporterConfig(cls=SVImporter, extensions={"json"}),
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def get_importer(
|
|
46
|
+
data_file: DataFile,
|
|
47
|
+
es_import_conn: ElasticImportConn,
|
|
48
|
+
thread_count: int = 4,
|
|
49
|
+
) -> BaseImporter[typing.Any]:
|
|
50
|
+
"""Create an appropriate BaseImporter instance based on the data
|
|
51
|
+
file's extension and type.
|
|
52
|
+
|
|
53
|
+
:param data_file: Data file to process and import.
|
|
54
|
+
:param es_import_conn: Elasticsearch import connector instance.
|
|
55
|
+
:param thread_count: Number of threads to use for parallel data file
|
|
56
|
+
import.
|
|
57
|
+
:return: An instance of the appropriate BaseImporter subclass.
|
|
58
|
+
:raises ImporterError: If the data file extension or type is invalid.
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
importer = ImporterFactory._importers[data_file.type]
|
|
62
|
+
except KeyError:
|
|
63
|
+
supported_types = sorted(
|
|
64
|
+
[f"'{i_type}'" for i_type in ImporterFactory._importers]
|
|
65
|
+
)
|
|
66
|
+
msg = (
|
|
67
|
+
f"Data file '{data_file.path.name}': no importer for type "
|
|
68
|
+
f"'{data_file.type}'. Supported types are: "
|
|
69
|
+
f"{', '.join(supported_types)}."
|
|
70
|
+
)
|
|
71
|
+
raise ImporterError(msg) from None
|
|
72
|
+
|
|
73
|
+
if data_file.ext not in importer["extensions"]:
|
|
74
|
+
supported_exts = sorted(
|
|
75
|
+
[f"'{ext}'" for ext in importer["extensions"]]
|
|
76
|
+
)
|
|
77
|
+
msg = (
|
|
78
|
+
f"Data file '{data_file.path.name}': extension "
|
|
79
|
+
f"'{data_file.ext}' not supported by importer "
|
|
80
|
+
f"{importer['cls'].__name__}. Supported extensions are: "
|
|
81
|
+
f"{', '.join(supported_exts)}."
|
|
82
|
+
)
|
|
83
|
+
raise ImporterError(msg)
|
|
84
|
+
|
|
85
|
+
return importer["cls"](data_file, es_import_conn, thread_count)
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import logging
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import schema
|
|
8
|
+
import vcf
|
|
9
|
+
from vcf.model import _Record
|
|
10
|
+
|
|
11
|
+
from genelastic.import_data.constants import (
|
|
12
|
+
QC_METRICS_SCHEMA,
|
|
13
|
+
SV_METRICS_SCHEMA,
|
|
14
|
+
)
|
|
15
|
+
from genelastic.import_data.importers.importer_base import (
|
|
16
|
+
BaseImporter,
|
|
17
|
+
ImporterError,
|
|
18
|
+
JSONBaseImporter,
|
|
19
|
+
YAMLBaseImporter,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("genelastic")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CoverageImporter(BaseImporter[Iterable[list[str]]]):
|
|
26
|
+
"""Importer for coverage files."""
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def target_index(self) -> str:
|
|
30
|
+
"""Returns the import target index name."""
|
|
31
|
+
return self._es_import_conn.coverage_index
|
|
32
|
+
|
|
33
|
+
def _load(self) -> Iterable[list[str]]:
|
|
34
|
+
"""Load a TSV formatted coverage file.
|
|
35
|
+
:raises ImporterError: If the file could not be opened or decoded.
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
with self._data_file.path.open(newline="", encoding="utf-8") as f:
|
|
39
|
+
reader = csv.reader(f, delimiter="\t")
|
|
40
|
+
try:
|
|
41
|
+
first_row = next(reader)
|
|
42
|
+
except StopIteration:
|
|
43
|
+
msg = f"Coverage file '{self._data_file.path}' is empty."
|
|
44
|
+
raise ImporterError(msg) from None
|
|
45
|
+
yield first_row
|
|
46
|
+
yield from reader
|
|
47
|
+
except (OSError, UnicodeDecodeError) as e:
|
|
48
|
+
raise ImporterError(e) from None
|
|
49
|
+
|
|
50
|
+
def _transform(self, data: Iterable[list[str]]) -> Iterable[dict[str, Any]]:
|
|
51
|
+
"""Transform each coverage file row into a JSON document."""
|
|
52
|
+
for row in data:
|
|
53
|
+
yield {
|
|
54
|
+
"_index": self.target_index,
|
|
55
|
+
"_source": {
|
|
56
|
+
"analysis_id": self._data_file.analysis_id,
|
|
57
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
58
|
+
"row": {
|
|
59
|
+
"chr": row[0],
|
|
60
|
+
"pos": int(row[1]) + 1,
|
|
61
|
+
"depth": int(row[2]),
|
|
62
|
+
},
|
|
63
|
+
},
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class VCFImporter(BaseImporter[Iterable[_Record]]):
|
|
68
|
+
"""Importer for VCF files."""
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def target_index(self) -> str:
|
|
72
|
+
"""Returns the import target index name."""
|
|
73
|
+
return self._es_import_conn.vcf_variants_index
|
|
74
|
+
|
|
75
|
+
def _load(self) -> Iterable[_Record]:
|
|
76
|
+
"""Load a VCF file. GZ compressed VCF files are supported.
|
|
77
|
+
:raises ImporterError: If the file could not be opened, decoded or is empty.
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
yield from vcf.Reader(
|
|
81
|
+
filename=str(self._data_file.path), encoding="utf-8"
|
|
82
|
+
)
|
|
83
|
+
except StopIteration:
|
|
84
|
+
msg = f"VCF file '{self._data_file.path}' is empty."
|
|
85
|
+
raise ImporterError(msg) from None
|
|
86
|
+
except (OSError, UnicodeDecodeError) as e:
|
|
87
|
+
raise ImporterError(e) from None
|
|
88
|
+
|
|
89
|
+
def _transform(self, data: Iterable[_Record]) -> Iterable[dict[str, Any]]:
|
|
90
|
+
"""Transform each VCF file record into a JSON document."""
|
|
91
|
+
for record in data:
|
|
92
|
+
# Fix values
|
|
93
|
+
if not record.CHROM.startswith("chr"):
|
|
94
|
+
if record.CHROM.lower().startswith("chr"):
|
|
95
|
+
record.CHROM = "chr" + record.CHROM[3:]
|
|
96
|
+
else:
|
|
97
|
+
record.CHROM = "chr" + record.CHROM
|
|
98
|
+
|
|
99
|
+
# Build document
|
|
100
|
+
alt = [x if x is None else x.type for x in record.ALT]
|
|
101
|
+
|
|
102
|
+
yield {
|
|
103
|
+
"_index": self.target_index,
|
|
104
|
+
"_source": {
|
|
105
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
106
|
+
"analysis_id": self._data_file.analysis_id,
|
|
107
|
+
"record": {
|
|
108
|
+
"type": "vcf",
|
|
109
|
+
"chr": record.CHROM,
|
|
110
|
+
"pos": record.POS,
|
|
111
|
+
"alt": alt,
|
|
112
|
+
"info": record.INFO,
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class QCImporter(YAMLBaseImporter):
|
|
119
|
+
"""Importer for QC YAML metrics files."""
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def target_index(self) -> str:
|
|
123
|
+
"""Returns the import target index name."""
|
|
124
|
+
return self._es_import_conn.qc_metrics_index
|
|
125
|
+
|
|
126
|
+
def _validate(self, data: dict[str, Any]) -> None:
|
|
127
|
+
"""Validate the YAML document against the expected schema.
|
|
128
|
+
|
|
129
|
+
:raises ImporterError: If the file format is invalid.
|
|
130
|
+
"""
|
|
131
|
+
try:
|
|
132
|
+
QC_METRICS_SCHEMA.validate(data)
|
|
133
|
+
except schema.SchemaError as e:
|
|
134
|
+
raise ImporterError(e) from None
|
|
135
|
+
|
|
136
|
+
def _transform(self, data: dict[str, Any]) -> Iterable[dict[str, Any]]:
|
|
137
|
+
"""Transform a QC YAML metrics file into a JSON document."""
|
|
138
|
+
yield {
|
|
139
|
+
"_index": self.target_index,
|
|
140
|
+
"_source": {
|
|
141
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
142
|
+
"analysis_id": self._data_file.analysis_id,
|
|
143
|
+
"metrics": data,
|
|
144
|
+
},
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class SmallvarImporter(JSONBaseImporter):
|
|
149
|
+
"""Importer for SmallVar JSON metrics files."""
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def target_index(self) -> str:
|
|
153
|
+
"""Returns the import target index name."""
|
|
154
|
+
return self._es_import_conn.smallvar_metrics_index
|
|
155
|
+
|
|
156
|
+
def _transform(self, data: dict[str, Any]) -> Iterable[dict[str, Any]]:
|
|
157
|
+
"""Transform a SmallVar metrics file into JSON documents."""
|
|
158
|
+
try:
|
|
159
|
+
for metric in data["metrics"]:
|
|
160
|
+
values_count = len(metric["data"][0]["values"])
|
|
161
|
+
|
|
162
|
+
metric_id = metric["id"].replace(".", "_").lower()
|
|
163
|
+
|
|
164
|
+
for i in range(values_count):
|
|
165
|
+
doc = {}
|
|
166
|
+
for item in metric["data"]:
|
|
167
|
+
# Attribute name should not use '.' as it refers
|
|
168
|
+
# to nested objects.
|
|
169
|
+
label = item["label"].replace(".", "_")
|
|
170
|
+
doc[label] = item["values"][i]
|
|
171
|
+
|
|
172
|
+
yield {
|
|
173
|
+
"_index": self.target_index,
|
|
174
|
+
"_source": {
|
|
175
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
176
|
+
"analysis_id": self._data_file.analysis_id,
|
|
177
|
+
"metric_id": metric_id,
|
|
178
|
+
"metrics": doc,
|
|
179
|
+
},
|
|
180
|
+
}
|
|
181
|
+
except KeyError as e:
|
|
182
|
+
msg = (
|
|
183
|
+
f"Smallvar metrics file '{self._data_file.path}' "
|
|
184
|
+
f"is invalid: missing key {e}."
|
|
185
|
+
)
|
|
186
|
+
raise ImporterError(msg) from None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class SVImporter(JSONBaseImporter):
|
|
190
|
+
"""Importer for SV JSON metrics files."""
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def target_index(self) -> str:
|
|
194
|
+
"""Returns the import target index name."""
|
|
195
|
+
return self._es_import_conn.sv_metrics_index
|
|
196
|
+
|
|
197
|
+
def _validate(self, data: dict[str, Any]) -> None:
|
|
198
|
+
"""Validate the YAML document against the expected schema.
|
|
199
|
+
|
|
200
|
+
:raises ImporterError: If the file format is invalid.
|
|
201
|
+
"""
|
|
202
|
+
try:
|
|
203
|
+
SV_METRICS_SCHEMA.validate(data)
|
|
204
|
+
except schema.SchemaError as e:
|
|
205
|
+
raise ImporterError(e) from None
|
|
206
|
+
|
|
207
|
+
def _transform(self, data: dict[str, Any]) -> Iterable[dict[str, Any]]:
|
|
208
|
+
"""Transform a SV metrics file into a JSON document."""
|
|
209
|
+
for region in data["regions"]:
|
|
210
|
+
for result in region["results"]:
|
|
211
|
+
# Convert all values to float to avoid mapping issues.
|
|
212
|
+
result["precision"] = float(result["precision"])
|
|
213
|
+
result["recall"] = float(result["recall"])
|
|
214
|
+
result["f1"] = float(result["f1"])
|
|
215
|
+
|
|
216
|
+
yield {
|
|
217
|
+
"_index": self.target_index,
|
|
218
|
+
"_source": {
|
|
219
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
220
|
+
"analysis_id": self._data_file.analysis_id,
|
|
221
|
+
"metrics": data,
|
|
222
|
+
},
|
|
223
|
+
}
|
genelastic/import_data/logger.py
CHANGED
|
@@ -56,5 +56,6 @@ def configure_logging(verbose: int, log_file: str | None = None) -> None:
|
|
|
56
56
|
1: logging.INFO, # default
|
|
57
57
|
2: logging.DEBUG, # verbose mode
|
|
58
58
|
}
|
|
59
|
+
level = level_map.get(verbose)
|
|
59
60
|
# If verbose is greater than 2, set level to TRACE.
|
|
60
|
-
root.setLevel(
|
|
61
|
+
root.setLevel(level if level else logging.TRACE) # type: ignore[attr-defined]
|
|
File without changes
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import typing
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from genelastic.common.types import BundleDict
|
|
6
|
+
from genelastic.import_data.collect import (
|
|
7
|
+
extract_analysis_metadata,
|
|
8
|
+
)
|
|
9
|
+
from genelastic.import_data.models.analysis import Analysis
|
|
10
|
+
from genelastic.import_data.models.data_file import DataFile
|
|
11
|
+
from genelastic.import_data.models.unique_list import UniqueList
|
|
12
|
+
from genelastic.import_data.resolve import (
|
|
13
|
+
resolve_filename_pattern,
|
|
14
|
+
validate_file_prefix,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger("genelastic")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Analyses(UniqueList[Analysis]):
|
|
21
|
+
"""Container of Analysis objects."""
|
|
22
|
+
|
|
23
|
+
def get_data_files(self, ext: str | None = None) -> list[DataFile]:
|
|
24
|
+
"""Returns matched files as DataFile objects across all analyses.
|
|
25
|
+
|
|
26
|
+
:param ext: Filter the list of matched files by their extension
|
|
27
|
+
(case-sensitive).
|
|
28
|
+
"""
|
|
29
|
+
return [df for a in self for df in a.get_data_files(ext=ext)]
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def extensions(self) -> set[str]:
|
|
33
|
+
"""Returns all matched files extensions across all analyses."""
|
|
34
|
+
return {ext for a in self for ext in a.extensions}
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def matched_files(self) -> set[Path]:
|
|
38
|
+
"""Returns the number of files that matched the pattern across all
|
|
39
|
+
analyses.
|
|
40
|
+
"""
|
|
41
|
+
return {f for a in self for f in a.matched_files}
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def unmatched_files(self) -> set[Path]:
|
|
45
|
+
"""Return the set of files that were not matched by any analysis.
|
|
46
|
+
|
|
47
|
+
The behavior differs depending on whether analyses share the same
|
|
48
|
+
``data_path``:
|
|
49
|
+
|
|
50
|
+
- Within the same directory: a file is considered unmatched only if
|
|
51
|
+
**all** analyses in that directory failed to match it. This is
|
|
52
|
+
computed as the intersection of their respective ``unmatched_files``
|
|
53
|
+
sets.
|
|
54
|
+
|
|
55
|
+
- Across different directories: unmatched files are simply aggregated
|
|
56
|
+
(union of sets), since each directory is independent.
|
|
57
|
+
|
|
58
|
+
:return: A set of paths corresponding to unmatched files across all
|
|
59
|
+
analyses.
|
|
60
|
+
"""
|
|
61
|
+
unmatched_per_dir: dict[Path, set[Path]] = {}
|
|
62
|
+
|
|
63
|
+
for a in self:
|
|
64
|
+
try:
|
|
65
|
+
unmatched_per_dir[a.data_path] = set.intersection(
|
|
66
|
+
unmatched_per_dir[a.data_path], a.unmatched_files
|
|
67
|
+
)
|
|
68
|
+
except KeyError:
|
|
69
|
+
unmatched_per_dir[a.data_path] = a.unmatched_files
|
|
70
|
+
|
|
71
|
+
if not unmatched_per_dir.values():
|
|
72
|
+
return set()
|
|
73
|
+
return set.union(*unmatched_per_dir.values())
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def from_dict(cls, bundle: BundleDict) -> typing.Self:
|
|
77
|
+
"""Initialize an ``Analyses`` container from a single bundle dictionary.
|
|
78
|
+
|
|
79
|
+
Expected bundle keys:
|
|
80
|
+
|
|
81
|
+
- Mandatory: ``file_prefix``, ``tags``, ``bundle_file``, ``data_path``.
|
|
82
|
+
- Optional: ``multi_match`` (default: ``False``), ``suffix`` (default: ``None``).
|
|
83
|
+
|
|
84
|
+
:param bundle: A dictionary describing one analysis configuration.
|
|
85
|
+
:raises InvalidFilePrefixError: If the ``file_prefix`` is invalid.
|
|
86
|
+
:raises FilenamePatternResolveError: If ``multi_match=False`` and some
|
|
87
|
+
tag fields are missing from the bundle metadata.
|
|
88
|
+
:raises UniqueListDuplicateError: If two ``Analysis`` objects happens
|
|
89
|
+
to share the same ID inside the ``Analyses`` instance.
|
|
90
|
+
:raises DataFileCollectorError: If the ``data_path`` is not an existing
|
|
91
|
+
directory or if metadata extraction or instantiation of a data file
|
|
92
|
+
objet fails for a given file.
|
|
93
|
+
:return: An ``Analyses`` instance containing one or several
|
|
94
|
+
``Analysis`` objects.
|
|
95
|
+
"""
|
|
96
|
+
analyses = cls()
|
|
97
|
+
|
|
98
|
+
# Validate file prefix structure.
|
|
99
|
+
logger.info("- Validating file prefix '%s'...", bundle["file_prefix"])
|
|
100
|
+
validate_file_prefix(
|
|
101
|
+
file_prefix=bundle["file_prefix"], tags=bundle["tags"]
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Resolve the filename pattern. In multi-match mode, tags without
|
|
105
|
+
# metadata values are accepted. They will be resolved later from
|
|
106
|
+
# filename-extracted metadata. In single-match mode, a
|
|
107
|
+
# FilenamePatternResolveError exception will be raised.
|
|
108
|
+
strict_mode = not bool(bundle.get("multi_match"))
|
|
109
|
+
logger.info(
|
|
110
|
+
"- Resolving filename pattern in %s mode...",
|
|
111
|
+
"strict" if strict_mode else "non-strict",
|
|
112
|
+
)
|
|
113
|
+
filename_pattern = resolve_filename_pattern(
|
|
114
|
+
file_prefix=bundle["file_prefix"],
|
|
115
|
+
tags=bundle["tags"],
|
|
116
|
+
metadata=bundle,
|
|
117
|
+
suffix=bundle.get("suffix"),
|
|
118
|
+
strict=strict_mode,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Scan the data path to extract metadata from filenames.
|
|
122
|
+
logger.info(
|
|
123
|
+
"- Collecting files to extract metadata from using the resolved "
|
|
124
|
+
"filename pattern."
|
|
125
|
+
)
|
|
126
|
+
extracted_metadata = extract_analysis_metadata(
|
|
127
|
+
data_path=bundle["data_path"],
|
|
128
|
+
file_prefix=bundle["file_prefix"],
|
|
129
|
+
tags=bundle["tags"],
|
|
130
|
+
filename_pattern=filename_pattern,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
logger.info(
|
|
134
|
+
"- Extracted metadata from %d analysis(es): %s",
|
|
135
|
+
len(extracted_metadata.keys()),
|
|
136
|
+
", ".join(extracted_metadata.keys()),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
for analysis_id, metadata in extracted_metadata.items():
|
|
140
|
+
# For each file match, merge filename-extracted metadata with the
|
|
141
|
+
# original bundle to describe one analysis.
|
|
142
|
+
full_metadata = {**bundle, **metadata}
|
|
143
|
+
full_metadata["analysis_id"] = analysis_id
|
|
144
|
+
|
|
145
|
+
# Re-resolve filename pattern in strict mode to let the analysis
|
|
146
|
+
# collect its own files (all tags should now be defined).
|
|
147
|
+
full_metadata["filename_pattern"] = resolve_filename_pattern(
|
|
148
|
+
file_prefix=full_metadata["file_prefix"],
|
|
149
|
+
tags=full_metadata["tags"],
|
|
150
|
+
metadata=full_metadata,
|
|
151
|
+
suffix=full_metadata.get("suffix"),
|
|
152
|
+
strict=True,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Instantiate the Analysis and add it to the container.
|
|
156
|
+
analyses.append(Analysis(**full_metadata))
|
|
157
|
+
logger.info("")
|
|
158
|
+
|
|
159
|
+
return analyses
|
|
160
|
+
|
|
161
|
+
@classmethod
|
|
162
|
+
def from_dicts(cls, arr: typing.Sequence[BundleDict]) -> typing.Self:
|
|
163
|
+
"""Initialize an ``Analyses`` container from multiple bundle
|
|
164
|
+
dictionaries.
|
|
165
|
+
|
|
166
|
+
This is a convenience wrapper that calls ``from_dict`` for each
|
|
167
|
+
bundle in the sequence and concatenates the results.
|
|
168
|
+
|
|
169
|
+
:param arr: A sequence of bundle dictionaries.
|
|
170
|
+
:return: An ``Analyses`` instance containing all analyses from the
|
|
171
|
+
input bundles.
|
|
172
|
+
"""
|
|
173
|
+
analyses = cls()
|
|
174
|
+
|
|
175
|
+
for bundle in arr:
|
|
176
|
+
analyses.extend(analyses.from_dict(bundle))
|
|
177
|
+
|
|
178
|
+
return analyses
|