genelastic 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/.env +4 -0
- genelastic/api/cli_start_api.py +2 -2
- genelastic/api/errors.py +52 -0
- genelastic/api/extends/example.py +0 -6
- genelastic/api/extends/example.yml +0 -20
- genelastic/api/routes.py +313 -181
- genelastic/api/server.py +8 -3
- genelastic/api/specification.yml +343 -181
- genelastic/common/__init__.py +0 -44
- genelastic/common/cli.py +48 -0
- genelastic/common/elastic.py +374 -46
- genelastic/common/exceptions.py +34 -2
- genelastic/common/server.py +9 -1
- genelastic/common/types.py +1 -14
- genelastic/import_data/__init__.py +0 -27
- genelastic/import_data/checker.py +99 -0
- genelastic/import_data/checker_observer.py +13 -0
- genelastic/import_data/cli/__init__.py +0 -0
- genelastic/import_data/cli/cli_check.py +136 -0
- genelastic/import_data/{cli_gen_data.py → cli/gen_data.py} +4 -4
- genelastic/import_data/cli/import_data.py +346 -0
- genelastic/import_data/cli/info.py +247 -0
- genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
- genelastic/import_data/cli/validate.py +146 -0
- genelastic/import_data/collect.py +185 -0
- genelastic/import_data/constants.py +136 -11
- genelastic/import_data/import_bundle.py +102 -59
- genelastic/import_data/import_bundle_factory.py +70 -149
- genelastic/import_data/importers/__init__.py +0 -0
- genelastic/import_data/importers/importer_base.py +131 -0
- genelastic/import_data/importers/importer_factory.py +85 -0
- genelastic/import_data/importers/importer_types.py +223 -0
- genelastic/import_data/logger.py +2 -1
- genelastic/import_data/models/__init__.py +0 -0
- genelastic/import_data/models/analyses.py +178 -0
- genelastic/import_data/models/analysis.py +144 -0
- genelastic/import_data/models/data_file.py +110 -0
- genelastic/import_data/models/process.py +45 -0
- genelastic/import_data/models/processes.py +84 -0
- genelastic/import_data/models/tags.py +170 -0
- genelastic/import_data/models/unique_list.py +109 -0
- genelastic/import_data/models/validate.py +26 -0
- genelastic/import_data/patterns.py +90 -0
- genelastic/import_data/random_bundle.py +10 -8
- genelastic/import_data/resolve.py +157 -0
- genelastic/ui/.env +1 -0
- genelastic/ui/cli_start_ui.py +4 -2
- genelastic/ui/routes.py +289 -42
- genelastic/ui/static/cea-cnrgh.ico +0 -0
- genelastic/ui/static/cea.ico +0 -0
- genelastic/ui/static/layout.ico +0 -0
- genelastic/ui/static/novaseq6000.png +0 -0
- genelastic/ui/static/style.css +430 -0
- genelastic/ui/static/ui.js +458 -0
- genelastic/ui/templates/analyses.html +96 -9
- genelastic/ui/templates/analysis_detail.html +44 -0
- genelastic/ui/templates/bi_process_detail.html +129 -0
- genelastic/ui/templates/bi_processes.html +114 -9
- genelastic/ui/templates/explorer.html +356 -0
- genelastic/ui/templates/home.html +205 -2
- genelastic/ui/templates/layout.html +148 -29
- genelastic/ui/templates/version.html +19 -7
- genelastic/ui/templates/wet_process_detail.html +131 -0
- genelastic/ui/templates/wet_processes.html +114 -9
- genelastic-0.9.0.dist-info/METADATA +686 -0
- genelastic-0.9.0.dist-info/RECORD +76 -0
- genelastic-0.9.0.dist-info/WHEEL +4 -0
- genelastic-0.9.0.dist-info/entry_points.txt +10 -0
- genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
- genelastic/import_data/analyses.py +0 -69
- genelastic/import_data/analysis.py +0 -205
- genelastic/import_data/bi_process.py +0 -27
- genelastic/import_data/bi_processes.py +0 -49
- genelastic/import_data/cli_import.py +0 -379
- genelastic/import_data/cli_info.py +0 -256
- genelastic/import_data/cli_validate.py +0 -54
- genelastic/import_data/data_file.py +0 -87
- genelastic/import_data/filename_pattern.py +0 -57
- genelastic/import_data/tags.py +0 -123
- genelastic/import_data/wet_process.py +0 -28
- genelastic/import_data/wet_processes.py +0 -53
- genelastic-0.8.0.dist-info/METADATA +0 -109
- genelastic-0.8.0.dist-info/RECORD +0 -52
- genelastic-0.8.0.dist-info/WHEEL +0 -5
- genelastic-0.8.0.dist-info/entry_points.txt +0 -8
- genelastic-0.8.0.dist-info/top_level.txt +0 -1
genelastic/common/types.py
CHANGED
|
@@ -4,20 +4,7 @@ import typing
|
|
|
4
4
|
Bucket: typing.TypeAlias = dict[str, dict[typing.Any, typing.Any]]
|
|
5
5
|
BundleDict: typing.TypeAlias = dict[str, typing.Any]
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
WetProcessesData: typing.TypeAlias = dict[str, str | int | float]
|
|
9
|
-
BioInfoProcessData: typing.TypeAlias = dict[str, str | list[str]]
|
|
10
|
-
|
|
11
|
-
AnalysisDocument: typing.TypeAlias = dict[str, str | None | AnalysisMetaData]
|
|
12
|
-
MetadataDocument: typing.TypeAlias = dict[
|
|
13
|
-
str, int | str | list[typing.Any | None]
|
|
14
|
-
]
|
|
15
|
-
ProcessDocument: typing.TypeAlias = (
|
|
16
|
-
dict[str, str] | WetProcessesData | BioInfoProcessData
|
|
17
|
-
)
|
|
18
|
-
BulkItems: typing.TypeAlias = list[
|
|
19
|
-
dict[str, str | MetadataDocument | AnalysisDocument | ProcessDocument]
|
|
20
|
-
]
|
|
7
|
+
Metadata: typing.TypeAlias = dict[str, str | int]
|
|
21
8
|
|
|
22
9
|
# Types related to random bundle generation.
|
|
23
10
|
RandomBiProcessData: typing.TypeAlias = dict[str, str | list[dict[str, str]]]
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
"""Genelastic package for importing Genomic data into Elasticsearch."""
|
|
2
|
-
|
|
3
|
-
from .analysis import Analysis
|
|
4
|
-
from .import_bundle import ImportBundle
|
|
5
|
-
from .import_bundle_factory import (
|
|
6
|
-
load_import_bundle_file,
|
|
7
|
-
make_import_bundle_from_files,
|
|
8
|
-
)
|
|
9
|
-
from .random_bundle import (
|
|
10
|
-
RandomAnalysis,
|
|
11
|
-
RandomBiProcess,
|
|
12
|
-
RandomBundle,
|
|
13
|
-
RandomWetProcess,
|
|
14
|
-
)
|
|
15
|
-
from .tags import Tags
|
|
16
|
-
|
|
17
|
-
__all__ = [
|
|
18
|
-
"Analysis",
|
|
19
|
-
"ImportBundle",
|
|
20
|
-
"RandomAnalysis",
|
|
21
|
-
"RandomBiProcess",
|
|
22
|
-
"RandomBundle",
|
|
23
|
-
"RandomWetProcess",
|
|
24
|
-
"Tags",
|
|
25
|
-
"load_import_bundle_file",
|
|
26
|
-
"make_import_bundle_from_files",
|
|
27
|
-
]
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from genelastic.common.elastic import ElasticQueryConn
|
|
4
|
+
from genelastic.import_data.checker_observer import CheckerObserver
|
|
5
|
+
from genelastic.import_data.models.analyses import Analyses
|
|
6
|
+
from genelastic.import_data.models.processes import Processes
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("genelastic")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Checker:
|
|
12
|
+
"""Validate coherence between YAML metadata and Elasticsearch,
|
|
13
|
+
using a project-specific observer mechanism.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, es: ElasticQueryConn, *, strict: bool = False) -> None:
|
|
17
|
+
"""Initialize the Checker.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
es: Elasticsearch connection instance.
|
|
21
|
+
strict: Treat ES-only entries as errors when True.
|
|
22
|
+
"""
|
|
23
|
+
self.es = es
|
|
24
|
+
self.strict = strict
|
|
25
|
+
self.errors_detected = False
|
|
26
|
+
self._observers: list[CheckerObserver] = []
|
|
27
|
+
|
|
28
|
+
def attach(self, observer: CheckerObserver) -> None:
|
|
29
|
+
"""Register an observer to receive Checker notifications."""
|
|
30
|
+
self._observers.append(observer)
|
|
31
|
+
|
|
32
|
+
def detach(self, observer: CheckerObserver) -> None:
|
|
33
|
+
"""Unregister an observer so it no longer receives notifications."""
|
|
34
|
+
self._observers.remove(observer)
|
|
35
|
+
|
|
36
|
+
def _notify_missing(self, label: str, missing: list[str]) -> None:
|
|
37
|
+
"""Notify observers about missing IDs."""
|
|
38
|
+
self.errors_detected = True
|
|
39
|
+
for obs in self._observers:
|
|
40
|
+
obs.notify_missing(label, missing)
|
|
41
|
+
|
|
42
|
+
def _notify_extra(self, label: str, extra: list[str]) -> None:
|
|
43
|
+
"""Notify observers about extra IDs."""
|
|
44
|
+
self.errors_detected = True
|
|
45
|
+
for obs in self._observers:
|
|
46
|
+
obs.notify_extra(label, extra)
|
|
47
|
+
|
|
48
|
+
def _check_generic(
|
|
49
|
+
self, label: str, ids_yaml: set[str], ids_es: set[str]
|
|
50
|
+
) -> None:
|
|
51
|
+
"""Compare YAML IDs vs Elasticsearch IDs for a given entity type."""
|
|
52
|
+
logger.info("Checking %s...", label)
|
|
53
|
+
|
|
54
|
+
missing = sorted(ids_yaml - ids_es)
|
|
55
|
+
extra = sorted(ids_es - ids_yaml)
|
|
56
|
+
|
|
57
|
+
if missing:
|
|
58
|
+
logger.error("Missing %s in ES: %s", label, missing)
|
|
59
|
+
self._notify_missing(label, missing)
|
|
60
|
+
|
|
61
|
+
if extra:
|
|
62
|
+
if self.strict:
|
|
63
|
+
logger.error(
|
|
64
|
+
"%s in ES but missing from YAML: %s",
|
|
65
|
+
label.capitalize(),
|
|
66
|
+
extra,
|
|
67
|
+
)
|
|
68
|
+
self._notify_extra(label, extra)
|
|
69
|
+
else:
|
|
70
|
+
logger.info("Extra %s ignored (non-strict mode).", label)
|
|
71
|
+
|
|
72
|
+
if not missing and (self.strict and not extra):
|
|
73
|
+
logger.info("OK ✓ All %s match exactly.", label)
|
|
74
|
+
elif not missing and not self.strict:
|
|
75
|
+
logger.info("OK ✓ YAML %s present (extra ignored).", label)
|
|
76
|
+
|
|
77
|
+
def check_analyses(self, analyses: Analyses) -> None:
|
|
78
|
+
"""Check analysis IDs between YAML and Elasticsearch."""
|
|
79
|
+
ids_yaml = {a.id for a in analyses}
|
|
80
|
+
ids_es = set(
|
|
81
|
+
self.es.get_field_values(self.es.data_files_index, "analysis_id")
|
|
82
|
+
)
|
|
83
|
+
self._check_generic("analyses", ids_yaml, ids_es)
|
|
84
|
+
|
|
85
|
+
def check_wet_processes(self, processes: Processes) -> None:
|
|
86
|
+
"""Check wet process IDs between YAML and Elasticsearch."""
|
|
87
|
+
ids_yaml = set(processes.keys())
|
|
88
|
+
ids_es = set(
|
|
89
|
+
self.es.get_field_values(self.es.wet_processes_index, "proc_id")
|
|
90
|
+
)
|
|
91
|
+
self._check_generic("wet processes", ids_yaml, ids_es)
|
|
92
|
+
|
|
93
|
+
def check_bi_processes(self, processes: Processes) -> None:
|
|
94
|
+
"""Check biological process IDs between YAML and Elasticsearch."""
|
|
95
|
+
ids_yaml = set(processes.keys())
|
|
96
|
+
ids_es = set(
|
|
97
|
+
self.es.get_field_values(self.es.bi_processes_index, "proc_id")
|
|
98
|
+
)
|
|
99
|
+
self._check_generic("bi processes", ids_yaml, ids_es)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from typing import Protocol
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class CheckerObserver(Protocol):
|
|
5
|
+
"""Protocol for classes observing Checker events."""
|
|
6
|
+
|
|
7
|
+
def notify_missing(self, label: str, missing: list[str]) -> None:
|
|
8
|
+
"""Called when expected IDs are missing in Elasticsearch."""
|
|
9
|
+
...
|
|
10
|
+
|
|
11
|
+
def notify_extra(self, label: str, extra: list[str]) -> None:
|
|
12
|
+
"""Called when unexpected IDs exist in Elasticsearch."""
|
|
13
|
+
...
|
|
File without changes
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from genelastic.common.cli import (
|
|
7
|
+
add_es_connection_args,
|
|
8
|
+
add_verbose_control_args,
|
|
9
|
+
add_version_arg,
|
|
10
|
+
)
|
|
11
|
+
from genelastic.common.elastic import ElasticQueryConn
|
|
12
|
+
from genelastic.import_data.checker import Checker
|
|
13
|
+
from genelastic.import_data.import_bundle_factory import (
|
|
14
|
+
make_import_bundle_from_files,
|
|
15
|
+
)
|
|
16
|
+
from genelastic.import_data.logger import configure_logging
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("genelastic")
|
|
19
|
+
logging.getLogger("elastic_transport").setLevel(logging.WARNING)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CLICheckObserver:
|
|
23
|
+
"""Observer used by the CLI to log Checker errors."""
|
|
24
|
+
|
|
25
|
+
def __init__(self) -> None:
|
|
26
|
+
self._logger = logger
|
|
27
|
+
|
|
28
|
+
def notify_missing(self, label: str, missing: list[str]) -> None:
|
|
29
|
+
"""Handle missing IDs by logging an error."""
|
|
30
|
+
self._logger.error("[CHECKER] Missing %s in ES: %s", label, missing)
|
|
31
|
+
|
|
32
|
+
def notify_extra(self, label: str, extra: list[str]) -> None:
|
|
33
|
+
"""Handle extra IDs by logging an error."""
|
|
34
|
+
self._logger.error("[CHECKER] Extra %s in ES: %s", label, extra)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def read_args() -> argparse.Namespace:
|
|
38
|
+
parser = argparse.ArgumentParser(
|
|
39
|
+
description="Check database coherency against one or more YAML bundles.",
|
|
40
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
add_version_arg(parser)
|
|
44
|
+
add_verbose_control_args(parser)
|
|
45
|
+
add_es_connection_args(parser)
|
|
46
|
+
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"files",
|
|
49
|
+
type=Path,
|
|
50
|
+
nargs="+",
|
|
51
|
+
help="Paths to YAML bundle files to validate.",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--strict",
|
|
56
|
+
action="store_true",
|
|
57
|
+
help=(
|
|
58
|
+
"Enable strict mode: also report entries present in Elasticsearch "
|
|
59
|
+
"but missing from YAML bundles."
|
|
60
|
+
),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
parser.add_argument(
|
|
64
|
+
"-A",
|
|
65
|
+
"--check-analyses",
|
|
66
|
+
action="store_true",
|
|
67
|
+
help="Check only analyses coherence.",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"-W",
|
|
72
|
+
"--check-wet",
|
|
73
|
+
action="store_true",
|
|
74
|
+
help="Check only wet processes coherence.",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"-B",
|
|
79
|
+
"--check-bi",
|
|
80
|
+
action="store_true",
|
|
81
|
+
help="Check only biological processes coherence.",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
parser.add_argument(
|
|
85
|
+
"-X",
|
|
86
|
+
"--all",
|
|
87
|
+
action="store_true",
|
|
88
|
+
help="Check all entities (analyses, wet processes and bi processes).",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return parser.parse_args()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def main() -> None:
|
|
95
|
+
args = read_args()
|
|
96
|
+
configure_logging(args.verbose)
|
|
97
|
+
|
|
98
|
+
logger.info(
|
|
99
|
+
"Connecting to Elasticsearch at https://%s:%s ...",
|
|
100
|
+
args.es_host,
|
|
101
|
+
args.es_port,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
es = ElasticQueryConn(
|
|
105
|
+
f"https://{args.es_host}:{args.es_port}",
|
|
106
|
+
args.es_cert_fp,
|
|
107
|
+
args.es_index_prefix,
|
|
108
|
+
basic_auth=(args.es_usr, args.es_pwd),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
import_bundle = make_import_bundle_from_files(args.files)
|
|
112
|
+
|
|
113
|
+
checker = Checker(es, strict=args.strict)
|
|
114
|
+
checker.attach(CLICheckObserver())
|
|
115
|
+
|
|
116
|
+
run_all = args.all or not (
|
|
117
|
+
args.check_analyses or args.check_wet or args.check_bi
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if args.check_analyses or run_all:
|
|
121
|
+
checker.check_analyses(import_bundle.analyses)
|
|
122
|
+
|
|
123
|
+
if args.check_wet or run_all:
|
|
124
|
+
checker.check_wet_processes(import_bundle.wet_processes)
|
|
125
|
+
|
|
126
|
+
if args.check_bi or run_all:
|
|
127
|
+
checker.check_bi_processes(import_bundle.bi_processes)
|
|
128
|
+
|
|
129
|
+
if checker.errors_detected:
|
|
130
|
+
sys.exit(1)
|
|
131
|
+
|
|
132
|
+
sys.exit(0)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
main()
|
|
@@ -4,10 +4,9 @@ from pathlib import Path
|
|
|
4
4
|
|
|
5
5
|
from biophony import DEFAULT_RATE, MutSimParams
|
|
6
6
|
|
|
7
|
-
from genelastic.common import add_verbose_control_args
|
|
8
|
-
|
|
9
|
-
from .
|
|
10
|
-
from .random_bundle import (
|
|
7
|
+
from genelastic.common.cli import add_verbose_control_args, add_version_arg
|
|
8
|
+
from genelastic.import_data.logger import configure_logging
|
|
9
|
+
from genelastic.import_data.random_bundle import (
|
|
11
10
|
RandomBundle,
|
|
12
11
|
)
|
|
13
12
|
|
|
@@ -26,6 +25,7 @@ def read_args() -> argparse.Namespace:
|
|
|
26
25
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
27
26
|
allow_abbrev=False,
|
|
28
27
|
)
|
|
28
|
+
add_version_arg(parser)
|
|
29
29
|
add_verbose_control_args(parser)
|
|
30
30
|
parser.add_argument(
|
|
31
31
|
"output_dir",
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
# vi: se tw=80
|
|
2
|
+
|
|
3
|
+
# Elasticsearch Python API:
|
|
4
|
+
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/overview.html
|
|
5
|
+
# https://elasticsearch-py.readthedocs.io/en/latest/api.html
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import logging
|
|
9
|
+
import sys
|
|
10
|
+
from datetime import UTC, datetime
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from genelastic.common.cli import (
|
|
14
|
+
add_es_connection_args,
|
|
15
|
+
add_verbose_control_args,
|
|
16
|
+
add_version_arg,
|
|
17
|
+
log_item,
|
|
18
|
+
log_section,
|
|
19
|
+
log_subsection,
|
|
20
|
+
positive_int,
|
|
21
|
+
)
|
|
22
|
+
from genelastic.common.elastic import ElasticImportConn
|
|
23
|
+
from genelastic.import_data.import_bundle_factory import (
|
|
24
|
+
make_import_bundle_from_files,
|
|
25
|
+
)
|
|
26
|
+
from genelastic.import_data.importers.importer_base import ImporterError
|
|
27
|
+
from genelastic.import_data.importers.importer_factory import ImporterFactory
|
|
28
|
+
from genelastic.import_data.logger import configure_logging
|
|
29
|
+
from genelastic.import_data.models.analysis import Analysis
|
|
30
|
+
from genelastic.import_data.models.data_file import DataFile
|
|
31
|
+
from genelastic.import_data.models.processes import Processes
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger("genelastic")
|
|
34
|
+
logging.getLogger("elastic_transport").setLevel(
|
|
35
|
+
logging.WARNING
|
|
36
|
+
) # Disable excessive logging
|
|
37
|
+
logging.getLogger("urllib3").setLevel(
|
|
38
|
+
logging.WARNING
|
|
39
|
+
) # Disable excessive logging
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def read_args() -> argparse.Namespace:
|
|
43
|
+
"""Read arguments from command line."""
|
|
44
|
+
parser = argparse.ArgumentParser(
|
|
45
|
+
description="Genetics data importer.",
|
|
46
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
47
|
+
allow_abbrev=False,
|
|
48
|
+
)
|
|
49
|
+
add_version_arg(parser)
|
|
50
|
+
add_verbose_control_args(parser)
|
|
51
|
+
add_es_connection_args(parser)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"-D",
|
|
54
|
+
"--dry-run",
|
|
55
|
+
dest="dryrun",
|
|
56
|
+
action="count",
|
|
57
|
+
default=0,
|
|
58
|
+
help=(
|
|
59
|
+
"Dry-run level. -D for data files loading (VCF, coverage, etc) "
|
|
60
|
+
"without connecting or importing to database. "
|
|
61
|
+
"-DD for metadata YAML files loading only (no loading of data files)."
|
|
62
|
+
),
|
|
63
|
+
)
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--log-file", dest="log_file", help="Path to a log file."
|
|
66
|
+
)
|
|
67
|
+
parser.add_argument(
|
|
68
|
+
"--no-list",
|
|
69
|
+
dest="no_list",
|
|
70
|
+
action="store_true",
|
|
71
|
+
help="Do not print list of files to be imported.",
|
|
72
|
+
)
|
|
73
|
+
parser.add_argument(
|
|
74
|
+
"--no-confirm",
|
|
75
|
+
dest="no_confirm",
|
|
76
|
+
action="store_true",
|
|
77
|
+
help="Do not ask confirmation before importing.",
|
|
78
|
+
)
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
"-t",
|
|
81
|
+
"--threads",
|
|
82
|
+
dest="thread_count",
|
|
83
|
+
type=positive_int,
|
|
84
|
+
default=4,
|
|
85
|
+
help="Number of threads to use for parallel data files import.",
|
|
86
|
+
)
|
|
87
|
+
parser.add_argument(
|
|
88
|
+
"--multi-match",
|
|
89
|
+
dest="multi_match",
|
|
90
|
+
action="store_true",
|
|
91
|
+
help=(
|
|
92
|
+
"Enable grouping of files from the same 'data_path' into multiple "
|
|
93
|
+
"analyses by extracting variable metadata fields directly from "
|
|
94
|
+
"filenames using the file prefix. If some metadata fields (e.g., "
|
|
95
|
+
"sample_name, wet_process, bi_process) are not defined in the YAML "
|
|
96
|
+
"bundle, the importer detects all analyses sharing the same "
|
|
97
|
+
"defined metadata, but differing by the undefined fields. This "
|
|
98
|
+
"allows importing and filtering several analyses at once from a "
|
|
99
|
+
"single directory, based on the metadata present in filenames. "
|
|
100
|
+
"When disabled (default), only files matching the fixed filename "
|
|
101
|
+
"pattern (where all metadata fields are defined in the YAML) are "
|
|
102
|
+
"grouped into a single analysis; other files are ignored."
|
|
103
|
+
),
|
|
104
|
+
)
|
|
105
|
+
parser.add_argument(
|
|
106
|
+
"files",
|
|
107
|
+
type=Path,
|
|
108
|
+
nargs="+",
|
|
109
|
+
default=None,
|
|
110
|
+
help="Data files that describe what to import.",
|
|
111
|
+
)
|
|
112
|
+
return parser.parse_args()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def import_analysis(
|
|
116
|
+
es_import_conn: ElasticImportConn,
|
|
117
|
+
analysis: Analysis,
|
|
118
|
+
) -> None:
|
|
119
|
+
"""Import analysis into a dedicated index."""
|
|
120
|
+
logger.info(
|
|
121
|
+
" -> Importing analysis '%s' metadata into index '%s'...",
|
|
122
|
+
analysis.id,
|
|
123
|
+
es_import_conn.analyses_index,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
documents = [
|
|
127
|
+
{
|
|
128
|
+
"_index": es_import_conn.analyses_index,
|
|
129
|
+
"_source": {
|
|
130
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
131
|
+
"analysis_id": analysis.id,
|
|
132
|
+
"bundle_file": str(analysis.bundle_file),
|
|
133
|
+
"data_path": str(analysis.data_path),
|
|
134
|
+
"metadata": analysis.metadata,
|
|
135
|
+
},
|
|
136
|
+
}
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
es_import_conn.bulk_import(documents)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def import_data_file(
|
|
143
|
+
es_import_conn: ElasticImportConn,
|
|
144
|
+
data_file: DataFile,
|
|
145
|
+
) -> None:
|
|
146
|
+
"""Import data files into a dedicated index."""
|
|
147
|
+
logger.info(
|
|
148
|
+
" -> Importing metadata into index '%s'...",
|
|
149
|
+
es_import_conn.data_files_index,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
documents = [
|
|
153
|
+
{
|
|
154
|
+
"_index": es_import_conn.data_files_index,
|
|
155
|
+
"_source": {
|
|
156
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
157
|
+
"analysis_id": data_file.analysis_id,
|
|
158
|
+
"path": str(data_file.path),
|
|
159
|
+
"bundle_file": str(data_file.bundle_file),
|
|
160
|
+
"metadata": data_file.metadata,
|
|
161
|
+
"metrics": data_file.metrics,
|
|
162
|
+
},
|
|
163
|
+
}
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
es_import_conn.bulk_import(documents)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def import_data_file_content(
|
|
170
|
+
es_import_conn: ElasticImportConn,
|
|
171
|
+
data_file: DataFile,
|
|
172
|
+
thread_count: int,
|
|
173
|
+
dry_run: int,
|
|
174
|
+
) -> None:
|
|
175
|
+
"""Import data file content into a dedicated index,
|
|
176
|
+
based on their extension and type.
|
|
177
|
+
"""
|
|
178
|
+
# -DD: no file processing, no import.
|
|
179
|
+
if dry_run > 1:
|
|
180
|
+
logger.info("[Dryrun] Data file neither processed nor imported.")
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
logger.info(
|
|
185
|
+
" -> Processing file content for import...",
|
|
186
|
+
)
|
|
187
|
+
importer = ImporterFactory.get_importer(
|
|
188
|
+
data_file, es_import_conn, thread_count
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# -D: only process files, no import.
|
|
192
|
+
if dry_run == 1:
|
|
193
|
+
logger.info("[Dryrun] Data file processed but not imported.")
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
logger.info(
|
|
197
|
+
" -> Importing file content into index '%s'...",
|
|
198
|
+
importer.target_index,
|
|
199
|
+
)
|
|
200
|
+
importer.import_docs()
|
|
201
|
+
except ImporterError as e:
|
|
202
|
+
logger.error(e)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def import_processes(
|
|
206
|
+
es_import_conn: ElasticImportConn,
|
|
207
|
+
index: str,
|
|
208
|
+
processes: Processes,
|
|
209
|
+
) -> None:
|
|
210
|
+
"""Import processes into a dedicated index, based on their type."""
|
|
211
|
+
documents = [
|
|
212
|
+
{
|
|
213
|
+
"_index": index,
|
|
214
|
+
"_source": {
|
|
215
|
+
"proc_id": process.id,
|
|
216
|
+
"type": process.type,
|
|
217
|
+
"metadata": process.data,
|
|
218
|
+
},
|
|
219
|
+
}
|
|
220
|
+
for process in processes.values()
|
|
221
|
+
]
|
|
222
|
+
|
|
223
|
+
es_import_conn.bulk_import(documents)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def main() -> None:
|
|
227
|
+
"""Entry point of the import script."""
|
|
228
|
+
# Read command line arguments
|
|
229
|
+
args = read_args()
|
|
230
|
+
|
|
231
|
+
# Configure logging
|
|
232
|
+
configure_logging(args.verbose, log_file=args.log_file)
|
|
233
|
+
logger.debug("Arguments: %s", args)
|
|
234
|
+
logger.debug("LOGGERS: %s", logging.root.manager.loggerDict)
|
|
235
|
+
|
|
236
|
+
# Open connection to ES
|
|
237
|
+
addr = f"https://{args.es_host}:{args.es_port}"
|
|
238
|
+
logger.info("Connecting to Elasticsearch at %s...", addr)
|
|
239
|
+
es_import_conn = ElasticImportConn(
|
|
240
|
+
addr,
|
|
241
|
+
args.es_cert_fp,
|
|
242
|
+
args.es_index_prefix,
|
|
243
|
+
args.dryrun,
|
|
244
|
+
basic_auth=(args.es_usr, args.es_pwd),
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
log_section("LOAD DATA")
|
|
248
|
+
logger.info("")
|
|
249
|
+
import_bundle = make_import_bundle_from_files(
|
|
250
|
+
args.files, multi_match=args.multi_match, check=True
|
|
251
|
+
)
|
|
252
|
+
all_bundled_files = import_bundle.analyses.get_data_files()
|
|
253
|
+
|
|
254
|
+
if not all_bundled_files:
|
|
255
|
+
logger.warning("No matching data files found from import bundle(s) !")
|
|
256
|
+
|
|
257
|
+
log_section("IMPORT DATA")
|
|
258
|
+
# List files before importing.
|
|
259
|
+
if not args.no_list:
|
|
260
|
+
logger.info("")
|
|
261
|
+
logger.info(
|
|
262
|
+
"The following %s file(s) will be imported:", len(all_bundled_files)
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
for data_file in all_bundled_files:
|
|
266
|
+
logger.info("- '%s'", data_file.path)
|
|
267
|
+
else:
|
|
268
|
+
logger.debug(
|
|
269
|
+
"'--no-list' argument provided: "
|
|
270
|
+
"not listing files about to be imported."
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Ask confirmation for importing
|
|
274
|
+
if not args.no_confirm:
|
|
275
|
+
answer: str = "maybe"
|
|
276
|
+
while answer not in ["", "n", "y"]:
|
|
277
|
+
answer = input("Import (y/N)? ").lower()
|
|
278
|
+
if answer != "y":
|
|
279
|
+
logger.info("Import canceled.")
|
|
280
|
+
sys.exit(0)
|
|
281
|
+
else:
|
|
282
|
+
logger.debug(
|
|
283
|
+
"'--no-confirm' argument provided: "
|
|
284
|
+
"not asking for confirmation before importing files."
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Start import.
|
|
288
|
+
log_subsection("Importing wet processes...")
|
|
289
|
+
logger.info(
|
|
290
|
+
"-> Importing %s wet process(es) into index '%s': %s.",
|
|
291
|
+
len(import_bundle.wet_processes),
|
|
292
|
+
es_import_conn.wet_processes_index,
|
|
293
|
+
", ".join(import_bundle.wet_processes.keys()),
|
|
294
|
+
)
|
|
295
|
+
import_processes(
|
|
296
|
+
es_import_conn,
|
|
297
|
+
es_import_conn.wet_processes_index,
|
|
298
|
+
import_bundle.wet_processes,
|
|
299
|
+
)
|
|
300
|
+
log_subsection("Importing bioinformatics processes...")
|
|
301
|
+
logger.info(
|
|
302
|
+
"-> Importing %s bioinformatics process(es) into index '%s': %s.",
|
|
303
|
+
len(import_bundle.bi_processes),
|
|
304
|
+
es_import_conn.bi_processes_index,
|
|
305
|
+
", ".join(import_bundle.bi_processes.keys()),
|
|
306
|
+
)
|
|
307
|
+
import_processes(
|
|
308
|
+
es_import_conn,
|
|
309
|
+
es_import_conn.bi_processes_index,
|
|
310
|
+
import_bundle.bi_processes,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
log_subsection("Importing analysis metadata...")
|
|
314
|
+
for i, analysis in enumerate(sorted(import_bundle.analyses)):
|
|
315
|
+
log_item(
|
|
316
|
+
"Analysis",
|
|
317
|
+
i + 1,
|
|
318
|
+
len(import_bundle.analyses),
|
|
319
|
+
)
|
|
320
|
+
import_analysis(es_import_conn, analysis)
|
|
321
|
+
|
|
322
|
+
log_subsection("Importing data files...")
|
|
323
|
+
counter = 1
|
|
324
|
+
for ext in sorted(import_bundle.analyses.extensions):
|
|
325
|
+
data_files = import_bundle.analyses.get_data_files(ext)
|
|
326
|
+
logger.info("[ %s data files ]", ext.upper())
|
|
327
|
+
|
|
328
|
+
for data_file in data_files:
|
|
329
|
+
logger.info(
|
|
330
|
+
" -> Processing data file #%s/%s: '%s'...",
|
|
331
|
+
counter,
|
|
332
|
+
len(import_bundle.analyses.get_data_files()),
|
|
333
|
+
data_file.path.name,
|
|
334
|
+
)
|
|
335
|
+
import_data_file(es_import_conn, data_file)
|
|
336
|
+
import_data_file_content(
|
|
337
|
+
es_import_conn, data_file, args.thread_count, args.dryrun
|
|
338
|
+
)
|
|
339
|
+
logger.info("")
|
|
340
|
+
counter += 1
|
|
341
|
+
|
|
342
|
+
logger.info("=> Done.")
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
if __name__ == "__main__":
|
|
346
|
+
main()
|