genelastic 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/.env +4 -0
- genelastic/api/cli_start_api.py +18 -0
- genelastic/api/errors.py +52 -0
- genelastic/api/extends/example.py +0 -6
- genelastic/api/extends/example.yml +0 -0
- genelastic/api/routes.py +313 -181
- genelastic/api/server.py +34 -26
- genelastic/api/settings.py +5 -9
- genelastic/api/specification.yml +512 -0
- genelastic/common/__init__.py +0 -39
- genelastic/common/cli.py +100 -0
- genelastic/common/elastic.py +374 -46
- genelastic/common/exceptions.py +34 -2
- genelastic/common/server.py +59 -0
- genelastic/common/types.py +1 -14
- genelastic/import_data/__init__.py +0 -27
- genelastic/import_data/checker.py +99 -0
- genelastic/import_data/checker_observer.py +13 -0
- genelastic/import_data/cli/__init__.py +0 -0
- genelastic/import_data/cli/cli_check.py +136 -0
- genelastic/import_data/cli/gen_data.py +143 -0
- genelastic/import_data/cli/import_data.py +346 -0
- genelastic/import_data/cli/info.py +247 -0
- genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
- genelastic/import_data/cli/validate.py +146 -0
- genelastic/import_data/collect.py +185 -0
- genelastic/import_data/constants.py +136 -11
- genelastic/import_data/import_bundle.py +102 -59
- genelastic/import_data/import_bundle_factory.py +70 -149
- genelastic/import_data/importers/__init__.py +0 -0
- genelastic/import_data/importers/importer_base.py +131 -0
- genelastic/import_data/importers/importer_factory.py +85 -0
- genelastic/import_data/importers/importer_types.py +223 -0
- genelastic/import_data/logger.py +2 -1
- genelastic/import_data/models/__init__.py +0 -0
- genelastic/import_data/models/analyses.py +178 -0
- genelastic/import_data/models/analysis.py +144 -0
- genelastic/import_data/models/data_file.py +110 -0
- genelastic/import_data/models/process.py +45 -0
- genelastic/import_data/models/processes.py +84 -0
- genelastic/import_data/models/tags.py +170 -0
- genelastic/import_data/models/unique_list.py +109 -0
- genelastic/import_data/models/validate.py +26 -0
- genelastic/import_data/patterns.py +90 -0
- genelastic/import_data/random_bundle.py +79 -54
- genelastic/import_data/resolve.py +157 -0
- genelastic/ui/.env +1 -0
- genelastic/ui/cli_start_ui.py +20 -0
- genelastic/ui/routes.py +333 -0
- genelastic/ui/server.py +9 -82
- genelastic/ui/settings.py +2 -6
- genelastic/ui/static/cea-cnrgh.ico +0 -0
- genelastic/ui/static/cea.ico +0 -0
- genelastic/ui/static/layout.ico +0 -0
- genelastic/ui/static/novaseq6000.png +0 -0
- genelastic/ui/static/style.css +430 -0
- genelastic/ui/static/ui.js +458 -0
- genelastic/ui/templates/analyses.html +98 -0
- genelastic/ui/templates/analysis_detail.html +44 -0
- genelastic/ui/templates/bi_process_detail.html +129 -0
- genelastic/ui/templates/bi_processes.html +116 -0
- genelastic/ui/templates/explorer.html +356 -0
- genelastic/ui/templates/home.html +207 -0
- genelastic/ui/templates/layout.html +153 -0
- genelastic/ui/templates/version.html +21 -0
- genelastic/ui/templates/wet_process_detail.html +131 -0
- genelastic/ui/templates/wet_processes.html +116 -0
- genelastic-0.9.0.dist-info/METADATA +686 -0
- genelastic-0.9.0.dist-info/RECORD +76 -0
- genelastic-0.9.0.dist-info/WHEEL +4 -0
- genelastic-0.9.0.dist-info/entry_points.txt +10 -0
- genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
- genelastic/import_data/analyses.py +0 -69
- genelastic/import_data/analysis.py +0 -205
- genelastic/import_data/bi_process.py +0 -27
- genelastic/import_data/bi_processes.py +0 -49
- genelastic/import_data/cli_gen_data.py +0 -116
- genelastic/import_data/cli_import.py +0 -379
- genelastic/import_data/cli_info.py +0 -256
- genelastic/import_data/cli_validate.py +0 -54
- genelastic/import_data/data_file.py +0 -87
- genelastic/import_data/filename_pattern.py +0 -57
- genelastic/import_data/tags.py +0 -123
- genelastic/import_data/wet_process.py +0 -28
- genelastic/import_data/wet_processes.py +0 -53
- genelastic-0.7.0.dist-info/METADATA +0 -105
- genelastic-0.7.0.dist-info/RECORD +0 -40
- genelastic-0.7.0.dist-info/WHEEL +0 -5
- genelastic-0.7.0.dist-info/entry_points.txt +0 -6
- genelastic-0.7.0.dist-info/top_level.txt +0 -1
genelastic/common/cli.py
CHANGED
|
@@ -1,6 +1,39 @@
|
|
|
1
1
|
"""Utility functions for CLI scripts."""
|
|
2
2
|
|
|
3
3
|
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
from importlib.metadata import version
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger("genelastic")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
BASE_LOG_LEVEL = ["critical", "error", "warning", "info", "debug"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def positive_int(value: str) -> int:
|
|
14
|
+
"""Argparse type: require a positive integer."""
|
|
15
|
+
try:
|
|
16
|
+
number = int(value)
|
|
17
|
+
except ValueError:
|
|
18
|
+
msg = f"expected a valid integer, got '{value}'."
|
|
19
|
+
raise argparse.ArgumentTypeError(msg) from None
|
|
20
|
+
if number <= 0:
|
|
21
|
+
msg = f"expected a positive integer, got {value}."
|
|
22
|
+
raise argparse.ArgumentTypeError(msg) from None
|
|
23
|
+
return number
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def add_version_arg(parser: argparse.ArgumentParser) -> None:
|
|
27
|
+
"""Add a version argument to query the current Genelastic version.
|
|
28
|
+
Argument is added to the parser by using its reference.
|
|
29
|
+
"""
|
|
30
|
+
top_level_package = __package__.split(".")[0]
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"-V",
|
|
33
|
+
"--version",
|
|
34
|
+
action="version",
|
|
35
|
+
version=f"%(prog)s {version(top_level_package)}",
|
|
36
|
+
)
|
|
4
37
|
|
|
5
38
|
|
|
6
39
|
def add_verbose_control_args(parser: argparse.ArgumentParser) -> None:
|
|
@@ -61,3 +94,70 @@ def add_es_connection_args(parser: argparse.ArgumentParser) -> None:
|
|
|
61
94
|
dest="es_index_prefix",
|
|
62
95
|
help="Add the given prefix to each index created during import.",
|
|
63
96
|
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def parse_server_launch_args(
|
|
100
|
+
parser_desc: str, default_port: int
|
|
101
|
+
) -> argparse.Namespace:
|
|
102
|
+
parser = argparse.ArgumentParser(
|
|
103
|
+
description=parser_desc,
|
|
104
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
105
|
+
allow_abbrev=False,
|
|
106
|
+
)
|
|
107
|
+
parser.add_argument(
|
|
108
|
+
"--host",
|
|
109
|
+
type=str,
|
|
110
|
+
default="127.0.0.1",
|
|
111
|
+
)
|
|
112
|
+
parser.add_argument(
|
|
113
|
+
"--port",
|
|
114
|
+
type=int,
|
|
115
|
+
default=default_port,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
env_subparsers = parser.add_subparsers(dest="env", required=True)
|
|
119
|
+
dev_parser = env_subparsers.add_parser(
|
|
120
|
+
"dev",
|
|
121
|
+
help="Use development environment.",
|
|
122
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
123
|
+
)
|
|
124
|
+
dev_parser.add_argument(
|
|
125
|
+
"--log-level",
|
|
126
|
+
type=str,
|
|
127
|
+
default="info",
|
|
128
|
+
choices=[*BASE_LOG_LEVEL, "trace"],
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
prod_parser = env_subparsers.add_parser(
|
|
132
|
+
"prod",
|
|
133
|
+
help="Use production environment.",
|
|
134
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
135
|
+
)
|
|
136
|
+
prod_parser.add_argument(
|
|
137
|
+
"--log-level", type=str, default="info", choices=BASE_LOG_LEVEL
|
|
138
|
+
)
|
|
139
|
+
prod_parser.add_argument(
|
|
140
|
+
"-w", "--workers", type=int, default=1, help="Number of workers."
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
prod_parser.add_argument("--access-logfile", type=str, default=None)
|
|
144
|
+
prod_parser.add_argument("--log-file", type=str, default=None)
|
|
145
|
+
|
|
146
|
+
return parser.parse_args()
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def log_section(title: str) -> None:
|
|
150
|
+
msg = f">> {title} <<"
|
|
151
|
+
logger.info("*" * len(msg))
|
|
152
|
+
logger.info(msg)
|
|
153
|
+
logger.info("*" * len(msg))
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def log_subsection(title: str) -> None:
|
|
157
|
+
logger.info("")
|
|
158
|
+
logger.info("<%s>", title)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def log_item(name: str, index: int, count: int) -> None:
|
|
162
|
+
msg = f"[ {name} #{index}/{count} ]"
|
|
163
|
+
logger.info(msg)
|
genelastic/common/elastic.py
CHANGED
|
@@ -1,64 +1,226 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
3
|
import time
|
|
4
|
-
import
|
|
5
|
-
from abc import ABC
|
|
4
|
+
from collections.abc import Iterable
|
|
6
5
|
from typing import Any
|
|
7
6
|
|
|
8
7
|
import elastic_transport
|
|
9
8
|
import elasticsearch.helpers
|
|
10
9
|
from elasticsearch import Elasticsearch
|
|
10
|
+
from elasticsearch.helpers import scan
|
|
11
|
+
from tqdm import tqdm
|
|
11
12
|
|
|
12
13
|
from .exceptions import DBIntegrityError
|
|
13
|
-
from .types import Bucket
|
|
14
|
+
from .types import Bucket
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger("genelastic")
|
|
16
17
|
|
|
17
18
|
|
|
18
|
-
class ElasticConn
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
19
|
+
class ElasticConn:
|
|
20
|
+
"""Base class for Elasticsearch connectors.
|
|
21
|
+
|
|
22
|
+
This class provides common functionality for managing index names and
|
|
23
|
+
establishing a connection to an Elasticsearch server. It is not
|
|
24
|
+
intended to be instantiated directly. Instead, use one of its
|
|
25
|
+
subclasses:
|
|
26
|
+
|
|
27
|
+
- ``ElasticQueryConn`` for performing search and query operations,
|
|
28
|
+
- ``ElasticImportConn`` for importing and indexing data.
|
|
29
|
+
|
|
30
|
+
:param url: URL of the Elasticsearch host.
|
|
31
|
+
:param fingerprint: SHA256 certificate fingerprint for secure HTTPS
|
|
32
|
+
connection.
|
|
33
|
+
:param index_prefix: Prefix to prepend to all index names.
|
|
34
|
+
:param dry_run: Dry run mode; 0 = execute queries, >=1 = skip queries,
|
|
35
|
+
no Elasticsearch client is created.
|
|
36
|
+
:param kwargs: Additional keyword arguments passed to the Elasticsearch
|
|
37
|
+
client.
|
|
38
|
+
:raises SystemExit: If connection or authentication to Elasticsearch
|
|
39
|
+
fails.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
url: str,
|
|
45
|
+
fingerprint: str,
|
|
46
|
+
index_prefix: str,
|
|
47
|
+
dry_run: int = 0,
|
|
48
|
+
**kwargs: Any, # noqa: ANN401
|
|
49
|
+
) -> None:
|
|
50
|
+
self._index_prefix = index_prefix
|
|
51
|
+
self._dry_run = dry_run
|
|
52
|
+
self._init_indices()
|
|
53
|
+
self._client = None
|
|
54
|
+
|
|
55
|
+
if self._dry_run > 0:
|
|
56
|
+
msg = (
|
|
57
|
+
f"[Dryrun] {self.__class__.__name__} "
|
|
58
|
+
f"instantiated without an Elasticsearch client."
|
|
59
|
+
)
|
|
60
|
+
logger.info(msg)
|
|
61
|
+
return
|
|
25
62
|
|
|
26
|
-
:url: URL of the Elasticsearch host.
|
|
27
|
-
:fingerprint: sha256 certificate fingerprint for a secure HTTP connection.
|
|
28
|
-
:returns: The configured elasticsearch client instance.
|
|
29
|
-
:raises SystemExit: If the connection to the Elasticsearch server failed.
|
|
30
|
-
"""
|
|
31
63
|
try:
|
|
32
|
-
self.
|
|
64
|
+
self._client = Elasticsearch(
|
|
33
65
|
url,
|
|
34
66
|
ssl_assert_fingerprint=fingerprint,
|
|
35
67
|
# Verify cert only when the fingerprint is not None.
|
|
36
68
|
verify_certs=bool(fingerprint),
|
|
37
69
|
**kwargs,
|
|
38
70
|
)
|
|
39
|
-
self.
|
|
71
|
+
self._client.info()
|
|
40
72
|
except (
|
|
41
73
|
elastic_transport.TransportError,
|
|
42
74
|
elasticsearch.AuthenticationException,
|
|
43
75
|
) as e:
|
|
44
76
|
raise SystemExit(e) from e
|
|
45
77
|
|
|
78
|
+
def _init_indices(self) -> None:
|
|
79
|
+
# Core indices.
|
|
80
|
+
self._analyses_index = f"{self._index_prefix}_analyses"
|
|
81
|
+
self._data_files_index = f"{self._index_prefix}_data_files"
|
|
82
|
+
# Content indices.
|
|
83
|
+
self._vcf_variants_index = f"{self._index_prefix}_vcf_variants"
|
|
84
|
+
self._coverage_index = f"{self._index_prefix}_coverage"
|
|
85
|
+
# Metrics indices.
|
|
86
|
+
self._qc_metrics_index = f"{self._index_prefix}_qc_metrics"
|
|
87
|
+
self._sv_metrics_index = f"{self._index_prefix}_sv_metrics"
|
|
88
|
+
self._smallvar_metrics_index = f"{self._index_prefix}_smallvar_metrics"
|
|
89
|
+
# Processes indices.
|
|
90
|
+
self._bi_processes_index = f"{self._index_prefix}_bi_processes"
|
|
91
|
+
self._wet_processes_index = f"{self._index_prefix}_wet_processes"
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def client(self) -> Elasticsearch | None:
|
|
95
|
+
"""Elasticsearch client."""
|
|
96
|
+
return self._client
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def analyses_index(self) -> str:
|
|
100
|
+
"""Index for analyses."""
|
|
101
|
+
return self._analyses_index
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def data_files_index(self) -> str:
|
|
105
|
+
"""Index for data files."""
|
|
106
|
+
return self._data_files_index
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def vcf_variants_index(self) -> str:
|
|
110
|
+
"""Index for VCF variants."""
|
|
111
|
+
return self._vcf_variants_index
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def coverage_index(self) -> str:
|
|
115
|
+
"""Index for coverage data."""
|
|
116
|
+
return self._coverage_index
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def qc_metrics_index(self) -> str:
|
|
120
|
+
"""Index for quality control metrics."""
|
|
121
|
+
return self._qc_metrics_index
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def sv_metrics_index(self) -> str:
|
|
125
|
+
"""Index for structural variant (SV) metrics."""
|
|
126
|
+
return self._sv_metrics_index
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def smallvar_metrics_index(self) -> str:
|
|
130
|
+
"""Index for small variant (SNV/indel) metrics."""
|
|
131
|
+
return self._smallvar_metrics_index
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def bi_processes_index(self) -> str:
|
|
135
|
+
"""Index for bioinformatics processes."""
|
|
136
|
+
return self._bi_processes_index
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def wet_processes_index(self) -> str:
|
|
140
|
+
"""Index for wet lab processes."""
|
|
141
|
+
return self._wet_processes_index
|
|
142
|
+
|
|
46
143
|
|
|
47
144
|
class ElasticImportConn(ElasticConn):
|
|
48
145
|
"""Connector to import data into an Elasticsearch database."""
|
|
49
146
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
147
|
+
@staticmethod
|
|
148
|
+
def _handle_bulk_response(response: Iterable[tuple[bool, Any]]) -> None:
|
|
149
|
+
success_count = 0
|
|
150
|
+
failure_count = 0
|
|
151
|
+
total_items = 0
|
|
152
|
+
|
|
153
|
+
start_time = time.perf_counter()
|
|
154
|
+
|
|
155
|
+
for success, info in tqdm(
|
|
156
|
+
response,
|
|
157
|
+
desc="Import progress",
|
|
158
|
+
unit=" documents",
|
|
159
|
+
unit_scale=True, # Scale large counts for easier readability (e.g., 1200 => 1.2k).
|
|
160
|
+
leave=False, # Hide finished bars to keep console clean.
|
|
161
|
+
):
|
|
162
|
+
total_items += 1
|
|
163
|
+
if success:
|
|
164
|
+
success_count += 1
|
|
165
|
+
logger.trace(info) # type: ignore[attr-defined]
|
|
166
|
+
else:
|
|
167
|
+
failure_count += 1
|
|
168
|
+
logger.error("Failed to import item: %s", info)
|
|
169
|
+
|
|
56
170
|
elapsed = time.perf_counter() - start_time
|
|
57
171
|
logger.info(
|
|
58
|
-
"Imported %d
|
|
172
|
+
" - Imported %d document(s) (ok: %d, failed: %d) in %s (%f docs/s).",
|
|
59
173
|
total_items,
|
|
174
|
+
success_count,
|
|
175
|
+
failure_count,
|
|
60
176
|
datetime.timedelta(seconds=elapsed),
|
|
61
|
-
total_items / elapsed,
|
|
177
|
+
total_items / elapsed if elapsed > 0 else 0,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def bulk_import(self, documents: Iterable[dict[str, Any]]) -> None:
|
|
181
|
+
"""Import documents in streaming mode, suitable for low to medium
|
|
182
|
+
document volumes.
|
|
183
|
+
|
|
184
|
+
:param documents: documents to index.
|
|
185
|
+
"""
|
|
186
|
+
if not self.client:
|
|
187
|
+
logger.info("[Dryrun] bulk_import: no Elasticsearch client.")
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
self._handle_bulk_response(
|
|
191
|
+
elasticsearch.helpers.streaming_bulk(
|
|
192
|
+
self.client,
|
|
193
|
+
actions=documents,
|
|
194
|
+
raise_on_error=False,
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def parallel_bulk_import(
|
|
199
|
+
self,
|
|
200
|
+
documents: Iterable[dict[str, Any]],
|
|
201
|
+
thread_count: int = 4,
|
|
202
|
+
) -> None:
|
|
203
|
+
"""Import documents in parallel mode, suitable for large document
|
|
204
|
+
volumes.
|
|
205
|
+
|
|
206
|
+
:param documents: documents to index.
|
|
207
|
+
:param thread_count: Number of threads to use for parallel bulk import.
|
|
208
|
+
"""
|
|
209
|
+
if not self.client:
|
|
210
|
+
logger.info(
|
|
211
|
+
"[Dryrun] parallel_bulk_import: no Elasticsearch client."
|
|
212
|
+
)
|
|
213
|
+
return
|
|
214
|
+
|
|
215
|
+
logger.debug("parallel_bulk_import: using %s thread(s).", thread_count)
|
|
216
|
+
|
|
217
|
+
self._handle_bulk_response(
|
|
218
|
+
elasticsearch.helpers.parallel_bulk(
|
|
219
|
+
self.client,
|
|
220
|
+
actions=documents,
|
|
221
|
+
raise_on_error=False,
|
|
222
|
+
thread_count=thread_count,
|
|
223
|
+
)
|
|
62
224
|
)
|
|
63
225
|
|
|
64
226
|
|
|
@@ -67,14 +229,22 @@ class ElasticQueryConn(ElasticConn):
|
|
|
67
229
|
|
|
68
230
|
def get_indices(self) -> Any | str: # noqa: ANN401
|
|
69
231
|
"""Return all indices."""
|
|
232
|
+
if not self.client:
|
|
233
|
+
logger.info("[Dryrun] get_indices: no Elasticsearch client.")
|
|
234
|
+
return []
|
|
235
|
+
|
|
70
236
|
return self.client.cat.indices(format="json").body
|
|
71
237
|
|
|
72
238
|
def get_document_by_id(self, index: str, document_id: str) -> Any | str: # noqa: ANN401
|
|
73
239
|
"""Return a document by its ID."""
|
|
240
|
+
if not self.client:
|
|
241
|
+
logger.info("[Dryrun] get_document_by_id: no Elasticsearch client.")
|
|
242
|
+
return {}
|
|
243
|
+
|
|
74
244
|
return self.client.get(index=index, id=document_id).body
|
|
75
245
|
|
|
76
246
|
def run_composite_aggregation(
|
|
77
|
-
self, index: str, query: dict[str,
|
|
247
|
+
self, index: str, query: dict[str, Any]
|
|
78
248
|
) -> list[Bucket]:
|
|
79
249
|
"""Executes a composite aggregation on an Elasticsearch index and
|
|
80
250
|
returns all paginated results.
|
|
@@ -83,20 +253,18 @@ class ElasticQueryConn(ElasticConn):
|
|
|
83
253
|
:param query: Aggregation query to run.
|
|
84
254
|
:return: List of aggregation results.
|
|
85
255
|
"""
|
|
256
|
+
if not self.client:
|
|
257
|
+
logger.info(
|
|
258
|
+
"[Dryrun] run_composite_aggregation: "
|
|
259
|
+
"no Elasticsearch client."
|
|
260
|
+
)
|
|
261
|
+
return []
|
|
262
|
+
|
|
86
263
|
# Extract the aggregation name from the query dict.
|
|
87
264
|
agg_name = next(iter(query["aggs"]))
|
|
88
265
|
all_buckets: list[Bucket] = []
|
|
89
266
|
|
|
90
|
-
|
|
91
|
-
logger.debug(
|
|
92
|
-
"Running composite aggregation query %s on index '%s'.",
|
|
93
|
-
query,
|
|
94
|
-
index,
|
|
95
|
-
)
|
|
96
|
-
response = self.client.search(index=index, body=query)
|
|
97
|
-
except elasticsearch.NotFoundError as e:
|
|
98
|
-
msg = f"Error: {e.message} for index '{index}'."
|
|
99
|
-
raise SystemExit(msg) from e
|
|
267
|
+
response = self.client.search(index=index, body=query)
|
|
100
268
|
|
|
101
269
|
while True:
|
|
102
270
|
# Extract buckets from the response.
|
|
@@ -109,15 +277,10 @@ class ElasticQueryConn(ElasticConn):
|
|
|
109
277
|
if "after_key" in response["aggregations"][agg_name]:
|
|
110
278
|
after_key = response["aggregations"][agg_name]["after_key"]
|
|
111
279
|
query["aggs"][agg_name]["composite"]["after"] = after_key
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
# Fetch the next page of results.
|
|
117
|
-
response = self.client.search(index=index, body=query)
|
|
118
|
-
except elasticsearch.NotFoundError as e:
|
|
119
|
-
msg = f"Error: {e.message} for index '{index}'."
|
|
120
|
-
raise SystemExit(msg) from e
|
|
280
|
+
|
|
281
|
+
# Fetch the next page of results.
|
|
282
|
+
logger.debug("Running query %s on index '%s'.", query, index)
|
|
283
|
+
response = self.client.search(index=index, body=query)
|
|
121
284
|
else:
|
|
122
285
|
break
|
|
123
286
|
|
|
@@ -125,6 +288,10 @@ class ElasticQueryConn(ElasticConn):
|
|
|
125
288
|
|
|
126
289
|
def get_field_values(self, index: str, field_name: str) -> set[str]:
|
|
127
290
|
"""Return a set of values for a given field."""
|
|
291
|
+
if not self.client:
|
|
292
|
+
logger.info("[Dryrun] get_field_values: no Elasticsearch client.")
|
|
293
|
+
return set()
|
|
294
|
+
|
|
128
295
|
values = set()
|
|
129
296
|
|
|
130
297
|
query = {
|
|
@@ -152,8 +319,14 @@ class ElasticQueryConn(ElasticConn):
|
|
|
152
319
|
|
|
153
320
|
def search_by_field_value(
|
|
154
321
|
self, index: str, field: str, value: str
|
|
155
|
-
) -> dict[str,
|
|
322
|
+
) -> dict[str, Any] | None:
|
|
156
323
|
"""Search a document by a value for a certain field."""
|
|
324
|
+
if not self.client:
|
|
325
|
+
logger.info(
|
|
326
|
+
"[Dryrun] search_by_field_value: no Elasticsearch client."
|
|
327
|
+
)
|
|
328
|
+
return {}
|
|
329
|
+
|
|
157
330
|
logger.info(
|
|
158
331
|
"Searching for field '%s' with value '%s' inside index '%s'.",
|
|
159
332
|
field,
|
|
@@ -183,6 +356,10 @@ class ElasticQueryConn(ElasticConn):
|
|
|
183
356
|
:raises genelastic.common.DBIntegrityError:
|
|
184
357
|
Some values of the given field are duplicated in the index.
|
|
185
358
|
"""
|
|
359
|
+
if not self.client:
|
|
360
|
+
logger.info("[Dryrun] ensure_unique: no Elasticsearch client.")
|
|
361
|
+
return
|
|
362
|
+
|
|
186
363
|
logger.info(
|
|
187
364
|
"Ensuring that the field '%s' in the index '%s' only contains unique values...",
|
|
188
365
|
field,
|
|
@@ -212,3 +389,154 @@ class ElasticQueryConn(ElasticConn):
|
|
|
212
389
|
logger.info(
|
|
213
390
|
"All values of field '%s' in index '%s' are unique.", field, index
|
|
214
391
|
)
|
|
392
|
+
|
|
393
|
+
def get_all_documents_kv(self, index: str) -> list[dict[str, Any]]:
|
|
394
|
+
"""Return all key-value pairs from all documents in an index."""
|
|
395
|
+
if not self.client:
|
|
396
|
+
logger.info(
|
|
397
|
+
"[Dryrun] get_all_documents_kv: no Elasticsearch client."
|
|
398
|
+
)
|
|
399
|
+
return []
|
|
400
|
+
|
|
401
|
+
def flatten(
|
|
402
|
+
d: dict[str, Any], parent_key: str = "", sep: str = "."
|
|
403
|
+
) -> dict[str, Any]:
|
|
404
|
+
items = {}
|
|
405
|
+
for k, v in d.items():
|
|
406
|
+
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
|
407
|
+
if isinstance(v, dict):
|
|
408
|
+
items.update(flatten(v, new_key, sep=sep))
|
|
409
|
+
else:
|
|
410
|
+
items[new_key] = v
|
|
411
|
+
return items
|
|
412
|
+
|
|
413
|
+
results = []
|
|
414
|
+
for doc in scan(
|
|
415
|
+
self.client, index=index, query={"query": {"match_all": {}}}
|
|
416
|
+
):
|
|
417
|
+
source = doc.get("_source", {})
|
|
418
|
+
flattened = flatten(source)
|
|
419
|
+
results.append(flattened)
|
|
420
|
+
|
|
421
|
+
return results
|
|
422
|
+
|
|
423
|
+
def get_all_documents_kv_count(
|
|
424
|
+
self, index: str, field: str, size: int = 10000
|
|
425
|
+
) -> dict[str, int]:
|
|
426
|
+
"""Return a dictionary with the count of each unique value for an index field."""
|
|
427
|
+
if not self.client:
|
|
428
|
+
logger.info(
|
|
429
|
+
"[Dryrun] get_all_documents_kv_count: no Elasticsearch client."
|
|
430
|
+
)
|
|
431
|
+
return {}
|
|
432
|
+
|
|
433
|
+
query = {
|
|
434
|
+
"size": 0,
|
|
435
|
+
"aggs": {
|
|
436
|
+
"value_counts": {
|
|
437
|
+
"terms": {
|
|
438
|
+
"field": f"{field}.keyword",
|
|
439
|
+
"size": size,
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
},
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
response = self.client.search(index=index, body=query)
|
|
446
|
+
buckets = response["aggregations"]["value_counts"]["buckets"]
|
|
447
|
+
return {bucket["key"]: bucket["doc_count"] for bucket in buckets}
|
|
448
|
+
|
|
449
|
+
def get_process(self, index: str, proc_id: str) -> dict[str, Any] | None:
|
|
450
|
+
"""Get details about a specific process."""
|
|
451
|
+
if not self.client:
|
|
452
|
+
logger.info("[Dryrun] get_process: no Elasticsearch client.")
|
|
453
|
+
return {}
|
|
454
|
+
|
|
455
|
+
query = {
|
|
456
|
+
"query": {"term": {"proc_id.keyword": {"value": proc_id}}},
|
|
457
|
+
"size": 1,
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
response = self.client.search(index=index, body=query)
|
|
461
|
+
|
|
462
|
+
result = response["hits"]["hits"]
|
|
463
|
+
return result[0]["_source"] if result else None
|
|
464
|
+
|
|
465
|
+
def list_analyses_by_process(self, term: str, proc_id: str) -> list[str]:
|
|
466
|
+
"""Route to list analyses that contain the specified process."""
|
|
467
|
+
if not self.client:
|
|
468
|
+
logger.info(
|
|
469
|
+
"[Dryrun] list_analyses_by_process: " "no Elasticsearch client."
|
|
470
|
+
)
|
|
471
|
+
return []
|
|
472
|
+
|
|
473
|
+
search_query = {
|
|
474
|
+
"query": {
|
|
475
|
+
"term": {
|
|
476
|
+
f"metadata.{term}.keyword": proc_id,
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
response = self.client.search(
|
|
481
|
+
index=self.analyses_index, body=search_query
|
|
482
|
+
)
|
|
483
|
+
return [
|
|
484
|
+
hit["_source"]["analysis_id"] for hit in response["hits"]["hits"]
|
|
485
|
+
]
|
|
486
|
+
|
|
487
|
+
def list_analyses_by_process_esql(
|
|
488
|
+
self,
|
|
489
|
+
term: str,
|
|
490
|
+
proc_id: str,
|
|
491
|
+
) -> list[dict[str, str]]:
|
|
492
|
+
"""ES|QL route to list analyses that contain the specified process."""
|
|
493
|
+
if not self.client:
|
|
494
|
+
logger.info(
|
|
495
|
+
"[Dryrun] list_analyses_by_process_esql: no Elasticsearch client."
|
|
496
|
+
)
|
|
497
|
+
return []
|
|
498
|
+
|
|
499
|
+
query = (
|
|
500
|
+
f"FROM {self.analyses_index} | "
|
|
501
|
+
f"WHERE metadata.{term} == ? | "
|
|
502
|
+
f"KEEP analysis_id"
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
response = self.client.esql.query(
|
|
506
|
+
body={"query": query, "params": [proc_id]},
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
columns_name = [column["name"] for column in response["columns"]]
|
|
510
|
+
return [
|
|
511
|
+
dict(zip(columns_name, value, strict=False))
|
|
512
|
+
for value in response["values"]
|
|
513
|
+
]
|
|
514
|
+
|
|
515
|
+
def list_analyses_by_process_sql(
|
|
516
|
+
self,
|
|
517
|
+
term: str,
|
|
518
|
+
proc_id: str,
|
|
519
|
+
) -> list[dict[str, str]]:
|
|
520
|
+
"""SQL route to list analyses that contain the specified process."""
|
|
521
|
+
if not self.client:
|
|
522
|
+
logger.info(
|
|
523
|
+
"[Dryrun] list_analyses_by_process_sql: no Elasticsearch client."
|
|
524
|
+
)
|
|
525
|
+
return []
|
|
526
|
+
|
|
527
|
+
# ruff: noqa: S608
|
|
528
|
+
query = (
|
|
529
|
+
f"SELECT analysis_id "
|
|
530
|
+
f"FROM {self.analyses_index} "
|
|
531
|
+
f"WHERE metadata.{term} = ?"
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
response = self.client.sql.query(
|
|
535
|
+
body={"query": query, "params": [proc_id]}
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
columns_name = [column["name"] for column in response["columns"]]
|
|
539
|
+
return [
|
|
540
|
+
dict(zip(columns_name, row, strict=False))
|
|
541
|
+
for row in response["rows"]
|
|
542
|
+
]
|
genelastic/common/exceptions.py
CHANGED
|
@@ -1,4 +1,36 @@
|
|
|
1
1
|
class DBIntegrityError(Exception):
|
|
2
|
-
"""
|
|
3
|
-
|
|
2
|
+
"""Exception raised when the database content does not match the expected
|
|
3
|
+
data schema.
|
|
4
4
|
"""
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DataFileCollectorError(Exception):
|
|
8
|
+
"""Exception raised when an error occur while collecting analysis data
|
|
9
|
+
files.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InvalidFilePrefixError(Exception):
|
|
14
|
+
"""Exception raised when a file prefix is invalid."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FilenamePatternResolveError(Exception):
|
|
18
|
+
"""Exception raised when a filename pattern could not be resolved."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class UniqueListDuplicateError(Exception):
|
|
22
|
+
"""Exception raised when trying to add an item that already exist in the
|
|
23
|
+
list.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TagsDefinitionError(Exception):
|
|
28
|
+
"""Exception raised when the tags definition is invalid."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class YAMLFileReadError(Exception):
|
|
32
|
+
"""Exception raised when a YAML file cannot be opened or parsed."""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ValidationError(Exception):
|
|
36
|
+
"""Exception raised when a YAML document fails schema validation."""
|