genelastic 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. genelastic/api/.env +4 -0
  2. genelastic/api/cli_start_api.py +18 -0
  3. genelastic/api/errors.py +52 -0
  4. genelastic/api/extends/example.py +0 -6
  5. genelastic/api/extends/example.yml +0 -0
  6. genelastic/api/routes.py +313 -181
  7. genelastic/api/server.py +34 -26
  8. genelastic/api/settings.py +5 -9
  9. genelastic/api/specification.yml +512 -0
  10. genelastic/common/__init__.py +0 -39
  11. genelastic/common/cli.py +100 -0
  12. genelastic/common/elastic.py +374 -46
  13. genelastic/common/exceptions.py +34 -2
  14. genelastic/common/server.py +59 -0
  15. genelastic/common/types.py +1 -14
  16. genelastic/import_data/__init__.py +0 -27
  17. genelastic/import_data/checker.py +99 -0
  18. genelastic/import_data/checker_observer.py +13 -0
  19. genelastic/import_data/cli/__init__.py +0 -0
  20. genelastic/import_data/cli/cli_check.py +136 -0
  21. genelastic/import_data/cli/gen_data.py +143 -0
  22. genelastic/import_data/cli/import_data.py +346 -0
  23. genelastic/import_data/cli/info.py +247 -0
  24. genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
  25. genelastic/import_data/cli/validate.py +146 -0
  26. genelastic/import_data/collect.py +185 -0
  27. genelastic/import_data/constants.py +136 -11
  28. genelastic/import_data/import_bundle.py +102 -59
  29. genelastic/import_data/import_bundle_factory.py +70 -149
  30. genelastic/import_data/importers/__init__.py +0 -0
  31. genelastic/import_data/importers/importer_base.py +131 -0
  32. genelastic/import_data/importers/importer_factory.py +85 -0
  33. genelastic/import_data/importers/importer_types.py +223 -0
  34. genelastic/import_data/logger.py +2 -1
  35. genelastic/import_data/models/__init__.py +0 -0
  36. genelastic/import_data/models/analyses.py +178 -0
  37. genelastic/import_data/models/analysis.py +144 -0
  38. genelastic/import_data/models/data_file.py +110 -0
  39. genelastic/import_data/models/process.py +45 -0
  40. genelastic/import_data/models/processes.py +84 -0
  41. genelastic/import_data/models/tags.py +170 -0
  42. genelastic/import_data/models/unique_list.py +109 -0
  43. genelastic/import_data/models/validate.py +26 -0
  44. genelastic/import_data/patterns.py +90 -0
  45. genelastic/import_data/random_bundle.py +79 -54
  46. genelastic/import_data/resolve.py +157 -0
  47. genelastic/ui/.env +1 -0
  48. genelastic/ui/cli_start_ui.py +20 -0
  49. genelastic/ui/routes.py +333 -0
  50. genelastic/ui/server.py +9 -82
  51. genelastic/ui/settings.py +2 -6
  52. genelastic/ui/static/cea-cnrgh.ico +0 -0
  53. genelastic/ui/static/cea.ico +0 -0
  54. genelastic/ui/static/layout.ico +0 -0
  55. genelastic/ui/static/novaseq6000.png +0 -0
  56. genelastic/ui/static/style.css +430 -0
  57. genelastic/ui/static/ui.js +458 -0
  58. genelastic/ui/templates/analyses.html +98 -0
  59. genelastic/ui/templates/analysis_detail.html +44 -0
  60. genelastic/ui/templates/bi_process_detail.html +129 -0
  61. genelastic/ui/templates/bi_processes.html +116 -0
  62. genelastic/ui/templates/explorer.html +356 -0
  63. genelastic/ui/templates/home.html +207 -0
  64. genelastic/ui/templates/layout.html +153 -0
  65. genelastic/ui/templates/version.html +21 -0
  66. genelastic/ui/templates/wet_process_detail.html +131 -0
  67. genelastic/ui/templates/wet_processes.html +116 -0
  68. genelastic-0.9.0.dist-info/METADATA +686 -0
  69. genelastic-0.9.0.dist-info/RECORD +76 -0
  70. genelastic-0.9.0.dist-info/WHEEL +4 -0
  71. genelastic-0.9.0.dist-info/entry_points.txt +10 -0
  72. genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
  73. genelastic/import_data/analyses.py +0 -69
  74. genelastic/import_data/analysis.py +0 -205
  75. genelastic/import_data/bi_process.py +0 -27
  76. genelastic/import_data/bi_processes.py +0 -49
  77. genelastic/import_data/cli_gen_data.py +0 -116
  78. genelastic/import_data/cli_import.py +0 -379
  79. genelastic/import_data/cli_info.py +0 -256
  80. genelastic/import_data/cli_validate.py +0 -54
  81. genelastic/import_data/data_file.py +0 -87
  82. genelastic/import_data/filename_pattern.py +0 -57
  83. genelastic/import_data/tags.py +0 -123
  84. genelastic/import_data/wet_process.py +0 -28
  85. genelastic/import_data/wet_processes.py +0 -53
  86. genelastic-0.7.0.dist-info/METADATA +0 -105
  87. genelastic-0.7.0.dist-info/RECORD +0 -40
  88. genelastic-0.7.0.dist-info/WHEEL +0 -5
  89. genelastic-0.7.0.dist-info/entry_points.txt +0 -6
  90. genelastic-0.7.0.dist-info/top_level.txt +0 -1
genelastic/common/cli.py CHANGED
@@ -1,6 +1,39 @@
1
1
  """Utility functions for CLI scripts."""
2
2
 
3
3
  import argparse
4
+ import logging
5
+ from importlib.metadata import version
6
+
7
+ logger = logging.getLogger("genelastic")
8
+
9
+
10
+ BASE_LOG_LEVEL = ["critical", "error", "warning", "info", "debug"]
11
+
12
+
13
+ def positive_int(value: str) -> int:
14
+ """Argparse type: require a positive integer."""
15
+ try:
16
+ number = int(value)
17
+ except ValueError:
18
+ msg = f"expected a valid integer, got '{value}'."
19
+ raise argparse.ArgumentTypeError(msg) from None
20
+ if number <= 0:
21
+ msg = f"expected a positive integer, got {value}."
22
+ raise argparse.ArgumentTypeError(msg) from None
23
+ return number
24
+
25
+
26
+ def add_version_arg(parser: argparse.ArgumentParser) -> None:
27
+ """Add a version argument to query the current Genelastic version.
28
+ Argument is added to the parser by using its reference.
29
+ """
30
+ top_level_package = __package__.split(".")[0]
31
+ parser.add_argument(
32
+ "-V",
33
+ "--version",
34
+ action="version",
35
+ version=f"%(prog)s {version(top_level_package)}",
36
+ )
4
37
 
5
38
 
6
39
  def add_verbose_control_args(parser: argparse.ArgumentParser) -> None:
@@ -61,3 +94,70 @@ def add_es_connection_args(parser: argparse.ArgumentParser) -> None:
61
94
  dest="es_index_prefix",
62
95
  help="Add the given prefix to each index created during import.",
63
96
  )
97
+
98
+
99
+ def parse_server_launch_args(
100
+ parser_desc: str, default_port: int
101
+ ) -> argparse.Namespace:
102
+ parser = argparse.ArgumentParser(
103
+ description=parser_desc,
104
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
105
+ allow_abbrev=False,
106
+ )
107
+ parser.add_argument(
108
+ "--host",
109
+ type=str,
110
+ default="127.0.0.1",
111
+ )
112
+ parser.add_argument(
113
+ "--port",
114
+ type=int,
115
+ default=default_port,
116
+ )
117
+
118
+ env_subparsers = parser.add_subparsers(dest="env", required=True)
119
+ dev_parser = env_subparsers.add_parser(
120
+ "dev",
121
+ help="Use development environment.",
122
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
123
+ )
124
+ dev_parser.add_argument(
125
+ "--log-level",
126
+ type=str,
127
+ default="info",
128
+ choices=[*BASE_LOG_LEVEL, "trace"],
129
+ )
130
+
131
+ prod_parser = env_subparsers.add_parser(
132
+ "prod",
133
+ help="Use production environment.",
134
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
135
+ )
136
+ prod_parser.add_argument(
137
+ "--log-level", type=str, default="info", choices=BASE_LOG_LEVEL
138
+ )
139
+ prod_parser.add_argument(
140
+ "-w", "--workers", type=int, default=1, help="Number of workers."
141
+ )
142
+
143
+ prod_parser.add_argument("--access-logfile", type=str, default=None)
144
+ prod_parser.add_argument("--log-file", type=str, default=None)
145
+
146
+ return parser.parse_args()
147
+
148
+
149
+ def log_section(title: str) -> None:
150
+ msg = f">> {title} <<"
151
+ logger.info("*" * len(msg))
152
+ logger.info(msg)
153
+ logger.info("*" * len(msg))
154
+
155
+
156
+ def log_subsection(title: str) -> None:
157
+ logger.info("")
158
+ logger.info("<%s>", title)
159
+
160
+
161
+ def log_item(name: str, index: int, count: int) -> None:
162
+ msg = f"[ {name} #{index}/{count} ]"
163
+ logger.info(msg)
@@ -1,64 +1,226 @@
1
1
  import datetime
2
2
  import logging
3
3
  import time
4
- import typing
5
- from abc import ABC
4
+ from collections.abc import Iterable
6
5
  from typing import Any
7
6
 
8
7
  import elastic_transport
9
8
  import elasticsearch.helpers
10
9
  from elasticsearch import Elasticsearch
10
+ from elasticsearch.helpers import scan
11
+ from tqdm import tqdm
11
12
 
12
13
  from .exceptions import DBIntegrityError
13
- from .types import Bucket, BulkItems
14
+ from .types import Bucket
14
15
 
15
16
  logger = logging.getLogger("genelastic")
16
17
 
17
18
 
18
- class ElasticConn(ABC):
19
- """Abstract class representing a connector for an Elasticsearch server."""
20
-
21
- client: Elasticsearch
22
-
23
- def __init__(self, url: str, fingerprint: str, **kwargs: Any) -> None: # noqa: ANN401
24
- """Initialize an elasticsearch client instance.
19
+ class ElasticConn:
20
+ """Base class for Elasticsearch connectors.
21
+
22
+ This class provides common functionality for managing index names and
23
+ establishing a connection to an Elasticsearch server. It is not
24
+ intended to be instantiated directly. Instead, use one of its
25
+ subclasses:
26
+
27
+ - ``ElasticQueryConn`` for performing search and query operations,
28
+ - ``ElasticImportConn`` for importing and indexing data.
29
+
30
+ :param url: URL of the Elasticsearch host.
31
+ :param fingerprint: SHA256 certificate fingerprint for secure HTTPS
32
+ connection.
33
+ :param index_prefix: Prefix to prepend to all index names.
34
+ :param dry_run: Dry run mode; 0 = execute queries, >=1 = skip queries,
35
+ no Elasticsearch client is created.
36
+ :param kwargs: Additional keyword arguments passed to the Elasticsearch
37
+ client.
38
+ :raises SystemExit: If connection or authentication to Elasticsearch
39
+ fails.
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ url: str,
45
+ fingerprint: str,
46
+ index_prefix: str,
47
+ dry_run: int = 0,
48
+ **kwargs: Any, # noqa: ANN401
49
+ ) -> None:
50
+ self._index_prefix = index_prefix
51
+ self._dry_run = dry_run
52
+ self._init_indices()
53
+ self._client = None
54
+
55
+ if self._dry_run > 0:
56
+ msg = (
57
+ f"[Dryrun] {self.__class__.__name__} "
58
+ f"instantiated without an Elasticsearch client."
59
+ )
60
+ logger.info(msg)
61
+ return
25
62
 
26
- :url: URL of the Elasticsearch host.
27
- :fingerprint: sha256 certificate fingerprint for a secure HTTP connection.
28
- :returns: The configured elasticsearch client instance.
29
- :raises SystemExit: If the connection to the Elasticsearch server failed.
30
- """
31
63
  try:
32
- self.client = Elasticsearch(
64
+ self._client = Elasticsearch(
33
65
  url,
34
66
  ssl_assert_fingerprint=fingerprint,
35
67
  # Verify cert only when the fingerprint is not None.
36
68
  verify_certs=bool(fingerprint),
37
69
  **kwargs,
38
70
  )
39
- self.client.info()
71
+ self._client.info()
40
72
  except (
41
73
  elastic_transport.TransportError,
42
74
  elasticsearch.AuthenticationException,
43
75
  ) as e:
44
76
  raise SystemExit(e) from e
45
77
 
78
+ def _init_indices(self) -> None:
79
+ # Core indices.
80
+ self._analyses_index = f"{self._index_prefix}_analyses"
81
+ self._data_files_index = f"{self._index_prefix}_data_files"
82
+ # Content indices.
83
+ self._vcf_variants_index = f"{self._index_prefix}_vcf_variants"
84
+ self._coverage_index = f"{self._index_prefix}_coverage"
85
+ # Metrics indices.
86
+ self._qc_metrics_index = f"{self._index_prefix}_qc_metrics"
87
+ self._sv_metrics_index = f"{self._index_prefix}_sv_metrics"
88
+ self._smallvar_metrics_index = f"{self._index_prefix}_smallvar_metrics"
89
+ # Processes indices.
90
+ self._bi_processes_index = f"{self._index_prefix}_bi_processes"
91
+ self._wet_processes_index = f"{self._index_prefix}_wet_processes"
92
+
93
+ @property
94
+ def client(self) -> Elasticsearch | None:
95
+ """Elasticsearch client."""
96
+ return self._client
97
+
98
+ @property
99
+ def analyses_index(self) -> str:
100
+ """Index for analyses."""
101
+ return self._analyses_index
102
+
103
+ @property
104
+ def data_files_index(self) -> str:
105
+ """Index for data files."""
106
+ return self._data_files_index
107
+
108
+ @property
109
+ def vcf_variants_index(self) -> str:
110
+ """Index for VCF variants."""
111
+ return self._vcf_variants_index
112
+
113
+ @property
114
+ def coverage_index(self) -> str:
115
+ """Index for coverage data."""
116
+ return self._coverage_index
117
+
118
+ @property
119
+ def qc_metrics_index(self) -> str:
120
+ """Index for quality control metrics."""
121
+ return self._qc_metrics_index
122
+
123
+ @property
124
+ def sv_metrics_index(self) -> str:
125
+ """Index for structural variant (SV) metrics."""
126
+ return self._sv_metrics_index
127
+
128
+ @property
129
+ def smallvar_metrics_index(self) -> str:
130
+ """Index for small variant (SNV/indel) metrics."""
131
+ return self._smallvar_metrics_index
132
+
133
+ @property
134
+ def bi_processes_index(self) -> str:
135
+ """Index for bioinformatics processes."""
136
+ return self._bi_processes_index
137
+
138
+ @property
139
+ def wet_processes_index(self) -> str:
140
+ """Index for wet lab processes."""
141
+ return self._wet_processes_index
142
+
46
143
 
47
144
  class ElasticImportConn(ElasticConn):
48
145
  """Connector to import data into an Elasticsearch database."""
49
146
 
50
- def import_items(
51
- self, bulk_items: BulkItems, start_time: float, total_items: int
52
- ) -> None:
53
- """Import items to the Elasticsearch database."""
54
- if len(bulk_items) > 0:
55
- elasticsearch.helpers.bulk(self.client, bulk_items)
147
+ @staticmethod
148
+ def _handle_bulk_response(response: Iterable[tuple[bool, Any]]) -> None:
149
+ success_count = 0
150
+ failure_count = 0
151
+ total_items = 0
152
+
153
+ start_time = time.perf_counter()
154
+
155
+ for success, info in tqdm(
156
+ response,
157
+ desc="Import progress",
158
+ unit=" documents",
159
+ unit_scale=True, # Scale large counts for easier readability (e.g., 1200 => 1.2k).
160
+ leave=False, # Hide finished bars to keep console clean.
161
+ ):
162
+ total_items += 1
163
+ if success:
164
+ success_count += 1
165
+ logger.trace(info) # type: ignore[attr-defined]
166
+ else:
167
+ failure_count += 1
168
+ logger.error("Failed to import item: %s", info)
169
+
56
170
  elapsed = time.perf_counter() - start_time
57
171
  logger.info(
58
- "Imported %d items in %s (%f items/s).",
172
+ " - Imported %d document(s) (ok: %d, failed: %d) in %s (%f docs/s).",
59
173
  total_items,
174
+ success_count,
175
+ failure_count,
60
176
  datetime.timedelta(seconds=elapsed),
61
- total_items / elapsed,
177
+ total_items / elapsed if elapsed > 0 else 0,
178
+ )
179
+
180
+ def bulk_import(self, documents: Iterable[dict[str, Any]]) -> None:
181
+ """Import documents in streaming mode, suitable for low to medium
182
+ document volumes.
183
+
184
+ :param documents: documents to index.
185
+ """
186
+ if not self.client:
187
+ logger.info("[Dryrun] bulk_import: no Elasticsearch client.")
188
+ return
189
+
190
+ self._handle_bulk_response(
191
+ elasticsearch.helpers.streaming_bulk(
192
+ self.client,
193
+ actions=documents,
194
+ raise_on_error=False,
195
+ )
196
+ )
197
+
198
+ def parallel_bulk_import(
199
+ self,
200
+ documents: Iterable[dict[str, Any]],
201
+ thread_count: int = 4,
202
+ ) -> None:
203
+ """Import documents in parallel mode, suitable for large document
204
+ volumes.
205
+
206
+ :param documents: documents to index.
207
+ :param thread_count: Number of threads to use for parallel bulk import.
208
+ """
209
+ if not self.client:
210
+ logger.info(
211
+ "[Dryrun] parallel_bulk_import: no Elasticsearch client."
212
+ )
213
+ return
214
+
215
+ logger.debug("parallel_bulk_import: using %s thread(s).", thread_count)
216
+
217
+ self._handle_bulk_response(
218
+ elasticsearch.helpers.parallel_bulk(
219
+ self.client,
220
+ actions=documents,
221
+ raise_on_error=False,
222
+ thread_count=thread_count,
223
+ )
62
224
  )
63
225
 
64
226
 
@@ -67,14 +229,22 @@ class ElasticQueryConn(ElasticConn):
67
229
 
68
230
  def get_indices(self) -> Any | str: # noqa: ANN401
69
231
  """Return all indices."""
232
+ if not self.client:
233
+ logger.info("[Dryrun] get_indices: no Elasticsearch client.")
234
+ return []
235
+
70
236
  return self.client.cat.indices(format="json").body
71
237
 
72
238
  def get_document_by_id(self, index: str, document_id: str) -> Any | str: # noqa: ANN401
73
239
  """Return a document by its ID."""
240
+ if not self.client:
241
+ logger.info("[Dryrun] get_document_by_id: no Elasticsearch client.")
242
+ return {}
243
+
74
244
  return self.client.get(index=index, id=document_id).body
75
245
 
76
246
  def run_composite_aggregation(
77
- self, index: str, query: dict[str, typing.Any]
247
+ self, index: str, query: dict[str, Any]
78
248
  ) -> list[Bucket]:
79
249
  """Executes a composite aggregation on an Elasticsearch index and
80
250
  returns all paginated results.
@@ -83,20 +253,18 @@ class ElasticQueryConn(ElasticConn):
83
253
  :param query: Aggregation query to run.
84
254
  :return: List of aggregation results.
85
255
  """
256
+ if not self.client:
257
+ logger.info(
258
+ "[Dryrun] run_composite_aggregation: "
259
+ "no Elasticsearch client."
260
+ )
261
+ return []
262
+
86
263
  # Extract the aggregation name from the query dict.
87
264
  agg_name = next(iter(query["aggs"]))
88
265
  all_buckets: list[Bucket] = []
89
266
 
90
- try:
91
- logger.debug(
92
- "Running composite aggregation query %s on index '%s'.",
93
- query,
94
- index,
95
- )
96
- response = self.client.search(index=index, body=query)
97
- except elasticsearch.NotFoundError as e:
98
- msg = f"Error: {e.message} for index '{index}'."
99
- raise SystemExit(msg) from e
267
+ response = self.client.search(index=index, body=query)
100
268
 
101
269
  while True:
102
270
  # Extract buckets from the response.
@@ -109,15 +277,10 @@ class ElasticQueryConn(ElasticConn):
109
277
  if "after_key" in response["aggregations"][agg_name]:
110
278
  after_key = response["aggregations"][agg_name]["after_key"]
111
279
  query["aggs"][agg_name]["composite"]["after"] = after_key
112
- try:
113
- logger.debug(
114
- "Running query %s on index '%s'.", query, index
115
- )
116
- # Fetch the next page of results.
117
- response = self.client.search(index=index, body=query)
118
- except elasticsearch.NotFoundError as e:
119
- msg = f"Error: {e.message} for index '{index}'."
120
- raise SystemExit(msg) from e
280
+
281
+ # Fetch the next page of results.
282
+ logger.debug("Running query %s on index '%s'.", query, index)
283
+ response = self.client.search(index=index, body=query)
121
284
  else:
122
285
  break
123
286
 
@@ -125,6 +288,10 @@ class ElasticQueryConn(ElasticConn):
125
288
 
126
289
  def get_field_values(self, index: str, field_name: str) -> set[str]:
127
290
  """Return a set of values for a given field."""
291
+ if not self.client:
292
+ logger.info("[Dryrun] get_field_values: no Elasticsearch client.")
293
+ return set()
294
+
128
295
  values = set()
129
296
 
130
297
  query = {
@@ -152,8 +319,14 @@ class ElasticQueryConn(ElasticConn):
152
319
 
153
320
  def search_by_field_value(
154
321
  self, index: str, field: str, value: str
155
- ) -> dict[str, typing.Any] | None:
322
+ ) -> dict[str, Any] | None:
156
323
  """Search a document by a value for a certain field."""
324
+ if not self.client:
325
+ logger.info(
326
+ "[Dryrun] search_by_field_value: no Elasticsearch client."
327
+ )
328
+ return {}
329
+
157
330
  logger.info(
158
331
  "Searching for field '%s' with value '%s' inside index '%s'.",
159
332
  field,
@@ -183,6 +356,10 @@ class ElasticQueryConn(ElasticConn):
183
356
  :raises genelastic.common.DBIntegrityError:
184
357
  Some values of the given field are duplicated in the index.
185
358
  """
359
+ if not self.client:
360
+ logger.info("[Dryrun] ensure_unique: no Elasticsearch client.")
361
+ return
362
+
186
363
  logger.info(
187
364
  "Ensuring that the field '%s' in the index '%s' only contains unique values...",
188
365
  field,
@@ -212,3 +389,154 @@ class ElasticQueryConn(ElasticConn):
212
389
  logger.info(
213
390
  "All values of field '%s' in index '%s' are unique.", field, index
214
391
  )
392
+
393
+ def get_all_documents_kv(self, index: str) -> list[dict[str, Any]]:
394
+ """Return all key-value pairs from all documents in an index."""
395
+ if not self.client:
396
+ logger.info(
397
+ "[Dryrun] get_all_documents_kv: no Elasticsearch client."
398
+ )
399
+ return []
400
+
401
+ def flatten(
402
+ d: dict[str, Any], parent_key: str = "", sep: str = "."
403
+ ) -> dict[str, Any]:
404
+ items = {}
405
+ for k, v in d.items():
406
+ new_key = f"{parent_key}{sep}{k}" if parent_key else k
407
+ if isinstance(v, dict):
408
+ items.update(flatten(v, new_key, sep=sep))
409
+ else:
410
+ items[new_key] = v
411
+ return items
412
+
413
+ results = []
414
+ for doc in scan(
415
+ self.client, index=index, query={"query": {"match_all": {}}}
416
+ ):
417
+ source = doc.get("_source", {})
418
+ flattened = flatten(source)
419
+ results.append(flattened)
420
+
421
+ return results
422
+
423
+ def get_all_documents_kv_count(
424
+ self, index: str, field: str, size: int = 10000
425
+ ) -> dict[str, int]:
426
+ """Return a dictionary with the count of each unique value for an index field."""
427
+ if not self.client:
428
+ logger.info(
429
+ "[Dryrun] get_all_documents_kv_count: no Elasticsearch client."
430
+ )
431
+ return {}
432
+
433
+ query = {
434
+ "size": 0,
435
+ "aggs": {
436
+ "value_counts": {
437
+ "terms": {
438
+ "field": f"{field}.keyword",
439
+ "size": size,
440
+ }
441
+ }
442
+ },
443
+ }
444
+
445
+ response = self.client.search(index=index, body=query)
446
+ buckets = response["aggregations"]["value_counts"]["buckets"]
447
+ return {bucket["key"]: bucket["doc_count"] for bucket in buckets}
448
+
449
+ def get_process(self, index: str, proc_id: str) -> dict[str, Any] | None:
450
+ """Get details about a specific process."""
451
+ if not self.client:
452
+ logger.info("[Dryrun] get_process: no Elasticsearch client.")
453
+ return {}
454
+
455
+ query = {
456
+ "query": {"term": {"proc_id.keyword": {"value": proc_id}}},
457
+ "size": 1,
458
+ }
459
+
460
+ response = self.client.search(index=index, body=query)
461
+
462
+ result = response["hits"]["hits"]
463
+ return result[0]["_source"] if result else None
464
+
465
+ def list_analyses_by_process(self, term: str, proc_id: str) -> list[str]:
466
+ """Route to list analyses that contain the specified process."""
467
+ if not self.client:
468
+ logger.info(
469
+ "[Dryrun] list_analyses_by_process: " "no Elasticsearch client."
470
+ )
471
+ return []
472
+
473
+ search_query = {
474
+ "query": {
475
+ "term": {
476
+ f"metadata.{term}.keyword": proc_id,
477
+ }
478
+ }
479
+ }
480
+ response = self.client.search(
481
+ index=self.analyses_index, body=search_query
482
+ )
483
+ return [
484
+ hit["_source"]["analysis_id"] for hit in response["hits"]["hits"]
485
+ ]
486
+
487
+ def list_analyses_by_process_esql(
488
+ self,
489
+ term: str,
490
+ proc_id: str,
491
+ ) -> list[dict[str, str]]:
492
+ """ES|QL route to list analyses that contain the specified process."""
493
+ if not self.client:
494
+ logger.info(
495
+ "[Dryrun] list_analyses_by_process_esql: no Elasticsearch client."
496
+ )
497
+ return []
498
+
499
+ query = (
500
+ f"FROM {self.analyses_index} | "
501
+ f"WHERE metadata.{term} == ? | "
502
+ f"KEEP analysis_id"
503
+ )
504
+
505
+ response = self.client.esql.query(
506
+ body={"query": query, "params": [proc_id]},
507
+ )
508
+
509
+ columns_name = [column["name"] for column in response["columns"]]
510
+ return [
511
+ dict(zip(columns_name, value, strict=False))
512
+ for value in response["values"]
513
+ ]
514
+
515
+ def list_analyses_by_process_sql(
516
+ self,
517
+ term: str,
518
+ proc_id: str,
519
+ ) -> list[dict[str, str]]:
520
+ """SQL route to list analyses that contain the specified process."""
521
+ if not self.client:
522
+ logger.info(
523
+ "[Dryrun] list_analyses_by_process_sql: no Elasticsearch client."
524
+ )
525
+ return []
526
+
527
+ # ruff: noqa: S608
528
+ query = (
529
+ f"SELECT analysis_id "
530
+ f"FROM {self.analyses_index} "
531
+ f"WHERE metadata.{term} = ?"
532
+ )
533
+
534
+ response = self.client.sql.query(
535
+ body={"query": query, "params": [proc_id]}
536
+ )
537
+
538
+ columns_name = [column["name"] for column in response["columns"]]
539
+ return [
540
+ dict(zip(columns_name, row, strict=False))
541
+ for row in response["rows"]
542
+ ]
@@ -1,4 +1,36 @@
1
1
  class DBIntegrityError(Exception):
2
- """Represents an integrity error,
3
- raised when the database content does not match the expected data schema.
2
+ """Exception raised when the database content does not match the expected
3
+ data schema.
4
4
  """
5
+
6
+
7
+ class DataFileCollectorError(Exception):
8
+ """Exception raised when an error occur while collecting analysis data
9
+ files.
10
+ """
11
+
12
+
13
+ class InvalidFilePrefixError(Exception):
14
+ """Exception raised when a file prefix is invalid."""
15
+
16
+
17
+ class FilenamePatternResolveError(Exception):
18
+ """Exception raised when a filename pattern could not be resolved."""
19
+
20
+
21
+ class UniqueListDuplicateError(Exception):
22
+ """Exception raised when trying to add an item that already exist in the
23
+ list.
24
+ """
25
+
26
+
27
+ class TagsDefinitionError(Exception):
28
+ """Exception raised when the tags definition is invalid."""
29
+
30
+
31
+ class YAMLFileReadError(Exception):
32
+ """Exception raised when a YAML file cannot be opened or parsed."""
33
+
34
+
35
+ class ValidationError(Exception):
36
+ """Exception raised when a YAML document fails schema validation."""