genelastic 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. genelastic/__init__.py +0 -13
  2. genelastic/api/__init__.py +0 -0
  3. genelastic/api/extends/__init__.py +0 -0
  4. genelastic/api/extends/example.py +6 -0
  5. genelastic/api/routes.py +221 -0
  6. genelastic/api/server.py +80 -0
  7. genelastic/api/settings.py +14 -0
  8. genelastic/common/__init__.py +39 -0
  9. genelastic/common/cli.py +63 -0
  10. genelastic/common/elastic.py +214 -0
  11. genelastic/common/exceptions.py +4 -0
  12. genelastic/common/types.py +25 -0
  13. genelastic/import_data/__init__.py +27 -0
  14. genelastic/{analyses.py → import_data/analyses.py} +19 -20
  15. genelastic/{analysis.py → import_data/analysis.py} +71 -66
  16. genelastic/{bi_process.py → import_data/bi_process.py} +8 -6
  17. genelastic/{bi_processes.py → import_data/bi_processes.py} +10 -9
  18. genelastic/import_data/cli_gen_data.py +116 -0
  19. genelastic/import_data/cli_import.py +379 -0
  20. genelastic/import_data/cli_info.py +256 -0
  21. genelastic/import_data/cli_integrity.py +384 -0
  22. genelastic/import_data/cli_validate.py +54 -0
  23. genelastic/import_data/constants.py +24 -0
  24. genelastic/{data_file.py → import_data/data_file.py} +26 -21
  25. genelastic/import_data/filename_pattern.py +57 -0
  26. genelastic/{import_bundle.py → import_data/import_bundle.py} +58 -48
  27. genelastic/import_data/import_bundle_factory.py +298 -0
  28. genelastic/{logger.py → import_data/logger.py} +22 -18
  29. genelastic/import_data/random_bundle.py +402 -0
  30. genelastic/{tags.py → import_data/tags.py} +48 -27
  31. genelastic/{wet_process.py → import_data/wet_process.py} +8 -4
  32. genelastic/{wet_processes.py → import_data/wet_processes.py} +15 -9
  33. genelastic/ui/__init__.py +0 -0
  34. genelastic/ui/server.py +87 -0
  35. genelastic/ui/settings.py +11 -0
  36. genelastic-0.7.0.dist-info/METADATA +105 -0
  37. genelastic-0.7.0.dist-info/RECORD +40 -0
  38. {genelastic-0.6.0.dist-info → genelastic-0.7.0.dist-info}/WHEEL +1 -1
  39. genelastic-0.7.0.dist-info/entry_points.txt +6 -0
  40. genelastic/common.py +0 -151
  41. genelastic/constants.py +0 -45
  42. genelastic/filename_pattern.py +0 -62
  43. genelastic/gen_data.py +0 -193
  44. genelastic/import_bundle_factory.py +0 -288
  45. genelastic/import_data.py +0 -294
  46. genelastic/info.py +0 -248
  47. genelastic/integrity.py +0 -324
  48. genelastic/validate_data.py +0 -41
  49. genelastic-0.6.0.dist-info/METADATA +0 -36
  50. genelastic-0.6.0.dist-info/RECORD +0 -25
  51. genelastic-0.6.0.dist-info/entry_points.txt +0 -6
  52. {genelastic-0.6.0.dist-info → genelastic-0.7.0.dist-info}/top_level.txt +0 -0
genelastic/__init__.py CHANGED
@@ -1,13 +0,0 @@
1
- """Genelastic package for importing Genomic data into Elasticsearch.
2
- """
3
-
4
- from .import_bundle import ImportBundle
5
- from .common import BundleDict
6
- from .constants import BUNDLE_CURRENT_VERSION
7
- from .import_bundle_factory import make_import_bundle_from_files, \
8
- load_import_bundle_file
9
- from .analysis import Analysis
10
- from .analyses import Analyses
11
-
12
- __all__ = ['make_import_bundle_from_files', 'BUNDLE_CURRENT_VERSION',
13
- 'load_import_bundle_file', 'Analysis', 'ImportBundle']
File without changes
File without changes
@@ -0,0 +1,6 @@
1
+ from flask import Response, jsonify
2
+
3
+
4
+ def ping_2() -> Response:
5
+ """Test route to verify that the server is online."""
6
+ return jsonify({"message": "pong_2"})
@@ -0,0 +1,221 @@
1
+ from importlib.metadata import version
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from flask import Response, current_app, jsonify
6
+
7
+
8
+ def ping() -> Response:
9
+ """Test route to verify that the server is online."""
10
+ return jsonify({"message": "pong"})
11
+
12
+
13
+ def list_indices() -> Response:
14
+ """Route to list Elasticsearch indexes."""
15
+ return current_app.elastic_query_conn.get_indices() # type: ignore[attr-defined, no-any-return]
16
+
17
+
18
+ def retrieve_document(index_id: str, document_id: str) -> Response:
19
+ """Route to retrieve a document by its ID."""
20
+ document = current_app.elastic_query_conn.get_document_by_id( # type: ignore[attr-defined]
21
+ index_id, document_id
22
+ )
23
+ return jsonify(document)
24
+
25
+
26
+ def list_wet_processes() -> Response:
27
+ """Route to list wet processes."""
28
+ wet_processes_index = (
29
+ f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-wet_processes"
30
+ )
31
+ result = current_app.elastic_query_conn.get_field_values( # type: ignore[attr-defined]
32
+ wet_processes_index, "proc_id"
33
+ )
34
+ return jsonify(list(result))
35
+
36
+
37
+ def list_bi_processes() -> Response:
38
+ """Route to list bi processes."""
39
+ bi_processes_index = (
40
+ f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-bi_processes"
41
+ )
42
+ result = current_app.elastic_query_conn.get_field_values( # type: ignore[attr-defined]
43
+ bi_processes_index, "proc_id"
44
+ )
45
+ return jsonify(list(result))
46
+
47
+
48
+ def list_analyses() -> Response:
49
+ """Route to list analyses."""
50
+ analyses_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-analyses"
51
+ result = current_app.elastic_query_conn.get_field_values( # type: ignore[attr-defined]
52
+ analyses_index, "path"
53
+ )
54
+ filenames = [Path(path).name for path in result]
55
+ return jsonify(filenames)
56
+
57
+
58
+ def list_analyses_wet_processes(proc_id: str) -> Response:
59
+ """Route to list analyses one of specific wet process"""
60
+ analyses_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-analyses"
61
+
62
+ search_query = {
63
+ "query": {
64
+ "term": {
65
+ "metadata.wet_process.keyword": proc_id,
66
+ }
67
+ }
68
+ }
69
+ response = current_app.elastic_query_conn.client.search( # type: ignore[attr-defined]
70
+ index=analyses_index, body=search_query
71
+ )
72
+ result = [hit["_source"]["path"] for hit in response["hits"]["hits"]]
73
+
74
+ return jsonify(result)
75
+
76
+
77
+ def list_analyses_bi_processes(proc_id: str) -> Response:
78
+ """Route to list analyses one of specific bi process"""
79
+ analyses_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-analyses"
80
+
81
+ search_query = {
82
+ "query": {
83
+ "term": {
84
+ "metadata.bi_process.keyword": proc_id,
85
+ }
86
+ }
87
+ }
88
+ response = current_app.elastic_query_conn.client.search( # type: ignore[attr-defined]
89
+ index=analyses_index, body=search_query
90
+ )
91
+ result = [hit["_source"]["path"] for hit in response["hits"]["hits"]]
92
+
93
+ return jsonify(result)
94
+
95
+
96
+ def list_snv_documents() -> Response:
97
+ """Route to list all documents containing a mutation at a single position (SNV)."""
98
+ index_pattern = "genelastic-file-*"
99
+ target_value = "SNV"
100
+
101
+ search_query = {
102
+ "aggs": {
103
+ "snv_docs": {
104
+ "composite": {
105
+ "sources": [
106
+ {"alt_value": {"terms": {"field": "alt.keyword"}}}
107
+ ],
108
+ "size": 1000,
109
+ }
110
+ }
111
+ },
112
+ "query": {"term": {"alt.keyword": target_value}},
113
+ "size": 0,
114
+ }
115
+
116
+ all_documents = []
117
+ buckets = current_app.elastic_query_conn.run_composite_aggregation( # type: ignore[attr-defined]
118
+ index_pattern, search_query
119
+ )
120
+
121
+ for bucket in buckets:
122
+ alt_value = bucket["key"]["alt_value"]
123
+
124
+ search_query_docs = {
125
+ "query": {"term": {"alt.keyword": alt_value}},
126
+ "size": 1000,
127
+ }
128
+
129
+ response = current_app.elastic_query_conn.client.search( # type: ignore[attr-defined]
130
+ index=index_pattern, body=search_query_docs
131
+ )
132
+
133
+ all_documents.extend(response["hits"]["hits"])
134
+
135
+ return jsonify(all_documents)
136
+
137
+
138
+ def build_snv_search_query(
139
+ target_alt: str, target_svtype: str
140
+ ) -> dict[str, Any]:
141
+ """Helper function to build the search query for SNV documents with specified alt and SVTYPE."""
142
+ return {
143
+ "query": {
144
+ "bool": {
145
+ "must": [
146
+ {"term": {"alt.keyword": target_alt}},
147
+ {"term": {"info.SVTYPE.keyword": target_svtype}},
148
+ ]
149
+ }
150
+ },
151
+ "size": 1000,
152
+ }
153
+
154
+
155
+ def build_snv_mutation_search_query(
156
+ target_svtypes: list[str],
157
+ ) -> dict[str, Any]:
158
+ """Helper function to build the search query for SNV mutations with specified SVTYPE values."""
159
+ return {
160
+ "query": {
161
+ "bool": {
162
+ "must": [
163
+ {"term": {"alt.keyword": "SNV"}},
164
+ {"terms": {"info.SVTYPE.keyword": target_svtypes}},
165
+ ]
166
+ }
167
+ },
168
+ "size": 1000,
169
+ }
170
+
171
+
172
+ def list_snv_insertion_documents() -> Response:
173
+ """Route to list all documents containing an insertion (INS) at a single position (SNV)."""
174
+ index_pattern = "genelastic-file-*"
175
+ search_query = build_snv_search_query(target_alt="SNV", target_svtype="INS")
176
+
177
+ response = current_app.elastic_query_conn.client.search( # type: ignore[attr-defined]
178
+ index=index_pattern, body=search_query
179
+ )
180
+
181
+ all_documents = [hit["_source"] for hit in response["hits"]["hits"]]
182
+
183
+ return jsonify(all_documents)
184
+
185
+
186
+ def list_snv_deletion_documents() -> Response:
187
+ """Route to list all documents containing a deletion (DEL) at a single position (SNV)."""
188
+ index_pattern = "genelastic-file-*"
189
+ search_query = build_snv_search_query(target_alt="SNV", target_svtype="DEL")
190
+
191
+ response = current_app.elastic_query_conn.client.search( # type: ignore[attr-defined]
192
+ index=index_pattern, body=search_query
193
+ )
194
+
195
+ all_documents = [hit["_source"] for hit in response["hits"]["hits"]]
196
+
197
+ return jsonify(all_documents)
198
+
199
+
200
+ def list_snv_mutation_documents() -> Response:
201
+ """Route to list all documents containing a mutation at a single position (SNV)."""
202
+ index_pattern = "genelastic-file-*"
203
+ target_svtypes = ["INS", "DEL"]
204
+
205
+ search_query = build_snv_mutation_search_query(
206
+ target_svtypes=target_svtypes
207
+ )
208
+
209
+ response = current_app.elastic_query_conn.client.search( # type: ignore[attr-defined]
210
+ index=index_pattern, body=search_query
211
+ )
212
+
213
+ all_documents = [hit["_source"] for hit in response["hits"]["hits"]]
214
+
215
+ return jsonify(all_documents)
216
+
217
+
218
+ def get_genelastic_version() -> Response:
219
+ """Retourne la version du package genelastic."""
220
+ top_level_package = __package__.split(".")[0]
221
+ return jsonify({"version": version(top_level_package)})
@@ -0,0 +1,80 @@
1
+ from pathlib import Path
2
+ from typing import Any
3
+
4
+ import connexion
5
+ import yaml
6
+
7
+ from genelastic.common import ElasticQueryConn
8
+
9
+
10
+ def load_yaml(file_path: Path) -> Any: # noqa: ANN401
11
+ """Load a YAML file and return its content."""
12
+ content = None
13
+ with Path.open(file_path, encoding="utf-8") as f:
14
+ try:
15
+ content = yaml.safe_load(f)
16
+ except yaml.YAMLError as exc:
17
+ raise SystemExit(exc) from exc
18
+ return content
19
+
20
+
21
+ def aggregate_openapi_specs(
22
+ main_spec_file: Path, additional_spec_path: Path
23
+ ) -> Any: # noqa: ANN401
24
+ """Aggregate OpenAPI specifications from a main file and a directory
25
+ of additional specifications.
26
+ """
27
+ main_spec = load_yaml(main_spec_file)
28
+ try:
29
+ entries = additional_spec_path.iterdir()
30
+ except OSError as exc:
31
+ raise SystemExit(exc) from exc
32
+
33
+ if "paths" not in main_spec:
34
+ main_spec["paths"] = []
35
+
36
+ for entry in entries:
37
+ if not entry.is_file():
38
+ continue
39
+
40
+ if entry.suffix not in [".yml", ".yaml"]:
41
+ continue
42
+
43
+ content = load_yaml(entry)
44
+
45
+ if "paths" in content:
46
+ main_spec["paths"].update(content["paths"])
47
+
48
+ return main_spec
49
+
50
+
51
+ # Initialiser l'application Connexion
52
+ connexion_app = connexion.FlaskApp(__name__)
53
+ connexion_app.app.config.from_object("src.genelastic.api.settings.Config")
54
+
55
+ # Initialiser le client Elasticsearch
56
+ es_url = connexion_app.app.config["GENAPI_ES_URL"]
57
+ es_cert_fp = connexion_app.app.config["GENAPI_ES_CERT_FP"]
58
+ es_api_key = connexion_app.app.config["GENAPI_ES_ENCODED_API_KEY"]
59
+
60
+ connexion_app.app.elastic_query_conn = ElasticQueryConn(
61
+ es_url, es_cert_fp, api_key=es_api_key
62
+ )
63
+
64
+ connexion_app.app.logger.debug(
65
+ "Successfully connected to Elasticsearch server: %s",
66
+ connexion_app.app.elastic_query_conn.client.info(),
67
+ )
68
+
69
+ # Chemins des fichiers YAML
70
+ main_yaml_file = Path(__file__).parents[0] / "specification.yml"
71
+ additional_yaml_dir = Path(__file__).parents[0] / "extends"
72
+
73
+ # Charger et combiner les fichiers YAML
74
+ yaml_spec = aggregate_openapi_specs(main_yaml_file, additional_yaml_dir)
75
+
76
+ # Ajouter la spécification vers OpenAPI
77
+ connexion_app.add_api(yaml_spec)
78
+
79
+ if __name__ == "__main__":
80
+ connexion_app.run(debug=True)
@@ -0,0 +1,14 @@
1
+ from environs import Env
2
+
3
+ env = Env()
4
+ env.read_env()
5
+
6
+
7
+ class Config:
8
+ """Flask config class."""
9
+
10
+ # Charger toutes les variables d'environnement nécessaires
11
+ GENAPI_ES_URL = env.url("GENAPI_ES_URL").geturl()
12
+ GENAPI_ES_ENCODED_API_KEY = env.str("GENAPI_ES_ENCODED_API_KEY")
13
+ GENAPI_ES_INDEX_PREFIX = env.str("GENAPI_ES_INDEX_PREFIX")
14
+ GENAPI_ES_CERT_FP = env.str("GENAPI_ES_CERT_FP")
@@ -0,0 +1,39 @@
1
+ """Genelastic package for common code between API and import scripts."""
2
+
3
+ from .cli import add_es_connection_args, add_verbose_control_args
4
+ from .elastic import ElasticImportConn, ElasticQueryConn
5
+ from .exceptions import DBIntegrityError
6
+ from .types import (
7
+ AnalysisDocument,
8
+ AnalysisMetaData,
9
+ BioInfoProcessData,
10
+ Bucket,
11
+ BulkItems,
12
+ BundleDict,
13
+ MetadataDocument,
14
+ ProcessDocument,
15
+ RandomAnalysisData,
16
+ RandomBiProcessData,
17
+ RandomWetProcessData,
18
+ WetProcessesData,
19
+ )
20
+
21
+ __all__ = [
22
+ "AnalysisDocument",
23
+ "AnalysisMetaData",
24
+ "BioInfoProcessData",
25
+ "Bucket",
26
+ "BulkItems",
27
+ "BundleDict",
28
+ "DBIntegrityError",
29
+ "ElasticImportConn",
30
+ "ElasticQueryConn",
31
+ "MetadataDocument",
32
+ "ProcessDocument",
33
+ "RandomAnalysisData",
34
+ "RandomBiProcessData",
35
+ "RandomWetProcessData",
36
+ "WetProcessesData",
37
+ "add_es_connection_args",
38
+ "add_verbose_control_args",
39
+ ]
@@ -0,0 +1,63 @@
1
+ """Utility functions for CLI scripts."""
2
+
3
+ import argparse
4
+
5
+
6
+ def add_verbose_control_args(parser: argparse.ArgumentParser) -> None:
7
+ """Add verbose control arguments to the parser.
8
+ Arguments are added to the parser by using its reference.
9
+ """
10
+ parser.add_argument(
11
+ "-q",
12
+ "--quiet",
13
+ dest="verbose",
14
+ action="store_const",
15
+ const=0,
16
+ default=1,
17
+ help="Set verbosity to 0 (quiet mode).",
18
+ )
19
+ parser.add_argument(
20
+ "-v",
21
+ "--verbose",
22
+ dest="verbose",
23
+ action="count",
24
+ default=1,
25
+ help=(
26
+ "Verbose level. -v for information, -vv for debug, -vvv for trace."
27
+ ),
28
+ )
29
+
30
+
31
+ def add_es_connection_args(parser: argparse.ArgumentParser) -> None:
32
+ """Add arguments to the parser needed to gather ElasticSearch server connection parameters.
33
+ Arguments are added to the parser by using its reference.
34
+ """
35
+ parser.add_argument(
36
+ "--es-host",
37
+ dest="es_host",
38
+ default="localhost",
39
+ help="Address of Elasticsearch host.",
40
+ )
41
+ parser.add_argument(
42
+ "--es-port",
43
+ type=int,
44
+ default=9200,
45
+ dest="es_port",
46
+ help="Elasticsearch port.",
47
+ )
48
+ parser.add_argument(
49
+ "--es-usr", dest="es_usr", default="elastic", help="Elasticsearch user."
50
+ )
51
+ parser.add_argument(
52
+ "--es-pwd", dest="es_pwd", required=True, help="Elasticsearch password."
53
+ )
54
+ parser.add_argument(
55
+ "--es-cert-fp",
56
+ dest="es_cert_fp",
57
+ help="Elasticsearch sha256 certificate fingerprint.",
58
+ )
59
+ parser.add_argument(
60
+ "--es-index-prefix",
61
+ dest="es_index_prefix",
62
+ help="Add the given prefix to each index created during import.",
63
+ )
@@ -0,0 +1,214 @@
1
+ import datetime
2
+ import logging
3
+ import time
4
+ import typing
5
+ from abc import ABC
6
+ from typing import Any
7
+
8
+ import elastic_transport
9
+ import elasticsearch.helpers
10
+ from elasticsearch import Elasticsearch
11
+
12
+ from .exceptions import DBIntegrityError
13
+ from .types import Bucket, BulkItems
14
+
15
+ logger = logging.getLogger("genelastic")
16
+
17
+
18
+ class ElasticConn(ABC):
19
+ """Abstract class representing a connector for an Elasticsearch server."""
20
+
21
+ client: Elasticsearch
22
+
23
+ def __init__(self, url: str, fingerprint: str, **kwargs: Any) -> None: # noqa: ANN401
24
+ """Initialize an elasticsearch client instance.
25
+
26
+ :url: URL of the Elasticsearch host.
27
+ :fingerprint: sha256 certificate fingerprint for a secure HTTP connection.
28
+ :returns: The configured elasticsearch client instance.
29
+ :raises SystemExit: If the connection to the Elasticsearch server failed.
30
+ """
31
+ try:
32
+ self.client = Elasticsearch(
33
+ url,
34
+ ssl_assert_fingerprint=fingerprint,
35
+ # Verify cert only when the fingerprint is not None.
36
+ verify_certs=bool(fingerprint),
37
+ **kwargs,
38
+ )
39
+ self.client.info()
40
+ except (
41
+ elastic_transport.TransportError,
42
+ elasticsearch.AuthenticationException,
43
+ ) as e:
44
+ raise SystemExit(e) from e
45
+
46
+
47
+ class ElasticImportConn(ElasticConn):
48
+ """Connector to import data into an Elasticsearch database."""
49
+
50
+ def import_items(
51
+ self, bulk_items: BulkItems, start_time: float, total_items: int
52
+ ) -> None:
53
+ """Import items to the Elasticsearch database."""
54
+ if len(bulk_items) > 0:
55
+ elasticsearch.helpers.bulk(self.client, bulk_items)
56
+ elapsed = time.perf_counter() - start_time
57
+ logger.info(
58
+ "Imported %d items in %s (%f items/s).",
59
+ total_items,
60
+ datetime.timedelta(seconds=elapsed),
61
+ total_items / elapsed,
62
+ )
63
+
64
+
65
+ class ElasticQueryConn(ElasticConn):
66
+ """Connector to query data from an Elasticsearch database."""
67
+
68
+ def get_indices(self) -> Any | str: # noqa: ANN401
69
+ """Return all indices."""
70
+ return self.client.cat.indices(format="json").body
71
+
72
+ def get_document_by_id(self, index: str, document_id: str) -> Any | str: # noqa: ANN401
73
+ """Return a document by its ID."""
74
+ return self.client.get(index=index, id=document_id).body
75
+
76
+ def run_composite_aggregation(
77
+ self, index: str, query: dict[str, typing.Any]
78
+ ) -> list[Bucket]:
79
+ """Executes a composite aggregation on an Elasticsearch index and
80
+ returns all paginated results.
81
+
82
+ :param index: Name of the index to query.
83
+ :param query: Aggregation query to run.
84
+ :return: List of aggregation results.
85
+ """
86
+ # Extract the aggregation name from the query dict.
87
+ agg_name = next(iter(query["aggs"]))
88
+ all_buckets: list[Bucket] = []
89
+
90
+ try:
91
+ logger.debug(
92
+ "Running composite aggregation query %s on index '%s'.",
93
+ query,
94
+ index,
95
+ )
96
+ response = self.client.search(index=index, body=query)
97
+ except elasticsearch.NotFoundError as e:
98
+ msg = f"Error: {e.message} for index '{index}'."
99
+ raise SystemExit(msg) from e
100
+
101
+ while True:
102
+ # Extract buckets from the response.
103
+ buckets: list[Bucket] = response["aggregations"][agg_name][
104
+ "buckets"
105
+ ]
106
+ all_buckets.extend(buckets)
107
+
108
+ # Check if there are more results to fetch.
109
+ if "after_key" in response["aggregations"][agg_name]:
110
+ after_key = response["aggregations"][agg_name]["after_key"]
111
+ query["aggs"][agg_name]["composite"]["after"] = after_key
112
+ try:
113
+ logger.debug(
114
+ "Running query %s on index '%s'.", query, index
115
+ )
116
+ # Fetch the next page of results.
117
+ response = self.client.search(index=index, body=query)
118
+ except elasticsearch.NotFoundError as e:
119
+ msg = f"Error: {e.message} for index '{index}'."
120
+ raise SystemExit(msg) from e
121
+ else:
122
+ break
123
+
124
+ return all_buckets
125
+
126
+ def get_field_values(self, index: str, field_name: str) -> set[str]:
127
+ """Return a set of values for a given field."""
128
+ values = set()
129
+
130
+ query = {
131
+ "size": 0,
132
+ "aggs": {
133
+ "get_field_values": {
134
+ "composite": {
135
+ "sources": {
136
+ "values": {
137
+ "terms": {"field": f"{field_name}.keyword"}
138
+ }
139
+ },
140
+ "size": 1000,
141
+ }
142
+ }
143
+ },
144
+ }
145
+
146
+ buckets: list[Bucket] = self.run_composite_aggregation(index, query)
147
+
148
+ for bucket in buckets:
149
+ values.add(bucket["key"]["values"])
150
+
151
+ return values
152
+
153
+ def search_by_field_value(
154
+ self, index: str, field: str, value: str
155
+ ) -> dict[str, typing.Any] | None:
156
+ """Search a document by a value for a certain field."""
157
+ logger.info(
158
+ "Searching for field '%s' with value '%s' inside index '%s'.",
159
+ field,
160
+ value,
161
+ index,
162
+ )
163
+ search_query = {
164
+ "query": {
165
+ "term": {
166
+ f"{field}.keyword": value,
167
+ }
168
+ }
169
+ }
170
+
171
+ response = self.client.search(index=index, body=search_query)
172
+
173
+ try:
174
+ return response["hits"]["hits"][0]["_source"] # type: ignore[no-any-return]
175
+ except KeyError:
176
+ return None
177
+
178
+ def ensure_unique(self, index: str, field: str) -> None:
179
+ """Ensure that all values of a field in an index are all unique.
180
+
181
+ :param index: Name of the index.
182
+ :param field: Field name to check for value uniqueness.
183
+ :raises genelastic.common.DBIntegrityError:
184
+ Some values of the given field are duplicated in the index.
185
+ """
186
+ logger.info(
187
+ "Ensuring that the field '%s' in the index '%s' only contains unique values...",
188
+ field,
189
+ index,
190
+ )
191
+ query = {
192
+ "size": 0,
193
+ "aggs": {
194
+ "duplicate_proc_ids": {
195
+ "terms": {
196
+ "field": f"{field}.keyword",
197
+ "size": 10000,
198
+ "min_doc_count": 2,
199
+ }
200
+ }
201
+ },
202
+ }
203
+ buckets: list[Bucket] = self.run_composite_aggregation(index, query)
204
+ duplicated_processes: set[str] = {
205
+ str(bucket["key"]) for bucket in buckets
206
+ }
207
+
208
+ if len(duplicated_processes) > 0:
209
+ msg = f"Found non-unique value for field {field} in index '{index}': {', '.join(duplicated_processes)}."
210
+ raise DBIntegrityError(msg)
211
+
212
+ logger.info(
213
+ "All values of field '%s' in index '%s' are unique.", field, index
214
+ )
@@ -0,0 +1,4 @@
1
+ class DBIntegrityError(Exception):
2
+ """Represents an integrity error,
3
+ raised when the database content does not match the expected data schema.
4
+ """
@@ -0,0 +1,25 @@
1
+ import typing
2
+
3
+ # Types related to Elasticsearch data import.
4
+ Bucket: typing.TypeAlias = dict[str, dict[typing.Any, typing.Any]]
5
+ BundleDict: typing.TypeAlias = dict[str, typing.Any]
6
+
7
+ AnalysisMetaData: typing.TypeAlias = dict[str, str | int]
8
+ WetProcessesData: typing.TypeAlias = dict[str, str | int | float]
9
+ BioInfoProcessData: typing.TypeAlias = dict[str, str | list[str]]
10
+
11
+ AnalysisDocument: typing.TypeAlias = dict[str, str | None | AnalysisMetaData]
12
+ MetadataDocument: typing.TypeAlias = dict[
13
+ str, int | str | list[typing.Any | None]
14
+ ]
15
+ ProcessDocument: typing.TypeAlias = (
16
+ dict[str, str] | WetProcessesData | BioInfoProcessData
17
+ )
18
+ BulkItems: typing.TypeAlias = list[
19
+ dict[str, str | MetadataDocument | AnalysisDocument | ProcessDocument]
20
+ ]
21
+
22
+ # Types related to random bundle generation.
23
+ RandomBiProcessData: typing.TypeAlias = dict[str, str | list[dict[str, str]]]
24
+ RandomWetProcessData: typing.TypeAlias = dict[str, str | float]
25
+ RandomAnalysisData: typing.TypeAlias = dict[str, str | list[int | str]]