genelastic 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. genelastic/api/extends/example.py +2 -3
  2. genelastic/api/routes.py +160 -23
  3. genelastic/api/server.py +30 -22
  4. genelastic/api/settings.py +3 -2
  5. genelastic/common/__init__.py +36 -9
  6. genelastic/common/cli.py +51 -23
  7. genelastic/common/elastic.py +80 -49
  8. genelastic/common/exceptions.py +0 -2
  9. genelastic/common/types.py +20 -15
  10. genelastic/import_data/__init__.py +23 -5
  11. genelastic/import_data/analyses.py +17 -20
  12. genelastic/import_data/analysis.py +69 -65
  13. genelastic/import_data/bi_process.py +7 -5
  14. genelastic/import_data/bi_processes.py +8 -8
  15. genelastic/import_data/cli_gen_data.py +116 -0
  16. genelastic/import_data/cli_import.py +379 -0
  17. genelastic/import_data/{info.py → cli_info.py} +104 -75
  18. genelastic/import_data/cli_integrity.py +384 -0
  19. genelastic/import_data/cli_validate.py +54 -0
  20. genelastic/import_data/constants.py +11 -32
  21. genelastic/import_data/data_file.py +23 -20
  22. genelastic/import_data/filename_pattern.py +26 -32
  23. genelastic/import_data/import_bundle.py +56 -47
  24. genelastic/import_data/import_bundle_factory.py +166 -158
  25. genelastic/import_data/logger.py +22 -18
  26. genelastic/import_data/random_bundle.py +402 -0
  27. genelastic/import_data/tags.py +46 -26
  28. genelastic/import_data/wet_process.py +8 -4
  29. genelastic/import_data/wet_processes.py +13 -8
  30. genelastic/ui/__init__.py +0 -0
  31. genelastic/ui/server.py +87 -0
  32. genelastic/ui/settings.py +11 -0
  33. genelastic-0.7.0.dist-info/METADATA +105 -0
  34. genelastic-0.7.0.dist-info/RECORD +40 -0
  35. {genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/WHEEL +1 -1
  36. genelastic-0.7.0.dist-info/entry_points.txt +6 -0
  37. genelastic/import_data/gen_data.py +0 -194
  38. genelastic/import_data/import_data.py +0 -292
  39. genelastic/import_data/integrity.py +0 -290
  40. genelastic/import_data/validate_data.py +0 -43
  41. genelastic-0.6.1.dist-info/METADATA +0 -41
  42. genelastic-0.6.1.dist-info/RECORD +0 -36
  43. genelastic-0.6.1.dist-info/entry_points.txt +0 -6
  44. {genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/top_level.txt +0 -0
@@ -1,71 +1,100 @@
1
- # pylint: disable=missing-module-docstring
2
1
  import argparse
3
2
  import logging
4
- import typing
5
3
 
6
- from genelastic.common import (ElasticQueryConn, add_verbose_control_args,
7
- add_es_connection_args, Bucket)
4
+ from genelastic.common import (
5
+ Bucket,
6
+ ElasticQueryConn,
7
+ add_es_connection_args,
8
+ add_verbose_control_args,
9
+ )
8
10
 
9
11
  from .logger import configure_logging
10
12
 
11
- logger = logging.getLogger('genelastic')
12
- logging.getLogger('elastic_transport').setLevel(logging.WARNING) # Disable excessive logging
13
+ logger = logging.getLogger("genelastic")
14
+ logging.getLogger("elastic_transport").setLevel(
15
+ logging.WARNING
16
+ ) # Disable excessive logging
13
17
 
14
18
 
15
19
  def read_args() -> argparse.Namespace:
16
20
  """Read arguments from command line."""
17
- parser = argparse.ArgumentParser(description='ElasticSearch database info.',
18
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
19
- allow_abbrev=False)
21
+ parser = argparse.ArgumentParser(
22
+ description="ElasticSearch database info.",
23
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
24
+ allow_abbrev=False,
25
+ )
20
26
  add_verbose_control_args(parser)
21
27
  add_es_connection_args(parser)
22
- parser.add_argument("-y", "--list-bundles", action="store_true",
23
- help="List all imported YAML bundles.")
24
- parser.add_argument("-f", "--list-data-files", action="store_true",
25
- help="List all imported data files.")
26
- parser.add_argument("-w", "--list-wet-processes", action="store_true",
27
- help="List all imported wet processes.")
28
- parser.add_argument("-b", "--list-bi-processes", action="store_true",
29
- help="List all imported bio info processes.")
30
- parser.add_argument("-Y", "--list-data-files-per-bundle", action="store_true",
31
- help="For each imported YAML bundle, "
32
- "display some info and list its data files.")
28
+ parser.add_argument(
29
+ "-y",
30
+ "--list-bundles",
31
+ action="store_true",
32
+ help="List all imported YAML bundles.",
33
+ )
34
+ parser.add_argument(
35
+ "-f",
36
+ "--list-data-files",
37
+ action="store_true",
38
+ help="List all imported data files.",
39
+ )
40
+ parser.add_argument(
41
+ "-w",
42
+ "--list-wet-processes",
43
+ action="store_true",
44
+ help="List all imported wet processes.",
45
+ )
46
+ parser.add_argument(
47
+ "-b",
48
+ "--list-bi-processes",
49
+ action="store_true",
50
+ help="List all imported bio info processes.",
51
+ )
52
+ parser.add_argument(
53
+ "-Y",
54
+ "--list-data-files-per-bundle",
55
+ action="store_true",
56
+ help="For each imported YAML bundle, "
57
+ "display some info and list its data files.",
58
+ )
33
59
  return parser.parse_args()
34
60
 
35
61
 
36
62
  def list_bundles(es_query_conn: ElasticQueryConn, index: str) -> None:
37
63
  """List all imported YAML bundles."""
38
-
39
64
  query = {
40
65
  "size": 0,
41
66
  "aggs": {
42
67
  "get_bundle_paths": {
43
68
  "composite": {
44
- "sources": {"bundle_path": {"terms": {"field": "bundle_path.keyword"}}},
69
+ "sources": {
70
+ "bundle_path": {
71
+ "terms": {"field": "bundle_path.keyword"}
72
+ }
73
+ },
45
74
  "size": 1000,
46
75
  }
47
76
  }
48
- }
77
+ },
49
78
  }
50
79
 
51
- buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
80
+ buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
81
+ index, query
82
+ )
52
83
 
53
- print("Imported YAML files")
54
- print("===================")
84
+ logger.info("Imported YAML files")
85
+ logger.info("===================")
55
86
 
56
87
  if len(buckets) == 0:
57
- print("Empty response.", end="\n")
88
+ logger.info("Empty response.")
58
89
  return
59
90
 
60
91
  for bucket in buckets:
61
- bundle_path = bucket['key']['bundle_path']
62
- print(f'- {bundle_path}')
63
- print()
92
+ bundle_path = bucket["key"]["bundle_path"]
93
+ logger.info("- %s", bundle_path)
64
94
 
65
95
 
66
96
  def list_data_files(es_query_conn: ElasticQueryConn, index: str) -> None:
67
97
  """List all imported data files."""
68
-
69
98
  query = {
70
99
  "size": 0,
71
100
  "aggs": {
@@ -75,22 +104,23 @@ def list_data_files(es_query_conn: ElasticQueryConn, index: str) -> None:
75
104
  "size": 1000,
76
105
  }
77
106
  }
78
- }
107
+ },
79
108
  }
80
109
 
81
- buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
110
+ buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
111
+ index, query
112
+ )
82
113
 
83
- print("Imported data files")
84
- print("===================")
114
+ logger.info("Imported data files")
115
+ logger.info("===================")
85
116
 
86
117
  if len(buckets) == 0:
87
- print("Empty response.", end="\n")
118
+ logger.info("Empty response.")
88
119
  return
89
120
 
90
121
  for bucket in buckets:
91
- bundle_path = bucket['key']['path']
92
- print(f'- {bundle_path}')
93
- print()
122
+ bundle_path = bucket["key"]["path"]
123
+ logger.info("- %s", bundle_path)
94
124
 
95
125
 
96
126
  def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
@@ -98,29 +128,30 @@ def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
98
128
  process_ids = es_query_conn.get_field_values(index, "proc_id")
99
129
 
100
130
  if len(process_ids) == 0:
101
- print("Empty response.", end="\n")
131
+ logger.info("Empty response.")
102
132
  return
103
133
 
104
134
  for process_id in process_ids:
105
- print(f'- {process_id}')
106
- print()
135
+ logger.info("- %s", process_id)
107
136
 
108
137
 
109
138
  def list_wet_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
110
139
  """List all wet processes."""
111
- print("Imported wet processes")
112
- print("======================")
140
+ logger.info("Imported wet processes")
141
+ logger.info("======================")
113
142
  list_processes(es_query_conn, index)
114
143
 
115
144
 
116
145
  def list_bi_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
117
146
  """List all bio info processes."""
118
- print("Imported bi processes")
119
- print("=====================")
147
+ logger.info("Imported bi processes")
148
+ logger.info("=====================")
120
149
  list_processes(es_query_conn, index)
121
150
 
122
151
 
123
- def list_data_files_per_bundle(es_query_conn: ElasticQueryConn, index: str) -> None:
152
+ def list_data_files_per_bundle(
153
+ es_query_conn: ElasticQueryConn, index: str
154
+ ) -> None:
124
155
  """For each imported YAML bundle, display some info and list its data files."""
125
156
  query = {
126
157
  "size": 0,
@@ -130,50 +161,47 @@ def list_data_files_per_bundle(es_query_conn: ElasticQueryConn, index: str) -> N
130
161
  "sources": [
131
162
  {
132
163
  "bundle_path": {
133
- "terms": {
134
- "field": "bundle_path.keyword"
135
- }
164
+ "terms": {"field": "bundle_path.keyword"}
136
165
  }
137
166
  }
138
167
  ],
139
- "size": 100
168
+ "size": 100,
140
169
  },
141
- "aggs": {
142
- "docs": {
143
- "top_hits": {
144
- "size": 100
145
- }
146
- }
147
- }
170
+ "aggs": {"docs": {"top_hits": {"size": 100}}},
148
171
  }
149
- }
172
+ },
150
173
  }
151
174
 
152
- buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
175
+ buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
176
+ index, query
177
+ )
153
178
 
154
- print("Data files per YAML bundle")
155
- print("==========================")
179
+ logger.info("Data files per YAML bundle")
180
+ logger.info("==========================")
156
181
 
157
182
  if len(buckets) == 0:
158
- print("Empty response.", end="\n")
183
+ logger.info("Empty response.")
159
184
  return
160
185
 
161
186
  for bucket in buckets:
162
-
163
187
  documents = bucket["docs"]["hits"]["hits"]
164
188
  if len(documents) == 0:
165
189
  continue
166
190
 
167
- print(f"- Bundle Path: {bucket['key']['bundle_path']}")
168
- print(f" -> Wet process: {documents[0]['_source']['metadata']['wet_process']}")
169
- print(f" -> Bio info process: {documents[0]['_source']['metadata']['bi_process']}")
170
- print(" -> Data files:")
191
+ logger.info("- Bundle Path: %s", bucket["key"]["bundle_path"])
192
+ logger.info(
193
+ " -> Wet process: %s",
194
+ documents[0]["_source"]["metadata"]["wet_process"],
195
+ )
196
+ logger.info(
197
+ " -> Bio info process: %s",
198
+ documents[0]["_source"]["metadata"]["bi_process"],
199
+ )
200
+ logger.info(" -> Data files:")
171
201
 
172
202
  for doc in documents:
173
- print(f" - Index: {doc['_source']['file_index']}")
174
- print(f" Path: {doc['_source']['path']}")
175
-
176
- print()
203
+ logger.info(" - Index: %s", doc["_source"]["file_index"])
204
+ logger.info(" Path: %s", doc["_source"]["path"])
177
205
 
178
206
 
179
207
  def main() -> None:
@@ -185,8 +213,9 @@ def main() -> None:
185
213
 
186
214
  addr = f"https://{args.es_host}:{args.es_port}"
187
215
  logger.info("Trying to connect to Elasticsearch at %s...", addr)
188
- es_query_conn = ElasticQueryConn(addr, args.es_cert_fp,
189
- basic_auth=(args.es_usr, args.es_pwd))
216
+ es_query_conn = ElasticQueryConn(
217
+ addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
218
+ )
190
219
 
191
220
  analysis_index = f"{args.es_index_prefix}-analyses"
192
221
  wet_processes_index = f"{args.es_index_prefix}-wet_processes"
@@ -223,5 +252,5 @@ def main() -> None:
223
252
  list_data_files_per_bundle(es_query_conn, analysis_index)
224
253
 
225
254
 
226
- if __name__ == '__main__':
255
+ if __name__ == "__main__":
227
256
  main()
@@ -0,0 +1,384 @@
1
+ import argparse
2
+ import logging
3
+
4
+ from elasticsearch import NotFoundError
5
+
6
+ from genelastic.common import (
7
+ Bucket,
8
+ DBIntegrityError,
9
+ ElasticQueryConn,
10
+ add_es_connection_args,
11
+ add_verbose_control_args,
12
+ )
13
+
14
+ from .logger import configure_logging
15
+
16
+ logger = logging.getLogger("genelastic")
17
+ logging.getLogger("elastic_transport").setLevel(
18
+ logging.WARNING
19
+ ) # Disable excessive logging
20
+
21
+
22
+ def read_args() -> argparse.Namespace:
23
+ """Read arguments from command line."""
24
+ parser = argparse.ArgumentParser(
25
+ description="Utility to check the integrity "
26
+ "of the genelastic ElasticSearch database.",
27
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
28
+ allow_abbrev=False,
29
+ )
30
+ add_verbose_control_args(parser)
31
+ add_es_connection_args(parser)
32
+ return parser.parse_args()
33
+
34
+
35
+ def check_for_undefined_file_indices(
36
+ es_query_conn: ElasticQueryConn, analyses_index: str
37
+ ) -> None:
38
+ """Check for potentially undefined files indices in the analyses index.
39
+
40
+ :param es_query_conn: Elasticsearch database instance.
41
+ :param analyses_index: Name of the index where analyses are stored.
42
+ :raises genelastic.common.DBIntegrityError:
43
+ Some files indices are used in the analyses index but are undefined.
44
+ """
45
+ logger.info(
46
+ "Checking for references to undefined file indices in the index '%s'...",
47
+ analyses_index,
48
+ )
49
+
50
+ undefined_indices = set()
51
+
52
+ query = {
53
+ "size": 0,
54
+ "aggs": {
55
+ "get_file_indices": {
56
+ "composite": {
57
+ "sources": {
58
+ "file_index": {"terms": {"field": "file_index.keyword"}}
59
+ },
60
+ "size": 1000,
61
+ }
62
+ }
63
+ },
64
+ }
65
+
66
+ buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
67
+ analyses_index, query
68
+ )
69
+
70
+ for bucket in buckets:
71
+ file_index = bucket["key"]["file_index"]
72
+
73
+ try:
74
+ es_query_conn.client.indices.get(index=file_index)
75
+ logger.debug(
76
+ "File index %s used in index '%s' is defined.",
77
+ file_index,
78
+ analyses_index,
79
+ )
80
+ except NotFoundError:
81
+ logger.debug(
82
+ "File index %s used in '%s' is undefined.",
83
+ file_index,
84
+ analyses_index,
85
+ )
86
+ undefined_indices.add(file_index)
87
+
88
+ if len(undefined_indices) > 0:
89
+ msg = (
90
+ f"Found the following undefined file indices defined in the index '{analyses_index}': "
91
+ f"{', '.join(undefined_indices)}"
92
+ )
93
+ raise DBIntegrityError(msg)
94
+
95
+ logger.info("All defined file indices are referenced.")
96
+
97
+
98
+ def get_undefined_processes(
99
+ es_query_conn: ElasticQueryConn,
100
+ analyses_index: str,
101
+ process_index: str,
102
+ field: str,
103
+ ) -> set[str]:
104
+ """Return a set of undefined processes IDs in an index.
105
+
106
+ :param es_query_conn: Elasticsearch database instance.
107
+ :param analyses_index: Name of the index where analyses are stored.
108
+ :param process_index: Name of the index to check for undefined processes.
109
+ :param field: Field name used to retrieve the process ID.
110
+ :returns: A set of undefined processes IDs.
111
+ """
112
+ query = {
113
+ "size": 0,
114
+ "aggs": {
115
+ "get_analyses_processes": {
116
+ "composite": {
117
+ "sources": {
118
+ "process": {"terms": {"field": f"{field}.keyword"}}
119
+ },
120
+ "size": 1000,
121
+ }
122
+ }
123
+ },
124
+ }
125
+
126
+ buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
127
+ analyses_index, query
128
+ )
129
+
130
+ used_processes = {bucket["key"]["process"] for bucket in buckets}
131
+ logger.debug(
132
+ "Used values for field '%s' in index '%s': %s",
133
+ field,
134
+ analyses_index,
135
+ used_processes,
136
+ )
137
+
138
+ defined_processes = es_query_conn.get_field_values(process_index, "proc_id")
139
+ logger.debug(
140
+ "Defined values in index '%s': %s", process_index, defined_processes
141
+ )
142
+
143
+ return used_processes.difference(defined_processes)
144
+
145
+
146
+ def check_for_undefined_wet_processes(
147
+ es_query_conn: ElasticQueryConn, analyses_index: str, wet_process_index: str
148
+ ) -> None:
149
+ """Check that each wet process used in the analyses index is defined.
150
+
151
+ :param es_query_conn: Elasticsearch database instance.
152
+ :param analyses_index: Name of the index where analyses are stored.
153
+ :param wet_process_index: Name of the index where wet processes are stored.
154
+ :raises genelastic.common.DBIntegrityError:
155
+ Some wet processes used in the analyses index are undefined.
156
+ """
157
+ logger.info(
158
+ "Checking for undefined wet processes used in index '%s'...",
159
+ analyses_index,
160
+ )
161
+ undefined_wet_processes = get_undefined_processes(
162
+ es_query_conn, analyses_index, wet_process_index, "metadata.wet_process"
163
+ )
164
+
165
+ if len(undefined_wet_processes) > 0:
166
+ msg = (
167
+ f"Index '{analyses_index}' uses the following undefined wet processes: "
168
+ f"{', '.join(undefined_wet_processes)}."
169
+ )
170
+ raise DBIntegrityError(msg)
171
+
172
+ logger.info(
173
+ "All wet processes used in index '%s' are defined.", wet_process_index
174
+ )
175
+
176
+
177
+ def check_for_undefined_bi_processes(
178
+ es_query_conn: ElasticQueryConn, analyses_index: str, bi_process_index: str
179
+ ) -> None:
180
+ """Check that each bio info process used in the analyses index is defined.
181
+
182
+ :param es_query_conn: Elasticsearch database instance.
183
+ :param analyses_index: Name of the index where analyses are stored.
184
+ :param bi_process_index: Name of the index where bio info processes are stored.
185
+ :raises genelastic.common.DBIntegrityError:
186
+ Some bio info processes used in the analyses index are undefined.
187
+ """
188
+ logger.info(
189
+ "Checking for undefined bio info processes used in index '%s'...",
190
+ analyses_index,
191
+ )
192
+ undefined_bi_processes = get_undefined_processes(
193
+ es_query_conn, analyses_index, bi_process_index, "metadata.bi_process"
194
+ )
195
+
196
+ if len(undefined_bi_processes) > 0:
197
+ msg = (
198
+ f"Index '{analyses_index}' uses the following undefined bio info processes: "
199
+ f"{', '.join(undefined_bi_processes)}."
200
+ )
201
+ raise DBIntegrityError(msg)
202
+
203
+ logger.info(
204
+ "All bio info processes used in index '%s' are defined.",
205
+ bi_process_index,
206
+ )
207
+
208
+
209
+ def check_for_unused_file_indices(
210
+ es_query_conn: ElasticQueryConn, analyses_index: str, index_prefix: str
211
+ ) -> int:
212
+ """Check that each of the file indices are used in at least one analysis.
213
+
214
+ :param es_query_conn: Elasticsearch database instance.
215
+ :param analyses_index: Name of the index where analyses are stored.
216
+ :param index_prefix: Prefix given to all the indices of the ElasticSearch database.
217
+ :returns: 1 if some file indices exists but are unused in the analyses index,
218
+ and 0 otherwise.
219
+ """
220
+ json_indices = es_query_conn.client.cat.indices(
221
+ index=f"{index_prefix}-file-*", format="json"
222
+ ).body
223
+
224
+ found_file_indices = set()
225
+ for x in json_indices:
226
+ if isinstance(x, dict):
227
+ found_file_indices.add(x["index"])
228
+
229
+ query = {
230
+ "size": 0,
231
+ "aggs": {
232
+ "get_file_indices": {
233
+ "composite": {
234
+ "sources": {
235
+ "file_index": {"terms": {"field": "file_index.keyword"}}
236
+ },
237
+ "size": 1000,
238
+ }
239
+ }
240
+ },
241
+ }
242
+
243
+ buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
244
+ analyses_index, query
245
+ )
246
+
247
+ used_files_indices = {bucket["key"]["file_index"] for bucket in buckets}
248
+ unused_files_indices = found_file_indices.difference(used_files_indices)
249
+
250
+ if len(unused_files_indices) > 0:
251
+ logger.warning(
252
+ "Found the following unused files indices: %s",
253
+ ", ".join(unused_files_indices),
254
+ )
255
+ return 1
256
+
257
+ logger.info("All files indices are used.")
258
+ return 0
259
+
260
+
261
+ def check_for_unused_wet_processes(
262
+ es_query_conn: ElasticQueryConn, analyses_index: str, wet_proc_index: str
263
+ ) -> int:
264
+ """Check for defined wet processes that are not used in the analyses index.
265
+
266
+ :param es_query_conn: Elasticsearch database instance.
267
+ :param analyses_index: Name of the index where analyses are stored.
268
+ :param wet_proc_index: Name of the index where wet processes are stored.
269
+ :returns: 1 if some wet process are defined but unused in the analyses index,
270
+ and 0 otherwise.
271
+ """
272
+ logger.info(
273
+ "Checking for unused wet processes in the index '%s'...", wet_proc_index
274
+ )
275
+
276
+ defined_wet_procs = es_query_conn.get_field_values(
277
+ wet_proc_index, "proc_id"
278
+ )
279
+ logger.debug(
280
+ "Found the following defined wet processes: %s", defined_wet_procs
281
+ )
282
+
283
+ used_wet_procs = es_query_conn.get_field_values(
284
+ analyses_index, "metadata.wet_process"
285
+ )
286
+ logger.debug(
287
+ "Following processes are used in the index '%s': %s",
288
+ analyses_index,
289
+ used_wet_procs,
290
+ )
291
+
292
+ unused_wet_procs = defined_wet_procs - used_wet_procs
293
+ if len(unused_wet_procs) > 0:
294
+ logger.warning("Found unused wet processes: %s", unused_wet_procs)
295
+ return 1
296
+
297
+ logger.info("No unused wet processes found.")
298
+ return 0
299
+
300
+
301
+ def check_for_unused_bi_processes(
302
+ es_query_conn: ElasticQueryConn, analyses_index: str, bi_proc_index: str
303
+ ) -> int:
304
+ """Check for defined bio info processes that are not used in the analyses index.
305
+
306
+ :param es_query_conn: Elasticsearch database instance.
307
+ :param analyses_index: Name of the index where analyses are stored.
308
+ :param bi_proc_index: Name of the index where bio info processes are stored.
309
+ :returns: 1 if some wet process are defined but unused in the analyses index,
310
+ and 0 otherwise.
311
+ """
312
+ logger.info(
313
+ "Checking for unused bio info processes in the index '%s'...",
314
+ bi_proc_index,
315
+ )
316
+
317
+ defined_bi_procs = es_query_conn.get_field_values(bi_proc_index, "proc_id")
318
+ logger.debug(
319
+ "Found the following defined bio info processes: %s", defined_bi_procs
320
+ )
321
+
322
+ used_bi_procs = es_query_conn.get_field_values(
323
+ analyses_index, "metadata.bi_process"
324
+ )
325
+ logger.debug(
326
+ "Following processes are used in the index '%s': %s",
327
+ analyses_index,
328
+ used_bi_procs,
329
+ )
330
+
331
+ unused_bi_procs = defined_bi_procs - used_bi_procs
332
+ if len(unused_bi_procs) > 0:
333
+ logger.warning("Found unused bio info processes: %s", unused_bi_procs)
334
+ return 1
335
+
336
+ logger.info("No unused bio info processes found.")
337
+ return 0
338
+
339
+
340
+ def main() -> None:
341
+ """Entry point of the integrity script."""
342
+ args = read_args()
343
+
344
+ configure_logging(args.verbose)
345
+ logger.debug("Arguments: %s", args)
346
+
347
+ analyses_index = f"{args.es_index_prefix}-analyses"
348
+ wet_processes_index = f"{args.es_index_prefix}-wet_processes"
349
+ bi_processes_index = f"{args.es_index_prefix}-bi_processes"
350
+
351
+ addr = f"https://{args.es_host}:{args.es_port}"
352
+ logger.info("Trying to connect to Elasticsearch at %s...", addr)
353
+ es_query_conn = ElasticQueryConn(
354
+ addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
355
+ )
356
+
357
+ # Fatal errors
358
+ try:
359
+ es_query_conn.ensure_unique(wet_processes_index, "proc_id")
360
+ es_query_conn.ensure_unique(bi_processes_index, "proc_id")
361
+ check_for_undefined_file_indices(es_query_conn, analyses_index)
362
+ check_for_undefined_wet_processes(
363
+ es_query_conn, analyses_index, wet_processes_index
364
+ )
365
+ check_for_undefined_bi_processes(
366
+ es_query_conn, analyses_index, bi_processes_index
367
+ )
368
+ except DBIntegrityError as e:
369
+ raise SystemExit(e) from e
370
+
371
+ # Warnings
372
+ check_for_unused_wet_processes(
373
+ es_query_conn, analyses_index, wet_processes_index
374
+ )
375
+ check_for_unused_bi_processes(
376
+ es_query_conn, analyses_index, bi_processes_index
377
+ )
378
+ check_for_unused_file_indices(
379
+ es_query_conn, analyses_index, args.es_index_prefix
380
+ )
381
+
382
+
383
+ if __name__ == "__main__":
384
+ main()