genelastic 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. genelastic/api/cli_start_api.py +18 -0
  2. genelastic/api/extends/example.py +2 -3
  3. genelastic/api/extends/example.yml +20 -0
  4. genelastic/api/routes.py +160 -23
  5. genelastic/api/server.py +42 -31
  6. genelastic/api/settings.py +5 -8
  7. genelastic/api/specification.yml +350 -0
  8. genelastic/common/__init__.py +41 -9
  9. genelastic/common/cli.py +103 -23
  10. genelastic/common/elastic.py +80 -49
  11. genelastic/common/exceptions.py +0 -2
  12. genelastic/common/server.py +51 -0
  13. genelastic/common/types.py +20 -15
  14. genelastic/import_data/__init__.py +23 -5
  15. genelastic/import_data/analyses.py +17 -20
  16. genelastic/import_data/analysis.py +69 -65
  17. genelastic/import_data/bi_process.py +7 -5
  18. genelastic/import_data/bi_processes.py +8 -8
  19. genelastic/import_data/cli_gen_data.py +143 -0
  20. genelastic/import_data/cli_import.py +379 -0
  21. genelastic/import_data/{info.py → cli_info.py} +104 -75
  22. genelastic/import_data/cli_integrity.py +384 -0
  23. genelastic/import_data/cli_validate.py +54 -0
  24. genelastic/import_data/constants.py +11 -32
  25. genelastic/import_data/data_file.py +23 -20
  26. genelastic/import_data/filename_pattern.py +26 -32
  27. genelastic/import_data/import_bundle.py +56 -47
  28. genelastic/import_data/import_bundle_factory.py +166 -158
  29. genelastic/import_data/logger.py +22 -18
  30. genelastic/import_data/random_bundle.py +425 -0
  31. genelastic/import_data/tags.py +46 -26
  32. genelastic/import_data/wet_process.py +8 -4
  33. genelastic/import_data/wet_processes.py +13 -8
  34. genelastic/ui/__init__.py +0 -0
  35. genelastic/ui/cli_start_ui.py +18 -0
  36. genelastic/ui/routes.py +86 -0
  37. genelastic/ui/server.py +14 -0
  38. genelastic/ui/settings.py +7 -0
  39. genelastic/ui/templates/analyses.html +11 -0
  40. genelastic/ui/templates/bi_processes.html +11 -0
  41. genelastic/ui/templates/home.html +4 -0
  42. genelastic/ui/templates/layout.html +34 -0
  43. genelastic/ui/templates/version.html +9 -0
  44. genelastic/ui/templates/wet_processes.html +11 -0
  45. genelastic-0.8.0.dist-info/METADATA +109 -0
  46. genelastic-0.8.0.dist-info/RECORD +52 -0
  47. {genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/WHEEL +1 -1
  48. genelastic-0.8.0.dist-info/entry_points.txt +8 -0
  49. genelastic/import_data/gen_data.py +0 -194
  50. genelastic/import_data/import_data.py +0 -292
  51. genelastic/import_data/integrity.py +0 -290
  52. genelastic/import_data/validate_data.py +0 -43
  53. genelastic-0.6.1.dist-info/METADATA +0 -41
  54. genelastic-0.6.1.dist-info/RECORD +0 -36
  55. genelastic-0.6.1.dist-info/entry_points.txt +0 -6
  56. {genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,379 @@
1
+ # vi: se tw=80
2
+
3
+ # Elasticsearch Python API:
4
+ # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/overview.html
5
+ # https://elasticsearch-py.readthedocs.io/en/latest/api.html
6
+
7
+ import argparse
8
+ import csv
9
+ import datetime
10
+ import hashlib
11
+ import logging
12
+ import sys
13
+ import time
14
+ from pathlib import Path
15
+
16
+ import vcf
17
+
18
+ from genelastic.common import (
19
+ AnalysisDocument,
20
+ BulkItems,
21
+ ElasticImportConn,
22
+ MetadataDocument,
23
+ ProcessDocument,
24
+ add_es_connection_args,
25
+ add_verbose_control_args,
26
+ )
27
+
28
+ from .bi_processes import BioInfoProcesses
29
+ from .data_file import DataFile
30
+ from .import_bundle_factory import make_import_bundle_from_files
31
+ from .logger import configure_logging
32
+ from .wet_processes import WetProcesses
33
+
34
+ logger = logging.getLogger("genelastic")
35
+ logging.getLogger("elastic_transport").setLevel(
36
+ logging.WARNING
37
+ ) # Disable excessive logging
38
+ logging.getLogger("urllib3").setLevel(
39
+ logging.WARNING
40
+ ) # Disable excessive logging
41
+
42
+
43
+ def read_args() -> argparse.Namespace:
44
+ """Read arguments from command line."""
45
+ parser = argparse.ArgumentParser(
46
+ description="Genetics data importer.",
47
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
48
+ allow_abbrev=False,
49
+ )
50
+ add_verbose_control_args(parser)
51
+ add_es_connection_args(parser)
52
+ parser.add_argument(
53
+ "-D",
54
+ "--dry-run",
55
+ dest="dryrun",
56
+ action="count",
57
+ default=0,
58
+ help=(
59
+ "Dry-run level. -D for data files loading (VCF, coverage, etc) "
60
+ "without connecting or importing to database. "
61
+ "-DD for metadata YAML files loading only (no loading of data files)."
62
+ ),
63
+ )
64
+ parser.add_argument(
65
+ "--log-file", dest="log_file", help="Path to a log file."
66
+ )
67
+ parser.add_argument(
68
+ "--no-list",
69
+ dest="no_list",
70
+ action="store_true",
71
+ help="Do not print list of files to be imported.",
72
+ )
73
+ parser.add_argument(
74
+ "--no-confirm",
75
+ dest="no_confirm",
76
+ action="store_true",
77
+ help="Do not ask confirmation before importing.",
78
+ )
79
+ parser.add_argument(
80
+ "files",
81
+ type=Path,
82
+ nargs="+",
83
+ default=None,
84
+ help="Data files that describe what to import.",
85
+ )
86
+ return parser.parse_args()
87
+
88
+
89
+ def import_cov_file(
90
+ es_import_conn: ElasticImportConn | None,
91
+ file_index: str,
92
+ file: Path,
93
+ dryrun: int = 0,
94
+ ) -> None:
95
+ """Import a coverage file to the Elasticsearch database."""
96
+ # Set field types
97
+ if dryrun == 0 and es_import_conn:
98
+ es_import_conn.client.indices.put_mapping(
99
+ index=file_index,
100
+ body={
101
+ "properties": {
102
+ "pos": {"type": "integer"},
103
+ "depth": {"type": "byte"},
104
+ }
105
+ },
106
+ )
107
+
108
+ # Open file
109
+ if dryrun > 1:
110
+ logger.info(
111
+ "Would load and import Coverage file %s " "into index %s.",
112
+ file,
113
+ file_index,
114
+ )
115
+ else:
116
+ logger.info("Load Coverage file %s.", file)
117
+ if dryrun == 1:
118
+ logger.info(
119
+ "Would import Coverage file %s into index %s.", file, file_index
120
+ )
121
+ else:
122
+ logger.info(
123
+ "Import Coverage file %s into index %s.", file, file_index
124
+ )
125
+ with file.open(newline="", encoding="utf-8") as f:
126
+ # Read file as CSV
127
+ reader = csv.reader(f, delimiter="\t", quotechar='"')
128
+
129
+ # Loop on al lines
130
+ for row in reader:
131
+ # Build document
132
+ # Position starts at 0 inside coverage file
133
+ doc: MetadataDocument = {
134
+ "type": "coverage",
135
+ "chr": row[0],
136
+ "pos": int(row[1]) + 1,
137
+ "depth": int(row[2]),
138
+ }
139
+
140
+ # Insert document
141
+ if dryrun == 0 and es_import_conn:
142
+ es_import_conn.client.index(index=file_index, document=doc)
143
+
144
+
145
+ def import_analysis_metadata( # noqa: PLR0913
146
+ es_import_conn: ElasticImportConn | None,
147
+ index_prefix: str,
148
+ file_index: str,
149
+ file: DataFile,
150
+ analysis_type: str,
151
+ dryrun: int = 0,
152
+ ) -> None:
153
+ """Import analysis metadata into a dedicated index."""
154
+ doc: AnalysisDocument = {
155
+ "path": str(file.path.resolve()),
156
+ "bundle_path": str(file.bundle_path.resolve())
157
+ if file.bundle_path
158
+ else None,
159
+ "metadata": file.metadata,
160
+ "file_index": file_index,
161
+ "type": analysis_type,
162
+ }
163
+
164
+ bulk_items: BulkItems = [
165
+ {"_index": f"{index_prefix}-analyses", "_source": doc}
166
+ ]
167
+
168
+ if dryrun == 0 and es_import_conn:
169
+ es_import_conn.import_items(
170
+ bulk_items,
171
+ start_time=time.perf_counter(),
172
+ total_items=len(bulk_items),
173
+ )
174
+
175
+
176
+ def import_vcf_file(
177
+ es_import_conn: ElasticImportConn | None,
178
+ file_index: str,
179
+ file: DataFile,
180
+ dryrun: int = 0,
181
+ ) -> None:
182
+ """Import a VCF file to the Elasticsearch database."""
183
+ logger.info('Import VCF file "%s".', file)
184
+
185
+ if dryrun > 1:
186
+ logger.info(
187
+ "Would load and import VCF file %s " "into index %s.",
188
+ file.path,
189
+ file_index,
190
+ )
191
+ else:
192
+ logger.info("Load VCF file %s.", file.path)
193
+ if dryrun == 1:
194
+ logger.info(
195
+ "Would import VCF file %s into index %s.", file.path, file_index
196
+ )
197
+ else:
198
+ logger.info(
199
+ "Importing VCF file %s into index %s...", file.path, file_index
200
+ )
201
+
202
+ try:
203
+ vcf_reader = vcf.Reader(filename=str(file.path))
204
+ n = 0
205
+ start = time.perf_counter()
206
+ bulk_sz = 256 # Bulk size
207
+ bulk_items: BulkItems = []
208
+ for record in vcf_reader:
209
+ # Correct values
210
+ if not record.CHROM.startswith("chr"):
211
+ if record.CHROM.lower().startswith("chr"):
212
+ record.CHROM = "chr" + record.CHROM[3:]
213
+ else:
214
+ record.CHROM = "chr" + record.CHROM
215
+
216
+ # Build document
217
+ alt = [x if x is None else x.type for x in record.ALT]
218
+ doc: MetadataDocument = {
219
+ "type": "vcf",
220
+ "chr": record.CHROM,
221
+ "pos": record.POS,
222
+ "alt": alt,
223
+ "info": record.INFO,
224
+ }
225
+
226
+ if dryrun == 0:
227
+ # Append item to bulk
228
+ bulk_items.append({"_index": file_index, "_source": doc})
229
+ n += 1
230
+
231
+ # Insert bulk of items
232
+ if len(bulk_items) >= bulk_sz and es_import_conn:
233
+ es_import_conn.import_items(
234
+ bulk_items, start_time=start, total_items=n
235
+ )
236
+ bulk_items = []
237
+
238
+ # Insert remaining items
239
+ if dryrun == 0 and es_import_conn:
240
+ es_import_conn.import_items(
241
+ bulk_items, start_time=start, total_items=n
242
+ )
243
+
244
+ except StopIteration:
245
+ logger.error("Skipping empty file : %s.", file.path)
246
+
247
+
248
+ def import_processes(
249
+ es_import_conn: ElasticImportConn | None,
250
+ index: str,
251
+ processes: WetProcesses | BioInfoProcesses,
252
+ dryrun: int = 0,
253
+ ) -> None:
254
+ """Import processes into their own index."""
255
+ bulk_items: BulkItems = []
256
+
257
+ for proc_id in processes.get_process_ids():
258
+ process = processes[proc_id]
259
+ process_type = process.__class__.__name__
260
+ doc: ProcessDocument = process.data | {
261
+ "proc_id": proc_id,
262
+ "type": process_type,
263
+ }
264
+ bulk_items.append({"_index": index, "_source": doc})
265
+
266
+ if dryrun == 0 and es_import_conn:
267
+ es_import_conn.import_items(
268
+ bulk_items,
269
+ start_time=time.perf_counter(),
270
+ total_items=len(bulk_items),
271
+ )
272
+
273
+
274
+ def generate_unique_index(index_prefix: str, filepath: Path) -> str:
275
+ """Generate a unique index with the following format:
276
+ <index_prefix>_<current_date>_<md5_hashed_filepath>
277
+ """
278
+ current_date = datetime.datetime.now(tz=datetime.UTC).strftime("%Y%m%d")
279
+ hashed_filepath = hashlib.md5(
280
+ str(filepath).encode("utf-8"), usedforsecurity=False
281
+ ).hexdigest()
282
+ return f"{index_prefix}-file-{current_date}-{hashed_filepath}"
283
+
284
+
285
+ def main() -> None: # noqa: C901
286
+ """Entry point of the import script."""
287
+ # Read command line arguments
288
+ args = read_args()
289
+
290
+ # Configure logging
291
+ configure_logging(args.verbose, log_file=args.log_file)
292
+ logger.debug("Arguments: %s", args)
293
+ logger.debug("LOGGERS: %s", logging.root.manager.loggerDict)
294
+
295
+ # Open connection to ES
296
+ if args.dryrun == 0:
297
+ addr = f"https://{args.es_host}:{args.es_port}"
298
+ logger.info("Trying to connect to Elasticsearch at %s...", addr)
299
+ es_import_conn = ElasticImportConn(
300
+ addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
301
+ )
302
+ else:
303
+ es_import_conn = None
304
+
305
+ # Load YAML import bundle
306
+ import_bundle = make_import_bundle_from_files(args.files, check=True)
307
+ all_bundled_files = import_bundle.get_files()
308
+
309
+ # CHECK
310
+ for f in all_bundled_files:
311
+ if not f.exists():
312
+ msg = f"Path {f.path} does not point to a valid file."
313
+ raise RuntimeError(msg)
314
+
315
+ # LIST
316
+ if not args.no_list:
317
+ for f in all_bundled_files:
318
+ logger.info("Will import %s.", f.path)
319
+
320
+ # Ask confirmation for importing
321
+ if not args.no_confirm:
322
+ answer: str = "maybe"
323
+ while answer not in ["", "n", "y"]:
324
+ answer = input("Import (y/N)? ").lower()
325
+ if answer != "y":
326
+ logger.info("Import canceled.")
327
+ sys.exit(0)
328
+
329
+ # IMPORT
330
+ # Loop on file categories
331
+ for cat in import_bundle.analyses.get_all_categories():
332
+ # Import all files in this category.
333
+ for f in import_bundle.get_files(cat):
334
+ logger.info("Import %s files from %s.", cat, f.path)
335
+ # First, generate a unique index name for each file.
336
+ file_index = generate_unique_index(args.es_index_prefix, f.path)
337
+ # Then, import the analysis metadata into a dedicated index.
338
+ import_analysis_metadata(
339
+ es_import_conn,
340
+ args.es_index_prefix,
341
+ file_index,
342
+ f,
343
+ cat,
344
+ args.dryrun,
345
+ )
346
+ # Finally, import the file in its own index.
347
+ globals()[f"import_{cat}_file"](
348
+ es_import_conn=es_import_conn,
349
+ file_index=file_index,
350
+ file=f,
351
+ dryrun=args.dryrun,
352
+ )
353
+
354
+ # Import processes
355
+ logger.info("Importing wet processes.")
356
+ logger.info(
357
+ "Wet processes IDs = %s",
358
+ str(import_bundle.wet_processes.get_process_ids()),
359
+ )
360
+ import_processes(
361
+ es_import_conn,
362
+ f"{args.es_index_prefix}-wet_processes",
363
+ import_bundle.wet_processes,
364
+ )
365
+
366
+ logger.info("Importing bio info processes.")
367
+ logger.info(
368
+ "Bio info processes IDs = %s",
369
+ str(import_bundle.bi_processes.get_process_ids()),
370
+ )
371
+ import_processes(
372
+ es_import_conn,
373
+ f"{args.es_index_prefix}-bi_processes",
374
+ import_bundle.bi_processes,
375
+ )
376
+
377
+
378
+ if __name__ == "__main__":
379
+ main()
@@ -1,71 +1,100 @@
1
- # pylint: disable=missing-module-docstring
2
1
  import argparse
3
2
  import logging
4
- import typing
5
3
 
6
- from genelastic.common import (ElasticQueryConn, add_verbose_control_args,
7
- add_es_connection_args, Bucket)
4
+ from genelastic.common import (
5
+ Bucket,
6
+ ElasticQueryConn,
7
+ add_es_connection_args,
8
+ add_verbose_control_args,
9
+ )
8
10
 
9
11
  from .logger import configure_logging
10
12
 
11
- logger = logging.getLogger('genelastic')
12
- logging.getLogger('elastic_transport').setLevel(logging.WARNING) # Disable excessive logging
13
+ logger = logging.getLogger("genelastic")
14
+ logging.getLogger("elastic_transport").setLevel(
15
+ logging.WARNING
16
+ ) # Disable excessive logging
13
17
 
14
18
 
15
19
  def read_args() -> argparse.Namespace:
16
20
  """Read arguments from command line."""
17
- parser = argparse.ArgumentParser(description='ElasticSearch database info.',
18
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
19
- allow_abbrev=False)
21
+ parser = argparse.ArgumentParser(
22
+ description="ElasticSearch database info.",
23
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
24
+ allow_abbrev=False,
25
+ )
20
26
  add_verbose_control_args(parser)
21
27
  add_es_connection_args(parser)
22
- parser.add_argument("-y", "--list-bundles", action="store_true",
23
- help="List all imported YAML bundles.")
24
- parser.add_argument("-f", "--list-data-files", action="store_true",
25
- help="List all imported data files.")
26
- parser.add_argument("-w", "--list-wet-processes", action="store_true",
27
- help="List all imported wet processes.")
28
- parser.add_argument("-b", "--list-bi-processes", action="store_true",
29
- help="List all imported bio info processes.")
30
- parser.add_argument("-Y", "--list-data-files-per-bundle", action="store_true",
31
- help="For each imported YAML bundle, "
32
- "display some info and list its data files.")
28
+ parser.add_argument(
29
+ "-y",
30
+ "--list-bundles",
31
+ action="store_true",
32
+ help="List all imported YAML bundles.",
33
+ )
34
+ parser.add_argument(
35
+ "-f",
36
+ "--list-data-files",
37
+ action="store_true",
38
+ help="List all imported data files.",
39
+ )
40
+ parser.add_argument(
41
+ "-w",
42
+ "--list-wet-processes",
43
+ action="store_true",
44
+ help="List all imported wet processes.",
45
+ )
46
+ parser.add_argument(
47
+ "-b",
48
+ "--list-bi-processes",
49
+ action="store_true",
50
+ help="List all imported bio info processes.",
51
+ )
52
+ parser.add_argument(
53
+ "-Y",
54
+ "--list-data-files-per-bundle",
55
+ action="store_true",
56
+ help="For each imported YAML bundle, "
57
+ "display some info and list its data files.",
58
+ )
33
59
  return parser.parse_args()
34
60
 
35
61
 
36
62
  def list_bundles(es_query_conn: ElasticQueryConn, index: str) -> None:
37
63
  """List all imported YAML bundles."""
38
-
39
64
  query = {
40
65
  "size": 0,
41
66
  "aggs": {
42
67
  "get_bundle_paths": {
43
68
  "composite": {
44
- "sources": {"bundle_path": {"terms": {"field": "bundle_path.keyword"}}},
69
+ "sources": {
70
+ "bundle_path": {
71
+ "terms": {"field": "bundle_path.keyword"}
72
+ }
73
+ },
45
74
  "size": 1000,
46
75
  }
47
76
  }
48
- }
77
+ },
49
78
  }
50
79
 
51
- buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
80
+ buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
81
+ index, query
82
+ )
52
83
 
53
- print("Imported YAML files")
54
- print("===================")
84
+ logger.info("Imported YAML files")
85
+ logger.info("===================")
55
86
 
56
87
  if len(buckets) == 0:
57
- print("Empty response.", end="\n")
88
+ logger.info("Empty response.")
58
89
  return
59
90
 
60
91
  for bucket in buckets:
61
- bundle_path = bucket['key']['bundle_path']
62
- print(f'- {bundle_path}')
63
- print()
92
+ bundle_path = bucket["key"]["bundle_path"]
93
+ logger.info("- %s", bundle_path)
64
94
 
65
95
 
66
96
  def list_data_files(es_query_conn: ElasticQueryConn, index: str) -> None:
67
97
  """List all imported data files."""
68
-
69
98
  query = {
70
99
  "size": 0,
71
100
  "aggs": {
@@ -75,22 +104,23 @@ def list_data_files(es_query_conn: ElasticQueryConn, index: str) -> None:
75
104
  "size": 1000,
76
105
  }
77
106
  }
78
- }
107
+ },
79
108
  }
80
109
 
81
- buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
110
+ buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
111
+ index, query
112
+ )
82
113
 
83
- print("Imported data files")
84
- print("===================")
114
+ logger.info("Imported data files")
115
+ logger.info("===================")
85
116
 
86
117
  if len(buckets) == 0:
87
- print("Empty response.", end="\n")
118
+ logger.info("Empty response.")
88
119
  return
89
120
 
90
121
  for bucket in buckets:
91
- bundle_path = bucket['key']['path']
92
- print(f'- {bundle_path}')
93
- print()
122
+ bundle_path = bucket["key"]["path"]
123
+ logger.info("- %s", bundle_path)
94
124
 
95
125
 
96
126
  def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
@@ -98,29 +128,30 @@ def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
98
128
  process_ids = es_query_conn.get_field_values(index, "proc_id")
99
129
 
100
130
  if len(process_ids) == 0:
101
- print("Empty response.", end="\n")
131
+ logger.info("Empty response.")
102
132
  return
103
133
 
104
134
  for process_id in process_ids:
105
- print(f'- {process_id}')
106
- print()
135
+ logger.info("- %s", process_id)
107
136
 
108
137
 
109
138
  def list_wet_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
110
139
  """List all wet processes."""
111
- print("Imported wet processes")
112
- print("======================")
140
+ logger.info("Imported wet processes")
141
+ logger.info("======================")
113
142
  list_processes(es_query_conn, index)
114
143
 
115
144
 
116
145
  def list_bi_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
117
146
  """List all bio info processes."""
118
- print("Imported bi processes")
119
- print("=====================")
147
+ logger.info("Imported bi processes")
148
+ logger.info("=====================")
120
149
  list_processes(es_query_conn, index)
121
150
 
122
151
 
123
- def list_data_files_per_bundle(es_query_conn: ElasticQueryConn, index: str) -> None:
152
+ def list_data_files_per_bundle(
153
+ es_query_conn: ElasticQueryConn, index: str
154
+ ) -> None:
124
155
  """For each imported YAML bundle, display some info and list its data files."""
125
156
  query = {
126
157
  "size": 0,
@@ -130,50 +161,47 @@ def list_data_files_per_bundle(es_query_conn: ElasticQueryConn, index: str) -> N
130
161
  "sources": [
131
162
  {
132
163
  "bundle_path": {
133
- "terms": {
134
- "field": "bundle_path.keyword"
135
- }
164
+ "terms": {"field": "bundle_path.keyword"}
136
165
  }
137
166
  }
138
167
  ],
139
- "size": 100
168
+ "size": 100,
140
169
  },
141
- "aggs": {
142
- "docs": {
143
- "top_hits": {
144
- "size": 100
145
- }
146
- }
147
- }
170
+ "aggs": {"docs": {"top_hits": {"size": 100}}},
148
171
  }
149
- }
172
+ },
150
173
  }
151
174
 
152
- buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
175
+ buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
176
+ index, query
177
+ )
153
178
 
154
- print("Data files per YAML bundle")
155
- print("==========================")
179
+ logger.info("Data files per YAML bundle")
180
+ logger.info("==========================")
156
181
 
157
182
  if len(buckets) == 0:
158
- print("Empty response.", end="\n")
183
+ logger.info("Empty response.")
159
184
  return
160
185
 
161
186
  for bucket in buckets:
162
-
163
187
  documents = bucket["docs"]["hits"]["hits"]
164
188
  if len(documents) == 0:
165
189
  continue
166
190
 
167
- print(f"- Bundle Path: {bucket['key']['bundle_path']}")
168
- print(f" -> Wet process: {documents[0]['_source']['metadata']['wet_process']}")
169
- print(f" -> Bio info process: {documents[0]['_source']['metadata']['bi_process']}")
170
- print(" -> Data files:")
191
+ logger.info("- Bundle Path: %s", bucket["key"]["bundle_path"])
192
+ logger.info(
193
+ " -> Wet process: %s",
194
+ documents[0]["_source"]["metadata"]["wet_process"],
195
+ )
196
+ logger.info(
197
+ " -> Bio info process: %s",
198
+ documents[0]["_source"]["metadata"]["bi_process"],
199
+ )
200
+ logger.info(" -> Data files:")
171
201
 
172
202
  for doc in documents:
173
- print(f" - Index: {doc['_source']['file_index']}")
174
- print(f" Path: {doc['_source']['path']}")
175
-
176
- print()
203
+ logger.info(" - Index: %s", doc["_source"]["file_index"])
204
+ logger.info(" Path: %s", doc["_source"]["path"])
177
205
 
178
206
 
179
207
  def main() -> None:
@@ -185,8 +213,9 @@ def main() -> None:
185
213
 
186
214
  addr = f"https://{args.es_host}:{args.es_port}"
187
215
  logger.info("Trying to connect to Elasticsearch at %s...", addr)
188
- es_query_conn = ElasticQueryConn(addr, args.es_cert_fp,
189
- basic_auth=(args.es_usr, args.es_pwd))
216
+ es_query_conn = ElasticQueryConn(
217
+ addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
218
+ )
190
219
 
191
220
  analysis_index = f"{args.es_index_prefix}-analyses"
192
221
  wet_processes_index = f"{args.es_index_prefix}-wet_processes"
@@ -223,5 +252,5 @@ def main() -> None:
223
252
  list_data_files_per_bundle(es_query_conn, analysis_index)
224
253
 
225
254
 
226
- if __name__ == '__main__':
255
+ if __name__ == "__main__":
227
256
  main()