genelastic 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/cli_start_api.py +18 -0
- genelastic/api/extends/example.py +2 -3
- genelastic/api/extends/example.yml +20 -0
- genelastic/api/routes.py +160 -23
- genelastic/api/server.py +42 -31
- genelastic/api/settings.py +5 -8
- genelastic/api/specification.yml +350 -0
- genelastic/common/__init__.py +41 -9
- genelastic/common/cli.py +103 -23
- genelastic/common/elastic.py +80 -49
- genelastic/common/exceptions.py +0 -2
- genelastic/common/server.py +51 -0
- genelastic/common/types.py +20 -15
- genelastic/import_data/__init__.py +23 -5
- genelastic/import_data/analyses.py +17 -20
- genelastic/import_data/analysis.py +69 -65
- genelastic/import_data/bi_process.py +7 -5
- genelastic/import_data/bi_processes.py +8 -8
- genelastic/import_data/cli_gen_data.py +143 -0
- genelastic/import_data/cli_import.py +379 -0
- genelastic/import_data/{info.py → cli_info.py} +104 -75
- genelastic/import_data/cli_integrity.py +384 -0
- genelastic/import_data/cli_validate.py +54 -0
- genelastic/import_data/constants.py +11 -32
- genelastic/import_data/data_file.py +23 -20
- genelastic/import_data/filename_pattern.py +26 -32
- genelastic/import_data/import_bundle.py +56 -47
- genelastic/import_data/import_bundle_factory.py +166 -158
- genelastic/import_data/logger.py +22 -18
- genelastic/import_data/random_bundle.py +425 -0
- genelastic/import_data/tags.py +46 -26
- genelastic/import_data/wet_process.py +8 -4
- genelastic/import_data/wet_processes.py +13 -8
- genelastic/ui/__init__.py +0 -0
- genelastic/ui/cli_start_ui.py +18 -0
- genelastic/ui/routes.py +86 -0
- genelastic/ui/server.py +14 -0
- genelastic/ui/settings.py +7 -0
- genelastic/ui/templates/analyses.html +11 -0
- genelastic/ui/templates/bi_processes.html +11 -0
- genelastic/ui/templates/home.html +4 -0
- genelastic/ui/templates/layout.html +34 -0
- genelastic/ui/templates/version.html +9 -0
- genelastic/ui/templates/wet_processes.html +11 -0
- genelastic-0.8.0.dist-info/METADATA +109 -0
- genelastic-0.8.0.dist-info/RECORD +52 -0
- {genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/WHEEL +1 -1
- genelastic-0.8.0.dist-info/entry_points.txt +8 -0
- genelastic/import_data/gen_data.py +0 -194
- genelastic/import_data/import_data.py +0 -292
- genelastic/import_data/integrity.py +0 -290
- genelastic/import_data/validate_data.py +0 -43
- genelastic-0.6.1.dist-info/METADATA +0 -41
- genelastic-0.6.1.dist-info/RECORD +0 -36
- genelastic-0.6.1.dist-info/entry_points.txt +0 -6
- {genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
# vi: se tw=80
|
|
2
|
+
|
|
3
|
+
# Elasticsearch Python API:
|
|
4
|
+
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/overview.html
|
|
5
|
+
# https://elasticsearch-py.readthedocs.io/en/latest/api.html
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import csv
|
|
9
|
+
import datetime
|
|
10
|
+
import hashlib
|
|
11
|
+
import logging
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
import vcf
|
|
17
|
+
|
|
18
|
+
from genelastic.common import (
|
|
19
|
+
AnalysisDocument,
|
|
20
|
+
BulkItems,
|
|
21
|
+
ElasticImportConn,
|
|
22
|
+
MetadataDocument,
|
|
23
|
+
ProcessDocument,
|
|
24
|
+
add_es_connection_args,
|
|
25
|
+
add_verbose_control_args,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
from .bi_processes import BioInfoProcesses
|
|
29
|
+
from .data_file import DataFile
|
|
30
|
+
from .import_bundle_factory import make_import_bundle_from_files
|
|
31
|
+
from .logger import configure_logging
|
|
32
|
+
from .wet_processes import WetProcesses
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger("genelastic")
|
|
35
|
+
logging.getLogger("elastic_transport").setLevel(
|
|
36
|
+
logging.WARNING
|
|
37
|
+
) # Disable excessive logging
|
|
38
|
+
logging.getLogger("urllib3").setLevel(
|
|
39
|
+
logging.WARNING
|
|
40
|
+
) # Disable excessive logging
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def read_args() -> argparse.Namespace:
|
|
44
|
+
"""Read arguments from command line."""
|
|
45
|
+
parser = argparse.ArgumentParser(
|
|
46
|
+
description="Genetics data importer.",
|
|
47
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
48
|
+
allow_abbrev=False,
|
|
49
|
+
)
|
|
50
|
+
add_verbose_control_args(parser)
|
|
51
|
+
add_es_connection_args(parser)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"-D",
|
|
54
|
+
"--dry-run",
|
|
55
|
+
dest="dryrun",
|
|
56
|
+
action="count",
|
|
57
|
+
default=0,
|
|
58
|
+
help=(
|
|
59
|
+
"Dry-run level. -D for data files loading (VCF, coverage, etc) "
|
|
60
|
+
"without connecting or importing to database. "
|
|
61
|
+
"-DD for metadata YAML files loading only (no loading of data files)."
|
|
62
|
+
),
|
|
63
|
+
)
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--log-file", dest="log_file", help="Path to a log file."
|
|
66
|
+
)
|
|
67
|
+
parser.add_argument(
|
|
68
|
+
"--no-list",
|
|
69
|
+
dest="no_list",
|
|
70
|
+
action="store_true",
|
|
71
|
+
help="Do not print list of files to be imported.",
|
|
72
|
+
)
|
|
73
|
+
parser.add_argument(
|
|
74
|
+
"--no-confirm",
|
|
75
|
+
dest="no_confirm",
|
|
76
|
+
action="store_true",
|
|
77
|
+
help="Do not ask confirmation before importing.",
|
|
78
|
+
)
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
"files",
|
|
81
|
+
type=Path,
|
|
82
|
+
nargs="+",
|
|
83
|
+
default=None,
|
|
84
|
+
help="Data files that describe what to import.",
|
|
85
|
+
)
|
|
86
|
+
return parser.parse_args()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def import_cov_file(
|
|
90
|
+
es_import_conn: ElasticImportConn | None,
|
|
91
|
+
file_index: str,
|
|
92
|
+
file: Path,
|
|
93
|
+
dryrun: int = 0,
|
|
94
|
+
) -> None:
|
|
95
|
+
"""Import a coverage file to the Elasticsearch database."""
|
|
96
|
+
# Set field types
|
|
97
|
+
if dryrun == 0 and es_import_conn:
|
|
98
|
+
es_import_conn.client.indices.put_mapping(
|
|
99
|
+
index=file_index,
|
|
100
|
+
body={
|
|
101
|
+
"properties": {
|
|
102
|
+
"pos": {"type": "integer"},
|
|
103
|
+
"depth": {"type": "byte"},
|
|
104
|
+
}
|
|
105
|
+
},
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Open file
|
|
109
|
+
if dryrun > 1:
|
|
110
|
+
logger.info(
|
|
111
|
+
"Would load and import Coverage file %s " "into index %s.",
|
|
112
|
+
file,
|
|
113
|
+
file_index,
|
|
114
|
+
)
|
|
115
|
+
else:
|
|
116
|
+
logger.info("Load Coverage file %s.", file)
|
|
117
|
+
if dryrun == 1:
|
|
118
|
+
logger.info(
|
|
119
|
+
"Would import Coverage file %s into index %s.", file, file_index
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
logger.info(
|
|
123
|
+
"Import Coverage file %s into index %s.", file, file_index
|
|
124
|
+
)
|
|
125
|
+
with file.open(newline="", encoding="utf-8") as f:
|
|
126
|
+
# Read file as CSV
|
|
127
|
+
reader = csv.reader(f, delimiter="\t", quotechar='"')
|
|
128
|
+
|
|
129
|
+
# Loop on al lines
|
|
130
|
+
for row in reader:
|
|
131
|
+
# Build document
|
|
132
|
+
# Position starts at 0 inside coverage file
|
|
133
|
+
doc: MetadataDocument = {
|
|
134
|
+
"type": "coverage",
|
|
135
|
+
"chr": row[0],
|
|
136
|
+
"pos": int(row[1]) + 1,
|
|
137
|
+
"depth": int(row[2]),
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
# Insert document
|
|
141
|
+
if dryrun == 0 and es_import_conn:
|
|
142
|
+
es_import_conn.client.index(index=file_index, document=doc)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def import_analysis_metadata( # noqa: PLR0913
|
|
146
|
+
es_import_conn: ElasticImportConn | None,
|
|
147
|
+
index_prefix: str,
|
|
148
|
+
file_index: str,
|
|
149
|
+
file: DataFile,
|
|
150
|
+
analysis_type: str,
|
|
151
|
+
dryrun: int = 0,
|
|
152
|
+
) -> None:
|
|
153
|
+
"""Import analysis metadata into a dedicated index."""
|
|
154
|
+
doc: AnalysisDocument = {
|
|
155
|
+
"path": str(file.path.resolve()),
|
|
156
|
+
"bundle_path": str(file.bundle_path.resolve())
|
|
157
|
+
if file.bundle_path
|
|
158
|
+
else None,
|
|
159
|
+
"metadata": file.metadata,
|
|
160
|
+
"file_index": file_index,
|
|
161
|
+
"type": analysis_type,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
bulk_items: BulkItems = [
|
|
165
|
+
{"_index": f"{index_prefix}-analyses", "_source": doc}
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
if dryrun == 0 and es_import_conn:
|
|
169
|
+
es_import_conn.import_items(
|
|
170
|
+
bulk_items,
|
|
171
|
+
start_time=time.perf_counter(),
|
|
172
|
+
total_items=len(bulk_items),
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def import_vcf_file(
|
|
177
|
+
es_import_conn: ElasticImportConn | None,
|
|
178
|
+
file_index: str,
|
|
179
|
+
file: DataFile,
|
|
180
|
+
dryrun: int = 0,
|
|
181
|
+
) -> None:
|
|
182
|
+
"""Import a VCF file to the Elasticsearch database."""
|
|
183
|
+
logger.info('Import VCF file "%s".', file)
|
|
184
|
+
|
|
185
|
+
if dryrun > 1:
|
|
186
|
+
logger.info(
|
|
187
|
+
"Would load and import VCF file %s " "into index %s.",
|
|
188
|
+
file.path,
|
|
189
|
+
file_index,
|
|
190
|
+
)
|
|
191
|
+
else:
|
|
192
|
+
logger.info("Load VCF file %s.", file.path)
|
|
193
|
+
if dryrun == 1:
|
|
194
|
+
logger.info(
|
|
195
|
+
"Would import VCF file %s into index %s.", file.path, file_index
|
|
196
|
+
)
|
|
197
|
+
else:
|
|
198
|
+
logger.info(
|
|
199
|
+
"Importing VCF file %s into index %s...", file.path, file_index
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
vcf_reader = vcf.Reader(filename=str(file.path))
|
|
204
|
+
n = 0
|
|
205
|
+
start = time.perf_counter()
|
|
206
|
+
bulk_sz = 256 # Bulk size
|
|
207
|
+
bulk_items: BulkItems = []
|
|
208
|
+
for record in vcf_reader:
|
|
209
|
+
# Correct values
|
|
210
|
+
if not record.CHROM.startswith("chr"):
|
|
211
|
+
if record.CHROM.lower().startswith("chr"):
|
|
212
|
+
record.CHROM = "chr" + record.CHROM[3:]
|
|
213
|
+
else:
|
|
214
|
+
record.CHROM = "chr" + record.CHROM
|
|
215
|
+
|
|
216
|
+
# Build document
|
|
217
|
+
alt = [x if x is None else x.type for x in record.ALT]
|
|
218
|
+
doc: MetadataDocument = {
|
|
219
|
+
"type": "vcf",
|
|
220
|
+
"chr": record.CHROM,
|
|
221
|
+
"pos": record.POS,
|
|
222
|
+
"alt": alt,
|
|
223
|
+
"info": record.INFO,
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
if dryrun == 0:
|
|
227
|
+
# Append item to bulk
|
|
228
|
+
bulk_items.append({"_index": file_index, "_source": doc})
|
|
229
|
+
n += 1
|
|
230
|
+
|
|
231
|
+
# Insert bulk of items
|
|
232
|
+
if len(bulk_items) >= bulk_sz and es_import_conn:
|
|
233
|
+
es_import_conn.import_items(
|
|
234
|
+
bulk_items, start_time=start, total_items=n
|
|
235
|
+
)
|
|
236
|
+
bulk_items = []
|
|
237
|
+
|
|
238
|
+
# Insert remaining items
|
|
239
|
+
if dryrun == 0 and es_import_conn:
|
|
240
|
+
es_import_conn.import_items(
|
|
241
|
+
bulk_items, start_time=start, total_items=n
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
except StopIteration:
|
|
245
|
+
logger.error("Skipping empty file : %s.", file.path)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def import_processes(
|
|
249
|
+
es_import_conn: ElasticImportConn | None,
|
|
250
|
+
index: str,
|
|
251
|
+
processes: WetProcesses | BioInfoProcesses,
|
|
252
|
+
dryrun: int = 0,
|
|
253
|
+
) -> None:
|
|
254
|
+
"""Import processes into their own index."""
|
|
255
|
+
bulk_items: BulkItems = []
|
|
256
|
+
|
|
257
|
+
for proc_id in processes.get_process_ids():
|
|
258
|
+
process = processes[proc_id]
|
|
259
|
+
process_type = process.__class__.__name__
|
|
260
|
+
doc: ProcessDocument = process.data | {
|
|
261
|
+
"proc_id": proc_id,
|
|
262
|
+
"type": process_type,
|
|
263
|
+
}
|
|
264
|
+
bulk_items.append({"_index": index, "_source": doc})
|
|
265
|
+
|
|
266
|
+
if dryrun == 0 and es_import_conn:
|
|
267
|
+
es_import_conn.import_items(
|
|
268
|
+
bulk_items,
|
|
269
|
+
start_time=time.perf_counter(),
|
|
270
|
+
total_items=len(bulk_items),
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def generate_unique_index(index_prefix: str, filepath: Path) -> str:
|
|
275
|
+
"""Generate a unique index with the following format:
|
|
276
|
+
<index_prefix>_<current_date>_<md5_hashed_filepath>
|
|
277
|
+
"""
|
|
278
|
+
current_date = datetime.datetime.now(tz=datetime.UTC).strftime("%Y%m%d")
|
|
279
|
+
hashed_filepath = hashlib.md5(
|
|
280
|
+
str(filepath).encode("utf-8"), usedforsecurity=False
|
|
281
|
+
).hexdigest()
|
|
282
|
+
return f"{index_prefix}-file-{current_date}-{hashed_filepath}"
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def main() -> None: # noqa: C901
|
|
286
|
+
"""Entry point of the import script."""
|
|
287
|
+
# Read command line arguments
|
|
288
|
+
args = read_args()
|
|
289
|
+
|
|
290
|
+
# Configure logging
|
|
291
|
+
configure_logging(args.verbose, log_file=args.log_file)
|
|
292
|
+
logger.debug("Arguments: %s", args)
|
|
293
|
+
logger.debug("LOGGERS: %s", logging.root.manager.loggerDict)
|
|
294
|
+
|
|
295
|
+
# Open connection to ES
|
|
296
|
+
if args.dryrun == 0:
|
|
297
|
+
addr = f"https://{args.es_host}:{args.es_port}"
|
|
298
|
+
logger.info("Trying to connect to Elasticsearch at %s...", addr)
|
|
299
|
+
es_import_conn = ElasticImportConn(
|
|
300
|
+
addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
|
|
301
|
+
)
|
|
302
|
+
else:
|
|
303
|
+
es_import_conn = None
|
|
304
|
+
|
|
305
|
+
# Load YAML import bundle
|
|
306
|
+
import_bundle = make_import_bundle_from_files(args.files, check=True)
|
|
307
|
+
all_bundled_files = import_bundle.get_files()
|
|
308
|
+
|
|
309
|
+
# CHECK
|
|
310
|
+
for f in all_bundled_files:
|
|
311
|
+
if not f.exists():
|
|
312
|
+
msg = f"Path {f.path} does not point to a valid file."
|
|
313
|
+
raise RuntimeError(msg)
|
|
314
|
+
|
|
315
|
+
# LIST
|
|
316
|
+
if not args.no_list:
|
|
317
|
+
for f in all_bundled_files:
|
|
318
|
+
logger.info("Will import %s.", f.path)
|
|
319
|
+
|
|
320
|
+
# Ask confirmation for importing
|
|
321
|
+
if not args.no_confirm:
|
|
322
|
+
answer: str = "maybe"
|
|
323
|
+
while answer not in ["", "n", "y"]:
|
|
324
|
+
answer = input("Import (y/N)? ").lower()
|
|
325
|
+
if answer != "y":
|
|
326
|
+
logger.info("Import canceled.")
|
|
327
|
+
sys.exit(0)
|
|
328
|
+
|
|
329
|
+
# IMPORT
|
|
330
|
+
# Loop on file categories
|
|
331
|
+
for cat in import_bundle.analyses.get_all_categories():
|
|
332
|
+
# Import all files in this category.
|
|
333
|
+
for f in import_bundle.get_files(cat):
|
|
334
|
+
logger.info("Import %s files from %s.", cat, f.path)
|
|
335
|
+
# First, generate a unique index name for each file.
|
|
336
|
+
file_index = generate_unique_index(args.es_index_prefix, f.path)
|
|
337
|
+
# Then, import the analysis metadata into a dedicated index.
|
|
338
|
+
import_analysis_metadata(
|
|
339
|
+
es_import_conn,
|
|
340
|
+
args.es_index_prefix,
|
|
341
|
+
file_index,
|
|
342
|
+
f,
|
|
343
|
+
cat,
|
|
344
|
+
args.dryrun,
|
|
345
|
+
)
|
|
346
|
+
# Finally, import the file in its own index.
|
|
347
|
+
globals()[f"import_{cat}_file"](
|
|
348
|
+
es_import_conn=es_import_conn,
|
|
349
|
+
file_index=file_index,
|
|
350
|
+
file=f,
|
|
351
|
+
dryrun=args.dryrun,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Import processes
|
|
355
|
+
logger.info("Importing wet processes.")
|
|
356
|
+
logger.info(
|
|
357
|
+
"Wet processes IDs = %s",
|
|
358
|
+
str(import_bundle.wet_processes.get_process_ids()),
|
|
359
|
+
)
|
|
360
|
+
import_processes(
|
|
361
|
+
es_import_conn,
|
|
362
|
+
f"{args.es_index_prefix}-wet_processes",
|
|
363
|
+
import_bundle.wet_processes,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
logger.info("Importing bio info processes.")
|
|
367
|
+
logger.info(
|
|
368
|
+
"Bio info processes IDs = %s",
|
|
369
|
+
str(import_bundle.bi_processes.get_process_ids()),
|
|
370
|
+
)
|
|
371
|
+
import_processes(
|
|
372
|
+
es_import_conn,
|
|
373
|
+
f"{args.es_index_prefix}-bi_processes",
|
|
374
|
+
import_bundle.bi_processes,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
if __name__ == "__main__":
|
|
379
|
+
main()
|
|
@@ -1,71 +1,100 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
1
|
import argparse
|
|
3
2
|
import logging
|
|
4
|
-
import typing
|
|
5
3
|
|
|
6
|
-
from genelastic.common import (
|
|
7
|
-
|
|
4
|
+
from genelastic.common import (
|
|
5
|
+
Bucket,
|
|
6
|
+
ElasticQueryConn,
|
|
7
|
+
add_es_connection_args,
|
|
8
|
+
add_verbose_control_args,
|
|
9
|
+
)
|
|
8
10
|
|
|
9
11
|
from .logger import configure_logging
|
|
10
12
|
|
|
11
|
-
logger = logging.getLogger(
|
|
12
|
-
logging.getLogger(
|
|
13
|
+
logger = logging.getLogger("genelastic")
|
|
14
|
+
logging.getLogger("elastic_transport").setLevel(
|
|
15
|
+
logging.WARNING
|
|
16
|
+
) # Disable excessive logging
|
|
13
17
|
|
|
14
18
|
|
|
15
19
|
def read_args() -> argparse.Namespace:
|
|
16
20
|
"""Read arguments from command line."""
|
|
17
|
-
parser = argparse.ArgumentParser(
|
|
18
|
-
|
|
19
|
-
|
|
21
|
+
parser = argparse.ArgumentParser(
|
|
22
|
+
description="ElasticSearch database info.",
|
|
23
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
24
|
+
allow_abbrev=False,
|
|
25
|
+
)
|
|
20
26
|
add_verbose_control_args(parser)
|
|
21
27
|
add_es_connection_args(parser)
|
|
22
|
-
parser.add_argument(
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
parser.add_argument(
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"-y",
|
|
30
|
+
"--list-bundles",
|
|
31
|
+
action="store_true",
|
|
32
|
+
help="List all imported YAML bundles.",
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"-f",
|
|
36
|
+
"--list-data-files",
|
|
37
|
+
action="store_true",
|
|
38
|
+
help="List all imported data files.",
|
|
39
|
+
)
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"-w",
|
|
42
|
+
"--list-wet-processes",
|
|
43
|
+
action="store_true",
|
|
44
|
+
help="List all imported wet processes.",
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"-b",
|
|
48
|
+
"--list-bi-processes",
|
|
49
|
+
action="store_true",
|
|
50
|
+
help="List all imported bio info processes.",
|
|
51
|
+
)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"-Y",
|
|
54
|
+
"--list-data-files-per-bundle",
|
|
55
|
+
action="store_true",
|
|
56
|
+
help="For each imported YAML bundle, "
|
|
57
|
+
"display some info and list its data files.",
|
|
58
|
+
)
|
|
33
59
|
return parser.parse_args()
|
|
34
60
|
|
|
35
61
|
|
|
36
62
|
def list_bundles(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
37
63
|
"""List all imported YAML bundles."""
|
|
38
|
-
|
|
39
64
|
query = {
|
|
40
65
|
"size": 0,
|
|
41
66
|
"aggs": {
|
|
42
67
|
"get_bundle_paths": {
|
|
43
68
|
"composite": {
|
|
44
|
-
"sources": {
|
|
69
|
+
"sources": {
|
|
70
|
+
"bundle_path": {
|
|
71
|
+
"terms": {"field": "bundle_path.keyword"}
|
|
72
|
+
}
|
|
73
|
+
},
|
|
45
74
|
"size": 1000,
|
|
46
75
|
}
|
|
47
76
|
}
|
|
48
|
-
}
|
|
77
|
+
},
|
|
49
78
|
}
|
|
50
79
|
|
|
51
|
-
buckets:
|
|
80
|
+
buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
|
|
81
|
+
index, query
|
|
82
|
+
)
|
|
52
83
|
|
|
53
|
-
|
|
54
|
-
|
|
84
|
+
logger.info("Imported YAML files")
|
|
85
|
+
logger.info("===================")
|
|
55
86
|
|
|
56
87
|
if len(buckets) == 0:
|
|
57
|
-
|
|
88
|
+
logger.info("Empty response.")
|
|
58
89
|
return
|
|
59
90
|
|
|
60
91
|
for bucket in buckets:
|
|
61
|
-
bundle_path = bucket[
|
|
62
|
-
|
|
63
|
-
print()
|
|
92
|
+
bundle_path = bucket["key"]["bundle_path"]
|
|
93
|
+
logger.info("- %s", bundle_path)
|
|
64
94
|
|
|
65
95
|
|
|
66
96
|
def list_data_files(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
67
97
|
"""List all imported data files."""
|
|
68
|
-
|
|
69
98
|
query = {
|
|
70
99
|
"size": 0,
|
|
71
100
|
"aggs": {
|
|
@@ -75,22 +104,23 @@ def list_data_files(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
|
75
104
|
"size": 1000,
|
|
76
105
|
}
|
|
77
106
|
}
|
|
78
|
-
}
|
|
107
|
+
},
|
|
79
108
|
}
|
|
80
109
|
|
|
81
|
-
buckets:
|
|
110
|
+
buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
|
|
111
|
+
index, query
|
|
112
|
+
)
|
|
82
113
|
|
|
83
|
-
|
|
84
|
-
|
|
114
|
+
logger.info("Imported data files")
|
|
115
|
+
logger.info("===================")
|
|
85
116
|
|
|
86
117
|
if len(buckets) == 0:
|
|
87
|
-
|
|
118
|
+
logger.info("Empty response.")
|
|
88
119
|
return
|
|
89
120
|
|
|
90
121
|
for bucket in buckets:
|
|
91
|
-
bundle_path = bucket[
|
|
92
|
-
|
|
93
|
-
print()
|
|
122
|
+
bundle_path = bucket["key"]["path"]
|
|
123
|
+
logger.info("- %s", bundle_path)
|
|
94
124
|
|
|
95
125
|
|
|
96
126
|
def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
@@ -98,29 +128,30 @@ def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
|
98
128
|
process_ids = es_query_conn.get_field_values(index, "proc_id")
|
|
99
129
|
|
|
100
130
|
if len(process_ids) == 0:
|
|
101
|
-
|
|
131
|
+
logger.info("Empty response.")
|
|
102
132
|
return
|
|
103
133
|
|
|
104
134
|
for process_id in process_ids:
|
|
105
|
-
|
|
106
|
-
print()
|
|
135
|
+
logger.info("- %s", process_id)
|
|
107
136
|
|
|
108
137
|
|
|
109
138
|
def list_wet_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
110
139
|
"""List all wet processes."""
|
|
111
|
-
|
|
112
|
-
|
|
140
|
+
logger.info("Imported wet processes")
|
|
141
|
+
logger.info("======================")
|
|
113
142
|
list_processes(es_query_conn, index)
|
|
114
143
|
|
|
115
144
|
|
|
116
145
|
def list_bi_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
117
146
|
"""List all bio info processes."""
|
|
118
|
-
|
|
119
|
-
|
|
147
|
+
logger.info("Imported bi processes")
|
|
148
|
+
logger.info("=====================")
|
|
120
149
|
list_processes(es_query_conn, index)
|
|
121
150
|
|
|
122
151
|
|
|
123
|
-
def list_data_files_per_bundle(
|
|
152
|
+
def list_data_files_per_bundle(
|
|
153
|
+
es_query_conn: ElasticQueryConn, index: str
|
|
154
|
+
) -> None:
|
|
124
155
|
"""For each imported YAML bundle, display some info and list its data files."""
|
|
125
156
|
query = {
|
|
126
157
|
"size": 0,
|
|
@@ -130,50 +161,47 @@ def list_data_files_per_bundle(es_query_conn: ElasticQueryConn, index: str) -> N
|
|
|
130
161
|
"sources": [
|
|
131
162
|
{
|
|
132
163
|
"bundle_path": {
|
|
133
|
-
"terms": {
|
|
134
|
-
"field": "bundle_path.keyword"
|
|
135
|
-
}
|
|
164
|
+
"terms": {"field": "bundle_path.keyword"}
|
|
136
165
|
}
|
|
137
166
|
}
|
|
138
167
|
],
|
|
139
|
-
"size": 100
|
|
168
|
+
"size": 100,
|
|
140
169
|
},
|
|
141
|
-
"aggs": {
|
|
142
|
-
"docs": {
|
|
143
|
-
"top_hits": {
|
|
144
|
-
"size": 100
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
}
|
|
170
|
+
"aggs": {"docs": {"top_hits": {"size": 100}}},
|
|
148
171
|
}
|
|
149
|
-
}
|
|
172
|
+
},
|
|
150
173
|
}
|
|
151
174
|
|
|
152
|
-
buckets:
|
|
175
|
+
buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
|
|
176
|
+
index, query
|
|
177
|
+
)
|
|
153
178
|
|
|
154
|
-
|
|
155
|
-
|
|
179
|
+
logger.info("Data files per YAML bundle")
|
|
180
|
+
logger.info("==========================")
|
|
156
181
|
|
|
157
182
|
if len(buckets) == 0:
|
|
158
|
-
|
|
183
|
+
logger.info("Empty response.")
|
|
159
184
|
return
|
|
160
185
|
|
|
161
186
|
for bucket in buckets:
|
|
162
|
-
|
|
163
187
|
documents = bucket["docs"]["hits"]["hits"]
|
|
164
188
|
if len(documents) == 0:
|
|
165
189
|
continue
|
|
166
190
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
191
|
+
logger.info("- Bundle Path: %s", bucket["key"]["bundle_path"])
|
|
192
|
+
logger.info(
|
|
193
|
+
" -> Wet process: %s",
|
|
194
|
+
documents[0]["_source"]["metadata"]["wet_process"],
|
|
195
|
+
)
|
|
196
|
+
logger.info(
|
|
197
|
+
" -> Bio info process: %s",
|
|
198
|
+
documents[0]["_source"]["metadata"]["bi_process"],
|
|
199
|
+
)
|
|
200
|
+
logger.info(" -> Data files:")
|
|
171
201
|
|
|
172
202
|
for doc in documents:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
print()
|
|
203
|
+
logger.info(" - Index: %s", doc["_source"]["file_index"])
|
|
204
|
+
logger.info(" Path: %s", doc["_source"]["path"])
|
|
177
205
|
|
|
178
206
|
|
|
179
207
|
def main() -> None:
|
|
@@ -185,8 +213,9 @@ def main() -> None:
|
|
|
185
213
|
|
|
186
214
|
addr = f"https://{args.es_host}:{args.es_port}"
|
|
187
215
|
logger.info("Trying to connect to Elasticsearch at %s...", addr)
|
|
188
|
-
es_query_conn = ElasticQueryConn(
|
|
189
|
-
|
|
216
|
+
es_query_conn = ElasticQueryConn(
|
|
217
|
+
addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
|
|
218
|
+
)
|
|
190
219
|
|
|
191
220
|
analysis_index = f"{args.es_index_prefix}-analyses"
|
|
192
221
|
wet_processes_index = f"{args.es_index_prefix}-wet_processes"
|
|
@@ -223,5 +252,5 @@ def main() -> None:
|
|
|
223
252
|
list_data_files_per_bundle(es_query_conn, analysis_index)
|
|
224
253
|
|
|
225
254
|
|
|
226
|
-
if __name__ ==
|
|
255
|
+
if __name__ == "__main__":
|
|
227
256
|
main()
|