genelastic 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/__init__.py +0 -13
- genelastic/api/__init__.py +0 -0
- genelastic/api/extends/__init__.py +0 -0
- genelastic/api/extends/example.py +7 -0
- genelastic/api/routes.py +84 -0
- genelastic/api/server.py +72 -0
- genelastic/api/settings.py +13 -0
- genelastic/common/__init__.py +12 -0
- genelastic/common/cli.py +35 -0
- genelastic/common/elastic.py +183 -0
- genelastic/common/exceptions.py +6 -0
- genelastic/common/types.py +20 -0
- genelastic/import_data/__init__.py +9 -0
- genelastic/{analyses.py → import_data/analyses.py} +3 -1
- genelastic/{analysis.py → import_data/analysis.py} +3 -2
- genelastic/{bi_process.py → import_data/bi_process.py} +1 -1
- genelastic/{bi_processes.py → import_data/bi_processes.py} +2 -1
- genelastic/{data_file.py → import_data/data_file.py} +3 -1
- genelastic/{filename_pattern.py → import_data/filename_pattern.py} +2 -1
- genelastic/{gen_data.py → import_data/gen_data.py} +3 -2
- genelastic/{import_bundle.py → import_data/import_bundle.py} +2 -1
- genelastic/{import_bundle_factory.py → import_data/import_bundle_factory.py} +3 -1
- genelastic/{import_data.py → import_data/import_data.py} +49 -51
- genelastic/{info.py → import_data/info.py} +29 -50
- genelastic/{integrity.py → import_data/integrity.py} +53 -87
- genelastic/{tags.py → import_data/tags.py} +2 -1
- genelastic/{validate_data.py → import_data/validate_data.py} +6 -4
- genelastic/{wet_processes.py → import_data/wet_processes.py} +2 -1
- {genelastic-0.6.0.dist-info → genelastic-0.6.1.dist-info}/METADATA +7 -2
- genelastic-0.6.1.dist-info/RECORD +36 -0
- {genelastic-0.6.0.dist-info → genelastic-0.6.1.dist-info}/WHEEL +1 -1
- genelastic-0.6.1.dist-info/entry_points.txt +6 -0
- genelastic/common.py +0 -151
- genelastic-0.6.0.dist-info/RECORD +0 -25
- genelastic-0.6.0.dist-info/entry_points.txt +0 -6
- /genelastic/{constants.py → import_data/constants.py} +0 -0
- /genelastic/{logger.py → import_data/logger.py} +0 -0
- /genelastic/{wet_process.py → import_data/wet_process.py} +0 -0
- {genelastic-0.6.0.dist-info → genelastic-0.6.1.dist-info}/top_level.txt +0 -0
|
@@ -8,23 +8,22 @@
|
|
|
8
8
|
import argparse
|
|
9
9
|
import csv
|
|
10
10
|
import datetime
|
|
11
|
+
import hashlib
|
|
11
12
|
import logging
|
|
12
13
|
import os
|
|
13
14
|
import sys
|
|
14
15
|
import time
|
|
15
|
-
import
|
|
16
|
+
import vcf # type: ignore
|
|
16
17
|
|
|
17
|
-
import
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
from genelastic.common import (add_verbose_control_args, add_es_connection_args,
|
|
19
|
+
ElasticImportConn, MetadataDocument, AnalysisDocument,
|
|
20
|
+
BulkItems, ProcessDocument)
|
|
20
21
|
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from . import make_import_bundle_from_files
|
|
22
|
+
from .import_bundle_factory import make_import_bundle_from_files
|
|
23
|
+
from .bi_processes import BioInfoProcesses
|
|
24
24
|
from .data_file import DataFile
|
|
25
25
|
from .logger import configure_logging
|
|
26
|
-
from .
|
|
27
|
-
add_verbose_control_args, add_es_connection_args, connect_to_es)
|
|
26
|
+
from .wet_processes import WetProcesses
|
|
28
27
|
|
|
29
28
|
logger = logging.getLogger('genelastic')
|
|
30
29
|
logging.getLogger('elastic_transport').setLevel(logging.WARNING) # Disable excessive logging
|
|
@@ -35,7 +34,8 @@ def read_args() -> argparse.Namespace:
|
|
|
35
34
|
# pylint: disable=R0801
|
|
36
35
|
"""Read arguments from command line."""
|
|
37
36
|
parser = argparse.ArgumentParser(description='Genetics data importer.',
|
|
38
|
-
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
37
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
38
|
+
allow_abbrev=False)
|
|
39
39
|
add_verbose_control_args(parser)
|
|
40
40
|
add_es_connection_args(parser)
|
|
41
41
|
parser.add_argument('-D', '--dry-run', dest='dryrun', action='count',
|
|
@@ -56,14 +56,14 @@ def read_args() -> argparse.Namespace:
|
|
|
56
56
|
return args
|
|
57
57
|
|
|
58
58
|
|
|
59
|
-
def import_cov_file(
|
|
59
|
+
def import_cov_file(es_import_conn: ElasticImportConn | None,
|
|
60
60
|
file_index: str, file: str, dryrun: int = 0) -> None:
|
|
61
61
|
"""Import a coverage file to the Elasticsearch database."""
|
|
62
62
|
# Set field types
|
|
63
|
-
if dryrun == 0 and
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
63
|
+
if dryrun == 0 and es_import_conn:
|
|
64
|
+
es_import_conn.client.indices.put_mapping(index=file_index,
|
|
65
|
+
body={'properties': {'pos': {'type': 'integer'},
|
|
66
|
+
'depth': {'type': 'byte'}}})
|
|
67
67
|
|
|
68
68
|
# Open file
|
|
69
69
|
if dryrun > 1:
|
|
@@ -93,24 +93,12 @@ def import_cov_file(es: elasticsearch.Elasticsearch | None,
|
|
|
93
93
|
}
|
|
94
94
|
|
|
95
95
|
# Insert document
|
|
96
|
-
if dryrun == 0 and
|
|
97
|
-
|
|
98
|
-
|
|
96
|
+
if dryrun == 0 and es_import_conn:
|
|
97
|
+
es_import_conn.client.index(index=file_index, document=doc)
|
|
99
98
|
|
|
100
|
-
def import_items(es: elasticsearch.Elasticsearch | None,
|
|
101
|
-
bulk_items: BulkItems,
|
|
102
|
-
start_time: float,
|
|
103
|
-
total_items: int) -> None:
|
|
104
|
-
"""Import items to the Elasticsearch database."""
|
|
105
|
-
if len(bulk_items) > 0 and es:
|
|
106
|
-
elasticsearch.helpers.bulk(es, bulk_items)
|
|
107
|
-
elapsed = time.perf_counter() - start_time
|
|
108
|
-
logger.info("Imported %d items in %s (%f items/s).", total_items,
|
|
109
|
-
datetime.timedelta(seconds=elapsed), total_items / elapsed)
|
|
110
99
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def import_analysis_metadata(es: elasticsearch.Elasticsearch | None,
|
|
100
|
+
# pylint: disable-next=too-many-arguments, too-many-positional-arguments
|
|
101
|
+
def import_analysis_metadata(es_import_conn: ElasticImportConn | None,
|
|
114
102
|
index_prefix: str,
|
|
115
103
|
file_index: str,
|
|
116
104
|
file: DataFile,
|
|
@@ -129,12 +117,14 @@ def import_analysis_metadata(es: elasticsearch.Elasticsearch | None,
|
|
|
129
117
|
{"_index": f"{index_prefix}-analyses", "_source": doc}
|
|
130
118
|
]
|
|
131
119
|
|
|
132
|
-
if dryrun == 0:
|
|
133
|
-
|
|
134
|
-
|
|
120
|
+
if dryrun == 0 and es_import_conn:
|
|
121
|
+
es_import_conn.import_items(bulk_items,
|
|
122
|
+
start_time=time.perf_counter(),
|
|
123
|
+
total_items=len(bulk_items)
|
|
124
|
+
)
|
|
135
125
|
|
|
136
126
|
|
|
137
|
-
def import_vcf_file(
|
|
127
|
+
def import_vcf_file(es_import_conn: ElasticImportConn | None,
|
|
138
128
|
file_index: str,
|
|
139
129
|
file: DataFile,
|
|
140
130
|
dryrun: int = 0) -> None:
|
|
@@ -184,20 +174,20 @@ def import_vcf_file(es: elasticsearch.Elasticsearch | None,
|
|
|
184
174
|
# resp = es.index(index=index, document=doc)
|
|
185
175
|
|
|
186
176
|
# Insert bulk of items
|
|
187
|
-
if len(bulk_items) >= bulk_sz:
|
|
188
|
-
import_items(
|
|
189
|
-
|
|
177
|
+
if len(bulk_items) >= bulk_sz and es_import_conn:
|
|
178
|
+
es_import_conn.import_items(bulk_items, start_time=start,
|
|
179
|
+
total_items=n)
|
|
190
180
|
bulk_items = []
|
|
191
181
|
|
|
192
182
|
# Insert remaining items
|
|
193
|
-
if dryrun == 0:
|
|
194
|
-
import_items(
|
|
183
|
+
if dryrun == 0 and es_import_conn:
|
|
184
|
+
es_import_conn.import_items(bulk_items, start_time=start, total_items=n)
|
|
195
185
|
|
|
196
186
|
except StopIteration:
|
|
197
187
|
logger.error('Skipping empty file : %s.', file.path)
|
|
198
188
|
|
|
199
189
|
|
|
200
|
-
def import_processes(
|
|
190
|
+
def import_processes(es_import_conn: ElasticImportConn | None, index: str,
|
|
201
191
|
processes: WetProcesses | BioInfoProcesses, dryrun: int = 0) -> None:
|
|
202
192
|
"""Import processes into their own index."""
|
|
203
193
|
|
|
@@ -209,9 +199,11 @@ def import_processes(es: elasticsearch.Elasticsearch | None, index: str,
|
|
|
209
199
|
doc: ProcessDocument = process.data | {'proc_id': proc_id, 'type': process_type}
|
|
210
200
|
bulk_items.append({"_index": index, "_source": doc})
|
|
211
201
|
|
|
212
|
-
if dryrun == 0:
|
|
213
|
-
|
|
214
|
-
|
|
202
|
+
if dryrun == 0 and es_import_conn:
|
|
203
|
+
es_import_conn.import_items(bulk_items,
|
|
204
|
+
start_time=time.perf_counter(),
|
|
205
|
+
total_items=len(bulk_items)
|
|
206
|
+
)
|
|
215
207
|
|
|
216
208
|
|
|
217
209
|
def generate_unique_index(index_prefix: str, filepath: str) -> str:
|
|
@@ -235,10 +227,13 @@ def main() -> None:
|
|
|
235
227
|
logger.debug("LOGGERS: %s", logging.root.manager.loggerDict) # pylint: disable=no-member
|
|
236
228
|
|
|
237
229
|
# Open connection to ES
|
|
238
|
-
es = None
|
|
239
230
|
if args.dryrun == 0:
|
|
240
|
-
|
|
241
|
-
|
|
231
|
+
addr = f"https://{args.es_host}:{args.es_port}"
|
|
232
|
+
logger.info("Trying to connect to Elasticsearch at %s...", addr)
|
|
233
|
+
es_import_conn = ElasticImportConn(addr, args.es_cert_fp,
|
|
234
|
+
basic_auth=(args.es_usr, args.es_pwd))
|
|
235
|
+
else:
|
|
236
|
+
es_import_conn = None
|
|
242
237
|
|
|
243
238
|
# Create index
|
|
244
239
|
# es.indices.create(index=args.es_index_prefix)
|
|
@@ -275,19 +270,22 @@ def main() -> None:
|
|
|
275
270
|
# First, generate a unique index name for each file.
|
|
276
271
|
file_index = generate_unique_index(args.es_index_prefix, f.path)
|
|
277
272
|
# Then, import the analysis metadata into a dedicated index.
|
|
278
|
-
import_analysis_metadata(
|
|
273
|
+
import_analysis_metadata(es_import_conn, args.es_index_prefix,
|
|
274
|
+
file_index, f, cat, args.dryrun)
|
|
279
275
|
# Finally, import the file in its own index.
|
|
280
|
-
globals()[f'import_{cat}_file'](
|
|
276
|
+
globals()[f'import_{cat}_file'](es_import_conn=es_import_conn,
|
|
281
277
|
file_index=file_index, file=f, dryrun=args.dryrun)
|
|
282
278
|
|
|
283
279
|
# Import processes
|
|
284
280
|
logger.info("Importing wet processes.")
|
|
285
281
|
logger.info("Wet processes IDs = %s", str(import_bundle.wet_processes.get_process_ids()))
|
|
286
|
-
import_processes(
|
|
282
|
+
import_processes(es_import_conn, f"{args.es_index_prefix}-wet_processes",
|
|
283
|
+
import_bundle.wet_processes)
|
|
287
284
|
|
|
288
285
|
logger.info("Importing bio info processes.")
|
|
289
286
|
logger.info("Bio info processes IDs = %s", str(import_bundle.bi_processes.get_process_ids()))
|
|
290
|
-
import_processes(
|
|
287
|
+
import_processes(es_import_conn, f"{args.es_index_prefix}-bi_processes",
|
|
288
|
+
import_bundle.bi_processes)
|
|
291
289
|
|
|
292
290
|
|
|
293
291
|
if __name__ == '__main__':
|
|
@@ -3,16 +3,13 @@ import argparse
|
|
|
3
3
|
import logging
|
|
4
4
|
import typing
|
|
5
5
|
|
|
6
|
-
import
|
|
7
|
-
|
|
6
|
+
from genelastic.common import (ElasticQueryConn, add_verbose_control_args,
|
|
7
|
+
add_es_connection_args, Bucket)
|
|
8
8
|
|
|
9
9
|
from .logger import configure_logging
|
|
10
|
-
from .common import (add_es_connection_args, connect_to_es, add_verbose_control_args, Bucket,
|
|
11
|
-
run_composite_aggregation, get_process_ids)
|
|
12
10
|
|
|
13
11
|
logger = logging.getLogger('genelastic')
|
|
14
12
|
logging.getLogger('elastic_transport').setLevel(logging.WARNING) # Disable excessive logging
|
|
15
|
-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
16
13
|
|
|
17
14
|
|
|
18
15
|
def read_args() -> argparse.Namespace:
|
|
@@ -36,7 +33,7 @@ def read_args() -> argparse.Namespace:
|
|
|
36
33
|
return parser.parse_args()
|
|
37
34
|
|
|
38
35
|
|
|
39
|
-
def list_bundles(
|
|
36
|
+
def list_bundles(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
40
37
|
"""List all imported YAML bundles."""
|
|
41
38
|
|
|
42
39
|
query = {
|
|
@@ -51,7 +48,7 @@ def list_bundles(es: elasticsearch.Elasticsearch, index: str) -> None:
|
|
|
51
48
|
}
|
|
52
49
|
}
|
|
53
50
|
|
|
54
|
-
buckets: typing.List[Bucket] = run_composite_aggregation(
|
|
51
|
+
buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
|
|
55
52
|
|
|
56
53
|
print("Imported YAML files")
|
|
57
54
|
print("===================")
|
|
@@ -66,7 +63,7 @@ def list_bundles(es: elasticsearch.Elasticsearch, index: str) -> None:
|
|
|
66
63
|
print()
|
|
67
64
|
|
|
68
65
|
|
|
69
|
-
def list_data_files(
|
|
66
|
+
def list_data_files(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
70
67
|
"""List all imported data files."""
|
|
71
68
|
|
|
72
69
|
query = {
|
|
@@ -81,7 +78,7 @@ def list_data_files(es: elasticsearch.Elasticsearch, index: str) -> None:
|
|
|
81
78
|
}
|
|
82
79
|
}
|
|
83
80
|
|
|
84
|
-
buckets: typing.List[Bucket] = run_composite_aggregation(
|
|
81
|
+
buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
|
|
85
82
|
|
|
86
83
|
print("Imported data files")
|
|
87
84
|
print("===================")
|
|
@@ -96,9 +93,9 @@ def list_data_files(es: elasticsearch.Elasticsearch, index: str) -> None:
|
|
|
96
93
|
print()
|
|
97
94
|
|
|
98
95
|
|
|
99
|
-
def list_processes(
|
|
96
|
+
def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
100
97
|
"""List all processes."""
|
|
101
|
-
process_ids =
|
|
98
|
+
process_ids = es_query_conn.get_field_values(index, "proc_id")
|
|
102
99
|
|
|
103
100
|
if len(process_ids) == 0:
|
|
104
101
|
print("Empty response.", end="\n")
|
|
@@ -109,43 +106,21 @@ def list_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
|
|
|
109
106
|
print()
|
|
110
107
|
|
|
111
108
|
|
|
112
|
-
def list_wet_processes(
|
|
109
|
+
def list_wet_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
113
110
|
"""List all wet processes."""
|
|
114
111
|
print("Imported wet processes")
|
|
115
112
|
print("======================")
|
|
116
|
-
list_processes(
|
|
113
|
+
list_processes(es_query_conn, index)
|
|
117
114
|
|
|
118
115
|
|
|
119
|
-
def list_bi_processes(
|
|
116
|
+
def list_bi_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
120
117
|
"""List all bio info processes."""
|
|
121
118
|
print("Imported bi processes")
|
|
122
119
|
print("=====================")
|
|
123
|
-
list_processes(
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
def search_doc_by_field_value(es: elasticsearch.Elasticsearch,
|
|
127
|
-
index: str, field: str, value: str) -> (
|
|
128
|
-
typing.Dict[str, typing.Any] | None):
|
|
129
|
-
"""Search a document by a value for a certain field."""
|
|
130
|
-
logger.info("Searching for field '%s' with value '%s' inside index '%s'.",
|
|
131
|
-
field, value, index)
|
|
132
|
-
search_query = {
|
|
133
|
-
"query": {
|
|
134
|
-
"term": {
|
|
135
|
-
f"{field}.keyword": value,
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
}
|
|
120
|
+
list_processes(es_query_conn, index)
|
|
139
121
|
|
|
140
|
-
response = es.search(index=index, body=search_query)
|
|
141
122
|
|
|
142
|
-
|
|
143
|
-
return response['hits']['hits'][0]['_source'] # type: ignore
|
|
144
|
-
except KeyError:
|
|
145
|
-
return None
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def list_data_files_per_bundle(es: elasticsearch.Elasticsearch, index: str) -> None:
|
|
123
|
+
def list_data_files_per_bundle(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
149
124
|
"""For each imported YAML bundle, display some info and list its data files."""
|
|
150
125
|
query = {
|
|
151
126
|
"size": 0,
|
|
@@ -174,7 +149,7 @@ def list_data_files_per_bundle(es: elasticsearch.Elasticsearch, index: str) -> N
|
|
|
174
149
|
}
|
|
175
150
|
}
|
|
176
151
|
|
|
177
|
-
buckets: typing.List[Bucket] = run_composite_aggregation(
|
|
152
|
+
buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
|
|
178
153
|
|
|
179
154
|
print("Data files per YAML bundle")
|
|
180
155
|
print("==========================")
|
|
@@ -207,7 +182,11 @@ def main() -> None:
|
|
|
207
182
|
|
|
208
183
|
configure_logging(args.verbose)
|
|
209
184
|
logger.debug("Arguments: %s", args)
|
|
210
|
-
|
|
185
|
+
|
|
186
|
+
addr = f"https://{args.es_host}:{args.es_port}"
|
|
187
|
+
logger.info("Trying to connect to Elasticsearch at %s...", addr)
|
|
188
|
+
es_query_conn = ElasticQueryConn(addr, args.es_cert_fp,
|
|
189
|
+
basic_auth=(args.es_usr, args.es_pwd))
|
|
211
190
|
|
|
212
191
|
analysis_index = f"{args.es_index_prefix}-analyses"
|
|
213
192
|
wet_processes_index = f"{args.es_index_prefix}-wet_processes"
|
|
@@ -216,32 +195,32 @@ def main() -> None:
|
|
|
216
195
|
list_call_count = 0
|
|
217
196
|
|
|
218
197
|
if args.list_bundles:
|
|
219
|
-
list_bundles(
|
|
198
|
+
list_bundles(es_query_conn, analysis_index)
|
|
220
199
|
list_call_count += 1
|
|
221
200
|
|
|
222
201
|
if args.list_data_files:
|
|
223
|
-
list_data_files(
|
|
202
|
+
list_data_files(es_query_conn, analysis_index)
|
|
224
203
|
list_call_count += 1
|
|
225
204
|
|
|
226
205
|
if args.list_wet_processes:
|
|
227
|
-
list_wet_processes(
|
|
206
|
+
list_wet_processes(es_query_conn, wet_processes_index)
|
|
228
207
|
list_call_count += 1
|
|
229
208
|
|
|
230
209
|
if args.list_bi_processes:
|
|
231
|
-
list_bi_processes(
|
|
210
|
+
list_bi_processes(es_query_conn, bi_processes_index)
|
|
232
211
|
list_call_count += 1
|
|
233
212
|
|
|
234
213
|
if args.list_data_files_per_bundle:
|
|
235
|
-
list_data_files_per_bundle(
|
|
214
|
+
list_data_files_per_bundle(es_query_conn, analysis_index)
|
|
236
215
|
list_call_count += 1
|
|
237
216
|
|
|
238
217
|
if list_call_count == 0:
|
|
239
218
|
logger.debug("No list option specified, listing everything.")
|
|
240
|
-
list_bundles(
|
|
241
|
-
list_data_files(
|
|
242
|
-
list_wet_processes(
|
|
243
|
-
list_bi_processes(
|
|
244
|
-
list_data_files_per_bundle(
|
|
219
|
+
list_bundles(es_query_conn, analysis_index)
|
|
220
|
+
list_data_files(es_query_conn, analysis_index)
|
|
221
|
+
list_wet_processes(es_query_conn, wet_processes_index)
|
|
222
|
+
list_bi_processes(es_query_conn, bi_processes_index)
|
|
223
|
+
list_data_files_per_bundle(es_query_conn, analysis_index)
|
|
245
224
|
|
|
246
225
|
|
|
247
226
|
if __name__ == '__main__':
|