genelastic 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. genelastic/__init__.py +0 -13
  2. genelastic/api/__init__.py +0 -0
  3. genelastic/api/extends/__init__.py +0 -0
  4. genelastic/api/extends/example.py +7 -0
  5. genelastic/api/routes.py +84 -0
  6. genelastic/api/server.py +72 -0
  7. genelastic/api/settings.py +13 -0
  8. genelastic/common/__init__.py +12 -0
  9. genelastic/common/cli.py +35 -0
  10. genelastic/common/elastic.py +183 -0
  11. genelastic/common/exceptions.py +6 -0
  12. genelastic/common/types.py +20 -0
  13. genelastic/import_data/__init__.py +9 -0
  14. genelastic/{analyses.py → import_data/analyses.py} +3 -1
  15. genelastic/{analysis.py → import_data/analysis.py} +3 -2
  16. genelastic/{bi_process.py → import_data/bi_process.py} +1 -1
  17. genelastic/{bi_processes.py → import_data/bi_processes.py} +2 -1
  18. genelastic/{data_file.py → import_data/data_file.py} +3 -1
  19. genelastic/{filename_pattern.py → import_data/filename_pattern.py} +2 -1
  20. genelastic/{gen_data.py → import_data/gen_data.py} +3 -2
  21. genelastic/{import_bundle.py → import_data/import_bundle.py} +2 -1
  22. genelastic/{import_bundle_factory.py → import_data/import_bundle_factory.py} +3 -1
  23. genelastic/{import_data.py → import_data/import_data.py} +49 -51
  24. genelastic/{info.py → import_data/info.py} +29 -50
  25. genelastic/{integrity.py → import_data/integrity.py} +53 -87
  26. genelastic/{tags.py → import_data/tags.py} +2 -1
  27. genelastic/{validate_data.py → import_data/validate_data.py} +6 -4
  28. genelastic/{wet_processes.py → import_data/wet_processes.py} +2 -1
  29. {genelastic-0.6.0.dist-info → genelastic-0.6.1.dist-info}/METADATA +7 -2
  30. genelastic-0.6.1.dist-info/RECORD +36 -0
  31. {genelastic-0.6.0.dist-info → genelastic-0.6.1.dist-info}/WHEEL +1 -1
  32. genelastic-0.6.1.dist-info/entry_points.txt +6 -0
  33. genelastic/common.py +0 -151
  34. genelastic-0.6.0.dist-info/RECORD +0 -25
  35. genelastic-0.6.0.dist-info/entry_points.txt +0 -6
  36. /genelastic/{constants.py → import_data/constants.py} +0 -0
  37. /genelastic/{logger.py → import_data/logger.py} +0 -0
  38. /genelastic/{wet_process.py → import_data/wet_process.py} +0 -0
  39. {genelastic-0.6.0.dist-info → genelastic-0.6.1.dist-info}/top_level.txt +0 -0
@@ -8,23 +8,22 @@
8
8
  import argparse
9
9
  import csv
10
10
  import datetime
11
+ import hashlib
11
12
  import logging
12
13
  import os
13
14
  import sys
14
15
  import time
15
- import hashlib
16
+ import vcf # type: ignore
16
17
 
17
- import elasticsearch
18
- import elasticsearch.helpers
19
- import vcf # type: ignore[import-untyped]
18
+ from genelastic.common import (add_verbose_control_args, add_es_connection_args,
19
+ ElasticImportConn, MetadataDocument, AnalysisDocument,
20
+ BulkItems, ProcessDocument)
20
21
 
21
- from genelastic.bi_processes import BioInfoProcesses
22
- from genelastic.wet_processes import WetProcesses
23
- from . import make_import_bundle_from_files
22
+ from .import_bundle_factory import make_import_bundle_from_files
23
+ from .bi_processes import BioInfoProcesses
24
24
  from .data_file import DataFile
25
25
  from .logger import configure_logging
26
- from .common import (BulkItems, AnalysisDocument, ProcessDocument, MetadataDocument,
27
- add_verbose_control_args, add_es_connection_args, connect_to_es)
26
+ from .wet_processes import WetProcesses
28
27
 
29
28
  logger = logging.getLogger('genelastic')
30
29
  logging.getLogger('elastic_transport').setLevel(logging.WARNING) # Disable excessive logging
@@ -35,7 +34,8 @@ def read_args() -> argparse.Namespace:
35
34
  # pylint: disable=R0801
36
35
  """Read arguments from command line."""
37
36
  parser = argparse.ArgumentParser(description='Genetics data importer.',
38
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
37
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
38
+ allow_abbrev=False)
39
39
  add_verbose_control_args(parser)
40
40
  add_es_connection_args(parser)
41
41
  parser.add_argument('-D', '--dry-run', dest='dryrun', action='count',
@@ -56,14 +56,14 @@ def read_args() -> argparse.Namespace:
56
56
  return args
57
57
 
58
58
 
59
- def import_cov_file(es: elasticsearch.Elasticsearch | None,
59
+ def import_cov_file(es_import_conn: ElasticImportConn | None,
60
60
  file_index: str, file: str, dryrun: int = 0) -> None:
61
61
  """Import a coverage file to the Elasticsearch database."""
62
62
  # Set field types
63
- if dryrun == 0 and es:
64
- es.indices.put_mapping(index=file_index,
65
- body={'properties': {'pos': {'type': 'integer'},
66
- 'depth': {'type': 'byte'}}})
63
+ if dryrun == 0 and es_import_conn:
64
+ es_import_conn.client.indices.put_mapping(index=file_index,
65
+ body={'properties': {'pos': {'type': 'integer'},
66
+ 'depth': {'type': 'byte'}}})
67
67
 
68
68
  # Open file
69
69
  if dryrun > 1:
@@ -93,24 +93,12 @@ def import_cov_file(es: elasticsearch.Elasticsearch | None,
93
93
  }
94
94
 
95
95
  # Insert document
96
- if dryrun == 0 and es:
97
- es.index(index=file_index, document=doc)
98
-
96
+ if dryrun == 0 and es_import_conn:
97
+ es_import_conn.client.index(index=file_index, document=doc)
99
98
 
100
- def import_items(es: elasticsearch.Elasticsearch | None,
101
- bulk_items: BulkItems,
102
- start_time: float,
103
- total_items: int) -> None:
104
- """Import items to the Elasticsearch database."""
105
- if len(bulk_items) > 0 and es:
106
- elasticsearch.helpers.bulk(es, bulk_items)
107
- elapsed = time.perf_counter() - start_time
108
- logger.info("Imported %d items in %s (%f items/s).", total_items,
109
- datetime.timedelta(seconds=elapsed), total_items / elapsed)
110
99
 
111
-
112
- # pylint: disable-next=too-many-arguments
113
- def import_analysis_metadata(es: elasticsearch.Elasticsearch | None,
100
+ # pylint: disable-next=too-many-arguments, too-many-positional-arguments
101
+ def import_analysis_metadata(es_import_conn: ElasticImportConn | None,
114
102
  index_prefix: str,
115
103
  file_index: str,
116
104
  file: DataFile,
@@ -129,12 +117,14 @@ def import_analysis_metadata(es: elasticsearch.Elasticsearch | None,
129
117
  {"_index": f"{index_prefix}-analyses", "_source": doc}
130
118
  ]
131
119
 
132
- if dryrun == 0:
133
- start = time.perf_counter()
134
- import_items(es, bulk_items, start_time=start, total_items=len(bulk_items))
120
+ if dryrun == 0 and es_import_conn:
121
+ es_import_conn.import_items(bulk_items,
122
+ start_time=time.perf_counter(),
123
+ total_items=len(bulk_items)
124
+ )
135
125
 
136
126
 
137
- def import_vcf_file(es: elasticsearch.Elasticsearch | None,
127
+ def import_vcf_file(es_import_conn: ElasticImportConn | None,
138
128
  file_index: str,
139
129
  file: DataFile,
140
130
  dryrun: int = 0) -> None:
@@ -184,20 +174,20 @@ def import_vcf_file(es: elasticsearch.Elasticsearch | None,
184
174
  # resp = es.index(index=index, document=doc)
185
175
 
186
176
  # Insert bulk of items
187
- if len(bulk_items) >= bulk_sz:
188
- import_items(es, bulk_items, start_time=start,
189
- total_items=n)
177
+ if len(bulk_items) >= bulk_sz and es_import_conn:
178
+ es_import_conn.import_items(bulk_items, start_time=start,
179
+ total_items=n)
190
180
  bulk_items = []
191
181
 
192
182
  # Insert remaining items
193
- if dryrun == 0:
194
- import_items(es, bulk_items, start_time=start, total_items=n)
183
+ if dryrun == 0 and es_import_conn:
184
+ es_import_conn.import_items(bulk_items, start_time=start, total_items=n)
195
185
 
196
186
  except StopIteration:
197
187
  logger.error('Skipping empty file : %s.', file.path)
198
188
 
199
189
 
200
- def import_processes(es: elasticsearch.Elasticsearch | None, index: str,
190
+ def import_processes(es_import_conn: ElasticImportConn | None, index: str,
201
191
  processes: WetProcesses | BioInfoProcesses, dryrun: int = 0) -> None:
202
192
  """Import processes into their own index."""
203
193
 
@@ -209,9 +199,11 @@ def import_processes(es: elasticsearch.Elasticsearch | None, index: str,
209
199
  doc: ProcessDocument = process.data | {'proc_id': proc_id, 'type': process_type}
210
200
  bulk_items.append({"_index": index, "_source": doc})
211
201
 
212
- if dryrun == 0:
213
- start = time.perf_counter()
214
- import_items(es, bulk_items, start_time=start, total_items=len(bulk_items))
202
+ if dryrun == 0 and es_import_conn:
203
+ es_import_conn.import_items(bulk_items,
204
+ start_time=time.perf_counter(),
205
+ total_items=len(bulk_items)
206
+ )
215
207
 
216
208
 
217
209
  def generate_unique_index(index_prefix: str, filepath: str) -> str:
@@ -235,10 +227,13 @@ def main() -> None:
235
227
  logger.debug("LOGGERS: %s", logging.root.manager.loggerDict) # pylint: disable=no-member
236
228
 
237
229
  # Open connection to ES
238
- es = None
239
230
  if args.dryrun == 0:
240
- es = connect_to_es(host=args.es_host, port=args.es_port, usr=args.es_usr,
241
- pwd=args.es_pwd)
231
+ addr = f"https://{args.es_host}:{args.es_port}"
232
+ logger.info("Trying to connect to Elasticsearch at %s...", addr)
233
+ es_import_conn = ElasticImportConn(addr, args.es_cert_fp,
234
+ basic_auth=(args.es_usr, args.es_pwd))
235
+ else:
236
+ es_import_conn = None
242
237
 
243
238
  # Create index
244
239
  # es.indices.create(index=args.es_index_prefix)
@@ -275,19 +270,22 @@ def main() -> None:
275
270
  # First, generate a unique index name for each file.
276
271
  file_index = generate_unique_index(args.es_index_prefix, f.path)
277
272
  # Then, import the analysis metadata into a dedicated index.
278
- import_analysis_metadata(es, args.es_index_prefix, file_index, f, cat, args.dryrun)
273
+ import_analysis_metadata(es_import_conn, args.es_index_prefix,
274
+ file_index, f, cat, args.dryrun)
279
275
  # Finally, import the file in its own index.
280
- globals()[f'import_{cat}_file'](es=es,
276
+ globals()[f'import_{cat}_file'](es_import_conn=es_import_conn,
281
277
  file_index=file_index, file=f, dryrun=args.dryrun)
282
278
 
283
279
  # Import processes
284
280
  logger.info("Importing wet processes.")
285
281
  logger.info("Wet processes IDs = %s", str(import_bundle.wet_processes.get_process_ids()))
286
- import_processes(es, f"{args.es_index_prefix}-wet_processes", import_bundle.wet_processes)
282
+ import_processes(es_import_conn, f"{args.es_index_prefix}-wet_processes",
283
+ import_bundle.wet_processes)
287
284
 
288
285
  logger.info("Importing bio info processes.")
289
286
  logger.info("Bio info processes IDs = %s", str(import_bundle.bi_processes.get_process_ids()))
290
- import_processes(es, f"{args.es_index_prefix}-bi_processes", import_bundle.bi_processes)
287
+ import_processes(es_import_conn, f"{args.es_index_prefix}-bi_processes",
288
+ import_bundle.bi_processes)
291
289
 
292
290
 
293
291
  if __name__ == '__main__':
@@ -3,16 +3,13 @@ import argparse
3
3
  import logging
4
4
  import typing
5
5
 
6
- import elasticsearch
7
- import urllib3
6
+ from genelastic.common import (ElasticQueryConn, add_verbose_control_args,
7
+ add_es_connection_args, Bucket)
8
8
 
9
9
  from .logger import configure_logging
10
- from .common import (add_es_connection_args, connect_to_es, add_verbose_control_args, Bucket,
11
- run_composite_aggregation, get_process_ids)
12
10
 
13
11
  logger = logging.getLogger('genelastic')
14
12
  logging.getLogger('elastic_transport').setLevel(logging.WARNING) # Disable excessive logging
15
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
16
13
 
17
14
 
18
15
  def read_args() -> argparse.Namespace:
@@ -36,7 +33,7 @@ def read_args() -> argparse.Namespace:
36
33
  return parser.parse_args()
37
34
 
38
35
 
39
- def list_bundles(es: elasticsearch.Elasticsearch, index: str) -> None:
36
+ def list_bundles(es_query_conn: ElasticQueryConn, index: str) -> None:
40
37
  """List all imported YAML bundles."""
41
38
 
42
39
  query = {
@@ -51,7 +48,7 @@ def list_bundles(es: elasticsearch.Elasticsearch, index: str) -> None:
51
48
  }
52
49
  }
53
50
 
54
- buckets: typing.List[Bucket] = run_composite_aggregation(es, index, query)
51
+ buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
55
52
 
56
53
  print("Imported YAML files")
57
54
  print("===================")
@@ -66,7 +63,7 @@ def list_bundles(es: elasticsearch.Elasticsearch, index: str) -> None:
66
63
  print()
67
64
 
68
65
 
69
- def list_data_files(es: elasticsearch.Elasticsearch, index: str) -> None:
66
+ def list_data_files(es_query_conn: ElasticQueryConn, index: str) -> None:
70
67
  """List all imported data files."""
71
68
 
72
69
  query = {
@@ -81,7 +78,7 @@ def list_data_files(es: elasticsearch.Elasticsearch, index: str) -> None:
81
78
  }
82
79
  }
83
80
 
84
- buckets: typing.List[Bucket] = run_composite_aggregation(es, index, query)
81
+ buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
85
82
 
86
83
  print("Imported data files")
87
84
  print("===================")
@@ -96,9 +93,9 @@ def list_data_files(es: elasticsearch.Elasticsearch, index: str) -> None:
96
93
  print()
97
94
 
98
95
 
99
- def list_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
96
+ def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
100
97
  """List all processes."""
101
- process_ids = get_process_ids(es, index, "proc_id")
98
+ process_ids = es_query_conn.get_field_values(index, "proc_id")
102
99
 
103
100
  if len(process_ids) == 0:
104
101
  print("Empty response.", end="\n")
@@ -109,43 +106,21 @@ def list_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
109
106
  print()
110
107
 
111
108
 
112
- def list_wet_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
109
+ def list_wet_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
113
110
  """List all wet processes."""
114
111
  print("Imported wet processes")
115
112
  print("======================")
116
- list_processes(es, index)
113
+ list_processes(es_query_conn, index)
117
114
 
118
115
 
119
- def list_bi_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
116
+ def list_bi_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
120
117
  """List all bio info processes."""
121
118
  print("Imported bi processes")
122
119
  print("=====================")
123
- list_processes(es, index)
124
-
125
-
126
- def search_doc_by_field_value(es: elasticsearch.Elasticsearch,
127
- index: str, field: str, value: str) -> (
128
- typing.Dict[str, typing.Any] | None):
129
- """Search a document by a value for a certain field."""
130
- logger.info("Searching for field '%s' with value '%s' inside index '%s'.",
131
- field, value, index)
132
- search_query = {
133
- "query": {
134
- "term": {
135
- f"{field}.keyword": value,
136
- }
137
- }
138
- }
120
+ list_processes(es_query_conn, index)
139
121
 
140
- response = es.search(index=index, body=search_query)
141
122
 
142
- try:
143
- return response['hits']['hits'][0]['_source'] # type: ignore
144
- except KeyError:
145
- return None
146
-
147
-
148
- def list_data_files_per_bundle(es: elasticsearch.Elasticsearch, index: str) -> None:
123
+ def list_data_files_per_bundle(es_query_conn: ElasticQueryConn, index: str) -> None:
149
124
  """For each imported YAML bundle, display some info and list its data files."""
150
125
  query = {
151
126
  "size": 0,
@@ -174,7 +149,7 @@ def list_data_files_per_bundle(es: elasticsearch.Elasticsearch, index: str) -> N
174
149
  }
175
150
  }
176
151
 
177
- buckets: typing.List[Bucket] = run_composite_aggregation(es, index, query)
152
+ buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
178
153
 
179
154
  print("Data files per YAML bundle")
180
155
  print("==========================")
@@ -207,7 +182,11 @@ def main() -> None:
207
182
 
208
183
  configure_logging(args.verbose)
209
184
  logger.debug("Arguments: %s", args)
210
- es = connect_to_es(host=args.es_host, port=args.es_port, usr=args.es_usr, pwd=args.es_pwd)
185
+
186
+ addr = f"https://{args.es_host}:{args.es_port}"
187
+ logger.info("Trying to connect to Elasticsearch at %s...", addr)
188
+ es_query_conn = ElasticQueryConn(addr, args.es_cert_fp,
189
+ basic_auth=(args.es_usr, args.es_pwd))
211
190
 
212
191
  analysis_index = f"{args.es_index_prefix}-analyses"
213
192
  wet_processes_index = f"{args.es_index_prefix}-wet_processes"
@@ -216,32 +195,32 @@ def main() -> None:
216
195
  list_call_count = 0
217
196
 
218
197
  if args.list_bundles:
219
- list_bundles(es, analysis_index)
198
+ list_bundles(es_query_conn, analysis_index)
220
199
  list_call_count += 1
221
200
 
222
201
  if args.list_data_files:
223
- list_data_files(es, analysis_index)
202
+ list_data_files(es_query_conn, analysis_index)
224
203
  list_call_count += 1
225
204
 
226
205
  if args.list_wet_processes:
227
- list_wet_processes(es, wet_processes_index)
206
+ list_wet_processes(es_query_conn, wet_processes_index)
228
207
  list_call_count += 1
229
208
 
230
209
  if args.list_bi_processes:
231
- list_bi_processes(es, bi_processes_index)
210
+ list_bi_processes(es_query_conn, bi_processes_index)
232
211
  list_call_count += 1
233
212
 
234
213
  if args.list_data_files_per_bundle:
235
- list_data_files_per_bundle(es, analysis_index)
214
+ list_data_files_per_bundle(es_query_conn, analysis_index)
236
215
  list_call_count += 1
237
216
 
238
217
  if list_call_count == 0:
239
218
  logger.debug("No list option specified, listing everything.")
240
- list_bundles(es, analysis_index)
241
- list_data_files(es, analysis_index)
242
- list_wet_processes(es, wet_processes_index)
243
- list_bi_processes(es, bi_processes_index)
244
- list_data_files_per_bundle(es, analysis_index)
219
+ list_bundles(es_query_conn, analysis_index)
220
+ list_data_files(es_query_conn, analysis_index)
221
+ list_wet_processes(es_query_conn, wet_processes_index)
222
+ list_bi_processes(es_query_conn, bi_processes_index)
223
+ list_data_files_per_bundle(es_query_conn, analysis_index)
245
224
 
246
225
 
247
226
  if __name__ == '__main__':