genelastic 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. genelastic/__init__.py +0 -13
  2. genelastic/api/__init__.py +0 -0
  3. genelastic/api/extends/__init__.py +0 -0
  4. genelastic/api/extends/example.py +6 -0
  5. genelastic/api/routes.py +221 -0
  6. genelastic/api/server.py +80 -0
  7. genelastic/api/settings.py +14 -0
  8. genelastic/common/__init__.py +39 -0
  9. genelastic/common/cli.py +63 -0
  10. genelastic/common/elastic.py +214 -0
  11. genelastic/common/exceptions.py +4 -0
  12. genelastic/common/types.py +25 -0
  13. genelastic/import_data/__init__.py +27 -0
  14. genelastic/{analyses.py → import_data/analyses.py} +19 -20
  15. genelastic/{analysis.py → import_data/analysis.py} +71 -66
  16. genelastic/{bi_process.py → import_data/bi_process.py} +8 -6
  17. genelastic/{bi_processes.py → import_data/bi_processes.py} +10 -9
  18. genelastic/import_data/cli_gen_data.py +116 -0
  19. genelastic/import_data/cli_import.py +379 -0
  20. genelastic/import_data/cli_info.py +256 -0
  21. genelastic/import_data/cli_integrity.py +384 -0
  22. genelastic/import_data/cli_validate.py +54 -0
  23. genelastic/import_data/constants.py +24 -0
  24. genelastic/{data_file.py → import_data/data_file.py} +26 -21
  25. genelastic/import_data/filename_pattern.py +57 -0
  26. genelastic/{import_bundle.py → import_data/import_bundle.py} +58 -48
  27. genelastic/import_data/import_bundle_factory.py +298 -0
  28. genelastic/{logger.py → import_data/logger.py} +22 -18
  29. genelastic/import_data/random_bundle.py +402 -0
  30. genelastic/{tags.py → import_data/tags.py} +48 -27
  31. genelastic/{wet_process.py → import_data/wet_process.py} +8 -4
  32. genelastic/{wet_processes.py → import_data/wet_processes.py} +15 -9
  33. genelastic/ui/__init__.py +0 -0
  34. genelastic/ui/server.py +87 -0
  35. genelastic/ui/settings.py +11 -0
  36. genelastic-0.7.0.dist-info/METADATA +105 -0
  37. genelastic-0.7.0.dist-info/RECORD +40 -0
  38. {genelastic-0.6.0.dist-info → genelastic-0.7.0.dist-info}/WHEEL +1 -1
  39. genelastic-0.7.0.dist-info/entry_points.txt +6 -0
  40. genelastic/common.py +0 -151
  41. genelastic/constants.py +0 -45
  42. genelastic/filename_pattern.py +0 -62
  43. genelastic/gen_data.py +0 -193
  44. genelastic/import_bundle_factory.py +0 -288
  45. genelastic/import_data.py +0 -294
  46. genelastic/info.py +0 -248
  47. genelastic/integrity.py +0 -324
  48. genelastic/validate_data.py +0 -41
  49. genelastic-0.6.0.dist-info/METADATA +0 -36
  50. genelastic-0.6.0.dist-info/RECORD +0 -25
  51. genelastic-0.6.0.dist-info/entry_points.txt +0 -6
  52. {genelastic-0.6.0.dist-info → genelastic-0.7.0.dist-info}/top_level.txt +0 -0
genelastic/import_data.py DELETED
@@ -1,294 +0,0 @@
1
- # pylint: disable=missing-module-docstring
2
- # vi: se tw=80
3
-
4
- # Elasticsearch Python API:
5
- # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/overview.html
6
- # https://elasticsearch-py.readthedocs.io/en/latest/api.html
7
-
8
- import argparse
9
- import csv
10
- import datetime
11
- import logging
12
- import os
13
- import sys
14
- import time
15
- import hashlib
16
-
17
- import elasticsearch
18
- import elasticsearch.helpers
19
- import vcf # type: ignore[import-untyped]
20
-
21
- from genelastic.bi_processes import BioInfoProcesses
22
- from genelastic.wet_processes import WetProcesses
23
- from . import make_import_bundle_from_files
24
- from .data_file import DataFile
25
- from .logger import configure_logging
26
- from .common import (BulkItems, AnalysisDocument, ProcessDocument, MetadataDocument,
27
- add_verbose_control_args, add_es_connection_args, connect_to_es)
28
-
29
- logger = logging.getLogger('genelastic')
30
- logging.getLogger('elastic_transport').setLevel(logging.WARNING) # Disable excessive logging
31
- logging.getLogger('urllib3').setLevel(logging.WARNING) # Disable excessive logging
32
-
33
-
34
- def read_args() -> argparse.Namespace:
35
- # pylint: disable=R0801
36
- """Read arguments from command line."""
37
- parser = argparse.ArgumentParser(description='Genetics data importer.',
38
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
39
- add_verbose_control_args(parser)
40
- add_es_connection_args(parser)
41
- parser.add_argument('-D', '--dry-run', dest='dryrun', action='count',
42
- default=0,
43
- help=('Dry-run level. -D for data files loading (VCF, coverage, etc)' +
44
- ' without connecting or importing to database. -DD for metadata' +
45
- ' YAML files loading only (no loading of data files).'))
46
- parser.add_argument('--log-file', dest='log_file', help='Path to a log file.')
47
- parser.add_argument('--no-list', dest='no_list',
48
- action='store_true',
49
- help='Do not print list of files to be imported.')
50
- parser.add_argument('--no-confirm', dest='no_confirm',
51
- action='store_true',
52
- help='Do not ask confirmation before importing.')
53
- parser.add_argument('files', type=str, nargs="+", default=None,
54
- help="Data files that describe what to import.")
55
- args = parser.parse_args()
56
- return args
57
-
58
-
59
- def import_cov_file(es: elasticsearch.Elasticsearch | None,
60
- file_index: str, file: str, dryrun: int = 0) -> None:
61
- """Import a coverage file to the Elasticsearch database."""
62
- # Set field types
63
- if dryrun == 0 and es:
64
- es.indices.put_mapping(index=file_index,
65
- body={'properties': {'pos': {'type': 'integer'},
66
- 'depth': {'type': 'byte'}}})
67
-
68
- # Open file
69
- if dryrun > 1:
70
- logger.info('Would load and import Coverage file %s '
71
- 'into index %s.', file, file_index)
72
- else:
73
- logger.info('Load Coverage file %s.', file)
74
- if dryrun == 1:
75
- logger.info('Would import Coverage file %s into index %s.', file, file_index)
76
- else:
77
- logger.info('Import Coverage file %s into index %s.', file, file_index)
78
- with open(file, newline='', encoding="utf-8") as f:
79
-
80
- # Read file as CSV
81
- reader = csv.reader(f, delimiter='\t', quotechar='"')
82
-
83
- # Loop on al lines
84
- for row in reader:
85
-
86
- # Build document
87
- # Position starts at 0 inside coverage file
88
- doc: MetadataDocument = {
89
- 'type': 'coverage',
90
- 'chr': row[0],
91
- 'pos': int(row[1]) + 1,
92
- 'depth': int(row[2])
93
- }
94
-
95
- # Insert document
96
- if dryrun == 0 and es:
97
- es.index(index=file_index, document=doc)
98
-
99
-
100
- def import_items(es: elasticsearch.Elasticsearch | None,
101
- bulk_items: BulkItems,
102
- start_time: float,
103
- total_items: int) -> None:
104
- """Import items to the Elasticsearch database."""
105
- if len(bulk_items) > 0 and es:
106
- elasticsearch.helpers.bulk(es, bulk_items)
107
- elapsed = time.perf_counter() - start_time
108
- logger.info("Imported %d items in %s (%f items/s).", total_items,
109
- datetime.timedelta(seconds=elapsed), total_items / elapsed)
110
-
111
-
112
- # pylint: disable-next=too-many-arguments
113
- def import_analysis_metadata(es: elasticsearch.Elasticsearch | None,
114
- index_prefix: str,
115
- file_index: str,
116
- file: DataFile,
117
- analysis_type: str,
118
- dryrun: int = 0) -> None:
119
- """Import analysis metadata into a dedicated index."""
120
- doc: AnalysisDocument = {
121
- "path": os.path.abspath(file.path),
122
- "bundle_path": os.path.abspath(file.bundle_path) if file.bundle_path else None,
123
- "metadata": file.metadata,
124
- "file_index": file_index,
125
- "type": analysis_type
126
- }
127
-
128
- bulk_items: BulkItems = [
129
- {"_index": f"{index_prefix}-analyses", "_source": doc}
130
- ]
131
-
132
- if dryrun == 0:
133
- start = time.perf_counter()
134
- import_items(es, bulk_items, start_time=start, total_items=len(bulk_items))
135
-
136
-
137
- def import_vcf_file(es: elasticsearch.Elasticsearch | None,
138
- file_index: str,
139
- file: DataFile,
140
- dryrun: int = 0) -> None:
141
- """Import a VCF file to the Elasticsearch database."""
142
- logger.info("Import VCF file \"%s\".", file)
143
-
144
- if dryrun > 1:
145
- logger.info('Would load and import VCF file %s '
146
- 'into index %s.', file.path, file_index)
147
- else:
148
- logger.info('Load VCF file %s.', file.path)
149
- if dryrun == 1:
150
- logger.info('Would import VCF file %s into index %s.', file.path, file_index)
151
- else:
152
- logger.info('Importing VCF file %s into index %s...', file.path, file_index)
153
-
154
- try:
155
- vcf_reader = vcf.Reader(filename=file.path)
156
- n = 0
157
- start = time.perf_counter()
158
- bulk_sz = 256 # Bulk size
159
- bulk_items: BulkItems = []
160
- for record in vcf_reader:
161
-
162
- # Correct values
163
- if not record.CHROM.startswith('chr'):
164
- if record.CHROM.lower().startswith('chr'):
165
- record.CHROM = 'chr' + record.CHROM[3:]
166
- else:
167
- record.CHROM = 'chr' + record.CHROM
168
-
169
- # Build document
170
- alt = [x if x is None else x.type for x in record.ALT]
171
- doc: MetadataDocument = {
172
- 'type': 'vcf',
173
- 'chr': record.CHROM,
174
- 'pos': record.POS,
175
- 'alt': alt,
176
- 'info': record.INFO,
177
- }
178
-
179
- if dryrun == 0:
180
-
181
- # Append item to bulk
182
- bulk_items.append({"_index": file_index, "_source": doc})
183
- n += 1
184
- # resp = es.index(index=index, document=doc)
185
-
186
- # Insert bulk of items
187
- if len(bulk_items) >= bulk_sz:
188
- import_items(es, bulk_items, start_time=start,
189
- total_items=n)
190
- bulk_items = []
191
-
192
- # Insert remaining items
193
- if dryrun == 0:
194
- import_items(es, bulk_items, start_time=start, total_items=n)
195
-
196
- except StopIteration:
197
- logger.error('Skipping empty file : %s.', file.path)
198
-
199
-
200
- def import_processes(es: elasticsearch.Elasticsearch | None, index: str,
201
- processes: WetProcesses | BioInfoProcesses, dryrun: int = 0) -> None:
202
- """Import processes into their own index."""
203
-
204
- bulk_items: BulkItems = []
205
-
206
- for proc_id in processes.get_process_ids():
207
- process = processes[proc_id]
208
- process_type = process.__class__.__name__
209
- doc: ProcessDocument = process.data | {'proc_id': proc_id, 'type': process_type}
210
- bulk_items.append({"_index": index, "_source": doc})
211
-
212
- if dryrun == 0:
213
- start = time.perf_counter()
214
- import_items(es, bulk_items, start_time=start, total_items=len(bulk_items))
215
-
216
-
217
- def generate_unique_index(index_prefix: str, filepath: str) -> str:
218
- """
219
- Generate a unique index with the following format:
220
- <index_prefix>_<current_date>_<md5_hashed_filepath>
221
- """
222
- current_date = datetime.datetime.today().strftime('%Y%m%d')
223
- hashed_filepath = hashlib.md5(filepath.encode('utf-8'), usedforsecurity=False).hexdigest()
224
- return f"{index_prefix}-file-{current_date}-{hashed_filepath}"
225
-
226
-
227
- def main() -> None:
228
- """Entry point of the import script."""
229
- # Read command line arguments
230
- args = read_args()
231
-
232
- # Configure logging
233
- configure_logging(args.verbose, log_file=args.log_file)
234
- logger.debug("Arguments: %s", args)
235
- logger.debug("LOGGERS: %s", logging.root.manager.loggerDict) # pylint: disable=no-member
236
-
237
- # Open connection to ES
238
- es = None
239
- if args.dryrun == 0:
240
- es = connect_to_es(host=args.es_host, port=args.es_port, usr=args.es_usr,
241
- pwd=args.es_pwd)
242
-
243
- # Create index
244
- # es.indices.create(index=args.es_index_prefix)
245
-
246
- # Load YAML import bundle
247
- import_bundle = make_import_bundle_from_files(args.files, check=True)
248
- all_bundled_files = import_bundle.get_files()
249
-
250
- # CHECK
251
- for f in all_bundled_files:
252
- if not f.exists():
253
- raise RuntimeError(f"Path {f.path} does not point to a valid file.")
254
-
255
- # LIST
256
- if not args.no_list:
257
- for f in all_bundled_files:
258
- logger.info("Will import %s.", f.path)
259
-
260
- # Ask confirmation for importing
261
- if not args.no_confirm:
262
- answer: str = "maybe"
263
- while answer not in ['', 'n', 'y']:
264
- answer = input("Import (y/N)? ").lower()
265
- if answer != 'y':
266
- logger.info("Import canceled.")
267
- sys.exit(0)
268
-
269
- # IMPORT
270
- # Loop on file categories
271
- for cat in import_bundle.analyses.get_all_categories():
272
- # Import all files in this category.
273
- for f in import_bundle.get_files(cat):
274
- logger.info("Import %s files from %s.", cat, f.path)
275
- # First, generate a unique index name for each file.
276
- file_index = generate_unique_index(args.es_index_prefix, f.path)
277
- # Then, import the analysis metadata into a dedicated index.
278
- import_analysis_metadata(es, args.es_index_prefix, file_index, f, cat, args.dryrun)
279
- # Finally, import the file in its own index.
280
- globals()[f'import_{cat}_file'](es=es,
281
- file_index=file_index, file=f, dryrun=args.dryrun)
282
-
283
- # Import processes
284
- logger.info("Importing wet processes.")
285
- logger.info("Wet processes IDs = %s", str(import_bundle.wet_processes.get_process_ids()))
286
- import_processes(es, f"{args.es_index_prefix}-wet_processes", import_bundle.wet_processes)
287
-
288
- logger.info("Importing bio info processes.")
289
- logger.info("Bio info processes IDs = %s", str(import_bundle.bi_processes.get_process_ids()))
290
- import_processes(es, f"{args.es_index_prefix}-bi_processes", import_bundle.bi_processes)
291
-
292
-
293
- if __name__ == '__main__':
294
- main()
genelastic/info.py DELETED
@@ -1,248 +0,0 @@
1
- # pylint: disable=missing-module-docstring
2
- import argparse
3
- import logging
4
- import typing
5
-
6
- import elasticsearch
7
- import urllib3
8
-
9
- from .logger import configure_logging
10
- from .common import (add_es_connection_args, connect_to_es, add_verbose_control_args, Bucket,
11
- run_composite_aggregation, get_process_ids)
12
-
13
- logger = logging.getLogger('genelastic')
14
- logging.getLogger('elastic_transport').setLevel(logging.WARNING) # Disable excessive logging
15
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
16
-
17
-
18
- def read_args() -> argparse.Namespace:
19
- """Read arguments from command line."""
20
- parser = argparse.ArgumentParser(description='ElasticSearch database info.',
21
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
22
- allow_abbrev=False)
23
- add_verbose_control_args(parser)
24
- add_es_connection_args(parser)
25
- parser.add_argument("-y", "--list-bundles", action="store_true",
26
- help="List all imported YAML bundles.")
27
- parser.add_argument("-f", "--list-data-files", action="store_true",
28
- help="List all imported data files.")
29
- parser.add_argument("-w", "--list-wet-processes", action="store_true",
30
- help="List all imported wet processes.")
31
- parser.add_argument("-b", "--list-bi-processes", action="store_true",
32
- help="List all imported bio info processes.")
33
- parser.add_argument("-Y", "--list-data-files-per-bundle", action="store_true",
34
- help="For each imported YAML bundle, "
35
- "display some info and list its data files.")
36
- return parser.parse_args()
37
-
38
-
39
- def list_bundles(es: elasticsearch.Elasticsearch, index: str) -> None:
40
- """List all imported YAML bundles."""
41
-
42
- query = {
43
- "size": 0,
44
- "aggs": {
45
- "get_bundle_paths": {
46
- "composite": {
47
- "sources": {"bundle_path": {"terms": {"field": "bundle_path.keyword"}}},
48
- "size": 1000,
49
- }
50
- }
51
- }
52
- }
53
-
54
- buckets: typing.List[Bucket] = run_composite_aggregation(es, index, query)
55
-
56
- print("Imported YAML files")
57
- print("===================")
58
-
59
- if len(buckets) == 0:
60
- print("Empty response.", end="\n")
61
- return
62
-
63
- for bucket in buckets:
64
- bundle_path = bucket['key']['bundle_path']
65
- print(f'- {bundle_path}')
66
- print()
67
-
68
-
69
- def list_data_files(es: elasticsearch.Elasticsearch, index: str) -> None:
70
- """List all imported data files."""
71
-
72
- query = {
73
- "size": 0,
74
- "aggs": {
75
- "get_paths": {
76
- "composite": {
77
- "sources": {"path": {"terms": {"field": "path.keyword"}}},
78
- "size": 1000,
79
- }
80
- }
81
- }
82
- }
83
-
84
- buckets: typing.List[Bucket] = run_composite_aggregation(es, index, query)
85
-
86
- print("Imported data files")
87
- print("===================")
88
-
89
- if len(buckets) == 0:
90
- print("Empty response.", end="\n")
91
- return
92
-
93
- for bucket in buckets:
94
- bundle_path = bucket['key']['path']
95
- print(f'- {bundle_path}')
96
- print()
97
-
98
-
99
- def list_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
100
- """List all processes."""
101
- process_ids = get_process_ids(es, index, "proc_id")
102
-
103
- if len(process_ids) == 0:
104
- print("Empty response.", end="\n")
105
- return
106
-
107
- for process_id in process_ids:
108
- print(f'- {process_id}')
109
- print()
110
-
111
-
112
- def list_wet_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
113
- """List all wet processes."""
114
- print("Imported wet processes")
115
- print("======================")
116
- list_processes(es, index)
117
-
118
-
119
- def list_bi_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
120
- """List all bio info processes."""
121
- print("Imported bi processes")
122
- print("=====================")
123
- list_processes(es, index)
124
-
125
-
126
- def search_doc_by_field_value(es: elasticsearch.Elasticsearch,
127
- index: str, field: str, value: str) -> (
128
- typing.Dict[str, typing.Any] | None):
129
- """Search a document by a value for a certain field."""
130
- logger.info("Searching for field '%s' with value '%s' inside index '%s'.",
131
- field, value, index)
132
- search_query = {
133
- "query": {
134
- "term": {
135
- f"{field}.keyword": value,
136
- }
137
- }
138
- }
139
-
140
- response = es.search(index=index, body=search_query)
141
-
142
- try:
143
- return response['hits']['hits'][0]['_source'] # type: ignore
144
- except KeyError:
145
- return None
146
-
147
-
148
- def list_data_files_per_bundle(es: elasticsearch.Elasticsearch, index: str) -> None:
149
- """For each imported YAML bundle, display some info and list its data files."""
150
- query = {
151
- "size": 0,
152
- "aggs": {
153
- "data_files": {
154
- "composite": {
155
- "sources": [
156
- {
157
- "bundle_path": {
158
- "terms": {
159
- "field": "bundle_path.keyword"
160
- }
161
- }
162
- }
163
- ],
164
- "size": 100
165
- },
166
- "aggs": {
167
- "docs": {
168
- "top_hits": {
169
- "size": 100
170
- }
171
- }
172
- }
173
- }
174
- }
175
- }
176
-
177
- buckets: typing.List[Bucket] = run_composite_aggregation(es, index, query)
178
-
179
- print("Data files per YAML bundle")
180
- print("==========================")
181
-
182
- if len(buckets) == 0:
183
- print("Empty response.", end="\n")
184
- return
185
-
186
- for bucket in buckets:
187
-
188
- documents = bucket["docs"]["hits"]["hits"]
189
- if len(documents) == 0:
190
- continue
191
-
192
- print(f"- Bundle Path: {bucket['key']['bundle_path']}")
193
- print(f" -> Wet process: {documents[0]['_source']['metadata']['wet_process']}")
194
- print(f" -> Bio info process: {documents[0]['_source']['metadata']['bi_process']}")
195
- print(" -> Data files:")
196
-
197
- for doc in documents:
198
- print(f" - Index: {doc['_source']['file_index']}")
199
- print(f" Path: {doc['_source']['path']}")
200
-
201
- print()
202
-
203
-
204
- def main() -> None:
205
- """Entry point of the info script."""
206
- args = read_args()
207
-
208
- configure_logging(args.verbose)
209
- logger.debug("Arguments: %s", args)
210
- es = connect_to_es(host=args.es_host, port=args.es_port, usr=args.es_usr, pwd=args.es_pwd)
211
-
212
- analysis_index = f"{args.es_index_prefix}-analyses"
213
- wet_processes_index = f"{args.es_index_prefix}-wet_processes"
214
- bi_processes_index = f"{args.es_index_prefix}-bi_processes"
215
-
216
- list_call_count = 0
217
-
218
- if args.list_bundles:
219
- list_bundles(es, analysis_index)
220
- list_call_count += 1
221
-
222
- if args.list_data_files:
223
- list_data_files(es, analysis_index)
224
- list_call_count += 1
225
-
226
- if args.list_wet_processes:
227
- list_wet_processes(es, wet_processes_index)
228
- list_call_count += 1
229
-
230
- if args.list_bi_processes:
231
- list_bi_processes(es, bi_processes_index)
232
- list_call_count += 1
233
-
234
- if args.list_data_files_per_bundle:
235
- list_data_files_per_bundle(es, analysis_index)
236
- list_call_count += 1
237
-
238
- if list_call_count == 0:
239
- logger.debug("No list option specified, listing everything.")
240
- list_bundles(es, analysis_index)
241
- list_data_files(es, analysis_index)
242
- list_wet_processes(es, wet_processes_index)
243
- list_bi_processes(es, bi_processes_index)
244
- list_data_files_per_bundle(es, analysis_index)
245
-
246
-
247
- if __name__ == '__main__':
248
- main()