genelastic 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. genelastic/api/extends/example.py +2 -3
  2. genelastic/api/routes.py +160 -23
  3. genelastic/api/server.py +30 -22
  4. genelastic/api/settings.py +3 -2
  5. genelastic/common/__init__.py +36 -9
  6. genelastic/common/cli.py +51 -23
  7. genelastic/common/elastic.py +80 -49
  8. genelastic/common/exceptions.py +0 -2
  9. genelastic/common/types.py +20 -15
  10. genelastic/import_data/__init__.py +23 -5
  11. genelastic/import_data/analyses.py +17 -20
  12. genelastic/import_data/analysis.py +69 -65
  13. genelastic/import_data/bi_process.py +7 -5
  14. genelastic/import_data/bi_processes.py +8 -8
  15. genelastic/import_data/cli_gen_data.py +116 -0
  16. genelastic/import_data/cli_import.py +379 -0
  17. genelastic/import_data/{info.py → cli_info.py} +104 -75
  18. genelastic/import_data/cli_integrity.py +384 -0
  19. genelastic/import_data/cli_validate.py +54 -0
  20. genelastic/import_data/constants.py +11 -32
  21. genelastic/import_data/data_file.py +23 -20
  22. genelastic/import_data/filename_pattern.py +26 -32
  23. genelastic/import_data/import_bundle.py +56 -47
  24. genelastic/import_data/import_bundle_factory.py +166 -158
  25. genelastic/import_data/logger.py +22 -18
  26. genelastic/import_data/random_bundle.py +402 -0
  27. genelastic/import_data/tags.py +46 -26
  28. genelastic/import_data/wet_process.py +8 -4
  29. genelastic/import_data/wet_processes.py +13 -8
  30. genelastic/ui/__init__.py +0 -0
  31. genelastic/ui/server.py +87 -0
  32. genelastic/ui/settings.py +11 -0
  33. genelastic-0.7.0.dist-info/METADATA +105 -0
  34. genelastic-0.7.0.dist-info/RECORD +40 -0
  35. {genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/WHEEL +1 -1
  36. genelastic-0.7.0.dist-info/entry_points.txt +6 -0
  37. genelastic/import_data/gen_data.py +0 -194
  38. genelastic/import_data/import_data.py +0 -292
  39. genelastic/import_data/integrity.py +0 -290
  40. genelastic/import_data/validate_data.py +0 -43
  41. genelastic-0.6.1.dist-info/METADATA +0 -41
  42. genelastic-0.6.1.dist-info/RECORD +0 -36
  43. genelastic-0.6.1.dist-info/entry_points.txt +0 -6
  44. {genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/top_level.txt +0 -0
@@ -1,292 +0,0 @@
1
- # pylint: disable=missing-module-docstring
2
- # vi: se tw=80
3
-
4
- # Elasticsearch Python API:
5
- # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/overview.html
6
- # https://elasticsearch-py.readthedocs.io/en/latest/api.html
7
-
8
- import argparse
9
- import csv
10
- import datetime
11
- import hashlib
12
- import logging
13
- import os
14
- import sys
15
- import time
16
- import vcf # type: ignore
17
-
18
- from genelastic.common import (add_verbose_control_args, add_es_connection_args,
19
- ElasticImportConn, MetadataDocument, AnalysisDocument,
20
- BulkItems, ProcessDocument)
21
-
22
- from .import_bundle_factory import make_import_bundle_from_files
23
- from .bi_processes import BioInfoProcesses
24
- from .data_file import DataFile
25
- from .logger import configure_logging
26
- from .wet_processes import WetProcesses
27
-
28
- logger = logging.getLogger('genelastic')
29
- logging.getLogger('elastic_transport').setLevel(logging.WARNING) # Disable excessive logging
30
- logging.getLogger('urllib3').setLevel(logging.WARNING) # Disable excessive logging
31
-
32
-
33
- def read_args() -> argparse.Namespace:
34
- # pylint: disable=R0801
35
- """Read arguments from command line."""
36
- parser = argparse.ArgumentParser(description='Genetics data importer.',
37
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
38
- allow_abbrev=False)
39
- add_verbose_control_args(parser)
40
- add_es_connection_args(parser)
41
- parser.add_argument('-D', '--dry-run', dest='dryrun', action='count',
42
- default=0,
43
- help=('Dry-run level. -D for data files loading (VCF, coverage, etc)' +
44
- ' without connecting or importing to database. -DD for metadata' +
45
- ' YAML files loading only (no loading of data files).'))
46
- parser.add_argument('--log-file', dest='log_file', help='Path to a log file.')
47
- parser.add_argument('--no-list', dest='no_list',
48
- action='store_true',
49
- help='Do not print list of files to be imported.')
50
- parser.add_argument('--no-confirm', dest='no_confirm',
51
- action='store_true',
52
- help='Do not ask confirmation before importing.')
53
- parser.add_argument('files', type=str, nargs="+", default=None,
54
- help="Data files that describe what to import.")
55
- args = parser.parse_args()
56
- return args
57
-
58
-
59
- def import_cov_file(es_import_conn: ElasticImportConn | None,
60
- file_index: str, file: str, dryrun: int = 0) -> None:
61
- """Import a coverage file to the Elasticsearch database."""
62
- # Set field types
63
- if dryrun == 0 and es_import_conn:
64
- es_import_conn.client.indices.put_mapping(index=file_index,
65
- body={'properties': {'pos': {'type': 'integer'},
66
- 'depth': {'type': 'byte'}}})
67
-
68
- # Open file
69
- if dryrun > 1:
70
- logger.info('Would load and import Coverage file %s '
71
- 'into index %s.', file, file_index)
72
- else:
73
- logger.info('Load Coverage file %s.', file)
74
- if dryrun == 1:
75
- logger.info('Would import Coverage file %s into index %s.', file, file_index)
76
- else:
77
- logger.info('Import Coverage file %s into index %s.', file, file_index)
78
- with open(file, newline='', encoding="utf-8") as f:
79
-
80
- # Read file as CSV
81
- reader = csv.reader(f, delimiter='\t', quotechar='"')
82
-
83
- # Loop on al lines
84
- for row in reader:
85
-
86
- # Build document
87
- # Position starts at 0 inside coverage file
88
- doc: MetadataDocument = {
89
- 'type': 'coverage',
90
- 'chr': row[0],
91
- 'pos': int(row[1]) + 1,
92
- 'depth': int(row[2])
93
- }
94
-
95
- # Insert document
96
- if dryrun == 0 and es_import_conn:
97
- es_import_conn.client.index(index=file_index, document=doc)
98
-
99
-
100
- # pylint: disable-next=too-many-arguments, too-many-positional-arguments
101
- def import_analysis_metadata(es_import_conn: ElasticImportConn | None,
102
- index_prefix: str,
103
- file_index: str,
104
- file: DataFile,
105
- analysis_type: str,
106
- dryrun: int = 0) -> None:
107
- """Import analysis metadata into a dedicated index."""
108
- doc: AnalysisDocument = {
109
- "path": os.path.abspath(file.path),
110
- "bundle_path": os.path.abspath(file.bundle_path) if file.bundle_path else None,
111
- "metadata": file.metadata,
112
- "file_index": file_index,
113
- "type": analysis_type
114
- }
115
-
116
- bulk_items: BulkItems = [
117
- {"_index": f"{index_prefix}-analyses", "_source": doc}
118
- ]
119
-
120
- if dryrun == 0 and es_import_conn:
121
- es_import_conn.import_items(bulk_items,
122
- start_time=time.perf_counter(),
123
- total_items=len(bulk_items)
124
- )
125
-
126
-
127
- def import_vcf_file(es_import_conn: ElasticImportConn | None,
128
- file_index: str,
129
- file: DataFile,
130
- dryrun: int = 0) -> None:
131
- """Import a VCF file to the Elasticsearch database."""
132
- logger.info("Import VCF file \"%s\".", file)
133
-
134
- if dryrun > 1:
135
- logger.info('Would load and import VCF file %s '
136
- 'into index %s.', file.path, file_index)
137
- else:
138
- logger.info('Load VCF file %s.', file.path)
139
- if dryrun == 1:
140
- logger.info('Would import VCF file %s into index %s.', file.path, file_index)
141
- else:
142
- logger.info('Importing VCF file %s into index %s...', file.path, file_index)
143
-
144
- try:
145
- vcf_reader = vcf.Reader(filename=file.path)
146
- n = 0
147
- start = time.perf_counter()
148
- bulk_sz = 256 # Bulk size
149
- bulk_items: BulkItems = []
150
- for record in vcf_reader:
151
-
152
- # Correct values
153
- if not record.CHROM.startswith('chr'):
154
- if record.CHROM.lower().startswith('chr'):
155
- record.CHROM = 'chr' + record.CHROM[3:]
156
- else:
157
- record.CHROM = 'chr' + record.CHROM
158
-
159
- # Build document
160
- alt = [x if x is None else x.type for x in record.ALT]
161
- doc: MetadataDocument = {
162
- 'type': 'vcf',
163
- 'chr': record.CHROM,
164
- 'pos': record.POS,
165
- 'alt': alt,
166
- 'info': record.INFO,
167
- }
168
-
169
- if dryrun == 0:
170
-
171
- # Append item to bulk
172
- bulk_items.append({"_index": file_index, "_source": doc})
173
- n += 1
174
- # resp = es.index(index=index, document=doc)
175
-
176
- # Insert bulk of items
177
- if len(bulk_items) >= bulk_sz and es_import_conn:
178
- es_import_conn.import_items(bulk_items, start_time=start,
179
- total_items=n)
180
- bulk_items = []
181
-
182
- # Insert remaining items
183
- if dryrun == 0 and es_import_conn:
184
- es_import_conn.import_items(bulk_items, start_time=start, total_items=n)
185
-
186
- except StopIteration:
187
- logger.error('Skipping empty file : %s.', file.path)
188
-
189
-
190
- def import_processes(es_import_conn: ElasticImportConn | None, index: str,
191
- processes: WetProcesses | BioInfoProcesses, dryrun: int = 0) -> None:
192
- """Import processes into their own index."""
193
-
194
- bulk_items: BulkItems = []
195
-
196
- for proc_id in processes.get_process_ids():
197
- process = processes[proc_id]
198
- process_type = process.__class__.__name__
199
- doc: ProcessDocument = process.data | {'proc_id': proc_id, 'type': process_type}
200
- bulk_items.append({"_index": index, "_source": doc})
201
-
202
- if dryrun == 0 and es_import_conn:
203
- es_import_conn.import_items(bulk_items,
204
- start_time=time.perf_counter(),
205
- total_items=len(bulk_items)
206
- )
207
-
208
-
209
- def generate_unique_index(index_prefix: str, filepath: str) -> str:
210
- """
211
- Generate a unique index with the following format:
212
- <index_prefix>_<current_date>_<md5_hashed_filepath>
213
- """
214
- current_date = datetime.datetime.today().strftime('%Y%m%d')
215
- hashed_filepath = hashlib.md5(filepath.encode('utf-8'), usedforsecurity=False).hexdigest()
216
- return f"{index_prefix}-file-{current_date}-{hashed_filepath}"
217
-
218
-
219
- def main() -> None:
220
- """Entry point of the import script."""
221
- # Read command line arguments
222
- args = read_args()
223
-
224
- # Configure logging
225
- configure_logging(args.verbose, log_file=args.log_file)
226
- logger.debug("Arguments: %s", args)
227
- logger.debug("LOGGERS: %s", logging.root.manager.loggerDict) # pylint: disable=no-member
228
-
229
- # Open connection to ES
230
- if args.dryrun == 0:
231
- addr = f"https://{args.es_host}:{args.es_port}"
232
- logger.info("Trying to connect to Elasticsearch at %s...", addr)
233
- es_import_conn = ElasticImportConn(addr, args.es_cert_fp,
234
- basic_auth=(args.es_usr, args.es_pwd))
235
- else:
236
- es_import_conn = None
237
-
238
- # Create index
239
- # es.indices.create(index=args.es_index_prefix)
240
-
241
- # Load YAML import bundle
242
- import_bundle = make_import_bundle_from_files(args.files, check=True)
243
- all_bundled_files = import_bundle.get_files()
244
-
245
- # CHECK
246
- for f in all_bundled_files:
247
- if not f.exists():
248
- raise RuntimeError(f"Path {f.path} does not point to a valid file.")
249
-
250
- # LIST
251
- if not args.no_list:
252
- for f in all_bundled_files:
253
- logger.info("Will import %s.", f.path)
254
-
255
- # Ask confirmation for importing
256
- if not args.no_confirm:
257
- answer: str = "maybe"
258
- while answer not in ['', 'n', 'y']:
259
- answer = input("Import (y/N)? ").lower()
260
- if answer != 'y':
261
- logger.info("Import canceled.")
262
- sys.exit(0)
263
-
264
- # IMPORT
265
- # Loop on file categories
266
- for cat in import_bundle.analyses.get_all_categories():
267
- # Import all files in this category.
268
- for f in import_bundle.get_files(cat):
269
- logger.info("Import %s files from %s.", cat, f.path)
270
- # First, generate a unique index name for each file.
271
- file_index = generate_unique_index(args.es_index_prefix, f.path)
272
- # Then, import the analysis metadata into a dedicated index.
273
- import_analysis_metadata(es_import_conn, args.es_index_prefix,
274
- file_index, f, cat, args.dryrun)
275
- # Finally, import the file in its own index.
276
- globals()[f'import_{cat}_file'](es_import_conn=es_import_conn,
277
- file_index=file_index, file=f, dryrun=args.dryrun)
278
-
279
- # Import processes
280
- logger.info("Importing wet processes.")
281
- logger.info("Wet processes IDs = %s", str(import_bundle.wet_processes.get_process_ids()))
282
- import_processes(es_import_conn, f"{args.es_index_prefix}-wet_processes",
283
- import_bundle.wet_processes)
284
-
285
- logger.info("Importing bio info processes.")
286
- logger.info("Bio info processes IDs = %s", str(import_bundle.bi_processes.get_process_ids()))
287
- import_processes(es_import_conn, f"{args.es_index_prefix}-bi_processes",
288
- import_bundle.bi_processes)
289
-
290
-
291
- if __name__ == '__main__':
292
- main()
@@ -1,290 +0,0 @@
1
- # pylint: disable=missing-module-docstring
2
- import argparse
3
- import logging
4
- import typing
5
- from elasticsearch import NotFoundError
6
-
7
- from genelastic.common import (ElasticQueryConn, DBIntegrityError, Bucket,
8
- add_verbose_control_args, add_es_connection_args)
9
-
10
- from .logger import configure_logging
11
-
12
- logger = logging.getLogger('genelastic')
13
- logging.getLogger('elastic_transport').setLevel(logging.WARNING) # Disable excessive logging
14
-
15
-
16
- def read_args() -> argparse.Namespace:
17
- """Read arguments from command line."""
18
- parser = argparse.ArgumentParser(description='Utility to check the integrity '
19
- 'of the genelastic ElasticSearch database.',
20
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
21
- allow_abbrev=False)
22
- add_verbose_control_args(parser)
23
- add_es_connection_args(parser)
24
- return parser.parse_args()
25
-
26
-
27
- def check_for_undefined_file_indices(es_query_conn: ElasticQueryConn, analyses_index: str) -> None:
28
- """
29
- Check for potentially undefined files indices in the analyses index.
30
-
31
- :param es_query_conn: Elasticsearch database instance.
32
- :param analyses_index: Name of the index where analyses are stored.
33
- :raises genelastic.common.DBIntegrityError:
34
- Some files indices are used in the analyses index but are undefined.
35
- """
36
- logger.info("Checking for references to undefined file indices in the index '%s'...",
37
- analyses_index)
38
-
39
- undefined_indices = set()
40
-
41
- query = {
42
- "size": 0,
43
- "aggs": {
44
- "get_file_indices": {
45
- "composite": {
46
- "sources": {"file_index": {"terms": {"field": "file_index.keyword"}}},
47
- "size": 1000,
48
- }
49
- }
50
- }
51
- }
52
-
53
- buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(analyses_index, query)
54
-
55
- for bucket in buckets:
56
- file_index = bucket['key']['file_index']
57
-
58
- try:
59
- es_query_conn.client.indices.get(index=file_index)
60
- logger.debug("File index %s used in index '%s' is defined.",
61
- file_index, analyses_index)
62
- except NotFoundError:
63
- logger.debug("File index %s used in '%s' is undefined.",
64
- file_index, analyses_index)
65
- undefined_indices.add(file_index)
66
-
67
- if len(undefined_indices) > 0:
68
- raise DBIntegrityError(f"Found the following undefined file indices defined "
69
- f"in the index '{analyses_index}': "
70
- f"{", ".join(undefined_indices)}")
71
-
72
- logger.info("All defined file indices are referenced.")
73
-
74
-
75
- def get_undefined_processes(es_query_conn: ElasticQueryConn, analyses_index: str,
76
- process_index: str, field: str) -> typing.Set[str]:
77
- """
78
- Return a set of undefined processes IDs in an index.
79
-
80
- :param es_query_conn: Elasticsearch database instance.
81
- :param analyses_index: Name of the index where analyses are stored.
82
- :param process_index: Name of the index to check for undefined processes.
83
- :param field: Field name used to retrieve the process ID.
84
- :returns: A set of undefined processes IDs.
85
- """
86
- query = {
87
- "size": 0,
88
- "aggs": {
89
- "get_analyses_processes": {
90
- "composite": {
91
- "sources": {"process": {"terms": {"field": f"{field}.keyword"}}},
92
- "size": 1000,
93
- }
94
- }
95
- }
96
- }
97
-
98
- buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(analyses_index, query)
99
-
100
- used_processes = set(map(lambda bucket: bucket["key"]["process"], buckets))
101
- logger.debug("Used values for field '%s' in index '%s': %s",
102
- field, analyses_index, used_processes)
103
-
104
- defined_processes = es_query_conn.get_field_values(process_index, "proc_id")
105
- logger.debug("Defined values in index '%s': %s", process_index, defined_processes)
106
-
107
- return used_processes.difference(defined_processes)
108
-
109
-
110
- def check_for_undefined_wet_processes(es_query_conn: ElasticQueryConn,
111
- analyses_index: str, wet_process_index: str) -> None:
112
- """
113
- Check that each wet process used in the analyses index is defined.
114
-
115
- :param es_query_conn: Elasticsearch database instance.
116
- :param analyses_index: Name of the index where analyses are stored.
117
- :param wet_process_index: Name of the index where wet processes are stored.
118
- :raises genelastic.common.DBIntegrityError:
119
- Some wet processes used in the analyses index are undefined.
120
- """
121
- logger.info("Checking for undefined wet processes used in index '%s'...", analyses_index)
122
- undefined_wet_processes = get_undefined_processes(es_query_conn,
123
- analyses_index, wet_process_index,
124
- "metadata.wet_process")
125
-
126
- if len(undefined_wet_processes) > 0:
127
- raise DBIntegrityError(f"Index '{analyses_index}' uses the following "
128
- f"undefined wet processes: {", ".join(undefined_wet_processes)}.")
129
-
130
- logger.info("All wet processes used in index '%s' are defined.", wet_process_index)
131
-
132
-
133
- def check_for_undefined_bi_processes(es_query_conn: ElasticQueryConn,
134
- analyses_index: str, bi_process_index: str) -> None:
135
- """
136
- Check that each bio info process used in the analyses index is defined.
137
-
138
- :param es_query_conn: Elasticsearch database instance.
139
- :param analyses_index: Name of the index where analyses are stored.
140
- :param bi_process_index: Name of the index where bio info processes are stored.
141
- :raises genelastic.common.DBIntegrityError:
142
- Some bio info processes used in the analyses index are undefined.
143
- """
144
- logger.info("Checking for undefined bio info processes used in index '%s'...",
145
- analyses_index)
146
- undefined_bi_processes = get_undefined_processes(es_query_conn, analyses_index,
147
- bi_process_index,
148
- "metadata.bi_process")
149
-
150
- if len(undefined_bi_processes) > 0:
151
- raise DBIntegrityError(f"Index '{analyses_index}' uses the following "
152
- f"undefined bio info processes: "
153
- f"{", ".join(undefined_bi_processes)}.")
154
-
155
- logger.info("All bio info processes used in index '%s' are defined.", bi_process_index)
156
-
157
-
158
- def check_for_unused_file_indices(es_query_conn: ElasticQueryConn,
159
- analyses_index: str, index_prefix: str) -> int:
160
- """
161
- Check that each of the file indices are used in at least one analysis.
162
-
163
- :param es_query_conn: Elasticsearch database instance.
164
- :param analyses_index: Name of the index where analyses are stored.
165
- :param index_prefix: Prefix given to all the indices of the ElasticSearch database.
166
- :returns: 1 if some file indices exists but are unused in the analyses index,
167
- and 0 otherwise.
168
- """
169
- json_indices = (es_query_conn.client.cat.
170
- indices(index=f"{index_prefix}-file-*", format="json").body)
171
- found_file_indices = set(map(lambda x: x["index"], json_indices))
172
-
173
- query = {
174
- "size": 0,
175
- "aggs": {
176
- "get_file_indices": {
177
- "composite": {
178
- "sources": {"file_index": {"terms": {"field": "file_index.keyword"}}},
179
- "size": 1000,
180
- }
181
- }
182
- }
183
- }
184
-
185
- buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(analyses_index, query)
186
-
187
- used_files_indices = set(map(lambda bucket: bucket['key']['file_index'], buckets))
188
- unused_files_indices = found_file_indices.difference(used_files_indices)
189
-
190
- if len(unused_files_indices) > 0:
191
- logger.warning("Found the following unused files indices: %s",
192
- ", ".join(unused_files_indices))
193
- return 1
194
-
195
- logger.info("All files indices are used.")
196
- return 0
197
-
198
-
199
- def check_for_unused_wet_processes(es_query_conn: ElasticQueryConn, analyses_index: str,
200
- wet_proc_index: str) -> int:
201
- """
202
- Check for defined wet processes that are not used in the analyses index.
203
-
204
- :param es_query_conn: Elasticsearch database instance.
205
- :param analyses_index: Name of the index where analyses are stored.
206
- :param wet_proc_index: Name of the index where wet processes are stored.
207
- :returns: 1 if some wet process are defined but unused in the analyses index,
208
- and 0 otherwise.
209
- """
210
- logger.info("Checking for unused wet processes in the index '%s'...", wet_proc_index)
211
-
212
- defined_wet_procs = es_query_conn.get_field_values(wet_proc_index, "proc_id")
213
- logger.debug("Found the following defined wet processes: %s", defined_wet_procs)
214
-
215
- used_wet_procs = es_query_conn.get_field_values(analyses_index, "metadata.wet_process")
216
- logger.debug("Following processes are used in the index '%s': %s",
217
- analyses_index, used_wet_procs)
218
-
219
- unused_wet_procs = defined_wet_procs - used_wet_procs
220
- if len(unused_wet_procs) > 0:
221
- logger.warning("Found unused wet processes: %s", unused_wet_procs)
222
- return 1
223
-
224
- logger.info("No unused wet processes found.")
225
- return 0
226
-
227
-
228
- def check_for_unused_bi_processes(es_query_conn: ElasticQueryConn, analyses_index: str,
229
- bi_proc_index: str) -> int:
230
- """
231
- Check for defined bio info processes that are not used in the analyses index.
232
-
233
- :param es_query_conn: Elasticsearch database instance.
234
- :param analyses_index: Name of the index where analyses are stored.
235
- :param bi_proc_index: Name of the index where bio info processes are stored.
236
- :returns: 1 if some wet process are defined but unused in the analyses index,
237
- and 0 otherwise.
238
- """
239
- logger.info("Checking for unused bio info processes in the index '%s'...", bi_proc_index)
240
-
241
- defined_bi_procs = es_query_conn.get_field_values(bi_proc_index, "proc_id")
242
- logger.debug("Found the following defined bio info processes: %s", defined_bi_procs)
243
-
244
- used_bi_procs = es_query_conn.get_field_values(analyses_index, "metadata.bi_process")
245
- logger.debug("Following processes are used in the index '%s': %s",
246
- analyses_index, used_bi_procs)
247
-
248
- unused_bi_procs = defined_bi_procs - used_bi_procs
249
- if len(unused_bi_procs) > 0:
250
- logger.warning("Found unused bio info processes: %s", unused_bi_procs)
251
- return 1
252
-
253
- logger.info("No unused bio info processes found.")
254
- return 0
255
-
256
-
257
- def main() -> None:
258
- """Entry point of the integrity script."""
259
- args = read_args()
260
-
261
- configure_logging(args.verbose)
262
- logger.debug("Arguments: %s", args)
263
-
264
- analyses_index = f"{args.es_index_prefix}-analyses"
265
- wet_processes_index = f"{args.es_index_prefix}-wet_processes"
266
- bi_processes_index = f"{args.es_index_prefix}-bi_processes"
267
-
268
- addr = f"https://{args.es_host}:{args.es_port}"
269
- logger.info("Trying to connect to Elasticsearch at %s...", addr)
270
- es_query_conn = ElasticQueryConn(addr, args.es_cert_fp,
271
- basic_auth=(args.es_usr, args.es_pwd))
272
-
273
- # Fatal errors
274
- try:
275
- es_query_conn.ensure_unique(wet_processes_index, "proc_id")
276
- es_query_conn.ensure_unique(bi_processes_index, "proc_id")
277
- check_for_undefined_file_indices(es_query_conn, analyses_index)
278
- check_for_undefined_wet_processes(es_query_conn, analyses_index, wet_processes_index)
279
- check_for_undefined_bi_processes(es_query_conn, analyses_index, bi_processes_index)
280
- except DBIntegrityError as e:
281
- raise SystemExit(e) from e
282
-
283
- # Warnings
284
- check_for_unused_wet_processes(es_query_conn, analyses_index, wet_processes_index)
285
- check_for_unused_bi_processes(es_query_conn, analyses_index, bi_processes_index)
286
- check_for_unused_file_indices(es_query_conn, analyses_index, args.es_index_prefix)
287
-
288
-
289
- if __name__ == '__main__':
290
- main()
@@ -1,43 +0,0 @@
1
- # pylint: disable=missing-module-docstring
2
- import argparse
3
- import logging
4
-
5
- from schema import SchemaError # type: ignore
6
-
7
- from genelastic.common import add_verbose_control_args
8
-
9
- from .logger import configure_logging
10
- from .import_bundle_factory import make_import_bundle_from_files
11
-
12
- logger = logging.getLogger('genelastic')
13
-
14
-
15
- def read_args() -> argparse.Namespace:
16
- """Read arguments from command line."""
17
- parser = argparse.ArgumentParser(description="Ensure that YAML files "
18
- "follow the genelastic YAML bundle schema.",
19
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
20
- allow_abbrev=False)
21
- add_verbose_control_args(parser)
22
- parser.add_argument('files', type=str, nargs="+", default=None,
23
- help="YAML files to validate.")
24
- parser.add_argument('-c', '--check', action='store_true',
25
- help="In addition to validating the schema, "
26
- "check for undefined referenced processes.")
27
- return parser.parse_args()
28
-
29
-
30
- def main() -> int:
31
- """Entry point of the validate script."""
32
- args = read_args()
33
- configure_logging(args.verbose)
34
-
35
- try:
36
- make_import_bundle_from_files(args.files, check=args.check)
37
- except (ValueError, RuntimeError, SchemaError) as e:
38
- # Catch any exception that can be raised by 'make_import_bundle_from_files'.
39
- logger.error(e)
40
- return 1
41
-
42
- logger.info("All YAML files respect the genelastic YAML bundle format.")
43
- return 0
@@ -1,41 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: genelastic
3
- Version: 0.6.1
4
- Summary: Generate and store genetic data into an Elasticsearch database.
5
- Author: CNRGH
6
- Author-email: Pierrick ROGER <pierrick.roger@cnrgh.fr>, Maxime BLANCHON <maxime.blanchon@cnrgh.fr>
7
- License: CeCILL
8
- Keywords: CNRGH,genelastic,generation,storage,elasticsearch,database
9
- Classifier: Development Status :: 3 - Alpha
10
- Classifier: Intended Audience :: Science/Research
11
- Classifier: License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)
12
- Classifier: Programming Language :: Python :: 3.11
13
- Requires-Python: >=3.11
14
- Description-Content-Type: text/markdown
15
- Requires-Dist: elasticsearch
16
- Requires-Dist: PyVCF3
17
- Requires-Dist: schema
18
- Requires-Dist: PyYAML
19
- Requires-Dist: biophony >=1.0.1
20
- Requires-Dist: colorlog
21
- Provides-Extra: api
22
- Requires-Dist: flask ; extra == 'api'
23
- Requires-Dist: elasticsearch ; extra == 'api'
24
- Requires-Dist: environs ; extra == 'api'
25
- Requires-Dist: connexion[flask,swagger-ui,uvicorn] ; extra == 'api'
26
- Provides-Extra: docs
27
- Requires-Dist: sphinx ; extra == 'docs'
28
- Requires-Dist: sphinx-autoapi ; extra == 'docs'
29
- Requires-Dist: furo ; extra == 'docs'
30
- Provides-Extra: tests
31
- Requires-Dist: pytest ; extra == 'tests'
32
- Requires-Dist: mypy ; extra == 'tests'
33
- Requires-Dist: pylint ; extra == 'tests'
34
- Requires-Dist: bandit ; extra == 'tests'
35
- Requires-Dist: coverage ; extra == 'tests'
36
- Requires-Dist: yamllint ; extra == 'tests'
37
- Requires-Dist: types-PyYAML ; extra == 'tests'
38
-
39
- # genelastic
40
-
41
- Storing of genetics data into an Elasticsearch database.