genelastic 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. genelastic/__init__.py +0 -13
  2. genelastic/api/__init__.py +0 -0
  3. genelastic/api/extends/__init__.py +0 -0
  4. genelastic/api/extends/example.py +6 -0
  5. genelastic/api/routes.py +221 -0
  6. genelastic/api/server.py +80 -0
  7. genelastic/api/settings.py +14 -0
  8. genelastic/common/__init__.py +39 -0
  9. genelastic/common/cli.py +63 -0
  10. genelastic/common/elastic.py +214 -0
  11. genelastic/common/exceptions.py +4 -0
  12. genelastic/common/types.py +25 -0
  13. genelastic/import_data/__init__.py +27 -0
  14. genelastic/{analyses.py → import_data/analyses.py} +19 -20
  15. genelastic/{analysis.py → import_data/analysis.py} +71 -66
  16. genelastic/{bi_process.py → import_data/bi_process.py} +8 -6
  17. genelastic/{bi_processes.py → import_data/bi_processes.py} +10 -9
  18. genelastic/import_data/cli_gen_data.py +116 -0
  19. genelastic/import_data/cli_import.py +379 -0
  20. genelastic/import_data/cli_info.py +256 -0
  21. genelastic/import_data/cli_integrity.py +384 -0
  22. genelastic/import_data/cli_validate.py +54 -0
  23. genelastic/import_data/constants.py +24 -0
  24. genelastic/{data_file.py → import_data/data_file.py} +26 -21
  25. genelastic/import_data/filename_pattern.py +57 -0
  26. genelastic/{import_bundle.py → import_data/import_bundle.py} +58 -48
  27. genelastic/import_data/import_bundle_factory.py +298 -0
  28. genelastic/{logger.py → import_data/logger.py} +22 -18
  29. genelastic/import_data/random_bundle.py +402 -0
  30. genelastic/{tags.py → import_data/tags.py} +48 -27
  31. genelastic/{wet_process.py → import_data/wet_process.py} +8 -4
  32. genelastic/{wet_processes.py → import_data/wet_processes.py} +15 -9
  33. genelastic/ui/__init__.py +0 -0
  34. genelastic/ui/server.py +87 -0
  35. genelastic/ui/settings.py +11 -0
  36. genelastic-0.7.0.dist-info/METADATA +105 -0
  37. genelastic-0.7.0.dist-info/RECORD +40 -0
  38. {genelastic-0.6.0.dist-info → genelastic-0.7.0.dist-info}/WHEEL +1 -1
  39. genelastic-0.7.0.dist-info/entry_points.txt +6 -0
  40. genelastic/common.py +0 -151
  41. genelastic/constants.py +0 -45
  42. genelastic/filename_pattern.py +0 -62
  43. genelastic/gen_data.py +0 -193
  44. genelastic/import_bundle_factory.py +0 -288
  45. genelastic/import_data.py +0 -294
  46. genelastic/info.py +0 -248
  47. genelastic/integrity.py +0 -324
  48. genelastic/validate_data.py +0 -41
  49. genelastic-0.6.0.dist-info/METADATA +0 -36
  50. genelastic-0.6.0.dist-info/RECORD +0 -25
  51. genelastic-0.6.0.dist-info/entry_points.txt +0 -6
  52. {genelastic-0.6.0.dist-info → genelastic-0.7.0.dist-info}/top_level.txt +0 -0
genelastic/integrity.py DELETED
@@ -1,324 +0,0 @@
1
- # pylint: disable=missing-module-docstring
2
- import argparse
3
- import logging
4
- import typing
5
-
6
- import elasticsearch
7
- import urllib3
8
- from elasticsearch import NotFoundError
9
-
10
- from .common import (add_verbose_control_args, add_es_connection_args,
11
- connect_to_es, get_process_ids, Bucket, run_composite_aggregation)
12
- from .logger import configure_logging
13
-
14
-
15
- logger = logging.getLogger('genelastic')
16
- logging.getLogger('elastic_transport').setLevel(logging.WARNING) # Disable excessive logging
17
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
18
-
19
- class DBIntegrityError(Exception):
20
- """Represents an integrity error,
21
- raised when the database content does not match the expected data schema.
22
- """
23
-
24
- def read_args() -> argparse.Namespace:
25
- """Read arguments from command line."""
26
- parser = argparse.ArgumentParser(description='Utility to check the integrity '
27
- 'of the genelastic ElasticSearch database.',
28
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
29
- allow_abbrev=False)
30
- add_verbose_control_args(parser)
31
- add_es_connection_args(parser)
32
- return parser.parse_args()
33
-
34
-
35
- def ensure_unique(es: elasticsearch.Elasticsearch, index: str, field: str) -> None:
36
- """
37
- Ensure that all values of a field in an index are all unique.
38
-
39
- :param es: Elasticsearch database instance.
40
- :param index: Name of the index.
41
- :param field: Field name to check for value uniqueness.
42
- :raises DBIntegrityError: Some values of the given field are duplicated in the index.
43
- """
44
-
45
- logger.info("Ensuring that the field '%s' in the index '%s' only contains unique values...",
46
- field, index)
47
- query = {
48
- "size": 0,
49
- "aggs": {
50
- "duplicate_proc_ids": {
51
- "terms": {
52
- "field": f"{field}.keyword",
53
- "size": 10000,
54
- "min_doc_count": 2
55
- }
56
- }
57
- }
58
- }
59
- buckets: typing.List[Bucket] = run_composite_aggregation(es, index, query)
60
- duplicated_processes: typing.Set[str] = set(map(lambda bucket: str(bucket["key"]), buckets))
61
-
62
- if len(duplicated_processes) > 0:
63
- raise DBIntegrityError(f"Found non-unique value for field {field} in index '{index}': "
64
- f"{", ".join(duplicated_processes)}.")
65
-
66
- logger.info("All values of field '%s' in index '%s' are unique.",
67
- field, index)
68
-
69
-
70
- def check_for_undefined_file_indices(es: elasticsearch.Elasticsearch, analyses_index: str) -> None:
71
- """
72
- Check for potentially undefined files indices in the analyses index.
73
-
74
- :param es: Elasticsearch database instance.
75
- :param analyses_index: Name of the index where analyses are stored.
76
- :raises DBIntegrityError: Some files indices are used in the analyses index but
77
- are undefined.
78
- """
79
- logger.info("Checking for references to undefined file indices in the index '%s'...",
80
- analyses_index)
81
-
82
- undefined_indices = set()
83
-
84
- query = {
85
- "size": 0,
86
- "aggs": {
87
- "get_file_indices": {
88
- "composite": {
89
- "sources": {"file_index": {"terms": {"field": "file_index.keyword"}}},
90
- "size": 1000,
91
- }
92
- }
93
- }
94
- }
95
-
96
- buckets: typing.List[Bucket] = run_composite_aggregation(es, analyses_index, query)
97
-
98
- for bucket in buckets:
99
- file_index = bucket['key']['file_index']
100
-
101
- try:
102
- es.indices.get(index=file_index)
103
- logger.debug("File index %s used in index '%s' is defined.",
104
- file_index, analyses_index)
105
- except NotFoundError:
106
- logger.debug("File index %s used in '%s' is undefined.",
107
- file_index, analyses_index)
108
- undefined_indices.add(file_index)
109
-
110
- if len(undefined_indices) > 0:
111
- raise DBIntegrityError(f"Found the following undefined file indices defined "
112
- f"in the index '{analyses_index}': "
113
- f"{", ".join(undefined_indices)}")
114
-
115
- logger.info("All defined file indices are referenced.")
116
-
117
-
118
- def get_undefined_processes(es: elasticsearch.Elasticsearch, analyses_index: str,
119
- process_index: str, field: str) -> typing.Set[str]:
120
- """
121
- Return a set of undefined processes IDs in an index.
122
-
123
- :param es: Elasticsearch database instance.
124
- :param analyses_index: Name of the index where analyses are stored.
125
- :param process_index: Name of the index to check for undefined processes.
126
- :param field: Field name used to retrieve the process ID.
127
- :returns: A set of undefined processes IDs.
128
- """
129
- query = {
130
- "size": 0,
131
- "aggs": {
132
- "get_analyses_processes": {
133
- "composite": {
134
- "sources": { "process": {"terms": {"field": f"{field}.keyword"}}},
135
- "size": 1000,
136
- }
137
- }
138
- }
139
- }
140
-
141
- buckets: typing.List[Bucket] = run_composite_aggregation(es, analyses_index, query)
142
-
143
- used_processes = set(map(lambda bucket: bucket["key"]["process"], buckets))
144
- logger.debug("Used values for field '%s' in index '%s': %s",
145
- field, analyses_index, used_processes)
146
-
147
- defined_processes = get_process_ids(es, process_index, "proc_id")
148
- logger.debug("Defined values in index '%s': %s", process_index, defined_processes)
149
-
150
- return used_processes.difference(defined_processes)
151
-
152
-
153
- def check_for_undefined_wet_processes(es: elasticsearch.Elasticsearch,
154
- analyses_index: str, wet_process_index: str) -> None:
155
- """
156
- Check that each wet process used in the analyses index is defined.
157
-
158
- :param es: Elasticsearch database instance.
159
- :param analyses_index: Name of the index where analyses are stored.
160
- :param wet_process_index: Name of the index where wet processes are stored.
161
- :raises DBIntegrityError: Some wet processes used in the analyses index are undefined.
162
- """
163
- logger.info("Checking for undefined wet processes used in index '%s'...", analyses_index)
164
- undefined_wet_processes = get_undefined_processes(es, analyses_index, wet_process_index,
165
- "metadata.wet_process")
166
-
167
- if len(undefined_wet_processes) > 0:
168
- raise DBIntegrityError(f"Index '{analyses_index}' uses the following "
169
- f"undefined wet processes: {", ".join(undefined_wet_processes)}.")
170
-
171
- logger.info("All wet processes used in index '%s' are defined.", wet_process_index)
172
-
173
-
174
- def check_for_undefined_bi_processes(es: elasticsearch.Elasticsearch,
175
- analyses_index: str, bi_process_index: str) -> None:
176
- """
177
- Check that each bio info process used in the analyses index is defined.
178
-
179
- :param es: Elasticsearch database instance.
180
- :param analyses_index: Name of the index where analyses are stored.
181
- :param bi_process_index: Name of the index where bio info processes are stored.
182
- :raises DBIntegrityError: Some bio info processes used in the analyses index are undefined.
183
- """
184
- logger.info("Checking for undefined bio info processes used in index '%s'...", analyses_index)
185
- undefined_bi_processes = get_undefined_processes(es, analyses_index, bi_process_index,
186
- "metadata.bi_process")
187
-
188
- if len(undefined_bi_processes) > 0:
189
- raise DBIntegrityError(f"Index '{analyses_index}' uses the following "
190
- f"undefined bio info processes: "
191
- f"{", ".join(undefined_bi_processes)}.")
192
-
193
- logger.info("All bio info processes used in index '%s' are defined.", bi_process_index)
194
-
195
-
196
- def check_for_unused_file_indices(es: elasticsearch.Elasticsearch,
197
- analyses_index: str, index_prefix: str) -> int:
198
- """
199
- Check that each of the file indices are used in at least one analysis.
200
-
201
- :param es: Elasticsearch database instance.
202
- :param analyses_index: Name of the index where analyses are stored.
203
- :param index_prefix: Prefix given to all the indices of the ElasticSearch database.
204
- :returns: 1 if some file indices exists but are unused in the analyses index,
205
- and 0 otherwise.
206
- """
207
- json_indices = es.cat.indices(index=f"{index_prefix}-file-*", format="json").body
208
- found_file_indices = set(map(lambda x: x["index"], json_indices))
209
-
210
- query = {
211
- "size": 0,
212
- "aggs": {
213
- "get_file_indices": {
214
- "composite": {
215
- "sources": {"file_index": {"terms": {"field": "file_index.keyword"}}},
216
- "size": 1000,
217
- }
218
- }
219
- }
220
- }
221
-
222
- buckets: typing.List[Bucket] = run_composite_aggregation(es, analyses_index, query)
223
-
224
- used_files_indices = set(map(lambda bucket: bucket['key']['file_index'], buckets))
225
- unused_files_indices = found_file_indices.difference(used_files_indices)
226
-
227
- if len(unused_files_indices) > 0:
228
- logger.warning("Found the following unused files indices: %s",
229
- ", ".join(unused_files_indices))
230
- return 1
231
-
232
- logger.info("All files indices are used.")
233
- return 0
234
-
235
-
236
- def check_for_unused_wet_processes(es: elasticsearch.Elasticsearch, analyses_index: str,
237
- wet_proc_index: str) -> int:
238
- """
239
- Check for defined wet processes that are not used in the analyses index.
240
-
241
- :param es: Elasticsearch database instance.
242
- :param analyses_index: Name of the index where analyses are stored.
243
- :param wet_proc_index: Name of the index where wet processes are stored.
244
- :returns: 1 if some wet process are defined but unused in the analyses index,
245
- and 0 otherwise.
246
- """
247
- logger.info("Checking for unused wet processes in the index '%s'...", wet_proc_index)
248
-
249
- defined_wet_procs = get_process_ids(es, wet_proc_index, "proc_id")
250
- logger.debug("Found the following defined wet processes: %s", defined_wet_procs)
251
-
252
- used_wet_procs = get_process_ids(es, analyses_index, "metadata.wet_process")
253
- logger.debug("Following processes are used in the index '%s': %s",
254
- analyses_index, used_wet_procs)
255
-
256
- unused_wet_procs = defined_wet_procs - used_wet_procs
257
- if len(unused_wet_procs) > 0:
258
- logger.warning("Found unused wet processes: %s", unused_wet_procs)
259
- return 1
260
-
261
- logger.info("No unused wet processes found.")
262
- return 0
263
-
264
-
265
- def check_for_unused_bi_processes(es: elasticsearch.Elasticsearch, analyses_index: str,
266
- bi_proc_index: str) -> int:
267
- """
268
- Check for defined bio info processes that are not used in the analyses index.
269
-
270
- :param es: Elasticsearch database instance.
271
- :param analyses_index: Name of the index where analyses are stored.
272
- :param bi_proc_index: Name of the index where bio info processes are stored.
273
- :returns: 1 if some wet process are defined but unused in the analyses index,
274
- and 0 otherwise.
275
- """
276
- logger.info("Checking for unused bio info processes in the index '%s'...", bi_proc_index)
277
-
278
- defined_bi_procs = get_process_ids(es, bi_proc_index, "proc_id")
279
- logger.debug("Found the following defined bio info processes: %s", defined_bi_procs)
280
-
281
- used_bi_procs = get_process_ids(es, analyses_index, "metadata.bi_process")
282
- logger.debug("Following processes are used in the index '%s': %s",
283
- analyses_index, used_bi_procs)
284
-
285
- unused_bi_procs = defined_bi_procs - used_bi_procs
286
- if len(unused_bi_procs) > 0:
287
- logger.warning("Found unused bio info processes: %s", unused_bi_procs)
288
- return 1
289
-
290
- logger.info("No unused bio info processes found.")
291
- return 0
292
-
293
-
294
- def main() -> None:
295
- """Entry point of the integrity script."""
296
- args = read_args()
297
-
298
- configure_logging(args.verbose)
299
- logger.debug("Arguments: %s", args)
300
-
301
- analyses_index = f"{args.es_index_prefix}-analyses"
302
- wet_processes_index = f"{args.es_index_prefix}-wet_processes"
303
- bi_processes_index = f"{args.es_index_prefix}-bi_processes"
304
-
305
- es = connect_to_es(host=args.es_host, port=args.es_port, usr=args.es_usr, pwd=args.es_pwd)
306
-
307
- # Fatal errors
308
- try:
309
- ensure_unique(es, wet_processes_index, "proc_id")
310
- ensure_unique(es, bi_processes_index, "proc_id")
311
- check_for_undefined_file_indices(es, analyses_index)
312
- check_for_undefined_wet_processes(es, analyses_index, wet_processes_index)
313
- check_for_undefined_bi_processes(es, analyses_index, bi_processes_index)
314
- except DBIntegrityError as e:
315
- raise SystemExit(e) from e
316
-
317
- # Warnings
318
- check_for_unused_wet_processes(es, analyses_index, wet_processes_index)
319
- check_for_unused_bi_processes(es, analyses_index, bi_processes_index)
320
- check_for_unused_file_indices(es, analyses_index, args.es_index_prefix)
321
-
322
-
323
- if __name__ == '__main__':
324
- main()
@@ -1,41 +0,0 @@
1
- # pylint: disable=missing-module-docstring
2
- import argparse
3
- import logging
4
-
5
- from schema import SchemaError # type: ignore[import-untyped]
6
-
7
- from . import make_import_bundle_from_files
8
- from .common import add_verbose_control_args
9
- from .logger import configure_logging
10
-
11
- logger = logging.getLogger('genelastic')
12
-
13
-
14
- def read_args() -> argparse.Namespace:
15
- """Read arguments from command line."""
16
- parser = argparse.ArgumentParser(description="Ensure that YAML files "
17
- "follow the genelastic YAML bundle schema.",
18
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
19
- add_verbose_control_args(parser)
20
- parser.add_argument('files', type=str, nargs="+", default=None,
21
- help="YAML files to validate.")
22
- parser.add_argument('-c', '--check', action='store_true',
23
- help="In addition to validating the schema, "
24
- "check for undefined referenced processes.")
25
- return parser.parse_args()
26
-
27
-
28
- def main() -> int:
29
- """Entry point of the validate script."""
30
- args = read_args()
31
- configure_logging(args.verbose)
32
-
33
- try:
34
- make_import_bundle_from_files(args.files, check=args.check)
35
- except (ValueError, RuntimeError, SchemaError) as e:
36
- # Catch any exception that can be raised by 'make_import_bundle_from_files'.
37
- logger.error(e)
38
- return 1
39
-
40
- logger.info("All YAML files respect the genelastic YAML bundle format.")
41
- return 0
@@ -1,36 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: genelastic
3
- Version: 0.6.0
4
- Summary: Generate and store genetic data into an Elasticsearch database.
5
- Author: CNRGH
6
- Author-email: Pierrick ROGER <pierrick.roger@cnrgh.fr>, Maxime BLANCHON <maxime.blanchon@cnrgh.fr>
7
- License: CeCILL
8
- Keywords: CNRGH,genelastic,generation,storage,elasticsearch,database
9
- Classifier: Development Status :: 3 - Alpha
10
- Classifier: Intended Audience :: Science/Research
11
- Classifier: License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)
12
- Classifier: Programming Language :: Python :: 3.11
13
- Requires-Python: >=3.11
14
- Description-Content-Type: text/markdown
15
- Requires-Dist: elasticsearch
16
- Requires-Dist: PyVCF3
17
- Requires-Dist: schema
18
- Requires-Dist: PyYAML
19
- Requires-Dist: biophony >=1.0.1
20
- Requires-Dist: colorlog
21
- Provides-Extra: docs
22
- Requires-Dist: sphinx ; extra == 'docs'
23
- Requires-Dist: sphinx-autoapi ; extra == 'docs'
24
- Requires-Dist: furo ; extra == 'docs'
25
- Provides-Extra: tests
26
- Requires-Dist: pytest ; extra == 'tests'
27
- Requires-Dist: mypy ; extra == 'tests'
28
- Requires-Dist: pylint <3.3,>=3.2 ; extra == 'tests'
29
- Requires-Dist: bandit ; extra == 'tests'
30
- Requires-Dist: coverage ; extra == 'tests'
31
- Requires-Dist: yamllint ; extra == 'tests'
32
- Requires-Dist: types-PyYAML ; extra == 'tests'
33
-
34
- # genelastic
35
-
36
- Storing of genetics data into an Elasticsearch database.
@@ -1,25 +0,0 @@
1
- genelastic/__init__.py,sha256=lMTq5VsAuRjNlf3eAEqGE-Yvht63IJ0nIf9z_1hwC00,486
2
- genelastic/analyses.py,sha256=UTvNIhZpK2zF77zg02ftyAdUNpWhTwQJeqb2scU2b_Y,1961
3
- genelastic/analysis.py,sha256=N8oo8uXoFbdLb7C1_67rTFEzV962G_CIqlaEE9IPjiM,7876
4
- genelastic/bi_process.py,sha256=CT4AFFv-pyJceKnYCHKS7SKGhVuSxOJUw5CXSbED15s,698
5
- genelastic/bi_processes.py,sha256=ciGQyoR4Tuxhoq4FPK6b0O92AzGLgijVGqS19QeMg6I,1405
6
- genelastic/common.py,sha256=22SDJJmED2bQygO7GjXFfnB-KH0UujoH867bpz2OAQ4,6065
7
- genelastic/constants.py,sha256=Y-3i8VlMFTcS158tsbBjvjsCvnWXVXE-Y3A2QOgcoOE,935
8
- genelastic/data_file.py,sha256=QzOOThuCRlWg_iiH3-6FnYZaVgDVfJI0YxZ0Eoz30kc,2709
9
- genelastic/filename_pattern.py,sha256=IDQ9ffXxISJ6VMineu-qxnxZgjyejhVVesWIyUhbriE,1980
10
- genelastic/gen_data.py,sha256=s8-wTh7O7tyuszcIQC4dP1_kVyWLFMhtQMhQLL2JlD8,6922
11
- genelastic/import_bundle.py,sha256=ZqiKi5BYBo4by2FWBsS5qGyDRn7xxLtSb3ks1SqySNc,4865
12
- genelastic/import_bundle_factory.py,sha256=nK-VlJATgCNnJSTQotOva89j9H5pdJqU58u1QtlqJkA,9080
13
- genelastic/import_data.py,sha256=SENK1_Khw88Jgs8EXvDwk9jhQidiETxmAVhn9ag6jNs,11489
14
- genelastic/info.py,sha256=3fk1fPrpfK8oRo1WnABNDSGdEpq1G6wvCW_D8meyHss,7789
15
- genelastic/integrity.py,sha256=ypXl9kAdnsxa7LgZ9nDgsklBqVlG9I61A5hqfGeGYgs,13090
16
- genelastic/logger.py,sha256=eV_LACPjkIg3G_D5g0oTcIRZL86E_iQ2UM_t0CwEkUI,1835
17
- genelastic/tags.py,sha256=xHCLWgnXcLUUKN3zthQXoJ7yjEhPoQi7JLvdMtB6T5c,3913
18
- genelastic/validate_data.py,sha256=V0f7fFTs5FkVU8NoBfDI7mQDwITzW_QXt3bj5OgsdzQ,1531
19
- genelastic/wet_process.py,sha256=uhsZrpDHUiP6-Y6f6_3xcsvqDl0ew_-9aY8vFr3kB3A,693
20
- genelastic/wet_processes.py,sha256=PtV0HFs6rGan_-3-BiXeab-VBX1JQGucktoXE4GuaAk,1355
21
- genelastic-0.6.0.dist-info/METADATA,sha256=Ad8wOo_mTY3l7RVy9WNdMAzVnWhTxEb2uacXue1CdUU,1335
22
- genelastic-0.6.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
23
- genelastic-0.6.0.dist-info/entry_points.txt,sha256=ZYi1_Rmjl-9XRywzPdV-U7TxA7Z6yyLVt-W13fZtxsQ,204
24
- genelastic-0.6.0.dist-info/top_level.txt,sha256=ra4gCsuKH1d0sXygcnwD_u597ir6bYYxWTS7dkA6vdM,11
25
- genelastic-0.6.0.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- [console_scripts]
2
- db_info = genelastic.info:main
3
- db_integrity = genelastic.integrity:main
4
- gen-data = genelastic.gen_data:main
5
- import = genelastic.import_data:main
6
- validate = genelastic.validate_data:main