genelastic 0.6.0__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {genelastic-0.6.0 → genelastic-0.6.1}/PKG-INFO +7 -2
  2. {genelastic-0.6.0 → genelastic-0.6.1}/pyproject.toml +14 -7
  3. genelastic-0.6.1/src/genelastic/__init__.py +0 -0
  4. genelastic-0.6.1/src/genelastic/api/__init__.py +0 -0
  5. genelastic-0.6.1/src/genelastic/api/extends/__init__.py +0 -0
  6. genelastic-0.6.1/src/genelastic/api/extends/example.py +7 -0
  7. genelastic-0.6.1/src/genelastic/api/routes.py +84 -0
  8. genelastic-0.6.1/src/genelastic/api/server.py +72 -0
  9. genelastic-0.6.1/src/genelastic/api/settings.py +13 -0
  10. genelastic-0.6.1/src/genelastic/common/__init__.py +12 -0
  11. genelastic-0.6.1/src/genelastic/common/cli.py +35 -0
  12. genelastic-0.6.1/src/genelastic/common/elastic.py +183 -0
  13. genelastic-0.6.1/src/genelastic/common/exceptions.py +6 -0
  14. genelastic-0.6.1/src/genelastic/common/types.py +20 -0
  15. genelastic-0.6.1/src/genelastic/import_data/__init__.py +9 -0
  16. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/analyses.py +3 -1
  17. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/analysis.py +3 -2
  18. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/bi_process.py +1 -1
  19. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/bi_processes.py +2 -1
  20. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/data_file.py +3 -1
  21. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/filename_pattern.py +2 -1
  22. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/gen_data.py +3 -2
  23. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/import_bundle.py +2 -1
  24. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/import_bundle_factory.py +3 -1
  25. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/import_data.py +49 -51
  26. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/info.py +29 -50
  27. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/integrity.py +53 -87
  28. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/tags.py +2 -1
  29. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/validate_data.py +6 -4
  30. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/wet_processes.py +2 -1
  31. {genelastic-0.6.0 → genelastic-0.6.1}/src/genelastic.egg-info/PKG-INFO +7 -2
  32. genelastic-0.6.1/src/genelastic.egg-info/SOURCES.txt +41 -0
  33. genelastic-0.6.1/src/genelastic.egg-info/entry_points.txt +6 -0
  34. {genelastic-0.6.0 → genelastic-0.6.1}/src/genelastic.egg-info/requires.txt +7 -1
  35. {genelastic-0.6.0 → genelastic-0.6.1}/tests/test_010_analyses.py +3 -4
  36. {genelastic-0.6.0 → genelastic-0.6.1}/tests/test_100_import_bundle_format.py +46 -46
  37. genelastic-0.6.0/src/genelastic/__init__.py +0 -13
  38. genelastic-0.6.0/src/genelastic/common.py +0 -151
  39. genelastic-0.6.0/src/genelastic.egg-info/SOURCES.txt +0 -30
  40. genelastic-0.6.0/src/genelastic.egg-info/entry_points.txt +0 -6
  41. {genelastic-0.6.0 → genelastic-0.6.1}/README.md +0 -0
  42. {genelastic-0.6.0 → genelastic-0.6.1}/setup.cfg +0 -0
  43. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/constants.py +0 -0
  44. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/logger.py +0 -0
  45. {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/wet_process.py +0 -0
  46. {genelastic-0.6.0 → genelastic-0.6.1}/src/genelastic.egg-info/dependency_links.txt +0 -0
  47. {genelastic-0.6.0 → genelastic-0.6.1}/src/genelastic.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: genelastic
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: Generate and store genetic data into an Elasticsearch database.
5
5
  Author: CNRGH
6
6
  Author-email: Pierrick ROGER <pierrick.roger@cnrgh.fr>, Maxime BLANCHON <maxime.blanchon@cnrgh.fr>
@@ -21,7 +21,7 @@ Requires-Dist: colorlog
21
21
  Provides-Extra: tests
22
22
  Requires-Dist: pytest; extra == "tests"
23
23
  Requires-Dist: mypy; extra == "tests"
24
- Requires-Dist: pylint<3.3,>=3.2; extra == "tests"
24
+ Requires-Dist: pylint; extra == "tests"
25
25
  Requires-Dist: bandit; extra == "tests"
26
26
  Requires-Dist: coverage; extra == "tests"
27
27
  Requires-Dist: yamllint; extra == "tests"
@@ -30,6 +30,11 @@ Provides-Extra: docs
30
30
  Requires-Dist: sphinx; extra == "docs"
31
31
  Requires-Dist: sphinx-autoapi; extra == "docs"
32
32
  Requires-Dist: furo; extra == "docs"
33
+ Provides-Extra: api
34
+ Requires-Dist: flask; extra == "api"
35
+ Requires-Dist: elasticsearch; extra == "api"
36
+ Requires-Dist: environs; extra == "api"
37
+ Requires-Dist: connexion[flask,swagger-ui,uvicorn]; extra == "api"
33
38
 
34
39
  # genelastic
35
40
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "genelastic"
7
- version = "0.6.0"
7
+ version = "0.6.1"
8
8
  dependencies = [
9
9
  "elasticsearch",
10
10
  "PyVCF3",
@@ -33,7 +33,7 @@ classifiers = [
33
33
  tests = [
34
34
  "pytest",
35
35
  "mypy",
36
- "pylint>=3.2,<3.3",
36
+ "pylint",
37
37
  "bandit",
38
38
  "coverage",
39
39
  "yamllint",
@@ -45,9 +45,16 @@ docs = [
45
45
  "furo"
46
46
  ]
47
47
 
48
+ api = [
49
+ "flask",
50
+ "elasticsearch",
51
+ "environs",
52
+ "connexion[flask,swagger-ui,uvicorn]"
53
+ ]
54
+
48
55
  [project.scripts]
49
- gen-data = "genelastic.gen_data:main"
50
- import = "genelastic.import_data:main"
51
- validate = "genelastic.validate_data:main"
52
- db_info = "genelastic.info:main"
53
- db_integrity = "genelastic.integrity:main"
56
+ gen-data = "genelastic.import_data.gen_data:main"
57
+ import = "genelastic.import_data.import_data:main"
58
+ validate = "genelastic.import_data.validate_data:main"
59
+ db_info = "genelastic.import_data.info:main"
60
+ db_integrity = "genelastic.import_data.integrity:main"
File without changes
File without changes
@@ -0,0 +1,7 @@
1
+ # pylint: disable=missing-module-docstring
2
+ from flask import jsonify, Response
3
+
4
+
5
+ def ping_2() -> Response:
6
+ """Test route to verify that the server is online."""
7
+ return jsonify({'message': 'pong_2'})
@@ -0,0 +1,84 @@
1
+ # pylint: disable=missing-module-docstring
2
+ from pathlib import Path
3
+ from flask import jsonify, current_app, Response
4
+
5
+
6
+ def ping() -> Response:
7
+ """Test route to verify that the server is online."""
8
+ return jsonify({'message': 'pong'})
9
+
10
+
11
+ def list_indices() -> Response:
12
+ """Route to list Elasticsearch indexes."""
13
+ return current_app.elastic_query_conn.get_indices() # type: ignore
14
+
15
+
16
+ def retrieve_document(index_id: str, document_id: str) -> Response:
17
+ """Route to retrieve a document by its ID."""
18
+ document = (current_app.elastic_query_conn # type: ignore
19
+ .get_document_by_id(index_id, document_id))
20
+ return jsonify(document)
21
+
22
+
23
+ def list_wet_processes() -> Response:
24
+ """Route to list wet processes."""
25
+ wet_processes_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-wet_processes"
26
+ result = (current_app.elastic_query_conn # type: ignore
27
+ .get_field_values(wet_processes_index, "proc_id"))
28
+ return jsonify(list(result))
29
+
30
+
31
+ def list_bi_processes() -> Response:
32
+ """Route to list bi processes."""
33
+ bi_processes_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-bi_processes"
34
+ result = (current_app.elastic_query_conn # type: ignore
35
+ .get_field_values(bi_processes_index, "name"))
36
+ return jsonify(list(result))
37
+
38
+
39
+ def list_analyses() -> Response:
40
+ """Route to list analyses."""
41
+ analyses_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-analyses"
42
+ result = current_app.elastic_query_conn.get_field_values(analyses_index, "path") # type: ignore
43
+ filenames = [Path(path).name for path in result]
44
+ return jsonify(filenames)
45
+
46
+
47
+ def list_analyses_wet_processes(proc_id: str) -> Response:
48
+ """Route to list analyses one of specific wet process"""
49
+ analyses_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-analyses"
50
+
51
+ search_query = {
52
+ "query": {
53
+ "term": {
54
+ "metadata.wet_process.keyword": proc_id,
55
+ }
56
+ }
57
+ }
58
+ result = []
59
+ response = (current_app.elastic_query_conn # type: ignore
60
+ .client.search(index=analyses_index, body=search_query))
61
+ for hit in response['hits']['hits']:
62
+ result.append(hit['_source']['path'])
63
+
64
+ return jsonify(result)
65
+
66
+
67
+ def list_analyses_bi_processes(proc_id: str) -> Response:
68
+ """Route to list analyses one of specific bi process"""
69
+ analyses_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-analyses"
70
+
71
+ search_query = {
72
+ "query": {
73
+ "term": {
74
+ "metadata.bi_process.keyword": proc_id,
75
+ }
76
+ }
77
+ }
78
+ result = []
79
+ response = (current_app.elastic_query_conn # type: ignore
80
+ .client.search(index=analyses_index, body=search_query))
81
+ for hit in response['hits']['hits']:
82
+ result.append(hit['_source']['path'])
83
+
84
+ return jsonify(result)
@@ -0,0 +1,72 @@
1
+ # pylint: disable=missing-module-docstring
2
+ from typing import Any
3
+ from pathlib import Path
4
+ import yaml
5
+ import connexion # type: ignore
6
+ from genelastic.common import ElasticQueryConn
7
+
8
+
9
+ def load_yaml(file_path: Path) -> Any:
10
+ """Load a YAML file and return its content."""
11
+ content = None
12
+ with open(file_path, encoding='utf-8') as f:
13
+ try:
14
+ content = yaml.safe_load(f)
15
+ except yaml.YAMLError as exc:
16
+ raise SystemExit(exc) from exc
17
+ return content
18
+
19
+
20
+ def aggregate_openapi_specs(main_spec_file: Path, additional_spec_path: Path) -> Any:
21
+ """Aggregate OpenAPI specifications from a main file and a directory
22
+ of additional specifications."""
23
+ main_spec = load_yaml(main_spec_file)
24
+ try:
25
+ entries = additional_spec_path.iterdir()
26
+ except OSError as exc:
27
+ raise SystemExit(exc) from exc
28
+
29
+ if not 'paths' in main_spec:
30
+ main_spec['paths'] = []
31
+
32
+ for entry in entries:
33
+ if not entry.is_file():
34
+ continue
35
+
36
+ if not entry.suffix in [".yml", ".yaml"]:
37
+ continue
38
+
39
+ content = load_yaml(entry)
40
+
41
+ if 'paths' in content:
42
+ main_spec['paths'].update(content['paths'])
43
+
44
+ return main_spec
45
+
46
+
47
+ # Initialiser l'application Connexion
48
+ connexion_app = connexion.FlaskApp(__name__)
49
+ connexion_app.app.config.from_object('src.genelastic.api.settings.Config')
50
+
51
+ # Initialiser le client Elasticsearch
52
+ es_url = connexion_app.app.config['GENAPI_ES_URL']
53
+ es_cert_fp = connexion_app.app.config['GENAPI_ES_CERT_FP']
54
+ es_api_key = connexion_app.app.config['GENAPI_ES_ENCODED_API_KEY']
55
+
56
+ connexion_app.app.elastic_query_conn = ElasticQueryConn(es_url, es_cert_fp, api_key=es_api_key)
57
+
58
+ connexion_app.app.logger.debug("Successfully connected to Elasticsearch server: %s",
59
+ connexion_app.app.elastic_query_conn.client.info())
60
+
61
+ # Chemins des fichiers YAML
62
+ main_yaml_file = Path(__file__).parents[0] / 'specification.yml'
63
+ additional_yaml_dir = Path(__file__).parents[0] / 'extends'
64
+
65
+ # Charger et combiner les fichiers YAML
66
+ yaml_spec = aggregate_openapi_specs(main_yaml_file, additional_yaml_dir)
67
+
68
+ # Ajouter la spécification vers OpenAPI
69
+ connexion_app.add_api(yaml_spec)
70
+
71
+ if __name__ == '__main__':
72
+ connexion_app.run(debug=True)
@@ -0,0 +1,13 @@
1
+ # pylint: disable=missing-module-docstring
2
+ from environs import Env
3
+
4
+ env = Env()
5
+ env.read_env()
6
+
7
+ # pylint: disable=missing-class-docstring,too-few-public-methods
8
+ class Config:
9
+ # Charger toutes les variables d'environnement nécessaires
10
+ GENAPI_ES_URL = env.url("GENAPI_ES_URL").geturl()
11
+ GENAPI_ES_ENCODED_API_KEY = env.str("GENAPI_ES_ENCODED_API_KEY")
12
+ GENAPI_ES_INDEX_PREFIX = env.str("GENAPI_ES_INDEX_PREFIX")
13
+ GENAPI_ES_CERT_FP = env.str("GENAPI_ES_CERT_FP")
@@ -0,0 +1,12 @@
1
+ """Genelastic package for common code between API and import scripts."""
2
+ from .elastic import ElasticQueryConn, ElasticImportConn
3
+ from .types import (BundleDict, AnalysisMetaData, BioInfoProcessData, WetProcessesData,
4
+ MetadataDocument, AnalysisDocument, BulkItems, ProcessDocument, Bucket)
5
+ from .cli import add_verbose_control_args, add_es_connection_args
6
+ from .exceptions import DBIntegrityError
7
+
8
+ __all__ = ['ElasticQueryConn', 'ElasticImportConn', 'BundleDict', 'AnalysisMetaData',
9
+ 'BioInfoProcessData', 'WetProcessesData', 'MetadataDocument', 'AnalysisDocument',
10
+ 'BulkItems', 'ProcessDocument', 'Bucket', 'add_verbose_control_args',
11
+ 'add_es_connection_args', 'DBIntegrityError'
12
+ ]
@@ -0,0 +1,35 @@
1
+ """Utility functions for CLI scripts."""
2
+ import argparse
3
+
4
+
5
+ def add_verbose_control_args(parser: argparse.ArgumentParser) -> None:
6
+ """
7
+ Add verbose control arguments to the parser.
8
+ Arguments are added to the parser by using its reference.
9
+ """
10
+ parser.add_argument('-q', '--quiet', dest='verbose', action='store_const',
11
+ const=0, default=1,
12
+ help='Set verbosity to 0 (quiet mode).')
13
+ parser.add_argument('-v', '--verbose', dest='verbose', action='count',
14
+ default=1,
15
+ help=('Verbose level. -v for information, -vv for debug,' +
16
+ ' -vvv for trace.'))
17
+
18
+
19
+ def add_es_connection_args(parser: argparse.ArgumentParser) -> None:
20
+ """
21
+ Add arguments to the parser needed to gather ElasticSearch server connection parameters.
22
+ Arguments are added to the parser by using its reference.
23
+ """
24
+ parser.add_argument('--es-host', dest='es_host', default='localhost',
25
+ help='Address of Elasticsearch host.')
26
+ parser.add_argument('--es-port', type=int, default=9200, dest='es_port',
27
+ help='Elasticsearch port.')
28
+ parser.add_argument('--es-usr', dest='es_usr', default='elastic',
29
+ help='Elasticsearch user.')
30
+ parser.add_argument('--es-pwd', dest='es_pwd', required=True,
31
+ help='Elasticsearch password.')
32
+ parser.add_argument('--es-cert-fp', dest='es_cert_fp',
33
+ help='Elasticsearch sha256 certificate fingerprint.')
34
+ parser.add_argument('--es-index-prefix', dest='es_index_prefix',
35
+ help='Add the given prefix to each index created during import.')
@@ -0,0 +1,183 @@
1
+ # pylint: disable=missing-module-docstring
2
+ import datetime
3
+ import logging
4
+ import time
5
+ import typing
6
+ from abc import ABC
7
+ from typing import Any
8
+
9
+ import elastic_transport
10
+ import elasticsearch.helpers
11
+ from elasticsearch import Elasticsearch
12
+
13
+ from .exceptions import DBIntegrityError
14
+ from .types import Bucket, BulkItems
15
+
16
+ logger = logging.getLogger('genelastic')
17
+
18
+
19
+ class ElasticConn(ABC): # pylint: disable=too-few-public-methods
20
+ """Abstract class representing a connector for an Elasticsearch server."""
21
+ client: Elasticsearch
22
+
23
+ def __init__(self, url: str, fingerprint: str, **kwargs: Any):
24
+ """Initialize an elasticsearch client instance.
25
+
26
+ :url: URL of the Elasticsearch host.
27
+ :fingerprint: sha256 certificate fingerprint for a secure HTTP connection.
28
+ :returns: The configured elasticsearch client instance.
29
+ :raises SystemExit: If the connection to the Elasticsearch server failed.
30
+ """
31
+ try:
32
+ self.client = Elasticsearch(
33
+ url,
34
+ ssl_assert_fingerprint=fingerprint,
35
+ # Verify cert only when the fingerprint is not None.
36
+ verify_certs=bool(fingerprint),
37
+ **kwargs
38
+ )
39
+ self.client.info()
40
+ except (elastic_transport.TransportError, elasticsearch.AuthenticationException) as e:
41
+ raise SystemExit(e) from e
42
+
43
+
44
+ class ElasticImportConn(ElasticConn): # pylint: disable=too-few-public-methods
45
+ """Connector to import data into an Elasticsearch database."""
46
+ def import_items(self, bulk_items: BulkItems,
47
+ start_time: float,
48
+ total_items: int) -> None:
49
+ """Import items to the Elasticsearch database."""
50
+ if len(bulk_items) > 0:
51
+ elasticsearch.helpers.bulk(self.client, bulk_items)
52
+ elapsed = time.perf_counter() - start_time
53
+ logger.info("Imported %d items in %s (%f items/s).", total_items,
54
+ datetime.timedelta(seconds=elapsed), total_items / elapsed)
55
+
56
+
57
+ class ElasticQueryConn(ElasticConn):
58
+ """Connector to query data from an Elasticsearch database."""
59
+
60
+ def get_indices(self) -> Any | str:
61
+ """Return all indices."""
62
+ return self.client.cat.indices(format="json").body
63
+
64
+ def get_document_by_id(self, index: str, document_id: str) -> Any | str:
65
+ """Return a document by its ID."""
66
+ return self.client.get(index=index, id=document_id).body
67
+
68
+ def run_composite_aggregation(self, index: str, query: dict[str, typing.Any]) \
69
+ -> list[Bucket]:
70
+ """
71
+ Executes a composite aggregation on an Elasticsearch index and
72
+ returns all paginated results.
73
+
74
+ :param index: Name of the index to query.
75
+ :param query: Aggregation query to run.
76
+ :return: List of aggregation results.
77
+ """
78
+ # Extract the aggregation name from the query dict.
79
+ agg_name = next(iter(query["aggs"]))
80
+ all_buckets: typing.List[Bucket] = []
81
+
82
+ try:
83
+ logger.debug("Running composite aggregation query %s on index '%s'.", query, index)
84
+ response = self.client.search(index=index, body=query)
85
+ except elasticsearch.NotFoundError as e:
86
+ raise SystemExit(f"Error: {e.message} for index '{index}'.") from e
87
+
88
+ while True:
89
+ # Extract buckets from the response.
90
+ buckets: typing.List[Bucket] = response['aggregations'][agg_name]['buckets']
91
+ all_buckets.extend(buckets)
92
+
93
+ # Check if there are more results to fetch.
94
+ if 'after_key' in response['aggregations'][agg_name]:
95
+ after_key = response['aggregations'][agg_name]['after_key']
96
+ query['aggs'][agg_name]['composite']['after'] = after_key
97
+ try:
98
+ logger.debug("Running query %s on index '%s'.", query, index)
99
+ # Fetch the next page of results.
100
+ response = self.client.search(index=index, body=query)
101
+ except elasticsearch.NotFoundError as e:
102
+ raise SystemExit(f"Error: {e.message} for index '{index}'.") from e
103
+ else:
104
+ break
105
+
106
+ return all_buckets
107
+
108
+ def get_field_values(self, index: str, field_name: str) -> set[str]:
109
+ """Return a set of values for a given field."""
110
+ values = set()
111
+
112
+ query = {
113
+ "size": 0,
114
+ "aggs": {
115
+ "get_field_values": {
116
+ "composite": {
117
+ "sources": {"values": {"terms": {"field": f"{field_name}.keyword"}}},
118
+ "size": 1000,
119
+ }
120
+ }
121
+ }
122
+ }
123
+
124
+ buckets: typing.List[Bucket] = self.run_composite_aggregation(index, query)
125
+
126
+ for bucket in buckets:
127
+ values.add(bucket['key']['values'])
128
+
129
+ return values
130
+
131
+ def search_by_field_value(self, index: str, field: str, value: str) -> (
132
+ typing.Dict[str, typing.Any] | None):
133
+ """Search a document by a value for a certain field."""
134
+ logger.info("Searching for field '%s' with value '%s' inside index '%s'.",
135
+ field, value, index)
136
+ search_query = {
137
+ "query": {
138
+ "term": {
139
+ f"{field}.keyword": value,
140
+ }
141
+ }
142
+ }
143
+
144
+ response = self.client.search(index=index, body=search_query)
145
+
146
+ try:
147
+ return response['hits']['hits'][0]['_source'] # type: ignore
148
+ except KeyError:
149
+ return None
150
+
151
+ def ensure_unique(self, index: str, field: str) -> None:
152
+ """
153
+ Ensure that all values of a field in an index are all unique.
154
+
155
+ :param index: Name of the index.
156
+ :param field: Field name to check for value uniqueness.
157
+ :raises genelastic.common.DBIntegrityError:
158
+ Some values of the given field are duplicated in the index.
159
+ """
160
+
161
+ logger.info("Ensuring that the field '%s' in the index '%s' only contains unique values...",
162
+ field, index)
163
+ query = {
164
+ "size": 0,
165
+ "aggs": {
166
+ "duplicate_proc_ids": {
167
+ "terms": {
168
+ "field": f"{field}.keyword",
169
+ "size": 10000,
170
+ "min_doc_count": 2
171
+ }
172
+ }
173
+ }
174
+ }
175
+ buckets: typing.List[Bucket] = self.run_composite_aggregation(index, query)
176
+ duplicated_processes: typing.Set[str] = set(map(lambda bucket: str(bucket["key"]), buckets))
177
+
178
+ if len(duplicated_processes) > 0:
179
+ raise DBIntegrityError(f"Found non-unique value for field {field} in index '{index}': "
180
+ f"{', '.join(duplicated_processes)}.")
181
+
182
+ logger.info("All values of field '%s' in index '%s' are unique.",
183
+ field, index)
@@ -0,0 +1,6 @@
1
+ # pylint: disable=missing-module-docstring
2
+
3
+ class DBIntegrityError(Exception):
4
+ """Represents an integrity error,
5
+ raised when the database content does not match the expected data schema.
6
+ """
@@ -0,0 +1,20 @@
1
+ # pylint: disable=missing-module-docstring
2
+
3
+ import typing
4
+
5
+ Bucket: typing.TypeAlias = dict[str, dict[typing.Any, typing.Any]]
6
+
7
+ AnalysisMetaData: typing.TypeAlias = typing.Dict[str, str | int]
8
+ WetProcessesData: typing.TypeAlias = typing.Dict[str, str | int | float]
9
+ BioInfoProcessData: typing.TypeAlias = typing.Dict[str, str | typing.List[str]]
10
+ BundleDict: typing.TypeAlias = typing.Dict[str, typing.Any]
11
+
12
+ AnalysisDocument: typing.TypeAlias = typing.Dict[str, str | None | AnalysisMetaData]
13
+ MetadataDocument: typing.TypeAlias = typing.Dict[str, int | str | typing.List[typing.Any | None]]
14
+ ProcessDocument: typing.TypeAlias = (typing.Dict[str, str] |
15
+ WetProcessesData |
16
+ BioInfoProcessData)
17
+ BulkItems: typing.TypeAlias = typing.List[typing.Dict[str, str |
18
+ MetadataDocument |
19
+ AnalysisDocument |
20
+ ProcessDocument]]
@@ -0,0 +1,9 @@
1
+ """Genelastic package for importing Genomic data into Elasticsearch."""
2
+ from .analysis import Analysis
3
+ from .import_bundle_factory import (make_import_bundle_from_files,
4
+ load_import_bundle_file)
5
+ from .tags import Tags
6
+ from .import_bundle import ImportBundle
7
+
8
+ __all__ = ['Analysis', 'Tags', 'ImportBundle', 'make_import_bundle_from_files',
9
+ 'load_import_bundle_file']
@@ -1,7 +1,9 @@
1
1
  # pylint: disable=missing-module-docstring
2
2
  import typing
3
+
4
+ from genelastic.common import BundleDict
5
+
3
6
  from .analysis import Analysis
4
- from .common import BundleDict
5
7
  from .data_file import DataFile
6
8
 
7
9
  class Analyses:
@@ -7,7 +7,8 @@ import re
7
7
  import typing
8
8
  from pathlib import Path
9
9
 
10
- from .common import AnalysisMetaData
10
+ from genelastic.common import AnalysisMetaData
11
+
11
12
  from .constants import ALLOWED_CATEGORIES
12
13
  from .data_file import DataFile
13
14
  from .filename_pattern import FilenamePattern
@@ -19,7 +20,7 @@ logger = logging.getLogger('genelastic')
19
20
  class Analysis:
20
21
  """Class Analysis that represents an analysis."""
21
22
 
22
- # pylint: disable-next=too-many-arguments
23
+ # pylint: disable-next=too-many-arguments, too-many-positional-arguments
23
24
  def __init__(self,
24
25
  tags: Tags,
25
26
  root_dir: str = '.',
@@ -2,7 +2,7 @@
2
2
  import copy
3
3
  import typing
4
4
 
5
- from .common import BioInfoProcessData
5
+ from genelastic.common import BioInfoProcessData
6
6
 
7
7
 
8
8
  class BioInfoProcess:
@@ -2,8 +2,9 @@
2
2
  import logging
3
3
  import typing
4
4
 
5
+ from genelastic.common import BundleDict
6
+
5
7
  from .bi_process import BioInfoProcess
6
- from .common import BundleDict
7
8
 
8
9
  logger = logging.getLogger('genelastic')
9
10
 
@@ -11,8 +11,10 @@ import logging
11
11
  import os
12
12
  import pathlib
13
13
  import typing
14
+
15
+ from genelastic.common import AnalysisMetaData
16
+
14
17
  from .filename_pattern import FilenamePattern
15
- from .common import AnalysisMetaData
16
18
 
17
19
  logger = logging.getLogger('genelastic')
18
20
 
@@ -5,7 +5,8 @@ and extract metadata from file names using this pattern.
5
5
 
6
6
  import logging
7
7
  import re
8
- from .common import AnalysisMetaData
8
+
9
+ from genelastic.common import AnalysisMetaData
9
10
 
10
11
  logger = logging.getLogger('genelastic')
11
12
 
@@ -8,8 +8,8 @@ import sys
8
8
  from typing import Dict, List, Sequence, Collection
9
9
 
10
10
  import yaml
11
-
12
11
  from genelastic.common import add_verbose_control_args
12
+
13
13
  from .logger import configure_logging
14
14
 
15
15
  logger = logging.getLogger('genelastic')
@@ -19,7 +19,8 @@ def read_args() -> argparse.Namespace:
19
19
  # pylint: disable=R0801
20
20
  """Read arguments from command line."""
21
21
  parser = argparse.ArgumentParser(description='Genetics data random generator.',
22
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
22
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
23
+ allow_abbrev=False)
23
24
  add_verbose_control_args(parser)
24
25
  parser.add_argument('-d', '--data-folder', dest='data_folder', required=True,
25
26
  help='Data destination folder.')
@@ -8,9 +8,10 @@ import logging
8
8
  import sys
9
9
  import typing
10
10
 
11
+ from genelastic.common import BundleDict
12
+
11
13
  from .bi_processes import BioInfoProcesses
12
14
  from .data_file import DataFile
13
- from .common import BundleDict
14
15
  from .constants import BUNDLE_CURRENT_VERSION
15
16
  from .analyses import Analyses
16
17
  from .tags import Tags
@@ -11,8 +11,10 @@ from yaml.scanner import ScannerError
11
11
 
12
12
  import schema # type: ignore[import-untyped]
13
13
  import yaml
14
+
15
+ from genelastic.common import BundleDict
16
+
14
17
  from .import_bundle import ImportBundle
15
- from .common import BundleDict
16
18
  from .constants import BUNDLE_CURRENT_VERSION
17
19
 
18
20
  logger = logging.getLogger('genelastic')