genelastic 0.6.0__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {genelastic-0.6.0 → genelastic-0.6.1}/PKG-INFO +7 -2
- {genelastic-0.6.0 → genelastic-0.6.1}/pyproject.toml +14 -7
- genelastic-0.6.1/src/genelastic/__init__.py +0 -0
- genelastic-0.6.1/src/genelastic/api/__init__.py +0 -0
- genelastic-0.6.1/src/genelastic/api/extends/__init__.py +0 -0
- genelastic-0.6.1/src/genelastic/api/extends/example.py +7 -0
- genelastic-0.6.1/src/genelastic/api/routes.py +84 -0
- genelastic-0.6.1/src/genelastic/api/server.py +72 -0
- genelastic-0.6.1/src/genelastic/api/settings.py +13 -0
- genelastic-0.6.1/src/genelastic/common/__init__.py +12 -0
- genelastic-0.6.1/src/genelastic/common/cli.py +35 -0
- genelastic-0.6.1/src/genelastic/common/elastic.py +183 -0
- genelastic-0.6.1/src/genelastic/common/exceptions.py +6 -0
- genelastic-0.6.1/src/genelastic/common/types.py +20 -0
- genelastic-0.6.1/src/genelastic/import_data/__init__.py +9 -0
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/analyses.py +3 -1
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/analysis.py +3 -2
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/bi_process.py +1 -1
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/bi_processes.py +2 -1
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/data_file.py +3 -1
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/filename_pattern.py +2 -1
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/gen_data.py +3 -2
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/import_bundle.py +2 -1
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/import_bundle_factory.py +3 -1
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/import_data.py +49 -51
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/info.py +29 -50
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/integrity.py +53 -87
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/tags.py +2 -1
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/validate_data.py +6 -4
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/wet_processes.py +2 -1
- {genelastic-0.6.0 → genelastic-0.6.1}/src/genelastic.egg-info/PKG-INFO +7 -2
- genelastic-0.6.1/src/genelastic.egg-info/SOURCES.txt +41 -0
- genelastic-0.6.1/src/genelastic.egg-info/entry_points.txt +6 -0
- {genelastic-0.6.0 → genelastic-0.6.1}/src/genelastic.egg-info/requires.txt +7 -1
- {genelastic-0.6.0 → genelastic-0.6.1}/tests/test_010_analyses.py +3 -4
- {genelastic-0.6.0 → genelastic-0.6.1}/tests/test_100_import_bundle_format.py +46 -46
- genelastic-0.6.0/src/genelastic/__init__.py +0 -13
- genelastic-0.6.0/src/genelastic/common.py +0 -151
- genelastic-0.6.0/src/genelastic.egg-info/SOURCES.txt +0 -30
- genelastic-0.6.0/src/genelastic.egg-info/entry_points.txt +0 -6
- {genelastic-0.6.0 → genelastic-0.6.1}/README.md +0 -0
- {genelastic-0.6.0 → genelastic-0.6.1}/setup.cfg +0 -0
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/constants.py +0 -0
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/logger.py +0 -0
- {genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/wet_process.py +0 -0
- {genelastic-0.6.0 → genelastic-0.6.1}/src/genelastic.egg-info/dependency_links.txt +0 -0
- {genelastic-0.6.0 → genelastic-0.6.1}/src/genelastic.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: genelastic
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Generate and store genetic data into an Elasticsearch database.
|
|
5
5
|
Author: CNRGH
|
|
6
6
|
Author-email: Pierrick ROGER <pierrick.roger@cnrgh.fr>, Maxime BLANCHON <maxime.blanchon@cnrgh.fr>
|
|
@@ -21,7 +21,7 @@ Requires-Dist: colorlog
|
|
|
21
21
|
Provides-Extra: tests
|
|
22
22
|
Requires-Dist: pytest; extra == "tests"
|
|
23
23
|
Requires-Dist: mypy; extra == "tests"
|
|
24
|
-
Requires-Dist: pylint
|
|
24
|
+
Requires-Dist: pylint; extra == "tests"
|
|
25
25
|
Requires-Dist: bandit; extra == "tests"
|
|
26
26
|
Requires-Dist: coverage; extra == "tests"
|
|
27
27
|
Requires-Dist: yamllint; extra == "tests"
|
|
@@ -30,6 +30,11 @@ Provides-Extra: docs
|
|
|
30
30
|
Requires-Dist: sphinx; extra == "docs"
|
|
31
31
|
Requires-Dist: sphinx-autoapi; extra == "docs"
|
|
32
32
|
Requires-Dist: furo; extra == "docs"
|
|
33
|
+
Provides-Extra: api
|
|
34
|
+
Requires-Dist: flask; extra == "api"
|
|
35
|
+
Requires-Dist: elasticsearch; extra == "api"
|
|
36
|
+
Requires-Dist: environs; extra == "api"
|
|
37
|
+
Requires-Dist: connexion[flask,swagger-ui,uvicorn]; extra == "api"
|
|
33
38
|
|
|
34
39
|
# genelastic
|
|
35
40
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "genelastic"
|
|
7
|
-
version = "0.6.
|
|
7
|
+
version = "0.6.1"
|
|
8
8
|
dependencies = [
|
|
9
9
|
"elasticsearch",
|
|
10
10
|
"PyVCF3",
|
|
@@ -33,7 +33,7 @@ classifiers = [
|
|
|
33
33
|
tests = [
|
|
34
34
|
"pytest",
|
|
35
35
|
"mypy",
|
|
36
|
-
"pylint
|
|
36
|
+
"pylint",
|
|
37
37
|
"bandit",
|
|
38
38
|
"coverage",
|
|
39
39
|
"yamllint",
|
|
@@ -45,9 +45,16 @@ docs = [
|
|
|
45
45
|
"furo"
|
|
46
46
|
]
|
|
47
47
|
|
|
48
|
+
api = [
|
|
49
|
+
"flask",
|
|
50
|
+
"elasticsearch",
|
|
51
|
+
"environs",
|
|
52
|
+
"connexion[flask,swagger-ui,uvicorn]"
|
|
53
|
+
]
|
|
54
|
+
|
|
48
55
|
[project.scripts]
|
|
49
|
-
gen-data = "genelastic.gen_data:main"
|
|
50
|
-
import = "genelastic.import_data:main"
|
|
51
|
-
validate = "genelastic.validate_data:main"
|
|
52
|
-
db_info = "genelastic.info:main"
|
|
53
|
-
db_integrity = "genelastic.integrity:main"
|
|
56
|
+
gen-data = "genelastic.import_data.gen_data:main"
|
|
57
|
+
import = "genelastic.import_data.import_data:main"
|
|
58
|
+
validate = "genelastic.import_data.validate_data:main"
|
|
59
|
+
db_info = "genelastic.import_data.info:main"
|
|
60
|
+
db_integrity = "genelastic.import_data.integrity:main"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# pylint: disable=missing-module-docstring
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from flask import jsonify, current_app, Response
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def ping() -> Response:
|
|
7
|
+
"""Test route to verify that the server is online."""
|
|
8
|
+
return jsonify({'message': 'pong'})
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def list_indices() -> Response:
|
|
12
|
+
"""Route to list Elasticsearch indexes."""
|
|
13
|
+
return current_app.elastic_query_conn.get_indices() # type: ignore
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def retrieve_document(index_id: str, document_id: str) -> Response:
|
|
17
|
+
"""Route to retrieve a document by its ID."""
|
|
18
|
+
document = (current_app.elastic_query_conn # type: ignore
|
|
19
|
+
.get_document_by_id(index_id, document_id))
|
|
20
|
+
return jsonify(document)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def list_wet_processes() -> Response:
|
|
24
|
+
"""Route to list wet processes."""
|
|
25
|
+
wet_processes_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-wet_processes"
|
|
26
|
+
result = (current_app.elastic_query_conn # type: ignore
|
|
27
|
+
.get_field_values(wet_processes_index, "proc_id"))
|
|
28
|
+
return jsonify(list(result))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def list_bi_processes() -> Response:
|
|
32
|
+
"""Route to list bi processes."""
|
|
33
|
+
bi_processes_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-bi_processes"
|
|
34
|
+
result = (current_app.elastic_query_conn # type: ignore
|
|
35
|
+
.get_field_values(bi_processes_index, "name"))
|
|
36
|
+
return jsonify(list(result))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def list_analyses() -> Response:
|
|
40
|
+
"""Route to list analyses."""
|
|
41
|
+
analyses_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-analyses"
|
|
42
|
+
result = current_app.elastic_query_conn.get_field_values(analyses_index, "path") # type: ignore
|
|
43
|
+
filenames = [Path(path).name for path in result]
|
|
44
|
+
return jsonify(filenames)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def list_analyses_wet_processes(proc_id: str) -> Response:
|
|
48
|
+
"""Route to list analyses one of specific wet process"""
|
|
49
|
+
analyses_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-analyses"
|
|
50
|
+
|
|
51
|
+
search_query = {
|
|
52
|
+
"query": {
|
|
53
|
+
"term": {
|
|
54
|
+
"metadata.wet_process.keyword": proc_id,
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
result = []
|
|
59
|
+
response = (current_app.elastic_query_conn # type: ignore
|
|
60
|
+
.client.search(index=analyses_index, body=search_query))
|
|
61
|
+
for hit in response['hits']['hits']:
|
|
62
|
+
result.append(hit['_source']['path'])
|
|
63
|
+
|
|
64
|
+
return jsonify(result)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def list_analyses_bi_processes(proc_id: str) -> Response:
|
|
68
|
+
"""Route to list analyses one of specific bi process"""
|
|
69
|
+
analyses_index = f"{current_app.config['GENAPI_ES_INDEX_PREFIX']}-analyses"
|
|
70
|
+
|
|
71
|
+
search_query = {
|
|
72
|
+
"query": {
|
|
73
|
+
"term": {
|
|
74
|
+
"metadata.bi_process.keyword": proc_id,
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
result = []
|
|
79
|
+
response = (current_app.elastic_query_conn # type: ignore
|
|
80
|
+
.client.search(index=analyses_index, body=search_query))
|
|
81
|
+
for hit in response['hits']['hits']:
|
|
82
|
+
result.append(hit['_source']['path'])
|
|
83
|
+
|
|
84
|
+
return jsonify(result)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# pylint: disable=missing-module-docstring
|
|
2
|
+
from typing import Any
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import yaml
|
|
5
|
+
import connexion # type: ignore
|
|
6
|
+
from genelastic.common import ElasticQueryConn
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_yaml(file_path: Path) -> Any:
|
|
10
|
+
"""Load a YAML file and return its content."""
|
|
11
|
+
content = None
|
|
12
|
+
with open(file_path, encoding='utf-8') as f:
|
|
13
|
+
try:
|
|
14
|
+
content = yaml.safe_load(f)
|
|
15
|
+
except yaml.YAMLError as exc:
|
|
16
|
+
raise SystemExit(exc) from exc
|
|
17
|
+
return content
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def aggregate_openapi_specs(main_spec_file: Path, additional_spec_path: Path) -> Any:
|
|
21
|
+
"""Aggregate OpenAPI specifications from a main file and a directory
|
|
22
|
+
of additional specifications."""
|
|
23
|
+
main_spec = load_yaml(main_spec_file)
|
|
24
|
+
try:
|
|
25
|
+
entries = additional_spec_path.iterdir()
|
|
26
|
+
except OSError as exc:
|
|
27
|
+
raise SystemExit(exc) from exc
|
|
28
|
+
|
|
29
|
+
if not 'paths' in main_spec:
|
|
30
|
+
main_spec['paths'] = []
|
|
31
|
+
|
|
32
|
+
for entry in entries:
|
|
33
|
+
if not entry.is_file():
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
if not entry.suffix in [".yml", ".yaml"]:
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
content = load_yaml(entry)
|
|
40
|
+
|
|
41
|
+
if 'paths' in content:
|
|
42
|
+
main_spec['paths'].update(content['paths'])
|
|
43
|
+
|
|
44
|
+
return main_spec
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# Initialiser l'application Connexion
|
|
48
|
+
connexion_app = connexion.FlaskApp(__name__)
|
|
49
|
+
connexion_app.app.config.from_object('src.genelastic.api.settings.Config')
|
|
50
|
+
|
|
51
|
+
# Initialiser le client Elasticsearch
|
|
52
|
+
es_url = connexion_app.app.config['GENAPI_ES_URL']
|
|
53
|
+
es_cert_fp = connexion_app.app.config['GENAPI_ES_CERT_FP']
|
|
54
|
+
es_api_key = connexion_app.app.config['GENAPI_ES_ENCODED_API_KEY']
|
|
55
|
+
|
|
56
|
+
connexion_app.app.elastic_query_conn = ElasticQueryConn(es_url, es_cert_fp, api_key=es_api_key)
|
|
57
|
+
|
|
58
|
+
connexion_app.app.logger.debug("Successfully connected to Elasticsearch server: %s",
|
|
59
|
+
connexion_app.app.elastic_query_conn.client.info())
|
|
60
|
+
|
|
61
|
+
# Chemins des fichiers YAML
|
|
62
|
+
main_yaml_file = Path(__file__).parents[0] / 'specification.yml'
|
|
63
|
+
additional_yaml_dir = Path(__file__).parents[0] / 'extends'
|
|
64
|
+
|
|
65
|
+
# Charger et combiner les fichiers YAML
|
|
66
|
+
yaml_spec = aggregate_openapi_specs(main_yaml_file, additional_yaml_dir)
|
|
67
|
+
|
|
68
|
+
# Ajouter la spécification vers OpenAPI
|
|
69
|
+
connexion_app.add_api(yaml_spec)
|
|
70
|
+
|
|
71
|
+
if __name__ == '__main__':
|
|
72
|
+
connexion_app.run(debug=True)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# pylint: disable=missing-module-docstring
|
|
2
|
+
from environs import Env
|
|
3
|
+
|
|
4
|
+
env = Env()
|
|
5
|
+
env.read_env()
|
|
6
|
+
|
|
7
|
+
# pylint: disable=missing-class-docstring,too-few-public-methods
|
|
8
|
+
class Config:
|
|
9
|
+
# Charger toutes les variables d'environnement nécessaires
|
|
10
|
+
GENAPI_ES_URL = env.url("GENAPI_ES_URL").geturl()
|
|
11
|
+
GENAPI_ES_ENCODED_API_KEY = env.str("GENAPI_ES_ENCODED_API_KEY")
|
|
12
|
+
GENAPI_ES_INDEX_PREFIX = env.str("GENAPI_ES_INDEX_PREFIX")
|
|
13
|
+
GENAPI_ES_CERT_FP = env.str("GENAPI_ES_CERT_FP")
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Genelastic package for common code between API and import scripts."""
|
|
2
|
+
from .elastic import ElasticQueryConn, ElasticImportConn
|
|
3
|
+
from .types import (BundleDict, AnalysisMetaData, BioInfoProcessData, WetProcessesData,
|
|
4
|
+
MetadataDocument, AnalysisDocument, BulkItems, ProcessDocument, Bucket)
|
|
5
|
+
from .cli import add_verbose_control_args, add_es_connection_args
|
|
6
|
+
from .exceptions import DBIntegrityError
|
|
7
|
+
|
|
8
|
+
__all__ = ['ElasticQueryConn', 'ElasticImportConn', 'BundleDict', 'AnalysisMetaData',
|
|
9
|
+
'BioInfoProcessData', 'WetProcessesData', 'MetadataDocument', 'AnalysisDocument',
|
|
10
|
+
'BulkItems', 'ProcessDocument', 'Bucket', 'add_verbose_control_args',
|
|
11
|
+
'add_es_connection_args', 'DBIntegrityError'
|
|
12
|
+
]
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Utility functions for CLI scripts."""
|
|
2
|
+
import argparse
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def add_verbose_control_args(parser: argparse.ArgumentParser) -> None:
|
|
6
|
+
"""
|
|
7
|
+
Add verbose control arguments to the parser.
|
|
8
|
+
Arguments are added to the parser by using its reference.
|
|
9
|
+
"""
|
|
10
|
+
parser.add_argument('-q', '--quiet', dest='verbose', action='store_const',
|
|
11
|
+
const=0, default=1,
|
|
12
|
+
help='Set verbosity to 0 (quiet mode).')
|
|
13
|
+
parser.add_argument('-v', '--verbose', dest='verbose', action='count',
|
|
14
|
+
default=1,
|
|
15
|
+
help=('Verbose level. -v for information, -vv for debug,' +
|
|
16
|
+
' -vvv for trace.'))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def add_es_connection_args(parser: argparse.ArgumentParser) -> None:
|
|
20
|
+
"""
|
|
21
|
+
Add arguments to the parser needed to gather ElasticSearch server connection parameters.
|
|
22
|
+
Arguments are added to the parser by using its reference.
|
|
23
|
+
"""
|
|
24
|
+
parser.add_argument('--es-host', dest='es_host', default='localhost',
|
|
25
|
+
help='Address of Elasticsearch host.')
|
|
26
|
+
parser.add_argument('--es-port', type=int, default=9200, dest='es_port',
|
|
27
|
+
help='Elasticsearch port.')
|
|
28
|
+
parser.add_argument('--es-usr', dest='es_usr', default='elastic',
|
|
29
|
+
help='Elasticsearch user.')
|
|
30
|
+
parser.add_argument('--es-pwd', dest='es_pwd', required=True,
|
|
31
|
+
help='Elasticsearch password.')
|
|
32
|
+
parser.add_argument('--es-cert-fp', dest='es_cert_fp',
|
|
33
|
+
help='Elasticsearch sha256 certificate fingerprint.')
|
|
34
|
+
parser.add_argument('--es-index-prefix', dest='es_index_prefix',
|
|
35
|
+
help='Add the given prefix to each index created during import.')
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# pylint: disable=missing-module-docstring
|
|
2
|
+
import datetime
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
import typing
|
|
6
|
+
from abc import ABC
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import elastic_transport
|
|
10
|
+
import elasticsearch.helpers
|
|
11
|
+
from elasticsearch import Elasticsearch
|
|
12
|
+
|
|
13
|
+
from .exceptions import DBIntegrityError
|
|
14
|
+
from .types import Bucket, BulkItems
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger('genelastic')
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ElasticConn(ABC): # pylint: disable=too-few-public-methods
|
|
20
|
+
"""Abstract class representing a connector for an Elasticsearch server."""
|
|
21
|
+
client: Elasticsearch
|
|
22
|
+
|
|
23
|
+
def __init__(self, url: str, fingerprint: str, **kwargs: Any):
|
|
24
|
+
"""Initialize an elasticsearch client instance.
|
|
25
|
+
|
|
26
|
+
:url: URL of the Elasticsearch host.
|
|
27
|
+
:fingerprint: sha256 certificate fingerprint for a secure HTTP connection.
|
|
28
|
+
:returns: The configured elasticsearch client instance.
|
|
29
|
+
:raises SystemExit: If the connection to the Elasticsearch server failed.
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
self.client = Elasticsearch(
|
|
33
|
+
url,
|
|
34
|
+
ssl_assert_fingerprint=fingerprint,
|
|
35
|
+
# Verify cert only when the fingerprint is not None.
|
|
36
|
+
verify_certs=bool(fingerprint),
|
|
37
|
+
**kwargs
|
|
38
|
+
)
|
|
39
|
+
self.client.info()
|
|
40
|
+
except (elastic_transport.TransportError, elasticsearch.AuthenticationException) as e:
|
|
41
|
+
raise SystemExit(e) from e
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ElasticImportConn(ElasticConn): # pylint: disable=too-few-public-methods
|
|
45
|
+
"""Connector to import data into an Elasticsearch database."""
|
|
46
|
+
def import_items(self, bulk_items: BulkItems,
|
|
47
|
+
start_time: float,
|
|
48
|
+
total_items: int) -> None:
|
|
49
|
+
"""Import items to the Elasticsearch database."""
|
|
50
|
+
if len(bulk_items) > 0:
|
|
51
|
+
elasticsearch.helpers.bulk(self.client, bulk_items)
|
|
52
|
+
elapsed = time.perf_counter() - start_time
|
|
53
|
+
logger.info("Imported %d items in %s (%f items/s).", total_items,
|
|
54
|
+
datetime.timedelta(seconds=elapsed), total_items / elapsed)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class ElasticQueryConn(ElasticConn):
|
|
58
|
+
"""Connector to query data from an Elasticsearch database."""
|
|
59
|
+
|
|
60
|
+
def get_indices(self) -> Any | str:
|
|
61
|
+
"""Return all indices."""
|
|
62
|
+
return self.client.cat.indices(format="json").body
|
|
63
|
+
|
|
64
|
+
def get_document_by_id(self, index: str, document_id: str) -> Any | str:
|
|
65
|
+
"""Return a document by its ID."""
|
|
66
|
+
return self.client.get(index=index, id=document_id).body
|
|
67
|
+
|
|
68
|
+
def run_composite_aggregation(self, index: str, query: dict[str, typing.Any]) \
|
|
69
|
+
-> list[Bucket]:
|
|
70
|
+
"""
|
|
71
|
+
Executes a composite aggregation on an Elasticsearch index and
|
|
72
|
+
returns all paginated results.
|
|
73
|
+
|
|
74
|
+
:param index: Name of the index to query.
|
|
75
|
+
:param query: Aggregation query to run.
|
|
76
|
+
:return: List of aggregation results.
|
|
77
|
+
"""
|
|
78
|
+
# Extract the aggregation name from the query dict.
|
|
79
|
+
agg_name = next(iter(query["aggs"]))
|
|
80
|
+
all_buckets: typing.List[Bucket] = []
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
logger.debug("Running composite aggregation query %s on index '%s'.", query, index)
|
|
84
|
+
response = self.client.search(index=index, body=query)
|
|
85
|
+
except elasticsearch.NotFoundError as e:
|
|
86
|
+
raise SystemExit(f"Error: {e.message} for index '{index}'.") from e
|
|
87
|
+
|
|
88
|
+
while True:
|
|
89
|
+
# Extract buckets from the response.
|
|
90
|
+
buckets: typing.List[Bucket] = response['aggregations'][agg_name]['buckets']
|
|
91
|
+
all_buckets.extend(buckets)
|
|
92
|
+
|
|
93
|
+
# Check if there are more results to fetch.
|
|
94
|
+
if 'after_key' in response['aggregations'][agg_name]:
|
|
95
|
+
after_key = response['aggregations'][agg_name]['after_key']
|
|
96
|
+
query['aggs'][agg_name]['composite']['after'] = after_key
|
|
97
|
+
try:
|
|
98
|
+
logger.debug("Running query %s on index '%s'.", query, index)
|
|
99
|
+
# Fetch the next page of results.
|
|
100
|
+
response = self.client.search(index=index, body=query)
|
|
101
|
+
except elasticsearch.NotFoundError as e:
|
|
102
|
+
raise SystemExit(f"Error: {e.message} for index '{index}'.") from e
|
|
103
|
+
else:
|
|
104
|
+
break
|
|
105
|
+
|
|
106
|
+
return all_buckets
|
|
107
|
+
|
|
108
|
+
def get_field_values(self, index: str, field_name: str) -> set[str]:
|
|
109
|
+
"""Return a set of values for a given field."""
|
|
110
|
+
values = set()
|
|
111
|
+
|
|
112
|
+
query = {
|
|
113
|
+
"size": 0,
|
|
114
|
+
"aggs": {
|
|
115
|
+
"get_field_values": {
|
|
116
|
+
"composite": {
|
|
117
|
+
"sources": {"values": {"terms": {"field": f"{field_name}.keyword"}}},
|
|
118
|
+
"size": 1000,
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
buckets: typing.List[Bucket] = self.run_composite_aggregation(index, query)
|
|
125
|
+
|
|
126
|
+
for bucket in buckets:
|
|
127
|
+
values.add(bucket['key']['values'])
|
|
128
|
+
|
|
129
|
+
return values
|
|
130
|
+
|
|
131
|
+
def search_by_field_value(self, index: str, field: str, value: str) -> (
|
|
132
|
+
typing.Dict[str, typing.Any] | None):
|
|
133
|
+
"""Search a document by a value for a certain field."""
|
|
134
|
+
logger.info("Searching for field '%s' with value '%s' inside index '%s'.",
|
|
135
|
+
field, value, index)
|
|
136
|
+
search_query = {
|
|
137
|
+
"query": {
|
|
138
|
+
"term": {
|
|
139
|
+
f"{field}.keyword": value,
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
response = self.client.search(index=index, body=search_query)
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
return response['hits']['hits'][0]['_source'] # type: ignore
|
|
148
|
+
except KeyError:
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
def ensure_unique(self, index: str, field: str) -> None:
|
|
152
|
+
"""
|
|
153
|
+
Ensure that all values of a field in an index are all unique.
|
|
154
|
+
|
|
155
|
+
:param index: Name of the index.
|
|
156
|
+
:param field: Field name to check for value uniqueness.
|
|
157
|
+
:raises genelastic.common.DBIntegrityError:
|
|
158
|
+
Some values of the given field are duplicated in the index.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
logger.info("Ensuring that the field '%s' in the index '%s' only contains unique values...",
|
|
162
|
+
field, index)
|
|
163
|
+
query = {
|
|
164
|
+
"size": 0,
|
|
165
|
+
"aggs": {
|
|
166
|
+
"duplicate_proc_ids": {
|
|
167
|
+
"terms": {
|
|
168
|
+
"field": f"{field}.keyword",
|
|
169
|
+
"size": 10000,
|
|
170
|
+
"min_doc_count": 2
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
buckets: typing.List[Bucket] = self.run_composite_aggregation(index, query)
|
|
176
|
+
duplicated_processes: typing.Set[str] = set(map(lambda bucket: str(bucket["key"]), buckets))
|
|
177
|
+
|
|
178
|
+
if len(duplicated_processes) > 0:
|
|
179
|
+
raise DBIntegrityError(f"Found non-unique value for field {field} in index '{index}': "
|
|
180
|
+
f"{', '.join(duplicated_processes)}.")
|
|
181
|
+
|
|
182
|
+
logger.info("All values of field '%s' in index '%s' are unique.",
|
|
183
|
+
field, index)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# pylint: disable=missing-module-docstring
|
|
2
|
+
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
Bucket: typing.TypeAlias = dict[str, dict[typing.Any, typing.Any]]
|
|
6
|
+
|
|
7
|
+
AnalysisMetaData: typing.TypeAlias = typing.Dict[str, str | int]
|
|
8
|
+
WetProcessesData: typing.TypeAlias = typing.Dict[str, str | int | float]
|
|
9
|
+
BioInfoProcessData: typing.TypeAlias = typing.Dict[str, str | typing.List[str]]
|
|
10
|
+
BundleDict: typing.TypeAlias = typing.Dict[str, typing.Any]
|
|
11
|
+
|
|
12
|
+
AnalysisDocument: typing.TypeAlias = typing.Dict[str, str | None | AnalysisMetaData]
|
|
13
|
+
MetadataDocument: typing.TypeAlias = typing.Dict[str, int | str | typing.List[typing.Any | None]]
|
|
14
|
+
ProcessDocument: typing.TypeAlias = (typing.Dict[str, str] |
|
|
15
|
+
WetProcessesData |
|
|
16
|
+
BioInfoProcessData)
|
|
17
|
+
BulkItems: typing.TypeAlias = typing.List[typing.Dict[str, str |
|
|
18
|
+
MetadataDocument |
|
|
19
|
+
AnalysisDocument |
|
|
20
|
+
ProcessDocument]]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Genelastic package for importing Genomic data into Elasticsearch."""
|
|
2
|
+
from .analysis import Analysis
|
|
3
|
+
from .import_bundle_factory import (make_import_bundle_from_files,
|
|
4
|
+
load_import_bundle_file)
|
|
5
|
+
from .tags import Tags
|
|
6
|
+
from .import_bundle import ImportBundle
|
|
7
|
+
|
|
8
|
+
__all__ = ['Analysis', 'Tags', 'ImportBundle', 'make_import_bundle_from_files',
|
|
9
|
+
'load_import_bundle_file']
|
|
@@ -7,7 +7,8 @@ import re
|
|
|
7
7
|
import typing
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
|
|
10
|
-
from .common import AnalysisMetaData
|
|
10
|
+
from genelastic.common import AnalysisMetaData
|
|
11
|
+
|
|
11
12
|
from .constants import ALLOWED_CATEGORIES
|
|
12
13
|
from .data_file import DataFile
|
|
13
14
|
from .filename_pattern import FilenamePattern
|
|
@@ -19,7 +20,7 @@ logger = logging.getLogger('genelastic')
|
|
|
19
20
|
class Analysis:
|
|
20
21
|
"""Class Analysis that represents an analysis."""
|
|
21
22
|
|
|
22
|
-
# pylint: disable-next=too-many-arguments
|
|
23
|
+
# pylint: disable-next=too-many-arguments, too-many-positional-arguments
|
|
23
24
|
def __init__(self,
|
|
24
25
|
tags: Tags,
|
|
25
26
|
root_dir: str = '.',
|
{genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/data_file.py
RENAMED
|
@@ -11,8 +11,10 @@ import logging
|
|
|
11
11
|
import os
|
|
12
12
|
import pathlib
|
|
13
13
|
import typing
|
|
14
|
+
|
|
15
|
+
from genelastic.common import AnalysisMetaData
|
|
16
|
+
|
|
14
17
|
from .filename_pattern import FilenamePattern
|
|
15
|
-
from .common import AnalysisMetaData
|
|
16
18
|
|
|
17
19
|
logger = logging.getLogger('genelastic')
|
|
18
20
|
|
|
@@ -8,8 +8,8 @@ import sys
|
|
|
8
8
|
from typing import Dict, List, Sequence, Collection
|
|
9
9
|
|
|
10
10
|
import yaml
|
|
11
|
-
|
|
12
11
|
from genelastic.common import add_verbose_control_args
|
|
12
|
+
|
|
13
13
|
from .logger import configure_logging
|
|
14
14
|
|
|
15
15
|
logger = logging.getLogger('genelastic')
|
|
@@ -19,7 +19,8 @@ def read_args() -> argparse.Namespace:
|
|
|
19
19
|
# pylint: disable=R0801
|
|
20
20
|
"""Read arguments from command line."""
|
|
21
21
|
parser = argparse.ArgumentParser(description='Genetics data random generator.',
|
|
22
|
-
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
22
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
23
|
+
allow_abbrev=False)
|
|
23
24
|
add_verbose_control_args(parser)
|
|
24
25
|
parser.add_argument('-d', '--data-folder', dest='data_folder', required=True,
|
|
25
26
|
help='Data destination folder.')
|
{genelastic-0.6.0/src/genelastic → genelastic-0.6.1/src/genelastic/import_data}/import_bundle.py
RENAMED
|
@@ -8,9 +8,10 @@ import logging
|
|
|
8
8
|
import sys
|
|
9
9
|
import typing
|
|
10
10
|
|
|
11
|
+
from genelastic.common import BundleDict
|
|
12
|
+
|
|
11
13
|
from .bi_processes import BioInfoProcesses
|
|
12
14
|
from .data_file import DataFile
|
|
13
|
-
from .common import BundleDict
|
|
14
15
|
from .constants import BUNDLE_CURRENT_VERSION
|
|
15
16
|
from .analyses import Analyses
|
|
16
17
|
from .tags import Tags
|
|
@@ -11,8 +11,10 @@ from yaml.scanner import ScannerError
|
|
|
11
11
|
|
|
12
12
|
import schema # type: ignore[import-untyped]
|
|
13
13
|
import yaml
|
|
14
|
+
|
|
15
|
+
from genelastic.common import BundleDict
|
|
16
|
+
|
|
14
17
|
from .import_bundle import ImportBundle
|
|
15
|
-
from .common import BundleDict
|
|
16
18
|
from .constants import BUNDLE_CURRENT_VERSION
|
|
17
19
|
|
|
18
20
|
logger = logging.getLogger('genelastic')
|