PyPI - genelastic - Versions diffs - 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

genelastic 0.6.0py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

genelastic/__init__.py +0 -13
genelastic/api/__init__.py +0 -0
genelastic/api/extends/__init__.py +0 -0
genelastic/api/extends/example.py +7 -0
genelastic/api/routes.py +84 -0
genelastic/api/server.py +72 -0
genelastic/api/settings.py +13 -0
genelastic/common/__init__.py +12 -0
genelastic/common/cli.py +35 -0
genelastic/common/elastic.py +183 -0
genelastic/common/exceptions.py +6 -0
genelastic/common/types.py +20 -0
genelastic/import_data/__init__.py +9 -0
genelastic/{analyses.py → import_data/analyses.py} +3 -1
genelastic/{analysis.py → import_data/analysis.py} +3 -2
genelastic/{bi_process.py → import_data/bi_process.py} +1 -1
genelastic/{bi_processes.py → import_data/bi_processes.py} +2 -1
genelastic/{data_file.py → import_data/data_file.py} +3 -1
genelastic/{filename_pattern.py → import_data/filename_pattern.py} +2 -1
genelastic/{gen_data.py → import_data/gen_data.py} +3 -2
genelastic/{import_bundle.py → import_data/import_bundle.py} +2 -1
genelastic/{import_bundle_factory.py → import_data/import_bundle_factory.py} +3 -1
genelastic/{import_data.py → import_data/import_data.py} +49 -51
genelastic/{info.py → import_data/info.py} +29 -50
genelastic/{integrity.py → import_data/integrity.py} +53 -87
genelastic/{tags.py → import_data/tags.py} +2 -1
genelastic/{validate_data.py → import_data/validate_data.py} +6 -4
genelastic/{wet_processes.py → import_data/wet_processes.py} +2 -1
{genelastic-0.6.0.dist-info → genelastic-0.6.1.dist-info}/METADATA +7 -2
genelastic-0.6.1.dist-info/RECORD +36 -0
{genelastic-0.6.0.dist-info → genelastic-0.6.1.dist-info}/WHEEL +1 -1
genelastic-0.6.1.dist-info/entry_points.txt +6 -0
genelastic/common.py +0 -151
genelastic-0.6.0.dist-info/RECORD +0 -25
genelastic-0.6.0.dist-info/entry_points.txt +0 -6
/genelastic/{constants.py → import_data/constants.py} +0 -0
/genelastic/{logger.py → import_data/logger.py} +0 -0
/genelastic/{wet_process.py → import_data/wet_process.py} +0 -0
{genelastic-0.6.0.dist-info → genelastic-0.6.1.dist-info}/top_level.txt +0 -0

genelastic/{import_data.py → import_data/import_data.py} RENAMED Viewed

@@ -8,23 +8,22 @@
 import argparse
 import csv
 import datetime
+import hashlib
 import logging
 import os
 import sys
 import time
-import hashlib
+import vcf  # type: ignore
-import elasticsearch
-import elasticsearch.helpers
-import vcf  # type: ignore[import-untyped]
+from genelastic.common import (add_verbose_control_args, add_es_connection_args,
+                               ElasticImportConn, MetadataDocument, AnalysisDocument,
+                               BulkItems, ProcessDocument)
-from genelastic.bi_processes import BioInfoProcesses
-from genelastic.wet_processes import WetProcesses
-from . import make_import_bundle_from_files
+from .import_bundle_factory import make_import_bundle_from_files
+from .bi_processes import BioInfoProcesses
 from .data_file import DataFile
 from .logger import configure_logging
-from .common import (BulkItems, AnalysisDocument, ProcessDocument, MetadataDocument,
-                     add_verbose_control_args, add_es_connection_args, connect_to_es)
+from .wet_processes import WetProcesses
 logger = logging.getLogger('genelastic')
 logging.getLogger('elastic_transport').setLevel(logging.WARNING)  # Disable excessive logging
@@ -35,7 +34,8 @@ def read_args() -> argparse.Namespace:
     # pylint: disable=R0801
     """Read arguments from command line."""
     parser = argparse.ArgumentParser(description='Genetics data importer.',
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+                                     allow_abbrev=False)
     add_verbose_control_args(parser)
     add_es_connection_args(parser)
     parser.add_argument('-D', '--dry-run', dest='dryrun', action='count',
@@ -56,14 +56,14 @@ def read_args() -> argparse.Namespace:
     return args
-def import_cov_file(es: elasticsearch.Elasticsearch | None,
+def import_cov_file(es_import_conn: ElasticImportConn | None,
                     file_index: str, file: str, dryrun: int = 0) -> None:
     """Import a coverage file to the Elasticsearch database."""
     # Set field types
-    if dryrun == 0 and es:
-        es.indices.put_mapping(index=file_index,
-                               body={'properties': {'pos': {'type': 'integer'},
-                                                    'depth': {'type': 'byte'}}})
+    if dryrun == 0 and es_import_conn:
+        es_import_conn.client.indices.put_mapping(index=file_index,
+                                                  body={'properties': {'pos': {'type': 'integer'},
+                                                                       'depth': {'type': 'byte'}}})
     # Open file
     if dryrun > 1:
@@ -93,24 +93,12 @@ def import_cov_file(es: elasticsearch.Elasticsearch | None,
                 }
                 # Insert document
-                if dryrun == 0 and es:
-                    es.index(index=file_index, document=doc)
+                if dryrun == 0 and es_import_conn:
+                    es_import_conn.client.index(index=file_index, document=doc)
-def import_items(es: elasticsearch.Elasticsearch | None,
-                 bulk_items: BulkItems,
-                 start_time: float,
-                 total_items: int) -> None:
-    """Import items to the Elasticsearch database."""
-    if len(bulk_items) > 0 and es:
-        elasticsearch.helpers.bulk(es, bulk_items)
-    elapsed = time.perf_counter() - start_time
-    logger.info("Imported %d items in %s (%f items/s).", total_items,
-                datetime.timedelta(seconds=elapsed), total_items / elapsed)
-# pylint: disable-next=too-many-arguments
-def import_analysis_metadata(es: elasticsearch.Elasticsearch | None,
+# pylint: disable-next=too-many-arguments, too-many-positional-arguments
+def import_analysis_metadata(es_import_conn: ElasticImportConn | None,
                              index_prefix: str,
                              file_index: str,
                              file: DataFile,
@@ -129,12 +117,14 @@ def import_analysis_metadata(es: elasticsearch.Elasticsearch | None,
         {"_index": f"{index_prefix}-analyses", "_source": doc}
     ]
-    if dryrun == 0:
-        start = time.perf_counter()
-        import_items(es, bulk_items, start_time=start, total_items=len(bulk_items))
+    if dryrun == 0 and es_import_conn:
+        es_import_conn.import_items(bulk_items,
+                                    start_time=time.perf_counter(),
+                                    total_items=len(bulk_items)
+                                    )
-def import_vcf_file(es: elasticsearch.Elasticsearch | None,
+def import_vcf_file(es_import_conn: ElasticImportConn | None,
                     file_index: str,
                     file: DataFile,
                     dryrun: int = 0) -> None:
@@ -184,20 +174,20 @@ def import_vcf_file(es: elasticsearch.Elasticsearch | None,
                     # resp = es.index(index=index, document=doc)
                     # Insert bulk of items
-                    if len(bulk_items) >= bulk_sz:
-                        import_items(es, bulk_items, start_time=start,
-                                     total_items=n)
+                    if len(bulk_items) >= bulk_sz and es_import_conn:
+                        es_import_conn.import_items(bulk_items, start_time=start,
+                                                    total_items=n)
                         bulk_items = []
             # Insert remaining items
-            if dryrun == 0:
-                import_items(es, bulk_items, start_time=start, total_items=n)
+            if dryrun == 0 and es_import_conn:
+                es_import_conn.import_items(bulk_items, start_time=start, total_items=n)
         except StopIteration:
             logger.error('Skipping empty file : %s.', file.path)
-def import_processes(es: elasticsearch.Elasticsearch | None, index: str,
+def import_processes(es_import_conn: ElasticImportConn | None, index: str,
                      processes: WetProcesses | BioInfoProcesses, dryrun: int = 0) -> None:
     """Import processes into their own index."""
@@ -209,9 +199,11 @@ def import_processes(es: elasticsearch.Elasticsearch | None, index: str,
         doc: ProcessDocument = process.data | {'proc_id': proc_id, 'type': process_type}
         bulk_items.append({"_index": index, "_source": doc})
-    if dryrun == 0:
-        start = time.perf_counter()
-        import_items(es, bulk_items, start_time=start, total_items=len(bulk_items))
+    if dryrun == 0 and es_import_conn:
+        es_import_conn.import_items(bulk_items,
+                                    start_time=time.perf_counter(),
+                                    total_items=len(bulk_items)
+                                    )
 def generate_unique_index(index_prefix: str, filepath: str) -> str:
@@ -235,10 +227,13 @@ def main() -> None:
     logger.debug("LOGGERS: %s", logging.root.manager.loggerDict)  # pylint: disable=no-member
     # Open connection to ES
-    es = None
     if args.dryrun == 0:
-        es = connect_to_es(host=args.es_host, port=args.es_port, usr=args.es_usr,
-                           pwd=args.es_pwd)
+        addr = f"https://{args.es_host}:{args.es_port}"
+        logger.info("Trying to connect to Elasticsearch at %s...", addr)
+        es_import_conn = ElasticImportConn(addr, args.es_cert_fp,
+                                           basic_auth=(args.es_usr, args.es_pwd))
+    else:
+        es_import_conn = None
     # Create index
     # es.indices.create(index=args.es_index_prefix)
@@ -275,19 +270,22 @@ def main() -> None:
             # First, generate a unique index name for each file.
             file_index = generate_unique_index(args.es_index_prefix, f.path)
             # Then, import the analysis metadata into a dedicated index.
-            import_analysis_metadata(es, args.es_index_prefix, file_index, f, cat, args.dryrun)
+            import_analysis_metadata(es_import_conn, args.es_index_prefix,
+                                     file_index, f, cat, args.dryrun)
             # Finally, import the file in its own index.
-            globals()[f'import_{cat}_file'](es=es,
+            globals()[f'import_{cat}_file'](es_import_conn=es_import_conn,
                                             file_index=file_index, file=f, dryrun=args.dryrun)
     # Import processes
     logger.info("Importing wet processes.")
     logger.info("Wet processes IDs = %s", str(import_bundle.wet_processes.get_process_ids()))
-    import_processes(es, f"{args.es_index_prefix}-wet_processes", import_bundle.wet_processes)
+    import_processes(es_import_conn, f"{args.es_index_prefix}-wet_processes",
+                     import_bundle.wet_processes)
     logger.info("Importing bio info processes.")
     logger.info("Bio info processes IDs = %s", str(import_bundle.bi_processes.get_process_ids()))
-    import_processes(es, f"{args.es_index_prefix}-bi_processes", import_bundle.bi_processes)
+    import_processes(es_import_conn, f"{args.es_index_prefix}-bi_processes",
+                     import_bundle.bi_processes)
 if __name__ == '__main__':

genelastic/{info.py → import_data/info.py} RENAMED Viewed

@@ -3,16 +3,13 @@ import argparse
 import logging
 import typing
-import elasticsearch
-import urllib3
+from genelastic.common import (ElasticQueryConn, add_verbose_control_args,
+                               add_es_connection_args, Bucket)
 from .logger import configure_logging
-from .common import (add_es_connection_args, connect_to_es, add_verbose_control_args, Bucket,
-                     run_composite_aggregation, get_process_ids)
 logger = logging.getLogger('genelastic')
 logging.getLogger('elastic_transport').setLevel(logging.WARNING)  # Disable excessive logging
-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 def read_args() -> argparse.Namespace:
@@ -36,7 +33,7 @@ def read_args() -> argparse.Namespace:
     return parser.parse_args()
-def list_bundles(es: elasticsearch.Elasticsearch, index: str) -> None:
+def list_bundles(es_query_conn: ElasticQueryConn, index: str) -> None:
     """List all imported YAML bundles."""
     query = {
@@ -51,7 +48,7 @@ def list_bundles(es: elasticsearch.Elasticsearch, index: str) -> None:
         }
     }
-    buckets: typing.List[Bucket] = run_composite_aggregation(es, index, query)
+    buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
     print("Imported YAML files")
     print("===================")
@@ -66,7 +63,7 @@ def list_bundles(es: elasticsearch.Elasticsearch, index: str) -> None:
     print()
-def list_data_files(es: elasticsearch.Elasticsearch, index: str) -> None:
+def list_data_files(es_query_conn: ElasticQueryConn, index: str) -> None:
     """List all imported data files."""
     query = {
@@ -81,7 +78,7 @@ def list_data_files(es: elasticsearch.Elasticsearch, index: str) -> None:
         }
     }
-    buckets: typing.List[Bucket] = run_composite_aggregation(es, index, query)
+    buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
     print("Imported data files")
     print("===================")
@@ -96,9 +93,9 @@ def list_data_files(es: elasticsearch.Elasticsearch, index: str) -> None:
     print()
-def list_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
+def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
     """List all processes."""
-    process_ids = get_process_ids(es, index, "proc_id")
+    process_ids = es_query_conn.get_field_values(index, "proc_id")
     if len(process_ids) == 0:
         print("Empty response.", end="\n")
@@ -109,43 +106,21 @@ def list_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
     print()
-def list_wet_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
+def list_wet_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
     """List all wet processes."""
     print("Imported wet processes")
     print("======================")
-    list_processes(es, index)
+    list_processes(es_query_conn, index)
-def list_bi_processes(es: elasticsearch.Elasticsearch, index: str) -> None:
+def list_bi_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
     """List all bio info processes."""
     print("Imported bi processes")
     print("=====================")
-    list_processes(es, index)
-def search_doc_by_field_value(es: elasticsearch.Elasticsearch,
-                              index: str, field: str, value: str) -> (
-        typing.Dict[str, typing.Any] | None):
-    """Search a document by a value for a certain field."""
-    logger.info("Searching for field '%s' with value '%s' inside index '%s'.",
-                field, value, index)
-    search_query = {
-        "query": {
-            "term": {
-                f"{field}.keyword": value,
-            }
-        }
-    }
+    list_processes(es_query_conn, index)
-    response = es.search(index=index, body=search_query)
-    try:
-        return response['hits']['hits'][0]['_source']  # type: ignore
-    except KeyError:
-        return None
-def list_data_files_per_bundle(es: elasticsearch.Elasticsearch, index: str) -> None:
+def list_data_files_per_bundle(es_query_conn: ElasticQueryConn, index: str) -> None:
     """For each imported YAML bundle, display some info and list its data files."""
     query = {
         "size": 0,
@@ -174,7 +149,7 @@ def list_data_files_per_bundle(es: elasticsearch.Elasticsearch, index: str) -> N
         }
     }
-    buckets: typing.List[Bucket] = run_composite_aggregation(es, index, query)
+    buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
     print("Data files per YAML bundle")
     print("==========================")
@@ -207,7 +182,11 @@ def main() -> None:
     configure_logging(args.verbose)
     logger.debug("Arguments: %s", args)
-    es = connect_to_es(host=args.es_host, port=args.es_port, usr=args.es_usr, pwd=args.es_pwd)
+    addr = f"https://{args.es_host}:{args.es_port}"
+    logger.info("Trying to connect to Elasticsearch at %s...", addr)
+    es_query_conn = ElasticQueryConn(addr, args.es_cert_fp,
+                                     basic_auth=(args.es_usr, args.es_pwd))
     analysis_index = f"{args.es_index_prefix}-analyses"
     wet_processes_index = f"{args.es_index_prefix}-wet_processes"
@@ -216,32 +195,32 @@ def main() -> None:
     list_call_count = 0
     if args.list_bundles:
-        list_bundles(es, analysis_index)
+        list_bundles(es_query_conn, analysis_index)
         list_call_count += 1
     if args.list_data_files:
-        list_data_files(es, analysis_index)
+        list_data_files(es_query_conn, analysis_index)
         list_call_count += 1
     if args.list_wet_processes:
-        list_wet_processes(es, wet_processes_index)
+        list_wet_processes(es_query_conn, wet_processes_index)
         list_call_count += 1
     if args.list_bi_processes:
-        list_bi_processes(es, bi_processes_index)
+        list_bi_processes(es_query_conn, bi_processes_index)
         list_call_count += 1
     if args.list_data_files_per_bundle:
-        list_data_files_per_bundle(es, analysis_index)
+        list_data_files_per_bundle(es_query_conn, analysis_index)
         list_call_count += 1
     if list_call_count == 0:
         logger.debug("No list option specified, listing everything.")
-        list_bundles(es, analysis_index)
-        list_data_files(es, analysis_index)
-        list_wet_processes(es, wet_processes_index)
-        list_bi_processes(es, bi_processes_index)
-        list_data_files_per_bundle(es, analysis_index)
+        list_bundles(es_query_conn, analysis_index)
+        list_data_files(es_query_conn, analysis_index)
+        list_wet_processes(es_query_conn, wet_processes_index)
+        list_bi_processes(es_query_conn, bi_processes_index)
+        list_data_files_per_bundle(es_query_conn, analysis_index)
 if __name__ == '__main__':

genelastic 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

genelastic 0.6.0py3-none-any.whl → 0.6.1py3-none-any.whl