PyPI - genelastic - Versions diffs - 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

genelastic 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

genelastic/api/extends/example.py +2 -3
genelastic/api/routes.py +160 -23
genelastic/api/server.py +30 -22
genelastic/api/settings.py +3 -2
genelastic/common/__init__.py +36 -9
genelastic/common/cli.py +51 -23
genelastic/common/elastic.py +80 -49
genelastic/common/exceptions.py +0 -2
genelastic/common/types.py +20 -15
genelastic/import_data/__init__.py +23 -5
genelastic/import_data/analyses.py +17 -20
genelastic/import_data/analysis.py +69 -65
genelastic/import_data/bi_process.py +7 -5
genelastic/import_data/bi_processes.py +8 -8
genelastic/import_data/cli_gen_data.py +116 -0
genelastic/import_data/cli_import.py +379 -0
genelastic/import_data/{info.py → cli_info.py} +104 -75
genelastic/import_data/cli_integrity.py +384 -0
genelastic/import_data/cli_validate.py +54 -0
genelastic/import_data/constants.py +11 -32
genelastic/import_data/data_file.py +23 -20
genelastic/import_data/filename_pattern.py +26 -32
genelastic/import_data/import_bundle.py +56 -47
genelastic/import_data/import_bundle_factory.py +166 -158
genelastic/import_data/logger.py +22 -18
genelastic/import_data/random_bundle.py +402 -0
genelastic/import_data/tags.py +46 -26
genelastic/import_data/wet_process.py +8 -4
genelastic/import_data/wet_processes.py +13 -8
genelastic/ui/__init__.py +0 -0
genelastic/ui/server.py +87 -0
genelastic/ui/settings.py +11 -0
genelastic-0.7.0.dist-info/METADATA +105 -0
genelastic-0.7.0.dist-info/RECORD +40 -0
{genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/WHEEL +1 -1
genelastic-0.7.0.dist-info/entry_points.txt +6 -0
genelastic/import_data/gen_data.py +0 -194
genelastic/import_data/import_data.py +0 -292
genelastic/import_data/integrity.py +0 -290
genelastic/import_data/validate_data.py +0 -43
genelastic-0.6.1.dist-info/METADATA +0 -41
genelastic-0.6.1.dist-info/RECORD +0 -36
genelastic-0.6.1.dist-info/entry_points.txt +0 -6
{genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/top_level.txt +0 -0

genelastic/import_data/{info.py → cli_info.py} RENAMED Viewed

@@ -1,71 +1,100 @@
-# pylint: disable=missing-module-docstring
 import argparse
 import logging
-import typing
-from genelastic.common import (ElasticQueryConn, add_verbose_control_args,
-                               add_es_connection_args, Bucket)
+from genelastic.common import (
+    Bucket,
+    ElasticQueryConn,
+    add_es_connection_args,
+    add_verbose_control_args,
+)
 from .logger import configure_logging
-logger = logging.getLogger('genelastic')
-logging.getLogger('elastic_transport').setLevel(logging.WARNING)  # Disable excessive logging
+logger = logging.getLogger("genelastic")
+logging.getLogger("elastic_transport").setLevel(
+    logging.WARNING
+)  # Disable excessive logging
 def read_args() -> argparse.Namespace:
     """Read arguments from command line."""
-    parser = argparse.ArgumentParser(description='ElasticSearch database info.',
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-                                     allow_abbrev=False)
+    parser = argparse.ArgumentParser(
+        description="ElasticSearch database info.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        allow_abbrev=False,
+    )
     add_verbose_control_args(parser)
     add_es_connection_args(parser)
-    parser.add_argument("-y", "--list-bundles", action="store_true",
-                        help="List all imported YAML bundles.")
-    parser.add_argument("-f", "--list-data-files", action="store_true",
-                        help="List all imported data files.")
-    parser.add_argument("-w", "--list-wet-processes", action="store_true",
-                        help="List all imported wet processes.")
-    parser.add_argument("-b", "--list-bi-processes", action="store_true",
-                        help="List all imported bio info processes.")
-    parser.add_argument("-Y", "--list-data-files-per-bundle", action="store_true",
-                        help="For each imported YAML bundle, "
-                             "display some info and list its data files.")
+    parser.add_argument(
+        "-y",
+        "--list-bundles",
+        action="store_true",
+        help="List all imported YAML bundles.",
+    )
+    parser.add_argument(
+        "-f",
+        "--list-data-files",
+        action="store_true",
+        help="List all imported data files.",
+    )
+    parser.add_argument(
+        "-w",
+        "--list-wet-processes",
+        action="store_true",
+        help="List all imported wet processes.",
+    )
+    parser.add_argument(
+        "-b",
+        "--list-bi-processes",
+        action="store_true",
+        help="List all imported bio info processes.",
+    )
+    parser.add_argument(
+        "-Y",
+        "--list-data-files-per-bundle",
+        action="store_true",
+        help="For each imported YAML bundle, "
+        "display some info and list its data files.",
+    )
     return parser.parse_args()
 def list_bundles(es_query_conn: ElasticQueryConn, index: str) -> None:
     """List all imported YAML bundles."""
     query = {
         "size": 0,
         "aggs": {
             "get_bundle_paths": {
                 "composite": {
-                    "sources": {"bundle_path": {"terms": {"field": "bundle_path.keyword"}}},
+                    "sources": {
+                        "bundle_path": {
+                            "terms": {"field": "bundle_path.keyword"}
+                        }
+                    },
                     "size": 1000,
                 }
             }
-        }
+        },
     }
-    buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
+    buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
+        index, query
+    )
-    print("Imported YAML files")
-    print("===================")
+    logger.info("Imported YAML files")
+    logger.info("===================")
     if len(buckets) == 0:
-        print("Empty response.", end="\n")
+        logger.info("Empty response.")
         return
     for bucket in buckets:
-        bundle_path = bucket['key']['bundle_path']
-        print(f'- {bundle_path}')
-    print()
+        bundle_path = bucket["key"]["bundle_path"]
+        logger.info("- %s", bundle_path)
 def list_data_files(es_query_conn: ElasticQueryConn, index: str) -> None:
     """List all imported data files."""
     query = {
         "size": 0,
         "aggs": {
@@ -75,22 +104,23 @@ def list_data_files(es_query_conn: ElasticQueryConn, index: str) -> None:
                     "size": 1000,
                 }
             }
-        }
+        },
     }
-    buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
+    buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
+        index, query
+    )
-    print("Imported data files")
-    print("===================")
+    logger.info("Imported data files")
+    logger.info("===================")
     if len(buckets) == 0:
-        print("Empty response.", end="\n")
+        logger.info("Empty response.")
         return
     for bucket in buckets:
-        bundle_path = bucket['key']['path']
-        print(f'- {bundle_path}')
-    print()
+        bundle_path = bucket["key"]["path"]
+        logger.info("- %s", bundle_path)
 def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
@@ -98,29 +128,30 @@ def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
     process_ids = es_query_conn.get_field_values(index, "proc_id")
     if len(process_ids) == 0:
-        print("Empty response.", end="\n")
+        logger.info("Empty response.")
         return
     for process_id in process_ids:
-        print(f'- {process_id}')
-    print()
+        logger.info("- %s", process_id)
 def list_wet_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
     """List all wet processes."""
-    print("Imported wet processes")
-    print("======================")
+    logger.info("Imported wet processes")
+    logger.info("======================")
     list_processes(es_query_conn, index)
 def list_bi_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
     """List all bio info processes."""
-    print("Imported bi processes")
-    print("=====================")
+    logger.info("Imported bi processes")
+    logger.info("=====================")
     list_processes(es_query_conn, index)
-def list_data_files_per_bundle(es_query_conn: ElasticQueryConn, index: str) -> None:
+def list_data_files_per_bundle(
+    es_query_conn: ElasticQueryConn, index: str
+) -> None:
     """For each imported YAML bundle, display some info and list its data files."""
     query = {
         "size": 0,
@@ -130,50 +161,47 @@ def list_data_files_per_bundle(es_query_conn: ElasticQueryConn, index: str) -> N
                     "sources": [
                         {
                             "bundle_path": {
-                                "terms": {
-                                    "field": "bundle_path.keyword"
-                                }
+                                "terms": {"field": "bundle_path.keyword"}
                             }
                         }
                     ],
-                    "size": 100
+                    "size": 100,
                 },
-                "aggs": {
-                    "docs": {
-                        "top_hits": {
-                            "size": 100
-                        }
-                    }
-                }
+                "aggs": {"docs": {"top_hits": {"size": 100}}},
             }
-        }
+        },
     }
-    buckets: typing.List[Bucket] = es_query_conn.run_composite_aggregation(index, query)
+    buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
+        index, query
+    )
-    print("Data files per YAML bundle")
-    print("==========================")
+    logger.info("Data files per YAML bundle")
+    logger.info("==========================")
     if len(buckets) == 0:
-        print("Empty response.", end="\n")
+        logger.info("Empty response.")
         return
     for bucket in buckets:
         documents = bucket["docs"]["hits"]["hits"]
         if len(documents) == 0:
             continue
-        print(f"- Bundle Path: {bucket['key']['bundle_path']}")
-        print(f"    -> Wet process: {documents[0]['_source']['metadata']['wet_process']}")
-        print(f"    -> Bio info process: {documents[0]['_source']['metadata']['bi_process']}")
-        print("    -> Data files:")
+        logger.info("- Bundle Path: %s", bucket["key"]["bundle_path"])
+        logger.info(
+            "    -> Wet process: %s",
+            documents[0]["_source"]["metadata"]["wet_process"],
+        )
+        logger.info(
+            "    -> Bio info process: %s",
+            documents[0]["_source"]["metadata"]["bi_process"],
+        )
+        logger.info("    -> Data files:")
         for doc in documents:
-            print(f"        - Index: {doc['_source']['file_index']}")
-            print(f"          Path: {doc['_source']['path']}")
-    print()
+            logger.info("        - Index: %s", doc["_source"]["file_index"])
+            logger.info("          Path: %s", doc["_source"]["path"])
 def main() -> None:
@@ -185,8 +213,9 @@ def main() -> None:
     addr = f"https://{args.es_host}:{args.es_port}"
     logger.info("Trying to connect to Elasticsearch at %s...", addr)
-    es_query_conn = ElasticQueryConn(addr, args.es_cert_fp,
-                                     basic_auth=(args.es_usr, args.es_pwd))
+    es_query_conn = ElasticQueryConn(
+        addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
+    )
     analysis_index = f"{args.es_index_prefix}-analyses"
     wet_processes_index = f"{args.es_index_prefix}-wet_processes"
@@ -223,5 +252,5 @@ def main() -> None:
         list_data_files_per_bundle(es_query_conn, analysis_index)
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()

genelastic/import_data/cli_integrity.py ADDED Viewed

@@ -0,0 +1,384 @@
+import argparse
+import logging
+from elasticsearch import NotFoundError
+from genelastic.common import (
+    Bucket,
+    DBIntegrityError,
+    ElasticQueryConn,
+    add_es_connection_args,
+    add_verbose_control_args,
+)
+from .logger import configure_logging
+logger = logging.getLogger("genelastic")
+logging.getLogger("elastic_transport").setLevel(
+    logging.WARNING
+)  # Disable excessive logging
+def read_args() -> argparse.Namespace:
+    """Read arguments from command line."""
+    parser = argparse.ArgumentParser(
+        description="Utility to check the integrity "
+        "of the genelastic ElasticSearch database.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        allow_abbrev=False,
+    )
+    add_verbose_control_args(parser)
+    add_es_connection_args(parser)
+    return parser.parse_args()
+def check_for_undefined_file_indices(
+    es_query_conn: ElasticQueryConn, analyses_index: str
+) -> None:
+    """Check for potentially undefined files indices in the analyses index.
+    :param es_query_conn: Elasticsearch database instance.
+    :param analyses_index: Name of the index where analyses are stored.
+    :raises genelastic.common.DBIntegrityError:
+        Some files indices are used in the analyses index but are undefined.
+    """
+    logger.info(
+        "Checking for references to undefined file indices in the index '%s'...",
+        analyses_index,
+    )
+    undefined_indices = set()
+    query = {
+        "size": 0,
+        "aggs": {
+            "get_file_indices": {
+                "composite": {
+                    "sources": {
+                        "file_index": {"terms": {"field": "file_index.keyword"}}
+                    },
+                    "size": 1000,
+                }
+            }
+        },
+    }
+    buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
+        analyses_index, query
+    )
+    for bucket in buckets:
+        file_index = bucket["key"]["file_index"]
+        try:
+            es_query_conn.client.indices.get(index=file_index)
+            logger.debug(
+                "File index %s used in index '%s' is defined.",
+                file_index,
+                analyses_index,
+            )
+        except NotFoundError:
+            logger.debug(
+                "File index %s used in '%s' is undefined.",
+                file_index,
+                analyses_index,
+            )
+            undefined_indices.add(file_index)
+    if len(undefined_indices) > 0:
+        msg = (
+            f"Found the following undefined file indices defined in the index '{analyses_index}': "
+            f"{', '.join(undefined_indices)}"
+        )
+        raise DBIntegrityError(msg)
+    logger.info("All defined file indices are referenced.")
+def get_undefined_processes(
+    es_query_conn: ElasticQueryConn,
+    analyses_index: str,
+    process_index: str,
+    field: str,
+) -> set[str]:
+    """Return a set of undefined processes IDs in an index.
+    :param es_query_conn: Elasticsearch database instance.
+    :param analyses_index: Name of the index where analyses are stored.
+    :param process_index: Name of the index to check for undefined processes.
+    :param field: Field name used to retrieve the process ID.
+    :returns: A set of undefined processes IDs.
+    """
+    query = {
+        "size": 0,
+        "aggs": {
+            "get_analyses_processes": {
+                "composite": {
+                    "sources": {
+                        "process": {"terms": {"field": f"{field}.keyword"}}
+                    },
+                    "size": 1000,
+                }
+            }
+        },
+    }
+    buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
+        analyses_index, query
+    )
+    used_processes = {bucket["key"]["process"] for bucket in buckets}
+    logger.debug(
+        "Used values for field '%s' in index '%s': %s",
+        field,
+        analyses_index,
+        used_processes,
+    )
+    defined_processes = es_query_conn.get_field_values(process_index, "proc_id")
+    logger.debug(
+        "Defined values in index '%s': %s", process_index, defined_processes
+    )
+    return used_processes.difference(defined_processes)
+def check_for_undefined_wet_processes(
+    es_query_conn: ElasticQueryConn, analyses_index: str, wet_process_index: str
+) -> None:
+    """Check that each wet process used in the analyses index is defined.
+    :param es_query_conn: Elasticsearch database instance.
+    :param analyses_index: Name of the index where analyses are stored.
+    :param wet_process_index: Name of the index where wet processes are stored.
+    :raises genelastic.common.DBIntegrityError:
+        Some wet processes used in the analyses index are undefined.
+    """
+    logger.info(
+        "Checking for undefined wet processes used in index '%s'...",
+        analyses_index,
+    )
+    undefined_wet_processes = get_undefined_processes(
+        es_query_conn, analyses_index, wet_process_index, "metadata.wet_process"
+    )
+    if len(undefined_wet_processes) > 0:
+        msg = (
+            f"Index '{analyses_index}' uses the following undefined wet processes: "
+            f"{', '.join(undefined_wet_processes)}."
+        )
+        raise DBIntegrityError(msg)
+    logger.info(
+        "All wet processes used in index '%s' are defined.", wet_process_index
+    )
+def check_for_undefined_bi_processes(
+    es_query_conn: ElasticQueryConn, analyses_index: str, bi_process_index: str
+) -> None:
+    """Check that each bio info process used in the analyses index is defined.
+    :param es_query_conn: Elasticsearch database instance.
+    :param analyses_index: Name of the index where analyses are stored.
+    :param bi_process_index: Name of the index where bio info processes are stored.
+    :raises genelastic.common.DBIntegrityError:
+        Some bio info processes used in the analyses index are undefined.
+    """
+    logger.info(
+        "Checking for undefined bio info processes used in index '%s'...",
+        analyses_index,
+    )
+    undefined_bi_processes = get_undefined_processes(
+        es_query_conn, analyses_index, bi_process_index, "metadata.bi_process"
+    )
+    if len(undefined_bi_processes) > 0:
+        msg = (
+            f"Index '{analyses_index}' uses the following undefined bio info processes: "
+            f"{', '.join(undefined_bi_processes)}."
+        )
+        raise DBIntegrityError(msg)
+    logger.info(
+        "All bio info processes used in index '%s' are defined.",
+        bi_process_index,
+    )
+def check_for_unused_file_indices(
+    es_query_conn: ElasticQueryConn, analyses_index: str, index_prefix: str
+) -> int:
+    """Check that each of the file indices are used in at least one analysis.
+    :param es_query_conn: Elasticsearch database instance.
+    :param analyses_index: Name of the index where analyses are stored.
+    :param index_prefix: Prefix given to all the indices of the ElasticSearch database.
+    :returns: 1 if some file indices exists but are unused in the analyses index,
+        and 0 otherwise.
+    """
+    json_indices = es_query_conn.client.cat.indices(
+        index=f"{index_prefix}-file-*", format="json"
+    ).body
+    found_file_indices = set()
+    for x in json_indices:
+        if isinstance(x, dict):
+            found_file_indices.add(x["index"])
+    query = {
+        "size": 0,
+        "aggs": {
+            "get_file_indices": {
+                "composite": {
+                    "sources": {
+                        "file_index": {"terms": {"field": "file_index.keyword"}}
+                    },
+                    "size": 1000,
+                }
+            }
+        },
+    }
+    buckets: list[Bucket] = es_query_conn.run_composite_aggregation(
+        analyses_index, query
+    )
+    used_files_indices = {bucket["key"]["file_index"] for bucket in buckets}
+    unused_files_indices = found_file_indices.difference(used_files_indices)
+    if len(unused_files_indices) > 0:
+        logger.warning(
+            "Found the following unused files indices: %s",
+            ", ".join(unused_files_indices),
+        )
+        return 1
+    logger.info("All files indices are used.")
+    return 0
+def check_for_unused_wet_processes(
+    es_query_conn: ElasticQueryConn, analyses_index: str, wet_proc_index: str
+) -> int:
+    """Check for defined wet processes that are not used in the analyses index.
+    :param es_query_conn: Elasticsearch database instance.
+    :param analyses_index: Name of the index where analyses are stored.
+    :param wet_proc_index: Name of the index where wet processes are stored.
+    :returns: 1 if some wet process are defined but unused in the analyses index,
+        and 0 otherwise.
+    """
+    logger.info(
+        "Checking for unused wet processes in the index '%s'...", wet_proc_index
+    )
+    defined_wet_procs = es_query_conn.get_field_values(
+        wet_proc_index, "proc_id"
+    )
+    logger.debug(
+        "Found the following defined wet processes: %s", defined_wet_procs
+    )
+    used_wet_procs = es_query_conn.get_field_values(
+        analyses_index, "metadata.wet_process"
+    )
+    logger.debug(
+        "Following processes are used in the index '%s': %s",
+        analyses_index,
+        used_wet_procs,
+    )
+    unused_wet_procs = defined_wet_procs - used_wet_procs
+    if len(unused_wet_procs) > 0:
+        logger.warning("Found unused wet processes: %s", unused_wet_procs)
+        return 1
+    logger.info("No unused wet processes found.")
+    return 0
+def check_for_unused_bi_processes(
+    es_query_conn: ElasticQueryConn, analyses_index: str, bi_proc_index: str
+) -> int:
+    """Check for defined bio info processes that are not used in the analyses index.
+    :param es_query_conn: Elasticsearch database instance.
+    :param analyses_index: Name of the index where analyses are stored.
+    :param bi_proc_index: Name of the index where bio info processes are stored.
+    :returns: 1 if some wet process are defined but unused in the analyses index,
+        and 0 otherwise.
+    """
+    logger.info(
+        "Checking for unused bio info processes in the index '%s'...",
+        bi_proc_index,
+    )
+    defined_bi_procs = es_query_conn.get_field_values(bi_proc_index, "proc_id")
+    logger.debug(
+        "Found the following defined bio info processes: %s", defined_bi_procs
+    )
+    used_bi_procs = es_query_conn.get_field_values(
+        analyses_index, "metadata.bi_process"
+    )
+    logger.debug(
+        "Following processes are used in the index '%s': %s",
+        analyses_index,
+        used_bi_procs,
+    )
+    unused_bi_procs = defined_bi_procs - used_bi_procs
+    if len(unused_bi_procs) > 0:
+        logger.warning("Found unused bio info processes: %s", unused_bi_procs)
+        return 1
+    logger.info("No unused bio info processes found.")
+    return 0
+def main() -> None:
+    """Entry point of the integrity script."""
+    args = read_args()
+    configure_logging(args.verbose)
+    logger.debug("Arguments: %s", args)
+    analyses_index = f"{args.es_index_prefix}-analyses"
+    wet_processes_index = f"{args.es_index_prefix}-wet_processes"
+    bi_processes_index = f"{args.es_index_prefix}-bi_processes"
+    addr = f"https://{args.es_host}:{args.es_port}"
+    logger.info("Trying to connect to Elasticsearch at %s...", addr)
+    es_query_conn = ElasticQueryConn(
+        addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
+    )
+    # Fatal errors
+    try:
+        es_query_conn.ensure_unique(wet_processes_index, "proc_id")
+        es_query_conn.ensure_unique(bi_processes_index, "proc_id")
+        check_for_undefined_file_indices(es_query_conn, analyses_index)
+        check_for_undefined_wet_processes(
+            es_query_conn, analyses_index, wet_processes_index
+        )
+        check_for_undefined_bi_processes(
+            es_query_conn, analyses_index, bi_processes_index
+        )
+    except DBIntegrityError as e:
+        raise SystemExit(e) from e
+    # Warnings
+    check_for_unused_wet_processes(
+        es_query_conn, analyses_index, wet_processes_index
+    )
+    check_for_unused_bi_processes(
+        es_query_conn, analyses_index, bi_processes_index
+    )
+    check_for_unused_file_indices(
+        es_query_conn, analyses_index, args.es_index_prefix
+    )
+if __name__ == "__main__":
+    main()

genelastic 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

genelastic 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl