PyPI - genelastic - Versions diffs - 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

genelastic 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

genelastic/api/.env +4 -0
genelastic/api/cli_start_api.py +18 -0
genelastic/api/errors.py +52 -0
genelastic/api/extends/example.py +0 -6
genelastic/api/extends/example.yml +0 -0
genelastic/api/routes.py +313 -181
genelastic/api/server.py +34 -26
genelastic/api/settings.py +5 -9
genelastic/api/specification.yml +512 -0
genelastic/common/__init__.py +0 -39
genelastic/common/cli.py +100 -0
genelastic/common/elastic.py +374 -46
genelastic/common/exceptions.py +34 -2
genelastic/common/server.py +59 -0
genelastic/common/types.py +1 -14
genelastic/import_data/__init__.py +0 -27
genelastic/import_data/checker.py +99 -0
genelastic/import_data/checker_observer.py +13 -0
genelastic/import_data/cli/__init__.py +0 -0
genelastic/import_data/cli/cli_check.py +136 -0
genelastic/import_data/cli/gen_data.py +143 -0
genelastic/import_data/cli/import_data.py +346 -0
genelastic/import_data/cli/info.py +247 -0
genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
genelastic/import_data/cli/validate.py +146 -0
genelastic/import_data/collect.py +185 -0
genelastic/import_data/constants.py +136 -11
genelastic/import_data/import_bundle.py +102 -59
genelastic/import_data/import_bundle_factory.py +70 -149
genelastic/import_data/importers/__init__.py +0 -0
genelastic/import_data/importers/importer_base.py +131 -0
genelastic/import_data/importers/importer_factory.py +85 -0
genelastic/import_data/importers/importer_types.py +223 -0
genelastic/import_data/logger.py +2 -1
genelastic/import_data/models/__init__.py +0 -0
genelastic/import_data/models/analyses.py +178 -0
genelastic/import_data/models/analysis.py +144 -0
genelastic/import_data/models/data_file.py +110 -0
genelastic/import_data/models/process.py +45 -0
genelastic/import_data/models/processes.py +84 -0
genelastic/import_data/models/tags.py +170 -0
genelastic/import_data/models/unique_list.py +109 -0
genelastic/import_data/models/validate.py +26 -0
genelastic/import_data/patterns.py +90 -0
genelastic/import_data/random_bundle.py +79 -54
genelastic/import_data/resolve.py +157 -0
genelastic/ui/.env +1 -0
genelastic/ui/cli_start_ui.py +20 -0
genelastic/ui/routes.py +333 -0
genelastic/ui/server.py +9 -82
genelastic/ui/settings.py +2 -6
genelastic/ui/static/cea-cnrgh.ico +0 -0
genelastic/ui/static/cea.ico +0 -0
genelastic/ui/static/layout.ico +0 -0
genelastic/ui/static/novaseq6000.png +0 -0
genelastic/ui/static/style.css +430 -0
genelastic/ui/static/ui.js +458 -0
genelastic/ui/templates/analyses.html +98 -0
genelastic/ui/templates/analysis_detail.html +44 -0
genelastic/ui/templates/bi_process_detail.html +129 -0
genelastic/ui/templates/bi_processes.html +116 -0
genelastic/ui/templates/explorer.html +356 -0
genelastic/ui/templates/home.html +207 -0
genelastic/ui/templates/layout.html +153 -0
genelastic/ui/templates/version.html +21 -0
genelastic/ui/templates/wet_process_detail.html +131 -0
genelastic/ui/templates/wet_processes.html +116 -0
genelastic-0.9.0.dist-info/METADATA +686 -0
genelastic-0.9.0.dist-info/RECORD +76 -0
genelastic-0.9.0.dist-info/WHEEL +4 -0
genelastic-0.9.0.dist-info/entry_points.txt +10 -0
genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
genelastic/import_data/analyses.py +0 -69
genelastic/import_data/analysis.py +0 -205
genelastic/import_data/bi_process.py +0 -27
genelastic/import_data/bi_processes.py +0 -49
genelastic/import_data/cli_gen_data.py +0 -116
genelastic/import_data/cli_import.py +0 -379
genelastic/import_data/cli_info.py +0 -256
genelastic/import_data/cli_validate.py +0 -54
genelastic/import_data/data_file.py +0 -87
genelastic/import_data/filename_pattern.py +0 -57
genelastic/import_data/tags.py +0 -123
genelastic/import_data/wet_process.py +0 -28
genelastic/import_data/wet_processes.py +0 -53
genelastic-0.7.0.dist-info/METADATA +0 -105
genelastic-0.7.0.dist-info/RECORD +0 -40
genelastic-0.7.0.dist-info/WHEEL +0 -5
genelastic-0.7.0.dist-info/entry_points.txt +0 -6
genelastic-0.7.0.dist-info/top_level.txt +0 -1

genelastic/import_data/cli/import_data.py ADDED Viewed

@@ -0,0 +1,346 @@
+# vi: se tw=80
+# Elasticsearch Python API:
+# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/overview.html
+# https://elasticsearch-py.readthedocs.io/en/latest/api.html
+import argparse
+import logging
+import sys
+from datetime import UTC, datetime
+from pathlib import Path
+from genelastic.common.cli import (
+    add_es_connection_args,
+    add_verbose_control_args,
+    add_version_arg,
+    log_item,
+    log_section,
+    log_subsection,
+    positive_int,
+)
+from genelastic.common.elastic import ElasticImportConn
+from genelastic.import_data.import_bundle_factory import (
+    make_import_bundle_from_files,
+)
+from genelastic.import_data.importers.importer_base import ImporterError
+from genelastic.import_data.importers.importer_factory import ImporterFactory
+from genelastic.import_data.logger import configure_logging
+from genelastic.import_data.models.analysis import Analysis
+from genelastic.import_data.models.data_file import DataFile
+from genelastic.import_data.models.processes import Processes
+logger = logging.getLogger("genelastic")
+logging.getLogger("elastic_transport").setLevel(
+    logging.WARNING
+)  # Disable excessive logging
+logging.getLogger("urllib3").setLevel(
+    logging.WARNING
+)  # Disable excessive logging
+def read_args() -> argparse.Namespace:
+    """Read arguments from command line."""
+    parser = argparse.ArgumentParser(
+        description="Genetics data importer.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        allow_abbrev=False,
+    )
+    add_version_arg(parser)
+    add_verbose_control_args(parser)
+    add_es_connection_args(parser)
+    parser.add_argument(
+        "-D",
+        "--dry-run",
+        dest="dryrun",
+        action="count",
+        default=0,
+        help=(
+            "Dry-run level. -D for data files loading (VCF, coverage, etc) "
+            "without connecting or importing to database. "
+            "-DD for metadata YAML files loading only (no loading of data files)."
+        ),
+    )
+    parser.add_argument(
+        "--log-file", dest="log_file", help="Path to a log file."
+    )
+    parser.add_argument(
+        "--no-list",
+        dest="no_list",
+        action="store_true",
+        help="Do not print list of files to be imported.",
+    )
+    parser.add_argument(
+        "--no-confirm",
+        dest="no_confirm",
+        action="store_true",
+        help="Do not ask confirmation before importing.",
+    )
+    parser.add_argument(
+        "-t",
+        "--threads",
+        dest="thread_count",
+        type=positive_int,
+        default=4,
+        help="Number of threads to use for parallel data files import.",
+    )
+    parser.add_argument(
+        "--multi-match",
+        dest="multi_match",
+        action="store_true",
+        help=(
+            "Enable grouping of files from the same 'data_path' into multiple "
+            "analyses by extracting variable metadata fields directly from "
+            "filenames using the file prefix. If some metadata fields (e.g., "
+            "sample_name, wet_process, bi_process) are not defined in the YAML "
+            "bundle, the importer detects all analyses sharing the same "
+            "defined metadata, but differing by the undefined fields. This "
+            "allows importing and filtering several analyses at once from a "
+            "single directory, based on the metadata present in filenames. "
+            "When disabled (default), only files matching the fixed filename "
+            "pattern (where all metadata fields are defined in the YAML) are "
+            "grouped into a single analysis; other files are ignored."
+        ),
+    )
+    parser.add_argument(
+        "files",
+        type=Path,
+        nargs="+",
+        default=None,
+        help="Data files that describe what to import.",
+    )
+    return parser.parse_args()
+def import_analysis(
+    es_import_conn: ElasticImportConn,
+    analysis: Analysis,
+) -> None:
+    """Import analysis into a dedicated index."""
+    logger.info(
+        " -> Importing analysis '%s' metadata into index '%s'...",
+        analysis.id,
+        es_import_conn.analyses_index,
+    )
+    documents = [
+        {
+            "_index": es_import_conn.analyses_index,
+            "_source": {
+                "created_at": datetime.now(UTC).isoformat(),
+                "analysis_id": analysis.id,
+                "bundle_file": str(analysis.bundle_file),
+                "data_path": str(analysis.data_path),
+                "metadata": analysis.metadata,
+            },
+        }
+    ]
+    es_import_conn.bulk_import(documents)
+def import_data_file(
+    es_import_conn: ElasticImportConn,
+    data_file: DataFile,
+) -> None:
+    """Import data files into a dedicated index."""
+    logger.info(
+        " -> Importing metadata into index '%s'...",
+        es_import_conn.data_files_index,
+    )
+    documents = [
+        {
+            "_index": es_import_conn.data_files_index,
+            "_source": {
+                "created_at": datetime.now(UTC).isoformat(),
+                "analysis_id": data_file.analysis_id,
+                "path": str(data_file.path),
+                "bundle_file": str(data_file.bundle_file),
+                "metadata": data_file.metadata,
+                "metrics": data_file.metrics,
+            },
+        }
+    ]
+    es_import_conn.bulk_import(documents)
+def import_data_file_content(
+    es_import_conn: ElasticImportConn,
+    data_file: DataFile,
+    thread_count: int,
+    dry_run: int,
+) -> None:
+    """Import data file content into a dedicated index,
+    based on their extension and type.
+    """
+    # -DD: no file processing, no import.
+    if dry_run > 1:
+        logger.info("[Dryrun] Data file neither processed nor imported.")
+        return
+    try:
+        logger.info(
+            " -> Processing file content for import...",
+        )
+        importer = ImporterFactory.get_importer(
+            data_file, es_import_conn, thread_count
+        )
+        # -D: only process files, no import.
+        if dry_run == 1:
+            logger.info("[Dryrun] Data file processed but not imported.")
+            return
+        logger.info(
+            " -> Importing file content into index '%s'...",
+            importer.target_index,
+        )
+        importer.import_docs()
+    except ImporterError as e:
+        logger.error(e)
+def import_processes(
+    es_import_conn: ElasticImportConn,
+    index: str,
+    processes: Processes,
+) -> None:
+    """Import processes into a dedicated index, based on their type."""
+    documents = [
+        {
+            "_index": index,
+            "_source": {
+                "proc_id": process.id,
+                "type": process.type,
+                "metadata": process.data,
+            },
+        }
+        for process in processes.values()
+    ]
+    es_import_conn.bulk_import(documents)
+def main() -> None:
+    """Entry point of the import script."""
+    # Read command line arguments
+    args = read_args()
+    # Configure logging
+    configure_logging(args.verbose, log_file=args.log_file)
+    logger.debug("Arguments: %s", args)
+    logger.debug("LOGGERS: %s", logging.root.manager.loggerDict)
+    # Open connection to ES
+    addr = f"https://{args.es_host}:{args.es_port}"
+    logger.info("Connecting to Elasticsearch at %s...", addr)
+    es_import_conn = ElasticImportConn(
+        addr,
+        args.es_cert_fp,
+        args.es_index_prefix,
+        args.dryrun,
+        basic_auth=(args.es_usr, args.es_pwd),
+    )
+    log_section("LOAD DATA")
+    logger.info("")
+    import_bundle = make_import_bundle_from_files(
+        args.files, multi_match=args.multi_match, check=True
+    )
+    all_bundled_files = import_bundle.analyses.get_data_files()
+    if not all_bundled_files:
+        logger.warning("No matching data files found from import bundle(s) !")
+    log_section("IMPORT DATA")
+    # List files before importing.
+    if not args.no_list:
+        logger.info("")
+        logger.info(
+            "The following %s file(s) will be imported:", len(all_bundled_files)
+        )
+        for data_file in all_bundled_files:
+            logger.info("- '%s'", data_file.path)
+    else:
+        logger.debug(
+            "'--no-list' argument provided: "
+            "not listing files about to be imported."
+        )
+    # Ask confirmation for importing
+    if not args.no_confirm:
+        answer: str = "maybe"
+        while answer not in ["", "n", "y"]:
+            answer = input("Import (y/N)? ").lower()
+        if answer != "y":
+            logger.info("Import canceled.")
+            sys.exit(0)
+    else:
+        logger.debug(
+            "'--no-confirm' argument provided: "
+            "not asking for confirmation before importing files."
+        )
+    # Start import.
+    log_subsection("Importing wet processes...")
+    logger.info(
+        "-> Importing %s wet process(es) into index '%s': %s.",
+        len(import_bundle.wet_processes),
+        es_import_conn.wet_processes_index,
+        ", ".join(import_bundle.wet_processes.keys()),
+    )
+    import_processes(
+        es_import_conn,
+        es_import_conn.wet_processes_index,
+        import_bundle.wet_processes,
+    )
+    log_subsection("Importing bioinformatics processes...")
+    logger.info(
+        "-> Importing %s bioinformatics process(es) into index '%s': %s.",
+        len(import_bundle.bi_processes),
+        es_import_conn.bi_processes_index,
+        ", ".join(import_bundle.bi_processes.keys()),
+    )
+    import_processes(
+        es_import_conn,
+        es_import_conn.bi_processes_index,
+        import_bundle.bi_processes,
+    )
+    log_subsection("Importing analysis metadata...")
+    for i, analysis in enumerate(sorted(import_bundle.analyses)):
+        log_item(
+            "Analysis",
+            i + 1,
+            len(import_bundle.analyses),
+        )
+        import_analysis(es_import_conn, analysis)
+    log_subsection("Importing data files...")
+    counter = 1
+    for ext in sorted(import_bundle.analyses.extensions):
+        data_files = import_bundle.analyses.get_data_files(ext)
+        logger.info("[ %s data files ]", ext.upper())
+        for data_file in data_files:
+            logger.info(
+                " -> Processing data file #%s/%s: '%s'...",
+                counter,
+                len(import_bundle.analyses.get_data_files()),
+                data_file.path.name,
+            )
+            import_data_file(es_import_conn, data_file)
+            import_data_file_content(
+                es_import_conn, data_file, args.thread_count, args.dryrun
+            )
+            logger.info("")
+            counter += 1
+    logger.info("=> Done.")
+if __name__ == "__main__":
+    main()

genelastic/import_data/cli/info.py ADDED Viewed

@@ -0,0 +1,247 @@
+import argparse
+import logging
+from datetime import datetime
+from genelastic.common.cli import (
+    add_es_connection_args,
+    add_verbose_control_args,
+    add_version_arg,
+)
+from genelastic.common.elastic import ElasticQueryConn
+from genelastic.import_data.logger import configure_logging
+logger = logging.getLogger("genelastic")
+logging.getLogger("elastic_transport").setLevel(
+    logging.WARNING
+)  # Disable excessive logging
+def read_args() -> argparse.Namespace:
+    """Read arguments from the command line."""
+    parser = argparse.ArgumentParser(
+        description="ElasticSearch database info.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        allow_abbrev=False,
+    )
+    add_version_arg(parser)
+    add_verbose_control_args(parser)
+    add_es_connection_args(parser)
+    parser.add_argument(
+        "-a",
+        "--list-analyses",
+        action="store_true",
+        help="List all imported analyses.",
+    )
+    parser.add_argument(
+        "-w",
+        "--list-wet-processes",
+        action="store_true",
+        help="List all imported wet processes.",
+    )
+    parser.add_argument(
+        "-b",
+        "--list-bi-processes",
+        action="store_true",
+        help="List all imported bio info processes.",
+    )
+    parser.add_argument(
+        "-B",
+        "--list-bundles",
+        action="store_true",
+        help="List YAML bundles and associated analyses.",
+    )
+    return parser.parse_args()
+def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
+    """List all processes."""
+    process_ids = es_query_conn.get_field_values(index, "proc_id")
+    if len(process_ids) == 0:
+        logger.info("Empty response.")
+        return
+    for process_id in process_ids:
+        logger.info("- %s", process_id)
+def list_wet_processes(es_query_conn: ElasticQueryConn) -> None:
+    """List all wet processes."""
+    logger.info("Imported wet processes")
+    logger.info("======================")
+    list_processes(es_query_conn, es_query_conn.wet_processes_index)
+def list_bi_processes(es_query_conn: ElasticQueryConn) -> None:
+    """List all bio info processes."""
+    logger.info("Imported bi processes")
+    logger.info("=====================")
+    list_processes(es_query_conn, es_query_conn.bi_processes_index)
+def list_analyses(es_query_conn: ElasticQueryConn) -> None:
+    """List all imported analyses and their associated data files."""
+    query = {
+        "size": 0,
+        "aggs": {
+            "by_analysis": {
+                "composite": {
+                    "size": 1000,
+                    "sources": [
+                        {
+                            "analysis_id": {
+                                "terms": {"field": "analysis_id.keyword"}
+                            }
+                        }
+                    ],
+                },
+                "aggs": {
+                    "data_files": {
+                        "top_hits": {
+                            "size": 100,
+                        }
+                    }
+                },
+            }
+        },
+    }
+    buckets = es_query_conn.run_composite_aggregation(
+        es_query_conn.data_files_index, query
+    )
+    if not buckets:
+        logger.info("No data files found.")
+        return
+    logger.info("Data files per YAML bundle")
+    logger.info("=" * 80)
+    for i, bucket in enumerate(buckets):
+        analysis_id = bucket["key"]["analysis_id"]
+        hits = bucket["data_files"]["hits"]["hits"]
+        doc_count = len(hits)
+        logger.info(
+            "[%d] Analysis ID: %s (%d file%s)",
+            i + 1,
+            analysis_id,
+            doc_count,
+            "s" if doc_count > 1 else "",
+        )
+        logger.info("-" * 80)
+        for j, hit in enumerate(hits):
+            source = hit["_source"]
+            created_at = datetime.fromisoformat(source["created_at"])
+            created_at_formatted = created_at.strftime("%Y-%m-%d")
+            logger.info(" File %d of %d:", j + 1, doc_count)
+            logger.info(" created_at : %s", created_at_formatted)
+            logger.info(" bundle_file : %s", source["bundle_file"])
+            logger.info(" path : %s", source["path"])
+def list_bundles(es_query_conn: ElasticQueryConn) -> None:
+    """List bundle_file → associated analysis_id (clean visual CLI output)."""
+    query = {
+        "size": 0,
+        "aggs": {
+            "by_bundle": {
+                "composite": {
+                    "size": 2000,
+                    "sources": [
+                        {
+                            "bundle_file": {
+                                "terms": {"field": "bundle_file.keyword"}
+                            }
+                        }
+                    ],
+                },
+                "aggs": {
+                    "analyses": {
+                        "terms": {
+                            "field": "analysis_id.keyword",
+                            "size": 2000,
+                        }
+                    }
+                },
+            }
+        },
+    }
+    buckets = es_query_conn.run_composite_aggregation(
+        es_query_conn.data_files_index, query
+    )
+    if not buckets:
+        logger.info("No bundles found.")
+        return
+    # Sort bundles by bundle_file path
+    buckets = sorted(buckets, key=lambda b: b["key"]["bundle_file"])
+    logger.info("========================================")
+    logger.info(" BUNDLES AND ASSOCIATED ANALYSES")
+    logger.info("========================================")
+    logger.info("")
+    for idx, bucket in enumerate(buckets, start=1):
+        bundle = bucket["key"]["bundle_file"]
+        analyses = bucket["analyses"]["buckets"]
+        logger.info("#%d %s", idx, bundle)
+        if not analyses:
+            logger.info("   (no analyses)")
+        else:
+            for a in analyses:
+                logger.info("   • %s", a["key"])
+        logger.info("----------------------------------------")
+def main() -> None:
+    """Entry point of the info script."""
+    args = read_args()
+    configure_logging(args.verbose)
+    logger.debug("Arguments: %s", args)
+    addr = f"https://{args.es_host}:{args.es_port}"
+    logger.info("Connecting to Elasticsearch at %s...", addr)
+    es_query_conn = ElasticQueryConn(
+        addr,
+        args.es_cert_fp,
+        args.es_index_prefix,
+        basic_auth=(args.es_usr, args.es_pwd),
+    )
+    list_call_count = 0
+    if args.list_bundles:
+        list_bundles(es_query_conn)
+        list_call_count += 1
+    if args.list_analyses:
+        list_analyses(es_query_conn)
+        list_call_count += 1
+    if args.list_wet_processes:
+        list_wet_processes(es_query_conn)
+        list_call_count += 1
+    if args.list_bi_processes:
+        list_bi_processes(es_query_conn)
+        list_call_count += 1
+    if list_call_count == 0:
+        logger.debug("No list option specified, listing everything.")
+        list_analyses(es_query_conn)
+        list_wet_processes(es_query_conn)
+        list_bi_processes(es_query_conn)
+if __name__ == "__main__":
+    main()

genelastic/import_data/{cli_integrity.py → cli/integrity.py} RENAMED Viewed

@@ -1,17 +1,20 @@
 import argparse
 import logging
+import typing
 from elasticsearch import NotFoundError
-from genelastic.common import (
-    Bucket,
-    DBIntegrityError,
-    ElasticQueryConn,
+from genelastic.common.cli import (
     add_es_connection_args,
     add_verbose_control_args,
+    add_version_arg,
 )
+from genelastic.common.elastic import ElasticQueryConn
+from genelastic.common.exceptions import DBIntegrityError
+from genelastic.import_data.logger import configure_logging
-from .logger import configure_logging
+if typing.TYPE_CHECKING:
+    from genelastic.common.types import Bucket
 logger = logging.getLogger("genelastic")
 logging.getLogger("elastic_transport").setLevel(
@@ -27,6 +30,7 @@ def read_args() -> argparse.Namespace:
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         allow_abbrev=False,
     )
+    add_version_arg(parser)
     add_verbose_control_args(parser)
     add_es_connection_args(parser)
     return parser.parse_args()
@@ -42,6 +46,13 @@ def check_for_undefined_file_indices(
     :raises genelastic.common.DBIntegrityError:
         Some files indices are used in the analyses index but are undefined.
     """
+    if not es_query_conn.client:
+        logger.info(
+            "[Dryrun] check_for_undefined_file_indices: "
+            "no Elasticsearch client."
+        )
+        return
     logger.info(
         "Checking for references to undefined file indices in the index '%s'...",
         analyses_index,
@@ -217,6 +228,13 @@ def check_for_unused_file_indices(
     :returns: 1 if some file indices exists but are unused in the analyses index,
         and 0 otherwise.
     """
+    if not es_query_conn.client:
+        logger.info(
+            "[Dryrun] check_for_unused_file_indices: "
+            "no Elasticsearch client."
+        )
+        return -1
     json_indices = es_query_conn.client.cat.indices(
         index=f"{index_prefix}-file-*", format="json"
     ).body
@@ -349,9 +367,13 @@ def main() -> None:
     bi_processes_index = f"{args.es_index_prefix}-bi_processes"
     addr = f"https://{args.es_host}:{args.es_port}"
-    logger.info("Trying to connect to Elasticsearch at %s...", addr)
+    logger.info("Connecting to Elasticsearch at %s...", addr)
     es_query_conn = ElasticQueryConn(
-        addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
+        addr,
+        args.es_cert_fp,
+        args.es_index_prefix,
+        args.dryrun,
+        basic_auth=(args.es_usr, args.es_pwd),
     )
     # Fatal errors

genelastic 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

genelastic 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl