PyPI - genelastic - Versions diffs - 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

genelastic 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

genelastic/api/extends/example.py +2 -3
genelastic/api/routes.py +160 -23
genelastic/api/server.py +30 -22
genelastic/api/settings.py +3 -2
genelastic/common/__init__.py +36 -9
genelastic/common/cli.py +51 -23
genelastic/common/elastic.py +80 -49
genelastic/common/exceptions.py +0 -2
genelastic/common/types.py +20 -15
genelastic/import_data/__init__.py +23 -5
genelastic/import_data/analyses.py +17 -20
genelastic/import_data/analysis.py +69 -65
genelastic/import_data/bi_process.py +7 -5
genelastic/import_data/bi_processes.py +8 -8
genelastic/import_data/cli_gen_data.py +116 -0
genelastic/import_data/cli_import.py +379 -0
genelastic/import_data/{info.py → cli_info.py} +104 -75
genelastic/import_data/cli_integrity.py +384 -0
genelastic/import_data/cli_validate.py +54 -0
genelastic/import_data/constants.py +11 -32
genelastic/import_data/data_file.py +23 -20
genelastic/import_data/filename_pattern.py +26 -32
genelastic/import_data/import_bundle.py +56 -47
genelastic/import_data/import_bundle_factory.py +166 -158
genelastic/import_data/logger.py +22 -18
genelastic/import_data/random_bundle.py +402 -0
genelastic/import_data/tags.py +46 -26
genelastic/import_data/wet_process.py +8 -4
genelastic/import_data/wet_processes.py +13 -8
genelastic/ui/__init__.py +0 -0
genelastic/ui/server.py +87 -0
genelastic/ui/settings.py +11 -0
genelastic-0.7.0.dist-info/METADATA +105 -0
genelastic-0.7.0.dist-info/RECORD +40 -0
{genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/WHEEL +1 -1
genelastic-0.7.0.dist-info/entry_points.txt +6 -0
genelastic/import_data/gen_data.py +0 -194
genelastic/import_data/import_data.py +0 -292
genelastic/import_data/integrity.py +0 -290
genelastic/import_data/validate_data.py +0 -43
genelastic-0.6.1.dist-info/METADATA +0 -41
genelastic-0.6.1.dist-info/RECORD +0 -36
genelastic-0.6.1.dist-info/entry_points.txt +0 -6
{genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/top_level.txt +0 -0

genelastic/import_data/bi_process.py CHANGED Viewed

@@ -1,15 +1,17 @@
-# pylint: disable=missing-module-docstring
 import copy
-import typing
 from genelastic.common import BioInfoProcessData
 class BioInfoProcess:
     """Class representing a bio process."""
-    def __init__(self, proc_id: str,
-                 bundle_file: str | None = None,
-                 **data: str | typing.List[str]) -> None:
+    def __init__(
+        self,
+        proc_id: str,
+        bundle_file: str | None = None,
+        **data: str | list[str],
+    ) -> None:
         self._proc_id = proc_id
         self._bundle_file = bundle_file
         self._data: BioInfoProcessData = data

genelastic/import_data/bi_processes.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# pylint: disable=missing-module-docstring
 import logging
 import typing
@@ -6,14 +5,14 @@ from genelastic.common import BundleDict
 from .bi_process import BioInfoProcess
-logger = logging.getLogger('genelastic')
+logger = logging.getLogger("genelastic")
 class BioInfoProcesses:
     """Class BioInfoProcesses is a container of BioInfoProcess objects."""
     def __init__(self) -> None:
-        self._dict: typing.Dict[str, BioInfoProcess] = {}
+        self._dict: dict[str, BioInfoProcess] = {}
     def __len__(self) -> int:
         return len(self._dict)
@@ -27,20 +26,21 @@ class BioInfoProcesses:
         the program exits.
         """
         if process.id in self._dict:
-            raise ValueError(f"A bi process with the id '{process.id}' is already present.")
+            msg = f"A bi process with the id '{process.id}' is already present."
+            raise ValueError(msg)
         # Add one WetProcess object.
         self._dict[process.id] = process
-    def get_process_ids(self) -> typing.Set[str]:
+    def get_process_ids(self) -> set[str]:
         """Get a list of the bio processes IDs."""
         return set(self._dict.keys())
     @classmethod
-    def from_array_of_dicts(cls, arr: typing.Sequence[BundleDict]
-                            ) -> typing.Self:
+    def from_array_of_dicts(
+        cls, arr: typing.Sequence[BundleDict]
+    ) -> typing.Self:
         """Build a BioInfoProcesses instance."""
         bi_processes = cls()
         for d in arr:

genelastic/import_data/cli_gen_data.py ADDED Viewed

@@ -0,0 +1,116 @@
+import argparse
+import logging
+from pathlib import Path
+from genelastic.common import add_verbose_control_args
+from .logger import configure_logging
+from .random_bundle import (
+    RandomBundle,
+)
+logger = logging.getLogger("genelastic")
+def read_args() -> argparse.Namespace:
+    """Read arguments from command line."""
+    parser = argparse.ArgumentParser(
+        description="Genetics data random generator.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        allow_abbrev=False,
+    )
+    add_verbose_control_args(parser)
+    parser.add_argument(
+        "-d",
+        "--data-folder",
+        dest="data_folder",
+        required=True,
+        help="Data destination folder.",
+        type=Path,
+    )
+    parser.add_argument(
+        "--log-file", dest="log_file", help="Path to a log file."
+    )
+    parser.add_argument(
+        "-n",
+        "--chrom-nb",
+        dest="chrom_nb",
+        type=int,
+        default=5,
+        help="Number of chromosomes to include in the generated VCF file.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-yaml-file",
+        dest="output_file",
+        default=None,
+        help="Output YAML file.",
+        type=Path,
+    )
+    parser.add_argument(
+        "-s",
+        "--sequence-size",
+        type=int,
+        default=2000,
+        help="Sequence size (number of nucleotides) generated for each chromosome.",
+    )
+    parser.add_argument(
+        "-c",
+        "--coverage",
+        action="store_true",
+        help="Generate a coverage file for each analysis.",
+    )
+    parser.add_argument(
+        "-a",
+        "--analyses",
+        help="Number of analyses to generate. "
+        "Each analysis is composed of a YAML bundle file declaring its wet lab and bioinformatics processes, "
+        "a VCF file and optionally a coverage file.",
+        default=1,
+        type=int,
+    )
+    parser.add_argument(
+        "-p",
+        "--processes",
+        help="Number of Wet Lab and Bioinformatics processes to generate.",
+        default=1,
+        type=int,
+    )
+    return parser.parse_args()
+def main() -> None:
+    """Entry point of the gen-data script."""
+    # Read command line arguments
+    args = read_args()
+    folder = args.data_folder.resolve()
+    if not folder.is_dir():
+        msg = f"ERROR: '{folder}' does not exist or is not a directory."
+        raise SystemExit(msg)
+    if args.analyses < 1:
+        msg = "Analyses count must be at least 1."
+        raise SystemExit(msg)
+    if args.processes < 1:
+        msg = "Processes count must be at least 1."
+        raise SystemExit(msg)
+    # Configure logging
+    configure_logging(args.verbose, log_file=args.log_file)
+    logger.debug("Arguments: %s", args)
+    # Write to stdout or file
+    RandomBundle(
+        folder,
+        args.analyses,
+        args.processes,
+        args.chrom_nb,
+        args.sequence_size,
+        do_gen_coverage=args.coverage,
+    ).to_yaml(args.output_file)
+if __name__ == "__main__":
+    main()

genelastic/import_data/cli_import.py ADDED Viewed

@@ -0,0 +1,379 @@
+# vi: se tw=80
+# Elasticsearch Python API:
+# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/overview.html
+# https://elasticsearch-py.readthedocs.io/en/latest/api.html
+import argparse
+import csv
+import datetime
+import hashlib
+import logging
+import sys
+import time
+from pathlib import Path
+import vcf
+from genelastic.common import (
+    AnalysisDocument,
+    BulkItems,
+    ElasticImportConn,
+    MetadataDocument,
+    ProcessDocument,
+    add_es_connection_args,
+    add_verbose_control_args,
+)
+from .bi_processes import BioInfoProcesses
+from .data_file import DataFile
+from .import_bundle_factory import make_import_bundle_from_files
+from .logger import configure_logging
+from .wet_processes import WetProcesses
+logger = logging.getLogger("genelastic")
+logging.getLogger("elastic_transport").setLevel(
+    logging.WARNING
+)  # Disable excessive logging
+logging.getLogger("urllib3").setLevel(
+    logging.WARNING
+)  # Disable excessive logging
+def read_args() -> argparse.Namespace:
+    """Read arguments from command line."""
+    parser = argparse.ArgumentParser(
+        description="Genetics data importer.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        allow_abbrev=False,
+    )
+    add_verbose_control_args(parser)
+    add_es_connection_args(parser)
+    parser.add_argument(
+        "-D",
+        "--dry-run",
+        dest="dryrun",
+        action="count",
+        default=0,
+        help=(
+            "Dry-run level. -D for data files loading (VCF, coverage, etc) "
+            "without connecting or importing to database. "
+            "-DD for metadata YAML files loading only (no loading of data files)."
+        ),
+    )
+    parser.add_argument(
+        "--log-file", dest="log_file", help="Path to a log file."
+    )
+    parser.add_argument(
+        "--no-list",
+        dest="no_list",
+        action="store_true",
+        help="Do not print list of files to be imported.",
+    )
+    parser.add_argument(
+        "--no-confirm",
+        dest="no_confirm",
+        action="store_true",
+        help="Do not ask confirmation before importing.",
+    )
+    parser.add_argument(
+        "files",
+        type=Path,
+        nargs="+",
+        default=None,
+        help="Data files that describe what to import.",
+    )
+    return parser.parse_args()
+def import_cov_file(
+    es_import_conn: ElasticImportConn | None,
+    file_index: str,
+    file: Path,
+    dryrun: int = 0,
+) -> None:
+    """Import a coverage file to the Elasticsearch database."""
+    # Set field types
+    if dryrun == 0 and es_import_conn:
+        es_import_conn.client.indices.put_mapping(
+            index=file_index,
+            body={
+                "properties": {
+                    "pos": {"type": "integer"},
+                    "depth": {"type": "byte"},
+                }
+            },
+        )
+    # Open file
+    if dryrun > 1:
+        logger.info(
+            "Would load and import Coverage file %s " "into index %s.",
+            file,
+            file_index,
+        )
+    else:
+        logger.info("Load Coverage file %s.", file)
+        if dryrun == 1:
+            logger.info(
+                "Would import Coverage file %s into index %s.", file, file_index
+            )
+        else:
+            logger.info(
+                "Import Coverage file %s into index %s.", file, file_index
+            )
+        with file.open(newline="", encoding="utf-8") as f:
+            # Read file as CSV
+            reader = csv.reader(f, delimiter="\t", quotechar='"')
+            # Loop on al lines
+            for row in reader:
+                # Build document
+                # Position starts at 0 inside coverage file
+                doc: MetadataDocument = {
+                    "type": "coverage",
+                    "chr": row[0],
+                    "pos": int(row[1]) + 1,
+                    "depth": int(row[2]),
+                }
+                # Insert document
+                if dryrun == 0 and es_import_conn:
+                    es_import_conn.client.index(index=file_index, document=doc)
+def import_analysis_metadata(  # noqa: PLR0913
+    es_import_conn: ElasticImportConn | None,
+    index_prefix: str,
+    file_index: str,
+    file: DataFile,
+    analysis_type: str,
+    dryrun: int = 0,
+) -> None:
+    """Import analysis metadata into a dedicated index."""
+    doc: AnalysisDocument = {
+        "path": str(file.path.resolve()),
+        "bundle_path": str(file.bundle_path.resolve())
+        if file.bundle_path
+        else None,
+        "metadata": file.metadata,
+        "file_index": file_index,
+        "type": analysis_type,
+    }
+    bulk_items: BulkItems = [
+        {"_index": f"{index_prefix}-analyses", "_source": doc}
+    ]
+    if dryrun == 0 and es_import_conn:
+        es_import_conn.import_items(
+            bulk_items,
+            start_time=time.perf_counter(),
+            total_items=len(bulk_items),
+        )
+def import_vcf_file(
+    es_import_conn: ElasticImportConn | None,
+    file_index: str,
+    file: DataFile,
+    dryrun: int = 0,
+) -> None:
+    """Import a VCF file to the Elasticsearch database."""
+    logger.info('Import VCF file "%s".', file)
+    if dryrun > 1:
+        logger.info(
+            "Would load and import VCF file %s " "into index %s.",
+            file.path,
+            file_index,
+        )
+    else:
+        logger.info("Load VCF file %s.", file.path)
+        if dryrun == 1:
+            logger.info(
+                "Would import VCF file %s into index %s.", file.path, file_index
+            )
+        else:
+            logger.info(
+                "Importing VCF file %s into index %s...", file.path, file_index
+            )
+        try:
+            vcf_reader = vcf.Reader(filename=str(file.path))
+            n = 0
+            start = time.perf_counter()
+            bulk_sz = 256  # Bulk size
+            bulk_items: BulkItems = []
+            for record in vcf_reader:
+                # Correct values
+                if not record.CHROM.startswith("chr"):
+                    if record.CHROM.lower().startswith("chr"):
+                        record.CHROM = "chr" + record.CHROM[3:]
+                    else:
+                        record.CHROM = "chr" + record.CHROM
+                # Build document
+                alt = [x if x is None else x.type for x in record.ALT]
+                doc: MetadataDocument = {
+                    "type": "vcf",
+                    "chr": record.CHROM,
+                    "pos": record.POS,
+                    "alt": alt,
+                    "info": record.INFO,
+                }
+                if dryrun == 0:
+                    # Append item to bulk
+                    bulk_items.append({"_index": file_index, "_source": doc})
+                    n += 1
+                    # Insert bulk of items
+                    if len(bulk_items) >= bulk_sz and es_import_conn:
+                        es_import_conn.import_items(
+                            bulk_items, start_time=start, total_items=n
+                        )
+                        bulk_items = []
+            # Insert remaining items
+            if dryrun == 0 and es_import_conn:
+                es_import_conn.import_items(
+                    bulk_items, start_time=start, total_items=n
+                )
+        except StopIteration:
+            logger.error("Skipping empty file : %s.", file.path)
+def import_processes(
+    es_import_conn: ElasticImportConn | None,
+    index: str,
+    processes: WetProcesses | BioInfoProcesses,
+    dryrun: int = 0,
+) -> None:
+    """Import processes into their own index."""
+    bulk_items: BulkItems = []
+    for proc_id in processes.get_process_ids():
+        process = processes[proc_id]
+        process_type = process.__class__.__name__
+        doc: ProcessDocument = process.data | {
+            "proc_id": proc_id,
+            "type": process_type,
+        }
+        bulk_items.append({"_index": index, "_source": doc})
+    if dryrun == 0 and es_import_conn:
+        es_import_conn.import_items(
+            bulk_items,
+            start_time=time.perf_counter(),
+            total_items=len(bulk_items),
+        )
+def generate_unique_index(index_prefix: str, filepath: Path) -> str:
+    """Generate a unique index with the following format:
+    <index_prefix>_<current_date>_<md5_hashed_filepath>
+    """
+    current_date = datetime.datetime.now(tz=datetime.UTC).strftime("%Y%m%d")
+    hashed_filepath = hashlib.md5(
+        str(filepath).encode("utf-8"), usedforsecurity=False
+    ).hexdigest()
+    return f"{index_prefix}-file-{current_date}-{hashed_filepath}"
+def main() -> None:  # noqa: C901
+    """Entry point of the import script."""
+    # Read command line arguments
+    args = read_args()
+    # Configure logging
+    configure_logging(args.verbose, log_file=args.log_file)
+    logger.debug("Arguments: %s", args)
+    logger.debug("LOGGERS: %s", logging.root.manager.loggerDict)
+    # Open connection to ES
+    if args.dryrun == 0:
+        addr = f"https://{args.es_host}:{args.es_port}"
+        logger.info("Trying to connect to Elasticsearch at %s...", addr)
+        es_import_conn = ElasticImportConn(
+            addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
+        )
+    else:
+        es_import_conn = None
+    # Load YAML import bundle
+    import_bundle = make_import_bundle_from_files(args.files, check=True)
+    all_bundled_files = import_bundle.get_files()
+    # CHECK
+    for f in all_bundled_files:
+        if not f.exists():
+            msg = f"Path {f.path} does not point to a valid file."
+            raise RuntimeError(msg)
+    # LIST
+    if not args.no_list:
+        for f in all_bundled_files:
+            logger.info("Will import %s.", f.path)
+    # Ask confirmation for importing
+    if not args.no_confirm:
+        answer: str = "maybe"
+        while answer not in ["", "n", "y"]:
+            answer = input("Import (y/N)? ").lower()
+        if answer != "y":
+            logger.info("Import canceled.")
+            sys.exit(0)
+    # IMPORT
+    # Loop on file categories
+    for cat in import_bundle.analyses.get_all_categories():
+        # Import all files in this category.
+        for f in import_bundle.get_files(cat):
+            logger.info("Import %s files from %s.", cat, f.path)
+            # First, generate a unique index name for each file.
+            file_index = generate_unique_index(args.es_index_prefix, f.path)
+            # Then, import the analysis metadata into a dedicated index.
+            import_analysis_metadata(
+                es_import_conn,
+                args.es_index_prefix,
+                file_index,
+                f,
+                cat,
+                args.dryrun,
+            )
+            # Finally, import the file in its own index.
+            globals()[f"import_{cat}_file"](
+                es_import_conn=es_import_conn,
+                file_index=file_index,
+                file=f,
+                dryrun=args.dryrun,
+            )
+    # Import processes
+    logger.info("Importing wet processes.")
+    logger.info(
+        "Wet processes IDs = %s",
+        str(import_bundle.wet_processes.get_process_ids()),
+    )
+    import_processes(
+        es_import_conn,
+        f"{args.es_index_prefix}-wet_processes",
+        import_bundle.wet_processes,
+    )
+    logger.info("Importing bio info processes.")
+    logger.info(
+        "Bio info processes IDs = %s",
+        str(import_bundle.bi_processes.get_process_ids()),
+    )
+    import_processes(
+        es_import_conn,
+        f"{args.es_index_prefix}-bi_processes",
+        import_bundle.bi_processes,
+    )
+if __name__ == "__main__":
+    main()

genelastic 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

genelastic 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl