PyPI - genelastic - Versions diffs - 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

genelastic 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

genelastic/api/.env +4 -0
genelastic/api/cli_start_api.py +18 -0
genelastic/api/errors.py +52 -0
genelastic/api/extends/example.py +0 -6
genelastic/api/extends/example.yml +0 -0
genelastic/api/routes.py +313 -181
genelastic/api/server.py +34 -26
genelastic/api/settings.py +5 -9
genelastic/api/specification.yml +512 -0
genelastic/common/__init__.py +0 -39
genelastic/common/cli.py +100 -0
genelastic/common/elastic.py +374 -46
genelastic/common/exceptions.py +34 -2
genelastic/common/server.py +59 -0
genelastic/common/types.py +1 -14
genelastic/import_data/__init__.py +0 -27
genelastic/import_data/checker.py +99 -0
genelastic/import_data/checker_observer.py +13 -0
genelastic/import_data/cli/__init__.py +0 -0
genelastic/import_data/cli/cli_check.py +136 -0
genelastic/import_data/cli/gen_data.py +143 -0
genelastic/import_data/cli/import_data.py +346 -0
genelastic/import_data/cli/info.py +247 -0
genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
genelastic/import_data/cli/validate.py +146 -0
genelastic/import_data/collect.py +185 -0
genelastic/import_data/constants.py +136 -11
genelastic/import_data/import_bundle.py +102 -59
genelastic/import_data/import_bundle_factory.py +70 -149
genelastic/import_data/importers/__init__.py +0 -0
genelastic/import_data/importers/importer_base.py +131 -0
genelastic/import_data/importers/importer_factory.py +85 -0
genelastic/import_data/importers/importer_types.py +223 -0
genelastic/import_data/logger.py +2 -1
genelastic/import_data/models/__init__.py +0 -0
genelastic/import_data/models/analyses.py +178 -0
genelastic/import_data/models/analysis.py +144 -0
genelastic/import_data/models/data_file.py +110 -0
genelastic/import_data/models/process.py +45 -0
genelastic/import_data/models/processes.py +84 -0
genelastic/import_data/models/tags.py +170 -0
genelastic/import_data/models/unique_list.py +109 -0
genelastic/import_data/models/validate.py +26 -0
genelastic/import_data/patterns.py +90 -0
genelastic/import_data/random_bundle.py +79 -54
genelastic/import_data/resolve.py +157 -0
genelastic/ui/.env +1 -0
genelastic/ui/cli_start_ui.py +20 -0
genelastic/ui/routes.py +333 -0
genelastic/ui/server.py +9 -82
genelastic/ui/settings.py +2 -6
genelastic/ui/static/cea-cnrgh.ico +0 -0
genelastic/ui/static/cea.ico +0 -0
genelastic/ui/static/layout.ico +0 -0
genelastic/ui/static/novaseq6000.png +0 -0
genelastic/ui/static/style.css +430 -0
genelastic/ui/static/ui.js +458 -0
genelastic/ui/templates/analyses.html +98 -0
genelastic/ui/templates/analysis_detail.html +44 -0
genelastic/ui/templates/bi_process_detail.html +129 -0
genelastic/ui/templates/bi_processes.html +116 -0
genelastic/ui/templates/explorer.html +356 -0
genelastic/ui/templates/home.html +207 -0
genelastic/ui/templates/layout.html +153 -0
genelastic/ui/templates/version.html +21 -0
genelastic/ui/templates/wet_process_detail.html +131 -0
genelastic/ui/templates/wet_processes.html +116 -0
genelastic-0.9.0.dist-info/METADATA +686 -0
genelastic-0.9.0.dist-info/RECORD +76 -0
genelastic-0.9.0.dist-info/WHEEL +4 -0
genelastic-0.9.0.dist-info/entry_points.txt +10 -0
genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
genelastic/import_data/analyses.py +0 -69
genelastic/import_data/analysis.py +0 -205
genelastic/import_data/bi_process.py +0 -27
genelastic/import_data/bi_processes.py +0 -49
genelastic/import_data/cli_gen_data.py +0 -116
genelastic/import_data/cli_import.py +0 -379
genelastic/import_data/cli_info.py +0 -256
genelastic/import_data/cli_validate.py +0 -54
genelastic/import_data/data_file.py +0 -87
genelastic/import_data/filename_pattern.py +0 -57
genelastic/import_data/tags.py +0 -123
genelastic/import_data/wet_process.py +0 -28
genelastic/import_data/wet_processes.py +0 -53
genelastic-0.7.0.dist-info/METADATA +0 -105
genelastic-0.7.0.dist-info/RECORD +0 -40
genelastic-0.7.0.dist-info/WHEEL +0 -5
genelastic-0.7.0.dist-info/entry_points.txt +0 -6
genelastic-0.7.0.dist-info/top_level.txt +0 -1

genelastic/import_data/cli/validate.py ADDED Viewed

@@ -0,0 +1,146 @@
+import argparse
+import logging
+from pathlib import Path
+from genelastic.common.cli import add_verbose_control_args, add_version_arg
+from genelastic.common.exceptions import (
+    ValidationError,
+    YAMLFileReadError,
+)
+from genelastic.import_data.import_bundle_factory import (
+    load_yaml_file,
+    validate_doc,
+)
+from genelastic.import_data.logger import configure_logging
+from genelastic.import_data.models.validate import ValidationIssue
+logger = logging.getLogger("genelastic")
+def read_args() -> argparse.Namespace:
+    """Read arguments from command line."""
+    parser = argparse.ArgumentParser(
+        description="Statically validates YAML bundles: "
+        "ensure they comply to the bundle schema.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        allow_abbrev=False,
+    )
+    add_version_arg(parser)
+    add_verbose_control_args(parser)
+    parser.add_argument(
+        "files",
+        type=Path,
+        nargs="+",
+        default=None,
+        help="Paths to YAML files containing bundles to validate.",
+    )
+    parser.add_argument(
+        "-x",
+        "--fail-fast",
+        dest="fail_fast",
+        action="store_true",
+        help="Stop validating files after the first error is encountered.",
+    )
+    return parser.parse_args()
+def main() -> int:
+    """Entry point of the validate script."""
+    args = read_args()
+    configure_logging(args.verbose)
+    validation_issues = []
+    file_count = len(args.files)
+    for file_index, file_path in enumerate(args.files):
+        resolved_file_path = file_path.resolve()
+        logger.info(
+            "[%s/%s] Validating bundle(s) from file '%s'.",
+            file_index + 1,
+            file_count,
+            resolved_file_path,
+        )
+        logger.info("Loading YAML file...")
+        try:
+            docs = load_yaml_file(resolved_file_path)
+        except YAMLFileReadError as e:
+            logger.error(e)
+            if args.fail_fast:
+                raise SystemExit(1) from None
+            validation_issues.append(
+                ValidationIssue(
+                    exc_type=type(e).__name__,
+                    file_path=resolved_file_path,
+                    file_index=file_index + 1,
+                    file_count=file_count,
+                )
+            )
+            continue
+        logger.info("-> YAML file successfully loaded.")
+        doc_count = len(docs)
+        logger.info("Found %s document(s) in the YAML file.", doc_count)
+        for doc_index, doc in enumerate(docs):
+            logger.info(
+                "  Validating bundle format for document #%s/%s...",
+                doc_index + 1,
+                doc_count,
+            )
+            try:
+                validate_doc(doc)
+            except ValidationError as e:
+                logger.error(e)
+                if args.fail_fast:
+                    raise SystemExit(1) from None
+                validation_issues.append(
+                    ValidationIssue(
+                        exc_type=type(e).__name__,
+                        file_path=resolved_file_path,
+                        file_index=file_index + 1,
+                        file_count=file_count,
+                        doc_index=doc_index + 1,
+                        doc_count=doc_count,
+                    )
+                )
+                continue
+            logger.info("  -> Bundle format is valid.")
+        logger.info("")
+    if len(validation_issues) > 0:
+        logger.error("Some files raised exceptions:")
+        for issue in validation_issues:
+            logger.error("  - %s", issue)
+        ret_code = 1
+    else:
+        logger.info("All bundles respect the genelastic YAML bundle format.")
+        ret_code = 0
+    files_failing_validation = len(
+        {issue.file_path for issue in validation_issues}
+    )
+    files_passing_validation = file_count - files_failing_validation
+    logger.info(
+        "Out of %s file(s), validation passed for %s and failed for %s.",
+        file_count,
+        files_passing_validation,
+        files_failing_validation,
+    )
+    return ret_code
+if __name__ == "__main__":
+    raise SystemExit(main())

genelastic/import_data/collect.py ADDED Viewed

@@ -0,0 +1,185 @@
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from genelastic.common.exceptions import DataFileCollectorError
+from genelastic.common.types import Metadata
+from genelastic.import_data.models.data_file import DataFile
+from genelastic.import_data.models.tags import Tags
+from genelastic.import_data.patterns import FilenamePattern
+from genelastic.import_data.resolve import resolve_analysis_id
+logger = logging.getLogger("genelastic")
+def collect_files(data_path: Path) -> set[Path]:
+    """Collect files for a given analysis.
+    All files directly under ``data_path`` are returned.
+    :param data_path: Directory containing the files.
+    :raises DataFileCollectorError: If ``data_path`` is not an existing
+        directory.
+    :return: Set of absolute paths to collected files.
+    """
+    try:
+        collected_files = {x for x in data_path.iterdir() if x.is_file()}
+    except OSError as e:
+        msg = f"Error collecting files: data directory is invalid. {e}."
+        raise DataFileCollectorError(msg) from e
+    return collected_files
+def match_files(
+    files: set[Path],
+    filename_pattern: FilenamePattern,
+) -> tuple[set[Path], set[Path]]:
+    """Splits a set of files into those that match a given filename pattern and
+    those that don't.
+    This function applies the provided ``filename_pattern`` to each file name
+    in ``files``, and returns two sets: one containing files that match the
+    pattern, and one containing those that do not.
+    :param files: A set of file paths to check.
+    :param filename_pattern: The filename pattern used for matching.
+    :returns: A tuple containing in first position a set of files that match
+      the pattern, and in second position a set of files that do not match the
+      pattern.
+    """
+    matched_files = {
+        f for f in files if filename_pattern.matches_pattern(f.name)
+    }
+    return matched_files, files - matched_files
+def extract_analysis_metadata(
+    data_path: Path,
+    file_prefix: str,
+    tags: Tags,
+    filename_pattern: FilenamePattern,
+) -> dict[str, Metadata]:
+    analysis = {}
+    for file in collect_files(data_path):
+        if not filename_pattern.matches_pattern(file.name):
+            logger.debug("File '%s' was not matched.", file.name)
+            continue
+        filename_metadata = filename_pattern.extract_metadata(file.name)
+        analysis_id = resolve_analysis_id(file_prefix, tags, filename_metadata)
+        analysis[analysis_id] = filename_metadata
+    return analysis
+def init_data_files(
+    analysis_id: str,
+    files: set[Path],
+    filename_pattern: FilenamePattern,
+    bundle_file: Path,
+) -> set[DataFile]:
+    """Instantiate ``DataFile`` objects from a set of file paths associated
+    with an analysis.
+    :param analysis_id: ID of the analysis, shared by all created ``DataFile``
+      instances.
+    :param files: Set of file paths associated with the analysis.
+    :param filename_pattern: Pattern used to extract metadata from filenames.
+      The extracted metadata is included in each ``DataFile``.
+    :param bundle_file: Path to the YAML bundle file from which the analysis is
+      defined.
+    :raises DataFileCollectorError: If metadata extraction or instantiation
+        of a data file objet fails for a given file.
+    :return: A set of successfully instantiated ``DataFile`` objects.
+    """
+    data_files = set()
+    for file in files:
+        try:
+            metadata = filename_pattern.extract_metadata(file.name)
+            data_file = DataFile(
+                analysis_id=analysis_id,
+                path=file,
+                bundle_file=bundle_file,
+                metadata=metadata,
+            )
+            data_files.add(data_file)
+        except RuntimeError as e:
+            msg = f"Error instantiating data files: {e}"
+            raise DataFileCollectorError(msg) from None
+    return data_files
+@dataclass(frozen=True)
+class DataFileCollectorResult:
+    """Result of a data file collection."""
+    matched_files: set[Path]
+    unmatched_files: set[Path]
+    data_files: set[DataFile]
+class DataFileCollector:
+    """Collect all data files belonging to an analysis."""
+    def __init__(
+        self,
+        analysis_id: str,
+        bundle_file: Path,
+        data_path: Path,
+        filename_pattern: FilenamePattern,
+        *,
+        multi_match: bool = False,
+    ) -> None:
+        self._analysis_id = analysis_id
+        self._bundle_file = bundle_file
+        self._data_path = data_path
+        self._filename_pattern = filename_pattern
+        self._multi_match = multi_match
+    def run(self) -> DataFileCollectorResult:
+        """Collects files from the analysis data path, matches them against the
+        analysis filename pattern, and instantiates ``DataFile`` objects for
+        each matched file.
+        :raises DataFileCollectorError: If the ``data_path`` is not an existing
+            directory or if metadata extraction or instantiation of a data file
+            objet fails for a given file.
+        :return: A ``DataFileCollectorResult`` containing the sets of matched
+            and unmatched files, as well as a set of instantiated ``DataFile``
+            objects.
+        """
+        files = collect_files(self._data_path)
+        logger.debug(
+            " -> Collected %s file(s):",
+            len(files),
+        )
+        for path in sorted(files):
+            logger.debug("  - '%s'", path.name)
+        matched_files, unmatched_files = match_files(
+            files, self._filename_pattern
+        )
+        logger.info(" -> Found %s matching file(s):", len(matched_files))
+        for path in sorted(matched_files):
+            logger.info("  - '%s'", path.name)
+        logger.info(
+            " -> Found %s non-matching file(s):",
+            len(unmatched_files),
+        )
+        for path in sorted(unmatched_files):
+            logger.info("  - '%s'", path.name)
+        data_files = init_data_files(
+            self._analysis_id,
+            matched_files,
+            self._filename_pattern,
+            self._bundle_file,
+        )
+        return DataFileCollectorResult(
+            matched_files=matched_files,
+            unmatched_files=unmatched_files,
+            data_files=data_files,
+        )

genelastic/import_data/constants.py CHANGED Viewed

@@ -5,20 +5,145 @@ This module contains genelastic constants.
 import typing
-ALLOWED_CATEGORIES: typing.Final[list[str]] = ["vcf", "cov"]
+import schema
+ALLOWED_EXTENSIONS: typing.Final[list[str]] = [
+    "vcf",
+    "cov",
+    "json",
+    "yml",
+    "yaml",
+]
 BUNDLE_CURRENT_VERSION = 3
-DEFAULT_TAG_REGEX = "[^_-]+"
-DEFAULT_TAG_PREFIX = "%"
-DEFAULT_TAG_SUFFIX = ""
+DEFAULT_TAG_REGEX = "[^_]+"
+DEFAULT_TAG_DELIMITER_START = "%"
+DEFAULT_TAG_DELIMITER_END = ""
 DEFAULT_TAG2FIELD: typing.Final[dict[str, dict[str, str]]] = {
-    "%S": {"field": "sample_name", "regex": DEFAULT_TAG_REGEX},
-    "%F": {"field": "source", "regex": DEFAULT_TAG_REGEX},
-    "%W": {"field": "wet_process", "regex": DEFAULT_TAG_REGEX},
-    "%B": {"field": "bi_process", "regex": DEFAULT_TAG_REGEX},
-    "%D": {"field": "cov_depth", "regex": DEFAULT_TAG_REGEX},
-    "%A": {"field": "barcode", "regex": DEFAULT_TAG_REGEX},
-    "%R": {"field": "reference_genome", "regex": DEFAULT_TAG_REGEX},
+    "S": {"field": "sample_name", "regex": DEFAULT_TAG_REGEX},
+    "F": {"field": "source", "regex": DEFAULT_TAG_REGEX},
+    "W": {"field": "wet_process", "regex": DEFAULT_TAG_REGEX},
+    "B": {"field": "bi_process", "regex": DEFAULT_TAG_REGEX},
+    "D": {"field": "cov_depth", "regex": DEFAULT_TAG_REGEX},
+    "A": {"field": "barcode", "regex": DEFAULT_TAG_REGEX},
+    "R": {"field": "reference_genome", "regex": DEFAULT_TAG_REGEX},
 }
+TOOLS_SUFFIX_RE = r"_(?P<tool>[a-zA-Z0-9]+)-(?P<version>\d+(?:-\d+){0,2})(?!-)"
+"""
+Regular expression to extract individual tool-version metadata pairs from a
+validated ``.metrics`` suffix in filenames.
+- Captures exactly one tool-version pair, where:
+  - ``tool`` is an alphanumeric identifier (letters and digits),
+  - ``version`` consists of 1 to 3 numeric components separated by hyphens
+    (e.g., '1', '1-0', '1-0-0'),
+- Uses named capture groups (``tool`` and ``version``) to extract data,
+- The negative lookahead ``(?!-)`` ensures the version does not end with a
+  hyphen,
+- Intended for extracting all matching pairs after the ``.metrics`` prefix has
+  been validated.
+"""
+_METRICS_SUFFIX_RE = r"(?:\.metrics(?:_[a-zA-Z0-9]+-\d+(?:-\d+){0,2}(?!-))*)?"
+"""
+Regular expression to match and validate the entire optional ``.metrics``
+suffix in filenames.
+- Matches zero or one occurrence of:
+  - A literal ``.metrics`` prefix, which must be the first suffix in the
+    filename,
+  - Followed optionally by zero or more tool-version pairs, each starting with
+    an underscore ``_`` and matching the same format as ``TOOLS_SUFFIX_RE``,
+- Validates that the whole suffix structure is correct (including optional
+  presence),
+- Ensures that when present, the suffix starts with ``.metrics`` and is
+  correctly formatted,
+- Does not extract individual tool-version pairs; its role is to validate the
+  suffix as a whole.
+"""
+_EXTENSIONS_SUFFIX_RE = rf"\.(?P<ext>{'|'.join(ALLOWED_EXTENSIONS)})(\.gz)?"
+"""
+Regular expression for matching allowed file extensions with optional gzip
+compression.
+This regex matches the file extension suffixes for files belonging to
+a set of predefined allowed extensions, specified in the ``ALLOWED_EXTENSIONS``
+list.
+The pattern matches:
+- a dot (``.``) followed by one of the allowed extensions,
+- optionally, a second extension ``.gz`` indicating gzip compression.
+Examples of matched suffixes: ``.vcf``, ``.cov``, ``.json``, ``.vcf.gz``,
+``.json.gz``.
+"""
+FILE_SUFFIXES_RE = rf"{_METRICS_SUFFIX_RE}{_EXTENSIONS_SUFFIX_RE}"
+"""Regex used to validate the suffix part of a filename.
+It matches an optional metrics suffix (containing tool-version metadata),
+immediately followed by a required allowed file extension suffix
+(possibly compressed with .gz).
+This regex is the combination of ``_METRICS_SUFFIX_RE`` and
+``_EXTENSIONS_SUFFIX_RE``.
+"""
+QC_METRICS_SCHEMA = schema.Schema(
+    {
+        "id": str,
+        "genome_coverage_size": float,
+        "genome_coverage_percent": float,
+        "n50": int,
+        "larger_contig": int,
+        "iqr": int,
+        "outlier_percent": float,
+        "mean_depth": float,
+        "mean_duplicat_percent": float,
+        "fold_regions_percents": {
+            "5": float,
+            "10": float,
+            "20": float,
+            "30": float,
+            "40": float,
+        },
+    }
+)
+SV_METRICS_SCHEMA = schema.Schema(
+    {
+        "metadata_mandatory": [{str: schema.Or(str, int, float, bool)}],
+        schema.Optional("metadata_optional"): [
+            {str: schema.Or(str, int, float, bool)}
+        ],
+        "regions": [
+            {
+                "name": str,
+                "bed": str,
+                "results": [
+                    {
+                        "svtype": str,
+                        "size": str,
+                        "FP_query": int,
+                        "TP_truth": int,
+                        "TP_query": int,
+                        "FN_truth": int,
+                        "total_truth": int,
+                        "total_query": int,
+                        "precision": schema.Or(int, float),
+                        "recall": schema.Or(int, float),
+                        "f1": schema.Or(int, float),
+                    }
+                ],
+            }
+        ],
+    }
+)

genelastic 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

genelastic 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl