PyPI - genelastic - Versions diffs - 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

genelastic 0.6.1py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

genelastic/api/cli_start_api.py +18 -0
genelastic/api/extends/example.py +2 -3
genelastic/api/extends/example.yml +20 -0
genelastic/api/routes.py +160 -23
genelastic/api/server.py +42 -31
genelastic/api/settings.py +5 -8
genelastic/api/specification.yml +350 -0
genelastic/common/__init__.py +41 -9
genelastic/common/cli.py +103 -23
genelastic/common/elastic.py +80 -49
genelastic/common/exceptions.py +0 -2
genelastic/common/server.py +51 -0
genelastic/common/types.py +20 -15
genelastic/import_data/__init__.py +23 -5
genelastic/import_data/analyses.py +17 -20
genelastic/import_data/analysis.py +69 -65
genelastic/import_data/bi_process.py +7 -5
genelastic/import_data/bi_processes.py +8 -8
genelastic/import_data/cli_gen_data.py +143 -0
genelastic/import_data/cli_import.py +379 -0
genelastic/import_data/{info.py → cli_info.py} +104 -75
genelastic/import_data/cli_integrity.py +384 -0
genelastic/import_data/cli_validate.py +54 -0
genelastic/import_data/constants.py +11 -32
genelastic/import_data/data_file.py +23 -20
genelastic/import_data/filename_pattern.py +26 -32
genelastic/import_data/import_bundle.py +56 -47
genelastic/import_data/import_bundle_factory.py +166 -158
genelastic/import_data/logger.py +22 -18
genelastic/import_data/random_bundle.py +425 -0
genelastic/import_data/tags.py +46 -26
genelastic/import_data/wet_process.py +8 -4
genelastic/import_data/wet_processes.py +13 -8
genelastic/ui/__init__.py +0 -0
genelastic/ui/cli_start_ui.py +18 -0
genelastic/ui/routes.py +86 -0
genelastic/ui/server.py +14 -0
genelastic/ui/settings.py +7 -0
genelastic/ui/templates/analyses.html +11 -0
genelastic/ui/templates/bi_processes.html +11 -0
genelastic/ui/templates/home.html +4 -0
genelastic/ui/templates/layout.html +34 -0
genelastic/ui/templates/version.html +9 -0
genelastic/ui/templates/wet_processes.html +11 -0
genelastic-0.8.0.dist-info/METADATA +109 -0
genelastic-0.8.0.dist-info/RECORD +52 -0
{genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/WHEEL +1 -1
genelastic-0.8.0.dist-info/entry_points.txt +8 -0
genelastic/import_data/gen_data.py +0 -194
genelastic/import_data/import_data.py +0 -292
genelastic/import_data/integrity.py +0 -290
genelastic/import_data/validate_data.py +0 -43
genelastic-0.6.1.dist-info/METADATA +0 -41
genelastic-0.6.1.dist-info/RECORD +0 -36
genelastic-0.6.1.dist-info/entry_points.txt +0 -6
{genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/top_level.txt +0 -0

genelastic/import_data/analysis.py CHANGED Viewed

@@ -1,8 +1,5 @@
-# pylint: disable=missing-module-docstring
 import copy
-import glob
 import logging
-import os
 import re
 import typing
 from pathlib import Path
@@ -14,28 +11,31 @@ from .data_file import DataFile
 from .filename_pattern import FilenamePattern
 from .tags import Tags
-logger = logging.getLogger('genelastic')
+logger = logging.getLogger("genelastic")
 class Analysis:
     """Class Analysis that represents an analysis."""
-    # pylint: disable-next=too-many-arguments, too-many-positional-arguments
-    def __init__(self,
-                 tags: Tags,
-                 root_dir: str = '.',
-                 bundle_file: str | None = None,
-                 file_prefix: str | None = None,
-                 files: typing.Sequence[str] | None = None,
-                 data_path: str | None = None,
-                 **metadata: str | int) -> None:
-        self._bundle_file = bundle_file
+    def __init__(  # noqa: PLR0913
+        self,
+        tags: Tags,
+        root_dir: str = ".",
+        bundle_file: str | None = None,
+        file_prefix: str | None = None,
+        files: typing.Sequence[str] | None = None,
+        data_path: str | None = None,
+        **metadata: str | int,
+    ) -> None:
+        self._bundle_file = Path(bundle_file) if bundle_file else None
         self._file_prefix = file_prefix
         self._files = files
-        self._data_path = Analysis._resolve_data_path(root_dir, data_path)
+        self._data_path = Analysis._resolve_data_path(
+            Path(root_dir), Path(data_path) if data_path else None
+        )
         self._tags = tags
         self._metadata: AnalysisMetaData = metadata
-        self._categories: typing.Set[str] = set()
+        self._categories: set[str] = set()
     @property
     def metadata(self) -> AnalysisMetaData:
@@ -43,17 +43,15 @@ class Analysis:
         return copy.deepcopy(self._metadata)
     @property
-    def bundle_file(self) -> str | None:
+    def bundle_file(self) -> Path | None:
         """Get the bundle file."""
         return self._bundle_file
     @property
     def filename_regex(self) -> str:
-        """
-        Resolve placeholders in a file prefix using metadata
+        """Resolve placeholders in a file prefix using metadata
         and unresolved placeholders are converted to regex groups
         """
         x: str = r"^.+\.(?P<ext>vcf|cov)(\.gz)?$"
         # Use existing generic prefix
@@ -65,84 +63,87 @@ class Analysis:
                 regex = tag_attrs["regex"]
                 # Build field regex
-                field_regex = (f"(?P<{field}>{self._metadata.get(field)})"
-                               if field in self._metadata else
-                               f"(?P<{field}>{regex})")
+                field_regex = (
+                    f"(?P<{field}>{self._metadata.get(field)})"
+                    if field in self._metadata
+                    else f"(?P<{field}>{regex})"
+                )
                 # Replace tag with field regex
                 x = x.replace(tag_name, field_regex)
             # Check for tags that were not replaced.
             groups = re.findall(self._tags.search_regex, x)
             for match in groups:
-                logger.warning("String '%s' in key 'file_prefix' looks like an undefined tag. "
-                               "If this string is not a tag, you can ignore this warning.",
-                               match)
+                logger.warning(
+                    "String '%s' in key 'file_prefix' looks like an undefined tag. "
+                    "If this string is not a tag, you can ignore this warning.",
+                    match,
+                )
             # Add missing start and end markers
             if not x.startswith("^"):
                 x = "^" + x
             if not x.endswith("$"):
-                x += (r"\.(?P<ext>" + '|'.join(ALLOWED_CATEGORIES)
-                      + r")(\.gz)?$")
+                x += r"\.(?P<ext>" + "|".join(ALLOWED_CATEGORIES) + r")(\.gz)?$"
             logger.debug("File regex for %s: %s", self._bundle_file, x)
         return x
     def get_nb_files(self, cat: str | None = None) -> int:
-        """Returns the total number of files.
-        """
+        """Returns the total number of files."""
         return len(self.get_file_paths(cat=cat))
-    def get_data_files(self, cat: str | None = None) -> typing.List[DataFile]:
-        """Returns the list of matched files as DataFile objects.
-        """
+    def get_data_files(self, cat: str | None = None) -> list[DataFile]:
+        """Returns the list of matched files as DataFile objects."""
         files = self.get_file_paths(cat=cat)
         filename_pattern = FilenamePattern(self.filename_regex)
-        data_files: typing.List[DataFile] = []
+        data_files: list[DataFile] = []
         for f in files:
             try:
-                data_files.append(DataFile.make_from_bundle(
-                    path=f, bundle_path=self._bundle_file,
-                    pattern=filename_pattern))
-            except (IOError, ValueError) as e:
+                data_files.append(
+                    DataFile.make_from_bundle(
+                        path=f,
+                        bundle_path=self._bundle_file,
+                        pattern=filename_pattern,
+                    )
+                )
+            except (OSError, ValueError) as e:
                 logger.error("Error processing file %s: %s", f, str(e))
         return data_files
-    def get_file_paths(self, cat: str | None = None) -> typing.Sequence[str]:
-        """Returns the list of matched files.
-        """
+    def get_file_paths(self, cat: str | None = None) -> typing.Sequence[Path]:
+        """Returns the list of matched files."""
         files, _, _ = self._do_get_file_paths(cat=cat)
         return files
-    def get_unmatched_file_paths(self, cat: str | None = None
-                                 ) -> typing.Sequence[str]:
-        """Returns the list of unmatched files.
-        """
+    def get_unmatched_file_paths(
+        self, cat: str | None = None
+    ) -> typing.Sequence[Path]:
+        """Returns the list of unmatched files."""
         _, files, _ = self._do_get_file_paths(cat=cat)
         return files
-    def get_all_categories(self) -> typing.Set[str]:
+    def get_all_categories(self) -> set[str]:
         """Returns all categories of the analysis."""
         _, _, categories = self._do_get_file_paths()
         return categories
     @staticmethod
-    def _resolve_data_path(root_dir: str, data_path: str | None) -> str:
-        resolved_data_path = '' if data_path is None else data_path
+    def _resolve_data_path(root_dir: Path, data_path: Path | None) -> Path:
+        resolved_data_path = Path() if data_path is None else data_path
-        if not os.path.isabs(resolved_data_path):
-            resolved_data_path = os.path.abspath(os.path.join(root_dir, resolved_data_path))
+        if not resolved_data_path.is_absolute():
+            resolved_data_path = (root_dir / resolved_data_path).absolute()
         return resolved_data_path
-    def _get_files_with_allowed_categories(self) -> typing.Dict[str, str]:
+    def _get_files_with_allowed_categories(self) -> dict[Path, str]:
         # Create a dict to store allowed files. Keys are the filepaths,
         # and values are their corresponding category.
-        allowed_files: typing.Dict[str, str] = {}
+        allowed_files: dict[Path, str] = {}
         # If files are listed explicitly in the YAML in the 'files' attribute, process them.
         if self._files is not None:
             abs_filepaths = [Path(self._data_path) / f for f in self._files]
@@ -151,14 +152,14 @@ class Analysis:
                 cat = file.suffixes[0][1:]
                 # Add each matching file and its category to the dict.
                 if cat in ALLOWED_CATEGORIES:
-                    allowed_files[str(file)] = cat
+                    allowed_files[file] = cat
         # Else, look for files on disk using the YAML 'data_path' attribute.
         else:
             # Try to retrieve files matching allowed categories using glob.
             for cat in ALLOWED_CATEGORIES:
-                glob_res = []
-                glob_res.extend(glob.glob(os.path.join(self._data_path, f"*.{cat}")))
-                glob_res.extend(glob.glob(os.path.join(self._data_path, f"*.{cat}.gz")))
+                glob_res: list[Path] = []
+                glob_res.extend(self._data_path.glob(f"*.{cat}"))
+                glob_res.extend(self._data_path.glob(f"*.{cat}.gz"))
                 # Add each globed file and its category to the dict.
                 for g_file in glob_res:
@@ -166,12 +167,13 @@ class Analysis:
         return allowed_files
-    def _do_get_file_paths(self, cat: str | None = None) \
-            -> tuple[typing.Sequence[str], typing.Sequence[str], typing.Set[str]]:
+    def _do_get_file_paths(
+        self, cat: str | None = None
+    ) -> tuple[typing.Sequence[Path], typing.Sequence[Path], set[str]]:
         # Raise an error if the category given as a parameter is not part of the allowed categories.
         if cat is not None and cat not in ALLOWED_CATEGORIES:
-            raise ValueError(f"Unknown category {cat}.")
+            msg = f"Unknown category {cat}."
+            raise ValueError(msg)
         # Obtain a dict of all files matching the allowed categories.
         allowed_files = self._get_files_with_allowed_categories()
@@ -181,16 +183,18 @@ class Analysis:
             files_to_match = allowed_files
         else:
             # A category was given as a parameter, so we match only this specific category.
-            files_to_match = dict((k, v) for (k, v) in allowed_files.items() if v == cat)
+            files_to_match = {
+                k: v for k, v in allowed_files.items() if v == cat
+            }
         filename_pattern = FilenamePattern(self.filename_regex)
-        matching_files: typing.List[str] = []
-        non_matching_files: typing.List[str] = []
+        matching_files: list[Path] = []
+        non_matching_files: list[Path] = []
         categories = set()
         # We filter files by ensuring that they match the filename pattern defined in the analysis.
         for file, category in sorted(files_to_match.items()):
-            if filename_pattern.matches_pattern(os.path.basename(file)):
+            if filename_pattern.matches_pattern(file.name):
                 matching_files.append(file)
                 logger.info("MATCHED file %s.", file)
                 # Add the file category to the categories set.

genelastic/import_data/bi_process.py CHANGED Viewed

@@ -1,15 +1,17 @@
-# pylint: disable=missing-module-docstring
 import copy
-import typing
 from genelastic.common import BioInfoProcessData
 class BioInfoProcess:
     """Class representing a bio process."""
-    def __init__(self, proc_id: str,
-                 bundle_file: str | None = None,
-                 **data: str | typing.List[str]) -> None:
+    def __init__(
+        self,
+        proc_id: str,
+        bundle_file: str | None = None,
+        **data: str | list[str],
+    ) -> None:
         self._proc_id = proc_id
         self._bundle_file = bundle_file
         self._data: BioInfoProcessData = data

genelastic/import_data/bi_processes.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# pylint: disable=missing-module-docstring
 import logging
 import typing
@@ -6,14 +5,14 @@ from genelastic.common import BundleDict
 from .bi_process import BioInfoProcess
-logger = logging.getLogger('genelastic')
+logger = logging.getLogger("genelastic")
 class BioInfoProcesses:
     """Class BioInfoProcesses is a container of BioInfoProcess objects."""
     def __init__(self) -> None:
-        self._dict: typing.Dict[str, BioInfoProcess] = {}
+        self._dict: dict[str, BioInfoProcess] = {}
     def __len__(self) -> int:
         return len(self._dict)
@@ -27,20 +26,21 @@ class BioInfoProcesses:
         the program exits.
         """
         if process.id in self._dict:
-            raise ValueError(f"A bi process with the id '{process.id}' is already present.")
+            msg = f"A bi process with the id '{process.id}' is already present."
+            raise ValueError(msg)
         # Add one WetProcess object.
         self._dict[process.id] = process
-    def get_process_ids(self) -> typing.Set[str]:
+    def get_process_ids(self) -> set[str]:
         """Get a list of the bio processes IDs."""
         return set(self._dict.keys())
     @classmethod
-    def from_array_of_dicts(cls, arr: typing.Sequence[BundleDict]
-                            ) -> typing.Self:
+    def from_array_of_dicts(
+        cls, arr: typing.Sequence[BundleDict]
+    ) -> typing.Self:
         """Build a BioInfoProcesses instance."""
         bi_processes = cls()
         for d in arr:

genelastic/import_data/cli_gen_data.py ADDED Viewed

@@ -0,0 +1,143 @@
+import argparse
+import logging
+from pathlib import Path
+from biophony import DEFAULT_RATE, MutSimParams
+from genelastic.common import add_verbose_control_args
+from .logger import configure_logging
+from .random_bundle import (
+    RandomBundle,
+)
+logger = logging.getLogger("genelastic")
+def read_args() -> argparse.Namespace:
+    """Read arguments from the command line."""
+    parser = argparse.ArgumentParser(
+        description="Random bundle generator. "
+        "A bundle is a YAML file format used to import genetic data into an Elasticsearch database. "
+        "It can contain one or more analyses; "
+        "each analysis including metadata, references to "
+        "a wet lab and bioinformatics process "
+        "and paths to a VCF file and optionally to a coverage file.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        allow_abbrev=False,
+    )
+    add_verbose_control_args(parser)
+    parser.add_argument(
+        "output_dir",
+        help="Path where analyses VCF and coverage files will be generated.",
+        type=Path,
+    )
+    parser.add_argument("--log-file", help="Path to a log file.")
+    parser.add_argument(
+        "-n",
+        "--chrom-nb",
+        type=int,
+        default=5,
+        help="Number of chromosomes to include in the generated VCF file.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-bundle",
+        default=None,
+        help="Path where the YAML bundle file will be written. "
+        "If no path is provided, the bundle is written to stdout.",
+        type=Path,
+    )
+    parser.add_argument(
+        "-l",
+        "--sequence-length",
+        type=int,
+        default=2000,
+        help="Sequence length (number of nucleotides) generated for each chromosome.",
+    )
+    parser.add_argument(
+        "-c",
+        "--coverage",
+        action="store_true",
+        help="Generate a coverage file for each analysis.",
+    )
+    parser.add_argument(
+        "-a",
+        "--analyses",
+        help="Number of analyses to generate. "
+        "Each analysis will reference a wet lab and bioinformatics process, "
+        "a VCF file and optionally a coverage file.",
+        default=1,
+        type=int,
+    )
+    parser.add_argument(
+        "-p",
+        "--processes",
+        help="Number of wet lab and bioinformatics processes to generate.",
+        default=1,
+        type=int,
+    )
+    parser.add_argument(
+        "-s",
+        "--snp-rate",
+        help="Generated VCF SNP rate.",
+        type=float,
+        default=DEFAULT_RATE,
+    )
+    parser.add_argument(
+        "-i",
+        "--ins-rate",
+        help="Generated VCF insertion rate.",
+        type=float,
+        default=DEFAULT_RATE,
+    )
+    parser.add_argument(
+        "-d",
+        "--del-rate",
+        help="Generated VCF deletion rate.",
+        type=float,
+        default=DEFAULT_RATE,
+    )
+    return parser.parse_args()
+def main() -> None:
+    """Entry point of the gen-data script."""
+    # Read command line arguments
+    args = read_args()
+    output_dir = args.output_dir.resolve()
+    if not output_dir.is_dir():
+        msg = f"ERROR: '{output_dir}' does not exist or is not a directory."
+        raise SystemExit(msg)
+    if args.analyses < 1:
+        msg = "Analyses count must be at least 1."
+        raise SystemExit(msg)
+    if args.processes < 1:
+        msg = "Processes count must be at least 1."
+        raise SystemExit(msg)
+    # Configure logging
+    configure_logging(args.verbose, log_file=args.log_file)
+    logger.debug("Arguments: %s", args)
+    # Write to stdout or file
+    RandomBundle(
+        output_dir,
+        args.analyses,
+        args.processes,
+        args.chrom_nb,
+        args.sequence_length,
+        MutSimParams(
+            snp_rate=args.snp_rate,
+            ins_rate=args.ins_rate,
+            del_rate=args.del_rate,
+        ),
+        do_gen_coverage=args.coverage,
+    ).to_yaml(args.output_bundle)
+if __name__ == "__main__":
+    main()

genelastic 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

genelastic 0.6.1py3-none-any.whl → 0.8.0py3-none-any.whl