PyPI - genelastic - Versions diffs - 0.6.0__py3-none-any.whl - Mend

genelastic 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

genelastic/__init__.py +13 -0
genelastic/analyses.py +70 -0
genelastic/analysis.py +200 -0
genelastic/bi_process.py +25 -0
genelastic/bi_processes.py +48 -0
genelastic/common.py +151 -0
genelastic/constants.py +45 -0
genelastic/data_file.py +82 -0
genelastic/filename_pattern.py +62 -0
genelastic/gen_data.py +193 -0
genelastic/import_bundle.py +134 -0
genelastic/import_bundle_factory.py +288 -0
genelastic/import_data.py +294 -0
genelastic/info.py +248 -0
genelastic/integrity.py +324 -0
genelastic/logger.py +56 -0
genelastic/tags.py +102 -0
genelastic/validate_data.py +41 -0
genelastic/wet_process.py +24 -0
genelastic/wet_processes.py +47 -0
genelastic-0.6.0.dist-info/METADATA +36 -0
genelastic-0.6.0.dist-info/RECORD +25 -0
genelastic-0.6.0.dist-info/WHEEL +5 -0
genelastic-0.6.0.dist-info/entry_points.txt +6 -0
genelastic-0.6.0.dist-info/top_level.txt +1 -0

genelastic/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Genelastic package for importing Genomic data into Elasticsearch.
+"""
+from .import_bundle import ImportBundle
+from .common import BundleDict
+from .constants import BUNDLE_CURRENT_VERSION
+from .import_bundle_factory import make_import_bundle_from_files, \
+    load_import_bundle_file
+from .analysis import Analysis
+from .analyses import Analyses
+__all__ = ['make_import_bundle_from_files', 'BUNDLE_CURRENT_VERSION',
+           'load_import_bundle_file', 'Analysis', 'ImportBundle']

genelastic/analyses.py ADDED Viewed

@@ -0,0 +1,70 @@
+# pylint: disable=missing-module-docstring
+import typing
+from .analysis import Analysis
+from .common import BundleDict
+from .data_file import DataFile
+class Analyses:
+    """Class Analyses is a container of Analysis objects.
+    """
+    def __init__(self) -> None:
+        self._arr: typing.List[Analysis] = []
+        self._iter_index: int = 0
+    def __len__(self) -> int:
+        return len(self._arr)
+    def __iter__(self) -> typing.Generator[Analysis, typing.Any, None]:
+        yield from self._arr
+    @typing.overload
+    def __getitem__(self, k: int) -> Analysis:
+        pass
+    @typing.overload
+    def __getitem__(self, k: slice) -> typing.List[Analysis]:
+        pass
+    def __getitem__(self, k): # type: ignore
+        if isinstance(k, int):
+            return self._arr[k]
+        return self._arr[k.start:k.stop]
+    def add(self, a: Analysis) -> None:
+        """Add one Analysis object."""
+        self._arr.append(a)
+    def get_nb_files(self, cat: str | None = None) -> int:
+        """Get the total number of files as paths."""
+        return len(self.get_data_files(cat = cat))
+    def get_data_files(self, cat: str | None = None) -> typing.List[DataFile]:
+        """Get the total number of files as DataFile objects.
+        """
+        data_files: typing.List[DataFile] = []
+        for a in self._arr:
+            data_files.extend(a.get_data_files(cat = cat))
+        return data_files
+    def get_all_categories(self) -> typing.Set[str]:
+        """Return all the categories of the analyses."""
+        categories = set()
+        for a in self._arr:
+            categories.update(a.get_all_categories())
+        return categories
+    @classmethod
+    def from_array_of_dicts(cls, arr: typing.Sequence[BundleDict]
+                            ) -> typing.Self:
+        """Build an Analyses instance."""
+        analyses = cls()
+        for d in arr:
+            analyses.add(Analysis(**d))
+        return analyses

genelastic/analysis.py ADDED Viewed

@@ -0,0 +1,200 @@
+# pylint: disable=missing-module-docstring
+import copy
+import glob
+import logging
+import os
+import re
+import typing
+from pathlib import Path
+from .common import AnalysisMetaData
+from .constants import ALLOWED_CATEGORIES
+from .data_file import DataFile
+from .filename_pattern import FilenamePattern
+from .tags import Tags
+logger = logging.getLogger('genelastic')
+class Analysis:
+    """Class Analysis that represents an analysis."""
+    # pylint: disable-next=too-many-arguments
+    def __init__(self,
+                 tags: Tags,
+                 root_dir: str = '.',
+                 bundle_file: str | None = None,
+                 file_prefix: str | None = None,
+                 files: typing.Sequence[str] | None = None,
+                 data_path: str | None = None,
+                 **metadata: str | int) -> None:
+        self._bundle_file = bundle_file
+        self._file_prefix = file_prefix
+        self._files = files
+        self._data_path = Analysis._resolve_data_path(root_dir, data_path)
+        self._tags = tags
+        self._metadata: AnalysisMetaData = metadata
+        self._categories: typing.Set[str] = set()
+    @property
+    def metadata(self) -> AnalysisMetaData:
+        """Get metadata."""
+        return copy.deepcopy(self._metadata)
+    @property
+    def bundle_file(self) -> str | None:
+        """Get the bundle file."""
+        return self._bundle_file
+    @property
+    def filename_regex(self) -> str:
+        """
+        Resolve placeholders in a file prefix using metadata
+        and unresolved placeholders are converted to regex groups
+        """
+        x: str = r"^.+\.(?P<ext>vcf|cov)(\.gz)?$"
+        # Use existing generic prefix
+        if self._file_prefix:
+            x = self._file_prefix
+            # Replace %* tags
+            for tag_name, tag_attrs in self._tags.items:
+                field = tag_attrs["field"]
+                regex = tag_attrs["regex"]
+                # Build field regex
+                field_regex = (f"(?P<{field}>{self._metadata.get(field)})"
+                               if field in self._metadata else
+                               f"(?P<{field}>{regex})")
+                # Replace tag with field regex
+                x = x.replace(tag_name, field_regex)
+            # Check for tags that were not replaced.
+            groups = re.findall(self._tags.search_regex, x)
+            for match in groups:
+                logger.warning("String '%s' in key 'file_prefix' looks like an undefined tag. "
+                               "If this string is not a tag, you can ignore this warning.",
+                               match)
+            # Add missing start and end markers
+            if not x.startswith("^"):
+                x = "^" + x
+            if not x.endswith("$"):
+                x += (r"\.(?P<ext>" + '|'.join(ALLOWED_CATEGORIES)
+                      + r")(\.gz)?$")
+            logger.debug("File regex for %s: %s", self._bundle_file, x)
+        return x
+    def get_nb_files(self, cat: str | None = None) -> int:
+        """Returns the total number of files.
+        """
+        return len(self.get_file_paths(cat=cat))
+    def get_data_files(self, cat: str | None = None) -> typing.List[DataFile]:
+        """Returns the list of matched files as DataFile objects.
+        """
+        files = self.get_file_paths(cat=cat)
+        filename_pattern = FilenamePattern(self.filename_regex)
+        data_files: typing.List[DataFile] = []
+        for f in files:
+            try:
+                data_files.append(DataFile.make_from_bundle(
+                    path=f, bundle_path=self._bundle_file,
+                    pattern=filename_pattern))
+            except (IOError, ValueError) as e:
+                logger.error("Error processing file %s: %s", f, str(e))
+        return data_files
+    def get_file_paths(self, cat: str | None = None) -> typing.Sequence[str]:
+        """Returns the list of matched files.
+        """
+        files, _, _ = self._do_get_file_paths(cat=cat)
+        return files
+    def get_unmatched_file_paths(self, cat: str | None = None
+                                 ) -> typing.Sequence[str]:
+        """Returns the list of unmatched files.
+        """
+        _, files, _ = self._do_get_file_paths(cat=cat)
+        return files
+    def get_all_categories(self) -> typing.Set[str]:
+        """Returns all categories of the analysis."""
+        _, _, categories = self._do_get_file_paths()
+        return categories
+    @staticmethod
+    def _resolve_data_path(root_dir: str, data_path: str | None) -> str:
+        resolved_data_path = '' if data_path is None else data_path
+        if not os.path.isabs(resolved_data_path):
+            resolved_data_path = os.path.abspath(os.path.join(root_dir, resolved_data_path))
+        return resolved_data_path
+    def _get_files_with_allowed_categories(self) -> typing.Dict[str, str]:
+        # Create a dict to store allowed files. Keys are the filepaths,
+        # and values are their corresponding category.
+        allowed_files: typing.Dict[str, str] = {}
+        # If files are listed explicitly in the YAML in the 'files' attribute, process them.
+        if self._files is not None:
+            abs_filepaths = [Path(self._data_path) / f for f in self._files]
+            # Try to retrieve files matching allowed categories by checking their first suffix.
+            for file in abs_filepaths:
+                cat = file.suffixes[0][1:]
+                # Add each matching file and its category to the dict.
+                if cat in ALLOWED_CATEGORIES:
+                    allowed_files[str(file)] = cat
+        # Else, look for files on disk using the YAML 'data_path' attribute.
+        else:
+            # Try to retrieve files matching allowed categories using glob.
+            for cat in ALLOWED_CATEGORIES:
+                glob_res = []
+                glob_res.extend(glob.glob(os.path.join(self._data_path, f"*.{cat}")))
+                glob_res.extend(glob.glob(os.path.join(self._data_path, f"*.{cat}.gz")))
+                # Add each globed file and its category to the dict.
+                for g_file in glob_res:
+                    allowed_files[g_file] = cat
+        return allowed_files
+    def _do_get_file_paths(self, cat: str | None = None) \
+            -> tuple[typing.Sequence[str], typing.Sequence[str], typing.Set[str]]:
+        # Raise an error if the category given as a parameter is not part of the allowed categories.
+        if cat is not None and cat not in ALLOWED_CATEGORIES:
+            raise ValueError(f"Unknown category {cat}.")
+        # Obtain a dict of all files matching the allowed categories.
+        allowed_files = self._get_files_with_allowed_categories()
+        if cat is None:
+            # No category was given as a parameter, so we match all categories.
+            files_to_match = allowed_files
+        else:
+            # A category was given as a parameter, so we match only this specific category.
+            files_to_match = dict((k, v) for (k, v) in allowed_files.items() if v == cat)
+        filename_pattern = FilenamePattern(self.filename_regex)
+        matching_files: typing.List[str] = []
+        non_matching_files: typing.List[str] = []
+        categories = set()
+        # We filter files by ensuring that they match the filename pattern defined in the analysis.
+        for file, category in sorted(files_to_match.items()):
+            if filename_pattern.matches_pattern(os.path.basename(file)):
+                matching_files.append(file)
+                logger.info("MATCHED file %s.", file)
+                # Add the file category to the categories set.
+                categories.add(category)
+            else:
+                logger.warning("UNMATCHED file %s.", file)
+                non_matching_files.append(file)
+        return matching_files, non_matching_files, categories

genelastic/bi_process.py ADDED Viewed

@@ -0,0 +1,25 @@
+# pylint: disable=missing-module-docstring
+import copy
+import typing
+from .common import BioInfoProcessData
+class BioInfoProcess:
+    """Class representing a bio process."""
+    def __init__(self, proc_id: str,
+                 bundle_file: str | None = None,
+                 **data: str | typing.List[str]) -> None:
+        self._proc_id = proc_id
+        self._bundle_file = bundle_file
+        self._data: BioInfoProcessData = data
+    @property
+    def id(self) -> str:
+        """Get the bio process ID."""
+        return self._proc_id
+    @property
+    def data(self) -> BioInfoProcessData:
+        """Get data associated to the bio process."""
+        return copy.deepcopy(self._data)

genelastic/bi_processes.py ADDED Viewed

@@ -0,0 +1,48 @@
+# pylint: disable=missing-module-docstring
+import logging
+import typing
+from .bi_process import BioInfoProcess
+from .common import BundleDict
+logger = logging.getLogger('genelastic')
+class BioInfoProcesses:
+    """Class BioInfoProcesses is a container of BioInfoProcess objects."""
+    def __init__(self) -> None:
+        self._dict: typing.Dict[str, BioInfoProcess] = {}
+    def __len__(self) -> int:
+        return len(self._dict)
+    def __getitem__(self, key: str) -> BioInfoProcess:
+        return self._dict[key]
+    def add(self, process: BioInfoProcess) -> None:
+        """Add one BioInfoProcess object.
+        If a BioInfoProcess object with the same ID already exists in the container,
+        the program exits.
+        """
+        if process.id in self._dict:
+            raise ValueError(f"A bi process with the id '{process.id}' is already present.")
+        # Add one WetProcess object.
+        self._dict[process.id] = process
+    def get_process_ids(self) -> typing.Set[str]:
+        """Get a list of the bio processes IDs."""
+        return set(self._dict.keys())
+    @classmethod
+    def from_array_of_dicts(cls, arr: typing.Sequence[BundleDict]
+                            ) -> typing.Self:
+        """Build a BioInfoProcesses instance."""
+        bi_processes = cls()
+        for d in arr:
+            bi_processes.add(BioInfoProcess(**d))
+        return bi_processes

genelastic/common.py ADDED Viewed

@@ -0,0 +1,151 @@
+"""
+Module: common
+This module contains custom types and functions shared by multiple genelastic scripts.
+"""
+import argparse
+import sys
+import typing
+import logging
+import elastic_transport
+import elasticsearch
+logger = logging.getLogger('genelastic')
+AnalysisMetaData: typing.TypeAlias = typing.Dict[str, str | int]
+WetProcessesData: typing.TypeAlias = typing.Dict[str, str | int | float]
+BioInfoProcessData: typing.TypeAlias = typing.Dict[str, str | typing.List[str]]
+BundleDict: typing.TypeAlias = typing.Dict[str, typing.Any]
+AnalysisDocument: typing.TypeAlias = typing.Dict[str, str | None | AnalysisMetaData]
+MetadataDocument: typing.TypeAlias = typing.Dict[str, int | str | typing.List[typing.Any | None]]
+ProcessDocument: typing.TypeAlias = (typing.Dict[str, str] |
+                                     WetProcessesData |
+                                     BioInfoProcessData)
+BulkItems: typing.TypeAlias = typing.List[typing.Dict[str, str |
+                                                           MetadataDocument |
+                                                           AnalysisDocument |
+                                                           ProcessDocument]]
+Bucket: typing.TypeAlias = typing.Dict[str, typing.Dict[typing.Any, typing.Any]]
+def connect_to_es(host: str, port: int, usr: str, pwd: str) -> elasticsearch.Elasticsearch:
+    """Connect to a remote Elasticsearch database."""
+    addr = f"https://{host}:{port}"
+    logger.info("Trying to connect to Elasticsearch at %s.", addr)
+    try:
+        es = elasticsearch.Elasticsearch(
+            addr,
+            # ssl_assert_fingerprint=args.es_cert_fp,
+            # ca_certs=args.es_cert,
+            verify_certs=False,
+            basic_auth=(usr, pwd)
+        )
+        logger.info(es.info())
+    except elastic_transport.TransportError as e:
+        logger.error(e.message)
+        sys.exit(1)
+    return es
+def run_composite_aggregation(es: elasticsearch.Elasticsearch,
+                              index: str, query: typing.Dict[str, typing.Any]) \
+        -> typing.List[Bucket]:
+    """
+    Executes a composite aggregation on an Elasticsearch index and returns all paginated results.
+    :param es: Elasticsearch client instance.
+    :param index: Name of the index to query.
+    :param query: Aggregation query to run.
+    :return: List of aggregation results.
+    """
+    # Extract the aggregation name from the query dict.
+    agg_name = next(iter(query["aggs"]))
+    all_buckets: typing.List[Bucket] = []
+    try:
+        logger.debug("Running composite aggregation query %s on index '%s'.", query, index)
+        response = es.search(index=index, body=query)
+    except elasticsearch.NotFoundError as e:
+        raise SystemExit(f"Error: {e.message} for index '{index}'.") from e
+    while True:
+        # Extract buckets from the response.
+        buckets: typing.List[Bucket] = response['aggregations'][agg_name]['buckets']
+        all_buckets.extend(buckets)
+        # Check if there are more results to fetch.
+        if 'after_key' in response['aggregations'][agg_name]:
+            after_key = response['aggregations'][agg_name]['after_key']
+            query['aggs'][agg_name]['composite']['after'] = after_key
+            try:
+                logger.debug("Running query %s on index '%s'.", query, index)
+                response = es.search(index=index, body=query)  # Fetch the next page of results.
+            except elasticsearch.NotFoundError as e:
+                raise SystemExit(f"Error: {e.message} for index '{index}'.") from e
+        else:
+            break
+    return all_buckets
+def get_process_ids(es: elasticsearch.Elasticsearch, index: str, proc_field_name: str) \
+        -> typing.Set[str]:
+    """Return a set of process IDs."""
+    process_ids = set()
+    query = {
+        "size": 0,
+        "aggs": {
+            "get_proc_ids": {
+                "composite": {
+                    "sources": {"proc_id": {"terms": {"field": f"{proc_field_name}.keyword"}}},
+                    "size": 1000,
+                }
+            }
+        }
+    }
+    buckets: typing.List[Bucket] = run_composite_aggregation(es, index, query)
+    for bucket in buckets:
+        process_ids.add(bucket['key']['proc_id'])
+    return process_ids
+def add_verbose_control_args(parser: argparse.ArgumentParser) -> None:
+    """
+    Add verbose control arguments to the parser.
+    Arguments are added to the parser by using its reference.
+    """
+    parser.add_argument('-q', '--quiet', dest='verbose', action='store_const',
+                        const=0, default=1,
+                        help='Set verbosity to 0 (quiet mode).')
+    parser.add_argument('-v', '--verbose', dest='verbose', action='count',
+                        default=1,
+                        help=('Verbose level. -v for information, -vv for debug,' +
+                              ' -vvv for trace.'))
+def add_es_connection_args(parser: argparse.ArgumentParser) -> None:
+    """
+    Add arguments to the parser needed to gather ElasticSearch server connection parameters.
+    Arguments are added to the parser by using its reference.
+    """
+    parser.add_argument('--es-host', dest='es_host', default='localhost',
+                        help='Address of Elasticsearch host.')
+    parser.add_argument('--es-port', type=int, default=9200, dest='es_port',
+                        help='Elasticsearch port.')
+    parser.add_argument('--es-usr', dest='es_usr', default='elastic',
+                        help='Elasticsearch user.')
+    parser.add_argument('--es-pwd', dest='es_pwd', required=True,
+                        help='Elasticsearch password.')
+    parser.add_argument('--es-cert', dest='es_cert',
+                        help='Elasticsearch certificate file.')
+    parser.add_argument('--es-cert-fp', dest='es_cert_fp',
+                        help='Elasticsearch certificate fingerprint.')
+    parser.add_argument('--es-index-prefix', dest='es_index_prefix',
+                        help='Add the given prefix to each index created during import.')

genelastic/constants.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""
+Module: constants
+This module contains genelastic constants.
+"""
+import typing
+ALLOWED_CATEGORIES: typing.Final[typing.List[str]] = ['vcf', 'cov']
+BUNDLE_CURRENT_VERSION = 3
+DEFAULT_TAG_REGEX = "[^_-]+"
+DEFAULT_TAG_PREFIX = "%"
+DEFAULT_TAG_SUFFIX = ""
+DEFAULT_TAG2FIELD: typing.Final[typing.Dict[str, typing.Dict[str, str]]] = {
+    '%S': {
+        "field": 'sample_name',
+        "regex": DEFAULT_TAG_REGEX
+    },
+    '%F': {
+        "field": 'source',
+        "regex": DEFAULT_TAG_REGEX
+    },
+    '%W': {
+        "field": 'wet_process',
+        "regex": DEFAULT_TAG_REGEX
+    },
+    '%B': {
+        "field": 'bi_process',
+        "regex": DEFAULT_TAG_REGEX
+    },
+    '%D': {
+        "field": 'cov_depth',
+        "regex": DEFAULT_TAG_REGEX
+    },
+    '%A': {
+        "field": 'barcode',
+        "regex": DEFAULT_TAG_REGEX
+    },
+    '%R':  {
+        "field": 'reference_genome',
+        "regex": DEFAULT_TAG_REGEX
+    }
+}

genelastic/data_file.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""
+This module defines the DataFile class, which handles the representation,
+management, and extraction of metadata for a data file within a data bundle.
+It includes functionality to construct DataFile instances from paths and
+optional filename patterns, retrieve file paths and metadata, and support
+for extracting metadata from filenames using specified patterns.
+"""
+import logging
+import os
+import pathlib
+import typing
+from .filename_pattern import FilenamePattern
+from .common import AnalysisMetaData
+logger = logging.getLogger('genelastic')
+class DataFile:
+    """Class for handling a data file and its metadata."""
+    # Initializer
+    def __init__(self, path: str, bundle_path: str | None = None,
+                 metadata: typing.Optional[AnalysisMetaData] = None) -> None:
+        self._path = path
+        self._bundle_path = bundle_path  # The bundle YAML file in which this
+        # file was listed.
+        self._metadata = {} if metadata is None else metadata
+    def __repr__(self) -> str:
+        return (f"File {self._path}, from bundle {self._bundle_path}"
+                + f", with metadata {self._metadata}")
+    # Get path
+    @property
+    def path(self) -> str:
+        """Retrieve the data file path."""
+        return self._path
+    def exists(self) -> bool:
+        """Tests if the associated file exists on disk."""
+        return os.path.isfile(self._path)
+    # Get bundle path
+    @property
+    def bundle_path(self) -> str | None:
+        """Retrieve the path to the associated data bundle file."""
+        return self._bundle_path
+    # Get metadata
+    @property
+    def metadata(self) -> AnalysisMetaData:
+        """Retrieve a copy of the metadata associated with the data file."""
+        return self._metadata.copy()
+    # Factory
+    @classmethod
+    def make_from_bundle(
+            cls,
+            path: str,
+            bundle_path: str | None,
+            pattern: typing.Optional[FilenamePattern] = None) -> 'DataFile':
+        """Construct a DataFile instance from a bundle path, file path,
+        and optional filename pattern."""
+        # Make absolute path
+        if not os.path.isabs(path) and not bundle_path is None:
+            path = os.path.join(os.path.dirname(bundle_path), path)
+        # Extract filename metadata
+        metadata = None
+        if pattern is not None:
+            metadata = pattern.extract_metadata(os.path.basename(path))
+        if metadata:
+            if "ext" not in metadata:
+                metadata["ext"] = pathlib.Path(path).suffixes[0][1:]
+            if "cov_depth" in metadata:
+                metadata["cov_depth"] = int(metadata["cov_depth"])
+        return cls(path, bundle_path, metadata)