PyPI - sai-pg - Versions diffs - 1.0.0__py3-none-any.whl - Mend

sai-pg 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sai/__init__.py +18 -0
sai/__main__.py +73 -0
sai/parsers/__init__.py +18 -0
sai/parsers/argument_validation.py +169 -0
sai/parsers/outlier_parser.py +76 -0
sai/parsers/plot_parser.py +152 -0
sai/parsers/score_parser.py +241 -0
sai/sai.py +315 -0
sai/stats/__init__.py +18 -0
sai/stats/features.py +302 -0
sai/utils/__init__.py +22 -0
sai/utils/generators/__init__.py +23 -0
sai/utils/generators/chunk_generator.py +148 -0
sai/utils/generators/data_generator.py +49 -0
sai/utils/generators/window_generator.py +250 -0
sai/utils/genomic_dataclasses.py +46 -0
sai/utils/multiprocessing/__init__.py +22 -0
sai/utils/multiprocessing/mp_manager.py +251 -0
sai/utils/multiprocessing/mp_pool.py +73 -0
sai/utils/preprocessors/__init__.py +23 -0
sai/utils/preprocessors/chunk_preprocessor.py +152 -0
sai/utils/preprocessors/data_preprocessor.py +94 -0
sai/utils/preprocessors/feature_preprocessor.py +211 -0
sai/utils/utils.py +689 -0
sai_pg-1.0.0.dist-info/METADATA +44 -0
sai_pg-1.0.0.dist-info/RECORD +30 -0
sai_pg-1.0.0.dist-info/WHEEL +5 -0
sai_pg-1.0.0.dist-info/entry_points.txt +2 -0
sai_pg-1.0.0.dist-info/licenses/LICENSE +674 -0
sai_pg-1.0.0.dist-info/top_level.txt +1 -0

sai/utils/generators/window_generator.py ADDED Viewed

@@ -0,0 +1,250 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+from itertools import combinations, product
+from typing import Iterator, Any
+from sai.utils import read_data, split_genome
+from sai.utils.generators import DataGenerator
+class WindowGenerator(DataGenerator):
+    """
+    Generates genomic data for each specified window from VCF and other related files,
+    allowing the user to select the number of source populations.
+    """
+    def __init__(
+        self,
+        vcf_file: str,
+        chr_name: str,
+        ref_ind_file: str,
+        tgt_ind_file: str,
+        src_ind_file: str,
+        win_len: int,
+        win_step: int,
+        start: int = None,
+        end: int = None,
+        anc_allele_file: str = None,
+        num_src: int = 1,
+    ):
+        """
+        Initializes a new instance of WindowGenerator.
+        Parameters
+        ----------
+        vcf_file : str
+            The path to the VCF file containing variant data.
+        chr_name: str
+            The chromosome name to read from the VCF file.
+        ref_ind_file : str
+            The path to the file containing identifiers for reference populations.
+        tgt_ind_file : str
+            The path to the file containing identifiers for target populations.
+        src_ind_file : str
+            The path to the file containing identifiers for source populations.
+        win_len : int
+            The length of each window in base pairs.
+        win_step : int
+            The step size between windows in base pairs.
+        start: int, optional
+            The starting position (1-based, inclusive) on the chromosome. Default: None.
+        end: int, optional
+            The ending position (1-based, inclusive) on the chromosome. Default: None.
+        anc_allele_file: str, optional
+            Path to the file containing ancestral allele information. Default: None.
+        num_src : int, optional
+            The number of source populations to include in each combination. Default: 1.
+        Raises
+        ------
+        ValueError
+            If `win_len` is less than or equal to 0, if `win_step` is negative.
+        """
+        if win_len <= 0:
+            raise ValueError("`win_len` must be greater than 0.")
+        if win_step < 0:
+            raise ValueError("`win_step` must be non-negative.")
+        if num_src < 1:
+            raise ValueError("`num_src` must be at least 1.")
+        self.win_len = win_len
+        self.win_step = win_step
+        self.num_src = num_src
+        self.chr_name = chr_name
+        # Load data
+        (
+            self.ref_data,
+            self.ref_samples,
+            self.tgt_data,
+            self.tgt_samples,
+            self.src_data,
+            self.src_samples,
+            self.ploidy,
+        ) = read_data(
+            vcf_file=vcf_file,
+            chr_name=self.chr_name,
+            start=start,
+            end=end,
+            ref_ind_file=ref_ind_file,
+            tgt_ind_file=tgt_ind_file,
+            src_ind_file=src_ind_file,
+            anc_allele_file=anc_allele_file,
+            is_phased=False,
+            filter_ref=False,
+            filter_tgt=False,
+            filter_src=False,
+        )
+        self.src_combinations = list(
+            combinations(self.src_samples.keys(), self.num_src)
+        )
+        self.tgt_windows = {
+            tgt_pop: split_genome(
+                pos=(
+                    self.tgt_data[tgt_pop].POS
+                    if (start is None) and (end is None)
+                    else [start, end - win_len + win_step]
+                ),
+                window_size=self.win_len,
+                step_size=self.win_step,
+            )
+            for tgt_pop in self.tgt_samples
+        }
+        self.total_windows = sum(
+            len(windows) * len(self.ref_samples) * len(self.src_combinations)
+            for windows in self.tgt_windows.values()
+        )
+    def _window_generator(self) -> Iterator[dict[str, Any]]:
+        """
+        Generator function that yields genomic data for each window for each
+        population combination, including specified source population combinations.
+        Yields
+        ------
+        dict
+            A dictionary containing population names, start and end positions,
+            ploidy and phase information, reference, target, and source genotypes,
+            and positions for each window.
+        """
+        for ref_pop, tgt_pop, src_comb in product(
+            self.ref_samples, self.tgt_samples, self.src_combinations
+        ):
+            tgt_pos = self.tgt_data[tgt_pop].POS
+            for start, end in self.tgt_windows[tgt_pop]:
+                ref_gts = self.ref_data[ref_pop].GT[
+                    (self.ref_data[ref_pop].POS >= start)
+                    & (self.ref_data[ref_pop].POS < end)
+                ]
+                tgt_gts = self.tgt_data[tgt_pop].GT[
+                    (self.tgt_data[tgt_pop].POS >= start)
+                    & (self.tgt_data[tgt_pop].POS < end)
+                ]
+                src_gts_list = [
+                    self.src_data[src_pop].GT[
+                        (self.src_data[src_pop].POS >= start)
+                        & (self.src_data[src_pop].POS < end)
+                    ]
+                    for src_pop in src_comb
+                ]
+                sub_pos = tgt_pos[(tgt_pos >= start) & (tgt_pos < end)]
+                yield {
+                    "chr_name": self.chr_name,
+                    "ref_pop": ref_pop,
+                    "tgt_pop": tgt_pop,
+                    "src_pop_list": src_comb,  # List of source populations in this combination
+                    "start": start,
+                    "end": end,
+                    "pos": sub_pos,
+                    "ref_gts": ref_gts,
+                    "tgt_gts": tgt_gts,
+                    "src_gts_list": src_gts_list,  # List of genotypes for each source population in src_comb
+                    "ploidy": self.ploidy,
+                }
+    def _none_window_generator(self) -> Iterator[dict[str, Any]]:
+        """
+        Generates empty window data when reference, target, or source data is missing.
+        Yields
+        ------
+        dict[str, Any]
+            A dictionary containing the following keys:
+            - "chr_name" (str): The chromosome name.
+            - "ref_pop" (str): Reference population name.
+            - "tgt_pop" (str): Target population name.
+            - "src_pop_list" (list[str]): List of source populations in this combination.
+            - "start" (int): Start position of the window.
+            - "end" (int): End position of the window.
+            - "pos" (list[int]): Empty list, since no positions are available.
+            - "ref_gts" (None): Placeholder for missing reference genotypes.
+            - "tgt_gts" (None): Placeholder for missing target genotypes.
+            - "src_gts_list" (None): Placeholder for missing source genotypes.
+            - "ploidy" (None): Placeholder for missing ploidy information.
+        """
+        for ref_pop, tgt_pop, src_comb in product(
+            self.ref_samples, self.tgt_samples, self.src_combinations
+        ):
+            for start, end in self.tgt_windows[tgt_pop]:
+                yield {
+                    "chr_name": self.chr_name,
+                    "ref_pop": ref_pop,
+                    "tgt_pop": tgt_pop,
+                    "src_pop_list": src_comb,
+                    "start": start,
+                    "end": end,
+                    "pos": [],
+                    "ref_gts": None,
+                    "tgt_gts": None,
+                    "src_gts_list": None,
+                    "ploidy": None,
+                }
+    def get(self) -> Iterator[dict[str, Any]]:
+        """
+        Returns the generator for window data.
+        Returns
+        -------
+        generator
+            A generator yielding genomic data for each window.
+        """
+        if (
+            (self.ref_data is None)
+            or (self.tgt_data is None)
+            or (self.src_data is None)
+        ):
+            return self._none_window_generator()
+        else:
+            return self._window_generator()
+    def __len__(self) -> int:
+        """
+        Returns the precomputed total number of windows across all population combinations.
+        Returns
+        -------
+        int
+            Total number of windows.
+        """
+        return self.total_windows

sai/utils/genomic_dataclasses.py ADDED Viewed

@@ -0,0 +1,46 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+import allel
+from dataclasses import dataclass
+@dataclass
+class ChromosomeData:
+    """
+    A data structure for storing chromosome-specific genotype information.
+    Attributes
+    ----------
+    REF : list[str]
+        A list of reference alleles for each variant position.
+    ALT : list[str]
+        A list of alternate alleles for each variant position.
+    POS : list[int]
+        A list of genomic positions corresponding to each variant.
+    GT : list[allel.GenotypeVector]
+        A list of genotype vectors, where each vector represents the genotype
+        information for a specific variant position.
+    """
+    REF: list[str]
+    ALT: list[str]
+    POS: list[int]
+    GT: list[allel.GenotypeVector]

sai/utils/multiprocessing/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# Copyright 2024 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+from .mp_manager import mp_manager
+from .mp_pool import mp_pool

sai/utils/multiprocessing/mp_manager.py ADDED Viewed

@@ -0,0 +1,251 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+import multiprocessing
+import queue
+import time
+from multiprocessing import current_process
+from multiprocessing import Manager
+from multiprocessing import Process
+from threading import Thread
+from sai.utils.generators import DataGenerator
+from sai.utils.preprocessors import DataPreprocessor
+def monitor(shared_dict: dict, workers: list[multiprocessing.Process]) -> None:
+    """
+    Monitors worker processes to ensure they complete successfully, initiating shutdown if any worker fails.
+    Continuously checks the status of each worker process through a shared dictionary. If all workers
+    have completed successfully, the monitoring loop exits. If any worker process terminates without
+    marking its completion as 'Completed', a shutdown procedure is initiated for all workers.
+    Parameters
+    ----------
+    shared_dict : dict
+        A shared dictionary managed by a multiprocessing Manager. Worker processes use this dictionary
+        to update their status. Keys are the names of worker processes, and values are status strings
+        ('Completed', 'Failed', etc.).
+    workers : list[multiprocessing.Process]
+        A list of multiprocessing.Process objects, each representing a worker process to be monitored.
+    Notes
+    -----
+    - The function assumes that worker processes update their status in the shared dictionary
+      upon completion or failure.
+    - In case of a worker failure (process is no longer alive but hasn't marked 'Completed'),
+      `terminate_all_workers` is called to gracefully shutdown all workers.
+    - The function uses a 1-second interval for periodic checks to balance responsiveness with efficiency.
+    """
+    while True:
+        # alive_workers = [worker.name for worker in workers if worker.is_alive()]
+        # completed_workers = [worker.name for worker in workers if shared_dict.get(worker.name) == 'Completed']
+        # print("Monitoring", "Alive:", alive_workers, "Completed:", completed_workers)
+        if all(shared_dict.get(worker.name) == "Completed" for worker in workers):
+            # print("All workers completed their tasks successfully.")
+            return
+        for worker in workers:
+            if not worker.is_alive() and shared_dict.get(worker.name) != "Completed":
+                print(
+                    f"{worker.name} did not complete successfully. Initiating shutdown."
+                )
+                terminate_all_workers(workers)
+                print("All workers are terminated.")
+                return
+        time.sleep(1)  # Check periodically
+def terminate_all_workers(workers: list[multiprocessing.Process]) -> None:
+    """
+    Terminates all worker processes and waits for them to complete.
+    Sends a terminate signal to each worker process in the provided list and waits for each
+    to join, ensuring all processes are properly terminated before proceeding.
+    Parameters
+    ----------
+    workers : list[multiprocessing.Process]
+        A list of multiprocessing.Process objects, each representing a worker process to be terminated.
+    Notes
+    -----
+    - This function is typically called to ensure a clean shutdown in case of errors or when
+      all work has been completed.
+    - It first sends a `terminate` signal to each worker and then waits for each process to join,
+      guaranteeing that no worker process is left hanging.
+    """
+    for w in workers:
+        w.terminate()
+    for w in workers:
+        w.join()  # Wait for the process to terminate
+def mp_manager(
+    data_processor: DataPreprocessor,
+    data_generator: DataGenerator,
+    nprocess: int,
+    **kwargs,
+) -> None:
+    """
+    Manages the distribution of tasks across multiple processes for parallel execution, ensuring
+    reproducibility through controlled seed values for each task.
+    This function initializes a pool of worker processes and distributes tasks among them.
+    Each task involves executing a specified job, potentially with different seeds for
+    each repetition to ensure variability yet reproducibility in stochastic processes.
+    Parameters
+    ----------
+    data_processor : DataPreprocessor
+        An instance of `DataPreprocessor` that prepares data for each task before execution.
+    data_generator : DataGenerator
+        An instance of a `DataGenerator` subclass that yields dictionaries with parameters for each task.
+        The `run` method in the corresponding job instance must be compatible with the parameters returned
+        by the `get` method in the data_generator. This ensures that each task executed by the job function
+        receives the correct parameters, facilitating reproducibility and consistency across tasks.
+    nprocess : int
+        The number of worker processes to use for executing the job in parallel. This determines
+        the pool size of the multiprocessing environment.
+    **kwargs : dict
+        Additional keyword arguments to be passed directly to the job function. These are
+        forwarded as-is to each job invocation.
+    Raises
+    ------
+    Exception
+        Captures and logs any exceptions encountered during the initialization or execution
+        phase, including issues with starting worker processes or collecting results.
+    Notes
+    -----
+    - The function utilizes a multiprocessing manager to create shared queues and dictionaries
+      for task distribution and worker status monitoring.
+    - To ensure smooth termination and cleanup, a monitoring thread is used to join all worker
+      processes, and `cleanup_on_sigterm` is called to handle sudden terminations gracefully.
+    """
+    try:
+        from pytest_cov.embed import cleanup_on_sigterm
+    except ImportError:
+        pass
+    else:
+        cleanup_on_sigterm()
+    with Manager() as manager:
+        in_queue, out_queue = manager.Queue(), manager.Queue()
+        shared_dict = manager.dict()
+        workers = [
+            Process(
+                target=mp_worker, args=(in_queue, out_queue, shared_dict), kwargs=kwargs
+            )
+            for i in range(nprocess)
+        ]
+        for params in data_generator.get():
+            in_queue.put((data_processor, params))
+        try:
+            for w in workers:
+                w.start()
+            monitor_thread = Thread(target=monitor, args=(shared_dict, workers))
+            monitor_thread.start()
+            results = []
+            for i in range(len(data_generator)):
+                items = out_queue.get()
+                if items is None:
+                    continue
+                if isinstance(items, tuple) and "error" in items:
+                    break
+                results.extend(items)
+            if results:
+                data_processor.process_items(results)
+            for w in workers:
+                w.join()
+        finally:
+            for w in workers:
+                w.terminate()
+            monitor_thread.join()
+def mp_worker(
+    in_queue: queue.Queue, out_queue: queue.Queue, shared_dict: dict, **kwargs
+) -> None:
+    """
+    A multiprocessing worker function that processes tasks from an input queue, executes a job,
+    and reports the status to an output queue and a shared dictionary.
+    This worker continuously fetches tasks from `in_queue`, each task comprising a repetition
+    number, a seed value, and a job object. It executes the `run` method of the job object.
+    Upon successful completion of a task, it places the result in `out_queue`. If the input queue
+    is empty or an exception occurs during task processing, the worker updates its status in
+    `shared_dict` and terminates gracefully.
+    Parameters
+    ----------
+    in_queue : multiprocessing.managers.SyncManager.Queue
+        The input queue from which the worker fetches tasks.
+    out_queue : multiprocessing.managers.SyncManager.Queue
+        The output queue where the worker posts the results of processed tasks.
+    shared_dict : dict
+        A shared dictionary where the worker updates its status. Uses the worker's process name
+        as the key and the status ('Started', 'Completed', 'Failed') as the value.
+    **kwargs : dict
+        Additional keyword arguments that may be passed to the job's `run` method.
+    Raises
+    ------
+    Exception
+        Captures and logs any exceptions encountered during task processing. The worker updates
+        its status as 'Failed' in `shared_dict` and posts an error message to `out_queue` before
+        termination.
+    Notes
+    -----
+    - The worker uses a timeout of 5 seconds for fetching tasks from `in_queue` to prevent
+      indefinite blocking if the queue is empty.
+    - Upon encountering an empty queue, the worker marks itself as 'Completed' in `shared_dict`
+      and exits.
+    - If an exception occurs, it marks itself as 'Failed' and posts the error to `out_queue`
+      before breaking the loop and terminating.
+    """
+    process_name = current_process().name
+    shared_dict[process_name] = "Started"
+    while True:
+        try:
+            try:
+                data_processor, params = in_queue.get(timeout=5)
+                items = data_processor.run(**params, **kwargs)
+            except queue.Empty:
+                shared_dict[process_name] = "Completed"
+                return  # Exit the loop and end the worker process
+            out_queue.put(items)
+        except Exception as e:
+            shared_dict[process_name] = "Failed"
+            out_queue.put(("error", str(e)))
+            raise Exception(f"Worker {process_name} encountered an exception: {e}")

sai/utils/multiprocessing/mp_pool.py ADDED Viewed

@@ -0,0 +1,73 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+from multiprocessing import Pool
+from typing import Any
+from sai.utils.generators import DataGenerator
+from sai.utils.preprocessors import DataPreprocessor
+def mp_worker(params: tuple[DataPreprocessor, dict]) -> Any:
+    """
+    Executes the `run` method of the `DataPreprocessor` with provided parameters.
+    Parameters
+    ----------
+    params : tuple of (DataPreprocessor, dict)
+        A tuple containing an instance of `DataPreprocessor` and a dictionary of parameters.
+    Returns
+    -------
+    Any
+        The result of `data_processor.run(**param_dict)`.
+    """
+    data_processor, param_dict = params
+    return data_processor.run(**param_dict)
+def mp_pool(
+    data_processor: DataPreprocessor,
+    data_generator: DataGenerator,
+    nprocess: int,
+) -> None:
+    """
+    Distributes data processing tasks across multiple processes.
+    Parameters
+    ----------
+    data_processor : DataPreprocessor
+        An instance of `DataPreprocessor` responsible for processing data.
+    data_generator : DataGenerator
+        A generator that yields parameter dictionaries for processing.
+    nprocess : int
+        The number of worker processes to use.
+    Returns
+    -------
+    None
+        The processed results are handled by `data_processor.process_items()`.
+    """
+    tasks: list[tuple[DataPreprocessor, dict]] = [
+        (data_processor, params) for params in data_generator.get()
+    ]
+    with Pool(processes=nprocess) as pool:
+        results = pool.map(mp_worker, tasks)
+    data_processor.process_items(results)

sai/utils/preprocessors/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+from .data_preprocessor import DataPreprocessor
+from .chunk_preprocessor import ChunkPreprocessor
+from .feature_preprocessor import FeaturePreprocessor