PyPI - omnigenome - Versions diffs - 0.3.0a1__py3-none-any.whl → 0.3.1a0__py3-none-any.whl - Mend

omnigenome 0.3.0a1py3-none-any.whl → 0.3.1a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

omnigenome/__init__.py +16 -8
omnigenome/auto/auto_bench/__init__.py +0 -1
omnigenome/auto/auto_bench/auto_bench.py +24 -14
omnigenome/auto/auto_train/__init__.py +0 -1
omnigenome/auto/auto_train/auto_train.py +11 -12
omnigenome/auto/bench_hub/__init__.py +0 -1
omnigenome/auto/bench_hub/bench_hub.py +1 -1
omnigenome/cli/__init__.py +0 -1
omnigenome/cli/commands/__init__.py +0 -1
omnigenome/cli/commands/base.py +10 -10
omnigenome/cli/commands/bench/__init__.py +0 -1
omnigenome/cli/commands/bench/bench_cli.py +10 -10
omnigenome/cli/commands/rna/__init__.py +0 -1
omnigenome/cli/commands/rna/rna_design.py +10 -11
omnigenome/src/__init__.py +0 -1
omnigenome/src/abc/__init__.py +0 -1
omnigenome/src/abc/abstract_dataset.py +38 -19
omnigenome/src/abc/abstract_metric.py +7 -7
omnigenome/src/abc/abstract_model.py +15 -14
omnigenome/src/abc/abstract_tokenizer.py +9 -7
omnigenome/src/dataset/omni_dataset.py +16 -14
omnigenome/src/lora/__init__.py +0 -1
omnigenome/src/lora/lora_model.py +47 -41
omnigenome/src/metric/classification_metric.py +11 -11
omnigenome/src/metric/metric.py +19 -19
omnigenome/src/metric/ranking_metric.py +15 -15
omnigenome/src/metric/regression_metric.py +18 -18
omnigenome/src/misc/utils.py +40 -36
omnigenome/src/model/augmentation/__init__.py +0 -1
omnigenome/src/model/augmentation/model.py +17 -17
omnigenome/src/model/classification/__init__.py +0 -1
omnigenome/src/model/classification/model.py +28 -32
omnigenome/src/model/embedding/__init__.py +0 -1
omnigenome/src/model/embedding/model.py +35 -35
omnigenome/src/model/mlm/__init__.py +0 -1
omnigenome/src/model/mlm/model.py +13 -13
omnigenome/src/model/module_utils.py +17 -17
omnigenome/src/model/regression/__init__.py +0 -1
omnigenome/src/model/regression/model.py +72 -77
omnigenome/src/model/regression/resnet.py +32 -32
omnigenome/src/model/rna_design/__init__.py +0 -1
omnigenome/src/model/rna_design/model.py +65 -58
omnigenome/src/model/seq2seq/__init__.py +0 -1
omnigenome/src/model/seq2seq/model.py +4 -4
omnigenome/src/tokenizer/bpe_tokenizer.py +27 -27
omnigenome/src/tokenizer/kmers_tokenizer.py +22 -22
omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +11 -11
omnigenome/src/trainer/accelerate_trainer.py +40 -32
omnigenome/src/trainer/hf_trainer.py +8 -8
omnigenome/src/trainer/trainer.py +37 -25
omnigenome/utility/dataset_hub/__init__.py +0 -1
omnigenome/utility/dataset_hub/dataset_hub.py +13 -13
omnigenome/utility/ensemble.py +26 -26
omnigenome/utility/hub_utils.py +8 -8
omnigenome/utility/model_hub/__init__.py +0 -1
omnigenome/utility/model_hub/model_hub.py +26 -25
omnigenome/utility/pipeline_hub/__init__.py +0 -1
omnigenome/utility/pipeline_hub/pipeline.py +49 -49
omnigenome/utility/pipeline_hub/pipeline_hub.py +17 -17
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/METADATA +2 -2
omnigenome-0.3.1a0.dist-info/RECORD +78 -0
omnigenome-0.3.0a1.dist-info/RECORD +0 -78
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/WHEEL +0 -0
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/entry_points.txt +0 -0
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/licenses/LICENSE +0 -0
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/top_level.txt +0 -0

omnigenome/src/trainer/trainer.py CHANGED Viewed

@@ -29,14 +29,14 @@ from torch.cuda.amp import GradScaler
 def _infer_optimization_direction(metrics, prev_metrics):
     """
     Infer the optimization direction based on metric names and trends.
     This function determines whether larger or smaller values are better for
     the given metrics by analyzing metric names and their trends over time.
     Args:
         metrics (dict): Current metric values
         prev_metrics (list): Previous metric values from multiple epochs
     Returns:
         str: Either "larger_is_better" or "smaller_is_better"
     """
@@ -98,11 +98,11 @@ def _infer_optimization_direction(metrics, prev_metrics):
 class Trainer:
     """
     Comprehensive trainer for OmniGenome models.
     This trainer provides a complete training framework with automatic mixed precision,
     early stopping, metric tracking, and model checkpointing. It supports various
     training configurations and can handle different types of genomic sequence tasks.
     Attributes:
         model: The model to be trained
         train_loader: DataLoader for training data
@@ -118,7 +118,7 @@ class Trainer:
         metrics: Dictionary to store training metrics
         predictions: Dictionary to store model predictions
     """
     def __init__(
         self,
         model,
@@ -139,7 +139,7 @@ class Trainer:
     ):
         """
         Initialize the trainer.
         Args:
             model: The model to be trained
             train_dataset: Training dataset
@@ -191,7 +191,9 @@ class Trainer:
         )
         self.seed = seed
         self.device = device if device else autocuda.auto_cuda()
-        self.device = torch.device(self.device) if isinstance(self.device, str) else self.device
+        self.device = (
+            torch.device(self.device) if isinstance(self.device, str) else self.device
+        )
         self.fast_dtype = {
             "float32": torch.float32,
@@ -218,11 +220,11 @@ class Trainer:
     def _is_metric_better(self, metrics, stage="valid"):
         """
         Check if the current metrics are better than the best metrics so far.
         Args:
             metrics (dict): Current metric values
             stage (str): Stage name ("valid" or "test")
         Returns:
             bool: True if current metrics are better than best metrics
         """
@@ -268,11 +270,11 @@ class Trainer:
     def train(self, path_to_save=None, **kwargs):
         """
         Train the model.
         Args:
             path_to_save (str, optional): Path to save the best model
             **kwargs: Additional keyword arguments
         Returns:
             dict: Training metrics and results
         """
@@ -300,19 +302,29 @@ class Trainer:
                     self.optimizer.zero_grad()
                 if self.fast_dtype:
-                    with torch.autocast(device_type=self.device.type, dtype=self.fast_dtype):
+                    with torch.autocast(
+                        device_type=self.device.type, dtype=self.fast_dtype
+                    ):
                         outputs = self.model(**batch)
                 else:
                     outputs = self.model(**batch)
                 if "loss" not in outputs:
                     # Generally, the model should return a loss in the outputs via OmniGenBench
                     # For the Lora models, the loss is computed separately
-                    if hasattr(self.model, "loss_function") and callable(self.model.loss_function):
-                        loss = self.model.loss_function(outputs['logits'], outputs["labels"])
-                    elif (hasattr(self.model, "model")
-                          and hasattr(self.model.model, "loss_function")
-                          and callable(self.model.model.loss_function)):
-                        loss = self.model.model.loss_function(outputs['logits'], outputs["labels"])
+                    if hasattr(self.model, "loss_function") and callable(
+                        self.model.loss_function
+                    ):
+                        loss = self.model.loss_function(
+                            outputs["logits"], outputs["labels"]
+                        )
+                    elif (
+                        hasattr(self.model, "model")
+                        and hasattr(self.model.model, "loss_function")
+                        and callable(self.model.model.loss_function)
+                    ):
+                        loss = self.model.model.loss_function(
+                            outputs["logits"], outputs["labels"]
+                        )
                     else:
                         raise ValueError(
                             "The model does not have a loss function defined. "
@@ -480,10 +492,10 @@ class Trainer:
     def get_model(self, **kwargs):
         """
         Get the trained model.
         Args:
             **kwargs: Additional keyword arguments
         Returns:
             The trained model
         """
@@ -492,7 +504,7 @@ class Trainer:
     def compute_metrics(self):
         """
         Get the metric computation functions.
         Returns:
             list: List of metric computation functions
         """
@@ -501,10 +513,10 @@ class Trainer:
     def unwrap_model(self, model=None):
         """
         Unwrap the model from any distributed training wrappers.
         Args:
             model: Model to unwrap (default: None, uses self.model)
         Returns:
             The unwrapped model
         """
@@ -538,7 +550,7 @@ class Trainer:
         """
         if os.path.exists(self._model_state_dict_path):
             self.unwrap_model().load_state_dict(
-                torch.load(self._model_state_dict_path, map_location='cpu')
+                torch.load(self._model_state_dict_path, map_location="cpu")
             )
             self.unwrap_model().to(self.device)

omnigenome/utility/dataset_hub/__init__.py CHANGED Viewed

@@ -10,4 +10,3 @@
 """
 This package contains modules for the dataset hub.
 """

omnigenome/utility/dataset_hub/dataset_hub.py CHANGED Viewed

@@ -32,11 +32,11 @@ def load_benchmark_datasets(
 ):
     """
     Load benchmark datasets from the OmniGenome hub.
     This function automatically downloads benchmark datasets if they don't exist locally,
     loads their configurations, and initializes train/validation/test datasets with
     the specified tokenizer.
     Args:
         benchmark (str): Name or path of the benchmark to load. If the benchmark
             doesn't exist locally, it will be downloaded from the hub.
@@ -46,17 +46,17 @@ def load_benchmark_datasets(
             be loaded from the benchmark configuration.
         **kwargs: Additional keyword arguments to override benchmark configuration.
             These will be passed to the dataset classes and tokenizer initialization.
     Returns:
         dict: Dictionary containing datasets for each benchmark task, with keys
             being benchmark names and values being dictionaries with 'train',
             'valid', and 'test' datasets.
     Raises:
         FileNotFoundError: If the benchmark cannot be found or downloaded.
         ValueError: If the benchmark configuration is invalid.
         ImportError: If required dependencies are not available.
     Example:
         >>> from omnigenome import OmniSingleNucleotideTokenizer
         >>> tokenizer = OmniSingleNucleotideTokenizer.from_pretrained("model_name")
@@ -64,7 +64,7 @@ def load_benchmark_datasets(
         >>> print(f"Loaded {len(datasets)} benchmark tasks")
         >>> for task_name, task_datasets in datasets.items():
         ...     print(f"{task_name}: {len(task_datasets['train'])} train samples")
     Note:
         - The function automatically handles U/T conversion and other preprocessing
           based on the benchmark configuration.
@@ -80,7 +80,7 @@ def load_benchmark_datasets(
             "does not exist. Search online for available benchmarks.",
         )
         benchmark = download_benchmark(benchmark)
     # Import benchmark list
     bench_metadata = load_module_from_path(
         f"bench_metadata", f"{benchmark}/metadata.py"
@@ -107,9 +107,7 @@ def load_benchmark_datasets(
         for key, value in _kwargs.items():
             if key in bench_config:
-                fprint(
-                    "Override", key, "with", value, "according to the input kwargs"
-                )
+                fprint("Override", key, "with", value, "according to the input kwargs")
                 bench_config.update({key: value})
             else:
@@ -170,9 +168,11 @@ def load_benchmark_datasets(
             "valid": valid_set,
         }
-        fprint(f"Loaded dataset for {bench} with {len(train_set)} train samples, "
-              f"{len(test_set)} test samples and {len(valid_set)} valid samples.")
+        fprint(
+            f"Loaded dataset for {bench} with {len(train_set)} train samples, "
+            f"{len(test_set)} test samples and {len(valid_set)} valid samples."
+        )
         datasets[bench] = dataset
-    return datasets
+    return datasets

omnigenome/utility/ensemble.py CHANGED Viewed

@@ -14,11 +14,11 @@ import numpy as np
 class VoteEnsemblePredictor:
     """
     An ensemble predictor that combines predictions from multiple models using voting.
     This class implements ensemble methods for combining predictions from multiple
     models or checkpoints. It supports both weighted and unweighted voting, and
     provides various aggregation methods for different data types (numeric and string).
     Attributes:
         checkpoints: List of checkpoint names
         predictors: Dictionary of initialized predictors
@@ -27,7 +27,7 @@ class VoteEnsemblePredictor:
         str_agg: Function for aggregating string predictions
         numeric_agg_methods: Dictionary of available numeric aggregation methods
         str_agg_methods: Dictionary of available string aggregation methods
     Example:
         >>> from omnigenome.utility import VoteEnsemblePredictor
         >>> predictors = {
@@ -51,14 +51,14 @@ class VoteEnsemblePredictor:
     ):
         """
         Initialize the VoteEnsemblePredictor.
         Args:
             predictors (List or dict): A list of checkpoints, or a dictionary of initialized predictors
             weights (List or dict, optional): A list of weights for each predictor, or a dictionary of weights for each predictor
             numeric_agg (str, optional): The aggregation method for numeric data. Options are 'average', 'mean', 'max', 'min',
                                         'median', 'mode', and 'sum'. Defaults to 'average'
             str_agg (str, optional): The aggregation method for string data. Options are 'max_vote', 'min_vote', 'vote', and 'mode'. Defaults to 'max_vote'
         Raises:
             AssertionError: If predictors and weights have different lengths or types
             AssertionError: If predictors list is empty
@@ -113,13 +113,13 @@ class VoteEnsemblePredictor:
     def numeric_agg(self, result: list):
         """
         Aggregate a list of numeric values.
         Args:
             result (list): A list of numeric values to aggregate
         Returns:
             The aggregated value using the specified numeric aggregation method
         Example:
             >>> ensemble = VoteEnsemblePredictor(predictors, numeric_agg="average")
             >>> result = ensemble.numeric_agg([0.8, 0.9, 0.7])
@@ -132,13 +132,13 @@ class VoteEnsemblePredictor:
     def __ensemble(self, result: dict):
         """
         Aggregate prediction results by calling the appropriate aggregation method.
         This method determines the type of result and calls the appropriate
         aggregation method (numeric or string).
         Args:
             result (dict): A dictionary containing the prediction results
         Returns:
             The aggregated prediction result
         """
@@ -152,13 +152,13 @@ class VoteEnsemblePredictor:
     def __dict_aggregate(self, result: dict):
         """
         Recursively aggregate a dictionary of prediction results.
         This method recursively processes nested dictionaries and applies
         appropriate aggregation methods to each level.
         Args:
             result (dict): A dictionary containing the prediction results
         Returns:
             dict: The aggregated prediction result
         """
@@ -175,16 +175,16 @@ class VoteEnsemblePredictor:
     def __list_aggregate(self, result: list):
         """
         Aggregate a list of prediction results.
         This method handles different types of list elements and applies
         appropriate aggregation methods based on the data type.
         Args:
             result (list): A list of prediction results to aggregate
         Returns:
             The aggregated result
         Raises:
             AssertionError: If all elements in the list are not of the same type
         """
@@ -227,18 +227,18 @@ class VoteEnsemblePredictor:
     def predict(self, text, ignore_error=False, print_result=False):
         """
         Predicts on a single text and returns the ensemble result.
         This method combines predictions from all predictors in the ensemble
         using the specified weights and aggregation methods.
         Args:
             text (str): The text to perform prediction on
             ignore_error (bool, optional): Whether to ignore any errors that occur during prediction. Defaults to False
             print_result (bool, optional): Whether to print the prediction result. Defaults to False
         Returns:
             dict: The ensemble prediction result
         Example:
             >>> result = ensemble.predict("ACGUAGGUAUCGUAGA", ignore_error=True)
             >>> print(result)
@@ -267,18 +267,18 @@ class VoteEnsemblePredictor:
     def batch_predict(self, texts, ignore_error=False, print_result=False):
         """
         Predicts on a batch of texts using the ensemble of predictors.
         This method processes multiple texts efficiently by combining predictions
         from all predictors in the ensemble for each text in the batch.
         Args:
             texts (list): A list of strings to predict on
             ignore_error (bool, optional): Boolean indicating whether to ignore errors or raise exceptions when prediction fails. Defaults to False
             print_result (bool, optional): Boolean indicating whether to print the raw results for each predictor. Defaults to False
         Returns:
             list: A list of dictionaries, each dictionary containing the aggregated results of the corresponding text in the input list
         Example:
             >>> texts = ["ACGUAGGUAUCGUAGA", "GGCTAGCTA", "TATCGCTA"]
             >>> results = ensemble.batch_predict(texts, ignore_error=True)

omnigenome/utility/hub_utils.py CHANGED Viewed

@@ -24,7 +24,7 @@ from omnigenome.src.misc.utils import fprint, default_omnigenome_repo
 def unzip_checkpoint(checkpoint_path):
     """
     Unzips a checkpoint file.
     This function extracts a zipped checkpoint file to a directory,
     making it ready for use by the model loading functions.
@@ -51,7 +51,7 @@ def query_models_info(
 ) -> Dict[str, Any]:
     """
     Queries information about available models from the hub.
     This function retrieves model information from the OmniGenome hub,
     either from a remote repository or from a local cache. It supports
     filtering by keywords to find specific models.
@@ -69,7 +69,7 @@ def query_models_info(
         >>> # Query all models
         >>> models = query_models_info("")
         >>> print(len(models))  # Number of available models
         >>> # Query specific models
         >>> models = query_models_info("DNA")
         >>> print(models.keys())  # Models containing "DNA"
@@ -108,7 +108,7 @@ def query_pipelines_info(
 ) -> Dict[str, Any]:
     """
     Queries information about available pipelines from the hub.
     This function retrieves pipeline information from the OmniGenome hub,
     either from a remote repository or from a local cache. It supports
     filtering by keywords to find specific pipelines.
@@ -126,7 +126,7 @@ def query_pipelines_info(
         >>> # Query all pipelines
         >>> pipelines = query_pipelines_info("")
         >>> print(len(pipelines))  # Number of available pipelines
         >>> # Query specific pipelines
         >>> pipelines = query_pipelines_info("classification")
         >>> print(pipelines.keys())  # Pipelines containing "classification"
@@ -165,7 +165,7 @@ def query_benchmarks_info(
 ) -> Dict[str, Any]:
     """
     Queries information about available benchmarks from the hub.
     This function retrieves benchmark information from the OmniGenome hub,
     either from a remote repository or from a local cache. It supports
     filtering by keywords to find specific benchmarks.
@@ -183,7 +183,7 @@ def query_benchmarks_info(
         >>> # Query all benchmarks
         >>> benchmarks = query_benchmarks_info("")
         >>> print(len(benchmarks))  # Number of available benchmarks
         >>> # Query specific benchmarks
         >>> benchmarks = query_benchmarks_info("RGB")
         >>> print(benchmarks.keys())  # Benchmarks containing "RGB"
@@ -468,7 +468,7 @@ def download_benchmark(
 def check_version(repo: str = None) -> None:
     """
     Checks the version compatibility between local and remote OmniGenome.
     This function compares the local OmniGenome version with the version
     available in the remote repository to ensure compatibility.

omnigenome/utility/model_hub/__init__.py CHANGED Viewed

@@ -9,4 +9,3 @@
 """
 This package contains modules for the model hub.
 """

omnigenome/utility/model_hub/model_hub.py CHANGED Viewed

@@ -21,33 +21,33 @@ from ...src.misc.utils import env_meta_info, fprint
 class ModelHub:
     """
     A hub for loading and managing pre-trained genomic models.
     This class provides a unified interface for loading pre-trained models
     from the OmniGenome hub or local paths. It handles model downloading,
     tokenizer loading, and device placement automatically.
     The ModelHub supports various model types and can automatically
     download models from the hub if they're not available locally.
     Attributes:
         metadata (dict): Environment metadata information
     Example:
         >>> from omnigenome import ModelHub
         >>> hub = ModelHub()
         >>> # Load a model from the hub
         >>> model, tokenizer = ModelHub.load_model_and_tokenizer("model_name")
         >>> # Check available models
         >>> models = hub.available_models()
         >>> print(list(models.keys()))
     """
     def __init__(self, *args, **kwargs):
         """
         Initialize the ModelHub instance.
         Args:
             *args: Additional positional arguments
             **kwargs: Additional keyword arguments
@@ -66,21 +66,21 @@ class ModelHub:
     ):
         """
         Load a model and its tokenizer from the hub or local path.
         This method loads both the model and tokenizer, places them on the
         specified device, and returns them as a tuple. It handles automatic
         device selection if none is specified.
         Args:
             model_name_or_path (str): Name or path of the model to load
             local_only (bool, optional): Whether to use only local cache. Defaults to False
             device (str, optional): Device to load the model on. If None, uses auto-detection
             dtype (torch.dtype, optional): Data type for the model. Defaults to torch.float16
             **kwargs: Additional keyword arguments passed to the model loading functions
         Returns:
             tuple: A tuple containing (model, tokenizer)
         Example:
             >>> model, tokenizer = ModelHub.load_model_and_tokenizer("yangheng/OmniGenome-186M")
             >>> print(f"Model loaded on device: {next(model.parameters()).device}")
@@ -108,24 +108,24 @@ class ModelHub:
     ):
         """
         Load a model from the hub or local path.
         This method handles model loading from various sources including
         local paths and the OmniGenome hub. It automatically downloads
         models if they're not available locally.
         Args:
             model_name_or_path (str): Name or path of the model to load
             local_only (bool, optional): Whether to use only local cache. Defaults to False
             device (str, optional): Device to load the model on. If None, uses auto-detection
             dtype (torch.dtype, optional): Data type for the model. Defaults to torch.float16
             **kwargs: Additional keyword arguments passed to the model loading functions
         Returns:
             torch.nn.Module: The loaded model
         Raises:
             ValueError: If model_name_or_path is not a string
         Example:
             >>> model = ModelHub.load("yangheng/OmniGenome-186M")
             >>> print(f"Model type: {type(model)}")
@@ -152,6 +152,7 @@ class ModelHub:
             tokenizer = tokenizer_cls.from_pretrained(path, **kwargs)
         else:
             from multimolecule import RnaTokenizer
             tokenizer = RnaTokenizer.from_pretrained(path, **kwargs)
         config.metadata = metadata
@@ -187,25 +188,25 @@ class ModelHub:
     ):
         """
         Get information about available models in the hub.
         This method queries the OmniGenome hub to retrieve information about
         available models. It can filter models by name and supports both
         local and remote queries.
         Args:
             model_name_or_path (str, optional): Filter models by name. Defaults to None
             local_only (bool, optional): Whether to use only local cache. Defaults to False
             repo (str, optional): Repository URL to query. Defaults to ""
             **kwargs: Additional keyword arguments
         Returns:
             dict: Dictionary containing information about available models
         Example:
             >>> hub = ModelHub()
             >>> models = hub.available_models()
             >>> print(f"Available models: {len(models)}")
             >>> # Filter models by name
             >>> dna_models = hub.available_models("DNA")
             >>> print(f"DNA models: {list(dna_models.keys())}")
@@ -218,13 +219,13 @@ class ModelHub:
     def push(self, model, **kwargs):
         """
         Push a model to the hub.
         This method is not yet implemented and will raise a NotImplementedError.
         Args:
             model: The model to push to the hub
             **kwargs: Additional keyword arguments
         Raises:
             NotImplementedError: This method has not been implemented yet
         """

omnigenome/utility/pipeline_hub/__init__.py CHANGED Viewed

@@ -9,4 +9,3 @@
 """
 This package contains modules for the pipeline hub.
 """

omnigenome 0.3.0a1__py3-none-any.whl → 0.3.1a0__py3-none-any.whl

omnigenome 0.3.0a1py3-none-any.whl → 0.3.1a0py3-none-any.whl