PyPI - omnigenome - Versions diffs - 0.3.0a1__py3-none-any.whl → 0.3.1a0__py3-none-any.whl - Mend

omnigenome 0.3.0a1py3-none-any.whl → 0.3.1a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

omnigenome/__init__.py +16 -8
omnigenome/auto/auto_bench/__init__.py +0 -1
omnigenome/auto/auto_bench/auto_bench.py +24 -14
omnigenome/auto/auto_train/__init__.py +0 -1
omnigenome/auto/auto_train/auto_train.py +11 -12
omnigenome/auto/bench_hub/__init__.py +0 -1
omnigenome/auto/bench_hub/bench_hub.py +1 -1
omnigenome/cli/__init__.py +0 -1
omnigenome/cli/commands/__init__.py +0 -1
omnigenome/cli/commands/base.py +10 -10
omnigenome/cli/commands/bench/__init__.py +0 -1
omnigenome/cli/commands/bench/bench_cli.py +10 -10
omnigenome/cli/commands/rna/__init__.py +0 -1
omnigenome/cli/commands/rna/rna_design.py +10 -11
omnigenome/src/__init__.py +0 -1
omnigenome/src/abc/__init__.py +0 -1
omnigenome/src/abc/abstract_dataset.py +38 -19
omnigenome/src/abc/abstract_metric.py +7 -7
omnigenome/src/abc/abstract_model.py +15 -14
omnigenome/src/abc/abstract_tokenizer.py +9 -7
omnigenome/src/dataset/omni_dataset.py +16 -14
omnigenome/src/lora/__init__.py +0 -1
omnigenome/src/lora/lora_model.py +47 -41
omnigenome/src/metric/classification_metric.py +11 -11
omnigenome/src/metric/metric.py +19 -19
omnigenome/src/metric/ranking_metric.py +15 -15
omnigenome/src/metric/regression_metric.py +18 -18
omnigenome/src/misc/utils.py +40 -36
omnigenome/src/model/augmentation/__init__.py +0 -1
omnigenome/src/model/augmentation/model.py +17 -17
omnigenome/src/model/classification/__init__.py +0 -1
omnigenome/src/model/classification/model.py +28 -32
omnigenome/src/model/embedding/__init__.py +0 -1
omnigenome/src/model/embedding/model.py +35 -35
omnigenome/src/model/mlm/__init__.py +0 -1
omnigenome/src/model/mlm/model.py +13 -13
omnigenome/src/model/module_utils.py +17 -17
omnigenome/src/model/regression/__init__.py +0 -1
omnigenome/src/model/regression/model.py +72 -77
omnigenome/src/model/regression/resnet.py +32 -32
omnigenome/src/model/rna_design/__init__.py +0 -1
omnigenome/src/model/rna_design/model.py +65 -58
omnigenome/src/model/seq2seq/__init__.py +0 -1
omnigenome/src/model/seq2seq/model.py +4 -4
omnigenome/src/tokenizer/bpe_tokenizer.py +27 -27
omnigenome/src/tokenizer/kmers_tokenizer.py +22 -22
omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +11 -11
omnigenome/src/trainer/accelerate_trainer.py +40 -32
omnigenome/src/trainer/hf_trainer.py +8 -8
omnigenome/src/trainer/trainer.py +37 -25
omnigenome/utility/dataset_hub/__init__.py +0 -1
omnigenome/utility/dataset_hub/dataset_hub.py +13 -13
omnigenome/utility/ensemble.py +26 -26
omnigenome/utility/hub_utils.py +8 -8
omnigenome/utility/model_hub/__init__.py +0 -1
omnigenome/utility/model_hub/model_hub.py +26 -25
omnigenome/utility/pipeline_hub/__init__.py +0 -1
omnigenome/utility/pipeline_hub/pipeline.py +49 -49
omnigenome/utility/pipeline_hub/pipeline_hub.py +17 -17
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/METADATA +2 -2
omnigenome-0.3.1a0.dist-info/RECORD +78 -0
omnigenome-0.3.0a1.dist-info/RECORD +0 -78
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/WHEEL +0 -0
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/entry_points.txt +0 -0
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/licenses/LICENSE +0 -0
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/top_level.txt +0 -0

omnigenome/src/tokenizer/bpe_tokenizer.py CHANGED Viewed

@@ -17,17 +17,17 @@ warnings.filterwarnings("once")
 def is_bpe_tokenization(tokens, threshold=0.1):
     """
     Check if the tokenization is BPE-based by analyzing token characteristics.
     This function examines the tokens to determine if they follow BPE tokenization
     patterns by analyzing token length distributions and special token patterns.
     Args:
         tokens (list): List of tokens to analyze
         threshold (float, optional): Threshold for determining BPE tokenization. Defaults to 0.1
     Returns:
         bool: True if tokens appear to be BPE-based, False otherwise
     Example:
         >>> tokens = ["▁hello", "▁world", "▁how", "▁are", "▁you"]
         >>> is_bpe = is_bpe_tokenization(tokens)
@@ -52,15 +52,15 @@ def is_bpe_tokenization(tokens, threshold=0.1):
 class OmniBPETokenizer(OmniTokenizer):
     """
     A Byte Pair Encoding (BPE) tokenizer for genomic sequences.
     This tokenizer uses BPE tokenization for genomic sequences and provides
     validation to ensure the base tokenizer is BPE-based. It supports sequence
     preprocessing and handles various input formats.
     Attributes:
         base_tokenizer: The underlying BPE tokenizer
         metadata: Dictionary containing tokenizer metadata
     Example:
         >>> from omnigenome.src.tokenizer import OmniBPETokenizer
         >>> from transformers import AutoTokenizer
@@ -75,7 +75,7 @@ class OmniBPETokenizer(OmniTokenizer):
     def __init__(self, base_tokenizer=None, **kwargs):
         """
         Initialize the OmniBPETokenizer.
         Args:
             base_tokenizer: The base BPE tokenizer
             **kwargs: Additional keyword arguments passed to parent class
@@ -86,21 +86,21 @@ class OmniBPETokenizer(OmniTokenizer):
     def __call__(self, sequence, **kwargs):
         """
         Tokenize a sequence using BPE tokenization.
         This method processes the input sequence using BPE tokenization,
         handles sequence preprocessing (U/T conversion, whitespace addition),
         and validates that the tokenization is BPE-based.
         Args:
             sequence (str): Input sequence to tokenize
             **kwargs: Additional keyword arguments including max_length
         Returns:
             dict: Dictionary containing tokenized inputs with keys 'input_ids' and 'attention_mask'
         Raises:
             ValueError: If the tokenizer is not BPE-based
         Example:
             >>> sequence = "ACGUAGGUAUCGUAGA"
             >>> tokenized = tokenizer(sequence)
@@ -136,14 +136,14 @@ class OmniBPETokenizer(OmniTokenizer):
     def from_pretrained(model_name_or_path, **kwargs):
         """
         Create a BPE tokenizer from a pre-trained model.
         Args:
             model_name_or_path (str): Name or path of the pre-trained model
             **kwargs: Additional keyword arguments
         Returns:
             OmniBPETokenizer: Initialized BPE tokenizer
         Example:
             >>> tokenizer = OmniBPETokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D")
             >>> print(type(tokenizer))
@@ -159,14 +159,14 @@ class OmniBPETokenizer(OmniTokenizer):
     def tokenize(self, sequence, **kwargs):
         """
         Tokenize a sequence using the base BPE tokenizer.
         Args:
             sequence (str): Input sequence to tokenize
             **kwargs: Additional keyword arguments
         Returns:
             list: List of tokens
         Example:
             >>> sequence = "ACGUAGGUAUCGUAGA"
             >>> tokens = tokenizer.tokenize(sequence)
@@ -178,17 +178,17 @@ class OmniBPETokenizer(OmniTokenizer):
     def encode(self, sequence, **kwargs):
         """
         Encode a sequence using the base BPE tokenizer.
         Args:
             sequence (str): Input sequence to encode
             **kwargs: Additional keyword arguments
         Returns:
             list: List of token IDs
         Raises:
             AssertionError: If the base tokenizer is not BPE-based
         Example:
             >>> sequence = "ACGUAGGUAUCGUAGA"
             >>> token_ids = tokenizer.encode(sequence)
@@ -203,17 +203,17 @@ class OmniBPETokenizer(OmniTokenizer):
     def decode(self, sequence, **kwargs):
         """
         Decode a sequence using the base BPE tokenizer.
         Args:
             sequence: Input sequence to decode (can be token IDs or tokens)
             **kwargs: Additional keyword arguments
         Returns:
             str: Decoded sequence
         Raises:
             AssertionError: If the base tokenizer is not BPE-based
         Example:
             >>> token_ids = [1, 2, 3, 4, 5]
             >>> sequence = tokenizer.decode(token_ids)

omnigenome/src/tokenizer/kmers_tokenizer.py CHANGED Viewed

@@ -16,18 +16,18 @@ warnings.filterwarnings("once")
 class OmniKmersTokenizer(OmniTokenizer):
     """
     A k-mer based tokenizer for genomic sequences.
     This tokenizer breaks genomic sequences into overlapping k-mers and uses
     a base tokenizer to convert them into token IDs. It supports various
     k-mer sizes and overlap configurations for different genomic applications.
     Attributes:
         base_tokenizer: The underlying tokenizer for converting k-mers to IDs
         k: Size of k-mers
         overlap: Number of overlapping positions between consecutive k-mers
         max_length: Maximum sequence length for tokenization
         metadata: Dictionary containing tokenizer metadata
     Example:
         >>> from omnigenome.src.tokenizer import OmniKmersTokenizer
         >>> from transformers import AutoTokenizer
@@ -42,7 +42,7 @@ class OmniKmersTokenizer(OmniTokenizer):
     def __init__(self, base_tokenizer=None, k=3, overlap=0, max_length=512, **kwargs):
         """
         Initialize the OmniKmersTokenizer.
         Args:
             base_tokenizer: The base tokenizer for converting k-mers to token IDs
             k (int, optional): Size of k-mers. Defaults to 3
@@ -59,18 +59,18 @@ class OmniKmersTokenizer(OmniTokenizer):
     def __call__(self, sequence, **kwargs):
         """
         Tokenize a sequence or list of sequences into tokenized inputs.
         This method processes the input sequence(s) by first converting them to k-mers,
         then using the base tokenizer to convert k-mers to token IDs. It handles
         sequence preprocessing (U/T conversion) and adds special tokens.
         Args:
             sequence (str or list): Input sequence(s) to tokenize
             **kwargs: Additional keyword arguments including max_length
         Returns:
             dict: Dictionary containing tokenized inputs with keys 'input_ids' and 'attention_mask'
         Example:
             >>> sequence = "ACGUAGGUAUCGUAGA"
             >>> tokenized = tokenizer(sequence)
@@ -126,14 +126,14 @@ class OmniKmersTokenizer(OmniTokenizer):
     def from_pretrained(model_name_or_path, **kwargs):
         """
         Create a k-mers tokenizer from a pre-trained model.
         Args:
             model_name_or_path (str): Name or path of the pre-trained model
             **kwargs: Additional keyword arguments
         Returns:
             OmniKmersTokenizer: Initialized k-mers tokenizer
         Example:
             >>> tokenizer = OmniKmersTokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D")
             >>> print(type(tokenizer))
@@ -149,17 +149,17 @@ class OmniKmersTokenizer(OmniTokenizer):
     def tokenize(self, sequence, **kwargs):
         """
         Convert sequence(s) into k-mers.
         This method breaks the input sequence(s) into overlapping k-mers based on
         the configured k-mer size and overlap parameters.
         Args:
             sequence (str or list): Input sequence(s) to convert to k-mers
             **kwargs: Additional keyword arguments
         Returns:
             list: List of k-mer lists for each input sequence
         Example:
             >>> sequence = "ACGUAGGUAUCGUAGA"
             >>> k_mers = tokenizer.tokenize(sequence)
@@ -184,11 +184,11 @@ class OmniKmersTokenizer(OmniTokenizer):
     def encode(self, input_ids, **kwargs):
         """
         Encode input IDs using the base tokenizer.
         Args:
             input_ids: Input IDs to encode
             **kwargs: Additional keyword arguments
         Returns:
             Encoded input IDs
         """
@@ -197,11 +197,11 @@ class OmniKmersTokenizer(OmniTokenizer):
     def decode(self, input_ids, **kwargs):
         """
         Decode input IDs using the base tokenizer.
         Args:
             input_ids: Input IDs to decode
             **kwargs: Additional keyword arguments
         Returns:
             Decoded sequence
         """
@@ -210,13 +210,13 @@ class OmniKmersTokenizer(OmniTokenizer):
     def encode_plus(self, sequence, **kwargs):
         """
         Encode a sequence with additional information.
         This method is not yet implemented for k-mers tokenizer.
         Args:
             sequence: Input sequence
             **kwargs: Additional keyword arguments
         Raises:
             NotImplementedError: This method is not implemented yet
         """

omnigenome/src/tokenizer/single_nucleotide_tokenizer.py CHANGED Viewed

@@ -19,16 +19,16 @@ warnings.filterwarnings("once")
 class OmniSingleNucleotideTokenizer(OmniTokenizer):
     """
     Tokenizer for single nucleotide tokenization in genomics.
     This tokenizer converts genomic sequences into individual nucleotide tokens,
     where each nucleotide (A, T, C, G, U) becomes a separate token. It's designed
     for genomic sequence processing where fine-grained nucleotide-level analysis
     is required.
     The tokenizer supports various preprocessing options including U/T conversion
     and whitespace addition between nucleotides. It also handles special tokens
     like BOS (beginning of sequence) and EOS (end of sequence) tokens.
     Attributes:
         u2t (bool): Whether to convert 'U' to 'T'.
         t2u (bool): Whether to convert 'T' to 'U'.
@@ -54,7 +54,7 @@ class OmniSingleNucleotideTokenizer(OmniTokenizer):
     def __call__(self, sequence, **kwargs):
         """
         Tokenizes sequences using single nucleotide tokenization.
         This method converts genomic sequences into tokenized inputs suitable
         for model training and inference. It handles sequence preprocessing,
         tokenization, and padding/truncation.
@@ -76,7 +76,7 @@ class OmniSingleNucleotideTokenizer(OmniTokenizer):
             >>> # Tokenize a single sequence
             >>> inputs = tokenizer("ATCGATCG")
             >>> print(inputs['input_ids'].shape)  # torch.Size([1, seq_len])
             >>> # Tokenize multiple sequences
             >>> inputs = tokenizer(["ATCGATCG", "GCTAGCTA"])
             >>> print(inputs['input_ids'].shape)  # torch.Size([2, seq_len])
@@ -134,7 +134,7 @@ class OmniSingleNucleotideTokenizer(OmniTokenizer):
     def from_pretrained(model_name_or_path, **kwargs):
         """
         Loads a single nucleotide tokenizer from a pre-trained model.
         This method creates a single nucleotide tokenizer wrapper around
         a Hugging Face tokenizer loaded from a pre-trained model.
@@ -156,7 +156,7 @@ class OmniSingleNucleotideTokenizer(OmniTokenizer):
     def tokenize(self, sequence, **kwargs):
         """
         Converts a sequence into a list of individual nucleotide tokens.
         This method tokenizes genomic sequences by treating each nucleotide
         as a separate token. It handles both single sequences and lists of sequences.
@@ -172,7 +172,7 @@ class OmniSingleNucleotideTokenizer(OmniTokenizer):
             >>> # Tokenize a single sequence
             >>> tokens = tokenizer.tokenize("ATCGATCG")
             >>> print(tokens)  # [['A', 'T', 'C', 'G', 'A', 'T', 'C', 'G']]
             >>> # Tokenize multiple sequences
             >>> tokens = tokenizer.tokenize(["ATCGATCG", "GCTAGCTA"])
             >>> print(tokens)  # [['A', 'T', 'C', 'G', ...], ['G', 'C', 'T', 'A', ...]]
@@ -191,7 +191,7 @@ class OmniSingleNucleotideTokenizer(OmniTokenizer):
     def encode(self, sequence, **kwargs):
         """
         Converts a sequence into a list of token IDs.
         This method encodes genomic sequences into token IDs using the
         underlying base tokenizer.
@@ -211,7 +211,7 @@ class OmniSingleNucleotideTokenizer(OmniTokenizer):
     def decode(self, sequence, **kwargs):
         """
         Converts a list of token IDs back into a sequence.
         This method decodes token IDs back into genomic sequences using
         the underlying base tokenizer.
@@ -231,7 +231,7 @@ class OmniSingleNucleotideTokenizer(OmniTokenizer):
     def encode_plus(self, sequence, **kwargs):
         """
         Encodes a sequence with additional information.
         This method provides enhanced encoding with additional information
         like attention masks and token type IDs.

omnigenome/src/trainer/accelerate_trainer.py CHANGED Viewed

@@ -21,15 +21,15 @@ from ..misc.utils import env_meta_info, fprint, seed_everything
 def _infer_optimization_direction(metrics, prev_metrics):
     """
     Infer the optimization direction based on metric values.
     This function analyzes the trend of metric values to determine whether
     larger values are better (e.g., accuracy) or smaller values are better
     (e.g., loss).
     Args:
         metrics (dict): Current metric values
         prev_metrics (list): Previous metric values
     Returns:
         str: Either 'larger_is_better' or 'smaller_is_better'
     """
@@ -91,11 +91,11 @@ def _infer_optimization_direction(metrics, prev_metrics):
 class AccelerateTrainer:
     """
     A distributed training trainer using HuggingFace Accelerate.
     This trainer provides distributed training capabilities with automatic mixed precision,
     gradient accumulation, and early stopping. It supports both single and multi-GPU
     training with seamless integration with HuggingFace Accelerate.
     Attributes:
         model: The model to train
         train_loader: DataLoader for training data
@@ -110,7 +110,7 @@ class AccelerateTrainer:
         accelerator: HuggingFace Accelerate instance
         metrics: Dictionary to store training metrics
         predictions: Dictionary to store predictions
     Example:
         >>> from omnigenome.src.trainer import AccelerateTrainer
         >>> trainer = AccelerateTrainer(
@@ -143,7 +143,7 @@ class AccelerateTrainer:
     ):
         """
         Initialize the AccelerateTrainer.
         Args:
             model: The model to train
             train_dataset (torch.utils.data.Dataset, optional): Training dataset
@@ -293,14 +293,14 @@ class AccelerateTrainer:
     def evaluate(self):
         """
         Evaluate the model on the validation dataset.
         This method runs the model in evaluation mode and computes metrics
         on the validation dataset. It handles distributed evaluation and
         gathers results from all processes.
         Returns:
             dict: Dictionary containing evaluation metrics
         Example:
             >>> metrics = trainer.evaluate()
             >>> print(f"Validation accuracy: {metrics['accuracy']:.4f}")
@@ -364,14 +364,14 @@ class AccelerateTrainer:
     def test(self):
         """
         Test the model on the test dataset.
         This method runs the model in evaluation mode and computes metrics
         on the test dataset. It handles distributed testing and gathers
         results from all processes.
         Returns:
             dict: Dictionary containing test metrics
         Example:
             >>> metrics = trainer.test()
             >>> print(f"Test accuracy: {metrics['accuracy']:.4f}")
@@ -431,18 +431,18 @@ class AccelerateTrainer:
     def train(self, path_to_save=None, **kwargs):
         """
         Train the model using distributed training.
         This method performs the complete training loop with validation,
         early stopping, and model checkpointing. It handles distributed
         training across multiple GPUs and processes.
         Args:
             path_to_save (str, optional): Path to save the trained model
             **kwargs: Additional keyword arguments for model saving
         Returns:
             dict: Dictionary containing training metrics
         Example:
             >>> metrics = trainer.train(path_to_save="./checkpoints/model")
             >>> print(f"Best validation accuracy: {metrics['best_valid']['accuracy']:.4f}")
@@ -489,12 +489,20 @@ class AccelerateTrainer:
                 if "loss" not in outputs:
                     # Generally, the model should return a loss in the outputs via OmniGenBench
                     # For the Lora models, the loss is computed separately
-                    if hasattr(self.model, "loss_function") and callable(self.model.loss_function):
-                        loss = self.model.loss_function(outputs['logits'], outputs["labels"])
-                    elif (hasattr(self.model, "model")
-                          and hasattr(self.model.model, "loss_function")
-                          and callable(self.model.model.loss_function)):
-                        loss = self.model.model.loss_function(outputs['logits'], outputs["labels"])
+                    if hasattr(self.model, "loss_function") and callable(
+                        self.model.loss_function
+                    ):
+                        loss = self.model.loss_function(
+                            outputs["logits"], outputs["labels"]
+                        )
+                    elif (
+                        hasattr(self.model, "model")
+                        and hasattr(self.model.model, "loss_function")
+                        and callable(self.model.model.loss_function)
+                    ):
+                        loss = self.model.model.loss_function(
+                            outputs["logits"], outputs["labels"]
+                        )
                     else:
                         raise ValueError(
                             "The model does not have a loss function defined. "
@@ -585,11 +593,11 @@ class AccelerateTrainer:
     def _is_metric_better(self, metrics, stage="valid"):
         """
         Check if the current metrics are better than the best metrics so far.
         Args:
             metrics (dict): Current metrics
             stage (str): Stage of evaluation ('valid' or 'test')
         Returns:
             bool: True if current metrics are better, False otherwise
         """
@@ -643,10 +651,10 @@ class AccelerateTrainer:
     def predict(self, data_loader):
         """
         Make predictions using the trained model.
         Args:
             data_loader: DataLoader containing data to predict on
         Returns:
             dict: Dictionary containing predictions
         """
@@ -655,10 +663,10 @@ class AccelerateTrainer:
     def get_model(self, **kwargs):
         """
         Get the trained model.
         Args:
             **kwargs: Additional keyword arguments
         Returns:
             The trained model
         """
@@ -667,10 +675,10 @@ class AccelerateTrainer:
     def compute_metrics(self):
         """
         Compute metrics for evaluation.
         This method should be implemented by subclasses to provide specific
         metric computation logic.
         Raises:
             NotImplementedError: If compute_metrics method is not implemented
         """
@@ -682,7 +690,7 @@ class AccelerateTrainer:
     def save_model(self, path, overwrite=False, **kwargs):
         """
         Save the trained model.
         Args:
             path (str): Path to save the model
             overwrite (bool, optional): Whether to overwrite existing files. Defaults to False

omnigenome/src/trainer/hf_trainer.py CHANGED Viewed

@@ -24,19 +24,19 @@ from ... import __version__ as omnigenome_version
 class HFTrainer(Trainer):
     """
     HuggingFace trainer wrapper for OmniGenome models.
     This class extends the HuggingFace Trainer to include OmniGenome-specific
     metadata and functionality while maintaining full compatibility with the
     HuggingFace training ecosystem.
     Attributes:
         metadata: Dictionary containing OmniGenome library information
     """
     def __init__(self, *args, **kwargs):
         """
         Initialize the HuggingFace trainer wrapper.
         Args:
             *args: Positional arguments passed to the parent Trainer
             **kwargs: Keyword arguments passed to the parent Trainer
@@ -51,19 +51,19 @@ class HFTrainer(Trainer):
 class HFTrainingArguments(TrainingArguments):
     """
     HuggingFace training arguments wrapper for OmniGenome models.
     This class extends the HuggingFace TrainingArguments to include
     OmniGenome-specific metadata while maintaining full compatibility
     with the HuggingFace training ecosystem.
     Attributes:
         metadata: Dictionary containing OmniGenome library information
     """
     def __init__(self, *args, **kwargs):
         """
         Initialize the HuggingFace training arguments wrapper.
         Args:
             *args: Positional arguments passed to the parent TrainingArguments
             **kwargs: Keyword arguments passed to the parent TrainingArguments

omnigenome 0.3.0a1__py3-none-any.whl → 0.3.1a0__py3-none-any.whl

omnigenome 0.3.0a1py3-none-any.whl → 0.3.1a0py3-none-any.whl