PyPI - omnigenome - Versions diffs - 0.3.0a1__py3-none-any.whl → 0.3.1a0__py3-none-any.whl - Mend

omnigenome 0.3.0a1py3-none-any.whl → 0.3.1a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

omnigenome/__init__.py +16 -8
omnigenome/auto/auto_bench/__init__.py +0 -1
omnigenome/auto/auto_bench/auto_bench.py +24 -14
omnigenome/auto/auto_train/__init__.py +0 -1
omnigenome/auto/auto_train/auto_train.py +11 -12
omnigenome/auto/bench_hub/__init__.py +0 -1
omnigenome/auto/bench_hub/bench_hub.py +1 -1
omnigenome/cli/__init__.py +0 -1
omnigenome/cli/commands/__init__.py +0 -1
omnigenome/cli/commands/base.py +10 -10
omnigenome/cli/commands/bench/__init__.py +0 -1
omnigenome/cli/commands/bench/bench_cli.py +10 -10
omnigenome/cli/commands/rna/__init__.py +0 -1
omnigenome/cli/commands/rna/rna_design.py +10 -11
omnigenome/src/__init__.py +0 -1
omnigenome/src/abc/__init__.py +0 -1
omnigenome/src/abc/abstract_dataset.py +38 -19
omnigenome/src/abc/abstract_metric.py +7 -7
omnigenome/src/abc/abstract_model.py +15 -14
omnigenome/src/abc/abstract_tokenizer.py +9 -7
omnigenome/src/dataset/omni_dataset.py +16 -14
omnigenome/src/lora/__init__.py +0 -1
omnigenome/src/lora/lora_model.py +47 -41
omnigenome/src/metric/classification_metric.py +11 -11
omnigenome/src/metric/metric.py +19 -19
omnigenome/src/metric/ranking_metric.py +15 -15
omnigenome/src/metric/regression_metric.py +18 -18
omnigenome/src/misc/utils.py +40 -36
omnigenome/src/model/augmentation/__init__.py +0 -1
omnigenome/src/model/augmentation/model.py +17 -17
omnigenome/src/model/classification/__init__.py +0 -1
omnigenome/src/model/classification/model.py +28 -32
omnigenome/src/model/embedding/__init__.py +0 -1
omnigenome/src/model/embedding/model.py +35 -35
omnigenome/src/model/mlm/__init__.py +0 -1
omnigenome/src/model/mlm/model.py +13 -13
omnigenome/src/model/module_utils.py +17 -17
omnigenome/src/model/regression/__init__.py +0 -1
omnigenome/src/model/regression/model.py +72 -77
omnigenome/src/model/regression/resnet.py +32 -32
omnigenome/src/model/rna_design/__init__.py +0 -1
omnigenome/src/model/rna_design/model.py +65 -58
omnigenome/src/model/seq2seq/__init__.py +0 -1
omnigenome/src/model/seq2seq/model.py +4 -4
omnigenome/src/tokenizer/bpe_tokenizer.py +27 -27
omnigenome/src/tokenizer/kmers_tokenizer.py +22 -22
omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +11 -11
omnigenome/src/trainer/accelerate_trainer.py +40 -32
omnigenome/src/trainer/hf_trainer.py +8 -8
omnigenome/src/trainer/trainer.py +37 -25
omnigenome/utility/dataset_hub/__init__.py +0 -1
omnigenome/utility/dataset_hub/dataset_hub.py +13 -13
omnigenome/utility/ensemble.py +26 -26
omnigenome/utility/hub_utils.py +8 -8
omnigenome/utility/model_hub/__init__.py +0 -1
omnigenome/utility/model_hub/model_hub.py +26 -25
omnigenome/utility/pipeline_hub/__init__.py +0 -1
omnigenome/utility/pipeline_hub/pipeline.py +49 -49
omnigenome/utility/pipeline_hub/pipeline_hub.py +17 -17
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/METADATA +2 -2
omnigenome-0.3.1a0.dist-info/RECORD +78 -0
omnigenome-0.3.0a1.dist-info/RECORD +0 -78
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/WHEEL +0 -0
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/entry_points.txt +0 -0
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/licenses/LICENSE +0 -0
{omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/top_level.txt +0 -0

omnigenome/src/model/regression/resnet.py CHANGED Viewed

@@ -23,14 +23,14 @@ from typing import Type, Callable, Union, List, Optional
 def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
     """
     3x3 convolution with padding.
     Args:
         in_planes (int): Number of input channels
         out_planes (int): Number of output channels
         stride (int): Stride for the convolution (default: 1)
         groups (int): Number of groups for grouped convolution (default: 1)
         dilation (int): Dilation factor for the convolution (default: 1)
     Returns:
         nn.Conv2d: 3x3 convolution layer
     """
@@ -49,12 +49,12 @@ def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
 def conv1x1(in_planes, out_planes, stride=1):
     """
     1x1 convolution.
     Args:
         in_planes (int): Number of input channels
         out_planes (int): Number of output channels
         stride (int): Stride for the convolution (default: 1)
     Returns:
         nn.Conv2d: 1x1 convolution layer
     """
@@ -64,14 +64,14 @@ def conv1x1(in_planes, out_planes, stride=1):
 def conv5x5(in_planes, out_planes, stride=1, groups=1, dilation=1):
     """
     5x5 convolution with padding.
     Args:
         in_planes (int): Number of input channels
         out_planes (int): Number of output channels
         stride (int): Stride for the convolution (default: 1)
         groups (int): Number of groups for grouped convolution (default: 1)
         dilation (int): Dilation factor for the convolution (default: 1)
     Returns:
         nn.Conv2d: 5x5 convolution layer
     """
@@ -90,10 +90,10 @@ def conv5x5(in_planes, out_planes, stride=1, groups=1, dilation=1):
 class BasicBlock(nn.Module):
     """
     Basic ResNet block for genomic sequence processing.
     This block implements a basic residual connection with two convolutions
     and is optimized for processing genomic sequence data with layer normalization.
     Attributes:
         expansion (int): Expansion factor for the block (default: 1)
         conv1: First 3x3 convolution layer
@@ -105,7 +105,7 @@ class BasicBlock(nn.Module):
         downsample: Downsampling layer for residual connection
         stride: Stride for the convolutions
     """
     expansion: int = 1
     def __init__(
@@ -121,7 +121,7 @@ class BasicBlock(nn.Module):
     ) -> None:
         """
         Initialize the BasicBlock.
         Args:
             inplanes (int): Number of input channels
             planes (int): Number of output channels
@@ -130,7 +130,7 @@ class BasicBlock(nn.Module):
             groups (int): Number of groups for grouped convolution (default: 1)
             dilation (int): Dilation factor for convolutions (default: 1)
             norm_layer: Normalization layer type (default: None, uses LayerNorm)
         Raises:
             NotImplementedError: If dilation > 1 is specified
         """
@@ -154,10 +154,10 @@ class BasicBlock(nn.Module):
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass through the BasicBlock.
         Args:
             x (Tensor): Input tensor [batch_size, channels, height, width]
         Returns:
             Tensor: Output tensor with same shape as input
         """
@@ -188,11 +188,11 @@ class BasicBlock(nn.Module):
 class Bottleneck(nn.Module):
     """
     Bottleneck ResNet block for genomic sequence processing.
     This block implements a bottleneck residual connection with three convolutions
     (1x1, 3x3, 1x1) and is designed for deeper networks. It's adapted from
     the original ResNet V1.5 implementation.
     Attributes:
         expansion (int): Expansion factor for the block (default: 4)
         conv1: First 1x1 convolution layer
@@ -205,7 +205,7 @@ class Bottleneck(nn.Module):
         downsample: Downsampling layer for residual connection
         stride: Stride for the convolutions
     """
     # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
     # while original implementation places the stride at the first 1x1 convolution(self.conv1)
     # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
@@ -227,7 +227,7 @@ class Bottleneck(nn.Module):
     ) -> None:
         """
         Initialize the Bottleneck block.
         Args:
             inplanes (int): Number of input channels
             planes (int): Number of output channels
@@ -256,10 +256,10 @@ class Bottleneck(nn.Module):
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass through the Bottleneck block.
         Args:
             x (Tensor): Input tensor [batch_size, channels, height, width]
         Returns:
             Tensor: Output tensor with same shape as input
         """
@@ -288,11 +288,11 @@ class Bottleneck(nn.Module):
 class ResNet(nn.Module):
     """
     ResNet architecture adapted for genomic sequence analysis.
     This ResNet implementation is specifically designed for processing genomic
     sequences and their structural representations. It uses layer normalization
     instead of batch normalization and is optimized for genomic data characteristics.
     Attributes:
         _norm_layer: Normalization layer type
         inplanes: Number of input channels for the first layer
@@ -319,7 +319,7 @@ class ResNet(nn.Module):
     ) -> None:
         """
         Initialize the ResNet architecture.
         Args:
             channels (int): Number of input channels
             block: Type of ResNet block (BasicBlock or Bottleneck)
@@ -329,7 +329,7 @@ class ResNet(nn.Module):
             width_per_group (int): Width per group for bottleneck blocks (default: 1)
             replace_stride_with_dilation: Whether to replace stride with dilation (default: None)
             norm_layer: Normalization layer type (default: None, uses LayerNorm)
         Raises:
             ValueError: If replace_stride_with_dilation is not None or a 3-element tuple
         """
@@ -379,14 +379,14 @@ class ResNet(nn.Module):
     ) -> nn.Sequential:
         """
         Create a layer of ResNet blocks.
         Args:
             block: Type of ResNet block to use
             planes (int): Number of output channels for the layer
             blocks (int): Number of blocks in the layer
             stride (int): Stride for the first block (default: 1)
             dilate (bool): Whether to use dilation (default: False)
         Returns:
             nn.Sequential: Sequential container of ResNet blocks
         """
@@ -433,10 +433,10 @@ class ResNet(nn.Module):
     def _forward_impl(self, x: Tensor) -> Tensor:
         """
         Forward pass implementation.
         Args:
             x (Tensor): Input tensor [batch_size, channels, height, width]
         Returns:
             Tensor: Output tensor after processing through ResNet
         """
@@ -456,10 +456,10 @@ class ResNet(nn.Module):
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass through the ResNet.
         Args:
             x (Tensor): Input tensor [batch_size, channels, height, width]
         Returns:
             Tensor: Output tensor after processing through ResNet
         """
@@ -469,14 +469,14 @@ class ResNet(nn.Module):
 def resnet_b16(channels=128, bbn=16):
     """
     Create a ResNet-B16 model for genomic sequence analysis.
     This function creates a ResNet model with 16 basic blocks, optimized
     for processing genomic sequences and their structural representations.
     Args:
         channels (int): Number of input channels (default: 128)
         bbn (int): Number of basic blocks (default: 16)
     Returns:
         ResNet: Configured ResNet model
     """

omnigenome/src/model/rna_design/__init__.py CHANGED Viewed

@@ -9,4 +9,3 @@
 """
 This package contains modules for RNA design models.
 """

omnigenome/src/model/rna_design/model.py CHANGED Viewed

@@ -30,19 +30,19 @@ from omnigenome.src.misc.utils import fprint
 class OmniModelForRNADesign(torch.nn.Module):
     """
     RNA design model using masked language modeling and evolutionary algorithms.
     This model combines a pre-trained masked language model with evolutionary
     algorithms to design RNA sequences that fold into specific target structures.
     It uses a multi-objective optimization approach to balance structure similarity
     and thermodynamic stability.
     Attributes:
         device: Device to run the model on (CPU or GPU)
         parallel: Whether to use parallel processing for structure prediction
         tokenizer: Tokenizer for processing RNA sequences
         model: Pre-trained masked language model
     """
     def __init__(
         self,
         model="yangheng/OmniGenome-186M",
@@ -53,7 +53,7 @@ class OmniModelForRNADesign(torch.nn.Module):
     ):
         """
         Initialize the RNA design model.
         Args:
             model (str): Model name or path for the pre-trained MLM model
             device: Device to run the model on (default: None, auto-detect)
@@ -72,10 +72,10 @@ class OmniModelForRNADesign(torch.nn.Module):
     def _random_bp_span(bp_span=None):
         """
         Generate a random base pair span.
         Args:
             bp_span (int, optional): Fixed base pair span. If None, generates random.
         Returns:
             int: Base pair span value
         """
@@ -87,16 +87,16 @@ class OmniModelForRNADesign(torch.nn.Module):
     def _longest_bp_span(structure):
         """
         Find the longest base pair span in the structure.
         Args:
             structure (str): RNA structure in dot-bracket notation
         Returns:
             int: Length of the longest base pair span
         """
         max_span = 0
         current_span = 0
         for char in structure:
             if char == "(":
                 current_span += 1
@@ -105,18 +105,18 @@ class OmniModelForRNADesign(torch.nn.Module):
                 current_span = max(0, current_span - 1)
             else:
                 current_span = 0
         return max_span
     @staticmethod
     def _predict_structure_single(sequence, bp_span=-1):
         """
         Predict structure for a single sequence (worker function for multiprocessing).
         Args:
             sequence (str): RNA sequence to fold
             bp_span (int): Base pair span parameter
         Returns:
             tuple: (structure, mfe) tuple
         """
@@ -129,30 +129,30 @@ class OmniModelForRNADesign(torch.nn.Module):
     def _predict_structure(self, sequences, bp_span=-1):
         """
         Predict structures for multiple sequences.
         Args:
             sequences (list): List of RNA sequences
             bp_span (int): Base pair span parameter
         Returns:
             list: List of (structure, mfe) tuples
         """
         if not self.parallel or len(sequences) <= 1:
             # Sequential processing
             return [self._predict_structure_single(seq, bp_span) for seq in sequences]
         # Parallel processing with improved error handling
         try:
             # Determine number of workers
             max_workers = min(os.cpu_count(), len(sequences), 8)  # Limit to 8 workers
             with ProcessPoolExecutor(max_workers=max_workers) as executor:
                 # Submit all tasks
                 future_to_seq = {
-                    executor.submit(self._predict_structure_single, seq, bp_span): seq
+                    executor.submit(self._predict_structure_single, seq, bp_span): seq
                     for seq in sequences
                 }
                 # Collect results
                 results = []
                 for future in as_completed(future_to_seq):
@@ -164,112 +164,119 @@ class OmniModelForRNADesign(torch.nn.Module):
                         warnings.warn(f"Failed to process sequence {seq}: {e}")
                         # Fallback to dot structure
                         results.append(("." * len(seq), 0.0))
                 return results
         except Exception as e:
-            warnings.warn(f"Parallel processing failed, falling back to sequential: {e}")
+            warnings.warn(
+                f"Parallel processing failed, falling back to sequential: {e}"
+            )
             # Fallback to sequential processing
             return [self._predict_structure_single(seq, bp_span) for seq in sequences]
     def _init_population(self, structure, num_population):
         """
         Initialize the population with random sequences.
         Args:
             structure (str): Target RNA structure
             num_population (int): Population size
         Returns:
             list: List of (sequence, bp_span) tuples
         """
         population = []
         bp_span = self._longest_bp_span(structure)
         for _ in range(num_population):
             # Generate random sequence
             sequence = "".join(random.choice("ACGU") for _ in range(len(structure)))
             population.append((sequence, bp_span))
         return population
     def _mlm_mutate(self, population, structure, mutation_ratio):
         """
         Mutate population using masked language modeling.
         Args:
             population (list): Current population
             structure (str): Target RNA structure
             mutation_ratio (float): Ratio of tokens to mutate
         Returns:
             list: Mutated population
         """
         def mutate(sequence, mutation_rate):
             # Create masked sequence
             masked_sequence = list(sequence)
             num_mutations = int(len(sequence) * mutation_rate)
             mutation_positions = random.sample(range(len(sequence)), num_mutations)
             for pos in mutation_positions:
                 masked_sequence[pos] = self.tokenizer.mask_token
             return "".join(masked_sequence)
         # Prepare inputs for MLM
         mlm_inputs = []
         for sequence, bp_span in population:
             masked_seq = mutate(sequence, mutation_ratio)
             mlm_inputs.append(masked_seq)
         # Get predictions from MLM
         predicted_tokens = self._mlm_predict(mlm_inputs, structure)
         # Convert predictions back to sequences
         mutated_population = []
         for i, (sequence, bp_span) in enumerate(population):
             # Convert token IDs back to nucleotides
-            new_sequence = self.tokenizer.decode(predicted_tokens[i], skip_special_tokens=True)
+            new_sequence = self.tokenizer.decode(
+                predicted_tokens[i], skip_special_tokens=True
+            )
             # Ensure the sequence has the correct length
             if len(new_sequence) != len(structure):
-                new_sequence = new_sequence[:len(structure)].ljust(len(structure), "A")
+                new_sequence = new_sequence[: len(structure)].ljust(len(structure), "A")
             mutated_population.append((new_sequence, bp_span))
         return mutated_population
     def _crossover(self, population, num_points=3):
         """
         Perform crossover operation on the population.
         Args:
             population (list): Current population
             num_points (int): Number of crossover points
         Returns:
             list: Population after crossover
         """
         if len(population) < 2:
             return population
         # Create crossover masks
         num_sequences = len(population)
         masks = np.zeros((num_sequences, len(population[0][0])), dtype=bool)
         # Generate random crossover points
-        crossover_points = np.random.randint(0, len(population[0][0]), (num_sequences, num_points))
+        crossover_points = np.random.randint(
+            0, len(population[0][0]), (num_sequences, num_points)
+        )
         # Create parent indices
         parent_indices = np.random.randint(0, num_sequences, (num_sequences, 2))
         # Generate crossover masks
         for i in range(num_sequences):
             for j in range(num_points):
                 if j == 0:
-                    masks[i, :crossover_points[i, j]] = True
+                    masks[i, : crossover_points[i, j]] = True
                 else:
-                    last_point = crossover_points[i, j-1]
-                    masks[i, last_point:crossover_points[i, j]] = j % 2 == 0
+                    last_point = crossover_points[i, j - 1]
+                    masks[i, last_point : crossover_points[i, j]] = j % 2 == 0
             # Handle the last segment
             last_point = crossover_points[i, -1]
             masks[i, last_point:] = num_points % 2 == 0
@@ -298,17 +305,17 @@ class OmniModelForRNADesign(torch.nn.Module):
     def _evaluate_structure_fitness(self, sequences, structure):
         """
         Evaluate the fitness of the RNA structure by comparing with the target structure.
         Args:
             sequences (list): List of (sequence, bp_span) tuples to evaluate
             structure (str): Target RNA structure
         Returns:
             list: Sorted population with fitness scores and MFE values
         """
         # Get sequences for structure prediction
         seq_list = [seq for seq, _ in sequences]
         # Predict structures (with improved multiprocessing)
         structures_mfe = self._predict_structure(seq_list)
@@ -326,11 +333,11 @@ class OmniModelForRNADesign(torch.nn.Module):
     def _non_dominated_sorting(scores, mfe_values):
         """
         Perform non-dominated sorting for multi-objective optimization.
         Args:
             scores (list): Structure similarity scores
             mfe_values (list): Minimum free energy values
         Returns:
             list: List of fronts (Pareto fronts)
         """
@@ -369,11 +376,11 @@ class OmniModelForRNADesign(torch.nn.Module):
     def _select_next_generation(next_generation, fronts):
         """
         Select the next generation based on Pareto fronts.
         Args:
             next_generation (list): Current population with fitness scores
             fronts (list): Pareto fronts
         Returns:
             list: Selected population for the next generation
         """
@@ -389,11 +396,11 @@ class OmniModelForRNADesign(torch.nn.Module):
     def _mlm_predict(self, mlm_inputs, structure):
         """
         Perform masked language model prediction.
         Args:
             mlm_inputs (list): List of masked input sequences
             structure (str): Target RNA structure
         Returns:
             list: Predicted token IDs for each input
         """
@@ -403,7 +410,7 @@ class OmniModelForRNADesign(torch.nn.Module):
         with torch.no_grad():
             for i in range(0, len(mlm_inputs), batch_size):
                 inputs = self.tokenizer(
-                    mlm_inputs[i: i + batch_size],
+                    mlm_inputs[i : i + batch_size],
                     padding=False,
                     max_length=1024,
                     truncation=True,
@@ -422,13 +429,13 @@ class OmniModelForRNADesign(torch.nn.Module):
     ):
         """
         Design RNA sequences for a target structure using evolutionary algorithms.
         Args:
             structure (str): Target RNA structure in dot-bracket notation
             mutation_ratio (float): Ratio of tokens to mutate (default: 0.5)
             num_population (int): Population size (default: 100)
             num_generation (int): Number of generations (default: 100)
         Returns:
             list: List of designed RNA sequences with their fitness scores
         """

omnigenome/src/model/seq2seq/__init__.py CHANGED Viewed

@@ -9,4 +9,3 @@
 """
 This package contains modules for sequence-to-sequence models.
 """

omnigenome/src/model/seq2seq/model.py CHANGED Viewed

@@ -21,20 +21,20 @@ from ...abc.abstract_model import OmniModel
 class OmniModelForSeq2Seq(OmniModel):
     """
     Sequence-to-sequence model for genomic sequences.
     This model implements a sequence-to-sequence architecture for genomic
     sequences, where the input is one sequence and the output is another
     sequence. It's useful for tasks like sequence translation, structure
     prediction, or sequence transformation.
     The model can be extended to implement specific seq2seq tasks by
     overriding the forward, predict, and inference methods.
     """
     def __init__(self, config_or_model, tokenizer, *args, **kwargs):
         """
         Initialize the sequence-to-sequence model.
         Args:
             config_or_model: Model configuration or pre-trained model
             tokenizer: Tokenizer for processing input sequences

omnigenome 0.3.0a1__py3-none-any.whl → 0.3.1a0__py3-none-any.whl

omnigenome 0.3.0a1py3-none-any.whl → 0.3.1a0py3-none-any.whl