PyPI - omnigenome - Versions diffs - 0.3.0a0__py3-none-any.whl → 0.3.1a0__py3-none-any.whl - Mend

omnigenome 0.3.0a0py3-none-any.whl → 0.3.1a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

omnigenome/__init__.py +29 -44
omnigenome/auto/auto_bench/__init__.py +0 -1
omnigenome/auto/auto_bench/auto_bench.py +24 -14
omnigenome/auto/auto_train/__init__.py +0 -1
omnigenome/auto/auto_train/auto_train.py +11 -12
omnigenome/auto/bench_hub/__init__.py +0 -1
omnigenome/auto/bench_hub/bench_hub.py +1 -1
omnigenome/cli/__init__.py +0 -1
omnigenome/cli/commands/__init__.py +0 -1
omnigenome/cli/commands/base.py +10 -10
omnigenome/cli/commands/bench/__init__.py +0 -1
omnigenome/cli/commands/bench/bench_cli.py +10 -10
omnigenome/cli/commands/rna/__init__.py +0 -1
omnigenome/cli/commands/rna/rna_design.py +10 -11
omnigenome/src/__init__.py +0 -1
omnigenome/src/abc/__init__.py +0 -1
omnigenome/src/abc/abstract_dataset.py +38 -19
omnigenome/src/abc/abstract_metric.py +7 -7
omnigenome/src/abc/abstract_model.py +15 -14
omnigenome/src/abc/abstract_tokenizer.py +9 -7
omnigenome/src/dataset/omni_dataset.py +16 -14
omnigenome/src/lora/__init__.py +0 -1
omnigenome/src/lora/lora_model.py +47 -41
omnigenome/src/metric/classification_metric.py +11 -11
omnigenome/src/metric/metric.py +19 -19
omnigenome/src/metric/ranking_metric.py +15 -15
omnigenome/src/metric/regression_metric.py +18 -18
omnigenome/src/misc/utils.py +214 -150
omnigenome/src/model/augmentation/__init__.py +0 -1
omnigenome/src/model/augmentation/model.py +17 -17
omnigenome/src/model/classification/__init__.py +0 -1
omnigenome/src/model/classification/model.py +28 -32
omnigenome/src/model/embedding/__init__.py +0 -1
omnigenome/src/model/embedding/model.py +35 -35
omnigenome/src/model/mlm/__init__.py +0 -1
omnigenome/src/model/mlm/model.py +13 -13
omnigenome/src/model/module_utils.py +17 -17
omnigenome/src/model/regression/__init__.py +0 -1
omnigenome/src/model/regression/model.py +72 -77
omnigenome/src/model/regression/resnet.py +32 -32
omnigenome/src/model/rna_design/__init__.py +0 -1
omnigenome/src/model/rna_design/model.py +168 -118
omnigenome/src/model/seq2seq/__init__.py +0 -1
omnigenome/src/model/seq2seq/model.py +4 -4
omnigenome/src/tokenizer/bpe_tokenizer.py +27 -27
omnigenome/src/tokenizer/kmers_tokenizer.py +22 -22
omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +11 -11
omnigenome/src/trainer/accelerate_trainer.py +40 -32
omnigenome/src/trainer/hf_trainer.py +8 -8
omnigenome/src/trainer/trainer.py +37 -25
omnigenome/utility/dataset_hub/__init__.py +0 -1
omnigenome/utility/dataset_hub/dataset_hub.py +13 -13
omnigenome/utility/ensemble.py +26 -26
omnigenome/utility/hub_utils.py +8 -8
omnigenome/utility/model_hub/__init__.py +0 -1
omnigenome/utility/model_hub/model_hub.py +26 -25
omnigenome/utility/pipeline_hub/__init__.py +0 -1
omnigenome/utility/pipeline_hub/pipeline.py +49 -49
omnigenome/utility/pipeline_hub/pipeline_hub.py +17 -17
{omnigenome-0.3.0a0.dist-info → omnigenome-0.3.1a0.dist-info}/METADATA +3 -3
omnigenome-0.3.1a0.dist-info/RECORD +78 -0
{omnigenome-0.3.0a0.dist-info → omnigenome-0.3.1a0.dist-info}/top_level.txt +0 -1
omnigenome-0.3.0a0.dist-info/RECORD +0 -85
tests/__init__.py +0 -9
tests/conftest.py +0 -160
tests/test_dataset_patterns.py +0 -291
tests/test_examples_syntax.py +0 -83
tests/test_model_loading.py +0 -183
tests/test_rna_functions.py +0 -255
tests/test_training_patterns.py +0 -302
{omnigenome-0.3.0a0.dist-info → omnigenome-0.3.1a0.dist-info}/WHEEL +0 -0
{omnigenome-0.3.0a0.dist-info → omnigenome-0.3.1a0.dist-info}/entry_points.txt +0 -0
{omnigenome-0.3.0a0.dist-info → omnigenome-0.3.1a0.dist-info}/licenses/LICENSE +0 -0

omnigenome/src/abc/abstract_dataset.py CHANGED Viewed

@@ -56,7 +56,7 @@ def covert_input_to_tensor(data):
 class OmniGenomeDict(dict):
     """
     A dictionary subclass that allows moving all tensor values to a specified device.
     This class extends the standard Python dictionary to provide a convenient
     method for moving all tensor values to a specific device (CPU/GPU).
     """
@@ -87,14 +87,14 @@ class OmniGenomeDict(dict):
 class OmniDataset(torch.utils.data.Dataset):
     """
     Abstract base class for all datasets in OmniGenome.
     This class provides a unified interface for genomic datasets in the OmniGenome
     framework. It handles data loading, preprocessing, tokenization, and provides
     a PyTorch-compatible dataset interface.
     The class supports various data formats and can handle different types of
     genomic tasks including classification, regression, and token-level tasks.
     Attributes:
         tokenizer: The tokenizer to use for processing sequences.
         max_length (int): The maximum sequence length for tokenization.
@@ -118,17 +118,17 @@ class OmniDataset(torch.utils.data.Dataset):
             **kwargs: Additional keyword arguments.
                 - label2id (dict): A mapping from labels to integer IDs.
                 - shuffle (bool): Whether to shuffle the data. Defaults to True.
-                - structure_in (bool): Whether to include secondary structure
+                - structure_in (bool): Whether to include secondary structure
                   information. Defaults to False.
-                - drop_long_seq (bool): Whether to drop sequences longer than
+                - drop_long_seq (bool): Whether to drop sequences longer than
                   max_length. Defaults to False.
         Example:
             >>> # Initialize with a single data file
             >>> dataset = OmniDataset("data.json", tokenizer, max_length=512)
             >>> # Initialize with label mapping
-            >>> dataset = OmniDataset("data.json", tokenizer,
+            >>> dataset = OmniDataset("data.json", tokenizer,
             ...                      label2id={"A": 0, "B": 1})
         """
         super(OmniDataset, self).__init__()
@@ -158,9 +158,7 @@ class OmniDataset(torch.utils.data.Dataset):
             )
             self.max_length = self.tokenizer.max_length
         else:
-            fprint(
-                f"No max_length detected, using default max_length=512."
-            )
+            fprint(f"No max_length detected, using default max_length=512.")
             self.max_length = 512
         self.tokenizer.max_length = self.max_length
@@ -417,23 +415,44 @@ class OmniDataset(torch.utils.data.Dataset):
                     lines = f.readlines()
                 for line in lines:
                     examples.append({"text": line.strip()})
-            elif data_source.endswith(('.fasta', '.fa', '.fna', '.ffn', '.faa', '.frn')):
+            elif data_source.endswith(
+                (".fasta", ".fa", ".fna", ".ffn", ".faa", ".frn")
+            ):
                 try:
                     from Bio import SeqIO
                 except ImportError:
-                    raise ImportError("Biopython is required for FASTA parsing. Please install with 'pip install biopython'.")
+                    raise ImportError(
+                        "Biopython is required for FASTA parsing. Please install with 'pip install biopython'."
+                    )
                 for record in SeqIO.parse(data_source, "fasta"):
-                    examples.append({"id": record.id, "sequence": str(record.seq), "description": record.description})
-            elif data_source.endswith(('.fastq', '.fq')):
+                    examples.append(
+                        {
+                            "id": record.id,
+                            "sequence": str(record.seq),
+                            "description": record.description,
+                        }
+                    )
+            elif data_source.endswith((".fastq", ".fq")):
                 try:
                     from Bio import SeqIO
                 except ImportError:
-                    raise ImportError("Biopython is required for FASTQ parsing. Please install with 'pip install biopython'.")
+                    raise ImportError(
+                        "Biopython is required for FASTQ parsing. Please install with 'pip install biopython'."
+                    )
                 for record in SeqIO.parse(data_source, "fastq"):
-                    examples.append({"id": record.id, "sequence": str(record.seq), "quality": record.letter_annotations.get("phred_quality", [])})
-            elif data_source.endswith('.bed'):
+                    examples.append(
+                        {
+                            "id": record.id,
+                            "sequence": str(record.seq),
+                            "quality": record.letter_annotations.get(
+                                "phred_quality", []
+                            ),
+                        }
+                    )
+            elif data_source.endswith(".bed"):
                 import pandas as pd
-                df = pd.read_csv(data_source, sep='\t', comment='#')
+                df = pd.read_csv(data_source, sep="\t", comment="#")
                 # Assign column names for standard BED fields
                 for _, row in df.iterrows():
                     examples.append(row.to_dict())

omnigenome/src/abc/abstract_metric.py CHANGED Viewed

@@ -15,17 +15,17 @@ from ..misc.utils import env_meta_info
 class OmniMetric:
     """
     Abstract base class for all metrics in OmniGenome, based on scikit-learn.
     This class provides a unified interface for evaluation metrics in the OmniGenome
     framework. It integrates with scikit-learn's metric functions and provides
     additional functionality for handling genomic data evaluation.
     The class automatically exposes all scikit-learn metrics as attributes,
     making them easily accessible for evaluation tasks.
     Attributes:
         metric_func (callable): A callable metric function from `sklearn.metrics`.
-        ignore_y (any): A value in the ground truth labels to be ignored during
+        ignore_y (any): A value in the ground truth labels to be ignored during
                        metric computation.
         metadata (dict): Metadata about the metric including version info.
     """
@@ -35,10 +35,10 @@ class OmniMetric:
         Initializes the metric.
         Args:
-            metric_func (callable, optional): A callable metric function from
+            metric_func (callable, optional): A callable metric function from
                                             `sklearn.metrics`. If None, subclasses
                                             should implement their own compute method.
-            ignore_y (any, optional): A value in the ground truth labels to be
+            ignore_y (any, optional): A value in the ground truth labels to be
                                     ignored during metric computation.
             *args: Additional positional arguments.
             **kwargs: Additional keyword arguments.
@@ -46,7 +46,7 @@ class OmniMetric:
         Example:
             >>> # Initialize with a specific metric function
             >>> metric = OmniMetric(metrics.accuracy_score)
             >>> # Initialize with ignore value
             >>> metric = OmniMetric(ignore_y=-100)
         """

omnigenome/src/abc/abstract_model.py CHANGED Viewed

@@ -47,14 +47,14 @@ def count_parameters(model):
 class OmniModel(torch.nn.Module):
     """
     Abstract base class for all models in OmniGenome.
     This class provides a unified interface for all genomic models in the OmniGenome
     framework. It handles model initialization, forward passes, loss computation,
     prediction, inference, and model persistence.
     The class is designed to work with various types of genomic data and tasks,
     including sequence classification, token classification, regression, and more.
     Attributes:
         model (torch.nn.Module): The underlying PyTorch model.
         config: The model configuration.
@@ -76,16 +76,16 @@ class OmniModel(torch.nn.Module):
         - From a configuration object
         Args:
-            config_or_model: A model configuration, a pre-trained model path (str),
+            config_or_model: A model configuration, a pre-trained model path (str),
                            or a `torch.nn.Module` instance.
             tokenizer: The tokenizer associated with the model.
             *args: Additional positional arguments.
             **kwargs: Additional keyword arguments.
                 - label2id (dict): Mapping from class labels to IDs.
                 - num_labels (int): The number of labels.
-                - trust_remote_code (bool): Whether to trust remote code when loading
+                - trust_remote_code (bool): Whether to trust remote code when loading
                   from Hugging Face Hub. Defaults to True.
-                - ignore_mismatched_sizes (bool): Whether to ignore size mismatches
+                - ignore_mismatched_sizes (bool): Whether to ignore size mismatches
                   when loading pre-trained weights. Defaults to False.
                 - dropout (float): Dropout rate. Defaults to 0.0.
@@ -97,7 +97,7 @@ class OmniModel(torch.nn.Module):
         Example:
             >>> # Initialize from a pre-trained model
             >>> model = OmniModelForSequenceClassification("model_path", tokenizer)
             >>> # Initialize from a configuration
             >>> config = AutoConfig.from_pretrained("model_path")
             >>> model = OmniModelForSequenceClassification(config, tokenizer)
@@ -202,7 +202,9 @@ class OmniModel(torch.nn.Module):
             )
             self.config.num_labels = len(self.config.id2label)
-        assert len(self.config.label2id) == num_labels, f"Expected {num_labels} labels, but got {len(self.config.label2id)} in label2id dictionary."
+        assert (
+            len(self.config.label2id) == num_labels
+        ), f"Expected {num_labels} labels, but got {len(self.config.label2id)} in label2id dictionary."
         # The metadata of the model
         self.metadata = env_meta_info()
@@ -240,7 +242,7 @@ class OmniModel(torch.nn.Module):
         model architectures by mapping input parameters appropriately.
         Args:
-            **inputs: The inputs to the model, compatible with the base model's
+            **inputs: The inputs to the model, compatible with the base model's
                      forward method. Typically includes 'input_ids', 'attention_mask',
                      and other model-specific parameters.
@@ -386,7 +388,7 @@ class OmniModel(torch.nn.Module):
         predictions for further processing.
         Args:
-            sequence_or_inputs: A sequence (str), list of sequences, or
+            sequence_or_inputs: A sequence (str), list of sequences, or
                                tokenized inputs (dict/tuple).
             **kwargs: Additional arguments for tokenization and inference.
@@ -398,7 +400,7 @@ class OmniModel(torch.nn.Module):
         Example:
             >>> # Predict on a single sequence
             >>> outputs = model.predict("ATCGATCG")
             >>> # Predict on multiple sequences
             >>> outputs = model.predict(["ATCGATCG", "GCTAGCTA"])
         """
@@ -416,7 +418,7 @@ class OmniModel(torch.nn.Module):
         to class labels or probabilities.
         Args:
-            sequence_or_inputs: A sequence (str), list of sequences, or
+            sequence_or_inputs: A sequence (str), list of sequences, or
                                tokenized inputs (dict/tuple).
             **kwargs: Additional arguments for tokenization and inference.
@@ -429,7 +431,7 @@ class OmniModel(torch.nn.Module):
             >>> # Inference on a single sequence
             >>> results = model.inference("ATCGATCG")
             >>> print(results['predictions'])  # Class labels
             >>> # Inference on multiple sequences
             >>> results = model.inference(["ATCGATCG", "GCTAGCTA"])
         """
@@ -686,4 +688,3 @@ class OmniModel(torch.nn.Module):
         info += f"Model Config: {self.config}\n"
         fprint(info)
         return info

omnigenome/src/abc/abstract_tokenizer.py CHANGED Viewed

@@ -16,15 +16,15 @@ from ..misc.utils import env_meta_info, load_module_from_path
 class OmniTokenizer:
     """
     A wrapper class for tokenizers to provide a consistent interface within OmniGenome.
     This class provides a unified interface for tokenizers in the OmniGenome framework.
     It wraps underlying tokenizers (typically from Hugging Face) and provides
     additional functionality for genomic sequence processing.
     The class handles various tokenization strategies and provides compatibility
     with different model architectures. It also supports custom tokenizer wrappers
     for specialized genomic tasks.
     Attributes:
         base_tokenizer: The underlying tokenizer instance (e.g., from Hugging Face).
         max_length (int): The default maximum sequence length.
@@ -52,7 +52,7 @@ class OmniTokenizer:
             >>> from transformers import AutoTokenizer
             >>> base_tokenizer = AutoTokenizer.from_pretrained("model_name")
             >>> tokenizer = OmniTokenizer(base_tokenizer, max_length=512)
             >>> # Initialize with sequence conversion
             >>> tokenizer = OmniTokenizer(base_tokenizer, u2t=True)
         """
@@ -87,9 +87,9 @@ class OmniTokenizer:
         Example:
             >>> # Load from a pre-trained model
             >>> tokenizer = OmniTokenizer.from_pretrained("model_name")
             >>> # Load with custom parameters
-            >>> tokenizer = OmniTokenizer.from_pretrained("model_name",
+            >>> tokenizer = OmniTokenizer.from_pretrained("model_name",
             ...                                          trust_remote_code=True)
         """
         wrapper_path = f"{model_name_or_path.rstrip('/')}/omnigenome_wrapper.py"
@@ -104,7 +104,9 @@ class OmniTokenizer:
             warnings.warn(
                 f"No tokenizer wrapper found in {wrapper_path} -> Exception: {e}"
             )
-            kwargs.pop("num_labels", None) # Remove num_labels if it exists, as it may not be applicable
+            kwargs.pop(
+                "num_labels", None
+            )  # Remove num_labels if it exists, as it may not be applicable
             tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)

omnigenome/src/dataset/omni_dataset.py CHANGED Viewed

@@ -27,11 +27,11 @@ from ... import __name__, __version__
 class OmniDatasetForTokenClassification(OmniDataset):
     """
     Dataset class specifically designed for token classification tasks in genomics.
     This class extends `OmniDataset` to provide functionalities for preparing input sequences
     and their corresponding token-level labels. It's designed for tasks where each token
     in a sequence needs to be classified independently.
     Attributes:
         metadata: Dictionary containing dataset metadata including library information
         label2id: Mapping from label strings to integer IDs
@@ -68,7 +68,7 @@ class OmniDatasetForTokenClassification(OmniDataset):
     def prepare_input(self, instance, **kwargs):
         """
         Prepare a single data instance for token classification.
         This method handles both string sequences and dictionary instances
         containing sequence and label information. It tokenizes the input
         sequence and prepares token-level labels for classification.
@@ -138,11 +138,11 @@ class OmniDatasetForTokenClassification(OmniDataset):
 class OmniDatasetForSequenceClassification(OmniDataset):
     """
     Dataset class for sequence classification tasks in genomics.
     This class extends `OmniDataset` to prepare input sequences and their corresponding
     sequence-level labels. It's designed for tasks where the entire sequence needs
     to be classified into one of several categories.
     Attributes:
         metadata: Dictionary containing dataset metadata including library information
         label2id: Mapping from label strings to integer IDs
@@ -179,7 +179,7 @@ class OmniDatasetForSequenceClassification(OmniDataset):
     def prepare_input(self, instance, **kwargs):
         """
         Prepare a single data instance for sequence classification.
         This method handles both string sequences and dictionary instances
         containing sequence and label information. It tokenizes the input
         sequence and prepares sequence-level labels for classification.
@@ -238,11 +238,11 @@ class OmniDatasetForSequenceClassification(OmniDataset):
 class OmniDatasetForTokenRegression(OmniDataset):
     """
     Dataset class for token regression tasks in genomics.
     This class extends `OmniDataset` to prepare input sequences and their corresponding
     token-level regression targets. It's designed for tasks where each token in a
     sequence needs to be assigned a continuous value.
     Attributes:
         metadata: Dictionary containing dataset metadata including library information
     """
@@ -278,7 +278,7 @@ class OmniDatasetForTokenRegression(OmniDataset):
     def prepare_input(self, instance, **kwargs):
         """
         Prepare a single data instance for token regression.
         This method handles both string sequences and dictionary instances
         containing sequence and regression target information. It tokenizes
         the input sequence and prepares token-level regression targets.
@@ -330,7 +330,9 @@ class OmniDatasetForTokenRegression(OmniDataset):
             # Handle token-level regression labels
             if isinstance(labels, (list, tuple)):
                 # Ensure labels match sequence length
-                labels = list(labels)[:self.max_length - 2]  # Account for special tokens
+                labels = list(labels)[
+                    : self.max_length - 2
+                ]  # Account for special tokens
                 labels = [-100] + labels + [-100]  # Add padding for special tokens
             else:
                 # Single value for the entire sequence
@@ -343,11 +345,11 @@ class OmniDatasetForTokenRegression(OmniDataset):
 class OmniDatasetForSequenceRegression(OmniDataset):
     """
     Dataset class for sequence regression tasks in genomics.
     This class extends `OmniDataset` to prepare input sequences and their corresponding
     sequence-level regression targets. It's designed for tasks where the entire
     sequence needs to be assigned a continuous value.
     Attributes:
         metadata: Dictionary containing dataset metadata including library information
     """
@@ -383,7 +385,7 @@ class OmniDatasetForSequenceRegression(OmniDataset):
     def prepare_input(self, instance, **kwargs):
         """
         Prepare a single data instance for sequence regression.
         This method handles both string sequences and dictionary instances
         containing sequence and regression target information. It tokenizes
         the input sequence and prepares sequence-level regression targets.
@@ -432,4 +434,4 @@ class OmniDatasetForSequenceRegression(OmniDataset):
             labels = float(labels)
         tokenized_inputs["labels"] = torch.tensor(labels, dtype=torch.float32)
-        return tokenized_inputs
+        return tokenized_inputs

omnigenome/src/lora/__init__.py CHANGED Viewed

@@ -10,4 +10,3 @@
 """
 This package contains modules for LoRA (Low-Rank Adaptation) fine-tuning.
 """

omnigenome 0.3.0a0__py3-none-any.whl → 0.3.1a0__py3-none-any.whl

omnigenome 0.3.0a0py3-none-any.whl → 0.3.1a0py3-none-any.whl