omnigenome 0.3.1a0__py3-none-any.whl → 1.0.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omnigenome/__init__.py +26 -266
- {omnigenome-0.3.1a0.dist-info → omnigenome-1.0.0b0.dist-info}/METADATA +8 -9
- omnigenome-1.0.0b0.dist-info/RECORD +6 -0
- omnigenome/auto/__init__.py +0 -3
- omnigenome/auto/auto_bench/__init__.py +0 -11
- omnigenome/auto/auto_bench/auto_bench.py +0 -494
- omnigenome/auto/auto_bench/auto_bench_cli.py +0 -230
- omnigenome/auto/auto_bench/auto_bench_config.py +0 -216
- omnigenome/auto/auto_bench/config_check.py +0 -34
- omnigenome/auto/auto_train/__init__.py +0 -12
- omnigenome/auto/auto_train/auto_train.py +0 -429
- omnigenome/auto/auto_train/auto_train_cli.py +0 -222
- omnigenome/auto/bench_hub/__init__.py +0 -11
- omnigenome/auto/bench_hub/bench_hub.py +0 -25
- omnigenome/cli/__init__.py +0 -12
- omnigenome/cli/commands/__init__.py +0 -12
- omnigenome/cli/commands/base.py +0 -83
- omnigenome/cli/commands/bench/__init__.py +0 -12
- omnigenome/cli/commands/bench/bench_cli.py +0 -202
- omnigenome/cli/commands/rna/__init__.py +0 -12
- omnigenome/cli/commands/rna/rna_design.py +0 -177
- omnigenome/cli/omnigenome_cli.py +0 -128
- omnigenome/src/__init__.py +0 -11
- omnigenome/src/abc/__init__.py +0 -11
- omnigenome/src/abc/abstract_dataset.py +0 -641
- omnigenome/src/abc/abstract_metric.py +0 -114
- omnigenome/src/abc/abstract_model.py +0 -690
- omnigenome/src/abc/abstract_tokenizer.py +0 -269
- omnigenome/src/dataset/__init__.py +0 -16
- omnigenome/src/dataset/omni_dataset.py +0 -437
- omnigenome/src/lora/__init__.py +0 -12
- omnigenome/src/lora/lora_model.py +0 -300
- omnigenome/src/metric/__init__.py +0 -15
- omnigenome/src/metric/classification_metric.py +0 -184
- omnigenome/src/metric/metric.py +0 -199
- omnigenome/src/metric/ranking_metric.py +0 -142
- omnigenome/src/metric/regression_metric.py +0 -191
- omnigenome/src/misc/__init__.py +0 -3
- omnigenome/src/misc/utils.py +0 -503
- omnigenome/src/model/__init__.py +0 -19
- omnigenome/src/model/augmentation/__init__.py +0 -11
- omnigenome/src/model/augmentation/model.py +0 -219
- omnigenome/src/model/classification/__init__.py +0 -11
- omnigenome/src/model/classification/model.py +0 -638
- omnigenome/src/model/embedding/__init__.py +0 -11
- omnigenome/src/model/embedding/model.py +0 -263
- omnigenome/src/model/mlm/__init__.py +0 -11
- omnigenome/src/model/mlm/model.py +0 -177
- omnigenome/src/model/module_utils.py +0 -232
- omnigenome/src/model/regression/__init__.py +0 -11
- omnigenome/src/model/regression/model.py +0 -781
- omnigenome/src/model/regression/resnet.py +0 -483
- omnigenome/src/model/rna_design/__init__.py +0 -11
- omnigenome/src/model/rna_design/model.py +0 -476
- omnigenome/src/model/seq2seq/__init__.py +0 -11
- omnigenome/src/model/seq2seq/model.py +0 -44
- omnigenome/src/tokenizer/__init__.py +0 -16
- omnigenome/src/tokenizer/bpe_tokenizer.py +0 -226
- omnigenome/src/tokenizer/kmers_tokenizer.py +0 -247
- omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +0 -249
- omnigenome/src/trainer/__init__.py +0 -14
- omnigenome/src/trainer/accelerate_trainer.py +0 -747
- omnigenome/src/trainer/hf_trainer.py +0 -75
- omnigenome/src/trainer/trainer.py +0 -591
- omnigenome/utility/__init__.py +0 -3
- omnigenome/utility/dataset_hub/__init__.py +0 -12
- omnigenome/utility/dataset_hub/dataset_hub.py +0 -178
- omnigenome/utility/ensemble.py +0 -324
- omnigenome/utility/hub_utils.py +0 -517
- omnigenome/utility/model_hub/__init__.py +0 -11
- omnigenome/utility/model_hub/model_hub.py +0 -232
- omnigenome/utility/pipeline_hub/__init__.py +0 -11
- omnigenome/utility/pipeline_hub/pipeline.py +0 -483
- omnigenome/utility/pipeline_hub/pipeline_hub.py +0 -129
- omnigenome-0.3.1a0.dist-info/RECORD +0 -78
- omnigenome-0.3.1a0.dist-info/entry_points.txt +0 -3
- {omnigenome-0.3.1a0.dist-info → omnigenome-1.0.0b0.dist-info}/WHEEL +0 -0
- {omnigenome-0.3.1a0.dist-info → omnigenome-1.0.0b0.dist-info}/licenses/LICENSE +0 -0
- {omnigenome-0.3.1a0.dist-info → omnigenome-1.0.0b0.dist-info}/top_level.txt +0 -0
|
@@ -1,219 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
# file: model.py
|
|
3
|
-
# time: 18:37 22/09/2024
|
|
4
|
-
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
|
|
5
|
-
# github: https://github.com/yangheng95
|
|
6
|
-
# huggingface: https://huggingface.co/yangheng
|
|
7
|
-
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
|
|
8
|
-
# Copyright (C) 2019-2024. All Rights Reserved.
|
|
9
|
-
"""
|
|
10
|
-
Data augmentation model for genomic sequences.
|
|
11
|
-
|
|
12
|
-
This module provides a data augmentation model that uses masked language modeling
|
|
13
|
-
to generate augmented versions of genomic sequences. It's useful for expanding
|
|
14
|
-
training datasets and improving model robustness.
|
|
15
|
-
"""
|
|
16
|
-
import torch
|
|
17
|
-
import random
|
|
18
|
-
import json
|
|
19
|
-
import tqdm
|
|
20
|
-
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
|
21
|
-
import autocuda
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class OmniModelForAugmentation(torch.nn.Module):
|
|
25
|
-
"""
|
|
26
|
-
Data augmentation model for genomic sequences using masked language modeling.
|
|
27
|
-
|
|
28
|
-
This model uses a pre-trained masked language model to generate augmented
|
|
29
|
-
versions of genomic sequences by randomly masking tokens and predicting
|
|
30
|
-
replacements. It's useful for expanding training datasets and improving
|
|
31
|
-
model generalization.
|
|
32
|
-
|
|
33
|
-
Attributes:
|
|
34
|
-
tokenizer: Tokenizer for processing genomic sequences
|
|
35
|
-
model: Pre-trained masked language model
|
|
36
|
-
device: Device to run the model on (CPU or GPU)
|
|
37
|
-
noise_ratio: Proportion of tokens to mask for augmentation
|
|
38
|
-
max_length: Maximum sequence length for tokenization
|
|
39
|
-
k: Number of augmented instances to generate per sequence
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
def __init__(
|
|
43
|
-
self,
|
|
44
|
-
model_name_or_path=None,
|
|
45
|
-
noise_ratio=0.15,
|
|
46
|
-
max_length=1026,
|
|
47
|
-
instance_num=1,
|
|
48
|
-
*args,
|
|
49
|
-
**kwargs
|
|
50
|
-
):
|
|
51
|
-
"""
|
|
52
|
-
Initialize the augmentation model.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
model_name_or_path (str): Path or model name for loading the pre-trained model
|
|
56
|
-
noise_ratio (float): The proportion of tokens to mask in each sequence for augmentation (default: 0.15)
|
|
57
|
-
max_length (int): The maximum sequence length for tokenization (default: 1026)
|
|
58
|
-
instance_num (int): Number of augmented instances to generate per sequence (default: 1)
|
|
59
|
-
*args: Additional positional arguments
|
|
60
|
-
**kwargs: Additional keyword arguments
|
|
61
|
-
"""
|
|
62
|
-
super().__init__()
|
|
63
|
-
try:
|
|
64
|
-
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
|
65
|
-
except Exception as e:
|
|
66
|
-
if "RnaTokenizer" in str(e):
|
|
67
|
-
from multimolecule import RnaTokenizer
|
|
68
|
-
|
|
69
|
-
self.tokenizer = RnaTokenizer.from_pretrained(model_name_or_path)
|
|
70
|
-
|
|
71
|
-
self.model = AutoModelForMaskedLM.from_pretrained(
|
|
72
|
-
model_name_or_path, trust_remote_code=True
|
|
73
|
-
)
|
|
74
|
-
self.device = autocuda.auto_cuda()
|
|
75
|
-
self.model.to(self.device)
|
|
76
|
-
|
|
77
|
-
# Hyperparameters for augmentation
|
|
78
|
-
self.noise_ratio = noise_ratio
|
|
79
|
-
self.max_length = max_length
|
|
80
|
-
self.k = instance_num
|
|
81
|
-
|
|
82
|
-
def load_sequences_from_file(self, input_file):
|
|
83
|
-
"""
|
|
84
|
-
Load sequences from a JSON file.
|
|
85
|
-
|
|
86
|
-
Args:
|
|
87
|
-
input_file (str): Path to the input JSON file containing sequences
|
|
88
|
-
|
|
89
|
-
Returns:
|
|
90
|
-
list: List of sequences loaded from the file
|
|
91
|
-
"""
|
|
92
|
-
sequences = []
|
|
93
|
-
with open(input_file, "r") as f:
|
|
94
|
-
for line in f.readlines():
|
|
95
|
-
sequences.append(json.loads(line)["seq"])
|
|
96
|
-
return sequences
|
|
97
|
-
|
|
98
|
-
def apply_noise_to_sequence(self, seq):
|
|
99
|
-
"""
|
|
100
|
-
Apply noise to a single sequence by randomly masking tokens.
|
|
101
|
-
|
|
102
|
-
Args:
|
|
103
|
-
seq (str): Input genomic sequence
|
|
104
|
-
|
|
105
|
-
Returns:
|
|
106
|
-
str: Sequence with randomly masked tokens
|
|
107
|
-
"""
|
|
108
|
-
seq_list = self.tokenizer.tokenize(seq)
|
|
109
|
-
for _ in range(int(len(seq_list) * self.noise_ratio)):
|
|
110
|
-
random_idx = random.randint(0, len(seq_list) - 1)
|
|
111
|
-
seq_list[random_idx] = self.tokenizer.mask_token
|
|
112
|
-
return "".join(seq_list)
|
|
113
|
-
|
|
114
|
-
def augment_sequence(self, seq):
|
|
115
|
-
"""
|
|
116
|
-
Perform augmentation on a single sequence by predicting masked tokens.
|
|
117
|
-
|
|
118
|
-
Args:
|
|
119
|
-
seq (str): Input genomic sequence with masked tokens
|
|
120
|
-
|
|
121
|
-
Returns:
|
|
122
|
-
str: Augmented sequence with predicted tokens replacing masked tokens
|
|
123
|
-
"""
|
|
124
|
-
tokenized_inputs = self.tokenizer(
|
|
125
|
-
seq,
|
|
126
|
-
padding="do_not_pad",
|
|
127
|
-
truncation=True,
|
|
128
|
-
max_length=self.max_length,
|
|
129
|
-
return_tensors="pt",
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
with torch.no_grad():
|
|
133
|
-
predictions = self.model(**tokenized_inputs.to(self.device))["logits"]
|
|
134
|
-
predicted_tokens = predictions.argmax(dim=-1).cpu()
|
|
135
|
-
|
|
136
|
-
# Replace masked tokens with predicted tokens
|
|
137
|
-
input_ids = tokenized_inputs["input_ids"][0].cpu()
|
|
138
|
-
input_ids[input_ids == self.tokenizer.mask_token_id] = predicted_tokens[0][
|
|
139
|
-
input_ids == self.tokenizer.mask_token_id
|
|
140
|
-
]
|
|
141
|
-
|
|
142
|
-
augmented_sequence = self.tokenizer.decode(input_ids, skip_special_tokens=True)
|
|
143
|
-
return augmented_sequence
|
|
144
|
-
|
|
145
|
-
def augment(self, seq, k=None):
|
|
146
|
-
"""
|
|
147
|
-
Generate multiple augmented instances for a single sequence.
|
|
148
|
-
|
|
149
|
-
Args:
|
|
150
|
-
seq (str): Input genomic sequence
|
|
151
|
-
k (int, optional): Number of augmented instances to generate (default: None, uses self.k)
|
|
152
|
-
|
|
153
|
-
Returns:
|
|
154
|
-
list: List of augmented sequences
|
|
155
|
-
"""
|
|
156
|
-
augmented_sequences = []
|
|
157
|
-
for _ in range(self.k if k is None else k):
|
|
158
|
-
noised_seq = self.apply_noise_to_sequence(seq)
|
|
159
|
-
augmented_seq = self.augment_sequence(noised_seq)
|
|
160
|
-
augmented_sequences.append(augmented_seq)
|
|
161
|
-
return augmented_sequences
|
|
162
|
-
|
|
163
|
-
def augment_sequences(self, sequences):
|
|
164
|
-
"""
|
|
165
|
-
Augment a list of sequences by applying noise and performing MLM-based predictions.
|
|
166
|
-
|
|
167
|
-
Args:
|
|
168
|
-
sequences (list): List of genomic sequences to augment
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
list: List of all augmented sequences
|
|
172
|
-
"""
|
|
173
|
-
all_augmented_sequences = []
|
|
174
|
-
for seq in tqdm.tqdm(sequences, desc="Augmenting Sequences"):
|
|
175
|
-
augmented_instances = self.augment(seq)
|
|
176
|
-
all_augmented_sequences.extend(augmented_instances)
|
|
177
|
-
return all_augmented_sequences
|
|
178
|
-
|
|
179
|
-
def save_augmented_sequences(self, augmented_sequences, output_file):
|
|
180
|
-
"""
|
|
181
|
-
Save augmented sequences to a JSON file.
|
|
182
|
-
|
|
183
|
-
Args:
|
|
184
|
-
augmented_sequences (list): List of augmented sequences to save
|
|
185
|
-
output_file (str): Path to the output JSON file
|
|
186
|
-
"""
|
|
187
|
-
with open(output_file, "w") as f:
|
|
188
|
-
for seq in augmented_sequences:
|
|
189
|
-
f.write(json.dumps({"aug_seq": seq}) + "\n")
|
|
190
|
-
|
|
191
|
-
def augment_from_file(self, input_file, output_file):
|
|
192
|
-
"""
|
|
193
|
-
Main function to handle the augmentation process from a file input to a file output.
|
|
194
|
-
|
|
195
|
-
This method loads sequences from an input file, augments them using the MLM model,
|
|
196
|
-
and saves the augmented sequences to an output file.
|
|
197
|
-
|
|
198
|
-
Args:
|
|
199
|
-
input_file (str): Path to the input file containing sequences
|
|
200
|
-
output_file (str): Path to the output file where augmented sequences will be saved
|
|
201
|
-
"""
|
|
202
|
-
sequences = self.load_sequences_from_file(input_file)
|
|
203
|
-
augmented_sequences = self.augment_sequences(sequences)
|
|
204
|
-
self.save_augmented_sequences(augmented_sequences, output_file)
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
# Example usage
|
|
208
|
-
if __name__ == "__main__":
|
|
209
|
-
model = OmniModelForAugmentation(
|
|
210
|
-
model_name_or_path="anonymous8/OmniGenome-186M",
|
|
211
|
-
noise_ratio=0.2, # Example noise ratio
|
|
212
|
-
max_length=1026, # Maximum token length
|
|
213
|
-
instance_num=3, # Number of augmented instances per sequence
|
|
214
|
-
)
|
|
215
|
-
aug = model.augment_sequence("ATCTTGCATTGAAG")
|
|
216
|
-
input_file = "toy_datasets/test.json"
|
|
217
|
-
output_file = "toy_datasets/augmented_sequences.json"
|
|
218
|
-
|
|
219
|
-
model.augment_from_file(input_file, output_file)
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
# file: __init__.py
|
|
3
|
-
# time: 21:10 08/04/2024
|
|
4
|
-
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
|
|
5
|
-
# github: https://github.com/yangheng95
|
|
6
|
-
# huggingface: https://huggingface.co/yangheng
|
|
7
|
-
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
|
|
8
|
-
# Copyright (C) 2019-2024. All Rights Reserved.
|
|
9
|
-
"""
|
|
10
|
-
This package contains modules for classification models.
|
|
11
|
-
"""
|