omnigenome 0.3.1a0__py3-none-any.whl → 0.3.3a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of omnigenome might be problematic. Click here for more details.

Files changed (79) hide show
  1. omnigenome/__init__.py +252 -266
  2. {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/METADATA +9 -9
  3. omnigenome-0.3.3a0.dist-info/RECORD +7 -0
  4. omnigenome/auto/__init__.py +0 -3
  5. omnigenome/auto/auto_bench/__init__.py +0 -11
  6. omnigenome/auto/auto_bench/auto_bench.py +0 -494
  7. omnigenome/auto/auto_bench/auto_bench_cli.py +0 -230
  8. omnigenome/auto/auto_bench/auto_bench_config.py +0 -216
  9. omnigenome/auto/auto_bench/config_check.py +0 -34
  10. omnigenome/auto/auto_train/__init__.py +0 -12
  11. omnigenome/auto/auto_train/auto_train.py +0 -429
  12. omnigenome/auto/auto_train/auto_train_cli.py +0 -222
  13. omnigenome/auto/bench_hub/__init__.py +0 -11
  14. omnigenome/auto/bench_hub/bench_hub.py +0 -25
  15. omnigenome/cli/__init__.py +0 -12
  16. omnigenome/cli/commands/__init__.py +0 -12
  17. omnigenome/cli/commands/base.py +0 -83
  18. omnigenome/cli/commands/bench/__init__.py +0 -12
  19. omnigenome/cli/commands/bench/bench_cli.py +0 -202
  20. omnigenome/cli/commands/rna/__init__.py +0 -12
  21. omnigenome/cli/commands/rna/rna_design.py +0 -177
  22. omnigenome/cli/omnigenome_cli.py +0 -128
  23. omnigenome/src/__init__.py +0 -11
  24. omnigenome/src/abc/__init__.py +0 -11
  25. omnigenome/src/abc/abstract_dataset.py +0 -641
  26. omnigenome/src/abc/abstract_metric.py +0 -114
  27. omnigenome/src/abc/abstract_model.py +0 -690
  28. omnigenome/src/abc/abstract_tokenizer.py +0 -269
  29. omnigenome/src/dataset/__init__.py +0 -16
  30. omnigenome/src/dataset/omni_dataset.py +0 -437
  31. omnigenome/src/lora/__init__.py +0 -12
  32. omnigenome/src/lora/lora_model.py +0 -300
  33. omnigenome/src/metric/__init__.py +0 -15
  34. omnigenome/src/metric/classification_metric.py +0 -184
  35. omnigenome/src/metric/metric.py +0 -199
  36. omnigenome/src/metric/ranking_metric.py +0 -142
  37. omnigenome/src/metric/regression_metric.py +0 -191
  38. omnigenome/src/misc/__init__.py +0 -3
  39. omnigenome/src/misc/utils.py +0 -503
  40. omnigenome/src/model/__init__.py +0 -19
  41. omnigenome/src/model/augmentation/__init__.py +0 -11
  42. omnigenome/src/model/augmentation/model.py +0 -219
  43. omnigenome/src/model/classification/__init__.py +0 -11
  44. omnigenome/src/model/classification/model.py +0 -638
  45. omnigenome/src/model/embedding/__init__.py +0 -11
  46. omnigenome/src/model/embedding/model.py +0 -263
  47. omnigenome/src/model/mlm/__init__.py +0 -11
  48. omnigenome/src/model/mlm/model.py +0 -177
  49. omnigenome/src/model/module_utils.py +0 -232
  50. omnigenome/src/model/regression/__init__.py +0 -11
  51. omnigenome/src/model/regression/model.py +0 -781
  52. omnigenome/src/model/regression/resnet.py +0 -483
  53. omnigenome/src/model/rna_design/__init__.py +0 -11
  54. omnigenome/src/model/rna_design/model.py +0 -476
  55. omnigenome/src/model/seq2seq/__init__.py +0 -11
  56. omnigenome/src/model/seq2seq/model.py +0 -44
  57. omnigenome/src/tokenizer/__init__.py +0 -16
  58. omnigenome/src/tokenizer/bpe_tokenizer.py +0 -226
  59. omnigenome/src/tokenizer/kmers_tokenizer.py +0 -247
  60. omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +0 -249
  61. omnigenome/src/trainer/__init__.py +0 -14
  62. omnigenome/src/trainer/accelerate_trainer.py +0 -747
  63. omnigenome/src/trainer/hf_trainer.py +0 -75
  64. omnigenome/src/trainer/trainer.py +0 -591
  65. omnigenome/utility/__init__.py +0 -3
  66. omnigenome/utility/dataset_hub/__init__.py +0 -12
  67. omnigenome/utility/dataset_hub/dataset_hub.py +0 -178
  68. omnigenome/utility/ensemble.py +0 -324
  69. omnigenome/utility/hub_utils.py +0 -517
  70. omnigenome/utility/model_hub/__init__.py +0 -11
  71. omnigenome/utility/model_hub/model_hub.py +0 -232
  72. omnigenome/utility/pipeline_hub/__init__.py +0 -11
  73. omnigenome/utility/pipeline_hub/pipeline.py +0 -483
  74. omnigenome/utility/pipeline_hub/pipeline_hub.py +0 -129
  75. omnigenome-0.3.1a0.dist-info/RECORD +0 -78
  76. {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/WHEEL +0 -0
  77. {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/entry_points.txt +0 -0
  78. {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/licenses/LICENSE +0 -0
  79. {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/top_level.txt +0 -0
@@ -1,219 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # file: model.py
3
- # time: 18:37 22/09/2024
4
- # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
- # github: https://github.com/yangheng95
6
- # huggingface: https://huggingface.co/yangheng
7
- # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
- # Copyright (C) 2019-2024. All Rights Reserved.
9
- """
10
- Data augmentation model for genomic sequences.
11
-
12
- This module provides a data augmentation model that uses masked language modeling
13
- to generate augmented versions of genomic sequences. It's useful for expanding
14
- training datasets and improving model robustness.
15
- """
16
- import torch
17
- import random
18
- import json
19
- import tqdm
20
- from transformers import AutoModelForMaskedLM, AutoTokenizer
21
- import autocuda
22
-
23
-
24
- class OmniModelForAugmentation(torch.nn.Module):
25
- """
26
- Data augmentation model for genomic sequences using masked language modeling.
27
-
28
- This model uses a pre-trained masked language model to generate augmented
29
- versions of genomic sequences by randomly masking tokens and predicting
30
- replacements. It's useful for expanding training datasets and improving
31
- model generalization.
32
-
33
- Attributes:
34
- tokenizer: Tokenizer for processing genomic sequences
35
- model: Pre-trained masked language model
36
- device: Device to run the model on (CPU or GPU)
37
- noise_ratio: Proportion of tokens to mask for augmentation
38
- max_length: Maximum sequence length for tokenization
39
- k: Number of augmented instances to generate per sequence
40
- """
41
-
42
- def __init__(
43
- self,
44
- model_name_or_path=None,
45
- noise_ratio=0.15,
46
- max_length=1026,
47
- instance_num=1,
48
- *args,
49
- **kwargs
50
- ):
51
- """
52
- Initialize the augmentation model.
53
-
54
- Args:
55
- model_name_or_path (str): Path or model name for loading the pre-trained model
56
- noise_ratio (float): The proportion of tokens to mask in each sequence for augmentation (default: 0.15)
57
- max_length (int): The maximum sequence length for tokenization (default: 1026)
58
- instance_num (int): Number of augmented instances to generate per sequence (default: 1)
59
- *args: Additional positional arguments
60
- **kwargs: Additional keyword arguments
61
- """
62
- super().__init__()
63
- try:
64
- self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
65
- except Exception as e:
66
- if "RnaTokenizer" in str(e):
67
- from multimolecule import RnaTokenizer
68
-
69
- self.tokenizer = RnaTokenizer.from_pretrained(model_name_or_path)
70
-
71
- self.model = AutoModelForMaskedLM.from_pretrained(
72
- model_name_or_path, trust_remote_code=True
73
- )
74
- self.device = autocuda.auto_cuda()
75
- self.model.to(self.device)
76
-
77
- # Hyperparameters for augmentation
78
- self.noise_ratio = noise_ratio
79
- self.max_length = max_length
80
- self.k = instance_num
81
-
82
- def load_sequences_from_file(self, input_file):
83
- """
84
- Load sequences from a JSON file.
85
-
86
- Args:
87
- input_file (str): Path to the input JSON file containing sequences
88
-
89
- Returns:
90
- list: List of sequences loaded from the file
91
- """
92
- sequences = []
93
- with open(input_file, "r") as f:
94
- for line in f.readlines():
95
- sequences.append(json.loads(line)["seq"])
96
- return sequences
97
-
98
- def apply_noise_to_sequence(self, seq):
99
- """
100
- Apply noise to a single sequence by randomly masking tokens.
101
-
102
- Args:
103
- seq (str): Input genomic sequence
104
-
105
- Returns:
106
- str: Sequence with randomly masked tokens
107
- """
108
- seq_list = self.tokenizer.tokenize(seq)
109
- for _ in range(int(len(seq_list) * self.noise_ratio)):
110
- random_idx = random.randint(0, len(seq_list) - 1)
111
- seq_list[random_idx] = self.tokenizer.mask_token
112
- return "".join(seq_list)
113
-
114
- def augment_sequence(self, seq):
115
- """
116
- Perform augmentation on a single sequence by predicting masked tokens.
117
-
118
- Args:
119
- seq (str): Input genomic sequence with masked tokens
120
-
121
- Returns:
122
- str: Augmented sequence with predicted tokens replacing masked tokens
123
- """
124
- tokenized_inputs = self.tokenizer(
125
- seq,
126
- padding="do_not_pad",
127
- truncation=True,
128
- max_length=self.max_length,
129
- return_tensors="pt",
130
- )
131
-
132
- with torch.no_grad():
133
- predictions = self.model(**tokenized_inputs.to(self.device))["logits"]
134
- predicted_tokens = predictions.argmax(dim=-1).cpu()
135
-
136
- # Replace masked tokens with predicted tokens
137
- input_ids = tokenized_inputs["input_ids"][0].cpu()
138
- input_ids[input_ids == self.tokenizer.mask_token_id] = predicted_tokens[0][
139
- input_ids == self.tokenizer.mask_token_id
140
- ]
141
-
142
- augmented_sequence = self.tokenizer.decode(input_ids, skip_special_tokens=True)
143
- return augmented_sequence
144
-
145
- def augment(self, seq, k=None):
146
- """
147
- Generate multiple augmented instances for a single sequence.
148
-
149
- Args:
150
- seq (str): Input genomic sequence
151
- k (int, optional): Number of augmented instances to generate (default: None, uses self.k)
152
-
153
- Returns:
154
- list: List of augmented sequences
155
- """
156
- augmented_sequences = []
157
- for _ in range(self.k if k is None else k):
158
- noised_seq = self.apply_noise_to_sequence(seq)
159
- augmented_seq = self.augment_sequence(noised_seq)
160
- augmented_sequences.append(augmented_seq)
161
- return augmented_sequences
162
-
163
- def augment_sequences(self, sequences):
164
- """
165
- Augment a list of sequences by applying noise and performing MLM-based predictions.
166
-
167
- Args:
168
- sequences (list): List of genomic sequences to augment
169
-
170
- Returns:
171
- list: List of all augmented sequences
172
- """
173
- all_augmented_sequences = []
174
- for seq in tqdm.tqdm(sequences, desc="Augmenting Sequences"):
175
- augmented_instances = self.augment(seq)
176
- all_augmented_sequences.extend(augmented_instances)
177
- return all_augmented_sequences
178
-
179
- def save_augmented_sequences(self, augmented_sequences, output_file):
180
- """
181
- Save augmented sequences to a JSON file.
182
-
183
- Args:
184
- augmented_sequences (list): List of augmented sequences to save
185
- output_file (str): Path to the output JSON file
186
- """
187
- with open(output_file, "w") as f:
188
- for seq in augmented_sequences:
189
- f.write(json.dumps({"aug_seq": seq}) + "\n")
190
-
191
- def augment_from_file(self, input_file, output_file):
192
- """
193
- Main function to handle the augmentation process from a file input to a file output.
194
-
195
- This method loads sequences from an input file, augments them using the MLM model,
196
- and saves the augmented sequences to an output file.
197
-
198
- Args:
199
- input_file (str): Path to the input file containing sequences
200
- output_file (str): Path to the output file where augmented sequences will be saved
201
- """
202
- sequences = self.load_sequences_from_file(input_file)
203
- augmented_sequences = self.augment_sequences(sequences)
204
- self.save_augmented_sequences(augmented_sequences, output_file)
205
-
206
-
207
- # Example usage
208
- if __name__ == "__main__":
209
- model = OmniModelForAugmentation(
210
- model_name_or_path="anonymous8/OmniGenome-186M",
211
- noise_ratio=0.2, # Example noise ratio
212
- max_length=1026, # Maximum token length
213
- instance_num=3, # Number of augmented instances per sequence
214
- )
215
- aug = model.augment_sequence("ATCTTGCATTGAAG")
216
- input_file = "toy_datasets/test.json"
217
- output_file = "toy_datasets/augmented_sequences.json"
218
-
219
- model.augment_from_file(input_file, output_file)
@@ -1,11 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # file: __init__.py
3
- # time: 21:10 08/04/2024
4
- # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
- # github: https://github.com/yangheng95
6
- # huggingface: https://huggingface.co/yangheng
7
- # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
- # Copyright (C) 2019-2024. All Rights Reserved.
9
- """
10
- This package contains modules for classification models.
11
- """