omnigenome 0.3.0a1__py3-none-any.whl → 0.3.3a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of omnigenome might be problematic. Click here for more details.
- omnigenome/__init__.py +252 -258
- {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/METADATA +10 -10
- omnigenome-0.3.3a0.dist-info/RECORD +7 -0
- omnigenome/auto/__init__.py +0 -3
- omnigenome/auto/auto_bench/__init__.py +0 -12
- omnigenome/auto/auto_bench/auto_bench.py +0 -484
- omnigenome/auto/auto_bench/auto_bench_cli.py +0 -230
- omnigenome/auto/auto_bench/auto_bench_config.py +0 -216
- omnigenome/auto/auto_bench/config_check.py +0 -34
- omnigenome/auto/auto_train/__init__.py +0 -13
- omnigenome/auto/auto_train/auto_train.py +0 -430
- omnigenome/auto/auto_train/auto_train_cli.py +0 -222
- omnigenome/auto/bench_hub/__init__.py +0 -12
- omnigenome/auto/bench_hub/bench_hub.py +0 -25
- omnigenome/cli/__init__.py +0 -13
- omnigenome/cli/commands/__init__.py +0 -13
- omnigenome/cli/commands/base.py +0 -83
- omnigenome/cli/commands/bench/__init__.py +0 -13
- omnigenome/cli/commands/bench/bench_cli.py +0 -202
- omnigenome/cli/commands/rna/__init__.py +0 -13
- omnigenome/cli/commands/rna/rna_design.py +0 -178
- omnigenome/cli/omnigenome_cli.py +0 -128
- omnigenome/src/__init__.py +0 -12
- omnigenome/src/abc/__init__.py +0 -12
- omnigenome/src/abc/abstract_dataset.py +0 -622
- omnigenome/src/abc/abstract_metric.py +0 -114
- omnigenome/src/abc/abstract_model.py +0 -689
- omnigenome/src/abc/abstract_tokenizer.py +0 -267
- omnigenome/src/dataset/__init__.py +0 -16
- omnigenome/src/dataset/omni_dataset.py +0 -435
- omnigenome/src/lora/__init__.py +0 -13
- omnigenome/src/lora/lora_model.py +0 -294
- omnigenome/src/metric/__init__.py +0 -15
- omnigenome/src/metric/classification_metric.py +0 -184
- omnigenome/src/metric/metric.py +0 -199
- omnigenome/src/metric/ranking_metric.py +0 -142
- omnigenome/src/metric/regression_metric.py +0 -191
- omnigenome/src/misc/__init__.py +0 -3
- omnigenome/src/misc/utils.py +0 -499
- omnigenome/src/model/__init__.py +0 -19
- omnigenome/src/model/augmentation/__init__.py +0 -12
- omnigenome/src/model/augmentation/model.py +0 -219
- omnigenome/src/model/classification/__init__.py +0 -12
- omnigenome/src/model/classification/model.py +0 -642
- omnigenome/src/model/embedding/__init__.py +0 -12
- omnigenome/src/model/embedding/model.py +0 -263
- omnigenome/src/model/mlm/__init__.py +0 -12
- omnigenome/src/model/mlm/model.py +0 -177
- omnigenome/src/model/module_utils.py +0 -232
- omnigenome/src/model/regression/__init__.py +0 -12
- omnigenome/src/model/regression/model.py +0 -786
- omnigenome/src/model/regression/resnet.py +0 -483
- omnigenome/src/model/rna_design/__init__.py +0 -12
- omnigenome/src/model/rna_design/model.py +0 -469
- omnigenome/src/model/seq2seq/__init__.py +0 -12
- omnigenome/src/model/seq2seq/model.py +0 -44
- omnigenome/src/tokenizer/__init__.py +0 -16
- omnigenome/src/tokenizer/bpe_tokenizer.py +0 -226
- omnigenome/src/tokenizer/kmers_tokenizer.py +0 -247
- omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +0 -249
- omnigenome/src/trainer/__init__.py +0 -14
- omnigenome/src/trainer/accelerate_trainer.py +0 -739
- omnigenome/src/trainer/hf_trainer.py +0 -75
- omnigenome/src/trainer/trainer.py +0 -579
- omnigenome/utility/__init__.py +0 -3
- omnigenome/utility/dataset_hub/__init__.py +0 -13
- omnigenome/utility/dataset_hub/dataset_hub.py +0 -178
- omnigenome/utility/ensemble.py +0 -324
- omnigenome/utility/hub_utils.py +0 -517
- omnigenome/utility/model_hub/__init__.py +0 -12
- omnigenome/utility/model_hub/model_hub.py +0 -231
- omnigenome/utility/pipeline_hub/__init__.py +0 -12
- omnigenome/utility/pipeline_hub/pipeline.py +0 -483
- omnigenome/utility/pipeline_hub/pipeline_hub.py +0 -129
- omnigenome-0.3.0a1.dist-info/RECORD +0 -78
- {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/WHEEL +0 -0
- {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/entry_points.txt +0 -0
- {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/licenses/LICENSE +0 -0
- {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/top_level.txt +0 -0
|
@@ -1,249 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
# file: single_nucleotide_tokenizer.py
|
|
3
|
-
# time: 18:05 08/04/2024
|
|
4
|
-
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
|
|
5
|
-
# github: https://github.com/yangheng95
|
|
6
|
-
# huggingface: https://huggingface.co/yangheng
|
|
7
|
-
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
|
|
8
|
-
# Copyright (C) 2019-2024. All Rights Reserved.
|
|
9
|
-
|
|
10
|
-
import warnings
|
|
11
|
-
|
|
12
|
-
from transformers import AutoTokenizer
|
|
13
|
-
|
|
14
|
-
from ..abc.abstract_tokenizer import OmniTokenizer
|
|
15
|
-
|
|
16
|
-
warnings.filterwarnings("once")
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class OmniSingleNucleotideTokenizer(OmniTokenizer):
|
|
20
|
-
"""
|
|
21
|
-
Tokenizer for single nucleotide tokenization in genomics.
|
|
22
|
-
|
|
23
|
-
This tokenizer converts genomic sequences into individual nucleotide tokens,
|
|
24
|
-
where each nucleotide (A, T, C, G, U) becomes a separate token. It's designed
|
|
25
|
-
for genomic sequence processing where fine-grained nucleotide-level analysis
|
|
26
|
-
is required.
|
|
27
|
-
|
|
28
|
-
The tokenizer supports various preprocessing options including U/T conversion
|
|
29
|
-
and whitespace addition between nucleotides. It also handles special tokens
|
|
30
|
-
like BOS (beginning of sequence) and EOS (end of sequence) tokens.
|
|
31
|
-
|
|
32
|
-
Attributes:
|
|
33
|
-
u2t (bool): Whether to convert 'U' to 'T'.
|
|
34
|
-
t2u (bool): Whether to convert 'T' to 'U'.
|
|
35
|
-
add_whitespace (bool): Whether to add whitespace between nucleotides.
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
def __init__(self, base_tokenizer=None, **kwargs):
|
|
39
|
-
"""
|
|
40
|
-
Initializes the single nucleotide tokenizer.
|
|
41
|
-
|
|
42
|
-
Args:
|
|
43
|
-
base_tokenizer: The underlying Hugging Face tokenizer.
|
|
44
|
-
**kwargs: Additional keyword arguments passed to the parent class.
|
|
45
|
-
|
|
46
|
-
Example:
|
|
47
|
-
>>> from transformers import AutoTokenizer
|
|
48
|
-
>>> base_tokenizer = AutoTokenizer.from_pretrained("model_name")
|
|
49
|
-
>>> tokenizer = OmniSingleNucleotideTokenizer(base_tokenizer)
|
|
50
|
-
"""
|
|
51
|
-
super(OmniSingleNucleotideTokenizer, self).__init__(base_tokenizer, **kwargs)
|
|
52
|
-
self.metadata["tokenizer_name"] = self.__class__.__name__
|
|
53
|
-
|
|
54
|
-
def __call__(self, sequence, **kwargs):
|
|
55
|
-
"""
|
|
56
|
-
Tokenizes sequences using single nucleotide tokenization.
|
|
57
|
-
|
|
58
|
-
This method converts genomic sequences into tokenized inputs suitable
|
|
59
|
-
for model training and inference. It handles sequence preprocessing,
|
|
60
|
-
tokenization, and padding/truncation.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
sequence (str or list): A single sequence or list of sequences to tokenize.
|
|
64
|
-
**kwargs: Additional arguments for tokenization:
|
|
65
|
-
- max_length (int): Maximum sequence length.
|
|
66
|
-
- padding (str): Padding strategy.
|
|
67
|
-
- truncation (bool): Whether to truncate sequences.
|
|
68
|
-
- warnings (bool): Whether to show warnings for unknown tokens.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
dict: A dictionary containing tokenized inputs:
|
|
72
|
-
- input_ids: Token IDs for the sequences
|
|
73
|
-
- attention_mask: Attention mask for the sequences
|
|
74
|
-
|
|
75
|
-
Example:
|
|
76
|
-
>>> # Tokenize a single sequence
|
|
77
|
-
>>> inputs = tokenizer("ATCGATCG")
|
|
78
|
-
>>> print(inputs['input_ids'].shape) # torch.Size([1, seq_len])
|
|
79
|
-
|
|
80
|
-
>>> # Tokenize multiple sequences
|
|
81
|
-
>>> inputs = tokenizer(["ATCGATCG", "GCTAGCTA"])
|
|
82
|
-
>>> print(inputs['input_ids'].shape) # torch.Size([2, seq_len])
|
|
83
|
-
"""
|
|
84
|
-
if self.u2t:
|
|
85
|
-
sequence = "".join([seq.replace("U", "T").upper() for seq in sequence])
|
|
86
|
-
if self.t2u:
|
|
87
|
-
sequence = "".join([seq.replace("T", "U").upper() for seq in sequence])
|
|
88
|
-
if self.add_whitespace:
|
|
89
|
-
sequence = " ".join(list(sequence))
|
|
90
|
-
sequence_tokens = self.tokenize(sequence)[
|
|
91
|
-
: kwargs.get("max_length", self.max_length) - 2
|
|
92
|
-
]
|
|
93
|
-
tokenized_inputs = {
|
|
94
|
-
"input_ids": [],
|
|
95
|
-
"attention_mask": [],
|
|
96
|
-
}
|
|
97
|
-
bos_id = (
|
|
98
|
-
self.base_tokenizer.bos_token_id
|
|
99
|
-
if self.base_tokenizer.bos_token_id is not None
|
|
100
|
-
else self.base_tokenizer.cls_token_id
|
|
101
|
-
)
|
|
102
|
-
eos_id = (
|
|
103
|
-
self.base_tokenizer.eos_token_id
|
|
104
|
-
if self.base_tokenizer.eos_token_id is not None
|
|
105
|
-
else self.base_tokenizer.sep_token_id
|
|
106
|
-
)
|
|
107
|
-
for tokens in sequence_tokens:
|
|
108
|
-
tokenized_inputs["input_ids"].append(
|
|
109
|
-
[bos_id] + self.base_tokenizer.convert_tokens_to_ids(tokens) + [eos_id]
|
|
110
|
-
)
|
|
111
|
-
tokenized_inputs["attention_mask"].append(
|
|
112
|
-
[1] * len(tokenized_inputs["input_ids"][-1])
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
if kwargs.get("warnings", True):
|
|
116
|
-
for i, ids in enumerate(tokenized_inputs["input_ids"]):
|
|
117
|
-
if ids.count(self.base_tokenizer.unk_token_id) / len(ids) > 0.1:
|
|
118
|
-
warnings.warn(
|
|
119
|
-
f"Unknown tokens are more than "
|
|
120
|
-
f"{ids.count(self.base_tokenizer.unk_token_id) / len(ids)}% in the {i}-th sequence, "
|
|
121
|
-
f"please check the tokenization process."
|
|
122
|
-
)
|
|
123
|
-
max_length = max(len(ids) for ids in tokenized_inputs["input_ids"])
|
|
124
|
-
tokenized_inputs = self.base_tokenizer.pad(
|
|
125
|
-
tokenized_inputs,
|
|
126
|
-
padding=kwargs.get("padding", "max_length"),
|
|
127
|
-
max_length=min(max_length, kwargs.get("max_length", 512)),
|
|
128
|
-
return_attention_mask=kwargs.get("return_attention_mask", True),
|
|
129
|
-
return_tensors="pt",
|
|
130
|
-
)
|
|
131
|
-
return tokenized_inputs
|
|
132
|
-
|
|
133
|
-
@staticmethod
|
|
134
|
-
def from_pretrained(model_name_or_path, **kwargs):
|
|
135
|
-
"""
|
|
136
|
-
Loads a single nucleotide tokenizer from a pre-trained model.
|
|
137
|
-
|
|
138
|
-
This method creates a single nucleotide tokenizer wrapper around
|
|
139
|
-
a Hugging Face tokenizer loaded from a pre-trained model.
|
|
140
|
-
|
|
141
|
-
Args:
|
|
142
|
-
model_name_or_path (str): The name or path of the pre-trained model.
|
|
143
|
-
**kwargs: Additional arguments for the tokenizer.
|
|
144
|
-
|
|
145
|
-
Returns:
|
|
146
|
-
OmniSingleNucleotideTokenizer: An instance of the tokenizer.
|
|
147
|
-
|
|
148
|
-
Example:
|
|
149
|
-
>>> tokenizer = OmniSingleNucleotideTokenizer.from_pretrained("model_name")
|
|
150
|
-
"""
|
|
151
|
-
self = OmniSingleNucleotideTokenizer(
|
|
152
|
-
AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
|
|
153
|
-
)
|
|
154
|
-
return self
|
|
155
|
-
|
|
156
|
-
def tokenize(self, sequence, **kwargs):
|
|
157
|
-
"""
|
|
158
|
-
Converts a sequence into a list of individual nucleotide tokens.
|
|
159
|
-
|
|
160
|
-
This method tokenizes genomic sequences by treating each nucleotide
|
|
161
|
-
as a separate token. It handles both single sequences and lists of sequences.
|
|
162
|
-
|
|
163
|
-
Args:
|
|
164
|
-
sequence (str or list): A single sequence or list of sequences to tokenize.
|
|
165
|
-
**kwargs: Additional arguments (not used in this implementation).
|
|
166
|
-
|
|
167
|
-
Returns:
|
|
168
|
-
list: A list of token lists, where each inner list contains
|
|
169
|
-
individual nucleotide tokens.
|
|
170
|
-
|
|
171
|
-
Example:
|
|
172
|
-
>>> # Tokenize a single sequence
|
|
173
|
-
>>> tokens = tokenizer.tokenize("ATCGATCG")
|
|
174
|
-
>>> print(tokens) # [['A', 'T', 'C', 'G', 'A', 'T', 'C', 'G']]
|
|
175
|
-
|
|
176
|
-
>>> # Tokenize multiple sequences
|
|
177
|
-
>>> tokens = tokenizer.tokenize(["ATCGATCG", "GCTAGCTA"])
|
|
178
|
-
>>> print(tokens) # [['A', 'T', 'C', 'G', ...], ['G', 'C', 'T', 'A', ...]]
|
|
179
|
-
"""
|
|
180
|
-
if isinstance(sequence, str):
|
|
181
|
-
sequences = [sequence]
|
|
182
|
-
else:
|
|
183
|
-
sequences = sequence
|
|
184
|
-
|
|
185
|
-
sequence_tokens = []
|
|
186
|
-
for i in range(len(sequences)):
|
|
187
|
-
sequence_tokens.append(list(sequences[i]))
|
|
188
|
-
|
|
189
|
-
return sequence_tokens
|
|
190
|
-
|
|
191
|
-
def encode(self, sequence, **kwargs):
|
|
192
|
-
"""
|
|
193
|
-
Converts a sequence into a list of token IDs.
|
|
194
|
-
|
|
195
|
-
This method encodes genomic sequences into token IDs using the
|
|
196
|
-
underlying base tokenizer.
|
|
197
|
-
|
|
198
|
-
Args:
|
|
199
|
-
sequence (str): The input sequence to encode.
|
|
200
|
-
**kwargs: Additional arguments for encoding.
|
|
201
|
-
|
|
202
|
-
Returns:
|
|
203
|
-
list: A list of token IDs.
|
|
204
|
-
|
|
205
|
-
Example:
|
|
206
|
-
>>> token_ids = tokenizer.encode("ATCGATCG")
|
|
207
|
-
>>> print(token_ids) # [1, 2, 3, 4, 1, 2, 3, 4]
|
|
208
|
-
"""
|
|
209
|
-
return self.base_tokenizer.encode(sequence, **kwargs)
|
|
210
|
-
|
|
211
|
-
def decode(self, sequence, **kwargs):
|
|
212
|
-
"""
|
|
213
|
-
Converts a list of token IDs back into a sequence.
|
|
214
|
-
|
|
215
|
-
This method decodes token IDs back into genomic sequences using
|
|
216
|
-
the underlying base tokenizer.
|
|
217
|
-
|
|
218
|
-
Args:
|
|
219
|
-
sequence (list): A list of token IDs.
|
|
220
|
-
**kwargs: Additional arguments for decoding.
|
|
221
|
-
|
|
222
|
-
Returns:
|
|
223
|
-
str: The decoded sequence.
|
|
224
|
-
|
|
225
|
-
Example:
|
|
226
|
-
>>> sequence = tokenizer.decode([1, 2, 3, 4])
|
|
227
|
-
>>> print(sequence) # "ATCG"
|
|
228
|
-
"""
|
|
229
|
-
return self.base_tokenizer.decode(sequence, **kwargs)
|
|
230
|
-
|
|
231
|
-
def encode_plus(self, sequence, **kwargs):
|
|
232
|
-
"""
|
|
233
|
-
Encodes a sequence with additional information.
|
|
234
|
-
|
|
235
|
-
This method provides enhanced encoding with additional information
|
|
236
|
-
like attention masks and token type IDs.
|
|
237
|
-
|
|
238
|
-
Args:
|
|
239
|
-
sequence (str): The input sequence to encode.
|
|
240
|
-
**kwargs: Additional arguments for encoding.
|
|
241
|
-
|
|
242
|
-
Returns:
|
|
243
|
-
dict: A dictionary containing encoded information.
|
|
244
|
-
|
|
245
|
-
Example:
|
|
246
|
-
>>> encoded = tokenizer.encode_plus("ATCGATCG")
|
|
247
|
-
>>> print(encoded.keys()) # dict_keys(['input_ids', 'attention_mask'])
|
|
248
|
-
"""
|
|
249
|
-
return self.base_tokenizer.encode_plus(sequence, **kwargs)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
# file: __init__.py
|
|
3
|
-
# time: 11:45 14/04/2024
|
|
4
|
-
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
|
|
5
|
-
# github: https://github.com/yangheng95
|
|
6
|
-
# huggingface: https://huggingface.co/yangheng
|
|
7
|
-
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
|
|
8
|
-
# Copyright (C) 2019-2024. All Rights Reserved.
|
|
9
|
-
"""
|
|
10
|
-
This package contains trainer implementations.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
from .hf_trainer import HFTrainer
|
|
14
|
-
from .trainer import Trainer
|