omnigenome 0.3.0a1__py3-none-any.whl → 0.3.3a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of omnigenome might be problematic. Click here for more details.

Files changed (79) hide show
  1. omnigenome/__init__.py +252 -258
  2. {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/METADATA +10 -10
  3. omnigenome-0.3.3a0.dist-info/RECORD +7 -0
  4. omnigenome/auto/__init__.py +0 -3
  5. omnigenome/auto/auto_bench/__init__.py +0 -12
  6. omnigenome/auto/auto_bench/auto_bench.py +0 -484
  7. omnigenome/auto/auto_bench/auto_bench_cli.py +0 -230
  8. omnigenome/auto/auto_bench/auto_bench_config.py +0 -216
  9. omnigenome/auto/auto_bench/config_check.py +0 -34
  10. omnigenome/auto/auto_train/__init__.py +0 -13
  11. omnigenome/auto/auto_train/auto_train.py +0 -430
  12. omnigenome/auto/auto_train/auto_train_cli.py +0 -222
  13. omnigenome/auto/bench_hub/__init__.py +0 -12
  14. omnigenome/auto/bench_hub/bench_hub.py +0 -25
  15. omnigenome/cli/__init__.py +0 -13
  16. omnigenome/cli/commands/__init__.py +0 -13
  17. omnigenome/cli/commands/base.py +0 -83
  18. omnigenome/cli/commands/bench/__init__.py +0 -13
  19. omnigenome/cli/commands/bench/bench_cli.py +0 -202
  20. omnigenome/cli/commands/rna/__init__.py +0 -13
  21. omnigenome/cli/commands/rna/rna_design.py +0 -178
  22. omnigenome/cli/omnigenome_cli.py +0 -128
  23. omnigenome/src/__init__.py +0 -12
  24. omnigenome/src/abc/__init__.py +0 -12
  25. omnigenome/src/abc/abstract_dataset.py +0 -622
  26. omnigenome/src/abc/abstract_metric.py +0 -114
  27. omnigenome/src/abc/abstract_model.py +0 -689
  28. omnigenome/src/abc/abstract_tokenizer.py +0 -267
  29. omnigenome/src/dataset/__init__.py +0 -16
  30. omnigenome/src/dataset/omni_dataset.py +0 -435
  31. omnigenome/src/lora/__init__.py +0 -13
  32. omnigenome/src/lora/lora_model.py +0 -294
  33. omnigenome/src/metric/__init__.py +0 -15
  34. omnigenome/src/metric/classification_metric.py +0 -184
  35. omnigenome/src/metric/metric.py +0 -199
  36. omnigenome/src/metric/ranking_metric.py +0 -142
  37. omnigenome/src/metric/regression_metric.py +0 -191
  38. omnigenome/src/misc/__init__.py +0 -3
  39. omnigenome/src/misc/utils.py +0 -499
  40. omnigenome/src/model/__init__.py +0 -19
  41. omnigenome/src/model/augmentation/__init__.py +0 -12
  42. omnigenome/src/model/augmentation/model.py +0 -219
  43. omnigenome/src/model/classification/__init__.py +0 -12
  44. omnigenome/src/model/classification/model.py +0 -642
  45. omnigenome/src/model/embedding/__init__.py +0 -12
  46. omnigenome/src/model/embedding/model.py +0 -263
  47. omnigenome/src/model/mlm/__init__.py +0 -12
  48. omnigenome/src/model/mlm/model.py +0 -177
  49. omnigenome/src/model/module_utils.py +0 -232
  50. omnigenome/src/model/regression/__init__.py +0 -12
  51. omnigenome/src/model/regression/model.py +0 -786
  52. omnigenome/src/model/regression/resnet.py +0 -483
  53. omnigenome/src/model/rna_design/__init__.py +0 -12
  54. omnigenome/src/model/rna_design/model.py +0 -469
  55. omnigenome/src/model/seq2seq/__init__.py +0 -12
  56. omnigenome/src/model/seq2seq/model.py +0 -44
  57. omnigenome/src/tokenizer/__init__.py +0 -16
  58. omnigenome/src/tokenizer/bpe_tokenizer.py +0 -226
  59. omnigenome/src/tokenizer/kmers_tokenizer.py +0 -247
  60. omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +0 -249
  61. omnigenome/src/trainer/__init__.py +0 -14
  62. omnigenome/src/trainer/accelerate_trainer.py +0 -739
  63. omnigenome/src/trainer/hf_trainer.py +0 -75
  64. omnigenome/src/trainer/trainer.py +0 -579
  65. omnigenome/utility/__init__.py +0 -3
  66. omnigenome/utility/dataset_hub/__init__.py +0 -13
  67. omnigenome/utility/dataset_hub/dataset_hub.py +0 -178
  68. omnigenome/utility/ensemble.py +0 -324
  69. omnigenome/utility/hub_utils.py +0 -517
  70. omnigenome/utility/model_hub/__init__.py +0 -12
  71. omnigenome/utility/model_hub/model_hub.py +0 -231
  72. omnigenome/utility/pipeline_hub/__init__.py +0 -12
  73. omnigenome/utility/pipeline_hub/pipeline.py +0 -483
  74. omnigenome/utility/pipeline_hub/pipeline_hub.py +0 -129
  75. omnigenome-0.3.0a1.dist-info/RECORD +0 -78
  76. {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/WHEEL +0 -0
  77. {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/entry_points.txt +0 -0
  78. {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/licenses/LICENSE +0 -0
  79. {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/top_level.txt +0 -0
@@ -1,249 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # file: single_nucleotide_tokenizer.py
3
- # time: 18:05 08/04/2024
4
- # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
- # github: https://github.com/yangheng95
6
- # huggingface: https://huggingface.co/yangheng
7
- # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
- # Copyright (C) 2019-2024. All Rights Reserved.
9
-
10
- import warnings
11
-
12
- from transformers import AutoTokenizer
13
-
14
- from ..abc.abstract_tokenizer import OmniTokenizer
15
-
16
- warnings.filterwarnings("once")
17
-
18
-
19
- class OmniSingleNucleotideTokenizer(OmniTokenizer):
20
- """
21
- Tokenizer for single nucleotide tokenization in genomics.
22
-
23
- This tokenizer converts genomic sequences into individual nucleotide tokens,
24
- where each nucleotide (A, T, C, G, U) becomes a separate token. It's designed
25
- for genomic sequence processing where fine-grained nucleotide-level analysis
26
- is required.
27
-
28
- The tokenizer supports various preprocessing options including U/T conversion
29
- and whitespace addition between nucleotides. It also handles special tokens
30
- like BOS (beginning of sequence) and EOS (end of sequence) tokens.
31
-
32
- Attributes:
33
- u2t (bool): Whether to convert 'U' to 'T'.
34
- t2u (bool): Whether to convert 'T' to 'U'.
35
- add_whitespace (bool): Whether to add whitespace between nucleotides.
36
- """
37
-
38
- def __init__(self, base_tokenizer=None, **kwargs):
39
- """
40
- Initializes the single nucleotide tokenizer.
41
-
42
- Args:
43
- base_tokenizer: The underlying Hugging Face tokenizer.
44
- **kwargs: Additional keyword arguments passed to the parent class.
45
-
46
- Example:
47
- >>> from transformers import AutoTokenizer
48
- >>> base_tokenizer = AutoTokenizer.from_pretrained("model_name")
49
- >>> tokenizer = OmniSingleNucleotideTokenizer(base_tokenizer)
50
- """
51
- super(OmniSingleNucleotideTokenizer, self).__init__(base_tokenizer, **kwargs)
52
- self.metadata["tokenizer_name"] = self.__class__.__name__
53
-
54
- def __call__(self, sequence, **kwargs):
55
- """
56
- Tokenizes sequences using single nucleotide tokenization.
57
-
58
- This method converts genomic sequences into tokenized inputs suitable
59
- for model training and inference. It handles sequence preprocessing,
60
- tokenization, and padding/truncation.
61
-
62
- Args:
63
- sequence (str or list): A single sequence or list of sequences to tokenize.
64
- **kwargs: Additional arguments for tokenization:
65
- - max_length (int): Maximum sequence length.
66
- - padding (str): Padding strategy.
67
- - truncation (bool): Whether to truncate sequences.
68
- - warnings (bool): Whether to show warnings for unknown tokens.
69
-
70
- Returns:
71
- dict: A dictionary containing tokenized inputs:
72
- - input_ids: Token IDs for the sequences
73
- - attention_mask: Attention mask for the sequences
74
-
75
- Example:
76
- >>> # Tokenize a single sequence
77
- >>> inputs = tokenizer("ATCGATCG")
78
- >>> print(inputs['input_ids'].shape) # torch.Size([1, seq_len])
79
-
80
- >>> # Tokenize multiple sequences
81
- >>> inputs = tokenizer(["ATCGATCG", "GCTAGCTA"])
82
- >>> print(inputs['input_ids'].shape) # torch.Size([2, seq_len])
83
- """
84
- if self.u2t:
85
- sequence = "".join([seq.replace("U", "T").upper() for seq in sequence])
86
- if self.t2u:
87
- sequence = "".join([seq.replace("T", "U").upper() for seq in sequence])
88
- if self.add_whitespace:
89
- sequence = " ".join(list(sequence))
90
- sequence_tokens = self.tokenize(sequence)[
91
- : kwargs.get("max_length", self.max_length) - 2
92
- ]
93
- tokenized_inputs = {
94
- "input_ids": [],
95
- "attention_mask": [],
96
- }
97
- bos_id = (
98
- self.base_tokenizer.bos_token_id
99
- if self.base_tokenizer.bos_token_id is not None
100
- else self.base_tokenizer.cls_token_id
101
- )
102
- eos_id = (
103
- self.base_tokenizer.eos_token_id
104
- if self.base_tokenizer.eos_token_id is not None
105
- else self.base_tokenizer.sep_token_id
106
- )
107
- for tokens in sequence_tokens:
108
- tokenized_inputs["input_ids"].append(
109
- [bos_id] + self.base_tokenizer.convert_tokens_to_ids(tokens) + [eos_id]
110
- )
111
- tokenized_inputs["attention_mask"].append(
112
- [1] * len(tokenized_inputs["input_ids"][-1])
113
- )
114
-
115
- if kwargs.get("warnings", True):
116
- for i, ids in enumerate(tokenized_inputs["input_ids"]):
117
- if ids.count(self.base_tokenizer.unk_token_id) / len(ids) > 0.1:
118
- warnings.warn(
119
- f"Unknown tokens are more than "
120
- f"{ids.count(self.base_tokenizer.unk_token_id) / len(ids)}% in the {i}-th sequence, "
121
- f"please check the tokenization process."
122
- )
123
- max_length = max(len(ids) for ids in tokenized_inputs["input_ids"])
124
- tokenized_inputs = self.base_tokenizer.pad(
125
- tokenized_inputs,
126
- padding=kwargs.get("padding", "max_length"),
127
- max_length=min(max_length, kwargs.get("max_length", 512)),
128
- return_attention_mask=kwargs.get("return_attention_mask", True),
129
- return_tensors="pt",
130
- )
131
- return tokenized_inputs
132
-
133
- @staticmethod
134
- def from_pretrained(model_name_or_path, **kwargs):
135
- """
136
- Loads a single nucleotide tokenizer from a pre-trained model.
137
-
138
- This method creates a single nucleotide tokenizer wrapper around
139
- a Hugging Face tokenizer loaded from a pre-trained model.
140
-
141
- Args:
142
- model_name_or_path (str): The name or path of the pre-trained model.
143
- **kwargs: Additional arguments for the tokenizer.
144
-
145
- Returns:
146
- OmniSingleNucleotideTokenizer: An instance of the tokenizer.
147
-
148
- Example:
149
- >>> tokenizer = OmniSingleNucleotideTokenizer.from_pretrained("model_name")
150
- """
151
- self = OmniSingleNucleotideTokenizer(
152
- AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
153
- )
154
- return self
155
-
156
- def tokenize(self, sequence, **kwargs):
157
- """
158
- Converts a sequence into a list of individual nucleotide tokens.
159
-
160
- This method tokenizes genomic sequences by treating each nucleotide
161
- as a separate token. It handles both single sequences and lists of sequences.
162
-
163
- Args:
164
- sequence (str or list): A single sequence or list of sequences to tokenize.
165
- **kwargs: Additional arguments (not used in this implementation).
166
-
167
- Returns:
168
- list: A list of token lists, where each inner list contains
169
- individual nucleotide tokens.
170
-
171
- Example:
172
- >>> # Tokenize a single sequence
173
- >>> tokens = tokenizer.tokenize("ATCGATCG")
174
- >>> print(tokens) # [['A', 'T', 'C', 'G', 'A', 'T', 'C', 'G']]
175
-
176
- >>> # Tokenize multiple sequences
177
- >>> tokens = tokenizer.tokenize(["ATCGATCG", "GCTAGCTA"])
178
- >>> print(tokens) # [['A', 'T', 'C', 'G', ...], ['G', 'C', 'T', 'A', ...]]
179
- """
180
- if isinstance(sequence, str):
181
- sequences = [sequence]
182
- else:
183
- sequences = sequence
184
-
185
- sequence_tokens = []
186
- for i in range(len(sequences)):
187
- sequence_tokens.append(list(sequences[i]))
188
-
189
- return sequence_tokens
190
-
191
- def encode(self, sequence, **kwargs):
192
- """
193
- Converts a sequence into a list of token IDs.
194
-
195
- This method encodes genomic sequences into token IDs using the
196
- underlying base tokenizer.
197
-
198
- Args:
199
- sequence (str): The input sequence to encode.
200
- **kwargs: Additional arguments for encoding.
201
-
202
- Returns:
203
- list: A list of token IDs.
204
-
205
- Example:
206
- >>> token_ids = tokenizer.encode("ATCGATCG")
207
- >>> print(token_ids) # [1, 2, 3, 4, 1, 2, 3, 4]
208
- """
209
- return self.base_tokenizer.encode(sequence, **kwargs)
210
-
211
- def decode(self, sequence, **kwargs):
212
- """
213
- Converts a list of token IDs back into a sequence.
214
-
215
- This method decodes token IDs back into genomic sequences using
216
- the underlying base tokenizer.
217
-
218
- Args:
219
- sequence (list): A list of token IDs.
220
- **kwargs: Additional arguments for decoding.
221
-
222
- Returns:
223
- str: The decoded sequence.
224
-
225
- Example:
226
- >>> sequence = tokenizer.decode([1, 2, 3, 4])
227
- >>> print(sequence) # "ATCG"
228
- """
229
- return self.base_tokenizer.decode(sequence, **kwargs)
230
-
231
- def encode_plus(self, sequence, **kwargs):
232
- """
233
- Encodes a sequence with additional information.
234
-
235
- This method provides enhanced encoding with additional information
236
- like attention masks and token type IDs.
237
-
238
- Args:
239
- sequence (str): The input sequence to encode.
240
- **kwargs: Additional arguments for encoding.
241
-
242
- Returns:
243
- dict: A dictionary containing encoded information.
244
-
245
- Example:
246
- >>> encoded = tokenizer.encode_plus("ATCGATCG")
247
- >>> print(encoded.keys()) # dict_keys(['input_ids', 'attention_mask'])
248
- """
249
- return self.base_tokenizer.encode_plus(sequence, **kwargs)
@@ -1,14 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # file: __init__.py
3
- # time: 11:45 14/04/2024
4
- # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
- # github: https://github.com/yangheng95
6
- # huggingface: https://huggingface.co/yangheng
7
- # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
- # Copyright (C) 2019-2024. All Rights Reserved.
9
- """
10
- This package contains trainer implementations.
11
- """
12
-
13
- from .hf_trainer import HFTrainer
14
- from .trainer import Trainer