omnigenome 0.3.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of omnigenome might be problematic. Click here for more details.

Files changed (85) hide show
  1. omnigenome/__init__.py +281 -0
  2. omnigenome/auto/__init__.py +3 -0
  3. omnigenome/auto/auto_bench/__init__.py +12 -0
  4. omnigenome/auto/auto_bench/auto_bench.py +484 -0
  5. omnigenome/auto/auto_bench/auto_bench_cli.py +230 -0
  6. omnigenome/auto/auto_bench/auto_bench_config.py +216 -0
  7. omnigenome/auto/auto_bench/config_check.py +34 -0
  8. omnigenome/auto/auto_train/__init__.py +13 -0
  9. omnigenome/auto/auto_train/auto_train.py +430 -0
  10. omnigenome/auto/auto_train/auto_train_cli.py +222 -0
  11. omnigenome/auto/bench_hub/__init__.py +12 -0
  12. omnigenome/auto/bench_hub/bench_hub.py +25 -0
  13. omnigenome/cli/__init__.py +13 -0
  14. omnigenome/cli/commands/__init__.py +13 -0
  15. omnigenome/cli/commands/base.py +83 -0
  16. omnigenome/cli/commands/bench/__init__.py +13 -0
  17. omnigenome/cli/commands/bench/bench_cli.py +202 -0
  18. omnigenome/cli/commands/rna/__init__.py +13 -0
  19. omnigenome/cli/commands/rna/rna_design.py +178 -0
  20. omnigenome/cli/omnigenome_cli.py +128 -0
  21. omnigenome/src/__init__.py +12 -0
  22. omnigenome/src/abc/__init__.py +12 -0
  23. omnigenome/src/abc/abstract_dataset.py +622 -0
  24. omnigenome/src/abc/abstract_metric.py +114 -0
  25. omnigenome/src/abc/abstract_model.py +689 -0
  26. omnigenome/src/abc/abstract_tokenizer.py +267 -0
  27. omnigenome/src/dataset/__init__.py +16 -0
  28. omnigenome/src/dataset/omni_dataset.py +435 -0
  29. omnigenome/src/lora/__init__.py +13 -0
  30. omnigenome/src/lora/lora_model.py +294 -0
  31. omnigenome/src/metric/__init__.py +15 -0
  32. omnigenome/src/metric/classification_metric.py +184 -0
  33. omnigenome/src/metric/metric.py +199 -0
  34. omnigenome/src/metric/ranking_metric.py +142 -0
  35. omnigenome/src/metric/regression_metric.py +191 -0
  36. omnigenome/src/misc/__init__.py +3 -0
  37. omnigenome/src/misc/utils.py +439 -0
  38. omnigenome/src/model/__init__.py +19 -0
  39. omnigenome/src/model/augmentation/__init__.py +12 -0
  40. omnigenome/src/model/augmentation/model.py +219 -0
  41. omnigenome/src/model/classification/__init__.py +12 -0
  42. omnigenome/src/model/classification/model.py +642 -0
  43. omnigenome/src/model/embedding/__init__.py +12 -0
  44. omnigenome/src/model/embedding/model.py +263 -0
  45. omnigenome/src/model/mlm/__init__.py +12 -0
  46. omnigenome/src/model/mlm/model.py +177 -0
  47. omnigenome/src/model/module_utils.py +232 -0
  48. omnigenome/src/model/regression/__init__.py +12 -0
  49. omnigenome/src/model/regression/model.py +786 -0
  50. omnigenome/src/model/regression/resnet.py +483 -0
  51. omnigenome/src/model/rna_design/__init__.py +12 -0
  52. omnigenome/src/model/rna_design/model.py +426 -0
  53. omnigenome/src/model/seq2seq/__init__.py +12 -0
  54. omnigenome/src/model/seq2seq/model.py +44 -0
  55. omnigenome/src/tokenizer/__init__.py +16 -0
  56. omnigenome/src/tokenizer/bpe_tokenizer.py +226 -0
  57. omnigenome/src/tokenizer/kmers_tokenizer.py +247 -0
  58. omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +249 -0
  59. omnigenome/src/trainer/__init__.py +14 -0
  60. omnigenome/src/trainer/accelerate_trainer.py +739 -0
  61. omnigenome/src/trainer/hf_trainer.py +75 -0
  62. omnigenome/src/trainer/trainer.py +579 -0
  63. omnigenome/utility/__init__.py +3 -0
  64. omnigenome/utility/dataset_hub/__init__.py +13 -0
  65. omnigenome/utility/dataset_hub/dataset_hub.py +178 -0
  66. omnigenome/utility/ensemble.py +324 -0
  67. omnigenome/utility/hub_utils.py +517 -0
  68. omnigenome/utility/model_hub/__init__.py +12 -0
  69. omnigenome/utility/model_hub/model_hub.py +231 -0
  70. omnigenome/utility/pipeline_hub/__init__.py +12 -0
  71. omnigenome/utility/pipeline_hub/pipeline.py +483 -0
  72. omnigenome/utility/pipeline_hub/pipeline_hub.py +129 -0
  73. omnigenome-0.3.0a0.dist-info/METADATA +224 -0
  74. omnigenome-0.3.0a0.dist-info/RECORD +85 -0
  75. omnigenome-0.3.0a0.dist-info/WHEEL +5 -0
  76. omnigenome-0.3.0a0.dist-info/entry_points.txt +3 -0
  77. omnigenome-0.3.0a0.dist-info/licenses/LICENSE +201 -0
  78. omnigenome-0.3.0a0.dist-info/top_level.txt +2 -0
  79. tests/__init__.py +9 -0
  80. tests/conftest.py +160 -0
  81. tests/test_dataset_patterns.py +291 -0
  82. tests/test_examples_syntax.py +83 -0
  83. tests/test_model_loading.py +183 -0
  84. tests/test_rna_functions.py +255 -0
  85. tests/test_training_patterns.py +302 -0
@@ -0,0 +1,267 @@
1
+ # -*- coding: utf-8 -*-
2
+ # file: omnigenome_wrapper.py
3
+ # time: 18:37 06/04/2024
4
+ # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
+ # github: https://github.com/yangheng95
6
+ # huggingface: https://huggingface.co/yangheng
7
+ # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
+ # Copyright (C) 2019-2024. All Rights Reserved.
9
+ import warnings
10
+
11
+ from transformers import AutoTokenizer
12
+
13
+ from ..misc.utils import env_meta_info, load_module_from_path
14
+
15
+
16
+ class OmniTokenizer:
17
+ """
18
+ A wrapper class for tokenizers to provide a consistent interface within OmniGenome.
19
+
20
+ This class provides a unified interface for tokenizers in the OmniGenome framework.
21
+ It wraps underlying tokenizers (typically from Hugging Face) and provides
22
+ additional functionality for genomic sequence processing.
23
+
24
+ The class handles various tokenization strategies and provides compatibility
25
+ with different model architectures. It also supports custom tokenizer wrappers
26
+ for specialized genomic tasks.
27
+
28
+ Attributes:
29
+ base_tokenizer: The underlying tokenizer instance (e.g., from Hugging Face).
30
+ max_length (int): The default maximum sequence length.
31
+ metadata (dict): Metadata about the tokenizer including version info.
32
+ u2t (bool): Whether to convert 'U' to 'T'.
33
+ t2u (bool): Whether to convert 'T' to 'U'.
34
+ add_whitespace (bool): Whether to add whitespace between characters.
35
+ """
36
+
37
+ def __init__(self, base_tokenizer=None, max_length=512, **kwargs):
38
+ """
39
+ Initializes the tokenizer wrapper.
40
+
41
+ Args:
42
+ base_tokenizer: The underlying tokenizer instance (e.g., from Hugging Face).
43
+ max_length (int): The default maximum sequence length. Defaults to 512.
44
+ **kwargs: Additional keyword arguments.
45
+ - u2t (bool): Whether to convert 'U' to 'T'. Defaults to False.
46
+ - t2u (bool): Whether to convert 'T' to 'U'. Defaults to False.
47
+ - add_whitespace (bool): Whether to add whitespace between characters.
48
+ Defaults to False.
49
+
50
+ Example:
51
+ >>> # Initialize with a Hugging Face tokenizer
52
+ >>> from transformers import AutoTokenizer
53
+ >>> base_tokenizer = AutoTokenizer.from_pretrained("model_name")
54
+ >>> tokenizer = OmniTokenizer(base_tokenizer, max_length=512)
55
+
56
+ >>> # Initialize with sequence conversion
57
+ >>> tokenizer = OmniTokenizer(base_tokenizer, u2t=True)
58
+ """
59
+ self.metadata = env_meta_info()
60
+
61
+ self.base_tokenizer = base_tokenizer
62
+ self.max_length = max_length
63
+
64
+ for key, value in kwargs.items():
65
+ self.metadata[key] = value
66
+
67
+ self.u2t = kwargs.get("u2t", False)
68
+ self.t2u = kwargs.get("t2u", False)
69
+ self.add_whitespace = kwargs.get("add_whitespace", False)
70
+
71
+ @staticmethod
72
+ def from_pretrained(model_name_or_path, **kwargs):
73
+ """
74
+ Loads a tokenizer from a pre-trained model path.
75
+
76
+ It attempts to load a custom tokenizer wrapper if `omnigenome_wrapper.py`
77
+ is present in the model directory. Otherwise, it falls back to
78
+ `transformers.AutoTokenizer`.
79
+
80
+ Args:
81
+ model_name_or_path (str): The name or path of the pre-trained model.
82
+ **kwargs: Additional arguments for the tokenizer.
83
+
84
+ Returns:
85
+ OmniTokenizer: An instance of a tokenizer.
86
+
87
+ Example:
88
+ >>> # Load from a pre-trained model
89
+ >>> tokenizer = OmniTokenizer.from_pretrained("model_name")
90
+
91
+ >>> # Load with custom parameters
92
+ >>> tokenizer = OmniTokenizer.from_pretrained("model_name",
93
+ ... trust_remote_code=True)
94
+ """
95
+ wrapper_path = f"{model_name_or_path.rstrip('/')}/omnigenome_wrapper.py"
96
+ try:
97
+ tokenizer_cls = load_module_from_path(
98
+ "OmniTokenizerWrapper", wrapper_path
99
+ ).Tokenizer
100
+ tokenizer = tokenizer_cls(
101
+ AutoTokenizer.from_pretrained(model_name_or_path, **kwargs), **kwargs
102
+ )
103
+ except Exception as e:
104
+ warnings.warn(
105
+ f"No tokenizer wrapper found in {wrapper_path} -> Exception: {e}"
106
+ )
107
+ kwargs.pop("num_labels", None) # Remove num_labels if it exists, as it may not be applicable
108
+
109
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
110
+
111
+ return tokenizer
112
+
113
+ def save_pretrained(self, save_directory):
114
+ """
115
+ Saves the base tokenizer to a directory.
116
+
117
+ Args:
118
+ save_directory (str): The directory to save the tokenizer to.
119
+
120
+ Example:
121
+ >>> tokenizer.save_pretrained("./saved_tokenizer")
122
+ """
123
+ self.base_tokenizer.save_pretrained(save_directory)
124
+
125
+ def __call__(self, *args, **kwargs):
126
+ """
127
+ Tokenizes inputs using the base tokenizer.
128
+
129
+ This method provides a convenient interface for tokenization with
130
+ sensible defaults for padding, truncation, and tensor conversion.
131
+
132
+ Args:
133
+ *args: Positional arguments for the base tokenizer.
134
+ **kwargs: Keyword arguments for the base tokenizer.
135
+
136
+ Returns:
137
+ dict: The output from the base tokenizer, typically containing
138
+ 'input_ids' and 'attention_mask'.
139
+
140
+ Example:
141
+ >>> # Tokenize a sequence
142
+ >>> inputs = tokenizer("ATCGATCG")
143
+ >>> print(inputs['input_ids'].shape)
144
+ """
145
+ padding = kwargs.pop("padding", True)
146
+ truncation = kwargs.pop("truncation", True)
147
+ max_length = kwargs.pop(
148
+ "max_length", self.max_length if self.max_length else 512
149
+ )
150
+ return_tensor = kwargs.pop("return_tensors", "pt")
151
+ return self.base_tokenizer(
152
+ padding=padding,
153
+ truncation=truncation,
154
+ max_length=max_length,
155
+ return_tensors=return_tensor,
156
+ *args,
157
+ **kwargs,
158
+ )
159
+
160
+ def tokenize(self, sequence, **kwargs):
161
+ """
162
+ Converts a sequence into a list of tokens. Must be implemented by subclasses.
163
+
164
+ This method should be implemented by concrete tokenizer classes to define
165
+ how sequences are tokenized for their specific use case.
166
+
167
+ Args:
168
+ sequence (str): The input sequence.
169
+ **kwargs: Additional arguments.
170
+
171
+ Returns:
172
+ list: A list of tokens.
173
+
174
+ Raises:
175
+ NotImplementedError: If the method is not implemented by the subclass.
176
+
177
+ Example:
178
+ >>> # In a nucleotide tokenizer
179
+ >>> tokens = tokenizer.tokenize("ATCGATCG")
180
+ >>> print(tokens) # ['A', 'T', 'C', 'G', 'A', 'T', 'C', 'G']
181
+ """
182
+ raise NotImplementedError(
183
+ "The tokenize() function should be adapted for different models,"
184
+ " please implement it for your model."
185
+ )
186
+
187
+ def encode(self, sequence, **kwargs):
188
+ """
189
+ Converts a sequence into a list of token IDs. Must be implemented by subclasses.
190
+
191
+ This method should be implemented by concrete tokenizer classes to define
192
+ how sequences are encoded into token IDs.
193
+
194
+ Args:
195
+ sequence (str): The input sequence.
196
+ **kwargs: Additional arguments.
197
+
198
+ Returns:
199
+ list: A list of token IDs.
200
+
201
+ Raises:
202
+ NotImplementedError: If the method is not implemented by the subclass.
203
+
204
+ Example:
205
+ >>> # In a nucleotide tokenizer
206
+ >>> token_ids = tokenizer.encode("ATCGATCG")
207
+ >>> print(token_ids) # [1, 2, 3, 4, 1, 2, 3, 4]
208
+ """
209
+ raise NotImplementedError(
210
+ "The encode() function should be adapted for different models,"
211
+ " please implement it for your model."
212
+ )
213
+
214
+ def decode(self, sequence, **kwargs):
215
+ """
216
+ Converts a list of token IDs back into a sequence. Must be implemented by subclasses.
217
+
218
+ This method should be implemented by concrete tokenizer classes to define
219
+ how token IDs are decoded back into sequences.
220
+
221
+ Args:
222
+ sequence (list): A list of token IDs.
223
+ **kwargs: Additional arguments.
224
+
225
+ Returns:
226
+ str: The decoded sequence.
227
+
228
+ Raises:
229
+ NotImplementedError: If the method is not implemented by the subclass.
230
+
231
+ Example:
232
+ >>> # In a nucleotide tokenizer
233
+ >>> sequence = tokenizer.decode([1, 2, 3, 4])
234
+ >>> print(sequence) # "ATCG"
235
+ """
236
+ raise NotImplementedError(
237
+ "The decode() function should be adapted for different models,"
238
+ " please implement it for your model."
239
+ )
240
+
241
+ def __getattribute__(self, item):
242
+ """
243
+ Custom attribute getter that falls back to the base tokenizer if an
244
+ attribute is not found on the wrapper.
245
+
246
+ This method provides transparent access to the base tokenizer's attributes,
247
+ allowing the wrapper to be used as a drop-in replacement for the base tokenizer.
248
+
249
+ Args:
250
+ item (str): The attribute name to get.
251
+
252
+ Returns:
253
+ The attribute value from either the wrapper or the base tokenizer.
254
+
255
+ Raises:
256
+ AttributeError: If the attribute is not found on either the wrapper
257
+ or the base tokenizer.
258
+ """
259
+ try:
260
+ return super().__getattribute__(item)
261
+ except AttributeError:
262
+ try:
263
+ return self.base_tokenizer.__getattribute__(item)
264
+ except (AttributeError, RecursionError) as e:
265
+ raise AttributeError(
266
+ f"'{self.__class__.__name__}' object has no attribute '{item}'"
267
+ ) from e
@@ -0,0 +1,16 @@
1
+ # -*- coding: utf-8 -*-
2
+ # file: __init__.py
3
+ # time: 22:33 08/04/2024
4
+ # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
+ # github: https://github.com/yangheng95
6
+ # huggingface: https://huggingface.co/yangheng
7
+ # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
+ # Copyright (C) 2019-2024. All Rights Reserved.
9
+ """
10
+ This package contains dataset-related modules.
11
+ """
12
+
13
+ from .omni_dataset import OmniDatasetForSequenceClassification
14
+ from .omni_dataset import OmniDatasetForSequenceRegression
15
+ from .omni_dataset import OmniDatasetForTokenClassification
16
+ from .omni_dataset import OmniDatasetForTokenRegression