omnigenome 0.3.1a0__py3-none-any.whl → 0.4.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. omnigenome/__init__.py +304 -266
  2. omnigenome-0.4.0a0.dist-info/METADATA +354 -0
  3. omnigenome-0.4.0a0.dist-info/RECORD +7 -0
  4. omnigenome/auto/__init__.py +0 -3
  5. omnigenome/auto/auto_bench/__init__.py +0 -11
  6. omnigenome/auto/auto_bench/auto_bench.py +0 -494
  7. omnigenome/auto/auto_bench/auto_bench_cli.py +0 -230
  8. omnigenome/auto/auto_bench/auto_bench_config.py +0 -216
  9. omnigenome/auto/auto_bench/config_check.py +0 -34
  10. omnigenome/auto/auto_train/__init__.py +0 -12
  11. omnigenome/auto/auto_train/auto_train.py +0 -429
  12. omnigenome/auto/auto_train/auto_train_cli.py +0 -222
  13. omnigenome/auto/bench_hub/__init__.py +0 -11
  14. omnigenome/auto/bench_hub/bench_hub.py +0 -25
  15. omnigenome/cli/__init__.py +0 -12
  16. omnigenome/cli/commands/__init__.py +0 -12
  17. omnigenome/cli/commands/base.py +0 -83
  18. omnigenome/cli/commands/bench/__init__.py +0 -12
  19. omnigenome/cli/commands/bench/bench_cli.py +0 -202
  20. omnigenome/cli/commands/rna/__init__.py +0 -12
  21. omnigenome/cli/commands/rna/rna_design.py +0 -177
  22. omnigenome/cli/omnigenome_cli.py +0 -128
  23. omnigenome/src/__init__.py +0 -11
  24. omnigenome/src/abc/__init__.py +0 -11
  25. omnigenome/src/abc/abstract_dataset.py +0 -641
  26. omnigenome/src/abc/abstract_metric.py +0 -114
  27. omnigenome/src/abc/abstract_model.py +0 -690
  28. omnigenome/src/abc/abstract_tokenizer.py +0 -269
  29. omnigenome/src/dataset/__init__.py +0 -16
  30. omnigenome/src/dataset/omni_dataset.py +0 -437
  31. omnigenome/src/lora/__init__.py +0 -12
  32. omnigenome/src/lora/lora_model.py +0 -300
  33. omnigenome/src/metric/__init__.py +0 -15
  34. omnigenome/src/metric/classification_metric.py +0 -184
  35. omnigenome/src/metric/metric.py +0 -199
  36. omnigenome/src/metric/ranking_metric.py +0 -142
  37. omnigenome/src/metric/regression_metric.py +0 -191
  38. omnigenome/src/misc/__init__.py +0 -3
  39. omnigenome/src/misc/utils.py +0 -503
  40. omnigenome/src/model/__init__.py +0 -19
  41. omnigenome/src/model/augmentation/__init__.py +0 -11
  42. omnigenome/src/model/augmentation/model.py +0 -219
  43. omnigenome/src/model/classification/__init__.py +0 -11
  44. omnigenome/src/model/classification/model.py +0 -638
  45. omnigenome/src/model/embedding/__init__.py +0 -11
  46. omnigenome/src/model/embedding/model.py +0 -263
  47. omnigenome/src/model/mlm/__init__.py +0 -11
  48. omnigenome/src/model/mlm/model.py +0 -177
  49. omnigenome/src/model/module_utils.py +0 -232
  50. omnigenome/src/model/regression/__init__.py +0 -11
  51. omnigenome/src/model/regression/model.py +0 -781
  52. omnigenome/src/model/regression/resnet.py +0 -483
  53. omnigenome/src/model/rna_design/__init__.py +0 -11
  54. omnigenome/src/model/rna_design/model.py +0 -476
  55. omnigenome/src/model/seq2seq/__init__.py +0 -11
  56. omnigenome/src/model/seq2seq/model.py +0 -44
  57. omnigenome/src/tokenizer/__init__.py +0 -16
  58. omnigenome/src/tokenizer/bpe_tokenizer.py +0 -226
  59. omnigenome/src/tokenizer/kmers_tokenizer.py +0 -247
  60. omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +0 -249
  61. omnigenome/src/trainer/__init__.py +0 -14
  62. omnigenome/src/trainer/accelerate_trainer.py +0 -747
  63. omnigenome/src/trainer/hf_trainer.py +0 -75
  64. omnigenome/src/trainer/trainer.py +0 -591
  65. omnigenome/utility/__init__.py +0 -3
  66. omnigenome/utility/dataset_hub/__init__.py +0 -12
  67. omnigenome/utility/dataset_hub/dataset_hub.py +0 -178
  68. omnigenome/utility/ensemble.py +0 -324
  69. omnigenome/utility/hub_utils.py +0 -517
  70. omnigenome/utility/model_hub/__init__.py +0 -11
  71. omnigenome/utility/model_hub/model_hub.py +0 -232
  72. omnigenome/utility/pipeline_hub/__init__.py +0 -11
  73. omnigenome/utility/pipeline_hub/pipeline.py +0 -483
  74. omnigenome/utility/pipeline_hub/pipeline_hub.py +0 -129
  75. omnigenome-0.3.1a0.dist-info/METADATA +0 -224
  76. omnigenome-0.3.1a0.dist-info/RECORD +0 -78
  77. {omnigenome-0.3.1a0.dist-info → omnigenome-0.4.0a0.dist-info}/WHEEL +0 -0
  78. {omnigenome-0.3.1a0.dist-info → omnigenome-0.4.0a0.dist-info}/entry_points.txt +0 -0
  79. {omnigenome-0.3.1a0.dist-info → omnigenome-0.4.0a0.dist-info}/licenses/LICENSE +0 -0
  80. {omnigenome-0.3.1a0.dist-info → omnigenome-0.4.0a0.dist-info}/top_level.txt +0 -0
@@ -1,269 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # file: omnigenome_wrapper.py
3
- # time: 18:37 06/04/2024
4
- # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
- # github: https://github.com/yangheng95
6
- # huggingface: https://huggingface.co/yangheng
7
- # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
- # Copyright (C) 2019-2024. All Rights Reserved.
9
- import warnings
10
-
11
- from transformers import AutoTokenizer
12
-
13
- from ..misc.utils import env_meta_info, load_module_from_path
14
-
15
-
16
- class OmniTokenizer:
17
- """
18
- A wrapper class for tokenizers to provide a consistent interface within OmniGenome.
19
-
20
- This class provides a unified interface for tokenizers in the OmniGenome framework.
21
- It wraps underlying tokenizers (typically from Hugging Face) and provides
22
- additional functionality for genomic sequence processing.
23
-
24
- The class handles various tokenization strategies and provides compatibility
25
- with different model architectures. It also supports custom tokenizer wrappers
26
- for specialized genomic tasks.
27
-
28
- Attributes:
29
- base_tokenizer: The underlying tokenizer instance (e.g., from Hugging Face).
30
- max_length (int): The default maximum sequence length.
31
- metadata (dict): Metadata about the tokenizer including version info.
32
- u2t (bool): Whether to convert 'U' to 'T'.
33
- t2u (bool): Whether to convert 'T' to 'U'.
34
- add_whitespace (bool): Whether to add whitespace between characters.
35
- """
36
-
37
- def __init__(self, base_tokenizer=None, max_length=512, **kwargs):
38
- """
39
- Initializes the tokenizer wrapper.
40
-
41
- Args:
42
- base_tokenizer: The underlying tokenizer instance (e.g., from Hugging Face).
43
- max_length (int): The default maximum sequence length. Defaults to 512.
44
- **kwargs: Additional keyword arguments.
45
- - u2t (bool): Whether to convert 'U' to 'T'. Defaults to False.
46
- - t2u (bool): Whether to convert 'T' to 'U'. Defaults to False.
47
- - add_whitespace (bool): Whether to add whitespace between characters.
48
- Defaults to False.
49
-
50
- Example:
51
- >>> # Initialize with a Hugging Face tokenizer
52
- >>> from transformers import AutoTokenizer
53
- >>> base_tokenizer = AutoTokenizer.from_pretrained("model_name")
54
- >>> tokenizer = OmniTokenizer(base_tokenizer, max_length=512)
55
-
56
- >>> # Initialize with sequence conversion
57
- >>> tokenizer = OmniTokenizer(base_tokenizer, u2t=True)
58
- """
59
- self.metadata = env_meta_info()
60
-
61
- self.base_tokenizer = base_tokenizer
62
- self.max_length = max_length
63
-
64
- for key, value in kwargs.items():
65
- self.metadata[key] = value
66
-
67
- self.u2t = kwargs.get("u2t", False)
68
- self.t2u = kwargs.get("t2u", False)
69
- self.add_whitespace = kwargs.get("add_whitespace", False)
70
-
71
- @staticmethod
72
- def from_pretrained(model_name_or_path, **kwargs):
73
- """
74
- Loads a tokenizer from a pre-trained model path.
75
-
76
- It attempts to load a custom tokenizer wrapper if `omnigenome_wrapper.py`
77
- is present in the model directory. Otherwise, it falls back to
78
- `transformers.AutoTokenizer`.
79
-
80
- Args:
81
- model_name_or_path (str): The name or path of the pre-trained model.
82
- **kwargs: Additional arguments for the tokenizer.
83
-
84
- Returns:
85
- OmniTokenizer: An instance of a tokenizer.
86
-
87
- Example:
88
- >>> # Load from a pre-trained model
89
- >>> tokenizer = OmniTokenizer.from_pretrained("model_name")
90
-
91
- >>> # Load with custom parameters
92
- >>> tokenizer = OmniTokenizer.from_pretrained("model_name",
93
- ... trust_remote_code=True)
94
- """
95
- wrapper_path = f"{model_name_or_path.rstrip('/')}/omnigenome_wrapper.py"
96
- try:
97
- tokenizer_cls = load_module_from_path(
98
- "OmniTokenizerWrapper", wrapper_path
99
- ).Tokenizer
100
- tokenizer = tokenizer_cls(
101
- AutoTokenizer.from_pretrained(model_name_or_path, **kwargs), **kwargs
102
- )
103
- except Exception as e:
104
- warnings.warn(
105
- f"No tokenizer wrapper found in {wrapper_path} -> Exception: {e}"
106
- )
107
- kwargs.pop(
108
- "num_labels", None
109
- ) # Remove num_labels if it exists, as it may not be applicable
110
-
111
- tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
112
-
113
- return tokenizer
114
-
115
- def save_pretrained(self, save_directory):
116
- """
117
- Saves the base tokenizer to a directory.
118
-
119
- Args:
120
- save_directory (str): The directory to save the tokenizer to.
121
-
122
- Example:
123
- >>> tokenizer.save_pretrained("./saved_tokenizer")
124
- """
125
- self.base_tokenizer.save_pretrained(save_directory)
126
-
127
- def __call__(self, *args, **kwargs):
128
- """
129
- Tokenizes inputs using the base tokenizer.
130
-
131
- This method provides a convenient interface for tokenization with
132
- sensible defaults for padding, truncation, and tensor conversion.
133
-
134
- Args:
135
- *args: Positional arguments for the base tokenizer.
136
- **kwargs: Keyword arguments for the base tokenizer.
137
-
138
- Returns:
139
- dict: The output from the base tokenizer, typically containing
140
- 'input_ids' and 'attention_mask'.
141
-
142
- Example:
143
- >>> # Tokenize a sequence
144
- >>> inputs = tokenizer("ATCGATCG")
145
- >>> print(inputs['input_ids'].shape)
146
- """
147
- padding = kwargs.pop("padding", True)
148
- truncation = kwargs.pop("truncation", True)
149
- max_length = kwargs.pop(
150
- "max_length", self.max_length if self.max_length else 512
151
- )
152
- return_tensor = kwargs.pop("return_tensors", "pt")
153
- return self.base_tokenizer(
154
- padding=padding,
155
- truncation=truncation,
156
- max_length=max_length,
157
- return_tensors=return_tensor,
158
- *args,
159
- **kwargs,
160
- )
161
-
162
- def tokenize(self, sequence, **kwargs):
163
- """
164
- Converts a sequence into a list of tokens. Must be implemented by subclasses.
165
-
166
- This method should be implemented by concrete tokenizer classes to define
167
- how sequences are tokenized for their specific use case.
168
-
169
- Args:
170
- sequence (str): The input sequence.
171
- **kwargs: Additional arguments.
172
-
173
- Returns:
174
- list: A list of tokens.
175
-
176
- Raises:
177
- NotImplementedError: If the method is not implemented by the subclass.
178
-
179
- Example:
180
- >>> # In a nucleotide tokenizer
181
- >>> tokens = tokenizer.tokenize("ATCGATCG")
182
- >>> print(tokens) # ['A', 'T', 'C', 'G', 'A', 'T', 'C', 'G']
183
- """
184
- raise NotImplementedError(
185
- "The tokenize() function should be adapted for different models,"
186
- " please implement it for your model."
187
- )
188
-
189
- def encode(self, sequence, **kwargs):
190
- """
191
- Converts a sequence into a list of token IDs. Must be implemented by subclasses.
192
-
193
- This method should be implemented by concrete tokenizer classes to define
194
- how sequences are encoded into token IDs.
195
-
196
- Args:
197
- sequence (str): The input sequence.
198
- **kwargs: Additional arguments.
199
-
200
- Returns:
201
- list: A list of token IDs.
202
-
203
- Raises:
204
- NotImplementedError: If the method is not implemented by the subclass.
205
-
206
- Example:
207
- >>> # In a nucleotide tokenizer
208
- >>> token_ids = tokenizer.encode("ATCGATCG")
209
- >>> print(token_ids) # [1, 2, 3, 4, 1, 2, 3, 4]
210
- """
211
- raise NotImplementedError(
212
- "The encode() function should be adapted for different models,"
213
- " please implement it for your model."
214
- )
215
-
216
- def decode(self, sequence, **kwargs):
217
- """
218
- Converts a list of token IDs back into a sequence. Must be implemented by subclasses.
219
-
220
- This method should be implemented by concrete tokenizer classes to define
221
- how token IDs are decoded back into sequences.
222
-
223
- Args:
224
- sequence (list): A list of token IDs.
225
- **kwargs: Additional arguments.
226
-
227
- Returns:
228
- str: The decoded sequence.
229
-
230
- Raises:
231
- NotImplementedError: If the method is not implemented by the subclass.
232
-
233
- Example:
234
- >>> # In a nucleotide tokenizer
235
- >>> sequence = tokenizer.decode([1, 2, 3, 4])
236
- >>> print(sequence) # "ATCG"
237
- """
238
- raise NotImplementedError(
239
- "The decode() function should be adapted for different models,"
240
- " please implement it for your model."
241
- )
242
-
243
- def __getattribute__(self, item):
244
- """
245
- Custom attribute getter that falls back to the base tokenizer if an
246
- attribute is not found on the wrapper.
247
-
248
- This method provides transparent access to the base tokenizer's attributes,
249
- allowing the wrapper to be used as a drop-in replacement for the base tokenizer.
250
-
251
- Args:
252
- item (str): The attribute name to get.
253
-
254
- Returns:
255
- The attribute value from either the wrapper or the base tokenizer.
256
-
257
- Raises:
258
- AttributeError: If the attribute is not found on either the wrapper
259
- or the base tokenizer.
260
- """
261
- try:
262
- return super().__getattribute__(item)
263
- except AttributeError:
264
- try:
265
- return self.base_tokenizer.__getattribute__(item)
266
- except (AttributeError, RecursionError) as e:
267
- raise AttributeError(
268
- f"'{self.__class__.__name__}' object has no attribute '{item}'"
269
- ) from e
@@ -1,16 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # file: __init__.py
3
- # time: 22:33 08/04/2024
4
- # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
- # github: https://github.com/yangheng95
6
- # huggingface: https://huggingface.co/yangheng
7
- # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
- # Copyright (C) 2019-2024. All Rights Reserved.
9
- """
10
- This package contains dataset-related modules.
11
- """
12
-
13
- from .omni_dataset import OmniDatasetForSequenceClassification
14
- from .omni_dataset import OmniDatasetForSequenceRegression
15
- from .omni_dataset import OmniDatasetForTokenClassification
16
- from .omni_dataset import OmniDatasetForTokenRegression