omnigenome 0.3.0a1__py3-none-any.whl → 0.3.3a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of omnigenome might be problematic. Click here for more details.

Files changed (79) hide show
  1. omnigenome/__init__.py +252 -258
  2. {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/METADATA +10 -10
  3. omnigenome-0.3.3a0.dist-info/RECORD +7 -0
  4. omnigenome/auto/__init__.py +0 -3
  5. omnigenome/auto/auto_bench/__init__.py +0 -12
  6. omnigenome/auto/auto_bench/auto_bench.py +0 -484
  7. omnigenome/auto/auto_bench/auto_bench_cli.py +0 -230
  8. omnigenome/auto/auto_bench/auto_bench_config.py +0 -216
  9. omnigenome/auto/auto_bench/config_check.py +0 -34
  10. omnigenome/auto/auto_train/__init__.py +0 -13
  11. omnigenome/auto/auto_train/auto_train.py +0 -430
  12. omnigenome/auto/auto_train/auto_train_cli.py +0 -222
  13. omnigenome/auto/bench_hub/__init__.py +0 -12
  14. omnigenome/auto/bench_hub/bench_hub.py +0 -25
  15. omnigenome/cli/__init__.py +0 -13
  16. omnigenome/cli/commands/__init__.py +0 -13
  17. omnigenome/cli/commands/base.py +0 -83
  18. omnigenome/cli/commands/bench/__init__.py +0 -13
  19. omnigenome/cli/commands/bench/bench_cli.py +0 -202
  20. omnigenome/cli/commands/rna/__init__.py +0 -13
  21. omnigenome/cli/commands/rna/rna_design.py +0 -178
  22. omnigenome/cli/omnigenome_cli.py +0 -128
  23. omnigenome/src/__init__.py +0 -12
  24. omnigenome/src/abc/__init__.py +0 -12
  25. omnigenome/src/abc/abstract_dataset.py +0 -622
  26. omnigenome/src/abc/abstract_metric.py +0 -114
  27. omnigenome/src/abc/abstract_model.py +0 -689
  28. omnigenome/src/abc/abstract_tokenizer.py +0 -267
  29. omnigenome/src/dataset/__init__.py +0 -16
  30. omnigenome/src/dataset/omni_dataset.py +0 -435
  31. omnigenome/src/lora/__init__.py +0 -13
  32. omnigenome/src/lora/lora_model.py +0 -294
  33. omnigenome/src/metric/__init__.py +0 -15
  34. omnigenome/src/metric/classification_metric.py +0 -184
  35. omnigenome/src/metric/metric.py +0 -199
  36. omnigenome/src/metric/ranking_metric.py +0 -142
  37. omnigenome/src/metric/regression_metric.py +0 -191
  38. omnigenome/src/misc/__init__.py +0 -3
  39. omnigenome/src/misc/utils.py +0 -499
  40. omnigenome/src/model/__init__.py +0 -19
  41. omnigenome/src/model/augmentation/__init__.py +0 -12
  42. omnigenome/src/model/augmentation/model.py +0 -219
  43. omnigenome/src/model/classification/__init__.py +0 -12
  44. omnigenome/src/model/classification/model.py +0 -642
  45. omnigenome/src/model/embedding/__init__.py +0 -12
  46. omnigenome/src/model/embedding/model.py +0 -263
  47. omnigenome/src/model/mlm/__init__.py +0 -12
  48. omnigenome/src/model/mlm/model.py +0 -177
  49. omnigenome/src/model/module_utils.py +0 -232
  50. omnigenome/src/model/regression/__init__.py +0 -12
  51. omnigenome/src/model/regression/model.py +0 -786
  52. omnigenome/src/model/regression/resnet.py +0 -483
  53. omnigenome/src/model/rna_design/__init__.py +0 -12
  54. omnigenome/src/model/rna_design/model.py +0 -469
  55. omnigenome/src/model/seq2seq/__init__.py +0 -12
  56. omnigenome/src/model/seq2seq/model.py +0 -44
  57. omnigenome/src/tokenizer/__init__.py +0 -16
  58. omnigenome/src/tokenizer/bpe_tokenizer.py +0 -226
  59. omnigenome/src/tokenizer/kmers_tokenizer.py +0 -247
  60. omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +0 -249
  61. omnigenome/src/trainer/__init__.py +0 -14
  62. omnigenome/src/trainer/accelerate_trainer.py +0 -739
  63. omnigenome/src/trainer/hf_trainer.py +0 -75
  64. omnigenome/src/trainer/trainer.py +0 -579
  65. omnigenome/utility/__init__.py +0 -3
  66. omnigenome/utility/dataset_hub/__init__.py +0 -13
  67. omnigenome/utility/dataset_hub/dataset_hub.py +0 -178
  68. omnigenome/utility/ensemble.py +0 -324
  69. omnigenome/utility/hub_utils.py +0 -517
  70. omnigenome/utility/model_hub/__init__.py +0 -12
  71. omnigenome/utility/model_hub/model_hub.py +0 -231
  72. omnigenome/utility/pipeline_hub/__init__.py +0 -12
  73. omnigenome/utility/pipeline_hub/pipeline.py +0 -483
  74. omnigenome/utility/pipeline_hub/pipeline_hub.py +0 -129
  75. omnigenome-0.3.0a1.dist-info/RECORD +0 -78
  76. {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/WHEEL +0 -0
  77. {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/entry_points.txt +0 -0
  78. {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/licenses/LICENSE +0 -0
  79. {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.3a0.dist-info}/top_level.txt +0 -0
@@ -1,435 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # file: abstract_dataset.py
3
- # time: 14:13 06/04/2024
4
- # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
- # github: https://github.com/yangheng95
6
- # huggingface: https://huggingface.co/yangheng
7
- # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
- # Copyright (C) 2019-2024. All Rights Reserved.
9
- """
10
- Specialized dataset classes for OmniGenome framework.
11
-
12
- This module provides specialized dataset classes for various genomic tasks,
13
- inheriting from the abstract `OmniDataset`. These classes handle data preparation
14
- for token classification, sequence classification, token regression, and sequence regression,
15
- integrating with tokenizers and managing metadata.
16
- """
17
- import json
18
-
19
- import numpy as np
20
- import torch
21
-
22
- from ..abc.abstract_dataset import OmniDataset
23
- from ..misc.utils import fprint
24
- from ... import __name__, __version__
25
-
26
-
27
- class OmniDatasetForTokenClassification(OmniDataset):
28
- """
29
- Dataset class specifically designed for token classification tasks in genomics.
30
-
31
- This class extends `OmniDataset` to provide functionalities for preparing input sequences
32
- and their corresponding token-level labels. It's designed for tasks where each token
33
- in a sequence needs to be classified independently.
34
-
35
- Attributes:
36
- metadata: Dictionary containing dataset metadata including library information
37
- label2id: Mapping from label strings to integer IDs
38
- """
39
-
40
- def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
41
- """
42
- Initialize the dataset for token classification.
43
-
44
- Args:
45
- data_source: Path to the data file or a list of paths.
46
- Supported formats depend on the `OmniDataset` implementation.
47
- tokenizer: The tokenizer instance to use for converting sequences into
48
- tokenized inputs.
49
- max_length: The maximum sequence length for tokenization. Sequences longer
50
- than this will be truncated. If None, a default or tokenizer's
51
- max length will be used.
52
- **kwargs: Additional keyword arguments to be stored in the dataset's metadata.
53
- """
54
- super(OmniDatasetForTokenClassification, self).__init__(
55
- data_source, tokenizer, max_length, **kwargs
56
- )
57
- self.metadata.update(
58
- {
59
- "library_name": __name__,
60
- "omnigenome_version": __version__,
61
- "task": "genome_token_classification",
62
- }
63
- )
64
-
65
- for key, value in kwargs.items():
66
- self.metadata[key] = value
67
-
68
- def prepare_input(self, instance, **kwargs):
69
- """
70
- Prepare a single data instance for token classification.
71
-
72
- This method handles both string sequences and dictionary instances
73
- containing sequence and label information. It tokenizes the input
74
- sequence and prepares token-level labels for classification.
75
-
76
- Args:
77
- instance: A single data instance. Can be a string representing the sequence
78
- or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
79
- **kwargs: Additional keyword arguments for tokenization, such as 'padding'
80
- and 'truncation'.
81
-
82
- Returns:
83
- dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
84
- and 'labels' (tensor of token-level labels).
85
-
86
- Raises:
87
- Exception: If the input instance format is unknown or if a dictionary
88
- instance does not contain a 'seq' or 'sequence' key.
89
- """
90
- labels = -100
91
- if isinstance(instance, str):
92
- sequence = instance
93
- elif isinstance(instance, dict):
94
- sequence = (
95
- instance.get("seq", None)
96
- if "seq" in instance
97
- else instance.get("sequence", None)
98
- )
99
- label = instance.get("label", None)
100
- labels = instance.get("labels", None)
101
- labels = labels if labels is not None else label
102
- if not sequence:
103
- raise Exception(
104
- "The input instance must contain a 'seq' or 'sequence' key."
105
- )
106
- else:
107
- raise Exception("Unknown instance format.")
108
-
109
- tokenized_inputs = self.tokenizer(
110
- sequence,
111
- padding=kwargs.get("padding", "do_not_pad"),
112
- truncation=kwargs.get("truncation", True),
113
- max_length=self.max_length,
114
- return_tensors="pt",
115
- )
116
- for col in tokenized_inputs:
117
- tokenized_inputs[col] = tokenized_inputs[col].squeeze()
118
-
119
- if labels is not None:
120
- if len(set(self.label2id.keys()) | set([str(l) for l in labels])) != len(
121
- set(self.label2id.keys())
122
- ):
123
- fprint(
124
- f"Warning: The labels <{labels}> in the input instance do not match the label2id mapping."
125
- )
126
- labels = (
127
- [-100]
128
- + [self.label2id.get(str(l), -100) for l in labels][
129
- : self.max_length - 2
130
- ]
131
- + [-100]
132
- )
133
-
134
- tokenized_inputs["labels"] = torch.tensor(labels)
135
- return tokenized_inputs
136
-
137
-
138
- class OmniDatasetForSequenceClassification(OmniDataset):
139
- """
140
- Dataset class for sequence classification tasks in genomics.
141
-
142
- This class extends `OmniDataset` to prepare input sequences and their corresponding
143
- sequence-level labels. It's designed for tasks where the entire sequence needs
144
- to be classified into one of several categories.
145
-
146
- Attributes:
147
- metadata: Dictionary containing dataset metadata including library information
148
- label2id: Mapping from label strings to integer IDs
149
- """
150
-
151
- def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
152
- """
153
- Initialize the dataset for sequence classification.
154
-
155
- Args:
156
- data_source: Path to the data file or a list of paths.
157
- Supported formats depend on the `OmniDataset` implementation.
158
- tokenizer: The tokenizer instance to use for converting sequences into
159
- tokenized inputs.
160
- max_length: The maximum sequence length for tokenization. Sequences longer
161
- than this will be truncated. If None, a default or tokenizer's
162
- max length will be used.
163
- **kwargs: Additional keyword arguments to be stored in the dataset's metadata.
164
- """
165
- super(OmniDatasetForSequenceClassification, self).__init__(
166
- data_source, tokenizer, max_length, **kwargs
167
- )
168
-
169
- self.metadata.update(
170
- {
171
- "library_name": __name__,
172
- "omnigenome_version": __version__,
173
- "task": "genome_sequence_classification",
174
- }
175
- )
176
- for key, value in kwargs.items():
177
- self.metadata[key] = value
178
-
179
- def prepare_input(self, instance, **kwargs):
180
- """
181
- Prepare a single data instance for sequence classification.
182
-
183
- This method handles both string sequences and dictionary instances
184
- containing sequence and label information. It tokenizes the input
185
- sequence and prepares sequence-level labels for classification.
186
-
187
- Args:
188
- instance: A single data instance. Can be a string representing the sequence
189
- or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
190
- **kwargs: Additional keyword arguments for tokenization, such as 'padding'
191
- and 'truncation'.
192
-
193
- Returns:
194
- dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
195
- and 'labels' (tensor of sequence-level labels).
196
-
197
- Raises:
198
- Exception: If the input instance format is unknown or if a dictionary
199
- instance does not contain a 'label' or 'labels' key, or if
200
- the label is not an integer.
201
- """
202
- labels = -100
203
- if isinstance(instance, str):
204
- sequence = instance
205
- elif isinstance(instance, dict):
206
- sequence = (
207
- instance.get("seq", None)
208
- if "seq" in instance
209
- else instance.get("sequence", None)
210
- )
211
- label = instance.get("label", None)
212
- labels = instance.get("labels", None)
213
- labels = labels if labels is not None else label
214
- else:
215
- raise Exception("Unknown instance format.")
216
-
217
- tokenized_inputs = self.tokenizer(
218
- sequence,
219
- padding=kwargs.get("padding", "do_not_pad"),
220
- truncation=kwargs.get("truncation", True),
221
- max_length=self.max_length,
222
- return_tensors="pt",
223
- )
224
- for col in tokenized_inputs:
225
- tokenized_inputs[col] = tokenized_inputs[col].squeeze()
226
-
227
- if labels is not None:
228
- if not isinstance(labels, int):
229
- raise Exception(
230
- "The label must be an integer for sequence classification."
231
- )
232
- labels = self.label2id.get(str(labels), -100)
233
-
234
- tokenized_inputs["labels"] = torch.tensor(labels)
235
- return tokenized_inputs
236
-
237
-
238
- class OmniDatasetForTokenRegression(OmniDataset):
239
- """
240
- Dataset class for token regression tasks in genomics.
241
-
242
- This class extends `OmniDataset` to prepare input sequences and their corresponding
243
- token-level regression targets. It's designed for tasks where each token in a
244
- sequence needs to be assigned a continuous value.
245
-
246
- Attributes:
247
- metadata: Dictionary containing dataset metadata including library information
248
- """
249
-
250
- def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
251
- """
252
- Initialize the dataset for token regression.
253
-
254
- Args:
255
- data_source: Path to the data file or a list of paths.
256
- Supported formats depend on the `OmniDataset` implementation.
257
- tokenizer: The tokenizer instance to use for converting sequences into
258
- tokenized inputs.
259
- max_length: The maximum sequence length for tokenization. Sequences longer
260
- than this will be truncated. If None, a default or tokenizer's
261
- max length will be used.
262
- **kwargs: Additional keyword arguments to be stored in the dataset's metadata.
263
- """
264
- super(OmniDatasetForTokenRegression, self).__init__(
265
- data_source, tokenizer, max_length, **kwargs
266
- )
267
-
268
- self.metadata.update(
269
- {
270
- "library_name": __name__,
271
- "omnigenome_version": __version__,
272
- "task": "genome_token_regression",
273
- }
274
- )
275
- for key, value in kwargs.items():
276
- self.metadata[key] = value
277
-
278
- def prepare_input(self, instance, **kwargs):
279
- """
280
- Prepare a single data instance for token regression.
281
-
282
- This method handles both string sequences and dictionary instances
283
- containing sequence and regression target information. It tokenizes
284
- the input sequence and prepares token-level regression targets.
285
-
286
- Args:
287
- instance: A single data instance. Can be a string representing the sequence
288
- or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
289
- **kwargs: Additional keyword arguments for tokenization, such as 'padding'
290
- and 'truncation'.
291
-
292
- Returns:
293
- dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
294
- and 'labels' (tensor of token-level regression targets).
295
-
296
- Raises:
297
- Exception: If the input instance format is unknown or if a dictionary
298
- instance does not contain a 'seq' or 'sequence' key.
299
- """
300
- labels = -100
301
- if isinstance(instance, str):
302
- sequence = instance
303
- elif isinstance(instance, dict):
304
- sequence = (
305
- instance.get("seq", None)
306
- if "seq" in instance
307
- else instance.get("sequence", None)
308
- )
309
- label = instance.get("label", None)
310
- labels = instance.get("labels", None)
311
- labels = labels if labels is not None else label
312
- if not sequence:
313
- raise Exception(
314
- "The input instance must contain a 'seq' or 'sequence' key."
315
- )
316
- else:
317
- raise Exception("Unknown instance format.")
318
-
319
- tokenized_inputs = self.tokenizer(
320
- sequence,
321
- padding=kwargs.get("padding", "do_not_pad"),
322
- truncation=kwargs.get("truncation", True),
323
- max_length=self.max_length,
324
- return_tensors="pt",
325
- )
326
- for col in tokenized_inputs:
327
- tokenized_inputs[col] = tokenized_inputs[col].squeeze()
328
-
329
- if labels is not None:
330
- # Handle token-level regression labels
331
- if isinstance(labels, (list, tuple)):
332
- # Ensure labels match sequence length
333
- labels = list(labels)[:self.max_length - 2] # Account for special tokens
334
- labels = [-100] + labels + [-100] # Add padding for special tokens
335
- else:
336
- # Single value for the entire sequence
337
- labels = [-100] + [float(labels)] * (self.max_length - 2) + [-100]
338
-
339
- tokenized_inputs["labels"] = torch.tensor(labels, dtype=torch.float32)
340
- return tokenized_inputs
341
-
342
-
343
- class OmniDatasetForSequenceRegression(OmniDataset):
344
- """
345
- Dataset class for sequence regression tasks in genomics.
346
-
347
- This class extends `OmniDataset` to prepare input sequences and their corresponding
348
- sequence-level regression targets. It's designed for tasks where the entire
349
- sequence needs to be assigned a continuous value.
350
-
351
- Attributes:
352
- metadata: Dictionary containing dataset metadata including library information
353
- """
354
-
355
- def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
356
- """
357
- Initialize the dataset for sequence regression.
358
-
359
- Args:
360
- data_source: Path to the data file or a list of paths.
361
- Supported formats depend on the `OmniDataset` implementation.
362
- tokenizer: The tokenizer instance to use for converting sequences into
363
- tokenized inputs.
364
- max_length: The maximum sequence length for tokenization. Sequences longer
365
- than this will be truncated. If None, a default or tokenizer's
366
- max length will be used.
367
- **kwargs: Additional keyword arguments to be stored in the dataset's metadata.
368
- """
369
- super(OmniDatasetForSequenceRegression, self).__init__(
370
- data_source, tokenizer, max_length, **kwargs
371
- )
372
-
373
- self.metadata.update(
374
- {
375
- "library_name": __name__,
376
- "omnigenome_version": __version__,
377
- "task": "genome_sequence_regression",
378
- }
379
- )
380
- for key, value in kwargs.items():
381
- self.metadata[key] = value
382
-
383
- def prepare_input(self, instance, **kwargs):
384
- """
385
- Prepare a single data instance for sequence regression.
386
-
387
- This method handles both string sequences and dictionary instances
388
- containing sequence and regression target information. It tokenizes
389
- the input sequence and prepares sequence-level regression targets.
390
-
391
- Args:
392
- instance: A single data instance. Can be a string representing the sequence
393
- or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
394
- **kwargs: Additional keyword arguments for tokenization, such as 'padding'
395
- and 'truncation'.
396
-
397
- Returns:
398
- dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
399
- and 'labels' (tensor of sequence-level regression targets).
400
-
401
- Raises:
402
- Exception: If the input instance format is unknown or if a dictionary
403
- instance does not contain a 'label' or 'labels' key.
404
- """
405
- labels = -100
406
- if isinstance(instance, str):
407
- sequence = instance
408
- elif isinstance(instance, dict):
409
- sequence = (
410
- instance.get("seq", None)
411
- if "seq" in instance
412
- else instance.get("sequence", None)
413
- )
414
- label = instance.get("label", None)
415
- labels = instance.get("labels", None)
416
- labels = labels if labels is not None else label
417
- else:
418
- raise Exception("Unknown instance format.")
419
-
420
- tokenized_inputs = self.tokenizer(
421
- sequence,
422
- padding=kwargs.get("padding", "do_not_pad"),
423
- truncation=kwargs.get("truncation", True),
424
- max_length=self.max_length,
425
- return_tensors="pt",
426
- )
427
- for col in tokenized_inputs:
428
- tokenized_inputs[col] = tokenized_inputs[col].squeeze()
429
-
430
- if labels is not None:
431
- # Convert to float for regression
432
- labels = float(labels)
433
-
434
- tokenized_inputs["labels"] = torch.tensor(labels, dtype=torch.float32)
435
- return tokenized_inputs
@@ -1,13 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # file: __init__.py
3
- # time: 12:35 11/06/2025
4
- # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
- # homepage: https://yangheng95.github.io
6
- # github: https://github.com/yangheng95
7
- # huggingface: https://huggingface.co/yangheng
8
- # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
9
- # Copyright (C) 2019-2025. All Rights Reserved.
10
- """
11
- This package contains modules for LoRA (Low-Rank Adaptation) fine-tuning.
12
- """
13
-