omnigenome 0.3.1a0__py3-none-any.whl → 0.3.3a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of omnigenome might be problematic. Click here for more details.

Files changed (79) hide show
  1. omnigenome/__init__.py +252 -266
  2. {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/METADATA +9 -9
  3. omnigenome-0.3.3a0.dist-info/RECORD +7 -0
  4. omnigenome/auto/__init__.py +0 -3
  5. omnigenome/auto/auto_bench/__init__.py +0 -11
  6. omnigenome/auto/auto_bench/auto_bench.py +0 -494
  7. omnigenome/auto/auto_bench/auto_bench_cli.py +0 -230
  8. omnigenome/auto/auto_bench/auto_bench_config.py +0 -216
  9. omnigenome/auto/auto_bench/config_check.py +0 -34
  10. omnigenome/auto/auto_train/__init__.py +0 -12
  11. omnigenome/auto/auto_train/auto_train.py +0 -429
  12. omnigenome/auto/auto_train/auto_train_cli.py +0 -222
  13. omnigenome/auto/bench_hub/__init__.py +0 -11
  14. omnigenome/auto/bench_hub/bench_hub.py +0 -25
  15. omnigenome/cli/__init__.py +0 -12
  16. omnigenome/cli/commands/__init__.py +0 -12
  17. omnigenome/cli/commands/base.py +0 -83
  18. omnigenome/cli/commands/bench/__init__.py +0 -12
  19. omnigenome/cli/commands/bench/bench_cli.py +0 -202
  20. omnigenome/cli/commands/rna/__init__.py +0 -12
  21. omnigenome/cli/commands/rna/rna_design.py +0 -177
  22. omnigenome/cli/omnigenome_cli.py +0 -128
  23. omnigenome/src/__init__.py +0 -11
  24. omnigenome/src/abc/__init__.py +0 -11
  25. omnigenome/src/abc/abstract_dataset.py +0 -641
  26. omnigenome/src/abc/abstract_metric.py +0 -114
  27. omnigenome/src/abc/abstract_model.py +0 -690
  28. omnigenome/src/abc/abstract_tokenizer.py +0 -269
  29. omnigenome/src/dataset/__init__.py +0 -16
  30. omnigenome/src/dataset/omni_dataset.py +0 -437
  31. omnigenome/src/lora/__init__.py +0 -12
  32. omnigenome/src/lora/lora_model.py +0 -300
  33. omnigenome/src/metric/__init__.py +0 -15
  34. omnigenome/src/metric/classification_metric.py +0 -184
  35. omnigenome/src/metric/metric.py +0 -199
  36. omnigenome/src/metric/ranking_metric.py +0 -142
  37. omnigenome/src/metric/regression_metric.py +0 -191
  38. omnigenome/src/misc/__init__.py +0 -3
  39. omnigenome/src/misc/utils.py +0 -503
  40. omnigenome/src/model/__init__.py +0 -19
  41. omnigenome/src/model/augmentation/__init__.py +0 -11
  42. omnigenome/src/model/augmentation/model.py +0 -219
  43. omnigenome/src/model/classification/__init__.py +0 -11
  44. omnigenome/src/model/classification/model.py +0 -638
  45. omnigenome/src/model/embedding/__init__.py +0 -11
  46. omnigenome/src/model/embedding/model.py +0 -263
  47. omnigenome/src/model/mlm/__init__.py +0 -11
  48. omnigenome/src/model/mlm/model.py +0 -177
  49. omnigenome/src/model/module_utils.py +0 -232
  50. omnigenome/src/model/regression/__init__.py +0 -11
  51. omnigenome/src/model/regression/model.py +0 -781
  52. omnigenome/src/model/regression/resnet.py +0 -483
  53. omnigenome/src/model/rna_design/__init__.py +0 -11
  54. omnigenome/src/model/rna_design/model.py +0 -476
  55. omnigenome/src/model/seq2seq/__init__.py +0 -11
  56. omnigenome/src/model/seq2seq/model.py +0 -44
  57. omnigenome/src/tokenizer/__init__.py +0 -16
  58. omnigenome/src/tokenizer/bpe_tokenizer.py +0 -226
  59. omnigenome/src/tokenizer/kmers_tokenizer.py +0 -247
  60. omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +0 -249
  61. omnigenome/src/trainer/__init__.py +0 -14
  62. omnigenome/src/trainer/accelerate_trainer.py +0 -747
  63. omnigenome/src/trainer/hf_trainer.py +0 -75
  64. omnigenome/src/trainer/trainer.py +0 -591
  65. omnigenome/utility/__init__.py +0 -3
  66. omnigenome/utility/dataset_hub/__init__.py +0 -12
  67. omnigenome/utility/dataset_hub/dataset_hub.py +0 -178
  68. omnigenome/utility/ensemble.py +0 -324
  69. omnigenome/utility/hub_utils.py +0 -517
  70. omnigenome/utility/model_hub/__init__.py +0 -11
  71. omnigenome/utility/model_hub/model_hub.py +0 -232
  72. omnigenome/utility/pipeline_hub/__init__.py +0 -11
  73. omnigenome/utility/pipeline_hub/pipeline.py +0 -483
  74. omnigenome/utility/pipeline_hub/pipeline_hub.py +0 -129
  75. omnigenome-0.3.1a0.dist-info/RECORD +0 -78
  76. {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/WHEEL +0 -0
  77. {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/entry_points.txt +0 -0
  78. {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/licenses/LICENSE +0 -0
  79. {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/top_level.txt +0 -0
@@ -1,437 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # file: abstract_dataset.py
3
- # time: 14:13 06/04/2024
4
- # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
- # github: https://github.com/yangheng95
6
- # huggingface: https://huggingface.co/yangheng
7
- # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
- # Copyright (C) 2019-2024. All Rights Reserved.
9
- """
10
- Specialized dataset classes for OmniGenome framework.
11
-
12
- This module provides specialized dataset classes for various genomic tasks,
13
- inheriting from the abstract `OmniDataset`. These classes handle data preparation
14
- for token classification, sequence classification, token regression, and sequence regression,
15
- integrating with tokenizers and managing metadata.
16
- """
17
- import json
18
-
19
- import numpy as np
20
- import torch
21
-
22
- from ..abc.abstract_dataset import OmniDataset
23
- from ..misc.utils import fprint
24
- from ... import __name__, __version__
25
-
26
-
27
- class OmniDatasetForTokenClassification(OmniDataset):
28
- """
29
- Dataset class specifically designed for token classification tasks in genomics.
30
-
31
- This class extends `OmniDataset` to provide functionalities for preparing input sequences
32
- and their corresponding token-level labels. It's designed for tasks where each token
33
- in a sequence needs to be classified independently.
34
-
35
- Attributes:
36
- metadata: Dictionary containing dataset metadata including library information
37
- label2id: Mapping from label strings to integer IDs
38
- """
39
-
40
- def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
41
- """
42
- Initialize the dataset for token classification.
43
-
44
- Args:
45
- data_source: Path to the data file or a list of paths.
46
- Supported formats depend on the `OmniDataset` implementation.
47
- tokenizer: The tokenizer instance to use for converting sequences into
48
- tokenized inputs.
49
- max_length: The maximum sequence length for tokenization. Sequences longer
50
- than this will be truncated. If None, a default or tokenizer's
51
- max length will be used.
52
- **kwargs: Additional keyword arguments to be stored in the dataset's metadata.
53
- """
54
- super(OmniDatasetForTokenClassification, self).__init__(
55
- data_source, tokenizer, max_length, **kwargs
56
- )
57
- self.metadata.update(
58
- {
59
- "library_name": __name__,
60
- "omnigenome_version": __version__,
61
- "task": "genome_token_classification",
62
- }
63
- )
64
-
65
- for key, value in kwargs.items():
66
- self.metadata[key] = value
67
-
68
- def prepare_input(self, instance, **kwargs):
69
- """
70
- Prepare a single data instance for token classification.
71
-
72
- This method handles both string sequences and dictionary instances
73
- containing sequence and label information. It tokenizes the input
74
- sequence and prepares token-level labels for classification.
75
-
76
- Args:
77
- instance: A single data instance. Can be a string representing the sequence
78
- or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
79
- **kwargs: Additional keyword arguments for tokenization, such as 'padding'
80
- and 'truncation'.
81
-
82
- Returns:
83
- dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
84
- and 'labels' (tensor of token-level labels).
85
-
86
- Raises:
87
- Exception: If the input instance format is unknown or if a dictionary
88
- instance does not contain a 'seq' or 'sequence' key.
89
- """
90
- labels = -100
91
- if isinstance(instance, str):
92
- sequence = instance
93
- elif isinstance(instance, dict):
94
- sequence = (
95
- instance.get("seq", None)
96
- if "seq" in instance
97
- else instance.get("sequence", None)
98
- )
99
- label = instance.get("label", None)
100
- labels = instance.get("labels", None)
101
- labels = labels if labels is not None else label
102
- if not sequence:
103
- raise Exception(
104
- "The input instance must contain a 'seq' or 'sequence' key."
105
- )
106
- else:
107
- raise Exception("Unknown instance format.")
108
-
109
- tokenized_inputs = self.tokenizer(
110
- sequence,
111
- padding=kwargs.get("padding", "do_not_pad"),
112
- truncation=kwargs.get("truncation", True),
113
- max_length=self.max_length,
114
- return_tensors="pt",
115
- )
116
- for col in tokenized_inputs:
117
- tokenized_inputs[col] = tokenized_inputs[col].squeeze()
118
-
119
- if labels is not None:
120
- if len(set(self.label2id.keys()) | set([str(l) for l in labels])) != len(
121
- set(self.label2id.keys())
122
- ):
123
- fprint(
124
- f"Warning: The labels <{labels}> in the input instance do not match the label2id mapping."
125
- )
126
- labels = (
127
- [-100]
128
- + [self.label2id.get(str(l), -100) for l in labels][
129
- : self.max_length - 2
130
- ]
131
- + [-100]
132
- )
133
-
134
- tokenized_inputs["labels"] = torch.tensor(labels)
135
- return tokenized_inputs
136
-
137
-
138
- class OmniDatasetForSequenceClassification(OmniDataset):
139
- """
140
- Dataset class for sequence classification tasks in genomics.
141
-
142
- This class extends `OmniDataset` to prepare input sequences and their corresponding
143
- sequence-level labels. It's designed for tasks where the entire sequence needs
144
- to be classified into one of several categories.
145
-
146
- Attributes:
147
- metadata: Dictionary containing dataset metadata including library information
148
- label2id: Mapping from label strings to integer IDs
149
- """
150
-
151
- def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
152
- """
153
- Initialize the dataset for sequence classification.
154
-
155
- Args:
156
- data_source: Path to the data file or a list of paths.
157
- Supported formats depend on the `OmniDataset` implementation.
158
- tokenizer: The tokenizer instance to use for converting sequences into
159
- tokenized inputs.
160
- max_length: The maximum sequence length for tokenization. Sequences longer
161
- than this will be truncated. If None, a default or tokenizer's
162
- max length will be used.
163
- **kwargs: Additional keyword arguments to be stored in the dataset's metadata.
164
- """
165
- super(OmniDatasetForSequenceClassification, self).__init__(
166
- data_source, tokenizer, max_length, **kwargs
167
- )
168
-
169
- self.metadata.update(
170
- {
171
- "library_name": __name__,
172
- "omnigenome_version": __version__,
173
- "task": "genome_sequence_classification",
174
- }
175
- )
176
- for key, value in kwargs.items():
177
- self.metadata[key] = value
178
-
179
- def prepare_input(self, instance, **kwargs):
180
- """
181
- Prepare a single data instance for sequence classification.
182
-
183
- This method handles both string sequences and dictionary instances
184
- containing sequence and label information. It tokenizes the input
185
- sequence and prepares sequence-level labels for classification.
186
-
187
- Args:
188
- instance: A single data instance. Can be a string representing the sequence
189
- or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
190
- **kwargs: Additional keyword arguments for tokenization, such as 'padding'
191
- and 'truncation'.
192
-
193
- Returns:
194
- dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
195
- and 'labels' (tensor of sequence-level labels).
196
-
197
- Raises:
198
- Exception: If the input instance format is unknown or if a dictionary
199
- instance does not contain a 'label' or 'labels' key, or if
200
- the label is not an integer.
201
- """
202
- labels = -100
203
- if isinstance(instance, str):
204
- sequence = instance
205
- elif isinstance(instance, dict):
206
- sequence = (
207
- instance.get("seq", None)
208
- if "seq" in instance
209
- else instance.get("sequence", None)
210
- )
211
- label = instance.get("label", None)
212
- labels = instance.get("labels", None)
213
- labels = labels if labels is not None else label
214
- else:
215
- raise Exception("Unknown instance format.")
216
-
217
- tokenized_inputs = self.tokenizer(
218
- sequence,
219
- padding=kwargs.get("padding", "do_not_pad"),
220
- truncation=kwargs.get("truncation", True),
221
- max_length=self.max_length,
222
- return_tensors="pt",
223
- )
224
- for col in tokenized_inputs:
225
- tokenized_inputs[col] = tokenized_inputs[col].squeeze()
226
-
227
- if labels is not None:
228
- if not isinstance(labels, int):
229
- raise Exception(
230
- "The label must be an integer for sequence classification."
231
- )
232
- labels = self.label2id.get(str(labels), -100)
233
-
234
- tokenized_inputs["labels"] = torch.tensor(labels)
235
- return tokenized_inputs
236
-
237
-
238
- class OmniDatasetForTokenRegression(OmniDataset):
239
- """
240
- Dataset class for token regression tasks in genomics.
241
-
242
- This class extends `OmniDataset` to prepare input sequences and their corresponding
243
- token-level regression targets. It's designed for tasks where each token in a
244
- sequence needs to be assigned a continuous value.
245
-
246
- Attributes:
247
- metadata: Dictionary containing dataset metadata including library information
248
- """
249
-
250
- def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
251
- """
252
- Initialize the dataset for token regression.
253
-
254
- Args:
255
- data_source: Path to the data file or a list of paths.
256
- Supported formats depend on the `OmniDataset` implementation.
257
- tokenizer: The tokenizer instance to use for converting sequences into
258
- tokenized inputs.
259
- max_length: The maximum sequence length for tokenization. Sequences longer
260
- than this will be truncated. If None, a default or tokenizer's
261
- max length will be used.
262
- **kwargs: Additional keyword arguments to be stored in the dataset's metadata.
263
- """
264
- super(OmniDatasetForTokenRegression, self).__init__(
265
- data_source, tokenizer, max_length, **kwargs
266
- )
267
-
268
- self.metadata.update(
269
- {
270
- "library_name": __name__,
271
- "omnigenome_version": __version__,
272
- "task": "genome_token_regression",
273
- }
274
- )
275
- for key, value in kwargs.items():
276
- self.metadata[key] = value
277
-
278
- def prepare_input(self, instance, **kwargs):
279
- """
280
- Prepare a single data instance for token regression.
281
-
282
- This method handles both string sequences and dictionary instances
283
- containing sequence and regression target information. It tokenizes
284
- the input sequence and prepares token-level regression targets.
285
-
286
- Args:
287
- instance: A single data instance. Can be a string representing the sequence
288
- or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
289
- **kwargs: Additional keyword arguments for tokenization, such as 'padding'
290
- and 'truncation'.
291
-
292
- Returns:
293
- dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
294
- and 'labels' (tensor of token-level regression targets).
295
-
296
- Raises:
297
- Exception: If the input instance format is unknown or if a dictionary
298
- instance does not contain a 'seq' or 'sequence' key.
299
- """
300
- labels = -100
301
- if isinstance(instance, str):
302
- sequence = instance
303
- elif isinstance(instance, dict):
304
- sequence = (
305
- instance.get("seq", None)
306
- if "seq" in instance
307
- else instance.get("sequence", None)
308
- )
309
- label = instance.get("label", None)
310
- labels = instance.get("labels", None)
311
- labels = labels if labels is not None else label
312
- if not sequence:
313
- raise Exception(
314
- "The input instance must contain a 'seq' or 'sequence' key."
315
- )
316
- else:
317
- raise Exception("Unknown instance format.")
318
-
319
- tokenized_inputs = self.tokenizer(
320
- sequence,
321
- padding=kwargs.get("padding", "do_not_pad"),
322
- truncation=kwargs.get("truncation", True),
323
- max_length=self.max_length,
324
- return_tensors="pt",
325
- )
326
- for col in tokenized_inputs:
327
- tokenized_inputs[col] = tokenized_inputs[col].squeeze()
328
-
329
- if labels is not None:
330
- # Handle token-level regression labels
331
- if isinstance(labels, (list, tuple)):
332
- # Ensure labels match sequence length
333
- labels = list(labels)[
334
- : self.max_length - 2
335
- ] # Account for special tokens
336
- labels = [-100] + labels + [-100] # Add padding for special tokens
337
- else:
338
- # Single value for the entire sequence
339
- labels = [-100] + [float(labels)] * (self.max_length - 2) + [-100]
340
-
341
- tokenized_inputs["labels"] = torch.tensor(labels, dtype=torch.float32)
342
- return tokenized_inputs
343
-
344
-
345
- class OmniDatasetForSequenceRegression(OmniDataset):
346
- """
347
- Dataset class for sequence regression tasks in genomics.
348
-
349
- This class extends `OmniDataset` to prepare input sequences and their corresponding
350
- sequence-level regression targets. It's designed for tasks where the entire
351
- sequence needs to be assigned a continuous value.
352
-
353
- Attributes:
354
- metadata: Dictionary containing dataset metadata including library information
355
- """
356
-
357
- def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
358
- """
359
- Initialize the dataset for sequence regression.
360
-
361
- Args:
362
- data_source: Path to the data file or a list of paths.
363
- Supported formats depend on the `OmniDataset` implementation.
364
- tokenizer: The tokenizer instance to use for converting sequences into
365
- tokenized inputs.
366
- max_length: The maximum sequence length for tokenization. Sequences longer
367
- than this will be truncated. If None, a default or tokenizer's
368
- max length will be used.
369
- **kwargs: Additional keyword arguments to be stored in the dataset's metadata.
370
- """
371
- super(OmniDatasetForSequenceRegression, self).__init__(
372
- data_source, tokenizer, max_length, **kwargs
373
- )
374
-
375
- self.metadata.update(
376
- {
377
- "library_name": __name__,
378
- "omnigenome_version": __version__,
379
- "task": "genome_sequence_regression",
380
- }
381
- )
382
- for key, value in kwargs.items():
383
- self.metadata[key] = value
384
-
385
- def prepare_input(self, instance, **kwargs):
386
- """
387
- Prepare a single data instance for sequence regression.
388
-
389
- This method handles both string sequences and dictionary instances
390
- containing sequence and regression target information. It tokenizes
391
- the input sequence and prepares sequence-level regression targets.
392
-
393
- Args:
394
- instance: A single data instance. Can be a string representing the sequence
395
- or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
396
- **kwargs: Additional keyword arguments for tokenization, such as 'padding'
397
- and 'truncation'.
398
-
399
- Returns:
400
- dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
401
- and 'labels' (tensor of sequence-level regression targets).
402
-
403
- Raises:
404
- Exception: If the input instance format is unknown or if a dictionary
405
- instance does not contain a 'label' or 'labels' key.
406
- """
407
- labels = -100
408
- if isinstance(instance, str):
409
- sequence = instance
410
- elif isinstance(instance, dict):
411
- sequence = (
412
- instance.get("seq", None)
413
- if "seq" in instance
414
- else instance.get("sequence", None)
415
- )
416
- label = instance.get("label", None)
417
- labels = instance.get("labels", None)
418
- labels = labels if labels is not None else label
419
- else:
420
- raise Exception("Unknown instance format.")
421
-
422
- tokenized_inputs = self.tokenizer(
423
- sequence,
424
- padding=kwargs.get("padding", "do_not_pad"),
425
- truncation=kwargs.get("truncation", True),
426
- max_length=self.max_length,
427
- return_tensors="pt",
428
- )
429
- for col in tokenized_inputs:
430
- tokenized_inputs[col] = tokenized_inputs[col].squeeze()
431
-
432
- if labels is not None:
433
- # Convert to float for regression
434
- labels = float(labels)
435
-
436
- tokenized_inputs["labels"] = torch.tensor(labels, dtype=torch.float32)
437
- return tokenized_inputs
@@ -1,12 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # file: __init__.py
3
- # time: 12:35 11/06/2025
4
- # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
- # homepage: https://yangheng95.github.io
6
- # github: https://github.com/yangheng95
7
- # huggingface: https://huggingface.co/yangheng
8
- # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
9
- # Copyright (C) 2019-2025. All Rights Reserved.
10
- """
11
- This package contains modules for LoRA (Low-Rank Adaptation) fine-tuning.
12
- """