omnigenome 0.3.1a0__py3-none-any.whl → 0.3.3a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of omnigenome might be problematic. Click here for more details.
- omnigenome/__init__.py +252 -266
- {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/METADATA +9 -9
- omnigenome-0.3.3a0.dist-info/RECORD +7 -0
- omnigenome/auto/__init__.py +0 -3
- omnigenome/auto/auto_bench/__init__.py +0 -11
- omnigenome/auto/auto_bench/auto_bench.py +0 -494
- omnigenome/auto/auto_bench/auto_bench_cli.py +0 -230
- omnigenome/auto/auto_bench/auto_bench_config.py +0 -216
- omnigenome/auto/auto_bench/config_check.py +0 -34
- omnigenome/auto/auto_train/__init__.py +0 -12
- omnigenome/auto/auto_train/auto_train.py +0 -429
- omnigenome/auto/auto_train/auto_train_cli.py +0 -222
- omnigenome/auto/bench_hub/__init__.py +0 -11
- omnigenome/auto/bench_hub/bench_hub.py +0 -25
- omnigenome/cli/__init__.py +0 -12
- omnigenome/cli/commands/__init__.py +0 -12
- omnigenome/cli/commands/base.py +0 -83
- omnigenome/cli/commands/bench/__init__.py +0 -12
- omnigenome/cli/commands/bench/bench_cli.py +0 -202
- omnigenome/cli/commands/rna/__init__.py +0 -12
- omnigenome/cli/commands/rna/rna_design.py +0 -177
- omnigenome/cli/omnigenome_cli.py +0 -128
- omnigenome/src/__init__.py +0 -11
- omnigenome/src/abc/__init__.py +0 -11
- omnigenome/src/abc/abstract_dataset.py +0 -641
- omnigenome/src/abc/abstract_metric.py +0 -114
- omnigenome/src/abc/abstract_model.py +0 -690
- omnigenome/src/abc/abstract_tokenizer.py +0 -269
- omnigenome/src/dataset/__init__.py +0 -16
- omnigenome/src/dataset/omni_dataset.py +0 -437
- omnigenome/src/lora/__init__.py +0 -12
- omnigenome/src/lora/lora_model.py +0 -300
- omnigenome/src/metric/__init__.py +0 -15
- omnigenome/src/metric/classification_metric.py +0 -184
- omnigenome/src/metric/metric.py +0 -199
- omnigenome/src/metric/ranking_metric.py +0 -142
- omnigenome/src/metric/regression_metric.py +0 -191
- omnigenome/src/misc/__init__.py +0 -3
- omnigenome/src/misc/utils.py +0 -503
- omnigenome/src/model/__init__.py +0 -19
- omnigenome/src/model/augmentation/__init__.py +0 -11
- omnigenome/src/model/augmentation/model.py +0 -219
- omnigenome/src/model/classification/__init__.py +0 -11
- omnigenome/src/model/classification/model.py +0 -638
- omnigenome/src/model/embedding/__init__.py +0 -11
- omnigenome/src/model/embedding/model.py +0 -263
- omnigenome/src/model/mlm/__init__.py +0 -11
- omnigenome/src/model/mlm/model.py +0 -177
- omnigenome/src/model/module_utils.py +0 -232
- omnigenome/src/model/regression/__init__.py +0 -11
- omnigenome/src/model/regression/model.py +0 -781
- omnigenome/src/model/regression/resnet.py +0 -483
- omnigenome/src/model/rna_design/__init__.py +0 -11
- omnigenome/src/model/rna_design/model.py +0 -476
- omnigenome/src/model/seq2seq/__init__.py +0 -11
- omnigenome/src/model/seq2seq/model.py +0 -44
- omnigenome/src/tokenizer/__init__.py +0 -16
- omnigenome/src/tokenizer/bpe_tokenizer.py +0 -226
- omnigenome/src/tokenizer/kmers_tokenizer.py +0 -247
- omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +0 -249
- omnigenome/src/trainer/__init__.py +0 -14
- omnigenome/src/trainer/accelerate_trainer.py +0 -747
- omnigenome/src/trainer/hf_trainer.py +0 -75
- omnigenome/src/trainer/trainer.py +0 -591
- omnigenome/utility/__init__.py +0 -3
- omnigenome/utility/dataset_hub/__init__.py +0 -12
- omnigenome/utility/dataset_hub/dataset_hub.py +0 -178
- omnigenome/utility/ensemble.py +0 -324
- omnigenome/utility/hub_utils.py +0 -517
- omnigenome/utility/model_hub/__init__.py +0 -11
- omnigenome/utility/model_hub/model_hub.py +0 -232
- omnigenome/utility/pipeline_hub/__init__.py +0 -11
- omnigenome/utility/pipeline_hub/pipeline.py +0 -483
- omnigenome/utility/pipeline_hub/pipeline_hub.py +0 -129
- omnigenome-0.3.1a0.dist-info/RECORD +0 -78
- {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/WHEEL +0 -0
- {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/entry_points.txt +0 -0
- {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/licenses/LICENSE +0 -0
- {omnigenome-0.3.1a0.dist-info → omnigenome-0.3.3a0.dist-info}/top_level.txt +0 -0
|
@@ -1,437 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
# file: abstract_dataset.py
|
|
3
|
-
# time: 14:13 06/04/2024
|
|
4
|
-
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
|
|
5
|
-
# github: https://github.com/yangheng95
|
|
6
|
-
# huggingface: https://huggingface.co/yangheng
|
|
7
|
-
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
|
|
8
|
-
# Copyright (C) 2019-2024. All Rights Reserved.
|
|
9
|
-
"""
|
|
10
|
-
Specialized dataset classes for OmniGenome framework.
|
|
11
|
-
|
|
12
|
-
This module provides specialized dataset classes for various genomic tasks,
|
|
13
|
-
inheriting from the abstract `OmniDataset`. These classes handle data preparation
|
|
14
|
-
for token classification, sequence classification, token regression, and sequence regression,
|
|
15
|
-
integrating with tokenizers and managing metadata.
|
|
16
|
-
"""
|
|
17
|
-
import json
|
|
18
|
-
|
|
19
|
-
import numpy as np
|
|
20
|
-
import torch
|
|
21
|
-
|
|
22
|
-
from ..abc.abstract_dataset import OmniDataset
|
|
23
|
-
from ..misc.utils import fprint
|
|
24
|
-
from ... import __name__, __version__
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class OmniDatasetForTokenClassification(OmniDataset):
|
|
28
|
-
"""
|
|
29
|
-
Dataset class specifically designed for token classification tasks in genomics.
|
|
30
|
-
|
|
31
|
-
This class extends `OmniDataset` to provide functionalities for preparing input sequences
|
|
32
|
-
and their corresponding token-level labels. It's designed for tasks where each token
|
|
33
|
-
in a sequence needs to be classified independently.
|
|
34
|
-
|
|
35
|
-
Attributes:
|
|
36
|
-
metadata: Dictionary containing dataset metadata including library information
|
|
37
|
-
label2id: Mapping from label strings to integer IDs
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
|
|
41
|
-
"""
|
|
42
|
-
Initialize the dataset for token classification.
|
|
43
|
-
|
|
44
|
-
Args:
|
|
45
|
-
data_source: Path to the data file or a list of paths.
|
|
46
|
-
Supported formats depend on the `OmniDataset` implementation.
|
|
47
|
-
tokenizer: The tokenizer instance to use for converting sequences into
|
|
48
|
-
tokenized inputs.
|
|
49
|
-
max_length: The maximum sequence length for tokenization. Sequences longer
|
|
50
|
-
than this will be truncated. If None, a default or tokenizer's
|
|
51
|
-
max length will be used.
|
|
52
|
-
**kwargs: Additional keyword arguments to be stored in the dataset's metadata.
|
|
53
|
-
"""
|
|
54
|
-
super(OmniDatasetForTokenClassification, self).__init__(
|
|
55
|
-
data_source, tokenizer, max_length, **kwargs
|
|
56
|
-
)
|
|
57
|
-
self.metadata.update(
|
|
58
|
-
{
|
|
59
|
-
"library_name": __name__,
|
|
60
|
-
"omnigenome_version": __version__,
|
|
61
|
-
"task": "genome_token_classification",
|
|
62
|
-
}
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
for key, value in kwargs.items():
|
|
66
|
-
self.metadata[key] = value
|
|
67
|
-
|
|
68
|
-
def prepare_input(self, instance, **kwargs):
|
|
69
|
-
"""
|
|
70
|
-
Prepare a single data instance for token classification.
|
|
71
|
-
|
|
72
|
-
This method handles both string sequences and dictionary instances
|
|
73
|
-
containing sequence and label information. It tokenizes the input
|
|
74
|
-
sequence and prepares token-level labels for classification.
|
|
75
|
-
|
|
76
|
-
Args:
|
|
77
|
-
instance: A single data instance. Can be a string representing the sequence
|
|
78
|
-
or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
|
|
79
|
-
**kwargs: Additional keyword arguments for tokenization, such as 'padding'
|
|
80
|
-
and 'truncation'.
|
|
81
|
-
|
|
82
|
-
Returns:
|
|
83
|
-
dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
|
|
84
|
-
and 'labels' (tensor of token-level labels).
|
|
85
|
-
|
|
86
|
-
Raises:
|
|
87
|
-
Exception: If the input instance format is unknown or if a dictionary
|
|
88
|
-
instance does not contain a 'seq' or 'sequence' key.
|
|
89
|
-
"""
|
|
90
|
-
labels = -100
|
|
91
|
-
if isinstance(instance, str):
|
|
92
|
-
sequence = instance
|
|
93
|
-
elif isinstance(instance, dict):
|
|
94
|
-
sequence = (
|
|
95
|
-
instance.get("seq", None)
|
|
96
|
-
if "seq" in instance
|
|
97
|
-
else instance.get("sequence", None)
|
|
98
|
-
)
|
|
99
|
-
label = instance.get("label", None)
|
|
100
|
-
labels = instance.get("labels", None)
|
|
101
|
-
labels = labels if labels is not None else label
|
|
102
|
-
if not sequence:
|
|
103
|
-
raise Exception(
|
|
104
|
-
"The input instance must contain a 'seq' or 'sequence' key."
|
|
105
|
-
)
|
|
106
|
-
else:
|
|
107
|
-
raise Exception("Unknown instance format.")
|
|
108
|
-
|
|
109
|
-
tokenized_inputs = self.tokenizer(
|
|
110
|
-
sequence,
|
|
111
|
-
padding=kwargs.get("padding", "do_not_pad"),
|
|
112
|
-
truncation=kwargs.get("truncation", True),
|
|
113
|
-
max_length=self.max_length,
|
|
114
|
-
return_tensors="pt",
|
|
115
|
-
)
|
|
116
|
-
for col in tokenized_inputs:
|
|
117
|
-
tokenized_inputs[col] = tokenized_inputs[col].squeeze()
|
|
118
|
-
|
|
119
|
-
if labels is not None:
|
|
120
|
-
if len(set(self.label2id.keys()) | set([str(l) for l in labels])) != len(
|
|
121
|
-
set(self.label2id.keys())
|
|
122
|
-
):
|
|
123
|
-
fprint(
|
|
124
|
-
f"Warning: The labels <{labels}> in the input instance do not match the label2id mapping."
|
|
125
|
-
)
|
|
126
|
-
labels = (
|
|
127
|
-
[-100]
|
|
128
|
-
+ [self.label2id.get(str(l), -100) for l in labels][
|
|
129
|
-
: self.max_length - 2
|
|
130
|
-
]
|
|
131
|
-
+ [-100]
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
tokenized_inputs["labels"] = torch.tensor(labels)
|
|
135
|
-
return tokenized_inputs
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
class OmniDatasetForSequenceClassification(OmniDataset):
|
|
139
|
-
"""
|
|
140
|
-
Dataset class for sequence classification tasks in genomics.
|
|
141
|
-
|
|
142
|
-
This class extends `OmniDataset` to prepare input sequences and their corresponding
|
|
143
|
-
sequence-level labels. It's designed for tasks where the entire sequence needs
|
|
144
|
-
to be classified into one of several categories.
|
|
145
|
-
|
|
146
|
-
Attributes:
|
|
147
|
-
metadata: Dictionary containing dataset metadata including library information
|
|
148
|
-
label2id: Mapping from label strings to integer IDs
|
|
149
|
-
"""
|
|
150
|
-
|
|
151
|
-
def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
|
|
152
|
-
"""
|
|
153
|
-
Initialize the dataset for sequence classification.
|
|
154
|
-
|
|
155
|
-
Args:
|
|
156
|
-
data_source: Path to the data file or a list of paths.
|
|
157
|
-
Supported formats depend on the `OmniDataset` implementation.
|
|
158
|
-
tokenizer: The tokenizer instance to use for converting sequences into
|
|
159
|
-
tokenized inputs.
|
|
160
|
-
max_length: The maximum sequence length for tokenization. Sequences longer
|
|
161
|
-
than this will be truncated. If None, a default or tokenizer's
|
|
162
|
-
max length will be used.
|
|
163
|
-
**kwargs: Additional keyword arguments to be stored in the dataset's metadata.
|
|
164
|
-
"""
|
|
165
|
-
super(OmniDatasetForSequenceClassification, self).__init__(
|
|
166
|
-
data_source, tokenizer, max_length, **kwargs
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
self.metadata.update(
|
|
170
|
-
{
|
|
171
|
-
"library_name": __name__,
|
|
172
|
-
"omnigenome_version": __version__,
|
|
173
|
-
"task": "genome_sequence_classification",
|
|
174
|
-
}
|
|
175
|
-
)
|
|
176
|
-
for key, value in kwargs.items():
|
|
177
|
-
self.metadata[key] = value
|
|
178
|
-
|
|
179
|
-
def prepare_input(self, instance, **kwargs):
|
|
180
|
-
"""
|
|
181
|
-
Prepare a single data instance for sequence classification.
|
|
182
|
-
|
|
183
|
-
This method handles both string sequences and dictionary instances
|
|
184
|
-
containing sequence and label information. It tokenizes the input
|
|
185
|
-
sequence and prepares sequence-level labels for classification.
|
|
186
|
-
|
|
187
|
-
Args:
|
|
188
|
-
instance: A single data instance. Can be a string representing the sequence
|
|
189
|
-
or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
|
|
190
|
-
**kwargs: Additional keyword arguments for tokenization, such as 'padding'
|
|
191
|
-
and 'truncation'.
|
|
192
|
-
|
|
193
|
-
Returns:
|
|
194
|
-
dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
|
|
195
|
-
and 'labels' (tensor of sequence-level labels).
|
|
196
|
-
|
|
197
|
-
Raises:
|
|
198
|
-
Exception: If the input instance format is unknown or if a dictionary
|
|
199
|
-
instance does not contain a 'label' or 'labels' key, or if
|
|
200
|
-
the label is not an integer.
|
|
201
|
-
"""
|
|
202
|
-
labels = -100
|
|
203
|
-
if isinstance(instance, str):
|
|
204
|
-
sequence = instance
|
|
205
|
-
elif isinstance(instance, dict):
|
|
206
|
-
sequence = (
|
|
207
|
-
instance.get("seq", None)
|
|
208
|
-
if "seq" in instance
|
|
209
|
-
else instance.get("sequence", None)
|
|
210
|
-
)
|
|
211
|
-
label = instance.get("label", None)
|
|
212
|
-
labels = instance.get("labels", None)
|
|
213
|
-
labels = labels if labels is not None else label
|
|
214
|
-
else:
|
|
215
|
-
raise Exception("Unknown instance format.")
|
|
216
|
-
|
|
217
|
-
tokenized_inputs = self.tokenizer(
|
|
218
|
-
sequence,
|
|
219
|
-
padding=kwargs.get("padding", "do_not_pad"),
|
|
220
|
-
truncation=kwargs.get("truncation", True),
|
|
221
|
-
max_length=self.max_length,
|
|
222
|
-
return_tensors="pt",
|
|
223
|
-
)
|
|
224
|
-
for col in tokenized_inputs:
|
|
225
|
-
tokenized_inputs[col] = tokenized_inputs[col].squeeze()
|
|
226
|
-
|
|
227
|
-
if labels is not None:
|
|
228
|
-
if not isinstance(labels, int):
|
|
229
|
-
raise Exception(
|
|
230
|
-
"The label must be an integer for sequence classification."
|
|
231
|
-
)
|
|
232
|
-
labels = self.label2id.get(str(labels), -100)
|
|
233
|
-
|
|
234
|
-
tokenized_inputs["labels"] = torch.tensor(labels)
|
|
235
|
-
return tokenized_inputs
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
class OmniDatasetForTokenRegression(OmniDataset):
|
|
239
|
-
"""
|
|
240
|
-
Dataset class for token regression tasks in genomics.
|
|
241
|
-
|
|
242
|
-
This class extends `OmniDataset` to prepare input sequences and their corresponding
|
|
243
|
-
token-level regression targets. It's designed for tasks where each token in a
|
|
244
|
-
sequence needs to be assigned a continuous value.
|
|
245
|
-
|
|
246
|
-
Attributes:
|
|
247
|
-
metadata: Dictionary containing dataset metadata including library information
|
|
248
|
-
"""
|
|
249
|
-
|
|
250
|
-
def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
|
|
251
|
-
"""
|
|
252
|
-
Initialize the dataset for token regression.
|
|
253
|
-
|
|
254
|
-
Args:
|
|
255
|
-
data_source: Path to the data file or a list of paths.
|
|
256
|
-
Supported formats depend on the `OmniDataset` implementation.
|
|
257
|
-
tokenizer: The tokenizer instance to use for converting sequences into
|
|
258
|
-
tokenized inputs.
|
|
259
|
-
max_length: The maximum sequence length for tokenization. Sequences longer
|
|
260
|
-
than this will be truncated. If None, a default or tokenizer's
|
|
261
|
-
max length will be used.
|
|
262
|
-
**kwargs: Additional keyword arguments to be stored in the dataset's metadata.
|
|
263
|
-
"""
|
|
264
|
-
super(OmniDatasetForTokenRegression, self).__init__(
|
|
265
|
-
data_source, tokenizer, max_length, **kwargs
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
self.metadata.update(
|
|
269
|
-
{
|
|
270
|
-
"library_name": __name__,
|
|
271
|
-
"omnigenome_version": __version__,
|
|
272
|
-
"task": "genome_token_regression",
|
|
273
|
-
}
|
|
274
|
-
)
|
|
275
|
-
for key, value in kwargs.items():
|
|
276
|
-
self.metadata[key] = value
|
|
277
|
-
|
|
278
|
-
def prepare_input(self, instance, **kwargs):
|
|
279
|
-
"""
|
|
280
|
-
Prepare a single data instance for token regression.
|
|
281
|
-
|
|
282
|
-
This method handles both string sequences and dictionary instances
|
|
283
|
-
containing sequence and regression target information. It tokenizes
|
|
284
|
-
the input sequence and prepares token-level regression targets.
|
|
285
|
-
|
|
286
|
-
Args:
|
|
287
|
-
instance: A single data instance. Can be a string representing the sequence
|
|
288
|
-
or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
|
|
289
|
-
**kwargs: Additional keyword arguments for tokenization, such as 'padding'
|
|
290
|
-
and 'truncation'.
|
|
291
|
-
|
|
292
|
-
Returns:
|
|
293
|
-
dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
|
|
294
|
-
and 'labels' (tensor of token-level regression targets).
|
|
295
|
-
|
|
296
|
-
Raises:
|
|
297
|
-
Exception: If the input instance format is unknown or if a dictionary
|
|
298
|
-
instance does not contain a 'seq' or 'sequence' key.
|
|
299
|
-
"""
|
|
300
|
-
labels = -100
|
|
301
|
-
if isinstance(instance, str):
|
|
302
|
-
sequence = instance
|
|
303
|
-
elif isinstance(instance, dict):
|
|
304
|
-
sequence = (
|
|
305
|
-
instance.get("seq", None)
|
|
306
|
-
if "seq" in instance
|
|
307
|
-
else instance.get("sequence", None)
|
|
308
|
-
)
|
|
309
|
-
label = instance.get("label", None)
|
|
310
|
-
labels = instance.get("labels", None)
|
|
311
|
-
labels = labels if labels is not None else label
|
|
312
|
-
if not sequence:
|
|
313
|
-
raise Exception(
|
|
314
|
-
"The input instance must contain a 'seq' or 'sequence' key."
|
|
315
|
-
)
|
|
316
|
-
else:
|
|
317
|
-
raise Exception("Unknown instance format.")
|
|
318
|
-
|
|
319
|
-
tokenized_inputs = self.tokenizer(
|
|
320
|
-
sequence,
|
|
321
|
-
padding=kwargs.get("padding", "do_not_pad"),
|
|
322
|
-
truncation=kwargs.get("truncation", True),
|
|
323
|
-
max_length=self.max_length,
|
|
324
|
-
return_tensors="pt",
|
|
325
|
-
)
|
|
326
|
-
for col in tokenized_inputs:
|
|
327
|
-
tokenized_inputs[col] = tokenized_inputs[col].squeeze()
|
|
328
|
-
|
|
329
|
-
if labels is not None:
|
|
330
|
-
# Handle token-level regression labels
|
|
331
|
-
if isinstance(labels, (list, tuple)):
|
|
332
|
-
# Ensure labels match sequence length
|
|
333
|
-
labels = list(labels)[
|
|
334
|
-
: self.max_length - 2
|
|
335
|
-
] # Account for special tokens
|
|
336
|
-
labels = [-100] + labels + [-100] # Add padding for special tokens
|
|
337
|
-
else:
|
|
338
|
-
# Single value for the entire sequence
|
|
339
|
-
labels = [-100] + [float(labels)] * (self.max_length - 2) + [-100]
|
|
340
|
-
|
|
341
|
-
tokenized_inputs["labels"] = torch.tensor(labels, dtype=torch.float32)
|
|
342
|
-
return tokenized_inputs
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
class OmniDatasetForSequenceRegression(OmniDataset):
|
|
346
|
-
"""
|
|
347
|
-
Dataset class for sequence regression tasks in genomics.
|
|
348
|
-
|
|
349
|
-
This class extends `OmniDataset` to prepare input sequences and their corresponding
|
|
350
|
-
sequence-level regression targets. It's designed for tasks where the entire
|
|
351
|
-
sequence needs to be assigned a continuous value.
|
|
352
|
-
|
|
353
|
-
Attributes:
|
|
354
|
-
metadata: Dictionary containing dataset metadata including library information
|
|
355
|
-
"""
|
|
356
|
-
|
|
357
|
-
def __init__(self, data_source, tokenizer, max_length=None, **kwargs):
|
|
358
|
-
"""
|
|
359
|
-
Initialize the dataset for sequence regression.
|
|
360
|
-
|
|
361
|
-
Args:
|
|
362
|
-
data_source: Path to the data file or a list of paths.
|
|
363
|
-
Supported formats depend on the `OmniDataset` implementation.
|
|
364
|
-
tokenizer: The tokenizer instance to use for converting sequences into
|
|
365
|
-
tokenized inputs.
|
|
366
|
-
max_length: The maximum sequence length for tokenization. Sequences longer
|
|
367
|
-
than this will be truncated. If None, a default or tokenizer's
|
|
368
|
-
max length will be used.
|
|
369
|
-
**kwargs: Additional keyword arguments to be stored in the dataset's metadata.
|
|
370
|
-
"""
|
|
371
|
-
super(OmniDatasetForSequenceRegression, self).__init__(
|
|
372
|
-
data_source, tokenizer, max_length, **kwargs
|
|
373
|
-
)
|
|
374
|
-
|
|
375
|
-
self.metadata.update(
|
|
376
|
-
{
|
|
377
|
-
"library_name": __name__,
|
|
378
|
-
"omnigenome_version": __version__,
|
|
379
|
-
"task": "genome_sequence_regression",
|
|
380
|
-
}
|
|
381
|
-
)
|
|
382
|
-
for key, value in kwargs.items():
|
|
383
|
-
self.metadata[key] = value
|
|
384
|
-
|
|
385
|
-
def prepare_input(self, instance, **kwargs):
|
|
386
|
-
"""
|
|
387
|
-
Prepare a single data instance for sequence regression.
|
|
388
|
-
|
|
389
|
-
This method handles both string sequences and dictionary instances
|
|
390
|
-
containing sequence and regression target information. It tokenizes
|
|
391
|
-
the input sequence and prepares sequence-level regression targets.
|
|
392
|
-
|
|
393
|
-
Args:
|
|
394
|
-
instance: A single data instance. Can be a string representing the sequence
|
|
395
|
-
or a dictionary with 'seq'/'sequence' and 'labels'/'label' keys.
|
|
396
|
-
**kwargs: Additional keyword arguments for tokenization, such as 'padding'
|
|
397
|
-
and 'truncation'.
|
|
398
|
-
|
|
399
|
-
Returns:
|
|
400
|
-
dict: A dictionary of tokenized inputs, including 'input_ids', 'attention_mask',
|
|
401
|
-
and 'labels' (tensor of sequence-level regression targets).
|
|
402
|
-
|
|
403
|
-
Raises:
|
|
404
|
-
Exception: If the input instance format is unknown or if a dictionary
|
|
405
|
-
instance does not contain a 'label' or 'labels' key.
|
|
406
|
-
"""
|
|
407
|
-
labels = -100
|
|
408
|
-
if isinstance(instance, str):
|
|
409
|
-
sequence = instance
|
|
410
|
-
elif isinstance(instance, dict):
|
|
411
|
-
sequence = (
|
|
412
|
-
instance.get("seq", None)
|
|
413
|
-
if "seq" in instance
|
|
414
|
-
else instance.get("sequence", None)
|
|
415
|
-
)
|
|
416
|
-
label = instance.get("label", None)
|
|
417
|
-
labels = instance.get("labels", None)
|
|
418
|
-
labels = labels if labels is not None else label
|
|
419
|
-
else:
|
|
420
|
-
raise Exception("Unknown instance format.")
|
|
421
|
-
|
|
422
|
-
tokenized_inputs = self.tokenizer(
|
|
423
|
-
sequence,
|
|
424
|
-
padding=kwargs.get("padding", "do_not_pad"),
|
|
425
|
-
truncation=kwargs.get("truncation", True),
|
|
426
|
-
max_length=self.max_length,
|
|
427
|
-
return_tensors="pt",
|
|
428
|
-
)
|
|
429
|
-
for col in tokenized_inputs:
|
|
430
|
-
tokenized_inputs[col] = tokenized_inputs[col].squeeze()
|
|
431
|
-
|
|
432
|
-
if labels is not None:
|
|
433
|
-
# Convert to float for regression
|
|
434
|
-
labels = float(labels)
|
|
435
|
-
|
|
436
|
-
tokenized_inputs["labels"] = torch.tensor(labels, dtype=torch.float32)
|
|
437
|
-
return tokenized_inputs
|
omnigenome/src/lora/__init__.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
# file: __init__.py
|
|
3
|
-
# time: 12:35 11/06/2025
|
|
4
|
-
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
|
|
5
|
-
# homepage: https://yangheng95.github.io
|
|
6
|
-
# github: https://github.com/yangheng95
|
|
7
|
-
# huggingface: https://huggingface.co/yangheng
|
|
8
|
-
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
|
|
9
|
-
# Copyright (C) 2019-2025. All Rights Reserved.
|
|
10
|
-
"""
|
|
11
|
-
This package contains modules for LoRA (Low-Rank Adaptation) fine-tuning.
|
|
12
|
-
"""
|