omnigenome 0.3.0a1__py3-none-any.whl → 0.3.1a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omnigenome/__init__.py +16 -8
- omnigenome/auto/auto_bench/__init__.py +0 -1
- omnigenome/auto/auto_bench/auto_bench.py +24 -14
- omnigenome/auto/auto_train/__init__.py +0 -1
- omnigenome/auto/auto_train/auto_train.py +11 -12
- omnigenome/auto/bench_hub/__init__.py +0 -1
- omnigenome/auto/bench_hub/bench_hub.py +1 -1
- omnigenome/cli/__init__.py +0 -1
- omnigenome/cli/commands/__init__.py +0 -1
- omnigenome/cli/commands/base.py +10 -10
- omnigenome/cli/commands/bench/__init__.py +0 -1
- omnigenome/cli/commands/bench/bench_cli.py +10 -10
- omnigenome/cli/commands/rna/__init__.py +0 -1
- omnigenome/cli/commands/rna/rna_design.py +10 -11
- omnigenome/src/__init__.py +0 -1
- omnigenome/src/abc/__init__.py +0 -1
- omnigenome/src/abc/abstract_dataset.py +38 -19
- omnigenome/src/abc/abstract_metric.py +7 -7
- omnigenome/src/abc/abstract_model.py +15 -14
- omnigenome/src/abc/abstract_tokenizer.py +9 -7
- omnigenome/src/dataset/omni_dataset.py +16 -14
- omnigenome/src/lora/__init__.py +0 -1
- omnigenome/src/lora/lora_model.py +47 -41
- omnigenome/src/metric/classification_metric.py +11 -11
- omnigenome/src/metric/metric.py +19 -19
- omnigenome/src/metric/ranking_metric.py +15 -15
- omnigenome/src/metric/regression_metric.py +18 -18
- omnigenome/src/misc/utils.py +40 -36
- omnigenome/src/model/augmentation/__init__.py +0 -1
- omnigenome/src/model/augmentation/model.py +17 -17
- omnigenome/src/model/classification/__init__.py +0 -1
- omnigenome/src/model/classification/model.py +28 -32
- omnigenome/src/model/embedding/__init__.py +0 -1
- omnigenome/src/model/embedding/model.py +35 -35
- omnigenome/src/model/mlm/__init__.py +0 -1
- omnigenome/src/model/mlm/model.py +13 -13
- omnigenome/src/model/module_utils.py +17 -17
- omnigenome/src/model/regression/__init__.py +0 -1
- omnigenome/src/model/regression/model.py +72 -77
- omnigenome/src/model/regression/resnet.py +32 -32
- omnigenome/src/model/rna_design/__init__.py +0 -1
- omnigenome/src/model/rna_design/model.py +65 -58
- omnigenome/src/model/seq2seq/__init__.py +0 -1
- omnigenome/src/model/seq2seq/model.py +4 -4
- omnigenome/src/tokenizer/bpe_tokenizer.py +27 -27
- omnigenome/src/tokenizer/kmers_tokenizer.py +22 -22
- omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +11 -11
- omnigenome/src/trainer/accelerate_trainer.py +40 -32
- omnigenome/src/trainer/hf_trainer.py +8 -8
- omnigenome/src/trainer/trainer.py +37 -25
- omnigenome/utility/dataset_hub/__init__.py +0 -1
- omnigenome/utility/dataset_hub/dataset_hub.py +13 -13
- omnigenome/utility/ensemble.py +26 -26
- omnigenome/utility/hub_utils.py +8 -8
- omnigenome/utility/model_hub/__init__.py +0 -1
- omnigenome/utility/model_hub/model_hub.py +26 -25
- omnigenome/utility/pipeline_hub/__init__.py +0 -1
- omnigenome/utility/pipeline_hub/pipeline.py +49 -49
- omnigenome/utility/pipeline_hub/pipeline_hub.py +17 -17
- {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/METADATA +2 -2
- omnigenome-0.3.1a0.dist-info/RECORD +78 -0
- omnigenome-0.3.0a1.dist-info/RECORD +0 -78
- {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/WHEEL +0 -0
- {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/entry_points.txt +0 -0
- {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/licenses/LICENSE +0 -0
- {omnigenome-0.3.0a1.dist-info → omnigenome-0.3.1a0.dist-info}/top_level.txt +0 -0
omnigenome/__init__.py
CHANGED
|
@@ -14,7 +14,7 @@ Use dir(omnigenome) to see all available APIs.
|
|
|
14
14
|
Key API Entries:
|
|
15
15
|
----------------
|
|
16
16
|
- AutoBench: Automated benchmarking of genomic models
|
|
17
|
-
- AutoTrain: Automated training of genomic models
|
|
17
|
+
- AutoTrain: Automated training of genomic models
|
|
18
18
|
- BenchHub: Hub for accessing benchmarks
|
|
19
19
|
- ModelHub: Hub for accessing pre-trained models
|
|
20
20
|
- PipelineHub: Hub for accessing pipelines
|
|
@@ -26,7 +26,7 @@ Key API Entries:
|
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
28
|
__name__ = "omnigenbench"
|
|
29
|
-
__version__ = "0.3.
|
|
29
|
+
__version__ = "0.3.1alpha"
|
|
30
30
|
|
|
31
31
|
__author__ = "YANG, HENG"
|
|
32
32
|
__email__ = "yangheng2021@gmail.com"
|
|
@@ -117,10 +117,18 @@ from .src.abc.abstract_tokenizer import OmniTokenizer as OmniGenomeTokenizer
|
|
|
117
117
|
from .src.abc.abstract_dataset import OmniDataset as OmniGenomeDataset
|
|
118
118
|
from .src.abc.abstract_metric import OmniMetric as OmniGenomeMetric
|
|
119
119
|
from .src.abc.abstract_model import OmniModel as OmniGenomeModel
|
|
120
|
-
from .src.dataset.omni_dataset import
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
from .src.dataset.omni_dataset import
|
|
120
|
+
from .src.dataset.omni_dataset import (
|
|
121
|
+
OmniDatasetForSequenceClassification as OmniGenomeDatasetForSequenceClassification,
|
|
122
|
+
)
|
|
123
|
+
from .src.dataset.omni_dataset import (
|
|
124
|
+
OmniDatasetForSequenceRegression as OmniGenomeDatasetForSequenceRegression,
|
|
125
|
+
)
|
|
126
|
+
from .src.dataset.omni_dataset import (
|
|
127
|
+
OmniDatasetForTokenClassification as OmniGenomeDatasetForTokenClassification,
|
|
128
|
+
)
|
|
129
|
+
from .src.dataset.omni_dataset import (
|
|
130
|
+
OmniDatasetForTokenRegression as OmniGenomeDatasetForTokenRegression,
|
|
131
|
+
)
|
|
124
132
|
from .src.lora.lora_model import OmniLoraModel as OmniGenomeLoraModel
|
|
125
133
|
from .src.model import (
|
|
126
134
|
OmniModelForSequenceClassification as OmniGenomeModelForSequenceClassification,
|
|
@@ -136,10 +144,10 @@ from .src.model import (
|
|
|
136
144
|
OmniModelForRNADesign as OmniGenomeModelForRNADesign,
|
|
137
145
|
OmniModelForEmbedding as OmniGenomeModelForEmbedding,
|
|
138
146
|
OmniModelForAugmentation as OmniGenomeModelForAugmentation,
|
|
139
|
-
|
|
140
147
|
)
|
|
141
148
|
|
|
142
149
|
from .utility.ensemble import VoteEnsemblePredictor
|
|
150
|
+
|
|
143
151
|
# ------------------------------------------------------------------------------
|
|
144
152
|
|
|
145
153
|
|
|
@@ -192,7 +200,7 @@ __all__ = [
|
|
|
192
200
|
"download_benchmark",
|
|
193
201
|
"download_model",
|
|
194
202
|
"download_pipeline",
|
|
195
|
-
"VoteEnsemblePredictor"
|
|
203
|
+
"VoteEnsemblePredictor",
|
|
196
204
|
]
|
|
197
205
|
|
|
198
206
|
|
|
@@ -34,18 +34,18 @@ from ... import __version__ as omnigenome_version
|
|
|
34
34
|
class AutoBench:
|
|
35
35
|
"""
|
|
36
36
|
AutoBench is a class for automatically benchmarking genomic foundation models.
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
This class provides a comprehensive framework for evaluating genomic models
|
|
39
39
|
across multiple benchmarks and tasks. It handles loading benchmarks, models,
|
|
40
40
|
tokenizers, and running evaluations with proper metric tracking and result
|
|
41
41
|
visualization.
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
AutoBench supports various evaluation scenarios including:
|
|
44
44
|
- Single model evaluation across multiple benchmarks
|
|
45
45
|
- Multi-seed evaluation for robustness testing
|
|
46
46
|
- Different trainer backends (native, accelerate, huggingface)
|
|
47
47
|
- Automatic metric visualization and result tracking
|
|
48
|
-
|
|
48
|
+
|
|
49
49
|
Attributes:
|
|
50
50
|
benchmark (str): The name or path of the benchmark to use.
|
|
51
51
|
model_name_or_path (str): The name or path of the model to evaluate.
|
|
@@ -73,19 +73,19 @@ class AutoBench:
|
|
|
73
73
|
model_name_or_path (str): The name or path of the model to evaluate.
|
|
74
74
|
tokenizer: The tokenizer to use. If None, it will be loaded from the model path.
|
|
75
75
|
**kwargs: Additional keyword arguments.
|
|
76
|
-
- autocast (str): The autocast precision to use ('fp16', 'bf16', etc.).
|
|
76
|
+
- autocast (str): The autocast precision to use ('fp16', 'bf16', etc.).
|
|
77
77
|
Defaults to 'fp16'.
|
|
78
|
-
- overwrite (bool): Whether to overwrite existing evaluation results.
|
|
78
|
+
- overwrite (bool): Whether to overwrite existing evaluation results.
|
|
79
79
|
Defaults to False.
|
|
80
|
-
- trainer (str): The trainer to use ('native', 'accelerate', 'hf_trainer').
|
|
80
|
+
- trainer (str): The trainer to use ('native', 'accelerate', 'hf_trainer').
|
|
81
81
|
Defaults to 'native'.
|
|
82
82
|
|
|
83
83
|
Example:
|
|
84
84
|
>>> # Initialize with a benchmark and model
|
|
85
85
|
>>> bench = AutoBench("RGB", "model_name")
|
|
86
|
-
|
|
86
|
+
|
|
87
87
|
>>> # Initialize with custom settings
|
|
88
|
-
>>> bench = AutoBench("RGB", "model_name",
|
|
88
|
+
>>> bench = AutoBench("RGB", "model_name",
|
|
89
89
|
... autocast="bf16", trainer="accelerate")
|
|
90
90
|
"""
|
|
91
91
|
self.benchmark = benchmark.rstrip("/")
|
|
@@ -137,7 +137,7 @@ class AutoBench:
|
|
|
137
137
|
def bench_info(self):
|
|
138
138
|
"""
|
|
139
139
|
Prints and returns information about the current benchmark setup.
|
|
140
|
-
|
|
140
|
+
|
|
141
141
|
This method provides a comprehensive overview of the current
|
|
142
142
|
benchmark configuration, including benchmark details, model information,
|
|
143
143
|
and evaluation settings.
|
|
@@ -161,7 +161,7 @@ class AutoBench:
|
|
|
161
161
|
def run(self, **kwargs):
|
|
162
162
|
"""
|
|
163
163
|
Runs the benchmarking process.
|
|
164
|
-
|
|
164
|
+
|
|
165
165
|
This method iterates through the tasks in the benchmark, loads the corresponding
|
|
166
166
|
configurations, initializes the model, tokenizer, and datasets, and then
|
|
167
167
|
trains and evaluates the model. It supports multiple evaluation seeds and
|
|
@@ -174,7 +174,7 @@ class AutoBench:
|
|
|
174
174
|
Example:
|
|
175
175
|
>>> # Run benchmarking with default settings
|
|
176
176
|
>>> bench.run()
|
|
177
|
-
|
|
177
|
+
|
|
178
178
|
>>> # Run with custom parameters
|
|
179
179
|
>>> bench.run(learning_rate=1e-4, batch_size=16)
|
|
180
180
|
"""
|
|
@@ -218,7 +218,11 @@ class AutoBench:
|
|
|
218
218
|
for key, value in _kwargs.items():
|
|
219
219
|
if key in bench_config:
|
|
220
220
|
fprint(
|
|
221
|
-
"Override",
|
|
221
|
+
"Override",
|
|
222
|
+
key,
|
|
223
|
+
"with",
|
|
224
|
+
value,
|
|
225
|
+
"according to the input kwargs",
|
|
222
226
|
)
|
|
223
227
|
bench_config.update({key: value})
|
|
224
228
|
|
|
@@ -239,7 +243,11 @@ class AutoBench:
|
|
|
239
243
|
for key, value in _kwargs.items():
|
|
240
244
|
if key in bench_config:
|
|
241
245
|
fprint(
|
|
242
|
-
"Override",
|
|
246
|
+
"Override",
|
|
247
|
+
key,
|
|
248
|
+
"with",
|
|
249
|
+
value,
|
|
250
|
+
"according to the input kwargs",
|
|
243
251
|
)
|
|
244
252
|
bench_config.update({key: value})
|
|
245
253
|
|
|
@@ -290,7 +298,9 @@ class AutoBench:
|
|
|
290
298
|
fprint(f"\n{model}")
|
|
291
299
|
|
|
292
300
|
if kwargs.get("lora_config", None) is not None:
|
|
293
|
-
fprint(
|
|
301
|
+
fprint(
|
|
302
|
+
"Applying LoRA to the model with config:", kwargs["lora_config"]
|
|
303
|
+
)
|
|
294
304
|
model = OmniLoraModel(model, **kwargs.get("lora_config", {}))
|
|
295
305
|
|
|
296
306
|
# Init Trainer
|
|
@@ -33,17 +33,17 @@ autotrain_evaluations = "./autotrain_evaluations"
|
|
|
33
33
|
class AutoTrain:
|
|
34
34
|
"""
|
|
35
35
|
AutoTrain is a class for automatically training genomic foundation models on a given dataset.
|
|
36
|
-
|
|
36
|
+
|
|
37
37
|
This class provides a comprehensive framework for training genomic models
|
|
38
38
|
on various datasets with minimal configuration. It handles dataset loading,
|
|
39
39
|
model initialization, training configuration, and result tracking.
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
AutoTrain supports various training scenarios including:
|
|
42
42
|
- Single dataset training with multiple seeds
|
|
43
43
|
- Different trainer backends (native, accelerate, huggingface)
|
|
44
44
|
- Automatic metric visualization and result tracking
|
|
45
45
|
- Configurable training parameters
|
|
46
|
-
|
|
46
|
+
|
|
47
47
|
Attributes:
|
|
48
48
|
dataset (str): The name or path of the dataset to use for training.
|
|
49
49
|
model_name_or_path (str): The name or path of the model to train.
|
|
@@ -70,19 +70,19 @@ class AutoTrain:
|
|
|
70
70
|
model_name_or_path (str): The name or path of the model to train.
|
|
71
71
|
tokenizer: The tokenizer to use. If None, it will be loaded from the model path.
|
|
72
72
|
**kwargs: Additional keyword arguments.
|
|
73
|
-
- autocast (str): The autocast precision to use ('fp16', 'bf16', etc.).
|
|
73
|
+
- autocast (str): The autocast precision to use ('fp16', 'bf16', etc.).
|
|
74
74
|
Defaults to 'fp16'.
|
|
75
|
-
- overwrite (bool): Whether to overwrite existing training results.
|
|
75
|
+
- overwrite (bool): Whether to overwrite existing training results.
|
|
76
76
|
Defaults to False.
|
|
77
|
-
- trainer (str): The trainer to use ('native', 'accelerate', 'hf_trainer').
|
|
77
|
+
- trainer (str): The trainer to use ('native', 'accelerate', 'hf_trainer').
|
|
78
78
|
Defaults to 'accelerate'.
|
|
79
79
|
|
|
80
80
|
Example:
|
|
81
81
|
>>> # Initialize with a dataset and model
|
|
82
82
|
>>> trainer = AutoTrain("dataset_name", "model_name")
|
|
83
|
-
|
|
83
|
+
|
|
84
84
|
>>> # Initialize with custom settings
|
|
85
|
-
>>> trainer = AutoTrain("dataset_name", "model_name",
|
|
85
|
+
>>> trainer = AutoTrain("dataset_name", "model_name",
|
|
86
86
|
... autocast="bf16", trainer="accelerate")
|
|
87
87
|
"""
|
|
88
88
|
self.dataset = dataset.rstrip("/")
|
|
@@ -118,7 +118,7 @@ class AutoTrain:
|
|
|
118
118
|
def bench_info(self):
|
|
119
119
|
"""
|
|
120
120
|
Print and return information about the current training setup.
|
|
121
|
-
|
|
121
|
+
|
|
122
122
|
This method provides a comprehensive overview of the current
|
|
123
123
|
training configuration, including dataset details, model information,
|
|
124
124
|
and training settings.
|
|
@@ -140,7 +140,7 @@ class AutoTrain:
|
|
|
140
140
|
def run(self, **kwargs):
|
|
141
141
|
"""
|
|
142
142
|
Run the training process.
|
|
143
|
-
|
|
143
|
+
|
|
144
144
|
This method loads the dataset configuration, initializes the model and
|
|
145
145
|
tokenizer, and runs training across multiple seeds. It supports various
|
|
146
146
|
training backends and automatic result tracking.
|
|
@@ -152,12 +152,11 @@ class AutoTrain:
|
|
|
152
152
|
Example:
|
|
153
153
|
>>> # Run training with default settings
|
|
154
154
|
>>> trainer.run()
|
|
155
|
-
|
|
155
|
+
|
|
156
156
|
>>> # Run with custom parameters
|
|
157
157
|
>>> trainer.run(learning_rate=1e-4, batch_size=16)
|
|
158
158
|
"""
|
|
159
159
|
|
|
160
|
-
|
|
161
160
|
clean_temp_checkpoint(1) # clean temp checkpoint older than 1 day
|
|
162
161
|
|
|
163
162
|
_kwargs = kwargs.copy()
|
omnigenome/cli/__init__.py
CHANGED
omnigenome/cli/commands/base.py
CHANGED
|
@@ -13,15 +13,15 @@ from abc import ABC, abstractmethod
|
|
|
13
13
|
class BaseCommand(ABC):
|
|
14
14
|
"""
|
|
15
15
|
Abstract base class for all CLI commands in OmniGenome.
|
|
16
|
-
|
|
16
|
+
|
|
17
17
|
This class provides a common interface for all command-line interface
|
|
18
18
|
commands in the OmniGenome framework. It defines the structure that
|
|
19
19
|
all command classes must follow, including registration and common
|
|
20
20
|
argument handling.
|
|
21
|
-
|
|
21
|
+
|
|
22
22
|
Subclasses must implement the `register_command` method to define
|
|
23
23
|
their specific command-line interface and arguments.
|
|
24
|
-
|
|
24
|
+
|
|
25
25
|
Example:
|
|
26
26
|
>>> class MyCommand(BaseCommand):
|
|
27
27
|
... @classmethod
|
|
@@ -29,7 +29,7 @@ class BaseCommand(ABC):
|
|
|
29
29
|
... parser = subparsers.add_parser("mycommand", help="My command")
|
|
30
30
|
... parser.add_argument("--input", required=True)
|
|
31
31
|
... parser.set_defaults(func=cls.execute)
|
|
32
|
-
...
|
|
32
|
+
...
|
|
33
33
|
... @staticmethod
|
|
34
34
|
... def execute(args):
|
|
35
35
|
... print(f"Executing with input: {args.input}")
|
|
@@ -40,14 +40,14 @@ class BaseCommand(ABC):
|
|
|
40
40
|
def register_command(cls, subparsers):
|
|
41
41
|
"""
|
|
42
42
|
Register the command and its arguments with the main parser.
|
|
43
|
-
|
|
43
|
+
|
|
44
44
|
This abstract method must be implemented by all subclasses to define
|
|
45
45
|
their specific command-line interface, including arguments, help text,
|
|
46
46
|
and default functions.
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
Args:
|
|
49
49
|
subparsers: The subparsers object from the main ArgumentParser
|
|
50
|
-
|
|
50
|
+
|
|
51
51
|
Example:
|
|
52
52
|
>>> parser = argparse.ArgumentParser()
|
|
53
53
|
>>> subparsers = parser.add_subparsers()
|
|
@@ -59,13 +59,13 @@ class BaseCommand(ABC):
|
|
|
59
59
|
def add_common_arguments(cls, parser):
|
|
60
60
|
"""
|
|
61
61
|
Add common arguments to a command's parser.
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
This method adds standard arguments that are common across all
|
|
64
64
|
OmniGenome CLI commands, such as logging level and output directory.
|
|
65
|
-
|
|
65
|
+
|
|
66
66
|
Args:
|
|
67
67
|
parser: The ArgumentParser for the specific command
|
|
68
|
-
|
|
68
|
+
|
|
69
69
|
Example:
|
|
70
70
|
>>> parser = argparse.ArgumentParser()
|
|
71
71
|
>>> BaseCommand.add_common_arguments(parser)
|
|
@@ -43,18 +43,18 @@ class BenchCommand(BaseCommand):
|
|
|
43
43
|
... --bs_scale 2 \
|
|
44
44
|
... --overwrite True
|
|
45
45
|
"""
|
|
46
|
-
|
|
46
|
+
|
|
47
47
|
@classmethod
|
|
48
48
|
def register_command(cls, subparsers):
|
|
49
49
|
"""
|
|
50
50
|
Register the autobench command with the argument parser.
|
|
51
|
-
|
|
51
|
+
|
|
52
52
|
This method sets up the command-line interface for the autobench functionality,
|
|
53
53
|
including all necessary arguments and their descriptions.
|
|
54
|
-
|
|
54
|
+
|
|
55
55
|
Args:
|
|
56
56
|
subparsers: The subparsers object from argparse to add the command to
|
|
57
|
-
|
|
57
|
+
|
|
58
58
|
Example:
|
|
59
59
|
>>> parser = argparse.ArgumentParser()
|
|
60
60
|
>>> subparsers = parser.add_subparsers()
|
|
@@ -122,15 +122,15 @@ class BenchCommand(BaseCommand):
|
|
|
122
122
|
def execute(args: argparse.Namespace):
|
|
123
123
|
"""
|
|
124
124
|
Execute the autobench command with the provided arguments.
|
|
125
|
-
|
|
125
|
+
|
|
126
126
|
This method runs the automated benchmarking process using the AutoBench
|
|
127
127
|
class. It handles model and tokenizer loading, benchmark execution,
|
|
128
128
|
and result logging.
|
|
129
|
-
|
|
129
|
+
|
|
130
130
|
Args:
|
|
131
131
|
args (argparse.Namespace): Parsed command-line arguments containing
|
|
132
132
|
benchmark configuration and model settings
|
|
133
|
-
|
|
133
|
+
|
|
134
134
|
Example:
|
|
135
135
|
>>> args = parser.parse_args(['autobench', '--model', 'model_name'])
|
|
136
136
|
>>> BenchCommand.execute(args)
|
|
@@ -187,13 +187,13 @@ class BenchCommand(BaseCommand):
|
|
|
187
187
|
def register_command(subparsers):
|
|
188
188
|
"""
|
|
189
189
|
Register the autobench command with the CLI.
|
|
190
|
-
|
|
190
|
+
|
|
191
191
|
This function is a convenience wrapper for registering the BenchCommand
|
|
192
192
|
with the argument parser.
|
|
193
|
-
|
|
193
|
+
|
|
194
194
|
Args:
|
|
195
195
|
subparsers: The subparsers object from argparse to add the command to
|
|
196
|
-
|
|
196
|
+
|
|
197
197
|
Example:
|
|
198
198
|
>>> parser = argparse.ArgumentParser()
|
|
199
199
|
>>> subparsers = parser.add_subparsers()
|
|
@@ -54,13 +54,13 @@ class RNADesignCommand(BaseCommand):
|
|
|
54
54
|
def register_command(cls, subparsers):
|
|
55
55
|
"""
|
|
56
56
|
Register the RNA design command with the argument parser.
|
|
57
|
-
|
|
57
|
+
|
|
58
58
|
This method sets up the command-line interface for RNA sequence design,
|
|
59
59
|
including all necessary arguments and their descriptions.
|
|
60
|
-
|
|
60
|
+
|
|
61
61
|
Args:
|
|
62
62
|
subparsers: The subparsers object from argparse to add the command to
|
|
63
|
-
|
|
63
|
+
|
|
64
64
|
Example:
|
|
65
65
|
>>> parser = argparse.ArgumentParser()
|
|
66
66
|
>>> subparsers = parser.add_subparsers()
|
|
@@ -109,18 +109,18 @@ class RNADesignCommand(BaseCommand):
|
|
|
109
109
|
def execute(args: argparse.Namespace):
|
|
110
110
|
"""
|
|
111
111
|
Execute the RNA design command with the provided arguments.
|
|
112
|
-
|
|
112
|
+
|
|
113
113
|
This method runs the RNA sequence design process using genetic algorithms.
|
|
114
114
|
It validates parameters, loads the model, runs the design optimization,
|
|
115
115
|
and outputs or saves the results.
|
|
116
|
-
|
|
116
|
+
|
|
117
117
|
Args:
|
|
118
118
|
args (argparse.Namespace): Parsed command-line arguments containing
|
|
119
119
|
design parameters and model settings
|
|
120
|
-
|
|
120
|
+
|
|
121
121
|
Raises:
|
|
122
122
|
ValueError: If mutation_ratio is not between 0.0 and 1.0
|
|
123
|
-
|
|
123
|
+
|
|
124
124
|
Example:
|
|
125
125
|
>>> args = parser.parse_args(['design', '--structure', '(((...)))'])
|
|
126
126
|
>>> RNADesignCommand.execute(args)
|
|
@@ -162,17 +162,16 @@ class RNADesignCommand(BaseCommand):
|
|
|
162
162
|
def register_command(subparsers):
|
|
163
163
|
"""
|
|
164
164
|
Register the RNA design command with the CLI.
|
|
165
|
-
|
|
165
|
+
|
|
166
166
|
This function is a convenience wrapper for registering the RNADesignCommand
|
|
167
167
|
with the argument parser.
|
|
168
|
-
|
|
168
|
+
|
|
169
169
|
Args:
|
|
170
170
|
subparsers: The subparsers object from argparse to add the command to
|
|
171
|
-
|
|
171
|
+
|
|
172
172
|
Example:
|
|
173
173
|
>>> parser = argparse.ArgumentParser()
|
|
174
174
|
>>> subparsers = parser.add_subparsers()
|
|
175
175
|
>>> register_command(subparsers)
|
|
176
176
|
"""
|
|
177
177
|
RNADesignCommand.register_command(subparsers)
|
|
178
|
-
|
omnigenome/src/__init__.py
CHANGED
omnigenome/src/abc/__init__.py
CHANGED
|
@@ -56,7 +56,7 @@ def covert_input_to_tensor(data):
|
|
|
56
56
|
class OmniGenomeDict(dict):
|
|
57
57
|
"""
|
|
58
58
|
A dictionary subclass that allows moving all tensor values to a specified device.
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
This class extends the standard Python dictionary to provide a convenient
|
|
61
61
|
method for moving all tensor values to a specific device (CPU/GPU).
|
|
62
62
|
"""
|
|
@@ -87,14 +87,14 @@ class OmniGenomeDict(dict):
|
|
|
87
87
|
class OmniDataset(torch.utils.data.Dataset):
|
|
88
88
|
"""
|
|
89
89
|
Abstract base class for all datasets in OmniGenome.
|
|
90
|
-
|
|
90
|
+
|
|
91
91
|
This class provides a unified interface for genomic datasets in the OmniGenome
|
|
92
92
|
framework. It handles data loading, preprocessing, tokenization, and provides
|
|
93
93
|
a PyTorch-compatible dataset interface.
|
|
94
|
-
|
|
94
|
+
|
|
95
95
|
The class supports various data formats and can handle different types of
|
|
96
96
|
genomic tasks including classification, regression, and token-level tasks.
|
|
97
|
-
|
|
97
|
+
|
|
98
98
|
Attributes:
|
|
99
99
|
tokenizer: The tokenizer to use for processing sequences.
|
|
100
100
|
max_length (int): The maximum sequence length for tokenization.
|
|
@@ -118,17 +118,17 @@ class OmniDataset(torch.utils.data.Dataset):
|
|
|
118
118
|
**kwargs: Additional keyword arguments.
|
|
119
119
|
- label2id (dict): A mapping from labels to integer IDs.
|
|
120
120
|
- shuffle (bool): Whether to shuffle the data. Defaults to True.
|
|
121
|
-
- structure_in (bool): Whether to include secondary structure
|
|
121
|
+
- structure_in (bool): Whether to include secondary structure
|
|
122
122
|
information. Defaults to False.
|
|
123
|
-
- drop_long_seq (bool): Whether to drop sequences longer than
|
|
123
|
+
- drop_long_seq (bool): Whether to drop sequences longer than
|
|
124
124
|
max_length. Defaults to False.
|
|
125
125
|
|
|
126
126
|
Example:
|
|
127
127
|
>>> # Initialize with a single data file
|
|
128
128
|
>>> dataset = OmniDataset("data.json", tokenizer, max_length=512)
|
|
129
|
-
|
|
129
|
+
|
|
130
130
|
>>> # Initialize with label mapping
|
|
131
|
-
>>> dataset = OmniDataset("data.json", tokenizer,
|
|
131
|
+
>>> dataset = OmniDataset("data.json", tokenizer,
|
|
132
132
|
... label2id={"A": 0, "B": 1})
|
|
133
133
|
"""
|
|
134
134
|
super(OmniDataset, self).__init__()
|
|
@@ -158,9 +158,7 @@ class OmniDataset(torch.utils.data.Dataset):
|
|
|
158
158
|
)
|
|
159
159
|
self.max_length = self.tokenizer.max_length
|
|
160
160
|
else:
|
|
161
|
-
fprint(
|
|
162
|
-
f"No max_length detected, using default max_length=512."
|
|
163
|
-
)
|
|
161
|
+
fprint(f"No max_length detected, using default max_length=512.")
|
|
164
162
|
self.max_length = 512
|
|
165
163
|
|
|
166
164
|
self.tokenizer.max_length = self.max_length
|
|
@@ -417,23 +415,44 @@ class OmniDataset(torch.utils.data.Dataset):
|
|
|
417
415
|
lines = f.readlines()
|
|
418
416
|
for line in lines:
|
|
419
417
|
examples.append({"text": line.strip()})
|
|
420
|
-
elif data_source.endswith(
|
|
418
|
+
elif data_source.endswith(
|
|
419
|
+
(".fasta", ".fa", ".fna", ".ffn", ".faa", ".frn")
|
|
420
|
+
):
|
|
421
421
|
try:
|
|
422
422
|
from Bio import SeqIO
|
|
423
423
|
except ImportError:
|
|
424
|
-
raise ImportError(
|
|
424
|
+
raise ImportError(
|
|
425
|
+
"Biopython is required for FASTA parsing. Please install with 'pip install biopython'."
|
|
426
|
+
)
|
|
425
427
|
for record in SeqIO.parse(data_source, "fasta"):
|
|
426
|
-
examples.append(
|
|
427
|
-
|
|
428
|
+
examples.append(
|
|
429
|
+
{
|
|
430
|
+
"id": record.id,
|
|
431
|
+
"sequence": str(record.seq),
|
|
432
|
+
"description": record.description,
|
|
433
|
+
}
|
|
434
|
+
)
|
|
435
|
+
elif data_source.endswith((".fastq", ".fq")):
|
|
428
436
|
try:
|
|
429
437
|
from Bio import SeqIO
|
|
430
438
|
except ImportError:
|
|
431
|
-
raise ImportError(
|
|
439
|
+
raise ImportError(
|
|
440
|
+
"Biopython is required for FASTQ parsing. Please install with 'pip install biopython'."
|
|
441
|
+
)
|
|
432
442
|
for record in SeqIO.parse(data_source, "fastq"):
|
|
433
|
-
examples.append(
|
|
434
|
-
|
|
443
|
+
examples.append(
|
|
444
|
+
{
|
|
445
|
+
"id": record.id,
|
|
446
|
+
"sequence": str(record.seq),
|
|
447
|
+
"quality": record.letter_annotations.get(
|
|
448
|
+
"phred_quality", []
|
|
449
|
+
),
|
|
450
|
+
}
|
|
451
|
+
)
|
|
452
|
+
elif data_source.endswith(".bed"):
|
|
435
453
|
import pandas as pd
|
|
436
|
-
|
|
454
|
+
|
|
455
|
+
df = pd.read_csv(data_source, sep="\t", comment="#")
|
|
437
456
|
# Assign column names for standard BED fields
|
|
438
457
|
for _, row in df.iterrows():
|
|
439
458
|
examples.append(row.to_dict())
|
|
@@ -15,17 +15,17 @@ from ..misc.utils import env_meta_info
|
|
|
15
15
|
class OmniMetric:
|
|
16
16
|
"""
|
|
17
17
|
Abstract base class for all metrics in OmniGenome, based on scikit-learn.
|
|
18
|
-
|
|
18
|
+
|
|
19
19
|
This class provides a unified interface for evaluation metrics in the OmniGenome
|
|
20
20
|
framework. It integrates with scikit-learn's metric functions and provides
|
|
21
21
|
additional functionality for handling genomic data evaluation.
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
The class automatically exposes all scikit-learn metrics as attributes,
|
|
24
24
|
making them easily accessible for evaluation tasks.
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
Attributes:
|
|
27
27
|
metric_func (callable): A callable metric function from `sklearn.metrics`.
|
|
28
|
-
ignore_y (any): A value in the ground truth labels to be ignored during
|
|
28
|
+
ignore_y (any): A value in the ground truth labels to be ignored during
|
|
29
29
|
metric computation.
|
|
30
30
|
metadata (dict): Metadata about the metric including version info.
|
|
31
31
|
"""
|
|
@@ -35,10 +35,10 @@ class OmniMetric:
|
|
|
35
35
|
Initializes the metric.
|
|
36
36
|
|
|
37
37
|
Args:
|
|
38
|
-
metric_func (callable, optional): A callable metric function from
|
|
38
|
+
metric_func (callable, optional): A callable metric function from
|
|
39
39
|
`sklearn.metrics`. If None, subclasses
|
|
40
40
|
should implement their own compute method.
|
|
41
|
-
ignore_y (any, optional): A value in the ground truth labels to be
|
|
41
|
+
ignore_y (any, optional): A value in the ground truth labels to be
|
|
42
42
|
ignored during metric computation.
|
|
43
43
|
*args: Additional positional arguments.
|
|
44
44
|
**kwargs: Additional keyword arguments.
|
|
@@ -46,7 +46,7 @@ class OmniMetric:
|
|
|
46
46
|
Example:
|
|
47
47
|
>>> # Initialize with a specific metric function
|
|
48
48
|
>>> metric = OmniMetric(metrics.accuracy_score)
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
>>> # Initialize with ignore value
|
|
51
51
|
>>> metric = OmniMetric(ignore_y=-100)
|
|
52
52
|
"""
|