omnigenome 0.3.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of omnigenome might be problematic. Click here for more details.
- omnigenome/__init__.py +281 -0
- omnigenome/auto/__init__.py +3 -0
- omnigenome/auto/auto_bench/__init__.py +12 -0
- omnigenome/auto/auto_bench/auto_bench.py +484 -0
- omnigenome/auto/auto_bench/auto_bench_cli.py +230 -0
- omnigenome/auto/auto_bench/auto_bench_config.py +216 -0
- omnigenome/auto/auto_bench/config_check.py +34 -0
- omnigenome/auto/auto_train/__init__.py +13 -0
- omnigenome/auto/auto_train/auto_train.py +430 -0
- omnigenome/auto/auto_train/auto_train_cli.py +222 -0
- omnigenome/auto/bench_hub/__init__.py +12 -0
- omnigenome/auto/bench_hub/bench_hub.py +25 -0
- omnigenome/cli/__init__.py +13 -0
- omnigenome/cli/commands/__init__.py +13 -0
- omnigenome/cli/commands/base.py +83 -0
- omnigenome/cli/commands/bench/__init__.py +13 -0
- omnigenome/cli/commands/bench/bench_cli.py +202 -0
- omnigenome/cli/commands/rna/__init__.py +13 -0
- omnigenome/cli/commands/rna/rna_design.py +178 -0
- omnigenome/cli/omnigenome_cli.py +128 -0
- omnigenome/src/__init__.py +12 -0
- omnigenome/src/abc/__init__.py +12 -0
- omnigenome/src/abc/abstract_dataset.py +622 -0
- omnigenome/src/abc/abstract_metric.py +114 -0
- omnigenome/src/abc/abstract_model.py +689 -0
- omnigenome/src/abc/abstract_tokenizer.py +267 -0
- omnigenome/src/dataset/__init__.py +16 -0
- omnigenome/src/dataset/omni_dataset.py +435 -0
- omnigenome/src/lora/__init__.py +13 -0
- omnigenome/src/lora/lora_model.py +294 -0
- omnigenome/src/metric/__init__.py +15 -0
- omnigenome/src/metric/classification_metric.py +184 -0
- omnigenome/src/metric/metric.py +199 -0
- omnigenome/src/metric/ranking_metric.py +142 -0
- omnigenome/src/metric/regression_metric.py +191 -0
- omnigenome/src/misc/__init__.py +3 -0
- omnigenome/src/misc/utils.py +439 -0
- omnigenome/src/model/__init__.py +19 -0
- omnigenome/src/model/augmentation/__init__.py +12 -0
- omnigenome/src/model/augmentation/model.py +219 -0
- omnigenome/src/model/classification/__init__.py +12 -0
- omnigenome/src/model/classification/model.py +642 -0
- omnigenome/src/model/embedding/__init__.py +12 -0
- omnigenome/src/model/embedding/model.py +263 -0
- omnigenome/src/model/mlm/__init__.py +12 -0
- omnigenome/src/model/mlm/model.py +177 -0
- omnigenome/src/model/module_utils.py +232 -0
- omnigenome/src/model/regression/__init__.py +12 -0
- omnigenome/src/model/regression/model.py +786 -0
- omnigenome/src/model/regression/resnet.py +483 -0
- omnigenome/src/model/rna_design/__init__.py +12 -0
- omnigenome/src/model/rna_design/model.py +426 -0
- omnigenome/src/model/seq2seq/__init__.py +12 -0
- omnigenome/src/model/seq2seq/model.py +44 -0
- omnigenome/src/tokenizer/__init__.py +16 -0
- omnigenome/src/tokenizer/bpe_tokenizer.py +226 -0
- omnigenome/src/tokenizer/kmers_tokenizer.py +247 -0
- omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +249 -0
- omnigenome/src/trainer/__init__.py +14 -0
- omnigenome/src/trainer/accelerate_trainer.py +739 -0
- omnigenome/src/trainer/hf_trainer.py +75 -0
- omnigenome/src/trainer/trainer.py +579 -0
- omnigenome/utility/__init__.py +3 -0
- omnigenome/utility/dataset_hub/__init__.py +13 -0
- omnigenome/utility/dataset_hub/dataset_hub.py +178 -0
- omnigenome/utility/ensemble.py +324 -0
- omnigenome/utility/hub_utils.py +517 -0
- omnigenome/utility/model_hub/__init__.py +12 -0
- omnigenome/utility/model_hub/model_hub.py +231 -0
- omnigenome/utility/pipeline_hub/__init__.py +12 -0
- omnigenome/utility/pipeline_hub/pipeline.py +483 -0
- omnigenome/utility/pipeline_hub/pipeline_hub.py +129 -0
- omnigenome-0.3.0a0.dist-info/METADATA +224 -0
- omnigenome-0.3.0a0.dist-info/RECORD +85 -0
- omnigenome-0.3.0a0.dist-info/WHEEL +5 -0
- omnigenome-0.3.0a0.dist-info/entry_points.txt +3 -0
- omnigenome-0.3.0a0.dist-info/licenses/LICENSE +201 -0
- omnigenome-0.3.0a0.dist-info/top_level.txt +2 -0
- tests/__init__.py +9 -0
- tests/conftest.py +160 -0
- tests/test_dataset_patterns.py +291 -0
- tests/test_examples_syntax.py +83 -0
- tests/test_model_loading.py +183 -0
- tests/test_rna_functions.py +255 -0
- tests/test_training_patterns.py +302 -0
omnigenome/__init__.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# file: __init__.py
|
|
3
|
+
# time: 14:53 06/04/2024
|
|
4
|
+
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
|
|
5
|
+
# github: https://github.com/yangheng95
|
|
6
|
+
# huggingface: https://huggingface.co/yangheng
|
|
7
|
+
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
|
|
8
|
+
# Copyright (C) 2019-2024. All Rights Reserved.
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
OmniGenome: A comprehensive toolkit for genomic foundation models.
|
|
12
|
+
|
|
13
|
+
This package provides a suite of tools for working with genomic data, including:
|
|
14
|
+
- Automated benchmarking and training pipelines.
|
|
15
|
+
- A hub for accessing pre-trained models, datasets, and pipelines.
|
|
16
|
+
- A flexible and extensible framework for building custom models and tasks.
|
|
17
|
+
|
|
18
|
+
This __init__.py file exposes the core components of the library for easy access.
|
|
19
|
+
|
|
20
|
+
Key Components:
|
|
21
|
+
---------------
|
|
22
|
+
- AutoBench: Automated benchmarking of genomic models
|
|
23
|
+
- AutoTrain: Automated training of genomic models
|
|
24
|
+
- BenchHub: Hub for accessing benchmarks
|
|
25
|
+
- ModelHub: Hub for accessing pre-trained models
|
|
26
|
+
- PipelineHub: Hub for accessing pipelines
|
|
27
|
+
- Various model classes for different genomic tasks
|
|
28
|
+
- Dataset classes for different data formats
|
|
29
|
+
- Tokenizer classes for different sequence representations
|
|
30
|
+
- Metric classes for evaluation
|
|
31
|
+
- Trainer classes for model training
|
|
32
|
+
|
|
33
|
+
Example Usage:
|
|
34
|
+
--------------
|
|
35
|
+
```python
|
|
36
|
+
from omnigenome import AutoBench, AutoTrain, OmniModelForSequenceClassification
|
|
37
|
+
|
|
38
|
+
# Run automated benchmarking
|
|
39
|
+
bench = AutoBench("RGB", "model_name")
|
|
40
|
+
bench.run()
|
|
41
|
+
|
|
42
|
+
# Train a model
|
|
43
|
+
trainer = AutoTrain("RGB", "model_name")
|
|
44
|
+
trainer.run()
|
|
45
|
+
|
|
46
|
+
# Use a specific model
|
|
47
|
+
model = OmniModelForSequenceClassification("model_path", tokenizer)
|
|
48
|
+
```
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
__name__ = "omnigenome"
|
|
52
|
+
__version__ = "0.3.0alpha"
|
|
53
|
+
|
|
54
|
+
__author__ = "YANG, HENG"
|
|
55
|
+
__email__ = "yangheng2021@gmail.com"
|
|
56
|
+
__license__ = "Apache-2.0"
|
|
57
|
+
|
|
58
|
+
# Import core auto components
|
|
59
|
+
from .auto.auto_bench.auto_bench import AutoBench
|
|
60
|
+
from .auto.auto_bench.auto_bench_config import AutoBenchConfig
|
|
61
|
+
from .auto.bench_hub.bench_hub import BenchHub
|
|
62
|
+
from .auto.auto_train.auto_train import AutoTrain
|
|
63
|
+
from .auto.auto_bench.auto_bench_cli import run_bench, bench_command
|
|
64
|
+
from .auto.auto_train.auto_train_cli import run_train, train_command
|
|
65
|
+
|
|
66
|
+
# Import source modules
|
|
67
|
+
from .src import dataset as dataset
|
|
68
|
+
from .src import metric as metric
|
|
69
|
+
from .src import model as model
|
|
70
|
+
from .src import tokenizer as tokenizer
|
|
71
|
+
|
|
72
|
+
# Import abstract base classes
|
|
73
|
+
from .src.abc.abstract_dataset import OmniDataset
|
|
74
|
+
from .src.abc.abstract_metric import OmniMetric
|
|
75
|
+
from .src.abc.abstract_model import OmniModel
|
|
76
|
+
from .src.abc.abstract_tokenizer import OmniTokenizer
|
|
77
|
+
from .src.abc.abstract_tokenizer import OmniTokenizer as AutoTokenizer
|
|
78
|
+
|
|
79
|
+
# Import dataset classes
|
|
80
|
+
from .src.dataset.omni_dataset import OmniDatasetForSequenceClassification
|
|
81
|
+
from .src.dataset.omni_dataset import OmniDatasetForSequenceRegression
|
|
82
|
+
from .src.dataset.omni_dataset import OmniDatasetForTokenClassification
|
|
83
|
+
from .src.dataset.omni_dataset import OmniDatasetForTokenRegression
|
|
84
|
+
|
|
85
|
+
# Import metric classes
|
|
86
|
+
from .src.metric import ClassificationMetric, RegressionMetric, RankingMetric
|
|
87
|
+
|
|
88
|
+
# Import utility functions
|
|
89
|
+
from .src.misc import utils as utils
|
|
90
|
+
from .src.misc.utils import clean_temp_dir_pt_files
|
|
91
|
+
|
|
92
|
+
# Import model classes
|
|
93
|
+
from .src.model import (
|
|
94
|
+
OmniModelForSequenceClassification,
|
|
95
|
+
OmniModelForMultiLabelSequenceClassification,
|
|
96
|
+
OmniModelForTokenClassification,
|
|
97
|
+
OmniModelForSequenceRegression,
|
|
98
|
+
OmniModelForTokenRegression,
|
|
99
|
+
OmniModelForStructuralImputation,
|
|
100
|
+
OmniModelForMatrixRegression,
|
|
101
|
+
OmniModelForMatrixClassification,
|
|
102
|
+
OmniModelForMLM,
|
|
103
|
+
OmniModelForSeq2Seq,
|
|
104
|
+
OmniModelForRNADesign,
|
|
105
|
+
OmniModelForEmbedding,
|
|
106
|
+
OmniModelForAugmentation,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Import LoRA model
|
|
110
|
+
from .src.lora.lora_model import OmniLoraModel
|
|
111
|
+
|
|
112
|
+
# Import tokenizer classes
|
|
113
|
+
from .src.tokenizer import OmniBPETokenizer
|
|
114
|
+
from .src.tokenizer import OmniKmersTokenizer
|
|
115
|
+
from .src.tokenizer import OmniSingleNucleotideTokenizer
|
|
116
|
+
|
|
117
|
+
# Import trainer classes
|
|
118
|
+
from .src.trainer.hf_trainer import HFTrainer
|
|
119
|
+
from .src.trainer.trainer import Trainer
|
|
120
|
+
from .src.trainer.accelerate_trainer import AccelerateTrainer
|
|
121
|
+
|
|
122
|
+
# Import hub utilities
|
|
123
|
+
from .utility.hub_utils import download_benchmark
|
|
124
|
+
from .utility.hub_utils import download_model
|
|
125
|
+
from .utility.hub_utils import download_pipeline
|
|
126
|
+
from .utility import hub_utils as hub_utils
|
|
127
|
+
|
|
128
|
+
# Import hub classes
|
|
129
|
+
from .utility.model_hub.model_hub import ModelHub
|
|
130
|
+
from .utility.dataset_hub.dataset_hub import load_benchmark_datasets
|
|
131
|
+
from .utility.pipeline_hub.pipeline import Pipeline
|
|
132
|
+
from .utility.pipeline_hub.pipeline_hub import PipelineHub
|
|
133
|
+
|
|
134
|
+
# Import module utilities
|
|
135
|
+
from .src.model.module_utils import OmniPooling
|
|
136
|
+
|
|
137
|
+
# --------------------------------------------------------------------------------
|
|
138
|
+
# For backward compatibility version 0.2.7alpha and earlier
|
|
139
|
+
from .src.abc.abstract_tokenizer import OmniTokenizer as OmniGenomeTokenizer
|
|
140
|
+
from .src.abc.abstract_dataset import OmniDataset as OmniGenomeDataset
|
|
141
|
+
from .src.abc.abstract_metric import OmniMetric as OmniGenomeMetric
|
|
142
|
+
from .src.abc.abstract_model import OmniModel as OmniGenomeModel
|
|
143
|
+
from .src.dataset.omni_dataset import OmniDatasetForSequenceClassification as OmniGenomeDatasetForSequenceClassification
|
|
144
|
+
from .src.dataset.omni_dataset import OmniDatasetForSequenceRegression as OmniGenomeDatasetForSequenceRegression
|
|
145
|
+
from .src.dataset.omni_dataset import OmniDatasetForTokenClassification as OmniGenomeDatasetForTokenClassification
|
|
146
|
+
from .src.dataset.omni_dataset import OmniDatasetForTokenRegression as OmniGenomeDatasetForTokenRegression
|
|
147
|
+
from .src.lora.lora_model import OmniLoraModel as OmniGenomeLoraModel
|
|
148
|
+
from .src.model import (
|
|
149
|
+
OmniModelForSequenceClassification as OmniGenomeModelForSequenceClassification,
|
|
150
|
+
OmniModelForMultiLabelSequenceClassification as OmniGenomeModelForMultiLabelSequenceClassification,
|
|
151
|
+
OmniModelForTokenClassification as OmniGenomeModelForTokenClassification,
|
|
152
|
+
OmniModelForSequenceRegression as OmniGenomeModelForSequenceRegression,
|
|
153
|
+
OmniModelForTokenRegression as OmniGenomeModelForTokenRegression,
|
|
154
|
+
OmniModelForStructuralImputation as OmniGenomeModelForStructuralImputation,
|
|
155
|
+
OmniModelForMatrixRegression as OmniGenomeModelForMatrixRegression,
|
|
156
|
+
OmniModelForMatrixClassification as OmniGenomeModelForMatrixClassification,
|
|
157
|
+
OmniModelForMLM as OmniGenomeModelForMLM,
|
|
158
|
+
OmniModelForSeq2Seq as OmniGenomeModelForSeq2Seq,
|
|
159
|
+
OmniModelForRNADesign as OmniGenomeModelForRNADesign,
|
|
160
|
+
OmniModelForEmbedding as OmniGenomeModelForEmbedding,
|
|
161
|
+
OmniModelForAugmentation as OmniGenomeModelForAugmentation,
|
|
162
|
+
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
from .utility.ensemble import VoteEnsemblePredictor
|
|
166
|
+
# ------------------------------------------------------------------------------
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
__all__ = [
|
|
170
|
+
"load_benchmark_datasets",
|
|
171
|
+
"OmniDataset",
|
|
172
|
+
"OmniModel",
|
|
173
|
+
"OmniMetric",
|
|
174
|
+
"AutoTokenizer",
|
|
175
|
+
"OmniTokenizer",
|
|
176
|
+
"OmniKmersTokenizer",
|
|
177
|
+
"OmniSingleNucleotideTokenizer",
|
|
178
|
+
"OmniBPETokenizer",
|
|
179
|
+
"ModelHub",
|
|
180
|
+
"Pipeline",
|
|
181
|
+
"PipelineHub",
|
|
182
|
+
"BenchHub",
|
|
183
|
+
"AutoBench",
|
|
184
|
+
"AutoBenchConfig",
|
|
185
|
+
"utils",
|
|
186
|
+
"model",
|
|
187
|
+
"tokenizer",
|
|
188
|
+
"dataset",
|
|
189
|
+
"OmniModelForSequenceClassification",
|
|
190
|
+
"OmniModelForMultiLabelSequenceClassification",
|
|
191
|
+
"OmniModelForTokenClassification",
|
|
192
|
+
"OmniModelForSequenceRegression",
|
|
193
|
+
"OmniModelForTokenRegression",
|
|
194
|
+
"OmniModelForRNADesign",
|
|
195
|
+
"OmniModelForEmbedding",
|
|
196
|
+
"OmniModelForAugmentation",
|
|
197
|
+
"OmniModelForStructuralImputation",
|
|
198
|
+
"OmniModelForMatrixRegression",
|
|
199
|
+
"OmniModelForMatrixClassification",
|
|
200
|
+
"OmniModelForMLM",
|
|
201
|
+
"OmniModelForSeq2Seq",
|
|
202
|
+
"OmniDatasetForTokenClassification",
|
|
203
|
+
"OmniDatasetForTokenRegression",
|
|
204
|
+
"OmniDatasetForSequenceClassification",
|
|
205
|
+
"OmniDatasetForSequenceRegression",
|
|
206
|
+
"OmniLoraModel",
|
|
207
|
+
"ClassificationMetric",
|
|
208
|
+
"RegressionMetric",
|
|
209
|
+
"RankingMetric",
|
|
210
|
+
"Trainer",
|
|
211
|
+
"HFTrainer",
|
|
212
|
+
"AccelerateTrainer",
|
|
213
|
+
"AutoBenchConfig",
|
|
214
|
+
"AutoBench",
|
|
215
|
+
"download_benchmark",
|
|
216
|
+
"download_model",
|
|
217
|
+
"download_pipeline",
|
|
218
|
+
"VoteEnsemblePredictor"
|
|
219
|
+
]
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
LOGO1 = r"""
|
|
223
|
+
**@@ #========= @@** ___ _
|
|
224
|
+
**@@ +----- @@** / _ \ _ __ ___ _ __ (_)
|
|
225
|
+
**@@ = @@** | | | || '_ ` _ \ | '_ \ | |
|
|
226
|
+
**@@ | |_| || | | | | || | | || |
|
|
227
|
+
@@** = **@@ \___/ |_| |_| |_||_| |_||_|
|
|
228
|
+
@@** ------+ **@@
|
|
229
|
+
@@** =========# **@@ ____
|
|
230
|
+
@@ ---------------+ @@ / ___| ___ _ __ ___ _ __ ___ ___
|
|
231
|
+
@@ ================== @@ | | _ / _ \| '_ \ / _ \ | '_ ` _ \ / _ \
|
|
232
|
+
@@ +--------------- @@ | |_| || __/| | | || (_) || | | | | || __/
|
|
233
|
+
@@** #========= **@@ \____| \___||_| |_| \___/ |_| |_| |_| \___|
|
|
234
|
+
@@** +------ **@@
|
|
235
|
+
@@** = **@@
|
|
236
|
+
@@** ____ _
|
|
237
|
+
**@@ = @@** | __ ) ___ _ __ ___ | |__
|
|
238
|
+
**@@ -----+ @@** | _ \ / _ \| '_ \ / __|| '_ \
|
|
239
|
+
**@@ ==========# @@** | |_) || __/| | | || (__ | | | |
|
|
240
|
+
@@ --------------+ @@** |____/ \___||_| |_| \___||_| |_|
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
LOGO2 = r"""
|
|
244
|
+
|
|
245
|
+
** +----------- ** ___ _
|
|
246
|
+
@@ @@ / _ \ _ __ ___ _ __ (_)
|
|
247
|
+
@@* #============== *@@ | | | || '_ ` _ \ | '_ \ | |
|
|
248
|
+
@@* *@@ | |_| || | | | | || | | || |
|
|
249
|
+
*@@ +------------ *@@ \___/ |_| |_| |_||_| |_||_|
|
|
250
|
+
*@* @@*
|
|
251
|
+
*@@ #========= @@*
|
|
252
|
+
*@@* *@@*
|
|
253
|
+
*@@ +---@@@* ____
|
|
254
|
+
*@@* ** / ___| ___ _ __ ___ _ __ ___ ___
|
|
255
|
+
**@** | | _ / _ \| '_ \ / _ \ | '_ ` _ \ / _ \
|
|
256
|
+
*@@* *@@* | |_| || __/| | | || (_) || | | | | || __/
|
|
257
|
+
*@@ ---+ @@* \____| \___||_| |_| \___/ |_| |_| |_| \___|
|
|
258
|
+
*@@* *@@*
|
|
259
|
+
*@@ =========# @@*
|
|
260
|
+
*@@ @@*
|
|
261
|
+
*@@ -------------+ @@* ____ _
|
|
262
|
+
@@ @@ | __ ) ___ _ __ ___ | |__
|
|
263
|
+
@@ ===============# @@ | _ \ / _ \| '_ \ / __|| '_ \
|
|
264
|
+
@@ @@ | |_) || __/| | | || (__ | | | |
|
|
265
|
+
** -----------+ ** |____/ \___||_| |_| \___||_| |_|
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
art_dna_color_map = {
|
|
269
|
+
"*": "blue", # Bases represented by '*'
|
|
270
|
+
"@": "white", # Bases represented by '@'
|
|
271
|
+
"-": "yellow", # Hydrogen bonds, assuming '-' represents a bond
|
|
272
|
+
"=": "light_cyan", # Hydrogen bonds, assuming '=' represents a bond
|
|
273
|
+
"+": "yellow", # '+' symbols in cyan
|
|
274
|
+
" ": "black", # Use red for undefined characters
|
|
275
|
+
}
|
|
276
|
+
import random
|
|
277
|
+
|
|
278
|
+
LOGO = random.choice([LOGO1, LOGO2])
|
|
279
|
+
print(LOGO)
|
|
280
|
+
|
|
281
|
+
clean_temp_dir_pt_files()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# file: __init__.py
|
|
3
|
+
# time: 18:28 11/04/2024
|
|
4
|
+
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
|
|
5
|
+
# github: https://github.com/yangheng95
|
|
6
|
+
# huggingface: https://huggingface.co/yangheng
|
|
7
|
+
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
|
|
8
|
+
# Copyright (C) 2019-2024. All Rights Reserved.
|
|
9
|
+
"""
|
|
10
|
+
This package contains modules for automated benchmarking of models.
|
|
11
|
+
"""
|
|
12
|
+
|