omnigenome 0.3.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of omnigenome might be problematic. Click here for more details.
- omnigenome/__init__.py +281 -0
- omnigenome/auto/__init__.py +3 -0
- omnigenome/auto/auto_bench/__init__.py +12 -0
- omnigenome/auto/auto_bench/auto_bench.py +484 -0
- omnigenome/auto/auto_bench/auto_bench_cli.py +230 -0
- omnigenome/auto/auto_bench/auto_bench_config.py +216 -0
- omnigenome/auto/auto_bench/config_check.py +34 -0
- omnigenome/auto/auto_train/__init__.py +13 -0
- omnigenome/auto/auto_train/auto_train.py +430 -0
- omnigenome/auto/auto_train/auto_train_cli.py +222 -0
- omnigenome/auto/bench_hub/__init__.py +12 -0
- omnigenome/auto/bench_hub/bench_hub.py +25 -0
- omnigenome/cli/__init__.py +13 -0
- omnigenome/cli/commands/__init__.py +13 -0
- omnigenome/cli/commands/base.py +83 -0
- omnigenome/cli/commands/bench/__init__.py +13 -0
- omnigenome/cli/commands/bench/bench_cli.py +202 -0
- omnigenome/cli/commands/rna/__init__.py +13 -0
- omnigenome/cli/commands/rna/rna_design.py +178 -0
- omnigenome/cli/omnigenome_cli.py +128 -0
- omnigenome/src/__init__.py +12 -0
- omnigenome/src/abc/__init__.py +12 -0
- omnigenome/src/abc/abstract_dataset.py +622 -0
- omnigenome/src/abc/abstract_metric.py +114 -0
- omnigenome/src/abc/abstract_model.py +689 -0
- omnigenome/src/abc/abstract_tokenizer.py +267 -0
- omnigenome/src/dataset/__init__.py +16 -0
- omnigenome/src/dataset/omni_dataset.py +435 -0
- omnigenome/src/lora/__init__.py +13 -0
- omnigenome/src/lora/lora_model.py +294 -0
- omnigenome/src/metric/__init__.py +15 -0
- omnigenome/src/metric/classification_metric.py +184 -0
- omnigenome/src/metric/metric.py +199 -0
- omnigenome/src/metric/ranking_metric.py +142 -0
- omnigenome/src/metric/regression_metric.py +191 -0
- omnigenome/src/misc/__init__.py +3 -0
- omnigenome/src/misc/utils.py +439 -0
- omnigenome/src/model/__init__.py +19 -0
- omnigenome/src/model/augmentation/__init__.py +12 -0
- omnigenome/src/model/augmentation/model.py +219 -0
- omnigenome/src/model/classification/__init__.py +12 -0
- omnigenome/src/model/classification/model.py +642 -0
- omnigenome/src/model/embedding/__init__.py +12 -0
- omnigenome/src/model/embedding/model.py +263 -0
- omnigenome/src/model/mlm/__init__.py +12 -0
- omnigenome/src/model/mlm/model.py +177 -0
- omnigenome/src/model/module_utils.py +232 -0
- omnigenome/src/model/regression/__init__.py +12 -0
- omnigenome/src/model/regression/model.py +786 -0
- omnigenome/src/model/regression/resnet.py +483 -0
- omnigenome/src/model/rna_design/__init__.py +12 -0
- omnigenome/src/model/rna_design/model.py +426 -0
- omnigenome/src/model/seq2seq/__init__.py +12 -0
- omnigenome/src/model/seq2seq/model.py +44 -0
- omnigenome/src/tokenizer/__init__.py +16 -0
- omnigenome/src/tokenizer/bpe_tokenizer.py +226 -0
- omnigenome/src/tokenizer/kmers_tokenizer.py +247 -0
- omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +249 -0
- omnigenome/src/trainer/__init__.py +14 -0
- omnigenome/src/trainer/accelerate_trainer.py +739 -0
- omnigenome/src/trainer/hf_trainer.py +75 -0
- omnigenome/src/trainer/trainer.py +579 -0
- omnigenome/utility/__init__.py +3 -0
- omnigenome/utility/dataset_hub/__init__.py +13 -0
- omnigenome/utility/dataset_hub/dataset_hub.py +178 -0
- omnigenome/utility/ensemble.py +324 -0
- omnigenome/utility/hub_utils.py +517 -0
- omnigenome/utility/model_hub/__init__.py +12 -0
- omnigenome/utility/model_hub/model_hub.py +231 -0
- omnigenome/utility/pipeline_hub/__init__.py +12 -0
- omnigenome/utility/pipeline_hub/pipeline.py +483 -0
- omnigenome/utility/pipeline_hub/pipeline_hub.py +129 -0
- omnigenome-0.3.0a0.dist-info/METADATA +224 -0
- omnigenome-0.3.0a0.dist-info/RECORD +85 -0
- omnigenome-0.3.0a0.dist-info/WHEEL +5 -0
- omnigenome-0.3.0a0.dist-info/entry_points.txt +3 -0
- omnigenome-0.3.0a0.dist-info/licenses/LICENSE +201 -0
- omnigenome-0.3.0a0.dist-info/top_level.txt +2 -0
- tests/__init__.py +9 -0
- tests/conftest.py +160 -0
- tests/test_dataset_patterns.py +291 -0
- tests/test_examples_syntax.py +83 -0
- tests/test_model_loading.py +183 -0
- tests/test_rna_functions.py +255 -0
- tests/test_training_patterns.py +302 -0
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# file: pipeline.py
|
|
3
|
+
# time: 18:38 12/04/2024
|
|
4
|
+
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
|
|
5
|
+
# github: https://github.com/yangheng95
|
|
6
|
+
# huggingface: https://huggingface.co/yangheng
|
|
7
|
+
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
|
|
8
|
+
# Copyright (C) 2019-2024. All Rights Reserved.
|
|
9
|
+
"""
|
|
10
|
+
Pipeline Module
|
|
11
|
+
|
|
12
|
+
This module provides the Pipeline class for creating and managing complete
|
|
13
|
+
machine learning workflows that combine models, tokenizers, datasets, and
|
|
14
|
+
trainers. Pipelines provide a unified interface for training, inference,
|
|
15
|
+
and model management.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
|
|
21
|
+
import autocuda
|
|
22
|
+
from transformers import AutoConfig, AutoTokenizer
|
|
23
|
+
|
|
24
|
+
from ..hub_utils import download_pipeline
|
|
25
|
+
from ..model_hub.model_hub import ModelHub
|
|
26
|
+
from ...src.abc.abstract_model import OmniModel
|
|
27
|
+
from ...src.misc.utils import env_meta_info, fprint
|
|
28
|
+
from ...src.trainer.trainer import Trainer
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Pipeline:
|
|
32
|
+
"""
|
|
33
|
+
Complete machine learning pipeline combining model, tokenizer, datasets, and trainer.
|
|
34
|
+
|
|
35
|
+
The Pipeline class provides a unified interface for managing complete machine
|
|
36
|
+
learning workflows. It handles model initialization, training, inference, and
|
|
37
|
+
persistence. Pipelines can be loaded from pre-built configurations or created
|
|
38
|
+
from scratch with custom components.
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
model (OmniModel): The underlying model for the pipeline.
|
|
42
|
+
tokenizer: Tokenizer for preprocessing input sequences.
|
|
43
|
+
dataset (dict): Dictionary containing train/validation/test datasets.
|
|
44
|
+
metadata (dict): Environment and pipeline metadata.
|
|
45
|
+
trainer (Trainer): Trainer instance for model training.
|
|
46
|
+
device (str): Target device for model execution (CPU/GPU).
|
|
47
|
+
name (str): Name identifier for the pipeline.
|
|
48
|
+
|
|
49
|
+
Example:
|
|
50
|
+
>>> from omnigenome import Pipeline, OmniModelForSequenceClassification
|
|
51
|
+
>>> # Create pipeline from model
|
|
52
|
+
>>> model = OmniModelForSequenceClassification("model_path", tokenizer)
|
|
53
|
+
>>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
|
|
54
|
+
>>> # Use for inference
|
|
55
|
+
>>> predictions = pipeline("ATCGATCG")
|
|
56
|
+
>>> # Train the model
|
|
57
|
+
>>> pipeline.train(datasets)
|
|
58
|
+
>>> # Save pipeline
|
|
59
|
+
>>> pipeline.save("./saved_pipeline")
|
|
60
|
+
|
|
61
|
+
Note:
|
|
62
|
+
- Pipelines automatically handle device placement and model optimization
|
|
63
|
+
- Environment metadata is collected for reproducibility
|
|
64
|
+
- Pipelines can be saved and loaded for easy deployment
|
|
65
|
+
- Supports both local models and hub-based model loading
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
model: OmniModel = None
|
|
69
|
+
tokenizer = None
|
|
70
|
+
dataset: dict = None
|
|
71
|
+
metadata: dict = None
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
name,
|
|
76
|
+
*,
|
|
77
|
+
model_name_or_path,
|
|
78
|
+
tokenizer=None,
|
|
79
|
+
datasets=None,
|
|
80
|
+
trainer=None,
|
|
81
|
+
**kwargs,
|
|
82
|
+
):
|
|
83
|
+
"""
|
|
84
|
+
Initialize a Pipeline instance.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
name (str): Name identifier for the pipeline.
|
|
88
|
+
model_name_or_path (Union[str, OmniModel]): Model to use in the pipeline.
|
|
89
|
+
Can be a string path/identifier or an OmniModel instance.
|
|
90
|
+
tokenizer (optional): Tokenizer for preprocessing. If None, will be
|
|
91
|
+
loaded from the model or model path. Defaults to None.
|
|
92
|
+
datasets (dict, optional): Dictionary containing train/validation/test
|
|
93
|
+
datasets. Keys should be 'train', 'valid', 'test'. Defaults to None.
|
|
94
|
+
trainer (Trainer, optional): Trainer instance for model training.
|
|
95
|
+
If None, a default trainer will be created. Defaults to None.
|
|
96
|
+
**kwargs: Additional keyword arguments including:
|
|
97
|
+
- device (str): Target device for model execution
|
|
98
|
+
- trust_remote_code (bool): Whether to trust remote code in tokenizers
|
|
99
|
+
- Other model-specific configuration parameters
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
ValueError: If model initialization fails.
|
|
103
|
+
ImportError: If required dependencies are not available.
|
|
104
|
+
FileNotFoundError: If model path is invalid.
|
|
105
|
+
|
|
106
|
+
Example:
|
|
107
|
+
>>> # Create from model path
|
|
108
|
+
>>> pipeline = Pipeline("rna_classification",
|
|
109
|
+
... model_name_or_path="yangheng/OmniGenome-186M")
|
|
110
|
+
>>> # Create from model instance
|
|
111
|
+
>>> model = OmniModelForSequenceClassification("model_path", tokenizer)
|
|
112
|
+
>>> pipeline = Pipeline("custom_pipeline", model_name_or_path=model)
|
|
113
|
+
|
|
114
|
+
Note:
|
|
115
|
+
- The pipeline automatically handles model loading and device placement
|
|
116
|
+
- Environment metadata is collected for tracking system information
|
|
117
|
+
- If a model instance is provided, its tokenizer and metadata are used
|
|
118
|
+
"""
|
|
119
|
+
self.metadata = env_meta_info()
|
|
120
|
+
self.name = name
|
|
121
|
+
self.tokenizer = tokenizer
|
|
122
|
+
self.datasets = datasets
|
|
123
|
+
self.trainer = trainer
|
|
124
|
+
self.device = (
|
|
125
|
+
autocuda.auto_cuda()
|
|
126
|
+
if kwargs.get("device") is None
|
|
127
|
+
else kwargs.get("device")
|
|
128
|
+
)
|
|
129
|
+
if not isinstance(model_name_or_path, str):
|
|
130
|
+
self.model = model_name_or_path
|
|
131
|
+
self.tokenizer = self.model.tokenizer
|
|
132
|
+
self.metadata = self.model.metadata
|
|
133
|
+
else:
|
|
134
|
+
self.init_pipeline(
|
|
135
|
+
model_name_or_path=model_name_or_path, tokenizer=tokenizer, **kwargs
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
self.model.to(self.device)
|
|
139
|
+
|
|
140
|
+
def __call__(self, inputs, *args, **kwargs):
|
|
141
|
+
"""
|
|
142
|
+
Call the pipeline for inference.
|
|
143
|
+
|
|
144
|
+
This method provides a convenient interface for running inference
|
|
145
|
+
through the pipeline. It delegates to the model's inference method.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
inputs: Input data for inference (can be string, list, or tensor).
|
|
149
|
+
*args: Additional positional arguments passed to model inference.
|
|
150
|
+
**kwargs: Additional keyword arguments passed to model inference.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
dict: Inference results including predictions and confidence scores.
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
>>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
|
|
157
|
+
>>> results = pipeline("ATCGATCG")
|
|
158
|
+
>>> print(results['predictions'])
|
|
159
|
+
"""
|
|
160
|
+
return self.model.inference(inputs, **kwargs)
|
|
161
|
+
|
|
162
|
+
def to(self, device):
|
|
163
|
+
"""
|
|
164
|
+
Move the pipeline to a specific device.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
device (str): Target device ('cpu', 'cuda', 'cuda:0', etc.).
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Pipeline: Self for method chaining.
|
|
171
|
+
|
|
172
|
+
Example:
|
|
173
|
+
>>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
|
|
174
|
+
>>> pipeline.to("cuda:0") # Move to GPU
|
|
175
|
+
>>> pipeline.to("cpu") # Move to CPU
|
|
176
|
+
"""
|
|
177
|
+
self.model.to(device)
|
|
178
|
+
self.device = device
|
|
179
|
+
return self
|
|
180
|
+
|
|
181
|
+
def init_pipeline(self, *, model_name_or_path, tokenizer=None, **kwargs):
|
|
182
|
+
"""
|
|
183
|
+
Initialize the pipeline components from a model path.
|
|
184
|
+
|
|
185
|
+
This method handles loading the model, tokenizer, and configuration
|
|
186
|
+
from a model path or identifier. It tries to load from the ModelHub
|
|
187
|
+
first, then falls back to HuggingFace transformers.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
model_name_or_path (str): Path or identifier of the model to load.
|
|
191
|
+
tokenizer (optional): Tokenizer instance. If None, will be loaded
|
|
192
|
+
from the model path. Defaults to None.
|
|
193
|
+
**kwargs: Additional keyword arguments for model loading including:
|
|
194
|
+
- trust_remote_code (bool): Whether to trust remote code
|
|
195
|
+
- device (str): Target device for the model
|
|
196
|
+
- Other model-specific parameters
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Pipeline: Self for method chaining.
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
ValueError: If model loading fails.
|
|
203
|
+
ImportError: If required dependencies are not available.
|
|
204
|
+
|
|
205
|
+
Example:
|
|
206
|
+
>>> pipeline = Pipeline("my_pipeline")
|
|
207
|
+
>>> pipeline.init_pipeline(model_name_or_path="yangheng/OmniGenome-186M")
|
|
208
|
+
|
|
209
|
+
Note:
|
|
210
|
+
- First attempts to load from OmniGenome ModelHub
|
|
211
|
+
- Falls back to HuggingFace transformers if ModelHub fails
|
|
212
|
+
- Automatically handles tokenizer loading and configuration
|
|
213
|
+
"""
|
|
214
|
+
trust_remote_code = kwargs.get("trust_remote_code", True)
|
|
215
|
+
try: # for the models saved by OmniGenome and served by the model hub
|
|
216
|
+
self.model = ModelHub.load(model_name_or_path, **kwargs)
|
|
217
|
+
self.tokenizer = self.model.tokenizer
|
|
218
|
+
self.metadata.update(self.model.metadata)
|
|
219
|
+
except Exception as e:
|
|
220
|
+
fprint(f"Fail to load the model from the model hub, the error is: {e}")
|
|
221
|
+
|
|
222
|
+
config = AutoConfig.from_pretrained(
|
|
223
|
+
model_name_or_path, trust_remote_code=trust_remote_code
|
|
224
|
+
)
|
|
225
|
+
if tokenizer is None:
|
|
226
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
227
|
+
model_name_or_path, trust_remote_code=trust_remote_code
|
|
228
|
+
)
|
|
229
|
+
self.model = OmniModel.from_pretrained(
|
|
230
|
+
model_name_or_path,
|
|
231
|
+
config=config,
|
|
232
|
+
tokenizer=tokenizer,
|
|
233
|
+
trust_remote_code=trust_remote_code,
|
|
234
|
+
**kwargs,
|
|
235
|
+
)
|
|
236
|
+
self.tokenizer = self.model.tokenizer
|
|
237
|
+
self.metadata.update(self.model.metadata)
|
|
238
|
+
fprint(f"The pipeline has been initialized from {model_name_or_path}.")
|
|
239
|
+
return self
|
|
240
|
+
|
|
241
|
+
def train(self, datasets: dict = None, trainer=None, **kwargs):
|
|
242
|
+
"""
|
|
243
|
+
Train the model in the pipeline.
|
|
244
|
+
|
|
245
|
+
This method initiates training of the model using the provided datasets
|
|
246
|
+
and trainer configuration. If no trainer is provided, the pipeline's
|
|
247
|
+
existing trainer will be used.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
datasets (dict, optional): Dictionary containing train/validation/test
|
|
251
|
+
datasets. If None, uses the pipeline's existing datasets.
|
|
252
|
+
Keys should be 'train', 'valid', 'test'. Defaults to None.
|
|
253
|
+
trainer (Trainer, optional): Trainer instance to use for training.
|
|
254
|
+
If None, uses the pipeline's existing trainer. Defaults to None.
|
|
255
|
+
**kwargs: Additional keyword arguments passed to the trainer.
|
|
256
|
+
|
|
257
|
+
Raises:
|
|
258
|
+
ValueError: If no trainer is available or datasets are invalid.
|
|
259
|
+
RuntimeError: If training fails.
|
|
260
|
+
|
|
261
|
+
Example:
|
|
262
|
+
>>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
|
|
263
|
+
>>> # Train with existing datasets
|
|
264
|
+
>>> pipeline.train()
|
|
265
|
+
>>> # Train with custom datasets
|
|
266
|
+
>>> custom_datasets = {'train': train_data, 'valid': valid_data}
|
|
267
|
+
>>> pipeline.train(datasets=custom_datasets)
|
|
268
|
+
>>> # Train with custom trainer
|
|
269
|
+
>>> from omnigenome import Trainer
|
|
270
|
+
>>> custom_trainer = Trainer(model, train_dataset=train_data)
|
|
271
|
+
>>> pipeline.train(trainer=custom_trainer)
|
|
272
|
+
|
|
273
|
+
Note:
|
|
274
|
+
- Training uses the pipeline's current model and device
|
|
275
|
+
- Progress and metrics are logged during training
|
|
276
|
+
- The trained model is automatically saved in the pipeline
|
|
277
|
+
"""
|
|
278
|
+
if trainer is not None:
|
|
279
|
+
assert isinstance(trainer, Trainer)
|
|
280
|
+
self.trainer = trainer
|
|
281
|
+
|
|
282
|
+
self.trainer.train()
|
|
283
|
+
|
|
284
|
+
def predict(self, inputs, **kwargs):
|
|
285
|
+
"""
|
|
286
|
+
Generate predictions for input data.
|
|
287
|
+
|
|
288
|
+
This method provides a high-level interface for generating predictions
|
|
289
|
+
from the pipeline's model. It handles preprocessing and postprocessing
|
|
290
|
+
automatically.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
inputs: Input data for prediction. Can be:
|
|
294
|
+
- str: Single sequence string
|
|
295
|
+
- list: List of sequence strings
|
|
296
|
+
- tensor: Preprocessed input tensors
|
|
297
|
+
**kwargs: Additional keyword arguments passed to model prediction.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
dict: Prediction results including:
|
|
301
|
+
- predictions: Predicted labels or values
|
|
302
|
+
- confidence: Confidence scores (if available)
|
|
303
|
+
- logits: Raw model outputs (if requested)
|
|
304
|
+
|
|
305
|
+
Example:
|
|
306
|
+
>>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
|
|
307
|
+
>>> # Single prediction
|
|
308
|
+
>>> result = pipeline.predict("ATCGATCG")
|
|
309
|
+
>>> print(result['predictions'])
|
|
310
|
+
>>> # Batch prediction
|
|
311
|
+
>>> results = pipeline.predict(["ATCGATCG", "GCTAGCTA"])
|
|
312
|
+
>>> print(results['predictions'])
|
|
313
|
+
|
|
314
|
+
Note:
|
|
315
|
+
- Input preprocessing is handled automatically
|
|
316
|
+
- Results are formatted consistently across different model types
|
|
317
|
+
- Confidence scores are included when available
|
|
318
|
+
"""
|
|
319
|
+
return self.model.predict(inputs, **kwargs)
|
|
320
|
+
|
|
321
|
+
def inference(self, inputs, **kwargs):
|
|
322
|
+
"""
|
|
323
|
+
Run full inference pipeline on input data.
|
|
324
|
+
|
|
325
|
+
This method provides the complete inference pipeline including
|
|
326
|
+
preprocessing, model forward pass, and postprocessing. It's the
|
|
327
|
+
recommended method for production inference.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
inputs: Input data for inference. Can be:
|
|
331
|
+
- str: Single sequence string
|
|
332
|
+
- list: List of sequence strings
|
|
333
|
+
- tensor: Preprocessed input tensors
|
|
334
|
+
**kwargs: Additional keyword arguments for inference including:
|
|
335
|
+
- return_attention: Whether to return attention weights
|
|
336
|
+
- return_hidden_states: Whether to return hidden states
|
|
337
|
+
- temperature: Temperature for sampling (if applicable)
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
dict: Complete inference results including:
|
|
341
|
+
- predictions: Final predictions
|
|
342
|
+
- confidence: Confidence scores
|
|
343
|
+
- attention: Attention weights (if requested)
|
|
344
|
+
- hidden_states: Hidden states (if requested)
|
|
345
|
+
|
|
346
|
+
Example:
|
|
347
|
+
>>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
|
|
348
|
+
>>> # Basic inference
|
|
349
|
+
>>> results = pipeline.inference("ATCGATCG")
|
|
350
|
+
>>> print(results['predictions'])
|
|
351
|
+
>>> # Inference with attention
|
|
352
|
+
>>> results = pipeline.inference("ATCGATCG", return_attention=True)
|
|
353
|
+
>>> print(results['attention'].shape)
|
|
354
|
+
|
|
355
|
+
Note:
|
|
356
|
+
- This is the most comprehensive inference method
|
|
357
|
+
- Handles all preprocessing and postprocessing automatically
|
|
358
|
+
- Returns rich information about the model's internal states
|
|
359
|
+
"""
|
|
360
|
+
return self.model.inference(inputs, **kwargs)
|
|
361
|
+
|
|
362
|
+
@staticmethod
|
|
363
|
+
def load(pipeline_name_or_path, local_only=False, **kwargs):
|
|
364
|
+
"""
|
|
365
|
+
Load a pipeline from disk or hub.
|
|
366
|
+
|
|
367
|
+
This static method loads a complete pipeline including model, tokenizer,
|
|
368
|
+
datasets, and trainer from a saved pipeline directory or hub identifier.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
pipeline_name_or_path (str): Path to saved pipeline directory or
|
|
372
|
+
hub identifier for downloading.
|
|
373
|
+
local_only (bool, optional): If True, only load from local paths.
|
|
374
|
+
If False, download from hub if not found locally. Defaults to False.
|
|
375
|
+
**kwargs: Additional keyword arguments for pipeline initialization:
|
|
376
|
+
- device: Target device for the model
|
|
377
|
+
- name: Custom name for the pipeline
|
|
378
|
+
- trust_remote_code: Whether to trust remote code
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Pipeline: Loaded pipeline instance ready for use.
|
|
382
|
+
|
|
383
|
+
Raises:
|
|
384
|
+
FileNotFoundError: If pipeline cannot be found locally and
|
|
385
|
+
local_only is True.
|
|
386
|
+
ValueError: If pipeline files are corrupted or invalid.
|
|
387
|
+
ImportError: If required dependencies are not available.
|
|
388
|
+
|
|
389
|
+
Example:
|
|
390
|
+
>>> # Load from local path
|
|
391
|
+
>>> pipeline = Pipeline.load("./saved_pipeline")
|
|
392
|
+
>>> # Load from hub
|
|
393
|
+
>>> pipeline = Pipeline.load("yangheng/OmniGenome-RNA-Classification")
|
|
394
|
+
>>> # Use loaded pipeline
|
|
395
|
+
>>> results = pipeline("ATCGATCG")
|
|
396
|
+
|
|
397
|
+
Note:
|
|
398
|
+
- Loads all pipeline components (model, tokenizer, datasets, trainer)
|
|
399
|
+
- Automatically handles device placement
|
|
400
|
+
- Preserves all training configurations and metadata
|
|
401
|
+
"""
|
|
402
|
+
import dill
|
|
403
|
+
|
|
404
|
+
if os.path.exists(pipeline_name_or_path):
|
|
405
|
+
path = pipeline_name_or_path
|
|
406
|
+
else:
|
|
407
|
+
path = download_pipeline(
|
|
408
|
+
pipeline_name_or_path, local_only=local_only, **kwargs
|
|
409
|
+
)
|
|
410
|
+
with open(f"{path}/datasets.pkl", "rb") as f:
|
|
411
|
+
datasets = dill.load(f)
|
|
412
|
+
with open(f"{path}/trainer.pkl", "rb") as f:
|
|
413
|
+
trainer = dill.load(f)
|
|
414
|
+
model = ModelHub.load(path, local_only=local_only, **kwargs)
|
|
415
|
+
tokenizer = model.tokenizer
|
|
416
|
+
pipeline = Pipeline(
|
|
417
|
+
name=(
|
|
418
|
+
pipeline_name_or_path
|
|
419
|
+
if kwargs.get("name") is None
|
|
420
|
+
else kwargs.get("name")
|
|
421
|
+
),
|
|
422
|
+
model_name_or_path=model,
|
|
423
|
+
tokenizer=tokenizer,
|
|
424
|
+
datasets=datasets,
|
|
425
|
+
trainer=trainer,
|
|
426
|
+
**kwargs,
|
|
427
|
+
)
|
|
428
|
+
return pipeline
|
|
429
|
+
|
|
430
|
+
def save(self, path, overwrite=False, **kwargs):
|
|
431
|
+
"""
|
|
432
|
+
Save the pipeline to disk.
|
|
433
|
+
|
|
434
|
+
This method saves the complete pipeline including model, tokenizer,
|
|
435
|
+
datasets, trainer, and metadata to a directory. The saved pipeline
|
|
436
|
+
can be loaded later using Pipeline.load().
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
path (str): Directory path where to save the pipeline.
|
|
440
|
+
overwrite (bool, optional): If True, overwrite existing directory.
|
|
441
|
+
If False, raise error if directory exists. Defaults to False.
|
|
442
|
+
**kwargs: Additional keyword arguments for model saving.
|
|
443
|
+
|
|
444
|
+
Raises:
|
|
445
|
+
FileExistsError: If path exists and overwrite is False.
|
|
446
|
+
OSError: If there are issues creating the directory or writing files.
|
|
447
|
+
RuntimeError: If saving fails due to model or data issues.
|
|
448
|
+
|
|
449
|
+
Example:
|
|
450
|
+
>>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
|
|
451
|
+
>>> # Train the pipeline
|
|
452
|
+
>>> pipeline.train(datasets)
|
|
453
|
+
>>> # Save the trained pipeline
|
|
454
|
+
>>> pipeline.save("./trained_pipeline", overwrite=True)
|
|
455
|
+
>>> # Load the saved pipeline later
|
|
456
|
+
>>> loaded_pipeline = Pipeline.load("./trained_pipeline")
|
|
457
|
+
|
|
458
|
+
Note:
|
|
459
|
+
- Saves all pipeline components (model, tokenizer, datasets, trainer)
|
|
460
|
+
- Preserves training configurations and metadata
|
|
461
|
+
- Model is temporarily moved to CPU during saving to avoid GPU memory issues
|
|
462
|
+
- Creates a complete, self-contained pipeline directory
|
|
463
|
+
"""
|
|
464
|
+
import dill
|
|
465
|
+
|
|
466
|
+
if os.path.exists(path) and not overwrite:
|
|
467
|
+
raise FileExistsError(
|
|
468
|
+
f"The path {path} already exists, please set overwrite=True to overwrite it."
|
|
469
|
+
)
|
|
470
|
+
if not os.path.exists(path):
|
|
471
|
+
os.makedirs(path)
|
|
472
|
+
device = self.model.model.device
|
|
473
|
+
self.model.model.to("cpu")
|
|
474
|
+
with open(f"{path}/datasets.pkl", "wb") as f:
|
|
475
|
+
dill.dump(self.datasets, f)
|
|
476
|
+
with open(f"{path}/metadata.json", "w") as f:
|
|
477
|
+
json.dump(self.metadata, f)
|
|
478
|
+
with open(f"{path}/tokenizer.pkl", "wb") as f:
|
|
479
|
+
dill.dump(self.tokenizer, f)
|
|
480
|
+
with open(f"{path}/trainer.pkl", "wb") as f:
|
|
481
|
+
dill.dump(self.trainer, f)
|
|
482
|
+
self.model.save(path, overwrite=overwrite, **kwargs)
|
|
483
|
+
self.model.model.to(device)
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# file: pipeline_hub.py
|
|
3
|
+
# time: 22:26 08/04/2024
|
|
4
|
+
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
|
|
5
|
+
# github: https://github.com/yangheng95
|
|
6
|
+
# huggingface: https://huggingface.co/yangheng
|
|
7
|
+
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
|
|
8
|
+
# Copyright (C) 2019-2024. All Rights Reserved.
|
|
9
|
+
"""
|
|
10
|
+
Pipeline Hub Module
|
|
11
|
+
|
|
12
|
+
This module provides the PipelineHub class for managing and loading pre-built
|
|
13
|
+
pipelines from the OmniGenome hub. Pipelines combine models, tokenizers,
|
|
14
|
+
datasets, and trainers into ready-to-use workflows.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .pipeline import Pipeline
|
|
18
|
+
from ...src.misc.utils import env_meta_info
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PipelineHub:
|
|
22
|
+
"""
|
|
23
|
+
Hub for managing and loading pre-built OmniGenome pipelines.
|
|
24
|
+
|
|
25
|
+
The PipelineHub provides a centralized interface for accessing pre-built
|
|
26
|
+
pipelines that combine models, tokenizers, datasets, and training
|
|
27
|
+
configurations. It handles automatic downloading and loading of pipelines
|
|
28
|
+
from the OmniGenome hub.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
metadata (dict): Environment metadata including system information,
|
|
32
|
+
package versions, and hardware details.
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> from omnigenome import PipelineHub
|
|
36
|
+
>>> hub = PipelineHub()
|
|
37
|
+
>>> pipeline = hub.load("yangheng/OmniGenome-RNA-Classification")
|
|
38
|
+
>>> predictions = pipeline("ATCGATCG")
|
|
39
|
+
>>> print(predictions['predictions'])
|
|
40
|
+
|
|
41
|
+
Note:
|
|
42
|
+
- Pipelines can be loaded from local paths or downloaded from the hub
|
|
43
|
+
- The hub automatically handles model, tokenizer, and dataset loading
|
|
44
|
+
- Environment metadata is collected for reproducibility
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, *args, **kwargs):
|
|
48
|
+
"""
|
|
49
|
+
Initialize the PipelineHub.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
*args: Variable length argument list (currently unused).
|
|
53
|
+
**kwargs: Arbitrary keyword arguments (currently unused).
|
|
54
|
+
|
|
55
|
+
Note:
|
|
56
|
+
The constructor initializes environment metadata for tracking
|
|
57
|
+
system information and package versions.
|
|
58
|
+
"""
|
|
59
|
+
super(PipelineHub, self).__init__(*args, **kwargs)
|
|
60
|
+
self.metadata = env_meta_info()
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def load(pipeline_name_or_path, local_only=False, **kwargs):
|
|
64
|
+
"""
|
|
65
|
+
Load a pipeline from the hub or local path.
|
|
66
|
+
|
|
67
|
+
This method loads a complete pipeline including the model, tokenizer,
|
|
68
|
+
datasets, and trainer configuration. If the pipeline doesn't exist
|
|
69
|
+
locally and local_only is False, it will be downloaded from the hub.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
pipeline_name_or_path (str): Name or path of the pipeline to load.
|
|
73
|
+
Can be a local directory path or a hub identifier.
|
|
74
|
+
local_only (bool, optional): If True, only load from local paths.
|
|
75
|
+
If False, download from hub if not found locally. Defaults to False.
|
|
76
|
+
**kwargs: Additional keyword arguments passed to the Pipeline constructor.
|
|
77
|
+
Common options include:
|
|
78
|
+
- device: Target device for the model
|
|
79
|
+
- trust_remote_code: Whether to trust remote code in tokenizers
|
|
80
|
+
- name: Custom name for the pipeline
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Pipeline: Loaded pipeline instance with model, tokenizer, datasets,
|
|
84
|
+
and trainer ready for use.
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
FileNotFoundError: If the pipeline cannot be found locally and
|
|
88
|
+
local_only is True.
|
|
89
|
+
ValueError: If the pipeline configuration is invalid.
|
|
90
|
+
ImportError: If required dependencies are not available.
|
|
91
|
+
|
|
92
|
+
Example:
|
|
93
|
+
>>> hub = PipelineHub()
|
|
94
|
+
>>> # Load from hub
|
|
95
|
+
>>> pipeline = hub.load("yangheng/OmniGenome-RNA-Classification")
|
|
96
|
+
>>> # Load from local path
|
|
97
|
+
>>> pipeline = hub.load("./my_pipeline", local_only=True)
|
|
98
|
+
>>> # Use pipeline for inference
|
|
99
|
+
>>> results = pipeline("ATCGATCG")
|
|
100
|
+
|
|
101
|
+
Note:
|
|
102
|
+
- The pipeline includes all necessary components for training and inference
|
|
103
|
+
- Model weights, tokenizer, and datasets are automatically loaded
|
|
104
|
+
- The pipeline can be used immediately for inference or fine-tuning
|
|
105
|
+
"""
|
|
106
|
+
return Pipeline.load(pipeline_name_or_path, local_only=local_only, **kwargs)
|
|
107
|
+
|
|
108
|
+
def push(self, pipeline, **kwargs):
|
|
109
|
+
"""
|
|
110
|
+
Push a pipeline to the hub (not yet implemented).
|
|
111
|
+
|
|
112
|
+
This method is intended to upload custom pipelines to the OmniGenome hub
|
|
113
|
+
for sharing and distribution. Currently not implemented.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
pipeline (Pipeline): Pipeline instance to upload to the hub.
|
|
117
|
+
**kwargs: Additional keyword arguments for the upload process.
|
|
118
|
+
|
|
119
|
+
Raises:
|
|
120
|
+
NotImplementedError: This method has not been implemented yet.
|
|
121
|
+
|
|
122
|
+
Note:
|
|
123
|
+
Future implementation will support:
|
|
124
|
+
- Pipeline metadata and documentation
|
|
125
|
+
- Model weights and configuration
|
|
126
|
+
- Tokenizer and dataset specifications
|
|
127
|
+
- Training configurations and results
|
|
128
|
+
"""
|
|
129
|
+
raise NotImplementedError("This method has not implemented yet.")
|