omnigenome 0.3.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of omnigenome might be problematic. Click here for more details.

Files changed (85) hide show
  1. omnigenome/__init__.py +281 -0
  2. omnigenome/auto/__init__.py +3 -0
  3. omnigenome/auto/auto_bench/__init__.py +12 -0
  4. omnigenome/auto/auto_bench/auto_bench.py +484 -0
  5. omnigenome/auto/auto_bench/auto_bench_cli.py +230 -0
  6. omnigenome/auto/auto_bench/auto_bench_config.py +216 -0
  7. omnigenome/auto/auto_bench/config_check.py +34 -0
  8. omnigenome/auto/auto_train/__init__.py +13 -0
  9. omnigenome/auto/auto_train/auto_train.py +430 -0
  10. omnigenome/auto/auto_train/auto_train_cli.py +222 -0
  11. omnigenome/auto/bench_hub/__init__.py +12 -0
  12. omnigenome/auto/bench_hub/bench_hub.py +25 -0
  13. omnigenome/cli/__init__.py +13 -0
  14. omnigenome/cli/commands/__init__.py +13 -0
  15. omnigenome/cli/commands/base.py +83 -0
  16. omnigenome/cli/commands/bench/__init__.py +13 -0
  17. omnigenome/cli/commands/bench/bench_cli.py +202 -0
  18. omnigenome/cli/commands/rna/__init__.py +13 -0
  19. omnigenome/cli/commands/rna/rna_design.py +178 -0
  20. omnigenome/cli/omnigenome_cli.py +128 -0
  21. omnigenome/src/__init__.py +12 -0
  22. omnigenome/src/abc/__init__.py +12 -0
  23. omnigenome/src/abc/abstract_dataset.py +622 -0
  24. omnigenome/src/abc/abstract_metric.py +114 -0
  25. omnigenome/src/abc/abstract_model.py +689 -0
  26. omnigenome/src/abc/abstract_tokenizer.py +267 -0
  27. omnigenome/src/dataset/__init__.py +16 -0
  28. omnigenome/src/dataset/omni_dataset.py +435 -0
  29. omnigenome/src/lora/__init__.py +13 -0
  30. omnigenome/src/lora/lora_model.py +294 -0
  31. omnigenome/src/metric/__init__.py +15 -0
  32. omnigenome/src/metric/classification_metric.py +184 -0
  33. omnigenome/src/metric/metric.py +199 -0
  34. omnigenome/src/metric/ranking_metric.py +142 -0
  35. omnigenome/src/metric/regression_metric.py +191 -0
  36. omnigenome/src/misc/__init__.py +3 -0
  37. omnigenome/src/misc/utils.py +439 -0
  38. omnigenome/src/model/__init__.py +19 -0
  39. omnigenome/src/model/augmentation/__init__.py +12 -0
  40. omnigenome/src/model/augmentation/model.py +219 -0
  41. omnigenome/src/model/classification/__init__.py +12 -0
  42. omnigenome/src/model/classification/model.py +642 -0
  43. omnigenome/src/model/embedding/__init__.py +12 -0
  44. omnigenome/src/model/embedding/model.py +263 -0
  45. omnigenome/src/model/mlm/__init__.py +12 -0
  46. omnigenome/src/model/mlm/model.py +177 -0
  47. omnigenome/src/model/module_utils.py +232 -0
  48. omnigenome/src/model/regression/__init__.py +12 -0
  49. omnigenome/src/model/regression/model.py +786 -0
  50. omnigenome/src/model/regression/resnet.py +483 -0
  51. omnigenome/src/model/rna_design/__init__.py +12 -0
  52. omnigenome/src/model/rna_design/model.py +426 -0
  53. omnigenome/src/model/seq2seq/__init__.py +12 -0
  54. omnigenome/src/model/seq2seq/model.py +44 -0
  55. omnigenome/src/tokenizer/__init__.py +16 -0
  56. omnigenome/src/tokenizer/bpe_tokenizer.py +226 -0
  57. omnigenome/src/tokenizer/kmers_tokenizer.py +247 -0
  58. omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +249 -0
  59. omnigenome/src/trainer/__init__.py +14 -0
  60. omnigenome/src/trainer/accelerate_trainer.py +739 -0
  61. omnigenome/src/trainer/hf_trainer.py +75 -0
  62. omnigenome/src/trainer/trainer.py +579 -0
  63. omnigenome/utility/__init__.py +3 -0
  64. omnigenome/utility/dataset_hub/__init__.py +13 -0
  65. omnigenome/utility/dataset_hub/dataset_hub.py +178 -0
  66. omnigenome/utility/ensemble.py +324 -0
  67. omnigenome/utility/hub_utils.py +517 -0
  68. omnigenome/utility/model_hub/__init__.py +12 -0
  69. omnigenome/utility/model_hub/model_hub.py +231 -0
  70. omnigenome/utility/pipeline_hub/__init__.py +12 -0
  71. omnigenome/utility/pipeline_hub/pipeline.py +483 -0
  72. omnigenome/utility/pipeline_hub/pipeline_hub.py +129 -0
  73. omnigenome-0.3.0a0.dist-info/METADATA +224 -0
  74. omnigenome-0.3.0a0.dist-info/RECORD +85 -0
  75. omnigenome-0.3.0a0.dist-info/WHEEL +5 -0
  76. omnigenome-0.3.0a0.dist-info/entry_points.txt +3 -0
  77. omnigenome-0.3.0a0.dist-info/licenses/LICENSE +201 -0
  78. omnigenome-0.3.0a0.dist-info/top_level.txt +2 -0
  79. tests/__init__.py +9 -0
  80. tests/conftest.py +160 -0
  81. tests/test_dataset_patterns.py +291 -0
  82. tests/test_examples_syntax.py +83 -0
  83. tests/test_model_loading.py +183 -0
  84. tests/test_rna_functions.py +255 -0
  85. tests/test_training_patterns.py +302 -0
@@ -0,0 +1,483 @@
1
+ # -*- coding: utf-8 -*-
2
+ # file: pipeline.py
3
+ # time: 18:38 12/04/2024
4
+ # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
+ # github: https://github.com/yangheng95
6
+ # huggingface: https://huggingface.co/yangheng
7
+ # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
+ # Copyright (C) 2019-2024. All Rights Reserved.
9
+ """
10
+ Pipeline Module
11
+
12
+ This module provides the Pipeline class for creating and managing complete
13
+ machine learning workflows that combine models, tokenizers, datasets, and
14
+ trainers. Pipelines provide a unified interface for training, inference,
15
+ and model management.
16
+ """
17
+
18
+ import json
19
+ import os
20
+
21
+ import autocuda
22
+ from transformers import AutoConfig, AutoTokenizer
23
+
24
+ from ..hub_utils import download_pipeline
25
+ from ..model_hub.model_hub import ModelHub
26
+ from ...src.abc.abstract_model import OmniModel
27
+ from ...src.misc.utils import env_meta_info, fprint
28
+ from ...src.trainer.trainer import Trainer
29
+
30
+
31
+ class Pipeline:
32
+ """
33
+ Complete machine learning pipeline combining model, tokenizer, datasets, and trainer.
34
+
35
+ The Pipeline class provides a unified interface for managing complete machine
36
+ learning workflows. It handles model initialization, training, inference, and
37
+ persistence. Pipelines can be loaded from pre-built configurations or created
38
+ from scratch with custom components.
39
+
40
+ Attributes:
41
+ model (OmniModel): The underlying model for the pipeline.
42
+ tokenizer: Tokenizer for preprocessing input sequences.
43
+ dataset (dict): Dictionary containing train/validation/test datasets.
44
+ metadata (dict): Environment and pipeline metadata.
45
+ trainer (Trainer): Trainer instance for model training.
46
+ device (str): Target device for model execution (CPU/GPU).
47
+ name (str): Name identifier for the pipeline.
48
+
49
+ Example:
50
+ >>> from omnigenome import Pipeline, OmniModelForSequenceClassification
51
+ >>> # Create pipeline from model
52
+ >>> model = OmniModelForSequenceClassification("model_path", tokenizer)
53
+ >>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
54
+ >>> # Use for inference
55
+ >>> predictions = pipeline("ATCGATCG")
56
+ >>> # Train the model
57
+ >>> pipeline.train(datasets)
58
+ >>> # Save pipeline
59
+ >>> pipeline.save("./saved_pipeline")
60
+
61
+ Note:
62
+ - Pipelines automatically handle device placement and model optimization
63
+ - Environment metadata is collected for reproducibility
64
+ - Pipelines can be saved and loaded for easy deployment
65
+ - Supports both local models and hub-based model loading
66
+ """
67
+
68
+ model: OmniModel = None
69
+ tokenizer = None
70
+ dataset: dict = None
71
+ metadata: dict = None
72
+
73
+ def __init__(
74
+ self,
75
+ name,
76
+ *,
77
+ model_name_or_path,
78
+ tokenizer=None,
79
+ datasets=None,
80
+ trainer=None,
81
+ **kwargs,
82
+ ):
83
+ """
84
+ Initialize a Pipeline instance.
85
+
86
+ Args:
87
+ name (str): Name identifier for the pipeline.
88
+ model_name_or_path (Union[str, OmniModel]): Model to use in the pipeline.
89
+ Can be a string path/identifier or an OmniModel instance.
90
+ tokenizer (optional): Tokenizer for preprocessing. If None, will be
91
+ loaded from the model or model path. Defaults to None.
92
+ datasets (dict, optional): Dictionary containing train/validation/test
93
+ datasets. Keys should be 'train', 'valid', 'test'. Defaults to None.
94
+ trainer (Trainer, optional): Trainer instance for model training.
95
+ If None, a default trainer will be created. Defaults to None.
96
+ **kwargs: Additional keyword arguments including:
97
+ - device (str): Target device for model execution
98
+ - trust_remote_code (bool): Whether to trust remote code in tokenizers
99
+ - Other model-specific configuration parameters
100
+
101
+ Raises:
102
+ ValueError: If model initialization fails.
103
+ ImportError: If required dependencies are not available.
104
+ FileNotFoundError: If model path is invalid.
105
+
106
+ Example:
107
+ >>> # Create from model path
108
+ >>> pipeline = Pipeline("rna_classification",
109
+ ... model_name_or_path="yangheng/OmniGenome-186M")
110
+ >>> # Create from model instance
111
+ >>> model = OmniModelForSequenceClassification("model_path", tokenizer)
112
+ >>> pipeline = Pipeline("custom_pipeline", model_name_or_path=model)
113
+
114
+ Note:
115
+ - The pipeline automatically handles model loading and device placement
116
+ - Environment metadata is collected for tracking system information
117
+ - If a model instance is provided, its tokenizer and metadata are used
118
+ """
119
+ self.metadata = env_meta_info()
120
+ self.name = name
121
+ self.tokenizer = tokenizer
122
+ self.datasets = datasets
123
+ self.trainer = trainer
124
+ self.device = (
125
+ autocuda.auto_cuda()
126
+ if kwargs.get("device") is None
127
+ else kwargs.get("device")
128
+ )
129
+ if not isinstance(model_name_or_path, str):
130
+ self.model = model_name_or_path
131
+ self.tokenizer = self.model.tokenizer
132
+ self.metadata = self.model.metadata
133
+ else:
134
+ self.init_pipeline(
135
+ model_name_or_path=model_name_or_path, tokenizer=tokenizer, **kwargs
136
+ )
137
+
138
+ self.model.to(self.device)
139
+
140
+ def __call__(self, inputs, *args, **kwargs):
141
+ """
142
+ Call the pipeline for inference.
143
+
144
+ This method provides a convenient interface for running inference
145
+ through the pipeline. It delegates to the model's inference method.
146
+
147
+ Args:
148
+ inputs: Input data for inference (can be string, list, or tensor).
149
+ *args: Additional positional arguments passed to model inference.
150
+ **kwargs: Additional keyword arguments passed to model inference.
151
+
152
+ Returns:
153
+ dict: Inference results including predictions and confidence scores.
154
+
155
+ Example:
156
+ >>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
157
+ >>> results = pipeline("ATCGATCG")
158
+ >>> print(results['predictions'])
159
+ """
160
+ return self.model.inference(inputs, **kwargs)
161
+
162
+ def to(self, device):
163
+ """
164
+ Move the pipeline to a specific device.
165
+
166
+ Args:
167
+ device (str): Target device ('cpu', 'cuda', 'cuda:0', etc.).
168
+
169
+ Returns:
170
+ Pipeline: Self for method chaining.
171
+
172
+ Example:
173
+ >>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
174
+ >>> pipeline.to("cuda:0") # Move to GPU
175
+ >>> pipeline.to("cpu") # Move to CPU
176
+ """
177
+ self.model.to(device)
178
+ self.device = device
179
+ return self
180
+
181
+ def init_pipeline(self, *, model_name_or_path, tokenizer=None, **kwargs):
182
+ """
183
+ Initialize the pipeline components from a model path.
184
+
185
+ This method handles loading the model, tokenizer, and configuration
186
+ from a model path or identifier. It tries to load from the ModelHub
187
+ first, then falls back to HuggingFace transformers.
188
+
189
+ Args:
190
+ model_name_or_path (str): Path or identifier of the model to load.
191
+ tokenizer (optional): Tokenizer instance. If None, will be loaded
192
+ from the model path. Defaults to None.
193
+ **kwargs: Additional keyword arguments for model loading including:
194
+ - trust_remote_code (bool): Whether to trust remote code
195
+ - device (str): Target device for the model
196
+ - Other model-specific parameters
197
+
198
+ Returns:
199
+ Pipeline: Self for method chaining.
200
+
201
+ Raises:
202
+ ValueError: If model loading fails.
203
+ ImportError: If required dependencies are not available.
204
+
205
+ Example:
206
+ >>> pipeline = Pipeline("my_pipeline")
207
+ >>> pipeline.init_pipeline(model_name_or_path="yangheng/OmniGenome-186M")
208
+
209
+ Note:
210
+ - First attempts to load from OmniGenome ModelHub
211
+ - Falls back to HuggingFace transformers if ModelHub fails
212
+ - Automatically handles tokenizer loading and configuration
213
+ """
214
+ trust_remote_code = kwargs.get("trust_remote_code", True)
215
+ try: # for the models saved by OmniGenome and served by the model hub
216
+ self.model = ModelHub.load(model_name_or_path, **kwargs)
217
+ self.tokenizer = self.model.tokenizer
218
+ self.metadata.update(self.model.metadata)
219
+ except Exception as e:
220
+ fprint(f"Fail to load the model from the model hub, the error is: {e}")
221
+
222
+ config = AutoConfig.from_pretrained(
223
+ model_name_or_path, trust_remote_code=trust_remote_code
224
+ )
225
+ if tokenizer is None:
226
+ tokenizer = AutoTokenizer.from_pretrained(
227
+ model_name_or_path, trust_remote_code=trust_remote_code
228
+ )
229
+ self.model = OmniModel.from_pretrained(
230
+ model_name_or_path,
231
+ config=config,
232
+ tokenizer=tokenizer,
233
+ trust_remote_code=trust_remote_code,
234
+ **kwargs,
235
+ )
236
+ self.tokenizer = self.model.tokenizer
237
+ self.metadata.update(self.model.metadata)
238
+ fprint(f"The pipeline has been initialized from {model_name_or_path}.")
239
+ return self
240
+
241
+ def train(self, datasets: dict = None, trainer=None, **kwargs):
242
+ """
243
+ Train the model in the pipeline.
244
+
245
+ This method initiates training of the model using the provided datasets
246
+ and trainer configuration. If no trainer is provided, the pipeline's
247
+ existing trainer will be used.
248
+
249
+ Args:
250
+ datasets (dict, optional): Dictionary containing train/validation/test
251
+ datasets. If None, uses the pipeline's existing datasets.
252
+ Keys should be 'train', 'valid', 'test'. Defaults to None.
253
+ trainer (Trainer, optional): Trainer instance to use for training.
254
+ If None, uses the pipeline's existing trainer. Defaults to None.
255
+ **kwargs: Additional keyword arguments passed to the trainer.
256
+
257
+ Raises:
258
+ ValueError: If no trainer is available or datasets are invalid.
259
+ RuntimeError: If training fails.
260
+
261
+ Example:
262
+ >>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
263
+ >>> # Train with existing datasets
264
+ >>> pipeline.train()
265
+ >>> # Train with custom datasets
266
+ >>> custom_datasets = {'train': train_data, 'valid': valid_data}
267
+ >>> pipeline.train(datasets=custom_datasets)
268
+ >>> # Train with custom trainer
269
+ >>> from omnigenome import Trainer
270
+ >>> custom_trainer = Trainer(model, train_dataset=train_data)
271
+ >>> pipeline.train(trainer=custom_trainer)
272
+
273
+ Note:
274
+ - Training uses the pipeline's current model and device
275
+ - Progress and metrics are logged during training
276
+ - The trained model is automatically saved in the pipeline
277
+ """
278
+ if trainer is not None:
279
+ assert isinstance(trainer, Trainer)
280
+ self.trainer = trainer
281
+
282
+ self.trainer.train()
283
+
284
+ def predict(self, inputs, **kwargs):
285
+ """
286
+ Generate predictions for input data.
287
+
288
+ This method provides a high-level interface for generating predictions
289
+ from the pipeline's model. It handles preprocessing and postprocessing
290
+ automatically.
291
+
292
+ Args:
293
+ inputs: Input data for prediction. Can be:
294
+ - str: Single sequence string
295
+ - list: List of sequence strings
296
+ - tensor: Preprocessed input tensors
297
+ **kwargs: Additional keyword arguments passed to model prediction.
298
+
299
+ Returns:
300
+ dict: Prediction results including:
301
+ - predictions: Predicted labels or values
302
+ - confidence: Confidence scores (if available)
303
+ - logits: Raw model outputs (if requested)
304
+
305
+ Example:
306
+ >>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
307
+ >>> # Single prediction
308
+ >>> result = pipeline.predict("ATCGATCG")
309
+ >>> print(result['predictions'])
310
+ >>> # Batch prediction
311
+ >>> results = pipeline.predict(["ATCGATCG", "GCTAGCTA"])
312
+ >>> print(results['predictions'])
313
+
314
+ Note:
315
+ - Input preprocessing is handled automatically
316
+ - Results are formatted consistently across different model types
317
+ - Confidence scores are included when available
318
+ """
319
+ return self.model.predict(inputs, **kwargs)
320
+
321
+ def inference(self, inputs, **kwargs):
322
+ """
323
+ Run full inference pipeline on input data.
324
+
325
+ This method provides the complete inference pipeline including
326
+ preprocessing, model forward pass, and postprocessing. It's the
327
+ recommended method for production inference.
328
+
329
+ Args:
330
+ inputs: Input data for inference. Can be:
331
+ - str: Single sequence string
332
+ - list: List of sequence strings
333
+ - tensor: Preprocessed input tensors
334
+ **kwargs: Additional keyword arguments for inference including:
335
+ - return_attention: Whether to return attention weights
336
+ - return_hidden_states: Whether to return hidden states
337
+ - temperature: Temperature for sampling (if applicable)
338
+
339
+ Returns:
340
+ dict: Complete inference results including:
341
+ - predictions: Final predictions
342
+ - confidence: Confidence scores
343
+ - attention: Attention weights (if requested)
344
+ - hidden_states: Hidden states (if requested)
345
+
346
+ Example:
347
+ >>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
348
+ >>> # Basic inference
349
+ >>> results = pipeline.inference("ATCGATCG")
350
+ >>> print(results['predictions'])
351
+ >>> # Inference with attention
352
+ >>> results = pipeline.inference("ATCGATCG", return_attention=True)
353
+ >>> print(results['attention'].shape)
354
+
355
+ Note:
356
+ - This is the most comprehensive inference method
357
+ - Handles all preprocessing and postprocessing automatically
358
+ - Returns rich information about the model's internal states
359
+ """
360
+ return self.model.inference(inputs, **kwargs)
361
+
362
+ @staticmethod
363
+ def load(pipeline_name_or_path, local_only=False, **kwargs):
364
+ """
365
+ Load a pipeline from disk or hub.
366
+
367
+ This static method loads a complete pipeline including model, tokenizer,
368
+ datasets, and trainer from a saved pipeline directory or hub identifier.
369
+
370
+ Args:
371
+ pipeline_name_or_path (str): Path to saved pipeline directory or
372
+ hub identifier for downloading.
373
+ local_only (bool, optional): If True, only load from local paths.
374
+ If False, download from hub if not found locally. Defaults to False.
375
+ **kwargs: Additional keyword arguments for pipeline initialization:
376
+ - device: Target device for the model
377
+ - name: Custom name for the pipeline
378
+ - trust_remote_code: Whether to trust remote code
379
+
380
+ Returns:
381
+ Pipeline: Loaded pipeline instance ready for use.
382
+
383
+ Raises:
384
+ FileNotFoundError: If pipeline cannot be found locally and
385
+ local_only is True.
386
+ ValueError: If pipeline files are corrupted or invalid.
387
+ ImportError: If required dependencies are not available.
388
+
389
+ Example:
390
+ >>> # Load from local path
391
+ >>> pipeline = Pipeline.load("./saved_pipeline")
392
+ >>> # Load from hub
393
+ >>> pipeline = Pipeline.load("yangheng/OmniGenome-RNA-Classification")
394
+ >>> # Use loaded pipeline
395
+ >>> results = pipeline("ATCGATCG")
396
+
397
+ Note:
398
+ - Loads all pipeline components (model, tokenizer, datasets, trainer)
399
+ - Automatically handles device placement
400
+ - Preserves all training configurations and metadata
401
+ """
402
+ import dill
403
+
404
+ if os.path.exists(pipeline_name_or_path):
405
+ path = pipeline_name_or_path
406
+ else:
407
+ path = download_pipeline(
408
+ pipeline_name_or_path, local_only=local_only, **kwargs
409
+ )
410
+ with open(f"{path}/datasets.pkl", "rb") as f:
411
+ datasets = dill.load(f)
412
+ with open(f"{path}/trainer.pkl", "rb") as f:
413
+ trainer = dill.load(f)
414
+ model = ModelHub.load(path, local_only=local_only, **kwargs)
415
+ tokenizer = model.tokenizer
416
+ pipeline = Pipeline(
417
+ name=(
418
+ pipeline_name_or_path
419
+ if kwargs.get("name") is None
420
+ else kwargs.get("name")
421
+ ),
422
+ model_name_or_path=model,
423
+ tokenizer=tokenizer,
424
+ datasets=datasets,
425
+ trainer=trainer,
426
+ **kwargs,
427
+ )
428
+ return pipeline
429
+
430
+ def save(self, path, overwrite=False, **kwargs):
431
+ """
432
+ Save the pipeline to disk.
433
+
434
+ This method saves the complete pipeline including model, tokenizer,
435
+ datasets, trainer, and metadata to a directory. The saved pipeline
436
+ can be loaded later using Pipeline.load().
437
+
438
+ Args:
439
+ path (str): Directory path where to save the pipeline.
440
+ overwrite (bool, optional): If True, overwrite existing directory.
441
+ If False, raise error if directory exists. Defaults to False.
442
+ **kwargs: Additional keyword arguments for model saving.
443
+
444
+ Raises:
445
+ FileExistsError: If path exists and overwrite is False.
446
+ OSError: If there are issues creating the directory or writing files.
447
+ RuntimeError: If saving fails due to model or data issues.
448
+
449
+ Example:
450
+ >>> pipeline = Pipeline("my_pipeline", model_name_or_path=model)
451
+ >>> # Train the pipeline
452
+ >>> pipeline.train(datasets)
453
+ >>> # Save the trained pipeline
454
+ >>> pipeline.save("./trained_pipeline", overwrite=True)
455
+ >>> # Load the saved pipeline later
456
+ >>> loaded_pipeline = Pipeline.load("./trained_pipeline")
457
+
458
+ Note:
459
+ - Saves all pipeline components (model, tokenizer, datasets, trainer)
460
+ - Preserves training configurations and metadata
461
+ - Model is temporarily moved to CPU during saving to avoid GPU memory issues
462
+ - Creates a complete, self-contained pipeline directory
463
+ """
464
+ import dill
465
+
466
+ if os.path.exists(path) and not overwrite:
467
+ raise FileExistsError(
468
+ f"The path {path} already exists, please set overwrite=True to overwrite it."
469
+ )
470
+ if not os.path.exists(path):
471
+ os.makedirs(path)
472
+ device = self.model.model.device
473
+ self.model.model.to("cpu")
474
+ with open(f"{path}/datasets.pkl", "wb") as f:
475
+ dill.dump(self.datasets, f)
476
+ with open(f"{path}/metadata.json", "w") as f:
477
+ json.dump(self.metadata, f)
478
+ with open(f"{path}/tokenizer.pkl", "wb") as f:
479
+ dill.dump(self.tokenizer, f)
480
+ with open(f"{path}/trainer.pkl", "wb") as f:
481
+ dill.dump(self.trainer, f)
482
+ self.model.save(path, overwrite=overwrite, **kwargs)
483
+ self.model.model.to(device)
@@ -0,0 +1,129 @@
1
+ # -*- coding: utf-8 -*-
2
+ # file: pipeline_hub.py
3
+ # time: 22:26 08/04/2024
4
+ # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
+ # github: https://github.com/yangheng95
6
+ # huggingface: https://huggingface.co/yangheng
7
+ # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
+ # Copyright (C) 2019-2024. All Rights Reserved.
9
+ """
10
+ Pipeline Hub Module
11
+
12
+ This module provides the PipelineHub class for managing and loading pre-built
13
+ pipelines from the OmniGenome hub. Pipelines combine models, tokenizers,
14
+ datasets, and trainers into ready-to-use workflows.
15
+ """
16
+
17
+ from .pipeline import Pipeline
18
+ from ...src.misc.utils import env_meta_info
19
+
20
+
21
+ class PipelineHub:
22
+ """
23
+ Hub for managing and loading pre-built OmniGenome pipelines.
24
+
25
+ The PipelineHub provides a centralized interface for accessing pre-built
26
+ pipelines that combine models, tokenizers, datasets, and training
27
+ configurations. It handles automatic downloading and loading of pipelines
28
+ from the OmniGenome hub.
29
+
30
+ Attributes:
31
+ metadata (dict): Environment metadata including system information,
32
+ package versions, and hardware details.
33
+
34
+ Example:
35
+ >>> from omnigenome import PipelineHub
36
+ >>> hub = PipelineHub()
37
+ >>> pipeline = hub.load("yangheng/OmniGenome-RNA-Classification")
38
+ >>> predictions = pipeline("ATCGATCG")
39
+ >>> print(predictions['predictions'])
40
+
41
+ Note:
42
+ - Pipelines can be loaded from local paths or downloaded from the hub
43
+ - The hub automatically handles model, tokenizer, and dataset loading
44
+ - Environment metadata is collected for reproducibility
45
+ """
46
+
47
+ def __init__(self, *args, **kwargs):
48
+ """
49
+ Initialize the PipelineHub.
50
+
51
+ Args:
52
+ *args: Variable length argument list (currently unused).
53
+ **kwargs: Arbitrary keyword arguments (currently unused).
54
+
55
+ Note:
56
+ The constructor initializes environment metadata for tracking
57
+ system information and package versions.
58
+ """
59
+ super(PipelineHub, self).__init__(*args, **kwargs)
60
+ self.metadata = env_meta_info()
61
+
62
+ @staticmethod
63
+ def load(pipeline_name_or_path, local_only=False, **kwargs):
64
+ """
65
+ Load a pipeline from the hub or local path.
66
+
67
+ This method loads a complete pipeline including the model, tokenizer,
68
+ datasets, and trainer configuration. If the pipeline doesn't exist
69
+ locally and local_only is False, it will be downloaded from the hub.
70
+
71
+ Args:
72
+ pipeline_name_or_path (str): Name or path of the pipeline to load.
73
+ Can be a local directory path or a hub identifier.
74
+ local_only (bool, optional): If True, only load from local paths.
75
+ If False, download from hub if not found locally. Defaults to False.
76
+ **kwargs: Additional keyword arguments passed to the Pipeline constructor.
77
+ Common options include:
78
+ - device: Target device for the model
79
+ - trust_remote_code: Whether to trust remote code in tokenizers
80
+ - name: Custom name for the pipeline
81
+
82
+ Returns:
83
+ Pipeline: Loaded pipeline instance with model, tokenizer, datasets,
84
+ and trainer ready for use.
85
+
86
+ Raises:
87
+ FileNotFoundError: If the pipeline cannot be found locally and
88
+ local_only is True.
89
+ ValueError: If the pipeline configuration is invalid.
90
+ ImportError: If required dependencies are not available.
91
+
92
+ Example:
93
+ >>> hub = PipelineHub()
94
+ >>> # Load from hub
95
+ >>> pipeline = hub.load("yangheng/OmniGenome-RNA-Classification")
96
+ >>> # Load from local path
97
+ >>> pipeline = hub.load("./my_pipeline", local_only=True)
98
+ >>> # Use pipeline for inference
99
+ >>> results = pipeline("ATCGATCG")
100
+
101
+ Note:
102
+ - The pipeline includes all necessary components for training and inference
103
+ - Model weights, tokenizer, and datasets are automatically loaded
104
+ - The pipeline can be used immediately for inference or fine-tuning
105
+ """
106
+ return Pipeline.load(pipeline_name_or_path, local_only=local_only, **kwargs)
107
+
108
+ def push(self, pipeline, **kwargs):
109
+ """
110
+ Push a pipeline to the hub (not yet implemented).
111
+
112
+ This method is intended to upload custom pipelines to the OmniGenome hub
113
+ for sharing and distribution. Currently not implemented.
114
+
115
+ Args:
116
+ pipeline (Pipeline): Pipeline instance to upload to the hub.
117
+ **kwargs: Additional keyword arguments for the upload process.
118
+
119
+ Raises:
120
+ NotImplementedError: This method has not been implemented yet.
121
+
122
+ Note:
123
+ Future implementation will support:
124
+ - Pipeline metadata and documentation
125
+ - Model weights and configuration
126
+ - Tokenizer and dataset specifications
127
+ - Training configurations and results
128
+ """
129
+ raise NotImplementedError("This method has not implemented yet.")