omnigenome 0.3.13a0__tar.gz → 0.3.17a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of omnigenome might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: omnigenome
3
- Version: 0.3.13a0
3
+ Version: 0.3.17a0
4
4
  Summary: OmniGenome: A comprehensive toolkit for genome analysis.
5
5
  Home-page: https://github.com/yangheng95/OmniGenBench
6
6
  Author: Yang, Heng
@@ -145,7 +145,8 @@ seeds = [0, 1, 2, 3, 4]
145
145
  bench = AutoBench(benchmark=benchmark, model_name_or_path=gfm, overwrite=False)
146
146
  bench.run(autocast=False, batch_size=bench_size, seeds=seeds)
147
147
  ```
148
- You can find an example of AutoBench via Python API [here](examples/autobench/AutoBench_Tutorial.ipynb).
148
+ You can find an example of AutoBench via Python API [here](examples/autobench_gfm_evaluation/benchmarking_with_lora.ipynb).
149
+
149
150
 
150
151
  ## Supported Models
151
152
 
@@ -187,9 +188,9 @@ RNA design is a fundamental problem in synthetic biology,
187
188
  where the goal is to design RNA sequences that fold into a target structure.
188
189
  In this demo, we show how to use OmniGenoBench to design RNA sequences
189
190
  that fold into a target structure using a pre-trained model.
190
- The tutorials of RNA Design Demo can be found in [RNA_Design_Tutorial.ipynb](examples/rna_design/RNA_Design_Tutorial.ipynb).
191
+ The tutorials of RNA Design Demo can be found in [RNA_Design_Tutorial.ipynb](examples/rna_sequence_design/RNA_Design_Tutorial.ipynb).
191
192
 
192
- You can find a visual example of RNA Design [here](asset/RNA_Design.gif).
193
+ You can find a visual example of RNA Design [here](asset/RNADesign-Demo.gif).
193
194
 
194
195
  ### RNA Secondary Structure Prediction
195
196
 
@@ -199,10 +200,10 @@ In this demo, we show how to use OmniGenoBench to predict the secondary structur
199
200
  The tutorials of RNA Secondary Structure Prediction can be found in
200
201
  [Secondary_Structure_Prediction_Tutorial.ipynb](examples/rna_secondary_structure_prediction/Secondary_Structure_Prediction_Tutorial.ipynb).
201
202
 
202
- You can find a visual example of RNA Secondary Structure Prediction [here](asset/RNA_Structure_Prediction.gif).
203
+ You can find a visual example of RNA Secondary Structure Prediction [here](asset/RNASSP-Demo.gif).
203
204
 
204
205
  ### More Tutorials
205
- Please find more usage tutorials in [examples/tutorials](examples/tutorials).
206
+ Please find more usage tutorials in [examples](examples).
206
207
 
207
208
  ## Citation
208
209
  ```bibtex
@@ -115,23 +115,23 @@ try:
115
115
  from omnigenbench.src.trainer.accelerate_trainer import AccelerateTrainer
116
116
 
117
117
  # Import hub utilities
118
- from omnigenbench.utility.hub_utils import (
118
+ from omnigenbench.src.utility.hub_utils import (
119
119
  download_benchmark,
120
120
  download_model,
121
121
  download_pipeline,
122
122
  query_models_info,
123
123
  )
124
- from omnigenbench.utility import hub_utils
124
+ from omnigenbench.src.utility import hub_utils
125
125
 
126
126
  # Import hub classes
127
- from omnigenbench.utility.model_hub.model_hub import ModelHub
128
- from omnigenbench.utility.dataset_hub.dataset_hub import load_benchmark_datasets
129
- from omnigenbench.utility.pipeline_hub.pipeline import Pipeline
130
- from omnigenbench.utility.pipeline_hub.pipeline_hub import PipelineHub
127
+ from omnigenbench.src.utility.model_hub.model_hub import ModelHub
128
+ from omnigenbench.src.utility.dataset_hub import load_benchmark_datasets
129
+ from omnigenbench.src.utility.pipeline_hub import Pipeline
130
+ from omnigenbench.src.utility.pipeline_hub.pipeline_hub import PipelineHub
131
131
 
132
132
  # Import module utilities
133
133
  from omnigenbench.src.model.module_utils import OmniPooling
134
- from omnigenbench.utility.ensemble import VoteEnsemblePredictor
134
+ from omnigenbench.src.utility import VoteEnsemblePredictor
135
135
 
136
136
  # For backward compatibility version 0.2.7alpha and earlier
137
137
  from omnigenbench.auto.config.auto_config import AutoBenchConfig
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: omnigenome
3
- Version: 0.3.13a0
3
+ Version: 0.3.17a0
4
4
  Summary: OmniGenome: A comprehensive toolkit for genome analysis.
5
5
  Home-page: https://github.com/yangheng95/OmniGenBench
6
6
  Author: Yang, Heng
@@ -145,7 +145,8 @@ seeds = [0, 1, 2, 3, 4]
145
145
  bench = AutoBench(benchmark=benchmark, model_name_or_path=gfm, overwrite=False)
146
146
  bench.run(autocast=False, batch_size=bench_size, seeds=seeds)
147
147
  ```
148
- You can find an example of AutoBench via Python API [here](examples/autobench/AutoBench_Tutorial.ipynb).
148
+ You can find an example of AutoBench via Python API [here](examples/autobench_gfm_evaluation/benchmarking_with_lora.ipynb).
149
+
149
150
 
150
151
  ## Supported Models
151
152
 
@@ -187,9 +188,9 @@ RNA design is a fundamental problem in synthetic biology,
187
188
  where the goal is to design RNA sequences that fold into a target structure.
188
189
  In this demo, we show how to use OmniGenoBench to design RNA sequences
189
190
  that fold into a target structure using a pre-trained model.
190
- The tutorials of RNA Design Demo can be found in [RNA_Design_Tutorial.ipynb](examples/rna_design/RNA_Design_Tutorial.ipynb).
191
+ The tutorials of RNA Design Demo can be found in [RNA_Design_Tutorial.ipynb](examples/rna_sequence_design/RNA_Design_Tutorial.ipynb).
191
192
 
192
- You can find a visual example of RNA Design [here](asset/RNA_Design.gif).
193
+ You can find a visual example of RNA Design [here](asset/RNADesign-Demo.gif).
193
194
 
194
195
  ### RNA Secondary Structure Prediction
195
196
 
@@ -199,10 +200,10 @@ In this demo, we show how to use OmniGenoBench to predict the secondary structur
199
200
  The tutorials of RNA Secondary Structure Prediction can be found in
200
201
  [Secondary_Structure_Prediction_Tutorial.ipynb](examples/rna_secondary_structure_prediction/Secondary_Structure_Prediction_Tutorial.ipynb).
201
202
 
202
- You can find a visual example of RNA Secondary Structure Prediction [here](asset/RNA_Structure_Prediction.gif).
203
+ You can find a visual example of RNA Secondary Structure Prediction [here](asset/RNASSP-Demo.gif).
203
204
 
204
205
  ### More Tutorials
205
- Please find more usage tutorials in [examples/tutorials](examples/tutorials).
206
+ Please find more usage tutorials in [examples](examples).
206
207
 
207
208
  ## Citation
208
209
  ```bibtex
@@ -10,6 +10,7 @@ omnigenome.egg-info/requires.txt
10
10
  omnigenome.egg-info/top_level.txt
11
11
  tests/test_dataset_patterns.py
12
12
  tests/test_examples_syntax.py
13
+ tests/test_inference_with_dataset.py
13
14
  tests/test_model_loading.py
14
15
  tests/test_rna_functions.py
15
16
  tests/test_training_patterns.py
@@ -0,0 +1,330 @@
1
+ """
2
+ Test inference functionality with dataset preprocessing.
3
+ """
4
+ import pytest
5
+ import tempfile
6
+ import os
7
+ from unittest.mock import patch, MagicMock, Mock
8
+ import warnings
9
+
10
+ try:
11
+ import torch
12
+ from transformers import BatchEncoding
13
+ except ImportError:
14
+ torch = None
15
+ BatchEncoding = None
16
+
17
+ # Mark as slow tests - can be run with --run-slow
18
+ pytestmark = pytest.mark.slow
19
+
20
+
21
+ class TestInferenceWithDataset:
22
+ """Test inference with dataset preprocessing functionality."""
23
+
24
+ @pytest.fixture
25
+ def mock_tokenizer(self):
26
+ """Mock tokenizer for testing."""
27
+ tokenizer = MagicMock()
28
+ tokenizer.return_value = {
29
+ 'input_ids': torch.tensor([[1, 2, 3, 4, 5]]),
30
+ 'attention_mask': torch.tensor([[1, 1, 1, 1, 1]])
31
+ }
32
+ tokenizer.pad_token_id = 0
33
+ return tokenizer
34
+
35
+ @pytest.fixture
36
+ def mock_config(self):
37
+ """Mock model config."""
38
+ config = MagicMock()
39
+ config.hidden_size = 768
40
+ config.num_labels = 2
41
+ config.label2id = {"negative": 0, "positive": 1}
42
+ config.id2label = {0: "negative", 1: "positive"}
43
+ config.pad_token_id = 0
44
+ return config
45
+
46
+ @pytest.fixture
47
+ def mock_dataset_class(self, mock_tokenizer):
48
+ """Mock dataset class with prepare_input method."""
49
+ class MockDataset:
50
+ def __init__(self, dataset_name, tokenizer, max_length=None, **kwargs):
51
+ self.dataset_name = dataset_name
52
+ self.tokenizer = tokenizer
53
+ self.max_length = max_length or 1024
54
+
55
+ def prepare_input(self, instance, **kwargs):
56
+ """Mock prepare_input method."""
57
+ if isinstance(instance, dict):
58
+ sequence = instance.get('sequence', instance.get('seq'))
59
+ elif isinstance(instance, str):
60
+ sequence = instance
61
+ else:
62
+ raise ValueError("Unsupported instance type")
63
+
64
+ # Mock tokenization
65
+ return {
66
+ 'input_ids': torch.tensor([[1, 2, 3, 4, 5]]),
67
+ 'attention_mask': torch.tensor([[1, 1, 1, 1, 1]]),
68
+ 'labels': torch.tensor([instance.get('label', -100)]) if isinstance(instance, dict) else torch.tensor([-100])
69
+ }
70
+
71
+ return MockDataset
72
+
73
+ def test_model_init_with_dataset_class(self, mock_tokenizer, mock_config, mock_dataset_class):
74
+ """Test model initialization with dataset_class parameter."""
75
+ if torch is None:
76
+ pytest.skip("torch not available")
77
+
78
+ try:
79
+ from omnigenbench import OmniModelForSequenceClassification
80
+ except ImportError:
81
+ pytest.skip("omnigenbench not available")
82
+
83
+ with patch('omnigenbench.src.abc.abstract_model.AutoModel') as mock_auto_model, \
84
+ patch('omnigenbench.src.abc.abstract_model.AutoConfig') as mock_auto_config:
85
+
86
+ mock_auto_config.from_pretrained.return_value = mock_config
87
+ mock_model_instance = MagicMock()
88
+ mock_model_instance.config = mock_config
89
+ mock_model_instance.device = torch.device('cpu')
90
+ mock_auto_model.from_pretrained.return_value = mock_model_instance
91
+
92
+ # Initialize model with dataset_class
93
+ model = OmniModelForSequenceClassification(
94
+ "test_model",
95
+ mock_tokenizer,
96
+ label2id={"negative": 0, "positive": 1},
97
+ dataset_class=mock_dataset_class
98
+ )
99
+
100
+ # Verify dataset_class is set
101
+ assert hasattr(model, 'dataset_class')
102
+ assert model.dataset_class == mock_dataset_class
103
+ assert 'dataset_cls' in model.metadata
104
+
105
+ def test_inference_with_string_input(self, mock_tokenizer, mock_config):
106
+ """Test inference with string input (traditional way)."""
107
+ if torch is None:
108
+ pytest.skip("torch not available")
109
+
110
+ try:
111
+ from omnigenbench import OmniModelForSequenceClassification
112
+ except ImportError:
113
+ pytest.skip("omnigenbench not available")
114
+
115
+ with patch('omnigenbench.src.abc.abstract_model.AutoModel') as mock_auto_model, \
116
+ patch('omnigenbench.src.abc.abstract_model.AutoConfig') as mock_auto_config:
117
+
118
+ mock_auto_config.from_pretrained.return_value = mock_config
119
+ mock_model_instance = MagicMock()
120
+ mock_model_instance.config = mock_config
121
+ mock_model_instance.device = torch.device('cpu')
122
+ mock_auto_model.from_pretrained.return_value = mock_model_instance
123
+
124
+ model = OmniModelForSequenceClassification(
125
+ "test_model",
126
+ mock_tokenizer,
127
+ label2id={"negative": 0, "positive": 1}
128
+ )
129
+
130
+ # Should work without dataset_class
131
+ # Note: This test mainly verifies that the code path works
132
+ # Actual inference behavior is mocked
133
+
134
+ def test_inference_with_dict_input_and_dataset_class(self, mock_tokenizer, mock_config, mock_dataset_class):
135
+ """Test inference with dict input when dataset_class is set."""
136
+ if torch is None:
137
+ pytest.skip("torch not available")
138
+
139
+ try:
140
+ from omnigenbench import OmniModelForSequenceClassification
141
+ except ImportError:
142
+ pytest.skip("omnigenbench not available")
143
+
144
+ with patch('omnigenbench.src.abc.abstract_model.AutoModel') as mock_auto_model, \
145
+ patch('omnigenbench.src.abc.abstract_model.AutoConfig') as mock_auto_config:
146
+
147
+ mock_auto_config.from_pretrained.return_value = mock_config
148
+ mock_model_instance = MagicMock()
149
+ mock_model_instance.config = mock_config
150
+ mock_model_instance.device = torch.device('cpu')
151
+ mock_auto_model.from_pretrained.return_value = mock_model_instance
152
+
153
+ model = OmniModelForSequenceClassification(
154
+ "test_model",
155
+ mock_tokenizer,
156
+ label2id={"negative": 0, "positive": 1},
157
+ dataset_class=mock_dataset_class
158
+ )
159
+
160
+ # Dict input should be processed by dataset.prepare_input
161
+ # Note: Actual processing is mocked, we're testing the code path
162
+
163
+ def test_metadata_contains_dataset_info(self, mock_tokenizer, mock_config, mock_dataset_class):
164
+ """Test that metadata contains dataset information."""
165
+ if torch is None:
166
+ pytest.skip("torch not available")
167
+
168
+ try:
169
+ from omnigenbench import OmniModelForSequenceClassification
170
+ except ImportError:
171
+ pytest.skip("omnigenbench not available")
172
+
173
+ with patch('omnigenbench.src.abc.abstract_model.AutoModel') as mock_auto_model, \
174
+ patch('omnigenbench.src.abc.abstract_model.AutoConfig') as mock_auto_config:
175
+
176
+ mock_auto_config.from_pretrained.return_value = mock_config
177
+ mock_model_instance = MagicMock()
178
+ mock_model_instance.config = mock_config
179
+ mock_model_instance.device = torch.device('cpu')
180
+ mock_auto_model.from_pretrained.return_value = mock_model_instance
181
+
182
+ model = OmniModelForSequenceClassification(
183
+ "test_model",
184
+ mock_tokenizer,
185
+ label2id={"negative": 0, "positive": 1},
186
+ dataset_class=mock_dataset_class
187
+ )
188
+
189
+ # Check metadata
190
+ assert 'dataset_cls' in model.metadata
191
+ assert 'dataset_module' in model.metadata
192
+ assert model.metadata['dataset_cls'] == mock_dataset_class.__name__
193
+
194
+ def test_backward_compatibility_without_dataset_class(self, mock_tokenizer, mock_config):
195
+ """Test that models work without dataset_class (backward compatibility)."""
196
+ if torch is None:
197
+ pytest.skip("torch not available")
198
+
199
+ try:
200
+ from omnigenbench import OmniModelForSequenceClassification
201
+ except ImportError:
202
+ pytest.skip("omnigenbench not available")
203
+
204
+ with patch('omnigenbench.src.abc.abstract_model.AutoModel') as mock_auto_model, \
205
+ patch('omnigenbench.src.abc.abstract_model.AutoConfig') as mock_auto_config:
206
+
207
+ mock_auto_config.from_pretrained.return_value = mock_config
208
+ mock_model_instance = MagicMock()
209
+ mock_model_instance.config = mock_config
210
+ mock_model_instance.device = torch.device('cpu')
211
+ mock_auto_model.from_pretrained.return_value = mock_model_instance
212
+
213
+ # Initialize without dataset_class
214
+ model = OmniModelForSequenceClassification(
215
+ "test_model",
216
+ mock_tokenizer,
217
+ label2id={"negative": 0, "positive": 1}
218
+ )
219
+
220
+ # Should not have dataset_class attribute
221
+ assert not hasattr(model, 'dataset_class')
222
+ # Metadata should not have dataset info
223
+ assert 'dataset_cls' not in model.metadata
224
+
225
+ def test_dataset_class_persistence(self, mock_tokenizer, mock_config, mock_dataset_class):
226
+ """Test that dataset_class information is saved in metadata."""
227
+ if torch is None:
228
+ pytest.skip("torch not available")
229
+
230
+ try:
231
+ from omnigenbench import OmniModelForSequenceClassification
232
+ except ImportError:
233
+ pytest.skip("omnigenbench not available")
234
+
235
+ with patch('omnigenbench.src.abc.abstract_model.AutoModel') as mock_auto_model, \
236
+ patch('omnigenbench.src.abc.abstract_model.AutoConfig') as mock_auto_config:
237
+
238
+ mock_auto_config.from_pretrained.return_value = mock_config
239
+ mock_model_instance = MagicMock()
240
+ mock_model_instance.config = mock_config
241
+ mock_model_instance.device = torch.device('cpu')
242
+ mock_auto_model.from_pretrained.return_value = mock_model_instance
243
+
244
+ model = OmniModelForSequenceClassification(
245
+ "test_model",
246
+ mock_tokenizer,
247
+ label2id={"negative": 0, "positive": 1},
248
+ dataset_class=mock_dataset_class
249
+ )
250
+
251
+ # Collect metadata
252
+ metadata = model._collect_metadata()
253
+
254
+ # Check dataset info is in metadata
255
+ assert 'dataset_cls' in metadata
256
+ assert 'dataset_module' in metadata
257
+ assert metadata['dataset_cls'] == mock_dataset_class.__name__
258
+
259
+ def test_input_format_detection(self):
260
+ """Test that different input formats are correctly detected."""
261
+ # This is a conceptual test - actual implementation is in _forward_from_raw_input
262
+
263
+ # String input
264
+ assert isinstance("ATCGATCG", str)
265
+
266
+ # List input
267
+ assert isinstance(["ATCG", "GCTA"], list)
268
+
269
+ # Dict input
270
+ assert isinstance({"sequence": "ATCG", "label": 1}, dict)
271
+
272
+ # Check dict has expected keys
273
+ test_dict = {"sequence": "ATCG", "label": 1}
274
+ assert "sequence" in test_dict or "seq" in test_dict
275
+
276
+ def test_fallback_mechanism(self):
277
+ """Test that fallback to tokenizer works when dataset processing fails."""
278
+ # This test verifies the concept - actual implementation has try-except
279
+ # to fall back to tokenizer when dataset.prepare_input fails
280
+
281
+ # Conceptual test: if dataset processing fails, should use tokenizer
282
+ has_dataset = False
283
+ use_tokenizer = True if not has_dataset else False
284
+ assert use_tokenizer
285
+
286
+ def test_load_dataset_class_method_exists(self):
287
+ """Test that _load_dataset_class method exists in OmniModel."""
288
+ try:
289
+ from omnigenbench.src.abc.abstract_model import OmniModel
290
+ assert hasattr(OmniModel, '_load_dataset_class')
291
+ except ImportError:
292
+ pytest.skip("omnigenbench not available")
293
+
294
+
295
+ class TestDatasetPreprocessingIntegration:
296
+ """Integration tests for dataset preprocessing in inference."""
297
+
298
+ def test_dataset_import_patterns(self):
299
+ """Test that dataset classes can be imported."""
300
+ try:
301
+ from omnigenbench import (
302
+ OmniDatasetForSequenceClassification,
303
+ OmniDatasetForTokenClassification,
304
+ OmniDatasetForSequenceRegression,
305
+ OmniDatasetForTokenRegression,
306
+ )
307
+ assert True
308
+ except ImportError:
309
+ pytest.skip("omnigenbench dataset classes not available")
310
+
311
+ def test_dataset_prepare_input_exists(self):
312
+ """Test that dataset classes have prepare_input method."""
313
+ try:
314
+ from omnigenbench import OmniDatasetForSequenceClassification
315
+ assert hasattr(OmniDatasetForSequenceClassification, 'prepare_input')
316
+ except ImportError:
317
+ pytest.skip("omnigenbench not available")
318
+
319
+ def test_model_dataset_compatibility(self):
320
+ """Test that model and dataset classes are compatible."""
321
+ try:
322
+ from omnigenbench import (
323
+ OmniModelForSequenceClassification,
324
+ OmniDatasetForSequenceClassification,
325
+ )
326
+ # If both can be imported, they should be compatible
327
+ assert True
328
+ except ImportError:
329
+ pytest.skip("omnigenbench not available")
330
+
File without changes
File without changes
File without changes