PyPI - omnigenome - Versions diffs - 0.3.0a0__py3-none-any.whl → 0.3.1a0__py3-none-any.whl - Mend

omnigenome 0.3.0a0py3-none-any.whl → 0.3.1a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

omnigenome/__init__.py +29 -44
omnigenome/auto/auto_bench/__init__.py +0 -1
omnigenome/auto/auto_bench/auto_bench.py +24 -14
omnigenome/auto/auto_train/__init__.py +0 -1
omnigenome/auto/auto_train/auto_train.py +11 -12
omnigenome/auto/bench_hub/__init__.py +0 -1
omnigenome/auto/bench_hub/bench_hub.py +1 -1
omnigenome/cli/__init__.py +0 -1
omnigenome/cli/commands/__init__.py +0 -1
omnigenome/cli/commands/base.py +10 -10
omnigenome/cli/commands/bench/__init__.py +0 -1
omnigenome/cli/commands/bench/bench_cli.py +10 -10
omnigenome/cli/commands/rna/__init__.py +0 -1
omnigenome/cli/commands/rna/rna_design.py +10 -11
omnigenome/src/__init__.py +0 -1
omnigenome/src/abc/__init__.py +0 -1
omnigenome/src/abc/abstract_dataset.py +38 -19
omnigenome/src/abc/abstract_metric.py +7 -7
omnigenome/src/abc/abstract_model.py +15 -14
omnigenome/src/abc/abstract_tokenizer.py +9 -7
omnigenome/src/dataset/omni_dataset.py +16 -14
omnigenome/src/lora/__init__.py +0 -1
omnigenome/src/lora/lora_model.py +47 -41
omnigenome/src/metric/classification_metric.py +11 -11
omnigenome/src/metric/metric.py +19 -19
omnigenome/src/metric/ranking_metric.py +15 -15
omnigenome/src/metric/regression_metric.py +18 -18
omnigenome/src/misc/utils.py +214 -150
omnigenome/src/model/augmentation/__init__.py +0 -1
omnigenome/src/model/augmentation/model.py +17 -17
omnigenome/src/model/classification/__init__.py +0 -1
omnigenome/src/model/classification/model.py +28 -32
omnigenome/src/model/embedding/__init__.py +0 -1
omnigenome/src/model/embedding/model.py +35 -35
omnigenome/src/model/mlm/__init__.py +0 -1
omnigenome/src/model/mlm/model.py +13 -13
omnigenome/src/model/module_utils.py +17 -17
omnigenome/src/model/regression/__init__.py +0 -1
omnigenome/src/model/regression/model.py +72 -77
omnigenome/src/model/regression/resnet.py +32 -32
omnigenome/src/model/rna_design/__init__.py +0 -1
omnigenome/src/model/rna_design/model.py +168 -118
omnigenome/src/model/seq2seq/__init__.py +0 -1
omnigenome/src/model/seq2seq/model.py +4 -4
omnigenome/src/tokenizer/bpe_tokenizer.py +27 -27
omnigenome/src/tokenizer/kmers_tokenizer.py +22 -22
omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +11 -11
omnigenome/src/trainer/accelerate_trainer.py +40 -32
omnigenome/src/trainer/hf_trainer.py +8 -8
omnigenome/src/trainer/trainer.py +37 -25
omnigenome/utility/dataset_hub/__init__.py +0 -1
omnigenome/utility/dataset_hub/dataset_hub.py +13 -13
omnigenome/utility/ensemble.py +26 -26
omnigenome/utility/hub_utils.py +8 -8
omnigenome/utility/model_hub/__init__.py +0 -1
omnigenome/utility/model_hub/model_hub.py +26 -25
omnigenome/utility/pipeline_hub/__init__.py +0 -1
omnigenome/utility/pipeline_hub/pipeline.py +49 -49
omnigenome/utility/pipeline_hub/pipeline_hub.py +17 -17
{omnigenome-0.3.0a0.dist-info → omnigenome-0.3.1a0.dist-info}/METADATA +3 -3
omnigenome-0.3.1a0.dist-info/RECORD +78 -0
{omnigenome-0.3.0a0.dist-info → omnigenome-0.3.1a0.dist-info}/top_level.txt +0 -1
omnigenome-0.3.0a0.dist-info/RECORD +0 -85
tests/__init__.py +0 -9
tests/conftest.py +0 -160
tests/test_dataset_patterns.py +0 -291
tests/test_examples_syntax.py +0 -83
tests/test_model_loading.py +0 -183
tests/test_rna_functions.py +0 -255
tests/test_training_patterns.py +0 -302
{omnigenome-0.3.0a0.dist-info → omnigenome-0.3.1a0.dist-info}/WHEEL +0 -0
{omnigenome-0.3.0a0.dist-info → omnigenome-0.3.1a0.dist-info}/entry_points.txt +0 -0
{omnigenome-0.3.0a0.dist-info → omnigenome-0.3.1a0.dist-info}/licenses/LICENSE +0 -0

tests/conftest.py DELETED Viewed

@@ -1,160 +0,0 @@
-"""
-Pytest configuration and shared fixtures for OmniGenBench tests.
-"""
-import pytest
-import sys
-import os
-from pathlib import Path
-# Add the project root to Python path
-ROOT_DIR = Path(__file__).parent.parent
-sys.path.insert(0, str(ROOT_DIR))
-def pytest_configure(config):
-    """Configure pytest with custom markers."""
-    config.addinivalue_line(
-        "markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')"
-    )
-    config.addinivalue_line(
-        "markers", "gpu: marks tests that require GPU (deselect with '-m \"not gpu\"')"
-    )
-    config.addinivalue_line(
-        "markers", "integration: marks tests as integration tests"
-    )
-def pytest_collection_modifyitems(config, items):
-    """Auto-mark slow tests and skip GPU tests if CUDA not available."""
-    try:
-        import torch
-        cuda_available = torch.cuda.is_available()
-    except ImportError:
-        cuda_available = False
-    for item in items:
-        # Auto-mark slow tests
-        if "slow" in item.nodeid or "model_loading" in item.nodeid:
-            item.add_marker(pytest.mark.slow)
-        # Skip GPU tests if CUDA not available
-        if item.get_closest_marker("gpu") and not cuda_available:
-            item.add_marker(pytest.mark.skip(reason="CUDA not available"))
-@pytest.fixture
-def sample_rna_sequences():
-    """Sample RNA sequences for testing."""
-    return [
-        "AUGGCUACG",
-        "CGGAUACGGC",
-        "UGGCCAAGUC",
-        "AUGCUGCUAUGCUA"
-    ]
-@pytest.fixture
-def sample_rna_structures():
-    """Sample RNA secondary structures for testing."""
-    return [
-        "(((())))",
-        "(((...)))",
-        "........",
-        "((..))"
-    ]
-@pytest.fixture
-def sample_dataset_entries():
-    """Sample dataset entries in the format used by examples."""
-    return [
-        {"seq": "AUCG", "label": "(..)"},
-        {"seq": "AUGC", "label": "().."},
-        {"seq": "CGAU", "label": "(())"},
-        {"seq": "GAUC", "label": "...."}
-    ]
-@pytest.fixture
-def mock_model_config():
-    """Mock model configuration for testing."""
-    from unittest.mock import MagicMock
-    config = MagicMock()
-    config.hidden_size = 768
-    config.num_labels = 2
-    config.vocab_size = 32
-    config.max_position_embeddings = 512
-    return config
-@pytest.fixture
-def mock_tokenizer():
-    """Mock tokenizer for testing."""
-    from unittest.mock import MagicMock
-    tokenizer = MagicMock()
-    tokenizer.encode.return_value = [1, 2, 3, 4, 5]
-    tokenizer.decode.return_value = "AUGC"
-    tokenizer.convert_ids_to_tokens.return_value = ["A", "U", "G", "C"]
-    tokenizer.vocab_size = 32
-    tokenizer.pad_token_id = 0
-    tokenizer.eos_token_id = 2
-    return tokenizer
-@pytest.fixture
-def temp_data_dir(tmp_path):
-    """Create temporary directory with sample data files."""
-    data_dir = tmp_path / "data"
-    data_dir.mkdir()
-    # Create sample train.json
-    train_file = data_dir / "train.json"
-    train_data = [
-        '{"seq": "AUCG", "label": "(..)"}',
-        '{"seq": "AUGC", "label": "().."}',
-        '{"seq": "CGAU", "label": "(())"}'
-    ]
-    train_file.write_text("\n".join(train_data))
-    # Create sample test.json
-    test_file = data_dir / "test.json"
-    test_data = [
-        '{"seq": "GAUC", "label": "...."}',
-        '{"seq": "UCGA", "label": "(.)"}'
-    ]
-    test_file.write_text("\n".join(test_data))
-    # Create sample config.py
-    config_file = data_dir / "config.py"
-    config_content = '''
-# Dataset configuration
-max_length = 512
-num_labels = 4
-task_type = "classification"
-'''
-    config_file.write_text(config_content)
-    return data_dir
-@pytest.fixture(scope="session")
-def examples_dir():
-    """Path to examples directory."""
-    return ROOT_DIR / "examples"
-@pytest.fixture
-def skip_if_no_omnigenome():
-    """Skip test if omnigenome package is not available."""
-    try:
-        import omnigenome
-        return False
-    except ImportError:
-        pytest.skip("omnigenome package not available")
-# Custom pytest markers
-pytestmark = [
-    pytest.mark.filterwarnings("ignore:.*:DeprecationWarning"),
-    pytest.mark.filterwarnings("ignore:.*:UserWarning"),
-]

tests/test_dataset_patterns.py DELETED Viewed

@@ -1,291 +0,0 @@
-"""
-Test dataset loading and processing patterns based on examples.
-"""
-import pytest
-import json
-import tempfile
-import os
-from unittest.mock import patch, MagicMock, mock_open
-class TestDatasetPatterns:
-    """Test dataset patterns from examples."""
-    def test_dataset_imports(self):
-        """Test dataset class imports as shown in examples."""
-        try:
-            from omnigenome import (
-                OmniGenomeDatasetForSequenceClassification,
-                OmniGenomeDatasetForSequenceRegression,
-                OmniGenomeDatasetForTokenClassification,
-                OmniGenomeDatasetForTokenRegression,
-            )
-            assert True
-        except ImportError:
-            pytest.skip("omnigenome not available or missing dependencies")
-    def test_json_dataset_format(self):
-        """Test JSON dataset format used in examples."""
-        # Sample data format from toy_datasets
-        sample_data = [
-            {"seq": "AUCG", "label": "(...)"},
-            {"seq": "AUGC", "label": "(..)"},
-            {"seq": "CGAU", "label": "().."},
-        ]
-        # Verify format
-        for item in sample_data:
-            assert "seq" in item
-            assert "label" in item
-            assert isinstance(item["seq"], str)
-            assert len(item["seq"]) > 0
-    @patch("builtins.open", new_callable=mock_open)
-    @patch("json.loads")
-    def test_dataset_loading_pattern(self, mock_json_loads, mock_file):
-        """Test dataset loading pattern from examples."""
-        # Mock data similar to examples
-        mock_data = [
-            {"seq": "AUCG", "label": "(..)"},
-            {"seq": "AUGC", "label": "()"},
-        ]
-        mock_json_loads.return_value = mock_data[0]
-        mock_file.return_value.__iter__ = lambda self: iter([
-            '{"seq": "AUCG", "label": "(..)"}\n',
-            '{"seq": "AUGC", "label": "()"}\n'
-        ])
-        # Pattern from examples for loading test data
-        def load_test_data(file_path):
-            """Pattern from Secondary_Structure_Prediction.py."""
-            data = []
-            with open(file_path) as f:
-                for line in f:
-                    data.append(json.loads(line))
-            return data
-        # Test the pattern
-        result = load_test_data("test_file.json")
-        assert len(result) == 2
-    def test_config_file_structure(self):
-        """Test config.py structure from toy_datasets."""
-        # Common config patterns from examples
-        config_patterns = {
-            "max_length": [128, 256, 512, 1024],
-            "num_labels": [2, 3, 4, 5],
-            "task_type": ["classification", "regression", "token_classification"],
-        }
-        for key, valid_values in config_patterns.items():
-            assert isinstance(key, str)
-            assert isinstance(valid_values, list)
-            assert len(valid_values) > 0
-    def test_sample_data_extraction_pattern(self):
-        """Test sample data extraction pattern from examples."""
-        import random
-        try:
-            import numpy as np
-        except ImportError:
-            pytest.skip("numpy not available")
-        def sample_rna_sequence_pattern():
-            """Pattern from Secondary_Structure_Prediction.py."""
-            try:
-                # Mock data similar to toy_datasets/Archive2/test.json
-                mock_examples = [
-                    {"seq": "AUCG", "label": "(..)"},
-                    {"seq": "AUGC", "label": "().."},
-                    {"seq": "CGAU", "label": "(())"},
-                ]
-                ex = mock_examples[np.random.randint(len(mock_examples))]
-                return ex['seq'], ex.get('label', '')
-            except Exception as e:
-                return f"Error loading sample: {e}", ""
-        # Test the pattern
-        seq, label = sample_rna_sequence_pattern()
-        assert isinstance(seq, str)
-        assert isinstance(label, str)
-    def test_data_validation_patterns(self):
-        """Test data validation patterns from examples."""
-        def validate_sequence_label_pair(seq, label):
-            """Validate sequence-label pair format."""
-            if not isinstance(seq, str) or not isinstance(label, str):
-                return False
-            if len(seq) == 0:
-                return False
-            # RNA sequence validation
-            if not all(base in "AUCG" for base in seq):
-                return False
-            # Structure validation (if applicable)
-            if label and not all(c in "()." for c in label):
-                return False
-            return True
-        # Test valid pairs
-        valid_pairs = [
-            ("AUCG", "(..)"),
-            ("AUG", "..."),
-            ("AU", "()"),
-            ("A", "."),
-        ]
-        for seq, label in valid_pairs:
-            assert validate_sequence_label_pair(seq, label)
-        # Test invalid pairs
-        invalid_pairs = [
-            ("", ""),           # Empty sequence
-            ("AUXG", "(..)"),   # Invalid base X
-            ("AUCG", "(.)X"),   # Invalid structure char
-            (123, "(..)"),      # Non-string sequence
-            ("AUCG", 123),      # Non-string label
-        ]
-        for seq, label in invalid_pairs:
-            assert not validate_sequence_label_pair(seq, label)
-    def test_train_test_split_patterns(self):
-        """Test train/test split patterns from examples."""
-        # Mock dataset similar to toy_datasets structure
-        mock_data = [
-            {"seq": "AUCG", "label": "(..)"},
-            {"seq": "AUGC", "label": "().."},
-            {"seq": "CGAU", "label": "(())"},
-            {"seq": "GAUC", "label": "...."},
-        ]
-        def split_data_pattern(data, train_ratio=0.8):
-            """Simple train/test split pattern."""
-            import random
-            random.shuffle(data)
-            split_idx = int(len(data) * train_ratio)
-            return data[:split_idx], data[split_idx:]
-        train_data, test_data = split_data_pattern(mock_data.copy())
-        # Verify split
-        assert len(train_data) + len(test_data) == len(mock_data)
-        assert len(train_data) >= len(test_data)  # With 80/20 split
-    def test_dataset_file_patterns(self):
-        """Test dataset file naming patterns from examples."""
-        expected_files = ["train.json", "test.json", "valid.json", "config.py"]
-        for filename in expected_files:
-            # Verify naming patterns
-            if filename.endswith(".json"):
-                assert filename in ["train.json", "test.json", "valid.json"]
-            elif filename.endswith(".py"):
-                assert filename == "config.py"
-    def test_dataset_initialization_pattern(self):
-        """Test dataset initialization pattern from examples."""
-        try:
-            from omnigenome import OmniGenomeDatasetForSequenceClassification
-        except ImportError:
-            pytest.skip("omnigenome not available")
-        with patch("omnigenome.OmniGenomeDatasetForSequenceClassification") as mock_dataset:
-            mock_dataset.return_value = MagicMock()
-            # Create a single mock tokenizer instance to use in both call and assertion
-            mock_tokenizer_instance = MagicMock()
-            # Pattern from examples
-            dataset = OmniGenomeDatasetForSequenceClassification(
-                train_file="path/to/train.json",
-                test_file="path/to/test.json",
-                tokenizer=mock_tokenizer_instance,
-                max_length=512
-            )
-            # Verify the call was made with the expected arguments
-            mock_dataset.assert_called_once()
-            call_args = mock_dataset.call_args
-            assert call_args[1]["train_file"] == "path/to/train.json"
-            assert call_args[1]["test_file"] == "path/to/test.json"
-            assert call_args[1]["max_length"] == 512
-    def test_benchmark_dataset_structure(self):
-        """Test benchmark dataset structure from examples."""
-        # RGB benchmark structure from examples
-        rgb_tasks = [
-            "RNA-mRNA",
-            "RNA-SNMD",
-            "RNA-SNMR",
-            "RNA-SSP-Archive2",
-            "RNA-SSP-bpRNA",
-            "RNA-SSP-rnastralign"
-        ]
-        for task in rgb_tasks:
-            assert isinstance(task, str)
-            assert "RNA" in task
-            assert len(task) > 3
-    def test_eterna_dataset_pattern(self):
-        """Test Eterna dataset pattern from RNA design examples."""
-        # Pattern from eterna100_vienna2.txt usage
-        def load_eterna_pattern():
-            """Mock Eterna dataset loading pattern."""
-            # This would normally read from eterna100_vienna2.txt
-            mock_eterna_data = [
-                "(((...)))",
-                "(((())))",
-                "........",
-                "((..))"
-            ]
-            return mock_eterna_data
-        eterna_structures = load_eterna_pattern()
-        for structure in eterna_structures:
-            assert isinstance(structure, str)
-            assert all(c in "()." for c in structure)
-    def test_solved_sequences_format(self):
-        """Test solved sequences format from RNA design examples."""
-        # Format from solved_sequences.json in RNA design
-        solved_format = {
-            "puzzle_1": {
-                "sequence": "AUCG",
-                "structure": "(..)",
-                "energy": -5.2
-            },
-            "puzzle_2": {
-                "sequence": "AUGC",
-                "structure": "().",
-                "energy": -3.1
-            }
-        }
-        for puzzle_id, data in solved_format.items():
-            assert isinstance(puzzle_id, str)
-            assert "sequence" in data
-            assert "structure" in data
-            assert "energy" in data
-            assert isinstance(data["energy"], (int, float))
-    def test_data_loading_error_handling(self):
-        """Test error handling patterns from examples."""
-        def safe_load_pattern(file_path):
-            """Safe loading pattern from examples."""
-            try:
-                # Mock successful load
-                return [{"seq": "AUCG", "label": "(..)"}]
-            except FileNotFoundError:
-                return []
-            except json.JSONDecodeError:
-                return []
-            except Exception as e:
-                print(f"Unexpected error: {e}")
-                return []
-        # Test error handling
-        result = safe_load_pattern("nonexistent.json")
-        assert isinstance(result, list)

tests/test_examples_syntax.py DELETED Viewed

@@ -1,83 +0,0 @@
-import os
-import glob
-import ast
-import py_compile
-import nbformat
-import pytest
-# Root directory of the repository (two levels up from this test file)
-ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
-EXAMPLES_DIR = os.path.join(ROOT_DIR, "examples")
-# -----------------------------------------------------------------------------
-# Helper collectors
-# -----------------------------------------------------------------------------
-def _collect_example_py_files():
-    """Return list of all *.py files under examples/ recursively."""
-    pattern = os.path.join(EXAMPLES_DIR, "**", "*.py")
-    return [path for path in glob.glob(pattern, recursive=True) if os.path.isfile(path)]
-def _collect_example_notebooks():
-    """Return list of all *.ipynb files under examples/ recursively."""
-    pattern = os.path.join(EXAMPLES_DIR, "**", "*.ipynb")
-    return [path for path in glob.glob(pattern, recursive=True) if os.path.isfile(path)]
-# -----------------------------------------------------------------------------
-# Tests for Python scripts
-# -----------------------------------------------------------------------------
-@pytest.mark.parametrize("py_path", _collect_example_py_files())
-def test_example_python_files_compile(py_path):
-    """Ensure each example Python script has valid syntax.
-    This uses ``py_compile`` so the file is parsed by CPython without execution
-    of the module-level code, avoiding heavy runtime dependencies.
-    """
-    # doraise=True raises a ``py_compile.PyCompileError`` on failure which
-    # pytest will treat as a test failure.
-    py_compile.compile(py_path, doraise=True)
-# -----------------------------------------------------------------------------
-# Tests for Jupyter notebooks
-# -----------------------------------------------------------------------------
-def _clean_code(source: str) -> str:
-    """Remove Jupyter magics / shell escapes so source can be parsed by ``ast``.
-    Lines starting with ``%`` or ``!`` are stripped because they are not valid
-    Python syntax outside a notebook environment.
-    """
-    cleaned_lines = []
-    for line in source.splitlines():
-        stripped = line.lstrip()
-        if stripped.startswith("%") or stripped.startswith("!"):
-            # Skip IPython magic or shell command
-            continue
-        cleaned_lines.append(line)
-    return "\n".join(cleaned_lines)
-@pytest.mark.parametrize("nb_path", _collect_example_notebooks())
-def test_example_notebook_cells_parse(nb_path):
-    """Validate that each code cell in the example notebooks can be parsed.
-    Instead of executing potentially heavy code, we parse the cleaned source of
-    each code cell with the ``ast`` module to ensure syntactic correctness.
-    """
-    nb = nbformat.read(nb_path, as_version=4)
-    for cell in nb.cells:
-        if cell.cell_type != "code":
-            continue
-        cleaned = _clean_code(cell.source)
-        if cleaned.strip() == "":
-            # Skip empty cells after cleaning
-            continue
-        # ``ast.parse`` raises ``SyntaxError`` on invalid Python code which will
-        # fail the test if encountered.
-        ast.parse(cleaned, filename=nb_path)

omnigenome 0.3.0a0__py3-none-any.whl → 0.3.1a0__py3-none-any.whl

omnigenome 0.3.0a0py3-none-any.whl → 0.3.1a0py3-none-any.whl