PyPI - omnigenome - Versions diffs - 0.3.0a0__py3-none-any.whl - Mend

omnigenome 0.3.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of omnigenome might be problematic. Click here for more details.

Files changed (85) hide show

omnigenome/__init__.py +281 -0
omnigenome/auto/__init__.py +3 -0
omnigenome/auto/auto_bench/__init__.py +12 -0
omnigenome/auto/auto_bench/auto_bench.py +484 -0
omnigenome/auto/auto_bench/auto_bench_cli.py +230 -0
omnigenome/auto/auto_bench/auto_bench_config.py +216 -0
omnigenome/auto/auto_bench/config_check.py +34 -0
omnigenome/auto/auto_train/__init__.py +13 -0
omnigenome/auto/auto_train/auto_train.py +430 -0
omnigenome/auto/auto_train/auto_train_cli.py +222 -0
omnigenome/auto/bench_hub/__init__.py +12 -0
omnigenome/auto/bench_hub/bench_hub.py +25 -0
omnigenome/cli/__init__.py +13 -0
omnigenome/cli/commands/__init__.py +13 -0
omnigenome/cli/commands/base.py +83 -0
omnigenome/cli/commands/bench/__init__.py +13 -0
omnigenome/cli/commands/bench/bench_cli.py +202 -0
omnigenome/cli/commands/rna/__init__.py +13 -0
omnigenome/cli/commands/rna/rna_design.py +178 -0
omnigenome/cli/omnigenome_cli.py +128 -0
omnigenome/src/__init__.py +12 -0
omnigenome/src/abc/__init__.py +12 -0
omnigenome/src/abc/abstract_dataset.py +622 -0
omnigenome/src/abc/abstract_metric.py +114 -0
omnigenome/src/abc/abstract_model.py +689 -0
omnigenome/src/abc/abstract_tokenizer.py +267 -0
omnigenome/src/dataset/__init__.py +16 -0
omnigenome/src/dataset/omni_dataset.py +435 -0
omnigenome/src/lora/__init__.py +13 -0
omnigenome/src/lora/lora_model.py +294 -0
omnigenome/src/metric/__init__.py +15 -0
omnigenome/src/metric/classification_metric.py +184 -0
omnigenome/src/metric/metric.py +199 -0
omnigenome/src/metric/ranking_metric.py +142 -0
omnigenome/src/metric/regression_metric.py +191 -0
omnigenome/src/misc/__init__.py +3 -0
omnigenome/src/misc/utils.py +439 -0
omnigenome/src/model/__init__.py +19 -0
omnigenome/src/model/augmentation/__init__.py +12 -0
omnigenome/src/model/augmentation/model.py +219 -0
omnigenome/src/model/classification/__init__.py +12 -0
omnigenome/src/model/classification/model.py +642 -0
omnigenome/src/model/embedding/__init__.py +12 -0
omnigenome/src/model/embedding/model.py +263 -0
omnigenome/src/model/mlm/__init__.py +12 -0
omnigenome/src/model/mlm/model.py +177 -0
omnigenome/src/model/module_utils.py +232 -0
omnigenome/src/model/regression/__init__.py +12 -0
omnigenome/src/model/regression/model.py +786 -0
omnigenome/src/model/regression/resnet.py +483 -0
omnigenome/src/model/rna_design/__init__.py +12 -0
omnigenome/src/model/rna_design/model.py +426 -0
omnigenome/src/model/seq2seq/__init__.py +12 -0
omnigenome/src/model/seq2seq/model.py +44 -0
omnigenome/src/tokenizer/__init__.py +16 -0
omnigenome/src/tokenizer/bpe_tokenizer.py +226 -0
omnigenome/src/tokenizer/kmers_tokenizer.py +247 -0
omnigenome/src/tokenizer/single_nucleotide_tokenizer.py +249 -0
omnigenome/src/trainer/__init__.py +14 -0
omnigenome/src/trainer/accelerate_trainer.py +739 -0
omnigenome/src/trainer/hf_trainer.py +75 -0
omnigenome/src/trainer/trainer.py +579 -0
omnigenome/utility/__init__.py +3 -0
omnigenome/utility/dataset_hub/__init__.py +13 -0
omnigenome/utility/dataset_hub/dataset_hub.py +178 -0
omnigenome/utility/ensemble.py +324 -0
omnigenome/utility/hub_utils.py +517 -0
omnigenome/utility/model_hub/__init__.py +12 -0
omnigenome/utility/model_hub/model_hub.py +231 -0
omnigenome/utility/pipeline_hub/__init__.py +12 -0
omnigenome/utility/pipeline_hub/pipeline.py +483 -0
omnigenome/utility/pipeline_hub/pipeline_hub.py +129 -0
omnigenome-0.3.0a0.dist-info/METADATA +224 -0
omnigenome-0.3.0a0.dist-info/RECORD +85 -0
omnigenome-0.3.0a0.dist-info/WHEEL +5 -0
omnigenome-0.3.0a0.dist-info/entry_points.txt +3 -0
omnigenome-0.3.0a0.dist-info/licenses/LICENSE +201 -0
omnigenome-0.3.0a0.dist-info/top_level.txt +2 -0
tests/__init__.py +9 -0
tests/conftest.py +160 -0
tests/test_dataset_patterns.py +291 -0
tests/test_examples_syntax.py +83 -0
tests/test_model_loading.py +183 -0
tests/test_rna_functions.py +255 -0
tests/test_training_patterns.py +302 -0

omnigenome/utility/hub_utils.py ADDED Viewed

@@ -0,0 +1,517 @@
+# -*- coding: utf-8 -*-
+# file: hub_utils.py
+# time: 16:54 13/04/2024
+# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
+# github: https://github.com/yangheng95
+# huggingface: https://huggingface.co/yangheng
+# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
+# Copyright (C) 2019-2024. All Rights Reserved.
+import json
+import os
+from typing import Union, Dict, Any
+import findfile
+import requests
+import tqdm
+from packaging.version import Version
+from termcolor import colored
+from omnigenome import __version__ as current_version
+from omnigenome.src.misc.utils import fprint, default_omnigenome_repo
+def unzip_checkpoint(checkpoint_path):
+    """
+    Unzips a checkpoint file.
+    This function extracts a zipped checkpoint file to a directory,
+    making it ready for use by the model loading functions.
+    Args:
+        checkpoint_path (str): The path to the checkpoint file.
+    Returns:
+        str: The path to the extracted checkpoint directory.
+    Example:
+        >>> extracted_path = unzip_checkpoint("model.zip")
+        >>> print(extracted_path)  # "model"
+    """
+    import zipfile
+    with zipfile.ZipFile(checkpoint_path, "r") as zip_ref:
+        zip_ref.extractall(checkpoint_path.strip(".zip"))
+    return checkpoint_path.strip(".zip")
+def query_models_info(
+    keyword: Union[list, str], repo: str = None, local_only: bool = False, **kwargs
+) -> Dict[str, Any]:
+    """
+    Queries information about available models from the hub.
+    This function retrieves model information from the OmniGenome hub,
+    either from a remote repository or from a local cache. It supports
+    filtering by keywords to find specific models.
+    Args:
+        keyword (Union[list, str]): A keyword or list of keywords to filter models.
+        repo (str, optional): The repository URL to query. If None, uses the default hub.
+        local_only (bool): Whether to use only local cache. Defaults to False.
+        **kwargs: Additional keyword arguments.
+    Returns:
+        Dict[str, Any]: A dictionary containing model information filtered by the keyword.
+    Example:
+        >>> # Query all models
+        >>> models = query_models_info("")
+        >>> print(len(models))  # Number of available models
+        >>> # Query specific models
+        >>> models = query_models_info("DNA")
+        >>> print(models.keys())  # Models containing "DNA"
+    """
+    if local_only:
+        with open("./models_info.json", "r", encoding="utf8") as f:
+            models_info = json.load(f)
+    else:
+        repo = repo if repo else "https://huggingface.co/spaces/anonymous8/gfm_hub/"
+        try:
+            response = requests.get(repo + "models_info.json")
+            models_info = response.json()
+            with open("./models_info.json", "w", encoding="utf8") as f:
+                json.dump(models_info, f)
+        except Exception as e:
+            fprint(
+                "Fail to download models info from huggingface space, the error is: {}".format(
+                    e
+                )
+            )
+            with open("./models_info.json", "r", encoding="utf8") as f:
+                models_info = json.load(f)
+    if isinstance(keyword, str):
+        filtered_models_info = {}
+        for key in models_info:
+            if keyword in key:
+                filtered_models_info[key] = models_info[key]
+        return filtered_models_info
+    else:
+        return models_info
+def query_pipelines_info(
+    keyword: Union[list, str], repo: str = None, local_only: bool = False, **kwargs
+) -> Dict[str, Any]:
+    """
+    Queries information about available pipelines from the hub.
+    This function retrieves pipeline information from the OmniGenome hub,
+    either from a remote repository or from a local cache. It supports
+    filtering by keywords to find specific pipelines.
+    Args:
+        keyword (Union[list, str]): A keyword or list of keywords to filter pipelines.
+        repo (str, optional): The repository URL to query. If None, uses the default hub.
+        local_only (bool): Whether to use only local cache. Defaults to False.
+        **kwargs: Additional keyword arguments.
+    Returns:
+        Dict[str, Any]: A dictionary containing pipeline information filtered by the keyword.
+    Example:
+        >>> # Query all pipelines
+        >>> pipelines = query_pipelines_info("")
+        >>> print(len(pipelines))  # Number of available pipelines
+        >>> # Query specific pipelines
+        >>> pipelines = query_pipelines_info("classification")
+        >>> print(pipelines.keys())  # Pipelines containing "classification"
+    """
+    if local_only:
+        with open("./pipelines_info.json", "r", encoding="utf8") as f:
+            pipelines_info = json.load(f)
+    else:
+        repo = (repo if repo else default_omnigenome_repo) + "resolve/main/"
+        try:
+            response = requests.get(repo + "pipelines_info.json")
+            pipelines_info = response.json()
+            with open("./pipelines_info.json", "w", encoding="utf8") as f:
+                json.dump(pipelines_info, f)
+        except Exception as e:
+            fprint(
+                "Fail to download pipelines info from huggingface space, the error is: {}".format(
+                    e
+                )
+            )
+            with open("./pipelines_info.json", "r", encoding="utf8") as f:
+                pipelines_info = json.load(f)
+    if isinstance(keyword, str):
+        filtered_pipelines_info = {}
+        for key in pipelines_info:
+            if keyword in key:
+                filtered_pipelines_info[key] = pipelines_info[key]
+        return filtered_pipelines_info
+    else:
+        return pipelines_info
+def query_benchmarks_info(
+    keyword: Union[list, str], repo: str = None, local_only: bool = False, **kwargs
+) -> Dict[str, Any]:
+    """
+    Queries information about available benchmarks from the hub.
+    This function retrieves benchmark information from the OmniGenome hub,
+    either from a remote repository or from a local cache. It supports
+    filtering by keywords to find specific benchmarks.
+    Args:
+        keyword (Union[list, str]): A keyword or list of keywords to filter benchmarks.
+        repo (str, optional): The repository URL to query. If None, uses the default hub.
+        local_only (bool): Whether to use only local cache. Defaults to False.
+        **kwargs: Additional keyword arguments.
+    Returns:
+        Dict[str, Any]: A dictionary containing benchmark information filtered by the keyword.
+    Example:
+        >>> # Query all benchmarks
+        >>> benchmarks = query_benchmarks_info("")
+        >>> print(len(benchmarks))  # Number of available benchmarks
+        >>> # Query specific benchmarks
+        >>> benchmarks = query_benchmarks_info("RGB")
+        >>> print(benchmarks.keys())  # Benchmarks containing "RGB"
+    """
+    if local_only:
+        with open("./benchmarks_info.json", "r", encoding="utf8") as f:
+            benchmarks_info = json.load(f)
+    else:
+        repo = (repo if repo else default_omnigenome_repo) + "resolve/main/"
+        try:
+            response = requests.get(repo + "benchmarks_info.json")
+            benchmarks_info = response.json()
+            with open("./benchmarks_info.json", "w", encoding="utf8") as f:
+                json.dump(benchmarks_info, f)
+        except Exception as e:
+            fprint(
+                "Fail to download datasets info from huggingface space, the error is: {}".format(
+                    e
+                )
+            )
+            with open("./benchmarks_info.json", "r", encoding="utf8") as f:
+                benchmarks_info = json.load(f)
+    if isinstance(keyword, str):
+        filtered_benchmarks_info = {}
+        for key in benchmarks_info:
+            if keyword in key:
+                filtered_benchmarks_info[key] = benchmarks_info[key]
+        return filtered_benchmarks_info
+    else:
+        return benchmarks_info
+def download_model(
+    model_name_or_path: str, local_only: bool = False, repo: str = None, cache_dir=None
+) -> str:
+    """
+    Downloads a model from a given URL.
+    This function downloads a model from the OmniGenome hub and caches it
+    locally for future use. It supports both remote and local-only modes.
+    Args:
+        model_name_or_path (str): The name or path of the model to download.
+        local_only (bool): A flag indicating whether to download the model from
+                          the local cache. Defaults to False.
+        repo (str, optional): The URL of the repository to download the model from.
+        cache_dir (str, optional): The directory to cache the downloaded model.
+                                 If None, uses "__OMNIGENOME_DATA__/models/".
+    Returns:
+        str: A string representing the path to the downloaded model.
+    Raises:
+        ConnectionError: If the model download fails.
+        ValueError: If the model is not found in the repository.
+    Example:
+        >>> # Download a model
+        >>> model_path = download_model("DNABERT-2")
+        >>> print(model_path)  # Path to the downloaded model
+        >>> # Download with custom cache directory
+        >>> model_path = download_model("DNABERT-2", cache_dir="./models")
+    """
+    cache_dir = (cache_dir if cache_dir else "__OMNIGENOME_DATA__") + "/models/"
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+    ckpt_config = findfile.find_files(cache_dir, ["config.json"])
+    if ckpt_config:
+        return os.path.dirname(ckpt_config[0])
+    if local_only:
+        with open("./models_info.json", "r", encoding="utf8") as f:
+            models_info = json.load(f)
+    else:
+        repo = (repo if repo else default_omnigenome_repo) + "resolve/main/"
+        try:
+            response = requests.get(repo + "models_info.json")
+            models_info = response.json()
+            with open("./models_info.json", "w", encoding="utf8") as f:
+                json.dump(models_info, f)
+        except Exception as e:
+            fprint(
+                "Fail to download models info from huggingface space, the error is: {}".format(
+                    e
+                )
+            )
+            with open("./models_info.json", "r", encoding="utf8") as f:
+                models_info = json.load(f)
+    if model_name_or_path in models_info:
+        model_info = models_info[model_name_or_path]
+        try:
+            model_url = f'{repo}/models/{model_info["filename"]}'
+            response = requests.get(model_url, stream=True)
+            cache_path = os.path.join(cache_dir, f"{model_info['filename']}")
+            with open(cache_path, "wb") as f:
+                for chunk in tqdm.tqdm(
+                    response.iter_content(chunk_size=1024 * 1024),
+                    unit="MB",
+                    total=int(response.headers["content-length"]) // 1024 // 1024,
+                    desc="Downloading model",
+                ):
+                    f.write(chunk)
+        except Exception as e:
+            raise ConnectionError("Fail to download model: {}".format(e))
+        return unzip_checkpoint(cache_path)
+    else:
+        raise ValueError("Model not found in the repository.")
+def download_pipeline(
+    pipeline_name_or_path: str,
+    local_only: bool = False,
+    repo: str = None,
+    cache_dir=None,
+) -> str:
+    """
+    Downloads a pipeline from a given URL.
+    This function downloads a pipeline from the OmniGenome hub and caches it
+    locally for future use. It supports both remote and local-only modes.
+    Args:
+        pipeline_name_or_path (str): The name or path of the pipeline to download.
+        local_only (bool): A flag indicating whether to download the pipeline from
+                          the local cache. Defaults to False.
+        repo (str, optional): The URL of the repository to download the pipeline from.
+        cache_dir (str, optional): The directory to cache the downloaded pipeline.
+                                 If None, uses "__OMNIGENOME_DATA__/pipelines/".
+    Returns:
+        str: A string representing the path to the downloaded pipeline.
+    Raises:
+        ConnectionError: If the pipeline download fails.
+        ValueError: If the pipeline is not found in the repository.
+    Example:
+        >>> # Download a pipeline
+        >>> pipeline_path = download_pipeline("classification_pipeline")
+        >>> print(pipeline_path)  # Path to the downloaded pipeline
+    """
+    cache_dir = (cache_dir if cache_dir else "__OMNIGENOME_DATA__") + "/pipelines/"
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+    ckpt_config = findfile.find_files(cache_dir, ["config.json"])
+    if ckpt_config:
+        return os.path.dirname(ckpt_config[0])
+    if local_only:
+        with open("./pipelines_info.json", "r", encoding="utf8") as f:
+            pipelines_info = json.load(f)
+    else:
+        repo = (repo if repo else default_omnigenome_repo) + "resolve/main/"
+        try:
+            response = requests.get(repo + "pipelines_info.json")
+            pipelines_info = response.json()
+            with open("./pipelines_info.json", "w", encoding="utf8") as f:
+                json.dump(pipelines_info, f)
+        except Exception as e:
+            fprint(
+                "Fail to download pipelines info from huggingface space, the error is: {}".format(
+                    e
+                )
+            )
+            with open("./pipelines_info.json", "r", encoding="utf8") as f:
+                pipelines_info = json.load(f)
+    if pipeline_name_or_path in pipelines_info:
+        pipeline_info = pipelines_info[pipeline_name_or_path]
+        try:
+            pipeline_url = f'{repo}/pipelines/{pipeline_info["filename"]}'
+            response = requests.get(pipeline_url, stream=True)
+            cache_path = os.path.join(cache_dir, f"{pipeline_info['filename']}")
+            with open(cache_path, "wb") as f:
+                for chunk in tqdm.tqdm(
+                    response.iter_content(chunk_size=1024 * 1024),
+                    unit="MB",
+                    total=int(response.headers["content-length"]) // 1024 // 1024,
+                    desc="Downloading pipeline",
+                ):
+                    f.write(chunk)
+        except Exception as e:
+            raise ConnectionError("Fail to download pipeline: {}".format(e))
+        return unzip_checkpoint(cache_path)
+    else:
+        raise ValueError("Pipeline not found in the repository.")
+def download_benchmark(
+    benchmark_name_or_path: str,
+    local_only: bool = False,
+    repo: str = None,
+    cache_dir=None,
+) -> str:
+    """
+    Downloads a benchmark from a given URL.
+    This function downloads a benchmark from the OmniGenome hub and caches it
+    locally for future use. It supports both remote and local-only modes.
+    Args:
+        benchmark_name_or_path (str): The name or path of the benchmark to download.
+        local_only (bool): A flag indicating whether to download the benchmark from
+                          the local cache. Defaults to False.
+        repo (str, optional): The URL of the repository to download the benchmark from.
+        cache_dir (str, optional): The directory to cache the downloaded benchmark.
+                                 If None, uses "__OMNIGENOME_DATA__/benchmarks/".
+    Returns:
+        str: A string representing the path to the downloaded benchmark.
+    Raises:
+        ConnectionError: If the benchmark download fails.
+        ValueError: If the benchmark is not found in the repository.
+    Example:
+        >>> # Download a benchmark
+        >>> benchmark_path = download_benchmark("RGB")
+        >>> print(benchmark_path)  # Path to the downloaded benchmark
+        >>> # Download with custom cache directory
+        >>> benchmark_path = download_benchmark("RGB", cache_dir="./benchmarks")
+    """
+    cache_dir = (cache_dir if cache_dir else "__OMNIGENOME_DATA__") + "/benchmarks/"
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+    bench_config = findfile.find_file(
+        cache_dir, [benchmark_name_or_path, "metadata.py"]
+    )
+    if bench_config:
+        return os.path.dirname(bench_config)
+    if local_only:
+        with open("./benchmarks_info.json", "r", encoding="utf8") as f:
+            benchmarks_info = json.load(f)
+    else:
+        repo = (repo if repo else default_omnigenome_repo) + "resolve/main/"
+        try:
+            response = requests.get(repo + "benchmarks_info.json")
+            benchmarks_info = response.json()
+            with open("./benchmarks_info.json", "w", encoding="utf8") as f:
+                json.dump(benchmarks_info, f)
+        except Exception as e:
+            fprint(
+                "Fail to download datasets info from huggingface space, the error is: {}".format(
+                    e
+                )
+            )
+            with open("./benchmarks_info.json", "r", encoding="utf8") as f:
+                benchmarks_info = json.load(f)
+    if benchmark_name_or_path in benchmarks_info:
+        benchmarks_info_item = benchmarks_info[benchmark_name_or_path]
+        try:
+            benchmark_url = f'{repo}/benchmarks/{benchmarks_info_item["filename"]}'
+            response = requests.get(benchmark_url, stream=True)
+            cache_path = os.path.join(cache_dir, f"{benchmarks_info_item['filename']}")
+            with open(cache_path, "wb") as f:
+                for chunk in tqdm.tqdm(
+                    response.iter_content(chunk_size=1024 * 1024),
+                    unit="MB",
+                    total=int(response.headers["content-length"]) // 1024 // 1024,
+                    desc="Downloading benchmark",
+                ):
+                    f.write(chunk)
+        except Exception as e:
+            raise ConnectionError("Fail to download benchmark: {}".format(e))
+        return unzip_checkpoint(cache_path)
+    else:
+        raise ValueError("Benchmark not found in the repository.")
+def check_version(repo: str = None) -> None:
+    """
+    Checks the version compatibility between local and remote OmniGenome.
+    This function compares the local OmniGenome version with the version
+    available in the remote repository to ensure compatibility.
+    Args:
+        repo (str, optional): The repository URL to check. If None, uses the default hub.
+    Example:
+        >>> check_version()  # Check version compatibility
+    """
+    repo = (repo if repo else default_omnigenome_repo) + "resolve/main/"
+    try:
+        response = requests.get(repo + "version.json")
+        version_info = response.json()
+        remote_version = version_info["version"]
+        if Version(current_version) < Version(remote_version):
+            fprint(
+                colored(
+                    f"Warning: Your local OmniGenome version ({current_version}) "
+                    f"is older than the remote version ({remote_version}). "
+                    f"Please consider updating.",
+                    "yellow",
+                )
+            )
+        elif Version(current_version) > Version(remote_version):
+            fprint(
+                colored(
+                    f"Warning: Your local OmniGenome version ({current_version}) "
+                    f"is newer than the remote version ({remote_version}). "
+                    f"This might cause compatibility issues.",
+                    "yellow",
+                )
+            )
+        else:
+            fprint(
+                colored(
+                    f"OmniGenome version ({current_version}) is up to date.",
+                    "green",
+                )
+            )
+    except Exception as e:
+        fprint(
+            colored(
+                f"Failed to check version: {e}",
+                "red",
+            )
+        )

omnigenome/utility/model_hub/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+# file: __init__.py
+# time: 18:27 11/04/2024
+# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
+# github: https://github.com/yangheng95
+# huggingface: https://huggingface.co/yangheng
+# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
+# Copyright (C) 2019-2024. All Rights Reserved.
+"""
+This package contains modules for the model hub.
+"""