deepchopper 1.3.0__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepchopper/__init__.py +9 -0
- deepchopper/__init__.pyi +67 -0
- deepchopper/__main__.py +4 -0
- deepchopper/cli.py +260 -0
- deepchopper/data/__init__.py +15 -0
- deepchopper/data/components/__init__.py +1 -0
- deepchopper/data/encode_fq.py +41 -0
- deepchopper/data/fq_datamodule.py +352 -0
- deepchopper/data/hg_data.py +39 -0
- deepchopper/data/only_fq.py +388 -0
- deepchopper/deepchopper.abi3.so +0 -0
- deepchopper/eval.py +86 -0
- deepchopper/models/__init__.py +4 -0
- deepchopper/models/basic_module.py +243 -0
- deepchopper/models/callbacks.py +57 -0
- deepchopper/models/cnn.py +54 -0
- deepchopper/models/components/__init__.py +1 -0
- deepchopper/models/dc_hg.py +163 -0
- deepchopper/models/llm/__init__.py +32 -0
- deepchopper/models/llm/caduceus.py +55 -0
- deepchopper/models/llm/components.py +99 -0
- deepchopper/models/llm/head.py +102 -0
- deepchopper/models/llm/hyena.py +41 -0
- deepchopper/models/llm/metric.py +44 -0
- deepchopper/models/llm/tokenizer.py +205 -0
- deepchopper/models/transformer.py +107 -0
- deepchopper/py.typed +0 -0
- deepchopper/train.py +109 -0
- deepchopper/ui/__init__.py +1 -0
- deepchopper/ui/main.py +189 -0
- deepchopper/utils/__init__.py +37 -0
- deepchopper/utils/instantiators.py +54 -0
- deepchopper/utils/logging_utils.py +53 -0
- deepchopper/utils/preprocess.py +62 -0
- deepchopper/utils/print.py +102 -0
- deepchopper/utils/pylogger.py +57 -0
- deepchopper/utils/rich_utils.py +100 -0
- deepchopper/utils/utils.py +138 -0
- deepchopper-1.3.0.dist-info/METADATA +254 -0
- deepchopper-1.3.0.dist-info/RECORD +43 -0
- deepchopper-1.3.0.dist-info/WHEEL +4 -0
- deepchopper-1.3.0.dist-info/entry_points.txt +2 -0
- deepchopper-1.3.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import rich
|
|
5
|
+
import rich.syntax
|
|
6
|
+
import rich.tree
|
|
7
|
+
from hydra.core.hydra_config import HydraConfig
|
|
8
|
+
from lightning_utilities.core.rank_zero import rank_zero_only
|
|
9
|
+
from omegaconf import DictConfig, OmegaConf, open_dict
|
|
10
|
+
from rich.prompt import Prompt
|
|
11
|
+
|
|
12
|
+
from . import pylogger
|
|
13
|
+
|
|
14
|
+
log = pylogger.RankedLogger(__name__, rank_zero_only=True)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@rank_zero_only
|
|
18
|
+
def print_config_tree(
|
|
19
|
+
cfg: DictConfig,
|
|
20
|
+
print_order: Sequence[str] = (
|
|
21
|
+
"data",
|
|
22
|
+
"model",
|
|
23
|
+
"callbacks",
|
|
24
|
+
"logger",
|
|
25
|
+
"trainer",
|
|
26
|
+
"paths",
|
|
27
|
+
"extras",
|
|
28
|
+
),
|
|
29
|
+
*,
|
|
30
|
+
resolve: bool = False,
|
|
31
|
+
save_to_file: bool = False,
|
|
32
|
+
) -> None:
|
|
33
|
+
"""Prints the contents of a DictConfig as a tree structure using the Rich library.
|
|
34
|
+
|
|
35
|
+
:param cfg: A DictConfig composed by Hydra.
|
|
36
|
+
:param print_order: Determines in what order config components are printed. Default is ``("data", "model",
|
|
37
|
+
"callbacks", "logger", "trainer", "paths", "extras")``.
|
|
38
|
+
:param resolve: Whether to resolve reference fields of DictConfig. Default is ``False``.
|
|
39
|
+
:param save_to_file: Whether to export config to the hydra output folder. Default is ``False``.
|
|
40
|
+
"""
|
|
41
|
+
style = "dim"
|
|
42
|
+
tree = rich.tree.Tree("CONFIG", style=style, guide_style=style)
|
|
43
|
+
|
|
44
|
+
queue = []
|
|
45
|
+
|
|
46
|
+
# add fields from `print_order` to queue
|
|
47
|
+
for field in print_order:
|
|
48
|
+
queue.append(field) if field in cfg else log.warning(
|
|
49
|
+
f"Field '{field}' not found in config. Skipping '{field}' config printing..."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# add all the other fields to queue (not specified in `print_order`)
|
|
53
|
+
for field in cfg:
|
|
54
|
+
if field not in queue:
|
|
55
|
+
queue.append(field)
|
|
56
|
+
|
|
57
|
+
# generate config tree from queue
|
|
58
|
+
for field in queue:
|
|
59
|
+
branch = tree.add(field, style=style, guide_style=style)
|
|
60
|
+
|
|
61
|
+
config_group = cfg[field]
|
|
62
|
+
if isinstance(config_group, DictConfig):
|
|
63
|
+
branch_content = OmegaConf.to_yaml(config_group, resolve=resolve)
|
|
64
|
+
else:
|
|
65
|
+
branch_content = str(config_group)
|
|
66
|
+
|
|
67
|
+
branch.add(rich.syntax.Syntax(branch_content, "yaml"))
|
|
68
|
+
|
|
69
|
+
# print config tree
|
|
70
|
+
rich.print(tree)
|
|
71
|
+
|
|
72
|
+
# save config tree to file
|
|
73
|
+
if save_to_file:
|
|
74
|
+
with Path(cfg.paths.output_dir, "config_tree.log").open("w") as file:
|
|
75
|
+
rich.print(tree, file=file)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@rank_zero_only
|
|
79
|
+
def enforce_tags(cfg: DictConfig, *, save_to_file: bool = False) -> None:
|
|
80
|
+
"""Prompts user to input tags from command line if no tags are provided in config.
|
|
81
|
+
|
|
82
|
+
:param cfg: A DictConfig composed by Hydra.
|
|
83
|
+
:param save_to_file: Whether to export tags to the hydra output folder. Default is ``False``.
|
|
84
|
+
"""
|
|
85
|
+
if not cfg.get("tags"):
|
|
86
|
+
if "id" in HydraConfig().cfg.hydra.job:
|
|
87
|
+
raise ValueError("Specify tags before launching a multirun!")
|
|
88
|
+
|
|
89
|
+
log.warning("No tags provided in config. Prompting user to input tags...")
|
|
90
|
+
tags = Prompt.ask("Enter a list of comma separated tags", default="dev")
|
|
91
|
+
tags = [t.strip() for t in tags.split(",") if t != ""]
|
|
92
|
+
|
|
93
|
+
with open_dict(cfg):
|
|
94
|
+
cfg.tags = tags
|
|
95
|
+
|
|
96
|
+
log.info(f"Tags: {cfg.tags}")
|
|
97
|
+
|
|
98
|
+
if save_to_file:
|
|
99
|
+
with Path(cfg.paths.output_dir, "tags.log").open("w") as file:
|
|
100
|
+
rich.print(cfg.tags, file=file)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from importlib.util import find_spec
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
from . import pylogger, rich_utils
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
|
|
14
|
+
from omegaconf import DictConfig
|
|
15
|
+
|
|
16
|
+
log = pylogger.RankedLogger(__name__, rank_zero_only=True)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def extras(cfg: DictConfig) -> None:
|
|
20
|
+
"""Applies optional utilities before the task is started.
|
|
21
|
+
|
|
22
|
+
Utilities:
|
|
23
|
+
- Ignoring python warnings
|
|
24
|
+
- Setting tags from command line
|
|
25
|
+
- Rich config printing
|
|
26
|
+
|
|
27
|
+
:param cfg: A DictConfig object containing the config tree.
|
|
28
|
+
"""
|
|
29
|
+
# return if no `extras` config
|
|
30
|
+
if not cfg.get("extras"):
|
|
31
|
+
log.warning("Extras config not found! <cfg.extras=null>")
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
# disable python warnings
|
|
35
|
+
if cfg.extras.get("ignore_warnings"):
|
|
36
|
+
log.info("Disabling python warnings! <cfg.extras.ignore_warnings=True>")
|
|
37
|
+
warnings.filterwarnings("ignore")
|
|
38
|
+
|
|
39
|
+
# prompt user to input tags from command line if none are provided in the config
|
|
40
|
+
if cfg.extras.get("enforce_tags"):
|
|
41
|
+
log.info("Enforcing tags! <cfg.extras.enforce_tags=True>")
|
|
42
|
+
rich_utils.enforce_tags(cfg, save_to_file=True)
|
|
43
|
+
|
|
44
|
+
# pretty print config tree using Rich library
|
|
45
|
+
if cfg.extras.get("print_config"):
|
|
46
|
+
log.info("Printing config tree with Rich! <cfg.extras.print_config=True>")
|
|
47
|
+
rich_utils.print_config_tree(cfg, resolve=True, save_to_file=True)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def task_wrapper(task_func: Callable) -> Callable:
|
|
51
|
+
"""Optional decorator that controls the failure behavior when executing the task function.
|
|
52
|
+
|
|
53
|
+
This wrapper can be used to:
|
|
54
|
+
- make sure loggers are closed even if the task function raises an exception (prevents multirun failure)
|
|
55
|
+
- save the exception to a `.log` file
|
|
56
|
+
- mark the run as failed with a dedicated file in the `logs/` folder (so we can find and rerun it later)
|
|
57
|
+
- etc. (adjust depending on your needs)
|
|
58
|
+
|
|
59
|
+
Example:
|
|
60
|
+
```
|
|
61
|
+
@utils.task_wrapper
|
|
62
|
+
def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
63
|
+
...
|
|
64
|
+
return metric_dict, object_dict
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
:param task_func: The task function to be wrapped.
|
|
68
|
+
|
|
69
|
+
:return: The wrapped task function.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def wrap(cfg: DictConfig) -> tuple[dict[str, Any], dict[str, Any]]:
|
|
73
|
+
# execute the task
|
|
74
|
+
try:
|
|
75
|
+
metric_dict, object_dict = task_func(cfg=cfg)
|
|
76
|
+
|
|
77
|
+
# things to do if exception occurs
|
|
78
|
+
except Exception as ex:
|
|
79
|
+
# save exception to `.log` file
|
|
80
|
+
log.exception("")
|
|
81
|
+
|
|
82
|
+
# some hyperparameter combinations might be invalid or cause out-of-memory errors
|
|
83
|
+
# so when using hparam search plugins like Optuna, you might want to disable
|
|
84
|
+
# raising the below exception to avoid multirun failure
|
|
85
|
+
raise ex
|
|
86
|
+
|
|
87
|
+
# things to always do after either success or exception
|
|
88
|
+
finally:
|
|
89
|
+
# display output dir path in terminal
|
|
90
|
+
log.info(f"Output dir: {cfg.paths.output_dir}")
|
|
91
|
+
|
|
92
|
+
# always close wandb run (even if exception occurs so multirun won't fail)
|
|
93
|
+
if find_spec("wandb"): # check if wandb is installed
|
|
94
|
+
import wandb
|
|
95
|
+
|
|
96
|
+
if wandb.run:
|
|
97
|
+
log.info("Closing wandb!")
|
|
98
|
+
wandb.finish()
|
|
99
|
+
|
|
100
|
+
return metric_dict, object_dict
|
|
101
|
+
|
|
102
|
+
return wrap
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_metric_value(metric_dict: dict[str, Any], metric_name: str | None) -> float | None:
|
|
106
|
+
"""Safely retrieves value of the metric logged in LightningModule.
|
|
107
|
+
|
|
108
|
+
:param metric_dict: A dict containing metric values.
|
|
109
|
+
:param metric_name: If provided, the name of the metric to retrieve.
|
|
110
|
+
:return: If a metric name was provided, the value of the metric.
|
|
111
|
+
"""
|
|
112
|
+
if not metric_name:
|
|
113
|
+
log.info("Metric name is None! Skipping metric value retrieval...")
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
if metric_name not in metric_dict:
|
|
117
|
+
msg = (
|
|
118
|
+
f"Metric value not found! <metric_name={metric_name}>\n"
|
|
119
|
+
"Make sure metric name logged in LightningModule is correct!\n"
|
|
120
|
+
"Make sure `optimized_metric` name in `hparams_search` config is correct!"
|
|
121
|
+
)
|
|
122
|
+
raise Exception(msg)
|
|
123
|
+
|
|
124
|
+
metric_value = metric_dict[metric_name].item()
|
|
125
|
+
log.info(f"Retrieved metric value! <{metric_name}={metric_value}>")
|
|
126
|
+
|
|
127
|
+
return metric_value
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def device():
|
|
131
|
+
"""Returns the appropriate device to use for training."""
|
|
132
|
+
if torch.cuda.is_available():
|
|
133
|
+
return torch.device("cuda")
|
|
134
|
+
|
|
135
|
+
if torch.backends.mps.is_available():
|
|
136
|
+
return torch.device("mps")
|
|
137
|
+
|
|
138
|
+
return torch.device("cpu")
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deepchopper
|
|
3
|
+
Version: 1.3.0
|
|
4
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
5
|
+
Classifier: Environment :: Console
|
|
6
|
+
Classifier: Intended Audience :: Science/Research
|
|
7
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Classifier: Programming Language :: Python
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Rust
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering
|
|
17
|
+
Requires-Dist: torch>=2.6.0
|
|
18
|
+
Requires-Dist: lightning>=2.1.2
|
|
19
|
+
Requires-Dist: torchmetrics>=1.2.1
|
|
20
|
+
Requires-Dist: rich>=13.7.0
|
|
21
|
+
Requires-Dist: transformers>=4.37.2
|
|
22
|
+
Requires-Dist: safetensors>=0.4.2
|
|
23
|
+
Requires-Dist: datasets>=3.0.0
|
|
24
|
+
Requires-Dist: evaluate>=0.4.3
|
|
25
|
+
Requires-Dist: typer>=0.12.0
|
|
26
|
+
Requires-Dist: gradio==5.0.1
|
|
27
|
+
Requires-Dist: fastapi==0.112.2
|
|
28
|
+
Requires-Dist: scikit-learn>=1.5.2
|
|
29
|
+
Requires-Dist: hydra-core>=1.3.2
|
|
30
|
+
Requires-Dist: omegaconf>=2.3.0
|
|
31
|
+
Requires-Dist: pyfastx>=2.2.0
|
|
32
|
+
Requires-Dist: deepchopper-cli>=1.3.0
|
|
33
|
+
License-File: LICENSE
|
|
34
|
+
Summary: Genomic language model mitigates chimera artifacts in nanopore direct RNA sequencing
|
|
35
|
+
Keywords: deep learning,bioinformatics,rust
|
|
36
|
+
Home-Page: https://serde.rs
|
|
37
|
+
Author-email: Yangyang Li <<yangyang.li@northwestern.edu>>, Ting-you Wang <<tywang@northwestern.edu>>
|
|
38
|
+
Requires-Python: >=3.10
|
|
39
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
40
|
+
Project-URL: changelog, https://github.com/ylab-hi/DeepChopper/README.md
|
|
41
|
+
Project-URL: documentation, https://github.com/ylab-hi/DeepChopper
|
|
42
|
+
Project-URL: homepage, https://github.com/ylab-hi/DeepChopper
|
|
43
|
+
Project-URL: repository, https://github.com/ylab-hi/DeepChopper
|
|
44
|
+
|
|
45
|
+
# <img src="./documentation/logo.webp" alt="logo" height="100"/> **DeepChopper** [](https://github.com/ylab-hi/DeepChopper/stargazers)
|
|
46
|
+
|
|
47
|
+
[](https://pypi.python.org/pypi/deepchopper)
|
|
48
|
+
[](https://pypi.org/project/deepchopper/#files)
|
|
49
|
+
[](https://github.com/ylab-hi/DeepChopper/blob/main/LICENSE)
|
|
50
|
+
[](https://pypi.python.org/pypi/deepbiop)
|
|
51
|
+
[](https://pypi.org/project/deepchopper/#files)
|
|
52
|
+
[](https://github.com/ylab-hi/DeepChopper/actions)
|
|
53
|
+
[](https://huggingface.co/spaces/yangliz5/deepchopper)
|
|
54
|
+
|
|
55
|
+
<!--toc:start-->
|
|
56
|
+
|
|
57
|
+
- [**DeepChopper**](#-deepchopper-)
|
|
58
|
+
- [🚀 Quick Start: Try DeepChopper Online](#-quick-start-try-deepchopper-online)
|
|
59
|
+
- [📦 Installation](#-installation)
|
|
60
|
+
- [Compatibility and Support](#compatibility-and-support)
|
|
61
|
+
- [PyPI Support](#pypi-support)
|
|
62
|
+
- [🛠️ Usage](#%EF%B8%8F-usage)
|
|
63
|
+
- [Command-Line Interface](#command-line-interface)
|
|
64
|
+
- [Python Library](#python-library)
|
|
65
|
+
- [📚 Cite](#-cite)
|
|
66
|
+
- [🤝 Contribution](#-contribution)
|
|
67
|
+
- [Build Environment](#build-environment)
|
|
68
|
+
- [📬 Support](#-support)
|
|
69
|
+
|
|
70
|
+
<!--toc:end-->
|
|
71
|
+
|
|
72
|
+
🧬 DeepChopper leverages a language model to accurately detect and chop artificial sequences that may cause chimeric reads, ensuring higher quality and more reliable sequencing results.
|
|
73
|
+
By integrating seamlessly with existing workflows, DeepChopper provides a robust solution for researchers and bioinformaticians working with Nanopore direct-RNA sequencing data.
|
|
74
|
+
|
|
75
|
+
📘 **FEATURED:** We provide a comprehensive tutorial that includes an example dataset in our [full documentation](./documentation/tutorial.md).
|
|
76
|
+
|
|
77
|
+
## 🚀 Quick Start: Try DeepChopper Online
|
|
78
|
+
|
|
79
|
+
Experience DeepChopper instantly through our user-friendly web interface. No installation required!
|
|
80
|
+
Simply click the button below to launch the web application and start exploring DeepChopper's capabilities:
|
|
81
|
+
|
|
82
|
+
[](https://huggingface.co/spaces/yangliz5/deepchopper)
|
|
83
|
+
|
|
84
|
+
**What you can do online:**
|
|
85
|
+
|
|
86
|
+
- 📤 Upload your sequencing data
|
|
87
|
+
- 🔬 Run DeepChopper's analysis
|
|
88
|
+
- 📊 Visualize results
|
|
89
|
+
- 🎛️ Experiment with different parameters
|
|
90
|
+
|
|
91
|
+
Perfect for quick tests or demonstrations! However, for extensive analyses or custom workflows, we recommend installing DeepChopper locally.
|
|
92
|
+
|
|
93
|
+
> ⚠️ Note: The online version is limited to one FASTQ record at a time and may not be suitable for large-scale projects.
|
|
94
|
+
|
|
95
|
+
## 📦 Installation
|
|
96
|
+
|
|
97
|
+
DeepChopper can be installed using pip, the Python package installer.
|
|
98
|
+
Follow these steps to install:
|
|
99
|
+
|
|
100
|
+
1. Ensure you have Python 3.10 or later installed on your system.
|
|
101
|
+
|
|
102
|
+
2. Create a virtual environment (recommended):
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
python -m venv deepchopper_env
|
|
106
|
+
source deepchopper_env/bin/activate # On Windows use `deepchopper_env\Scripts\activate`
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
3. Install DeepChopper:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
pip install deepchopper
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
4. Verify the installation:
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
deepchopper --help
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Compatibility and Support
|
|
122
|
+
|
|
123
|
+
DeepChopper is designed to work across various platforms and Python versions.
|
|
124
|
+
Below are the compatibility matrices for PyPI installations:
|
|
125
|
+
|
|
126
|
+
#### [PyPI Support][pypi]
|
|
127
|
+
|
|
128
|
+
| Python Version | Linux x86_64 | macOS Intel | macOS Apple Silicon | Windows x86_64 |
|
|
129
|
+
| :------------: | :----------: | :---------: | :-----------------: | :------------: |
|
|
130
|
+
| 3.10 | ✅ | ✅ | ✅ | ✅ |
|
|
131
|
+
| 3.11 | ✅ | ✅ | ✅ | ✅ |
|
|
132
|
+
| 3.12 | ✅ | ✅ | ✅ | ✅ |
|
|
133
|
+
|
|
134
|
+
🆘 Trouble installing? Check our [Troubleshooting Guide](https://github.com/ylab-hi/DeepChopper/blob/main/documentation/tutorial.md#troubleshooting) or [open an issue](https://github.com/ylab-hi/DeepChopper/issues).
|
|
135
|
+
|
|
136
|
+
## 🛠️ Usage
|
|
137
|
+
|
|
138
|
+
For a comprehensive guide, check out our [full tutorial](./documentation/tutorial.md).
|
|
139
|
+
Here's a quick overview:
|
|
140
|
+
|
|
141
|
+
### Command-Line Interface
|
|
142
|
+
|
|
143
|
+
DeepChopper offers three main commands: `encode`, `predict`, and `chop`.
|
|
144
|
+
|
|
145
|
+
1. **Encode** your input data:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
deepchopper encode <input.fq>
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
2. **Predict** chimera artifacts:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
deepchopper predict <input.parquet> --output predictions
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Using GPUs? Add the `--gpus` flag:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
deepchopper predict <input.parquet> --output predictions --gpus 2
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
3. **Chop** chimera artifacts:
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
deepchopper chop <predictions> raw.fq
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
**Memory Optimization:** For large datasets (>5M reads), use the `--chunk-size` parameter to control memory usage:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
# Low memory (~1-2GB): Slower but memory-efficient
|
|
173
|
+
deepchopper chop <predictions> raw.fq --chunk-size 1000
|
|
174
|
+
|
|
175
|
+
# Balanced (default, ~5-10GB): Good balance of speed and memory
|
|
176
|
+
deepchopper chop <predictions> raw.fq --chunk-size 10000
|
|
177
|
+
|
|
178
|
+
# High performance (~20-50GB): Fastest, requires more memory
|
|
179
|
+
deepchopper chop <predictions> raw.fq --chunk-size 50000
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
The chop command uses **streaming mode** to minimize memory usage. Instead of loading all reads into memory at once (which can require 100GB+ for 20M reads), it processes records in configurable chunks and writes results incrementally.
|
|
183
|
+
|
|
184
|
+
Want a GUI? Launch the web interface (note: limited to one FASTQ record at a time):
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
deepchopper web
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Python Library
|
|
191
|
+
|
|
192
|
+
Integrate DeepChopper into your Python scripts:
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
import deepchopper
|
|
196
|
+
|
|
197
|
+
model = deepchopper.DeepChopper.from_pretrained("yangliz5/deepchopper")
|
|
198
|
+
# Your analysis code here
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## 📚 Cite
|
|
202
|
+
|
|
203
|
+
If DeepChopper aids your research, please cite [our paper](https://www.nature.com/articles/s41467-026-68571-5):
|
|
204
|
+
|
|
205
|
+
```bibtex
|
|
206
|
+
@article{li2026genomic,
|
|
207
|
+
title = {Genomic Language Model Mitigates Chimera Artifacts in Nanopore Direct {{RNA}} Sequencing},
|
|
208
|
+
author = {Li, Yangyang and Wang, Ting-You and Guo, Qingxiang and Ren, Yanan and Lu, Xiaotong and Cao, Qi and Yang, Rendong},
|
|
209
|
+
date = {2026-01-19},
|
|
210
|
+
journaltitle = {Nature Communications},
|
|
211
|
+
shortjournal = {Nat Commun},
|
|
212
|
+
publisher = {Nature Publishing Group},
|
|
213
|
+
issn = {2041-1723},
|
|
214
|
+
doi = {10.1038/s41467-026-68571-5},
|
|
215
|
+
url = {https://www.nature.com/articles/s41467-026-68571-5},
|
|
216
|
+
urldate = {2026-01-20}
|
|
217
|
+
}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## 🤝 Contribution
|
|
221
|
+
|
|
222
|
+
We welcome contributions! Here's how to set up your development environment:
|
|
223
|
+
|
|
224
|
+
### Build Environment
|
|
225
|
+
|
|
226
|
+
Install [UV](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) and [Rust](https://www.rust-lang.org/tools/install)
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
git clone https://github.com/ylab-hi/DeepChopper.git
|
|
230
|
+
cd DeepChopper
|
|
231
|
+
|
|
232
|
+
# Install dependencies
|
|
233
|
+
uv sync
|
|
234
|
+
|
|
235
|
+
# Run DeepChopper
|
|
236
|
+
uv run deepchopper --help
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
🎉 Ready to contribute? Check out our [Contribution Guidelines](./CONTRIBUTING.md) to get started!
|
|
240
|
+
|
|
241
|
+
## 📬 Support
|
|
242
|
+
|
|
243
|
+
Need help? Have questions?
|
|
244
|
+
|
|
245
|
+
- 📖 Check our [Documentation](./documentation/tutorial.md)
|
|
246
|
+
- 🐛 [Report issues](https://github.com/ylab-hi/DeepChopper/issues)
|
|
247
|
+
|
|
248
|
+
______________________________________________________________________
|
|
249
|
+
|
|
250
|
+
DeepChopper is developed with ❤️ by the YLab team.
|
|
251
|
+
Happy sequencing! 🧬🔬
|
|
252
|
+
|
|
253
|
+
[pypi]: https://pypi.python.org/pypi/deepchopper
|
|
254
|
+
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
deepchopper/__init__.py,sha256=vVC8c-WYPrzqVzUh9-GmUlVQ1HU3F7waE2_7KS4XVV8,279
|
|
2
|
+
deepchopper/__init__.pyi,sha256=RWptrMQYYQDw3TuecAxtIE3eYkI-aMDvTKK_qVL7GtA,2150
|
|
3
|
+
deepchopper/__main__.py,sha256=Qd-f8z2Q2vpiEP2x6PBFsJrpACWDVxFKQk820MhFmHo,59
|
|
4
|
+
deepchopper/cli.py,sha256=MKRGZaby1RWHTkicV5Yd5wuKrwwS1ZO5lTeY4oqALbs,8920
|
|
5
|
+
deepchopper/data/__init__.py,sha256=hRcMeIfZEFVRmpASnXxs3RXjA-C0Zcwfu3I99Xe-V8k,405
|
|
6
|
+
deepchopper/data/components/__init__.py,sha256=hrrThPUqyvY_CNXCMFD3-6-mJeJxxT97kD44uCh19B4,26
|
|
7
|
+
deepchopper/data/encode_fq.py,sha256=d8r-zM_G37yJ4GH2d-vpml5z5UZDxNR_ZhfRVNeHeLM,1180
|
|
8
|
+
deepchopper/data/fq_datamodule.py,sha256=J74qpymotjalamZoydx9DIMMgT5yAKzVu08qNaGCvG8,14279
|
|
9
|
+
deepchopper/data/hg_data.py,sha256=it6Q6zKmhGIp4U1D3r6-dKwKkS3BDLALVIxwzGdnIbc,1399
|
|
10
|
+
deepchopper/data/only_fq.py,sha256=UF6pDLJvukjzuOaIwNsdTrAe9nwNFWEzh-uyIS3lk9I,15549
|
|
11
|
+
deepchopper/deepchopper.abi3.so,sha256=YWVdrXYYKO_1WAlRn_7L9_9XhmHdrZH5yPMGA4VIf0A,17820848
|
|
12
|
+
deepchopper/eval.py,sha256=AV6QaL5I2ZZheXAozC8Y1x9lr8F8m_EzeWwITlXlrlY,2756
|
|
13
|
+
deepchopper/models/__init__.py,sha256=qjs3MzLBYRen1P57hPV5hBeJofPYq1sMVjrpNYV7hog,84
|
|
14
|
+
deepchopper/models/basic_module.py,sha256=EPq4yCbx2oHaX2QyYyQyEHSJPvaa_YDSg8wS0jjhEko,10011
|
|
15
|
+
deepchopper/models/callbacks.py,sha256=RF7C-vCyXL-HT65NC-D9Jtdqsz9V3SdOWvhS6aCxkeo,2296
|
|
16
|
+
deepchopper/models/cnn.py,sha256=ZImrwd-Tob8DkHyoVa2z5r9DI64A-cp6qixsNrlfdJQ,1918
|
|
17
|
+
deepchopper/models/components/__init__.py,sha256=QB4-xNJc70QgSy45ovk3pblXN7VONB7WJq7e9QTbLF4,24
|
|
18
|
+
deepchopper/models/dc_hg.py,sha256=aQQH3CQjk6t4HtTkISIvfCXhiZWcK8d8wEg84TOJolQ,6246
|
|
19
|
+
deepchopper/models/llm/__init__.py,sha256=bpyFSsmN2hqKgZurkDXWo-ruk8OO6VAjQWdxX4CLFkA,921
|
|
20
|
+
deepchopper/models/llm/caduceus.py,sha256=luJ4TXlFq5akHi3GzA2bZ0HJ2IUrvRPJHjiXIG04Tfc,1595
|
|
21
|
+
deepchopper/models/llm/components.py,sha256=Djvez8W32E9OdfLTS43ZESidGd8C5w9a6bhgBpxiVBs,3078
|
|
22
|
+
deepchopper/models/llm/head.py,sha256=TDkzTfTaSy91e13eP6qHCTPGfVzCrCQjCw9Qfd4BY6U,4280
|
|
23
|
+
deepchopper/models/llm/hyena.py,sha256=NhOIhHyGldA9WySWTSJ39rd5rV6F_SDj7mm7TV7jOSM,1161
|
|
24
|
+
deepchopper/models/llm/metric.py,sha256=0YTmk0UA2bZS8A6_YS-ff2whmOKvt_jt64tm04A6YM0,1502
|
|
25
|
+
deepchopper/models/llm/tokenizer.py,sha256=OSohXxQqEOAiBLa5VsRSqBmRtRJ7f_lIF69RRDhOYHs,8382
|
|
26
|
+
deepchopper/models/transformer.py,sha256=PP2t7ISMbBlwXgETXy0jFxdVrRuXcZY8jp6QUjU6I3w,3689
|
|
27
|
+
deepchopper/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
deepchopper/train.py,sha256=xc9h3NE-KkRCrAG9KjbX6pR5RzaGUMXH2NhRqzGEYN4,3597
|
|
29
|
+
deepchopper/ui/__init__.py,sha256=-bBNFYOq80A2Egtpo5V5zWJtYOxQfRZFQ_feve5lkFU,23
|
|
30
|
+
deepchopper/ui/main.py,sha256=89amRC-u2ikmNTNFQBTr_ZStmpaF9knMopi2g29yx4o,8805
|
|
31
|
+
deepchopper/utils/__init__.py,sha256=2SbnAXWCw6DPRPz_MC86MhJdFerMIjX-SML3SrGTEMs,959
|
|
32
|
+
deepchopper/utils/instantiators.py,sha256=qZpYHR9sfTgo4-6B9Bc9UFWccf9mJDw8Y00wHfz1hys,1741
|
|
33
|
+
deepchopper/utils/logging_utils.py,sha256=KHFfwHqmwI-RxyCtdFwNyLKljozZS6PNviIgM_qjr7E,1722
|
|
34
|
+
deepchopper/utils/preprocess.py,sha256=2XNL1ol-1JRHs7oxkUa-lmbAQgNTy0LNGGP_hA5rysY,2129
|
|
35
|
+
deepchopper/utils/print.py,sha256=6WibNTPeZzBWmM6vfE_BqYKjWsZZIU9liG1yNsaKzlg,3072
|
|
36
|
+
deepchopper/utils/pylogger.py,sha256=_09Z3fnoSI2udjkwR_sCMO8fVImYh1HMS-k4KrCxrKI,2512
|
|
37
|
+
deepchopper/utils/rich_utils.py,sha256=FFgIxnetoe5b9YNSN9ASaqRSZiztB7zJDWd6EBW3x2o,3283
|
|
38
|
+
deepchopper/utils/utils.py,sha256=reIctC8RP3fQeGpD8tw9ifgOh4MWPi8t0nxWN17fT6s,4623
|
|
39
|
+
deepchopper-1.3.0.dist-info/METADATA,sha256=IOpSt1JY7JCZALRhw7fy_pbfatutXh5uOTglfhXa0Lc,9420
|
|
40
|
+
deepchopper-1.3.0.dist-info/WHEEL,sha256=vZ12AMAE5CVtd8oYbYGrz3omfHuIZCNO_3P50V00s00,104
|
|
41
|
+
deepchopper-1.3.0.dist-info/entry_points.txt,sha256=sX2f2aZ_wguUzA9yw3Lr17DBBjjV9DBdcIuKeKk3r78,50
|
|
42
|
+
deepchopper-1.3.0.dist-info/licenses/LICENSE,sha256=Yo7RMo69x4zRi_3gBAdmi55FFQWk3MVJghEU3H2eDkE,11354
|
|
43
|
+
deepchopper-1.3.0.dist-info/RECORD,,
|