deepchopper 1.3.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. deepchopper/__init__.py +9 -0
  2. deepchopper/__init__.pyi +67 -0
  3. deepchopper/__main__.py +4 -0
  4. deepchopper/cli.py +260 -0
  5. deepchopper/data/__init__.py +15 -0
  6. deepchopper/data/components/__init__.py +1 -0
  7. deepchopper/data/encode_fq.py +41 -0
  8. deepchopper/data/fq_datamodule.py +352 -0
  9. deepchopper/data/hg_data.py +39 -0
  10. deepchopper/data/only_fq.py +388 -0
  11. deepchopper/deepchopper.abi3.so +0 -0
  12. deepchopper/eval.py +86 -0
  13. deepchopper/models/__init__.py +4 -0
  14. deepchopper/models/basic_module.py +243 -0
  15. deepchopper/models/callbacks.py +57 -0
  16. deepchopper/models/cnn.py +54 -0
  17. deepchopper/models/components/__init__.py +1 -0
  18. deepchopper/models/dc_hg.py +163 -0
  19. deepchopper/models/llm/__init__.py +32 -0
  20. deepchopper/models/llm/caduceus.py +55 -0
  21. deepchopper/models/llm/components.py +99 -0
  22. deepchopper/models/llm/head.py +102 -0
  23. deepchopper/models/llm/hyena.py +41 -0
  24. deepchopper/models/llm/metric.py +44 -0
  25. deepchopper/models/llm/tokenizer.py +205 -0
  26. deepchopper/models/transformer.py +107 -0
  27. deepchopper/py.typed +0 -0
  28. deepchopper/train.py +109 -0
  29. deepchopper/ui/__init__.py +1 -0
  30. deepchopper/ui/main.py +189 -0
  31. deepchopper/utils/__init__.py +37 -0
  32. deepchopper/utils/instantiators.py +54 -0
  33. deepchopper/utils/logging_utils.py +53 -0
  34. deepchopper/utils/preprocess.py +62 -0
  35. deepchopper/utils/print.py +102 -0
  36. deepchopper/utils/pylogger.py +57 -0
  37. deepchopper/utils/rich_utils.py +100 -0
  38. deepchopper/utils/utils.py +138 -0
  39. deepchopper-1.3.0.dist-info/METADATA +254 -0
  40. deepchopper-1.3.0.dist-info/RECORD +43 -0
  41. deepchopper-1.3.0.dist-info/WHEEL +4 -0
  42. deepchopper-1.3.0.dist-info/entry_points.txt +2 -0
  43. deepchopper-1.3.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,9 @@
1
+ """DeepChopper package."""
2
+
3
+ from . import cli, data, eval, models, train, ui, utils
4
+ from .deepchopper import * # noqa: F403
5
+ from .models import DeepChopper
6
+
7
+ __version__ = "1.3.0"
8
+
9
+ __all__ = ["DeepChopper", "__version__", "cli", "data", "eval", "models", "train", "ui", "utils"]
@@ -0,0 +1,67 @@
1
+ from pathlib import Path
2
+
3
+ import numpy as np
4
+
5
+ class FqEncoder:
6
+ option: FqEncoderOption
7
+ kmer2id_table: dict[str, int]
8
+ id2kmer_table: dict[int, str]
9
+
10
+ def __init__(self, option: FqEncoderOption) -> None: ...
11
+
12
+ class FqEncoderOption:
13
+ kmer_size: int
14
+ qual_offset: int
15
+ bases: list[int]
16
+ vectorized_target: bool
17
+ max_width: int
18
+ max_seq_len: int
19
+
20
+ def __init__(
21
+ self,
22
+ kmer_size: int,
23
+ qual_offset: int,
24
+ bases: list[int],
25
+ vectorized_target: bool,
26
+ max_width: int | None,
27
+ max_seq_len: int | None,
28
+ ) -> None: ...
29
+
30
+ class RecordData:
31
+ def __init__(self, id: str, seq: str, qual: str) -> None: ...
32
+ def id(self) -> str: ...
33
+ def set_id(self, id: str) -> None: ...
34
+ def seq(self) -> str: ...
35
+ def set_seq(self, seq: str) -> None: ...
36
+ def qual(self) -> str: ...
37
+ def set_qual(self, qual: str) -> None: ...
38
+
39
+ def seq_to_kmers(seq: str, k: int) -> list[str]: ...
40
+ def kmers_to_seq(kmers: list[str]) -> str: ...
41
+ def generate_kmers_table(base: str, k: int): ...
42
+ def generate_kmers(base: str, k: int) -> list[str]: ...
43
+ def to_kmer_target_region(start: int, end: int, k: int, seq_len: int | None) -> tuple[int, int]: ...
44
+ def to_original_target_region(start: int, end: int, k: int) -> tuple[int, int]: ...
45
+ def kmerids_to_seq(kmer_ids: list[int], id2kmer_table: dict[int, str]) -> str: ...
46
+ def write_fq(records_data: list[RecordData], file_path: Path | None): ...
47
+ def write_fq_parallel(records_data: list[RecordData], file_path: Path, threads: int): ...
48
+ def encode_fq_path(
49
+ fq_path: Path,
50
+ k: int,
51
+ bases: str,
52
+ qual_offset: int,
53
+ vectorized_target: bool,
54
+ max_width: int | None = None,
55
+ max_seq_len: int | None = None,
56
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray, dict[str, int]]: ...
57
+ def encode_fq_paths(
58
+ fq_paths: list[Path],
59
+ k: int,
60
+ bases: str,
61
+ qual_offset: int,
62
+ vectorized_target: bool,
63
+ parallel_for_files: bool,
64
+ max_width: int | None = None,
65
+ max_seq_len: int | None = None,
66
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray, dict[str, int]]: ...
67
+ def summary_record_len(path: Path) -> list[int]: ...
@@ -0,0 +1,4 @@
1
+ from .cli import app
2
+
3
+ if __name__ == "__main__":
4
+ app()
deepchopper/cli.py ADDED
@@ -0,0 +1,260 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING
4
+
5
+ import lightning
6
+ import torch
7
+ import typer
8
+ from click import Context
9
+ from rich import print
10
+ from rich.logging import RichHandler
11
+ from typer.core import TyperGroup
12
+
13
+ import deepchopper
14
+
15
+ from .utils import (
16
+ highlight_target,
17
+ )
18
+
19
+ if TYPE_CHECKING:
20
+ from lightning.pytorch import LightningDataModule
21
+
22
+
23
+ def set_logging_level(level: int = logging.INFO):
24
+ """Set the logging level.
25
+
26
+ Parameters:
27
+ level (int): The logging level to set.
28
+ """
29
+ FORMAT = "%(message)s"
30
+ logging.basicConfig(
31
+ level=level,
32
+ format=FORMAT,
33
+ handlers=[RichHandler()],
34
+ )
35
+
36
+
37
+ def random_show_seq(dataset, sample: int = 3):
38
+ """Randomly selects 'sample' number of sequences from the given dataset and prints their IDs and targets.
39
+
40
+ Parameters:
41
+ dataset : A list of dictionaries where each dictionary represents a sequence with keys 'id', 'seq', and 'target'.
42
+ sample (int): The number of sequences to randomly select from the dataset. Default is 3.
43
+ """
44
+ total = len(dataset)
45
+ import secrets
46
+
47
+ highlight_ids = (secrets.randbelow(total) for _ in range(sample))
48
+ for highlight_id in highlight_ids:
49
+ print(f"id: {dataset[highlight_id]['id']}")
50
+ highlight_target(dataset[highlight_id]["seq"], *dataset[highlight_id]["target"])
51
+
52
+
53
+ def encode(
54
+ fastq_path: Path = typer.Argument(None, help="DEPRECATED: Use 'deepchopper predict' instead"),
55
+ ):
56
+ """DEPRECATED: Please use `deepchopper predict fastq_path` directly."""
57
+ typer.secho(
58
+ "❌ Error: The 'encode' command is deprecated.\n Please use 'deepchopper predict <fastq_path>' instead.",
59
+ fg=typer.colors.RED,
60
+ err=True,
61
+ )
62
+ raise typer.Exit(1)
63
+
64
+
65
+ def predict(
66
+ data_path: Path = typer.Argument(..., help="Path to the dataset"),
67
+ gpus: int = typer.Option(0, "--gpus", "-g", help="Number of GPUs to use"),
68
+ output_path: Path | None = typer.Option(None, "--output", "-o", help="Output path for predictions"),
69
+ batch_size: int = typer.Option(12, "--batch-size", "-b", help="Batch size"),
70
+ num_workers: int = typer.Option(0, "--workers", "-w", help="Number of workers"),
71
+ model: str = typer.Option(
72
+ "rna002",
73
+ "--model",
74
+ "-m",
75
+ help="Model name (choices: rna002, rna004)",
76
+ show_choices=True,
77
+ case_sensitive=False,
78
+ metavar="MODEL",
79
+ rich_help_panel="Model",
80
+ callback=lambda v: v.lower()
81
+ if v.lower() in {"rna002", "rna004"}
82
+ else typer.BadParameter("Model must be one of: rna002, rna004"),
83
+ ),
84
+ limit_predict_batches: int | None = typer.Option(None, "--limit-batches", help="Limit prediction batches"),
85
+ max_sample: int | None = typer.Option(None, "--max-sample", help="Maximum number of samples to process"),
86
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"),
87
+ ):
88
+ """Predict the given dataset using DeepChopper."""
89
+ if verbose:
90
+ set_logging_level(logging.INFO)
91
+
92
+ # Path validation
93
+ if isinstance(data_path, str):
94
+ data_path = Path(data_path)
95
+
96
+ if not data_path.exists():
97
+ typer.secho(f"❌ Error: Data path '{data_path}' does not exist.", fg=typer.colors.RED, err=True)
98
+ raise typer.Exit(1)
99
+
100
+ # Set seed only after validation passes
101
+ lightning.seed_everything(42, workers=True)
102
+
103
+ tokenizer = deepchopper.models.llm.load_tokenizer_from_hyena_model(model_name="hyenadna-small-32k-seqlen")
104
+ datamodule: LightningDataModule = deepchopper.data.OnlyFqDataModule(
105
+ train_data_path="dummy.parquet",
106
+ tokenizer=tokenizer,
107
+ predict_data_path=data_path.as_posix(),
108
+ batch_size=batch_size,
109
+ num_workers=num_workers,
110
+ max_predict_samples=max_sample,
111
+ )
112
+
113
+ model = (
114
+ deepchopper.DeepChopper.from_pretrained("yangliz5/deepchopper")
115
+ if model == "rna002"
116
+ else deepchopper.DeepChopper.from_pretrained("yangliz5/deepchopper-rna004")
117
+ )
118
+ output_path = Path(output_path or "predictions")
119
+ callbacks = [deepchopper.models.callbacks.CustomWriter(output_dir=output_path, write_interval="batch")]
120
+
121
+ if gpus > 0:
122
+ if torch.cuda.is_available():
123
+ accelerator = "gpu"
124
+ devices = min(gpus, torch.cuda.device_count())
125
+ elif torch.backends.mps.is_available():
126
+ accelerator = "mps"
127
+ devices = "auto" # MPS currently supports only one device
128
+ else:
129
+ accelerator = "cpu"
130
+ devices = "auto"
131
+ else:
132
+ accelerator = "cpu"
133
+ devices = "auto"
134
+
135
+ trainer = lightning.pytorch.trainer.Trainer(
136
+ accelerator=accelerator,
137
+ devices=devices,
138
+ callbacks=callbacks,
139
+ deterministic=True,
140
+ logger=False,
141
+ limit_predict_batches=limit_predict_batches,
142
+ )
143
+
144
+ import multiprocess.context as ctx
145
+
146
+ ctx._force_start_method("spawn")
147
+ trainer.predict(model=model, dataloaders=datamodule, return_predictions=False)
148
+
149
+
150
+ def chop(
151
+ predicts: list[Path] = typer.Argument(..., help="Paths to prediction files"),
152
+ fq: Path = typer.Argument(..., help="Path to FASTQ file"),
153
+ smooth_window_size: int = typer.Option(21, "--smooth-window", help="Smooth window size"),
154
+ min_interval_size: int = typer.Option(13, "--min-interval-size", help="Minimum interval size"),
155
+ approved_interval_number: int = typer.Option(20, "--approved-intervals", help="Number of approved intervals"),
156
+ max_process_intervals: int = typer.Option(4, "--max-process-intervals", help="Maximum process intervals"),
157
+ min_read_length_after_chop: int = typer.Option(20, "--min-read-length", help="Minimum read length after chop"),
158
+ output_chopped_seqs: bool = typer.Option(False, "--output-chopped", help="Output chopped sequences"),
159
+ chop_type: str = typer.Option("all", "--chop-type", help="Chop type"),
160
+ threads: int = typer.Option(2, "--threads", help="Number of threads"),
161
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"),
162
+ output_prefix: str | None = typer.Option(None, "--prefix", "-o", help="Output prefix"),
163
+ max_batch_size: int | None = typer.Option(None, "--max-batch", help="Maximum batch size"),
164
+ ):
165
+ """Chop sequences based on predictions."""
166
+ if verbose:
167
+ set_logging_level(logging.INFO)
168
+
169
+ import subprocess
170
+ from shutil import which
171
+
172
+ if which("deepchopper-chop") is None:
173
+ print("deepchopper-chop is not installed. Please use `cargo install deepchopper-chop` to install it.")
174
+ raise SystemExit
175
+
176
+ predict_files = " ".join([f"--pdt {predict}" for predict in predicts])
177
+
178
+ command = f"deepchopper-chop {predict_files} --fq {fq} -t {threads} -s {smooth_window_size} --mis {min_interval_size} -a {approved_interval_number} --mpi {max_process_intervals} --mcr {min_read_length_after_chop} --ct {chop_type} "
179
+
180
+ if output_chopped_seqs:
181
+ command += "--ocq "
182
+
183
+ if output_prefix is not None:
184
+ command += f"-o {output_prefix} "
185
+
186
+ if max_batch_size is not None:
187
+ command += f"-m {max_batch_size} "
188
+
189
+ try:
190
+ subprocess.run(command.split(), check=True)
191
+ except subprocess.CalledProcessError as e:
192
+ logging.error(f"Error: Chopping failed with exit code {e.returncode}")
193
+ raise e
194
+
195
+
196
+ def web():
197
+ """Run the web interface."""
198
+ deepchopper.ui.main()
199
+
200
+
201
+ class OrderCommands(TyperGroup):
202
+ """Order commands in the order appear."""
203
+
204
+ def list_commands(self, ctx: Context):
205
+ """Return list of commands in the order appear."""
206
+ return list(self.commands) # get commands using self.commands
207
+
208
+
209
+ def version_callback(value: bool):
210
+ """Print the version and exit."""
211
+ if value:
212
+ print(f"DeepChopper Version: {deepchopper.__version__}")
213
+ raise typer.Exit()
214
+
215
+
216
+ app = typer.Typer(
217
+ cls=OrderCommands,
218
+ context_settings={"help_option_names": ["-h", "--help"]},
219
+ help="DeepChopper: A genomic lanuage model to identify artificial sequences.",
220
+ )
221
+
222
+
223
+ # Add the version option to the main app
224
+ @app.callback()
225
+ def main(
226
+ version: bool | None = typer.Option(
227
+ None,
228
+ "--version",
229
+ "-V",
230
+ help="Show the application's version and exit.",
231
+ callback=version_callback,
232
+ is_eager=True,
233
+ ),
234
+ ):
235
+ """DeepChopper CLI."""
236
+
237
+
238
+ app.command(
239
+ help="DeepChopper: encode the given fastq (DEPRECATED)",
240
+ epilog="DEPRECATED: Please use `deepchopper predict fastq_path` directly.",
241
+ )(encode)
242
+
243
+ app.command(
244
+ help="DeepChopper: predict the given dataset",
245
+ epilog="Example: deepchopper predict fastq_path --gpus 1 --output predictions",
246
+ )(predict)
247
+
248
+ app.command(
249
+ help="DeepChopper: chop the given predictions!",
250
+ epilog="Example: deepchopper chop predictions/0 fastq_path",
251
+ )(chop)
252
+
253
+ app.command(
254
+ help="DeepChopper: a web ui!",
255
+ epilog="Example: deepchopper web",
256
+ )(web)
257
+
258
+
259
+ if __name__ == "__main__":
260
+ app()
@@ -0,0 +1,15 @@
1
+ """Data."""
2
+
3
+ from . import fq_datamodule
4
+ from .encode_fq import encode_fq_files_in_folder_to_parquet, encode_one_fq_file
5
+ from .hg_data import load_and_split_dataset
6
+ from .only_fq import OnlyFqDataModule, parse_fastq_file
7
+
8
+ __all__ = [
9
+ "OnlyFqDataModule",
10
+ "encode_fq_files_in_folder_to_parquet",
11
+ "encode_one_fq_file",
12
+ "fq_datamodule",
13
+ "load_and_split_dataset",
14
+ "parse_fastq_file",
15
+ ]
@@ -0,0 +1 @@
1
+ """Components of Data."""
@@ -0,0 +1,41 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from rich.logging import RichHandler
5
+
6
+ from deepchopper.deepchopper import encode_fq_path_to_parquet
7
+
8
+
9
+ def encode_one_fq_file(
10
+ fq_file: Path,
11
+ kmer_size: int = 3, # unused for encode parquet
12
+ qual_offset: int = 33,
13
+ bases="ACGTN",
14
+ ):
15
+ """Encode the sequences in a single FASTQ file into numerical representations and save the encoded data."""
16
+ encode_fq_path_to_parquet(fq_file, kmer_size, bases, qual_offset, vectorized_target=False)
17
+
18
+
19
+ def encode_fq_files_in_folder_to_parquet(data_folder: Path):
20
+ """Encode all fastq files in a given folder.
21
+
22
+ Args:
23
+ data_folder (Path): The folder containing the fastq files to encode.
24
+
25
+ Raises:
26
+ FileNotFoundError: If the specified data_folder does not exist.
27
+ """
28
+ FORMAT = "%(message)s"
29
+ logging.basicConfig(
30
+ level=logging.INFO,
31
+ format=FORMAT,
32
+ handlers=[RichHandler()],
33
+ )
34
+
35
+ if not data_folder.exists():
36
+ msg = f"Folder {data_folder} does not exist."
37
+ logging.error(msg)
38
+
39
+ for fq_file in data_folder.glob("*.[fq|fastq]"):
40
+ logging.info(f"Encoding {fq_file}")
41
+ encode_one_fq_file(fq_file)