PyPI - tetss2 - Versions diffs - 0.1.0__py3-none-any.whl - Mend

tetss2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

tetss2/__init__.py +3 -0
tetss2/assets/best_model.pth +0 -0
tetss2/cli.py +108 -0
tetss2/model.py +27 -0
tetss2/predictor.py +139 -0
tetss2-0.1.0.dist-info/METADATA +375 -0
tetss2-0.1.0.dist-info/RECORD +10 -0
tetss2-0.1.0.dist-info/WHEEL +5 -0
tetss2-0.1.0.dist-info/entry_points.txt +2 -0
tetss2-0.1.0.dist-info/top_level.txt +1 -0

tetss2/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .predictor import TETSS2Predictor
+__all__ = ["TETSS2Predictor"]

tetss2/assets/best_model.pth ADDED Viewed

Binary file

tetss2/cli.py ADDED Viewed

@@ -0,0 +1,108 @@
+import os
+os.environ["MKL_THREADING_LAYER"] = "GNU"
+import argparse
+import json
+from .predictor import TETSS2Predictor
+def main():
+    parser = argparse.ArgumentParser(
+        description="TETSS2.0: predict TE-TSS activity from 201 bp DNA sequence."
+    )
+    subparsers = parser.add_subparsers(dest="command")
+    predict_parser = subparsers.add_parser(
+        "predict",
+        help="Predict one DNA sequence."
+    )
+    predict_parser.add_argument(
+        "--sequence",
+        required=True,
+        help="Input DNA sequence. TETSS2.0 expects exactly 201 bp."
+    )
+    predict_parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.5,
+        help="Classification threshold. Default: 0.5"
+    )
+    predict_parser.add_argument(
+        "--no-length-check",
+        action="store_true",
+        help="Disable 201 bp length check. Mainly for debugging."
+    )
+    file_parser = subparsers.add_parser(
+        "predict-file",
+        help="Predict sequences from a TSV or CSV file."
+    )
+    file_parser.add_argument(
+        "--input",
+        required=True,
+        help="Input file path. The file should contain a sequence column."
+    )
+    file_parser.add_argument(
+        "--output",
+        required=True,
+        help="Output file path."
+    )
+    file_parser.add_argument(
+        "--sequence-column",
+        default="sequence",
+        help="Column name containing DNA sequences. Default: sequence"
+    )
+    file_parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.5,
+        help="Classification threshold. Default: 0.5"
+    )
+    file_parser.add_argument(
+        "--sep",
+        default="\t",
+        help="File separator. Default: tab. Use ',' for CSV."
+    )
+    file_parser.add_argument(
+        "--no-length-check",
+        action="store_true",
+        help="Disable 201 bp length check. Mainly for debugging."
+    )
+    args = parser.parse_args()
+    if args.command == "predict":
+        predictor = TETSS2Predictor()
+        result = predictor.predict(
+            args.sequence,
+            threshold=args.threshold,
+            check_length=not args.no_length_check,
+        )
+        print(json.dumps(result, indent=2))
+    elif args.command == "predict-file":
+        sep = args.sep
+        if sep == "\\t":
+            sep = "\t"
+        predictor = TETSS2Predictor()
+        predictor.predict_file(
+            input_path=args.input,
+            output_path=args.output,
+            sequence_column=args.sequence_column,
+            threshold=args.threshold,
+            sep=sep,
+            check_length=not args.no_length_check,
+        )
+        print(f"Prediction finished. Results saved to: {args.output}")
+    else:
+        parser.print_help()
+if __name__ == "__main__":
+    main()

tetss2/model.py ADDED Viewed

@@ -0,0 +1,27 @@
+import torch
+import torch.nn as nn
+class TETSSClassifier(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.features = nn.Sequential(
+            nn.Conv1d(4, 64, kernel_size=15, padding=7),
+            nn.BatchNorm1d(64), nn.ReLU(), nn.MaxPool1d(2),
+            nn.Conv1d(64, 128, kernel_size=15, padding=7),
+            nn.BatchNorm1d(128), nn.ReLU(), nn.MaxPool1d(2),
+            nn.Conv1d(128, 128, kernel_size=15, padding=7),
+            nn.BatchNorm1d(128), nn.ReLU(), nn.MaxPool1d(2),
+            nn.Conv1d(128, 256, kernel_size=7, padding=3),
+            nn.BatchNorm1d(256), nn.ReLU(), nn.AdaptiveMaxPool1d(1)
+        )
+        self.classifier = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(256, 128),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(128, 1)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.classifier(self.features(x)).squeeze(1)

tetss2/predictor.py ADDED Viewed

@@ -0,0 +1,139 @@
+import os
+os.environ["MKL_THREADING_LAYER"] = "GNU"
+import torch
+import numpy as np
+import pandas as pd
+from importlib.resources import files
+from .model import TETSSClassifier
+EXPECTED_SEQUENCE_LENGTH = 201
+def clean_sequence(seq: str) -> str:
+    seq = str(seq).strip().upper()
+    return seq
+def validate_sequence(seq: str, expected_length: int = EXPECTED_SEQUENCE_LENGTH) -> None:
+    allowed_bases = set("ACGTN")
+    invalid_bases = sorted(set(seq) - allowed_bases)
+    if invalid_bases:
+        raise ValueError(
+            f"Invalid bases found: {invalid_bases}. "
+            "Only A, C, G, T, and N are allowed."
+        )
+    if len(seq) != expected_length:
+        raise ValueError(
+            f"Invalid sequence length: {len(seq)} bp. "
+            f"TETSS2.0 expects exactly {expected_length} bp."
+        )
+def seq_to_onehot(seq: str) -> np.ndarray:
+    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
+    arr = np.zeros((4, len(seq)), dtype=np.float32)
+    for i, base in enumerate(seq.upper()):
+        idx = mapping.get(base)
+        if idx is not None:
+            arr[idx, i] = 1.0
+    return arr
+class TETSS2Predictor:
+    def __init__(self, model_path=None, device=None):
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        if model_path is None:
+            model_path = files("tetss2").joinpath("assets/best_model.pth")
+        self.model = TETSSClassifier().to(self.device)
+        state_dict = torch.load(model_path, map_location=self.device)
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+    def predict_proba(self, sequence: str, check_length: bool = True) -> float:
+        sequence = clean_sequence(sequence)
+        if check_length:
+            validate_sequence(sequence)
+        x = seq_to_onehot(sequence)
+        x = torch.from_numpy(x).unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            logits = self.model(x)
+            prob = torch.sigmoid(logits).item()
+        return float(prob)
+    def predict(
+        self,
+        sequence: str,
+        threshold: float = 0.5,
+        check_length: bool = True,
+    ) -> dict:
+        sequence = clean_sequence(sequence)
+        prob = self.predict_proba(sequence, check_length=check_length)
+        label = int(prob > threshold)
+        return {
+            "model": "TETSS2.0",
+            "sequence": sequence,
+            "sequence_length": len(sequence),
+            "probability": prob,
+            "prediction": label,
+            "threshold": threshold,
+        }
+    def predict_file(
+        self,
+        input_path: str,
+        output_path: str,
+        sequence_column: str = "sequence",
+        threshold: float = 0.5,
+        sep: str = "\t",
+        check_length: bool = True,
+    ) -> None:
+        df = pd.read_csv(input_path, sep=sep)
+        if sequence_column not in df.columns:
+            raise ValueError(
+                f"Cannot find sequence column '{sequence_column}' in input file. "
+                f"Available columns are: {list(df.columns)}"
+            )
+        probabilities = []
+        predictions = []
+        sequence_lengths = []
+        for idx, seq in enumerate(df[sequence_column], start=1):
+            seq = clean_sequence(seq)
+            try:
+                prob = self.predict_proba(seq, check_length=check_length)
+            except ValueError as e:
+                raise ValueError(
+                    f"Error in row {idx}: {e}"
+                )
+            pred = int(prob > threshold)
+            probabilities.append(prob)
+            predictions.append(pred)
+            sequence_lengths.append(len(seq))
+        df["tetss2_sequence_length"] = sequence_lengths
+        df["tetss2_probability"] = probabilities
+        df["tetss2_prediction"] = predictions
+        df["tetss2_threshold"] = threshold
+        df.to_csv(output_path, sep=sep, index=False)

tetss2-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,375 @@
+Metadata-Version: 2.4
+Name: tetss2
+Version: 0.1.0
+Summary: TETSS2.0: a PyTorch model for predicting TE-TSS activity from DNA sequences.
+Author-email: Moriyaa Cui <2311459@tongji.edu.cn>
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+Requires-Dist: numpy>=1.21
+Requires-Dist: pandas>=1.3
+Requires-Dist: torch>=1.10
+# TETSS2.0
+**TETSS2.0** is a deep learning model for predicting TE-derived transcription start site (TE-TSS) activity from DNA sequences.
+The Python package name and command-line tool name are **`tetss2`**. The model name shown in documents, figures, and the website is **TETSS2.0**.
+---
+## Overview
+TETSS2.0 is a PyTorch-based convolutional neural network classifier designed to predict whether an input DNA sequence is associated with TE-TSS activity.
+The model takes a DNA sequence as input and returns:
+* a prediction probability
+* a binary prediction label
+* the classification threshold used for prediction
+By default, TETSS2.0 uses a threshold of `0.5`.
+---
+## Input requirement
+TETSS2.0 expects DNA sequences of exactly **201 bp**.
+Allowed bases:
+```text
+A, C, G, T, N
+```
+Notes:
+* Input sequences are automatically converted to uppercase.
+* `N` is allowed but is encoded as an all-zero position in the one-hot representation.
+* Sequences shorter or longer than 201 bp are rejected by default.
+* The option `--no-length-check` is available only for debugging and is not recommended for normal prediction.
+---
+## Output
+For each input sequence, TETSS2.0 outputs:
+| Column                   | Description                              |
+| ------------------------ | ---------------------------------------- |
+| `tetss2_sequence_length` | Length of the input sequence             |
+| `tetss2_probability`     | Predicted probability score              |
+| `tetss2_prediction`      | Binary prediction result, `0` or `1`     |
+| `tetss2_threshold`       | Threshold used for binary classification |
+---
+## Installation
+### Option 1: Install from a local source directory
+If you have downloaded or cloned this package locally, enter the package directory and install it with:
+```bash
+cd tetss_rampage_package
+pip install -e .
+```
+After installation, check whether the command-line tool is available:
+```bash
+tetss2 --help
+```
+### Option 2: Recommended conda environment
+We recommend creating a clean conda environment before installation:
+```bash
+conda create -n tetss2 python=3.9
+conda activate tetss2
+conda install numpy pandas scikit-learn
+pip install torch==1.10.2
+pip install -e .
+```
+A future public release may support:
+```bash
+pip install tetss2
+```
+---
+## Command-line usage
+### 1. Predict a single sequence
+```bash
+tetss2 predict --sequence ACGTACGTACGTACGT
+```
+For normal use, the input sequence should be exactly 201 bp:
+```bash
+tetss2 predict --sequence YOUR_201BP_DNA_SEQUENCE
+```
+Example output:
+```json
+{
+  "model": "TETSS2.0",
+  "sequence": "ACGTACGTACGTACGT",
+  "sequence_length": 16,
+  "probability": 0.35490313172340393,
+  "prediction": 0,
+  "threshold": 0.5
+}
+```
+Note: the example above uses a short sequence only to demonstrate the command format. For biological prediction, please use a 201 bp sequence.
+---
+### 2. Batch prediction from a TSV file
+Prepare an input file containing a `sequence` column.
+Example input file: `input.tsv`
+```tsv
+sample_id	sequence
+sample1	ACGT...
+sample2	TTTT...
+sample3	GCGC...
+```
+Run batch prediction:
+```bash
+tetss2 predict-file \
+  --input input.tsv \
+  --output tetss2_predictions.tsv
+```
+The output file will contain the original columns plus TETSS2.0 prediction results.
+Example output:
+```tsv
+sample_id	sequence	tetss2_sequence_length	tetss2_probability	tetss2_prediction	tetss2_threshold
+sample1	ACGT...	201	0.3549	0	0.5
+sample2	TTTT...	201	0.8123	1	0.5
+sample3	GCGC...	201	0.4471	0	0.5
+```
+---
+### 3. Batch prediction from a CSV file
+If your input file is comma-separated, use `--sep ","`:
+```bash
+tetss2 predict-file \
+  --input input.csv \
+  --output tetss2_predictions.csv \
+  --sep ","
+```
+---
+### 4. Use a custom sequence column name
+If the sequence column is not named `sequence`, specify it with `--sequence-column`.
+For example, if the input file contains a column named `dna`:
+```bash
+tetss2 predict-file \
+  --input input.tsv \
+  --output tetss2_predictions.tsv \
+  --sequence-column dna
+```
+---
+## Python API usage
+TETSS2.0 can also be used directly in Python.
+```python
+from tetss2 import TETSS2Predictor
+predictor = TETSS2Predictor()
+result = predictor.predict("YOUR_201BP_DNA_SEQUENCE")
+print(result)
+```
+Example output:
+```python
+{
+    "model": "TETSS2.0",
+    "sequence": "YOUR_201BP_DNA_SEQUENCE",
+    "sequence_length": 201,
+    "probability": 0.73,
+    "prediction": 1,
+    "threshold": 0.5,
+}
+```
+---
+## Model architecture
+TETSS2.0 uses a one-dimensional convolutional neural network for DNA sequence classification.
+The model contains:
+* one-hot encoding of DNA sequences
+* multiple 1D convolutional layers
+* batch normalization
+* ReLU activation
+* max pooling
+* adaptive max pooling
+* fully connected classification layers
+The model outputs a single logit, which is converted to a probability using the sigmoid function.
+---
+## Model files
+The package includes the trained model weight file:
+```text
+best_model.pth
+```
+The original training output directory also contains:
+```text
+best_model.pth
+run_config.json
+train_history.tsv
+final_val_metrics.json
+split_summary.json
+train_split.tsv
+val_split.tsv
+```
+These files can be used to document model configuration, validation performance, and data splitting information.
+---
+## Validation performance
+Please fill in the following values using `final_val_metrics.json`:
+| Metric          | Value |
+| --------------- | ----: |
+| AUROC           |  TODO |
+| AUPRC           |  TODO |
+| Accuracy        |  TODO |
+| Precision       |  TODO |
+| Recall          |  TODO |
+| True negatives  |  TODO |
+| False positives |  TODO |
+| False negatives |  TODO |
+| True positives  |  TODO |
+To view the metrics file:
+```bash
+cat final_val_metrics.json
+```
+---
+## Troubleshooting
+### MKL threading error
+If you see an error similar to:
+```text
+mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1
+```
+try running:
+```bash
+export MKL_THREADING_LAYER=GNU
+```
+Then run the prediction command again.
+---
+### Cannot find sequence column
+If you see an error like:
+```text
+Cannot find sequence column 'sequence'
+```
+check whether your input file contains a real tab or comma separator.
+For a TSV file, you can check tabs with:
+```bash
+cat -A input.tsv
+```
+A real tab will appear as:
+```text
+^I
+```
+A correct TSV file should look like:
+```text
+sample_id^Isequence$
+sample1^IACGT...$
+```
+---
+## Citation
+If you use TETSS2 in your research, please cite:
+@software{tetss2,
+  title={TETSS2: Deep Learning Model for TE-TSS Prediction},
+  year={2026}
+}
+---
+## Contact
+For questions or issues, please contact the developer or open an issue in the project repository.
+---
+## License
+License information will be added later.
+## Repository
+https://github.com/MoriyaaCui/TETSS2.git
+## Live Demo
+A Gradio demo is available:
+```bash
+python demo/app.py

tetss2-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+tetss2/__init__.py,sha256=wUcpvTkelXY7ojAkjJEPFtsrXsMCj62yF3p1No1AbS4,69
+tetss2/cli.py,sha256=V-DwHpLYpMwze_J1nvRTqZlAzfdpxzTaOEHjepBL7e0,2797
+tetss2/model.py,sha256=2xk9li3HeqyYISGtM48vQ6iqdNWZr-89GvTR0La_ItY,964
+tetss2/predictor.py,sha256=HepcNNtg0ByVAuv9hsdN4JwB6zFBULPd2BAxZ1jvXlY,3905
+tetss2/assets/best_model.pth,sha256=8hgt7gTBuEW4OJbZrsW80ImSSjcmKwFOwf5rDB0ppRc,2560965
+tetss2-0.1.0.dist-info/METADATA,sha256=GKBdNlmMabgpKyqkO_ljmsrgfrcx35JnQB3Gpc7_grA,7242
+tetss2-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+tetss2-0.1.0.dist-info/entry_points.txt,sha256=AihRYcxX3CsK_d5rM870SAfq-EnA1ysRbpNXjqW2KMc,43
+tetss2-0.1.0.dist-info/top_level.txt,sha256=XbiPCKEihTZkQbisfvwjJb7-SHteuzSyyVH3MHjCFAI,7
+tetss2-0.1.0.dist-info/RECORD,,

tetss2-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

tetss2-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ tetss2 = tetss2.cli:main

tetss2-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ tetss2