tetss2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tetss2/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .predictor import TETSS2Predictor
2
+
3
+ __all__ = ["TETSS2Predictor"]
Binary file
tetss2/cli.py ADDED
@@ -0,0 +1,108 @@
1
+ import os
2
+
3
+ os.environ["MKL_THREADING_LAYER"] = "GNU"
4
+
5
+ import argparse
6
+ import json
7
+
8
+ from .predictor import TETSS2Predictor
9
+
10
+
11
+ def main():
12
+ parser = argparse.ArgumentParser(
13
+ description="TETSS2.0: predict TE-TSS activity from 201 bp DNA sequence."
14
+ )
15
+
16
+ subparsers = parser.add_subparsers(dest="command")
17
+
18
+ predict_parser = subparsers.add_parser(
19
+ "predict",
20
+ help="Predict one DNA sequence."
21
+ )
22
+ predict_parser.add_argument(
23
+ "--sequence",
24
+ required=True,
25
+ help="Input DNA sequence. TETSS2.0 expects exactly 201 bp."
26
+ )
27
+ predict_parser.add_argument(
28
+ "--threshold",
29
+ type=float,
30
+ default=0.5,
31
+ help="Classification threshold. Default: 0.5"
32
+ )
33
+ predict_parser.add_argument(
34
+ "--no-length-check",
35
+ action="store_true",
36
+ help="Disable 201 bp length check. Mainly for debugging."
37
+ )
38
+
39
+ file_parser = subparsers.add_parser(
40
+ "predict-file",
41
+ help="Predict sequences from a TSV or CSV file."
42
+ )
43
+ file_parser.add_argument(
44
+ "--input",
45
+ required=True,
46
+ help="Input file path. The file should contain a sequence column."
47
+ )
48
+ file_parser.add_argument(
49
+ "--output",
50
+ required=True,
51
+ help="Output file path."
52
+ )
53
+ file_parser.add_argument(
54
+ "--sequence-column",
55
+ default="sequence",
56
+ help="Column name containing DNA sequences. Default: sequence"
57
+ )
58
+ file_parser.add_argument(
59
+ "--threshold",
60
+ type=float,
61
+ default=0.5,
62
+ help="Classification threshold. Default: 0.5"
63
+ )
64
+ file_parser.add_argument(
65
+ "--sep",
66
+ default="\t",
67
+ help="File separator. Default: tab. Use ',' for CSV."
68
+ )
69
+ file_parser.add_argument(
70
+ "--no-length-check",
71
+ action="store_true",
72
+ help="Disable 201 bp length check. Mainly for debugging."
73
+ )
74
+
75
+ args = parser.parse_args()
76
+
77
+ if args.command == "predict":
78
+ predictor = TETSS2Predictor()
79
+ result = predictor.predict(
80
+ args.sequence,
81
+ threshold=args.threshold,
82
+ check_length=not args.no_length_check,
83
+ )
84
+ print(json.dumps(result, indent=2))
85
+
86
+ elif args.command == "predict-file":
87
+ sep = args.sep
88
+ if sep == "\\t":
89
+ sep = "\t"
90
+
91
+ predictor = TETSS2Predictor()
92
+ predictor.predict_file(
93
+ input_path=args.input,
94
+ output_path=args.output,
95
+ sequence_column=args.sequence_column,
96
+ threshold=args.threshold,
97
+ sep=sep,
98
+ check_length=not args.no_length_check,
99
+ )
100
+
101
+ print(f"Prediction finished. Results saved to: {args.output}")
102
+
103
+ else:
104
+ parser.print_help()
105
+
106
+
107
+ if __name__ == "__main__":
108
+ main()
tetss2/model.py ADDED
@@ -0,0 +1,27 @@
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ class TETSSClassifier(nn.Module):
6
+ def __init__(self):
7
+ super().__init__()
8
+ self.features = nn.Sequential(
9
+ nn.Conv1d(4, 64, kernel_size=15, padding=7),
10
+ nn.BatchNorm1d(64), nn.ReLU(), nn.MaxPool1d(2),
11
+ nn.Conv1d(64, 128, kernel_size=15, padding=7),
12
+ nn.BatchNorm1d(128), nn.ReLU(), nn.MaxPool1d(2),
13
+ nn.Conv1d(128, 128, kernel_size=15, padding=7),
14
+ nn.BatchNorm1d(128), nn.ReLU(), nn.MaxPool1d(2),
15
+ nn.Conv1d(128, 256, kernel_size=7, padding=3),
16
+ nn.BatchNorm1d(256), nn.ReLU(), nn.AdaptiveMaxPool1d(1)
17
+ )
18
+ self.classifier = nn.Sequential(
19
+ nn.Flatten(),
20
+ nn.Linear(256, 128),
21
+ nn.ReLU(),
22
+ nn.Dropout(0.3),
23
+ nn.Linear(128, 1)
24
+ )
25
+
26
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
27
+ return self.classifier(self.features(x)).squeeze(1)
tetss2/predictor.py ADDED
@@ -0,0 +1,139 @@
1
+ import os
2
+
3
+ os.environ["MKL_THREADING_LAYER"] = "GNU"
4
+
5
+ import torch
6
+ import numpy as np
7
+ import pandas as pd
8
+ from importlib.resources import files
9
+
10
+ from .model import TETSSClassifier
11
+
12
+
13
+ EXPECTED_SEQUENCE_LENGTH = 201
14
+
15
+
16
+ def clean_sequence(seq: str) -> str:
17
+ seq = str(seq).strip().upper()
18
+ return seq
19
+
20
+
21
+ def validate_sequence(seq: str, expected_length: int = EXPECTED_SEQUENCE_LENGTH) -> None:
22
+ allowed_bases = set("ACGTN")
23
+
24
+ invalid_bases = sorted(set(seq) - allowed_bases)
25
+ if invalid_bases:
26
+ raise ValueError(
27
+ f"Invalid bases found: {invalid_bases}. "
28
+ "Only A, C, G, T, and N are allowed."
29
+ )
30
+
31
+ if len(seq) != expected_length:
32
+ raise ValueError(
33
+ f"Invalid sequence length: {len(seq)} bp. "
34
+ f"TETSS2.0 expects exactly {expected_length} bp."
35
+ )
36
+
37
+
38
+ def seq_to_onehot(seq: str) -> np.ndarray:
39
+ mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
40
+ arr = np.zeros((4, len(seq)), dtype=np.float32)
41
+
42
+ for i, base in enumerate(seq.upper()):
43
+ idx = mapping.get(base)
44
+ if idx is not None:
45
+ arr[idx, i] = 1.0
46
+
47
+ return arr
48
+
49
+
50
+ class TETSS2Predictor:
51
+ def __init__(self, model_path=None, device=None):
52
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
53
+
54
+ if model_path is None:
55
+ model_path = files("tetss2").joinpath("assets/best_model.pth")
56
+
57
+ self.model = TETSSClassifier().to(self.device)
58
+ state_dict = torch.load(model_path, map_location=self.device)
59
+ self.model.load_state_dict(state_dict)
60
+ self.model.eval()
61
+
62
+ def predict_proba(self, sequence: str, check_length: bool = True) -> float:
63
+ sequence = clean_sequence(sequence)
64
+
65
+ if check_length:
66
+ validate_sequence(sequence)
67
+
68
+ x = seq_to_onehot(sequence)
69
+ x = torch.from_numpy(x).unsqueeze(0).to(self.device)
70
+
71
+ with torch.no_grad():
72
+ logits = self.model(x)
73
+ prob = torch.sigmoid(logits).item()
74
+
75
+ return float(prob)
76
+
77
+ def predict(
78
+ self,
79
+ sequence: str,
80
+ threshold: float = 0.5,
81
+ check_length: bool = True,
82
+ ) -> dict:
83
+ sequence = clean_sequence(sequence)
84
+
85
+ prob = self.predict_proba(sequence, check_length=check_length)
86
+ label = int(prob > threshold)
87
+
88
+ return {
89
+ "model": "TETSS2.0",
90
+ "sequence": sequence,
91
+ "sequence_length": len(sequence),
92
+ "probability": prob,
93
+ "prediction": label,
94
+ "threshold": threshold,
95
+ }
96
+
97
+ def predict_file(
98
+ self,
99
+ input_path: str,
100
+ output_path: str,
101
+ sequence_column: str = "sequence",
102
+ threshold: float = 0.5,
103
+ sep: str = "\t",
104
+ check_length: bool = True,
105
+ ) -> None:
106
+ df = pd.read_csv(input_path, sep=sep)
107
+
108
+ if sequence_column not in df.columns:
109
+ raise ValueError(
110
+ f"Cannot find sequence column '{sequence_column}' in input file. "
111
+ f"Available columns are: {list(df.columns)}"
112
+ )
113
+
114
+ probabilities = []
115
+ predictions = []
116
+ sequence_lengths = []
117
+
118
+ for idx, seq in enumerate(df[sequence_column], start=1):
119
+ seq = clean_sequence(seq)
120
+
121
+ try:
122
+ prob = self.predict_proba(seq, check_length=check_length)
123
+ except ValueError as e:
124
+ raise ValueError(
125
+ f"Error in row {idx}: {e}"
126
+ )
127
+
128
+ pred = int(prob > threshold)
129
+
130
+ probabilities.append(prob)
131
+ predictions.append(pred)
132
+ sequence_lengths.append(len(seq))
133
+
134
+ df["tetss2_sequence_length"] = sequence_lengths
135
+ df["tetss2_probability"] = probabilities
136
+ df["tetss2_prediction"] = predictions
137
+ df["tetss2_threshold"] = threshold
138
+
139
+ df.to_csv(output_path, sep=sep, index=False)
@@ -0,0 +1,375 @@
1
+ Metadata-Version: 2.4
2
+ Name: tetss2
3
+ Version: 0.1.0
4
+ Summary: TETSS2.0: a PyTorch model for predicting TE-TSS activity from DNA sequences.
5
+ Author-email: Moriyaa Cui <2311459@tongji.edu.cn>
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: numpy>=1.21
9
+ Requires-Dist: pandas>=1.3
10
+ Requires-Dist: torch>=1.10
11
+
12
+ # TETSS2.0
13
+
14
+ **TETSS2.0** is a deep learning model for predicting TE-derived transcription start site (TE-TSS) activity from DNA sequences.
15
+
16
+ The Python package name and command-line tool name are **`tetss2`**. The model name shown in documents, figures, and the website is **TETSS2.0**.
17
+
18
+ ---
19
+
20
+ ## Overview
21
+
22
+ TETSS2.0 is a PyTorch-based convolutional neural network classifier designed to predict whether an input DNA sequence is associated with TE-TSS activity.
23
+
24
+ The model takes a DNA sequence as input and returns:
25
+
26
+ * a prediction probability
27
+ * a binary prediction label
28
+ * the classification threshold used for prediction
29
+
30
+ By default, TETSS2.0 uses a threshold of `0.5`.
31
+
32
+ ---
33
+
34
+ ## Input requirement
35
+
36
+ TETSS2.0 expects DNA sequences of exactly **201 bp**.
37
+
38
+ Allowed bases:
39
+
40
+ ```text
41
+ A, C, G, T, N
42
+ ```
43
+
44
+ Notes:
45
+
46
+ * Input sequences are automatically converted to uppercase.
47
+ * `N` is allowed but is encoded as an all-zero position in the one-hot representation.
48
+ * Sequences shorter or longer than 201 bp are rejected by default.
49
+ * The option `--no-length-check` is available only for debugging and is not recommended for normal prediction.
50
+
51
+ ---
52
+
53
+ ## Output
54
+
55
+ For each input sequence, TETSS2.0 outputs:
56
+
57
+ | Column | Description |
58
+ | ------------------------ | ---------------------------------------- |
59
+ | `tetss2_sequence_length` | Length of the input sequence |
60
+ | `tetss2_probability` | Predicted probability score |
61
+ | `tetss2_prediction` | Binary prediction result, `0` or `1` |
62
+ | `tetss2_threshold` | Threshold used for binary classification |
63
+
64
+ ---
65
+
66
+ ## Installation
67
+
68
+ ### Option 1: Install from a local source directory
69
+
70
+ If you have downloaded or cloned this package locally, enter the package directory and install it with:
71
+
72
+ ```bash
73
+ cd tetss_rampage_package
74
+ pip install -e .
75
+ ```
76
+
77
+ After installation, check whether the command-line tool is available:
78
+
79
+ ```bash
80
+ tetss2 --help
81
+ ```
82
+
83
+ ### Option 2: Recommended conda environment
84
+
85
+ We recommend creating a clean conda environment before installation:
86
+
87
+ ```bash
88
+
89
+ conda create -n tetss2 python=3.9
90
+ conda activate tetss2
91
+ conda install numpy pandas scikit-learn
92
+ pip install torch==1.10.2
93
+
94
+ pip install -e .
95
+
96
+ ```
97
+
98
+ A future public release may support:
99
+
100
+ ```bash
101
+ pip install tetss2
102
+ ```
103
+
104
+ ---
105
+
106
+ ## Command-line usage
107
+
108
+ ### 1. Predict a single sequence
109
+
110
+ ```bash
111
+ tetss2 predict --sequence ACGTACGTACGTACGT
112
+ ```
113
+
114
+ For normal use, the input sequence should be exactly 201 bp:
115
+
116
+ ```bash
117
+ tetss2 predict --sequence YOUR_201BP_DNA_SEQUENCE
118
+ ```
119
+
120
+ Example output:
121
+
122
+ ```json
123
+ {
124
+ "model": "TETSS2.0",
125
+ "sequence": "ACGTACGTACGTACGT",
126
+ "sequence_length": 16,
127
+ "probability": 0.35490313172340393,
128
+ "prediction": 0,
129
+ "threshold": 0.5
130
+ }
131
+ ```
132
+
133
+ Note: the example above uses a short sequence only to demonstrate the command format. For biological prediction, please use a 201 bp sequence.
134
+
135
+ ---
136
+
137
+ ### 2. Batch prediction from a TSV file
138
+
139
+ Prepare an input file containing a `sequence` column.
140
+
141
+ Example input file: `input.tsv`
142
+
143
+ ```tsv
144
+ sample_id sequence
145
+ sample1 ACGT...
146
+ sample2 TTTT...
147
+ sample3 GCGC...
148
+ ```
149
+
150
+ Run batch prediction:
151
+
152
+ ```bash
153
+ tetss2 predict-file \
154
+ --input input.tsv \
155
+ --output tetss2_predictions.tsv
156
+ ```
157
+
158
+ The output file will contain the original columns plus TETSS2.0 prediction results.
159
+
160
+ Example output:
161
+
162
+ ```tsv
163
+ sample_id sequence tetss2_sequence_length tetss2_probability tetss2_prediction tetss2_threshold
164
+ sample1 ACGT... 201 0.3549 0 0.5
165
+ sample2 TTTT... 201 0.8123 1 0.5
166
+ sample3 GCGC... 201 0.4471 0 0.5
167
+ ```
168
+
169
+ ---
170
+
171
+ ### 3. Batch prediction from a CSV file
172
+
173
+ If your input file is comma-separated, use `--sep ","`:
174
+
175
+ ```bash
176
+ tetss2 predict-file \
177
+ --input input.csv \
178
+ --output tetss2_predictions.csv \
179
+ --sep ","
180
+ ```
181
+
182
+ ---
183
+
184
+ ### 4. Use a custom sequence column name
185
+
186
+ If the sequence column is not named `sequence`, specify it with `--sequence-column`.
187
+
188
+ For example, if the input file contains a column named `dna`:
189
+
190
+ ```bash
191
+ tetss2 predict-file \
192
+ --input input.tsv \
193
+ --output tetss2_predictions.tsv \
194
+ --sequence-column dna
195
+ ```
196
+
197
+ ---
198
+
199
+ ## Python API usage
200
+
201
+ TETSS2.0 can also be used directly in Python.
202
+
203
+ ```python
204
+ from tetss2 import TETSS2Predictor
205
+
206
+ predictor = TETSS2Predictor()
207
+
208
+ result = predictor.predict("YOUR_201BP_DNA_SEQUENCE")
209
+ print(result)
210
+ ```
211
+
212
+ Example output:
213
+
214
+ ```python
215
+ {
216
+ "model": "TETSS2.0",
217
+ "sequence": "YOUR_201BP_DNA_SEQUENCE",
218
+ "sequence_length": 201,
219
+ "probability": 0.73,
220
+ "prediction": 1,
221
+ "threshold": 0.5,
222
+ }
223
+ ```
224
+
225
+ ---
226
+
227
+ ## Model architecture
228
+
229
+ TETSS2.0 uses a one-dimensional convolutional neural network for DNA sequence classification.
230
+
231
+ The model contains:
232
+
233
+ * one-hot encoding of DNA sequences
234
+ * multiple 1D convolutional layers
235
+ * batch normalization
236
+ * ReLU activation
237
+ * max pooling
238
+ * adaptive max pooling
239
+ * fully connected classification layers
240
+
241
+ The model outputs a single logit, which is converted to a probability using the sigmoid function.
242
+
243
+ ---
244
+
245
+ ## Model files
246
+
247
+ The package includes the trained model weight file:
248
+
249
+ ```text
250
+ best_model.pth
251
+ ```
252
+
253
+ The original training output directory also contains:
254
+
255
+ ```text
256
+ best_model.pth
257
+ run_config.json
258
+ train_history.tsv
259
+ final_val_metrics.json
260
+ split_summary.json
261
+ train_split.tsv
262
+ val_split.tsv
263
+ ```
264
+
265
+ These files can be used to document model configuration, validation performance, and data splitting information.
266
+
267
+ ---
268
+
269
+ ## Validation performance
270
+
271
+ Please fill in the following values using `final_val_metrics.json`:
272
+
273
+ | Metric | Value |
274
+ | --------------- | ----: |
275
+ | AUROC | TODO |
276
+ | AUPRC | TODO |
277
+ | Accuracy | TODO |
278
+ | Precision | TODO |
279
+ | Recall | TODO |
280
+ | True negatives | TODO |
281
+ | False positives | TODO |
282
+ | False negatives | TODO |
283
+ | True positives | TODO |
284
+
285
+ To view the metrics file:
286
+
287
+ ```bash
288
+ cat final_val_metrics.json
289
+ ```
290
+
291
+ ---
292
+
293
+ ## Troubleshooting
294
+
295
+ ### MKL threading error
296
+
297
+ If you see an error similar to:
298
+
299
+ ```text
300
+ mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1
301
+ ```
302
+
303
+ try running:
304
+
305
+ ```bash
306
+ export MKL_THREADING_LAYER=GNU
307
+ ```
308
+
309
+ Then run the prediction command again.
310
+
311
+ ---
312
+
313
+ ### Cannot find sequence column
314
+
315
+ If you see an error like:
316
+
317
+ ```text
318
+ Cannot find sequence column 'sequence'
319
+ ```
320
+
321
+ check whether your input file contains a real tab or comma separator.
322
+
323
+ For a TSV file, you can check tabs with:
324
+
325
+ ```bash
326
+ cat -A input.tsv
327
+ ```
328
+
329
+ A real tab will appear as:
330
+
331
+ ```text
332
+ ^I
333
+ ```
334
+
335
+ A correct TSV file should look like:
336
+
337
+ ```text
338
+ sample_id^Isequence$
339
+ sample1^IACGT...$
340
+ ```
341
+
342
+ ---
343
+
344
+ ## Citation
345
+
346
+ If you use TETSS2 in your research, please cite:
347
+
348
+ @software{tetss2,
349
+ title={TETSS2: Deep Learning Model for TE-TSS Prediction},
350
+ year={2026}
351
+ }
352
+
353
+ ---
354
+
355
+ ## Contact
356
+
357
+ For questions or issues, please contact the developer or open an issue in the project repository.
358
+
359
+ ---
360
+
361
+ ## License
362
+
363
+ License information will be added later.
364
+
365
+ ## Repository
366
+
367
+ https://github.com/MoriyaaCui/TETSS2.git
368
+
369
+
370
+ ## Live Demo
371
+
372
+ A Gradio demo is available:
373
+
374
+ ```bash
375
+ python demo/app.py
@@ -0,0 +1,10 @@
1
+ tetss2/__init__.py,sha256=wUcpvTkelXY7ojAkjJEPFtsrXsMCj62yF3p1No1AbS4,69
2
+ tetss2/cli.py,sha256=V-DwHpLYpMwze_J1nvRTqZlAzfdpxzTaOEHjepBL7e0,2797
3
+ tetss2/model.py,sha256=2xk9li3HeqyYISGtM48vQ6iqdNWZr-89GvTR0La_ItY,964
4
+ tetss2/predictor.py,sha256=HepcNNtg0ByVAuv9hsdN4JwB6zFBULPd2BAxZ1jvXlY,3905
5
+ tetss2/assets/best_model.pth,sha256=8hgt7gTBuEW4OJbZrsW80ImSSjcmKwFOwf5rDB0ppRc,2560965
6
+ tetss2-0.1.0.dist-info/METADATA,sha256=GKBdNlmMabgpKyqkO_ljmsrgfrcx35JnQB3Gpc7_grA,7242
7
+ tetss2-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ tetss2-0.1.0.dist-info/entry_points.txt,sha256=AihRYcxX3CsK_d5rM870SAfq-EnA1ysRbpNXjqW2KMc,43
9
+ tetss2-0.1.0.dist-info/top_level.txt,sha256=XbiPCKEihTZkQbisfvwjJb7-SHteuzSyyVH3MHjCFAI,7
10
+ tetss2-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ tetss2 = tetss2.cli:main
@@ -0,0 +1 @@
1
+ tetss2