genbenchQC 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genbenchqc-1.0.0/LICENSE +21 -0
- genbenchqc-1.0.0/PKG-INFO +25 -0
- genbenchqc-1.0.0/README.md +147 -0
- genbenchqc-1.0.0/setup.cfg +4 -0
- genbenchqc-1.0.0/setup.py +40 -0
- genbenchqc-1.0.0/src/genbenchQC/__init__.py +0 -0
- genbenchqc-1.0.0/src/genbenchQC/evaluate_dataset.py +287 -0
- genbenchqc-1.0.0/src/genbenchQC/evaluate_sequences.py +147 -0
- genbenchqc-1.0.0/src/genbenchQC/evaluate_split.py +185 -0
- genbenchqc-1.0.0/src/genbenchQC/report/__init__.py +0 -0
- genbenchqc-1.0.0/src/genbenchQC/report/dataset_html_report.py +370 -0
- genbenchqc-1.0.0/src/genbenchQC/report/dataset_plots.py +331 -0
- genbenchqc-1.0.0/src/genbenchQC/report/report_generator.py +240 -0
- genbenchqc-1.0.0/src/genbenchQC/report/sequence_html_report.py +328 -0
- genbenchqc-1.0.0/src/genbenchQC/report/sequences_plots.py +166 -0
- genbenchqc-1.0.0/src/genbenchQC/report/split_html_report.py +152 -0
- genbenchqc-1.0.0/src/genbenchQC/utils/__init__.py +0 -0
- genbenchqc-1.0.0/src/genbenchQC/utils/input_utils.py +100 -0
- genbenchqc-1.0.0/src/genbenchQC/utils/statistics.py +159 -0
- genbenchqc-1.0.0/src/genbenchQC/utils/testing.py +144 -0
- genbenchqc-1.0.0/src/genbenchQC.egg-info/PKG-INFO +25 -0
- genbenchqc-1.0.0/src/genbenchQC.egg-info/SOURCES.txt +24 -0
- genbenchqc-1.0.0/src/genbenchQC.egg-info/dependency_links.txt +1 -0
- genbenchqc-1.0.0/src/genbenchQC.egg-info/entry_points.txt +4 -0
- genbenchqc-1.0.0/src/genbenchQC.egg-info/requires.txt +11 -0
- genbenchqc-1.0.0/src/genbenchQC.egg-info/top_level.txt +1 -0
genbenchqc-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Katarina Gresova
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: genbenchQC
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Genomic Benchmarks QC: Automated Quality Control for Genomic Machine Learning Datasets
|
|
5
|
+
Author: Katarina Gresova
|
|
6
|
+
Author-email: gresova11@gmail.com
|
|
7
|
+
Keywords: genomic benchmarks,deep learning,machine learning,computational biology,bioinformatics,genomics,quality control
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: numpy>=1.23
|
|
10
|
+
Requires-Dist: pandas>=1.5
|
|
11
|
+
Requires-Dist: matplotlib>=3.6
|
|
12
|
+
Requires-Dist: seaborn>=0.12
|
|
13
|
+
Requires-Dist: biopython>=1.8
|
|
14
|
+
Requires-Dist: scikit-learn>=1.2
|
|
15
|
+
Requires-Dist: cdhit-reader==0.2.0
|
|
16
|
+
Requires-Dist: statsmodels>=0.13
|
|
17
|
+
Provides-Extra: develop
|
|
18
|
+
Requires-Dist: pytest>=3; extra == "develop"
|
|
19
|
+
Dynamic: author
|
|
20
|
+
Dynamic: author-email
|
|
21
|
+
Dynamic: keywords
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
Dynamic: provides-extra
|
|
24
|
+
Dynamic: requires-dist
|
|
25
|
+
Dynamic: summary
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# Genomic Benchmarks QC: Automated Quality Control for Genomic Machine Learning Datasets
|
|
2
|
+
|
|
3
|
+
Genomic Benchmarks QC is a Python package and CLI toolkit for automated quality control of genomic datasets used in machine learning.
|
|
4
|
+
It helps detect biases, inconsistencies, and potential data leakage across sequences, dataset classes, and train-test splits — ensuring your datasets are reliable before model training.
|
|
5
|
+
|
|
6
|
+
## Features
|
|
7
|
+
|
|
8
|
+
### Provided Tools
|
|
9
|
+
- **evaluate_sequences** – QC of a single dataset or dataset subset.
|
|
10
|
+
- **evaluate_dataset** – QC across multiple dataset classes.
|
|
11
|
+
- **evaluate_split** – Train–test split leakage detection.
|
|
12
|
+
|
|
13
|
+
### General Features
|
|
14
|
+
- [**Sequence-level QC**](#evaluate-sequences) – Evaluate nucleotide composition, sequence length distribution, GC content, and more.
|
|
15
|
+
- [**Class-level QC**](#evaluate-dataset) – Compare multiple classes for feature similarity or bias.
|
|
16
|
+
- [**Train–test split validation**](#evaluate-split) – Detect potential data leakage through sequence similarity and clustering.
|
|
17
|
+
- [**Multiple input formats**](#supported-input-file-formats) – Supports FASTA, CSV, and TSV datasets.
|
|
18
|
+
- **Customizable reporting** – Generate JSON, HTML, or simple text summaries.
|
|
19
|
+
- **Integration-ready** – Available as both CLI tools and a Python API.
|
|
20
|
+
- **Flexible sequence handling** – Works with single or multiple sequence columns.
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
Install Genomic Benchmarks QC using pip:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install genbenchQC
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
If you plan to use `evaluate_split`, install [cd-hit](https://www.bioinformatics.org/cd-hit/cd-hit-user-guide):
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
conda install -c bioconda cd-hit
|
|
34
|
+
# or follow: https://github.com/weizhongli/cdhit/wiki/2.-Installation
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
Clone the repository to access example datasets:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
git clone https://github.com/katarinagresova/GenBenchQC.git
|
|
43
|
+
cd GenBenchQC
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Evaluate Sequences
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
evaluate_sequences \
|
|
50
|
+
--input example_datasets/G4_positives.fasta \
|
|
51
|
+
--format fasta \
|
|
52
|
+
--out_folder example_outputs/G4_dataset_positives
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
The same evaluation would be executed from Python as follows:
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from genbenchQC import evaluate_sequences
|
|
59
|
+
|
|
60
|
+
evaluate_sequences.run(
|
|
61
|
+
input='example_datasets/G4_positives.fasta',
|
|
62
|
+
format='fasta',
|
|
63
|
+
out_folder='example_outputs/G4_dataset_positives'
|
|
64
|
+
)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Outputs with their description are in [example_outputs/G4_dataset_positives](example_outputs/G4_dataset_positives).
|
|
68
|
+
|
|
69
|
+
### Evaluate Dataset
|
|
70
|
+
|
|
71
|
+
Running from CLI with fasta file:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
evaluate_dataset \
|
|
75
|
+
--input example_datasets/G4_positives.fasta example_datasets/G4_negatives.fasta \
|
|
76
|
+
--format fasta \
|
|
77
|
+
--out_folder example_outputs/G4_dataset
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Outputs with their description are in [example_outputs/G4_dataset](example_outputs/G4_dataset).
|
|
81
|
+
|
|
82
|
+
Running from Python with CSV file with multiple sequence columns:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from genbenchQC import evaluate_dataset
|
|
86
|
+
|
|
87
|
+
evaluate_dataset.run(
|
|
88
|
+
input=['example_datasets/miRNA_mRNA_pairs_dataset.tsv'],
|
|
89
|
+
format='tsv',
|
|
90
|
+
out_folder='example_outputs/miRNA_mRNA_dataset',
|
|
91
|
+
sequence_column=['gene', 'noncodingRNA'],
|
|
92
|
+
label_column='label',
|
|
93
|
+
label_list=['0', '1']
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Outputs with their description are in [example_outputs/miRNA_mRNA_dataset](example_outputs/miRNA_mRNA_dataset).
|
|
98
|
+
|
|
99
|
+
### Evaluate Split
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
evaluate_split \
|
|
103
|
+
--train_input example_datasets/enhancers_train.csv \
|
|
104
|
+
--test_input example_datasets/enhancers_test.csv \
|
|
105
|
+
--format csv \
|
|
106
|
+
--sequence_column sequence \
|
|
107
|
+
--out_folder example_outputs/enhancers_dataset
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
The same evaluation would be executed from Python as follows:
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from genbenchQC import evaluate_split
|
|
114
|
+
|
|
115
|
+
evaluate_split.run(
|
|
116
|
+
train_files=['example_datasets/enhancers_train.csv'],
|
|
117
|
+
test_files=['example_datasets/enhancers_test.csv'],
|
|
118
|
+
format='csv',
|
|
119
|
+
sequence_column='sequence',
|
|
120
|
+
out_folder='example_outputs/enhancers_dataset'
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Outputs with their description are in [example_outputs/enhancers_dataset](example_outputs/enhancers_dataset).
|
|
125
|
+
|
|
126
|
+
## Supported input file formats
|
|
127
|
+
|
|
128
|
+
You can choose to run the tool while having different dataset formats:
|
|
129
|
+
- **FASTA**: The input is a FASTA file / list of FASTA files. One file needs to contain sequences of one class if running *evaluate_sequences* mode.
|
|
130
|
+
- **CSV/TSV**: The input is a CSV/TSV file, and you provide the name of the column containing sequences. You can have either:
|
|
131
|
+
- **multiple files**, each one containing sequences from one class (similar as with FASTA input)
|
|
132
|
+
- **one file** containing sequences from multiple classes. In this case, when running *evaluate_sequences* mode, you need to provide the name of the column containing class labels so the tool can split the dataset into parts. The label classes can then be inferred, or you can specify their list by yourself. The dataset will then be split into pieces containing sequences with corresponding labels and analysis will be performed similarly as with multiple files.
|
|
133
|
+
- **CSV.GZ/TSV.GZ**: Functionality is the same as CSV/TSV files
|
|
134
|
+
|
|
135
|
+
When having CSV/TSV/CSV.GZ/TSV.GZ input, you can also decide to provide multiple sequence columns to analyze. In this case, the analysis in modes *evaluate_sequences* and *evaluate_dataset* will be performed for each column separately and lastly for sequences made by concatenating sequences throughout all the columns.
|
|
136
|
+
*evaluate_split* mode will run only the concatenated sequences.
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
## Development
|
|
140
|
+
|
|
141
|
+
If you want to help with the development of Genomic Benchmarks QC, you are more than welcome to join in!
|
|
142
|
+
|
|
143
|
+
For a guidance, have a look at [CONTRIBUTING.md](CONTRIBUTING.md)
|
|
144
|
+
|
|
145
|
+
## License
|
|
146
|
+
|
|
147
|
+
Genomic Benchmarks QC is MIT-style licensed, as found in the [LICENSE](LICENSE) file.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from setuptools import find_packages, setup
|
|
2
|
+
|
|
3
|
+
requirements = [
|
|
4
|
+
'numpy>=1.23',
|
|
5
|
+
'pandas>=1.5',
|
|
6
|
+
'matplotlib>=3.6',
|
|
7
|
+
'seaborn>=0.12',
|
|
8
|
+
'biopython>=1.8',
|
|
9
|
+
'scikit-learn>=1.2',
|
|
10
|
+
'cdhit-reader==0.2.0',
|
|
11
|
+
'statsmodels>=0.13',
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
test_requirements = [
|
|
15
|
+
'pytest>=3',
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
setup(
|
|
19
|
+
name='genbenchQC',
|
|
20
|
+
version='1.0.0',
|
|
21
|
+
description='Genomic Benchmarks QC: Automated Quality Control for Genomic Machine Learning Datasets',
|
|
22
|
+
author="Katarina Gresova",
|
|
23
|
+
author_email='gresova11@gmail.com',
|
|
24
|
+
packages=find_packages("src"),
|
|
25
|
+
package_dir={"": "src"},
|
|
26
|
+
install_requires=requirements,
|
|
27
|
+
extras_require={
|
|
28
|
+
"develop": test_requirements,
|
|
29
|
+
},
|
|
30
|
+
tests_require=["pytest"],
|
|
31
|
+
test_suite='tests',
|
|
32
|
+
entry_points='''
|
|
33
|
+
[console_scripts]
|
|
34
|
+
evaluate_sequences=genbenchQC.evaluate_sequences:main
|
|
35
|
+
evaluate_dataset=genbenchQC.evaluate_dataset:main
|
|
36
|
+
evaluate_split=genbenchQC.evaluate_split:main
|
|
37
|
+
''',
|
|
38
|
+
keywords=["genomic benchmarks", "deep learning", "machine learning",
|
|
39
|
+
"computational biology", "bioinformatics", "genomics", "quality control"],
|
|
40
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from itertools import combinations
|
|
5
|
+
from typing import Optional
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from genbenchQC.utils.statistics import SequenceStatistics
|
|
9
|
+
from genbenchQC.utils.testing import flag_significant_differences
|
|
10
|
+
from genbenchQC.report.report_generator import generate_json_report, generate_sequence_html_report, generate_simple_report, generate_dataset_html_report
|
|
11
|
+
from genbenchQC.utils.input_utils import read_fasta, read_sequences_from_df, read_multisequence_df, read_csv_file, setup_logger
|
|
12
|
+
|
|
13
|
+
def run_analysis(input_statistics, out_folder, report_types, seq_report_types, plot_type, flag_threshold):
|
|
14
|
+
|
|
15
|
+
out_folder = Path(out_folder)
|
|
16
|
+
|
|
17
|
+
# run individual analysis
|
|
18
|
+
for s in input_statistics:
|
|
19
|
+
stats, end_position = s.compute()
|
|
20
|
+
|
|
21
|
+
if seq_report_types:
|
|
22
|
+
|
|
23
|
+
filename = Path(s.filename).stem
|
|
24
|
+
if s.seq_column is not None:
|
|
25
|
+
filename += f'_{s.seq_column}'
|
|
26
|
+
if s.label is not None:
|
|
27
|
+
filename += f'_{s.label}'
|
|
28
|
+
|
|
29
|
+
if 'json' in seq_report_types:
|
|
30
|
+
json_report_path = out_folder / Path(filename + '_report.json')
|
|
31
|
+
generate_json_report(stats, json_report_path)
|
|
32
|
+
|
|
33
|
+
if 'html' in seq_report_types:
|
|
34
|
+
html_report_path = out_folder / Path(filename + '_report.html')
|
|
35
|
+
plots_path = out_folder / Path(filename + '_plots')
|
|
36
|
+
generate_sequence_html_report(stats, html_report_path, plots_path, end_position, plot_type)
|
|
37
|
+
|
|
38
|
+
if len(input_statistics) < 2:
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
# run pair comparison analysis with all combinations
|
|
42
|
+
for stat1, stat2 in combinations(input_statistics, 2):
|
|
43
|
+
filename = "dataset_report"
|
|
44
|
+
if stat1.seq_column is not None:
|
|
45
|
+
filename += f'_{stat1.seq_column}'
|
|
46
|
+
if stat1.label is not None and stat2.label is not None:
|
|
47
|
+
filename += f'_label_{stat1.label}_vs_{stat2.label}'
|
|
48
|
+
logging.debug(f"Comparing datasets label: {stat1.label} vs {stat2.label}")
|
|
49
|
+
else:
|
|
50
|
+
filename += f'_{Path(stat1.filename).stem}_{Path(stat2.filename).stem}'
|
|
51
|
+
logging.debug(
|
|
52
|
+
f"Comparing datasets: {stat1.filename} vs {stat2.filename}")
|
|
53
|
+
|
|
54
|
+
results = flag_significant_differences(
|
|
55
|
+
stat1.sequences, stat1.stats,
|
|
56
|
+
stat2.sequences, stat2.stats,
|
|
57
|
+
threshold=flag_threshold,
|
|
58
|
+
end_position=min(stat1.end_position, stat2.end_position)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if 'simple' in report_types:
|
|
62
|
+
simple_report_path = out_folder / Path(f'{filename}.csv')
|
|
63
|
+
generate_simple_report(results, simple_report_path)
|
|
64
|
+
|
|
65
|
+
if 'html' in report_types:
|
|
66
|
+
html_report_path = out_folder / Path(f'{filename}.html')
|
|
67
|
+
plots_path = out_folder / Path(f'{filename}_plots')
|
|
68
|
+
generate_dataset_html_report(
|
|
69
|
+
stat1, stat2, results,
|
|
70
|
+
html_report_path,
|
|
71
|
+
plots_path=plots_path,
|
|
72
|
+
threshold=flag_threshold,
|
|
73
|
+
end_position=min(stat1.end_position, stat2.end_position),
|
|
74
|
+
plot_type=plot_type
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def run(input,
|
|
78
|
+
format,
|
|
79
|
+
out_folder='.',
|
|
80
|
+
sequence_column: Optional[list[str]] = ['sequences'],
|
|
81
|
+
label_column='label',
|
|
82
|
+
label_list: Optional[list[str]] = ['infer'],
|
|
83
|
+
regression: Optional[bool] = False,
|
|
84
|
+
report_types: Optional[list[str]] = ['html', 'simple'],
|
|
85
|
+
seq_report_types: Optional[list[str]] = None,
|
|
86
|
+
end_position: Optional[int] = None,
|
|
87
|
+
plot_type: Optional[str] = 'boxen',
|
|
88
|
+
flag_threshold: Optional[float] = 0.015,
|
|
89
|
+
log_level: Optional[str] = 'INFO',
|
|
90
|
+
log_file: Optional[str] = None
|
|
91
|
+
):
|
|
92
|
+
"""Run the dataset evaluation.
|
|
93
|
+
|
|
94
|
+
This function reads sequences from the provided input files, performs analysis, and generates reports about the sequences.
|
|
95
|
+
|
|
96
|
+
@param input: List of paths to input files. Can be a list of files, each containing sequences from one class.
|
|
97
|
+
@param format: Format of the input files (fasta, csv, csv.gz, tsv, tsv.gz).
|
|
98
|
+
@param out_folder: Path to the output folder. Default: '.'.
|
|
99
|
+
@param sequence_column: Name of the columns with sequences to analyze for datasets in CSV/TSV format.
|
|
100
|
+
Either one column or list of columns. Default: ['sequences']
|
|
101
|
+
@param label_column: Name of the label column for datasets in CSV/TSV format. Default: 'label'.
|
|
102
|
+
@param label_list: List of label classes to consider or "infer" to parse different labels automatically from label column.
|
|
103
|
+
For datasets in CSV/TSV format.
|
|
104
|
+
@param regression: If True, label column is considered as a regression target and values are split into 2 classes.
|
|
105
|
+
@param report_types: Types of reports to generate. Default: ['html', 'simple'].
|
|
106
|
+
@param seq_report_types: Types of reports to generate for individual groups of sequences. Default: None.
|
|
107
|
+
@param end_position: End position of the sequences to consider in per position statistics.
|
|
108
|
+
If not provided, 75th percentile of sequence lengths will be used. Default: None.
|
|
109
|
+
@param plot_type: Type of plot to use for visualizations. For bigger datasets, "boxen" is recommended. Default: 'boxen'.
|
|
110
|
+
@param flag_threshold: Threshold for flagging significant differences in sequence statistics. Default: 0.015
|
|
111
|
+
@param log_level: Logging level, default to INFO.
|
|
112
|
+
@param log_file: Path to the log file. If provided, logs will be written to this file as well as to the console.
|
|
113
|
+
@return: None
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
setup_logger(log_level, log_file)
|
|
117
|
+
logging.info("Starting dataset evaluation.")
|
|
118
|
+
|
|
119
|
+
if not Path(out_folder).exists():
|
|
120
|
+
logging.info(f"Output folder {out_folder} does not exist. Creating it.")
|
|
121
|
+
Path(out_folder).mkdir(parents=True, exist_ok=True)
|
|
122
|
+
|
|
123
|
+
# we have multiple fasta files with one label each
|
|
124
|
+
if format == 'fasta':
|
|
125
|
+
seq_stats = []
|
|
126
|
+
for input_file in input:
|
|
127
|
+
sequences = read_fasta(input_file)
|
|
128
|
+
logging.debug(f"Read {len(sequences)} sequences from FASTA file {input_file}.")
|
|
129
|
+
seq_stats += [SequenceStatistics(sequences, filename=Path(input_file).name,
|
|
130
|
+
label=Path(input_file).stem, end_position=end_position)]
|
|
131
|
+
run_analysis(
|
|
132
|
+
input_statistics = seq_stats,
|
|
133
|
+
out_folder = out_folder,
|
|
134
|
+
report_types = report_types,
|
|
135
|
+
seq_report_types = seq_report_types,
|
|
136
|
+
plot_type = plot_type,
|
|
137
|
+
flag_threshold = flag_threshold
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# we have CSV/TSV
|
|
141
|
+
else:
|
|
142
|
+
# we have one file with multiple labels or regression target
|
|
143
|
+
if len(input) == 1:
|
|
144
|
+
df = read_csv_file(input[0], format, sequence_column, label_column)
|
|
145
|
+
|
|
146
|
+
# if regression is True, we split the label column into two classes
|
|
147
|
+
if regression:
|
|
148
|
+
# convert the label column to numeric if it is not already
|
|
149
|
+
if not pd.api.types.is_numeric_dtype(df[label_column]):
|
|
150
|
+
logging.debug(f"Converting label column '{label_column}' to numeric type for regression.")
|
|
151
|
+
df[label_column] = pd.to_numeric(df[label_column], errors='coerce')
|
|
152
|
+
# infer the threshold as the median of the label column
|
|
153
|
+
threshold = df[label_column].median()
|
|
154
|
+
logging.debug(f"Inferred threshold for regression: {threshold}")
|
|
155
|
+
df[label_column] = df[label_column].apply(lambda x: 'high' if x >= threshold else 'low')
|
|
156
|
+
labels = ['high', 'low']
|
|
157
|
+
|
|
158
|
+
# get the list of labels to consider
|
|
159
|
+
elif len(label_list) == 1 and label_list[0] == 'infer':
|
|
160
|
+
labels = df[label_column].unique().tolist()
|
|
161
|
+
logging.debug(f"Inferred labels: {labels}")
|
|
162
|
+
else:
|
|
163
|
+
labels = label_list
|
|
164
|
+
|
|
165
|
+
# loop over sequences with specific label and run statistics
|
|
166
|
+
for seq_col in sequence_column:
|
|
167
|
+
seq_stats = []
|
|
168
|
+
for label in labels:
|
|
169
|
+
sequences = read_sequences_from_df(df, seq_col, label_column, label)
|
|
170
|
+
logging.debug(f"Read {len(sequences)} sequences for label '{label}' from column '{seq_col}'.")
|
|
171
|
+
seq_stats += [SequenceStatistics(sequences, filename=Path(input[0]).name, label=label,
|
|
172
|
+
seq_column=seq_col, end_position=end_position)]
|
|
173
|
+
run_analysis(
|
|
174
|
+
input_statistics = seq_stats,
|
|
175
|
+
out_folder = out_folder,
|
|
176
|
+
report_types = report_types,
|
|
177
|
+
seq_report_types = seq_report_types,
|
|
178
|
+
plot_type = plot_type,
|
|
179
|
+
flag_threshold = flag_threshold
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# handle multiple sequence columns by concatenating sequences and running statistics on them
|
|
183
|
+
if len(sequence_column) > 1:
|
|
184
|
+
seq_stats = []
|
|
185
|
+
for label in labels:
|
|
186
|
+
sequences = read_multisequence_df(df, sequence_column, label_column, label)
|
|
187
|
+
seq_stats += [SequenceStatistics(sequences, filename=Path(input[0]).name, label=label,
|
|
188
|
+
seq_column='_'.join(sequence_column))]
|
|
189
|
+
run_analysis(
|
|
190
|
+
input_statistics = seq_stats,
|
|
191
|
+
out_folder = out_folder,
|
|
192
|
+
report_types = report_types,
|
|
193
|
+
seq_report_types = seq_report_types,
|
|
194
|
+
plot_type = plot_type,
|
|
195
|
+
flag_threshold = flag_threshold
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# we have multiple files with one label each
|
|
199
|
+
else:
|
|
200
|
+
# run statistics across input files
|
|
201
|
+
for seq_col in sequence_column:
|
|
202
|
+
seq_stats = []
|
|
203
|
+
for input_file in input:
|
|
204
|
+
sequences = read_sequences_from_df(read_csv_file(input_file, format, seq_col), seq_col)
|
|
205
|
+
logging.debug(f"Read {len(sequences)} sequences from file {input_file} in column '{seq_col}'.")
|
|
206
|
+
seq_stats += [SequenceStatistics(sequences, filename=Path(input_file).name,
|
|
207
|
+
label=Path(input_file).stem, seq_column=seq_col,
|
|
208
|
+
end_position=end_position)]
|
|
209
|
+
run_analysis(
|
|
210
|
+
input_statistics = seq_stats,
|
|
211
|
+
out_folder = out_folder,
|
|
212
|
+
report_types = report_types,
|
|
213
|
+
seq_report_types = seq_report_types,
|
|
214
|
+
plot_type = plot_type,
|
|
215
|
+
flag_threshold = flag_threshold
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# handle multiple sequence columns
|
|
219
|
+
if len(sequence_column) > 1:
|
|
220
|
+
seq_stats = []
|
|
221
|
+
for input_file in input:
|
|
222
|
+
sequences = read_multisequence_df(read_csv_file(input_file, format, sequence_column), sequence_column)
|
|
223
|
+
seq_stats += [SequenceStatistics(sequences, filename=Path(input_file).name, label=Path(input_file).stem,
|
|
224
|
+
seq_column='_'.join(sequence_column), end_position=end_position)]
|
|
225
|
+
run_analysis(
|
|
226
|
+
input_statistics = seq_stats,
|
|
227
|
+
out_folder = out_folder,
|
|
228
|
+
report_types = report_types,
|
|
229
|
+
seq_report_types = seq_report_types,
|
|
230
|
+
plot_type = plot_type,
|
|
231
|
+
flag_threshold = flag_threshold
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
logging.info("Dataset evaluation successfully completed.")
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def parse_args():
|
|
238
|
+
parser = argparse.ArgumentParser(description='A tool for evaluating sequence datasets.')
|
|
239
|
+
parser.add_argument('--input', type=str, help='Path to the dataset file. '
|
|
240
|
+
'Can be a list of files, each containing sequences from one class.', nargs='+', required=True)
|
|
241
|
+
parser.add_argument('--format', help="Format of the input files.", choices=['fasta', 'csv', 'csv.gz', 'tsv', 'tsv.gz'], required=True) # potentially add HF support
|
|
242
|
+
parser.add_argument('--sequence_column', type=str, help='Name of the columns with sequences to analyze for datasets in CSV/TSV format. '
|
|
243
|
+
'Either one column or list of columns.', nargs='+', default=['sequence'])
|
|
244
|
+
parser.add_argument('--label_column', type=str, help='Name with the label column for datasets in CSV/TSV format.', default='label')
|
|
245
|
+
parser.add_argument('--label_list', type=str, nargs='+', help='List of label classes to consider or "infer" to parse different labels automatically from label column.'
|
|
246
|
+
' For datasets in CSV/TSV format.', default=['infer'])
|
|
247
|
+
parser.add_argument('--regression', action='store_true', help='If True, label column is considered as a regression target and values are split into 2 classes')
|
|
248
|
+
parser.add_argument('--out_folder', type=str, help='Path to the output folder.', default='.')
|
|
249
|
+
parser.add_argument('--report_types', type=str, nargs='+', choices=['json', 'html', 'simple'], default=['html', 'simple'],
|
|
250
|
+
help='Types of reports to generate. Default: [html, simple].')
|
|
251
|
+
parser.add_argument('--seq_report_types', type=str, nargs='+', choices=['json', 'html'], default=[],
|
|
252
|
+
help='Types of reports to generate for individual groups of sequences. Default: [].')
|
|
253
|
+
parser.add_argument('--end_position', type=int, default=None,
|
|
254
|
+
help='End position of the sequences to consider in per position statistics. If not provided, 75th percentile of sequence lengths will be used.')
|
|
255
|
+
parser.add_argument('--plot_type', type=str, help='Type of plot to use for visualizations. For bigger datasets, "boxen" in recommended. Default: boxen.',
|
|
256
|
+
choices=['boxen', 'violin'], default='boxen')
|
|
257
|
+
parser.add_argument('--flag_threshold', type=float, default=0.015,
|
|
258
|
+
help='Threshold for flagging significant differences in sequence statistics. Default: 0.015')
|
|
259
|
+
parser.add_argument('--log_level', type=str, help='Logging level, default to INFO.', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default='INFO')
|
|
260
|
+
parser.add_argument('--log_file', type=str, help='Path to the log file. If provided, logs will be written to this file as well as to the console.', default=None)
|
|
261
|
+
args = parser.parse_args()
|
|
262
|
+
|
|
263
|
+
if args.format == 'fasta' and len(args.input) < 2:
|
|
264
|
+
parser.error("When format is 'fasta', the input must contain individual files for each class.")
|
|
265
|
+
|
|
266
|
+
return args
|
|
267
|
+
|
|
268
|
+
def main():
|
|
269
|
+
args = parse_args()
|
|
270
|
+
run(input = args.input,
|
|
271
|
+
format = args.format,
|
|
272
|
+
out_folder = args.out_folder,
|
|
273
|
+
sequence_column = args.sequence_column,
|
|
274
|
+
label_column = args.label_column,
|
|
275
|
+
label_list = args.label_list,
|
|
276
|
+
regression = args.regression,
|
|
277
|
+
report_types = args.report_types,
|
|
278
|
+
seq_report_types = args.seq_report_types,
|
|
279
|
+
end_position = args.end_position,
|
|
280
|
+
plot_type = args.plot_type,
|
|
281
|
+
flag_threshold = args.flag_threshold,
|
|
282
|
+
log_level = args.log_level,
|
|
283
|
+
log_file = args.log_file
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
if __name__ == '__main__':
|
|
287
|
+
main()
|