profact 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- profact-0.1.0/LICENSE +21 -0
- profact-0.1.0/PKG-INFO +184 -0
- profact-0.1.0/README.md +171 -0
- profact-0.1.0/profact/__init__.py +0 -0
- profact-0.1.0/profact/cli.py +209 -0
- profact-0.1.0/profact/compare.py +128 -0
- profact-0.1.0/profact/duplicates.py +57 -0
- profact-0.1.0/profact/parser.py +139 -0
- profact-0.1.0/profact/reporter.py +656 -0
- profact-0.1.0/profact/stats.py +91 -0
- profact-0.1.0/profact.egg-info/PKG-INFO +184 -0
- profact-0.1.0/profact.egg-info/SOURCES.txt +21 -0
- profact-0.1.0/profact.egg-info/dependency_links.txt +1 -0
- profact-0.1.0/profact.egg-info/entry_points.txt +2 -0
- profact-0.1.0/profact.egg-info/requires.txt +3 -0
- profact-0.1.0/profact.egg-info/top_level.txt +1 -0
- profact-0.1.0/pyproject.toml +26 -0
- profact-0.1.0/setup.cfg +4 -0
- profact-0.1.0/tests/test_compare.py +64 -0
- profact-0.1.0/tests/test_duplicates.py +40 -0
- profact-0.1.0/tests/test_parser.py +145 -0
- profact-0.1.0/tests/test_reporter.py +117 -0
- profact-0.1.0/tests/test_stats.py +114 -0
profact-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Wojtek Laskowski
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
profact-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: profact
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Protein FASTA analysis and comparison tool
|
|
5
|
+
Author: Wojciech Laskowski, Wojciech Moryl, Karolina Winczewska
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Provides-Extra: test
|
|
11
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
12
|
+
Dynamic: license-file
|
|
13
|
+
|
|
14
|
+
# ProFACT - Protein FASTA Analysis and Comparison Tool
|
|
15
|
+
|
|
16
|
+
> A pip-installable Python CLI for protein FASTA quality control, duplicate detection and dataset comparison.
|
|
17
|
+
|
|
18
|
+
## Overview
|
|
19
|
+
|
|
20
|
+
Bioinformatics workflows often use protein FASTA files after downloading data from UniProt, Swiss-Prot, RefSeq or Ensembl/NCBI, filtering protein datasets, or running protein prediction pipelines. Checking quality and comparing two FASTA datasets often requires separate shell commands or custom scripts.
|
|
21
|
+
|
|
22
|
+
ProFACT combines common protein FASTA quality-control tasks into one small, testable Python CLI tool.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Features
|
|
27
|
+
|
|
28
|
+
- Plain and gzipped protein FASTA support (`.fa`, `.fasta`, `.faa`, `.fa.gz`, `.fasta.gz`, `.faa.gz`)
|
|
29
|
+
- Protein-specific statistics: length distribution, N50 and amino acid composition
|
|
30
|
+
- Quality checks: `X`, `*`, invalid amino acid characters and empty records
|
|
31
|
+
- Duplicate detection: repeated IDs and identical sequences under different IDs
|
|
32
|
+
- Exact duplicate clustering
|
|
33
|
+
- Comparison of two protein FASTA files:
|
|
34
|
+
- added IDs
|
|
35
|
+
- removed IDs
|
|
36
|
+
- changed sequences
|
|
37
|
+
- changed sequence lengths
|
|
38
|
+
- changed duplicate clusters
|
|
39
|
+
- Output formats: text, TSV and JSON for all commands; HTML for statistics and full reports
|
|
40
|
+
- Pure Python, no heavy bioinformatics dependencies
|
|
41
|
+
- Local installation via `pip install -e .`
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install -e .
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
For running tests:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install -e ".[test]"
|
|
55
|
+
python3 -m pytest -q
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Quick Start With Included Data
|
|
61
|
+
|
|
62
|
+
The repository includes two sample UniProt proteome FASTA files in `data/`.
|
|
63
|
+
|
|
64
|
+
To run tests and generate all example outputs:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
bash scripts/run_examples.sh
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
This writes text, TSV, JSON and HTML outputs to `output/`.
|
|
71
|
+
|
|
72
|
+
You can also run selected analyses manually:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
python3 -m profact.cli stats \
|
|
76
|
+
-i data/uniprotkb_proteome_UP000000625_2026_06_06.fasta \
|
|
77
|
+
-fmt html \
|
|
78
|
+
-o output/stats_UP000000625.html
|
|
79
|
+
|
|
80
|
+
python3 -m profact.cli report \
|
|
81
|
+
-i data/uniprotkb_proteome_UP000000625_2026_06_06.fasta \
|
|
82
|
+
-fmt html \
|
|
83
|
+
-o output/report_UP000000625.html
|
|
84
|
+
|
|
85
|
+
python3 -m profact.cli compare \
|
|
86
|
+
-f1 data/uniprotkb_proteome_UP000000625_2026_06_06.fasta \
|
|
87
|
+
-f2 data/uniprotkb_proteome_UP000001570_2026_06_06.fasta \
|
|
88
|
+
-fmt json \
|
|
89
|
+
-o output/compare_UP000000625_vs_UP000001570.json
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Usage
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
# Inspect a protein FASTA file
|
|
98
|
+
profact stats -i swissprot_subset.fasta
|
|
99
|
+
|
|
100
|
+
# Export statistics as JSON
|
|
101
|
+
profact stats -i swissprot_subset.fasta -fmt json -o output/stats.json
|
|
102
|
+
|
|
103
|
+
# Validate protein records
|
|
104
|
+
profact validate -i proteins.fasta.gz
|
|
105
|
+
|
|
106
|
+
# Detect identical protein sequences
|
|
107
|
+
profact duplicates -i proteins.fasta
|
|
108
|
+
|
|
109
|
+
# Compare raw and filtered protein datasets
|
|
110
|
+
profact compare -f1 raw_proteins.fasta -f2 filtered_proteins.fasta
|
|
111
|
+
|
|
112
|
+
# Export comparison as JSON
|
|
113
|
+
profact compare -f1 old_uniprot.fasta -f2 new_uniprot.fasta -fmt json -o output/compare.json
|
|
114
|
+
|
|
115
|
+
# Generate a full HTML report
|
|
116
|
+
profact report -i proteins.fasta -fmt html -o output/report.html
|
|
117
|
+
|
|
118
|
+
# Generate a full text report
|
|
119
|
+
profact report -i proteins.fasta
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Generated files can be kept in `output/`, for example:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
profact stats -i data/proteins.fasta -fmt json -o output/stats.json
|
|
126
|
+
profact validate -i data/proteins.fasta -fmt tsv -o output/validation.tsv
|
|
127
|
+
profact compare -f1 data/old.fasta -f2 data/new.fasta -fmt json -o output/compare.json
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
The `validate` command exits with status code `1` when invalid records are found.
|
|
131
|
+
The `compare` command exits with status code `1` when the two files differ.
|
|
132
|
+
In both cases this is expected behavior and reports are still written to `output/`.
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Project Structure
|
|
137
|
+
|
|
138
|
+
```text
|
|
139
|
+
profact/ Python package and CLI implementation
|
|
140
|
+
tests/ Automated tests
|
|
141
|
+
data/ Sample FASTA datasets
|
|
142
|
+
output/ Generated reports and command outputs
|
|
143
|
+
scripts/ Helper scripts for reproducible example runs
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## What We Implemented
|
|
149
|
+
|
|
150
|
+
We created ProFACT as a new bioinformatics-related Python CLI project for protein FASTA quality control and comparison.
|
|
151
|
+
|
|
152
|
+
The implementation includes FASTA parsing, validation, sequence statistics, duplicate detection, pairwise dataset comparison, report generation, automated tests, sample FASTA datasets and a reproducible example script.
|
|
153
|
+
|
|
154
|
+
The tool was needed because common FASTA quality checks are often done with separate commands or custom scripts. ProFACT provides these checks in one consistent package.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Scope and Limitations
|
|
159
|
+
|
|
160
|
+
ProFACT works with **protein FASTA files**, not FASTQ files, raw sequencing reads or nucleotide QC.
|
|
161
|
+
|
|
162
|
+
The first version is intended for small and medium protein datasets, such as custom FASTA files, proteomes, Swiss-Prot subsets or filtered UniProt downloads. Very large databases such as full UniProt or NCBI NR are outside the main scope of the first version.
|
|
163
|
+
|
|
164
|
+
The comparison is based on sequence IDs and exact sequence content. ProFACT does not perform BLAST searches, multiple sequence alignment or similarity-based clustering. Duplicate clustering means exact grouping of identical protein sequences.
|
|
165
|
+
|
|
166
|
+
The `report` command generates a combined report for one FASTA file. Pairwise dataset comparison is handled separately by the `compare` command.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Similar Tools
|
|
171
|
+
|
|
172
|
+
Similar tools already exist, including `seqkit`, `pyfastx`, BioPython and FastQC. ProFACT does not aim to replace them or introduce a new bioinformatics algorithm.
|
|
173
|
+
|
|
174
|
+
Its value is integration: it combines common protein FASTA checks into one small, focused and testable CLI with structured outputs and automated tests.
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## Team Members
|
|
179
|
+
|
|
180
|
+
| Name | GitHub |
|
|
181
|
+
|------|--------|
|
|
182
|
+
| Wojciech Laskowski | [@wlaskowski](https://github.com/wlaskowski) |
|
|
183
|
+
| Wojciech Moryl | [@wojciech-moryl](https://github.com/Fair0n) |
|
|
184
|
+
| Karolina Winczewska | [@KarolinaWinczewska](https://github.com/KaWinczewska) |
|
profact-0.1.0/README.md
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# ProFACT - Protein FASTA Analysis and Comparison Tool
|
|
2
|
+
|
|
3
|
+
> A pip-installable Python CLI for protein FASTA quality control, duplicate detection and dataset comparison.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Bioinformatics workflows often use protein FASTA files after downloading data from UniProt, Swiss-Prot, RefSeq or Ensembl/NCBI, filtering protein datasets, or running protein prediction pipelines. Checking quality and comparing two FASTA datasets often requires separate shell commands or custom scripts.
|
|
8
|
+
|
|
9
|
+
ProFACT combines common protein FASTA quality-control tasks into one small, testable Python CLI tool.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- Plain and gzipped protein FASTA support (`.fa`, `.fasta`, `.faa`, `.fa.gz`, `.fasta.gz`, `.faa.gz`)
|
|
16
|
+
- Protein-specific statistics: length distribution, N50 and amino acid composition
|
|
17
|
+
- Quality checks: `X`, `*`, invalid amino acid characters and empty records
|
|
18
|
+
- Duplicate detection: repeated IDs and identical sequences under different IDs
|
|
19
|
+
- Exact duplicate clustering
|
|
20
|
+
- Comparison of two protein FASTA files:
|
|
21
|
+
- added IDs
|
|
22
|
+
- removed IDs
|
|
23
|
+
- changed sequences
|
|
24
|
+
- changed sequence lengths
|
|
25
|
+
- changed duplicate clusters
|
|
26
|
+
- Output formats: text, TSV and JSON for all commands; HTML for statistics and full reports
|
|
27
|
+
- Pure Python, no heavy bioinformatics dependencies
|
|
28
|
+
- Local installation via `pip install -e .`
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install -e .
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
For running tests:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install -e ".[test]"
|
|
42
|
+
python3 -m pytest -q
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Quick Start With Included Data
|
|
48
|
+
|
|
49
|
+
The repository includes two sample UniProt proteome FASTA files in `data/`.
|
|
50
|
+
|
|
51
|
+
To run tests and generate all example outputs:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
bash scripts/run_examples.sh
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
This writes text, TSV, JSON and HTML outputs to `output/`.
|
|
58
|
+
|
|
59
|
+
You can also run selected analyses manually:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
python3 -m profact.cli stats \
|
|
63
|
+
-i data/uniprotkb_proteome_UP000000625_2026_06_06.fasta \
|
|
64
|
+
-fmt html \
|
|
65
|
+
-o output/stats_UP000000625.html
|
|
66
|
+
|
|
67
|
+
python3 -m profact.cli report \
|
|
68
|
+
-i data/uniprotkb_proteome_UP000000625_2026_06_06.fasta \
|
|
69
|
+
-fmt html \
|
|
70
|
+
-o output/report_UP000000625.html
|
|
71
|
+
|
|
72
|
+
python3 -m profact.cli compare \
|
|
73
|
+
-f1 data/uniprotkb_proteome_UP000000625_2026_06_06.fasta \
|
|
74
|
+
-f2 data/uniprotkb_proteome_UP000001570_2026_06_06.fasta \
|
|
75
|
+
-fmt json \
|
|
76
|
+
-o output/compare_UP000000625_vs_UP000001570.json
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Usage
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
# Inspect a protein FASTA file
|
|
85
|
+
profact stats -i swissprot_subset.fasta
|
|
86
|
+
|
|
87
|
+
# Export statistics as JSON
|
|
88
|
+
profact stats -i swissprot_subset.fasta -fmt json -o output/stats.json
|
|
89
|
+
|
|
90
|
+
# Validate protein records
|
|
91
|
+
profact validate -i proteins.fasta.gz
|
|
92
|
+
|
|
93
|
+
# Detect identical protein sequences
|
|
94
|
+
profact duplicates -i proteins.fasta
|
|
95
|
+
|
|
96
|
+
# Compare raw and filtered protein datasets
|
|
97
|
+
profact compare -f1 raw_proteins.fasta -f2 filtered_proteins.fasta
|
|
98
|
+
|
|
99
|
+
# Export comparison as JSON
|
|
100
|
+
profact compare -f1 old_uniprot.fasta -f2 new_uniprot.fasta -fmt json -o output/compare.json
|
|
101
|
+
|
|
102
|
+
# Generate a full HTML report
|
|
103
|
+
profact report -i proteins.fasta -fmt html -o output/report.html
|
|
104
|
+
|
|
105
|
+
# Generate a full text report
|
|
106
|
+
profact report -i proteins.fasta
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Generated files can be kept in `output/`, for example:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
profact stats -i data/proteins.fasta -fmt json -o output/stats.json
|
|
113
|
+
profact validate -i data/proteins.fasta -fmt tsv -o output/validation.tsv
|
|
114
|
+
profact compare -f1 data/old.fasta -f2 data/new.fasta -fmt json -o output/compare.json
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
The `validate` command exits with status code `1` when invalid records are found.
|
|
118
|
+
The `compare` command exits with status code `1` when the two files differ.
|
|
119
|
+
In both cases this is expected behavior and reports are still written to `output/`.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Project Structure
|
|
124
|
+
|
|
125
|
+
```text
|
|
126
|
+
profact/ Python package and CLI implementation
|
|
127
|
+
tests/ Automated tests
|
|
128
|
+
data/ Sample FASTA datasets
|
|
129
|
+
output/ Generated reports and command outputs
|
|
130
|
+
scripts/ Helper scripts for reproducible example runs
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## What We Implemented
|
|
136
|
+
|
|
137
|
+
We created ProFACT as a new bioinformatics-related Python CLI project for protein FASTA quality control and comparison.
|
|
138
|
+
|
|
139
|
+
The implementation includes FASTA parsing, validation, sequence statistics, duplicate detection, pairwise dataset comparison, report generation, automated tests, sample FASTA datasets and a reproducible example script.
|
|
140
|
+
|
|
141
|
+
The tool was needed because common FASTA quality checks are often done with separate commands or custom scripts. ProFACT provides these checks in one consistent package.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Scope and Limitations
|
|
146
|
+
|
|
147
|
+
ProFACT works with **protein FASTA files**, not FASTQ files, raw sequencing reads or nucleotide QC.
|
|
148
|
+
|
|
149
|
+
The first version is intended for small and medium protein datasets, such as custom FASTA files, proteomes, Swiss-Prot subsets or filtered UniProt downloads. Very large databases such as full UniProt or NCBI NR are outside the main scope of the first version.
|
|
150
|
+
|
|
151
|
+
The comparison is based on sequence IDs and exact sequence content. ProFACT does not perform BLAST searches, multiple sequence alignment or similarity-based clustering. Duplicate clustering means exact grouping of identical protein sequences.
|
|
152
|
+
|
|
153
|
+
The `report` command generates a combined report for one FASTA file. Pairwise dataset comparison is handled separately by the `compare` command.
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Similar Tools
|
|
158
|
+
|
|
159
|
+
Similar tools already exist, including `seqkit`, `pyfastx`, BioPython and FastQC. ProFACT does not aim to replace them or introduce a new bioinformatics algorithm.
|
|
160
|
+
|
|
161
|
+
Its value is integration: it combines common protein FASTA checks into one small, focused and testable CLI with structured outputs and automated tests.
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## Team Members
|
|
166
|
+
|
|
167
|
+
| Name | GitHub |
|
|
168
|
+
|------|--------|
|
|
169
|
+
| Wojciech Laskowski | [@wlaskowski](https://github.com/wlaskowski) |
|
|
170
|
+
| Wojciech Moryl | [@wojciech-moryl](https://github.com/Fair0n) |
|
|
171
|
+
| Karolina Winczewska | [@KarolinaWinczewska](https://github.com/KaWinczewska) |
|
|
File without changes
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from .parser import read_fasta, validate_record, FastaParseError
|
|
5
|
+
from .duplicates import analyze_duplicates
|
|
6
|
+
from .compare import compare_files
|
|
7
|
+
from .stats import analyze_stats
|
|
8
|
+
from .reporter import (
|
|
9
|
+
stats_to_text,
|
|
10
|
+
stats_to_tsv,
|
|
11
|
+
stats_to_json,
|
|
12
|
+
stats_to_html,
|
|
13
|
+
format_validation_report,
|
|
14
|
+
build_full_report_data,
|
|
15
|
+
full_report_to_text,
|
|
16
|
+
full_report_to_tsv,
|
|
17
|
+
full_report_to_json,
|
|
18
|
+
full_report_to_html,
|
|
19
|
+
format_duplicates_report,
|
|
20
|
+
format_compare_report,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def write_output(output_str, output_file):
|
|
25
|
+
if output_file:
|
|
26
|
+
output_path = Path(output_file)
|
|
27
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
output_path.write_text(output_str)
|
|
29
|
+
else:
|
|
30
|
+
print(output_str)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def cmd_validate(args):
|
|
35
|
+
file_path = args.fasta_file
|
|
36
|
+
output_format = args.format
|
|
37
|
+
output_file = args.output
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
records = list(read_fasta(file_path))
|
|
41
|
+
except FileNotFoundError as e:
|
|
42
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
43
|
+
sys.exit(2)
|
|
44
|
+
except FastaParseError as e:
|
|
45
|
+
print(f"ERROR: Invalid FASTA format: {e}", file=sys.stderr)
|
|
46
|
+
sys.exit(2)
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
49
|
+
sys.exit(2)
|
|
50
|
+
|
|
51
|
+
validated = []
|
|
52
|
+
for rec in records:
|
|
53
|
+
res = validate_record(rec)
|
|
54
|
+
validated.append(
|
|
55
|
+
{
|
|
56
|
+
"id": rec.id,
|
|
57
|
+
"description": rec.description,
|
|
58
|
+
"sequence_length": len(rec.sequence),
|
|
59
|
+
"valid": res["valid"],
|
|
60
|
+
"errors": res["errors"],
|
|
61
|
+
"warnings": res["warnings"],
|
|
62
|
+
"has_x": res["has_x"],
|
|
63
|
+
"has_stop": res["has_stop"],
|
|
64
|
+
"is_empty": res["is_empty"],
|
|
65
|
+
"invalid_chars": sorted(res["invalid_chars"]),
|
|
66
|
+
"non_standard": sorted(res["non_standard"]),
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
total = len(validated)
|
|
71
|
+
valid_count = sum(1 for v in validated if v["valid"])
|
|
72
|
+
invalid_count = total - valid_count
|
|
73
|
+
records_with_x = sum(1 for v in validated if v["has_x"])
|
|
74
|
+
records_with_stop = sum(1 for v in validated if v["has_stop"])
|
|
75
|
+
empty_records = sum(1 for v in validated if v["is_empty"])
|
|
76
|
+
|
|
77
|
+
summary = {
|
|
78
|
+
"total_records": total,
|
|
79
|
+
"valid_records": valid_count,
|
|
80
|
+
"invalid_records": invalid_count,
|
|
81
|
+
"records_with_X": records_with_x,
|
|
82
|
+
"records_with_stop": records_with_stop,
|
|
83
|
+
"empty_records": empty_records,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Use the reporter function
|
|
87
|
+
output_str = format_validation_report(file_path, validated, summary, output_format)
|
|
88
|
+
|
|
89
|
+
write_output(output_str, output_file)
|
|
90
|
+
sys.exit(0 if invalid_count == 0 else 1)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def cmd_duplicates(args):
|
|
94
|
+
try:
|
|
95
|
+
data = analyze_duplicates(args.fasta_file)
|
|
96
|
+
except (FileNotFoundError, FastaParseError) as e:
|
|
97
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
98
|
+
sys.exit(2)
|
|
99
|
+
|
|
100
|
+
output_str = format_duplicates_report(data, args.fasta_file, args.format)
|
|
101
|
+
|
|
102
|
+
write_output(output_str, args.output)
|
|
103
|
+
sys.exit(0)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def cmd_compare(args):
|
|
107
|
+
try:
|
|
108
|
+
data = compare_files(args.old_fasta, args.new_fasta)
|
|
109
|
+
except (FileNotFoundError, FastaParseError) as e:
|
|
110
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
111
|
+
sys.exit(2)
|
|
112
|
+
|
|
113
|
+
output_str = format_compare_report(data, args.format)
|
|
114
|
+
|
|
115
|
+
write_output(output_str, args.output)
|
|
116
|
+
has_changes = any(
|
|
117
|
+
[
|
|
118
|
+
data["summary"]["added_count"],
|
|
119
|
+
data["summary"]["removed_count"],
|
|
120
|
+
data["summary"]["changed_sequence_count"],
|
|
121
|
+
data["summary"]["added_duplicate_cluster_count"],
|
|
122
|
+
data["summary"]["removed_duplicate_cluster_count"],
|
|
123
|
+
data["summary"]["changed_duplicate_cluster_count"],
|
|
124
|
+
]
|
|
125
|
+
)
|
|
126
|
+
sys.exit(1 if has_changes else 0)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def cmd_stats(args):
|
|
130
|
+
try:
|
|
131
|
+
data = analyze_stats(args.fasta_file)
|
|
132
|
+
except (FileNotFoundError, FastaParseError) as e:
|
|
133
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
134
|
+
sys.exit(2)
|
|
135
|
+
|
|
136
|
+
if args.format == "json":
|
|
137
|
+
output_str = stats_to_json(data)
|
|
138
|
+
elif args.format == "tsv":
|
|
139
|
+
output_str = stats_to_tsv(data)
|
|
140
|
+
elif args.format == "html":
|
|
141
|
+
output_str = stats_to_html(data)
|
|
142
|
+
else:
|
|
143
|
+
output_str = stats_to_text(data)
|
|
144
|
+
|
|
145
|
+
write_output(output_str, args.output)
|
|
146
|
+
sys.exit(0)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def cmd_report(args):
|
|
150
|
+
try:
|
|
151
|
+
data = build_full_report_data(args.fasta_file)
|
|
152
|
+
except (FileNotFoundError, FastaParseError) as e:
|
|
153
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
154
|
+
sys.exit(2)
|
|
155
|
+
|
|
156
|
+
if args.format == "json":
|
|
157
|
+
output_str = full_report_to_json(data)
|
|
158
|
+
elif args.format == "tsv":
|
|
159
|
+
output_str = full_report_to_tsv(data)
|
|
160
|
+
elif args.format == "html":
|
|
161
|
+
output_str = full_report_to_html(data)
|
|
162
|
+
else:
|
|
163
|
+
output_str = full_report_to_text(data)
|
|
164
|
+
|
|
165
|
+
write_output(output_str, args.output)
|
|
166
|
+
sys.exit(0)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def main():
|
|
170
|
+
parser = argparse.ArgumentParser(prog="profact", description="Protein FASTA analysis tool")
|
|
171
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
172
|
+
|
|
173
|
+
val_parser = subparsers.add_parser("validate", help="Validate protein FASTA records")
|
|
174
|
+
val_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
|
|
175
|
+
val_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv"], default="text", help="Output format")
|
|
176
|
+
val_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
|
|
177
|
+
val_parser.set_defaults(func=cmd_validate)
|
|
178
|
+
|
|
179
|
+
dup_parser = subparsers.add_parser("duplicates", help="Detect duplicate IDs and identical sequences")
|
|
180
|
+
dup_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
|
|
181
|
+
dup_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv"], default="text", help="Output format")
|
|
182
|
+
dup_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
|
|
183
|
+
dup_parser.set_defaults(func=cmd_duplicates)
|
|
184
|
+
|
|
185
|
+
cmp_parser = subparsers.add_parser("compare", help="Compare two protein FASTA files")
|
|
186
|
+
cmp_parser.add_argument("-f1", "--file_1", metavar="FILE_1", dest="old_fasta", required=True, help="First FASTA file")
|
|
187
|
+
cmp_parser.add_argument("-f2", "--file_2", metavar="FILE_2", dest="new_fasta", required=True, help="Second FASTA file")
|
|
188
|
+
cmp_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv"], default="text", help="Output format")
|
|
189
|
+
cmp_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
|
|
190
|
+
cmp_parser.set_defaults(func=cmd_compare)
|
|
191
|
+
|
|
192
|
+
stats_parser = subparsers.add_parser("stats", help="Compute protein FASTA statistics")
|
|
193
|
+
stats_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
|
|
194
|
+
stats_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv", "html"], default="text", help="Output format")
|
|
195
|
+
stats_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
|
|
196
|
+
stats_parser.set_defaults(func=cmd_stats)
|
|
197
|
+
|
|
198
|
+
report_parser = subparsers.add_parser("report", help="Generate full protein FASTA report")
|
|
199
|
+
report_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
|
|
200
|
+
report_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv", "html"], default="text", help="Output format")
|
|
201
|
+
report_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
|
|
202
|
+
report_parser.set_defaults(func=cmd_report)
|
|
203
|
+
|
|
204
|
+
args = parser.parse_args()
|
|
205
|
+
args.func(args)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
if __name__ == "__main__":
|
|
209
|
+
main()
|