profact 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- profact/__init__.py +0 -0
- profact/cli.py +209 -0
- profact/compare.py +128 -0
- profact/duplicates.py +57 -0
- profact/parser.py +139 -0
- profact/reporter.py +656 -0
- profact/stats.py +91 -0
- profact-0.1.0.dist-info/METADATA +184 -0
- profact-0.1.0.dist-info/RECORD +13 -0
- profact-0.1.0.dist-info/WHEEL +5 -0
- profact-0.1.0.dist-info/entry_points.txt +2 -0
- profact-0.1.0.dist-info/licenses/LICENSE +21 -0
- profact-0.1.0.dist-info/top_level.txt +1 -0
profact/__init__.py
ADDED
|
File without changes
|
profact/cli.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from .parser import read_fasta, validate_record, FastaParseError
|
|
5
|
+
from .duplicates import analyze_duplicates
|
|
6
|
+
from .compare import compare_files
|
|
7
|
+
from .stats import analyze_stats
|
|
8
|
+
from .reporter import (
|
|
9
|
+
stats_to_text,
|
|
10
|
+
stats_to_tsv,
|
|
11
|
+
stats_to_json,
|
|
12
|
+
stats_to_html,
|
|
13
|
+
format_validation_report,
|
|
14
|
+
build_full_report_data,
|
|
15
|
+
full_report_to_text,
|
|
16
|
+
full_report_to_tsv,
|
|
17
|
+
full_report_to_json,
|
|
18
|
+
full_report_to_html,
|
|
19
|
+
format_duplicates_report,
|
|
20
|
+
format_compare_report,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def write_output(output_str, output_file):
|
|
25
|
+
if output_file:
|
|
26
|
+
output_path = Path(output_file)
|
|
27
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
output_path.write_text(output_str)
|
|
29
|
+
else:
|
|
30
|
+
print(output_str)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def cmd_validate(args):
|
|
35
|
+
file_path = args.fasta_file
|
|
36
|
+
output_format = args.format
|
|
37
|
+
output_file = args.output
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
records = list(read_fasta(file_path))
|
|
41
|
+
except FileNotFoundError as e:
|
|
42
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
43
|
+
sys.exit(2)
|
|
44
|
+
except FastaParseError as e:
|
|
45
|
+
print(f"ERROR: Invalid FASTA format: {e}", file=sys.stderr)
|
|
46
|
+
sys.exit(2)
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
49
|
+
sys.exit(2)
|
|
50
|
+
|
|
51
|
+
validated = []
|
|
52
|
+
for rec in records:
|
|
53
|
+
res = validate_record(rec)
|
|
54
|
+
validated.append(
|
|
55
|
+
{
|
|
56
|
+
"id": rec.id,
|
|
57
|
+
"description": rec.description,
|
|
58
|
+
"sequence_length": len(rec.sequence),
|
|
59
|
+
"valid": res["valid"],
|
|
60
|
+
"errors": res["errors"],
|
|
61
|
+
"warnings": res["warnings"],
|
|
62
|
+
"has_x": res["has_x"],
|
|
63
|
+
"has_stop": res["has_stop"],
|
|
64
|
+
"is_empty": res["is_empty"],
|
|
65
|
+
"invalid_chars": sorted(res["invalid_chars"]),
|
|
66
|
+
"non_standard": sorted(res["non_standard"]),
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
total = len(validated)
|
|
71
|
+
valid_count = sum(1 for v in validated if v["valid"])
|
|
72
|
+
invalid_count = total - valid_count
|
|
73
|
+
records_with_x = sum(1 for v in validated if v["has_x"])
|
|
74
|
+
records_with_stop = sum(1 for v in validated if v["has_stop"])
|
|
75
|
+
empty_records = sum(1 for v in validated if v["is_empty"])
|
|
76
|
+
|
|
77
|
+
summary = {
|
|
78
|
+
"total_records": total,
|
|
79
|
+
"valid_records": valid_count,
|
|
80
|
+
"invalid_records": invalid_count,
|
|
81
|
+
"records_with_X": records_with_x,
|
|
82
|
+
"records_with_stop": records_with_stop,
|
|
83
|
+
"empty_records": empty_records,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Use the reporter function
|
|
87
|
+
output_str = format_validation_report(file_path, validated, summary, output_format)
|
|
88
|
+
|
|
89
|
+
write_output(output_str, output_file)
|
|
90
|
+
sys.exit(0 if invalid_count == 0 else 1)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def cmd_duplicates(args):
|
|
94
|
+
try:
|
|
95
|
+
data = analyze_duplicates(args.fasta_file)
|
|
96
|
+
except (FileNotFoundError, FastaParseError) as e:
|
|
97
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
98
|
+
sys.exit(2)
|
|
99
|
+
|
|
100
|
+
output_str = format_duplicates_report(data, args.fasta_file, args.format)
|
|
101
|
+
|
|
102
|
+
write_output(output_str, args.output)
|
|
103
|
+
sys.exit(0)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def cmd_compare(args):
|
|
107
|
+
try:
|
|
108
|
+
data = compare_files(args.old_fasta, args.new_fasta)
|
|
109
|
+
except (FileNotFoundError, FastaParseError) as e:
|
|
110
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
111
|
+
sys.exit(2)
|
|
112
|
+
|
|
113
|
+
output_str = format_compare_report(data, args.format)
|
|
114
|
+
|
|
115
|
+
write_output(output_str, args.output)
|
|
116
|
+
has_changes = any(
|
|
117
|
+
[
|
|
118
|
+
data["summary"]["added_count"],
|
|
119
|
+
data["summary"]["removed_count"],
|
|
120
|
+
data["summary"]["changed_sequence_count"],
|
|
121
|
+
data["summary"]["added_duplicate_cluster_count"],
|
|
122
|
+
data["summary"]["removed_duplicate_cluster_count"],
|
|
123
|
+
data["summary"]["changed_duplicate_cluster_count"],
|
|
124
|
+
]
|
|
125
|
+
)
|
|
126
|
+
sys.exit(1 if has_changes else 0)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def cmd_stats(args):
|
|
130
|
+
try:
|
|
131
|
+
data = analyze_stats(args.fasta_file)
|
|
132
|
+
except (FileNotFoundError, FastaParseError) as e:
|
|
133
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
134
|
+
sys.exit(2)
|
|
135
|
+
|
|
136
|
+
if args.format == "json":
|
|
137
|
+
output_str = stats_to_json(data)
|
|
138
|
+
elif args.format == "tsv":
|
|
139
|
+
output_str = stats_to_tsv(data)
|
|
140
|
+
elif args.format == "html":
|
|
141
|
+
output_str = stats_to_html(data)
|
|
142
|
+
else:
|
|
143
|
+
output_str = stats_to_text(data)
|
|
144
|
+
|
|
145
|
+
write_output(output_str, args.output)
|
|
146
|
+
sys.exit(0)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def cmd_report(args):
|
|
150
|
+
try:
|
|
151
|
+
data = build_full_report_data(args.fasta_file)
|
|
152
|
+
except (FileNotFoundError, FastaParseError) as e:
|
|
153
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
154
|
+
sys.exit(2)
|
|
155
|
+
|
|
156
|
+
if args.format == "json":
|
|
157
|
+
output_str = full_report_to_json(data)
|
|
158
|
+
elif args.format == "tsv":
|
|
159
|
+
output_str = full_report_to_tsv(data)
|
|
160
|
+
elif args.format == "html":
|
|
161
|
+
output_str = full_report_to_html(data)
|
|
162
|
+
else:
|
|
163
|
+
output_str = full_report_to_text(data)
|
|
164
|
+
|
|
165
|
+
write_output(output_str, args.output)
|
|
166
|
+
sys.exit(0)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def main():
|
|
170
|
+
parser = argparse.ArgumentParser(prog="profact", description="Protein FASTA analysis tool")
|
|
171
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
172
|
+
|
|
173
|
+
val_parser = subparsers.add_parser("validate", help="Validate protein FASTA records")
|
|
174
|
+
val_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
|
|
175
|
+
val_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv"], default="text", help="Output format")
|
|
176
|
+
val_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
|
|
177
|
+
val_parser.set_defaults(func=cmd_validate)
|
|
178
|
+
|
|
179
|
+
dup_parser = subparsers.add_parser("duplicates", help="Detect duplicate IDs and identical sequences")
|
|
180
|
+
dup_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
|
|
181
|
+
dup_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv"], default="text", help="Output format")
|
|
182
|
+
dup_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
|
|
183
|
+
dup_parser.set_defaults(func=cmd_duplicates)
|
|
184
|
+
|
|
185
|
+
cmp_parser = subparsers.add_parser("compare", help="Compare two protein FASTA files")
|
|
186
|
+
cmp_parser.add_argument("-f1", "--file_1", metavar="FILE_1", dest="old_fasta", required=True, help="First FASTA file")
|
|
187
|
+
cmp_parser.add_argument("-f2", "--file_2", metavar="FILE_2", dest="new_fasta", required=True, help="Second FASTA file")
|
|
188
|
+
cmp_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv"], default="text", help="Output format")
|
|
189
|
+
cmp_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
|
|
190
|
+
cmp_parser.set_defaults(func=cmd_compare)
|
|
191
|
+
|
|
192
|
+
stats_parser = subparsers.add_parser("stats", help="Compute protein FASTA statistics")
|
|
193
|
+
stats_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
|
|
194
|
+
stats_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv", "html"], default="text", help="Output format")
|
|
195
|
+
stats_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
|
|
196
|
+
stats_parser.set_defaults(func=cmd_stats)
|
|
197
|
+
|
|
198
|
+
report_parser = subparsers.add_parser("report", help="Generate full protein FASTA report")
|
|
199
|
+
report_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
|
|
200
|
+
report_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv", "html"], default="text", help="Output format")
|
|
201
|
+
report_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
|
|
202
|
+
report_parser.set_defaults(func=cmd_report)
|
|
203
|
+
|
|
204
|
+
args = parser.parse_args()
|
|
205
|
+
args.func(args)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
if __name__ == "__main__":
|
|
209
|
+
main()
|
profact/compare.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Comparison of two protein FASTA files."""
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Dict, Iterable, List, Any, Tuple
|
|
4
|
+
|
|
5
|
+
from .parser import FastaRecord, read_fasta
|
|
6
|
+
from .duplicates import duplicate_summary
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def records_by_id(records: Iterable[FastaRecord]) -> Dict[str, FastaRecord]:
|
|
10
|
+
"""
|
|
11
|
+
Build an ID -> record mapping.
|
|
12
|
+
|
|
13
|
+
If the same ID appears multiple times, the last record wins. Duplicate IDs
|
|
14
|
+
are still reported separately by duplicate_summary().
|
|
15
|
+
"""
|
|
16
|
+
return {record.id: record for record in records}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def sequence_cluster_signature(records: Iterable[FastaRecord]) -> Dict[str, Tuple[str, ...]]:
|
|
20
|
+
"""Return exact sequence -> tuple(sorted IDs) for all duplicated sequences."""
|
|
21
|
+
by_sequence = defaultdict(list)
|
|
22
|
+
for record in records:
|
|
23
|
+
by_sequence[record.sequence].append(record.id)
|
|
24
|
+
return {
|
|
25
|
+
sequence: tuple(sorted(ids))
|
|
26
|
+
for sequence, ids in by_sequence.items()
|
|
27
|
+
if len(ids) > 1
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def compare_duplicate_clusters(
|
|
32
|
+
old_records: Iterable[FastaRecord],
|
|
33
|
+
new_records: Iterable[FastaRecord],
|
|
34
|
+
) -> Dict[str, Any]:
|
|
35
|
+
"""Detect exact duplicate sequence clusters added, removed or changed."""
|
|
36
|
+
old_sig = sequence_cluster_signature(old_records)
|
|
37
|
+
new_sig = sequence_cluster_signature(new_records)
|
|
38
|
+
|
|
39
|
+
old_sequences = set(old_sig)
|
|
40
|
+
new_sequences = set(new_sig)
|
|
41
|
+
common_sequences = old_sequences & new_sequences
|
|
42
|
+
|
|
43
|
+
changed = []
|
|
44
|
+
for sequence in sorted(common_sequences):
|
|
45
|
+
if old_sig[sequence] != new_sig[sequence]:
|
|
46
|
+
old_ids = set(old_sig[sequence])
|
|
47
|
+
new_ids = set(new_sig[sequence])
|
|
48
|
+
changed.append({
|
|
49
|
+
"sequence": sequence,
|
|
50
|
+
"length": len(sequence),
|
|
51
|
+
"old_ids": list(old_sig[sequence]),
|
|
52
|
+
"new_ids": list(new_sig[sequence]),
|
|
53
|
+
"added_ids": sorted(new_ids - old_ids),
|
|
54
|
+
"removed_ids": sorted(old_ids - new_ids),
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
"added_clusters": [
|
|
59
|
+
{"sequence": seq, "length": len(seq), "ids": list(new_sig[seq])}
|
|
60
|
+
for seq in sorted(new_sequences - old_sequences)
|
|
61
|
+
],
|
|
62
|
+
"removed_clusters": [
|
|
63
|
+
{"sequence": seq, "length": len(seq), "ids": list(old_sig[seq])}
|
|
64
|
+
for seq in sorted(old_sequences - new_sequences)
|
|
65
|
+
],
|
|
66
|
+
"changed_clusters": changed,
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def compare_records(old_records: Iterable[FastaRecord], new_records: Iterable[FastaRecord]) -> Dict[str, Any]:
|
|
71
|
+
"""Compare two FASTA record collections by ID and exact sequence content."""
|
|
72
|
+
old_list = list(old_records)
|
|
73
|
+
new_list = list(new_records)
|
|
74
|
+
old_by_id = records_by_id(old_list)
|
|
75
|
+
new_by_id = records_by_id(new_list)
|
|
76
|
+
|
|
77
|
+
old_ids = set(old_by_id)
|
|
78
|
+
new_ids = set(new_by_id)
|
|
79
|
+
common_ids = old_ids & new_ids
|
|
80
|
+
|
|
81
|
+
changed_sequences = []
|
|
82
|
+
changed_lengths = []
|
|
83
|
+
for seq_id in sorted(common_ids):
|
|
84
|
+
old_seq = old_by_id[seq_id].sequence
|
|
85
|
+
new_seq = new_by_id[seq_id].sequence
|
|
86
|
+
if old_seq != new_seq:
|
|
87
|
+
item = {
|
|
88
|
+
"id": seq_id,
|
|
89
|
+
"old_length": len(old_seq),
|
|
90
|
+
"new_length": len(new_seq),
|
|
91
|
+
}
|
|
92
|
+
changed_sequences.append(item)
|
|
93
|
+
if len(old_seq) != len(new_seq):
|
|
94
|
+
changed_lengths.append(item)
|
|
95
|
+
|
|
96
|
+
duplicate_changes = compare_duplicate_clusters(old_list, new_list)
|
|
97
|
+
|
|
98
|
+
return {
|
|
99
|
+
"summary": {
|
|
100
|
+
"old_total_records": len(old_list),
|
|
101
|
+
"new_total_records": len(new_list),
|
|
102
|
+
"added_count": len(new_ids - old_ids),
|
|
103
|
+
"removed_count": len(old_ids - new_ids),
|
|
104
|
+
"changed_sequence_count": len(changed_sequences),
|
|
105
|
+
"changed_length_count": len(changed_lengths),
|
|
106
|
+
"added_duplicate_cluster_count": len(duplicate_changes["added_clusters"]),
|
|
107
|
+
"removed_duplicate_cluster_count": len(duplicate_changes["removed_clusters"]),
|
|
108
|
+
"changed_duplicate_cluster_count": len(duplicate_changes["changed_clusters"]),
|
|
109
|
+
},
|
|
110
|
+
"added_ids": [
|
|
111
|
+
{"id": seq_id, "new_length": len(new_by_id[seq_id].sequence)}
|
|
112
|
+
for seq_id in sorted(new_ids - old_ids)
|
|
113
|
+
],
|
|
114
|
+
"removed_ids": [
|
|
115
|
+
{"id": seq_id, "old_length": len(old_by_id[seq_id].sequence)}
|
|
116
|
+
for seq_id in sorted(old_ids - new_ids)
|
|
117
|
+
],
|
|
118
|
+
"changed_sequences": changed_sequences,
|
|
119
|
+
"changed_lengths": changed_lengths,
|
|
120
|
+
"old_duplicates": duplicate_summary(old_list),
|
|
121
|
+
"new_duplicates": duplicate_summary(new_list),
|
|
122
|
+
"duplicate_cluster_changes": duplicate_changes,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def compare_files(old_file: str, new_file: str) -> Dict[str, Any]:
|
|
127
|
+
"""Read and compare two FASTA files."""
|
|
128
|
+
return compare_records(read_fasta(old_file), read_fasta(new_file))
|
profact/duplicates.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Duplicate detection for protein FASTA records."""
|
|
2
|
+
from collections import Counter, defaultdict
|
|
3
|
+
from typing import Dict, Iterable, List, Any
|
|
4
|
+
|
|
5
|
+
from .parser import FastaRecord, read_fasta
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def find_duplicate_ids(records: Iterable[FastaRecord]) -> Dict[str, int]:
|
|
9
|
+
"""Return IDs that occur more than once with their occurrence counts."""
|
|
10
|
+
counts = Counter(record.id for record in records)
|
|
11
|
+
return {seq_id: count for seq_id, count in counts.items() if count > 1}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def cluster_identical_sequences(records: Iterable[FastaRecord]) -> List[Dict[str, Any]]:
|
|
15
|
+
"""
|
|
16
|
+
Group records by exact sequence content.
|
|
17
|
+
|
|
18
|
+
Only clusters with at least two records are returned. IDs are sorted to keep
|
|
19
|
+
output stable in tests and CLI reports.
|
|
20
|
+
"""
|
|
21
|
+
by_sequence = defaultdict(list)
|
|
22
|
+
for record in records:
|
|
23
|
+
by_sequence[record.sequence].append(record.id)
|
|
24
|
+
|
|
25
|
+
clusters = []
|
|
26
|
+
for sequence, ids in by_sequence.items():
|
|
27
|
+
if len(ids) > 1:
|
|
28
|
+
clusters.append({
|
|
29
|
+
"sequence": sequence,
|
|
30
|
+
"length": len(sequence),
|
|
31
|
+
"count": len(ids),
|
|
32
|
+
"ids": sorted(ids),
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
return sorted(clusters, key=lambda cluster: (cluster["length"], cluster["ids"]))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def duplicate_summary(records: Iterable[FastaRecord]) -> Dict[str, Any]:
|
|
39
|
+
"""Return duplicate-ID and identical-sequence cluster summary."""
|
|
40
|
+
materialized = list(records)
|
|
41
|
+
duplicate_ids = find_duplicate_ids(materialized)
|
|
42
|
+
clusters = cluster_identical_sequences(materialized)
|
|
43
|
+
ids_in_clusters = sorted({seq_id for cluster in clusters for seq_id in cluster["ids"]})
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
"total_records": len(materialized),
|
|
47
|
+
"duplicate_id_count": len(duplicate_ids),
|
|
48
|
+
"duplicate_ids": duplicate_ids,
|
|
49
|
+
"identical_sequence_cluster_count": len(clusters),
|
|
50
|
+
"records_in_identical_sequence_clusters": len(ids_in_clusters),
|
|
51
|
+
"identical_sequence_clusters": clusters,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def analyze_duplicates(file_path: str) -> Dict[str, Any]:
|
|
56
|
+
"""Read a FASTA file and return duplicate analysis."""
|
|
57
|
+
return duplicate_summary(read_fasta(file_path))
|
profact/parser.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""FASTA parsing and validation for protein sequences."""
|
|
2
|
+
import gzip
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterator, Dict, Set, List, Tuple, Optional
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
# Standard 20 amino acids
|
|
9
|
+
STD_AA = set("ACDEFGHIKLMNPQRSTVWY")
|
|
10
|
+
# Allowed with warnings
|
|
11
|
+
WARN_AA = {"X", "*"} # X = unknown, * = stop
|
|
12
|
+
# Non-standard but sometimes seen (B, Z, J) – also warnings
|
|
13
|
+
NON_STD = {"B", "Z", "J"}
|
|
14
|
+
# All characters that are not immediately rejected
|
|
15
|
+
VALID_CHARS = STD_AA | WARN_AA | NON_STD
|
|
16
|
+
|
|
17
|
+
class FastaRecord:
|
|
18
|
+
__slots__ = ("id", "description", "sequence")
|
|
19
|
+
def __init__(self, id: str, description: str, sequence: str):
|
|
20
|
+
self.id = id
|
|
21
|
+
self.description = description
|
|
22
|
+
self.sequence = sequence
|
|
23
|
+
|
|
24
|
+
def __repr__(self):
|
|
25
|
+
return f"FastaRecord(id={self.id!r}, seq_len={len(self.sequence)})"
|
|
26
|
+
|
|
27
|
+
class FastaParseError(Exception):
|
|
28
|
+
"""Raised when FASTA format is malformed."""
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
def read_fasta(file_path: str) -> Iterator[FastaRecord]:
|
|
32
|
+
"""
|
|
33
|
+
Yield FastaRecord objects from a FASTA file (plain or .gz).
|
|
34
|
+
Raises FastaParseError if a record header does not start with '>'.
|
|
35
|
+
"""
|
|
36
|
+
path = Path(file_path)
|
|
37
|
+
if not path.exists():
|
|
38
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
39
|
+
|
|
40
|
+
open_func = gzip.open if path.suffix == '.gz' else open
|
|
41
|
+
mode = 'rt' if path.suffix == '.gz' else 'r'
|
|
42
|
+
|
|
43
|
+
with open_func(path, mode) as f:
|
|
44
|
+
current_id = None
|
|
45
|
+
current_desc = ""
|
|
46
|
+
current_seq_lines = []
|
|
47
|
+
line_num = 0
|
|
48
|
+
|
|
49
|
+
for line in f:
|
|
50
|
+
line_num += 1
|
|
51
|
+
line = line.rstrip('\n\r')
|
|
52
|
+
if not line:
|
|
53
|
+
continue
|
|
54
|
+
if line[0] == '>':
|
|
55
|
+
# yield previous record
|
|
56
|
+
if current_id is not None:
|
|
57
|
+
seq = ''.join(current_seq_lines)
|
|
58
|
+
yield FastaRecord(current_id, current_desc, seq)
|
|
59
|
+
# parse header
|
|
60
|
+
header = line[1:].strip()
|
|
61
|
+
if not header:
|
|
62
|
+
raise FastaParseError(f"Empty header after '>' at line {line_num}")
|
|
63
|
+
parts = header.split(maxsplit=1)
|
|
64
|
+
current_id = parts[0]
|
|
65
|
+
current_desc = parts[1] if len(parts) > 1 else ""
|
|
66
|
+
current_seq_lines = []
|
|
67
|
+
else:
|
|
68
|
+
# sequence line – remove all whitespace
|
|
69
|
+
current_seq_lines.append(re.sub(r'\s+', '', line))
|
|
70
|
+
|
|
71
|
+
# last record
|
|
72
|
+
if current_id is not None:
|
|
73
|
+
seq = ''.join(current_seq_lines)
|
|
74
|
+
yield FastaRecord(current_id, current_desc, seq)
|
|
75
|
+
elif line_num == 0:
|
|
76
|
+
# empty file
|
|
77
|
+
return
|
|
78
|
+
else:
|
|
79
|
+
raise FastaParseError("File ended without a record (missing '>'?)")
|
|
80
|
+
|
|
81
|
+
def validate_record(record: FastaRecord) -> Dict[str, any]:
|
|
82
|
+
"""
|
|
83
|
+
Validate a single protein FASTA record.
|
|
84
|
+
Returns dict with:
|
|
85
|
+
valid (bool)
|
|
86
|
+
errors (list of str)
|
|
87
|
+
warnings (list of str)
|
|
88
|
+
has_x (bool)
|
|
89
|
+
has_stop (bool)
|
|
90
|
+
invalid_chars (set)
|
|
91
|
+
non_standard (set)
|
|
92
|
+
is_empty (bool)
|
|
93
|
+
"""
|
|
94
|
+
errors = []
|
|
95
|
+
warnings = []
|
|
96
|
+
seq = record.sequence
|
|
97
|
+
has_x = 'X' in seq
|
|
98
|
+
has_stop = '*' in seq
|
|
99
|
+
is_empty = len(seq) == 0
|
|
100
|
+
|
|
101
|
+
# Find invalid characters (anything not in VALID_CHARS)
|
|
102
|
+
invalid_chars = set(seq) - VALID_CHARS
|
|
103
|
+
if invalid_chars:
|
|
104
|
+
errors.append(f"Invalid character(s): {', '.join(sorted(invalid_chars))}")
|
|
105
|
+
|
|
106
|
+
if is_empty:
|
|
107
|
+
errors.append("Empty sequence")
|
|
108
|
+
|
|
109
|
+
# Warnings
|
|
110
|
+
if has_x:
|
|
111
|
+
warnings.append("Contains unknown residue 'X'")
|
|
112
|
+
if has_stop:
|
|
113
|
+
warnings.append("Contains stop symbol '*'")
|
|
114
|
+
|
|
115
|
+
non_standard = set(seq) & NON_STD
|
|
116
|
+
if non_standard:
|
|
117
|
+
warnings.append(f"Non-standard amino acid(s): {', '.join(sorted(non_standard))} (B=Asx, Z=Glx, J=Xle)")
|
|
118
|
+
|
|
119
|
+
valid = len(errors) == 0
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
"valid": valid,
|
|
123
|
+
"errors": errors,
|
|
124
|
+
"warnings": warnings,
|
|
125
|
+
"has_x": has_x,
|
|
126
|
+
"has_stop": has_stop,
|
|
127
|
+
"invalid_chars": invalid_chars,
|
|
128
|
+
"non_standard": non_standard,
|
|
129
|
+
"is_empty": is_empty,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
def validate_file(file_path: str) -> Tuple[List[FastaRecord], List[Dict]]:
|
|
133
|
+
"""
|
|
134
|
+
Parse all records from a FASTA file and validate each.
|
|
135
|
+
Returns (records, validation_results) where validation_results[i] corresponds to records[i].
|
|
136
|
+
"""
|
|
137
|
+
records = list(read_fasta(file_path))
|
|
138
|
+
results = [validate_record(rec) for rec in records]
|
|
139
|
+
return records, results
|