profact 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
profact/__init__.py ADDED
File without changes
profact/cli.py ADDED
@@ -0,0 +1,209 @@
1
+ import argparse
2
+ import sys
3
+ from pathlib import Path
4
+ from .parser import read_fasta, validate_record, FastaParseError
5
+ from .duplicates import analyze_duplicates
6
+ from .compare import compare_files
7
+ from .stats import analyze_stats
8
+ from .reporter import (
9
+ stats_to_text,
10
+ stats_to_tsv,
11
+ stats_to_json,
12
+ stats_to_html,
13
+ format_validation_report,
14
+ build_full_report_data,
15
+ full_report_to_text,
16
+ full_report_to_tsv,
17
+ full_report_to_json,
18
+ full_report_to_html,
19
+ format_duplicates_report,
20
+ format_compare_report,
21
+ )
22
+
23
+
24
+ def write_output(output_str, output_file):
25
+ if output_file:
26
+ output_path = Path(output_file)
27
+ output_path.parent.mkdir(parents=True, exist_ok=True)
28
+ output_path.write_text(output_str)
29
+ else:
30
+ print(output_str)
31
+
32
+
33
+
34
+ def cmd_validate(args):
35
+ file_path = args.fasta_file
36
+ output_format = args.format
37
+ output_file = args.output
38
+
39
+ try:
40
+ records = list(read_fasta(file_path))
41
+ except FileNotFoundError as e:
42
+ print(f"ERROR: {e}", file=sys.stderr)
43
+ sys.exit(2)
44
+ except FastaParseError as e:
45
+ print(f"ERROR: Invalid FASTA format: {e}", file=sys.stderr)
46
+ sys.exit(2)
47
+ except Exception as e:
48
+ print(f"ERROR: {e}", file=sys.stderr)
49
+ sys.exit(2)
50
+
51
+ validated = []
52
+ for rec in records:
53
+ res = validate_record(rec)
54
+ validated.append(
55
+ {
56
+ "id": rec.id,
57
+ "description": rec.description,
58
+ "sequence_length": len(rec.sequence),
59
+ "valid": res["valid"],
60
+ "errors": res["errors"],
61
+ "warnings": res["warnings"],
62
+ "has_x": res["has_x"],
63
+ "has_stop": res["has_stop"],
64
+ "is_empty": res["is_empty"],
65
+ "invalid_chars": sorted(res["invalid_chars"]),
66
+ "non_standard": sorted(res["non_standard"]),
67
+ }
68
+ )
69
+
70
+ total = len(validated)
71
+ valid_count = sum(1 for v in validated if v["valid"])
72
+ invalid_count = total - valid_count
73
+ records_with_x = sum(1 for v in validated if v["has_x"])
74
+ records_with_stop = sum(1 for v in validated if v["has_stop"])
75
+ empty_records = sum(1 for v in validated if v["is_empty"])
76
+
77
+ summary = {
78
+ "total_records": total,
79
+ "valid_records": valid_count,
80
+ "invalid_records": invalid_count,
81
+ "records_with_X": records_with_x,
82
+ "records_with_stop": records_with_stop,
83
+ "empty_records": empty_records,
84
+ }
85
+
86
+ # Use the reporter function
87
+ output_str = format_validation_report(file_path, validated, summary, output_format)
88
+
89
+ write_output(output_str, output_file)
90
+ sys.exit(0 if invalid_count == 0 else 1)
91
+
92
+
93
+ def cmd_duplicates(args):
94
+ try:
95
+ data = analyze_duplicates(args.fasta_file)
96
+ except (FileNotFoundError, FastaParseError) as e:
97
+ print(f"ERROR: {e}", file=sys.stderr)
98
+ sys.exit(2)
99
+
100
+ output_str = format_duplicates_report(data, args.fasta_file, args.format)
101
+
102
+ write_output(output_str, args.output)
103
+ sys.exit(0)
104
+
105
+
106
+ def cmd_compare(args):
107
+ try:
108
+ data = compare_files(args.old_fasta, args.new_fasta)
109
+ except (FileNotFoundError, FastaParseError) as e:
110
+ print(f"ERROR: {e}", file=sys.stderr)
111
+ sys.exit(2)
112
+
113
+ output_str = format_compare_report(data, args.format)
114
+
115
+ write_output(output_str, args.output)
116
+ has_changes = any(
117
+ [
118
+ data["summary"]["added_count"],
119
+ data["summary"]["removed_count"],
120
+ data["summary"]["changed_sequence_count"],
121
+ data["summary"]["added_duplicate_cluster_count"],
122
+ data["summary"]["removed_duplicate_cluster_count"],
123
+ data["summary"]["changed_duplicate_cluster_count"],
124
+ ]
125
+ )
126
+ sys.exit(1 if has_changes else 0)
127
+
128
+
129
+ def cmd_stats(args):
130
+ try:
131
+ data = analyze_stats(args.fasta_file)
132
+ except (FileNotFoundError, FastaParseError) as e:
133
+ print(f"ERROR: {e}", file=sys.stderr)
134
+ sys.exit(2)
135
+
136
+ if args.format == "json":
137
+ output_str = stats_to_json(data)
138
+ elif args.format == "tsv":
139
+ output_str = stats_to_tsv(data)
140
+ elif args.format == "html":
141
+ output_str = stats_to_html(data)
142
+ else:
143
+ output_str = stats_to_text(data)
144
+
145
+ write_output(output_str, args.output)
146
+ sys.exit(0)
147
+
148
+
149
+ def cmd_report(args):
150
+ try:
151
+ data = build_full_report_data(args.fasta_file)
152
+ except (FileNotFoundError, FastaParseError) as e:
153
+ print(f"ERROR: {e}", file=sys.stderr)
154
+ sys.exit(2)
155
+
156
+ if args.format == "json":
157
+ output_str = full_report_to_json(data)
158
+ elif args.format == "tsv":
159
+ output_str = full_report_to_tsv(data)
160
+ elif args.format == "html":
161
+ output_str = full_report_to_html(data)
162
+ else:
163
+ output_str = full_report_to_text(data)
164
+
165
+ write_output(output_str, args.output)
166
+ sys.exit(0)
167
+
168
+
169
+ def main():
170
+ parser = argparse.ArgumentParser(prog="profact", description="Protein FASTA analysis tool")
171
+ subparsers = parser.add_subparsers(dest="command", required=True)
172
+
173
+ val_parser = subparsers.add_parser("validate", help="Validate protein FASTA records")
174
+ val_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
175
+ val_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv"], default="text", help="Output format")
176
+ val_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
177
+ val_parser.set_defaults(func=cmd_validate)
178
+
179
+ dup_parser = subparsers.add_parser("duplicates", help="Detect duplicate IDs and identical sequences")
180
+ dup_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
181
+ dup_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv"], default="text", help="Output format")
182
+ dup_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
183
+ dup_parser.set_defaults(func=cmd_duplicates)
184
+
185
+ cmp_parser = subparsers.add_parser("compare", help="Compare two protein FASTA files")
186
+ cmp_parser.add_argument("-f1", "--file_1", metavar="FILE_1", dest="old_fasta", required=True, help="First FASTA file")
187
+ cmp_parser.add_argument("-f2", "--file_2", metavar="FILE_2", dest="new_fasta", required=True, help="Second FASTA file")
188
+ cmp_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv"], default="text", help="Output format")
189
+ cmp_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
190
+ cmp_parser.set_defaults(func=cmd_compare)
191
+
192
+ stats_parser = subparsers.add_parser("stats", help="Compute protein FASTA statistics")
193
+ stats_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
194
+ stats_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv", "html"], default="text", help="Output format")
195
+ stats_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
196
+ stats_parser.set_defaults(func=cmd_stats)
197
+
198
+ report_parser = subparsers.add_parser("report", help="Generate full protein FASTA report")
199
+ report_parser.add_argument("-i", "--input", dest="fasta_file", required=True, help="Input FASTA file (.fa, .fasta, .faa, .gz)")
200
+ report_parser.add_argument("-fmt", "--format", choices=["text", "json", "tsv", "html"], default="text", help="Output format")
201
+ report_parser.add_argument("-o", "--output", help="Output file (default: stdout)")
202
+ report_parser.set_defaults(func=cmd_report)
203
+
204
+ args = parser.parse_args()
205
+ args.func(args)
206
+
207
+
208
+ if __name__ == "__main__":
209
+ main()
profact/compare.py ADDED
@@ -0,0 +1,128 @@
1
+ """Comparison of two protein FASTA files."""
2
+ from collections import defaultdict
3
+ from typing import Dict, Iterable, List, Any, Tuple
4
+
5
+ from .parser import FastaRecord, read_fasta
6
+ from .duplicates import duplicate_summary
7
+
8
+
9
+ def records_by_id(records: Iterable[FastaRecord]) -> Dict[str, FastaRecord]:
10
+ """
11
+ Build an ID -> record mapping.
12
+
13
+ If the same ID appears multiple times, the last record wins. Duplicate IDs
14
+ are still reported separately by duplicate_summary().
15
+ """
16
+ return {record.id: record for record in records}
17
+
18
+
19
+ def sequence_cluster_signature(records: Iterable[FastaRecord]) -> Dict[str, Tuple[str, ...]]:
20
+ """Return exact sequence -> tuple(sorted IDs) for all duplicated sequences."""
21
+ by_sequence = defaultdict(list)
22
+ for record in records:
23
+ by_sequence[record.sequence].append(record.id)
24
+ return {
25
+ sequence: tuple(sorted(ids))
26
+ for sequence, ids in by_sequence.items()
27
+ if len(ids) > 1
28
+ }
29
+
30
+
31
+ def compare_duplicate_clusters(
32
+ old_records: Iterable[FastaRecord],
33
+ new_records: Iterable[FastaRecord],
34
+ ) -> Dict[str, Any]:
35
+ """Detect exact duplicate sequence clusters added, removed or changed."""
36
+ old_sig = sequence_cluster_signature(old_records)
37
+ new_sig = sequence_cluster_signature(new_records)
38
+
39
+ old_sequences = set(old_sig)
40
+ new_sequences = set(new_sig)
41
+ common_sequences = old_sequences & new_sequences
42
+
43
+ changed = []
44
+ for sequence in sorted(common_sequences):
45
+ if old_sig[sequence] != new_sig[sequence]:
46
+ old_ids = set(old_sig[sequence])
47
+ new_ids = set(new_sig[sequence])
48
+ changed.append({
49
+ "sequence": sequence,
50
+ "length": len(sequence),
51
+ "old_ids": list(old_sig[sequence]),
52
+ "new_ids": list(new_sig[sequence]),
53
+ "added_ids": sorted(new_ids - old_ids),
54
+ "removed_ids": sorted(old_ids - new_ids),
55
+ })
56
+
57
+ return {
58
+ "added_clusters": [
59
+ {"sequence": seq, "length": len(seq), "ids": list(new_sig[seq])}
60
+ for seq in sorted(new_sequences - old_sequences)
61
+ ],
62
+ "removed_clusters": [
63
+ {"sequence": seq, "length": len(seq), "ids": list(old_sig[seq])}
64
+ for seq in sorted(old_sequences - new_sequences)
65
+ ],
66
+ "changed_clusters": changed,
67
+ }
68
+
69
+
70
+ def compare_records(old_records: Iterable[FastaRecord], new_records: Iterable[FastaRecord]) -> Dict[str, Any]:
71
+ """Compare two FASTA record collections by ID and exact sequence content."""
72
+ old_list = list(old_records)
73
+ new_list = list(new_records)
74
+ old_by_id = records_by_id(old_list)
75
+ new_by_id = records_by_id(new_list)
76
+
77
+ old_ids = set(old_by_id)
78
+ new_ids = set(new_by_id)
79
+ common_ids = old_ids & new_ids
80
+
81
+ changed_sequences = []
82
+ changed_lengths = []
83
+ for seq_id in sorted(common_ids):
84
+ old_seq = old_by_id[seq_id].sequence
85
+ new_seq = new_by_id[seq_id].sequence
86
+ if old_seq != new_seq:
87
+ item = {
88
+ "id": seq_id,
89
+ "old_length": len(old_seq),
90
+ "new_length": len(new_seq),
91
+ }
92
+ changed_sequences.append(item)
93
+ if len(old_seq) != len(new_seq):
94
+ changed_lengths.append(item)
95
+
96
+ duplicate_changes = compare_duplicate_clusters(old_list, new_list)
97
+
98
+ return {
99
+ "summary": {
100
+ "old_total_records": len(old_list),
101
+ "new_total_records": len(new_list),
102
+ "added_count": len(new_ids - old_ids),
103
+ "removed_count": len(old_ids - new_ids),
104
+ "changed_sequence_count": len(changed_sequences),
105
+ "changed_length_count": len(changed_lengths),
106
+ "added_duplicate_cluster_count": len(duplicate_changes["added_clusters"]),
107
+ "removed_duplicate_cluster_count": len(duplicate_changes["removed_clusters"]),
108
+ "changed_duplicate_cluster_count": len(duplicate_changes["changed_clusters"]),
109
+ },
110
+ "added_ids": [
111
+ {"id": seq_id, "new_length": len(new_by_id[seq_id].sequence)}
112
+ for seq_id in sorted(new_ids - old_ids)
113
+ ],
114
+ "removed_ids": [
115
+ {"id": seq_id, "old_length": len(old_by_id[seq_id].sequence)}
116
+ for seq_id in sorted(old_ids - new_ids)
117
+ ],
118
+ "changed_sequences": changed_sequences,
119
+ "changed_lengths": changed_lengths,
120
+ "old_duplicates": duplicate_summary(old_list),
121
+ "new_duplicates": duplicate_summary(new_list),
122
+ "duplicate_cluster_changes": duplicate_changes,
123
+ }
124
+
125
+
126
+ def compare_files(old_file: str, new_file: str) -> Dict[str, Any]:
127
+ """Read and compare two FASTA files."""
128
+ return compare_records(read_fasta(old_file), read_fasta(new_file))
profact/duplicates.py ADDED
@@ -0,0 +1,57 @@
1
+ """Duplicate detection for protein FASTA records."""
2
+ from collections import Counter, defaultdict
3
+ from typing import Dict, Iterable, List, Any
4
+
5
+ from .parser import FastaRecord, read_fasta
6
+
7
+
8
+ def find_duplicate_ids(records: Iterable[FastaRecord]) -> Dict[str, int]:
9
+ """Return IDs that occur more than once with their occurrence counts."""
10
+ counts = Counter(record.id for record in records)
11
+ return {seq_id: count for seq_id, count in counts.items() if count > 1}
12
+
13
+
14
+ def cluster_identical_sequences(records: Iterable[FastaRecord]) -> List[Dict[str, Any]]:
15
+ """
16
+ Group records by exact sequence content.
17
+
18
+ Only clusters with at least two records are returned. IDs are sorted to keep
19
+ output stable in tests and CLI reports.
20
+ """
21
+ by_sequence = defaultdict(list)
22
+ for record in records:
23
+ by_sequence[record.sequence].append(record.id)
24
+
25
+ clusters = []
26
+ for sequence, ids in by_sequence.items():
27
+ if len(ids) > 1:
28
+ clusters.append({
29
+ "sequence": sequence,
30
+ "length": len(sequence),
31
+ "count": len(ids),
32
+ "ids": sorted(ids),
33
+ })
34
+
35
+ return sorted(clusters, key=lambda cluster: (cluster["length"], cluster["ids"]))
36
+
37
+
38
+ def duplicate_summary(records: Iterable[FastaRecord]) -> Dict[str, Any]:
39
+ """Return duplicate-ID and identical-sequence cluster summary."""
40
+ materialized = list(records)
41
+ duplicate_ids = find_duplicate_ids(materialized)
42
+ clusters = cluster_identical_sequences(materialized)
43
+ ids_in_clusters = sorted({seq_id for cluster in clusters for seq_id in cluster["ids"]})
44
+
45
+ return {
46
+ "total_records": len(materialized),
47
+ "duplicate_id_count": len(duplicate_ids),
48
+ "duplicate_ids": duplicate_ids,
49
+ "identical_sequence_cluster_count": len(clusters),
50
+ "records_in_identical_sequence_clusters": len(ids_in_clusters),
51
+ "identical_sequence_clusters": clusters,
52
+ }
53
+
54
+
55
+ def analyze_duplicates(file_path: str) -> Dict[str, Any]:
56
+ """Read a FASTA file and return duplicate analysis."""
57
+ return duplicate_summary(read_fasta(file_path))
profact/parser.py ADDED
@@ -0,0 +1,139 @@
1
+ """FASTA parsing and validation for protein sequences."""
2
+ import gzip
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Iterator, Dict, Set, List, Tuple, Optional
6
+ import re
7
+
8
+ # Standard 20 amino acids
9
+ STD_AA = set("ACDEFGHIKLMNPQRSTVWY")
10
+ # Allowed with warnings
11
+ WARN_AA = {"X", "*"} # X = unknown, * = stop
12
+ # Non-standard but sometimes seen (B, Z, J) – also warnings
13
+ NON_STD = {"B", "Z", "J"}
14
+ # All characters that are not immediately rejected
15
+ VALID_CHARS = STD_AA | WARN_AA | NON_STD
16
+
17
+ class FastaRecord:
18
+ __slots__ = ("id", "description", "sequence")
19
+ def __init__(self, id: str, description: str, sequence: str):
20
+ self.id = id
21
+ self.description = description
22
+ self.sequence = sequence
23
+
24
+ def __repr__(self):
25
+ return f"FastaRecord(id={self.id!r}, seq_len={len(self.sequence)})"
26
+
27
+ class FastaParseError(Exception):
28
+ """Raised when FASTA format is malformed."""
29
+ pass
30
+
31
+ def read_fasta(file_path: str) -> Iterator[FastaRecord]:
32
+ """
33
+ Yield FastaRecord objects from a FASTA file (plain or .gz).
34
+ Raises FastaParseError if a record header does not start with '>'.
35
+ """
36
+ path = Path(file_path)
37
+ if not path.exists():
38
+ raise FileNotFoundError(f"File not found: {file_path}")
39
+
40
+ open_func = gzip.open if path.suffix == '.gz' else open
41
+ mode = 'rt' if path.suffix == '.gz' else 'r'
42
+
43
+ with open_func(path, mode) as f:
44
+ current_id = None
45
+ current_desc = ""
46
+ current_seq_lines = []
47
+ line_num = 0
48
+
49
+ for line in f:
50
+ line_num += 1
51
+ line = line.rstrip('\n\r')
52
+ if not line:
53
+ continue
54
+ if line[0] == '>':
55
+ # yield previous record
56
+ if current_id is not None:
57
+ seq = ''.join(current_seq_lines)
58
+ yield FastaRecord(current_id, current_desc, seq)
59
+ # parse header
60
+ header = line[1:].strip()
61
+ if not header:
62
+ raise FastaParseError(f"Empty header after '>' at line {line_num}")
63
+ parts = header.split(maxsplit=1)
64
+ current_id = parts[0]
65
+ current_desc = parts[1] if len(parts) > 1 else ""
66
+ current_seq_lines = []
67
+ else:
68
+ # sequence line – remove all whitespace
69
+ current_seq_lines.append(re.sub(r'\s+', '', line))
70
+
71
+ # last record
72
+ if current_id is not None:
73
+ seq = ''.join(current_seq_lines)
74
+ yield FastaRecord(current_id, current_desc, seq)
75
+ elif line_num == 0:
76
+ # empty file
77
+ return
78
+ else:
79
+ raise FastaParseError("File ended without a record (missing '>'?)")
80
+
81
+ def validate_record(record: FastaRecord) -> Dict[str, any]:
82
+ """
83
+ Validate a single protein FASTA record.
84
+ Returns dict with:
85
+ valid (bool)
86
+ errors (list of str)
87
+ warnings (list of str)
88
+ has_x (bool)
89
+ has_stop (bool)
90
+ invalid_chars (set)
91
+ non_standard (set)
92
+ is_empty (bool)
93
+ """
94
+ errors = []
95
+ warnings = []
96
+ seq = record.sequence
97
+ has_x = 'X' in seq
98
+ has_stop = '*' in seq
99
+ is_empty = len(seq) == 0
100
+
101
+ # Find invalid characters (anything not in VALID_CHARS)
102
+ invalid_chars = set(seq) - VALID_CHARS
103
+ if invalid_chars:
104
+ errors.append(f"Invalid character(s): {', '.join(sorted(invalid_chars))}")
105
+
106
+ if is_empty:
107
+ errors.append("Empty sequence")
108
+
109
+ # Warnings
110
+ if has_x:
111
+ warnings.append("Contains unknown residue 'X'")
112
+ if has_stop:
113
+ warnings.append("Contains stop symbol '*'")
114
+
115
+ non_standard = set(seq) & NON_STD
116
+ if non_standard:
117
+ warnings.append(f"Non-standard amino acid(s): {', '.join(sorted(non_standard))} (B=Asx, Z=Glx, J=Xle)")
118
+
119
+ valid = len(errors) == 0
120
+
121
+ return {
122
+ "valid": valid,
123
+ "errors": errors,
124
+ "warnings": warnings,
125
+ "has_x": has_x,
126
+ "has_stop": has_stop,
127
+ "invalid_chars": invalid_chars,
128
+ "non_standard": non_standard,
129
+ "is_empty": is_empty,
130
+ }
131
+
132
+ def validate_file(file_path: str) -> Tuple[List[FastaRecord], List[Dict]]:
133
+ """
134
+ Parse all records from a FASTA file and validate each.
135
+ Returns (records, validation_results) where validation_results[i] corresponds to records[i].
136
+ """
137
+ records = list(read_fasta(file_path))
138
+ results = [validate_record(rec) for rec in records]
139
+ return records, results