ORForise 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ import collections
2
+ import logging
3
+
4
+
5
+
6
+ def _make_feature(seqid, source, type_, start, end, score, strand, phase, attributes):
7
+ attrs = []
8
+ for k, v in attributes.items():
9
+ attrs.append(f"{k}={v}")
10
+ return f"{seqid}\t{source}\t{type_}\t{start}\t{end}\t{score}\t{strand}\t{phase}\t{';'.join(attrs)}\n"
11
+
12
+
13
+ def parse_blast_tab6(path, genome_seq, gene_ident=None):
14
+ results = collections.OrderedDict()
15
+ count = 0
16
+ with open(path, 'r') as fh:
17
+ for i, line in enumerate(fh, 1):
18
+ line = line.strip()
19
+ if not line or line.startswith('#'):
20
+ continue
21
+ parts = line.split('\t')
22
+ if len(parts) < 12:
23
+ logging.warning(f"Line {i}: unexpected BLAST line with {len(parts)} columns")
24
+ continue
25
+ qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore = parts[:12]
26
+ try:
27
+ sstart = int(sstart)
28
+ send = int(send)
29
+ except ValueError:
30
+ logging.warning(f"Line {i}: non-integer coordinates in BLAST sstart/send")
31
+ continue
32
+ start = min(sstart, send)
33
+ end = max(sstart, send)
34
+ strand = '+' if sstart <= send else '-'
35
+ attrs = {
36
+ 'ID': f'blast_hit{count}',
37
+ 'Target': f'{qseqid} {qstart} {qend}',
38
+ 'pident': pident,
39
+ 'length': length,
40
+ 'evalue': evalue,
41
+ 'bitscore': bitscore
42
+ }
43
+ results[f"{start},{end}"] = [strand, '.', 'similarity', attrs]
44
+ count += 1
45
+ return results
46
+
47
+
48
+ def parse_abricate(path, genome_seq, gene_ident=None):
49
+ results = collections.OrderedDict()
50
+ count = 0
51
+ with (open(path, 'r') as fh):
52
+ header = None
53
+ for i, line in enumerate(fh, 1):
54
+ line = line.rstrip('\n')
55
+ if not line:
56
+ continue
57
+ if line.startswith('#'):
58
+ header = line.split('\t')
59
+ continue
60
+ if header is None:
61
+ # skip any pre-header content until a header line is encountered
62
+ continue
63
+ parts = line.split('\t')
64
+ if header and len(parts) == len(header):
65
+ row = dict(zip(header, parts))
66
+
67
+ try:
68
+ start = int(row.get('START', '0'))
69
+ end = int(row.get('END', '0'))
70
+ strand = row.get('STRAND')
71
+ except ValueError:
72
+ logging.warning(f"Line {i}: invalid START/END in Abricate line")
73
+ continue
74
+ seqid = row.get('SEQUENCE')
75
+ gene = row.get('GENE')
76
+ accession = row.get('ACCESSION') or 'unknown'
77
+ db = row.get('DATABASE') or 'unknown'
78
+ identity = row.get('%IDENTITY')
79
+ coverage = row.get('%COVERAGE')
80
+ product = row.get('PRODUCT') or 'unkown'
81
+ resistance = row.get('RESISTANCE') or 'unknown'
82
+
83
+ attrs = {
84
+ 'seqid': seqid,
85
+ 'start': start,
86
+ 'end': end,
87
+ 'strand': strand,
88
+ 'gene': gene,
89
+ 'accession': accession,
90
+ 'database': db,
91
+ 'identity': identity,
92
+ 'coverage': coverage,
93
+ 'product': product,
94
+ 'resistance': resistance
95
+ }
96
+ results[f"{start},{end}"] = attrs
97
+ count += 1
98
+ else:
99
+ logging.warning(f"Line {i}: unexpected number of columns in Abricate line")
100
+ continue
101
+ return results
102
+
103
+
104
+ def parse_genemark(path, genome_seq, gene_ident=None):
105
+ results = collections.OrderedDict()
106
+ count = 0
107
+ with open(path, 'r') as fh:
108
+ for i, line in enumerate(fh, 1):
109
+ line = line.strip()
110
+ if not line:
111
+ continue
112
+ parts = line.split()
113
+ if len(parts) < 3:
114
+ continue
115
+ try:
116
+ start = int(parts[0])
117
+ stop = int(parts[1])
118
+ except ValueError:
119
+ continue
120
+ strand_tok = parts[2]
121
+ if 'complement' in strand_tok:
122
+ strand = '-'
123
+ else:
124
+ strand = '+'
125
+ attrs = {'ID': f'genemark_hit{count}', 'tool': 'GeneMark'}
126
+ results[f"{start},{stop}"] = [strand, '.', 'CDS', attrs]
127
+ count += 1
128
+ return results
129
+
130
+
131
+ def TabToGFF(input_file, genome_seq, gene_ident='CDS', fmt='blast'):
132
+ # Should be cleaned up to use consistent format names
133
+ fmt = fmt.lower()
134
+ if fmt in ('blast', 'blast_tab6', 'tab6'):
135
+ return parse_blast_tab6(input_file, genome_seq, gene_ident)
136
+ if fmt in ('abricate', 'abricate_tsv', 'abricate_format'):
137
+ return parse_abricate(input_file, genome_seq, gene_ident)
138
+ if fmt in ('genemark', 'gene_mark'):
139
+ return parse_genemark(input_file, genome_seq, gene_ident)
140
+ raise ValueError(f"Unknown format: {fmt}")
File without changes
ORForise/utils.py CHANGED
@@ -4,7 +4,7 @@ import collections
4
4
  # Constants
5
5
  SHORT_ORF_LENGTH = 300
6
6
  MIN_COVERAGE = 75
7
- ORForise_Version = 'v1.5.0'
7
+ ORForise_Version = 'v1.6.0'
8
8
 
9
9
 
10
10
  def revCompIterative(watson): # Gets Reverse Complement