protk 1.4.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/augustus_to_proteindb.rb +30 -17
- data/bin/interprophet.rb +3 -1
- data/bin/protxml_to_gff.rb +26 -6
- data/lib/protk/error.rb +7 -0
- data/lib/protk/galaxy_util.rb +24 -0
- data/lib/protk/peptide.rb +13 -16
- data/lib/protk/tandem_search_tool.rb +4 -25
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7329f51a45b5449ec979e76aca5727c6714a5bc8
|
4
|
+
data.tar.gz: e96f553b27c61c7ba1935d379e01086e9cb00725
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb933aa9ce0cc6fabb19b0a731bb8d74f23456937ec4d08973477f8f956eb733fcb44b26b66fc61242f5ea6c617d0c3a178b83f16dc568ad5f330db9dcd27c1d
|
7
|
+
data.tar.gz: 5b2b370cea53d3a3ec9eee9d5916df8f910ef12181c0bce9b660c1d82d042e00a5b66ce7fcba6a864cd1a6266aa2fbffec998ae26700f4ddd7903ab141ba3241
|
@@ -110,13 +110,25 @@ def sequence_fasta_header(transcript_line,coding_sequences,scaffold)
|
|
110
110
|
|
111
111
|
tmatch=transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1}).*?ID=(.*?);/)
|
112
112
|
# require 'debugger'; debugger
|
113
|
-
tstart=
|
114
|
-
|
115
|
-
|
116
|
-
|
113
|
+
tstart,tend,tstrand = transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1})/).captures
|
114
|
+
|
115
|
+
# tstart=tmatch[1]
|
116
|
+
# tend=tmatch[2]
|
117
|
+
tsidfield = transcript_line.split("\t")[8]
|
118
|
+
tid = nil
|
119
|
+
if tsidfield =~ /ID=/
|
120
|
+
tid = tsidfield.match(/ID=(.*?);/).captures[0]
|
121
|
+
else
|
122
|
+
tid = tsidfield.gsub(" ","_").gsub(";","_")
|
123
|
+
end
|
124
|
+
|
125
|
+
# require 'byebug';byebug
|
126
|
+
|
127
|
+
tstrandfr="fwd"
|
128
|
+
tstrandfr = "rev" if tstrand=="-"
|
117
129
|
|
118
|
-
tid=tmatch[4]
|
119
|
-
header=">lcl|#{sanitize_scaffold_idstring(scaffold)}_#{
|
130
|
+
# tid=tmatch[4]
|
131
|
+
header=">lcl|#{sanitize_scaffold_idstring(scaffold)}_#{tstrandfr}_#{tid} #{tstart}|#{tend}"
|
120
132
|
if $add_transcript_info
|
121
133
|
coding_sequences.each { |coding_sequence| header << " #{cds_to_header_text(coding_sequence,tid)}" }
|
122
134
|
end
|
@@ -135,13 +147,14 @@ end
|
|
135
147
|
def parse_gene(gene_lines)
|
136
148
|
|
137
149
|
geneid=gene_lines[0].match(/start gene (.*)/)[1]
|
150
|
+
scaffold_id = gene_lines[1].split("\t")[0]
|
138
151
|
transcripts=get_transcript_lines(gene_lines)
|
139
152
|
coding_sequences=get_cds_lines(gene_lines)
|
140
153
|
proteins=get_protein_sequence_lines(gene_lines)
|
141
154
|
fasta_string=""
|
142
155
|
throw "transcripts/protein mismatch" unless transcripts.length == proteins.length
|
143
156
|
transcripts.each_with_index do |ts, i|
|
144
|
-
fh=sequence_fasta_header(ts,coding_sequences
|
157
|
+
fh=sequence_fasta_header(ts,coding_sequences,scaffold_id)
|
145
158
|
fasta_string << "#{fh}\n"
|
146
159
|
ps=protein_sequence(proteins[i])
|
147
160
|
fasta_string << "#{ps}\n"
|
@@ -152,14 +165,14 @@ def parse_gene(gene_lines)
|
|
152
165
|
fasta_string
|
153
166
|
end
|
154
167
|
|
155
|
-
def capture_scaffold(line)
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
end
|
168
|
+
# def capture_scaffold(line)
|
169
|
+
# if line =~ /-- prediction on sequence number.*?name = (.*)\)/
|
170
|
+
# $current_scaffold=line.match(/-- prediction on sequence number.*?name = (.*)\)/)[1]
|
171
|
+
# if ( $print_progress)
|
172
|
+
# puts $current_scaffold
|
173
|
+
# end
|
174
|
+
# end
|
175
|
+
# end
|
163
176
|
|
164
177
|
def capture_gene_start(line)
|
165
178
|
if line =~ /# start gene/
|
@@ -174,14 +187,14 @@ def at_gene_end(line)
|
|
174
187
|
return false
|
175
188
|
end
|
176
189
|
|
177
|
-
$current_scaffold=""
|
190
|
+
# $current_scaffold=""
|
178
191
|
gene_lines=[]
|
179
192
|
$capturing_gene=false
|
180
193
|
|
181
194
|
|
182
195
|
File.open(inname).each_with_index do |line, line_i|
|
183
196
|
line.chomp!
|
184
|
-
capture_scaffold(line)
|
197
|
+
# capture_scaffold(line)
|
185
198
|
capture_gene_start(line)
|
186
199
|
|
187
200
|
if at_gene_end(line)
|
data/bin/interprophet.rb
CHANGED
@@ -65,9 +65,11 @@ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
|
|
65
65
|
|
66
66
|
if for_galaxy
|
67
67
|
inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
|
68
|
+
input_files = inputs.collect { |e| e.staged_path }
|
69
|
+
else
|
70
|
+
input_files = inputs
|
68
71
|
end
|
69
72
|
|
70
|
-
input_files = inputs.collect { |e| e.staged_path }
|
71
73
|
|
72
74
|
cmd << " #{input_files.join(" ")} #{output_file}"
|
73
75
|
|
data/bin/protxml_to_gff.rb
CHANGED
@@ -11,12 +11,12 @@ require 'protk/gffdb'
|
|
11
11
|
require 'protk/protein'
|
12
12
|
require 'protk/peptide'
|
13
13
|
require 'protk/tool'
|
14
|
+
require 'protk/error'
|
14
15
|
require 'libxml'
|
15
16
|
require 'bio'
|
16
17
|
|
17
18
|
include LibXML
|
18
19
|
|
19
|
-
|
20
20
|
class NoGFFEntryFoundError < StandardError
|
21
21
|
end
|
22
22
|
|
@@ -26,6 +26,9 @@ end
|
|
26
26
|
class MultipleGFFEntriesForProteinError < StandardError
|
27
27
|
end
|
28
28
|
|
29
|
+
class GFFIDRegexNotMatchedError < ProtkError
|
30
|
+
end
|
31
|
+
|
29
32
|
def parse_proteins(protxml_file)
|
30
33
|
protxml_parser=XML::Parser.file(protxml_file)
|
31
34
|
protxml_doc=protxml_parser.parse
|
@@ -35,7 +38,14 @@ end
|
|
35
38
|
|
36
39
|
def protein_id_to_gffid(protein_id,gff_idregex)
|
37
40
|
return protein_id if gff_idregex.nil?
|
38
|
-
|
41
|
+
|
42
|
+
m = protein_id.match(/#{gff_idregex}/)
|
43
|
+
if m
|
44
|
+
return m.captures[0]
|
45
|
+
else
|
46
|
+
raise GFFIDRegexNotMatchedError.new("Unable to parse gff_id from #{protein_id} using regex #{gff_idregex}")
|
47
|
+
end
|
48
|
+
|
39
49
|
end
|
40
50
|
|
41
51
|
def protein_id_to_genomeid(protein_id,genome_idregex)
|
@@ -103,6 +113,8 @@ input_protxml=ARGV[0]
|
|
103
113
|
$protk.log "Creating GFFDB", :info
|
104
114
|
gffdb = GFFDB.create(tool.coords_file) if tool.coords_file
|
105
115
|
|
116
|
+
#require 'byebug';byebug
|
117
|
+
|
106
118
|
# genome_db = prepare_fasta(tool.genome,'nucl')
|
107
119
|
$protk.log "Preparing FASTA index", :info
|
108
120
|
prot_db = prepare_fasta(tool.database,'prot')
|
@@ -157,14 +169,22 @@ proteins.each do |protein|
|
|
157
169
|
rescue ProteinNotInDBError
|
158
170
|
$protk.log "No entry for #{parsed_name_for_protdb}", :info
|
159
171
|
rescue MultipleGFFEntriesForProteinError
|
160
|
-
$protk.log "Multiple entries in gff file for #{parsed_name_for_gffid}", :
|
161
|
-
|
162
|
-
|
172
|
+
$protk.log "Multiple entries in gff file for #{parsed_name_for_gffid}", :warn
|
173
|
+
# require 'byebug';byebug
|
174
|
+
# puts gff_parent_entries
|
175
|
+
rescue PeptideNotInProteinError => e
|
176
|
+
# This is generally not fatal. It can happen because of Leucine Isoleucine issues
|
177
|
+
#
|
178
|
+
$protk.log "#{e.message}. Parent protein ID #{protein.protein_name}" , :info
|
179
|
+
# require 'byebug';byebug
|
180
|
+
# puts protein.protein_name
|
181
|
+
rescue GFFIDRegexNotMatchedError => e
|
182
|
+
$protk.log e.message, :info
|
163
183
|
end
|
164
184
|
end
|
165
185
|
end
|
166
186
|
|
167
187
|
if num_missing_gff_entries>0
|
168
|
-
$protk.log "Failed to lookup gff entries. Try setting --gff-idregex" if tool.gff_idregex.nil?
|
188
|
+
$protk.log "Failed to lookup gff entries. Try setting --gff-idregex", :error if tool.gff_idregex.nil?
|
169
189
|
end
|
170
190
|
|
data/lib/protk/error.rb
ADDED
data/lib/protk/galaxy_util.rb
CHANGED
@@ -23,6 +23,30 @@ class GalaxyUtil
|
|
23
23
|
end
|
24
24
|
|
25
25
|
|
26
|
+
# Galaxy changes things like @ to __at__ we need to change it back
|
27
|
+
#
|
28
|
+
def self.decode_galaxy_string!(mstring)
|
29
|
+
mstring.gsub!("__at__","@")
|
30
|
+
mstring.gsub!("__oc__","{")
|
31
|
+
mstring.gsub!("__cc__","}")
|
32
|
+
mstring.gsub!("__ob__","[")
|
33
|
+
mstring.gsub!("__cb__","]")
|
34
|
+
mstring.gsub!("__gt__",">")
|
35
|
+
mstring.gsub!("__lt__","<")
|
36
|
+
mstring.gsub!("__sq__","'")
|
37
|
+
mstring.gsub!("__dq__","\"")
|
38
|
+
mstring.gsub!("__cn__","\n")
|
39
|
+
mstring.gsub!("__cr__","\r")
|
40
|
+
mstring.gsub!("__tc__","\t")
|
41
|
+
mstring.gsub!("__pd__","#")
|
42
|
+
|
43
|
+
# For characters not allowed at all by galaxy
|
44
|
+
mstring.gsub!("__pc__","|")
|
45
|
+
|
46
|
+
mstring
|
47
|
+
end
|
48
|
+
|
49
|
+
|
26
50
|
# Unused
|
27
51
|
|
28
52
|
# def self.stage_protxml(input_protxml_path)
|
data/lib/protk/peptide.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
require 'libxml'
|
2
2
|
require 'bio'
|
3
3
|
require 'protk/bio_gff3_extensions'
|
4
|
+
require 'protk/error'
|
5
|
+
|
4
6
|
include LibXML
|
5
7
|
|
6
|
-
class PeptideNotInProteinError <
|
8
|
+
class PeptideNotInProteinError < ProtkError
|
7
9
|
end
|
8
10
|
|
9
11
|
class Peptide
|
@@ -43,11 +45,11 @@ class Peptide
|
|
43
45
|
def coords_in_protein(prot_seq,reverse=false)
|
44
46
|
if reverse
|
45
47
|
pep_index = prot_seq.reverse.index(self.sequence.reverse)
|
46
|
-
raise PeptideNotInProteinError if pep_index.nil?
|
48
|
+
raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_index.nil?
|
47
49
|
pep_start_i = pep_index
|
48
50
|
else
|
49
51
|
pep_start_i = prot_seq.index(self.sequence)
|
50
|
-
raise PeptideNotInProteinError if pep_start_i.nil?
|
52
|
+
raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_start_i.nil?
|
51
53
|
end
|
52
54
|
pep_end_i = pep_start_i+self.sequence.length
|
53
55
|
{:start => pep_start_i,:end => pep_end_i}
|
@@ -91,31 +93,26 @@ class Peptide
|
|
91
93
|
pep_end_i = pep_start_i+self.sequence.length*3
|
92
94
|
fragments=[]
|
93
95
|
ordered_cds_records.each do |cds_record|
|
94
|
-
|
96
|
+
|
95
97
|
fragment = nil
|
96
98
|
fragment_len = 0
|
97
99
|
if on_reverse_strand
|
98
100
|
|
99
101
|
in_peptide = (i<pep_end_i) && (i>=pep_start_i)
|
100
102
|
before_len = [pep_start_i-i,0].max
|
101
|
-
# puts before_len
|
102
|
-
# puts in_peptide
|
103
|
-
# puts "i #{i} pi #{pep_end_i} psi #{pep_start_i}"
|
104
|
-
if in_peptide
|
105
103
|
|
104
|
+
if in_peptide
|
106
105
|
fragment_end = cds_record.end
|
107
106
|
fragment_len = [cds_record.length,pep_end_i-i].min
|
108
107
|
fragment_start = fragment_end-fragment_len+1
|
109
|
-
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
110
108
|
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
111
|
-
|
112
109
|
elsif before_len>0
|
113
110
|
fragment_end = cds_record.end - before_len
|
114
111
|
fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
|
115
|
-
# puts "Frag len #{fragment_len}"
|
116
112
|
fragment_start = fragment_end - fragment_len + 1
|
117
|
-
|
118
|
-
|
113
|
+
if fragment_len>0
|
114
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
115
|
+
end
|
119
116
|
else
|
120
117
|
fragment=nil
|
121
118
|
end
|
@@ -126,14 +123,14 @@ class Peptide
|
|
126
123
|
fragment_start = cds_record.start
|
127
124
|
fragment_len = [cds_record.length,pep_end_i-i].min
|
128
125
|
fragment_end = fragment_start+fragment_len-1
|
129
|
-
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
130
126
|
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
131
127
|
elsif before_len>0
|
132
128
|
fragment_start = cds_record.start + before_len
|
133
129
|
fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
|
134
130
|
fragment_end = fragment_start + fragment_len-1
|
135
|
-
|
136
|
-
|
131
|
+
if fragment_len>0
|
132
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
133
|
+
end
|
137
134
|
else
|
138
135
|
fragment=nil
|
139
136
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'protk/search_tool'
|
2
|
+
require 'protk/galaxy_util'
|
2
3
|
|
3
4
|
class String
|
4
5
|
def xtandem_modification_motif?
|
@@ -70,28 +71,6 @@ class TandemSearchTool < SearchTool
|
|
70
71
|
end
|
71
72
|
|
72
73
|
private
|
73
|
-
# Galaxy changes things like @ to __at__ we need to change it back
|
74
|
-
#
|
75
|
-
def decode_galaxy_string(mstring)
|
76
|
-
mstring.gsub!("__at__","@")
|
77
|
-
mstring.gsub!("__oc__","{")
|
78
|
-
mstring.gsub!("__cc__","}")
|
79
|
-
mstring.gsub!("__ob__","[")
|
80
|
-
mstring.gsub!("__cb__","]")
|
81
|
-
mstring.gsub!("__gt__",">")
|
82
|
-
mstring.gsub!("__lt__","<")
|
83
|
-
mstring.gsub!("__sq__","'")
|
84
|
-
mstring.gsub!("__dq__","\"")
|
85
|
-
mstring.gsub!("__cn__","\n")
|
86
|
-
mstring.gsub!("__cr__","\r")
|
87
|
-
mstring.gsub!("__tc__","\t")
|
88
|
-
mstring.gsub!("__pd__","#")
|
89
|
-
|
90
|
-
# For characters not allowed at all by galaxy
|
91
|
-
mstring.gsub!("__pc__","|")
|
92
|
-
|
93
|
-
mstring
|
94
|
-
end
|
95
74
|
|
96
75
|
def set_option(std_params, tandem_key, value)
|
97
76
|
notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
|
@@ -180,7 +159,7 @@ class TandemSearchTool < SearchTool
|
|
180
159
|
if opt_val.is_a?(TrueClass) || opt_val.is_a?(FalseClass)
|
181
160
|
opt_val = opt_val ? "yes" : "no"
|
182
161
|
end
|
183
|
-
append_option(std_params,xtandem_key,decode_galaxy_string(opt_val.to_s))
|
162
|
+
append_option(std_params,xtandem_key,GalaxyUtil.decode_galaxy_string!(opt_val.to_s))
|
184
163
|
end
|
185
164
|
end
|
186
165
|
|
@@ -208,7 +187,7 @@ class TandemSearchTool < SearchTool
|
|
208
187
|
#
|
209
188
|
|
210
189
|
var_mods = self.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
|
211
|
-
var_mods=var_mods.collect {|mod| decode_galaxy_string(mod) }
|
190
|
+
var_mods=var_mods.collect {|mod| GalaxyUtil.decode_galaxy_string!(mod) }
|
212
191
|
|
213
192
|
# var_mods allows motif's as well as standard mods. These should be in a separate array
|
214
193
|
var_motifs = [].replace(var_mods)
|
@@ -216,7 +195,7 @@ class TandemSearchTool < SearchTool
|
|
216
195
|
var_motifs.keep_if {|mod| mod.xtandem_modification_motif? }
|
217
196
|
|
218
197
|
fix_mods = self.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
|
219
|
-
fix_mods=fix_mods.collect {|mod| decode_galaxy_string(mod)}
|
198
|
+
fix_mods=fix_mods.collect {|mod| GalaxyUtil.decode_galaxy_string!(mod)}
|
220
199
|
|
221
200
|
# We also support the --glyco and --methionineo shortcuts.
|
222
201
|
# Add these here. No check is made for duplication
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: protk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.4.
|
4
|
+
version: 1.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ira Cooke
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: open4
|
@@ -266,6 +266,7 @@ files:
|
|
266
266
|
- lib/protk/data/uniprot_accessions_table.txt
|
267
267
|
- lib/protk/data/uniprot_input_accessions.loc
|
268
268
|
- lib/protk/data/yum_packages.yaml
|
269
|
+
- lib/protk/error.rb
|
269
270
|
- lib/protk/fastadb.rb
|
270
271
|
- lib/protk/galaxy_stager.rb
|
271
272
|
- lib/protk/galaxy_util.rb
|