protk 1.4.0 → 1.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/augustus_to_proteindb.rb +30 -17
- data/bin/interprophet.rb +3 -1
- data/bin/protxml_to_gff.rb +26 -6
- data/lib/protk/error.rb +7 -0
- data/lib/protk/galaxy_util.rb +24 -0
- data/lib/protk/peptide.rb +13 -16
- data/lib/protk/tandem_search_tool.rb +4 -25
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7329f51a45b5449ec979e76aca5727c6714a5bc8
|
4
|
+
data.tar.gz: e96f553b27c61c7ba1935d379e01086e9cb00725
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb933aa9ce0cc6fabb19b0a731bb8d74f23456937ec4d08973477f8f956eb733fcb44b26b66fc61242f5ea6c617d0c3a178b83f16dc568ad5f330db9dcd27c1d
|
7
|
+
data.tar.gz: 5b2b370cea53d3a3ec9eee9d5916df8f910ef12181c0bce9b660c1d82d042e00a5b66ce7fcba6a864cd1a6266aa2fbffec998ae26700f4ddd7903ab141ba3241
|
@@ -110,13 +110,25 @@ def sequence_fasta_header(transcript_line,coding_sequences,scaffold)
|
|
110
110
|
|
111
111
|
tmatch=transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1}).*?ID=(.*?);/)
|
112
112
|
# require 'debugger'; debugger
|
113
|
-
tstart=
|
114
|
-
|
115
|
-
|
116
|
-
|
113
|
+
tstart,tend,tstrand = transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1})/).captures
|
114
|
+
|
115
|
+
# tstart=tmatch[1]
|
116
|
+
# tend=tmatch[2]
|
117
|
+
tsidfield = transcript_line.split("\t")[8]
|
118
|
+
tid = nil
|
119
|
+
if tsidfield =~ /ID=/
|
120
|
+
tid = tsidfield.match(/ID=(.*?);/).captures[0]
|
121
|
+
else
|
122
|
+
tid = tsidfield.gsub(" ","_").gsub(";","_")
|
123
|
+
end
|
124
|
+
|
125
|
+
# require 'byebug';byebug
|
126
|
+
|
127
|
+
tstrandfr="fwd"
|
128
|
+
tstrandfr = "rev" if tstrand=="-"
|
117
129
|
|
118
|
-
tid=tmatch[4]
|
119
|
-
header=">lcl|#{sanitize_scaffold_idstring(scaffold)}_#{
|
130
|
+
# tid=tmatch[4]
|
131
|
+
header=">lcl|#{sanitize_scaffold_idstring(scaffold)}_#{tstrandfr}_#{tid} #{tstart}|#{tend}"
|
120
132
|
if $add_transcript_info
|
121
133
|
coding_sequences.each { |coding_sequence| header << " #{cds_to_header_text(coding_sequence,tid)}" }
|
122
134
|
end
|
@@ -135,13 +147,14 @@ end
|
|
135
147
|
def parse_gene(gene_lines)
|
136
148
|
|
137
149
|
geneid=gene_lines[0].match(/start gene (.*)/)[1]
|
150
|
+
scaffold_id = gene_lines[1].split("\t")[0]
|
138
151
|
transcripts=get_transcript_lines(gene_lines)
|
139
152
|
coding_sequences=get_cds_lines(gene_lines)
|
140
153
|
proteins=get_protein_sequence_lines(gene_lines)
|
141
154
|
fasta_string=""
|
142
155
|
throw "transcripts/protein mismatch" unless transcripts.length == proteins.length
|
143
156
|
transcripts.each_with_index do |ts, i|
|
144
|
-
fh=sequence_fasta_header(ts,coding_sequences
|
157
|
+
fh=sequence_fasta_header(ts,coding_sequences,scaffold_id)
|
145
158
|
fasta_string << "#{fh}\n"
|
146
159
|
ps=protein_sequence(proteins[i])
|
147
160
|
fasta_string << "#{ps}\n"
|
@@ -152,14 +165,14 @@ def parse_gene(gene_lines)
|
|
152
165
|
fasta_string
|
153
166
|
end
|
154
167
|
|
155
|
-
def capture_scaffold(line)
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
end
|
168
|
+
# def capture_scaffold(line)
|
169
|
+
# if line =~ /-- prediction on sequence number.*?name = (.*)\)/
|
170
|
+
# $current_scaffold=line.match(/-- prediction on sequence number.*?name = (.*)\)/)[1]
|
171
|
+
# if ( $print_progress)
|
172
|
+
# puts $current_scaffold
|
173
|
+
# end
|
174
|
+
# end
|
175
|
+
# end
|
163
176
|
|
164
177
|
def capture_gene_start(line)
|
165
178
|
if line =~ /# start gene/
|
@@ -174,14 +187,14 @@ def at_gene_end(line)
|
|
174
187
|
return false
|
175
188
|
end
|
176
189
|
|
177
|
-
$current_scaffold=""
|
190
|
+
# $current_scaffold=""
|
178
191
|
gene_lines=[]
|
179
192
|
$capturing_gene=false
|
180
193
|
|
181
194
|
|
182
195
|
File.open(inname).each_with_index do |line, line_i|
|
183
196
|
line.chomp!
|
184
|
-
capture_scaffold(line)
|
197
|
+
# capture_scaffold(line)
|
185
198
|
capture_gene_start(line)
|
186
199
|
|
187
200
|
if at_gene_end(line)
|
data/bin/interprophet.rb
CHANGED
@@ -65,9 +65,11 @@ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
|
|
65
65
|
|
66
66
|
if for_galaxy
|
67
67
|
inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
|
68
|
+
input_files = inputs.collect { |e| e.staged_path }
|
69
|
+
else
|
70
|
+
input_files = inputs
|
68
71
|
end
|
69
72
|
|
70
|
-
input_files = inputs.collect { |e| e.staged_path }
|
71
73
|
|
72
74
|
cmd << " #{input_files.join(" ")} #{output_file}"
|
73
75
|
|
data/bin/protxml_to_gff.rb
CHANGED
@@ -11,12 +11,12 @@ require 'protk/gffdb'
|
|
11
11
|
require 'protk/protein'
|
12
12
|
require 'protk/peptide'
|
13
13
|
require 'protk/tool'
|
14
|
+
require 'protk/error'
|
14
15
|
require 'libxml'
|
15
16
|
require 'bio'
|
16
17
|
|
17
18
|
include LibXML
|
18
19
|
|
19
|
-
|
20
20
|
class NoGFFEntryFoundError < StandardError
|
21
21
|
end
|
22
22
|
|
@@ -26,6 +26,9 @@ end
|
|
26
26
|
class MultipleGFFEntriesForProteinError < StandardError
|
27
27
|
end
|
28
28
|
|
29
|
+
class GFFIDRegexNotMatchedError < ProtkError
|
30
|
+
end
|
31
|
+
|
29
32
|
def parse_proteins(protxml_file)
|
30
33
|
protxml_parser=XML::Parser.file(protxml_file)
|
31
34
|
protxml_doc=protxml_parser.parse
|
@@ -35,7 +38,14 @@ end
|
|
35
38
|
|
36
39
|
def protein_id_to_gffid(protein_id,gff_idregex)
|
37
40
|
return protein_id if gff_idregex.nil?
|
38
|
-
|
41
|
+
|
42
|
+
m = protein_id.match(/#{gff_idregex}/)
|
43
|
+
if m
|
44
|
+
return m.captures[0]
|
45
|
+
else
|
46
|
+
raise GFFIDRegexNotMatchedError.new("Unable to parse gff_id from #{protein_id} using regex #{gff_idregex}")
|
47
|
+
end
|
48
|
+
|
39
49
|
end
|
40
50
|
|
41
51
|
def protein_id_to_genomeid(protein_id,genome_idregex)
|
@@ -103,6 +113,8 @@ input_protxml=ARGV[0]
|
|
103
113
|
$protk.log "Creating GFFDB", :info
|
104
114
|
gffdb = GFFDB.create(tool.coords_file) if tool.coords_file
|
105
115
|
|
116
|
+
#require 'byebug';byebug
|
117
|
+
|
106
118
|
# genome_db = prepare_fasta(tool.genome,'nucl')
|
107
119
|
$protk.log "Preparing FASTA index", :info
|
108
120
|
prot_db = prepare_fasta(tool.database,'prot')
|
@@ -157,14 +169,22 @@ proteins.each do |protein|
|
|
157
169
|
rescue ProteinNotInDBError
|
158
170
|
$protk.log "No entry for #{parsed_name_for_protdb}", :info
|
159
171
|
rescue MultipleGFFEntriesForProteinError
|
160
|
-
$protk.log "Multiple entries in gff file for #{parsed_name_for_gffid}", :
|
161
|
-
|
162
|
-
|
172
|
+
$protk.log "Multiple entries in gff file for #{parsed_name_for_gffid}", :warn
|
173
|
+
# require 'byebug';byebug
|
174
|
+
# puts gff_parent_entries
|
175
|
+
rescue PeptideNotInProteinError => e
|
176
|
+
# This is generally not fatal. It can happen because of Leucine Isoleucine issues
|
177
|
+
#
|
178
|
+
$protk.log "#{e.message}. Parent protein ID #{protein.protein_name}" , :info
|
179
|
+
# require 'byebug';byebug
|
180
|
+
# puts protein.protein_name
|
181
|
+
rescue GFFIDRegexNotMatchedError => e
|
182
|
+
$protk.log e.message, :info
|
163
183
|
end
|
164
184
|
end
|
165
185
|
end
|
166
186
|
|
167
187
|
if num_missing_gff_entries>0
|
168
|
-
$protk.log "Failed to lookup gff entries. Try setting --gff-idregex" if tool.gff_idregex.nil?
|
188
|
+
$protk.log "Failed to lookup gff entries. Try setting --gff-idregex", :error if tool.gff_idregex.nil?
|
169
189
|
end
|
170
190
|
|
data/lib/protk/error.rb
ADDED
data/lib/protk/galaxy_util.rb
CHANGED
@@ -23,6 +23,30 @@ class GalaxyUtil
|
|
23
23
|
end
|
24
24
|
|
25
25
|
|
26
|
+
# Galaxy changes things like @ to __at__ we need to change it back
|
27
|
+
#
|
28
|
+
def self.decode_galaxy_string!(mstring)
|
29
|
+
mstring.gsub!("__at__","@")
|
30
|
+
mstring.gsub!("__oc__","{")
|
31
|
+
mstring.gsub!("__cc__","}")
|
32
|
+
mstring.gsub!("__ob__","[")
|
33
|
+
mstring.gsub!("__cb__","]")
|
34
|
+
mstring.gsub!("__gt__",">")
|
35
|
+
mstring.gsub!("__lt__","<")
|
36
|
+
mstring.gsub!("__sq__","'")
|
37
|
+
mstring.gsub!("__dq__","\"")
|
38
|
+
mstring.gsub!("__cn__","\n")
|
39
|
+
mstring.gsub!("__cr__","\r")
|
40
|
+
mstring.gsub!("__tc__","\t")
|
41
|
+
mstring.gsub!("__pd__","#")
|
42
|
+
|
43
|
+
# For characters not allowed at all by galaxy
|
44
|
+
mstring.gsub!("__pc__","|")
|
45
|
+
|
46
|
+
mstring
|
47
|
+
end
|
48
|
+
|
49
|
+
|
26
50
|
# Unused
|
27
51
|
|
28
52
|
# def self.stage_protxml(input_protxml_path)
|
data/lib/protk/peptide.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
require 'libxml'
|
2
2
|
require 'bio'
|
3
3
|
require 'protk/bio_gff3_extensions'
|
4
|
+
require 'protk/error'
|
5
|
+
|
4
6
|
include LibXML
|
5
7
|
|
6
|
-
class PeptideNotInProteinError <
|
8
|
+
class PeptideNotInProteinError < ProtkError
|
7
9
|
end
|
8
10
|
|
9
11
|
class Peptide
|
@@ -43,11 +45,11 @@ class Peptide
|
|
43
45
|
def coords_in_protein(prot_seq,reverse=false)
|
44
46
|
if reverse
|
45
47
|
pep_index = prot_seq.reverse.index(self.sequence.reverse)
|
46
|
-
raise PeptideNotInProteinError if pep_index.nil?
|
48
|
+
raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_index.nil?
|
47
49
|
pep_start_i = pep_index
|
48
50
|
else
|
49
51
|
pep_start_i = prot_seq.index(self.sequence)
|
50
|
-
raise PeptideNotInProteinError if pep_start_i.nil?
|
52
|
+
raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_start_i.nil?
|
51
53
|
end
|
52
54
|
pep_end_i = pep_start_i+self.sequence.length
|
53
55
|
{:start => pep_start_i,:end => pep_end_i}
|
@@ -91,31 +93,26 @@ class Peptide
|
|
91
93
|
pep_end_i = pep_start_i+self.sequence.length*3
|
92
94
|
fragments=[]
|
93
95
|
ordered_cds_records.each do |cds_record|
|
94
|
-
|
96
|
+
|
95
97
|
fragment = nil
|
96
98
|
fragment_len = 0
|
97
99
|
if on_reverse_strand
|
98
100
|
|
99
101
|
in_peptide = (i<pep_end_i) && (i>=pep_start_i)
|
100
102
|
before_len = [pep_start_i-i,0].max
|
101
|
-
# puts before_len
|
102
|
-
# puts in_peptide
|
103
|
-
# puts "i #{i} pi #{pep_end_i} psi #{pep_start_i}"
|
104
|
-
if in_peptide
|
105
103
|
|
104
|
+
if in_peptide
|
106
105
|
fragment_end = cds_record.end
|
107
106
|
fragment_len = [cds_record.length,pep_end_i-i].min
|
108
107
|
fragment_start = fragment_end-fragment_len+1
|
109
|
-
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
110
108
|
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
111
|
-
|
112
109
|
elsif before_len>0
|
113
110
|
fragment_end = cds_record.end - before_len
|
114
111
|
fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
|
115
|
-
# puts "Frag len #{fragment_len}"
|
116
112
|
fragment_start = fragment_end - fragment_len + 1
|
117
|
-
|
118
|
-
|
113
|
+
if fragment_len>0
|
114
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
115
|
+
end
|
119
116
|
else
|
120
117
|
fragment=nil
|
121
118
|
end
|
@@ -126,14 +123,14 @@ class Peptide
|
|
126
123
|
fragment_start = cds_record.start
|
127
124
|
fragment_len = [cds_record.length,pep_end_i-i].min
|
128
125
|
fragment_end = fragment_start+fragment_len-1
|
129
|
-
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
130
126
|
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
131
127
|
elsif before_len>0
|
132
128
|
fragment_start = cds_record.start + before_len
|
133
129
|
fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
|
134
130
|
fragment_end = fragment_start + fragment_len-1
|
135
|
-
|
136
|
-
|
131
|
+
if fragment_len>0
|
132
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
133
|
+
end
|
137
134
|
else
|
138
135
|
fragment=nil
|
139
136
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'protk/search_tool'
|
2
|
+
require 'protk/galaxy_util'
|
2
3
|
|
3
4
|
class String
|
4
5
|
def xtandem_modification_motif?
|
@@ -70,28 +71,6 @@ class TandemSearchTool < SearchTool
|
|
70
71
|
end
|
71
72
|
|
72
73
|
private
|
73
|
-
# Galaxy changes things like @ to __at__ we need to change it back
|
74
|
-
#
|
75
|
-
def decode_galaxy_string(mstring)
|
76
|
-
mstring.gsub!("__at__","@")
|
77
|
-
mstring.gsub!("__oc__","{")
|
78
|
-
mstring.gsub!("__cc__","}")
|
79
|
-
mstring.gsub!("__ob__","[")
|
80
|
-
mstring.gsub!("__cb__","]")
|
81
|
-
mstring.gsub!("__gt__",">")
|
82
|
-
mstring.gsub!("__lt__","<")
|
83
|
-
mstring.gsub!("__sq__","'")
|
84
|
-
mstring.gsub!("__dq__","\"")
|
85
|
-
mstring.gsub!("__cn__","\n")
|
86
|
-
mstring.gsub!("__cr__","\r")
|
87
|
-
mstring.gsub!("__tc__","\t")
|
88
|
-
mstring.gsub!("__pd__","#")
|
89
|
-
|
90
|
-
# For characters not allowed at all by galaxy
|
91
|
-
mstring.gsub!("__pc__","|")
|
92
|
-
|
93
|
-
mstring
|
94
|
-
end
|
95
74
|
|
96
75
|
def set_option(std_params, tandem_key, value)
|
97
76
|
notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
|
@@ -180,7 +159,7 @@ class TandemSearchTool < SearchTool
|
|
180
159
|
if opt_val.is_a?(TrueClass) || opt_val.is_a?(FalseClass)
|
181
160
|
opt_val = opt_val ? "yes" : "no"
|
182
161
|
end
|
183
|
-
append_option(std_params,xtandem_key,decode_galaxy_string(opt_val.to_s))
|
162
|
+
append_option(std_params,xtandem_key,GalaxyUtil.decode_galaxy_string!(opt_val.to_s))
|
184
163
|
end
|
185
164
|
end
|
186
165
|
|
@@ -208,7 +187,7 @@ class TandemSearchTool < SearchTool
|
|
208
187
|
#
|
209
188
|
|
210
189
|
var_mods = self.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
|
211
|
-
var_mods=var_mods.collect {|mod| decode_galaxy_string(mod) }
|
190
|
+
var_mods=var_mods.collect {|mod| GalaxyUtil.decode_galaxy_string!(mod) }
|
212
191
|
|
213
192
|
# var_mods allows motif's as well as standard mods. These should be in a separate array
|
214
193
|
var_motifs = [].replace(var_mods)
|
@@ -216,7 +195,7 @@ class TandemSearchTool < SearchTool
|
|
216
195
|
var_motifs.keep_if {|mod| mod.xtandem_modification_motif? }
|
217
196
|
|
218
197
|
fix_mods = self.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
|
219
|
-
fix_mods=fix_mods.collect {|mod| decode_galaxy_string(mod)}
|
198
|
+
fix_mods=fix_mods.collect {|mod| GalaxyUtil.decode_galaxy_string!(mod)}
|
220
199
|
|
221
200
|
# We also support the --glyco and --methionineo shortcuts.
|
222
201
|
# Add these here. No check is made for duplication
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: protk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.4.
|
4
|
+
version: 1.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ira Cooke
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: open4
|
@@ -266,6 +266,7 @@ files:
|
|
266
266
|
- lib/protk/data/uniprot_accessions_table.txt
|
267
267
|
- lib/protk/data/uniprot_input_accessions.loc
|
268
268
|
- lib/protk/data/yum_packages.yaml
|
269
|
+
- lib/protk/error.rb
|
269
270
|
- lib/protk/fastadb.rb
|
270
271
|
- lib/protk/galaxy_stager.rb
|
271
272
|
- lib/protk/galaxy_util.rb
|