protk 1.4.0 → 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 22d2c990e46bf29f08cbf00dc2ecd9a759fae457
4
- data.tar.gz: 09dd0159d8b564d9d297a987c74e13833fabbc24
3
+ metadata.gz: 7329f51a45b5449ec979e76aca5727c6714a5bc8
4
+ data.tar.gz: e96f553b27c61c7ba1935d379e01086e9cb00725
5
5
  SHA512:
6
- metadata.gz: 2b08c1086187da5755e4b0d98dcfebcff80893eb291e09291ea3853fa848ade7f464f83fc97a8fe362d2b9a96e915e2aa1945ddd6e0f39ada32d1a747a0c7d73
7
- data.tar.gz: 767b474d9f0b890342f783bff0e8633434542cf151d34f239f9ed52dab43a1909b683fa285b25954af54856d68493ab67e4776d152830bc938e62a932c63e9b3
6
+ metadata.gz: fb933aa9ce0cc6fabb19b0a731bb8d74f23456937ec4d08973477f8f956eb733fcb44b26b66fc61242f5ea6c617d0c3a178b83f16dc568ad5f330db9dcd27c1d
7
+ data.tar.gz: 5b2b370cea53d3a3ec9eee9d5916df8f910ef12181c0bce9b660c1d82d042e00a5b66ce7fcba6a864cd1a6266aa2fbffec998ae26700f4ddd7903ab141ba3241
@@ -110,13 +110,25 @@ def sequence_fasta_header(transcript_line,coding_sequences,scaffold)
110
110
 
111
111
  tmatch=transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1}).*?ID=(.*?);/)
112
112
  # require 'debugger'; debugger
113
- tstart=tmatch[1]
114
- tend=tmatch[2]
115
- tstrand="fwd"
116
- tstrand = "rev" if tmatch[3]=="-"
113
+ tstart,tend,tstrand = transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1})/).captures
114
+
115
+ # tstart=tmatch[1]
116
+ # tend=tmatch[2]
117
+ tsidfield = transcript_line.split("\t")[8]
118
+ tid = nil
119
+ if tsidfield =~ /ID=/
120
+ tid = tsidfield.match(/ID=(.*?);/).captures[0]
121
+ else
122
+ tid = tsidfield.gsub(" ","_").gsub(";","_")
123
+ end
124
+
125
+ # require 'byebug';byebug
126
+
127
+ tstrandfr="fwd"
128
+ tstrandfr = "rev" if tstrand=="-"
117
129
 
118
- tid=tmatch[4]
119
- header=">lcl|#{sanitize_scaffold_idstring(scaffold)}_#{tstrand}_#{tid} #{tstart}|#{tend}"
130
+ # tid=tmatch[4]
131
+ header=">lcl|#{sanitize_scaffold_idstring(scaffold)}_#{tstrandfr}_#{tid} #{tstart}|#{tend}"
120
132
  if $add_transcript_info
121
133
  coding_sequences.each { |coding_sequence| header << " #{cds_to_header_text(coding_sequence,tid)}" }
122
134
  end
@@ -135,13 +147,14 @@ end
135
147
  def parse_gene(gene_lines)
136
148
 
137
149
  geneid=gene_lines[0].match(/start gene (.*)/)[1]
150
+ scaffold_id = gene_lines[1].split("\t")[0]
138
151
  transcripts=get_transcript_lines(gene_lines)
139
152
  coding_sequences=get_cds_lines(gene_lines)
140
153
  proteins=get_protein_sequence_lines(gene_lines)
141
154
  fasta_string=""
142
155
  throw "transcripts/protein mismatch" unless transcripts.length == proteins.length
143
156
  transcripts.each_with_index do |ts, i|
144
- fh=sequence_fasta_header(ts,coding_sequences,$current_scaffold)
157
+ fh=sequence_fasta_header(ts,coding_sequences,scaffold_id)
145
158
  fasta_string << "#{fh}\n"
146
159
  ps=protein_sequence(proteins[i])
147
160
  fasta_string << "#{ps}\n"
@@ -152,14 +165,14 @@ def parse_gene(gene_lines)
152
165
  fasta_string
153
166
  end
154
167
 
155
- def capture_scaffold(line)
156
- if line =~ /-- prediction on sequence number.*?name = (.*)\)/
157
- $current_scaffold=line.match(/-- prediction on sequence number.*?name = (.*)\)/)[1]
158
- if ( $print_progress)
159
- puts $current_scaffold
160
- end
161
- end
162
- end
168
+ # def capture_scaffold(line)
169
+ # if line =~ /-- prediction on sequence number.*?name = (.*)\)/
170
+ # $current_scaffold=line.match(/-- prediction on sequence number.*?name = (.*)\)/)[1]
171
+ # if ( $print_progress)
172
+ # puts $current_scaffold
173
+ # end
174
+ # end
175
+ # end
163
176
 
164
177
  def capture_gene_start(line)
165
178
  if line =~ /# start gene/
@@ -174,14 +187,14 @@ def at_gene_end(line)
174
187
  return false
175
188
  end
176
189
 
177
- $current_scaffold=""
190
+ # $current_scaffold=""
178
191
  gene_lines=[]
179
192
  $capturing_gene=false
180
193
 
181
194
 
182
195
  File.open(inname).each_with_index do |line, line_i|
183
196
  line.chomp!
184
- capture_scaffold(line)
197
+ # capture_scaffold(line)
185
198
  capture_gene_start(line)
186
199
 
187
200
  if at_gene_end(line)
@@ -65,9 +65,11 @@ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
65
65
 
66
66
  if for_galaxy
67
67
  inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
68
+ input_files = inputs.collect { |e| e.staged_path }
69
+ else
70
+ input_files = inputs
68
71
  end
69
72
 
70
- input_files = inputs.collect { |e| e.staged_path }
71
73
 
72
74
  cmd << " #{input_files.join(" ")} #{output_file}"
73
75
 
@@ -11,12 +11,12 @@ require 'protk/gffdb'
11
11
  require 'protk/protein'
12
12
  require 'protk/peptide'
13
13
  require 'protk/tool'
14
+ require 'protk/error'
14
15
  require 'libxml'
15
16
  require 'bio'
16
17
 
17
18
  include LibXML
18
19
 
19
-
20
20
  class NoGFFEntryFoundError < StandardError
21
21
  end
22
22
 
@@ -26,6 +26,9 @@ end
26
26
  class MultipleGFFEntriesForProteinError < StandardError
27
27
  end
28
28
 
29
+ class GFFIDRegexNotMatchedError < ProtkError
30
+ end
31
+
29
32
  def parse_proteins(protxml_file)
30
33
  protxml_parser=XML::Parser.file(protxml_file)
31
34
  protxml_doc=protxml_parser.parse
@@ -35,7 +38,14 @@ end
35
38
 
36
39
  def protein_id_to_gffid(protein_id,gff_idregex)
37
40
  return protein_id if gff_idregex.nil?
38
- return protein_id.match(/#{gff_idregex}/)[1]
41
+
42
+ m = protein_id.match(/#{gff_idregex}/)
43
+ if m
44
+ return m.captures[0]
45
+ else
46
+ raise GFFIDRegexNotMatchedError.new("Unable to parse gff_id from #{protein_id} using regex #{gff_idregex}")
47
+ end
48
+
39
49
  end
40
50
 
41
51
  def protein_id_to_genomeid(protein_id,genome_idregex)
@@ -103,6 +113,8 @@ input_protxml=ARGV[0]
103
113
  $protk.log "Creating GFFDB", :info
104
114
  gffdb = GFFDB.create(tool.coords_file) if tool.coords_file
105
115
 
116
+ #require 'byebug';byebug
117
+
106
118
  # genome_db = prepare_fasta(tool.genome,'nucl')
107
119
  $protk.log "Preparing FASTA index", :info
108
120
  prot_db = prepare_fasta(tool.database,'prot')
@@ -157,14 +169,22 @@ proteins.each do |protein|
157
169
  rescue ProteinNotInDBError
158
170
  $protk.log "No entry for #{parsed_name_for_protdb}", :info
159
171
  rescue MultipleGFFEntriesForProteinError
160
- $protk.log "Multiple entries in gff file for #{parsed_name_for_gffid}", :info
161
- rescue PeptideNotInProteinError
162
- $protk.log "A peptide was not found in its parent protein #{protein.protein_name}" , :warn
172
+ $protk.log "Multiple entries in gff file for #{parsed_name_for_gffid}", :warn
173
+ # require 'byebug';byebug
174
+ # puts gff_parent_entries
175
+ rescue PeptideNotInProteinError => e
176
+ # This is generally not fatal. It can happen because of Leucine Isoleucine issues
177
+ #
178
+ $protk.log "#{e.message}. Parent protein ID #{protein.protein_name}" , :info
179
+ # require 'byebug';byebug
180
+ # puts protein.protein_name
181
+ rescue GFFIDRegexNotMatchedError => e
182
+ $protk.log e.message, :info
163
183
  end
164
184
  end
165
185
  end
166
186
 
167
187
  if num_missing_gff_entries>0
168
- $protk.log "Failed to lookup gff entries. Try setting --gff-idregex" if tool.gff_idregex.nil?
188
+ $protk.log "Failed to lookup gff entries. Try setting --gff-idregex", :error if tool.gff_idregex.nil?
169
189
  end
170
190
 
@@ -0,0 +1,7 @@
1
+
2
+ class ProtkError < StandardError
3
+ attr_accessor :message
4
+ def initialize(message)
5
+ @message=message
6
+ end
7
+ end
@@ -23,6 +23,30 @@ class GalaxyUtil
23
23
  end
24
24
 
25
25
 
26
+ # Galaxy changes things like @ to __at__ we need to change it back
27
+ #
28
+ def self.decode_galaxy_string!(mstring)
29
+ mstring.gsub!("__at__","@")
30
+ mstring.gsub!("__oc__","{")
31
+ mstring.gsub!("__cc__","}")
32
+ mstring.gsub!("__ob__","[")
33
+ mstring.gsub!("__cb__","]")
34
+ mstring.gsub!("__gt__",">")
35
+ mstring.gsub!("__lt__","<")
36
+ mstring.gsub!("__sq__","'")
37
+ mstring.gsub!("__dq__","\"")
38
+ mstring.gsub!("__cn__","\n")
39
+ mstring.gsub!("__cr__","\r")
40
+ mstring.gsub!("__tc__","\t")
41
+ mstring.gsub!("__pd__","#")
42
+
43
+ # For characters not allowed at all by galaxy
44
+ mstring.gsub!("__pc__","|")
45
+
46
+ mstring
47
+ end
48
+
49
+
26
50
  # Unused
27
51
 
28
52
  # def self.stage_protxml(input_protxml_path)
@@ -1,9 +1,11 @@
1
1
  require 'libxml'
2
2
  require 'bio'
3
3
  require 'protk/bio_gff3_extensions'
4
+ require 'protk/error'
5
+
4
6
  include LibXML
5
7
 
6
- class PeptideNotInProteinError < StandardError
8
+ class PeptideNotInProteinError < ProtkError
7
9
  end
8
10
 
9
11
  class Peptide
@@ -43,11 +45,11 @@ class Peptide
43
45
  def coords_in_protein(prot_seq,reverse=false)
44
46
  if reverse
45
47
  pep_index = prot_seq.reverse.index(self.sequence.reverse)
46
- raise PeptideNotInProteinError if pep_index.nil?
48
+ raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_index.nil?
47
49
  pep_start_i = pep_index
48
50
  else
49
51
  pep_start_i = prot_seq.index(self.sequence)
50
- raise PeptideNotInProteinError if pep_start_i.nil?
52
+ raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_start_i.nil?
51
53
  end
52
54
  pep_end_i = pep_start_i+self.sequence.length
53
55
  {:start => pep_start_i,:end => pep_end_i}
@@ -91,31 +93,26 @@ class Peptide
91
93
  pep_end_i = pep_start_i+self.sequence.length*3
92
94
  fragments=[]
93
95
  ordered_cds_records.each do |cds_record|
94
- # puts cds_record
96
+
95
97
  fragment = nil
96
98
  fragment_len = 0
97
99
  if on_reverse_strand
98
100
 
99
101
  in_peptide = (i<pep_end_i) && (i>=pep_start_i)
100
102
  before_len = [pep_start_i-i,0].max
101
- # puts before_len
102
- # puts in_peptide
103
- # puts "i #{i} pi #{pep_end_i} psi #{pep_start_i}"
104
- if in_peptide
105
103
 
104
+ if in_peptide
106
105
  fragment_end = cds_record.end
107
106
  fragment_len = [cds_record.length,pep_end_i-i].min
108
107
  fragment_start = fragment_end-fragment_len+1
109
- # fragment = {:start=>fragment_start,:end=>fragment_end}
110
108
  fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
111
-
112
109
  elsif before_len>0
113
110
  fragment_end = cds_record.end - before_len
114
111
  fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
115
- # puts "Frag len #{fragment_len}"
116
112
  fragment_start = fragment_end - fragment_len + 1
117
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
118
- # fragment = {:start=>fragment_start,:end=>fragment_end}
113
+ if fragment_len>0
114
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
115
+ end
119
116
  else
120
117
  fragment=nil
121
118
  end
@@ -126,14 +123,14 @@ class Peptide
126
123
  fragment_start = cds_record.start
127
124
  fragment_len = [cds_record.length,pep_end_i-i].min
128
125
  fragment_end = fragment_start+fragment_len-1
129
- # fragment = {:start=>fragment_start,:end=>fragment_end}
130
126
  fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
131
127
  elsif before_len>0
132
128
  fragment_start = cds_record.start + before_len
133
129
  fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
134
130
  fragment_end = fragment_start + fragment_len-1
135
- # fragment = {:start=>fragment_start,:end=>fragment_end}
136
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
131
+ if fragment_len>0
132
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
133
+ end
137
134
  else
138
135
  fragment=nil
139
136
  end
@@ -1,4 +1,5 @@
1
1
  require 'protk/search_tool'
2
+ require 'protk/galaxy_util'
2
3
 
3
4
  class String
4
5
  def xtandem_modification_motif?
@@ -70,28 +71,6 @@ class TandemSearchTool < SearchTool
70
71
  end
71
72
 
72
73
  private
73
- # Galaxy changes things like @ to __at__ we need to change it back
74
- #
75
- def decode_galaxy_string(mstring)
76
- mstring.gsub!("__at__","@")
77
- mstring.gsub!("__oc__","{")
78
- mstring.gsub!("__cc__","}")
79
- mstring.gsub!("__ob__","[")
80
- mstring.gsub!("__cb__","]")
81
- mstring.gsub!("__gt__",">")
82
- mstring.gsub!("__lt__","<")
83
- mstring.gsub!("__sq__","'")
84
- mstring.gsub!("__dq__","\"")
85
- mstring.gsub!("__cn__","\n")
86
- mstring.gsub!("__cr__","\r")
87
- mstring.gsub!("__tc__","\t")
88
- mstring.gsub!("__pd__","#")
89
-
90
- # For characters not allowed at all by galaxy
91
- mstring.gsub!("__pc__","|")
92
-
93
- mstring
94
- end
95
74
 
96
75
  def set_option(std_params, tandem_key, value)
97
76
  notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
@@ -180,7 +159,7 @@ class TandemSearchTool < SearchTool
180
159
  if opt_val.is_a?(TrueClass) || opt_val.is_a?(FalseClass)
181
160
  opt_val = opt_val ? "yes" : "no"
182
161
  end
183
- append_option(std_params,xtandem_key,decode_galaxy_string(opt_val.to_s))
162
+ append_option(std_params,xtandem_key,GalaxyUtil.decode_galaxy_string!(opt_val.to_s))
184
163
  end
185
164
  end
186
165
 
@@ -208,7 +187,7 @@ class TandemSearchTool < SearchTool
208
187
  #
209
188
 
210
189
  var_mods = self.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
211
- var_mods=var_mods.collect {|mod| decode_galaxy_string(mod) }
190
+ var_mods=var_mods.collect {|mod| GalaxyUtil.decode_galaxy_string!(mod) }
212
191
 
213
192
  # var_mods allows motif's as well as standard mods. These should be in a separate array
214
193
  var_motifs = [].replace(var_mods)
@@ -216,7 +195,7 @@ class TandemSearchTool < SearchTool
216
195
  var_motifs.keep_if {|mod| mod.xtandem_modification_motif? }
217
196
 
218
197
  fix_mods = self.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
219
- fix_mods=fix_mods.collect {|mod| decode_galaxy_string(mod)}
198
+ fix_mods=fix_mods.collect {|mod| GalaxyUtil.decode_galaxy_string!(mod)}
220
199
 
221
200
  # We also support the --glyco and --methionineo shortcuts.
222
201
  # Add these here. No check is made for duplication
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: protk
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ira Cooke
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-01 00:00:00.000000000 Z
11
+ date: 2015-03-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: open4
@@ -266,6 +266,7 @@ files:
266
266
  - lib/protk/data/uniprot_accessions_table.txt
267
267
  - lib/protk/data/uniprot_input_accessions.loc
268
268
  - lib/protk/data/yum_packages.yaml
269
+ - lib/protk/error.rb
269
270
  - lib/protk/fastadb.rb
270
271
  - lib/protk/galaxy_stager.rb
271
272
  - lib/protk/galaxy_util.rb