protk 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 22d2c990e46bf29f08cbf00dc2ecd9a759fae457
4
- data.tar.gz: 09dd0159d8b564d9d297a987c74e13833fabbc24
3
+ metadata.gz: 7329f51a45b5449ec979e76aca5727c6714a5bc8
4
+ data.tar.gz: e96f553b27c61c7ba1935d379e01086e9cb00725
5
5
  SHA512:
6
- metadata.gz: 2b08c1086187da5755e4b0d98dcfebcff80893eb291e09291ea3853fa848ade7f464f83fc97a8fe362d2b9a96e915e2aa1945ddd6e0f39ada32d1a747a0c7d73
7
- data.tar.gz: 767b474d9f0b890342f783bff0e8633434542cf151d34f239f9ed52dab43a1909b683fa285b25954af54856d68493ab67e4776d152830bc938e62a932c63e9b3
6
+ metadata.gz: fb933aa9ce0cc6fabb19b0a731bb8d74f23456937ec4d08973477f8f956eb733fcb44b26b66fc61242f5ea6c617d0c3a178b83f16dc568ad5f330db9dcd27c1d
7
+ data.tar.gz: 5b2b370cea53d3a3ec9eee9d5916df8f910ef12181c0bce9b660c1d82d042e00a5b66ce7fcba6a864cd1a6266aa2fbffec998ae26700f4ddd7903ab141ba3241
@@ -110,13 +110,25 @@ def sequence_fasta_header(transcript_line,coding_sequences,scaffold)
110
110
 
111
111
  tmatch=transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1}).*?ID=(.*?);/)
112
112
  # require 'debugger'; debugger
113
- tstart=tmatch[1]
114
- tend=tmatch[2]
115
- tstrand="fwd"
116
- tstrand = "rev" if tmatch[3]=="-"
113
+ tstart,tend,tstrand = transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1})/).captures
114
+
115
+ # tstart=tmatch[1]
116
+ # tend=tmatch[2]
117
+ tsidfield = transcript_line.split("\t")[8]
118
+ tid = nil
119
+ if tsidfield =~ /ID=/
120
+ tid = tsidfield.match(/ID=(.*?);/).captures[0]
121
+ else
122
+ tid = tsidfield.gsub(" ","_").gsub(";","_")
123
+ end
124
+
125
+ # require 'byebug';byebug
126
+
127
+ tstrandfr="fwd"
128
+ tstrandfr = "rev" if tstrand=="-"
117
129
 
118
- tid=tmatch[4]
119
- header=">lcl|#{sanitize_scaffold_idstring(scaffold)}_#{tstrand}_#{tid} #{tstart}|#{tend}"
130
+ # tid=tmatch[4]
131
+ header=">lcl|#{sanitize_scaffold_idstring(scaffold)}_#{tstrandfr}_#{tid} #{tstart}|#{tend}"
120
132
  if $add_transcript_info
121
133
  coding_sequences.each { |coding_sequence| header << " #{cds_to_header_text(coding_sequence,tid)}" }
122
134
  end
@@ -135,13 +147,14 @@ end
135
147
  def parse_gene(gene_lines)
136
148
 
137
149
  geneid=gene_lines[0].match(/start gene (.*)/)[1]
150
+ scaffold_id = gene_lines[1].split("\t")[0]
138
151
  transcripts=get_transcript_lines(gene_lines)
139
152
  coding_sequences=get_cds_lines(gene_lines)
140
153
  proteins=get_protein_sequence_lines(gene_lines)
141
154
  fasta_string=""
142
155
  throw "transcripts/protein mismatch" unless transcripts.length == proteins.length
143
156
  transcripts.each_with_index do |ts, i|
144
- fh=sequence_fasta_header(ts,coding_sequences,$current_scaffold)
157
+ fh=sequence_fasta_header(ts,coding_sequences,scaffold_id)
145
158
  fasta_string << "#{fh}\n"
146
159
  ps=protein_sequence(proteins[i])
147
160
  fasta_string << "#{ps}\n"
@@ -152,14 +165,14 @@ def parse_gene(gene_lines)
152
165
  fasta_string
153
166
  end
154
167
 
155
- def capture_scaffold(line)
156
- if line =~ /-- prediction on sequence number.*?name = (.*)\)/
157
- $current_scaffold=line.match(/-- prediction on sequence number.*?name = (.*)\)/)[1]
158
- if ( $print_progress)
159
- puts $current_scaffold
160
- end
161
- end
162
- end
168
+ # def capture_scaffold(line)
169
+ # if line =~ /-- prediction on sequence number.*?name = (.*)\)/
170
+ # $current_scaffold=line.match(/-- prediction on sequence number.*?name = (.*)\)/)[1]
171
+ # if ( $print_progress)
172
+ # puts $current_scaffold
173
+ # end
174
+ # end
175
+ # end
163
176
 
164
177
  def capture_gene_start(line)
165
178
  if line =~ /# start gene/
@@ -174,14 +187,14 @@ def at_gene_end(line)
174
187
  return false
175
188
  end
176
189
 
177
- $current_scaffold=""
190
+ # $current_scaffold=""
178
191
  gene_lines=[]
179
192
  $capturing_gene=false
180
193
 
181
194
 
182
195
  File.open(inname).each_with_index do |line, line_i|
183
196
  line.chomp!
184
- capture_scaffold(line)
197
+ # capture_scaffold(line)
185
198
  capture_gene_start(line)
186
199
 
187
200
  if at_gene_end(line)
@@ -65,9 +65,11 @@ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
65
65
 
66
66
  if for_galaxy
67
67
  inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
68
+ input_files = inputs.collect { |e| e.staged_path }
69
+ else
70
+ input_files = inputs
68
71
  end
69
72
 
70
- input_files = inputs.collect { |e| e.staged_path }
71
73
 
72
74
  cmd << " #{input_files.join(" ")} #{output_file}"
73
75
 
@@ -11,12 +11,12 @@ require 'protk/gffdb'
11
11
  require 'protk/protein'
12
12
  require 'protk/peptide'
13
13
  require 'protk/tool'
14
+ require 'protk/error'
14
15
  require 'libxml'
15
16
  require 'bio'
16
17
 
17
18
  include LibXML
18
19
 
19
-
20
20
  class NoGFFEntryFoundError < StandardError
21
21
  end
22
22
 
@@ -26,6 +26,9 @@ end
26
26
  class MultipleGFFEntriesForProteinError < StandardError
27
27
  end
28
28
 
29
+ class GFFIDRegexNotMatchedError < ProtkError
30
+ end
31
+
29
32
  def parse_proteins(protxml_file)
30
33
  protxml_parser=XML::Parser.file(protxml_file)
31
34
  protxml_doc=protxml_parser.parse
@@ -35,7 +38,14 @@ end
35
38
 
36
39
  def protein_id_to_gffid(protein_id,gff_idregex)
37
40
  return protein_id if gff_idregex.nil?
38
- return protein_id.match(/#{gff_idregex}/)[1]
41
+
42
+ m = protein_id.match(/#{gff_idregex}/)
43
+ if m
44
+ return m.captures[0]
45
+ else
46
+ raise GFFIDRegexNotMatchedError.new("Unable to parse gff_id from #{protein_id} using regex #{gff_idregex}")
47
+ end
48
+
39
49
  end
40
50
 
41
51
  def protein_id_to_genomeid(protein_id,genome_idregex)
@@ -103,6 +113,8 @@ input_protxml=ARGV[0]
103
113
  $protk.log "Creating GFFDB", :info
104
114
  gffdb = GFFDB.create(tool.coords_file) if tool.coords_file
105
115
 
116
+ #require 'byebug';byebug
117
+
106
118
  # genome_db = prepare_fasta(tool.genome,'nucl')
107
119
  $protk.log "Preparing FASTA index", :info
108
120
  prot_db = prepare_fasta(tool.database,'prot')
@@ -157,14 +169,22 @@ proteins.each do |protein|
157
169
  rescue ProteinNotInDBError
158
170
  $protk.log "No entry for #{parsed_name_for_protdb}", :info
159
171
  rescue MultipleGFFEntriesForProteinError
160
- $protk.log "Multiple entries in gff file for #{parsed_name_for_gffid}", :info
161
- rescue PeptideNotInProteinError
162
- $protk.log "A peptide was not found in its parent protein #{protein.protein_name}" , :warn
172
+ $protk.log "Multiple entries in gff file for #{parsed_name_for_gffid}", :warn
173
+ # require 'byebug';byebug
174
+ # puts gff_parent_entries
175
+ rescue PeptideNotInProteinError => e
176
+ # This is generally not fatal. It can happen because of Leucine Isoleucine issues
177
+ #
178
+ $protk.log "#{e.message}. Parent protein ID #{protein.protein_name}" , :info
179
+ # require 'byebug';byebug
180
+ # puts protein.protein_name
181
+ rescue GFFIDRegexNotMatchedError => e
182
+ $protk.log e.message, :info
163
183
  end
164
184
  end
165
185
  end
166
186
 
167
187
  if num_missing_gff_entries>0
168
- $protk.log "Failed to lookup gff entries. Try setting --gff-idregex" if tool.gff_idregex.nil?
188
+ $protk.log "Failed to lookup gff entries. Try setting --gff-idregex", :error if tool.gff_idregex.nil?
169
189
  end
170
190
 
@@ -0,0 +1,7 @@
1
+
2
+ class ProtkError < StandardError
3
+ attr_accessor :message
4
+ def initialize(message)
5
+ @message=message
6
+ end
7
+ end
@@ -23,6 +23,30 @@ class GalaxyUtil
23
23
  end
24
24
 
25
25
 
26
+ # Galaxy changes things like @ to __at__ we need to change it back
27
+ #
28
+ def self.decode_galaxy_string!(mstring)
29
+ mstring.gsub!("__at__","@")
30
+ mstring.gsub!("__oc__","{")
31
+ mstring.gsub!("__cc__","}")
32
+ mstring.gsub!("__ob__","[")
33
+ mstring.gsub!("__cb__","]")
34
+ mstring.gsub!("__gt__",">")
35
+ mstring.gsub!("__lt__","<")
36
+ mstring.gsub!("__sq__","'")
37
+ mstring.gsub!("__dq__","\"")
38
+ mstring.gsub!("__cn__","\n")
39
+ mstring.gsub!("__cr__","\r")
40
+ mstring.gsub!("__tc__","\t")
41
+ mstring.gsub!("__pd__","#")
42
+
43
+ # For characters not allowed at all by galaxy
44
+ mstring.gsub!("__pc__","|")
45
+
46
+ mstring
47
+ end
48
+
49
+
26
50
  # Unused
27
51
 
28
52
  # def self.stage_protxml(input_protxml_path)
@@ -1,9 +1,11 @@
1
1
  require 'libxml'
2
2
  require 'bio'
3
3
  require 'protk/bio_gff3_extensions'
4
+ require 'protk/error'
5
+
4
6
  include LibXML
5
7
 
6
- class PeptideNotInProteinError < StandardError
8
+ class PeptideNotInProteinError < ProtkError
7
9
  end
8
10
 
9
11
  class Peptide
@@ -43,11 +45,11 @@ class Peptide
43
45
  def coords_in_protein(prot_seq,reverse=false)
44
46
  if reverse
45
47
  pep_index = prot_seq.reverse.index(self.sequence.reverse)
46
- raise PeptideNotInProteinError if pep_index.nil?
48
+ raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_index.nil?
47
49
  pep_start_i = pep_index
48
50
  else
49
51
  pep_start_i = prot_seq.index(self.sequence)
50
- raise PeptideNotInProteinError if pep_start_i.nil?
52
+ raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_start_i.nil?
51
53
  end
52
54
  pep_end_i = pep_start_i+self.sequence.length
53
55
  {:start => pep_start_i,:end => pep_end_i}
@@ -91,31 +93,26 @@ class Peptide
91
93
  pep_end_i = pep_start_i+self.sequence.length*3
92
94
  fragments=[]
93
95
  ordered_cds_records.each do |cds_record|
94
- # puts cds_record
96
+
95
97
  fragment = nil
96
98
  fragment_len = 0
97
99
  if on_reverse_strand
98
100
 
99
101
  in_peptide = (i<pep_end_i) && (i>=pep_start_i)
100
102
  before_len = [pep_start_i-i,0].max
101
- # puts before_len
102
- # puts in_peptide
103
- # puts "i #{i} pi #{pep_end_i} psi #{pep_start_i}"
104
- if in_peptide
105
103
 
104
+ if in_peptide
106
105
  fragment_end = cds_record.end
107
106
  fragment_len = [cds_record.length,pep_end_i-i].min
108
107
  fragment_start = fragment_end-fragment_len+1
109
- # fragment = {:start=>fragment_start,:end=>fragment_end}
110
108
  fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
111
-
112
109
  elsif before_len>0
113
110
  fragment_end = cds_record.end - before_len
114
111
  fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
115
- # puts "Frag len #{fragment_len}"
116
112
  fragment_start = fragment_end - fragment_len + 1
117
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
118
- # fragment = {:start=>fragment_start,:end=>fragment_end}
113
+ if fragment_len>0
114
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
115
+ end
119
116
  else
120
117
  fragment=nil
121
118
  end
@@ -126,14 +123,14 @@ class Peptide
126
123
  fragment_start = cds_record.start
127
124
  fragment_len = [cds_record.length,pep_end_i-i].min
128
125
  fragment_end = fragment_start+fragment_len-1
129
- # fragment = {:start=>fragment_start,:end=>fragment_end}
130
126
  fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
131
127
  elsif before_len>0
132
128
  fragment_start = cds_record.start + before_len
133
129
  fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
134
130
  fragment_end = fragment_start + fragment_len-1
135
- # fragment = {:start=>fragment_start,:end=>fragment_end}
136
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
131
+ if fragment_len>0
132
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
133
+ end
137
134
  else
138
135
  fragment=nil
139
136
  end
@@ -1,4 +1,5 @@
1
1
  require 'protk/search_tool'
2
+ require 'protk/galaxy_util'
2
3
 
3
4
  class String
4
5
  def xtandem_modification_motif?
@@ -70,28 +71,6 @@ class TandemSearchTool < SearchTool
70
71
  end
71
72
 
72
73
  private
73
- # Galaxy changes things like @ to __at__ we need to change it back
74
- #
75
- def decode_galaxy_string(mstring)
76
- mstring.gsub!("__at__","@")
77
- mstring.gsub!("__oc__","{")
78
- mstring.gsub!("__cc__","}")
79
- mstring.gsub!("__ob__","[")
80
- mstring.gsub!("__cb__","]")
81
- mstring.gsub!("__gt__",">")
82
- mstring.gsub!("__lt__","<")
83
- mstring.gsub!("__sq__","'")
84
- mstring.gsub!("__dq__","\"")
85
- mstring.gsub!("__cn__","\n")
86
- mstring.gsub!("__cr__","\r")
87
- mstring.gsub!("__tc__","\t")
88
- mstring.gsub!("__pd__","#")
89
-
90
- # For characters not allowed at all by galaxy
91
- mstring.gsub!("__pc__","|")
92
-
93
- mstring
94
- end
95
74
 
96
75
  def set_option(std_params, tandem_key, value)
97
76
  notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
@@ -180,7 +159,7 @@ class TandemSearchTool < SearchTool
180
159
  if opt_val.is_a?(TrueClass) || opt_val.is_a?(FalseClass)
181
160
  opt_val = opt_val ? "yes" : "no"
182
161
  end
183
- append_option(std_params,xtandem_key,decode_galaxy_string(opt_val.to_s))
162
+ append_option(std_params,xtandem_key,GalaxyUtil.decode_galaxy_string!(opt_val.to_s))
184
163
  end
185
164
  end
186
165
 
@@ -208,7 +187,7 @@ class TandemSearchTool < SearchTool
208
187
  #
209
188
 
210
189
  var_mods = self.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
211
- var_mods=var_mods.collect {|mod| decode_galaxy_string(mod) }
190
+ var_mods=var_mods.collect {|mod| GalaxyUtil.decode_galaxy_string!(mod) }
212
191
 
213
192
  # var_mods allows motif's as well as standard mods. These should be in a separate array
214
193
  var_motifs = [].replace(var_mods)
@@ -216,7 +195,7 @@ class TandemSearchTool < SearchTool
216
195
  var_motifs.keep_if {|mod| mod.xtandem_modification_motif? }
217
196
 
218
197
  fix_mods = self.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
219
- fix_mods=fix_mods.collect {|mod| decode_galaxy_string(mod)}
198
+ fix_mods=fix_mods.collect {|mod| GalaxyUtil.decode_galaxy_string!(mod)}
220
199
 
221
200
  # We also support the --glyco and --methionineo shortcuts.
222
201
  # Add these here. No check is made for duplication
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: protk
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ira Cooke
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-01 00:00:00.000000000 Z
11
+ date: 2015-03-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: open4
@@ -266,6 +266,7 @@ files:
266
266
  - lib/protk/data/uniprot_accessions_table.txt
267
267
  - lib/protk/data/uniprot_input_accessions.loc
268
268
  - lib/protk/data/yum_packages.yaml
269
+ - lib/protk/error.rb
269
270
  - lib/protk/fastadb.rb
270
271
  - lib/protk/galaxy_stager.rb
271
272
  - lib/protk/galaxy_util.rb