protk 1.3.0 → 1.3.1.pre2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,8 @@
1
+
2
+ module ProteinToGenomeMapper
3
+
4
+ # gene_seq should already have been reverse_complemented if on reverse strand
5
+
6
+
7
+
8
+ end
@@ -10,6 +10,8 @@ class ProtXMLToGFFTool < Tool
10
10
 
11
11
  add_value_option(:database,nil,['-d filename','--database filename','Database used for ms/ms searches (Fasta Format)'])
12
12
  add_value_option(:genome,nil,['-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)'])
13
+ add_value_option(:coords_file,nil,['-c filename','--coords-file filename.gff3', 'If genomic coordinates are not encoded in protein db entries look them up from a supplied gff file'])
14
+ # add_value_option(:contig_regex,nil,['--contig-regex expression','Regular expression with a single capture group to get contig ids from protein ids'])
13
15
  add_value_option(:protein_find,nil,['-f term','--find term', 'Restrict output to proteins whose name matches the specified string'])
14
16
  add_value_option(:nterm_minlen,7,['-n len','--nterm-min-len len', 'Only include inferred N-terminal sequences if longer than len'])
15
17
  add_boolean_option(:skip_fasta_indexing,false,['--skip-index','Don\'t index database (Index should already exist)'])
@@ -317,7 +319,7 @@ class ProtXMLToGFFTool < Tool
317
319
 
318
320
  check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
319
321
  if ( check_seq != peptide_seq)
320
- require 'debugger';debugger
322
+ # require 'debugger';debugger
321
323
  puts "Fragment seqs not equal to peptide seqs"
322
324
  end
323
325
 
@@ -13,15 +13,6 @@ require 'optparse'
13
13
  require 'pathname'
14
14
  require 'protk/tool'
15
15
 
16
- class FastaDatabase
17
- attr :name
18
- attr :path
19
- def initialize(name,path)
20
- @name=name
21
- @path=path
22
- end
23
- end
24
-
25
16
  class SearchTool < Tool
26
17
 
27
18
  # Initializes commandline options common to all search tools.
@@ -30,9 +21,9 @@ class SearchTool < Tool
30
21
  def initialize(option_support=[])
31
22
  super(option_support)
32
23
 
33
- if (option_support.include? :database)
34
- add_value_option(:database,"sphuman",['-d', '--database dbname', 'Specify the database to use for this search. Can be a named protk database or the path to a fasta file'])
35
- end
24
+ # if (option_support.include? :database)
25
+ # add_value_option(:database,"sphuman",['-d', '--database dbname', 'Specify the database to use for this search. Can be a named protk database or the path to a fasta file'])
26
+ # end
36
27
 
37
28
  if ( option_support.include? :enzyme )
38
29
  add_value_option(:enzyme,"Trypsin",['--enzyme enz', 'Enzyme'])
@@ -115,18 +106,6 @@ class SearchTool < Tool
115
106
  end
116
107
 
117
108
 
118
- def database_info
119
- case
120
- when Pathname.new(@options.database).exist? # It's an explicitly named db
121
- db_path=Pathname.new(@options.database).expand_path.to_s
122
- db_name=Pathname.new(@options.database).basename.to_s
123
- else
124
- db_path=Constants.new.current_database_for_name @options.database
125
- db_name=@options.database
126
- end
127
- FastaDatabase.new(db_name,db_path)
128
- end
129
-
130
109
  end
131
110
 
132
111
 
@@ -7,27 +7,18 @@ require 'pathname'
7
7
  #
8
8
  class SwissprotDatabase
9
9
 
10
- def initialize(env=nil,database="swissprot")
11
- if ( env!=nil)
12
- @genv=env
13
- else
14
- @genv=Constants.new
15
- end
10
+ def initialize(datfile_path,skip_indexing=false)
16
11
 
12
+ dbpath=Pathname.new(datfile_path)
13
+ dbclass=Bio::SPTR
17
14
 
18
- dbpath=Pathname.new(database)
19
-
20
- if ( dbpath.exist? )
21
- # require 'debugger';debugger
22
- dbclass=Bio::SPTR
15
+ unless skip_indexing
23
16
  parser = Bio::FlatFileIndex::Indexer::Parser.new(dbclass, nil, nil)
24
- Bio::FlatFileIndex::Indexer::makeindexFlat(dbpath.realpath.dirname.to_s, parser, {}, dbpath.realpath.to_s)
25
- @db_object=Bio::FlatFileIndex.new("#{dbpath.realpath.dirname.to_s}")
26
- elsif ( database=="swissprot")
27
- @db_object=Bio::FlatFileIndex.new("#{@genv.protein_database_root}/#{@genv.uniprot_sprot_annotation_database}")
28
- else
29
- @db_object=Bio::FlatFileIndex.new("#{@genv.protein_database_root}/#{@genv.uniprot_trembl_annotation_database}")
17
+ Bio::FlatFileIndex::Indexer::makeindexFlat(dbpath.realpath.dirname.to_s, parser, {}, \
18
+ dbpath.realpath.to_s)
30
19
  end
20
+
21
+ @db_object=Bio::FlatFileIndex.new("#{dbpath.realpath.dirname.to_s}")
31
22
 
32
23
  @db_object.always_check_consistency=false
33
24
  end
@@ -36,9 +27,6 @@ class SwissprotDatabase
36
27
  def get_entry_for_name(name)
37
28
  result=@db_object.get_by_id(name)
38
29
  if result==""
39
- if ( @genv!=nil)
40
- @genv.log("Failed to find UniProt entry for protein named #{name} in database",:warn)
41
- end
42
30
  return nil
43
31
  else
44
32
  Bio::SPTR.new(result)
data/lib/protk/tool.rb CHANGED
@@ -10,6 +10,17 @@ require 'optparse'
10
10
  require 'pathname'
11
11
  require 'protk/command_runner'
12
12
 
13
+
14
+ class FastaDatabase
15
+ attr :name
16
+ attr :path
17
+ def initialize(name,path)
18
+ @name=name
19
+ @path=path
20
+ end
21
+ end
22
+
23
+
13
24
  class Tool
14
25
 
15
26
  # Options set from the command-line
@@ -108,7 +119,13 @@ class Tool
108
119
  add_value_option(:threads,1,['-n','--threads num','Number of processing threads to use. Set to 0 to autodetect an appropriate value'])
109
120
  end
110
121
 
122
+ if ( option_support.include? :database)
123
+ add_value_option(:database,"sphuman",['-d', '--database dbname', 'Specify the database to use for this search. Can be a named protk database or the path to a fasta file'])
124
+ end
111
125
 
126
+ if (option_support.include? :debug)
127
+ add_boolean_option(:debug,false,['--debug','Run in debug mode'])
128
+ end
112
129
 
113
130
  end
114
131
 
@@ -147,6 +164,10 @@ class Tool
147
164
  # Checking for required options
148
165
  begin
149
166
  self.option_parser.parse!
167
+
168
+ if has_override
169
+ return true
170
+ end
150
171
  missing = mandatory.select{ |param| self.send(param).nil? }
151
172
  if not missing.empty?
152
173
  puts "Missing options: #{missing.join(', ')}"
@@ -175,5 +196,19 @@ class Tool
175
196
  cmd_runner.run_local(cmd)
176
197
  end
177
198
 
178
-
199
+
200
+ def database_info
201
+ case
202
+ when Pathname.new(@options.database).exist? # It's an explicitly named db
203
+ db_path=Pathname.new(@options.database).expand_path.to_s
204
+ db_name=Pathname.new(@options.database).basename.to_s
205
+ else
206
+ db_path=Constants.new.current_database_for_name @options.database
207
+ db_name=@options.database
208
+ end
209
+ FastaDatabase.new(db_name,db_path)
210
+ end
211
+
212
+
213
+
179
214
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: protk
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.3.1.pre2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ira Cooke
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-24 00:00:00.000000000 Z
11
+ date: 2014-10-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: open4
@@ -152,6 +152,26 @@ dependencies:
152
152
  - - ~>
153
153
  - !ruby/object:Gem::Version
154
154
  version: '0'
155
+ - !ruby/object:Gem::Dependency
156
+ name: debugger
157
+ requirement: !ruby/object:Gem::Requirement
158
+ requirements:
159
+ - - ~>
160
+ - !ruby/object:Gem::Version
161
+ version: '1.6'
162
+ - - '>='
163
+ - !ruby/object:Gem::Version
164
+ version: 1.6.0
165
+ type: :development
166
+ prerelease: false
167
+ version_requirements: !ruby/object:Gem::Requirement
168
+ requirements:
169
+ - - ~>
170
+ - !ruby/object:Gem::Version
171
+ version: '1.6'
172
+ - - '>='
173
+ - !ruby/object:Gem::Version
174
+ version: 1.6.0
155
175
  - !ruby/object:Gem::Dependency
156
176
  name: sqlite3
157
177
  requirement: !ruby/object:Gem::Requirement
@@ -190,11 +210,44 @@ executables:
190
210
  - augustus_to_proteindb.rb
191
211
  - protxml_to_gff.rb
192
212
  - protxml_to_table.rb
213
+ - swissprot_to_table.rb
214
+ - protxml_to_psql.rb
193
215
  extensions:
194
216
  - ext/decoymaker/extconf.rb
195
217
  extra_rdoc_files: []
196
218
  files:
197
- - README.md
219
+ - lib/protk/bio_gff3_extensions.rb
220
+ - lib/protk/bio_sptr_extensions.rb
221
+ - lib/protk/command_runner.rb
222
+ - lib/protk/constants.rb
223
+ - lib/protk/convert_util.rb
224
+ - lib/protk/data/make_uniprot_table.rb
225
+ - lib/protk/fastadb.rb
226
+ - lib/protk/galaxy_stager.rb
227
+ - lib/protk/galaxy_util.rb
228
+ - lib/protk/gffdb.rb
229
+ - lib/protk/manage_db_tool.rb
230
+ - lib/protk/mascot_util.rb
231
+ - lib/protk/mzml_parser.rb
232
+ - lib/protk/omssa_util.rb
233
+ - lib/protk/openms_defaults.rb
234
+ - lib/protk/peptide.rb
235
+ - lib/protk/pepxml.rb
236
+ - lib/protk/plasmodb.rb
237
+ - lib/protk/prophet_tool.rb
238
+ - lib/protk/protein.rb
239
+ - lib/protk/protein_to_genome_mapper.rb
240
+ - lib/protk/protxml_to_gff_tool.rb
241
+ - lib/protk/randomize.rb
242
+ - lib/protk/search_tool.rb
243
+ - lib/protk/setup_tool.rb
244
+ - lib/protk/swissprot_database.rb
245
+ - lib/protk/tandem_search_tool.rb
246
+ - lib/protk/tool.rb
247
+ - lib/protk/uniprot_mapper.rb
248
+ - lib/protk.rb
249
+ - lib/protk/manage_db_rakefile.rake
250
+ - lib/protk/setup_rakefile.rake
198
251
  - bin/add_retention_times.rb
199
252
  - bin/augustus_to_proteindb.rb
200
253
  - bin/interprophet.rb
@@ -213,25 +266,19 @@ files:
213
266
  - bin/protxml_to_table.rb
214
267
  - bin/repair_run_summary.rb
215
268
  - bin/sixframe.rb
269
+ - bin/swissprot_to_table.rb
216
270
  - bin/tandem_search.rb
217
271
  - bin/tandem_to_pepxml.rb
218
272
  - bin/unimod_to_loc.rb
219
273
  - bin/uniprot_mapper.rb
220
- - ext/decoymaker/decoymaker.c
221
- - ext/decoymaker/extconf.rb
222
- - lib/protk.rb
223
- - lib/protk/bio_sptr_extensions.rb
224
- - lib/protk/command_runner.rb
225
- - lib/protk/constants.rb
226
- - lib/protk/convert_util.rb
227
- - lib/protk/data/ExecutePipeline.trf
228
- - lib/protk/data/FeatureFinderCentroided.ini
229
- - lib/protk/data/FeatureFinderIsotopeWavelet.ini
274
+ - README.md
230
275
  - lib/protk/data/apt-get_packages.yaml
231
276
  - lib/protk/data/brew_packages.yaml
232
277
  - lib/protk/data/default_config.yml
278
+ - lib/protk/data/ExecutePipeline.trf
279
+ - lib/protk/data/FeatureFinderCentroided.ini
280
+ - lib/protk/data/FeatureFinderIsotopeWavelet.ini
233
281
  - lib/protk/data/galaxyenv.sh
234
- - lib/protk/data/make_uniprot_table.rb
235
282
  - lib/protk/data/predefined_db.crap.yaml
236
283
  - lib/protk/data/predefined_db.sphuman.yaml
237
284
  - lib/protk/data/predefined_db.swissprot_annotation.yaml
@@ -246,30 +293,11 @@ files:
246
293
  - lib/protk/data/uniprot_accessions_table.txt
247
294
  - lib/protk/data/uniprot_input_accessions.loc
248
295
  - lib/protk/data/yum_packages.yaml
249
- - lib/protk/fastadb.rb
250
- - lib/protk/galaxy_stager.rb
251
- - lib/protk/galaxy_util.rb
252
- - lib/protk/manage_db_rakefile.rake
253
- - lib/protk/manage_db_tool.rb
254
- - lib/protk/mascot_util.rb
255
- - lib/protk/mzml_parser.rb
256
- - lib/protk/omssa_util.rb
257
- - lib/protk/openms_defaults.rb
258
- - lib/protk/pepxml.rb
259
- - lib/protk/plasmodb.rb
260
- - lib/protk/prophet_tool.rb
261
- - lib/protk/protxml.rb
262
- - lib/protk/protxml_to_gff_tool.rb
263
- - lib/protk/randomize.rb
264
- - lib/protk/search_tool.rb
265
- - lib/protk/setup_rakefile.rake
266
- - lib/protk/setup_tool.rb
267
- - lib/protk/swissprot_database.rb
268
- - lib/protk/tandem_search_tool.rb
269
- - lib/protk/tool.rb
270
- - lib/protk/uniprot_mapper.rb
296
+ - ext/decoymaker/decoymaker.c
297
+ - ext/decoymaker/extconf.rb
271
298
  homepage: http://rubygems.org/gems/protk
272
- licenses: []
299
+ licenses:
300
+ - LGPL-2.1
273
301
  metadata: {}
274
302
  post_install_message: Now run protk_setup.rb to install third party tools
275
303
  rdoc_options: []
@@ -282,14 +310,13 @@ required_ruby_version: !ruby/object:Gem::Requirement
282
310
  version: '0'
283
311
  required_rubygems_version: !ruby/object:Gem::Requirement
284
312
  requirements:
285
- - - '>='
313
+ - - '>'
286
314
  - !ruby/object:Gem::Version
287
- version: '0'
315
+ version: 1.3.1
288
316
  requirements: []
289
317
  rubyforge_project:
290
- rubygems_version: 2.2.1
318
+ rubygems_version: 2.0.14
291
319
  signing_key:
292
320
  specification_version: 4
293
321
  summary: Proteomics Toolkit
294
322
  test_files: []
295
- has_rdoc:
data/lib/protk/protxml.rb DELETED
@@ -1,141 +0,0 @@
1
- require 'rubygems'
2
- require 'rexml/document'
3
- require 'rexml/xpath'
4
-
5
- class ProtXML
6
-
7
- attr_accessor :groups
8
-
9
-
10
- def indistinguishable_proteins_from_protein(protein_element)
11
- iprots=[]
12
- REXML::XPath.each(protein_element,"./indistinguishable_protein") do |ipel|
13
- ipel_attributes={}
14
- ipel.attributes.each_attribute { |att| ipel_attributes[att.expanded_name.to_sym]=att.value }
15
- iprots.push(ipel_attributes[:protein_name])
16
- end
17
- iprots
18
- end
19
-
20
- def peptides_from_protein(protein_element)
21
- peptides=[]
22
- REXML::XPath.each(protein_element,"./peptide") do |pel|
23
- peptide={}
24
-
25
- pel.attributes.each_attribute { |att| peptide[att.expanded_name.to_sym]=att.value }
26
- modifications=pel.get_elements("./modification_info")
27
- mods=modifications.collect {|mp| mp.attribute("modified_peptide").value }
28
- peptide[:modifications] = mods
29
- peptides.push(peptide)
30
- end
31
- peptides
32
- end
33
-
34
- def proteins_from_group(group_element)
35
- proteins=[]
36
- REXML::XPath.each(group_element,"./protein") do |pel|
37
- protein={}
38
- pel.attributes.each_attribute { |att| protein[att.expanded_name.to_sym]=att.value }
39
- protein[:peptides]=peptides_from_protein(pel)
40
- protein[:indistinguishable_prots]=indistinguishable_proteins_from_protein(pel)
41
- proteins.push(protein)
42
- end
43
- proteins
44
- end
45
-
46
- def init_groups
47
- @groups=[]
48
- REXML::XPath.each(@doc.root,"//protein_group") do |gel|
49
- group={}
50
- group[:group_probability]=gel.attributes["probability"].to_f
51
- group[:proteins]=proteins_from_group(gel)
52
- groups.push group
53
- end
54
- @groups
55
- end
56
-
57
-
58
- def initialize(file_name)
59
- @doc=REXML::Document.new(File.new(file_name))
60
- @groups=self.init_groups
61
- end
62
-
63
- def find_pep_xml()
64
- header = REXML::XPath.first(@doc, "//protein_summary_header")
65
- source_file = header.attributes["source_files"]
66
- end
67
-
68
- def peptide_sequences_from_protein(prot)
69
- peptides=prot[:peptides]
70
- sequences=[]
71
- peptides.each do |pep|
72
- if ( pep[:modifications].length > 0 )
73
- pep[:modifications].each {|pmod|
74
- sequences.push(pmod) }
75
- else
76
- sequences.push(pep[:peptide_sequence])
77
- end
78
- end
79
- sequences
80
- end
81
-
82
- def protein_to_row(prot)
83
- protein_row=[]
84
- protein_row.push(prot[:protein_name])
85
- protein_row.push(prot[:probability])
86
-
87
- indistinct=prot[:indistinguishable_prots]
88
- indist_string="#{prot[:protein_name]};"
89
- indistinct.each { |pr| indist_string<<"#{pr};"}
90
- indist_string.chop!
91
- protein_row.push(indist_string)
92
-
93
- protein_row.push(prot[:peptides].length)
94
-
95
- peptide_string=""
96
- peptide_sequences_from_protein(prot).each {|pep| peptide_string<<"#{pep};" }
97
- peptide_string.chop!
98
-
99
- protein_row.push(peptide_string)
100
- protein_row
101
- end
102
-
103
- # Convert the entire prot.xml document to row format
104
- # Returns an array of arrays. Each of the sub-arrays is a row.
105
- # Each row should contain a simple summary of the protein.
106
- # A separate row should be provided for every protein (including indistinguishable ones)
107
- # The first row will be the header
108
- #
109
- # Proteins with probabilities below a threshold are excluded
110
- #
111
- def as_rows(threshold_probability)
112
-
113
- rows=[]
114
- rows.push(["Accession","Probability","Indistinguishable Proteins","Num Peptides","Peptides"])
115
-
116
- proteins=[]
117
- @groups.each do |grp|
118
- grp[:proteins].each {|prot|
119
- if ( prot[:probability].to_f >= threshold_probability)
120
- proteins.push(prot)
121
- end
122
- }
123
- end
124
-
125
- proteins.each do |prot|
126
- protein_row=protein_to_row(prot)
127
- rows.push(protein_row)
128
-
129
- indistinguishables=prot[:indistinguishable_prots]
130
- indistinguishables.each do |indist|
131
- indist_row=protein_row.clone
132
- indist_row[0]=indist
133
- rows.push(indist_row)
134
- end
135
-
136
- end
137
-
138
- rows
139
- end
140
-
141
- end