protk 1.3.0.pre3 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 704282a21d38fd8d3a536fbbba9ab90eabd54355
4
- data.tar.gz: d325658a001939d222bfb7c942836df25ae9790b
3
+ metadata.gz: 5e8f8a571cb43ed61984a34b6e1fb51caf979593
4
+ data.tar.gz: b53857f75c1ff6ca850859c3985aee36533e437f
5
5
  SHA512:
6
- metadata.gz: f9902d7d48b5171470b073ab94089481f0a9125d0e65e4a33b600ed86cf622bafa774335a0c33520e955d49dd0f195ed7ae82fcab6ad18bf35a42dafe030aea3
7
- data.tar.gz: 253cabdf8bfcf009516cc2675ebe12bb9b43ec15515625eabd3701c0761be9f3fc78dcc58dc731c7f89f642f8c11cf1bd8889c5f68a565cda0d04de06fb273c3
6
+ metadata.gz: 9450fccc4a5ce59f064927d62fbc6a4342a1710c3b82707e0908dea52af7d0b50f215e64073bb067a506d204701acea11b6d28f302447494b8a30b1e7af2df2d
7
+ data.tar.gz: 1b8bc78fc09b4c81eee72fad169a6aee7145a16312c01bc95a5dd590f08cb98194b26115a166a759b9c52c7c67204a767747642e5e9331de4562d52f31eb1e11
@@ -36,7 +36,6 @@ input_file=ARGV[0]
36
36
  db_length=tool.db_length
37
37
  if ( db_length==0) #If no db length was specified use the number of entries in the input file
38
38
  db_length=Bio::FastaFormat.open(input_file).count
39
- puts "Found #{db_length} entries in input file"
40
39
  end
41
40
 
42
41
  output_file = tool.explicit_output if tool.explicit_output!=nil
@@ -65,6 +64,7 @@ end
65
64
 
66
65
  if ( tool.append )
67
66
  cmd ="awk 'FNR==1{print \"\"}1' #{input_file} #{decoys_tmp_file} > #{output_file};"
67
+ cmd << "sed -i.bak '/^$/d' #{output_file};"
68
68
  cmd << "rm #{decoys_tmp_file}"
69
69
  else
70
70
  cmd = "mv #{decoys_tmp_file} #{output_file}"
@@ -83,16 +83,17 @@ database_path=db_info.path
83
83
 
84
84
  # Database must have fasta extension
85
85
  if Pathname.new(database_path).extname.to_s.downcase != ".fasta"
86
- make_msgfdb_cmd << "ln -s #{database_path} #{database_path}.fasta;"
86
+ File.symlink(database_path,"#{database_path}.fasta") unless File.exists?("#{database_path}.fasta")
87
+ # make_msgfdb_cmd << "ln -s #{database_path} #{database_path}.fasta;"
87
88
  database_path="#{database_path}.fasta"
88
- db_info.path=database_path
89
+ database_path
89
90
  end
90
91
 
91
92
  # Database must be indexed
92
93
  unless FileTest.exists?("#{database_path}.canno")
93
- dbdir = Pathname.new(database_path).dirname.realpath.to_s
94
+ dbdir = Pathname.new(database_path).dirname.to_s
94
95
  tdavalue=search_tool.decoy_search ? 1 : 0;
95
- make_msgfdb_cmd << "cd #{dbdir}; java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
96
+ make_msgfdb_cmd << "java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
96
97
  end
97
98
 
98
99
 
@@ -214,7 +215,6 @@ ARGV.each do |filename|
214
215
  else
215
216
  cmd << "; mv #{mzid_output_path} #{output_path}"
216
217
  end
217
-
218
218
 
219
219
  # Up to here we've formulated the command. The rest is cleanup
220
220
  p "Running:#{cmd}"
@@ -12,8 +12,6 @@ require 'protk/command_runner'
12
12
  require 'protk/search_tool'
13
13
  require 'protk/galaxy_util'
14
14
 
15
- for_galaxy = GalaxyUtil.for_galaxy?
16
-
17
15
  # Setup specific command-line options for this tool. Other options are inherited from SearchTool
18
16
  #
19
17
  search_tool=SearchTool.new([
@@ -94,22 +92,6 @@ ARGV.each do |filename|
94
92
  #
95
93
  cmd << " -v #{search_tool.missed_cleavages}"
96
94
 
97
- # If this is for Galaxy and a data directory has been specified
98
- # look for a common unimod.xml file.
99
- if for_galaxy
100
- galaxy_index_dir = search_tool.galaxy_index_dir
101
- if galaxy_index_dir
102
- galaxy_mods = File.join(galaxy_index_dir, "mods.xml")
103
- if( FileTest.exists?(galaxy_mods) )
104
- cmd << " -mx #{galaxy_mods}"
105
- end
106
- galaxy_usermods = File.join(galaxy_index_dir, "usermods.xml")
107
- if( FileTest.exists?(galaxy_usermods) )
108
- cmd << " -mux #{galaxy_usermods}"
109
- end
110
- end
111
- end
112
-
113
95
  if ( search_tool.omx_output )
114
96
  cmd << " -ox #{search_tool.omx_output} "
115
97
  end
@@ -51,10 +51,11 @@ throw "When --output and -F options are set only one file at a time can be run"
51
51
  # Obtain a global environment object
52
52
  genv=Constants.new
53
53
 
54
-
54
+ input_stagers=[]
55
55
  inputs=ARGV.collect { |file_name| file_name.chomp}
56
56
  if for_galaxy
57
- inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
57
+ input_stagers = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
58
+ inputs=input_stagers.collect { |sg| sg.staged_path }
58
59
  end
59
60
 
60
61
  # Interrogate all the input files to obtain the database and search engine from them
@@ -212,7 +213,13 @@ else
212
213
 
213
214
  cmd=generate_command(genv,prophet_tool,inputs,output_file_name,database,engine)
214
215
  run_peptide_prophet(genv,prophet_tool,cmd,output_file_name,engine)
215
-
216
+
216
217
  end
217
218
 
219
+ if (for_galaxy)
220
+ input_stagers.each do |sg|
221
+ sg.restore_references(output_file_name)
222
+ sg.restore_references(output_file_name,{:base_only => true})
223
+ end
224
+ end
218
225
 
@@ -40,7 +40,13 @@ exit unless prophet_tool.check_options(true)
40
40
  # Obtain a global environment object
41
41
  genv=Constants.new
42
42
 
43
- inputs = ARGV.collect {|file_name| file_name.chomp }
43
+ input_stagers=[]
44
+ inputs=ARGV.collect { |file_name| file_name.chomp}
45
+ if for_galaxy
46
+ input_stagers = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
47
+ inputs=input_stagers.collect { |sg| sg.staged_path }
48
+ end
49
+
44
50
 
45
51
  if ( prophet_tool.explicit_output )
46
52
  output_file=prophet_tool.explicit_output
@@ -52,11 +58,6 @@ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
52
58
 
53
59
  cmd="ProteinProphet "
54
60
 
55
- if for_galaxy
56
- inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
57
- end
58
-
59
-
60
61
  cmd << " #{inputs.join(" ")} #{output_file}"
61
62
 
62
63
  if ( prophet_tool.glyco )
@@ -71,11 +72,13 @@ else
71
72
  genv.log("Protein Prophet output file #{output_file} already exists. Run with -r option to replace",:warn)
72
73
  end
73
74
 
74
- # if for_galaxy
75
- # Restore references to peptide prophet xml so downstream tools like
76
- # libra can find it.
77
- # input_stager.restore_references("protein_prophet_results.prot.xml")
78
- # end
75
+
76
+ if (for_galaxy)
77
+ input_stagers.each do |sg|
78
+ sg.restore_references(output_file)
79
+ sg.restore_references(output_file,{:base_only => true})
80
+ end
81
+ end
79
82
 
80
83
 
81
84
 
@@ -0,0 +1,399 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 18/1/2011
5
+ #
6
+ # Convert a protein/peptide xml file to sqlite database
7
+ #
8
+ #
9
+
10
+ require 'libxml'
11
+ require 'protk/constants'
12
+ require 'protk/command_runner'
13
+ require 'protk/tool'
14
+ require 'protk/fastadb'
15
+ require 'sqlite3'
16
+ require 'protk/mzml_parser'
17
+
18
+ include LibXML
19
+
20
+ def prepare_fasta(database_path,type)
21
+
22
+ db_filename = nil
23
+ case
24
+ when Pathname.new(database_path).exist? # It's an explicitly named db
25
+ db_filename = Pathname.new(database_path).expand_path.to_s
26
+ else
27
+ db_filename=Constants.new.current_database_for_name(database_path)
28
+ end
29
+
30
+ db_indexfilename = "#{db_filename}.pin"
31
+
32
+ if File.exist?(db_indexfilename)
33
+ puts "Using existing indexed database"
34
+ orf_lookup = FastaDB.new(db_filename)
35
+ else
36
+ puts "Indexing database"
37
+ orf_lookup = FastaDB.create(db_filename,db_filename,type)
38
+ end
39
+ orf_lookup
40
+ end
41
+
42
+ def get_fasta_record(protein_name,fastadb)
43
+ # puts "Looking up #{protein_name}"
44
+ entry = fastadb.get_by_id protein_name
45
+ if ( entry == nil)
46
+ puts "Failed lookup for #{protein_name}"
47
+ raise KeyError
48
+ end
49
+ entry
50
+ end
51
+
52
+ def initialize_db()
53
+ result = $outputdb.execute <<-SQL
54
+ CREATE TABLE ProteinGroups (
55
+ ID INT,
56
+ Probability REAL
57
+ );
58
+ SQL
59
+
60
+ result = $outputdb.execute <<-SQL
61
+ CREATE TABLE Proteins (
62
+ ID INT,
63
+ ProteinGroupID INT,
64
+ Probability REAL,
65
+ Name TEXT,
66
+ Description TEXT,
67
+ Coverage REAL,
68
+ NumPeptides INT,
69
+ Indistinguishables TEXT,
70
+ Sequence TEXT
71
+ );
72
+ SQL
73
+
74
+ result = $outputdb.execute <<-SQL
75
+ CREATE TABLE Peptides (
76
+ ID INT,
77
+ ProteinID INT,
78
+ Probability REAL,
79
+ Sequence TEXT,
80
+ Start INT,
81
+ End INT,
82
+ ModifiedSequence TEXT
83
+ );
84
+ SQL
85
+
86
+ # This has the role of a join table for the Peptides <-> Spectra many to many relationship
87
+ result = $outputdb.execute <<-SQL
88
+ CREATE TABLE PeptideSpectrumMatches (
89
+ PeptideSequence TEXT,
90
+ PeptideModifiedSequence TEXT,
91
+ SpectrumID INT,
92
+ ScanNum INT,
93
+ RetentionTime REAL,
94
+ PrecursorNeutralMass REAL,
95
+ MassDeviation REAL,
96
+ PrevAA TEXT,
97
+ NextAA TEXT
98
+ );
99
+ SQL
100
+
101
+ result = $outputdb.execute <<-SQL
102
+ CREATE TABLE Spectra (
103
+ ID INTEGER PRIMARY KEY,
104
+ MZData TEXT,
105
+ IntensityData TEXT,
106
+ PrecursorMass REAL,
107
+ PrecursorCharge INT,
108
+ SpectrumType INT,
109
+ SpectrumTitle TEXT
110
+ );
111
+ SQL
112
+
113
+ end
114
+
115
+ def insert_protein_group(group_node)
116
+ group_number=group_node.attributes['group_number']
117
+ group_prob=group_node.attributes['probability']
118
+ $outputdb.execute <<-SQL
119
+ INSERT INTO ProteinGroups(ID,Probability) VALUES(
120
+ #{group_number},#{group_prob}
121
+ );
122
+ SQL
123
+
124
+ proteins=group_node.find("./#{$protxml_ns_prefix}protein", $protxml_ns)
125
+
126
+ proteins.each do |protein|
127
+ insert_protein(protein,group_number)
128
+ end
129
+ end
130
+
131
+ def protein_dbid_from_name(protein_name)
132
+ protein_name #TODO: Allow user defined regex to parse this
133
+ end
134
+
135
+ def insert_protein(protein,group_id)
136
+
137
+ indis_proteins=protein.find("./#{$protxml_ns_prefix}indistinguishable_protein", $protxml_ns)
138
+ indis_proteins_summary=""
139
+ indis_proteins.each { |iprot| indis_proteins_summary<<"#{iprot.attributes['protein_name']};" }
140
+
141
+ annot_descr=protein.find("./#{$protxml_ns_prefix}annotation[@protein_description]", $protxml_ns)
142
+
143
+
144
+ protein_prob=protein.attributes['probability']
145
+ protein_name=protein.attributes['protein_name']
146
+
147
+ begin
148
+ protein_description=annot_descr[0].attributes['protein_description'].chomp.gsub("'","")
149
+ rescue
150
+ puts "No protein_description"
151
+ end
152
+ protein_coverage=protein.attributes['percent_coverage']
153
+ protein_npep = protein.attributes['total_number_peptides']
154
+ protein_indis = indis_proteins_summary
155
+
156
+ protein_coverage="NULL" unless protein_coverage
157
+ protein_indis="NULL" unless protein_indis
158
+ protein_description="NULL" unless protein_description
159
+
160
+ if $fasta_lookup
161
+ begin
162
+ entry=get_fasta_record(protein_name,$fasta_lookup)
163
+ protein_seq=entry.aaseq
164
+ rescue
165
+ puts "Warning: No entry found for #{protein_name}"
166
+ protein_seq="NULL"
167
+ end
168
+ end
169
+
170
+ begin
171
+ $outputdb.execute <<-SQL
172
+ INSERT INTO Proteins(ID,ProteinGroupID,Probability,Name,Description,Coverage,NumPeptides,Indistinguishables,Sequence)
173
+ VALUES(#{$protein_id},#{group_id},#{protein_prob},\'#{protein_name}\',\'#{protein_description}\',#{protein_coverage},
174
+ #{protein_npep},\'#{protein_indis}\','#{protein_seq}');
175
+ SQL
176
+ rescue
177
+ throw "Unable to insert #{protein_description}\n"
178
+ end
179
+ peptides=protein.find("./#{$protxml_ns_prefix}peptide",$protxml_ns)
180
+
181
+ peptides.each do |peptide|
182
+ insert_peptide(peptide,$protein_id,protein_seq)
183
+ end
184
+ $protein_id+=1
185
+ end
186
+
187
+ def insert_peptide(peptide,protein_id,protein_seq)
188
+ nsp_adjusted_probability=peptide.attributes['nsp_adjusted_probability']
189
+ sequence=peptide.attributes['peptide_sequence']
190
+
191
+ start_pos="NULL"
192
+ end_pos="NULL"
193
+ begin
194
+ if protein_seq!="NULL"
195
+ start_pos = protein_seq.index(sequence)
196
+ end_pos = start_pos+sequence.length
197
+ end
198
+ rescue
199
+ puts "Unable to locate peptide #{sequence} in protein #{protein_seq} for #{$protein_id}\n"
200
+ start_pos="NULL"
201
+ end_pos="NULL"
202
+ end
203
+ mod_info=peptide.find("./#{$protxml_ns_prefix}modification_info",$protxml_ns)
204
+
205
+ throw "More than one modification_info object for a peptide" unless mod_info.length<=1
206
+ mod_seq=format_modified_peptide(mod_info)
207
+
208
+ $outputdb.execute <<-SQL
209
+ INSERT INTO Peptides(ID,ProteinID,Probability,Sequence,Start,End,ModifiedSequence)
210
+ VALUES(#{$peptide_id},#{protein_id},#{nsp_adjusted_probability},\'#{sequence}\',
211
+ #{start_pos},#{end_pos},\'#{mod_seq}\')
212
+ SQL
213
+ $peptide_id+=1
214
+
215
+ end
216
+
217
+ def format_modified_peptide(mod_info)
218
+ mod_seq="NULL"
219
+ if mod_info.length==1
220
+ mod_seq=mod_info[0].attributes['modified_peptide']
221
+ mod_seq.gsub!(/\[/,"\{")
222
+ mod_seq.gsub!(/\]/,"\}")
223
+ end
224
+ mod_seq
225
+ end
226
+
227
+ def insert_psms_from_file(filepath)
228
+ $pepxml_ns_prefix="xmlns:"
229
+ $pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
230
+
231
+ pepxml_parser=XML::Parser.file("#{filepath}")
232
+ puts "Parsing #{filepath}"
233
+ pepxml_doc=pepxml_parser.parse
234
+ if not pepxml_doc.root.namespaces.default
235
+ $pepxml_ns_prefix=""
236
+ $pepxml_ns=nil
237
+ end
238
+
239
+ matched_spectra=[]
240
+
241
+ spectrum_queries=pepxml_doc.find("//#{$pepxml_ns_prefix}spectrum_query", $pepxml_ns)
242
+
243
+ spectrum_queries.each do |query|
244
+
245
+ spectrum_name = query.attributes['spectrum'].chomp.gsub("0","").sub(/\.\d+$/,"")
246
+
247
+ start_scan=query.attributes['start_scan'].to_i
248
+ end_scan=query.attributes['end_scan'].to_i
249
+ throw "Don't know how to deal with multi scan spectra" unless start_scan==end_scan
250
+
251
+ retention_time=query.attributes['retention_time_sec'].chomp.to_f
252
+ neutral_mass=query.attributes['precursor_neutral_mass'].to_f
253
+ assumed_charge=query.attributes['assumed_charge'].to_i
254
+
255
+
256
+ top_search_hit=query.find("./#{$pepxml_ns_prefix}search_result/#{$pepxml_ns_prefix}search_hit",$pepxml_ns)[0]
257
+ peptide=top_search_hit.attributes['peptide']
258
+
259
+ mod_info=top_search_hit.find("./#{$protxml_ns_prefix}modification_info",$protxml_ns)
260
+
261
+ throw "More than one modification_info object for a peptide" unless mod_info.length<=1
262
+ modified_peptide=format_modified_peptide(mod_info)
263
+
264
+ calc_neutral_pep_mass=top_search_hit.attributes['calc_neutral_pep_mass'].to_f
265
+ massdiff = top_search_hit.attributes['massdiff'].to_f
266
+ prevaa = top_search_hit.attributes['peptide_prev_aa']
267
+ nextaa = top_search_hit.attributes['peptide_next_aa']
268
+
269
+ spectrum_name="NULL" unless spectrum_name
270
+ retention_time="NULL" unless retention_time
271
+ assumed_charge="NULL" unless assumed_charge
272
+ calc_neutral_pep_mass="NULL" unless calc_neutral_pep_mass
273
+ massdiff = "NULL" unless massdiff
274
+ prevaa = "NULL" unless prevaa
275
+ nextaa = "NULL" unless nextaa
276
+
277
+
278
+ $outputdb.execute <<-SQL
279
+ INSERT INTO PeptideSpectrumMatches(PeptideSequence,PeptideModifiedSequence,SpectrumID,ScanNum,RetentionTime,PrecursorNeutralMass,MassDeviation,PrevAA,NextAA)
280
+ VALUES('#{peptide}','#{modified_peptide}','#{spectrum_name}','#{start_scan}','#{retention_time.to_f}'\
281
+ ,'#{calc_neutral_pep_mass}','#{massdiff}','#{prevaa}','#{nextaa}')
282
+ SQL
283
+
284
+ matched_spectra<<{:name => spectrum_name, :scan_num => start_scan}
285
+
286
+ end
287
+
288
+ matched_spectra
289
+ end
290
+
291
+
292
+ def lookup_spectra_from_files(file_list,matched_spectra)
293
+
294
+ titles_to_match = matched_spectra.collect { |s| s[:name] }
295
+
296
+ # require 'debugger';debugger
297
+
298
+ queries_with_spectra=Array.new.replace(titles_to_match)
299
+
300
+ num_matched=0
301
+ total_spectra=0
302
+
303
+ file_list.each do |file|
304
+ mzml_parser = MzMLParser.new(file)
305
+
306
+ spec = mzml_parser.next_spectrum
307
+
308
+
309
+ while (spec) do
310
+ total_spectra+=1
311
+ if titles_to_match.include? spec[:title]
312
+ num_matched+=1
313
+ queries_with_spectra.delete(spec[:title])
314
+
315
+ $outputdb.execute <<-SQL
316
+ INSERT INTO Spectra(MZData,IntensityData,SpectrumTitle,PrecursorMass)
317
+ VALUES('#{spec[:mz]}','#{spec[:intensity]}','#{spec[:title]}','#{spec[:precursormz]}')
318
+ SQL
319
+
320
+ else
321
+
322
+ end
323
+ spec = mzml_parser.next_spectrum
324
+ end
325
+
326
+ end
327
+ puts "Found #{num_matched} matching spectra"
328
+ puts "Total in spectrum files #{total_spectra}"
329
+ puts "Total psms #{titles_to_match.length}"
330
+ puts "Unmatched psms #{queries_with_spectra.length}"
331
+
332
+
333
+
334
+ end
335
+
336
+ # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
337
+ #
338
+ tool=Tool.new([:explicit_output,:over_write])
339
+ tool.option_parser.banner = "Convert a protXML file to a sqlite database.\n\nUsage: protxml_to_psql.rb [options] file1.protXML"
340
+
341
+ tool.add_value_option(:database,nil,['-d','--database path','A Fasta file where full protein sequences can be looked up'])
342
+
343
+ # require 'debugger';debugger
344
+
345
+ exit unless tool.check_options(true,[:explicit_output])
346
+
347
+ input_file=ARGV.shift
348
+
349
+
350
+ if File.exists? tool.explicit_output
351
+ throw "Cant overwrite existing db #{tool.explicit_output}" unless tool.over_write
352
+ File.delete(tool.explicit_output)
353
+ end
354
+
355
+ $fasta_lookup=nil
356
+ if tool.database
357
+ $fasta_lookup=prepare_fasta(tool.database,'prot')
358
+ end
359
+
360
+ $outputdb = SQLite3::Database.new tool.explicit_output
361
+
362
+ initialize_db
363
+
364
+ XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
365
+
366
+ protxml_parser=XML::Parser.file("#{input_file}")
367
+
368
+ $protxml_ns_prefix="xmlns:"
369
+ $protxml_ns="xmlns:http://regis-web.systemsbiology.net/protXML"
370
+
371
+
372
+ protxml_doc=protxml_parser.parse
373
+ if not protxml_doc.root.namespaces.default
374
+ $protxml_ns_prefix=""
375
+ $protxml_ns=nil
376
+ end
377
+
378
+ $protein_id=0
379
+ $peptide_id=0
380
+
381
+ headers_with_inputs=protxml_doc.find("//#{$protxml_ns_prefix}protein_summary_header[@source_files]",$protxml_ns)
382
+
383
+ matched_spectra=[]
384
+
385
+ headers_with_inputs.each do |header|
386
+ pepxml_files = header.attributes['source_files'].split(",")
387
+ pepxml_files.each do |pepxml_file|
388
+ matched_spectra.concat insert_psms_from_file(pepxml_file)
389
+ end
390
+ end
391
+
392
+ lookup_spectra_from_files(ARGV.collect { |file| file.chomp },matched_spectra)
393
+
394
+ protein_groups=protxml_doc.find("//#{$protxml_ns_prefix}protein_group", $protxml_ns)
395
+
396
+ protein_groups.each do |g|
397
+ insert_protein_group(g)
398
+ end
399
+
@@ -20,24 +20,40 @@
20
20
 
21
21
  #define AMINO_ACIDS "ARNDCEQGHILKMFPSTWYV"
22
22
  #define NOT_AMINO_ACIDS "BJOUXZ*"
23
- #define MAX_SEQUENCE_LENGTH 20000
24
- #define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
23
+ #define MAX_SEQUENCE_LENGTH 2000
24
+ #define MAX_LINE_LENGTH 200000 /* large enough to read in long header lines */
25
+
26
+ void RemoveSpaces(char* source)
27
+ {
28
+ char* i = source;
29
+ char* j = source;
30
+ while(*j != 0)
31
+ {
32
+ *i = *j++;
33
+ if(*i != ' ')
34
+ i++;
35
+ }
36
+ *i = 0;
37
+ }
25
38
 
26
39
 
27
40
  static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
28
- VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
29
-
30
- char *input_file = RSTRING_PTR(input_file_in);
41
+ VALUE db_length_in,VALUE output_file_in,VALUE prefix_string_in)
42
+ {
43
+
44
+ char *infile = StringValueCStr(input_file_in);
31
45
  long sequences_to_generate = NUM2INT(db_length_in);
32
- char * output_file = RSTRING_PTR(output_file_in);
46
+ char * outfile = StringValueCStr(output_file_in);
47
+ char *prefix_string = StringValueCStr(prefix_string_in);
33
48
 
34
49
  char line[MAX_LINE_LENGTH];
35
- char settings_line[60][70];
36
- char infile[255], outfile[255]; /* for reading input and writing output */
37
- char prefix_string[255];
50
+ // char settings_line[60][70];
51
+
38
52
  char *p,**index;
39
- char *sequence;
40
- char one_sequence[MAX_SEQUENCE_LENGTH],random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)],random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
53
+
54
+ char one_sequence[MAX_SEQUENCE_LENGTH];
55
+ char random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)];
56
+ char random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
41
57
  char *temp_sequence;
42
58
  int a;
43
59
  FILE *inp, *outp;
@@ -50,63 +66,57 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
50
66
  double x;
51
67
 
52
68
  /* scanning sequence database */
53
-
54
- strcpy(infile,input_file);
55
69
 
56
70
  if ((inp = fopen(infile, "r"))==NULL) {
57
71
  printf("error opening sequence database %s\n",infile);return -1;
58
72
  }
59
73
 
60
- printf("scanning sequence database \n%s\n",infile);
61
- fflush(stdout);
62
-
63
- i=0;n=0;k=0;
74
+ long total_sequence_len=0;
75
+ n=0;
64
76
 
65
77
  while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {
66
- i++;
67
- if(line[0]=='>') {
68
- if (!(n%1000)) {
69
- printf(".");
70
- fflush(stdout);
71
- n++;
72
- }
73
- }
78
+ total_sequence_len+=strlen(line);
79
+
80
+ // printf("%ld\n",i);fflush(stdout);
81
+ if (line[0]=='>') { n++; }
74
82
  }
75
-
83
+
76
84
  n_sequences=n;
77
85
 
78
86
 
79
87
  /* reading sequence database */
80
88
 
81
89
  temp_sequence=(char*)calloc(sizeof(char),MAX_SEQUENCE_LENGTH);
82
- sequence=(char*)malloc(sizeof(char)*(i*80)); /* allocate enough memory for 80 characters per line in FASTA database */
90
+
91
+ char *sequence_block=(char*)malloc(sizeof(char)*(total_sequence_len+2));
92
+
83
93
  index=(char**)malloc(sizeof(char*)*n_sequences);
84
- index[0]=sequence; /* set first index pointer to beginning of first database sequence */
94
+ index[0]=sequence_block; /* set first index pointer to beginning of first database sequence */
85
95
 
86
96
  if ((inp = fopen(infile, "r"))==NULL) {
87
97
  printf("error opening sequence database %s\n",infile);
88
98
  return -1;
89
99
  }
90
100
 
91
- printf("done\nreading sequence database \n%s\n",infile);
92
- fflush(stdout);
93
-
94
101
  n=-1;
95
102
  strcpy(temp_sequence,"\0");
96
103
 
97
104
  while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
98
- {
99
- if (strcmp(line,"\n")==0) {
105
+ {
106
+ RemoveSpaces(line);
107
+
108
+ if (strcmp(line,"\n")==0) { // Skips blank lines
100
109
  continue;
101
110
  }
111
+
102
112
  if (line[0]=='>') {
103
113
  if (n>=0) {
104
- if (!(n%1000)&&n>0) {
105
- printf(".");fflush(stdout);
106
- }
114
+
107
115
  strcpy(index[n],temp_sequence);
108
- n++; index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
116
+ n++;
117
+ index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
109
118
  strcpy(temp_sequence,"\0");
119
+
110
120
  }
111
121
  else
112
122
  {
@@ -116,7 +126,9 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
116
126
  }
117
127
  else
118
128
  {
119
- if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) continue;
129
+ if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) {
130
+ continue;
131
+ }
120
132
  strncat(temp_sequence,line,strlen(line)-1);
121
133
  }
122
134
  }
@@ -127,16 +139,18 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
127
139
 
128
140
  n_sequences=n+1;
129
141
 
130
- printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
142
+ // printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
143
+
144
+ // measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
145
+
131
146
 
132
- measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
133
147
 
134
148
 
135
149
 
136
150
  /* generating Markov probabilities */
137
151
 
138
- printf("generating Markov probability matrix...");
139
- fflush(stdout);
152
+ // printf("generating Markov probability matrix...");
153
+ // fflush(stdout);
140
154
 
141
155
  srand(time(0)); /* replace with constant to re-generate identical random databases */
142
156
 
@@ -146,52 +160,53 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
146
160
  }
147
161
  }
148
162
  for(j=0;j<=20;j++) {
149
- measured_aa_freq[j]=0;generated_aa_freq[j]=0;
163
+ measured_aa_freq[j]=0;
164
+ generated_aa_freq[j]=0;
150
165
  }
151
166
 
167
+
152
168
  for(protein=0;protein<n_sequences;protein++)
153
169
  {
154
- if (!(protein%1000)) {
155
- printf(".");
156
- fflush(stdout);
170
+ if (protein<(n_sequences-1))
171
+ {
172
+ long len_one_seq = (index[protein+1]-index[protein])/sizeof(char);
173
+ if ( len_one_seq > MAX_SEQUENCE_LENGTH ){
174
+ printf("Seq is longer than max len \n");fflush(stdout);
175
+ len_one_seq=MAX_SEQUENCE_LENGTH;
176
+ }
177
+ strncpy(one_sequence,index[protein],len_one_seq);
178
+
179
+ one_sequence[len_one_seq]='\0'; // NULL terminate the string
180
+
181
+ } else {
182
+ strcpy(one_sequence,index[protein]);
157
183
  }
158
184
 
159
- if (protein<(n_sequences-1))
185
+ pl=strlen(one_sequence);
186
+ n=1;
187
+ one_index=0;
188
+
189
+ for(i=0;i<pl;i++)
160
190
  {
161
- strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
162
- one_sequence[(index[protein+1]-index[protein])/sizeof(char)]='\0';
163
- }
164
- else strcpy(one_sequence,index[protein]);
165
- pl=strlen(one_sequence);
166
- n=1;one_index=0;
167
-
168
- for(i=0;i<pl;i++)
169
- {
170
- if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
171
- {
172
- if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
191
+ if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
173
192
  {
174
- printf("Unknown amino acid %c",one_sequence[i]);
193
+ if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
194
+ {
195
+ printf("Unknown amino acid %c",one_sequence[i]);
196
+ } else {
197
+ a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
198
+ MP[a][i]++;
199
+ measured_aa_freq[a]++;
200
+ }
175
201
  } else {
176
- a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
177
- MP[a][i]++;
202
+ a=floor(20*(float)rand()/RAND_MAX);
203
+ MP[a][i]++;
178
204
  measured_aa_freq[a]++;
179
- }
180
- }
181
- else {
182
- a=floor(20*(float)rand()/RAND_MAX);
183
- MP[a][i]++;
184
- measured_aa_freq[a]++;
185
- } // replace B, X, Z etc. with random amino acid to preserve size distribution
186
- }
187
- MP[20][pl]++;
188
- measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
205
+ } // replace B, X, Z etc. with random amino acid to preserve size distribution
189
206
  }
190
-
191
- printf("done\n");
192
- fflush(stdout);
193
-
194
-
207
+ MP[20][pl]++;
208
+ measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
209
+ }
195
210
 
196
211
  for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
197
212
  row_sum[i]=0;
@@ -204,41 +219,38 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
204
219
  }
205
220
 
206
221
 
207
- /* generate random protein sequences through Markov chain */
208
-
209
- strcpy(outfile,output_file);
210
222
 
211
- if ((outp = fopen(outfile, "w"))==NULL) {
212
- printf("error opening output file %s\n",outfile);
213
- return -1;
214
- }
223
+ /* generate random protein sequences through Markov chain */
215
224
 
216
- printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
217
225
 
218
- strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
226
+ if ((outp = fopen(outfile, "w"))==NULL) {
227
+ printf("error opening output file %s\n",outfile);
228
+ return -1;
229
+ }
219
230
 
220
- for(protein=0;protein<sequences_to_generate;protein++)
221
- {
222
- if (!(protein%1000)) {
223
- printf(".");fflush(stdout);
224
- }
231
+ for(protein=0;protein<sequences_to_generate;protein++)
232
+ {
225
233
 
226
- i=0; j=0;
227
- while (1)
234
+ i=0; j=0;
235
+ while (1)
236
+ {
237
+ x=(double)row_sum[j]*((double)rand()/RAND_MAX);
238
+ partial_sum=MP[0][j]; i=1;
239
+
240
+ while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
241
+
242
+ if (j>=MAX_SEQUENCE_LENGTH) { i=21; }/* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
243
+
244
+ if (i<21)
228
245
  {
229
- x=(double)row_sum[j]*((double)rand()/RAND_MAX);
230
- partial_sum=MP[0][j]; i=1;
231
- while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
232
- if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
233
- if (i<21)
234
- {
235
- random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
236
- }
237
- else /* i==21, i.e. protein sequence terminated */
238
- {
239
- k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
240
- for(l=0;l<j;l++)
241
- {
246
+ random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
247
+ } else { /* i==21, i.e. protein sequence terminated */
248
+ k=0;
249
+ generated_aa_freq[20]++;
250
+ generated_pl_sum+=j;
251
+
252
+ for(l=0;l<j;l++)
253
+ {
242
254
  random_sequence_output[k]=random_sequence[l]; k++;
243
255
  if (!((k+1)%61))
244
256
  {
@@ -256,19 +268,13 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
256
268
 
257
269
  fclose(outp);
258
270
 
259
-
260
- /* freeing some memory... */
261
-
262
- free(index);
263
271
 
264
- printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
272
+ // printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
265
273
 
266
274
  k=0;l=0;
267
275
  for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
268
- // printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
269
- // for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
270
276
 
271
- printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
277
+ // printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
272
278
 
273
279
  return 0;
274
280
 
@@ -20,14 +20,13 @@ class GalaxyStager
20
20
  end
21
21
  end
22
22
 
23
- def replace_references(in_file, options = {})
24
- options = { :base_only => false }.merge(options)
25
- replacement = options[:base_only] ? @staged_base : @staged_path
23
+ def replace_references(in_file)
26
24
  GalaxyStager.replace_references(in_file, @original_path, replacement)
27
25
  end
28
26
 
29
- def restore_references(in_file)
30
- GalaxyStager.replace_references(in_file, @staged_path, @original_path)
27
+ def restore_references(in_file, options = {})
28
+ path = options[:base_only] ? @staged_path.gsub(/#{@extension}/,"") : @staged_path
29
+ GalaxyStager.replace_references(in_file, path, @original_path)
31
30
  end
32
31
 
33
32
  def self.replace_references(in_file, from_path, to_path)
@@ -14,7 +14,7 @@ class GalaxyUtil
14
14
 
15
15
  def self.stage_pepxml(input_pepxml_path)
16
16
  stager = GalaxyStager.new(input_pepxml_path, :extension => ".pep.xml")
17
- stager.staged_path
17
+ stager
18
18
  end
19
19
 
20
20
  def self.stage_protxml(input_protxml_path)
@@ -0,0 +1,67 @@
1
+ require 'libxml'
2
+
3
+ include LibXML
4
+
5
+ class MzMLParser < Object
6
+
7
+
8
+ def initialize(path)
9
+ @namespace=
10
+ @mzml_ns_prefix="xmlns:"
11
+ @mzml_ns="xmlns:http://psi.hupo.org/ms/mzml"
12
+
13
+ doc=XML::Document.file(path)
14
+ @file_reader=XML::Reader.document(doc)
15
+ end
16
+
17
+ def next_spectrum()
18
+
19
+ until @file_reader.name=="spectrum"
20
+ if !@file_reader.read()
21
+ return nil
22
+ end
23
+ end
24
+
25
+ this_spect=spectrum_as_hash(@file_reader.expand)
26
+
27
+ @file_reader.next_sibling
28
+
29
+ return this_spect
30
+ end
31
+
32
+ def spectrum_as_hash(spectrum)
33
+ index=spectrum.attributes['index']
34
+ sid = spectrum.attributes['id']
35
+ precursor_mz_param = spectrum.find(".//#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000744\"]",@mzml_ns)[0]
36
+ mslevel_param = spectrum.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000511\"]",@mzml_ns)[0]
37
+
38
+ title_param = spectrum.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000796\"]",@mzml_ns)[0]
39
+
40
+ # prec_mz = spectrum.find(".//#{@mz}")
41
+
42
+ precursor_mz_mz = precursor_mz_param.attributes['value'] if precursor_mz_param
43
+ mslevel = mslevel_param.attributes['value'] if mslevel_param
44
+ spectrum_title = title_param['value'] if title_param
45
+
46
+ data_arrays = spectrum.find("./#{@mzml_ns_prefix}binaryDataArrayList/#{@mzml_ns_prefix}binaryDataArray",@mzml_ns)
47
+
48
+ data={}
49
+ data_arrays.each do |arr|
50
+ the_data = arr.find("./#{@mzml_ns_prefix}binary",@mzml_ns)[0].content
51
+ mzaccession = arr.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000514\"]",@mzml_ns)
52
+ if ( mzaccession.length==1 )
53
+ data[:mz] = the_data
54
+ else
55
+ data[:intensity] = the_data
56
+ end
57
+ end
58
+ data[:title]=spectrum_title
59
+ data[:mzlevel]=mslevel
60
+ data[:index]=index
61
+ data[:precursormz]=precursor_mz_mz
62
+ data[:id]=sid
63
+
64
+ data
65
+ end
66
+
67
+ end
@@ -53,7 +53,7 @@ class Tool
53
53
  end
54
54
 
55
55
 
56
- def add_value_option(symbol,default_value,opts)
56
+ def add_value_option(symbol,default_value,opts)
57
57
  @options[symbol]=default_value
58
58
  @option_parser.on(*opts) do |val|
59
59
  @options[symbol]=val
@@ -108,6 +108,8 @@ class Tool
108
108
  add_value_option(:threads,1,['-n','--threads num','Number of processing threads to use. Set to 0 to autodetect an appropriate value'])
109
109
  end
110
110
 
111
+
112
+
111
113
  end
112
114
 
113
115
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: protk
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0.pre3
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ira Cooke
@@ -152,6 +152,20 @@ dependencies:
152
152
  - - ~>
153
153
  - !ruby/object:Gem::Version
154
154
  version: '0'
155
+ - !ruby/object:Gem::Dependency
156
+ name: sqlite3
157
+ requirement: !ruby/object:Gem::Requirement
158
+ requirements:
159
+ - - ~>
160
+ - !ruby/object:Gem::Version
161
+ version: '0'
162
+ type: :runtime
163
+ prerelease: false
164
+ version_requirements: !ruby/object:Gem::Requirement
165
+ requirements:
166
+ - - ~>
167
+ - !ruby/object:Gem::Version
168
+ version: '0'
155
169
  description: A bunch of tools for proteomics
156
170
  email: iracooke@gmail.com
157
171
  executables:
@@ -195,6 +209,7 @@ files:
195
209
  - bin/protein_prophet.rb
196
210
  - bin/protk_setup.rb
197
211
  - bin/protxml_to_gff.rb
212
+ - bin/protxml_to_psql.rb
198
213
  - bin/protxml_to_table.rb
199
214
  - bin/repair_run_summary.rb
200
215
  - bin/sixframe.rb
@@ -237,6 +252,7 @@ files:
237
252
  - lib/protk/manage_db_rakefile.rake
238
253
  - lib/protk/manage_db_tool.rb
239
254
  - lib/protk/mascot_util.rb
255
+ - lib/protk/mzml_parser.rb
240
256
  - lib/protk/omssa_util.rb
241
257
  - lib/protk/openms_defaults.rb
242
258
  - lib/protk/pepxml.rb
@@ -266,9 +282,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
266
282
  version: '0'
267
283
  required_rubygems_version: !ruby/object:Gem::Requirement
268
284
  requirements:
269
- - - '>'
285
+ - - '>='
270
286
  - !ruby/object:Gem::Version
271
- version: 1.3.1
287
+ version: '0'
272
288
  requirements: []
273
289
  rubyforge_project:
274
290
  rubygems_version: 2.2.1