protk 1.3.0.pre3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 704282a21d38fd8d3a536fbbba9ab90eabd54355
4
- data.tar.gz: d325658a001939d222bfb7c942836df25ae9790b
3
+ metadata.gz: 5e8f8a571cb43ed61984a34b6e1fb51caf979593
4
+ data.tar.gz: b53857f75c1ff6ca850859c3985aee36533e437f
5
5
  SHA512:
6
- metadata.gz: f9902d7d48b5171470b073ab94089481f0a9125d0e65e4a33b600ed86cf622bafa774335a0c33520e955d49dd0f195ed7ae82fcab6ad18bf35a42dafe030aea3
7
- data.tar.gz: 253cabdf8bfcf009516cc2675ebe12bb9b43ec15515625eabd3701c0761be9f3fc78dcc58dc731c7f89f642f8c11cf1bd8889c5f68a565cda0d04de06fb273c3
6
+ metadata.gz: 9450fccc4a5ce59f064927d62fbc6a4342a1710c3b82707e0908dea52af7d0b50f215e64073bb067a506d204701acea11b6d28f302447494b8a30b1e7af2df2d
7
+ data.tar.gz: 1b8bc78fc09b4c81eee72fad169a6aee7145a16312c01bc95a5dd590f08cb98194b26115a166a759b9c52c7c67204a767747642e5e9331de4562d52f31eb1e11
@@ -36,7 +36,6 @@ input_file=ARGV[0]
36
36
  db_length=tool.db_length
37
37
  if ( db_length==0) #If no db length was specified use the number of entries in the input file
38
38
  db_length=Bio::FastaFormat.open(input_file).count
39
- puts "Found #{db_length} entries in input file"
40
39
  end
41
40
 
42
41
  output_file = tool.explicit_output if tool.explicit_output!=nil
@@ -65,6 +64,7 @@ end
65
64
 
66
65
  if ( tool.append )
67
66
  cmd ="awk 'FNR==1{print \"\"}1' #{input_file} #{decoys_tmp_file} > #{output_file};"
67
+ cmd << "sed -i.bak '/^$/d' #{output_file};"
68
68
  cmd << "rm #{decoys_tmp_file}"
69
69
  else
70
70
  cmd = "mv #{decoys_tmp_file} #{output_file}"
@@ -83,16 +83,17 @@ database_path=db_info.path
83
83
 
84
84
  # Database must have fasta extension
85
85
  if Pathname.new(database_path).extname.to_s.downcase != ".fasta"
86
- make_msgfdb_cmd << "ln -s #{database_path} #{database_path}.fasta;"
86
+ File.symlink(database_path,"#{database_path}.fasta") unless File.exists?("#{database_path}.fasta")
87
+ # make_msgfdb_cmd << "ln -s #{database_path} #{database_path}.fasta;"
87
88
  database_path="#{database_path}.fasta"
88
- db_info.path=database_path
89
+ database_path
89
90
  end
90
91
 
91
92
  # Database must be indexed
92
93
  unless FileTest.exists?("#{database_path}.canno")
93
- dbdir = Pathname.new(database_path).dirname.realpath.to_s
94
+ dbdir = Pathname.new(database_path).dirname.to_s
94
95
  tdavalue=search_tool.decoy_search ? 1 : 0;
95
- make_msgfdb_cmd << "cd #{dbdir}; java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
96
+ make_msgfdb_cmd << "java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
96
97
  end
97
98
 
98
99
 
@@ -214,7 +215,6 @@ ARGV.each do |filename|
214
215
  else
215
216
  cmd << "; mv #{mzid_output_path} #{output_path}"
216
217
  end
217
-
218
218
 
219
219
  # Up to here we've formulated the command. The rest is cleanup
220
220
  p "Running:#{cmd}"
@@ -12,8 +12,6 @@ require 'protk/command_runner'
12
12
  require 'protk/search_tool'
13
13
  require 'protk/galaxy_util'
14
14
 
15
- for_galaxy = GalaxyUtil.for_galaxy?
16
-
17
15
  # Setup specific command-line options for this tool. Other options are inherited from SearchTool
18
16
  #
19
17
  search_tool=SearchTool.new([
@@ -94,22 +92,6 @@ ARGV.each do |filename|
94
92
  #
95
93
  cmd << " -v #{search_tool.missed_cleavages}"
96
94
 
97
- # If this is for Galaxy and a data directory has been specified
98
- # look for a common unimod.xml file.
99
- if for_galaxy
100
- galaxy_index_dir = search_tool.galaxy_index_dir
101
- if galaxy_index_dir
102
- galaxy_mods = File.join(galaxy_index_dir, "mods.xml")
103
- if( FileTest.exists?(galaxy_mods) )
104
- cmd << " -mx #{galaxy_mods}"
105
- end
106
- galaxy_usermods = File.join(galaxy_index_dir, "usermods.xml")
107
- if( FileTest.exists?(galaxy_usermods) )
108
- cmd << " -mux #{galaxy_usermods}"
109
- end
110
- end
111
- end
112
-
113
95
  if ( search_tool.omx_output )
114
96
  cmd << " -ox #{search_tool.omx_output} "
115
97
  end
@@ -51,10 +51,11 @@ throw "When --output and -F options are set only one file at a time can be run"
51
51
  # Obtain a global environment object
52
52
  genv=Constants.new
53
53
 
54
-
54
+ input_stagers=[]
55
55
  inputs=ARGV.collect { |file_name| file_name.chomp}
56
56
  if for_galaxy
57
- inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
57
+ input_stagers = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
58
+ inputs=input_stagers.collect { |sg| sg.staged_path }
58
59
  end
59
60
 
60
61
  # Interrogate all the input files to obtain the database and search engine from them
@@ -212,7 +213,13 @@ else
212
213
 
213
214
  cmd=generate_command(genv,prophet_tool,inputs,output_file_name,database,engine)
214
215
  run_peptide_prophet(genv,prophet_tool,cmd,output_file_name,engine)
215
-
216
+
216
217
  end
217
218
 
219
+ if (for_galaxy)
220
+ input_stagers.each do |sg|
221
+ sg.restore_references(output_file_name)
222
+ sg.restore_references(output_file_name,{:base_only => true})
223
+ end
224
+ end
218
225
 
@@ -40,7 +40,13 @@ exit unless prophet_tool.check_options(true)
40
40
  # Obtain a global environment object
41
41
  genv=Constants.new
42
42
 
43
- inputs = ARGV.collect {|file_name| file_name.chomp }
43
+ input_stagers=[]
44
+ inputs=ARGV.collect { |file_name| file_name.chomp}
45
+ if for_galaxy
46
+ input_stagers = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
47
+ inputs=input_stagers.collect { |sg| sg.staged_path }
48
+ end
49
+
44
50
 
45
51
  if ( prophet_tool.explicit_output )
46
52
  output_file=prophet_tool.explicit_output
@@ -52,11 +58,6 @@ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
52
58
 
53
59
  cmd="ProteinProphet "
54
60
 
55
- if for_galaxy
56
- inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
57
- end
58
-
59
-
60
61
  cmd << " #{inputs.join(" ")} #{output_file}"
61
62
 
62
63
  if ( prophet_tool.glyco )
@@ -71,11 +72,13 @@ else
71
72
  genv.log("Protein Prophet output file #{output_file} already exists. Run with -r option to replace",:warn)
72
73
  end
73
74
 
74
- # if for_galaxy
75
- # Restore references to peptide prophet xml so downstream tools like
76
- # libra can find it.
77
- # input_stager.restore_references("protein_prophet_results.prot.xml")
78
- # end
75
+
76
+ if (for_galaxy)
77
+ input_stagers.each do |sg|
78
+ sg.restore_references(output_file)
79
+ sg.restore_references(output_file,{:base_only => true})
80
+ end
81
+ end
79
82
 
80
83
 
81
84
 
@@ -0,0 +1,399 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 18/1/2011
5
+ #
6
+ # Convert a protein/peptide xml file to sqlite database
7
+ #
8
+ #
9
+
10
+ require 'libxml'
11
+ require 'protk/constants'
12
+ require 'protk/command_runner'
13
+ require 'protk/tool'
14
+ require 'protk/fastadb'
15
+ require 'sqlite3'
16
+ require 'protk/mzml_parser'
17
+
18
+ include LibXML
19
+
20
+ def prepare_fasta(database_path,type)
21
+
22
+ db_filename = nil
23
+ case
24
+ when Pathname.new(database_path).exist? # It's an explicitly named db
25
+ db_filename = Pathname.new(database_path).expand_path.to_s
26
+ else
27
+ db_filename=Constants.new.current_database_for_name(database_path)
28
+ end
29
+
30
+ db_indexfilename = "#{db_filename}.pin"
31
+
32
+ if File.exist?(db_indexfilename)
33
+ puts "Using existing indexed database"
34
+ orf_lookup = FastaDB.new(db_filename)
35
+ else
36
+ puts "Indexing database"
37
+ orf_lookup = FastaDB.create(db_filename,db_filename,type)
38
+ end
39
+ orf_lookup
40
+ end
41
+
42
+ def get_fasta_record(protein_name,fastadb)
43
+ # puts "Looking up #{protein_name}"
44
+ entry = fastadb.get_by_id protein_name
45
+ if ( entry == nil)
46
+ puts "Failed lookup for #{protein_name}"
47
+ raise KeyError
48
+ end
49
+ entry
50
+ end
51
+
52
+ def initialize_db()
53
+ result = $outputdb.execute <<-SQL
54
+ CREATE TABLE ProteinGroups (
55
+ ID INT,
56
+ Probability REAL
57
+ );
58
+ SQL
59
+
60
+ result = $outputdb.execute <<-SQL
61
+ CREATE TABLE Proteins (
62
+ ID INT,
63
+ ProteinGroupID INT,
64
+ Probability REAL,
65
+ Name TEXT,
66
+ Description TEXT,
67
+ Coverage REAL,
68
+ NumPeptides INT,
69
+ Indistinguishables TEXT,
70
+ Sequence TEXT
71
+ );
72
+ SQL
73
+
74
+ result = $outputdb.execute <<-SQL
75
+ CREATE TABLE Peptides (
76
+ ID INT,
77
+ ProteinID INT,
78
+ Probability REAL,
79
+ Sequence TEXT,
80
+ Start INT,
81
+ End INT,
82
+ ModifiedSequence TEXT
83
+ );
84
+ SQL
85
+
86
+ # This has the role of a join table for the Peptides <-> Spectra many to many relationship
87
+ result = $outputdb.execute <<-SQL
88
+ CREATE TABLE PeptideSpectrumMatches (
89
+ PeptideSequence TEXT,
90
+ PeptideModifiedSequence TEXT,
91
+ SpectrumID INT,
92
+ ScanNum INT,
93
+ RetentionTime REAL,
94
+ PrecursorNeutralMass REAL,
95
+ MassDeviation REAL,
96
+ PrevAA TEXT,
97
+ NextAA TEXT
98
+ );
99
+ SQL
100
+
101
+ result = $outputdb.execute <<-SQL
102
+ CREATE TABLE Spectra (
103
+ ID INTEGER PRIMARY KEY,
104
+ MZData TEXT,
105
+ IntensityData TEXT,
106
+ PrecursorMass REAL,
107
+ PrecursorCharge INT,
108
+ SpectrumType INT,
109
+ SpectrumTitle TEXT
110
+ );
111
+ SQL
112
+
113
+ end
114
+
115
+ def insert_protein_group(group_node)
116
+ group_number=group_node.attributes['group_number']
117
+ group_prob=group_node.attributes['probability']
118
+ $outputdb.execute <<-SQL
119
+ INSERT INTO ProteinGroups(ID,Probability) VALUES(
120
+ #{group_number},#{group_prob}
121
+ );
122
+ SQL
123
+
124
+ proteins=group_node.find("./#{$protxml_ns_prefix}protein", $protxml_ns)
125
+
126
+ proteins.each do |protein|
127
+ insert_protein(protein,group_number)
128
+ end
129
+ end
130
+
131
+ def protein_dbid_from_name(protein_name)
132
+ protein_name #TODO: Allow user defined regex to parse this
133
+ end
134
+
135
+ def insert_protein(protein,group_id)
136
+
137
+ indis_proteins=protein.find("./#{$protxml_ns_prefix}indistinguishable_protein", $protxml_ns)
138
+ indis_proteins_summary=""
139
+ indis_proteins.each { |iprot| indis_proteins_summary<<"#{iprot.attributes['protein_name']};" }
140
+
141
+ annot_descr=protein.find("./#{$protxml_ns_prefix}annotation[@protein_description]", $protxml_ns)
142
+
143
+
144
+ protein_prob=protein.attributes['probability']
145
+ protein_name=protein.attributes['protein_name']
146
+
147
+ begin
148
+ protein_description=annot_descr[0].attributes['protein_description'].chomp.gsub("'","")
149
+ rescue
150
+ puts "No protein_description"
151
+ end
152
+ protein_coverage=protein.attributes['percent_coverage']
153
+ protein_npep = protein.attributes['total_number_peptides']
154
+ protein_indis = indis_proteins_summary
155
+
156
+ protein_coverage="NULL" unless protein_coverage
157
+ protein_indis="NULL" unless protein_indis
158
+ protein_description="NULL" unless protein_description
159
+
160
+ if $fasta_lookup
161
+ begin
162
+ entry=get_fasta_record(protein_name,$fasta_lookup)
163
+ protein_seq=entry.aaseq
164
+ rescue
165
+ puts "Warning: No entry found for #{protein_name}"
166
+ protein_seq="NULL"
167
+ end
168
+ end
169
+
170
+ begin
171
+ $outputdb.execute <<-SQL
172
+ INSERT INTO Proteins(ID,ProteinGroupID,Probability,Name,Description,Coverage,NumPeptides,Indistinguishables,Sequence)
173
+ VALUES(#{$protein_id},#{group_id},#{protein_prob},\'#{protein_name}\',\'#{protein_description}\',#{protein_coverage},
174
+ #{protein_npep},\'#{protein_indis}\','#{protein_seq}');
175
+ SQL
176
+ rescue
177
+ throw "Unable to insert #{protein_description}\n"
178
+ end
179
+ peptides=protein.find("./#{$protxml_ns_prefix}peptide",$protxml_ns)
180
+
181
+ peptides.each do |peptide|
182
+ insert_peptide(peptide,$protein_id,protein_seq)
183
+ end
184
+ $protein_id+=1
185
+ end
186
+
187
+ def insert_peptide(peptide,protein_id,protein_seq)
188
+ nsp_adjusted_probability=peptide.attributes['nsp_adjusted_probability']
189
+ sequence=peptide.attributes['peptide_sequence']
190
+
191
+ start_pos="NULL"
192
+ end_pos="NULL"
193
+ begin
194
+ if protein_seq!="NULL"
195
+ start_pos = protein_seq.index(sequence)
196
+ end_pos = start_pos+sequence.length
197
+ end
198
+ rescue
199
+ puts "Unable to locate peptide #{sequence} in protein #{protein_seq} for #{$protein_id}\n"
200
+ start_pos="NULL"
201
+ end_pos="NULL"
202
+ end
203
+ mod_info=peptide.find("./#{$protxml_ns_prefix}modification_info",$protxml_ns)
204
+
205
+ throw "More than one modification_info object for a peptide" unless mod_info.length<=1
206
+ mod_seq=format_modified_peptide(mod_info)
207
+
208
+ $outputdb.execute <<-SQL
209
+ INSERT INTO Peptides(ID,ProteinID,Probability,Sequence,Start,End,ModifiedSequence)
210
+ VALUES(#{$peptide_id},#{protein_id},#{nsp_adjusted_probability},\'#{sequence}\',
211
+ #{start_pos},#{end_pos},\'#{mod_seq}\')
212
+ SQL
213
+ $peptide_id+=1
214
+
215
+ end
216
+
217
+ def format_modified_peptide(mod_info)
218
+ mod_seq="NULL"
219
+ if mod_info.length==1
220
+ mod_seq=mod_info[0].attributes['modified_peptide']
221
+ mod_seq.gsub!(/\[/,"\{")
222
+ mod_seq.gsub!(/\]/,"\}")
223
+ end
224
+ mod_seq
225
+ end
226
+
227
+ def insert_psms_from_file(filepath)
228
+ $pepxml_ns_prefix="xmlns:"
229
+ $pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
230
+
231
+ pepxml_parser=XML::Parser.file("#{filepath}")
232
+ puts "Parsing #{filepath}"
233
+ pepxml_doc=pepxml_parser.parse
234
+ if not pepxml_doc.root.namespaces.default
235
+ $pepxml_ns_prefix=""
236
+ $pepxml_ns=nil
237
+ end
238
+
239
+ matched_spectra=[]
240
+
241
+ spectrum_queries=pepxml_doc.find("//#{$pepxml_ns_prefix}spectrum_query", $pepxml_ns)
242
+
243
+ spectrum_queries.each do |query|
244
+
245
+ spectrum_name = query.attributes['spectrum'].chomp.gsub("0","").sub(/\.\d+$/,"")
246
+
247
+ start_scan=query.attributes['start_scan'].to_i
248
+ end_scan=query.attributes['end_scan'].to_i
249
+ throw "Don't know how to deal with multi scan spectra" unless start_scan==end_scan
250
+
251
+ retention_time=query.attributes['retention_time_sec'].chomp.to_f
252
+ neutral_mass=query.attributes['precursor_neutral_mass'].to_f
253
+ assumed_charge=query.attributes['assumed_charge'].to_i
254
+
255
+
256
+ top_search_hit=query.find("./#{$pepxml_ns_prefix}search_result/#{$pepxml_ns_prefix}search_hit",$pepxml_ns)[0]
257
+ peptide=top_search_hit.attributes['peptide']
258
+
259
+ mod_info=top_search_hit.find("./#{$protxml_ns_prefix}modification_info",$protxml_ns)
260
+
261
+ throw "More than one modification_info object for a peptide" unless mod_info.length<=1
262
+ modified_peptide=format_modified_peptide(mod_info)
263
+
264
+ calc_neutral_pep_mass=top_search_hit.attributes['calc_neutral_pep_mass'].to_f
265
+ massdiff = top_search_hit.attributes['massdiff'].to_f
266
+ prevaa = top_search_hit.attributes['peptide_prev_aa']
267
+ nextaa = top_search_hit.attributes['peptide_next_aa']
268
+
269
+ spectrum_name="NULL" unless spectrum_name
270
+ retention_time="NULL" unless retention_time
271
+ assumed_charge="NULL" unless assumed_charge
272
+ calc_neutral_pep_mass="NULL" unless calc_neutral_pep_mass
273
+ massdiff = "NULL" unless massdiff
274
+ prevaa = "NULL" unless prevaa
275
+ nextaa = "NULL" unless nextaa
276
+
277
+
278
+ $outputdb.execute <<-SQL
279
+ INSERT INTO PeptideSpectrumMatches(PeptideSequence,PeptideModifiedSequence,SpectrumID,ScanNum,RetentionTime,PrecursorNeutralMass,MassDeviation,PrevAA,NextAA)
280
+ VALUES('#{peptide}','#{modified_peptide}','#{spectrum_name}','#{start_scan}','#{retention_time.to_f}'\
281
+ ,'#{calc_neutral_pep_mass}','#{massdiff}','#{prevaa}','#{nextaa}')
282
+ SQL
283
+
284
+ matched_spectra<<{:name => spectrum_name, :scan_num => start_scan}
285
+
286
+ end
287
+
288
+ matched_spectra
289
+ end
290
+
291
+
292
+ def lookup_spectra_from_files(file_list,matched_spectra)
293
+
294
+ titles_to_match = matched_spectra.collect { |s| s[:name] }
295
+
296
+ # require 'debugger';debugger
297
+
298
+ queries_with_spectra=Array.new.replace(titles_to_match)
299
+
300
+ num_matched=0
301
+ total_spectra=0
302
+
303
+ file_list.each do |file|
304
+ mzml_parser = MzMLParser.new(file)
305
+
306
+ spec = mzml_parser.next_spectrum
307
+
308
+
309
+ while (spec) do
310
+ total_spectra+=1
311
+ if titles_to_match.include? spec[:title]
312
+ num_matched+=1
313
+ queries_with_spectra.delete(spec[:title])
314
+
315
+ $outputdb.execute <<-SQL
316
+ INSERT INTO Spectra(MZData,IntensityData,SpectrumTitle,PrecursorMass)
317
+ VALUES('#{spec[:mz]}','#{spec[:intensity]}','#{spec[:title]}','#{spec[:precursormz]}')
318
+ SQL
319
+
320
+ else
321
+
322
+ end
323
+ spec = mzml_parser.next_spectrum
324
+ end
325
+
326
+ end
327
+ puts "Found #{num_matched} matching spectra"
328
+ puts "Total in spectrum files #{total_spectra}"
329
+ puts "Total psms #{titles_to_match.length}"
330
+ puts "Unmatched psms #{queries_with_spectra.length}"
331
+
332
+
333
+
334
+ end
335
+
336
+ # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
337
+ #
338
+ tool=Tool.new([:explicit_output,:over_write])
339
+ tool.option_parser.banner = "Convert a protXML file to a sqlite database.\n\nUsage: protxml_to_psql.rb [options] file1.protXML"
340
+
341
+ tool.add_value_option(:database,nil,['-d','--database path','A Fasta file where full protein sequences can be looked up'])
342
+
343
+ # require 'debugger';debugger
344
+
345
+ exit unless tool.check_options(true,[:explicit_output])
346
+
347
+ input_file=ARGV.shift
348
+
349
+
350
+ if File.exists? tool.explicit_output
351
+ throw "Cant overwrite existing db #{tool.explicit_output}" unless tool.over_write
352
+ File.delete(tool.explicit_output)
353
+ end
354
+
355
+ $fasta_lookup=nil
356
+ if tool.database
357
+ $fasta_lookup=prepare_fasta(tool.database,'prot')
358
+ end
359
+
360
+ $outputdb = SQLite3::Database.new tool.explicit_output
361
+
362
+ initialize_db
363
+
364
+ XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
365
+
366
+ protxml_parser=XML::Parser.file("#{input_file}")
367
+
368
+ $protxml_ns_prefix="xmlns:"
369
+ $protxml_ns="xmlns:http://regis-web.systemsbiology.net/protXML"
370
+
371
+
372
+ protxml_doc=protxml_parser.parse
373
+ if not protxml_doc.root.namespaces.default
374
+ $protxml_ns_prefix=""
375
+ $protxml_ns=nil
376
+ end
377
+
378
+ $protein_id=0
379
+ $peptide_id=0
380
+
381
+ headers_with_inputs=protxml_doc.find("//#{$protxml_ns_prefix}protein_summary_header[@source_files]",$protxml_ns)
382
+
383
+ matched_spectra=[]
384
+
385
+ headers_with_inputs.each do |header|
386
+ pepxml_files = header.attributes['source_files'].split(",")
387
+ pepxml_files.each do |pepxml_file|
388
+ matched_spectra.concat insert_psms_from_file(pepxml_file)
389
+ end
390
+ end
391
+
392
+ lookup_spectra_from_files(ARGV.collect { |file| file.chomp },matched_spectra)
393
+
394
+ protein_groups=protxml_doc.find("//#{$protxml_ns_prefix}protein_group", $protxml_ns)
395
+
396
+ protein_groups.each do |g|
397
+ insert_protein_group(g)
398
+ end
399
+
@@ -20,24 +20,40 @@
20
20
 
21
21
  #define AMINO_ACIDS "ARNDCEQGHILKMFPSTWYV"
22
22
  #define NOT_AMINO_ACIDS "BJOUXZ*"
23
- #define MAX_SEQUENCE_LENGTH 20000
24
- #define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
23
+ #define MAX_SEQUENCE_LENGTH 2000
24
+ #define MAX_LINE_LENGTH 200000 /* large enough to read in long header lines */
25
+
26
+ void RemoveSpaces(char* source)
27
+ {
28
+ char* i = source;
29
+ char* j = source;
30
+ while(*j != 0)
31
+ {
32
+ *i = *j++;
33
+ if(*i != ' ')
34
+ i++;
35
+ }
36
+ *i = 0;
37
+ }
25
38
 
26
39
 
27
40
  static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
28
- VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
29
-
30
- char *input_file = RSTRING_PTR(input_file_in);
41
+ VALUE db_length_in,VALUE output_file_in,VALUE prefix_string_in)
42
+ {
43
+
44
+ char *infile = StringValueCStr(input_file_in);
31
45
  long sequences_to_generate = NUM2INT(db_length_in);
32
- char * output_file = RSTRING_PTR(output_file_in);
46
+ char * outfile = StringValueCStr(output_file_in);
47
+ char *prefix_string = StringValueCStr(prefix_string_in);
33
48
 
34
49
  char line[MAX_LINE_LENGTH];
35
- char settings_line[60][70];
36
- char infile[255], outfile[255]; /* for reading input and writing output */
37
- char prefix_string[255];
50
+ // char settings_line[60][70];
51
+
38
52
  char *p,**index;
39
- char *sequence;
40
- char one_sequence[MAX_SEQUENCE_LENGTH],random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)],random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
53
+
54
+ char one_sequence[MAX_SEQUENCE_LENGTH];
55
+ char random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)];
56
+ char random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
41
57
  char *temp_sequence;
42
58
  int a;
43
59
  FILE *inp, *outp;
@@ -50,63 +66,57 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
50
66
  double x;
51
67
 
52
68
  /* scanning sequence database */
53
-
54
- strcpy(infile,input_file);
55
69
 
56
70
  if ((inp = fopen(infile, "r"))==NULL) {
57
71
  printf("error opening sequence database %s\n",infile);return -1;
58
72
  }
59
73
 
60
- printf("scanning sequence database \n%s\n",infile);
61
- fflush(stdout);
62
-
63
- i=0;n=0;k=0;
74
+ long total_sequence_len=0;
75
+ n=0;
64
76
 
65
77
  while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {
66
- i++;
67
- if(line[0]=='>') {
68
- if (!(n%1000)) {
69
- printf(".");
70
- fflush(stdout);
71
- n++;
72
- }
73
- }
78
+ total_sequence_len+=strlen(line);
79
+
80
+ // printf("%ld\n",i);fflush(stdout);
81
+ if (line[0]=='>') { n++; }
74
82
  }
75
-
83
+
76
84
  n_sequences=n;
77
85
 
78
86
 
79
87
  /* reading sequence database */
80
88
 
81
89
  temp_sequence=(char*)calloc(sizeof(char),MAX_SEQUENCE_LENGTH);
82
- sequence=(char*)malloc(sizeof(char)*(i*80)); /* allocate enough memory for 80 characters per line in FASTA database */
90
+
91
+ char *sequence_block=(char*)malloc(sizeof(char)*(total_sequence_len+2));
92
+
83
93
  index=(char**)malloc(sizeof(char*)*n_sequences);
84
- index[0]=sequence; /* set first index pointer to beginning of first database sequence */
94
+ index[0]=sequence_block; /* set first index pointer to beginning of first database sequence */
85
95
 
86
96
  if ((inp = fopen(infile, "r"))==NULL) {
87
97
  printf("error opening sequence database %s\n",infile);
88
98
  return -1;
89
99
  }
90
100
 
91
- printf("done\nreading sequence database \n%s\n",infile);
92
- fflush(stdout);
93
-
94
101
  n=-1;
95
102
  strcpy(temp_sequence,"\0");
96
103
 
97
104
  while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
98
- {
99
- if (strcmp(line,"\n")==0) {
105
+ {
106
+ RemoveSpaces(line);
107
+
108
+ if (strcmp(line,"\n")==0) { // Skips blank lines
100
109
  continue;
101
110
  }
111
+
102
112
  if (line[0]=='>') {
103
113
  if (n>=0) {
104
- if (!(n%1000)&&n>0) {
105
- printf(".");fflush(stdout);
106
- }
114
+
107
115
  strcpy(index[n],temp_sequence);
108
- n++; index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
116
+ n++;
117
+ index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
109
118
  strcpy(temp_sequence,"\0");
119
+
110
120
  }
111
121
  else
112
122
  {
@@ -116,7 +126,9 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
116
126
  }
117
127
  else
118
128
  {
119
- if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) continue;
129
+ if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) {
130
+ continue;
131
+ }
120
132
  strncat(temp_sequence,line,strlen(line)-1);
121
133
  }
122
134
  }
@@ -127,16 +139,18 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
127
139
 
128
140
  n_sequences=n+1;
129
141
 
130
- printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
142
+ // printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
143
+
144
+ // measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
145
+
131
146
 
132
- measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
133
147
 
134
148
 
135
149
 
136
150
  /* generating Markov probabilities */
137
151
 
138
- printf("generating Markov probability matrix...");
139
- fflush(stdout);
152
+ // printf("generating Markov probability matrix...");
153
+ // fflush(stdout);
140
154
 
141
155
  srand(time(0)); /* replace with constant to re-generate identical random databases */
142
156
 
@@ -146,52 +160,53 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
146
160
  }
147
161
  }
148
162
  for(j=0;j<=20;j++) {
149
- measured_aa_freq[j]=0;generated_aa_freq[j]=0;
163
+ measured_aa_freq[j]=0;
164
+ generated_aa_freq[j]=0;
150
165
  }
151
166
 
167
+
152
168
  for(protein=0;protein<n_sequences;protein++)
153
169
  {
154
- if (!(protein%1000)) {
155
- printf(".");
156
- fflush(stdout);
170
+ if (protein<(n_sequences-1))
171
+ {
172
+ long len_one_seq = (index[protein+1]-index[protein])/sizeof(char);
173
+ if ( len_one_seq > MAX_SEQUENCE_LENGTH ){
174
+ printf("Seq is longer than max len \n");fflush(stdout);
175
+ len_one_seq=MAX_SEQUENCE_LENGTH;
176
+ }
177
+ strncpy(one_sequence,index[protein],len_one_seq);
178
+
179
+ one_sequence[len_one_seq]='\0'; // NULL terminate the string
180
+
181
+ } else {
182
+ strcpy(one_sequence,index[protein]);
157
183
  }
158
184
 
159
- if (protein<(n_sequences-1))
185
+ pl=strlen(one_sequence);
186
+ n=1;
187
+ one_index=0;
188
+
189
+ for(i=0;i<pl;i++)
160
190
  {
161
- strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
162
- one_sequence[(index[protein+1]-index[protein])/sizeof(char)]='\0';
163
- }
164
- else strcpy(one_sequence,index[protein]);
165
- pl=strlen(one_sequence);
166
- n=1;one_index=0;
167
-
168
- for(i=0;i<pl;i++)
169
- {
170
- if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
171
- {
172
- if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
191
+ if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
173
192
  {
174
- printf("Unknown amino acid %c",one_sequence[i]);
193
+ if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
194
+ {
195
+ printf("Unknown amino acid %c",one_sequence[i]);
196
+ } else {
197
+ a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
198
+ MP[a][i]++;
199
+ measured_aa_freq[a]++;
200
+ }
175
201
  } else {
176
- a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
177
- MP[a][i]++;
202
+ a=floor(20*(float)rand()/RAND_MAX);
203
+ MP[a][i]++;
178
204
  measured_aa_freq[a]++;
179
- }
180
- }
181
- else {
182
- a=floor(20*(float)rand()/RAND_MAX);
183
- MP[a][i]++;
184
- measured_aa_freq[a]++;
185
- } // replace B, X, Z etc. with random amino acid to preserve size distribution
186
- }
187
- MP[20][pl]++;
188
- measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
205
+ } // replace B, X, Z etc. with random amino acid to preserve size distribution
189
206
  }
190
-
191
- printf("done\n");
192
- fflush(stdout);
193
-
194
-
207
+ MP[20][pl]++;
208
+ measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
209
+ }
195
210
 
196
211
  for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
197
212
  row_sum[i]=0;
@@ -204,41 +219,38 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
204
219
  }
205
220
 
206
221
 
207
- /* generate random protein sequences through Markov chain */
208
-
209
- strcpy(outfile,output_file);
210
222
 
211
- if ((outp = fopen(outfile, "w"))==NULL) {
212
- printf("error opening output file %s\n",outfile);
213
- return -1;
214
- }
223
+ /* generate random protein sequences through Markov chain */
215
224
 
216
- printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
217
225
 
218
- strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
226
+ if ((outp = fopen(outfile, "w"))==NULL) {
227
+ printf("error opening output file %s\n",outfile);
228
+ return -1;
229
+ }
219
230
 
220
- for(protein=0;protein<sequences_to_generate;protein++)
221
- {
222
- if (!(protein%1000)) {
223
- printf(".");fflush(stdout);
224
- }
231
+ for(protein=0;protein<sequences_to_generate;protein++)
232
+ {
225
233
 
226
- i=0; j=0;
227
- while (1)
234
+ i=0; j=0;
235
+ while (1)
236
+ {
237
+ x=(double)row_sum[j]*((double)rand()/RAND_MAX);
238
+ partial_sum=MP[0][j]; i=1;
239
+
240
+ while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
241
+
242
+ if (j>=MAX_SEQUENCE_LENGTH) { i=21; }/* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
243
+
244
+ if (i<21)
228
245
  {
229
- x=(double)row_sum[j]*((double)rand()/RAND_MAX);
230
- partial_sum=MP[0][j]; i=1;
231
- while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
232
- if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
233
- if (i<21)
234
- {
235
- random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
236
- }
237
- else /* i==21, i.e. protein sequence terminated */
238
- {
239
- k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
240
- for(l=0;l<j;l++)
241
- {
246
+ random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
247
+ } else { /* i==21, i.e. protein sequence terminated */
248
+ k=0;
249
+ generated_aa_freq[20]++;
250
+ generated_pl_sum+=j;
251
+
252
+ for(l=0;l<j;l++)
253
+ {
242
254
  random_sequence_output[k]=random_sequence[l]; k++;
243
255
  if (!((k+1)%61))
244
256
  {
@@ -256,19 +268,13 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
256
268
 
257
269
  fclose(outp);
258
270
 
259
-
260
- /* freeing some memory... */
261
-
262
- free(index);
263
271
 
264
- printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
272
+ // printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
265
273
 
266
274
  k=0;l=0;
267
275
  for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
268
- // printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
269
- // for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
270
276
 
271
- printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
277
+ // printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
272
278
 
273
279
  return 0;
274
280
 
@@ -20,14 +20,13 @@ class GalaxyStager
20
20
  end
21
21
  end
22
22
 
23
- def replace_references(in_file, options = {})
24
- options = { :base_only => false }.merge(options)
25
- replacement = options[:base_only] ? @staged_base : @staged_path
23
+ def replace_references(in_file)
26
24
  GalaxyStager.replace_references(in_file, @original_path, replacement)
27
25
  end
28
26
 
29
- def restore_references(in_file)
30
- GalaxyStager.replace_references(in_file, @staged_path, @original_path)
27
+ def restore_references(in_file, options = {})
28
+ path = options[:base_only] ? @staged_path.gsub(/#{@extension}/,"") : @staged_path
29
+ GalaxyStager.replace_references(in_file, path, @original_path)
31
30
  end
32
31
 
33
32
  def self.replace_references(in_file, from_path, to_path)
@@ -14,7 +14,7 @@ class GalaxyUtil
14
14
 
15
15
  def self.stage_pepxml(input_pepxml_path)
16
16
  stager = GalaxyStager.new(input_pepxml_path, :extension => ".pep.xml")
17
- stager.staged_path
17
+ stager
18
18
  end
19
19
 
20
20
  def self.stage_protxml(input_protxml_path)
@@ -0,0 +1,67 @@
1
+ require 'libxml'
2
+
3
+ include LibXML
4
+
5
+ class MzMLParser < Object
6
+
7
+
8
+ def initialize(path)
9
+ @namespace=
10
+ @mzml_ns_prefix="xmlns:"
11
+ @mzml_ns="xmlns:http://psi.hupo.org/ms/mzml"
12
+
13
+ doc=XML::Document.file(path)
14
+ @file_reader=XML::Reader.document(doc)
15
+ end
16
+
17
+ def next_spectrum()
18
+
19
+ until @file_reader.name=="spectrum"
20
+ if !@file_reader.read()
21
+ return nil
22
+ end
23
+ end
24
+
25
+ this_spect=spectrum_as_hash(@file_reader.expand)
26
+
27
+ @file_reader.next_sibling
28
+
29
+ return this_spect
30
+ end
31
+
32
+ def spectrum_as_hash(spectrum)
33
+ index=spectrum.attributes['index']
34
+ sid = spectrum.attributes['id']
35
+ precursor_mz_param = spectrum.find(".//#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000744\"]",@mzml_ns)[0]
36
+ mslevel_param = spectrum.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000511\"]",@mzml_ns)[0]
37
+
38
+ title_param = spectrum.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000796\"]",@mzml_ns)[0]
39
+
40
+ # prec_mz = spectrum.find(".//#{@mz}")
41
+
42
+ precursor_mz_mz = precursor_mz_param.attributes['value'] if precursor_mz_param
43
+ mslevel = mslevel_param.attributes['value'] if mslevel_param
44
+ spectrum_title = title_param['value'] if title_param
45
+
46
+ data_arrays = spectrum.find("./#{@mzml_ns_prefix}binaryDataArrayList/#{@mzml_ns_prefix}binaryDataArray",@mzml_ns)
47
+
48
+ data={}
49
+ data_arrays.each do |arr|
50
+ the_data = arr.find("./#{@mzml_ns_prefix}binary",@mzml_ns)[0].content
51
+ mzaccession = arr.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000514\"]",@mzml_ns)
52
+ if ( mzaccession.length==1 )
53
+ data[:mz] = the_data
54
+ else
55
+ data[:intensity] = the_data
56
+ end
57
+ end
58
+ data[:title]=spectrum_title
59
+ data[:mzlevel]=mslevel
60
+ data[:index]=index
61
+ data[:precursormz]=precursor_mz_mz
62
+ data[:id]=sid
63
+
64
+ data
65
+ end
66
+
67
+ end
@@ -53,7 +53,7 @@ class Tool
53
53
  end
54
54
 
55
55
 
56
- def add_value_option(symbol,default_value,opts)
56
+ def add_value_option(symbol,default_value,opts)
57
57
  @options[symbol]=default_value
58
58
  @option_parser.on(*opts) do |val|
59
59
  @options[symbol]=val
@@ -108,6 +108,8 @@ class Tool
108
108
  add_value_option(:threads,1,['-n','--threads num','Number of processing threads to use. Set to 0 to autodetect an appropriate value'])
109
109
  end
110
110
 
111
+
112
+
111
113
  end
112
114
 
113
115
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: protk
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0.pre3
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ira Cooke
@@ -152,6 +152,20 @@ dependencies:
152
152
  - - ~>
153
153
  - !ruby/object:Gem::Version
154
154
  version: '0'
155
+ - !ruby/object:Gem::Dependency
156
+ name: sqlite3
157
+ requirement: !ruby/object:Gem::Requirement
158
+ requirements:
159
+ - - ~>
160
+ - !ruby/object:Gem::Version
161
+ version: '0'
162
+ type: :runtime
163
+ prerelease: false
164
+ version_requirements: !ruby/object:Gem::Requirement
165
+ requirements:
166
+ - - ~>
167
+ - !ruby/object:Gem::Version
168
+ version: '0'
155
169
  description: A bunch of tools for proteomics
156
170
  email: iracooke@gmail.com
157
171
  executables:
@@ -195,6 +209,7 @@ files:
195
209
  - bin/protein_prophet.rb
196
210
  - bin/protk_setup.rb
197
211
  - bin/protxml_to_gff.rb
212
+ - bin/protxml_to_psql.rb
198
213
  - bin/protxml_to_table.rb
199
214
  - bin/repair_run_summary.rb
200
215
  - bin/sixframe.rb
@@ -237,6 +252,7 @@ files:
237
252
  - lib/protk/manage_db_rakefile.rake
238
253
  - lib/protk/manage_db_tool.rb
239
254
  - lib/protk/mascot_util.rb
255
+ - lib/protk/mzml_parser.rb
240
256
  - lib/protk/omssa_util.rb
241
257
  - lib/protk/openms_defaults.rb
242
258
  - lib/protk/pepxml.rb
@@ -266,9 +282,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
266
282
  version: '0'
267
283
  required_rubygems_version: !ruby/object:Gem::Requirement
268
284
  requirements:
269
- - - '>'
285
+ - - '>='
270
286
  - !ruby/object:Gem::Version
271
- version: 1.3.1
287
+ version: '0'
272
288
  requirements: []
273
289
  rubyforge_project:
274
290
  rubygems_version: 2.2.1