protk 1.3.0.pre3 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/make_decoy.rb +1 -1
- data/bin/msgfplus_search.rb +5 -5
- data/bin/omssa_search.rb +0 -18
- data/bin/peptide_prophet.rb +10 -3
- data/bin/protein_prophet.rb +14 -11
- data/bin/protxml_to_psql.rb +399 -0
- data/ext/decoymaker/decoymaker.c +120 -114
- data/lib/protk/galaxy_stager.rb +4 -5
- data/lib/protk/galaxy_util.rb +1 -1
- data/lib/protk/mzml_parser.rb +67 -0
- data/lib/protk/tool.rb +3 -1
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5e8f8a571cb43ed61984a34b6e1fb51caf979593
|
4
|
+
data.tar.gz: b53857f75c1ff6ca850859c3985aee36533e437f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9450fccc4a5ce59f064927d62fbc6a4342a1710c3b82707e0908dea52af7d0b50f215e64073bb067a506d204701acea11b6d28f302447494b8a30b1e7af2df2d
|
7
|
+
data.tar.gz: 1b8bc78fc09b4c81eee72fad169a6aee7145a16312c01bc95a5dd590f08cb98194b26115a166a759b9c52c7c67204a767747642e5e9331de4562d52f31eb1e11
|
data/bin/make_decoy.rb
CHANGED
@@ -36,7 +36,6 @@ input_file=ARGV[0]
|
|
36
36
|
db_length=tool.db_length
|
37
37
|
if ( db_length==0) #If no db length was specified use the number of entries in the input file
|
38
38
|
db_length=Bio::FastaFormat.open(input_file).count
|
39
|
-
puts "Found #{db_length} entries in input file"
|
40
39
|
end
|
41
40
|
|
42
41
|
output_file = tool.explicit_output if tool.explicit_output!=nil
|
@@ -65,6 +64,7 @@ end
|
|
65
64
|
|
66
65
|
if ( tool.append )
|
67
66
|
cmd ="awk 'FNR==1{print \"\"}1' #{input_file} #{decoys_tmp_file} > #{output_file};"
|
67
|
+
cmd << "sed -i.bak '/^$/d' #{output_file};"
|
68
68
|
cmd << "rm #{decoys_tmp_file}"
|
69
69
|
else
|
70
70
|
cmd = "mv #{decoys_tmp_file} #{output_file}"
|
data/bin/msgfplus_search.rb
CHANGED
@@ -83,16 +83,17 @@ database_path=db_info.path
|
|
83
83
|
|
84
84
|
# Database must have fasta extension
|
85
85
|
if Pathname.new(database_path).extname.to_s.downcase != ".fasta"
|
86
|
-
|
86
|
+
File.symlink(database_path,"#{database_path}.fasta") unless File.exists?("#{database_path}.fasta")
|
87
|
+
# make_msgfdb_cmd << "ln -s #{database_path} #{database_path}.fasta;"
|
87
88
|
database_path="#{database_path}.fasta"
|
88
|
-
|
89
|
+
database_path
|
89
90
|
end
|
90
91
|
|
91
92
|
# Database must be indexed
|
92
93
|
unless FileTest.exists?("#{database_path}.canno")
|
93
|
-
dbdir = Pathname.new(database_path).dirname.
|
94
|
+
dbdir = Pathname.new(database_path).dirname.to_s
|
94
95
|
tdavalue=search_tool.decoy_search ? 1 : 0;
|
95
|
-
make_msgfdb_cmd << "
|
96
|
+
make_msgfdb_cmd << "java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
|
96
97
|
end
|
97
98
|
|
98
99
|
|
@@ -214,7 +215,6 @@ ARGV.each do |filename|
|
|
214
215
|
else
|
215
216
|
cmd << "; mv #{mzid_output_path} #{output_path}"
|
216
217
|
end
|
217
|
-
|
218
218
|
|
219
219
|
# Up to here we've formulated the command. The rest is cleanup
|
220
220
|
p "Running:#{cmd}"
|
data/bin/omssa_search.rb
CHANGED
@@ -12,8 +12,6 @@ require 'protk/command_runner'
|
|
12
12
|
require 'protk/search_tool'
|
13
13
|
require 'protk/galaxy_util'
|
14
14
|
|
15
|
-
for_galaxy = GalaxyUtil.for_galaxy?
|
16
|
-
|
17
15
|
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
18
16
|
#
|
19
17
|
search_tool=SearchTool.new([
|
@@ -94,22 +92,6 @@ ARGV.each do |filename|
|
|
94
92
|
#
|
95
93
|
cmd << " -v #{search_tool.missed_cleavages}"
|
96
94
|
|
97
|
-
# If this is for Galaxy and a data directory has been specified
|
98
|
-
# look for a common unimod.xml file.
|
99
|
-
if for_galaxy
|
100
|
-
galaxy_index_dir = search_tool.galaxy_index_dir
|
101
|
-
if galaxy_index_dir
|
102
|
-
galaxy_mods = File.join(galaxy_index_dir, "mods.xml")
|
103
|
-
if( FileTest.exists?(galaxy_mods) )
|
104
|
-
cmd << " -mx #{galaxy_mods}"
|
105
|
-
end
|
106
|
-
galaxy_usermods = File.join(galaxy_index_dir, "usermods.xml")
|
107
|
-
if( FileTest.exists?(galaxy_usermods) )
|
108
|
-
cmd << " -mux #{galaxy_usermods}"
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
95
|
if ( search_tool.omx_output )
|
114
96
|
cmd << " -ox #{search_tool.omx_output} "
|
115
97
|
end
|
data/bin/peptide_prophet.rb
CHANGED
@@ -51,10 +51,11 @@ throw "When --output and -F options are set only one file at a time can be run"
|
|
51
51
|
# Obtain a global environment object
|
52
52
|
genv=Constants.new
|
53
53
|
|
54
|
-
|
54
|
+
input_stagers=[]
|
55
55
|
inputs=ARGV.collect { |file_name| file_name.chomp}
|
56
56
|
if for_galaxy
|
57
|
-
|
57
|
+
input_stagers = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
|
58
|
+
inputs=input_stagers.collect { |sg| sg.staged_path }
|
58
59
|
end
|
59
60
|
|
60
61
|
# Interrogate all the input files to obtain the database and search engine from them
|
@@ -212,7 +213,13 @@ else
|
|
212
213
|
|
213
214
|
cmd=generate_command(genv,prophet_tool,inputs,output_file_name,database,engine)
|
214
215
|
run_peptide_prophet(genv,prophet_tool,cmd,output_file_name,engine)
|
215
|
-
|
216
|
+
|
216
217
|
end
|
217
218
|
|
219
|
+
if (for_galaxy)
|
220
|
+
input_stagers.each do |sg|
|
221
|
+
sg.restore_references(output_file_name)
|
222
|
+
sg.restore_references(output_file_name,{:base_only => true})
|
223
|
+
end
|
224
|
+
end
|
218
225
|
|
data/bin/protein_prophet.rb
CHANGED
@@ -40,7 +40,13 @@ exit unless prophet_tool.check_options(true)
|
|
40
40
|
# Obtain a global environment object
|
41
41
|
genv=Constants.new
|
42
42
|
|
43
|
-
|
43
|
+
input_stagers=[]
|
44
|
+
inputs=ARGV.collect { |file_name| file_name.chomp}
|
45
|
+
if for_galaxy
|
46
|
+
input_stagers = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
|
47
|
+
inputs=input_stagers.collect { |sg| sg.staged_path }
|
48
|
+
end
|
49
|
+
|
44
50
|
|
45
51
|
if ( prophet_tool.explicit_output )
|
46
52
|
output_file=prophet_tool.explicit_output
|
@@ -52,11 +58,6 @@ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
|
|
52
58
|
|
53
59
|
cmd="ProteinProphet "
|
54
60
|
|
55
|
-
if for_galaxy
|
56
|
-
inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
|
57
|
-
end
|
58
|
-
|
59
|
-
|
60
61
|
cmd << " #{inputs.join(" ")} #{output_file}"
|
61
62
|
|
62
63
|
if ( prophet_tool.glyco )
|
@@ -71,11 +72,13 @@ else
|
|
71
72
|
genv.log("Protein Prophet output file #{output_file} already exists. Run with -r option to replace",:warn)
|
72
73
|
end
|
73
74
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
75
|
+
|
76
|
+
if (for_galaxy)
|
77
|
+
input_stagers.each do |sg|
|
78
|
+
sg.restore_references(output_file)
|
79
|
+
sg.restore_references(output_file,{:base_only => true})
|
80
|
+
end
|
81
|
+
end
|
79
82
|
|
80
83
|
|
81
84
|
|
@@ -0,0 +1,399 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 18/1/2011
|
5
|
+
#
|
6
|
+
# Convert a protein/peptide xml file to sqlite database
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'libxml'
|
11
|
+
require 'protk/constants'
|
12
|
+
require 'protk/command_runner'
|
13
|
+
require 'protk/tool'
|
14
|
+
require 'protk/fastadb'
|
15
|
+
require 'sqlite3'
|
16
|
+
require 'protk/mzml_parser'
|
17
|
+
|
18
|
+
include LibXML
|
19
|
+
|
20
|
+
def prepare_fasta(database_path,type)
|
21
|
+
|
22
|
+
db_filename = nil
|
23
|
+
case
|
24
|
+
when Pathname.new(database_path).exist? # It's an explicitly named db
|
25
|
+
db_filename = Pathname.new(database_path).expand_path.to_s
|
26
|
+
else
|
27
|
+
db_filename=Constants.new.current_database_for_name(database_path)
|
28
|
+
end
|
29
|
+
|
30
|
+
db_indexfilename = "#{db_filename}.pin"
|
31
|
+
|
32
|
+
if File.exist?(db_indexfilename)
|
33
|
+
puts "Using existing indexed database"
|
34
|
+
orf_lookup = FastaDB.new(db_filename)
|
35
|
+
else
|
36
|
+
puts "Indexing database"
|
37
|
+
orf_lookup = FastaDB.create(db_filename,db_filename,type)
|
38
|
+
end
|
39
|
+
orf_lookup
|
40
|
+
end
|
41
|
+
|
42
|
+
def get_fasta_record(protein_name,fastadb)
|
43
|
+
# puts "Looking up #{protein_name}"
|
44
|
+
entry = fastadb.get_by_id protein_name
|
45
|
+
if ( entry == nil)
|
46
|
+
puts "Failed lookup for #{protein_name}"
|
47
|
+
raise KeyError
|
48
|
+
end
|
49
|
+
entry
|
50
|
+
end
|
51
|
+
|
52
|
+
def initialize_db()
|
53
|
+
result = $outputdb.execute <<-SQL
|
54
|
+
CREATE TABLE ProteinGroups (
|
55
|
+
ID INT,
|
56
|
+
Probability REAL
|
57
|
+
);
|
58
|
+
SQL
|
59
|
+
|
60
|
+
result = $outputdb.execute <<-SQL
|
61
|
+
CREATE TABLE Proteins (
|
62
|
+
ID INT,
|
63
|
+
ProteinGroupID INT,
|
64
|
+
Probability REAL,
|
65
|
+
Name TEXT,
|
66
|
+
Description TEXT,
|
67
|
+
Coverage REAL,
|
68
|
+
NumPeptides INT,
|
69
|
+
Indistinguishables TEXT,
|
70
|
+
Sequence TEXT
|
71
|
+
);
|
72
|
+
SQL
|
73
|
+
|
74
|
+
result = $outputdb.execute <<-SQL
|
75
|
+
CREATE TABLE Peptides (
|
76
|
+
ID INT,
|
77
|
+
ProteinID INT,
|
78
|
+
Probability REAL,
|
79
|
+
Sequence TEXT,
|
80
|
+
Start INT,
|
81
|
+
End INT,
|
82
|
+
ModifiedSequence TEXT
|
83
|
+
);
|
84
|
+
SQL
|
85
|
+
|
86
|
+
# This has the role of a join table for the Peptides <-> Spectra many to many relationship
|
87
|
+
result = $outputdb.execute <<-SQL
|
88
|
+
CREATE TABLE PeptideSpectrumMatches (
|
89
|
+
PeptideSequence TEXT,
|
90
|
+
PeptideModifiedSequence TEXT,
|
91
|
+
SpectrumID INT,
|
92
|
+
ScanNum INT,
|
93
|
+
RetentionTime REAL,
|
94
|
+
PrecursorNeutralMass REAL,
|
95
|
+
MassDeviation REAL,
|
96
|
+
PrevAA TEXT,
|
97
|
+
NextAA TEXT
|
98
|
+
);
|
99
|
+
SQL
|
100
|
+
|
101
|
+
result = $outputdb.execute <<-SQL
|
102
|
+
CREATE TABLE Spectra (
|
103
|
+
ID INTEGER PRIMARY KEY,
|
104
|
+
MZData TEXT,
|
105
|
+
IntensityData TEXT,
|
106
|
+
PrecursorMass REAL,
|
107
|
+
PrecursorCharge INT,
|
108
|
+
SpectrumType INT,
|
109
|
+
SpectrumTitle TEXT
|
110
|
+
);
|
111
|
+
SQL
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
def insert_protein_group(group_node)
|
116
|
+
group_number=group_node.attributes['group_number']
|
117
|
+
group_prob=group_node.attributes['probability']
|
118
|
+
$outputdb.execute <<-SQL
|
119
|
+
INSERT INTO ProteinGroups(ID,Probability) VALUES(
|
120
|
+
#{group_number},#{group_prob}
|
121
|
+
);
|
122
|
+
SQL
|
123
|
+
|
124
|
+
proteins=group_node.find("./#{$protxml_ns_prefix}protein", $protxml_ns)
|
125
|
+
|
126
|
+
proteins.each do |protein|
|
127
|
+
insert_protein(protein,group_number)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def protein_dbid_from_name(protein_name)
|
132
|
+
protein_name #TODO: Allow user defined regex to parse this
|
133
|
+
end
|
134
|
+
|
135
|
+
def insert_protein(protein,group_id)
|
136
|
+
|
137
|
+
indis_proteins=protein.find("./#{$protxml_ns_prefix}indistinguishable_protein", $protxml_ns)
|
138
|
+
indis_proteins_summary=""
|
139
|
+
indis_proteins.each { |iprot| indis_proteins_summary<<"#{iprot.attributes['protein_name']};" }
|
140
|
+
|
141
|
+
annot_descr=protein.find("./#{$protxml_ns_prefix}annotation[@protein_description]", $protxml_ns)
|
142
|
+
|
143
|
+
|
144
|
+
protein_prob=protein.attributes['probability']
|
145
|
+
protein_name=protein.attributes['protein_name']
|
146
|
+
|
147
|
+
begin
|
148
|
+
protein_description=annot_descr[0].attributes['protein_description'].chomp.gsub("'","")
|
149
|
+
rescue
|
150
|
+
puts "No protein_description"
|
151
|
+
end
|
152
|
+
protein_coverage=protein.attributes['percent_coverage']
|
153
|
+
protein_npep = protein.attributes['total_number_peptides']
|
154
|
+
protein_indis = indis_proteins_summary
|
155
|
+
|
156
|
+
protein_coverage="NULL" unless protein_coverage
|
157
|
+
protein_indis="NULL" unless protein_indis
|
158
|
+
protein_description="NULL" unless protein_description
|
159
|
+
|
160
|
+
if $fasta_lookup
|
161
|
+
begin
|
162
|
+
entry=get_fasta_record(protein_name,$fasta_lookup)
|
163
|
+
protein_seq=entry.aaseq
|
164
|
+
rescue
|
165
|
+
puts "Warning: No entry found for #{protein_name}"
|
166
|
+
protein_seq="NULL"
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
begin
|
171
|
+
$outputdb.execute <<-SQL
|
172
|
+
INSERT INTO Proteins(ID,ProteinGroupID,Probability,Name,Description,Coverage,NumPeptides,Indistinguishables,Sequence)
|
173
|
+
VALUES(#{$protein_id},#{group_id},#{protein_prob},\'#{protein_name}\',\'#{protein_description}\',#{protein_coverage},
|
174
|
+
#{protein_npep},\'#{protein_indis}\','#{protein_seq}');
|
175
|
+
SQL
|
176
|
+
rescue
|
177
|
+
throw "Unable to insert #{protein_description}\n"
|
178
|
+
end
|
179
|
+
peptides=protein.find("./#{$protxml_ns_prefix}peptide",$protxml_ns)
|
180
|
+
|
181
|
+
peptides.each do |peptide|
|
182
|
+
insert_peptide(peptide,$protein_id,protein_seq)
|
183
|
+
end
|
184
|
+
$protein_id+=1
|
185
|
+
end
|
186
|
+
|
187
|
+
def insert_peptide(peptide,protein_id,protein_seq)
|
188
|
+
nsp_adjusted_probability=peptide.attributes['nsp_adjusted_probability']
|
189
|
+
sequence=peptide.attributes['peptide_sequence']
|
190
|
+
|
191
|
+
start_pos="NULL"
|
192
|
+
end_pos="NULL"
|
193
|
+
begin
|
194
|
+
if protein_seq!="NULL"
|
195
|
+
start_pos = protein_seq.index(sequence)
|
196
|
+
end_pos = start_pos+sequence.length
|
197
|
+
end
|
198
|
+
rescue
|
199
|
+
puts "Unable to locate peptide #{sequence} in protein #{protein_seq} for #{$protein_id}\n"
|
200
|
+
start_pos="NULL"
|
201
|
+
end_pos="NULL"
|
202
|
+
end
|
203
|
+
mod_info=peptide.find("./#{$protxml_ns_prefix}modification_info",$protxml_ns)
|
204
|
+
|
205
|
+
throw "More than one modification_info object for a peptide" unless mod_info.length<=1
|
206
|
+
mod_seq=format_modified_peptide(mod_info)
|
207
|
+
|
208
|
+
$outputdb.execute <<-SQL
|
209
|
+
INSERT INTO Peptides(ID,ProteinID,Probability,Sequence,Start,End,ModifiedSequence)
|
210
|
+
VALUES(#{$peptide_id},#{protein_id},#{nsp_adjusted_probability},\'#{sequence}\',
|
211
|
+
#{start_pos},#{end_pos},\'#{mod_seq}\')
|
212
|
+
SQL
|
213
|
+
$peptide_id+=1
|
214
|
+
|
215
|
+
end
|
216
|
+
|
217
|
+
def format_modified_peptide(mod_info)
|
218
|
+
mod_seq="NULL"
|
219
|
+
if mod_info.length==1
|
220
|
+
mod_seq=mod_info[0].attributes['modified_peptide']
|
221
|
+
mod_seq.gsub!(/\[/,"\{")
|
222
|
+
mod_seq.gsub!(/\]/,"\}")
|
223
|
+
end
|
224
|
+
mod_seq
|
225
|
+
end
|
226
|
+
|
227
|
+
def insert_psms_from_file(filepath)
|
228
|
+
$pepxml_ns_prefix="xmlns:"
|
229
|
+
$pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
|
230
|
+
|
231
|
+
pepxml_parser=XML::Parser.file("#{filepath}")
|
232
|
+
puts "Parsing #{filepath}"
|
233
|
+
pepxml_doc=pepxml_parser.parse
|
234
|
+
if not pepxml_doc.root.namespaces.default
|
235
|
+
$pepxml_ns_prefix=""
|
236
|
+
$pepxml_ns=nil
|
237
|
+
end
|
238
|
+
|
239
|
+
matched_spectra=[]
|
240
|
+
|
241
|
+
spectrum_queries=pepxml_doc.find("//#{$pepxml_ns_prefix}spectrum_query", $pepxml_ns)
|
242
|
+
|
243
|
+
spectrum_queries.each do |query|
|
244
|
+
|
245
|
+
spectrum_name = query.attributes['spectrum'].chomp.gsub("0","").sub(/\.\d+$/,"")
|
246
|
+
|
247
|
+
start_scan=query.attributes['start_scan'].to_i
|
248
|
+
end_scan=query.attributes['end_scan'].to_i
|
249
|
+
throw "Don't know how to deal with multi scan spectra" unless start_scan==end_scan
|
250
|
+
|
251
|
+
retention_time=query.attributes['retention_time_sec'].chomp.to_f
|
252
|
+
neutral_mass=query.attributes['precursor_neutral_mass'].to_f
|
253
|
+
assumed_charge=query.attributes['assumed_charge'].to_i
|
254
|
+
|
255
|
+
|
256
|
+
top_search_hit=query.find("./#{$pepxml_ns_prefix}search_result/#{$pepxml_ns_prefix}search_hit",$pepxml_ns)[0]
|
257
|
+
peptide=top_search_hit.attributes['peptide']
|
258
|
+
|
259
|
+
mod_info=top_search_hit.find("./#{$protxml_ns_prefix}modification_info",$protxml_ns)
|
260
|
+
|
261
|
+
throw "More than one modification_info object for a peptide" unless mod_info.length<=1
|
262
|
+
modified_peptide=format_modified_peptide(mod_info)
|
263
|
+
|
264
|
+
calc_neutral_pep_mass=top_search_hit.attributes['calc_neutral_pep_mass'].to_f
|
265
|
+
massdiff = top_search_hit.attributes['massdiff'].to_f
|
266
|
+
prevaa = top_search_hit.attributes['peptide_prev_aa']
|
267
|
+
nextaa = top_search_hit.attributes['peptide_next_aa']
|
268
|
+
|
269
|
+
spectrum_name="NULL" unless spectrum_name
|
270
|
+
retention_time="NULL" unless retention_time
|
271
|
+
assumed_charge="NULL" unless assumed_charge
|
272
|
+
calc_neutral_pep_mass="NULL" unless calc_neutral_pep_mass
|
273
|
+
massdiff = "NULL" unless massdiff
|
274
|
+
prevaa = "NULL" unless prevaa
|
275
|
+
nextaa = "NULL" unless nextaa
|
276
|
+
|
277
|
+
|
278
|
+
$outputdb.execute <<-SQL
|
279
|
+
INSERT INTO PeptideSpectrumMatches(PeptideSequence,PeptideModifiedSequence,SpectrumID,ScanNum,RetentionTime,PrecursorNeutralMass,MassDeviation,PrevAA,NextAA)
|
280
|
+
VALUES('#{peptide}','#{modified_peptide}','#{spectrum_name}','#{start_scan}','#{retention_time.to_f}'\
|
281
|
+
,'#{calc_neutral_pep_mass}','#{massdiff}','#{prevaa}','#{nextaa}')
|
282
|
+
SQL
|
283
|
+
|
284
|
+
matched_spectra<<{:name => spectrum_name, :scan_num => start_scan}
|
285
|
+
|
286
|
+
end
|
287
|
+
|
288
|
+
matched_spectra
|
289
|
+
end
|
290
|
+
|
291
|
+
|
292
|
+
def lookup_spectra_from_files(file_list,matched_spectra)
|
293
|
+
|
294
|
+
titles_to_match = matched_spectra.collect { |s| s[:name] }
|
295
|
+
|
296
|
+
# require 'debugger';debugger
|
297
|
+
|
298
|
+
queries_with_spectra=Array.new.replace(titles_to_match)
|
299
|
+
|
300
|
+
num_matched=0
|
301
|
+
total_spectra=0
|
302
|
+
|
303
|
+
file_list.each do |file|
|
304
|
+
mzml_parser = MzMLParser.new(file)
|
305
|
+
|
306
|
+
spec = mzml_parser.next_spectrum
|
307
|
+
|
308
|
+
|
309
|
+
while (spec) do
|
310
|
+
total_spectra+=1
|
311
|
+
if titles_to_match.include? spec[:title]
|
312
|
+
num_matched+=1
|
313
|
+
queries_with_spectra.delete(spec[:title])
|
314
|
+
|
315
|
+
$outputdb.execute <<-SQL
|
316
|
+
INSERT INTO Spectra(MZData,IntensityData,SpectrumTitle,PrecursorMass)
|
317
|
+
VALUES('#{spec[:mz]}','#{spec[:intensity]}','#{spec[:title]}','#{spec[:precursormz]}')
|
318
|
+
SQL
|
319
|
+
|
320
|
+
else
|
321
|
+
|
322
|
+
end
|
323
|
+
spec = mzml_parser.next_spectrum
|
324
|
+
end
|
325
|
+
|
326
|
+
end
|
327
|
+
puts "Found #{num_matched} matching spectra"
|
328
|
+
puts "Total in spectrum files #{total_spectra}"
|
329
|
+
puts "Total psms #{titles_to_match.length}"
|
330
|
+
puts "Unmatched psms #{queries_with_spectra.length}"
|
331
|
+
|
332
|
+
|
333
|
+
|
334
|
+
end
|
335
|
+
|
336
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
337
|
+
#
|
338
|
+
tool=Tool.new([:explicit_output,:over_write])
|
339
|
+
tool.option_parser.banner = "Convert a protXML file to a sqlite database.\n\nUsage: protxml_to_psql.rb [options] file1.protXML"
|
340
|
+
|
341
|
+
tool.add_value_option(:database,nil,['-d','--database path','A Fasta file where full protein sequences can be looked up'])
|
342
|
+
|
343
|
+
# require 'debugger';debugger
|
344
|
+
|
345
|
+
exit unless tool.check_options(true,[:explicit_output])
|
346
|
+
|
347
|
+
input_file=ARGV.shift
|
348
|
+
|
349
|
+
|
350
|
+
if File.exists? tool.explicit_output
|
351
|
+
throw "Cant overwrite existing db #{tool.explicit_output}" unless tool.over_write
|
352
|
+
File.delete(tool.explicit_output)
|
353
|
+
end
|
354
|
+
|
355
|
+
$fasta_lookup=nil
|
356
|
+
if tool.database
|
357
|
+
$fasta_lookup=prepare_fasta(tool.database,'prot')
|
358
|
+
end
|
359
|
+
|
360
|
+
$outputdb = SQLite3::Database.new tool.explicit_output
|
361
|
+
|
362
|
+
initialize_db
|
363
|
+
|
364
|
+
XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
|
365
|
+
|
366
|
+
protxml_parser=XML::Parser.file("#{input_file}")
|
367
|
+
|
368
|
+
$protxml_ns_prefix="xmlns:"
|
369
|
+
$protxml_ns="xmlns:http://regis-web.systemsbiology.net/protXML"
|
370
|
+
|
371
|
+
|
372
|
+
protxml_doc=protxml_parser.parse
|
373
|
+
if not protxml_doc.root.namespaces.default
|
374
|
+
$protxml_ns_prefix=""
|
375
|
+
$protxml_ns=nil
|
376
|
+
end
|
377
|
+
|
378
|
+
$protein_id=0
|
379
|
+
$peptide_id=0
|
380
|
+
|
381
|
+
headers_with_inputs=protxml_doc.find("//#{$protxml_ns_prefix}protein_summary_header[@source_files]",$protxml_ns)
|
382
|
+
|
383
|
+
matched_spectra=[]
|
384
|
+
|
385
|
+
headers_with_inputs.each do |header|
|
386
|
+
pepxml_files = header.attributes['source_files'].split(",")
|
387
|
+
pepxml_files.each do |pepxml_file|
|
388
|
+
matched_spectra.concat insert_psms_from_file(pepxml_file)
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
lookup_spectra_from_files(ARGV.collect { |file| file.chomp },matched_spectra)
|
393
|
+
|
394
|
+
protein_groups=protxml_doc.find("//#{$protxml_ns_prefix}protein_group", $protxml_ns)
|
395
|
+
|
396
|
+
protein_groups.each do |g|
|
397
|
+
insert_protein_group(g)
|
398
|
+
end
|
399
|
+
|
data/ext/decoymaker/decoymaker.c
CHANGED
@@ -20,24 +20,40 @@
|
|
20
20
|
|
21
21
|
#define AMINO_ACIDS "ARNDCEQGHILKMFPSTWYV"
|
22
22
|
#define NOT_AMINO_ACIDS "BJOUXZ*"
|
23
|
-
#define MAX_SEQUENCE_LENGTH
|
24
|
-
#define MAX_LINE_LENGTH
|
23
|
+
#define MAX_SEQUENCE_LENGTH 2000
|
24
|
+
#define MAX_LINE_LENGTH 200000 /* large enough to read in long header lines */
|
25
|
+
|
26
|
+
void RemoveSpaces(char* source)
|
27
|
+
{
|
28
|
+
char* i = source;
|
29
|
+
char* j = source;
|
30
|
+
while(*j != 0)
|
31
|
+
{
|
32
|
+
*i = *j++;
|
33
|
+
if(*i != ' ')
|
34
|
+
i++;
|
35
|
+
}
|
36
|
+
*i = 0;
|
37
|
+
}
|
25
38
|
|
26
39
|
|
27
40
|
static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
28
|
-
VALUE db_length_in,VALUE output_file_in,
|
29
|
-
|
30
|
-
|
41
|
+
VALUE db_length_in,VALUE output_file_in,VALUE prefix_string_in)
|
42
|
+
{
|
43
|
+
|
44
|
+
char *infile = StringValueCStr(input_file_in);
|
31
45
|
long sequences_to_generate = NUM2INT(db_length_in);
|
32
|
-
char *
|
46
|
+
char * outfile = StringValueCStr(output_file_in);
|
47
|
+
char *prefix_string = StringValueCStr(prefix_string_in);
|
33
48
|
|
34
49
|
char line[MAX_LINE_LENGTH];
|
35
|
-
char settings_line[60][70];
|
36
|
-
|
37
|
-
char prefix_string[255];
|
50
|
+
// char settings_line[60][70];
|
51
|
+
|
38
52
|
char *p,**index;
|
39
|
-
|
40
|
-
char one_sequence[MAX_SEQUENCE_LENGTH]
|
53
|
+
|
54
|
+
char one_sequence[MAX_SEQUENCE_LENGTH];
|
55
|
+
char random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)];
|
56
|
+
char random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
|
41
57
|
char *temp_sequence;
|
42
58
|
int a;
|
43
59
|
FILE *inp, *outp;
|
@@ -50,63 +66,57 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
50
66
|
double x;
|
51
67
|
|
52
68
|
/* scanning sequence database */
|
53
|
-
|
54
|
-
strcpy(infile,input_file);
|
55
69
|
|
56
70
|
if ((inp = fopen(infile, "r"))==NULL) {
|
57
71
|
printf("error opening sequence database %s\n",infile);return -1;
|
58
72
|
}
|
59
73
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
i=0;n=0;k=0;
|
74
|
+
long total_sequence_len=0;
|
75
|
+
n=0;
|
64
76
|
|
65
77
|
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
fflush(stdout);
|
71
|
-
n++;
|
72
|
-
}
|
73
|
-
}
|
78
|
+
total_sequence_len+=strlen(line);
|
79
|
+
|
80
|
+
// printf("%ld\n",i);fflush(stdout);
|
81
|
+
if (line[0]=='>') { n++; }
|
74
82
|
}
|
75
|
-
|
83
|
+
|
76
84
|
n_sequences=n;
|
77
85
|
|
78
86
|
|
79
87
|
/* reading sequence database */
|
80
88
|
|
81
89
|
temp_sequence=(char*)calloc(sizeof(char),MAX_SEQUENCE_LENGTH);
|
82
|
-
|
90
|
+
|
91
|
+
char *sequence_block=(char*)malloc(sizeof(char)*(total_sequence_len+2));
|
92
|
+
|
83
93
|
index=(char**)malloc(sizeof(char*)*n_sequences);
|
84
|
-
index[0]=
|
94
|
+
index[0]=sequence_block; /* set first index pointer to beginning of first database sequence */
|
85
95
|
|
86
96
|
if ((inp = fopen(infile, "r"))==NULL) {
|
87
97
|
printf("error opening sequence database %s\n",infile);
|
88
98
|
return -1;
|
89
99
|
}
|
90
100
|
|
91
|
-
printf("done\nreading sequence database \n%s\n",infile);
|
92
|
-
fflush(stdout);
|
93
|
-
|
94
101
|
n=-1;
|
95
102
|
strcpy(temp_sequence,"\0");
|
96
103
|
|
97
104
|
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
|
98
|
-
{
|
99
|
-
|
105
|
+
{
|
106
|
+
RemoveSpaces(line);
|
107
|
+
|
108
|
+
if (strcmp(line,"\n")==0) { // Skips blank lines
|
100
109
|
continue;
|
101
110
|
}
|
111
|
+
|
102
112
|
if (line[0]=='>') {
|
103
113
|
if (n>=0) {
|
104
|
-
|
105
|
-
printf(".");fflush(stdout);
|
106
|
-
}
|
114
|
+
|
107
115
|
strcpy(index[n],temp_sequence);
|
108
|
-
n++;
|
116
|
+
n++;
|
117
|
+
index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
|
109
118
|
strcpy(temp_sequence,"\0");
|
119
|
+
|
110
120
|
}
|
111
121
|
else
|
112
122
|
{
|
@@ -116,7 +126,9 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
116
126
|
}
|
117
127
|
else
|
118
128
|
{
|
119
|
-
if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH )
|
129
|
+
if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) {
|
130
|
+
continue;
|
131
|
+
}
|
120
132
|
strncat(temp_sequence,line,strlen(line)-1);
|
121
133
|
}
|
122
134
|
}
|
@@ -127,16 +139,18 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
127
139
|
|
128
140
|
n_sequences=n+1;
|
129
141
|
|
130
|
-
printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
|
142
|
+
// printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
|
143
|
+
|
144
|
+
// measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
|
145
|
+
|
131
146
|
|
132
|
-
measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
|
133
147
|
|
134
148
|
|
135
149
|
|
136
150
|
/* generating Markov probabilities */
|
137
151
|
|
138
|
-
printf("generating Markov probability matrix...");
|
139
|
-
fflush(stdout);
|
152
|
+
// printf("generating Markov probability matrix...");
|
153
|
+
// fflush(stdout);
|
140
154
|
|
141
155
|
srand(time(0)); /* replace with constant to re-generate identical random databases */
|
142
156
|
|
@@ -146,52 +160,53 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
146
160
|
}
|
147
161
|
}
|
148
162
|
for(j=0;j<=20;j++) {
|
149
|
-
measured_aa_freq[j]=0;
|
163
|
+
measured_aa_freq[j]=0;
|
164
|
+
generated_aa_freq[j]=0;
|
150
165
|
}
|
151
166
|
|
167
|
+
|
152
168
|
for(protein=0;protein<n_sequences;protein++)
|
153
169
|
{
|
154
|
-
if (
|
155
|
-
|
156
|
-
|
170
|
+
if (protein<(n_sequences-1))
|
171
|
+
{
|
172
|
+
long len_one_seq = (index[protein+1]-index[protein])/sizeof(char);
|
173
|
+
if ( len_one_seq > MAX_SEQUENCE_LENGTH ){
|
174
|
+
printf("Seq is longer than max len \n");fflush(stdout);
|
175
|
+
len_one_seq=MAX_SEQUENCE_LENGTH;
|
176
|
+
}
|
177
|
+
strncpy(one_sequence,index[protein],len_one_seq);
|
178
|
+
|
179
|
+
one_sequence[len_one_seq]='\0'; // NULL terminate the string
|
180
|
+
|
181
|
+
} else {
|
182
|
+
strcpy(one_sequence,index[protein]);
|
157
183
|
}
|
158
184
|
|
159
|
-
|
185
|
+
pl=strlen(one_sequence);
|
186
|
+
n=1;
|
187
|
+
one_index=0;
|
188
|
+
|
189
|
+
for(i=0;i<pl;i++)
|
160
190
|
{
|
161
|
-
|
162
|
-
one_sequence[(index[protein+1]-index[protein])/sizeof(char)]='\0';
|
163
|
-
}
|
164
|
-
else strcpy(one_sequence,index[protein]);
|
165
|
-
pl=strlen(one_sequence);
|
166
|
-
n=1;one_index=0;
|
167
|
-
|
168
|
-
for(i=0;i<pl;i++)
|
169
|
-
{
|
170
|
-
if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
|
171
|
-
{
|
172
|
-
if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
|
191
|
+
if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
|
173
192
|
{
|
174
|
-
|
193
|
+
if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
|
194
|
+
{
|
195
|
+
printf("Unknown amino acid %c",one_sequence[i]);
|
196
|
+
} else {
|
197
|
+
a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
|
198
|
+
MP[a][i]++;
|
199
|
+
measured_aa_freq[a]++;
|
200
|
+
}
|
175
201
|
} else {
|
176
|
-
a=20
|
177
|
-
MP[a][i]++;
|
202
|
+
a=floor(20*(float)rand()/RAND_MAX);
|
203
|
+
MP[a][i]++;
|
178
204
|
measured_aa_freq[a]++;
|
179
|
-
}
|
180
|
-
}
|
181
|
-
else {
|
182
|
-
a=floor(20*(float)rand()/RAND_MAX);
|
183
|
-
MP[a][i]++;
|
184
|
-
measured_aa_freq[a]++;
|
185
|
-
} // replace B, X, Z etc. with random amino acid to preserve size distribution
|
186
|
-
}
|
187
|
-
MP[20][pl]++;
|
188
|
-
measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
|
205
|
+
} // replace B, X, Z etc. with random amino acid to preserve size distribution
|
189
206
|
}
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
207
|
+
MP[20][pl]++;
|
208
|
+
measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
|
209
|
+
}
|
195
210
|
|
196
211
|
for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
|
197
212
|
row_sum[i]=0;
|
@@ -204,41 +219,38 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
204
219
|
}
|
205
220
|
|
206
221
|
|
207
|
-
/* generate random protein sequences through Markov chain */
|
208
|
-
|
209
|
-
strcpy(outfile,output_file);
|
210
222
|
|
211
|
-
|
212
|
-
printf("error opening output file %s\n",outfile);
|
213
|
-
return -1;
|
214
|
-
}
|
223
|
+
/* generate random protein sequences through Markov chain */
|
215
224
|
|
216
|
-
printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
|
217
225
|
|
218
|
-
|
226
|
+
if ((outp = fopen(outfile, "w"))==NULL) {
|
227
|
+
printf("error opening output file %s\n",outfile);
|
228
|
+
return -1;
|
229
|
+
}
|
219
230
|
|
220
|
-
|
221
|
-
|
222
|
-
if (!(protein%1000)) {
|
223
|
-
printf(".");fflush(stdout);
|
224
|
-
}
|
231
|
+
for(protein=0;protein<sequences_to_generate;protein++)
|
232
|
+
{
|
225
233
|
|
226
|
-
|
227
|
-
|
234
|
+
i=0; j=0;
|
235
|
+
while (1)
|
236
|
+
{
|
237
|
+
x=(double)row_sum[j]*((double)rand()/RAND_MAX);
|
238
|
+
partial_sum=MP[0][j]; i=1;
|
239
|
+
|
240
|
+
while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
|
241
|
+
|
242
|
+
if (j>=MAX_SEQUENCE_LENGTH) { i=21; }/* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
|
243
|
+
|
244
|
+
if (i<21)
|
228
245
|
{
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
else /* i==21, i.e. protein sequence terminated */
|
238
|
-
{
|
239
|
-
k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
|
240
|
-
for(l=0;l<j;l++)
|
241
|
-
{
|
246
|
+
random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
|
247
|
+
} else { /* i==21, i.e. protein sequence terminated */
|
248
|
+
k=0;
|
249
|
+
generated_aa_freq[20]++;
|
250
|
+
generated_pl_sum+=j;
|
251
|
+
|
252
|
+
for(l=0;l<j;l++)
|
253
|
+
{
|
242
254
|
random_sequence_output[k]=random_sequence[l]; k++;
|
243
255
|
if (!((k+1)%61))
|
244
256
|
{
|
@@ -256,19 +268,13 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
256
268
|
|
257
269
|
fclose(outp);
|
258
270
|
|
259
|
-
|
260
|
-
/* freeing some memory... */
|
261
|
-
|
262
|
-
free(index);
|
263
271
|
|
264
|
-
printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
|
272
|
+
// printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
|
265
273
|
|
266
274
|
k=0;l=0;
|
267
275
|
for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
|
268
|
-
// printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
|
269
|
-
// for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
|
270
276
|
|
271
|
-
printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
|
277
|
+
// printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
|
272
278
|
|
273
279
|
return 0;
|
274
280
|
|
data/lib/protk/galaxy_stager.rb
CHANGED
@@ -20,14 +20,13 @@ class GalaxyStager
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
def replace_references(in_file
|
24
|
-
options = { :base_only => false }.merge(options)
|
25
|
-
replacement = options[:base_only] ? @staged_base : @staged_path
|
23
|
+
def replace_references(in_file)
|
26
24
|
GalaxyStager.replace_references(in_file, @original_path, replacement)
|
27
25
|
end
|
28
26
|
|
29
|
-
def restore_references(in_file)
|
30
|
-
|
27
|
+
def restore_references(in_file, options = {})
|
28
|
+
path = options[:base_only] ? @staged_path.gsub(/#{@extension}/,"") : @staged_path
|
29
|
+
GalaxyStager.replace_references(in_file, path, @original_path)
|
31
30
|
end
|
32
31
|
|
33
32
|
def self.replace_references(in_file, from_path, to_path)
|
data/lib/protk/galaxy_util.rb
CHANGED
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'libxml'
|
2
|
+
|
3
|
+
include LibXML
|
4
|
+
|
5
|
+
class MzMLParser < Object
|
6
|
+
|
7
|
+
|
8
|
+
def initialize(path)
|
9
|
+
@namespace=
|
10
|
+
@mzml_ns_prefix="xmlns:"
|
11
|
+
@mzml_ns="xmlns:http://psi.hupo.org/ms/mzml"
|
12
|
+
|
13
|
+
doc=XML::Document.file(path)
|
14
|
+
@file_reader=XML::Reader.document(doc)
|
15
|
+
end
|
16
|
+
|
17
|
+
def next_spectrum()
|
18
|
+
|
19
|
+
until @file_reader.name=="spectrum"
|
20
|
+
if !@file_reader.read()
|
21
|
+
return nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
this_spect=spectrum_as_hash(@file_reader.expand)
|
26
|
+
|
27
|
+
@file_reader.next_sibling
|
28
|
+
|
29
|
+
return this_spect
|
30
|
+
end
|
31
|
+
|
32
|
+
def spectrum_as_hash(spectrum)
|
33
|
+
index=spectrum.attributes['index']
|
34
|
+
sid = spectrum.attributes['id']
|
35
|
+
precursor_mz_param = spectrum.find(".//#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000744\"]",@mzml_ns)[0]
|
36
|
+
mslevel_param = spectrum.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000511\"]",@mzml_ns)[0]
|
37
|
+
|
38
|
+
title_param = spectrum.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000796\"]",@mzml_ns)[0]
|
39
|
+
|
40
|
+
# prec_mz = spectrum.find(".//#{@mz}")
|
41
|
+
|
42
|
+
precursor_mz_mz = precursor_mz_param.attributes['value'] if precursor_mz_param
|
43
|
+
mslevel = mslevel_param.attributes['value'] if mslevel_param
|
44
|
+
spectrum_title = title_param['value'] if title_param
|
45
|
+
|
46
|
+
data_arrays = spectrum.find("./#{@mzml_ns_prefix}binaryDataArrayList/#{@mzml_ns_prefix}binaryDataArray",@mzml_ns)
|
47
|
+
|
48
|
+
data={}
|
49
|
+
data_arrays.each do |arr|
|
50
|
+
the_data = arr.find("./#{@mzml_ns_prefix}binary",@mzml_ns)[0].content
|
51
|
+
mzaccession = arr.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000514\"]",@mzml_ns)
|
52
|
+
if ( mzaccession.length==1 )
|
53
|
+
data[:mz] = the_data
|
54
|
+
else
|
55
|
+
data[:intensity] = the_data
|
56
|
+
end
|
57
|
+
end
|
58
|
+
data[:title]=spectrum_title
|
59
|
+
data[:mzlevel]=mslevel
|
60
|
+
data[:index]=index
|
61
|
+
data[:precursormz]=precursor_mz_mz
|
62
|
+
data[:id]=sid
|
63
|
+
|
64
|
+
data
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
data/lib/protk/tool.rb
CHANGED
@@ -53,7 +53,7 @@ class Tool
|
|
53
53
|
end
|
54
54
|
|
55
55
|
|
56
|
-
def add_value_option(symbol,default_value,opts)
|
56
|
+
def add_value_option(symbol,default_value,opts)
|
57
57
|
@options[symbol]=default_value
|
58
58
|
@option_parser.on(*opts) do |val|
|
59
59
|
@options[symbol]=val
|
@@ -108,6 +108,8 @@ class Tool
|
|
108
108
|
add_value_option(:threads,1,['-n','--threads num','Number of processing threads to use. Set to 0 to autodetect an appropriate value'])
|
109
109
|
end
|
110
110
|
|
111
|
+
|
112
|
+
|
111
113
|
end
|
112
114
|
|
113
115
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: protk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.0
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ira Cooke
|
@@ -152,6 +152,20 @@ dependencies:
|
|
152
152
|
- - ~>
|
153
153
|
- !ruby/object:Gem::Version
|
154
154
|
version: '0'
|
155
|
+
- !ruby/object:Gem::Dependency
|
156
|
+
name: sqlite3
|
157
|
+
requirement: !ruby/object:Gem::Requirement
|
158
|
+
requirements:
|
159
|
+
- - ~>
|
160
|
+
- !ruby/object:Gem::Version
|
161
|
+
version: '0'
|
162
|
+
type: :runtime
|
163
|
+
prerelease: false
|
164
|
+
version_requirements: !ruby/object:Gem::Requirement
|
165
|
+
requirements:
|
166
|
+
- - ~>
|
167
|
+
- !ruby/object:Gem::Version
|
168
|
+
version: '0'
|
155
169
|
description: A bunch of tools for proteomics
|
156
170
|
email: iracooke@gmail.com
|
157
171
|
executables:
|
@@ -195,6 +209,7 @@ files:
|
|
195
209
|
- bin/protein_prophet.rb
|
196
210
|
- bin/protk_setup.rb
|
197
211
|
- bin/protxml_to_gff.rb
|
212
|
+
- bin/protxml_to_psql.rb
|
198
213
|
- bin/protxml_to_table.rb
|
199
214
|
- bin/repair_run_summary.rb
|
200
215
|
- bin/sixframe.rb
|
@@ -237,6 +252,7 @@ files:
|
|
237
252
|
- lib/protk/manage_db_rakefile.rake
|
238
253
|
- lib/protk/manage_db_tool.rb
|
239
254
|
- lib/protk/mascot_util.rb
|
255
|
+
- lib/protk/mzml_parser.rb
|
240
256
|
- lib/protk/omssa_util.rb
|
241
257
|
- lib/protk/openms_defaults.rb
|
242
258
|
- lib/protk/pepxml.rb
|
@@ -266,9 +282,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
266
282
|
version: '0'
|
267
283
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
268
284
|
requirements:
|
269
|
-
- - '
|
285
|
+
- - '>='
|
270
286
|
- !ruby/object:Gem::Version
|
271
|
-
version:
|
287
|
+
version: '0'
|
272
288
|
requirements: []
|
273
289
|
rubyforge_project:
|
274
290
|
rubygems_version: 2.2.1
|