protk 1.3.0.pre3 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/make_decoy.rb +1 -1
- data/bin/msgfplus_search.rb +5 -5
- data/bin/omssa_search.rb +0 -18
- data/bin/peptide_prophet.rb +10 -3
- data/bin/protein_prophet.rb +14 -11
- data/bin/protxml_to_psql.rb +399 -0
- data/ext/decoymaker/decoymaker.c +120 -114
- data/lib/protk/galaxy_stager.rb +4 -5
- data/lib/protk/galaxy_util.rb +1 -1
- data/lib/protk/mzml_parser.rb +67 -0
- data/lib/protk/tool.rb +3 -1
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5e8f8a571cb43ed61984a34b6e1fb51caf979593
|
4
|
+
data.tar.gz: b53857f75c1ff6ca850859c3985aee36533e437f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9450fccc4a5ce59f064927d62fbc6a4342a1710c3b82707e0908dea52af7d0b50f215e64073bb067a506d204701acea11b6d28f302447494b8a30b1e7af2df2d
|
7
|
+
data.tar.gz: 1b8bc78fc09b4c81eee72fad169a6aee7145a16312c01bc95a5dd590f08cb98194b26115a166a759b9c52c7c67204a767747642e5e9331de4562d52f31eb1e11
|
data/bin/make_decoy.rb
CHANGED
@@ -36,7 +36,6 @@ input_file=ARGV[0]
|
|
36
36
|
db_length=tool.db_length
|
37
37
|
if ( db_length==0) #If no db length was specified use the number of entries in the input file
|
38
38
|
db_length=Bio::FastaFormat.open(input_file).count
|
39
|
-
puts "Found #{db_length} entries in input file"
|
40
39
|
end
|
41
40
|
|
42
41
|
output_file = tool.explicit_output if tool.explicit_output!=nil
|
@@ -65,6 +64,7 @@ end
|
|
65
64
|
|
66
65
|
if ( tool.append )
|
67
66
|
cmd ="awk 'FNR==1{print \"\"}1' #{input_file} #{decoys_tmp_file} > #{output_file};"
|
67
|
+
cmd << "sed -i.bak '/^$/d' #{output_file};"
|
68
68
|
cmd << "rm #{decoys_tmp_file}"
|
69
69
|
else
|
70
70
|
cmd = "mv #{decoys_tmp_file} #{output_file}"
|
data/bin/msgfplus_search.rb
CHANGED
@@ -83,16 +83,17 @@ database_path=db_info.path
|
|
83
83
|
|
84
84
|
# Database must have fasta extension
|
85
85
|
if Pathname.new(database_path).extname.to_s.downcase != ".fasta"
|
86
|
-
|
86
|
+
File.symlink(database_path,"#{database_path}.fasta") unless File.exists?("#{database_path}.fasta")
|
87
|
+
# make_msgfdb_cmd << "ln -s #{database_path} #{database_path}.fasta;"
|
87
88
|
database_path="#{database_path}.fasta"
|
88
|
-
|
89
|
+
database_path
|
89
90
|
end
|
90
91
|
|
91
92
|
# Database must be indexed
|
92
93
|
unless FileTest.exists?("#{database_path}.canno")
|
93
|
-
dbdir = Pathname.new(database_path).dirname.
|
94
|
+
dbdir = Pathname.new(database_path).dirname.to_s
|
94
95
|
tdavalue=search_tool.decoy_search ? 1 : 0;
|
95
|
-
make_msgfdb_cmd << "
|
96
|
+
make_msgfdb_cmd << "java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
|
96
97
|
end
|
97
98
|
|
98
99
|
|
@@ -214,7 +215,6 @@ ARGV.each do |filename|
|
|
214
215
|
else
|
215
216
|
cmd << "; mv #{mzid_output_path} #{output_path}"
|
216
217
|
end
|
217
|
-
|
218
218
|
|
219
219
|
# Up to here we've formulated the command. The rest is cleanup
|
220
220
|
p "Running:#{cmd}"
|
data/bin/omssa_search.rb
CHANGED
@@ -12,8 +12,6 @@ require 'protk/command_runner'
|
|
12
12
|
require 'protk/search_tool'
|
13
13
|
require 'protk/galaxy_util'
|
14
14
|
|
15
|
-
for_galaxy = GalaxyUtil.for_galaxy?
|
16
|
-
|
17
15
|
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
18
16
|
#
|
19
17
|
search_tool=SearchTool.new([
|
@@ -94,22 +92,6 @@ ARGV.each do |filename|
|
|
94
92
|
#
|
95
93
|
cmd << " -v #{search_tool.missed_cleavages}"
|
96
94
|
|
97
|
-
# If this is for Galaxy and a data directory has been specified
|
98
|
-
# look for a common unimod.xml file.
|
99
|
-
if for_galaxy
|
100
|
-
galaxy_index_dir = search_tool.galaxy_index_dir
|
101
|
-
if galaxy_index_dir
|
102
|
-
galaxy_mods = File.join(galaxy_index_dir, "mods.xml")
|
103
|
-
if( FileTest.exists?(galaxy_mods) )
|
104
|
-
cmd << " -mx #{galaxy_mods}"
|
105
|
-
end
|
106
|
-
galaxy_usermods = File.join(galaxy_index_dir, "usermods.xml")
|
107
|
-
if( FileTest.exists?(galaxy_usermods) )
|
108
|
-
cmd << " -mux #{galaxy_usermods}"
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
95
|
if ( search_tool.omx_output )
|
114
96
|
cmd << " -ox #{search_tool.omx_output} "
|
115
97
|
end
|
data/bin/peptide_prophet.rb
CHANGED
@@ -51,10 +51,11 @@ throw "When --output and -F options are set only one file at a time can be run"
|
|
51
51
|
# Obtain a global environment object
|
52
52
|
genv=Constants.new
|
53
53
|
|
54
|
-
|
54
|
+
input_stagers=[]
|
55
55
|
inputs=ARGV.collect { |file_name| file_name.chomp}
|
56
56
|
if for_galaxy
|
57
|
-
|
57
|
+
input_stagers = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
|
58
|
+
inputs=input_stagers.collect { |sg| sg.staged_path }
|
58
59
|
end
|
59
60
|
|
60
61
|
# Interrogate all the input files to obtain the database and search engine from them
|
@@ -212,7 +213,13 @@ else
|
|
212
213
|
|
213
214
|
cmd=generate_command(genv,prophet_tool,inputs,output_file_name,database,engine)
|
214
215
|
run_peptide_prophet(genv,prophet_tool,cmd,output_file_name,engine)
|
215
|
-
|
216
|
+
|
216
217
|
end
|
217
218
|
|
219
|
+
if (for_galaxy)
|
220
|
+
input_stagers.each do |sg|
|
221
|
+
sg.restore_references(output_file_name)
|
222
|
+
sg.restore_references(output_file_name,{:base_only => true})
|
223
|
+
end
|
224
|
+
end
|
218
225
|
|
data/bin/protein_prophet.rb
CHANGED
@@ -40,7 +40,13 @@ exit unless prophet_tool.check_options(true)
|
|
40
40
|
# Obtain a global environment object
|
41
41
|
genv=Constants.new
|
42
42
|
|
43
|
-
|
43
|
+
input_stagers=[]
|
44
|
+
inputs=ARGV.collect { |file_name| file_name.chomp}
|
45
|
+
if for_galaxy
|
46
|
+
input_stagers = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
|
47
|
+
inputs=input_stagers.collect { |sg| sg.staged_path }
|
48
|
+
end
|
49
|
+
|
44
50
|
|
45
51
|
if ( prophet_tool.explicit_output )
|
46
52
|
output_file=prophet_tool.explicit_output
|
@@ -52,11 +58,6 @@ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
|
|
52
58
|
|
53
59
|
cmd="ProteinProphet "
|
54
60
|
|
55
|
-
if for_galaxy
|
56
|
-
inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
|
57
|
-
end
|
58
|
-
|
59
|
-
|
60
61
|
cmd << " #{inputs.join(" ")} #{output_file}"
|
61
62
|
|
62
63
|
if ( prophet_tool.glyco )
|
@@ -71,11 +72,13 @@ else
|
|
71
72
|
genv.log("Protein Prophet output file #{output_file} already exists. Run with -r option to replace",:warn)
|
72
73
|
end
|
73
74
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
75
|
+
|
76
|
+
if (for_galaxy)
|
77
|
+
input_stagers.each do |sg|
|
78
|
+
sg.restore_references(output_file)
|
79
|
+
sg.restore_references(output_file,{:base_only => true})
|
80
|
+
end
|
81
|
+
end
|
79
82
|
|
80
83
|
|
81
84
|
|
@@ -0,0 +1,399 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 18/1/2011
|
5
|
+
#
|
6
|
+
# Convert a protein/peptide xml file to sqlite database
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'libxml'
|
11
|
+
require 'protk/constants'
|
12
|
+
require 'protk/command_runner'
|
13
|
+
require 'protk/tool'
|
14
|
+
require 'protk/fastadb'
|
15
|
+
require 'sqlite3'
|
16
|
+
require 'protk/mzml_parser'
|
17
|
+
|
18
|
+
include LibXML
|
19
|
+
|
20
|
+
def prepare_fasta(database_path,type)
|
21
|
+
|
22
|
+
db_filename = nil
|
23
|
+
case
|
24
|
+
when Pathname.new(database_path).exist? # It's an explicitly named db
|
25
|
+
db_filename = Pathname.new(database_path).expand_path.to_s
|
26
|
+
else
|
27
|
+
db_filename=Constants.new.current_database_for_name(database_path)
|
28
|
+
end
|
29
|
+
|
30
|
+
db_indexfilename = "#{db_filename}.pin"
|
31
|
+
|
32
|
+
if File.exist?(db_indexfilename)
|
33
|
+
puts "Using existing indexed database"
|
34
|
+
orf_lookup = FastaDB.new(db_filename)
|
35
|
+
else
|
36
|
+
puts "Indexing database"
|
37
|
+
orf_lookup = FastaDB.create(db_filename,db_filename,type)
|
38
|
+
end
|
39
|
+
orf_lookup
|
40
|
+
end
|
41
|
+
|
42
|
+
def get_fasta_record(protein_name,fastadb)
|
43
|
+
# puts "Looking up #{protein_name}"
|
44
|
+
entry = fastadb.get_by_id protein_name
|
45
|
+
if ( entry == nil)
|
46
|
+
puts "Failed lookup for #{protein_name}"
|
47
|
+
raise KeyError
|
48
|
+
end
|
49
|
+
entry
|
50
|
+
end
|
51
|
+
|
52
|
+
def initialize_db()
|
53
|
+
result = $outputdb.execute <<-SQL
|
54
|
+
CREATE TABLE ProteinGroups (
|
55
|
+
ID INT,
|
56
|
+
Probability REAL
|
57
|
+
);
|
58
|
+
SQL
|
59
|
+
|
60
|
+
result = $outputdb.execute <<-SQL
|
61
|
+
CREATE TABLE Proteins (
|
62
|
+
ID INT,
|
63
|
+
ProteinGroupID INT,
|
64
|
+
Probability REAL,
|
65
|
+
Name TEXT,
|
66
|
+
Description TEXT,
|
67
|
+
Coverage REAL,
|
68
|
+
NumPeptides INT,
|
69
|
+
Indistinguishables TEXT,
|
70
|
+
Sequence TEXT
|
71
|
+
);
|
72
|
+
SQL
|
73
|
+
|
74
|
+
result = $outputdb.execute <<-SQL
|
75
|
+
CREATE TABLE Peptides (
|
76
|
+
ID INT,
|
77
|
+
ProteinID INT,
|
78
|
+
Probability REAL,
|
79
|
+
Sequence TEXT,
|
80
|
+
Start INT,
|
81
|
+
End INT,
|
82
|
+
ModifiedSequence TEXT
|
83
|
+
);
|
84
|
+
SQL
|
85
|
+
|
86
|
+
# This has the role of a join table for the Peptides <-> Spectra many to many relationship
|
87
|
+
result = $outputdb.execute <<-SQL
|
88
|
+
CREATE TABLE PeptideSpectrumMatches (
|
89
|
+
PeptideSequence TEXT,
|
90
|
+
PeptideModifiedSequence TEXT,
|
91
|
+
SpectrumID INT,
|
92
|
+
ScanNum INT,
|
93
|
+
RetentionTime REAL,
|
94
|
+
PrecursorNeutralMass REAL,
|
95
|
+
MassDeviation REAL,
|
96
|
+
PrevAA TEXT,
|
97
|
+
NextAA TEXT
|
98
|
+
);
|
99
|
+
SQL
|
100
|
+
|
101
|
+
result = $outputdb.execute <<-SQL
|
102
|
+
CREATE TABLE Spectra (
|
103
|
+
ID INTEGER PRIMARY KEY,
|
104
|
+
MZData TEXT,
|
105
|
+
IntensityData TEXT,
|
106
|
+
PrecursorMass REAL,
|
107
|
+
PrecursorCharge INT,
|
108
|
+
SpectrumType INT,
|
109
|
+
SpectrumTitle TEXT
|
110
|
+
);
|
111
|
+
SQL
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
def insert_protein_group(group_node)
|
116
|
+
group_number=group_node.attributes['group_number']
|
117
|
+
group_prob=group_node.attributes['probability']
|
118
|
+
$outputdb.execute <<-SQL
|
119
|
+
INSERT INTO ProteinGroups(ID,Probability) VALUES(
|
120
|
+
#{group_number},#{group_prob}
|
121
|
+
);
|
122
|
+
SQL
|
123
|
+
|
124
|
+
proteins=group_node.find("./#{$protxml_ns_prefix}protein", $protxml_ns)
|
125
|
+
|
126
|
+
proteins.each do |protein|
|
127
|
+
insert_protein(protein,group_number)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def protein_dbid_from_name(protein_name)
|
132
|
+
protein_name #TODO: Allow user defined regex to parse this
|
133
|
+
end
|
134
|
+
|
135
|
+
def insert_protein(protein,group_id)
|
136
|
+
|
137
|
+
indis_proteins=protein.find("./#{$protxml_ns_prefix}indistinguishable_protein", $protxml_ns)
|
138
|
+
indis_proteins_summary=""
|
139
|
+
indis_proteins.each { |iprot| indis_proteins_summary<<"#{iprot.attributes['protein_name']};" }
|
140
|
+
|
141
|
+
annot_descr=protein.find("./#{$protxml_ns_prefix}annotation[@protein_description]", $protxml_ns)
|
142
|
+
|
143
|
+
|
144
|
+
protein_prob=protein.attributes['probability']
|
145
|
+
protein_name=protein.attributes['protein_name']
|
146
|
+
|
147
|
+
begin
|
148
|
+
protein_description=annot_descr[0].attributes['protein_description'].chomp.gsub("'","")
|
149
|
+
rescue
|
150
|
+
puts "No protein_description"
|
151
|
+
end
|
152
|
+
protein_coverage=protein.attributes['percent_coverage']
|
153
|
+
protein_npep = protein.attributes['total_number_peptides']
|
154
|
+
protein_indis = indis_proteins_summary
|
155
|
+
|
156
|
+
protein_coverage="NULL" unless protein_coverage
|
157
|
+
protein_indis="NULL" unless protein_indis
|
158
|
+
protein_description="NULL" unless protein_description
|
159
|
+
|
160
|
+
if $fasta_lookup
|
161
|
+
begin
|
162
|
+
entry=get_fasta_record(protein_name,$fasta_lookup)
|
163
|
+
protein_seq=entry.aaseq
|
164
|
+
rescue
|
165
|
+
puts "Warning: No entry found for #{protein_name}"
|
166
|
+
protein_seq="NULL"
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
begin
|
171
|
+
$outputdb.execute <<-SQL
|
172
|
+
INSERT INTO Proteins(ID,ProteinGroupID,Probability,Name,Description,Coverage,NumPeptides,Indistinguishables,Sequence)
|
173
|
+
VALUES(#{$protein_id},#{group_id},#{protein_prob},\'#{protein_name}\',\'#{protein_description}\',#{protein_coverage},
|
174
|
+
#{protein_npep},\'#{protein_indis}\','#{protein_seq}');
|
175
|
+
SQL
|
176
|
+
rescue
|
177
|
+
throw "Unable to insert #{protein_description}\n"
|
178
|
+
end
|
179
|
+
peptides=protein.find("./#{$protxml_ns_prefix}peptide",$protxml_ns)
|
180
|
+
|
181
|
+
peptides.each do |peptide|
|
182
|
+
insert_peptide(peptide,$protein_id,protein_seq)
|
183
|
+
end
|
184
|
+
$protein_id+=1
|
185
|
+
end
|
186
|
+
|
187
|
+
def insert_peptide(peptide,protein_id,protein_seq)
|
188
|
+
nsp_adjusted_probability=peptide.attributes['nsp_adjusted_probability']
|
189
|
+
sequence=peptide.attributes['peptide_sequence']
|
190
|
+
|
191
|
+
start_pos="NULL"
|
192
|
+
end_pos="NULL"
|
193
|
+
begin
|
194
|
+
if protein_seq!="NULL"
|
195
|
+
start_pos = protein_seq.index(sequence)
|
196
|
+
end_pos = start_pos+sequence.length
|
197
|
+
end
|
198
|
+
rescue
|
199
|
+
puts "Unable to locate peptide #{sequence} in protein #{protein_seq} for #{$protein_id}\n"
|
200
|
+
start_pos="NULL"
|
201
|
+
end_pos="NULL"
|
202
|
+
end
|
203
|
+
mod_info=peptide.find("./#{$protxml_ns_prefix}modification_info",$protxml_ns)
|
204
|
+
|
205
|
+
throw "More than one modification_info object for a peptide" unless mod_info.length<=1
|
206
|
+
mod_seq=format_modified_peptide(mod_info)
|
207
|
+
|
208
|
+
$outputdb.execute <<-SQL
|
209
|
+
INSERT INTO Peptides(ID,ProteinID,Probability,Sequence,Start,End,ModifiedSequence)
|
210
|
+
VALUES(#{$peptide_id},#{protein_id},#{nsp_adjusted_probability},\'#{sequence}\',
|
211
|
+
#{start_pos},#{end_pos},\'#{mod_seq}\')
|
212
|
+
SQL
|
213
|
+
$peptide_id+=1
|
214
|
+
|
215
|
+
end
|
216
|
+
|
217
|
+
def format_modified_peptide(mod_info)
|
218
|
+
mod_seq="NULL"
|
219
|
+
if mod_info.length==1
|
220
|
+
mod_seq=mod_info[0].attributes['modified_peptide']
|
221
|
+
mod_seq.gsub!(/\[/,"\{")
|
222
|
+
mod_seq.gsub!(/\]/,"\}")
|
223
|
+
end
|
224
|
+
mod_seq
|
225
|
+
end
|
226
|
+
|
227
|
+
def insert_psms_from_file(filepath)
|
228
|
+
$pepxml_ns_prefix="xmlns:"
|
229
|
+
$pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
|
230
|
+
|
231
|
+
pepxml_parser=XML::Parser.file("#{filepath}")
|
232
|
+
puts "Parsing #{filepath}"
|
233
|
+
pepxml_doc=pepxml_parser.parse
|
234
|
+
if not pepxml_doc.root.namespaces.default
|
235
|
+
$pepxml_ns_prefix=""
|
236
|
+
$pepxml_ns=nil
|
237
|
+
end
|
238
|
+
|
239
|
+
matched_spectra=[]
|
240
|
+
|
241
|
+
spectrum_queries=pepxml_doc.find("//#{$pepxml_ns_prefix}spectrum_query", $pepxml_ns)
|
242
|
+
|
243
|
+
spectrum_queries.each do |query|
|
244
|
+
|
245
|
+
spectrum_name = query.attributes['spectrum'].chomp.gsub("0","").sub(/\.\d+$/,"")
|
246
|
+
|
247
|
+
start_scan=query.attributes['start_scan'].to_i
|
248
|
+
end_scan=query.attributes['end_scan'].to_i
|
249
|
+
throw "Don't know how to deal with multi scan spectra" unless start_scan==end_scan
|
250
|
+
|
251
|
+
retention_time=query.attributes['retention_time_sec'].chomp.to_f
|
252
|
+
neutral_mass=query.attributes['precursor_neutral_mass'].to_f
|
253
|
+
assumed_charge=query.attributes['assumed_charge'].to_i
|
254
|
+
|
255
|
+
|
256
|
+
top_search_hit=query.find("./#{$pepxml_ns_prefix}search_result/#{$pepxml_ns_prefix}search_hit",$pepxml_ns)[0]
|
257
|
+
peptide=top_search_hit.attributes['peptide']
|
258
|
+
|
259
|
+
mod_info=top_search_hit.find("./#{$protxml_ns_prefix}modification_info",$protxml_ns)
|
260
|
+
|
261
|
+
throw "More than one modification_info object for a peptide" unless mod_info.length<=1
|
262
|
+
modified_peptide=format_modified_peptide(mod_info)
|
263
|
+
|
264
|
+
calc_neutral_pep_mass=top_search_hit.attributes['calc_neutral_pep_mass'].to_f
|
265
|
+
massdiff = top_search_hit.attributes['massdiff'].to_f
|
266
|
+
prevaa = top_search_hit.attributes['peptide_prev_aa']
|
267
|
+
nextaa = top_search_hit.attributes['peptide_next_aa']
|
268
|
+
|
269
|
+
spectrum_name="NULL" unless spectrum_name
|
270
|
+
retention_time="NULL" unless retention_time
|
271
|
+
assumed_charge="NULL" unless assumed_charge
|
272
|
+
calc_neutral_pep_mass="NULL" unless calc_neutral_pep_mass
|
273
|
+
massdiff = "NULL" unless massdiff
|
274
|
+
prevaa = "NULL" unless prevaa
|
275
|
+
nextaa = "NULL" unless nextaa
|
276
|
+
|
277
|
+
|
278
|
+
$outputdb.execute <<-SQL
|
279
|
+
INSERT INTO PeptideSpectrumMatches(PeptideSequence,PeptideModifiedSequence,SpectrumID,ScanNum,RetentionTime,PrecursorNeutralMass,MassDeviation,PrevAA,NextAA)
|
280
|
+
VALUES('#{peptide}','#{modified_peptide}','#{spectrum_name}','#{start_scan}','#{retention_time.to_f}'\
|
281
|
+
,'#{calc_neutral_pep_mass}','#{massdiff}','#{prevaa}','#{nextaa}')
|
282
|
+
SQL
|
283
|
+
|
284
|
+
matched_spectra<<{:name => spectrum_name, :scan_num => start_scan}
|
285
|
+
|
286
|
+
end
|
287
|
+
|
288
|
+
matched_spectra
|
289
|
+
end
|
290
|
+
|
291
|
+
|
292
|
+
def lookup_spectra_from_files(file_list,matched_spectra)
|
293
|
+
|
294
|
+
titles_to_match = matched_spectra.collect { |s| s[:name] }
|
295
|
+
|
296
|
+
# require 'debugger';debugger
|
297
|
+
|
298
|
+
queries_with_spectra=Array.new.replace(titles_to_match)
|
299
|
+
|
300
|
+
num_matched=0
|
301
|
+
total_spectra=0
|
302
|
+
|
303
|
+
file_list.each do |file|
|
304
|
+
mzml_parser = MzMLParser.new(file)
|
305
|
+
|
306
|
+
spec = mzml_parser.next_spectrum
|
307
|
+
|
308
|
+
|
309
|
+
while (spec) do
|
310
|
+
total_spectra+=1
|
311
|
+
if titles_to_match.include? spec[:title]
|
312
|
+
num_matched+=1
|
313
|
+
queries_with_spectra.delete(spec[:title])
|
314
|
+
|
315
|
+
$outputdb.execute <<-SQL
|
316
|
+
INSERT INTO Spectra(MZData,IntensityData,SpectrumTitle,PrecursorMass)
|
317
|
+
VALUES('#{spec[:mz]}','#{spec[:intensity]}','#{spec[:title]}','#{spec[:precursormz]}')
|
318
|
+
SQL
|
319
|
+
|
320
|
+
else
|
321
|
+
|
322
|
+
end
|
323
|
+
spec = mzml_parser.next_spectrum
|
324
|
+
end
|
325
|
+
|
326
|
+
end
|
327
|
+
puts "Found #{num_matched} matching spectra"
|
328
|
+
puts "Total in spectrum files #{total_spectra}"
|
329
|
+
puts "Total psms #{titles_to_match.length}"
|
330
|
+
puts "Unmatched psms #{queries_with_spectra.length}"
|
331
|
+
|
332
|
+
|
333
|
+
|
334
|
+
end
|
335
|
+
|
336
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
337
|
+
#
|
338
|
+
tool=Tool.new([:explicit_output,:over_write])
|
339
|
+
tool.option_parser.banner = "Convert a protXML file to a sqlite database.\n\nUsage: protxml_to_psql.rb [options] file1.protXML"
|
340
|
+
|
341
|
+
tool.add_value_option(:database,nil,['-d','--database path','A Fasta file where full protein sequences can be looked up'])
|
342
|
+
|
343
|
+
# require 'debugger';debugger
|
344
|
+
|
345
|
+
exit unless tool.check_options(true,[:explicit_output])
|
346
|
+
|
347
|
+
input_file=ARGV.shift
|
348
|
+
|
349
|
+
|
350
|
+
if File.exists? tool.explicit_output
|
351
|
+
throw "Cant overwrite existing db #{tool.explicit_output}" unless tool.over_write
|
352
|
+
File.delete(tool.explicit_output)
|
353
|
+
end
|
354
|
+
|
355
|
+
$fasta_lookup=nil
|
356
|
+
if tool.database
|
357
|
+
$fasta_lookup=prepare_fasta(tool.database,'prot')
|
358
|
+
end
|
359
|
+
|
360
|
+
$outputdb = SQLite3::Database.new tool.explicit_output
|
361
|
+
|
362
|
+
initialize_db
|
363
|
+
|
364
|
+
XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
|
365
|
+
|
366
|
+
protxml_parser=XML::Parser.file("#{input_file}")
|
367
|
+
|
368
|
+
$protxml_ns_prefix="xmlns:"
|
369
|
+
$protxml_ns="xmlns:http://regis-web.systemsbiology.net/protXML"
|
370
|
+
|
371
|
+
|
372
|
+
protxml_doc=protxml_parser.parse
|
373
|
+
if not protxml_doc.root.namespaces.default
|
374
|
+
$protxml_ns_prefix=""
|
375
|
+
$protxml_ns=nil
|
376
|
+
end
|
377
|
+
|
378
|
+
$protein_id=0
|
379
|
+
$peptide_id=0
|
380
|
+
|
381
|
+
headers_with_inputs=protxml_doc.find("//#{$protxml_ns_prefix}protein_summary_header[@source_files]",$protxml_ns)
|
382
|
+
|
383
|
+
matched_spectra=[]
|
384
|
+
|
385
|
+
headers_with_inputs.each do |header|
|
386
|
+
pepxml_files = header.attributes['source_files'].split(",")
|
387
|
+
pepxml_files.each do |pepxml_file|
|
388
|
+
matched_spectra.concat insert_psms_from_file(pepxml_file)
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
lookup_spectra_from_files(ARGV.collect { |file| file.chomp },matched_spectra)
|
393
|
+
|
394
|
+
protein_groups=protxml_doc.find("//#{$protxml_ns_prefix}protein_group", $protxml_ns)
|
395
|
+
|
396
|
+
protein_groups.each do |g|
|
397
|
+
insert_protein_group(g)
|
398
|
+
end
|
399
|
+
|
data/ext/decoymaker/decoymaker.c
CHANGED
@@ -20,24 +20,40 @@
|
|
20
20
|
|
21
21
|
#define AMINO_ACIDS "ARNDCEQGHILKMFPSTWYV"
|
22
22
|
#define NOT_AMINO_ACIDS "BJOUXZ*"
|
23
|
-
#define MAX_SEQUENCE_LENGTH
|
24
|
-
#define MAX_LINE_LENGTH
|
23
|
+
#define MAX_SEQUENCE_LENGTH 2000
|
24
|
+
#define MAX_LINE_LENGTH 200000 /* large enough to read in long header lines */
|
25
|
+
|
26
|
+
void RemoveSpaces(char* source)
|
27
|
+
{
|
28
|
+
char* i = source;
|
29
|
+
char* j = source;
|
30
|
+
while(*j != 0)
|
31
|
+
{
|
32
|
+
*i = *j++;
|
33
|
+
if(*i != ' ')
|
34
|
+
i++;
|
35
|
+
}
|
36
|
+
*i = 0;
|
37
|
+
}
|
25
38
|
|
26
39
|
|
27
40
|
static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
28
|
-
VALUE db_length_in,VALUE output_file_in,
|
29
|
-
|
30
|
-
|
41
|
+
VALUE db_length_in,VALUE output_file_in,VALUE prefix_string_in)
|
42
|
+
{
|
43
|
+
|
44
|
+
char *infile = StringValueCStr(input_file_in);
|
31
45
|
long sequences_to_generate = NUM2INT(db_length_in);
|
32
|
-
char *
|
46
|
+
char * outfile = StringValueCStr(output_file_in);
|
47
|
+
char *prefix_string = StringValueCStr(prefix_string_in);
|
33
48
|
|
34
49
|
char line[MAX_LINE_LENGTH];
|
35
|
-
char settings_line[60][70];
|
36
|
-
|
37
|
-
char prefix_string[255];
|
50
|
+
// char settings_line[60][70];
|
51
|
+
|
38
52
|
char *p,**index;
|
39
|
-
|
40
|
-
char one_sequence[MAX_SEQUENCE_LENGTH]
|
53
|
+
|
54
|
+
char one_sequence[MAX_SEQUENCE_LENGTH];
|
55
|
+
char random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)];
|
56
|
+
char random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
|
41
57
|
char *temp_sequence;
|
42
58
|
int a;
|
43
59
|
FILE *inp, *outp;
|
@@ -50,63 +66,57 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
50
66
|
double x;
|
51
67
|
|
52
68
|
/* scanning sequence database */
|
53
|
-
|
54
|
-
strcpy(infile,input_file);
|
55
69
|
|
56
70
|
if ((inp = fopen(infile, "r"))==NULL) {
|
57
71
|
printf("error opening sequence database %s\n",infile);return -1;
|
58
72
|
}
|
59
73
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
i=0;n=0;k=0;
|
74
|
+
long total_sequence_len=0;
|
75
|
+
n=0;
|
64
76
|
|
65
77
|
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
fflush(stdout);
|
71
|
-
n++;
|
72
|
-
}
|
73
|
-
}
|
78
|
+
total_sequence_len+=strlen(line);
|
79
|
+
|
80
|
+
// printf("%ld\n",i);fflush(stdout);
|
81
|
+
if (line[0]=='>') { n++; }
|
74
82
|
}
|
75
|
-
|
83
|
+
|
76
84
|
n_sequences=n;
|
77
85
|
|
78
86
|
|
79
87
|
/* reading sequence database */
|
80
88
|
|
81
89
|
temp_sequence=(char*)calloc(sizeof(char),MAX_SEQUENCE_LENGTH);
|
82
|
-
|
90
|
+
|
91
|
+
char *sequence_block=(char*)malloc(sizeof(char)*(total_sequence_len+2));
|
92
|
+
|
83
93
|
index=(char**)malloc(sizeof(char*)*n_sequences);
|
84
|
-
index[0]=
|
94
|
+
index[0]=sequence_block; /* set first index pointer to beginning of first database sequence */
|
85
95
|
|
86
96
|
if ((inp = fopen(infile, "r"))==NULL) {
|
87
97
|
printf("error opening sequence database %s\n",infile);
|
88
98
|
return -1;
|
89
99
|
}
|
90
100
|
|
91
|
-
printf("done\nreading sequence database \n%s\n",infile);
|
92
|
-
fflush(stdout);
|
93
|
-
|
94
101
|
n=-1;
|
95
102
|
strcpy(temp_sequence,"\0");
|
96
103
|
|
97
104
|
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
|
98
|
-
{
|
99
|
-
|
105
|
+
{
|
106
|
+
RemoveSpaces(line);
|
107
|
+
|
108
|
+
if (strcmp(line,"\n")==0) { // Skips blank lines
|
100
109
|
continue;
|
101
110
|
}
|
111
|
+
|
102
112
|
if (line[0]=='>') {
|
103
113
|
if (n>=0) {
|
104
|
-
|
105
|
-
printf(".");fflush(stdout);
|
106
|
-
}
|
114
|
+
|
107
115
|
strcpy(index[n],temp_sequence);
|
108
|
-
n++;
|
116
|
+
n++;
|
117
|
+
index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
|
109
118
|
strcpy(temp_sequence,"\0");
|
119
|
+
|
110
120
|
}
|
111
121
|
else
|
112
122
|
{
|
@@ -116,7 +126,9 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
116
126
|
}
|
117
127
|
else
|
118
128
|
{
|
119
|
-
if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH )
|
129
|
+
if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) {
|
130
|
+
continue;
|
131
|
+
}
|
120
132
|
strncat(temp_sequence,line,strlen(line)-1);
|
121
133
|
}
|
122
134
|
}
|
@@ -127,16 +139,18 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
127
139
|
|
128
140
|
n_sequences=n+1;
|
129
141
|
|
130
|
-
printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
|
142
|
+
// printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
|
143
|
+
|
144
|
+
// measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
|
145
|
+
|
131
146
|
|
132
|
-
measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
|
133
147
|
|
134
148
|
|
135
149
|
|
136
150
|
/* generating Markov probabilities */
|
137
151
|
|
138
|
-
printf("generating Markov probability matrix...");
|
139
|
-
fflush(stdout);
|
152
|
+
// printf("generating Markov probability matrix...");
|
153
|
+
// fflush(stdout);
|
140
154
|
|
141
155
|
srand(time(0)); /* replace with constant to re-generate identical random databases */
|
142
156
|
|
@@ -146,52 +160,53 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
146
160
|
}
|
147
161
|
}
|
148
162
|
for(j=0;j<=20;j++) {
|
149
|
-
measured_aa_freq[j]=0;
|
163
|
+
measured_aa_freq[j]=0;
|
164
|
+
generated_aa_freq[j]=0;
|
150
165
|
}
|
151
166
|
|
167
|
+
|
152
168
|
for(protein=0;protein<n_sequences;protein++)
|
153
169
|
{
|
154
|
-
if (
|
155
|
-
|
156
|
-
|
170
|
+
if (protein<(n_sequences-1))
|
171
|
+
{
|
172
|
+
long len_one_seq = (index[protein+1]-index[protein])/sizeof(char);
|
173
|
+
if ( len_one_seq > MAX_SEQUENCE_LENGTH ){
|
174
|
+
printf("Seq is longer than max len \n");fflush(stdout);
|
175
|
+
len_one_seq=MAX_SEQUENCE_LENGTH;
|
176
|
+
}
|
177
|
+
strncpy(one_sequence,index[protein],len_one_seq);
|
178
|
+
|
179
|
+
one_sequence[len_one_seq]='\0'; // NULL terminate the string
|
180
|
+
|
181
|
+
} else {
|
182
|
+
strcpy(one_sequence,index[protein]);
|
157
183
|
}
|
158
184
|
|
159
|
-
|
185
|
+
pl=strlen(one_sequence);
|
186
|
+
n=1;
|
187
|
+
one_index=0;
|
188
|
+
|
189
|
+
for(i=0;i<pl;i++)
|
160
190
|
{
|
161
|
-
|
162
|
-
one_sequence[(index[protein+1]-index[protein])/sizeof(char)]='\0';
|
163
|
-
}
|
164
|
-
else strcpy(one_sequence,index[protein]);
|
165
|
-
pl=strlen(one_sequence);
|
166
|
-
n=1;one_index=0;
|
167
|
-
|
168
|
-
for(i=0;i<pl;i++)
|
169
|
-
{
|
170
|
-
if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
|
171
|
-
{
|
172
|
-
if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
|
191
|
+
if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
|
173
192
|
{
|
174
|
-
|
193
|
+
if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
|
194
|
+
{
|
195
|
+
printf("Unknown amino acid %c",one_sequence[i]);
|
196
|
+
} else {
|
197
|
+
a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
|
198
|
+
MP[a][i]++;
|
199
|
+
measured_aa_freq[a]++;
|
200
|
+
}
|
175
201
|
} else {
|
176
|
-
a=20
|
177
|
-
MP[a][i]++;
|
202
|
+
a=floor(20*(float)rand()/RAND_MAX);
|
203
|
+
MP[a][i]++;
|
178
204
|
measured_aa_freq[a]++;
|
179
|
-
}
|
180
|
-
}
|
181
|
-
else {
|
182
|
-
a=floor(20*(float)rand()/RAND_MAX);
|
183
|
-
MP[a][i]++;
|
184
|
-
measured_aa_freq[a]++;
|
185
|
-
} // replace B, X, Z etc. with random amino acid to preserve size distribution
|
186
|
-
}
|
187
|
-
MP[20][pl]++;
|
188
|
-
measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
|
205
|
+
} // replace B, X, Z etc. with random amino acid to preserve size distribution
|
189
206
|
}
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
207
|
+
MP[20][pl]++;
|
208
|
+
measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
|
209
|
+
}
|
195
210
|
|
196
211
|
for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
|
197
212
|
row_sum[i]=0;
|
@@ -204,41 +219,38 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
204
219
|
}
|
205
220
|
|
206
221
|
|
207
|
-
/* generate random protein sequences through Markov chain */
|
208
|
-
|
209
|
-
strcpy(outfile,output_file);
|
210
222
|
|
211
|
-
|
212
|
-
printf("error opening output file %s\n",outfile);
|
213
|
-
return -1;
|
214
|
-
}
|
223
|
+
/* generate random protein sequences through Markov chain */
|
215
224
|
|
216
|
-
printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
|
217
225
|
|
218
|
-
|
226
|
+
if ((outp = fopen(outfile, "w"))==NULL) {
|
227
|
+
printf("error opening output file %s\n",outfile);
|
228
|
+
return -1;
|
229
|
+
}
|
219
230
|
|
220
|
-
|
221
|
-
|
222
|
-
if (!(protein%1000)) {
|
223
|
-
printf(".");fflush(stdout);
|
224
|
-
}
|
231
|
+
for(protein=0;protein<sequences_to_generate;protein++)
|
232
|
+
{
|
225
233
|
|
226
|
-
|
227
|
-
|
234
|
+
i=0; j=0;
|
235
|
+
while (1)
|
236
|
+
{
|
237
|
+
x=(double)row_sum[j]*((double)rand()/RAND_MAX);
|
238
|
+
partial_sum=MP[0][j]; i=1;
|
239
|
+
|
240
|
+
while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
|
241
|
+
|
242
|
+
if (j>=MAX_SEQUENCE_LENGTH) { i=21; }/* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
|
243
|
+
|
244
|
+
if (i<21)
|
228
245
|
{
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
else /* i==21, i.e. protein sequence terminated */
|
238
|
-
{
|
239
|
-
k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
|
240
|
-
for(l=0;l<j;l++)
|
241
|
-
{
|
246
|
+
random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
|
247
|
+
} else { /* i==21, i.e. protein sequence terminated */
|
248
|
+
k=0;
|
249
|
+
generated_aa_freq[20]++;
|
250
|
+
generated_pl_sum+=j;
|
251
|
+
|
252
|
+
for(l=0;l<j;l++)
|
253
|
+
{
|
242
254
|
random_sequence_output[k]=random_sequence[l]; k++;
|
243
255
|
if (!((k+1)%61))
|
244
256
|
{
|
@@ -256,19 +268,13 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
|
256
268
|
|
257
269
|
fclose(outp);
|
258
270
|
|
259
|
-
|
260
|
-
/* freeing some memory... */
|
261
|
-
|
262
|
-
free(index);
|
263
271
|
|
264
|
-
printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
|
272
|
+
// printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
|
265
273
|
|
266
274
|
k=0;l=0;
|
267
275
|
for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
|
268
|
-
// printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
|
269
|
-
// for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
|
270
276
|
|
271
|
-
printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
|
277
|
+
// printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
|
272
278
|
|
273
279
|
return 0;
|
274
280
|
|
data/lib/protk/galaxy_stager.rb
CHANGED
@@ -20,14 +20,13 @@ class GalaxyStager
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
def replace_references(in_file
|
24
|
-
options = { :base_only => false }.merge(options)
|
25
|
-
replacement = options[:base_only] ? @staged_base : @staged_path
|
23
|
+
def replace_references(in_file)
|
26
24
|
GalaxyStager.replace_references(in_file, @original_path, replacement)
|
27
25
|
end
|
28
26
|
|
29
|
-
def restore_references(in_file)
|
30
|
-
|
27
|
+
def restore_references(in_file, options = {})
|
28
|
+
path = options[:base_only] ? @staged_path.gsub(/#{@extension}/,"") : @staged_path
|
29
|
+
GalaxyStager.replace_references(in_file, path, @original_path)
|
31
30
|
end
|
32
31
|
|
33
32
|
def self.replace_references(in_file, from_path, to_path)
|
data/lib/protk/galaxy_util.rb
CHANGED
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'libxml'
|
2
|
+
|
3
|
+
include LibXML
|
4
|
+
|
5
|
+
class MzMLParser < Object
|
6
|
+
|
7
|
+
|
8
|
+
def initialize(path)
|
9
|
+
@namespace=
|
10
|
+
@mzml_ns_prefix="xmlns:"
|
11
|
+
@mzml_ns="xmlns:http://psi.hupo.org/ms/mzml"
|
12
|
+
|
13
|
+
doc=XML::Document.file(path)
|
14
|
+
@file_reader=XML::Reader.document(doc)
|
15
|
+
end
|
16
|
+
|
17
|
+
def next_spectrum()
|
18
|
+
|
19
|
+
until @file_reader.name=="spectrum"
|
20
|
+
if !@file_reader.read()
|
21
|
+
return nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
this_spect=spectrum_as_hash(@file_reader.expand)
|
26
|
+
|
27
|
+
@file_reader.next_sibling
|
28
|
+
|
29
|
+
return this_spect
|
30
|
+
end
|
31
|
+
|
32
|
+
def spectrum_as_hash(spectrum)
|
33
|
+
index=spectrum.attributes['index']
|
34
|
+
sid = spectrum.attributes['id']
|
35
|
+
precursor_mz_param = spectrum.find(".//#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000744\"]",@mzml_ns)[0]
|
36
|
+
mslevel_param = spectrum.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000511\"]",@mzml_ns)[0]
|
37
|
+
|
38
|
+
title_param = spectrum.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000796\"]",@mzml_ns)[0]
|
39
|
+
|
40
|
+
# prec_mz = spectrum.find(".//#{@mz}")
|
41
|
+
|
42
|
+
precursor_mz_mz = precursor_mz_param.attributes['value'] if precursor_mz_param
|
43
|
+
mslevel = mslevel_param.attributes['value'] if mslevel_param
|
44
|
+
spectrum_title = title_param['value'] if title_param
|
45
|
+
|
46
|
+
data_arrays = spectrum.find("./#{@mzml_ns_prefix}binaryDataArrayList/#{@mzml_ns_prefix}binaryDataArray",@mzml_ns)
|
47
|
+
|
48
|
+
data={}
|
49
|
+
data_arrays.each do |arr|
|
50
|
+
the_data = arr.find("./#{@mzml_ns_prefix}binary",@mzml_ns)[0].content
|
51
|
+
mzaccession = arr.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000514\"]",@mzml_ns)
|
52
|
+
if ( mzaccession.length==1 )
|
53
|
+
data[:mz] = the_data
|
54
|
+
else
|
55
|
+
data[:intensity] = the_data
|
56
|
+
end
|
57
|
+
end
|
58
|
+
data[:title]=spectrum_title
|
59
|
+
data[:mzlevel]=mslevel
|
60
|
+
data[:index]=index
|
61
|
+
data[:precursormz]=precursor_mz_mz
|
62
|
+
data[:id]=sid
|
63
|
+
|
64
|
+
data
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
data/lib/protk/tool.rb
CHANGED
@@ -53,7 +53,7 @@ class Tool
|
|
53
53
|
end
|
54
54
|
|
55
55
|
|
56
|
-
def add_value_option(symbol,default_value,opts)
|
56
|
+
def add_value_option(symbol,default_value,opts)
|
57
57
|
@options[symbol]=default_value
|
58
58
|
@option_parser.on(*opts) do |val|
|
59
59
|
@options[symbol]=val
|
@@ -108,6 +108,8 @@ class Tool
|
|
108
108
|
add_value_option(:threads,1,['-n','--threads num','Number of processing threads to use. Set to 0 to autodetect an appropriate value'])
|
109
109
|
end
|
110
110
|
|
111
|
+
|
112
|
+
|
111
113
|
end
|
112
114
|
|
113
115
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: protk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.0
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ira Cooke
|
@@ -152,6 +152,20 @@ dependencies:
|
|
152
152
|
- - ~>
|
153
153
|
- !ruby/object:Gem::Version
|
154
154
|
version: '0'
|
155
|
+
- !ruby/object:Gem::Dependency
|
156
|
+
name: sqlite3
|
157
|
+
requirement: !ruby/object:Gem::Requirement
|
158
|
+
requirements:
|
159
|
+
- - ~>
|
160
|
+
- !ruby/object:Gem::Version
|
161
|
+
version: '0'
|
162
|
+
type: :runtime
|
163
|
+
prerelease: false
|
164
|
+
version_requirements: !ruby/object:Gem::Requirement
|
165
|
+
requirements:
|
166
|
+
- - ~>
|
167
|
+
- !ruby/object:Gem::Version
|
168
|
+
version: '0'
|
155
169
|
description: A bunch of tools for proteomics
|
156
170
|
email: iracooke@gmail.com
|
157
171
|
executables:
|
@@ -195,6 +209,7 @@ files:
|
|
195
209
|
- bin/protein_prophet.rb
|
196
210
|
- bin/protk_setup.rb
|
197
211
|
- bin/protxml_to_gff.rb
|
212
|
+
- bin/protxml_to_psql.rb
|
198
213
|
- bin/protxml_to_table.rb
|
199
214
|
- bin/repair_run_summary.rb
|
200
215
|
- bin/sixframe.rb
|
@@ -237,6 +252,7 @@ files:
|
|
237
252
|
- lib/protk/manage_db_rakefile.rake
|
238
253
|
- lib/protk/manage_db_tool.rb
|
239
254
|
- lib/protk/mascot_util.rb
|
255
|
+
- lib/protk/mzml_parser.rb
|
240
256
|
- lib/protk/omssa_util.rb
|
241
257
|
- lib/protk/openms_defaults.rb
|
242
258
|
- lib/protk/pepxml.rb
|
@@ -266,9 +282,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
266
282
|
version: '0'
|
267
283
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
268
284
|
requirements:
|
269
|
-
- - '
|
285
|
+
- - '>='
|
270
286
|
- !ruby/object:Gem::Version
|
271
|
-
version:
|
287
|
+
version: '0'
|
272
288
|
requirements: []
|
273
289
|
rubyforge_project:
|
274
290
|
rubygems_version: 2.2.1
|