mspire 0.4.9 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/spec_id/bioworks.rb
DELETED
@@ -1,497 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require 'sample_enzyme'
|
4
|
-
require 'xmlparser'
|
5
|
-
require 'spec_id'
|
6
|
-
require 'zlib'
|
7
|
-
require 'hash_by'
|
8
|
-
require 'arrayclass'
|
9
|
-
require 'fasta'
|
10
|
-
|
11
|
-
## have to pre-declare some guys
|
12
|
-
module ProteinReferenceable; end
|
13
|
-
module SpecID; end
|
14
|
-
module SpecID::Prot; end
|
15
|
-
module SpecID::Pep; end
|
16
|
-
module SpecIDXML; end
|
17
|
-
|
18
|
-
class Bioworks
|
19
|
-
include SpecID
|
20
|
-
|
21
|
-
# Regular expressions
|
22
|
-
@@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o
|
23
|
-
@@modifications_re = /<modifications>(.*)<\/modifications>/o
|
24
|
-
@@protein_re = /<protein>/o
|
25
|
-
@@origfilename_re = /<origfilename>(.*)<\/origfilename>/o
|
26
|
-
@@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o
|
27
|
-
|
28
|
-
|
29
|
-
attr_accessor :peps, :prots, :version, :global_filename, :origfilename, :origfilepath
|
30
|
-
# a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) "
|
31
|
-
attr_accessor :modifications
|
32
|
-
|
33
|
-
def hi_prob_best ; false end
|
34
|
-
|
35
|
-
# -> prints to file filename1.sqt, filename2.sqt
|
36
|
-
# @TODO: sqt file output
|
37
|
-
def to_sqt(params_file)
|
38
|
-
## hash peps by filename
|
39
|
-
## hash prots by peptide
|
40
|
-
end
|
41
|
-
|
42
|
-
# returns the number of prots. Raises an Exception if open and closing xml
|
43
|
-
# tags don't agree
|
44
|
-
def num_prots(file)
|
45
|
-
re = /(<protein>)|(<\/protein>)/mo
|
46
|
-
begin_tags = 0
|
47
|
-
end_tags = 0
|
48
|
-
IO.read(file).scan(re) do |match|
|
49
|
-
if match.first
|
50
|
-
begin_tags += 1
|
51
|
-
else
|
52
|
-
end_tags += 1
|
53
|
-
end
|
54
|
-
end
|
55
|
-
if begin_tags != end_tags
|
56
|
-
puts "WARNING: #{file} doesn't have matching closing tags"
|
57
|
-
puts "for the <protein> tag. Returning # of beginning tags."
|
58
|
-
end
|
59
|
-
begin_tags
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
# Outputs the bioworks browser excel format (tab delimited) to file.
|
65
|
-
# Useful if you have more than ~65,000 lines (can export bioworks.xml
|
66
|
-
# and then convert to excel format).
|
67
|
-
# Currently, the only things not precisely identical are:
|
68
|
-
# 1. The peptide hit counts (although the first number [total # peptides] is accurate)
|
69
|
-
# 2. The precise ordering of peptides within each protein. When dealing with output from multiple runs, peptides with runs with exactly the same scan numbers are not guaranteed to be in the same order.
|
70
|
-
def to_excel(file)
|
71
|
-
update_peptide_hit_counts
|
72
|
-
arr = []
|
73
|
-
arr << ['', 'Reference', '', '', '', 'Score', 'Coverage', 'MW', 'Accession', 'Peptide (Hits)', '', ' ']
|
74
|
-
arr << ['', '"File, Scan(s)"', 'Peptide', 'MH+', 'z', 'XC', 'DeltaCn', 'Sp', 'RSp', 'Ions', 'Count', ' ']
|
75
|
-
@prots.each_with_index do |prot,index|
|
76
|
-
line_arr = prot.get(:consensus_score, :coverage, :weight, :accession)
|
77
|
-
if line_arr[1] == "0.0" then line_arr[1] = "" end
|
78
|
-
line_arr.unshift('', '', '')
|
79
|
-
line_arr.unshift('"' + prot.reference.split('|')[-1] + '"')
|
80
|
-
line_arr.unshift(index+1)
|
81
|
-
pep_hit_counts = prot.peptide_hit_counts
|
82
|
-
pep_hit_counts_string = pep_hit_counts[0].to_s + ' (' + pep_hit_counts[1..-1].join(" ") + ')'
|
83
|
-
line_arr.push( pep_hit_counts_string )
|
84
|
-
line_arr.push("")
|
85
|
-
line_arr.push(" ")
|
86
|
-
arr.push( line_arr )
|
87
|
-
prot.peps.sort_by{|obj| [obj.first_scan.to_i, obj.last_scan.to_i] }.each do |pep|
|
88
|
-
|
89
|
-
pep_arr = pep.get(:sequence, :mass, :charge, :xcorr, :deltacn, :sp, :rsp, :ions)
|
90
|
-
count = pep.count
|
91
|
-
if count == '0' then count = "" end
|
92
|
-
pep_arr.push(count)
|
93
|
-
pep_arr.push(' ')
|
94
|
-
pep_arr.unshift('"' + pep.file + '"')
|
95
|
-
pep_arr.unshift( '' )
|
96
|
-
arr.push( pep_arr )
|
97
|
-
end
|
98
|
-
end
|
99
|
-
File.open(file, "w") do |out|
|
100
|
-
arr.each do |line|
|
101
|
-
out.print(line.join("\t"), "\n")
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
end
|
106
|
-
|
107
|
-
# for output to excel format or other things, updates each protein
|
108
|
-
# with a peptide hit count array based on ranking of xcorr per dta file
|
109
|
-
# where each array is the total number of peptide hits, then rank 1,2,3,4,5
|
110
|
-
# @TODO: Can't get this to check out yet. Perhaps they use normalized
|
111
|
-
# Xcorr?
|
112
|
-
def update_peptide_hit_counts
|
113
|
-
@prots.each do |prot|
|
114
|
-
prot.peptide_hit_counts[0] = prot.peps.size
|
115
|
-
end
|
116
|
-
hash = peps.hash_by(:file)
|
117
|
-
hash.sort.each do |k,v|
|
118
|
-
sorted = v.sort_by {|obj| obj.xcorr.to_f }
|
119
|
-
peps, prot_groups = _uniq_peps_by_sequence_charge(sorted) ## but not on prot!!!!!uniq_peps_by_sequence_charge!
|
120
|
-
|
121
|
-
prot_groups.each_with_index do |prot_group, i|
|
122
|
-
prot_group.each do |prot|
|
123
|
-
prot.peptide_hit_counts[i+1] += 1 if prot.peptide_hit_counts[i+1]
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
# returns (peptides, proteins) where peptides is the unique list of peps
|
130
|
-
# and proteins is a parallel array of arrays of represented proteins
|
131
|
-
# note that each pep will contain its original prot it belongs to, even
|
132
|
-
# though the parallel protein actually represents the proteins it belongs
|
133
|
-
# to.
|
134
|
-
# assumes that each peptide points to all its proteins in pep.prots
|
135
|
-
def _uniq_peps_by_sequence_charge(peps)
|
136
|
-
new_arr = []
|
137
|
-
prot_arr = []
|
138
|
-
index_accounted_for = []
|
139
|
-
(0...peps.size).each do |i|
|
140
|
-
next if index_accounted_for.include?(i)
|
141
|
-
new_arr << peps[i]
|
142
|
-
prot_arr.push( peps[i].prots )
|
143
|
-
((i+1)...peps.size).each do |j|
|
144
|
-
pep1, pep2 = peps[i], peps[j]
|
145
|
-
if pep1.sequence == pep2.sequence && pep1.charge == pep2.charge
|
146
|
-
prot_arr.last.push( *(pep2.prots) )
|
147
|
-
index_accounted_for << j
|
148
|
-
end
|
149
|
-
end
|
150
|
-
end
|
151
|
-
return new_arr, prot_arr
|
152
|
-
end
|
153
|
-
|
154
|
-
def initialize(file=nil)
|
155
|
-
@peps = nil
|
156
|
-
if file
|
157
|
-
@filename = file
|
158
|
-
parse_xml(file)
|
159
|
-
#parse_xml_by_xmlparser(file)
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
def parse_xml_by_xmlparser(file)
|
164
|
-
parser = Bioworks::XMLParser.new
|
165
|
-
File.open(file) do |fh|
|
166
|
-
#3.times do fh.gets end ## TEMPFIX
|
167
|
-
parser.parse(fh)
|
168
|
-
end
|
169
|
-
#puts "ETETWSST"
|
170
|
-
#p parser.prots
|
171
|
-
@prots = parser.prots
|
172
|
-
end
|
173
|
-
|
174
|
-
# This is highly specific to Bioworks 3.2 xml export. In other words,
|
175
|
-
# unless the newlines, etc. are duplicated, this parser will fail! Not
|
176
|
-
# robust, but it is faster than xmlparser (which is based on the speedy
|
177
|
-
# expat)
|
178
|
-
def parse_xml(file)
|
179
|
-
fh = nil
|
180
|
-
if file =~ /\.gz$/
|
181
|
-
fh = Zlib::GzipReader.open(file)
|
182
|
-
else
|
183
|
-
fh = File.open(file)
|
184
|
-
end
|
185
|
-
@origfilename = get_regex_val(fh, @@origfilename_re)
|
186
|
-
@origfilepath = get_regex_val(fh, @@origfilepath_re)
|
187
|
-
if @origfilename
|
188
|
-
@global_filename = @origfilename.gsub(File.extname(@origfilename), "")
|
189
|
-
end
|
190
|
-
@version = get_regex_val(fh, @@bioworksinfo_re)
|
191
|
-
@modifications = get_regex_val(fh, @@modifications_re)
|
192
|
-
@prots, @peps = get_prots_from_xml_stream(fh)
|
193
|
-
fh.close
|
194
|
-
end
|
195
|
-
|
196
|
-
## returns proteins and peptides
|
197
|
-
def get_prots_from_xml_stream(fh)
|
198
|
-
uniq_pephit_hash = {}
|
199
|
-
prots = []
|
200
|
-
while line = fh.gets
|
201
|
-
if line =~ @@protein_re
|
202
|
-
prot = Bioworks::Prot.new
|
203
|
-
prot.bioworks = self
|
204
|
-
prot.set_from_xml_stream(fh, uniq_pephit_hash)
|
205
|
-
prots << prot
|
206
|
-
end
|
207
|
-
end
|
208
|
-
[prots, uniq_pephit_hash.values]
|
209
|
-
end
|
210
|
-
|
211
|
-
# gets the regex and stops (and rewinds if it hits a protein)
|
212
|
-
# if no regex is found, returns nil and rewinds the filehandle
|
213
|
-
def get_regex_val(fh, regex)
|
214
|
-
ver = nil
|
215
|
-
last_pos = fh.pos
|
216
|
-
while line = fh.gets
|
217
|
-
if line =~ regex
|
218
|
-
ver = $1.dup
|
219
|
-
break
|
220
|
-
elsif line =~ @@protein_re
|
221
|
-
fh.seek last_pos
|
222
|
-
break
|
223
|
-
end
|
224
|
-
last_pos = fh.pos
|
225
|
-
end
|
226
|
-
unless ver then fh.rewind end
|
227
|
-
ver
|
228
|
-
end
|
229
|
-
|
230
|
-
# Outputs sequest xml files (pepxml) for the trans-proteomics pipeline
|
231
|
-
def to_pepxml
|
232
|
-
string = xml_version
|
233
|
-
string
|
234
|
-
end
|
235
|
-
|
236
|
-
end
|
237
|
-
|
238
|
-
# Implements fast parsing via XMLParser (wrapper around Expat)
|
239
|
-
# It is actually slower (about %25 slower) than regular expression parsing
|
240
|
-
class Bioworks::XMLParser < XMLParser
|
241
|
-
@@at = '@'
|
242
|
-
attr_accessor :prots
|
243
|
-
|
244
|
-
def initialize
|
245
|
-
@current_obj = nil
|
246
|
-
@current_hash = {}
|
247
|
-
@current_name = nil
|
248
|
-
@current_data = nil
|
249
|
-
@prots = []
|
250
|
-
end
|
251
|
-
|
252
|
-
def startElement(name, attrs)
|
253
|
-
case name
|
254
|
-
when "peptide"
|
255
|
-
curr_prot = @current_obj
|
256
|
-
if @current_obj.class == Bioworks::Prot
|
257
|
-
@current_obj.set_from_xml_hash_xmlparser(@current_hash)
|
258
|
-
else
|
259
|
-
curr_prot = @current_obj.prot ## unless previous was a peptide
|
260
|
-
end
|
261
|
-
peptide = Bioworks::Pep.new
|
262
|
-
peptide.prot = curr_prot
|
263
|
-
curr_prot.peps << peptide
|
264
|
-
@current_obj = peptide
|
265
|
-
@current_hash = {}
|
266
|
-
when "protein"
|
267
|
-
@current_obj = Bioworks::Prot.new
|
268
|
-
@current_hash = {}
|
269
|
-
@prots << @current_obj
|
270
|
-
else
|
271
|
-
@current_name = name
|
272
|
-
end
|
273
|
-
end
|
274
|
-
|
275
|
-
def endElement(name)
|
276
|
-
case name
|
277
|
-
when "peptide"
|
278
|
-
@current_obj.set_from_hash_given_text(@current_hash)
|
279
|
-
when "protein"
|
280
|
-
else
|
281
|
-
@current_hash[name] = @current_data
|
282
|
-
end
|
283
|
-
end
|
284
|
-
|
285
|
-
def character(data)
|
286
|
-
@current_data = data
|
287
|
-
end
|
288
|
-
|
289
|
-
end
|
290
|
-
|
291
|
-
module Bioworks::XML
|
292
|
-
# The regular expression to grab attributes from the bioworks xml format
|
293
|
-
@@att_re = /<([\w]+)>(.*)<\/[\w]+>/o
|
294
|
-
end
|
295
|
-
|
296
|
-
class Bioworks::Prot
|
297
|
-
include ProteinReferenceable
|
298
|
-
include SpecID::Prot
|
299
|
-
include Bioworks::XML
|
300
|
-
|
301
|
-
@@end_prot_re = /<\/protein>/o
|
302
|
-
@@pep_re = /<peptide>/o
|
303
|
-
@@atts = %w(reference protein_probability consensus_score sf unified_score coverage pi weight accession peps)
|
304
|
-
attr_accessor :reference, :protein_probability, :consensus_score, :sf, :unified_score, :coverage, :pi, :weight, :accession, :peps, :bioworks, :peptide_hit_counts
|
305
|
-
|
306
|
-
def initialize
|
307
|
-
@peps = []
|
308
|
-
@peptide_hit_counts = [0,0,0,0,0,0]
|
309
|
-
end
|
310
|
-
|
311
|
-
|
312
|
-
# returns array of values of the attributes given (as symbols)
|
313
|
-
def get(*args)
|
314
|
-
args.collect do |arg|
|
315
|
-
send(arg)
|
316
|
-
end
|
317
|
-
end
|
318
|
-
|
319
|
-
def set_from_xml_stream(fh, uniq_pephit_hash)
|
320
|
-
hash = {}
|
321
|
-
@peps = []
|
322
|
-
while line = fh.gets
|
323
|
-
if line =~ @@att_re
|
324
|
-
hash[$1] = $2
|
325
|
-
elsif line =~ @@pep_re
|
326
|
-
## Could do a look ahead to grab the file and sequence to check
|
327
|
-
## uniqueness to increase speed here.
|
328
|
-
pep = Bioworks::Pep.new.set_from_xml_stream(fh)
|
329
|
-
# normal search results files have a global filename
|
330
|
-
# while multi-consensus do not
|
331
|
-
pep[12] ||= bioworks.global_filename
|
332
|
-
|
333
|
-
## figure out uniqueness
|
334
|
-
ky = [pep.base_name, pep.first_scan, pep.charge, pep.sequence]
|
335
|
-
if uniq_pephit_hash.key? ky
|
336
|
-
pep = uniq_pephit_hash[ky]
|
337
|
-
else
|
338
|
-
## insert the new protein
|
339
|
-
pep.prots = []
|
340
|
-
uniq_pephit_hash[ky] = pep
|
341
|
-
end
|
342
|
-
pep.prots << self
|
343
|
-
@peps << pep
|
344
|
-
|
345
|
-
elsif line =~ @@end_prot_re
|
346
|
-
set_from_xml_hash(hash)
|
347
|
-
break
|
348
|
-
else
|
349
|
-
puts "Bad parsing on: #{line}"
|
350
|
-
puts "EXITING!"
|
351
|
-
exit
|
352
|
-
end
|
353
|
-
end
|
354
|
-
self
|
355
|
-
end
|
356
|
-
|
357
|
-
def set_from_xml_hash_xmlparser(hash)
|
358
|
-
hash.delete("sequestresults")
|
359
|
-
hash.delete("bioworksinfo")
|
360
|
-
hash["sf"] = hash.delete("Sf")
|
361
|
-
hash["pi"] = hash.delete("pI")
|
362
|
-
set_from_xml_hash(hash)
|
363
|
-
end
|
364
|
-
|
365
|
-
# changes the sf to Sf and pI to pi
|
366
|
-
def set_from_xml_hash(hash)
|
367
|
-
@reference = hash["reference"]
|
368
|
-
@protein_probability = hash["protein_probability"].to_f
|
369
|
-
#@probability = @protein_probability.to_f
|
370
|
-
@consensus_score = hash["consensus_score"].to_f
|
371
|
-
@sf = hash["Sf"].to_f
|
372
|
-
@unified_score = hash["unified_score"].to_f
|
373
|
-
@coverage = hash["coverage"].to_f
|
374
|
-
@pi = hash["pI"].to_f
|
375
|
-
@weight = hash["weight"].to_f
|
376
|
-
@accession = hash["accession"]
|
377
|
-
end
|
378
|
-
end
|
379
|
-
|
380
|
-
Bioworks::Pep = Arrayclass.new( %w(sequence mass deltamass charge xcorr deltacn sp rsp ions count tic prots base_name first_scan last_scan peptide_probability file _num_prots _first_prot aaseq) )
|
381
|
-
# 0=sequence 1=mass 2=deltamass 3=charge 4=xcorr 5=deltacn 6=sp 7=rsp 8=ions 9=count 10=tic 11=prots 12=base_name 13=first_scan 14=last_scan 15=peptide_probability 16=file 17=_num_prots 18=_first_prot 19=aaseq
|
382
|
-
|
383
|
-
class Bioworks::Pep
|
384
|
-
include SpecID::Pep
|
385
|
-
include Bioworks::XML
|
386
|
-
include SpecIDXML
|
387
|
-
|
388
|
-
@@file_split_first_re = /, /o
|
389
|
-
@@file_split_second_re = / - /o
|
390
|
-
#@@att_re = /<(.*)>(.*)<\/(.*)>/
|
391
|
-
@@end_pep_re = /<\/peptide>/o
|
392
|
-
@@file_one_scan_re = /(.*), (\d+)/o
|
393
|
-
@@file_mult_scan_re = /(.*), (\d+) - (\d+)/o
|
394
|
-
## NOTE! the mass is really the theoretical MH+!!!!
|
395
|
-
## NOTE! ALL values stored as strings, except peptide_probability!
|
396
|
-
|
397
|
-
#ions is a string 'x/y'
|
398
|
-
|
399
|
-
## other accessors:
|
400
|
-
def probability ; self[15] end
|
401
|
-
def mh ; self[1] end
|
402
|
-
|
403
|
-
# This is not a true ppm since it should be divided by the actual mh instead
|
404
|
-
# of the theoretical (but it is as close as we can get for this object)
|
405
|
-
def ppm
|
406
|
-
1.0e6 * (self[2].abs/self[1])
|
407
|
-
#1.0e6 * (self.deltamass.abs/self.mh)
|
408
|
-
end
|
409
|
-
|
410
|
-
# returns array of values of the attributes given (as symbols)
|
411
|
-
def get(*args)
|
412
|
-
args.collect do |arg|
|
413
|
-
send(arg)
|
414
|
-
end
|
415
|
-
end
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
#def peptide_probability=(prob)
|
421
|
-
# @peptide_probability = prob.to_f
|
422
|
-
#end
|
423
|
-
|
424
|
-
# takes arguments in one of two forms:
|
425
|
-
# 1. file, first_scan[ - last_scan]
|
426
|
-
# 2. scan[ - last_scan]
|
427
|
-
# returns base_name, first_scan, last_scan
|
428
|
-
# base_name will be set for #1, nil for #2
|
429
|
-
def self.extract_file_info(arg)
|
430
|
-
last_scan = nil
|
431
|
-
(base_name, first_scan) = arg.split(@@file_split_first_re)
|
432
|
-
unless first_scan
|
433
|
-
first_scan = base_name
|
434
|
-
base_name = nil
|
435
|
-
end
|
436
|
-
first_scan = first_scan.split(@@file_split_second_re)
|
437
|
-
if first_scan.size > 1
|
438
|
-
(first_scan, last_scan) = first_scan
|
439
|
-
else
|
440
|
-
first_scan = first_scan[0]
|
441
|
-
last_scan = first_scan
|
442
|
-
end
|
443
|
-
[base_name, first_scan, last_scan]
|
444
|
-
end
|
445
|
-
|
446
|
-
tmp_verb = $VERBOSE
|
447
|
-
$VERBOSE = nil
|
448
|
-
def file=(arg)
|
449
|
-
## Set these vals by index:
|
450
|
-
#puts "AERRG: #{arg}"
|
451
|
-
self[16] = arg
|
452
|
-
self[12,3] = self.class.extract_file_info(arg)
|
453
|
-
end
|
454
|
-
$VERBOSE = tmp_verb
|
455
|
-
|
456
|
-
undef_method :inspect
|
457
|
-
def inspect
|
458
|
-
"<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>"
|
459
|
-
|
460
|
-
|
461
|
-
end
|
462
|
-
|
463
|
-
# if cast == true, then all the data will be cast
|
464
|
-
def set_from_hash_given_text(hash)
|
465
|
-
self[0,11] = [hash["sequence"], hash["mass"].to_f, hash["deltamass"].to_f, hash["charge"].to_i, hash["xcorr"].to_f, hash["deltacn"].to_f, hash["sp"].to_f, hash["rsp"].to_i, hash["ions"], hash["count"].to_i, hash["tic"].to_i]
|
466
|
-
self.file = hash["file"]
|
467
|
-
self[15] = hash["peptide_probability"].to_f
|
468
|
-
self[19] = SpecID::Pep.sequence_to_aaseq(self[0]) ## aaseq
|
469
|
-
end
|
470
|
-
|
471
|
-
def set_from_xml_stream(fh)
|
472
|
-
hash = {}
|
473
|
-
while line = fh.gets
|
474
|
-
if line =~ @@att_re
|
475
|
-
#hash[$1] = $2.dup
|
476
|
-
hash[$1] = $2
|
477
|
-
#puts "IN PEP: " + $1 + ": " + $2
|
478
|
-
elsif line =~ @@end_pep_re
|
479
|
-
set_from_hash_given_text(hash)
|
480
|
-
#puts "SELF[12]: #{self[12]}"
|
481
|
-
#puts "SELF[12]: #{self[12]}"
|
482
|
-
break
|
483
|
-
else
|
484
|
-
puts "Bad parsing on: #{line}"
|
485
|
-
puts "EXITING!"
|
486
|
-
exit
|
487
|
-
end
|
488
|
-
end
|
489
|
-
self
|
490
|
-
end
|
491
|
-
|
492
|
-
end
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
data/lib/spec_id/digestor.rb
DELETED
@@ -1,138 +0,0 @@
|
|
1
|
-
|
2
|
-
require 'spec_id/sequest/pepxml'
|
3
|
-
require 'spec_id/mass'
|
4
|
-
|
5
|
-
# A digestor must be able to respond to these methods:
|
6
|
-
class Digestor
|
7
|
-
|
8
|
-
# min_mh_mass = min molecular mass of peptide (M+H)+
|
9
|
-
attr_accessor :min_mh_mass
|
10
|
-
# max_mh_mass = max molecular mass of peptide (M+H)+
|
11
|
-
attr_accessor :max_mh_mass
|
12
|
-
# the number of allowable missed cleavages
|
13
|
-
attr_accessor :missed_cleavages
|
14
|
-
# sample_enzyme = SampleEnzyme object
|
15
|
-
attr_accessor :sample_enzyme
|
16
|
-
# hash of masses to use (matching keys of Mass::AVG or Mass::MONO)
|
17
|
-
# In addition, the following keys (as symbols) are recognized.
|
18
|
-
# add_C_term_protein
|
19
|
-
# add_C_term_peptide
|
20
|
-
# add_N_term_protein
|
21
|
-
# add_N_term_peptide
|
22
|
-
attr_accessor :mass_hash
|
23
|
-
|
24
|
-
# returns a list of peptide objects created from a digestion of the fasta
|
25
|
-
# proteins using the sequest params (variable mods not supported yet)
|
26
|
-
def self.digest(fasta_obj, params_obj)
|
27
|
-
dig = self.new
|
28
|
-
dig.set_from_params(params_obj)
|
29
|
-
dig.create_peptide_hash(fasta_obj).values
|
30
|
-
end
|
31
|
-
|
32
|
-
def initialize
|
33
|
-
end
|
34
|
-
|
35
|
-
# takes a parameters object and fills in the necessary values
|
36
|
-
def set_from_params(params_obj, include_variable_mods=false)
|
37
|
-
raise NotImplementedError, "no variable mods yet" if include_variable_mods
|
38
|
-
if params_obj.is_a? Sequest::Params
|
39
|
-
@sample_enzyme = params_obj.sample_enzyme
|
40
|
-
@missed_cleavages = params_obj.max_num_internal_cleavage_sites.to_i
|
41
|
-
(@min_mh_mass, @max_mh_mass) = params_obj.digest_mass_range.split(' ').map {|v| v.to_f }
|
42
|
-
(static_mods, static_terminal_mods) = Sequest::PepXML::Modifications.new.create_static_mods(params_obj)
|
43
|
-
monoisotopic_parents = case params_obj.mass_type_parent
|
44
|
-
when '0' ; false
|
45
|
-
when '1' ; true
|
46
|
-
end
|
47
|
-
|
48
|
-
@mass_hash = Mass.add_static_masses(monoisotopic_parents, static_mods, static_terminal_mods)
|
49
|
-
else
|
50
|
-
raise ArgumentError, "Don't recognize params object of type: #{params_obj.class}"
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
# aka 'digestion'
|
55
|
-
# will return a hash of SpecID::GenericPep objects (with 'aaseq' and
|
56
|
-
# 'prots') hashed by aminoacid sequence. The prot will be the fasta object.
|
57
|
-
def create_peptide_hash(fasta_obj)
|
58
|
-
pep_to_prots_hash = {}
|
59
|
-
pep_objs = nil
|
60
|
-
pep_aaseqs_ar = fasta_obj.map do |prot|
|
61
|
-
@sample_enzyme.digest(prot.aaseq, @missed_cleavages)
|
62
|
-
end
|
63
|
-
prot_aaseqs = fasta_obj.map {|prot| prot.aaseq }
|
64
|
-
passing_pep_seqs_ar = limit_sizes(prot_aaseqs, pep_aaseqs_ar, @min_mh_mass, @max_mh_mass, @mass_hash)
|
65
|
-
#pep_aaseqs_ar.each_with_index do |before_peps,i|
|
66
|
-
# after_peps = passing_pep_seqs_ar[i]
|
67
|
-
# puts "before: #{before_peps.size} after: #{after_peps.size}"
|
68
|
-
# puts "Losing: #{(before_peps - after_peps).inspect}"
|
69
|
-
# puts "Keeping: #{after_peps.inspect}"
|
70
|
-
#end
|
71
|
-
fasta_obj.each_with_index do |prot, i|
|
72
|
-
pep_seqs = passing_pep_seqs_ar[i]
|
73
|
-
pep_seqs.each do |pep_seq|
|
74
|
-
pep_obj =
|
75
|
-
if pep_to_prots_hash.key?(pep_seq)
|
76
|
-
pep_to_prots_hash[pep_seq]
|
77
|
-
else
|
78
|
-
pep_ob = SpecID::GenericPep.new
|
79
|
-
pep_ob.prots = []
|
80
|
-
pep_ob.aaseq = pep_seq
|
81
|
-
pep_to_prots_hash[pep_seq] = pep_ob
|
82
|
-
end
|
83
|
-
pep_obj.prots << prot
|
84
|
-
end
|
85
|
-
end
|
86
|
-
#pep_to_prots_hash.each do |k,v|
|
87
|
-
# p v.aaseq
|
88
|
-
# puts v.prots.size
|
89
|
-
#end
|
90
|
-
pep_to_prots_hash
|
91
|
-
end
|
92
|
-
|
93
|
-
# min max are both in terms of the M+H(+)
|
94
|
-
#
|
95
|
-
# h_plus:
|
96
|
-
# On this website:
|
97
|
-
# http://db.systemsbiology.net:8080/proteomicsToolkit/FragIonServlet.html
|
98
|
-
# They use the mass of 'H' not 'H+' to find the (M+H)+ weight.
|
99
|
-
#
|
100
|
-
# The prot_aaseq is used if the mass_hash contains the keys
|
101
|
-
# :add_C_term_protein or :add_N_term_protein
|
102
|
-
#
|
103
|
-
# mass_hash requires the key :h_plus or :h depending on h_plus option.
|
104
|
-
# prot_aaseqs is parallel to pep_aaseqs_ar where each is a group of
|
105
|
-
# peptides matching a protein aaseq
|
106
|
-
# returns another parallel array of passing peptides per protein
|
107
|
-
def limit_sizes(prot_aaseqs, pep_aaseqs_ar, min_mh, max_mh, mass_hash, h_plus=false)
|
108
|
-
if mass_hash.key?(:add_C_term_protein) or mass_hash.key?(:add_N_term_protein)
|
109
|
-
raise NotImplementedError, "need to add ability to change weights of peptides from the ends of proteins"
|
110
|
-
else
|
111
|
-
# figure out how much must be added to each peptide
|
112
|
-
# include the h2o, the h, and N and C terminal static mods
|
113
|
-
h_plus_key = h_plus ? :h_plus : :h
|
114
|
-
extra_add = mass_hash[h_plus_key]
|
115
|
-
[:add_N_term_peptide, :add_C_term_peptide].each do |sym|
|
116
|
-
if mass_hash.key?(sym)
|
117
|
-
extra_add += mass_hash[sym]
|
118
|
-
end
|
119
|
-
end
|
120
|
-
mc = Mass::Calculator.new(mass_hash, extra_add)
|
121
|
-
|
122
|
-
masses_per_group = pep_aaseqs_ar.map do |pep_aaseqs|
|
123
|
-
mc.masses(pep_aaseqs)
|
124
|
-
end
|
125
|
-
|
126
|
-
masses_per_group.zip(pep_aaseqs_ar).map do |masses, aaseqs|
|
127
|
-
passing = []
|
128
|
-
aaseqs.zip(masses) do |aaseq, mh_plus|
|
129
|
-
if ( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
|
130
|
-
passing << aaseq
|
131
|
-
end
|
132
|
-
end
|
133
|
-
passing
|
134
|
-
end
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
|
-
end
|