mspire 0.5.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. data/README.rdoc +24 -0
  2. data/Rakefile +51 -0
  3. data/VERSION +1 -0
  4. data/lib/cv/description.rb +18 -0
  5. data/lib/cv/param.rb +33 -0
  6. data/lib/cv.rb +3 -0
  7. data/lib/io/bookmark.rb +13 -0
  8. data/lib/merge.rb +7 -0
  9. data/lib/ms/cvlist.rb +76 -0
  10. data/lib/ms/digester.rb +245 -0
  11. data/lib/ms/fasta.rb +86 -0
  12. data/lib/ms/ident/peptide/db.rb +243 -0
  13. data/lib/ms/ident/peptide.rb +72 -0
  14. data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
  15. data/lib/ms/ident/peptide_hit.rb +26 -0
  16. data/lib/ms/ident/pepxml/modifications.rb +83 -0
  17. data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
  18. data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
  19. data/lib/ms/ident/pepxml/parameters.rb +14 -0
  20. data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
  21. data/lib/ms/ident/pepxml/search_database.rb +49 -0
  22. data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
  23. data/lib/ms/ident/pepxml/search_hit.rb +144 -0
  24. data/lib/ms/ident/pepxml/search_result.rb +35 -0
  25. data/lib/ms/ident/pepxml/search_summary.rb +92 -0
  26. data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
  27. data/lib/ms/ident/pepxml.rb +112 -0
  28. data/lib/ms/ident/protein.rb +33 -0
  29. data/lib/ms/ident/protein_group.rb +80 -0
  30. data/lib/ms/ident/search.rb +114 -0
  31. data/lib/ms/ident.rb +37 -0
  32. data/lib/ms/isotope/aa.rb +59 -0
  33. data/lib/ms/mascot.rb +6 -0
  34. data/lib/ms/mass/aa.rb +79 -0
  35. data/lib/ms/mass.rb +55 -0
  36. data/lib/ms/mzml/index_list.rb +98 -0
  37. data/lib/ms/mzml/plms1.rb +34 -0
  38. data/lib/ms/mzml.rb +197 -0
  39. data/lib/ms/obo.rb +38 -0
  40. data/lib/ms/plms1.rb +156 -0
  41. data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
  42. data/lib/ms/quant/qspec.rb +112 -0
  43. data/lib/ms/spectrum.rb +154 -8
  44. data/lib/ms.rb +3 -10
  45. data/lib/msplat.rb +2 -0
  46. data/lib/obo/ims.rb +5 -0
  47. data/lib/obo/ms.rb +7 -0
  48. data/lib/obo/ontology.rb +41 -0
  49. data/lib/obo/unit.rb +5 -0
  50. data/lib/openany.rb +23 -0
  51. data/lib/write_file_or_string.rb +18 -0
  52. data/obo/ims.obo +562 -0
  53. data/obo/ms.obo +11677 -0
  54. data/obo/unit.obo +2563 -0
  55. data/spec/ms/cvlist_spec.rb +60 -0
  56. data/spec/ms/digester_spec.rb +351 -0
  57. data/spec/ms/fasta_spec.rb +100 -0
  58. data/spec/ms/ident/peptide/db_spec.rb +108 -0
  59. data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
  60. data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
  61. data/spec/ms/ident/pepxml_spec.rb +442 -0
  62. data/spec/ms/ident/protein_group_spec.rb +68 -0
  63. data/spec/ms/mass_spec.rb +8 -0
  64. data/spec/ms/mzml/index_list_spec.rb +122 -0
  65. data/spec/ms/mzml/plms1_spec.rb +62 -0
  66. data/spec/ms/mzml_spec.rb +50 -0
  67. data/spec/ms/plms1_spec.rb +38 -0
  68. data/spec/ms/quant/qspec_spec.rb +25 -0
  69. data/spec/msplat_spec.rb +24 -0
  70. data/spec/obo_spec.rb +25 -0
  71. data/spec/spec_helper.rb +25 -0
  72. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
  73. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
  74. data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
  75. data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
  76. data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
  77. data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
  78. data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
  79. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
  80. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
  81. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
  82. data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
  83. data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
  84. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
  85. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
  86. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
  87. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
  88. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
  89. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
  90. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
  91. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
  92. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
  93. data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
  94. data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
  95. data/spec/testfiles/plms1/output.key +0 -0
  96. metadata +157 -40
  97. data/README +0 -77
  98. data/changelog.txt +0 -196
  99. data/lib/ms/calc.rb +0 -32
  100. data/lib/ms/data/interleaved.rb +0 -60
  101. data/lib/ms/data/lazy_io.rb +0 -73
  102. data/lib/ms/data/lazy_string.rb +0 -15
  103. data/lib/ms/data/simple.rb +0 -59
  104. data/lib/ms/data/transposed.rb +0 -41
  105. data/lib/ms/data.rb +0 -57
  106. data/lib/ms/format/format_error.rb +0 -12
  107. data/lib/ms/support/binary_search.rb +0 -126
@@ -0,0 +1,181 @@
1
+
2
+ require 'spec_helper'
3
+ require 'ms/ident/pepxml/sample_enzyme'
4
+ require 'nokogiri'
5
+
6
+ describe 'creating an MS::Ident::Pepxml::SampleEnzyme' do
7
+ before do
8
+ @hash = {
9
+ :name => 'trypsin',
10
+ :cut => 'KR',
11
+ :no_cut => 'P',
12
+ :sense => 'C',
13
+ }
14
+ end
15
+ it 'can be set by a known enzyme name' do
16
+ se = MS::Ident::Pepxml::SampleEnzyme.new('trypsin')
17
+ @hash.each do |k,v|
18
+ se.send(k).should == v
19
+ end
20
+ end
21
+
22
+ it 'can be set manually with a hash' do
23
+ se = MS::Ident::Pepxml::SampleEnzyme.new(@hash)
24
+ @hash.each do |k,v|
25
+ se.send(k).should == v
26
+ end
27
+ end
28
+ end
29
+
30
+ describe 'an MS::Ident::Pepxml::SampleEnzyme' do
31
+ before do
32
+ @sample_enzyme = MS::Ident::Pepxml::SampleEnzyme.new(:name=>'trypsin',:cut=>'KR',:no_cut=>'P',:sense=>'C')
33
+ end
34
+ it 'generates a valid xml fragment' do
35
+ string = @sample_enzyme.to_xml
36
+ string.is_a?(String).should == true
37
+ string.should match(/<sample_enzyme name="trypsin"/)
38
+ string.should match(/<specificity/)
39
+ %w(cut="KR" no_cut="P" sense="C").each {|re| string.should match(/#{re}/) }
40
+ !string.include?('version').should == true
41
+ end
42
+ it 'adds to an xml builder object' do
43
+ builder = Nokogiri::XML::Builder.new
44
+ after = @sample_enzyme.to_xml(builder)
45
+ after.is_a?(Nokogiri::XML::Builder).should == true
46
+ after.should == builder
47
+ after.to_xml.is_a?(String).should == true
48
+ end
49
+ end
50
+
51
+ describe 'an MS::Ident::Pepxml::SampleEnzyme making enzyme digestion calculations' do
52
+ before do
53
+ @full_KRP = MS::Ident::Pepxml::SampleEnzyme.new(
54
+ :name => 'trypsin',
55
+ :cut => 'KR',
56
+ :no_cut => 'P',
57
+ :sense => 'C',
58
+ )
59
+ @just_KR = MS::Ident::Pepxml::SampleEnzyme.new(
60
+ :name => 'trypsin',
61
+ :cut => 'KR',
62
+ :no_cut => '',
63
+ :sense => 'C',
64
+ )
65
+ end
66
+
67
+ it 'calculates the number of tolerant termini' do
68
+ exp = [{
69
+ # full KR/P
70
+ %w(K EPTIDR E) => 2,
71
+ %w(K PEPTIDR E) => 1,
72
+ %w(F EEPTIDR E) => 1,
73
+ %w(F PEPTIDW R) => 0,
74
+ },
75
+ {
76
+ # just KR
77
+ %w(K EPTIDR E) => 2,
78
+ %w(K PEPTIDR E) => 2,
79
+ %w(F EEPTIDR E) => 1,
80
+ %w(F PEPTIDW R) => 0,
81
+ }
82
+ ]
83
+ sample_enzyme_ar = [@full_KRP, @just_KR]
84
+ sample_enzyme_ar.zip(exp) do |sample_enzyme,hash|
85
+ hash.each do |seq, val|
86
+ sample_enzyme.num_tol_term(*seq).should == val
87
+ end
88
+ end
89
+ end
90
+
91
+ it 'calculates number of missed cleavages' do
92
+ exp = [{
93
+ "EPTIDR" => 0,
94
+ "PEPTIDR" => 0,
95
+ "EEPTIDR" => 0,
96
+ "PEPTIDW" => 0,
97
+ "PERPTIDW" => 0,
98
+ "PEPKPTIDW" => 0,
99
+ "PEPKTIDW" => 1,
100
+ "RTTIDR" => 1,
101
+ "RTTIKK" => 2,
102
+ "PKEPRTIDW" => 2,
103
+ "PKEPRTIDKP" => 2,
104
+ "PKEPRAALKPEERPTIDKW" => 3,
105
+ },
106
+ {
107
+ "EPTIDR" => 0,
108
+ "PEPTIDR" => 0,
109
+ "EEPTIDR" => 0,
110
+ "PEPTIDW" => 0,
111
+ "PERPTIDW" => 1,
112
+ "PEPKPTIDW" => 1,
113
+ "PEPKTIDW" => 1,
114
+ "RTTIDR" => 1,
115
+ "RTTIKK" => 2,
116
+ "PKEPRTIDW" => 2,
117
+ "PKEPRTIDKP" => 3,
118
+ "PKEPRAALKPEERPTIDKW" => 5,
119
+ }
120
+ ]
121
+
122
+ sample_enzyme_ar = [@full_KRP, @just_KR]
123
+ sample_enzyme_ar.zip(exp) do |sample_enzyme, hash|
124
+ hash.each do |aaseq, val|
125
+ sample_enzyme.num_missed_cleavages(aaseq).should == val
126
+ end
127
+ end
128
+ end
129
+ end
130
+
131
+ #xdescribe 'read in from an xml node' do
132
+ # # placeholder until written
133
+ #end
134
+
135
+ ### DOES this kind of functionality belong in this kind of container????
136
+ ### SHOULD it be with ms-enzyme or ms-in_silico ???????
137
+
138
+ =begin
139
+ require 'set'
140
+
141
+ describe 'MS::Ident::Pepxml::SampleEnzyme digesting sequences' do
142
+ it 'can digest with no missed cleavages' do
143
+ st = "CRGATKKTAGRPMEK"
144
+ SampleEnzyme.tryptic(st).should == %w(CR GATK K TAGRPMEK)
145
+ st = "CATRP"
146
+ SampleEnzyme.tryptic(st).should == %w(CATRP)
147
+ st = "RCATRP"
148
+ SampleEnzyme.tryptic(st).should == %w(R CATRP)
149
+ st = ""
150
+ SampleEnzyme.tryptic(st).should == []
151
+ st = "R"
152
+ SampleEnzyme.tryptic(st).should == %w(R)
153
+ end
154
+
155
+ it 'can digest with missed cleavages' do
156
+ st = "CRGATKKTAGRPMEKLLLERTKY"
157
+ zero = %w(CR GATK K TAGRPMEK LLLER TK Y)
158
+ SampleEnzyme.tryptic(st,0).to_set.should == zero.to_set
159
+ one = %w(CRGATK GATKK KTAGRPMEK TAGRPMEKLLLER LLLERTK TKY)
160
+ SampleEnzyme.tryptic(st,1).to_set.should == (zero+one).to_set
161
+ two = %w(CRGATKK GATKKTAGRPMEK KTAGRPMEKLLLER TAGRPMEKLLLERTK LLLERTKY)
162
+ all = zero + one + two
163
+ SampleEnzyme.tryptic(st,2).to_set.should == all.to_set
164
+ end
165
+
166
+ it 'contains duplicates IF there are duplicate tryptic sequences' do
167
+ st = "AAAAKCCCCKDDDDKCCCCK"
168
+ peps = SampleEnzyme.new('trypsin').digest(st, 2)
169
+ peps.select {|aaseq| aaseq == 'CCCCK'}.size.should == 2
170
+ end
171
+
172
+ end
173
+
174
+ describe SampleEnzyme, 'making enzyme calculations on sequences and aaseqs' do
175
+
176
+
177
+ end
178
+ =end
179
+
180
+
181
+
@@ -0,0 +1,37 @@
1
+ require 'spec_helper'
2
+
3
+ require 'ms/ident/pepxml/search_hit/modification_info'
4
+
5
+ describe 'MS::Ident::Pepxml::SearchHit::ModificationInfo' do
6
+
7
+ before do
8
+ modaaobjs = [[3, 150.3], [6, 345.2]].map do |ar|
9
+ MS::Ident::Pepxml::SearchHit::ModificationInfo::ModAminoacidMass.new(*ar)
10
+ end
11
+ hash = {
12
+ :mod_nterm_mass => 520.2,
13
+ :modified_peptide => "MOD*IFI^E&D",
14
+ :mod_aminoacid_masses => modaaobjs,
15
+ }
16
+ #answ = "<modification_info mod_nterm_mass=\"520.2\" modified_peptide=\"MOD*IFI^E&amp;D\">\n\t<mod_aminoacid_mass position=\"3\" mass=\"150.3\"/>\n\t<mod_aminoacid_mass position=\"6\" mass=\"345.2\"/>\n</modification_info>\n"
17
+ @obj = MS::Ident::Pepxml::SearchHit::ModificationInfo.new(hash)
18
+ end
19
+
20
+ it 'can produce valid pepxml xml' do
21
+ to_match = ['<modification_info',
22
+ ' mod_nterm_mass="520.2"',
23
+ " modified_peptide=\"MOD*IFI^E&amp;D\"",
24
+ "<mod_aminoacid_mass",
25
+ " position=\"3\"",
26
+ " mass=\"150.3\"",
27
+ " position=\"6\"",
28
+ " mass=\"345.2\"",
29
+ "</modification_info>"]
30
+ string = @obj.to_xml
31
+ to_match.each do |re|
32
+ string.should match(Regexp.new(Regexp.escape(re)))
33
+ end
34
+ end
35
+ end
36
+
37
+
@@ -0,0 +1,442 @@
1
+ require 'spec_helper'
2
+
3
+ require 'ms/mass'
4
+ require 'ms/mass/aa'
5
+ require 'ms/ident/pepxml'
6
+ require 'ms/ident/pepxml/modifications'
7
+ require 'ms/ident/pepxml/spectrum_query'
8
+ require 'ms/ident/pepxml/search_result'
9
+ require 'ms/ident/pepxml/search_hit'
10
+ require 'ms/ident/pepxml/search_hit/modification_info'
11
+
12
+ describe "creating an MS::Ident::Pepxml" do
13
+ include MS::Ident
14
+
15
+ it "can be creating in a nested fashion reflecting internal structure" do
16
+ tags_that_should_be_present = %w(msms_pipeline_analysis msms_run_summary sample_enzyme search_summary spectrum_query search_result search_hit modification_info mod_aminoacid_mass search_score)
17
+
18
+ pepxml = Pepxml.new do |msms_pipeline_analysis|
19
+ msms_pipeline_analysis.merge!(:summary_xml => "020.xml") do |msms_run_summary|
20
+ # prep the sample enzyme and search_summary
21
+ msms_run_summary.merge!(
22
+ :base_name => '/home/jtprince/dev/mspire/020',
23
+ :ms_manufacturer => 'Thermo',
24
+ :ms_model => 'LTQ Orbitrap',
25
+ :ms_ionization => 'ESI',
26
+ :ms_mass_analyzer => 'Ion Trap',
27
+ :ms_detector => 'UNKNOWN'
28
+ ) do |sample_enzyme, search_summary, spectrum_queries|
29
+ sample_enzyme.merge!(:name=>'Trypsin',:cut=>'KR',:no_cut=>'P',:sense=>'C')
30
+ search_summary.merge!(
31
+ :base_name=>'/path/to/file/020',
32
+ :search_engine => 'SEQUEST',
33
+ :precursor_mass_type =>'monoisotopic',
34
+ :fragment_mass_type => 'average'
35
+ ) do |search_database, enzymatic_search_constraint, modifications, parameters|
36
+ search_database.merge!(:local_path => '/path/to/db.fasta', :seq_type => 'AA') # note seq_type == type
37
+ enzymatic_search_constraint.merge!(
38
+ :enzyme => 'Trypsin',
39
+ :max_num_internal_cleavages => 2,
40
+ :min_number_termini => 2
41
+ )
42
+ modifications << Pepxml::AminoacidModification.new(
43
+ :aminoacid => 'M', :massdiff => 15.9994, :mass => MS::Mass::AA::MONO['M']+15.9994,
44
+ :variable => 'Y', :symbol => '*')
45
+ # invented, for example, a protein terminating mod
46
+ modifications << Pepxml::TerminalModification.new(
47
+ :terminus => 'c', :massdiff => 23.3333, :mass => MS::Mass::MONO['oh'] + 23.3333,
48
+ :variable => 'Y', :symbol => '[', :protein_terminus => 'c',
49
+ :description => 'leave protein_terminus off if not protein mod'
50
+ )
51
+ modifications << Pepxml::TerminalModification.new(
52
+ :terminus => 'c', :massdiff => 25.42322, :mass => MS::Mass::MONO['h+'] + 25.42322,
53
+ :variable => 'N', :symbol => ']', :description => 'example: c term mod'
54
+ )
55
+ parameters.merge!(
56
+ :fragment_ion_tolerance => 1.0000,
57
+ :digest_mass_range => '600.0 3500.0',
58
+ :enzyme_info => 'Trypsin(KR/P) 1 1 KR P', # etc....
59
+ )
60
+ end
61
+ spectrum_query1 = Pepxml::SpectrumQuery.new(
62
+ :spectrum => '020.3.3.1', :start_scan => 3, :end_scan => 3,
63
+ :precursor_neutral_mass => 1120.93743421875, :assumed_charge => 1
64
+ ) do |search_results|
65
+ search_result1 = Pepxml::SearchResult.new do |search_hits|
66
+ modpositions = [[1, 243.1559], [6, 167.0581], [7,181.085]].map do |pair|
67
+ Pepxml::SearchHit::ModificationInfo::ModAminoacidMass.new(*pair)
68
+ end
69
+ # order(modified_peptide, mod_aminoacid_masses, :mod_nterm_mass, :mod_cterm_mass)
70
+ # or can be set by hash
71
+ mod_info = Pepxml::SearchHit::ModificationInfo.new('Y#RLGGS#T#K', modpositions)
72
+ search_hit1 = Pepxml::SearchHit.new(
73
+ :hit_rank=>1, :peptide=>'YRLGGSTK', :peptide_prev_aa => "R", :peptide_next_aa => "K",
74
+ :protein => "gi|16130113|ref|NP_416680.1|", :num_tot_proteins => 1, :num_matched_ions => 5,
75
+ :tot_num_ions => 35, :calc_neutral_pep_mass => 1120.93163442, :massdiff => 0.00579979875010395,
76
+ :num_tol_term => 2, :num_missed_cleavages => 1, :is_rejected => 0,
77
+ :modification_info => mod_info) do |search_scores|
78
+ search_scores.merge!(:xcorr => 0.12346, :deltacn => 0.7959, :deltacnstar => 0,
79
+ :spscore => 29.85, :sprank => 1)
80
+ end
81
+ search_hits << search_hit1
82
+ end
83
+ search_results << search_result1
84
+ end
85
+ spectrum_queries << spectrum_query1
86
+ end
87
+ end
88
+ end
89
+ xml = pepxml.to_xml
90
+ tags_that_should_be_present.each do |tag|
91
+ xml.should match(/<#{tag} ?/)
92
+ end
93
+ xml.should match( /<\?xml version="1.0" encoding="UTF-8"\?>/ )
94
+ xml.should match( %r{<\?xml-stylesheet type="text/xsl" href="/tools/bin/TPP/tpp/schema/pepXML_std.xsl"\?>} )
95
+ end
96
+ end
97
+
98
+ =begin
99
+ # splits string on ' 'and matches the line found by find_line_regexp in
100
+ # lines
101
+ def match_modline_pieces(lines, find_line_regexp, string)
102
+ pieces = string.split(' ').map {|v| /#{Regexp.escape(v)}/ }
103
+ lines.each do |line|
104
+ if line =~ find_line_regexp
105
+ pieces.each do |piece|
106
+ line.should =~ piece
107
+ end
108
+ end
109
+ end
110
+ end
111
+
112
+
113
+ it 'gets modifications right in real run' do
114
+ @out_files.each do |fn|
115
+ fn.exist_as_a_file?.should be_true
116
+ beginning = IO.read(fn)
117
+ lines = beginning.split("\n")
118
+ [
119
+ [/aminoacid="M"/, '<aminoacid_modification symbol="*" massdiff="+15.9994" aminoacid="M" variable="Y" binary="N" mass="147.192"'],
120
+
121
+ [/aminoacid="S"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="S" variable="Y" binary="N" mass="167.0581"'],
122
+ [/aminoacid="T"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="T" variable="Y" binary="N" mass="181.085"'],
123
+ [/aminoacid="Y"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="Y" variable="Y" binary="N" mass="243.1559"'],
124
+ [/parameter name="diff_search_options"/, '<parameter name="diff_search_options" value="15.999400 M 79.979900 STY 0.000000 M 0.000000 X 0.000000 T 0.000000 Y"/>'],
125
+ ].each do |a,b|
126
+ match_modline_pieces(lines, a, b)
127
+ end
128
+ [
129
+ '<modification_info modified_peptide="Y#RLGGS#T#K">',
130
+ '<mod_aminoacid_mass position="1" mass="243.1559"/>',
131
+ '<mod_aminoacid_mass position="7" mass="167.0581"/>',
132
+ '</modification_info>',
133
+ '<mod_aminoacid_mass position="9" mass="181.085"/>'
134
+ ].each do |line|
135
+ beginning.should =~ /#{Regexp.escape(line)}/ # "a modification info for a peptide")
136
+ end
137
+ end
138
+ end
139
+ end
140
+ end
141
+
142
+
143
+
144
+ =begin
145
+ describe "MS::Ident::Pepxml created from small bioworks.xml" do
146
+
147
+ spec_large do
148
+ before(:all) do
149
+ tf_mzxml_path = Tfiles_l + "/yeast_gly_mzXML"
150
+
151
+ tf_params = Tfiles + "/bioworks32.params"
152
+ tf_bioworks_xml = Tfiles + "/bioworks_small.xml"
153
+ out_path = Tfiles
154
+ @pepxml_objs = Sequest::Pepxml.set_from_bioworks(tf_bioworks_xml, :params => tf_params, :ms_data => tf_mzxml_path, :out_path => out_path)
155
+ end
156
+
157
+ it 'gets some spectrum queries' do
158
+ @pepxml_objs.each do |obj|
159
+ (obj.spectrum_queries.size > 2).should be_true
160
+ (obj.spectrum_queries.first.search_results.first.search_hits.size > 0).should be_true
161
+ end
162
+ #@pepxml_objs.each do |pep| puts pep.to_pepxml end
163
+ end
164
+ end
165
+ end
166
+
167
+
168
+
169
+ describe Sequest::Pepxml, " created from large bioworks.xml" do
170
+ # assert_equal_by_pairs (really any old array)
171
+ def assert_equal_pairs(obj, arrs)
172
+ arrs.each do |arr|
173
+ #if obj.send(arr[1]) != arr[0]
174
+ # puts "HELLO"
175
+ # puts "OBJ answer"
176
+ # p obj.send(arr[1])
177
+ # puts "ar0"
178
+ # p arr[0]
179
+ # puts "ar1"
180
+ # p arr[1]
181
+ #end
182
+ if arr[0].is_a? Float
183
+ obj.send(arr[1]).should be_close(arr[0], 0.0000000001)
184
+ else
185
+ obj.send(arr[1]).should == arr[0]
186
+ end
187
+ end
188
+ end
189
+
190
+ #swap the first to guys first
191
+ def assert_equal_pairs_swapped(obj, arrs)
192
+ arrs.each do |arr|
193
+ arr[0], arr[1] = arr[1], arr[0]
194
+ end
195
+ assert_equal_pairs(obj, arrs)
196
+ end
197
+
198
+ spec_large do
199
+ before(:all) do
200
+ st = Time.new
201
+ params = Tfiles + "/opd1/sequest.3.2.params"
202
+ bioworks_xml = Tfiles_l + "/opd1/bioworks.000.oldparams.xml"
203
+ mzxml_path = Tfiles_l + "/opd1"
204
+ out_path = Tfiles
205
+ @pepxml_version = 18
206
+ @pepxml_objs = Sequest::Pepxml.set_from_bioworks_xml(bioworks_xml, params, {:ms_data => mzxml_path, :out_path => out_path, :pepxml_version => @pepxml_version})
207
+ puts "- takes #{Time.new - st} secs"
208
+ end
209
+
210
+ it 'extracts MSMSPipelineAnalysis' do
211
+ ######## HMMMMM...
212
+ Sequest::Pepxml.pepxml_version.should == @pepxml_version
213
+
214
+ # MSMSPipelineAnalysis
215
+ po = @pepxml_objs.first
216
+ msms_pipeline = po.msms_pipeline_analysis
217
+ msms_pipeline.xmlns.should == 'http://regis-web.systemsbiology.net/pepXML'
218
+ msms_pipeline.xmlns_xsi.should == 'http://www.w3.org/2001/XMLSchema-instance'
219
+ msms_pipeline.xsi_schema_location.should == 'http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v18.xsd'
220
+ msms_pipeline.summary_xml.should == '000.xml'
221
+ end
222
+
223
+ it 'extracts MSmSRunSummary' do
224
+ # MSMSRunSummary
225
+ rs = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary
226
+ rs.base_name.should =~ /\/000/
227
+ assert_equal_pairs(rs, [ ['ThermoFinnigan', :ms_manufacturer], ['LCQ Deca XP Plus', :ms_model], ['ESI', :ms_ionization], ['Ion Trap', :ms_mass_analyzer], ['UNKNOWN', :ms_detector], ['raw', :raw_data_type], ['.mzXML', :raw_data], ])
228
+ end
229
+
230
+ it 'extracts SampleEnzyme' do
231
+ # SampleEnzyme
232
+ se = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.sample_enzyme
233
+ assert_equal_pairs(se, [ ['Trypsin', :name], ['KR', :cut], [nil, :no_cut], ['C', :sense], ])
234
+ end
235
+
236
+ it 'extracts SearchSummary' do
237
+ # SearchSummary
238
+ ss = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.search_summary
239
+ ss.is_a?(Sequest::Pepxml::SearchSummary).should be_true
240
+ ss.base_name.should =~ /\/000/
241
+ ss.peptide_mass_tol.should =~ /1\.500/
242
+ assert_equal_pairs_swapped(ss, [ # normal attributes
243
+ [:search_engine, "SEQUEST"], [:precursor_mass_type, "average"], [:fragment_mass_type, "average"], [:out_data_type, "out"], [:out_data, ".tgz"], [:search_id, "1"],
244
+
245
+ # enzymatic_search_constraint
246
+ [:enzyme, 'Trypsin'], [:max_num_internal_cleavages, '2'], [:min_number_termini, '2'],
247
+
248
+ # parameters
249
+ [:fragment_ion_tol, "1.0000"], [:ion_series, "0 1 1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0"], [:max_num_differential_AA_per_mod, "3"], [:nucleotide_reading_frame, "0"], [:num_output_lines, "10"], [:remove_precursor_peak, "0"], [:ion_cutoff_percentage, "0.0000"], [:match_peak_count, "0"], [:match_peak_allowed_error, "1"], [:match_peak_tolerance, "1.0000"], [:protein_mass_filter, "0 0"],
250
+ ])
251
+
252
+ end
253
+ it 'extracts SearchDatabase' do
254
+ # SearchDatabase
255
+ sd = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.search_summary.search_database
256
+ sd.is_a?(Sequest::Pepxml::SearchDatabase).should be_true
257
+ assert_equal_pairs_swapped(sd, [ [:local_path, "C:\\Xcalibur\\database\\ecoli_K12.fasta"], [:seq_type, 'AA'], ])
258
+ end
259
+
260
+ it 'returns SpectrumQueries' do
261
+ # SpectrumQueries
262
+ sq = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.spectrum_queries
263
+ spec = sq.first
264
+ assert_equal_pairs_swapped(spec, [
265
+ [:spectrum, "000.100.100.1"], [:start_scan, "100"], [:end_scan, "100"],
266
+ #[:precursor_neutral_mass, "1074.5920"], # out2summary
267
+ [:precursor_neutral_mass, 1074.666926], # mine
268
+ [:assumed_charge, 1], [:index, "1"],
269
+ ])
270
+ sh = spec.search_results.first.search_hits.first
271
+ assert_equal_pairs_swapped(sh, [
272
+ # normal attributes
273
+ [:hit_rank, 1],
274
+ [:peptide, "SIYFRNFK"],
275
+ [:peptide_prev_aa, "R"],
276
+ [:peptide_next_aa, "G"],
277
+ [:protein, "gi|16130084|ref|NP_416651.1|"],
278
+ [:num_tot_proteins, 1],
279
+ [:num_matched_ions, 4],
280
+ [:tot_num_ions, 14],
281
+ #[:calc_neutral_pep_mass, "1074.1920"], # out2summary
282
+ [:calc_neutral_pep_mass, 1074.23261], # mine
283
+ #[:massdiff, "+0.400000"], # out2summary
284
+ [:massdiff, 0.434316000000081], # mine
285
+ [:num_tol_term, 2], [:num_missed_cleavages, 1], [:is_rejected, 0],
286
+
287
+ # search_score
288
+ [:xcorr, 0.4], [:deltacn, 0.023], [:deltacnstar, "0"], [:spscore, 78.8], [:sprank, 1],
289
+ ])
290
+
291
+ spec = sq[1]
292
+ assert_equal_pairs_swapped(spec, [
293
+ [:spectrum, "000.1000.1000.1"], [:start_scan, "1000"], [:end_scan, "1000"], #[:precursor_neutral_mass, "663.1920"], # out2summary
294
+ [:precursor_neutral_mass, 663.206111], # mine
295
+ [:assumed_charge, 1], [:index, "2"],
296
+ ])
297
+
298
+ sh = spec.search_results.first.search_hits.first
299
+ assert_equal_pairs_swapped(sh, [
300
+ # normal attributes
301
+ [:hit_rank, 1], [:peptide, "ALADFK"], [:peptide_prev_aa, "R"], [:peptide_next_aa, "S"], [:protein, "gi|16128765|ref|NP_415318.1|"], [:num_tot_proteins, 1], [:num_matched_ions, 5], [:tot_num_ions, 10],
302
+ [:num_tol_term, 2], [:num_missed_cleavages, 0], [:is_rejected, 0],
303
+ #[:massdiff, "-0.600000"], # out2summary
304
+ [:massdiff, -0.556499000000031], # mine
305
+ #[:calc_neutral_pep_mass, 663.7920], # out2summary
306
+ [:calc_neutral_pep_mass, 663.76261], # mine
307
+
308
+ # search_score
309
+ [:xcorr, 0.965], [:deltacn, 0.132], [:deltacnstar, "0"], [:spscore, 81.1], [:sprank, 1],
310
+ ])
311
+
312
+ spec = sq[9]
313
+ assert_equal_pairs_swapped(spec, [
314
+ [:spectrum, "000.1008.1008.2"], [:start_scan, "1008"], [:end_scan, "1008"], [:assumed_charge, 2],
315
+ #[:precursor_neutral_mass, "691.0920"], # out2summary
316
+ [:precursor_neutral_mass, 691.150992], # mine
317
+ ])
318
+
319
+ sh = spec.search_results.first.search_hits.first
320
+ assert_equal_pairs_swapped(sh, [
321
+ # normal attributes
322
+ [:hit_rank, 1], [:peptide, "RLFTR"], [:peptide_prev_aa, "R"], [:peptide_next_aa, "A"], [:protein, "gi|16130457|ref|NP_417027.1|"], [:num_tot_proteins, 1], [:num_matched_ions, 5], [:tot_num_ions, 8], [:num_tol_term, 2],
323
+
324
+ #[:num_missed_cleavages, "0"], # out2summary misses this!
325
+ [:num_missed_cleavages, 1],
326
+ [:is_rejected, 0],
327
+ #[:calc_neutral_pep_mass, "691.7920"], # out2summary
328
+ [:calc_neutral_pep_mass, 691.82261], # mine
329
+ #[:massdiff, "-0.700000"], # out2summary
330
+ [:massdiff, -0.67161800000008], # mine
331
+
332
+ # search_score
333
+ [:xcorr, 0.903], [:deltacn, 0.333], [:deltacnstar, "0"], [:spscore, 172.8], [:sprank, 1],
334
+ ])
335
+ end
336
+
337
+ it 'can generate correct pepxml file' do
338
+
339
+ ## IF OUR OBJECT IS CORRECT, THEN WE GET THE OUTPUT:
340
+ string = @pepxml_objs.first.to_pepxml
341
+ ans_lines = IO.read(Tfiles + "/opd1/000.my_answer.100lines.xml").split("\n")
342
+ base_name_re = /base_name=".*?files\//o
343
+ date_re = /date=".*?"/
344
+ string.split("\n").each_with_index do |line,i|
345
+ if i > 99 ; break end
346
+ ans, exp =
347
+ if i == 1
348
+ [line.sub(date_re,''), ans_lines[i].sub(date_re,'')]
349
+ elsif i == 2
350
+ [line.sub(base_name_re,''), ans_lines[i].sub(base_name_re, '').sub(/^\s+/, "\t")]
351
+ elsif i == 6
352
+ [line.sub(base_name_re,''), ans_lines[i].sub(base_name_re, '').sub(/^\s+/, "\t\t")]
353
+ else
354
+ [line, ans_lines[i]]
355
+ end
356
+
357
+ #ans.split('').zip(exp.split('')) do |l,a|
358
+ # if l != a
359
+ # puts line
360
+ # puts ans_lines[i]
361
+ # puts l
362
+ # puts a
363
+ # end
364
+ #end
365
+ if ans != exp
366
+ puts ans
367
+ puts exp
368
+ end
369
+ ans.should == exp
370
+ #line.sub(base_name_re,'').should == ans_lines[i].sub(base_name_re,'')
371
+ end
372
+ end
373
+ end
374
+ end
375
+
376
+
377
+
378
+ describe Sequest::Pepxml::Modifications do
379
+ before(:each) do
380
+ tf_params = Tfiles + "/bioworks32.params"
381
+ @params = Sequest::Params.new(tf_params)
382
+ # The params object here is completely unnecessary for this test, except
383
+ # that it sets up the mass table
384
+ @obj = Sequest::Pepxml::Modifications.new(@params, "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) ")
385
+ end
386
+ it 'creates a mod_symbols_hash' do
387
+ answ = {[:C, 12.0]=>"^", [:S, 80.0]=>"@", [:M, 29.0]=>"#", [:M, 15.9]=>"*", [:ct, 12.33]=>"[", [:nt, 14.2]=>"]"}
388
+ @obj.mod_symbols_hash.should == answ
389
+ ## need more here
390
+ end
391
+
392
+ it 'creates a ModificationInfo object given a special peptide sequence' do
393
+ mod_string = "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) "
394
+ @params.diff_search_options = "15.90000 M 29.00000 M 80.00000 S 12.00000 C"
395
+ @params.term_diff_search_options = "14.20000 12.33000"
396
+ mod = Sequest::Pepxml::Modifications.new(@params, mod_string)
397
+ ## no mods
398
+ peptide = "PEPTIDE"
399
+ mod.modification_info(peptide).should be_nil
400
+ peptide = "]M*EC^S@IDM#M*EMSCM["
401
+ modinfo = mod.modification_info(peptide)
402
+ modinfo.modified_peptide.should == peptide
403
+ modinfo.mod_nterm_mass.should be_close(146.40054, 0.000001)
404
+ modinfo.mod_cterm_mass.should be_close(160.52994, 0.000001)
405
+ end
406
+
407
+ end
408
+
409
+ describe Sequest::Pepxml::SearchHit::ModificationInfo do
410
+
411
+ before(:each) do
412
+ modaaobjs = [[3, 150.3], [6, 345.2]].map do |ar|
413
+ Sequest::Pepxml::SearchHit::ModificationInfo::ModAminoacidMass.new(ar)
414
+ end
415
+ hash = {
416
+ :mod_nterm_mass => 520.2,
417
+ :modified_peptide => "MOD*IFI^E&D",
418
+ :mod_aminoacid_masses => modaaobjs,
419
+ }
420
+ #answ = "<modification_info mod_nterm_mass=\"520.2\" modified_peptide=\"MOD*IFI^E&amp;D\">\n\t<mod_aminoacid_mass position=\"3\" mass=\"150.3\"/>\n\t<mod_aminoacid_mass position=\"6\" mass=\"345.2\"/>\n</modification_info>\n"
421
+ @obj = Sequest::Pepxml::SearchHit::ModificationInfo.new(hash)
422
+ end
423
+
424
+ def _re(st)
425
+ /#{Regexp.escape(st)}/
426
+ end
427
+
428
+ it 'can produce pepxml' do
429
+ answ = @obj.to_pepxml
430
+ answ.should =~ _re('<modification_info')
431
+ answ.should =~ _re(" mod_nterm_mass=\"520.2\"")
432
+ answ.should =~ _re(" modified_peptide=\"MOD*IFI^E&amp;D\"")
433
+ answ.should =~ _re("<mod_aminoacid_mass")
434
+ answ.should =~ _re(" position=\"3\"")
435
+ answ.should =~ _re(" mass=\"150.3\"")
436
+ answ.should =~ _re(" position=\"6\"")
437
+ answ.should =~ _re(" mass=\"345.2\"")
438
+ answ.should =~ _re("</modification_info>")
439
+ end
440
+ end
441
+
442
+ =end