mspire 0.5.0 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +24 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/lib/cv/description.rb +18 -0
- data/lib/cv/param.rb +33 -0
- data/lib/cv.rb +3 -0
- data/lib/io/bookmark.rb +13 -0
- data/lib/merge.rb +7 -0
- data/lib/ms/cvlist.rb +76 -0
- data/lib/ms/digester.rb +245 -0
- data/lib/ms/fasta.rb +86 -0
- data/lib/ms/ident/peptide/db.rb +243 -0
- data/lib/ms/ident/peptide.rb +72 -0
- data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
- data/lib/ms/ident/peptide_hit.rb +26 -0
- data/lib/ms/ident/pepxml/modifications.rb +83 -0
- data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
- data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
- data/lib/ms/ident/pepxml/parameters.rb +14 -0
- data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
- data/lib/ms/ident/pepxml/search_database.rb +49 -0
- data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
- data/lib/ms/ident/pepxml/search_hit.rb +144 -0
- data/lib/ms/ident/pepxml/search_result.rb +35 -0
- data/lib/ms/ident/pepxml/search_summary.rb +92 -0
- data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
- data/lib/ms/ident/pepxml.rb +112 -0
- data/lib/ms/ident/protein.rb +33 -0
- data/lib/ms/ident/protein_group.rb +80 -0
- data/lib/ms/ident/search.rb +114 -0
- data/lib/ms/ident.rb +37 -0
- data/lib/ms/isotope/aa.rb +59 -0
- data/lib/ms/mascot.rb +6 -0
- data/lib/ms/mass/aa.rb +79 -0
- data/lib/ms/mass.rb +55 -0
- data/lib/ms/mzml/index_list.rb +98 -0
- data/lib/ms/mzml/plms1.rb +34 -0
- data/lib/ms/mzml.rb +197 -0
- data/lib/ms/obo.rb +38 -0
- data/lib/ms/plms1.rb +156 -0
- data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
- data/lib/ms/quant/qspec.rb +112 -0
- data/lib/ms/spectrum.rb +154 -8
- data/lib/ms.rb +3 -10
- data/lib/msplat.rb +2 -0
- data/lib/obo/ims.rb +5 -0
- data/lib/obo/ms.rb +7 -0
- data/lib/obo/ontology.rb +41 -0
- data/lib/obo/unit.rb +5 -0
- data/lib/openany.rb +23 -0
- data/lib/write_file_or_string.rb +18 -0
- data/obo/ims.obo +562 -0
- data/obo/ms.obo +11677 -0
- data/obo/unit.obo +2563 -0
- data/spec/ms/cvlist_spec.rb +60 -0
- data/spec/ms/digester_spec.rb +351 -0
- data/spec/ms/fasta_spec.rb +100 -0
- data/spec/ms/ident/peptide/db_spec.rb +108 -0
- data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
- data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
- data/spec/ms/ident/pepxml_spec.rb +442 -0
- data/spec/ms/ident/protein_group_spec.rb +68 -0
- data/spec/ms/mass_spec.rb +8 -0
- data/spec/ms/mzml/index_list_spec.rb +122 -0
- data/spec/ms/mzml/plms1_spec.rb +62 -0
- data/spec/ms/mzml_spec.rb +50 -0
- data/spec/ms/plms1_spec.rb +38 -0
- data/spec/ms/quant/qspec_spec.rb +25 -0
- data/spec/msplat_spec.rb +24 -0
- data/spec/obo_spec.rb +25 -0
- data/spec/spec_helper.rb +25 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
- data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
- data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
- data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
- data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
- data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
- data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
- data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
- data/spec/testfiles/plms1/output.key +0 -0
- metadata +157 -40
- data/README +0 -77
- data/changelog.txt +0 -196
- data/lib/ms/calc.rb +0 -32
- data/lib/ms/data/interleaved.rb +0 -60
- data/lib/ms/data/lazy_io.rb +0 -73
- data/lib/ms/data/lazy_string.rb +0 -15
- data/lib/ms/data/simple.rb +0 -59
- data/lib/ms/data/transposed.rb +0 -41
- data/lib/ms/data.rb +0 -57
- data/lib/ms/format/format_error.rb +0 -12
- data/lib/ms/support/binary_search.rb +0 -126
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'ms/ident'
|
3
|
+
require 'ms/ident/pepxml/msms_pipeline_analysis'
|
4
|
+
|
5
|
+
require 'ostruct'
|
6
|
+
|
7
|
+
module MS ; module Ident ; end ; end
|
8
|
+
|
9
|
+
class Numeric
|
10
|
+
# returns a string with a + or - on the front
|
11
|
+
def to_plus_minus_string
|
12
|
+
if self >= 0
|
13
|
+
'+' << self.to_s
|
14
|
+
else
|
15
|
+
self.to_s
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class MS::Ident::Pepxml
|
21
|
+
XML_STYLESHEET_LOCATION = '/tools/bin/TPP/tpp/schema/pepXML_std.xsl'
|
22
|
+
DEFAULT_PEPXML_VERSION = MsmsPipelineAnalysis::PEPXML_VERSION
|
23
|
+
XML_ENCODING = 'UTF-8'
|
24
|
+
|
25
|
+
attr_accessor :msms_pipeline_analysis
|
26
|
+
|
27
|
+
# returns an array of MS::Ident::Pepxml::SearchHit::Simple structs
|
28
|
+
def self.simple_search_hits(file)
|
29
|
+
hit_values = File.open(file) do |io|
|
30
|
+
doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
|
31
|
+
# we can work with namespaces, or just remove them ...
|
32
|
+
doc.remove_namespaces!
|
33
|
+
root = doc.root
|
34
|
+
search_hits = root.xpath('//search_hit')
|
35
|
+
search_hits.each_with_index.map do |search_hit,i|
|
36
|
+
aaseq = search_hit['peptide']
|
37
|
+
charge = search_hit.parent.parent['assumed_charge'].to_i
|
38
|
+
search_score_nodes = search_hit.children.select {|node| node.name == 'search_score' }
|
39
|
+
search_scores = {}
|
40
|
+
search_score_nodes.each do |node|
|
41
|
+
search_scores[node['name'].to_sym] = node['value'].to_f
|
42
|
+
end
|
43
|
+
MS::Ident::Pepxml::SearchHit::Simple.new("hit_#{i}", MS::Ident::Search.new(file.chomp(File.extname(file))), aaseq, charge, search_scores)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def pepxml_version
|
49
|
+
msms_pipeline_analysis.pepxml_version
|
50
|
+
end
|
51
|
+
|
52
|
+
# returns an array of spectrum queries
|
53
|
+
def spectrum_queries
|
54
|
+
msms_pipeline_analysis.msms_run_summary.spectrum_queries
|
55
|
+
end
|
56
|
+
|
57
|
+
# yields a new Msms_Pipeline_Analysis object if given a block
|
58
|
+
def initialize(&block)
|
59
|
+
block.call(@msms_pipeline_analysis=MsmsPipelineAnalysis.new) if block
|
60
|
+
end
|
61
|
+
|
62
|
+
# takes an xml document object and sets it with the xml stylesheet
|
63
|
+
def add_stylesheet(doc, location)
|
64
|
+
xml_stylesheet = Nokogiri::XML::ProcessingInstruction.new(doc, "xml-stylesheet", %Q{type="text/xsl" href="#{location}"})
|
65
|
+
doc.root.add_previous_sibling xml_stylesheet
|
66
|
+
doc
|
67
|
+
end
|
68
|
+
|
69
|
+
# if no options are given, an xml string is returned. If either :outdir or
|
70
|
+
# :outfile is given, the xml is written to file and the output filename is returned.
|
71
|
+
# A single string argument will be interpreted as :outfile if it ends in
|
72
|
+
# '.xml' and the :outdir otherwise. In this case, update_summary_xml is still true
|
73
|
+
#
|
74
|
+
# options:
|
75
|
+
#
|
76
|
+
# arg default
|
77
|
+
# :outdir => nil write to disk using this outdir with summary_xml basename
|
78
|
+
# :outfile => nil write to this filename (overrides outdir)
|
79
|
+
# :update_summary_xml => true update summary_xml attribute to point to the output file true/false
|
80
|
+
#
|
81
|
+
# set outdir to
|
82
|
+
# File.dirname(pepxml_obj.msms_pipeline_analysis.msms_run_summary.base_name)
|
83
|
+
# to write to the same directory as the input search file.
|
84
|
+
def to_xml(opts={})
|
85
|
+
opts ||= {}
|
86
|
+
if opts.is_a?(String)
|
87
|
+
opts = ( opts.match(/\.xml$/) ? {:outfile => opts} : {:outdir => opts } )
|
88
|
+
end
|
89
|
+
opt = {:update_summary_xml => true, :outdir => nil, :outfile => nil}.merge(opts)
|
90
|
+
|
91
|
+
if opt[:outfile]
|
92
|
+
outfile = opt[:outfile]
|
93
|
+
elsif opt[:outdir]
|
94
|
+
outfile = File.join(opt[:outdir], msms_pipeline_analysis.summary_xml.split(/[\/\\]/).last)
|
95
|
+
end
|
96
|
+
self.msms_pipeline_analysis.summary_xml = File.expand_path(outfile) if (opt[:update_summary_xml] && outfile)
|
97
|
+
|
98
|
+
builder = Nokogiri::XML::Builder.new(:encoding => XML_ENCODING)
|
99
|
+
msms_pipeline_analysis.to_xml(builder)
|
100
|
+
add_stylesheet(builder.doc, MS::Ident::Pepxml::XML_STYLESHEET_LOCATION)
|
101
|
+
string = builder.doc.to_xml
|
102
|
+
|
103
|
+
if outfile
|
104
|
+
File.open(outfile,'w') {|out| out.print(string) }
|
105
|
+
outfile
|
106
|
+
else
|
107
|
+
string
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'andand'
|
2
|
+
|
3
|
+
module MS ; end
|
4
|
+
module MS::Ident
|
5
|
+
module ProteinLike
|
6
|
+
# an id for the protein
|
7
|
+
attr_accessor :id
|
8
|
+
|
9
|
+
# the protein sequence
|
10
|
+
attr_accessor :sequence
|
11
|
+
alias_method :seq, :sequence
|
12
|
+
alias_method :seq=, :sequence=
|
13
|
+
|
14
|
+
# a description of the protein
|
15
|
+
attr_accessor :description
|
16
|
+
|
17
|
+
# if the GN=([^\s]+) regexp is found in the description, returns the first
|
18
|
+
# match, or nil if not found
|
19
|
+
def gene_id
|
20
|
+
description.andand.match(/ GN=(\w+) ?/)[1]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# a generic protein class that is ProteinLike
|
25
|
+
class Protein
|
26
|
+
include ProteinLike
|
27
|
+
|
28
|
+
def initialize(id=nil, sequence=nil)
|
29
|
+
(@id, @sequence) = id, sequence
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module MS
|
4
|
+
module Ident
|
5
|
+
# represents a group of proteins, typically indistinguishable in the
|
6
|
+
# experiment.
|
7
|
+
class ProteinGroup < Array
|
8
|
+
attr_accessor :peptide_hits
|
9
|
+
|
10
|
+
PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
|
11
|
+
peptide_hits = protein_group_and_peptide_hits.last
|
12
|
+
num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
|
13
|
+
num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
|
14
|
+
[num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
|
15
|
+
end
|
16
|
+
|
17
|
+
# greedy algorithm to map a set of peptide_hits to protein groups. each
|
18
|
+
# peptide hit should respond to :aaseq, :charge, :proteins if a block is
|
19
|
+
# given, yields a single argument: a doublet of protein_group and peptide
|
20
|
+
# set. It expects a metric or array to sort by for creating greedy protein
|
21
|
+
# groups (the greediest proteins should sort to the back of the array). if
|
22
|
+
# no block is given, the groups are sorted by [# uniq aaseqs, # uniq
|
23
|
+
# aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
|
24
|
+
# peptide_hits and the objects returned by peptide_hit#proteins are used as
|
25
|
+
# hash keys. As long as each peptide hit has a unique signature (like an
|
26
|
+
# id) then any object will work. If they are Struct objects, you might
|
27
|
+
# consider redefining the #hash method to be object_id for performance and
|
28
|
+
# accuracy.
|
29
|
+
#
|
30
|
+
# returns an array of ProteinGroup objects, each set with :peptide_hits
|
31
|
+
#
|
32
|
+
# If update_peptide_hits is true, then each peptide_hit is linked to the array
|
33
|
+
# of protein_groups it is associated with using :protein_groups. A
|
34
|
+
# symbol can also be passed in, and that method will be called instead.
|
35
|
+
def self.peptide_hits_to_protein_groups(peptide_hits, update_peptide_hits=false, &sort_by)
|
36
|
+
update_peptide_hits = 'protein_groups='.to_sym if (update_peptide_hits==true)
|
37
|
+
sort_by ||= PRIORITIZE_PROTEINS
|
38
|
+
# note to self: I wrote this in 2011, so I think I know what I'm doing now
|
39
|
+
protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
|
40
|
+
peptide_hits.each do |peptide_hit|
|
41
|
+
peptide_hit.proteins.each do |protein|
|
42
|
+
protein_to_peptides[protein] << peptide_hit
|
43
|
+
end
|
44
|
+
end
|
45
|
+
peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
|
46
|
+
protein_to_peptides.each do |protein, peptide_set|
|
47
|
+
peptides_to_protein_group[peptide_set] << protein
|
48
|
+
end
|
49
|
+
peptides_to_protein_group.each do |pephits,ar_of_prots|
|
50
|
+
pg = MS::Ident::ProteinGroup.new(ar_of_prots)
|
51
|
+
pg.peptide_hits = pephits
|
52
|
+
peptides_to_protein_group[pephits] = pg
|
53
|
+
end
|
54
|
+
|
55
|
+
protein_group_to_peptides = peptides_to_protein_group.invert
|
56
|
+
greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
|
57
|
+
|
58
|
+
accounted_for = Set.new
|
59
|
+
# we are discarding the subsumed sets, but we could get them with
|
60
|
+
# partition
|
61
|
+
greedy_first.select! do |group, peptide_set|
|
62
|
+
has_an_unaccounted_peptide = false
|
63
|
+
peptide_set.each do |peptide_hit|
|
64
|
+
unless accounted_for.include?(peptide_hit)
|
65
|
+
has_an_unaccounted_peptide = true
|
66
|
+
accounted_for.add(peptide_hit)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
group.peptide_hits = peptide_set if has_an_unaccounted_peptide
|
70
|
+
has_an_unaccounted_peptide
|
71
|
+
end
|
72
|
+
if update_peptide_hits
|
73
|
+
greedy_first.each {|pg, pephits| pephits.each {|hit| hit.send(update_peptide_hits, pg) } }
|
74
|
+
end
|
75
|
+
greedy_first.map(&:first)
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
|
2
|
+
module MS
|
3
|
+
module Ident
|
4
|
+
|
5
|
+
module SearchLike
|
6
|
+
attr_accessor :id
|
7
|
+
attr_accessor :peptide_hits
|
8
|
+
alias_method :hits, :peptide_hits
|
9
|
+
alias_method :hits=, :peptide_hits=
|
10
|
+
|
11
|
+
# returns an array of peptide_hits and protein_hits that are linked to
|
12
|
+
# one another. NOTE: this will update peptide and protein
|
13
|
+
# hits :proteins and :peptide_hits attributes respectively). Assumes that each search
|
14
|
+
# responds to :peptide_hits, each peptide responds to :proteins and each protein to
|
15
|
+
# :peptide_hits. Can be done on a single file to restore protein/peptide
|
16
|
+
# linkages to their original single-file state.
|
17
|
+
# Assumes the protein is initialized with (reference, peptide_ar)
|
18
|
+
#
|
19
|
+
# yields the protein that will become the template for a new protein
|
20
|
+
# and expects a new protein hit
|
21
|
+
#def merge!(ar_of_peptide_hit_arrays)
|
22
|
+
# all_peptide_hits = []
|
23
|
+
# reference_hash = {}
|
24
|
+
# ar_of_peptide_hit_arrays.each do |peptide_hits|
|
25
|
+
# all_peptide_hits.push(*peptide_hits)
|
26
|
+
# peptide_hits.each do |peptide|
|
27
|
+
# peptide.proteins.each do |protein|
|
28
|
+
# id = protein.id
|
29
|
+
# if reference_hash.key?(id)
|
30
|
+
# reference_hash[id].peptide_hits << peptide
|
31
|
+
# reference_hash[id]
|
32
|
+
# else
|
33
|
+
# reference_hash[id] = yield(protein, [peptide])
|
34
|
+
# end
|
35
|
+
# end
|
36
|
+
# end
|
37
|
+
# end
|
38
|
+
# [all_peptide_hits, reference_hash.values]
|
39
|
+
#end
|
40
|
+
end
|
41
|
+
|
42
|
+
class Search
|
43
|
+
include SearchLike
|
44
|
+
|
45
|
+
def initialize(id=nil, peptide_hits=[])
|
46
|
+
@id = id
|
47
|
+
@peptide_hits = peptide_hits
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
module SearchGroup
|
53
|
+
|
54
|
+
# an array of search objects
|
55
|
+
attr_accessor :searches
|
56
|
+
|
57
|
+
# the group's file extension (with no leading period)
|
58
|
+
def extension
|
59
|
+
'grp'
|
60
|
+
end
|
61
|
+
|
62
|
+
def search_class
|
63
|
+
Search
|
64
|
+
end
|
65
|
+
|
66
|
+
# a simple formatted file with paths to the search files
|
67
|
+
def to_paths(file)
|
68
|
+
IO.readlines(file).grep(/\w/).reject {|v| v =~ /^#/}.map {|v| v.chomp }
|
69
|
+
end
|
70
|
+
|
71
|
+
def from_file(file)
|
72
|
+
from_filenames(to_paths(file))
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
def from_filenames(filenames)
|
77
|
+
filenames.each do |file|
|
78
|
+
if !File.exist? file
|
79
|
+
message = "File: #{file} does not exist!\n"
|
80
|
+
message << "perhaps you need to modify the file with file paths"
|
81
|
+
abort message
|
82
|
+
end
|
83
|
+
@searches << search_class.new(file)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
# takes an array of filenames or a single search filename (with
|
89
|
+
# extension defined by 'extendsion') or an array of objects passes any
|
90
|
+
# arguments to the initializer for each search
|
91
|
+
# the optional block yields the object for further processing
|
92
|
+
def initialize(arg=nil, opts={})
|
93
|
+
@peptide_hits = []
|
94
|
+
@reference_hash = {}
|
95
|
+
@searches = []
|
96
|
+
|
97
|
+
if arg
|
98
|
+
if arg.is_a?(String) && arg =~ /\.#{Regexp.escap(extension)}$/
|
99
|
+
from_file(arg)
|
100
|
+
elsif arg.is_a?(Array) && arg.first.is_a?(String)
|
101
|
+
from_filenames(arg)
|
102
|
+
elsif arg.is_a?(Array)
|
103
|
+
@searches = array
|
104
|
+
else
|
105
|
+
raise ArgumentError, "must be file, array of filenames, or array of objs"
|
106
|
+
end
|
107
|
+
@searches << search_class.new(file, opts)
|
108
|
+
end
|
109
|
+
yield(self) if block_given?
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
data/lib/ms/ident.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
|
2
|
+
require 'ms/ident/protein_group'
|
3
|
+
require 'ms/ident/protein'
|
4
|
+
require 'ms/ident/peptide_hit'
|
5
|
+
|
6
|
+
module MS
|
7
|
+
|
8
|
+
# An MS::Ident::ProteinGroup is an array of proteins that responds to
|
9
|
+
# :peptide_hits. All protein level identifications should be stored in a
|
10
|
+
# proteingroup object.
|
11
|
+
#
|
12
|
+
# An MS::Ident::Protein is an object representing a protein (:id,
|
13
|
+
# :sequence, :description). Note, it is not a protein hit (use a
|
14
|
+
# ProteinGroup)
|
15
|
+
#
|
16
|
+
# An MS::Ident::PeptideHit is an object representing a match between an
|
17
|
+
# amino acid sequence and a spectrum.
|
18
|
+
#
|
19
|
+
# Typical usage:
|
20
|
+
#
|
21
|
+
# require 'ms/ident'
|
22
|
+
#
|
23
|
+
# hit1 = PeptideHit.new(:id => 1, :aaseq => 'PEPTIDE', :search =>
|
24
|
+
# MS::Ident::Search.new, etc...)
|
25
|
+
# peptide_hits = [hit1, hit2, ...]
|
26
|
+
#
|
27
|
+
# protein_groups = MS::Ident::ProteinGroup.peptide_hits_to_protein_groups(peptide_hits)
|
28
|
+
# protein_groups.first.peptide_hits # => the peptide hits in that group
|
29
|
+
module Ident
|
30
|
+
# returns the filetype (if possible)
|
31
|
+
def self.filetype(file)
|
32
|
+
if file =~ /\.srf$/i
|
33
|
+
:srf
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module MS
|
2
|
+
module Isotope
|
3
|
+
module AA
|
4
|
+
ATOM_COUNTS_STR = {
|
5
|
+
'A' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
6
|
+
'R' => { :c =>6, :h =>14 , :o =>2 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
|
7
|
+
'N' => { :c =>4, :h =>8 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
8
|
+
'D' => { :c =>4, :h =>7 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
9
|
+
'C' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
10
|
+
'E' => { :c =>5, :h =>9 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
11
|
+
'Q' => { :c =>5, :h =>10 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
12
|
+
'G' => { :c =>2, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
13
|
+
'H' => { :c =>6, :h =>9 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
|
14
|
+
'I' => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
15
|
+
'L' => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
16
|
+
'K' => { :c =>6, :h =>14 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
17
|
+
'M' => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
18
|
+
'F' => { :c =>9, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
19
|
+
'P' => { :c =>5, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
20
|
+
'S' => { :c =>3, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
21
|
+
'T' => { :c =>4, :h =>9 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
22
|
+
'W' => { :c =>11, :h =>12 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
23
|
+
'Y' => { :c =>9, :h =>11 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
24
|
+
'V' => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
25
|
+
'U' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
|
26
|
+
'O' => { :c =>12, :h =>21 , :o =>3 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
|
27
|
+
}
|
28
|
+
ATOM_COUNTS_SYM = {
|
29
|
+
:A => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
30
|
+
:R => { :c =>6, :h =>14 , :o =>2 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
|
31
|
+
:N => { :c =>4, :h =>8 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
32
|
+
:D => { :c =>4, :h =>7 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
33
|
+
:C => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
34
|
+
:E => { :c =>5, :h =>9 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
35
|
+
:Q => { :c =>5, :h =>10 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
36
|
+
:G => { :c =>2, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
37
|
+
:H => { :c =>6, :h =>9 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
|
38
|
+
:I => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
39
|
+
:L => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
40
|
+
:K => { :c =>6, :h =>14 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
41
|
+
:M => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
42
|
+
:F => { :c =>9, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
43
|
+
:P => { :c =>5, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
44
|
+
:S => { :c =>3, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
45
|
+
:T => { :c =>4, :h =>9 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
46
|
+
:W => { :c =>11, :h =>12 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
47
|
+
:Y => { :c =>9, :h =>11 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
48
|
+
:V => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
49
|
+
:U => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
|
50
|
+
:O => { :c =>12, :h =>21 , :o =>3 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
|
51
|
+
}
|
52
|
+
ATOM_COUNTS_STR.each {|aa,val| ATOM_COUNTS[aa.to_sym] = val }
|
53
|
+
|
54
|
+
# string and symbol access of amino acid (atoms are all lower case
|
55
|
+
# symbols)
|
56
|
+
ATOM_COUNTS = ATOM_COUNTS_SYM.merge ATOM_COUNTS_STR
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
data/lib/ms/mascot.rb
ADDED
data/lib/ms/mass/aa.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'ms/mass'
|
2
|
+
|
3
|
+
module MS
|
4
|
+
module Mass
|
5
|
+
module AA
|
6
|
+
# amino_acids keys as strings, average masses
|
7
|
+
AVG_STRING = {
|
8
|
+
"*"=>118.88603,
|
9
|
+
"A"=>71.0779,
|
10
|
+
"B"=>172.1405,
|
11
|
+
"C"=>103.1429,
|
12
|
+
"D"=>115.0874,
|
13
|
+
"E"=>129.11398,
|
14
|
+
"F"=>147.17386,
|
15
|
+
"G"=>57.05132,
|
16
|
+
"H"=>137.13928,
|
17
|
+
"I"=>113.15764,
|
18
|
+
"K"=>128.17228,
|
19
|
+
"L"=>113.15764,
|
20
|
+
"M"=>131.19606,
|
21
|
+
"N"=>114.10264,
|
22
|
+
"O"=>211.28076,
|
23
|
+
"P"=>97.11518,
|
24
|
+
"Q"=>128.12922,
|
25
|
+
"R"=>156.18568,
|
26
|
+
"S"=>87.0773,
|
27
|
+
"T"=>101.10388,
|
28
|
+
"U"=>150.0379,
|
29
|
+
"V"=>99.13106,
|
30
|
+
"W"=>186.2099,
|
31
|
+
"X"=>118.88603,
|
32
|
+
"Y"=>163.17326,
|
33
|
+
"Z"=>128.6231
|
34
|
+
}
|
35
|
+
# amino_acids keys as strings, monoisotopic masses
|
36
|
+
MONO_STRING = {
|
37
|
+
"*"=>118.805716,
|
38
|
+
"A"=>71.0371137878,
|
39
|
+
"B"=>172.048405,
|
40
|
+
"C"=>103.0091844778,
|
41
|
+
"D"=>115.026943032,
|
42
|
+
"E"=>129.0425930962,
|
43
|
+
"F"=>147.0684139162,
|
44
|
+
"G"=>57.0214637236,
|
45
|
+
"H"=>137.0589118624,
|
46
|
+
"I"=>113.0840639804,
|
47
|
+
"K"=>128.0949630177,
|
48
|
+
"L"=>113.0840639804,
|
49
|
+
"M"=>131.0404846062,
|
50
|
+
"N"=>114.0429274472,
|
51
|
+
"O"=>211.1446528645,
|
52
|
+
"P"=>97.052763852,
|
53
|
+
"Q"=>128.0585775114,
|
54
|
+
"R"=>156.1011110281,
|
55
|
+
"S"=>87.0320284099,
|
56
|
+
"T"=>101.0476784741,
|
57
|
+
"U"=>150.9536355878,
|
58
|
+
"V"=>99.0684139162,
|
59
|
+
"W"=>186.0793129535,
|
60
|
+
"X"=>118.805716,
|
61
|
+
"Y"=>163.0633285383,
|
62
|
+
"Z"=>128.550585
|
63
|
+
}
|
64
|
+
|
65
|
+
# amino_acids keys as symbols, monoisotopic masses
|
66
|
+
MONO_SYM = Hash[MONO_STRING.map {|aa,mass| [aa.to_sym, mass] } ]
|
67
|
+
|
68
|
+
# amino_acids keys as symbols, average masses
|
69
|
+
AVG_SYM = Hash[AVG_STRING.map {|aa,mass| [aa.to_sym, mass] } ]
|
70
|
+
|
71
|
+
# Monoisotopic amino acid masses keyed as symbols and also strings (all
|
72
|
+
# upper case). Also includes MS::Mass::MONO for things like protons ('h+')
|
73
|
+
MONO = MONO_SYM.merge(MONO_STRING).merge(MS::Mass::MONO)
|
74
|
+
# Average amino acid masses keyed as symbols and also strings (all
|
75
|
+
# uppder case). Also includes MS::Mass::AVG for things like protons ('h+')
|
76
|
+
AVG = AVG_SYM.merge(AVG_STRING).merge(MS::Mass::AVG)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/ms/mass.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
|
2
|
+
module MS
|
3
|
+
module Mass
|
4
|
+
|
5
|
+
# takes a chemical formula in this format: C2BrH12O
|
6
|
+
def self.formula_to_exact_mass(formula)
|
7
|
+
# TODO: add other input methods
|
8
|
+
pairs = formula.scan(/([A-Z][a-z]?)(\d*)/).map do |match|
|
9
|
+
if match.last == ''
|
10
|
+
match[-1] = 1
|
11
|
+
end
|
12
|
+
[match[0], match[1].to_i]
|
13
|
+
end
|
14
|
+
pairs.map do |pair|
|
15
|
+
MONO[pair.first.downcase] * pair.last
|
16
|
+
end.reduce(:+)
|
17
|
+
end
|
18
|
+
|
19
|
+
H_PLUS = 1.00727646677
|
20
|
+
# + http://www.unimod.org/masses.html
|
21
|
+
MONO_STR = {
|
22
|
+
'c' => 12.0, # +
|
23
|
+
'br' => 78.9183361, # +
|
24
|
+
'd' => 2.014101779, # +
|
25
|
+
'f' => 18.99840322, # +
|
26
|
+
'n' => 14.003074, # +
|
27
|
+
'o' => 15.99491463, # +
|
28
|
+
'na' => 22.9897677, # +
|
29
|
+
'p' => 30.973762, # +
|
30
|
+
's' => 31.9720707, # +
|
31
|
+
'li' => 7.016003, # +
|
32
|
+
'cl' => 34.96885272, # +
|
33
|
+
'k' => 38.9637074, # +
|
34
|
+
'si' => 27.9769265325, # http://physics.nist.gov/cgi-bin/Compositions/stand_alone.pl?ele=Si&ascii=html&isotype=some
|
35
|
+
'i' => 126.904473, # +
|
36
|
+
'h+' => 1.00727646677,
|
37
|
+
'h' => 1.007825035, # +
|
38
|
+
'h2o' => 18.0105647,
|
39
|
+
'oh' => 17.002739665,
|
40
|
+
}
|
41
|
+
AVG_STR = {
|
42
|
+
'h+' => 1.007276, # using Mascot_H_plus mass (is this right for AVG??)
|
43
|
+
'h' => 1.00794,
|
44
|
+
'h2o' => 18.01528,
|
45
|
+
'oh' => 17.00734,
|
46
|
+
}
|
47
|
+
# sets MONO_STR, MONO, AVG_STR, and AVG
|
48
|
+
%w(MONO AVG).each do |type|
|
49
|
+
const_set "#{type}_SYM", Hash[ const_get("#{type}_STR").map {|k,v| [k.to_sym, v] } ]
|
50
|
+
const_set type, const_get("#{type}_STR").merge( const_get("#{type}_SYM") )
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
|
@@ -0,0 +1,98 @@
|
|
1
|
+
module MS
|
2
|
+
class Mzml
|
3
|
+
# A simple array of indices but #[] has been overloaded to find an index
|
4
|
+
# by name
|
5
|
+
#
|
6
|
+
# index_list[0] # the first index
|
7
|
+
# index_list.map(&:names) # -> [:spectrum, :chromatogram]
|
8
|
+
# index_list[:spectrum] # the spectrum index
|
9
|
+
# index_list[:chromatogram] # the chromatogram index
|
10
|
+
class IndexList < Array
|
11
|
+
alias_method :old_bracket_slice, :'[]'
|
12
|
+
|
13
|
+
# @param [Object] an Integer (index number) or a Symbol (:spectrum or
|
14
|
+
# :chromatogram)
|
15
|
+
# @return [MS::Mzml::Index] an index object
|
16
|
+
def [](int_or_symbol)
|
17
|
+
if int_or_symbol.is_a?(Integer)
|
18
|
+
old_bracket_slice(int_or_symbol)
|
19
|
+
else
|
20
|
+
self.find {|index| index.name == int_or_symbol }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# the array holds start bytes
|
26
|
+
class Index < Array
|
27
|
+
|
28
|
+
class << self
|
29
|
+
# returns an Integer or nil if not found
|
30
|
+
# does a single jump backwards from the tail of the file looking for
|
31
|
+
# an xml element based on tag. If it is not found, returns nil
|
32
|
+
def index_offset(io, tag='indexListOffset', bytes_backwards=200)
|
33
|
+
tag_re = %r{<#{tag}>([\-\d]+)</#{tag}>}
|
34
|
+
io.pos = (io.size - 1) - bytes_backwards
|
35
|
+
md = io.readlines("\n").map {|line| line.match(tag_re) }.compact.shift
|
36
|
+
md[1].to_i if md
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# an index indexed by scan number
|
41
|
+
attr_accessor :by_scans
|
42
|
+
|
43
|
+
# the name of the index (as a symbol)
|
44
|
+
attr_accessor :name
|
45
|
+
|
46
|
+
# a parallel array of ids (idRef's)
|
47
|
+
attr_accessor :ids
|
48
|
+
|
49
|
+
def start_byte_and_id(int)
|
50
|
+
[self[int], ids[int]]
|
51
|
+
end
|
52
|
+
|
53
|
+
# returns hash of id to start_byte
|
54
|
+
def create_id_index
|
55
|
+
Hash[self.ids.zip(self)]
|
56
|
+
end
|
57
|
+
|
58
|
+
# @return [Integer] the start byte of the spectrum
|
59
|
+
# @param [Object] an Integer (the index number) or String (an id string)
|
60
|
+
def start_byte(arg)
|
61
|
+
case arg
|
62
|
+
when Integer
|
63
|
+
self[arg]
|
64
|
+
when String
|
65
|
+
@id_index ||= create_id_index
|
66
|
+
@id_index[arg]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# generates a scan to index hash that points from scan number to the
|
71
|
+
# spectrum index number. returns the index, nil if the scan ids
|
72
|
+
# are not present and spectra are, or false if they are not unique.
|
73
|
+
def create_scan_to_index
|
74
|
+
scan_re = /scan=(\d+)/
|
75
|
+
scan_to_index = {}
|
76
|
+
ids.each_with_index do |id, index|
|
77
|
+
md = id.match(scan_re)
|
78
|
+
scan_num = md[1].to_i if md
|
79
|
+
if scan_num
|
80
|
+
if scan_to_index.key?(scan_num)
|
81
|
+
return false
|
82
|
+
else
|
83
|
+
scan_to_index[scan_num] = index
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
if scan_to_index.size > 0
|
88
|
+
by_scans = scan_to_index
|
89
|
+
elsif ids.size > 0
|
90
|
+
nil # there are scans, but we did not find scan numbers
|
91
|
+
else
|
92
|
+
scan_to_index
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|