mspire 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +24 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/lib/cv/description.rb +18 -0
- data/lib/cv/param.rb +33 -0
- data/lib/cv.rb +3 -0
- data/lib/io/bookmark.rb +13 -0
- data/lib/merge.rb +7 -0
- data/lib/ms/cvlist.rb +76 -0
- data/lib/ms/digester.rb +245 -0
- data/lib/ms/fasta.rb +86 -0
- data/lib/ms/ident/peptide/db.rb +243 -0
- data/lib/ms/ident/peptide.rb +72 -0
- data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
- data/lib/ms/ident/peptide_hit.rb +26 -0
- data/lib/ms/ident/pepxml/modifications.rb +83 -0
- data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
- data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
- data/lib/ms/ident/pepxml/parameters.rb +14 -0
- data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
- data/lib/ms/ident/pepxml/search_database.rb +49 -0
- data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
- data/lib/ms/ident/pepxml/search_hit.rb +144 -0
- data/lib/ms/ident/pepxml/search_result.rb +35 -0
- data/lib/ms/ident/pepxml/search_summary.rb +92 -0
- data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
- data/lib/ms/ident/pepxml.rb +112 -0
- data/lib/ms/ident/protein.rb +33 -0
- data/lib/ms/ident/protein_group.rb +80 -0
- data/lib/ms/ident/search.rb +114 -0
- data/lib/ms/ident.rb +37 -0
- data/lib/ms/isotope/aa.rb +59 -0
- data/lib/ms/mascot.rb +6 -0
- data/lib/ms/mass/aa.rb +79 -0
- data/lib/ms/mass.rb +55 -0
- data/lib/ms/mzml/index_list.rb +98 -0
- data/lib/ms/mzml/plms1.rb +34 -0
- data/lib/ms/mzml.rb +197 -0
- data/lib/ms/obo.rb +38 -0
- data/lib/ms/plms1.rb +156 -0
- data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
- data/lib/ms/quant/qspec.rb +112 -0
- data/lib/ms/spectrum.rb +154 -8
- data/lib/ms.rb +3 -10
- data/lib/msplat.rb +2 -0
- data/lib/obo/ims.rb +5 -0
- data/lib/obo/ms.rb +7 -0
- data/lib/obo/ontology.rb +41 -0
- data/lib/obo/unit.rb +5 -0
- data/lib/openany.rb +23 -0
- data/lib/write_file_or_string.rb +18 -0
- data/obo/ims.obo +562 -0
- data/obo/ms.obo +11677 -0
- data/obo/unit.obo +2563 -0
- data/spec/ms/cvlist_spec.rb +60 -0
- data/spec/ms/digester_spec.rb +351 -0
- data/spec/ms/fasta_spec.rb +100 -0
- data/spec/ms/ident/peptide/db_spec.rb +108 -0
- data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
- data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
- data/spec/ms/ident/pepxml_spec.rb +442 -0
- data/spec/ms/ident/protein_group_spec.rb +68 -0
- data/spec/ms/mass_spec.rb +8 -0
- data/spec/ms/mzml/index_list_spec.rb +122 -0
- data/spec/ms/mzml/plms1_spec.rb +62 -0
- data/spec/ms/mzml_spec.rb +50 -0
- data/spec/ms/plms1_spec.rb +38 -0
- data/spec/ms/quant/qspec_spec.rb +25 -0
- data/spec/msplat_spec.rb +24 -0
- data/spec/obo_spec.rb +25 -0
- data/spec/spec_helper.rb +25 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
- data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
- data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
- data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
- data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
- data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
- data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
- data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
- data/spec/testfiles/plms1/output.key +0 -0
- metadata +157 -40
- data/README +0 -77
- data/changelog.txt +0 -196
- data/lib/ms/calc.rb +0 -32
- data/lib/ms/data/interleaved.rb +0 -60
- data/lib/ms/data/lazy_io.rb +0 -73
- data/lib/ms/data/lazy_string.rb +0 -15
- data/lib/ms/data/simple.rb +0 -59
- data/lib/ms/data/transposed.rb +0 -41
- data/lib/ms/data.rb +0 -57
- data/lib/ms/format/format_error.rb +0 -12
- data/lib/ms/support/binary_search.rb +0 -126
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
require 'ms/ident'
|
|
3
|
+
require 'ms/ident/pepxml/msms_pipeline_analysis'
|
|
4
|
+
|
|
5
|
+
require 'ostruct'
|
|
6
|
+
|
|
7
|
+
module MS ; module Ident ; end ; end
|
|
8
|
+
|
|
9
|
+
class Numeric
|
|
10
|
+
# returns a string with a + or - on the front
|
|
11
|
+
def to_plus_minus_string
|
|
12
|
+
if self >= 0
|
|
13
|
+
'+' << self.to_s
|
|
14
|
+
else
|
|
15
|
+
self.to_s
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
class MS::Ident::Pepxml
|
|
21
|
+
XML_STYLESHEET_LOCATION = '/tools/bin/TPP/tpp/schema/pepXML_std.xsl'
|
|
22
|
+
DEFAULT_PEPXML_VERSION = MsmsPipelineAnalysis::PEPXML_VERSION
|
|
23
|
+
XML_ENCODING = 'UTF-8'
|
|
24
|
+
|
|
25
|
+
attr_accessor :msms_pipeline_analysis
|
|
26
|
+
|
|
27
|
+
# returns an array of MS::Ident::Pepxml::SearchHit::Simple structs
|
|
28
|
+
def self.simple_search_hits(file)
|
|
29
|
+
hit_values = File.open(file) do |io|
|
|
30
|
+
doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
|
|
31
|
+
# we can work with namespaces, or just remove them ...
|
|
32
|
+
doc.remove_namespaces!
|
|
33
|
+
root = doc.root
|
|
34
|
+
search_hits = root.xpath('//search_hit')
|
|
35
|
+
search_hits.each_with_index.map do |search_hit,i|
|
|
36
|
+
aaseq = search_hit['peptide']
|
|
37
|
+
charge = search_hit.parent.parent['assumed_charge'].to_i
|
|
38
|
+
search_score_nodes = search_hit.children.select {|node| node.name == 'search_score' }
|
|
39
|
+
search_scores = {}
|
|
40
|
+
search_score_nodes.each do |node|
|
|
41
|
+
search_scores[node['name'].to_sym] = node['value'].to_f
|
|
42
|
+
end
|
|
43
|
+
MS::Ident::Pepxml::SearchHit::Simple.new("hit_#{i}", MS::Ident::Search.new(file.chomp(File.extname(file))), aaseq, charge, search_scores)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def pepxml_version
|
|
49
|
+
msms_pipeline_analysis.pepxml_version
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# returns an array of spectrum queries
|
|
53
|
+
def spectrum_queries
|
|
54
|
+
msms_pipeline_analysis.msms_run_summary.spectrum_queries
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# yields a new Msms_Pipeline_Analysis object if given a block
|
|
58
|
+
def initialize(&block)
|
|
59
|
+
block.call(@msms_pipeline_analysis=MsmsPipelineAnalysis.new) if block
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# takes an xml document object and sets it with the xml stylesheet
|
|
63
|
+
def add_stylesheet(doc, location)
|
|
64
|
+
xml_stylesheet = Nokogiri::XML::ProcessingInstruction.new(doc, "xml-stylesheet", %Q{type="text/xsl" href="#{location}"})
|
|
65
|
+
doc.root.add_previous_sibling xml_stylesheet
|
|
66
|
+
doc
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# if no options are given, an xml string is returned. If either :outdir or
|
|
70
|
+
# :outfile is given, the xml is written to file and the output filename is returned.
|
|
71
|
+
# A single string argument will be interpreted as :outfile if it ends in
|
|
72
|
+
# '.xml' and the :outdir otherwise. In this case, update_summary_xml is still true
|
|
73
|
+
#
|
|
74
|
+
# options:
|
|
75
|
+
#
|
|
76
|
+
# arg default
|
|
77
|
+
# :outdir => nil write to disk using this outdir with summary_xml basename
|
|
78
|
+
# :outfile => nil write to this filename (overrides outdir)
|
|
79
|
+
# :update_summary_xml => true update summary_xml attribute to point to the output file true/false
|
|
80
|
+
#
|
|
81
|
+
# set outdir to
|
|
82
|
+
# File.dirname(pepxml_obj.msms_pipeline_analysis.msms_run_summary.base_name)
|
|
83
|
+
# to write to the same directory as the input search file.
|
|
84
|
+
def to_xml(opts={})
|
|
85
|
+
opts ||= {}
|
|
86
|
+
if opts.is_a?(String)
|
|
87
|
+
opts = ( opts.match(/\.xml$/) ? {:outfile => opts} : {:outdir => opts } )
|
|
88
|
+
end
|
|
89
|
+
opt = {:update_summary_xml => true, :outdir => nil, :outfile => nil}.merge(opts)
|
|
90
|
+
|
|
91
|
+
if opt[:outfile]
|
|
92
|
+
outfile = opt[:outfile]
|
|
93
|
+
elsif opt[:outdir]
|
|
94
|
+
outfile = File.join(opt[:outdir], msms_pipeline_analysis.summary_xml.split(/[\/\\]/).last)
|
|
95
|
+
end
|
|
96
|
+
self.msms_pipeline_analysis.summary_xml = File.expand_path(outfile) if (opt[:update_summary_xml] && outfile)
|
|
97
|
+
|
|
98
|
+
builder = Nokogiri::XML::Builder.new(:encoding => XML_ENCODING)
|
|
99
|
+
msms_pipeline_analysis.to_xml(builder)
|
|
100
|
+
add_stylesheet(builder.doc, MS::Ident::Pepxml::XML_STYLESHEET_LOCATION)
|
|
101
|
+
string = builder.doc.to_xml
|
|
102
|
+
|
|
103
|
+
if outfile
|
|
104
|
+
File.open(outfile,'w') {|out| out.print(string) }
|
|
105
|
+
outfile
|
|
106
|
+
else
|
|
107
|
+
string
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
require 'andand'
|
|
2
|
+
|
|
3
|
+
module MS ; end
|
|
4
|
+
module MS::Ident
|
|
5
|
+
module ProteinLike
|
|
6
|
+
# an id for the protein
|
|
7
|
+
attr_accessor :id
|
|
8
|
+
|
|
9
|
+
# the protein sequence
|
|
10
|
+
attr_accessor :sequence
|
|
11
|
+
alias_method :seq, :sequence
|
|
12
|
+
alias_method :seq=, :sequence=
|
|
13
|
+
|
|
14
|
+
# a description of the protein
|
|
15
|
+
attr_accessor :description
|
|
16
|
+
|
|
17
|
+
# if the GN=([^\s]+) regexp is found in the description, returns the first
|
|
18
|
+
# match, or nil if not found
|
|
19
|
+
def gene_id
|
|
20
|
+
description.andand.match(/ GN=(\w+) ?/)[1]
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# a generic protein class that is ProteinLike
|
|
25
|
+
class Protein
|
|
26
|
+
include ProteinLike
|
|
27
|
+
|
|
28
|
+
def initialize(id=nil, sequence=nil)
|
|
29
|
+
(@id, @sequence) = id, sequence
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
require 'set'
|
|
2
|
+
|
|
3
|
+
module MS
|
|
4
|
+
module Ident
|
|
5
|
+
# represents a group of proteins, typically indistinguishable in the
|
|
6
|
+
# experiment.
|
|
7
|
+
class ProteinGroup < Array
|
|
8
|
+
attr_accessor :peptide_hits
|
|
9
|
+
|
|
10
|
+
PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
|
|
11
|
+
peptide_hits = protein_group_and_peptide_hits.last
|
|
12
|
+
num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
|
|
13
|
+
num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
|
|
14
|
+
[num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# greedy algorithm to map a set of peptide_hits to protein groups. each
|
|
18
|
+
# peptide hit should respond to :aaseq, :charge, :proteins if a block is
|
|
19
|
+
# given, yields a single argument: a doublet of protein_group and peptide
|
|
20
|
+
# set. It expects a metric or array to sort by for creating greedy protein
|
|
21
|
+
# groups (the greediest proteins should sort to the back of the array). if
|
|
22
|
+
# no block is given, the groups are sorted by [# uniq aaseqs, # uniq
|
|
23
|
+
# aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
|
|
24
|
+
# peptide_hits and the objects returned by peptide_hit#proteins are used as
|
|
25
|
+
# hash keys. As long as each peptide hit has a unique signature (like an
|
|
26
|
+
# id) then any object will work. If they are Struct objects, you might
|
|
27
|
+
# consider redefining the #hash method to be object_id for performance and
|
|
28
|
+
# accuracy.
|
|
29
|
+
#
|
|
30
|
+
# returns an array of ProteinGroup objects, each set with :peptide_hits
|
|
31
|
+
#
|
|
32
|
+
# If update_peptide_hits is true, then each peptide_hit is linked to the array
|
|
33
|
+
# of protein_groups it is associated with using :protein_groups. A
|
|
34
|
+
# symbol can also be passed in, and that method will be called instead.
|
|
35
|
+
def self.peptide_hits_to_protein_groups(peptide_hits, update_peptide_hits=false, &sort_by)
|
|
36
|
+
update_peptide_hits = 'protein_groups='.to_sym if (update_peptide_hits==true)
|
|
37
|
+
sort_by ||= PRIORITIZE_PROTEINS
|
|
38
|
+
# note to self: I wrote this in 2011, so I think I know what I'm doing now
|
|
39
|
+
protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
|
|
40
|
+
peptide_hits.each do |peptide_hit|
|
|
41
|
+
peptide_hit.proteins.each do |protein|
|
|
42
|
+
protein_to_peptides[protein] << peptide_hit
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
|
|
46
|
+
protein_to_peptides.each do |protein, peptide_set|
|
|
47
|
+
peptides_to_protein_group[peptide_set] << protein
|
|
48
|
+
end
|
|
49
|
+
peptides_to_protein_group.each do |pephits,ar_of_prots|
|
|
50
|
+
pg = MS::Ident::ProteinGroup.new(ar_of_prots)
|
|
51
|
+
pg.peptide_hits = pephits
|
|
52
|
+
peptides_to_protein_group[pephits] = pg
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
protein_group_to_peptides = peptides_to_protein_group.invert
|
|
56
|
+
greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
|
|
57
|
+
|
|
58
|
+
accounted_for = Set.new
|
|
59
|
+
# we are discarding the subsumed sets, but we could get them with
|
|
60
|
+
# partition
|
|
61
|
+
greedy_first.select! do |group, peptide_set|
|
|
62
|
+
has_an_unaccounted_peptide = false
|
|
63
|
+
peptide_set.each do |peptide_hit|
|
|
64
|
+
unless accounted_for.include?(peptide_hit)
|
|
65
|
+
has_an_unaccounted_peptide = true
|
|
66
|
+
accounted_for.add(peptide_hit)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
group.peptide_hits = peptide_set if has_an_unaccounted_peptide
|
|
70
|
+
has_an_unaccounted_peptide
|
|
71
|
+
end
|
|
72
|
+
if update_peptide_hits
|
|
73
|
+
greedy_first.each {|pg, pephits| pephits.each {|hit| hit.send(update_peptide_hits, pg) } }
|
|
74
|
+
end
|
|
75
|
+
greedy_first.map(&:first)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
|
|
2
|
+
module MS
|
|
3
|
+
module Ident
|
|
4
|
+
|
|
5
|
+
module SearchLike
|
|
6
|
+
attr_accessor :id
|
|
7
|
+
attr_accessor :peptide_hits
|
|
8
|
+
alias_method :hits, :peptide_hits
|
|
9
|
+
alias_method :hits=, :peptide_hits=
|
|
10
|
+
|
|
11
|
+
# returns an array of peptide_hits and protein_hits that are linked to
|
|
12
|
+
# one another. NOTE: this will update peptide and protein
|
|
13
|
+
# hits :proteins and :peptide_hits attributes respectively). Assumes that each search
|
|
14
|
+
# responds to :peptide_hits, each peptide responds to :proteins and each protein to
|
|
15
|
+
# :peptide_hits. Can be done on a single file to restore protein/peptide
|
|
16
|
+
# linkages to their original single-file state.
|
|
17
|
+
# Assumes the protein is initialized with (reference, peptide_ar)
|
|
18
|
+
#
|
|
19
|
+
# yields the protein that will become the template for a new protein
|
|
20
|
+
# and expects a new protein hit
|
|
21
|
+
#def merge!(ar_of_peptide_hit_arrays)
|
|
22
|
+
# all_peptide_hits = []
|
|
23
|
+
# reference_hash = {}
|
|
24
|
+
# ar_of_peptide_hit_arrays.each do |peptide_hits|
|
|
25
|
+
# all_peptide_hits.push(*peptide_hits)
|
|
26
|
+
# peptide_hits.each do |peptide|
|
|
27
|
+
# peptide.proteins.each do |protein|
|
|
28
|
+
# id = protein.id
|
|
29
|
+
# if reference_hash.key?(id)
|
|
30
|
+
# reference_hash[id].peptide_hits << peptide
|
|
31
|
+
# reference_hash[id]
|
|
32
|
+
# else
|
|
33
|
+
# reference_hash[id] = yield(protein, [peptide])
|
|
34
|
+
# end
|
|
35
|
+
# end
|
|
36
|
+
# end
|
|
37
|
+
# end
|
|
38
|
+
# [all_peptide_hits, reference_hash.values]
|
|
39
|
+
#end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
class Search
|
|
43
|
+
include SearchLike
|
|
44
|
+
|
|
45
|
+
def initialize(id=nil, peptide_hits=[])
|
|
46
|
+
@id = id
|
|
47
|
+
@peptide_hits = peptide_hits
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
module SearchGroup
|
|
53
|
+
|
|
54
|
+
# an array of search objects
|
|
55
|
+
attr_accessor :searches
|
|
56
|
+
|
|
57
|
+
# the group's file extension (with no leading period)
|
|
58
|
+
def extension
|
|
59
|
+
'grp'
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def search_class
|
|
63
|
+
Search
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# a simple formatted file with paths to the search files
|
|
67
|
+
def to_paths(file)
|
|
68
|
+
IO.readlines(file).grep(/\w/).reject {|v| v =~ /^#/}.map {|v| v.chomp }
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def from_file(file)
|
|
72
|
+
from_filenames(to_paths(file))
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def from_filenames(filenames)
|
|
77
|
+
filenames.each do |file|
|
|
78
|
+
if !File.exist? file
|
|
79
|
+
message = "File: #{file} does not exist!\n"
|
|
80
|
+
message << "perhaps you need to modify the file with file paths"
|
|
81
|
+
abort message
|
|
82
|
+
end
|
|
83
|
+
@searches << search_class.new(file)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# takes an array of filenames or a single search filename (with
|
|
89
|
+
# extension defined by 'extendsion') or an array of objects passes any
|
|
90
|
+
# arguments to the initializer for each search
|
|
91
|
+
# the optional block yields the object for further processing
|
|
92
|
+
def initialize(arg=nil, opts={})
|
|
93
|
+
@peptide_hits = []
|
|
94
|
+
@reference_hash = {}
|
|
95
|
+
@searches = []
|
|
96
|
+
|
|
97
|
+
if arg
|
|
98
|
+
if arg.is_a?(String) && arg =~ /\.#{Regexp.escap(extension)}$/
|
|
99
|
+
from_file(arg)
|
|
100
|
+
elsif arg.is_a?(Array) && arg.first.is_a?(String)
|
|
101
|
+
from_filenames(arg)
|
|
102
|
+
elsif arg.is_a?(Array)
|
|
103
|
+
@searches = array
|
|
104
|
+
else
|
|
105
|
+
raise ArgumentError, "must be file, array of filenames, or array of objs"
|
|
106
|
+
end
|
|
107
|
+
@searches << search_class.new(file, opts)
|
|
108
|
+
end
|
|
109
|
+
yield(self) if block_given?
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
data/lib/ms/ident.rb
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
|
|
2
|
+
require 'ms/ident/protein_group'
|
|
3
|
+
require 'ms/ident/protein'
|
|
4
|
+
require 'ms/ident/peptide_hit'
|
|
5
|
+
|
|
6
|
+
module MS
|
|
7
|
+
|
|
8
|
+
# An MS::Ident::ProteinGroup is an array of proteins that responds to
|
|
9
|
+
# :peptide_hits. All protein level identifications should be stored in a
|
|
10
|
+
# proteingroup object.
|
|
11
|
+
#
|
|
12
|
+
# An MS::Ident::Protein is an object representing a protein (:id,
|
|
13
|
+
# :sequence, :description). Note, it is not a protein hit (use a
|
|
14
|
+
# ProteinGroup)
|
|
15
|
+
#
|
|
16
|
+
# An MS::Ident::PeptideHit is an object representing a match between an
|
|
17
|
+
# amino acid sequence and a spectrum.
|
|
18
|
+
#
|
|
19
|
+
# Typical usage:
|
|
20
|
+
#
|
|
21
|
+
# require 'ms/ident'
|
|
22
|
+
#
|
|
23
|
+
# hit1 = PeptideHit.new(:id => 1, :aaseq => 'PEPTIDE', :search =>
|
|
24
|
+
# MS::Ident::Search.new, etc...)
|
|
25
|
+
# peptide_hits = [hit1, hit2, ...]
|
|
26
|
+
#
|
|
27
|
+
# protein_groups = MS::Ident::ProteinGroup.peptide_hits_to_protein_groups(peptide_hits)
|
|
28
|
+
# protein_groups.first.peptide_hits # => the peptide hits in that group
|
|
29
|
+
module Ident
|
|
30
|
+
# returns the filetype (if possible)
|
|
31
|
+
def self.filetype(file)
|
|
32
|
+
if file =~ /\.srf$/i
|
|
33
|
+
:srf
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
module MS
|
|
2
|
+
module Isotope
|
|
3
|
+
module AA
|
|
4
|
+
ATOM_COUNTS_STR = {
|
|
5
|
+
'A' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
6
|
+
'R' => { :c =>6, :h =>14 , :o =>2 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
|
|
7
|
+
'N' => { :c =>4, :h =>8 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
|
8
|
+
'D' => { :c =>4, :h =>7 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
9
|
+
'C' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
|
10
|
+
'E' => { :c =>5, :h =>9 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
11
|
+
'Q' => { :c =>5, :h =>10 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
|
12
|
+
'G' => { :c =>2, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
13
|
+
'H' => { :c =>6, :h =>9 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
|
|
14
|
+
'I' => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
15
|
+
'L' => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
16
|
+
'K' => { :c =>6, :h =>14 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
|
17
|
+
'M' => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
|
18
|
+
'F' => { :c =>9, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
19
|
+
'P' => { :c =>5, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
20
|
+
'S' => { :c =>3, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
21
|
+
'T' => { :c =>4, :h =>9 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
22
|
+
'W' => { :c =>11, :h =>12 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
|
23
|
+
'Y' => { :c =>9, :h =>11 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
24
|
+
'V' => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
25
|
+
'U' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
|
|
26
|
+
'O' => { :c =>12, :h =>21 , :o =>3 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
|
|
27
|
+
}
|
|
28
|
+
ATOM_COUNTS_SYM = {
|
|
29
|
+
:A => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
30
|
+
:R => { :c =>6, :h =>14 , :o =>2 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
|
|
31
|
+
:N => { :c =>4, :h =>8 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
|
32
|
+
:D => { :c =>4, :h =>7 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
33
|
+
:C => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
|
34
|
+
:E => { :c =>5, :h =>9 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
35
|
+
:Q => { :c =>5, :h =>10 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
|
36
|
+
:G => { :c =>2, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
37
|
+
:H => { :c =>6, :h =>9 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
|
|
38
|
+
:I => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
39
|
+
:L => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
40
|
+
:K => { :c =>6, :h =>14 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
|
41
|
+
:M => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
|
42
|
+
:F => { :c =>9, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
43
|
+
:P => { :c =>5, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
44
|
+
:S => { :c =>3, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
45
|
+
:T => { :c =>4, :h =>9 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
46
|
+
:W => { :c =>11, :h =>12 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
|
47
|
+
:Y => { :c =>9, :h =>11 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
48
|
+
:V => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
|
49
|
+
:U => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
|
|
50
|
+
:O => { :c =>12, :h =>21 , :o =>3 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
|
|
51
|
+
}
|
|
52
|
+
ATOM_COUNTS_STR.each {|aa,val| ATOM_COUNTS[aa.to_sym] = val }
|
|
53
|
+
|
|
54
|
+
# string and symbol access of amino acid (atoms are all lower case
|
|
55
|
+
# symbols)
|
|
56
|
+
ATOM_COUNTS = ATOM_COUNTS_SYM.merge ATOM_COUNTS_STR
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
data/lib/ms/mascot.rb
ADDED
data/lib/ms/mass/aa.rb
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
require 'ms/mass'
|
|
2
|
+
|
|
3
|
+
module MS
|
|
4
|
+
module Mass
|
|
5
|
+
module AA
|
|
6
|
+
# amino_acids keys as strings, average masses
|
|
7
|
+
AVG_STRING = {
|
|
8
|
+
"*"=>118.88603,
|
|
9
|
+
"A"=>71.0779,
|
|
10
|
+
"B"=>172.1405,
|
|
11
|
+
"C"=>103.1429,
|
|
12
|
+
"D"=>115.0874,
|
|
13
|
+
"E"=>129.11398,
|
|
14
|
+
"F"=>147.17386,
|
|
15
|
+
"G"=>57.05132,
|
|
16
|
+
"H"=>137.13928,
|
|
17
|
+
"I"=>113.15764,
|
|
18
|
+
"K"=>128.17228,
|
|
19
|
+
"L"=>113.15764,
|
|
20
|
+
"M"=>131.19606,
|
|
21
|
+
"N"=>114.10264,
|
|
22
|
+
"O"=>211.28076,
|
|
23
|
+
"P"=>97.11518,
|
|
24
|
+
"Q"=>128.12922,
|
|
25
|
+
"R"=>156.18568,
|
|
26
|
+
"S"=>87.0773,
|
|
27
|
+
"T"=>101.10388,
|
|
28
|
+
"U"=>150.0379,
|
|
29
|
+
"V"=>99.13106,
|
|
30
|
+
"W"=>186.2099,
|
|
31
|
+
"X"=>118.88603,
|
|
32
|
+
"Y"=>163.17326,
|
|
33
|
+
"Z"=>128.6231
|
|
34
|
+
}
|
|
35
|
+
# amino_acids keys as strings, monoisotopic masses
|
|
36
|
+
MONO_STRING = {
|
|
37
|
+
"*"=>118.805716,
|
|
38
|
+
"A"=>71.0371137878,
|
|
39
|
+
"B"=>172.048405,
|
|
40
|
+
"C"=>103.0091844778,
|
|
41
|
+
"D"=>115.026943032,
|
|
42
|
+
"E"=>129.0425930962,
|
|
43
|
+
"F"=>147.0684139162,
|
|
44
|
+
"G"=>57.0214637236,
|
|
45
|
+
"H"=>137.0589118624,
|
|
46
|
+
"I"=>113.0840639804,
|
|
47
|
+
"K"=>128.0949630177,
|
|
48
|
+
"L"=>113.0840639804,
|
|
49
|
+
"M"=>131.0404846062,
|
|
50
|
+
"N"=>114.0429274472,
|
|
51
|
+
"O"=>211.1446528645,
|
|
52
|
+
"P"=>97.052763852,
|
|
53
|
+
"Q"=>128.0585775114,
|
|
54
|
+
"R"=>156.1011110281,
|
|
55
|
+
"S"=>87.0320284099,
|
|
56
|
+
"T"=>101.0476784741,
|
|
57
|
+
"U"=>150.9536355878,
|
|
58
|
+
"V"=>99.0684139162,
|
|
59
|
+
"W"=>186.0793129535,
|
|
60
|
+
"X"=>118.805716,
|
|
61
|
+
"Y"=>163.0633285383,
|
|
62
|
+
"Z"=>128.550585
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# amino_acids keys as symbols, monoisotopic masses
|
|
66
|
+
MONO_SYM = Hash[MONO_STRING.map {|aa,mass| [aa.to_sym, mass] } ]
|
|
67
|
+
|
|
68
|
+
# amino_acids keys as symbols, average masses
|
|
69
|
+
AVG_SYM = Hash[AVG_STRING.map {|aa,mass| [aa.to_sym, mass] } ]
|
|
70
|
+
|
|
71
|
+
# Monoisotopic amino acid masses keyed as symbols and also strings (all
|
|
72
|
+
# upper case). Also includes MS::Mass::MONO for things like protons ('h+')
|
|
73
|
+
MONO = MONO_SYM.merge(MONO_STRING).merge(MS::Mass::MONO)
|
|
74
|
+
# Average amino acid masses keyed as symbols and also strings (all
|
|
75
|
+
# uppder case). Also includes MS::Mass::AVG for things like protons ('h+')
|
|
76
|
+
AVG = AVG_SYM.merge(AVG_STRING).merge(MS::Mass::AVG)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
data/lib/ms/mass.rb
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
|
|
2
|
+
module MS
|
|
3
|
+
module Mass
|
|
4
|
+
|
|
5
|
+
# takes a chemical formula in this format: C2BrH12O
|
|
6
|
+
def self.formula_to_exact_mass(formula)
|
|
7
|
+
# TODO: add other input methods
|
|
8
|
+
pairs = formula.scan(/([A-Z][a-z]?)(\d*)/).map do |match|
|
|
9
|
+
if match.last == ''
|
|
10
|
+
match[-1] = 1
|
|
11
|
+
end
|
|
12
|
+
[match[0], match[1].to_i]
|
|
13
|
+
end
|
|
14
|
+
pairs.map do |pair|
|
|
15
|
+
MONO[pair.first.downcase] * pair.last
|
|
16
|
+
end.reduce(:+)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
H_PLUS = 1.00727646677
|
|
20
|
+
# + http://www.unimod.org/masses.html
|
|
21
|
+
MONO_STR = {
|
|
22
|
+
'c' => 12.0, # +
|
|
23
|
+
'br' => 78.9183361, # +
|
|
24
|
+
'd' => 2.014101779, # +
|
|
25
|
+
'f' => 18.99840322, # +
|
|
26
|
+
'n' => 14.003074, # +
|
|
27
|
+
'o' => 15.99491463, # +
|
|
28
|
+
'na' => 22.9897677, # +
|
|
29
|
+
'p' => 30.973762, # +
|
|
30
|
+
's' => 31.9720707, # +
|
|
31
|
+
'li' => 7.016003, # +
|
|
32
|
+
'cl' => 34.96885272, # +
|
|
33
|
+
'k' => 38.9637074, # +
|
|
34
|
+
'si' => 27.9769265325, # http://physics.nist.gov/cgi-bin/Compositions/stand_alone.pl?ele=Si&ascii=html&isotype=some
|
|
35
|
+
'i' => 126.904473, # +
|
|
36
|
+
'h+' => 1.00727646677,
|
|
37
|
+
'h' => 1.007825035, # +
|
|
38
|
+
'h2o' => 18.0105647,
|
|
39
|
+
'oh' => 17.002739665,
|
|
40
|
+
}
|
|
41
|
+
AVG_STR = {
|
|
42
|
+
'h+' => 1.007276, # using Mascot_H_plus mass (is this right for AVG??)
|
|
43
|
+
'h' => 1.00794,
|
|
44
|
+
'h2o' => 18.01528,
|
|
45
|
+
'oh' => 17.00734,
|
|
46
|
+
}
|
|
47
|
+
# sets MONO_STR, MONO, AVG_STR, and AVG
|
|
48
|
+
%w(MONO AVG).each do |type|
|
|
49
|
+
const_set "#{type}_SYM", Hash[ const_get("#{type}_STR").map {|k,v| [k.to_sym, v] } ]
|
|
50
|
+
const_set type, const_get("#{type}_STR").merge( const_get("#{type}_SYM") )
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
module MS
|
|
2
|
+
class Mzml
|
|
3
|
+
# A simple array of indices but #[] has been overloaded to find an index
|
|
4
|
+
# by name
|
|
5
|
+
#
|
|
6
|
+
# index_list[0] # the first index
|
|
7
|
+
# index_list.map(&:names) # -> [:spectrum, :chromatogram]
|
|
8
|
+
# index_list[:spectrum] # the spectrum index
|
|
9
|
+
# index_list[:chromatogram] # the chromatogram index
|
|
10
|
+
class IndexList < Array
|
|
11
|
+
alias_method :old_bracket_slice, :'[]'
|
|
12
|
+
|
|
13
|
+
# @param [Object] an Integer (index number) or a Symbol (:spectrum or
|
|
14
|
+
# :chromatogram)
|
|
15
|
+
# @return [MS::Mzml::Index] an index object
|
|
16
|
+
def [](int_or_symbol)
|
|
17
|
+
if int_or_symbol.is_a?(Integer)
|
|
18
|
+
old_bracket_slice(int_or_symbol)
|
|
19
|
+
else
|
|
20
|
+
self.find {|index| index.name == int_or_symbol }
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# the array holds start bytes
|
|
26
|
+
class Index < Array
|
|
27
|
+
|
|
28
|
+
class << self
|
|
29
|
+
# returns an Integer or nil if not found
|
|
30
|
+
# does a single jump backwards from the tail of the file looking for
|
|
31
|
+
# an xml element based on tag. If it is not found, returns nil
|
|
32
|
+
def index_offset(io, tag='indexListOffset', bytes_backwards=200)
|
|
33
|
+
tag_re = %r{<#{tag}>([\-\d]+)</#{tag}>}
|
|
34
|
+
io.pos = (io.size - 1) - bytes_backwards
|
|
35
|
+
md = io.readlines("\n").map {|line| line.match(tag_re) }.compact.shift
|
|
36
|
+
md[1].to_i if md
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# an index indexed by scan number
|
|
41
|
+
attr_accessor :by_scans
|
|
42
|
+
|
|
43
|
+
# the name of the index (as a symbol)
|
|
44
|
+
attr_accessor :name
|
|
45
|
+
|
|
46
|
+
# a parallel array of ids (idRef's)
|
|
47
|
+
attr_accessor :ids
|
|
48
|
+
|
|
49
|
+
def start_byte_and_id(int)
|
|
50
|
+
[self[int], ids[int]]
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# returns hash of id to start_byte
|
|
54
|
+
def create_id_index
|
|
55
|
+
Hash[self.ids.zip(self)]
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# @return [Integer] the start byte of the spectrum
|
|
59
|
+
# @param [Object] an Integer (the index number) or String (an id string)
|
|
60
|
+
def start_byte(arg)
|
|
61
|
+
case arg
|
|
62
|
+
when Integer
|
|
63
|
+
self[arg]
|
|
64
|
+
when String
|
|
65
|
+
@id_index ||= create_id_index
|
|
66
|
+
@id_index[arg]
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# generates a scan to index hash that points from scan number to the
|
|
71
|
+
# spectrum index number. returns the index, nil if the scan ids
|
|
72
|
+
# are not present and spectra are, or false if they are not unique.
|
|
73
|
+
def create_scan_to_index
|
|
74
|
+
scan_re = /scan=(\d+)/
|
|
75
|
+
scan_to_index = {}
|
|
76
|
+
ids.each_with_index do |id, index|
|
|
77
|
+
md = id.match(scan_re)
|
|
78
|
+
scan_num = md[1].to_i if md
|
|
79
|
+
if scan_num
|
|
80
|
+
if scan_to_index.key?(scan_num)
|
|
81
|
+
return false
|
|
82
|
+
else
|
|
83
|
+
scan_to_index[scan_num] = index
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
if scan_to_index.size > 0
|
|
88
|
+
by_scans = scan_to_index
|
|
89
|
+
elsif ids.size > 0
|
|
90
|
+
nil # there are scans, but we did not find scan numbers
|
|
91
|
+
else
|
|
92
|
+
scan_to_index
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|